From 3f97531b2c319fe4a460e5d2317fcedf3b0b058a Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Fri, 1 May 2026 12:29:49 +0200
Subject: [PATCH 1/5] feat: live UI server, vendored C library, marketing
 README
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three changes that work together:

1. Visualization is now a live HTTP server, not a static file. The
   `<meta http-equiv="refresh" content="10">` is gone — interactive state
   (filter selections, scroll, click highlights) survives across data
   updates. The `visualize` subcommand starts a tokio + axum server with
   GET / (page), GET /data (JSON), GET /events (SSE). A background task
   re-scans every --interval seconds and only emits a `data-changed`
   event when the new content hash differs from the previous one. The
   page subscribes to /events and reloads only on real change.

2. The C library is now vendored under crates/rayforce-sys/vendor/.
   build.rs compiles it via `cc::Build` on every fresh build — no
   external checkout, no submodule, no env var. RAYFORCE_DIR still works
   as an override for C-side development.

3. README rewritten as a product page (~100 lines). Drops the wall of
   `cargo run -q -p ... -- ...` commands and example outputs that made
   it look like an internal scratchpad. Leads with what raysense does,
   why it matters, and the three commands you actually need.
---
 README.md                                     |  372 +-
 crates/rayforce-sys/Cargo.toml                |    1 +
 crates/rayforce-sys/build.rs                  |  127 +-
 crates/rayforce-sys/vendor/rayforce/LICENSE   |   21 +
 .../vendor/rayforce/include/rayforce.h        |  418 ++
 .../vendor/rayforce/src/core/block.c          |   82 +
 .../vendor/rayforce/src/core/block.h          |   45 +
 .../vendor/rayforce/src/core/epoll.c          |  250 +
 .../vendor/rayforce/src/core/iocp.c           |   60 +
 .../vendor/rayforce/src/core/ipc.c            | 1117 +++
 .../vendor/rayforce/src/core/ipc.h            |   96 +
 .../vendor/rayforce/src/core/kqueue.c         |  248 +
 .../vendor/rayforce/src/core/morsel.c         |  122 +
 .../vendor/rayforce/src/core/morsel.h         |   41 +
 .../vendor/rayforce/src/core/numparse.c       |  452 ++
 .../vendor/rayforce/src/core/numparse.h       |   77 +
 .../vendor/rayforce/src/core/platform.c       |  464 ++
 .../vendor/rayforce/src/core/platform.h       |  178 +
 .../vendor/rayforce/src/core/poll.c           |  122 +
 .../vendor/rayforce/src/core/poll.h           |  115 +
 .../vendor/rayforce/src/core/pool.c           |  504 ++
 .../vendor/rayforce/src/core/pool.h           |   95 +
 .../vendor/rayforce/src/core/profile.h        |  161 +
 .../vendor/rayforce/src/core/progress.c       |  170 +
 .../vendor/rayforce/src/core/runtime.c        |  367 +
 .../vendor/rayforce/src/core/runtime.h        |  136 +
 .../vendor/rayforce/src/core/sock.c           |  201 +
 .../vendor/rayforce/src/core/sock.h           |   47 +
 .../vendor/rayforce/src/core/types.c          |   57 +
 .../vendor/rayforce/src/core/types.h          |   45 +
 .../rayforce-sys/vendor/rayforce/src/io/csv.c | 1821 +++++
 .../rayforce-sys/vendor/rayforce/src/io/csv.h |   34 +
 .../vendor/rayforce/src/lang/cal.h            |   84 +
 .../vendor/rayforce/src/lang/compile.c        |  518 ++
 .../vendor/rayforce/src/lang/env.c            |  658 ++
 .../vendor/rayforce/src/lang/env.h            |  118 +
 .../vendor/rayforce/src/lang/eval.c           | 2626 +++++++
 .../vendor/rayforce/src/lang/eval.h           |  298 +
 .../vendor/rayforce/src/lang/format.c         | 1074 +++
 .../vendor/rayforce/src/lang/format.h         |   50 +
 .../vendor/rayforce/src/lang/internal.h       |  514 ++
 .../vendor/rayforce/src/lang/nfo.c            |  100 +
 .../vendor/rayforce/src/lang/nfo.h            |   69 +
 .../vendor/rayforce/src/lang/parse.c          |  881 +++
 .../vendor/rayforce/src/lang/parse.h          |   39 +
 .../vendor/rayforce/src/lang/syscmd.c         |  359 +
 .../vendor/rayforce/src/lang/syscmd.h         |  103 +
 .../vendor/rayforce/src/mem/arena.c           |  160 +
 .../vendor/rayforce/src/mem/arena.h           |   60 +
 .../vendor/rayforce/src/mem/cow.c             |   79 +
 .../vendor/rayforce/src/mem/cow.h             |   43 +
 .../vendor/rayforce/src/mem/heap.c            | 1601 +++++
 .../vendor/rayforce/src/mem/heap.h            |  404 ++
 .../vendor/rayforce/src/mem/sys.c             |  122 +
 .../vendor/rayforce/src/mem/sys.h             |   49 +
 .../vendor/rayforce/src/ops/agg.c             |  509 ++
 .../vendor/rayforce/src/ops/arith.c           |  422 ++
 .../vendor/rayforce/src/ops/builtins.c        | 2681 +++++++
 .../vendor/rayforce/src/ops/cmp.c             |  330 +
 .../vendor/rayforce/src/ops/collection.c      | 2040 ++++++
 .../vendor/rayforce/src/ops/datalog.c         | 4325 +++++++++++
 .../vendor/rayforce/src/ops/datalog.h         |  344 +
 .../vendor/rayforce/src/ops/dump.c            |  254 +
 .../vendor/rayforce/src/ops/embedding.c       |  870 +++
 .../vendor/rayforce/src/ops/exec.c            | 2272 ++++++
 .../vendor/rayforce/src/ops/exec.h            |   29 +
 .../vendor/rayforce/src/ops/expr.c            | 1776 +++++
 .../vendor/rayforce/src/ops/filter.c          |  685 ++
 .../vendor/rayforce/src/ops/fuse.c            |  210 +
 .../vendor/rayforce/src/ops/fuse.h            |   29 +
 .../vendor/rayforce/src/ops/fvec.c            |  101 +
 .../vendor/rayforce/src/ops/fvec.h            |   52 +
 .../vendor/rayforce/src/ops/glob.c            |  102 +
 .../vendor/rayforce/src/ops/glob.h            |   43 +
 .../vendor/rayforce/src/ops/graph.c           | 1822 +++++
 .../vendor/rayforce/src/ops/graph.h           |   29 +
 .../vendor/rayforce/src/ops/group.c           | 4392 ++++++++++++
 .../vendor/rayforce/src/ops/hash.h            |  252 +
 .../vendor/rayforce/src/ops/idxop.c           |  734 ++
 .../vendor/rayforce/src/ops/idxop.h           |  171 +
 .../vendor/rayforce/src/ops/internal.h        |  992 +++
 .../vendor/rayforce/src/ops/join.c            | 1972 +++++
 .../vendor/rayforce/src/ops/journal.c         |  191 +
 .../vendor/rayforce/src/ops/journal.h         |   64 +
 .../vendor/rayforce/src/ops/lftj.c            |  258 +
 .../vendor/rayforce/src/ops/lftj.h            |  136 +
 .../vendor/rayforce/src/ops/linkop.c          |  328 +
 .../vendor/rayforce/src/ops/linkop.h          |  105 +
 .../vendor/rayforce/src/ops/ops.h             |  726 ++
 .../vendor/rayforce/src/ops/opt.c             | 2031 ++++++
 .../vendor/rayforce/src/ops/opt.h             |   29 +
 .../vendor/rayforce/src/ops/pipe.c            |   63 +
 .../vendor/rayforce/src/ops/pipe.h            |   43 +
 .../vendor/rayforce/src/ops/pivot.c           |  666 ++
 .../vendor/rayforce/src/ops/plan.c            |   31 +
 .../vendor/rayforce/src/ops/plan.h            |   29 +
 .../vendor/rayforce/src/ops/query.c           | 6329 +++++++++++++++++
 .../vendor/rayforce/src/ops/rerank.c          |  546 ++
 .../vendor/rayforce/src/ops/rowsel.c          |  445 ++
 .../vendor/rayforce/src/ops/rowsel.h          |  187 +
 .../vendor/rayforce/src/ops/sort.c            | 3682 ++++++++++
 .../vendor/rayforce/src/ops/string.c          |  604 ++
 .../vendor/rayforce/src/ops/strop.c           |  281 +
 .../vendor/rayforce/src/ops/system.c          |  827 +++
 .../vendor/rayforce/src/ops/tblop.c           |  948 +++
 .../vendor/rayforce/src/ops/temporal.c        |  665 ++
 .../vendor/rayforce/src/ops/temporal.h        |   84 +
 .../vendor/rayforce/src/ops/traverse.c        | 2641 +++++++
 .../vendor/rayforce/src/ops/window.c          | 1223 ++++
 .../vendor/rayforce/src/store/col.c           |  954 +++
 .../vendor/rayforce/src/store/col.h           |   34 +
 .../vendor/rayforce/src/store/csr.c           |  529 ++
 .../vendor/rayforce/src/store/csr.h           |   79 +
 .../vendor/rayforce/src/store/fileio.c        |  270 +
 .../vendor/rayforce/src/store/fileio.h        |   54 +
 .../vendor/rayforce/src/store/hnsw.c          |  972 +++
 .../vendor/rayforce/src/store/hnsw.h          |  133 +
 .../vendor/rayforce/src/store/journal.c       |  656 ++
 .../vendor/rayforce/src/store/journal.h       |  123 +
 .../vendor/rayforce/src/store/meta.c          |   43 +
 .../vendor/rayforce/src/store/meta.h          |   33 +
 .../vendor/rayforce/src/store/part.c          |  503 ++
 .../vendor/rayforce/src/store/part.h          |   33 +
 .../vendor/rayforce/src/store/serde.c         |  984 +++
 .../vendor/rayforce/src/store/serde.h         |   81 +
 .../vendor/rayforce/src/store/splay.c         |  229 +
 .../vendor/rayforce/src/store/splay.h         |   34 +
 .../vendor/rayforce/src/table/dict.c          |  609 ++
 .../vendor/rayforce/src/table/dict.h          |   68 +
 .../vendor/rayforce/src/table/sym.c           | 1251 ++++
 .../vendor/rayforce/src/table/sym.h           |  139 +
 .../vendor/rayforce/src/table/table.c         |  238 +
 .../vendor/rayforce/src/table/table.h         |   40 +
 .../vendor/rayforce/src/vec/atom.c            |  208 +
 .../vendor/rayforce/src/vec/atom.h            |   36 +
 .../vendor/rayforce/src/vec/embedding.h       |   38 +
 .../vendor/rayforce/src/vec/list.c            |  299 +
 .../vendor/rayforce/src/vec/list.h            |   36 +
 .../vendor/rayforce/src/vec/sel.c             |  190 +
 .../vendor/rayforce/src/vec/str.c             |   90 +
 .../vendor/rayforce/src/vec/str.h             |  103 +
 .../vendor/rayforce/src/vec/vec.c             | 1361 ++++
 .../vendor/rayforce/src/vec/vec.h             |   58 +
 crates/raysense-cli/Cargo.toml                |    4 +
 crates/raysense-cli/src/lib.rs                |  204 +-
 145 files changed, 79003 insertions(+), 368 deletions(-)
 create mode 100644 crates/rayforce-sys/vendor/rayforce/LICENSE
 create mode 100644 crates/rayforce-sys/vendor/rayforce/include/rayforce.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/block.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/block.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/epoll.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/iocp.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/ipc.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/ipc.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/kqueue.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/morsel.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/morsel.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/numparse.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/numparse.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/platform.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/platform.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/poll.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/poll.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/pool.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/pool.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/profile.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/progress.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/runtime.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/runtime.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/sock.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/sock.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/types.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/core/types.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/io/csv.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/io/csv.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/cal.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/compile.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/env.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/env.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/eval.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/eval.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/format.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/format.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/internal.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/nfo.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/nfo.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/parse.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/parse.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/mem/arena.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/mem/arena.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/mem/cow.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/mem/cow.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/mem/heap.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/mem/heap.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/mem/sys.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/mem/sys.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/agg.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/arith.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/builtins.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/cmp.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/collection.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/datalog.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/datalog.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/dump.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/embedding.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/exec.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/exec.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/expr.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/filter.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/fuse.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/fuse.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/fvec.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/fvec.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/glob.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/glob.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/graph.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/graph.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/group.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/hash.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/idxop.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/idxop.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/internal.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/join.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/journal.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/journal.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/lftj.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/lftj.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/linkop.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/linkop.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/ops.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/opt.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/opt.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/pipe.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/pipe.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/pivot.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/plan.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/plan.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/query.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/rerank.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/sort.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/string.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/strop.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/system.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/tblop.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/temporal.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/temporal.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/traverse.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/ops/window.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/col.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/col.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/csr.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/csr.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/fileio.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/fileio.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/hnsw.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/hnsw.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/journal.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/journal.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/meta.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/meta.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/part.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/part.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/serde.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/serde.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/splay.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/store/splay.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/table/dict.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/table/dict.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/table/sym.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/table/sym.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/table/table.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/table/table.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/atom.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/atom.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/embedding.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/list.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/list.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/sel.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/str.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/str.h
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/vec.c
 create mode 100644 crates/rayforce-sys/vendor/rayforce/src/vec/vec.h
diff --git a/README.md b/README.md
index c313d4c..4017b72 100644
--- a/README.md
+++ b/README.md
@@ -23,347 +23,97 @@
 
 # Raysense
 
-Raysense is local architectural telemetry for AI coding agents.
+**Architectural X-ray for your codebase. Live, local, agent-ready.**
 
-It scans a repository, extracts files/functions/imports, resolves local
-dependency edges, classifies imports, computes graph health, and can materialize
-the scan into Rayforce-backed memory tables.
+Point Raysense at a repository and it tells you, in seconds, where the
+load-bearing files are, which modules are tangled, where complexity is
+hiding, and which parts of the codebase are bus-factor-of-one. It runs
+locally, ships zero data anywhere, and exposes everything to AI coding
+agents through MCP.
 
-## Current Test Commands
+## Why
 
-```bash
-cargo run -q -p raysense-cli -- health .
-cargo run -q -p raysense-cli -- edges .
-cargo run -q -p raysense-cli -- observe . --memory
-```
+LLM coding agents read source one file at a time. They don't see the
+*shape* of your project: the cycles, the god files, the dead code, the
+files that change together every commit. Raysense computes that shape
+once and serves it back as queryable structure — to your agents, to
+your CI, and to a live dashboard you can keep open while you work.
 
-Against Rayforce from this workspace layout:
+## Install
 
 ```bash
-cargo run -q -p raysense-cli -- health ../rayforce
-cargo run -q -p raysense-cli -- edges ../rayforce | head
-cargo run -q -p raysense-cli -- observe ../rayforce --memory
-cargo run -q -p raysense-cli -- baseline save ../rayforce
-cargo run -q -p raysense-cli -- baseline diff ../rayforce
-```
-
-Current Rayforce baseline:
-
-```text
-score 77
-quality_signal 7708
-coverage_score 100
-structural_score 72
-facts files=190 functions=2662 calls=25704 call_edges=15492 imports=1039
-entry_points total=50 binaries=6 examples=4 tests=40
-imports local=657 external=0 system=382 unresolved=0
-graph resolved_edges=657 cycles=0
-coupling local_edges=657 cross_module_edges=240 cross_module_ratio=0.365 cross_unstable_edges=200 cross_unstable_ratio=0.304 entropy=0.824 entropy_bits=3.201 entropy_pairs=15 average_module_cohesion=0.667 cohesive_module_count=18 god_files=2 unstable_hotspots=4
-calls total=25704 resolved_edges=15492 resolution_ratio=0.603 max_function_fan_in=2537 max_function_fan_out=293
-size max_file_lines=6329 max_function_lines=2334 large_files=63 long_functions=209
-test_gap production_files=150 test_files=40 files_without_nearby_tests=150
-dsm modules=5 module_edges=240
-root_causes modularity=0.635 acyclicity=1.000 depth=1.000 equality=0.450 redundancy=0.952
-architecture depth=3 max_blast_radius=25 max_blast_radius_file=src/ops/query.c max_non_foundation_blast_radius=12 max_non_foundation_blast_radius_file=src/runtime/eval.c attack_surface_files=45 attack_surface_ratio=0.703 upward_violations=3 upward_violation_ratio=0.012 average_distance_from_main_sequence=0.214
-complexity max=131 avg=3.904 gini=0.550 dead_functions=50 duplicate_groups=20 redundancy_ratio=0.048
-evolution available=true commits_sampled=500 changed_files=190
-rules warnings=7 info=31
+cargo install raysense
 ```
 
-## Commands
+Or build from source — see [Building](#building) below.
 
-Install from crates.io after building a local Rayforce library:
+## Use
 
-```sh
-git clone git@github.com:RayforceDB/rayforce.git
-make -C rayforce lib
-RAYFORCE_DIR="$PWD/rayforce" cargo install raysense
-```
+Three things, one binary.
 
-For library use:
+**Live dashboard.** Open it once, leave it open. Updates the moment your
+code does, never on a fixed timer.
 
-```sh
-cargo add raysense
-```
-
-```text
-raysense observe <path> [--json] [--memory] [--config <path>]
-raysense health <path> [--json] [--config <path>]
-raysense edges <path> [--all] [--config <path>]
-raysense memory <path> [--config <path>]
-raysense check [path] [--json] [--sarif <path>] [--config <path>]
-raysense gate [path] [--save] [--baseline <path>] [--json] [--config <path>]
-raysense watch [path] [--interval <seconds>] [--config <path>]
-raysense visualize [path] [--watch] [--interval <seconds>] [--output <path>] [--config <path>]
-raysense plugin list [path] [--config <path>]
-raysense plugin add <name> <extensions...> [--file-name <name>] [--path <path>] [--config <path>]
-raysense plugin add-standard [--path <path>] [--config <path>]
-raysense plugin remove <name> [--path <path>] [--config <path>]
-raysense plugin validate <dir> [--json]
-raysense plugin scaffold <name> <extension> [--path <path>]
-raysense plugin init <name> <extension> [--path <path>] [--config <path>]
-raysense policy list
-raysense policy init <preset> [path] [--config <path>]
-raysense trend record [path] [--config <path>]
-raysense trend show [path] [--json] [--config <path>]
-raysense remediate [path] [--json] [--config <path>]
-raysense what-if [path] [--ignore <pattern>] [--generated <pattern>] [--json] [--config <path>]
-raysense baseline save <path> [--output <path>] [--config <path>]
-raysense baseline diff <path> [--baseline <path>] [--config <path>] [--json]
-raysense baseline tables [--baseline <path>] [--json]
-raysense baseline table <name> [--baseline <path>] [--columns <a,b>] [--filter <column:op:value>] [--filter-mode <all|any>] [--sort <column[:asc|desc]>] [--desc] [--offset <n>] [--limit <n>] [--json]
-raysense mcp
-raysense rayforce-version
+```bash
+raysense visualize .
 ```
 
-If `<path>/.raysense.toml` exists, health-producing commands load it
-automatically. `--config` overrides that path.
-Project-local plugin manifests under `.raysense/plugins/*/plugin.toml` are also
-loaded during scans, using the same fields as `[[scan.plugins]]`.
-When `.raysense/plugins/<name>/queries/tags.scm` is present and the plugin
-selects a compiled grammar with `grammar = "rust"`, `c`, `cpp`, `python`, or
-`typescript`, or with `grammar_path` and optional `grammar_symbol`, Raysense
-uses query captures for functions and imports before falling back to token
-prefixes.
-
-`raysense mcp` runs a stdio MCP server for agents. It exposes tools to read and
-write config, run health, inspect scan facts, list dependency edges, read
-hotspots, read rule findings, read DSM module edges, inspect architecture,
-coupling, cycles, hottest files/functions, blast radius, module levels, run
-what-if config simulations, and materialize memory table summaries. It can also
-write visualization dashboards, emit SARIF reports, apply policy presets,
-save/diff baselines, and query saved baseline tables with projection, filters,
-sorting, and pagination. Agent session tools can save an in-memory baseline,
-rescan, end the session, check rules, inspect evolution, inspect DSM data,
-inspect test gaps, list configured language plugins, and add generic or
-standard plugin profiles, remove plugin profiles, or validate local plugin
-directories. It can also scaffold project-local plugin templates.
+**Health report.** A single number out of 100, plus A–F grades on six
+dimensions, plus the rules currently failing.
 
-`raysense visualize` writes a self-refreshing local HTML dashboard with file
-size blocks, module graph edges, hotspots, rules, complexity, test gaps, and an
-embedded telemetry JSON payload. Use `--watch` to keep regenerating the page
-from fresh scans.
-
-Baselines are stored under `<path>/.raysense/baseline` by default. The manifest
-is JSON for fast agent diffs, and baseline tables are written under `tables/`
-in Rayforce splayed-table format.
-
-Baseline table filters use `column:op:value`, where `op` is one of `eq`, `ne`,
-`in`, `not_in`, `contains`, `starts_with`, `ends_with`, `regex`, `not_regex`,
-`gt`, `gte`, `lt`, or `lte`. Filters default to AND semantics; use
-`--filter-mode any` for OR.
-Repeat `--sort` to apply ordered multi-column sorting.
-
-CLI examples:
-
-```sh
-raysense baseline save .
-raysense baseline tables --baseline .raysense/baseline
-raysense baseline table files --baseline .raysense/baseline --columns path,language,lines --filter 'language:in:["c","rust"]' --sort language:asc --sort lines:desc --limit 10
-raysense baseline table files --baseline .raysense/baseline --columns path --filter 'path:regex:^src/ops/.*\.c$' --filter 'path:not_regex:query' --limit 10
+```bash
+raysense health .
 ```
 
-MCP query example:
+**CI gate.** Exit non-zero if any rule fails or scores drop against a
+saved baseline.
 
-```json
-{
-  "name": "raysense_baseline_table_read",
-  "arguments": {
-    "baseline_path": ".raysense/baseline",
-    "table": "files",
-    "columns": ["path", "language", "lines"],
-    "filters": [
-      {"column": "language", "op": "in", "value": ["c", "rust"]},
-      {"column": "path", "op": "regex", "value": "^src/.*\\.(c|rs)$"}
-    ],
-    "filter_mode": "all",
-    "sort": [
-      {"column": "language", "direction": "asc"},
-      {"column": "lines", "direction": "desc"}
-    ],
-    "limit": 10
-  }
-}
+```bash
+raysense check .
 ```
 
-Release checks:
+**Agent connector.** Hook Raysense into Claude, Cursor, or any MCP-capable
+client. 40+ tools — scan, edges, hotspots, what-if simulation, baseline
+diff, evolution metrics — all queryable.
 
-```sh
-cargo package -p rayforce-sys
-cargo package -p raysense-core
-cargo package -p raysense-memory
-cargo package -p raysense-cli
-cargo package -p raysense
+```bash
+raysense mcp
 ```
 
-Run the `publish` workflow manually with `dry_run=true` before publishing a
-release. The workflow publishes packages in dependency order, waits for each
-new package to appear in the registry index, and then runs a post-release
-install and library smoke check.
-
-Example config:
+## What it measures
 
-```toml
-[scan]
-ignored_paths = ["target", "fixtures/generated"]
-generated_paths = ["**/generated/*"]
-enabled_languages = []
-disabled_languages = []
-module_roots = ["crates", "src"]
-test_roots = ["tests"]
-public_api_paths = ["src/lib.rs"]
+- **Coupling, cohesion, instability** — Robert Martin's stable-foundation
+  model, plus blast radius and main-sequence distance.
+- **Complexity** — cyclomatic and cognitive, per function and aggregated.
+- **Cycles and depth** — strongly-connected components, longest acyclic
+  path, upward-layer violations.
+- **Evolution** — bus factor, change-coupling pairs, temporal hotspots
+  (churn × complexity), file age.
+- **Types and inheritance** — type facts with base-class extraction
+  (Python and TypeScript via tree-sitter, others via line parsing).
+- **Test gaps** — files without nearby tests, ranked by risk.
+- **Six A–F dimensions** — modularity, acyclicity, depth, equality,
+  redundancy, structural uniformity. One 0–100 quality signal.
 
-[[scan.plugins]]
-name = "foo"
-grammar = "rust"
-grammar_path = "grammars/foo.so"
-grammar_symbol = "tree_sitter_foo"
-extensions = ["foo"]
-file_names = ["Foofile"]
-function_prefixes = ["function "]
-import_prefixes = ["load "]
-call_suffixes = ["("]
-abstract_type_prefixes = ["interface "]
-concrete_type_prefixes = ["class ", "type "]
-tags_query = """
-(function_item
-  name: (identifier) @name) @definition.function
-"""
-package_index_files = ["index.foo"]
-test_path_patterns = ["tests/*", "*_test.foo"]
-source_roots = ["src"]
-ignored_paths = ["build/*"]
-local_import_prefixes = ["."]
-max_function_complexity = 15
-max_cognitive_complexity = 20
-max_file_lines = 500
-max_function_lines = 80
-resolver_alias_files = ["foo.config.json"]
-namespace_separator = "."
-module_prefix_files = ["mod.foo"]
-module_prefix_directives = ["package "]
-entry_point_patterns = ["main"]
-test_module_patterns = ["tests/*"]
-test_attribute_patterns = ["@Test"]
-parameter_node_kinds = ["parameter"]
-complexity_node_kinds = ["if_statement", "while_statement"]
-logical_operator_kinds = ["&&", "||"]
-abstract_base_classes = ["Base"]
+## Configuration
 
-[rules]
-min_quality_signal = 0
-min_modularity = 0.0
-min_acyclicity = 0.0
-min_depth = 0.0
-min_equality = 0.0
-min_redundancy = 0.0
-max_cycles = 0
-max_coupling_ratio = 1.0
-max_function_complexity = 15
-max_cognitive_complexity = 0
-max_file_lines = 0
-max_function_lines = 0
-no_god_files = true
-high_file_fan_in = 50
-high_file_fan_out = 15
-large_file_lines = 500
-max_large_file_findings = 20
-low_call_resolution_min_calls = 100
-low_call_resolution_ratio = 0.5
-high_function_fan_in = 200
-high_function_fan_out = 100
-max_call_hotspot_findings = 5
-max_upward_layer_violations = 0
-no_tests_detected = true
+Everything is overridable in `.raysense.toml` at the repo root: rule
+thresholds, plugin language definitions, baseline scoring, what-if
+ignored paths. Per-language rule overrides let one language demand
+stricter caps than another. `raysense --help` lists every flag.
 
-[[boundaries.forbidden_edges]]
-from = "src"
-to = "test"
-reason = "runtime code must not depend on tests"
+## Building from source
 
-[[boundaries.layers]]
-name = "core"
-path = "src/core/*"
-order = 0
+The C dependency is vendored. Clone and build — that's it:
 
-[score]
-modularity_weight = 1.0
-acyclicity_weight = 1.0
-depth_weight = 1.0
-equality_weight = 1.0
-redundancy_weight = 1.0
-structural_uniformity_weight = 0.0
+```bash
+git clone https://github.com/RayforceDB/raysense.git
+cd raysense
+cargo build --release
 ```
 
-## Status
-
-The first testable version has grammar-backed support for Rust, C/C++, Python,
-and TypeScript, plus a built-in generic catalog for common project languages
-and formats:
+No external setup, no submodules, no environment variables.
 
-- Configurable scan filtering by ignored paths and enabled/disabled languages.
-- Configurable module roots for DSM and architecture grouping.
-- Generic configured language plugins by file extension with configurable
-  function, import, and call token extraction.
-- Standard language plugin profiles can be listed through MCP or materialized
-  into project config with `raysense plugin add-standard`.
-- Project-local plugin manifests can be loaded from
-  `.raysense/plugins/*/plugin.toml`.
-- Built-in generic analyzers for Go, Java, Kotlin, Scala, C#, PHP, Ruby, Swift,
-  shell, SQL, Lua, Perl, Dart, Elixir, Haskell, OCaml, F#, Clojure, Solidity,
-  protobuf, GraphQL, build/config formats, and other common file types.
-- Tree-sitter-backed Rust, C, C++, Python, and TypeScript function discovery
-  with lightweight fallback extraction.
-- Tree-sitter-backed Rust `use`/`mod`, C/C++ include, Python import, and
-  TypeScript import extraction with lightweight fallback extraction.
-- Tree-sitter-backed Rust, C, C++, Python, and TypeScript call facts with
-  enclosing function ids.
-- Conservative call-edge resolution for unambiguous function names.
-- Function-level call metrics: resolution ratio, fan-in/fan-out, and top
-  called/calling functions.
-- Project profile inference for reusable include-root discovery.
-- Entry point facts for binaries, examples, and tests.
-- Local, external, system, and unresolved import classification.
-- Graph metrics: resolved edges, cycles, fan-in, fan-out.
-- Health summary with score, 0-10000 quality signal, root-cause scores,
-  import breakdown, hotspots, coupling, size, entry point, test-gap, DSM,
-  architecture, complexity, and evolution metrics.
-- Source-aware complexity, duplicate-body grouping, and public API aware
-  dead-function filtering.
-- Semantic-shape duplicate grouping for code that is structurally similar after
-  names and literals are normalized.
-- Ecosystem-aware module grouping for common monorepo, Rust, Python, Java, and
-  Kotlin layouts.
-- Test-gap candidates include expected test file paths for each unmatched
-  production file.
-- Framework-aware test-gap naming for Rust, Python, TypeScript, Go, Java, and
-  .NET-style projects.
-- Built-in policy presets for Rust crates, monorepos, backend services, and
-  libraries.
-- Remediation suggestions are exposed through the CLI and MCP.
-- Persisted trend samples can be recorded and read back for score/rule deltas.
-- Score calibration weights can be configured for the root-cause dimensions.
-- Built-in rules for high fan-in, production dependencies on test paths,
-  large-file/no-test findings, call-resolution/function-call hotspots, max
-  cycles, max coupling, max function complexity, god-file pressure, and ordered
-  layer constraints.
-- Rule thresholds can be configured with TOML.
-- Forbidden top-level module dependencies can be configured with TOML.
-- Config read/write, health runs, scan facts, edges, hotspots, rule findings,
-  module edges, architecture, coupling, cycles, hottest files/functions, blast
-  radius, module levels, what-if simulations, session start/end, rescans, rule
-  checks, evolution, DSM, test gaps, plugin listing, remediation suggestions,
-  trend metrics, policy presets, memory summaries, and saved baseline table
-  queries are exposed through the MCP interface.
-- Baseline save/diff is available through the CLI and MCP, with Rayforce
-  splayed-table storage for baseline tables.
-- MCP session baselines are persisted by default and can be compared across
-  process restarts.
-- CLI quality gate, watch loop, plugin management, and generated self-refreshing
-  local HTML architecture visualization are available.
-- Rayforce table materialization for scan facts, call facts, call edges,
-  health summary, hotspots, rules, module edges, and changed-file evolution
-  metrics.
+## License
 
-CI runs on pushes and pull requests. Publish runs when a release is published
-and can also be started manually.
+MIT. See [LICENSE](LICENSE).
diff --git a/crates/rayforce-sys/Cargo.toml b/crates/rayforce-sys/Cargo.toml
index 0726ddd..2e82ffe 100644
--- a/crates/rayforce-sys/Cargo.toml
+++ b/crates/rayforce-sys/Cargo.toml
@@ -29,3 +29,4 @@ description = "Rust FFI bindings for Rayforce used by Raysense"
 links = "rayforce"
 
 [build-dependencies]
+cc = "1"
diff --git a/crates/rayforce-sys/build.rs b/crates/rayforce-sys/build.rs
index a6ad1af..3ecc531 100644
--- a/crates/rayforce-sys/build.rs
+++ b/crates/rayforce-sys/build.rs
@@ -21,38 +21,23 @@
  *   SOFTWARE.
  */
 
+//! Compile the vendored C library directly via `cc`. No external checkout
+//! required — `cargo build` works from a fresh clone with no extra steps.
+//! Set `RAYFORCE_DIR` only if you want to link against an outside build for
+//! development.
+
 use std::env;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 
 fn main() {
     let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
-    let repo_root = manifest_dir.join("../..");
-    let checkout_dir = repo_root.join("deps/rayforce");
-    let sibling_dir = repo_root.join("../rayforce");
-    let rayforce_dir = env::var_os("RAYFORCE_DIR").map(PathBuf::from).unwrap_or({
-        if checkout_dir.exists() {
-            checkout_dir
-        } else {
-            sibling_dir
-        }
-    });
 
-    let include_dir = rayforce_dir.join("include");
-    let lib_dir = rayforce_dir.clone();
-    let lib_path = lib_dir.join("librayforce.a");
-
-    if !lib_path.exists() {
-        panic!(
-            "missing {}; build Rayforce with `make -C {} lib` or set RAYFORCE_DIR",
-            lib_path.display(),
-            rayforce_dir.display()
-        );
+    if let Some(external_dir) = env::var_os("RAYFORCE_DIR") {
+        link_external(PathBuf::from(external_dir));
+    } else {
+        compile_vendored(&manifest_dir.join("vendor/rayforce"));
     }
 
-    println!("cargo:include={}", include_dir.display());
-    println!("cargo:rustc-link-search=native={}", lib_dir.display());
-    println!("cargo:rustc-link-lib=static=rayforce");
-
     if env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("linux") {
         println!("cargo:rustc-link-lib=m");
         println!("cargo:rustc-link-lib=pthread");
@@ -61,9 +46,101 @@ fn main() {
     }
 
     println!("cargo:rerun-if-env-changed=RAYFORCE_DIR");
+}
+
+/// Default path: build the vendored sources with `cc::Build`. Excludes the
+/// REPL binary entry (`src/app/main.c`) since we only need the library.
+fn compile_vendored(vendor_dir: &Path) {
+    let include_dir = vendor_dir.join("include");
+    let src_dir = vendor_dir.join("src");
+    let mut build = cc::Build::new();
+    build
+        .std("c17")
+        .include(&include_dir)
+        .include(&src_dir)
+        .flag_if_supported("-fPIC")
+        .flag_if_supported("-Wno-unused-parameter")
+        .flag_if_supported("-Wno-unused-but-set-variable")
+        .flag_if_supported("-Wno-unused-variable")
+        .flag_if_supported("-Wno-unused-function");
+
+    if let Ok(profile) = env::var("PROFILE") {
+        if profile == "release" {
+            build
+                .opt_level(3)
+                .flag_if_supported("-funroll-loops")
+                .flag_if_supported("-fomit-frame-pointer")
+                .flag_if_supported("-fno-math-errno");
+        }
+    }
+
+    let mut count = 0usize;
+    for entry in walk_c_sources(&src_dir) {
+        if entry.ends_with(Path::new("app/main.c"))
+            || entry.ends_with(Path::new("app/repl.c"))
+            || entry.ends_with(Path::new("app/term.c"))
+        {
+            continue;
+        }
+        println!("cargo:rerun-if-changed={}", entry.display());
+        build.file(&entry);
+        count += 1;
+    }
+    if count == 0 {
+        panic!(
+            "no C sources found under {} — vendor/ is empty?",
+            src_dir.display()
+        );
+    }
+    println!("cargo:rerun-if-changed={}", include_dir.display());
+    println!("cargo:include={}", include_dir.display());
+    build.compile("rayforce");
+}
+
+/// Optional: link against an externally-built `librayforce.a`. Used only for
+/// rayforce development; everyone else gets the vendored compile path above.
+fn link_external(rayforce_dir: PathBuf) {
+    let include_dir = rayforce_dir.join("include");
+    let lib_path = rayforce_dir.join("librayforce.a");
+    if !lib_path.exists() {
+        panic!(
+            "RAYFORCE_DIR={} but {} is missing — build with `make -C {} lib`",
+            rayforce_dir.display(),
+            lib_path.display(),
+            rayforce_dir.display(),
+        );
+    }
+    println!("cargo:include={}", include_dir.display());
+    println!(
+        "cargo:rustc-link-search=native={}",
+        rayforce_dir.display()
+    );
+    println!("cargo:rustc-link-lib=static=rayforce");
     println!("cargo:rerun-if-changed={}", lib_path.display());
     println!(
         "cargo:rerun-if-changed={}",
         include_dir.join("rayforce.h").display()
     );
 }
+
+/// Walk a directory tree collecting all `*.c` files. Pure-std (no walkdir
+/// dep) to keep build-deps minimal.
+fn walk_c_sources(root: &Path) -> Vec<PathBuf> {
+    let mut out = Vec::new();
+    let mut stack = vec![root.to_path_buf()];
+    while let Some(dir) = stack.pop() {
+        let Ok(entries) = std::fs::read_dir(&dir) else {
+            continue;
+        };
+        for entry in entries.flatten() {
+            let path = entry.path();
+            if path.is_dir() {
+                stack.push(path);
+            } else if path.extension().and_then(|s| s.to_str()) == Some("c") {
+                out.push(path);
+            }
+        }
+    }
+    out.sort();
+    out
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/LICENSE b/crates/rayforce-sys/vendor/rayforce/LICENSE
new file mode 100644
index 0000000..d52e496
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Anton Kundenko <singaraiona@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/crates/rayforce-sys/vendor/rayforce/include/rayforce.h b/crates/rayforce-sys/vendor/rayforce/include/rayforce.h
new file mode 100644
index 0000000..f5b83f9
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/include/rayforce.h
@@ -0,0 +1,418 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_H
+#define RAY_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <assert.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ===== Semantic Versioning ===== */
+
+#define RAY_VERSION_MAJOR 2
+#define RAY_VERSION_MINOR 1
+#define RAY_VERSION_PATCH 0
+
+/* Packed version number: 0xMMmmpp (MM=major, mm=minor, pp=patch) */
+#define RAY_VERSION_NUMBER \
+    ((RAY_VERSION_MAJOR * 10000) + (RAY_VERSION_MINOR * 100) + RAY_VERSION_PATCH)
+
+/* Compile-time version check: true if lib version >= (major, minor, patch) */
+#define RAY_VERSION_AT_LEAST(major, minor, patch) \
+    (RAY_VERSION_NUMBER >= ((major) * 10000 + (minor) * 100 + (patch)))
+
+/* Runtime version query */
+int  ray_version_major(void);
+int  ray_version_minor(void);
+int  ray_version_patch(void);
+const char* ray_version_string(void);
+
+/* ===== Type Constants ===== */
+
+#define RAY_LIST       0
+#define RAY_BOOL       1
+#define RAY_U8         2
+#define RAY_I16        3
+#define RAY_I32        4
+#define RAY_I64        5
+#define RAY_F32        6
+#define RAY_F64        7
+#define RAY_DATE       8
+#define RAY_TIME       9
+#define RAY_TIMESTAMP 10
+#define RAY_GUID      11
+/* Unified dictionary-encoded string column (adaptive width) */
+#define RAY_SYM       12
+/* Variable-length string column (inline + pool) */
+#define RAY_STR       13
+
+/* Compound types */
+#define RAY_INDEX     97   /* Accelerator index attached to a vector (see ops/idxop.h) */
+#define RAY_TABLE     98
+#define RAY_DICT      99
+
+/* Function types (Rayforce-compatible) */
+#define RAY_LAMBDA    100   /* User-defined function (compiled body + env) */
+#define RAY_UNARY     101   /* Unary builtin: ray_t* (*)(ray_t*) */
+#define RAY_BINARY    102   /* Binary builtin: ray_t* (*)(ray_t*, ray_t*) */
+#define RAY_VARY      103   /* Variadic builtin: ray_t* (*)(ray_t**, int64_t) */
+#define RAY_ERROR     127   /* Error object: 8-byte packed ASCII code in sdata */
+#define RAY_NULL      126   /* Null / void — singleton static object */
+
+/* ===== Error Handling ===== */
+
+typedef enum {
+    RAY_OK = 0,
+    RAY_ERR_OOM,
+    RAY_ERR_TYPE,
+    RAY_ERR_RANGE,
+    RAY_ERR_LENGTH,
+    RAY_ERR_RANK,
+    RAY_ERR_DOMAIN,
+    RAY_ERR_NYI,
+    RAY_ERR_IO,
+    RAY_ERR_SCHEMA,
+    RAY_ERR_CORRUPT,
+    RAY_ERR_CANCEL,
+    RAY_ERR_PARSE,
+    RAY_ERR_NAME,
+    RAY_ERR_LIMIT,
+    RAY_ERR_RESERVED
+} ray_err_t;
+
+#define RAY_IS_ERR(p)    ((p) != NULL && (uintptr_t)(p) > 31 && ((ray_t*)(p))->type == RAY_ERROR)
+
+/* ===== Core Type: ray_t (32-byte block/object header) ===== */
+
+typedef union ray_t {
+    /* Allocated: object header */
+    struct {
+        /* Bytes 0-15: nullable bitmask / slice / ext nullmap / index */
+        union {
+            uint8_t  nullmap[16];
+            struct { union ray_t* slice_parent; int64_t slice_offset; };
+            struct { union ray_t* ext_nullmap;  union ray_t* sym_dict; };
+            struct { union ray_t* str_ext_null; union ray_t* str_pool; };
+            /* RAY_ATTR_HAS_INDEX (vectors): ray_t* of type RAY_INDEX
+             * carrying both the accelerator payload and the saved nullmap
+             * bytes.  _idx_pad is reserved (must be NULL).  See ops/idxop.h. */
+            struct { union ray_t* index;        union ray_t* _idx_pad; };
+            /* RAY_ATTR_HAS_LINK (vectors, RAY_I32/RAY_I64 only): bytes 8-15
+             * hold an int64 sym ID naming the target table.  link_lo[8]
+             * aliases bytes 0-7 (inline nullmap bits OR ext_nullmap pointer
+             * OR HAS_INDEX index pointer, depending on the other arm in use).
+             * See ops/linkop.h. */
+            struct { uint8_t link_lo[8];        int64_t link_target; };
+        };
+        /* Bytes 16-31: metadata + value */
+        uint8_t  mmod;       /* 0=heap, 1=file-mmap */
+        uint8_t  order;      /* block order (block size = 2^order) */
+        int8_t   type;       /* negative=atom, positive=vector, 0=LIST */
+        uint8_t  attrs;      /* attribute flags */
+        uint32_t rc;         /* reference count (0=free) */
+        union {
+            uint8_t  b8;     /* BOOL atom */
+            uint8_t  u8;     /* U8 atom */
+            int16_t  i16;    /* I16 atom */
+            int32_t  i32;    /* I32 atom */
+            uint32_t u32;
+            int64_t  i64;    /* I64/SYMBOL/DATE/TIME/TIMESTAMP atom */
+            double   f64;    /* F64 atom */
+            union ray_t* obj; /* pointer to child (long strings, GUID) */
+            struct { uint8_t slen; char sdata[7]; }; /* SSO string (<=7 bytes) */
+            int64_t  len;    /* vector element count */
+        };
+        uint8_t  data[];     /* element data (flexible array member) */
+    };
+    /* Free: buddy allocator block (fl_prev/fl_next overlay bytes 0-15) */
+    struct {
+        union ray_t* fl_prev;
+        union ray_t* fl_next;
+    };
+} ray_t;
+
+/* Global null singleton — always valid, retain/release are no-ops (ARENA flag) */
+extern ray_t __ray_null;
+#define RAY_NULL_OBJ  (&__ray_null)
+#define RAY_IS_NULL(p) ((p) == RAY_NULL_OBJ)
+
+/* Global last-resort OOM error sentinel — returned by ray_error when its
+ * own ray_alloc fails (deep OOM, e.g. heap can't even satisfy the 32-byte
+ * error header).  ARENA-flagged like RAY_NULL_OBJ so retain/release are
+ * no-ops; slen=3 / sdata="oom" so RAY_IS_ERR() and ray_err_code() both
+ * work without touching the heap.  Carries no per-VM message — we have
+ * no heap to format one into.  Without this fallback, hard OOM would
+ * silently bypass every `if (RAY_IS_ERR(x)) return x;` guard upstream. */
+extern ray_t __ray_oom;
+#define RAY_OOM_OBJ   (&__ray_oom)
+
+/* Error object creation (defined in core/runtime.c) */
+ray_t* ray_error(const char* code, const char* fmt, ...);
+const char* ray_err_code_str(ray_err_t e);
+ray_err_t ray_err_from_obj(ray_t* err);
+const char* ray_err_code(ray_t* err);
+/* Free a RAY_ERROR object.  ray_release() is a deliberate no-op for
+ * error ray_t* (see src/mem/cow.c), so callers that hold the sole
+ * reference and want the block reclaimed must use this helper instead —
+ * otherwise the error leaks until heap teardown. */
+void ray_error_free(ray_t* err);
+
+/* ===== Accessor Macros ===== */
+
+#define RAY_ATTR_SLICE  0x10
+
+#define ray_type(v)       ((v)->type)
+#define ray_is_atom(v)    ((v)->type < 0 || (v)->type >= RAY_LAMBDA)
+#define ray_is_vec(v)     ((v)->type >= RAY_BOOL && (v)->type <= RAY_STR)
+#define ray_len(v)        ((v)->len)
+
+/* Element type sizes indexed by type tag — covers all uint8_t values.
+ * Only types 1-14 (vectors) have non-zero entries. */
+extern const uint8_t ray_type_sizes[256];
+
+static inline void* ray_data_fn(ray_t* v) {
+    if (__builtin_expect(!!(v->attrs & RAY_ATTR_SLICE), 0))
+        return (char*)v->slice_parent->data
+               + v->slice_offset * ray_type_sizes[(uint8_t)v->type];
+    return (void*)v->data;
+}
+#define ray_slice_data(v) ray_data_fn(v)  /* alias — ray_data is always slice-safe */
+#define ray_data(v)       ray_data_fn(v)
+
+/* ===== Memory Allocator API ===== */
+
+ray_t*    ray_alloc(size_t data_size);
+/* NOTE: ray_free supports cross-thread free via foreign_blocks list.
+ * Blocks freed from a non-owning thread are deferred and coalesced
+ * when the owning heap flushes foreign blocks. */
+void     ray_free(ray_t* v);
+
+/* ===== Memory Budget API ===== */
+
+int64_t  ray_mem_budget(void);      /* returns memory budget in bytes */
+bool     ray_mem_pressure(void);    /* true if calling thread's usage exceeds budget */
+
+/* ===== Interrupt API =====
+ * Long-running queries poll ray_interrupted() at morsel granularity
+ * and bail out with a "cancel" error. The REPL's SIGINT handler wires
+ * Ctrl-C to ray_request_interrupt(); embedders can call it from their
+ * own signal handlers or cancellation threads. */
+
+void     ray_request_interrupt(void);
+void     ray_clear_interrupt(void);
+bool     ray_interrupted(void);
+
+/* ===== Progress API =====
+ * Pull-based, main-thread only. Worker threads never touch progress
+ * state. The executor calls ray_progress_update() at natural sync
+ * points (between ops, after pool dispatches, at pivot phase
+ * boundaries); the update only fires the user callback once the
+ * query has been running for at least min_ms and at most once per
+ * tick_interval_ms. Embedders register a callback to visualize
+ * long-running queries; leaving it unset has zero runtime cost. */
+
+typedef struct {
+    const char* op_name;      /* coarse: scan, group, pivot, join, ... */
+    const char* phase;        /* optional finer label, e.g. "pivot: dedupe" */
+    uint64_t    rows_done;
+    uint64_t    rows_total;   /* 0 = indeterminate */
+    double      elapsed_sec;
+    int64_t     mem_used;     /* bytes: buddy + direct mmap */
+    int64_t     mem_budget;   /* bytes: auto-detected memory budget */
+    bool        final;        /* true on the last tick of a query — renderers
+                                 use this to clear the line */
+} ray_progress_t;
+
+typedef void (*ray_progress_cb)(const ray_progress_t* snapshot, void* user);
+
+/* Register a progress callback. Set cb=NULL to disable. min_ms is the
+ * show-after threshold: queries finishing under it fire
+ * zero callbacks. tick_interval_ms throttles updates once active. */
+void ray_progress_set_callback(ray_progress_cb cb, void* user,
+                                uint64_t min_ms, uint64_t tick_interval_ms);
+
+/* Update progress state. Safe to call from the main thread only.
+ * phase/op_name may be NULL to keep the previous value. Counters
+ * always overwrite — 0 is a valid "starting fresh" value. Fires the
+ * registered callback if the show-after and tick-interval gates pass. */
+void ray_progress_update(const char* op_name, const char* phase,
+                         uint64_t rows_done, uint64_t rows_total);
+
+/* Relabel without touching the counters — for wrappers like exec_node
+ * that only know which operator is about to run but not its rows. A
+ * subsequent ray_progress_update from inside the op will advance the
+ * counters; until then the renderer shows an indeterminate bar. */
+void ray_progress_label(const char* op_name, const char* phase);
+
+/* Mark the end of the current query. Clears state and fires a final
+ * "100%" tick if the query ran long enough to have shown the bar. */
+void ray_progress_end(void);
+
+/* ===== COW / Ref Counting API ===== */
+
+void     ray_retain(ray_t* v);
+void     ray_release(ray_t* v);
+
+/* ===== Atom Constructors ===== */
+
+ray_t* ray_bool(bool val);
+ray_t* ray_u8(uint8_t val);
+ray_t* ray_i16(int16_t val);
+ray_t* ray_i32(int32_t val);
+ray_t* ray_i64(int64_t val);
+ray_t* ray_f32(float val);
+ray_t* ray_f64(double val);
+ray_t* ray_str(const char* s, size_t len);
+ray_t* ray_sym(int64_t id);
+ray_t* ray_date(int64_t val);
+ray_t* ray_time(int64_t val);
+ray_t* ray_timestamp(int64_t val);
+ray_t* ray_guid(const uint8_t* bytes);
+ray_t* ray_typed_null(int8_t type);
+
+/* Null bitmap check for atoms — bit 0 of nullmap[0] marks typed nulls.
+ * Also matches RAY_NULL_OBJ (the untyped null singleton). */
+#define RAY_ATOM_IS_NULL(x) (RAY_IS_NULL(x) || ((x)->type < 0 && ((x)->nullmap[0] & 1)))
+
+/* ===== Vector API ===== */
+
+ray_t* ray_vec_new(int8_t type, int64_t capacity);
+
+/* RAY_SYM index width — encoded in the lower 2 bits of the vector's
+ * `attrs` byte.  Pick the smallest width that fits the destination
+ * symbol-table size; W64 is the safe default when growing globally. */
+#define RAY_SYM_W8    0  /* uint8_t  indices, ≤255 entries */
+#define RAY_SYM_W16   1  /* uint16_t indices, ≤65,535 */
+#define RAY_SYM_W32   2  /* uint32_t indices, ≤4,294,967,295 */
+#define RAY_SYM_W64   3  /* int64_t  indices, unbounded */
+
+ray_t* ray_sym_vec_new(uint8_t sym_width, int64_t capacity);  /* RAY_SYM with adaptive width */
+ray_t* ray_vec_append(ray_t* vec, const void* elem);
+ray_t* ray_vec_set(ray_t* vec, int64_t idx, const void* elem);
+void* ray_vec_get(ray_t* vec, int64_t idx);
+ray_t* ray_vec_slice(ray_t* vec, int64_t offset, int64_t len);
+ray_t* ray_vec_concat(ray_t* a, ray_t* b);
+ray_t* ray_vec_from_raw(int8_t type, const void* data, int64_t count);
+ray_t* ray_vec_insert_at(ray_t* vec, int64_t idx, const void* elem);
+ray_t* ray_vec_insert_vec_at(ray_t* vec, int64_t idx, ray_t* src);
+ray_t* ray_vec_insert_many(ray_t* vec, ray_t* idxs, ray_t* vals);
+
+/* Null bitmap ops */
+void     ray_vec_set_null(ray_t* vec, int64_t idx, bool is_null);
+ray_err_t ray_vec_set_null_checked(ray_t* vec, int64_t idx, bool is_null);
+bool     ray_vec_is_null(ray_t* vec, int64_t idx);
+
+/* ===== String Vector API ===== */
+
+ray_t* ray_str_vec_append(ray_t* vec, const char* s, size_t len);
+const char* ray_str_vec_get(ray_t* vec, int64_t idx, size_t* out_len);
+ray_t* ray_str_vec_set(ray_t* vec, int64_t idx, const char* s, size_t len);
+ray_t* ray_str_vec_insert_at(ray_t* vec, int64_t idx, const char* s, size_t len);
+ray_t* ray_str_vec_compact(ray_t* vec);
+
+/* ===== String API ===== */
+
+const char* ray_str_ptr(ray_t* s);
+size_t      ray_str_len(ray_t* s);
+int         ray_str_cmp(ray_t* a, ray_t* b);
+
+/* ===== List API ===== */
+
+ray_t* ray_list_new(int64_t capacity);
+ray_t* ray_list_append(ray_t* list, ray_t* item);
+ray_t* ray_list_get(ray_t* list, int64_t idx);
+ray_t* ray_list_set(ray_t* list, int64_t idx, ray_t* item);
+ray_t* ray_list_insert_at(ray_t* list, int64_t idx, ray_t* item);
+ray_t* ray_list_insert_many(ray_t* list, ray_t* idxs, ray_t* vals);
+
+/* ===== Symbol Intern Table API ===== */
+
+ray_err_t ray_sym_init(void);
+void     ray_sym_destroy(void);
+int64_t  ray_sym_intern(const char* str, size_t len);
+int64_t  ray_sym_find(const char* str, size_t len);
+ray_t*    ray_sym_str(int64_t id);
+uint32_t ray_sym_count(void);
+bool     ray_sym_ensure_cap(uint32_t needed);
+ray_err_t ray_sym_save(const char* path);
+ray_err_t ray_sym_load(const char* path);
+
+/* ===== Environment API =====
+ *
+ * Thread-safety: the environment is shared global state.  Concurrent calls
+ * to ray_env_get() and ray_env_set() require external synchronization by
+ * the caller. */
+
+ray_t*    ray_env_get(int64_t sym_id);
+ray_err_t ray_env_set(int64_t sym_id, ray_t* val);
+
+/* ===== Table API ===== */
+
+ray_t*       ray_table_new(int64_t ncols);
+ray_t*       ray_table_add_col(ray_t* tbl, int64_t name_id, ray_t* col_vec);
+ray_t*       ray_table_get_col(ray_t* tbl, int64_t name_id);
+ray_t*       ray_table_get_col_idx(ray_t* tbl, int64_t idx);
+int64_t     ray_table_col_name(ray_t* tbl, int64_t idx);
+void        ray_table_set_col_name(ray_t* tbl, int64_t idx, int64_t name_id);
+int64_t     ray_table_ncols(ray_t* tbl);
+int64_t     ray_table_nrows(ray_t* tbl);
+ray_t*       ray_table_schema(ray_t* tbl);
+
+/* ===== Dict API =====
+ *
+ * A dict is a 2-pointer block (type=RAY_DICT, len=2) holding [keys, vals].
+ * Pair count is keys->len.
+ *
+ * keys:  Either a typed vector (RAY_SYM / RAY_I64 / RAY_F64 / RAY_STR /
+ *        RAY_GUID / RAY_DATE / RAY_TIME / RAY_TIMESTAMP / RAY_I32 /
+ *        RAY_I16 / RAY_BOOL / RAY_U8 / RAY_F32) when every key shares
+ *        one atom type, or a RAY_LIST of boxed atoms when keys are
+ *        heterogeneous.  Typed-vec lookup honors the keys' null bitmap
+ *        so a null key never collides with a legitimate zero/sentinel.
+ * vals:  Either a typed vector when every value shares one atom type,
+ *        or a RAY_LIST otherwise (the form parsed from {…} literals,
+ *        which keep value expressions unevaluated until probed).
+ *
+ * Layout matches RAY_TABLE; only the type tag and the contract on
+ * `vals` (a RAY_LIST of column vectors for tables) differ.
+ */
+
+ray_t*  ray_dict_new(ray_t* keys, ray_t* vals);          /* consumes both */
+ray_t*  ray_dict_keys(ray_t* d);                         /* borrowed */
+ray_t*  ray_dict_vals(ray_t* d);                         /* borrowed */
+int64_t ray_dict_len(ray_t* d);                          /* keys->len */
+ray_t*  ray_dict_get(ray_t* d, ray_t* key_atom);         /* owned, NULL if missing */
+ray_t*  ray_dict_upsert(ray_t* d, ray_t* key_atom, ray_t* val); /* COW; consumes d */
+ray_t*  ray_dict_remove(ray_t* d, ray_t* key_atom);             /* COW; consumes d */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RAY_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/block.c b/crates/rayforce-sys/vendor/rayforce/src/core/block.c
new file mode 100644
index 0000000..1401925
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/block.c
@@ -0,0 +1,82 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "block.h"
+#include "core/platform.h"
+#include "../mem/heap.h"
+#include "../ops/ops.h"
+#include "../table/sym.h"
+
+/* Weak stub for ray_alloc — replaced by buddy allocator at link time.
+ * Uses ray_vm_alloc (mmap) — page-aligned and zero-filled. */
+__attribute__((weak))
+ray_t* ray_alloc(size_t size) {
+    if (size < 32) size = 32;
+    size = (size + 4095) & ~(size_t)4095;
+    void* p = ray_vm_alloc(size);
+    if (!p) return ray_error("oom", NULL);
+    return (ray_t*)p;
+}
+
+size_t ray_block_size(ray_t* v) {
+    if (ray_is_atom(v)) return 32;
+    /* LIST (type=0) stores child pointers */
+    if (v->type == RAY_LIST) return 32 + (size_t)ray_len(v) * sizeof(ray_t*);
+    /* TABLE / DICT: 2-pointer block [keys, vals] */
+    if (v->type == RAY_TABLE || v->type == RAY_DICT) return 32 + 2 * sizeof(ray_t*);
+    /* RAY_SEL: variable layout — meta + seg_flags + seg_popcnt + bits */
+    if (v->type == RAY_SEL) {
+        int64_t nrows = ray_len(v);
+        if (nrows < 0) return 32;
+        uint32_t n_segs = (uint32_t)((nrows + RAY_MORSEL_ELEMS - 1) / RAY_MORSEL_ELEMS);
+        uint32_t n_words = (uint32_t)((nrows + 63) / 64);
+        size_t dsz = sizeof(ray_sel_meta_t);
+        dsz += (n_segs + 7u) & ~(size_t)7;           /* seg_flags, 8-aligned */
+        dsz += ((size_t)n_segs * 2 + 7u) & ~(size_t)7; /* seg_popcnt, 8-aligned */
+        dsz += (size_t)n_words * 8;                   /* bits */
+        return 32 + dsz;
+    }
+    /* Vectors: header (32 bytes) + len * elem_size.
+     * Use ray_sym_elem_size for SYM columns to respect narrow widths. */
+    int8_t t = ray_type(v);
+    if (t <= 0 || t >= RAY_TYPE_COUNT) return 32;
+    return 32 + (size_t)ray_len(v) * ray_sym_elem_size(t, v->attrs);
+}
+
+ray_t* ray_block_copy(ray_t* src) {
+    size_t sz = ray_block_size(src);
+    ray_t* dst = ray_alloc(sz);
+    if (!dst) return ray_error("oom", NULL);
+    /* Save allocator metadata before memcpy overwrites the header */
+    uint8_t new_mmod = dst->mmod;
+    uint8_t new_order = dst->order;
+    memcpy(dst, src, sz);
+    dst->mmod = new_mmod;
+    dst->order = new_order;
+    ray_atomic_store(&dst->rc, 1);
+    if (!ray_retain_owned_refs(dst)) {
+        ray_free(dst);
+        return ray_error("oom", NULL);
+    }
+    return dst;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/block.h b/crates/rayforce-sys/vendor/rayforce/src/core/block.h
new file mode 100644
index 0000000..1c50969
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/block.h
@@ -0,0 +1,45 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_BLOCK_H
+#define RAY_BLOCK_H
+
+/*
+ * block.h — Internal block header utilities.
+ *
+ * Provides ray_block_size() and ray_block_copy(). The core ray_t struct and
+ * accessor macros (ray_type, ray_is_atom, ray_is_vec, ray_len, ray_data,
+ * ray_elem_size) are defined in <rayforce.h>.
+ */
+#include <rayforce.h>
+#include <string.h>
+
+/* Compute total block size in bytes (header + data) */
+size_t ray_block_size(ray_t* v);
+
+/* Allocate a new block and shallow-copy header + data from src.
+ * Retains child refs (STR/LIST/TABLE pointers) via ray_retain_owned_refs.
+ * Requires ray_alloc (declared in rayforce.h, provided by the buddy allocator). */
+ray_t* ray_block_copy(ray_t* src);
+
+#endif /* RAY_BLOCK_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/epoll.c b/crates/rayforce-sys/vendor/rayforce/src/core/epoll.c
new file mode 100644
index 0000000..3452be8
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/epoll.c
@@ -0,0 +1,250 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#if defined(__linux__)
+
+#include "core/poll.h"
+#include "mem/sys.h"
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#define RAY_POLL_MAX_EVENTS 64
+#define RAY_POLL_INITIAL_CAP 16
+
+ray_poll_t* ray_poll_create(void)
+{
+    int fd = epoll_create1(0);
+    if (fd < 0) return NULL;
+
+    ray_poll_t* poll = (ray_poll_t*)ray_sys_alloc(sizeof(ray_poll_t));
+    if (!poll) { close(fd); return NULL; }
+
+    memset(poll, 0, sizeof(*poll));
+    poll->fd      = fd;
+    poll->code    = -1;
+    poll->sel_cap = RAY_POLL_INITIAL_CAP;
+    poll->sels    = (ray_selector_t**)ray_sys_alloc(
+                        poll->sel_cap * sizeof(ray_selector_t*));
+    if (!poll->sels) {
+        close(fd);
+        ray_sys_free(poll);
+        return NULL;
+    }
+    memset(poll->sels, 0, poll->sel_cap * sizeof(ray_selector_t*));
+    return poll;
+}
+
+void ray_poll_destroy(ray_poll_t* poll)
+{
+    if (!poll) return;
+
+    /* Deregister all selectors */
+    for (uint32_t i = 0; i < poll->n_sels; i++) {
+        ray_selector_t* sel = poll->sels[i];
+        if (!sel) continue;
+        if (sel->close_fn) sel->close_fn(poll, sel);
+        epoll_ctl((int)poll->fd, EPOLL_CTL_DEL, (int)sel->fd, NULL);
+        if (sel->rx.buf) ray_poll_buf_free(sel->rx.buf);
+        ray_poll_buf_free(sel->tx.buf);
+        ray_sys_free(sel);
+        poll->sels[i] = NULL;
+    }
+
+    if (poll->sels) ray_sys_free(poll->sels);
+    close((int)poll->fd);
+    ray_sys_free(poll);
+}
+
+int64_t ray_poll_register(ray_poll_t* poll, ray_poll_reg_t* reg)
+{
+    if (!poll || !reg) return -1;
+
+    /* Find free slot or grow */
+    int64_t id = -1;
+    for (uint32_t i = 0; i < poll->n_sels; i++) {
+        if (!poll->sels[i]) { id = (int64_t)i; break; }
+    }
+    if (id < 0) {
+        if (poll->n_sels >= poll->sel_cap) {
+            uint32_t new_cap = poll->sel_cap * 2;
+            ray_selector_t** ns = (ray_selector_t**)ray_sys_alloc(
+                new_cap * sizeof(ray_selector_t*));
+            if (!ns) return -1;
+            memcpy(ns, poll->sels, poll->n_sels * sizeof(ray_selector_t*));
+            memset(ns + poll->n_sels, 0,
+                   (new_cap - poll->n_sels) * sizeof(ray_selector_t*));
+            ray_sys_free(poll->sels);
+            poll->sels    = ns;
+            poll->sel_cap = new_cap;
+        }
+        id = (int64_t)poll->n_sels;
+        poll->n_sels++;
+    }
+
+    ray_selector_t* sel = (ray_selector_t*)ray_sys_alloc(sizeof(ray_selector_t));
+    if (!sel) return -1;
+    memset(sel, 0, sizeof(*sel));
+
+    sel->fd       = reg->fd;
+    sel->id       = id;
+    sel->type     = reg->type;
+    sel->data     = reg->data;
+    sel->open_fn  = reg->open_fn;
+    sel->close_fn = reg->close_fn;
+    sel->error_fn = reg->error_fn;
+    sel->data_fn  = reg->data_fn;
+    sel->rx.recv_fn = reg->recv_fn;
+    sel->rx.read_fn = reg->read_fn;
+    sel->tx.send_fn = reg->send_fn;
+
+    poll->sels[id] = sel;
+
+    /* Register with epoll */
+    struct epoll_event ev;
+    ev.events  = EPOLLIN;
+    ev.data.u64 = (uint64_t)id;
+
+    if (epoll_ctl((int)poll->fd, EPOLL_CTL_ADD, (int)reg->fd, &ev) < 0) {
+        poll->sels[id] = NULL;
+        ray_sys_free(sel);
+        return -1;
+    }
+
+    if (sel->open_fn) sel->open_fn(poll, sel);
+    return id;
+}
+
+void ray_poll_deregister(ray_poll_t* poll, int64_t id)
+{
+    if (!poll || id < 0 || (uint32_t)id >= poll->n_sels) return;
+    ray_selector_t* sel = poll->sels[id];
+    if (!sel) return;
+
+    epoll_ctl((int)poll->fd, EPOLL_CTL_DEL, (int)sel->fd, NULL);
+    if (sel->close_fn) sel->close_fn(poll, sel);
+    if (sel->rx.buf) ray_poll_buf_free(sel->rx.buf);
+    ray_poll_buf_free(sel->tx.buf);
+    ray_sys_free(sel);
+    poll->sels[id] = NULL;
+}
+
+int64_t ray_poll_run(ray_poll_t* poll)
+{
+    if (!poll) return -1;
+
+    struct epoll_event events[RAY_POLL_MAX_EVENTS];
+
+    while (poll->code < 0) {
+        int n = epoll_wait((int)poll->fd, events, RAY_POLL_MAX_EVENTS, -1);
+        if (n < 0) {
+            if (errno == EINTR) continue;
+            return -1;
+        }
+
+        for (int i = 0; i < n; i++) {
+            uint64_t eid = events[i].data.u64;
+            ray_selector_t* sel = NULL;
+
+            if (eid < poll->n_sels)
+                sel = poll->sels[eid];
+            if (!sel) continue;
+
+            /* Process readable data first — even if hangup is also set.
+             * A client may send a message and close; epoll reports both
+             * EPOLLIN and EPOLLHUP in the same event. */
+            if (events[i].events & EPOLLIN) {
+                /* Loop: read data → call read_fn → if state advanced,
+                 * read more. Handles multi-phase protocols (handshake →
+                 * header → payload) arriving in a single epoll event. */
+                for (;;) {
+                    /* Fill rx buffer */
+                    if (sel->rx.recv_fn && sel->rx.buf) {
+                        while (sel->rx.buf->offset < sel->rx.buf->size) {
+                            int64_t nr = sel->rx.recv_fn(
+                                sel->fd,
+                                sel->rx.buf->data + sel->rx.buf->offset,
+                                sel->rx.buf->size - sel->rx.buf->offset);
+                            if (nr <= 0) {
+                                if (nr < 0 && errno == EINTR) continue;
+                                if (nr < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+                                    break;
+                                /* Error or peer closed mid-read */
+                                if (sel->error_fn)
+                                    sel->error_fn(poll, sel);
+                                else
+                                    ray_poll_deregister(poll, sel->id);
+                                goto next_event;
+                            }
+                            sel->rx.buf->offset += nr;
+                        }
+                    }
+
+                    /* Not enough data for current phase */
+                    if (sel->rx.buf && sel->rx.buf->offset < sel->rx.buf->size)
+                        break;
+
+                    /* Call read_fn — may advance state and request new buffer */
+                    if (!sel->rx.read_fn) break;
+                    ray_t* obj = sel->rx.read_fn(poll, sel);
+
+                    /* Re-validate: read_fn may have deregistered this selector */
+                    if (eid >= poll->n_sels || !poll->sels[eid]) goto next_event;
+                    sel = poll->sels[eid];
+
+                    if (obj && sel->data_fn)
+                        sel->data_fn(poll, sel, obj);
+
+                    /* If data_fn deregistered the selector, stop */
+                    if (eid >= poll->n_sels || !poll->sels[eid]) goto next_event;
+                    sel = poll->sels[eid];
+
+                    /* If no rx buffer (state machine done or not set), stop */
+                    if (!sel->rx.buf) break;
+                    /* If buffer already has enough data for next phase, loop */
+                    if (sel->rx.buf->offset >= sel->rx.buf->size) continue;
+                    /* Otherwise try reading more (may EAGAIN → break) */
+                }
+            }
+
+            /* Error / hangup — after data is drained */
+            if (events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) {
+                /* Re-check: selector may have been freed by data_fn */
+                if (eid < poll->n_sels && poll->sels[eid]) {
+                    sel = poll->sels[eid];
+                    if (sel->error_fn)
+                        sel->error_fn(poll, sel);
+                    else
+                        ray_poll_deregister(poll, sel->id);
+                }
+            }
+
+        next_event:;
+        }
+    }
+
+    return poll->code;
+}
+
+#endif /* __linux__ */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/iocp.c b/crates/rayforce-sys/vendor/rayforce/src/core/iocp.c
new file mode 100644
index 0000000..8bbfac8
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/iocp.c
@@ -0,0 +1,60 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#if defined(RAY_OS_WINDOWS)
+
+#include "core/poll.h"
+#include <stdio.h>
+
+/* Windows IOCP implementation — stub for now.
+ * Full IOCP support is deferred to a future release. */
+
+ray_poll_t* ray_poll_create(void)
+{
+    fprintf(stderr, "ray_poll_create: IOCP not yet implemented\n");
+    return NULL;
+}
+
+void ray_poll_destroy(ray_poll_t* poll)
+{
+    (void)poll;
+}
+
+int64_t ray_poll_register(ray_poll_t* poll, ray_poll_reg_t* reg)
+{
+    (void)poll; (void)reg;
+    return -1;
+}
+
+void ray_poll_deregister(ray_poll_t* poll, int64_t id)
+{
+    (void)poll; (void)id;
+}
+
+int64_t ray_poll_run(ray_poll_t* poll)
+{
+    (void)poll;
+    return -1;
+}
+
+#endif /* _WIN32 */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/ipc.c b/crates/rayforce-sys/vendor/rayforce/src/core/ipc.c
new file mode 100644
index 0000000..4fa7419
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/ipc.c
@@ -0,0 +1,1117 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OS_WINDOWS
+  #define _GNU_SOURCE
+#endif
+
+#include "core/ipc.h"
+#include "mem/sys.h"
+#include "store/journal.h"
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+
+#ifdef RAY_OS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <winsock2.h>
+  #include <ws2tcpip.h>
+#else
+  #include <unistd.h>
+  #include <sys/socket.h>
+#endif
+
+#if defined(__linux__)
+  #include <sys/epoll.h>
+  #define RAY_IPC_MAX_EVENTS 64
+#elif defined(__APPLE__)
+  #include <sys/event.h>
+  #define RAY_IPC_MAX_EVENTS 64
+#endif
+
+#include "lang/eval.h"
+
+/* ===== Compression (delta + RLE) ===== */
+
+size_t ray_ipc_compress(const uint8_t* src, size_t len,
+                        uint8_t* dst, size_t dst_cap)
+{
+    if (len <= RAY_IPC_COMPRESS_THRESHOLD) return 0;
+
+    /* Step 1: delta-encode into temporary buffer */
+    uint8_t* delta = (uint8_t*)ray_sys_alloc(len);
+    if (!delta) return 0;
+
+    delta[0] = src[0];
+    for (size_t i = 1; i < len; i++)
+        delta[i] = (uint8_t)(src[i] - src[i - 1]);
+
+    /* Step 2: RLE-compress the delta stream */
+    size_t di = 0;
+    size_t si = 0;
+
+    while (si < len) {
+        if (si + 1 < len && delta[si] == delta[si + 1]) {
+            uint8_t val = delta[si];
+            size_t run = 1;
+            while (si + run < len && delta[si + run] == val && run < 127)
+                run++;
+            if (di + 2 > dst_cap) { ray_sys_free(delta); return 0; }
+            dst[di++] = (uint8_t)run;
+            dst[di++] = val;
+            si += run;
+        } else {
+            size_t start = si;
+            size_t llen = 0;
+            while (si < len && llen < 128) {
+                if (si + 1 < len && delta[si] == delta[si + 1])
+                    break;
+                si++;
+                llen++;
+            }
+            if (di + 1 + llen > dst_cap) { ray_sys_free(delta); return 0; }
+            dst[di++] = (uint8_t)(-(int8_t)llen);
+            memcpy(dst + di, delta + start, llen);
+            di += llen;
+        }
+    }
+
+    ray_sys_free(delta);
+    if (di >= len) return 0;
+    return di;
+}
+
+size_t ray_ipc_decompress(const uint8_t* src, size_t clen,
+                          uint8_t* dst, size_t dst_len)
+{
+    uint8_t* decoded = (uint8_t*)ray_sys_alloc(dst_len);
+    if (!decoded) return 0;
+
+    size_t si = 0;
+    size_t di = 0;
+
+    while (si < clen && di < dst_len) {
+        int8_t count = (int8_t)src[si++];
+        if (count > 0) {
+            if (si >= clen) { ray_sys_free(decoded); return 0; }
+            uint8_t val = src[si++];
+            size_t n = (size_t)count;
+            if (di + n > dst_len) { ray_sys_free(decoded); return 0; }
+            memset(decoded + di, val, n);
+            di += n;
+        } else {
+            size_t n = (size_t)(-(int)count);
+            if (si + n > clen || di + n > dst_len) {
+                ray_sys_free(decoded);
+                return 0;
+            }
+            memcpy(decoded + di, src + si, n);
+            si += n;
+            di += n;
+        }
+    }
+
+    /* Un-delta */
+    if (di == 0) { ray_sys_free(decoded); return 0; }
+    dst[0] = decoded[0];
+    for (size_t i = 1; i < di; i++)
+        dst[i] = (uint8_t)(decoded[i] + dst[i - 1]);
+
+    ray_sys_free(decoded);
+    return di;
+}
+
+/* ===== Shared protocol helpers ===== */
+
+#define RAY_IPC_PHASE_HANDSHAKE 0
+#define RAY_IPC_PHASE_HEADER    1
+#define RAY_IPC_PHASE_PAYLOAD   2
+#define RAY_IPC_PHASE_CREDS     3
+
+/* Constant-time comparison — prevents timing side-channel on password. */
+static bool ct_eq(const void* a, const void* b, size_t len) {
+    const volatile uint8_t* x = a;
+    const volatile uint8_t* y = b;
+    volatile uint8_t diff = 0;
+    for (size_t i = 0; i < len; i++)
+        diff |= x[i] ^ y[i];
+    return diff == 0;
+}
+
+/* Validate credential buffer against secret. Returns true if password matches.
+ * creds is "user:password\0" with length cred_len.
+ * secret MUST point to a char[256] buffer (zero-padded beyond the password).
+ * Compares pw against the full 256-byte secret buffer in constant time.
+ * No strlen, no secret-length-dependent copies. */
+static bool validate_creds(const uint8_t* buf, uint8_t cred_len,
+                           const char* secret) {
+    if (cred_len == 0) return false;
+    const char* creds = (const char*)buf;
+    const char* colon = memchr(creds, ':', cred_len);
+    const char* pw = colon ? colon + 1 : creds;
+    size_t pw_len = colon ? (size_t)(cred_len - (pw - creds)) : cred_len;
+    if (pw_len > 0 && pw[pw_len - 1] == '\0') pw_len--;
+    if (pw_len > 255) pw_len = 255;
+
+    /* Zero-pad pw into a 256-byte buffer, then compare all 256 bytes
+     * against the secret buffer (also 256 bytes, zero-padded at init).
+     * Matching passwords produce identical 256-byte buffers. */
+    uint8_t pw_buf[256] = {0};
+    memcpy(pw_buf, pw, pw_len);
+    return ct_eq(pw_buf, secret, 256);
+}
+
+static void send_response(ray_sock_t fd, ray_t* result)
+{
+    int64_t ser_size = ray_serde_size(result);
+    if (ser_size <= 0) return;
+
+    uint8_t* payload = (uint8_t*)ray_sys_alloc((size_t)ser_size);
+    if (!payload) return;
+    ray_ser_raw(payload, result);
+
+    uint8_t* send_buf = NULL;
+    size_t   send_len = 0;
+    uint8_t  flags    = 0;
+
+    if ((size_t)ser_size > RAY_IPC_COMPRESS_THRESHOLD) {
+        uint8_t* comp = (uint8_t*)ray_sys_alloc((size_t)ser_size);
+        if (comp) {
+            size_t clen = ray_ipc_compress(payload, (size_t)ser_size,
+                                           comp, (size_t)ser_size);
+            if (clen > 0 && clen + 4 < (size_t)ser_size) {
+                send_len = clen + 4;
+                send_buf = (uint8_t*)ray_sys_alloc(send_len);
+                if (send_buf) {
+                    uint32_t uncomp = (uint32_t)ser_size;
+                    memcpy(send_buf, &uncomp, 4);
+                    memcpy(send_buf + 4, comp, clen);
+                    flags = RAY_IPC_FLAG_COMPRESSED;
+                }
+            }
+            ray_sys_free(comp);
+        }
+    }
+
+    if (!send_buf) {
+        send_buf = payload;
+        send_len = (size_t)ser_size;
+        payload  = NULL;
+    }
+
+    ray_ipc_header_t hdr = {
+        .prefix  = RAY_SERDE_PREFIX,
+        .version = RAY_SERDE_WIRE_VERSION,
+        .flags   = flags,
+        .endian  = 0,
+        .msgtype = RAY_IPC_MSG_RESP,
+        .size    = (int64_t)send_len,
+    };
+    ray_sock_send(fd, &hdr, sizeof(hdr));
+    ray_sock_send(fd, send_buf, send_len);
+
+    ray_sys_free(send_buf);
+    if (payload) ray_sys_free(payload);
+}
+
+static ray_t* eval_payload(uint8_t* payload, size_t payload_len,
+                           ray_ipc_header_t* hdr)
+{
+    /* Journal hook: log every inbound SYNC message (state-mutation
+     * channel in q's model) before evaluation, so a crash mid-handler
+     * still leaves the message on disk for replay.  We write the raw
+     * inbound bytes — header + payload — verbatim, no decompression
+     * round-trip.  Async messages and responses are not logged, so
+     * background pings and result frames don't pollute the log.
+     * No-op when no journal is open or during in-progress replay.
+     *
+     * RAY_IPC_FLAG_RESTRICTED is captured into a LOCAL header copy:
+     * we mark the persisted frame with the connection's restricted
+     * state at write time so replay can re-impose it.  Without this
+     * a `-U` client's writes silently elevate to full privilege on
+     * crash-recovery, since replay runs on the main thread with no
+     * IPC connection context.  The bit is meaningless on the live
+     * IPC wire and doesn't affect this handler's eval — that uses
+     * the connection's own flag set by the caller above us.
+     *
+     * If the journal write fails (disk full, EIO), we ABORT the
+     * eval and return an error to the client.  q's documented
+     * behaviour: "the message has not been logged so we cannot
+     * accept it".  Silently evaluating un-logged mutations defeats
+     * the entire durability premise of `-l`/`-L`. */
+    if (ray_journal_is_open() && hdr->msgtype == RAY_IPC_MSG_SYNC) {
+        ray_ipc_header_t log_hdr = *hdr;
+        if (ray_eval_get_restricted())
+            log_hdr.flags |= RAY_IPC_FLAG_RESTRICTED;
+        ray_err_t je = ray_journal_write_bytes(&log_hdr, payload, (int64_t)payload_len);
+        if (je != RAY_OK) {
+            fprintf(stderr, "log: ERROR  journal write failed (rc=%d) — refusing to evaluate\n", (int)je);
+            return ray_error("io", "journal write failed; mutation refused");
+        }
+    }
+
+    uint8_t* decompressed = NULL;
+    if (hdr->flags & RAY_IPC_FLAG_COMPRESSED) {
+        if (payload_len < 4) return NULL;
+        uint32_t uncomp_size;
+        memcpy(&uncomp_size, payload, 4);
+        if (uncomp_size == 0 || uncomp_size > 256u * 1024u * 1024u) return NULL;
+        decompressed = (uint8_t*)ray_sys_alloc(uncomp_size);
+        if (!decompressed) return NULL;
+        size_t dlen = ray_ipc_decompress(payload + 4, payload_len - 4,
+                                         decompressed, uncomp_size);
+        if (dlen != uncomp_size) {
+            ray_sys_free(decompressed);
+            return NULL;
+        }
+        payload     = decompressed;
+        payload_len = uncomp_size;
+    }
+
+    int64_t de_len = (int64_t)payload_len;
+    ray_t*  msg    = ray_de_raw(payload, &de_len);
+    if (decompressed) ray_sys_free(decompressed);
+
+    ray_t* result = NULL;
+    if (msg && !RAY_IS_ERR(msg)) {
+        if (msg->type == -RAY_STR) {
+            const char* str  = ray_str_ptr(msg);
+            size_t      slen = ray_str_len(msg);
+            if (str && slen > 0) {
+                char* tmp = (char*)ray_sys_alloc(slen + 1);
+                if (tmp) {
+                    memcpy(tmp, str, slen);
+                    tmp[slen] = '\0';
+                    result = ray_eval_str(tmp);
+                    ray_sys_free(tmp);
+                }
+            }
+            ray_release(msg);
+        } else {
+            result = ray_eval(msg);
+            ray_release(msg);
+        }
+    }
+    return result ? result : RAY_NULL_OBJ;
+}
+
+/* ======================================================================
+ * Poll-based IPC (new API)
+ * ====================================================================== */
+
+/* Per-connection state stored in selector->data */
+typedef struct {
+    ray_ipc_header_t hdr;
+    uint8_t          phase;
+    int64_t          listener_id;  /* id of the listener selector */
+    bool             auth_required;  /* server has -u/-U */
+    bool             restricted;     /* server has -U */
+} ray_ipc_conn_data_t;
+
+static ray_t* ipc_read_handshake(ray_poll_t* poll, ray_selector_t* sel);
+static ray_t* ipc_read_creds(ray_poll_t* poll, ray_selector_t* sel);
+static ray_t* ipc_read_header(ray_poll_t* poll, ray_selector_t* sel);
+static ray_t* ipc_read_payload(ray_poll_t* poll, ray_selector_t* sel);
+static ray_t* ipc_on_data(ray_poll_t* poll, ray_selector_t* sel, void* data);
+static void   ipc_on_close(ray_poll_t* poll, ray_selector_t* sel);
+
+/* Wrappers matching ray_io_fn signature for socket recv/send */
+static int64_t ipc_recv_fn(int64_t fd, uint8_t* buf, int64_t len) {
+    return ray_sock_recv((ray_sock_t)fd, buf, (size_t)len);
+}
+static int64_t ipc_send_fn(int64_t fd, uint8_t* buf, int64_t len) {
+    return ray_sock_send((ray_sock_t)fd, buf, (size_t)len);
+}
+
+/* Accept callback — called when listener fd is readable */
+static ray_t* ipc_accept(ray_poll_t* poll, ray_selector_t* sel)
+{
+    ray_sock_t new_fd = ray_sock_accept((ray_sock_t)sel->fd);
+    if (new_fd == RAY_INVALID_SOCK) return NULL;
+    ray_sock_set_nonblocking(new_fd);
+
+    ray_ipc_conn_data_t* cd = (ray_ipc_conn_data_t*)ray_sys_alloc(
+                                    sizeof(ray_ipc_conn_data_t));
+    if (!cd) { ray_sock_close(new_fd); return NULL; }
+    memset(cd, 0, sizeof(*cd));
+    cd->phase = RAY_IPC_PHASE_HANDSHAKE;
+    cd->listener_id = sel->id;
+    cd->auth_required = (poll->auth_secret[0] != '\0');
+    cd->restricted    = poll->restricted;
+
+    ray_poll_reg_t reg = {0};
+    reg.fd       = (int64_t)new_fd;
+    reg.type     = RAY_SEL_SOCKET;
+    reg.recv_fn  = ipc_recv_fn;
+    reg.send_fn  = ipc_send_fn;
+    reg.read_fn  = ipc_read_handshake;
+    reg.data_fn  = ipc_on_data;
+    reg.close_fn = ipc_on_close;
+    reg.data     = cd;
+
+    int64_t id = ray_poll_register(poll, &reg);
+    if (id < 0) {
+        ray_sock_close(new_fd);
+        ray_sys_free(cd);
+        return NULL;
+    }
+
+    /* Request 2 bytes for handshake */
+    ray_selector_t* ns = ray_poll_get(poll, id);
+    if (ns) ray_poll_rx_request(poll, ns, 2);
+
+    return NULL;
+}
+
+static ray_t* ipc_read_handshake(ray_poll_t* poll, ray_selector_t* sel)
+{
+    if (!sel->rx.buf || sel->rx.buf->offset < 2) return NULL;
+    ray_ipc_conn_data_t* cd = (ray_ipc_conn_data_t*)sel->data;
+
+    /* Refuse peers speaking a different wire version BEFORE we commit to
+     * exchanging any serialized payloads.  Without this check a new
+     * server would happily send v3-layout values to a v2 client, which
+     * would misparse every atom after the version-bump byte. */
+    if (sel->rx.buf->data[0] != RAY_SERDE_WIRE_VERSION) {
+        ray_poll_deregister(poll, sel->id);
+        return NULL;
+    }
+
+    /* Send handshake response: version + auth_required flag */
+    uint8_t resp[2] = { RAY_SERDE_WIRE_VERSION, cd->auth_required ? 0x01 : 0x00 };
+    ray_sock_send((ray_sock_t)sel->fd, resp, 2);
+
+    if (cd->auth_required) {
+        cd->phase = RAY_IPC_PHASE_HANDSHAKE;
+        sel->rx.read_fn = ipc_read_creds;
+        ray_poll_rx_request(poll, sel, 1);  /* length byte first */
+        return NULL;
+    }
+
+    cd->phase = RAY_IPC_PHASE_HEADER;
+    sel->rx.read_fn = ipc_read_header;
+    ray_poll_rx_request(poll, sel, sizeof(ray_ipc_header_t));
+    return NULL;
+}
+
+static ray_t* ipc_read_creds(ray_poll_t* poll, ray_selector_t* sel)
+{
+    if (!sel->rx.buf || sel->rx.buf->offset < 1) return NULL;
+    uint8_t cred_len = sel->rx.buf->data[0];
+
+    if (sel->rx.buf->offset < 1 + cred_len) {
+        ray_poll_rx_request(poll, sel, 1 + cred_len);
+        return NULL;
+    }
+
+    ray_ipc_conn_data_t* cd = (ray_ipc_conn_data_t*)sel->data;
+
+    bool ok = validate_creds(sel->rx.buf->data + 1, cred_len,
+                             poll->auth_secret);
+    uint8_t result = ok ? 0x00 : 0x01;
+    ray_sock_send((ray_sock_t)sel->fd, &result, 1);
+
+    if (!ok) {
+        ray_poll_deregister(poll, sel->id);
+        return NULL;
+    }
+
+    cd->phase = RAY_IPC_PHASE_HEADER;
+    sel->rx.read_fn = ipc_read_header;
+    ray_poll_rx_request(poll, sel, sizeof(ray_ipc_header_t));
+    return NULL;
+}
+
+static ray_t* ipc_read_header(ray_poll_t* poll, ray_selector_t* sel)
+{
+    if (!sel->rx.buf ||
+        sel->rx.buf->offset < (int64_t)sizeof(ray_ipc_header_t))
+        return NULL;
+
+    ray_ipc_conn_data_t* cd = (ray_ipc_conn_data_t*)sel->data;
+    memcpy(&cd->hdr, sel->rx.buf->data, sizeof(ray_ipc_header_t));
+
+    if (cd->hdr.prefix != RAY_SERDE_PREFIX ||
+        cd->hdr.version != RAY_SERDE_WIRE_VERSION ||
+        cd->hdr.size <= 0 ||
+        cd->hdr.size > 256 * 1024 * 1024) {
+        ray_poll_deregister(poll, sel->id);
+        return NULL;
+    }
+
+    cd->phase = RAY_IPC_PHASE_PAYLOAD;
+    sel->rx.read_fn = ipc_read_payload;
+    ray_poll_rx_request(poll, sel, cd->hdr.size);
+
+    return NULL;
+}
+
+static ray_t* ipc_read_payload(ray_poll_t* poll, ray_selector_t* sel)
+{
+    ray_ipc_conn_data_t* cd = (ray_ipc_conn_data_t*)sel->data;
+
+    if (!sel->rx.buf || sel->rx.buf->offset < cd->hdr.size)
+        return NULL;
+
+    bool prev_restricted = ray_eval_get_restricted();
+    ray_eval_set_restricted(cd->restricted);
+
+    /* Eval and produce result */
+    ray_t* result = eval_payload(sel->rx.buf->data,
+                                 (size_t)sel->rx.buf->offset, &cd->hdr);
+
+    ray_eval_set_restricted(prev_restricted);
+
+    /* Send response for sync messages */
+    if (cd->hdr.msgtype == RAY_IPC_MSG_SYNC)
+        send_response((ray_sock_t)sel->fd, result);
+    if (result != RAY_NULL_OBJ) ray_release(result);
+
+    /* Reset for next message */
+    cd->phase = RAY_IPC_PHASE_HEADER;
+    sel->rx.read_fn = ipc_read_header;
+    ray_poll_rx_request(poll, sel, sizeof(ray_ipc_header_t));
+
+    return NULL;
+}
+
+static ray_t* ipc_on_data(ray_poll_t* poll, ray_selector_t* sel, void* data)
+{
+    (void)poll; (void)sel; (void)data;
+    return NULL;
+}
+
+static void ipc_on_close(ray_poll_t* poll, ray_selector_t* sel)
+{
+    (void)poll;
+    if (sel->data) {
+        ray_sys_free(sel->data);
+        sel->data = NULL;
+    }
+    ray_sock_close((ray_sock_t)sel->fd);
+}
+
+int64_t ray_ipc_listen(ray_poll_t* poll, uint16_t port)
+{
+    if (!poll) return -1;
+
+    ray_sock_t fd = ray_sock_listen(port);
+    if (fd == RAY_INVALID_SOCK) return -1;
+    ray_sock_set_nonblocking(fd);
+
+    ray_poll_reg_t reg = {0};
+    reg.fd       = (int64_t)fd;
+    reg.type     = RAY_SEL_SOCKET;
+    reg.read_fn  = ipc_accept;
+    reg.close_fn = ipc_on_close;
+
+    int64_t id = ray_poll_register(poll, &reg);
+    if (id < 0) {
+        ray_sock_close(fd);
+        return -1;
+    }
+    return id;
+}
+
+/* ======================================================================
+ * Server API
+ * ====================================================================== */
+
+static void conn_close(ray_ipc_server_t* srv, ray_ipc_conn_t* c)
+{
+#if defined(__linux__)
+    epoll_ctl(srv->poll_fd, EPOLL_CTL_DEL, c->fd, NULL);
+#elif defined(__APPLE__)
+    struct kevent kev;
+    EV_SET(&kev, c->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
+    kevent(srv->poll_fd, &kev, 1, NULL, 0, NULL);
+#else
+    (void)srv;
+#endif
+
+    ray_sock_close(c->fd);
+    if (c->rx_buf) ray_sys_free(c->rx_buf);
+    c->fd      = RAY_INVALID_SOCK;
+    c->rx_buf  = NULL;
+    c->rx_len  = 0;
+    c->rx_need = 0;
+
+    uint32_t idx = (uint32_t)(c - srv->conns);
+    if (idx + 1 < srv->n_conns)
+        srv->conns[idx] = srv->conns[srv->n_conns - 1];
+    if (srv->n_conns > 0) srv->n_conns--;
+}
+
+static void conn_on_handshake(ray_ipc_server_t* srv, ray_ipc_conn_t* c)
+{
+    /* Refuse peers speaking a different wire version up front — see the
+     * matching check in ipc_read_handshake. */
+    if (!c->rx_buf || c->rx_buf[0] != RAY_SERDE_WIRE_VERSION) {
+        conn_close(srv, c);
+        return;
+    }
+
+    bool auth_req = (srv->auth_secret[0] != '\0');
+    uint8_t resp[2] = { RAY_SERDE_WIRE_VERSION, auth_req ? 0x01 : 0x00 };
+    ray_sock_send(c->fd, resp, 2);
+
+    ray_sys_free(c->rx_buf);
+    c->rx_buf  = NULL;
+    c->rx_len  = 0;
+
+    if (auth_req) {
+        c->rx_need = 1; /* length byte */
+        c->phase   = RAY_IPC_PHASE_CREDS;
+        return;
+    }
+
+    c->rx_need = sizeof(ray_ipc_header_t);
+    c->phase   = RAY_IPC_PHASE_HEADER;
+}
+
+static void conn_on_header(ray_ipc_server_t* srv, ray_ipc_conn_t* c)
+{
+    memcpy(&c->hdr, c->rx_buf, sizeof(ray_ipc_header_t));
+
+    if (c->hdr.prefix != RAY_SERDE_PREFIX) { conn_close(srv, c); return; }
+    if (c->hdr.version != RAY_SERDE_WIRE_VERSION) { conn_close(srv, c); return; }
+    if (c->hdr.size <= 0)                  { conn_close(srv, c); return; }
+    if (c->hdr.size > 256 * 1024 * 1024)   { conn_close(srv, c); return; }
+
+    ray_sys_free(c->rx_buf);
+    c->rx_buf = (uint8_t*)ray_sys_alloc((size_t)c->hdr.size);
+    if (!c->rx_buf) { conn_close(srv, c); return; }
+    c->rx_len  = 0;
+    c->rx_need = (size_t)c->hdr.size;
+    c->phase   = RAY_IPC_PHASE_PAYLOAD;
+}
+
+static void conn_on_payload(ray_ipc_server_t* srv, ray_ipc_conn_t* c)
+{
+    bool prev = ray_eval_get_restricted();
+    ray_eval_set_restricted(srv->restricted);
+
+    ray_t* result = eval_payload(c->rx_buf, c->rx_len, &c->hdr);
+
+    ray_eval_set_restricted(prev);
+
+    if (c->hdr.msgtype == RAY_IPC_MSG_SYNC)
+        send_response(c->fd, result);
+    if (result != RAY_NULL_OBJ) ray_release(result);
+
+    ray_sys_free(c->rx_buf);
+    c->rx_buf  = NULL;
+    c->rx_len  = 0;
+    c->rx_need = sizeof(ray_ipc_header_t);
+    c->phase   = RAY_IPC_PHASE_HEADER;
+}
+
+static void conn_on_creds(ray_ipc_server_t* srv, ray_ipc_conn_t* c)
+{
+    if (c->rx_len == 1) {
+        /* Got length byte — reallocate buffer for full credential */
+        uint8_t cred_len = c->rx_buf[0];
+        size_t need = 1 + (size_t)cred_len;
+        uint8_t* newbuf = (uint8_t*)ray_sys_alloc(need);
+        if (!newbuf) { conn_close(srv, c); return; }
+        newbuf[0] = cred_len;
+        ray_sys_free(c->rx_buf);
+        c->rx_buf  = newbuf;
+        c->rx_need = need;
+        return;
+    }
+
+    uint8_t cred_len = c->rx_buf[0];
+    bool ok = validate_creds(c->rx_buf + 1, cred_len, srv->auth_secret);
+
+    uint8_t result = ok ? 0x00 : 0x01;
+    ray_sock_send(c->fd, &result, 1);
+
+    if (!ok) {
+        conn_close(srv, c);
+        return;
+    }
+
+    ray_sys_free(c->rx_buf);
+    c->rx_buf  = NULL;
+    c->rx_len  = 0;
+    c->rx_need = sizeof(ray_ipc_header_t);
+    c->phase   = RAY_IPC_PHASE_HEADER;
+}
+
+static void conn_on_readable(ray_ipc_server_t* srv, ray_ipc_conn_t* c)
+{
+    if (!c->rx_buf) {
+        c->rx_buf = (uint8_t*)ray_sys_alloc(c->rx_need);
+        if (!c->rx_buf) { conn_close(srv, c); return; }
+    }
+
+    int64_t n = ray_sock_recv(c->fd, c->rx_buf + c->rx_len,
+                              c->rx_need - c->rx_len);
+    if (n <= 0) { conn_close(srv, c); return; }
+    c->rx_len += (size_t)n;
+
+    if (c->rx_len < c->rx_need) return;
+
+    switch (c->phase) {
+    case RAY_IPC_PHASE_HANDSHAKE: conn_on_handshake(srv, c); break;
+    case RAY_IPC_PHASE_CREDS:     conn_on_creds(srv, c);     break;
+    case RAY_IPC_PHASE_HEADER:    conn_on_header(srv, c);    break;
+    case RAY_IPC_PHASE_PAYLOAD:   conn_on_payload(srv, c);   break;
+    }
+}
+
+ray_err_t ray_ipc_server_init(ray_ipc_server_t* srv, uint16_t port)
+{
+    memset(srv, 0, sizeof(*srv));
+    srv->listen_fd = ray_sock_listen(port);
+    if (srv->listen_fd == RAY_INVALID_SOCK) return RAY_ERR_IO;
+    ray_sock_set_nonblocking(srv->listen_fd);
+
+#if defined(__linux__)
+    srv->poll_fd = epoll_create1(0);
+    if (srv->poll_fd < 0) {
+        ray_sock_close(srv->listen_fd);
+        return RAY_ERR_IO;
+    }
+    struct epoll_event ev = { .events = EPOLLIN, .data.fd = srv->listen_fd };
+    epoll_ctl(srv->poll_fd, EPOLL_CTL_ADD, srv->listen_fd, &ev);
+#elif defined(__APPLE__)
+    srv->poll_fd = kqueue();
+    if (srv->poll_fd < 0) {
+        ray_sock_close(srv->listen_fd);
+        return RAY_ERR_IO;
+    }
+    struct kevent kev;
+    EV_SET(&kev, srv->listen_fd, EVFILT_READ, EV_ADD, 0, 0, NULL);
+    kevent(srv->poll_fd, &kev, 1, NULL, 0, NULL);
+#else
+    srv->poll_fd = -1;
+#endif
+
+    srv->running = true;
+    return RAY_OK;
+}
+
+void ray_ipc_server_destroy(ray_ipc_server_t* srv)
+{
+    for (uint32_t i = 0; i < srv->n_conns; i++) {
+        ray_ipc_conn_t* c = &srv->conns[i];
+        if (c->fd != RAY_INVALID_SOCK) {
+            if (c->rx_buf) ray_sys_free(c->rx_buf);
+            ray_sock_close(c->fd);
+        }
+    }
+    srv->n_conns = 0;
+
+    ray_sock_close(srv->listen_fd);
+    srv->listen_fd = RAY_INVALID_SOCK;
+
+    if (srv->poll_fd >= 0) {
+#ifndef RAY_OS_WINDOWS
+        close(srv->poll_fd);
+#endif
+    }
+    srv->poll_fd = -1;
+    srv->running = false;
+}
+
+int ray_ipc_poll(ray_ipc_server_t* srv, int timeout_ms)
+{
+    int ready = 0;
+
+#if defined(__linux__)
+    struct epoll_event events[RAY_IPC_MAX_EVENTS];
+    int nfds = epoll_wait(srv->poll_fd, events, RAY_IPC_MAX_EVENTS, timeout_ms);
+    if (nfds < 0) return (errno == EINTR) ? 0 : -1;
+
+    for (int i = 0; i < nfds; i++) {
+        int fd = events[i].data.fd;
+
+        if (fd == srv->listen_fd) {
+            ray_sock_t new_fd = ray_sock_accept(srv->listen_fd);
+            if (new_fd == RAY_INVALID_SOCK) continue;
+            ray_sock_set_nonblocking(new_fd);
+            if (srv->n_conns >= RAY_IPC_MAX_CONNS) {
+                ray_sock_close(new_fd);
+                continue;
+            }
+            ray_ipc_conn_t* c = &srv->conns[srv->n_conns++];
+            c->fd      = new_fd;
+            c->rx_buf  = NULL;
+            c->rx_len  = 0;
+            c->rx_need = 2;
+            c->phase   = RAY_IPC_PHASE_HANDSHAKE;
+            struct epoll_event cev = { .events = EPOLLIN, .data.fd = new_fd };
+            epoll_ctl(srv->poll_fd, EPOLL_CTL_ADD, new_fd, &cev);
+        } else {
+            bool found = false;
+            for (uint32_t j = 0; j < srv->n_conns; j++) {
+                if (srv->conns[j].fd == fd) {
+                    conn_on_readable(srv, &srv->conns[j]);
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) ready++;
+        }
+    }
+
+#elif defined(__APPLE__)
+    struct kevent events[RAY_IPC_MAX_EVENTS];
+    struct timespec ts;
+    struct timespec* tsp = NULL;
+    if (timeout_ms >= 0) {
+        ts.tv_sec  = timeout_ms / 1000;
+        ts.tv_nsec = (timeout_ms % 1000) * 1000000L;
+        tsp = &ts;
+    }
+    int nfds = kevent(srv->poll_fd, NULL, 0, events, RAY_IPC_MAX_EVENTS, tsp);
+    if (nfds < 0) return (errno == EINTR) ? 0 : -1;
+
+    for (int i = 0; i < nfds; i++) {
+        int fd = (int)events[i].ident;
+
+        if (fd == srv->listen_fd) {
+            ray_sock_t new_fd = ray_sock_accept(srv->listen_fd);
+            if (new_fd == RAY_INVALID_SOCK) continue;
+            ray_sock_set_nonblocking(new_fd);
+            if (srv->n_conns >= RAY_IPC_MAX_CONNS) {
+                ray_sock_close(new_fd);
+                continue;
+            }
+            ray_ipc_conn_t* c = &srv->conns[srv->n_conns++];
+            c->fd      = new_fd;
+            c->rx_buf  = NULL;
+            c->rx_len  = 0;
+            c->rx_need = 2;
+            c->phase   = RAY_IPC_PHASE_HANDSHAKE;
+            struct kevent kev;
+            EV_SET(&kev, new_fd, EVFILT_READ, EV_ADD, 0, 0, NULL);
+            kevent(srv->poll_fd, &kev, 1, NULL, 0, NULL);
+        } else {
+            bool found = false;
+            for (uint32_t j = 0; j < srv->n_conns; j++) {
+                if (srv->conns[j].fd == fd) {
+                    conn_on_readable(srv, &srv->conns[j]);
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) ready++;
+        }
+    }
+
+#else  /* Windows: select-based fallback */
+    fd_set rfds;
+    FD_ZERO(&rfds);
+    FD_SET(srv->listen_fd, &rfds);
+    ray_sock_t maxfd = srv->listen_fd;
+    for (uint32_t i = 0; i < srv->n_conns; i++) {
+        FD_SET(srv->conns[i].fd, &rfds);
+        if (srv->conns[i].fd > maxfd) maxfd = srv->conns[i].fd;
+    }
+
+    struct timeval tv;
+    struct timeval* tvp = NULL;
+    if (timeout_ms >= 0) {
+        tv.tv_sec  = timeout_ms / 1000;
+        tv.tv_usec = (timeout_ms % 1000) * 1000;
+        tvp = &tv;
+    }
+
+    int nfds = select((int)(maxfd + 1), &rfds, NULL, NULL, tvp);
+    if (nfds < 0) return (errno == EINTR) ? 0 : -1;
+
+    if (FD_ISSET(srv->listen_fd, &rfds)) {
+        ray_sock_t new_fd = ray_sock_accept(srv->listen_fd);
+        if (new_fd != RAY_INVALID_SOCK) {
+            ray_sock_set_nonblocking(new_fd);
+            if (srv->n_conns >= RAY_IPC_MAX_CONNS) {
+                ray_sock_close(new_fd);
+            } else {
+                ray_ipc_conn_t* c = &srv->conns[srv->n_conns++];
+                c->fd      = new_fd;
+                c->rx_buf  = NULL;
+                c->rx_len  = 0;
+                c->rx_need = 2;
+                c->phase   = RAY_IPC_PHASE_HANDSHAKE;
+            }
+        }
+    }
+
+    for (uint32_t i = srv->n_conns; i > 0; ) {
+        --i;
+        if (srv->conns[i].fd != RAY_INVALID_SOCK && FD_ISSET(srv->conns[i].fd, &rfds))
+            conn_on_readable(srv, &srv->conns[i]);
+    }
+#endif
+
+    return ready;
+}
+
+/* ===== Client API ===== */
+
+static ray_sock_t g_client_fds[RAY_IPC_MAX_CONNS];
+static int        g_client_count = 0;
+static bool       g_client_init = false;
+
+static void client_init(void) {
+    if (g_client_init) return;
+    for (int i = 0; i < RAY_IPC_MAX_CONNS; i++)
+        g_client_fds[i] = RAY_INVALID_SOCK;
+    g_client_init = true;
+}
+
+static int64_t recv_full(ray_sock_t fd, void* buf, size_t len) {
+    size_t total = 0;
+    while (total < len) {
+        int64_t n = ray_sock_recv(fd, (uint8_t*)buf + total, len - total);
+        if (n <= 0) return -1;
+        total += (size_t)n;
+    }
+    return (int64_t)total;
+}
+
+static int64_t client_send_msg(int64_t handle, ray_t* msg, uint8_t msgtype)
+{
+    if (handle < 0 || handle >= RAY_IPC_MAX_CONNS) return -2;
+    ray_sock_t fd = g_client_fds[handle];
+    if (fd == RAY_INVALID_SOCK) return -2;
+
+    int64_t ser_size = ray_serde_size(msg);
+    if (ser_size <= 0) return -1;
+
+    uint8_t* payload = (uint8_t*)ray_sys_alloc((size_t)ser_size);
+    if (!payload) return -1;
+    ray_ser_raw(payload, msg);
+
+    uint8_t* send_buf = NULL;
+    size_t   send_len = 0;
+    uint8_t  flags    = 0;
+
+    if ((size_t)ser_size > RAY_IPC_COMPRESS_THRESHOLD) {
+        uint8_t* comp = (uint8_t*)ray_sys_alloc((size_t)ser_size);
+        if (comp) {
+            size_t clen = ray_ipc_compress(payload, (size_t)ser_size,
+                                           comp, (size_t)ser_size);
+            if (clen > 0 && clen + 4 < (size_t)ser_size) {
+                send_len = clen + 4;
+                send_buf = (uint8_t*)ray_sys_alloc(send_len);
+                if (send_buf) {
+                    uint32_t uncomp = (uint32_t)ser_size;
+                    memcpy(send_buf, &uncomp, 4);
+                    memcpy(send_buf + 4, comp, clen);
+                    flags = RAY_IPC_FLAG_COMPRESSED;
+                }
+            }
+            ray_sys_free(comp);
+        }
+    }
+
+    if (!send_buf) {
+        send_buf = payload;
+        send_len = (size_t)ser_size;
+        payload  = NULL;
+    }
+
+    ray_ipc_header_t hdr = {
+        .prefix  = RAY_SERDE_PREFIX,
+        .version = RAY_SERDE_WIRE_VERSION,
+        .flags   = flags,
+        .endian  = 0,
+        .msgtype = msgtype,
+        .size    = (int64_t)send_len,
+    };
+
+    int64_t rc = ray_sock_send(fd, &hdr, sizeof(hdr));
+    if (rc < 0) { ray_sys_free(send_buf); if (payload) ray_sys_free(payload); return -1; }
+    rc = ray_sock_send(fd, send_buf, send_len);
+
+    ray_sys_free(send_buf);
+    if (payload) ray_sys_free(payload);
+    return rc < 0 ? -1 : 0;
+}
+
+int64_t ray_ipc_connect(const char* host, uint16_t port,
+                         const char* user, const char* password)
+{
+    client_init();
+
+    ray_sock_t fd = ray_sock_connect(host, port, 5000);
+    if (fd == RAY_INVALID_SOCK) return -1;
+
+    uint8_t hs[2] = { RAY_SERDE_WIRE_VERSION, 0x00 };
+    if (ray_sock_send(fd, hs, 2) < 0) {
+        ray_sock_close(fd);
+        return -1;
+    }
+
+    uint8_t resp[2];
+    if (recv_full(fd, resp, 2) < 0) {
+        ray_sock_close(fd);
+        return -1;
+    }
+
+    /* Refuse a peer that speaks a different wire version.  This gives
+     * the new client an explicit error at connect time rather than
+     * silently sending a v3 payload to a server that would misparse
+     * every atom. */
+    if (resp[0] != RAY_SERDE_WIRE_VERSION) {
+        ray_sock_close(fd);
+        return -4; /* wire version mismatch */
+    }
+
+    /* Auth required? */
+    if (resp[1] == 0x01) {
+        if (!password) {
+            ray_sock_close(fd);
+            return -2; /* auth required but no creds */
+        }
+        char cred[256];
+        int cred_len;
+        if (user && user[0])
+            cred_len = snprintf(cred, sizeof(cred), "%s:%s", user, password);
+        else
+            cred_len = snprintf(cred, sizeof(cred), ":%s", password);
+        if (cred_len < 0 || cred_len >= (int)sizeof(cred)) {
+            ray_sock_close(fd);
+            return -1;
+        }
+        cred_len++; /* include null terminator */
+        uint8_t len_byte = (uint8_t)cred_len;
+        if (ray_sock_send(fd, &len_byte, 1) < 0 ||
+            ray_sock_send(fd, cred, cred_len) < 0) {
+            ray_sock_close(fd);
+            return -1;
+        }
+        uint8_t auth_result;
+        if (recv_full(fd, &auth_result, 1) < 0 || auth_result != 0x00) {
+            ray_sock_close(fd);
+            return -3; /* auth rejected */
+        }
+    } else if (resp[1] != 0x00) {
+        ray_sock_close(fd);
+        return -1;
+    }
+
+#ifdef RAY_OS_WINDOWS
+    { DWORD z = 0;
+      setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (const char*)&z, sizeof(z));
+      setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, (const char*)&z, sizeof(z)); }
+#else
+    { struct timeval z = {0, 0};
+      setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &z, sizeof(z));
+      setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &z, sizeof(z)); }
+#endif
+
+    for (int i = 0; i < RAY_IPC_MAX_CONNS; i++) {
+        if (g_client_fds[i] == RAY_INVALID_SOCK) {
+            g_client_fds[i] = fd;
+            if (i >= g_client_count) g_client_count = i + 1;
+            return (int64_t)i;
+        }
+    }
+
+    ray_sock_close(fd);
+    return -1;
+}
+
+void ray_ipc_close(int64_t handle)
+{
+    if (handle < 0 || handle >= RAY_IPC_MAX_CONNS) return;
+    if (g_client_fds[handle] == RAY_INVALID_SOCK) return;
+    ray_sock_close(g_client_fds[handle]);
+    g_client_fds[handle] = RAY_INVALID_SOCK;
+}
+
+ray_t* ray_ipc_send(int64_t handle, ray_t* msg)
+{
+    { int64_t sr = client_send_msg(handle, msg, RAY_IPC_MSG_SYNC);
+      if (sr == -2) return ray_error("io", "connection closed");
+      if (sr < 0) return ray_error("io", "ipc send failed"); }
+
+    ray_sock_t fd = g_client_fds[handle];
+
+    ray_ipc_header_t hdr;
+    if (recv_full(fd, &hdr, sizeof(hdr)) < 0) {
+        ray_ipc_close(handle);
+        return ray_error("io", "ipc recv header failed");
+    }
+    if (hdr.prefix != RAY_SERDE_PREFIX || hdr.size <= 0) {
+        ray_ipc_close(handle);
+        return ray_error("io", "ipc bad response header");
+    }
+    if (hdr.version != RAY_SERDE_WIRE_VERSION) {
+        ray_ipc_close(handle);
+        return ray_error("version", "ipc peer wire version mismatch");
+    }
+    if (hdr.size > 256 * 1024 * 1024) {
+        ray_ipc_close(handle);
+        return ray_error("io", "ipc response too large");
+    }
+
+    uint8_t* payload = (uint8_t*)ray_sys_alloc((size_t)hdr.size);
+    if (!payload) return ray_error("oom", NULL);
+    if (recv_full(fd, payload, (size_t)hdr.size) < 0) {
+        ray_sys_free(payload);
+        ray_ipc_close(handle);
+        return ray_error("io", "ipc recv payload failed");
+    }
+
+    uint8_t* deser_buf     = payload;
+    size_t   deser_len     = (size_t)hdr.size;
+    uint8_t* decompressed  = NULL;
+
+    if (hdr.flags & RAY_IPC_FLAG_COMPRESSED) {
+        if (deser_len < 4) { ray_sys_free(payload); return ray_error("io", "ipc compressed payload too short"); }
+        uint32_t uncomp_size;
+        memcpy(&uncomp_size, payload, 4);
+        decompressed = (uint8_t*)ray_sys_alloc(uncomp_size);
+        if (!decompressed) { ray_sys_free(payload); return ray_error("oom", NULL); }
+        size_t dlen = ray_ipc_decompress(payload + 4, deser_len - 4,
+                                         decompressed, uncomp_size);
+        if (dlen != uncomp_size) {
+            ray_sys_free(decompressed);
+            ray_sys_free(payload);
+            return ray_error("io", "ipc decompress failed");
+        }
+        deser_buf = decompressed;
+        deser_len = uncomp_size;
+    }
+
+    int64_t de_len = (int64_t)deser_len;
+    ray_t*  result = ray_de_raw(deser_buf, &de_len);
+
+    if (decompressed) ray_sys_free(decompressed);
+    ray_sys_free(payload);
+
+    return result ? result : RAY_NULL_OBJ;
+}
+
+ray_err_t ray_ipc_send_async(int64_t handle, ray_t* msg)
+{
+    if (client_send_msg(handle, msg, RAY_IPC_MSG_ASYNC) < 0)
+        return RAY_ERR_IO;
+    return RAY_OK;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/ipc.h b/crates/rayforce-sys/vendor/rayforce/src/core/ipc.h
new file mode 100644
index 0000000..ec10ddf
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/ipc.h
@@ -0,0 +1,96 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_IPC_H
+#define RAY_IPC_H
+
+#include <rayforce.h>
+#include "core/poll.h"
+#include "core/sock.h"
+#include "store/serde.h"
+
+/* ===== Compression ===== */
+
+#define RAY_IPC_COMPRESS_THRESHOLD 2000
+
+size_t ray_ipc_compress(const uint8_t* src, size_t len,
+                        uint8_t* dst, size_t dst_cap);
+size_t ray_ipc_decompress(const uint8_t* src, size_t clen,
+                          uint8_t* dst, size_t dst_len);
+
+/* ===== Message types ===== */
+
+#define RAY_IPC_MSG_ASYNC  0
+#define RAY_IPC_MSG_SYNC   1
+#define RAY_IPC_MSG_RESP   2
+
+#define RAY_IPC_FLAG_COMPRESSED 0x01
+/* Set by the journal hook in core/ipc.c eval_payload when the inbound
+ * IPC message arrived on a `-U` restricted connection.  Used ONLY for
+ * persisted log frames; the live IPC path ignores it (the connection's
+ * restricted state is the source of truth there).  Replay reads the
+ * bit to re-impose the original sender's restrictions, otherwise a
+ * crash + restart silently elevates restricted commands to full
+ * privilege. */
+#define RAY_IPC_FLAG_RESTRICTED 0x02
+#define RAY_IPC_MAX_CONNS 256
+
+/* ===== Poll-based IPC (new API) ===== */
+
+/* Register IPC listener on poll. Returns selector id or -1. */
+int64_t ray_ipc_listen(ray_poll_t* poll, uint16_t port);
+
+/* ===== Legacy server API (wraps poll internally for tests) ===== */
+
+typedef struct ray_ipc_conn {
+    ray_sock_t        fd;
+    uint8_t*          rx_buf;
+    size_t            rx_len;
+    size_t            rx_need;
+    uint8_t           phase;
+    ray_ipc_header_t  hdr;
+} ray_ipc_conn_t;
+
+typedef struct ray_ipc_server {
+    ray_sock_t        listen_fd;
+    int               poll_fd;
+    ray_ipc_conn_t    conns[RAY_IPC_MAX_CONNS];
+    uint32_t          n_conns;
+    bool              running;
+    char              auth_secret[256]; /* password from -u/-U */
+    bool              restricted;       /* -U mode */
+} ray_ipc_server_t;
+
+ray_err_t ray_ipc_server_init(ray_ipc_server_t* srv, uint16_t port);
+void      ray_ipc_server_destroy(ray_ipc_server_t* srv);
+int       ray_ipc_poll(ray_ipc_server_t* srv, int timeout_ms);
+
+/* ===== Client API (blocking, no poll needed) ===== */
+
+int64_t   ray_ipc_connect(const char* host, uint16_t port,
+                           const char* user, const char* password);
+void      ray_ipc_close(int64_t handle);
+ray_t*    ray_ipc_send(int64_t handle, ray_t* msg);
+ray_err_t ray_ipc_send_async(int64_t handle, ray_t* msg);
+
+#endif /* RAY_IPC_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/kqueue.c b/crates/rayforce-sys/vendor/rayforce/src/core/kqueue.c
new file mode 100644
index 0000000..4c76021
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/kqueue.c
@@ -0,0 +1,248 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#if defined(__APPLE__)
+
+#include "core/poll.h"
+#include "mem/sys.h"
+#include <sys/event.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#define RAY_POLL_MAX_EVENTS 64
+#define RAY_POLL_INITIAL_CAP 16
+
+ray_poll_t* ray_poll_create(void)
+{
+    int fd = kqueue();
+    if (fd < 0) return NULL;
+
+    ray_poll_t* poll = (ray_poll_t*)ray_sys_alloc(sizeof(ray_poll_t));
+    if (!poll) { close(fd); return NULL; }
+
+    memset(poll, 0, sizeof(*poll));
+    poll->fd      = fd;
+    poll->code    = -1;
+    poll->sel_cap = RAY_POLL_INITIAL_CAP;
+    poll->sels    = (ray_selector_t**)ray_sys_alloc(
+                        poll->sel_cap * sizeof(ray_selector_t*));
+    if (!poll->sels) {
+        close(fd);
+        ray_sys_free(poll);
+        return NULL;
+    }
+    memset(poll->sels, 0, poll->sel_cap * sizeof(ray_selector_t*));
+    return poll;
+}
+
+void ray_poll_destroy(ray_poll_t* poll)
+{
+    if (!poll) return;
+
+    for (uint32_t i = 0; i < poll->n_sels; i++) {
+        ray_selector_t* sel = poll->sels[i];
+        if (!sel) continue;
+        if (sel->close_fn) sel->close_fn(poll, sel);
+        struct kevent kev;
+        EV_SET(&kev, (uintptr_t)sel->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
+        kevent((int)poll->fd, &kev, 1, NULL, 0, NULL);
+        if (sel->rx.buf) ray_poll_buf_free(sel->rx.buf);
+        ray_poll_buf_free(sel->tx.buf);
+        ray_sys_free(sel);
+        poll->sels[i] = NULL;
+    }
+
+    if (poll->sels) ray_sys_free(poll->sels);
+    close((int)poll->fd);
+    ray_sys_free(poll);
+}
+
+int64_t ray_poll_register(ray_poll_t* poll, ray_poll_reg_t* reg)
+{
+    if (!poll || !reg) return -1;
+
+    /* Find free slot or grow */
+    int64_t id = -1;
+    for (uint32_t i = 0; i < poll->n_sels; i++) {
+        if (!poll->sels[i]) { id = (int64_t)i; break; }
+    }
+    if (id < 0) {
+        if (poll->n_sels >= poll->sel_cap) {
+            uint32_t new_cap = poll->sel_cap * 2;
+            ray_selector_t** ns = (ray_selector_t**)ray_sys_alloc(
+                new_cap * sizeof(ray_selector_t*));
+            if (!ns) return -1;
+            memcpy(ns, poll->sels, poll->n_sels * sizeof(ray_selector_t*));
+            memset(ns + poll->n_sels, 0,
+                   (new_cap - poll->n_sels) * sizeof(ray_selector_t*));
+            ray_sys_free(poll->sels);
+            poll->sels    = ns;
+            poll->sel_cap = new_cap;
+        }
+        id = (int64_t)poll->n_sels;
+        poll->n_sels++;
+    }
+
+    ray_selector_t* sel = (ray_selector_t*)ray_sys_alloc(sizeof(ray_selector_t));
+    if (!sel) return -1;
+    memset(sel, 0, sizeof(*sel));
+
+    sel->fd       = reg->fd;
+    sel->id       = id;
+    sel->type     = reg->type;
+    sel->data     = reg->data;
+    sel->open_fn  = reg->open_fn;
+    sel->close_fn = reg->close_fn;
+    sel->error_fn = reg->error_fn;
+    sel->data_fn  = reg->data_fn;
+    sel->rx.recv_fn = reg->recv_fn;
+    sel->rx.read_fn = reg->read_fn;
+    sel->tx.send_fn = reg->send_fn;
+
+    poll->sels[id] = sel;
+
+    /* Register with kqueue */
+    struct kevent kev;
+    EV_SET(&kev, (uintptr_t)reg->fd, EVFILT_READ, EV_ADD, 0, 0,
+           (void*)(uintptr_t)id);
+
+    if (kevent((int)poll->fd, &kev, 1, NULL, 0, NULL) < 0) {
+        poll->sels[id] = NULL;
+        ray_sys_free(sel);
+        return -1;
+    }
+
+    if (sel->open_fn) sel->open_fn(poll, sel);
+    return id;
+}
+
+void ray_poll_deregister(ray_poll_t* poll, int64_t id)
+{
+    if (!poll || id < 0 || (uint32_t)id >= poll->n_sels) return;
+    ray_selector_t* sel = poll->sels[id];
+    if (!sel) return;
+
+    struct kevent kev;
+    EV_SET(&kev, (uintptr_t)sel->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
+    kevent((int)poll->fd, &kev, 1, NULL, 0, NULL);
+
+    if (sel->close_fn) sel->close_fn(poll, sel);
+    if (sel->rx.buf) ray_poll_buf_free(sel->rx.buf);
+    ray_poll_buf_free(sel->tx.buf);
+    ray_sys_free(sel);
+    poll->sels[id] = NULL;
+}
+
+int64_t ray_poll_run(ray_poll_t* poll)
+{
+    if (!poll) return -1;
+
+    struct kevent events[RAY_POLL_MAX_EVENTS];
+
+    while (poll->code < 0) {
+        int n = kevent((int)poll->fd, NULL, 0, events,
+                       RAY_POLL_MAX_EVENTS, NULL);
+        if (n < 0) {
+            if (errno == EINTR) continue;
+            return -1;
+        }
+
+        for (int i = 0; i < n; i++) {
+            uint64_t eid = (uint64_t)(uintptr_t)events[i].udata;
+            ray_selector_t* sel = NULL;
+
+            if (eid < poll->n_sels)
+                sel = poll->sels[eid];
+            if (!sel) continue;
+
+            /* EV_ERROR without data — fatal, skip directly */
+            if ((events[i].flags & EV_ERROR) && events[i].filter != EVFILT_READ) {
+                if (sel->error_fn)
+                    sel->error_fn(poll, sel);
+                else
+                    ray_poll_deregister(poll, sel->id);
+                continue;
+            }
+
+            /* Process readable data first — even if EOF is also set.
+             * A client may send a message and close simultaneously. */
+            if (events[i].filter == EVFILT_READ) {
+                for (;;) {
+                    if (sel->rx.recv_fn && sel->rx.buf) {
+                        while (sel->rx.buf->offset < sel->rx.buf->size) {
+                            int64_t nr = sel->rx.recv_fn(
+                                sel->fd,
+                                sel->rx.buf->data + sel->rx.buf->offset,
+                                sel->rx.buf->size - sel->rx.buf->offset);
+                            if (nr <= 0) {
+                                if (nr < 0 && errno == EINTR) continue;
+                                if (nr < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+                                    break;
+                                if (sel->error_fn)
+                                    sel->error_fn(poll, sel);
+                                else
+                                    ray_poll_deregister(poll, sel->id);
+                                goto next_event;
+                            }
+                            sel->rx.buf->offset += nr;
+                        }
+                    }
+                    if (sel->rx.buf && sel->rx.buf->offset < sel->rx.buf->size)
+                        break;
+                    if (!sel->rx.read_fn) break;
+                    ray_t* obj = sel->rx.read_fn(poll, sel);
+
+                    /* Re-validate: read_fn may have deregistered this selector */
+                    if (eid >= poll->n_sels || !poll->sels[eid]) goto next_event;
+                    sel = poll->sels[eid];
+
+                    if (obj && sel->data_fn)
+                        sel->data_fn(poll, sel, obj);
+                    if (eid >= poll->n_sels || !poll->sels[eid]) goto next_event;
+                    sel = poll->sels[eid];
+                    if (!sel->rx.buf) break;
+                    if (sel->rx.buf->offset >= sel->rx.buf->size) continue;
+                }
+            }
+
+            /* EOF / error — after data is drained */
+            if (events[i].flags & (EV_EOF | EV_ERROR)) {
+                if (eid < poll->n_sels && poll->sels[eid]) {
+                    sel = poll->sels[eid];
+                    if (sel->type == RAY_SEL_STDIN) goto next_event; /* Ctrl-D handled by read_fn */
+                    if (sel->error_fn)
+                        sel->error_fn(poll, sel);
+                    else
+                        ray_poll_deregister(poll, sel->id);
+                }
+            }
+
+        next_event:;
+        }
+    }
+
+    return poll->code;
+}
+
+#endif /* __APPLE__ */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/morsel.c b/crates/rayforce-sys/vendor/rayforce/src/core/morsel.c
new file mode 100644
index 0000000..3184cc3
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/morsel.c
@@ -0,0 +1,122 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "core/morsel.h"
+#include "core/platform.h"
+#include "mem/heap.h"
+#include "table/sym.h"
+#include "ops/idxop.h"
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * ray_morsel_init
+ *
+ * Initialize a morsel iterator over the given vector. Sets up offset,
+ * length, and element size. Issues a sequential madvise hint for mmap'd
+ * vectors to optimize readahead.
+ * -------------------------------------------------------------------------- */
+
+void ray_morsel_init(ray_morsel_t* m, ray_t* vec) {
+    m->vec = vec;
+    m->offset = 0;
+    m->len = ray_len(vec);
+    m->elem_size = ray_sym_elem_size(vec->type, vec->attrs);
+    m->morsel_len = 0;
+    m->morsel_ptr = NULL;
+    m->null_bits = NULL;
+
+    /* One-time hint for mmap'd vectors */
+    if (vec->mmod == 1) {
+        ray_vm_advise_seq(ray_data(vec), (size_t)m->len * m->elem_size);
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ray_morsel_next
+ *
+ * Advance to the next morsel. Returns true if a morsel is available, false
+ * when the vector is exhausted. Sets morsel_ptr to the data for the current
+ * chunk, morsel_len to the number of elements, and null_bits to the null
+ * bitmap (or NULL if no nulls).
+ * -------------------------------------------------------------------------- */
+
+bool ray_morsel_next(ray_morsel_t* m) {
+    m->offset += m->morsel_len;
+    if (m->offset >= m->len) return false;
+
+    int64_t remaining = m->len - m->offset;
+    m->morsel_len = remaining < RAY_MORSEL_ELEMS ? remaining : RAY_MORSEL_ELEMS;
+    m->morsel_ptr = (uint8_t*)ray_data(m->vec) + (size_t)m->offset * m->elem_size;
+
+    /* Null bitmap: only if HAS_NULLS.
+     * M5: null_bits points to the byte containing bit (m->offset).
+     * Callers must account for (m->offset % 8) bit offset within the
+     * first byte of null_bits when testing individual null bits.
+     *
+     * HAS_INDEX path: when an accelerator index is attached, the parent's
+     * 16-byte nullmap union holds the index pointer instead of bitmap data
+     * (or ext_nullmap pointer).  The original bytes are preserved inside
+     * ix->saved_nullmap.  Route through that snapshot here so null-aware
+     * loops still see the correct bits. */
+    m->null_bits = NULL;
+    if (m->vec->attrs & RAY_ATTR_HAS_NULLS) {
+        if (m->vec->attrs & RAY_ATTR_HAS_INDEX) {
+            ray_index_t* ix = ray_index_payload(m->vec->index);
+            if (ix->saved_attrs & RAY_ATTR_NULLMAP_EXT) {
+                ray_t* ext;
+                memcpy(&ext, &ix->saved_nullmap[0], sizeof(ext));
+                m->null_bits = (uint8_t*)ray_data(ext) + (m->offset / 8);
+            } else if (m->offset < 128) {
+                m->null_bits = ix->saved_nullmap + (m->offset / 8);
+            }
+        } else if (m->vec->attrs & RAY_ATTR_NULLMAP_EXT) {
+            /* External bitmap: point to correct byte offset */
+            ray_t* ext = m->vec->ext_nullmap;
+            m->null_bits = (uint8_t*)ray_data(ext) + (m->offset / 8);
+        } else if (m->offset < 128) {
+            /* Inline bitmap is 16 bytes = 128 bits; vectors with HAS_NULLS
+             * and >128 elements must use external nullmap (RAY_ATTR_NULLMAP_EXT).
+             * Returns null_bits=NULL for offset>=128 when using inline bitmap. */
+            m->null_bits = m->vec->nullmap + (m->offset / 8);
+        }
+    }
+
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_morsel_init_range
+ *
+ * Initialize a morsel iterator over a sub-range [start, end) of the vector.
+ * Used by parallel dispatch so each worker iterates a disjoint portion.
+ * -------------------------------------------------------------------------- */
+
+void ray_morsel_init_range(ray_morsel_t* m, ray_t* vec, int64_t start, int64_t end) {
+    m->vec = vec;
+    m->offset = start;
+    m->len = end;
+    m->elem_size = ray_sym_elem_size(vec->type, vec->attrs);
+    m->morsel_len = 0;
+    m->morsel_ptr = NULL;
+    m->null_bits = NULL;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/morsel.h b/crates/rayforce-sys/vendor/rayforce/src/core/morsel.h
new file mode 100644
index 0000000..db7c80c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/morsel.h
@@ -0,0 +1,41 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_MORSEL_H
+#define RAY_MORSEL_H
+
+/*
+ * morsel.h -- Morsel iterator infrastructure.
+ *
+ * A morsel is a chunk of up to RAY_MORSEL_ELEMS (1024) elements from a vector.
+ * The iterator advances through the vector one morsel at a time, providing
+ * direct data pointers and null bitmap pointers for each chunk.
+ */
+
+#include "ops/ops.h"
+
+/* Initialize a morsel iterator over a sub-range [start, end) of vec.
+ * Used by parallel dispatch to partition work across workers. */
+void ray_morsel_init_range(ray_morsel_t* m, ray_t* vec, int64_t start, int64_t end);
+
+#endif /* RAY_MORSEL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/numparse.c b/crates/rayforce-sys/vendor/rayforce/src/core/numparse.c
new file mode 100644
index 0000000..408443b
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/numparse.c
@@ -0,0 +1,452 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "core/numparse.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* ----------------------------------------------------------------------------
+ * SWAR digit detection
+ *
+ * Load 8 bytes as a little-endian u64 and use the standard Lemire trick:
+ *   - subtract 0x30 from each byte → if any byte was < '0' the result
+ *     underflows and the high bit of that lane is set
+ *   - add 0x46 (= 0x7F - 0x39) to each byte → if any byte was > '9'
+ *     the result exceeds 0x7F and the high bit of that lane is set
+ *   - OR the two and mask with 0x80...80; zero ⇔ all bytes in '0'..'9'
+ * ---------------------------------------------------------------------------- */
+
+#define LANE8_BIT 0x8080808080808080ULL
+#define LANE4_BIT 0x80808080U
+
+bool ray_is_8_digits(const void *p) {
+    uint64_t chunk;
+    memcpy(&chunk, p, 8);
+    uint64_t under = chunk - 0x3030303030303030ULL;   /* < '0' → MSB set */
+    uint64_t over  = chunk + 0x4646464646464646ULL;   /* > '9' → MSB set */
+    return ((under | over) & LANE8_BIT) == 0;
+}
+
+bool ray_is_4_digits(const void *p) {
+    uint32_t chunk;
+    memcpy(&chunk, p, 4);
+    uint32_t under = chunk - 0x30303030U;
+    uint32_t over  = chunk + 0x46464646U;
+    return ((under | over) & LANE4_BIT) == 0;
+}
+
+/* ----------------------------------------------------------------------------
+ * SWAR digit accumulation
+ *
+ * The classic three-stage byte-pair-quad fold from the
+ * "fast atoi" literature.  Compiler folds away well at -O2/-O3, but
+ * the explicit form keeps it tight at -O0 too (sanitizer build).
+ * ---------------------------------------------------------------------------- */
+
+uint64_t ray_parse_8_digits(const void *p) {
+    uint64_t chunk;
+    memcpy(&chunk, p, 8);
+    chunk -= 0x3030303030303030ULL;                   /* now each byte ∈ 0..9 */
+
+    /* Fold pairs of digits into 16-bit words: tens*10 + ones.  The
+     * memory-low byte of each pair holds the tens digit (it printed
+     * first), so on a little-endian load the tens are at chunk's even
+     * bytes and the ones are at the odd bytes. */
+    uint64_t tens  = chunk        & 0x000F000F000F000FULL;
+    uint64_t ones  = (chunk >> 8) & 0x000F000F000F000FULL;
+    uint64_t pairs = tens * 10 + ones;                /* 4 × 16-bit values 0..99 */
+
+    /* Fold pairs-of-pairs into 32-bit words: pair_lo*100 + pair_hi,
+     * where pair_lo holds the more-significant pair (printed first). */
+    uint64_t p_hi   = pairs        & 0x000000FF000000FFULL;
+    uint64_t p_lo   = (pairs >> 16) & 0x000000FF000000FFULL;
+    uint64_t quads  = p_hi * 100 + p_lo;              /* 2 × 32-bit values 0..9999 */
+
+    /* Final fold: low 32 bits hold the more-significant quad. */
+    return (quads & 0xFFFFFFFFULL) * 10000 + (quads >> 32);
+}
+
+uint32_t ray_parse_4_digits(const void *p) {
+    uint32_t chunk;
+    memcpy(&chunk, p, 4);
+    chunk -= 0x30303030U;
+    uint32_t tens  = chunk        & 0x000F000FU;
+    uint32_t ones  = (chunk >> 8) & 0x000F000FU;
+    uint32_t pairs = tens * 10 + ones;                /* low 16 = pair1, high 16 = pair2 */
+    return (pairs & 0xFFFFU) * 100 + (pairs >> 16);
+}
+
+/* ----------------------------------------------------------------------------
+ * Integer parsers
+ * ---------------------------------------------------------------------------- */
+
+#define IS_DIGIT(c) ((unsigned)((unsigned char)(c) - '0') < 10u)
+
+size_t ray_parse_i64(const char *src, size_t len, int64_t *dst) {
+    if (len == 0) return 0;
+
+    size_t i = 0;
+    int neg = 0;
+    if (src[0] == '-') { neg = 1; i = 1; }
+    else if (src[0] == '+') { i = 1; }
+    if (i == len) return 0;
+
+    size_t digit_start = i;
+
+    /* Strip leading zeros — they don't contribute to the significant
+     * digit count and would otherwise force an overly strict cap below
+     * (e.g. "00000000000000000001" is just 1, not a 20-digit value). */
+    while (i < len && src[i] == '0') i++;
+    size_t sig_start = i;
+
+    uint64_t result = 0;
+
+    /* SWAR: first 8 digits */
+    if (i + 8 <= len && ray_is_8_digits(src + i)) {
+        result = ray_parse_8_digits(src + i);
+        i += 8;
+        /* Second 8-digit chunk: result is in [0, 1e8), well below the
+         * 922337203 bound (= u64 max ÷ 2e10) that keeps result*1e8 +
+         * 1e8-1 from wrapping u64. */
+        if (i + 8 <= len && result <= 922337203ULL && ray_is_8_digits(src + i)) {
+            result = result * 100000000ULL + ray_parse_8_digits(src + i);
+            i += 8;
+        }
+    }
+
+    /* Scalar tail with strict 19-digit cap.  INT64_MAX (and |INT64_MIN|)
+     * have 19 decimal digits; anything past that always overflows i64
+     * and may also overflow u64 in a way where the wrapped value lands
+     * back inside [0, INT64_MAX], silently misparsing oversized inputs
+     * as small in-range values.  Cut off before that can happen. */
+    while (i < len && IS_DIGIT(src[i])) {
+        if ((size_t)(i - sig_start) >= 19) return 0; /* too many sig digits */
+        uint64_t prev = result;
+        result = result * 10 + (uint64_t)(src[i] - '0');
+        if (result < prev) return 0;                 /* u64 wrap (defensive) */
+        i++;
+    }
+
+    if (i == digit_start) return 0;                  /* no digits at all */
+
+    /* Fit into int64 with proper handling of INT64_MIN. */
+    if (neg) {
+        if (result > (uint64_t)INT64_MAX + 1ULL) return 0;
+        *dst = (int64_t)(0u - result);                /* avoids signed UB */
+    } else {
+        if (result > (uint64_t)INT64_MAX) return 0;
+        *dst = (int64_t)result;
+    }
+    return i;
+}
+
+size_t ray_parse_i32(const char *src, size_t len, int32_t *dst) {
+    int64_t v;
+    size_t n = ray_parse_i64(src, len, &v);
+    if (n == 0) return 0;
+    if (v < INT32_MIN || v > INT32_MAX) return 0;
+    *dst = (int32_t)v;
+    return n;
+}
+
+/* ----------------------------------------------------------------------------
+ * Float parser
+ *
+ * Layout: [+-]digits[.digits][eE[+-]digits]
+ * Also accepts NaN, Inf, +Inf, -Inf (case-insensitive prefix; we match
+ * the same forms the language printer emits and that .csv.write produces).
+ * ---------------------------------------------------------------------------- */
+
+static const double g_pow10[] = {
+    1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,
+    1e8,  1e9,  1e10, 1e11, 1e12, 1e13, 1e14, 1e15,
+    1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22
+};
+
+static inline int icmp3(const char *p, char a, char b, char c) {
+    unsigned char x = (unsigned char)p[0], y = (unsigned char)p[1], z = (unsigned char)p[2];
+    return (x == (unsigned char)a || x == (unsigned char)(a ^ 0x20)) &&
+           (y == (unsigned char)b || y == (unsigned char)(b ^ 0x20)) &&
+           (z == (unsigned char)c || z == (unsigned char)(c ^ 0x20));
+}
+
+/* Apply 10^e to val.
+ *
+ * For |e| ≤ 22 the pow10 table entries are exact f64 (10^k for k ≤ 22 is
+ * representable), so a single multiply / divide is correctly rounded.
+ *
+ * For positive e > 22 we use libm `pow(10, e)` and a *single* multiply.
+ * Chaining `val *= 1e22` instead would accumulate ~½ ulp per step and
+ * thirteen steps is enough to push values right at the DBL_MAX boundary
+ * (e.g. 1.7976931348623158e308) over the rounding threshold into +inf,
+ * even though the correctly-rounded f64 is still finite.
+ *
+ * For negative e, single multiply via `pow(10, -324)` would underflow to
+ * zero before the multiply could lift the result back into the denormal
+ * range — so 2.2250738585072014e-308 becomes 0.  We chain by 1e22 in
+ * that direction; chained division stays well-conditioned all the way
+ * down to the smallest denormal. */
+static inline double scale_pow10(double val, int e) {
+    if (e == 0) return val;
+    if (e > 0) {
+        if (e <= 22) return val * g_pow10[e];
+        return val * pow(10.0, (double)e);
+    } else {
+        int ne = -e;
+        if (ne <= 22) return val / g_pow10[ne];
+        while (ne > 22) {
+            val /= 1e22;
+            if (val == 0.0) return val;
+            ne -= 22;
+        }
+        return val / g_pow10[ne];
+    }
+}
+
+size_t ray_parse_f64(const char *src, size_t len, double *dst) {
+    if (len == 0) return 0;
+
+    size_t i = 0;
+    int neg = 0;
+    if (src[0] == '-') { neg = 1; i = 1; }
+    else if (src[0] == '+') { i = 1; }
+
+    /* NaN / Inf */
+    if (i + 3 <= len && icmp3(src + i, 'n', 'a', 'n')) {
+        *dst = __builtin_nan("");
+        return i + 3;
+    }
+    if (i + 3 <= len && icmp3(src + i, 'i', 'n', 'f')) {
+        *dst = neg ? -__builtin_inf() : __builtin_inf();
+        return i + 3;
+    }
+
+    if (i == len) return 0;
+
+    /* Build a single decimal mantissa in u64 plus a signed power-of-ten
+     * offset, then finalize with one multiply.  This avoids two pitfalls
+     * the earlier hand-rolled accumulator had:
+     *
+     *   1. A purely positional fractional cap dropped meaningful trailing
+     *      digits when leading zeros took up the budget — so 1e-19 written
+     *      as "0.0000000000000000001" came back as 0.
+     *   2. Chained `val *= 1e22` for large exponents accumulated rounding
+     *      error past DBL_MAX, turning DBL_MAX itself into inf.
+     */
+    uint64_t mantissa = 0;
+    int      mant_digits = 0;          /* significant digits captured */
+    int      dec_offset  = 0;          /* power of 10 to apply at the end */
+    bool     have_digit  = false;
+
+    /* ---- integer part ----------------------------------------------- */
+
+    /* Skip leading zeros (don't count as significant). */
+    while (i < len && src[i] == '0') { i++; have_digit = true; }
+
+    /* SWAR fast path for the first 8 / 16 sig digits. */
+    if (i + 8 <= len && ray_is_8_digits(src + i)) {
+        mantissa = ray_parse_8_digits(src + i);
+        mant_digits = 8;
+        i += 8;
+        have_digit = true;
+        if (i + 8 <= len && ray_is_8_digits(src + i)) {
+            mantissa = mantissa * 100000000ULL + ray_parse_8_digits(src + i);
+            mant_digits = 16;
+            i += 8;
+        }
+    }
+
+    /* Scalar tail of the integer part.  Past 18 sig digits we drop
+     * further integer digits but keep their magnitude via dec_offset. */
+    while (i < len && IS_DIGIT(src[i])) {
+        if (mant_digits < 18) {
+            mantissa = mantissa * 10 + (uint64_t)(src[i] - '0');
+            mant_digits++;
+        } else {
+            dec_offset++;
+        }
+        i++;
+        have_digit = true;
+    }
+
+    /* ---- fractional part -------------------------------------------- */
+
+    if (i < len && src[i] == '.') {
+        i++;
+        /* Leading zeros in the fractional part (when the mantissa is
+         * still 0) shift the decimal point but contribute no significant
+         * digit. */
+        if (mantissa == 0) {
+            while (i < len && src[i] == '0') {
+                dec_offset--;
+                i++;
+                have_digit = true;
+            }
+        }
+
+        /* SWAR fast path for the first 8 sig fractional digits. */
+        if (i + 8 <= len && mant_digits + 8 <= 18 && ray_is_8_digits(src + i)) {
+            mantissa = mantissa * 100000000ULL + ray_parse_8_digits(src + i);
+            mant_digits += 8;
+            dec_offset -= 8;
+            i += 8;
+            have_digit = true;
+        }
+
+        /* Scalar tail of the fractional part.  Past 18 sig digits we
+         * skip further fractional digits — they are below f64 precision
+         * and they don't shift the magnitude (no dec_offset change). */
+        while (i < len && IS_DIGIT(src[i])) {
+            if (mant_digits < 18) {
+                mantissa = mantissa * 10 + (uint64_t)(src[i] - '0');
+                mant_digits++;
+                dec_offset--;
+            }
+            i++;
+            have_digit = true;
+        }
+    }
+
+    if (!have_digit) return 0;
+
+    /* ---- explicit exponent ------------------------------------------ */
+
+    if (i < len && (src[i] == 'e' || src[i] == 'E')) {
+        size_t e_at = i;
+        i++;
+        int e_neg = 0;
+        if (i < len) {
+            if (src[i] == '-') { e_neg = 1; i++; }
+            else if (src[i] == '+') { i++; }
+        }
+        size_t e_start = i;
+        int exp_v = 0;
+        bool exp_capped = false;
+        while (i < len && IS_DIGIT(src[i])) {
+            if (exp_v <= 999) exp_v = exp_v * 10 + (src[i] - '0');
+            else exp_capped = true;
+            i++;
+        }
+        if (i == e_start) {
+            /* "1e" with no digits — rewind; the 'e' is not part of the number. */
+            i = e_at;
+        } else {
+            int e = exp_capped ? 10000 : exp_v;
+            dec_offset += e_neg ? -e : e;
+        }
+    }
+
+    /* ---- finalize: val = mantissa * 10^dec_offset ------------------- */
+
+    /* Fast path applies only when the conversion is provably correctly
+     * rounded — i.e. both factors of the final multiply are exact f64s:
+     *
+     *   - (double)mantissa is exact for mantissa ≤ 2^53.  Significant
+     *     digits ≤ 15 keeps mantissa ≤ 10^15 - 1 < 2^53.
+     *   - g_pow10[|k|] is exact for |k| ≤ 22 (10^22 fits in 76 bits but
+     *     IEEE 754 happens to round 10^k for k ≤ 22 to a value that
+     *     matches the table entries we hand-wrote).
+     *
+     * Outside that window — high-precision mantissas, large exponents,
+     * or boundary-near values — defer to libc strtod on the original
+     * substring.  glibc strtod is correctly rounded, so this fixes:
+     *   • DBL_MAX-edge overshoot (1.7976931348623158e308 → +inf in the
+     *     fast path; strtod rounds to DBL_MAX);
+     *   • DBL_MAX_PREV mismatch (1.7976931348623155e308 — fast path
+     *     gives DBL_MAX, strtod correctly gives DBL_MAX_PREV);
+     *   • Denormal underflow (mantissa·pow(10,-324) zeroes out before
+     *     scale_pow10's chained division could keep the result alive).
+     *
+     * Most CSV / lang values land in the fast path: they have ≤ 15
+     * significant digits and modest exponents.  The slow lane is
+     * reserved for inputs where the trade-off is correctness over
+     * speed. */
+    double val = 0.0;
+    bool   need_strtod = false;
+
+    if (mantissa == 0) {
+        val = 0.0;
+    } else if (dec_offset > 308) {
+        val = __builtin_inf();
+    } else if (dec_offset < -342) {                  /* below denormal range */
+        val = 0.0;
+    } else if (mant_digits <= 15 && dec_offset >= -22 && dec_offset <= 22) {
+        val = (double)mantissa;
+        if (dec_offset > 0)      val *= g_pow10[dec_offset];
+        else if (dec_offset < 0) val /= g_pow10[-dec_offset];
+    } else {
+        need_strtod = true;
+    }
+
+    if (need_strtod) {
+        char stackbuf[128];
+        char *buf = (i + 1 <= sizeof(stackbuf)) ? stackbuf : malloc(i + 1);
+        if (buf) {
+            memcpy(buf, src, i);
+            buf[i] = '\0';
+            char *endp = NULL;
+            double v = strtod(buf, &endp);
+            bool ok = (endp == buf + i);
+            if (buf != stackbuf) free(buf);
+            if (ok) {
+                /* strtod already applied the leading sign in buf, so
+                 * don't apply `neg` again. */
+                *dst = v;
+                return i;
+            }
+        }
+        /* Strtod unusable (OOM on a giant literal, or unexpected parse
+         * disagreement).  Fall through with the approximate result
+         * from the chained-multiply slow path so we still return a
+         * sensible value rather than 0. */
+        val = scale_pow10((double)mantissa, dec_offset);
+    }
+
+    *dst = neg ? -val : val;
+    return i;
+}
+
+/* ----------------------------------------------------------------------------
+ * Hexadecimal (no 0x prefix, lowercase or uppercase)
+ * ---------------------------------------------------------------------------- */
+
+size_t ray_parse_u64_hex(const char *src, size_t len, uint64_t *dst) {
+    uint64_t v = 0;
+    size_t i = 0;
+    while (i < len && i < 16) {
+        unsigned char c = (unsigned char)src[i];
+        unsigned d;
+        if (c >= '0' && c <= '9') d = (unsigned)(c - '0');
+        else if (c >= 'a' && c <= 'f') d = (unsigned)(c - 'a' + 10);
+        else if (c >= 'A' && c <= 'F') d = (unsigned)(c - 'A' + 10);
+        else break;
+        v = (v << 4) | d;
+        i++;
+    }
+    if (i == 0) return 0;
+    *dst = v;
+    return i;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/numparse.h b/crates/rayforce-sys/vendor/rayforce/src/core/numparse.h
new file mode 100644
index 0000000..fca548c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/numparse.h
@@ -0,0 +1,77 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_CORE_NUMPARSE_H
+#define RAY_CORE_NUMPARSE_H
+
+/* ============================================================================
+ * numparse — unified (ptr, len) → value parsers
+ *
+ * Used by both the language tokenizer (src/lang/parse.c) and the CSV
+ * reader (src/io/csv.c).  All parsers share the same shape:
+ *
+ *     size_t consumed = ray_parse_X(src, len, &out);
+ *
+ *   - returns the number of bytes consumed from `src`
+ *   - 0 means "no progress" — parse failed at byte 0, *out unchanged
+ *   - the language tokenizer advances its cursor by `consumed`
+ *   - the CSV reader treats `consumed != len` as a null/invalid field
+ *
+ * No leading whitespace is stripped; callers strip first if they need to.
+ * Optional sign characters (`+` / `-`) ARE consumed.
+ *
+ * SWAR primitives are also exported (used by fast date / time parsers
+ * that consume fixed-width digit groups).
+ * ============================================================================ */
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+size_t ray_parse_i64(const char *src, size_t len, int64_t  *dst);
+size_t ray_parse_i32(const char *src, size_t len, int32_t  *dst);
+size_t ray_parse_f64(const char *src, size_t len, double   *dst);
+size_t ray_parse_u64_hex(const char *src, size_t len, uint64_t *dst);
+
+/* ----------------------------------------------------------------------------
+ * SWAR (SIMD Within A Register) digit primitives.
+ *
+ * Caller must guarantee 8 readable bytes at `p` for the 8-digit forms,
+ * 4 for the 4-digit forms.  All loads are unaligned via memcpy.
+ * Little-endian assumed (x86_64 / aarch64 in normal mode).
+ * ---------------------------------------------------------------------------- */
+
+bool     ray_is_8_digits   (const void *p);
+bool     ray_is_4_digits   (const void *p);
+uint64_t ray_parse_8_digits(const void *p);
+uint32_t ray_parse_4_digits(const void *p);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RAY_CORE_NUMPARSE_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/platform.c b/crates/rayforce-sys/vendor/rayforce/src/core/platform.c
new file mode 100644
index 0000000..a386b32
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/platform.c
@@ -0,0 +1,464 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/* Feature test macros must come before any includes */
+#if defined(__linux__)
+  #define _GNU_SOURCE
+#endif
+
+#include "platform.h"
+
+/* ==========================================================================
+ * Linux / macOS (POSIX)
+ * ========================================================================== */
+#if defined(RAY_OS_LINUX) || defined(RAY_OS_MACOS)
+
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <pthread.h>
+#include "mem/sys.h"
+
+/* --------------------------------------------------------------------------
+ * Virtual memory
+ * -------------------------------------------------------------------------- */
+void* ray_vm_alloc(size_t size) {
+    void* p = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    return (p == MAP_FAILED) ? NULL : p;
+}
+
+void ray_vm_free(void* ptr, size_t size) {
+    if (ptr) munmap(ptr, size);
+}
+
+void* ray_vm_map_file(const char* path, size_t* out_size) {
+    int fd = open(path, O_RDONLY);
+    if (fd < 0) return NULL;
+
+    struct stat st;
+    if (fstat(fd, &st) != 0) {
+        close(fd);
+        return NULL;
+    }
+
+    if (st.st_size <= 0) {
+        close(fd);
+        if (out_size) *out_size = 0;
+        return NULL;
+    }
+
+    size_t len = (size_t)st.st_size;
+    void* p = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+    close(fd);
+
+    if (p == MAP_FAILED) return NULL;
+
+    if (out_size) *out_size = len;
+    return p;
+}
+
+void ray_vm_unmap_file(void* ptr, size_t size) {
+    if (ptr) munmap(ptr, size);
+}
+
+void ray_vm_advise_seq(void* ptr, size_t size) {
+    if (ptr) madvise(ptr, size, MADV_SEQUENTIAL);
+}
+
+void ray_vm_advise_willneed(void* ptr, size_t size) {
+    if (ptr) madvise(ptr, size, MADV_WILLNEED);
+}
+
+void ray_vm_release(void* ptr, size_t size) {
+    if (!ptr) return;
+#if defined(RAY_OS_MACOS)
+    madvise(ptr, size, MADV_FREE);
+#else
+    madvise(ptr, size, MADV_DONTNEED);
+#endif
+}
+
+void* ray_vm_alloc_aligned(size_t size, size_t alignment) {
+    size_t total = size + alignment;
+    void* mem = mmap(NULL, total, PROT_READ | PROT_WRITE,
+                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (mem == MAP_FAILED) return NULL;
+
+    uintptr_t addr = (uintptr_t)mem;
+    uintptr_t aligned = (addr + alignment - 1) & ~(alignment - 1);
+
+    /* Trim leading excess */
+    if (aligned > addr)
+        munmap(mem, aligned - addr);
+
+    /* Trim trailing excess */
+    uintptr_t end = addr + total;
+    uintptr_t aligned_end = aligned + size;
+    if (end > aligned_end)
+        munmap((void*)aligned_end, end - aligned_end);
+
+    return (void*)aligned;
+}
+
+/* --------------------------------------------------------------------------
+ * Threading
+ * -------------------------------------------------------------------------- */
+
+/* pthread entry expects void*(*)(void*), but ray_thread_fn is void(*)(void*).
+ * Use a small trampoline to bridge the signatures.                          */
+typedef struct {
+    ray_thread_fn fn;
+    void*        arg;
+} ray_thread_trampoline_t;
+
+static void* thread_trampoline(void* raw) {
+    ray_thread_trampoline_t ctx = *(ray_thread_trampoline_t*)raw;
+    /* Free the trampoline struct allocated on the heap. We copied it first
+     * so the creating thread can proceed freely.                            */
+    ray_sys_free(raw);
+    ctx.fn(ctx.arg);
+    return NULL;
+}
+
+ray_err_t ray_thread_create(ray_thread_t* t, ray_thread_fn fn, void* arg) {
+    ray_thread_trampoline_t* ctx = (ray_thread_trampoline_t*)ray_sys_alloc(sizeof(*ctx));
+    if (!ctx) return RAY_ERR_OOM;
+    ctx->fn  = fn;
+    ctx->arg = arg;
+
+    pthread_t pt;
+    int rc = pthread_create(&pt, NULL, thread_trampoline, ctx);
+    if (rc != 0) {
+        ray_sys_free(ctx);
+        return RAY_ERR_OOM;
+    }
+    *t = (ray_thread_t)pt;
+    return RAY_OK;
+}
+
+ray_err_t ray_thread_join(ray_thread_t t) {
+    int rc = pthread_join((pthread_t)t, NULL);
+    return (rc == 0) ? RAY_OK : RAY_ERR_IO;
+}
+
+uint32_t ray_thread_count(void) {
+    long n = sysconf(_SC_NPROCESSORS_ONLN);
+    return (n > 0) ? (uint32_t)n : 1;
+}
+
+/* --------------------------------------------------------------------------
+ * Semaphore
+ * -------------------------------------------------------------------------- */
+#if defined(RAY_OS_MACOS)
+
+ray_err_t ray_sem_init(ray_sem_t* s, uint32_t initial_value) {
+    *s = dispatch_semaphore_create((long)initial_value);
+    return (*s) ? RAY_OK : RAY_ERR_OOM;
+}
+
+void ray_sem_destroy(ray_sem_t* s) {
+    /* dispatch_semaphore is ARC-managed on modern macOS; explicit release for
+     * non-ARC builds (our C code).                                           */
+    if (*s) dispatch_release(*s);
+    *s = NULL;
+}
+
+void ray_sem_wait(ray_sem_t* s) {
+    dispatch_semaphore_wait(*s, DISPATCH_TIME_FOREVER);
+}
+
+void ray_sem_signal(ray_sem_t* s) {
+    dispatch_semaphore_signal(*s);
+}
+
+#else /* Linux */
+
+ray_err_t ray_sem_init(ray_sem_t* s, uint32_t initial_value) {
+    return (sem_init(s, 0, initial_value) == 0) ? RAY_OK : RAY_ERR_OOM;
+}
+
+void ray_sem_destroy(ray_sem_t* s) {
+    sem_destroy(s);
+}
+
+void ray_sem_wait(ray_sem_t* s) {
+    while (sem_wait(s) != 0) { /* retry on EINTR */ }
+}
+
+void ray_sem_signal(ray_sem_t* s) {
+    sem_post(s);
+}
+
+#endif /* macOS vs Linux semaphore */
+
+/* ==========================================================================
+ * Windows
+ * ========================================================================== */
+#elif defined(RAY_OS_WINDOWS)
+
+#ifndef WIN32_LEAN_AND_MEAN
+  #define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+
+/* --------------------------------------------------------------------------
+ * Virtual memory
+ * -------------------------------------------------------------------------- */
+void* ray_vm_alloc(size_t size) {
+    return VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+}
+
+void ray_vm_free(void* ptr, size_t size) {
+    (void)size;
+    if (ptr) VirtualFree(ptr, 0, MEM_RELEASE);
+}
+
+void* ray_vm_map_file(const char* path, size_t* out_size) {
+    HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL,
+                               OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) return NULL;
+
+    LARGE_INTEGER file_size;
+    if (!GetFileSizeEx(hFile, &file_size)) {
+        CloseHandle(hFile);
+        return NULL;
+    }
+
+    HANDLE hMap = CreateFileMappingA(hFile, NULL, PAGE_WRITECOPY, 0, 0, NULL);
+    if (!hMap) {
+        CloseHandle(hFile);
+        return NULL;
+    }
+
+    void* p = MapViewOfFile(hMap, FILE_MAP_COPY, 0, 0, 0);
+
+    /* We can close both handles; the mapping keeps the file open internally. */
+    CloseHandle(hMap);
+    CloseHandle(hFile);
+
+    if (!p) return NULL;
+
+    if (out_size) *out_size = (size_t)file_size.QuadPart;
+    return p;
+}
+
+void ray_vm_unmap_file(void* ptr, size_t size) {
+    (void)size;
+    if (ptr) UnmapViewOfFile(ptr);
+}
+
+void ray_vm_advise_seq(void* ptr, size_t size) {
+    /* PrefetchVirtualMemory is Win8.1+. Best-effort; ignore failure. */
+    WIN32_MEMORY_RANGE_ENTRY entry;
+    entry.VirtualAddress = ptr;
+    entry.NumberOfBytes  = size;
+    PrefetchVirtualMemory(GetCurrentProcess(), 1, &entry, 0);
+}
+
+void ray_vm_release(void* ptr, size_t size) {
+    if (!ptr) return;
+    /* DiscardVirtualMemory (Win8.1+) or fallback to decommit+recommit */
+    DiscardVirtualMemory(ptr, size);
+}
+
+void* ray_vm_alloc_aligned(size_t size, size_t alignment) {
+    /* Over-allocate, find aligned offset. Can't trim on Windows, so the
+     * pool header's vm_base field stores the original base for VirtualFree. */
+    void* mem = VirtualAlloc(NULL, size + alignment,
+                             MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+    if (!mem) return NULL;
+    uintptr_t aligned = ((uintptr_t)mem + alignment - 1) & ~(alignment - 1);
+    return (void*)aligned;
+}
+
+/* --------------------------------------------------------------------------
+ * Threading
+ * -------------------------------------------------------------------------- */
+typedef struct {
+    ray_thread_fn fn;
+    void*        arg;
+} ray_thread_trampoline_t;
+
+static DWORD WINAPI thread_trampoline(LPVOID raw) {
+    ray_thread_trampoline_t ctx = *(ray_thread_trampoline_t*)raw;
+    HeapFree(GetProcessHeap(), 0, raw);
+    ctx.fn(ctx.arg);
+    return 0;
+}
+
+ray_err_t ray_thread_create(ray_thread_t* t, ray_thread_fn fn, void* arg) {
+    ray_thread_trampoline_t* ctx = HeapAlloc(GetProcessHeap(), 0, sizeof(*ctx));
+    if (!ctx) return RAY_ERR_OOM;
+    ctx->fn  = fn;
+    ctx->arg = arg;
+
+    HANDLE h = CreateThread(NULL, 0, thread_trampoline, ctx, 0, NULL);
+    if (!h) {
+        HeapFree(GetProcessHeap(), 0, ctx);
+        return RAY_ERR_OOM;
+    }
+    *t = (ray_thread_t)h;
+    return RAY_OK;
+}
+
+ray_err_t ray_thread_join(ray_thread_t t) {
+    DWORD rc = WaitForSingleObject((HANDLE)t, INFINITE);
+    CloseHandle((HANDLE)t);
+    return (rc == WAIT_OBJECT_0) ? RAY_OK : RAY_ERR_IO;
+}
+
+uint32_t ray_thread_count(void) {
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    return (uint32_t)si.dwNumberOfProcessors;
+}
+
+/* --------------------------------------------------------------------------
+ * Semaphore
+ * -------------------------------------------------------------------------- */
+ray_err_t ray_sem_init(ray_sem_t* s, uint32_t initial_value) {
+    *s = CreateSemaphoreA(NULL, (LONG)initial_value, LONG_MAX, NULL);
+    return (*s) ? RAY_OK : RAY_ERR_OOM;
+}
+
+void ray_sem_destroy(ray_sem_t* s) {
+    if (*s) CloseHandle(*s);
+    *s = NULL;
+}
+
+void ray_sem_wait(ray_sem_t* s) {
+    WaitForSingleObject(*s, INFINITE);
+}
+
+void ray_sem_signal(ray_sem_t* s) {
+    ReleaseSemaphore(*s, 1, NULL);
+}
+
+#endif /* RAY_OS_WINDOWS */
+
+/* ==========================================================================
+ * WASM (Emscripten)
+ *
+ * Single-threaded by construction.  VM allocs are plain malloc; mmap of
+ * files goes through MEMFS via mmap()/munmap() (still works in emscripten
+ * for files written into the in-memory FS).  Thread/semaphore ops are
+ * stubs — pool.c will see thread_count() == 1 and skip worker creation.
+ * ========================================================================== */
+#if defined(RAY_OS_WASM)
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "mem/sys.h"
+
+void* ray_vm_alloc(size_t size) {
+    /* Emscripten provides MAP_ANONYMOUS; this is the cleanest way to get a
+     * page-aligned region the heap can hand out.  Falls back to aligned
+     * malloc if mmap is somehow refused (shouldn't happen on MEMFS). */
+    void* p = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        /* aligned_alloc requires size to be a multiple of alignment.
+         * Round up to a 64KB WASM page. */
+        size_t aligned = (size + 65535u) & ~(size_t)65535u;
+        p = aligned_alloc(65536, aligned);
+        return p;
+    }
+    return p;
+}
+
+void ray_vm_free(void* ptr, size_t size) {
+    if (!ptr) return;
+    if (munmap(ptr, size) != 0) free(ptr);
+}
+
+void* ray_vm_map_file(const char* path, size_t* out_size) {
+    int fd = open(path, O_RDONLY);
+    if (fd < 0) return NULL;
+
+    struct stat st;
+    if (fstat(fd, &st) != 0 || st.st_size <= 0) {
+        close(fd);
+        if (out_size) *out_size = 0;
+        return NULL;
+    }
+
+    size_t len = (size_t)st.st_size;
+    void* p = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+    close(fd);
+
+    if (p == MAP_FAILED) return NULL;
+    if (out_size) *out_size = len;
+    return p;
+}
+
+void ray_vm_unmap_file(void* ptr, size_t size) {
+    if (ptr) munmap(ptr, size);
+}
+
+/* madvise hints are advisory and have no analog on WASM — no-ops. */
+void ray_vm_advise_seq(void* ptr, size_t size)      { (void)ptr; (void)size; }
+void ray_vm_advise_willneed(void* ptr, size_t size) { (void)ptr; (void)size; }
+void ray_vm_release(void* ptr, size_t size)         { (void)ptr; (void)size; }
+
+void* ray_vm_alloc_aligned(size_t size, size_t alignment) {
+    /* aligned_alloc requires size to be a multiple of alignment per C17. */
+    size_t aligned_size = (size + alignment - 1) & ~(alignment - 1);
+    return aligned_alloc(alignment, aligned_size);
+}
+
+/* Threading — return errors / 1.  pool.c with n_workers==0 (the result of
+ * thread_count==1 ⇒ ncpu-1 == 0) never invokes thread_create. */
+ray_err_t ray_thread_create(ray_thread_t* t, ray_thread_fn fn, void* arg) {
+    (void)t; (void)fn; (void)arg;
+    return RAY_ERR_NYI;
+}
+
+ray_err_t ray_thread_join(ray_thread_t t) {
+    (void)t;
+    return RAY_OK;
+}
+
+uint32_t ray_thread_count(void) { return 1; }
+
+/* Semaphore — counter-only.  Single-threaded so wait never blocks (the
+ * counter must already be positive when wait fires). */
+ray_err_t ray_sem_init(ray_sem_t* s, uint32_t initial_value) {
+    *s = (int32_t)initial_value;
+    return RAY_OK;
+}
+
+void ray_sem_destroy(ray_sem_t* s) { (void)s; }
+
+void ray_sem_wait(ray_sem_t* s) {
+    if (*s > 0) (*s)--;
+}
+
+void ray_sem_signal(ray_sem_t* s) { (*s)++; }
+
+#endif /* RAY_OS_WASM */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/platform.h b/crates/rayforce-sys/vendor/rayforce/src/core/platform.h
new file mode 100644
index 0000000..cad406a
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/platform.h
@@ -0,0 +1,178 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_PLATFORM_H
+#define RAY_PLATFORM_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* --------------------------------------------------------------------------
+ * OS detection
+ * -------------------------------------------------------------------------- */
+/* Detect WASM/Emscripten *before* Linux so we don't pull in Linux-only
+ * headers (madvise, sem_*, pthread) that emscripten's sysroot stubs out.
+ * platform.c provides a dedicated WASM arm with malloc/MEMFS shims. */
+#if defined(__EMSCRIPTEN__)
+  #define RAY_OS_WASM    1
+#elif defined(__linux__)
+  #define RAY_OS_LINUX   1
+#elif defined(__APPLE__) && defined(__MACH__)
+  #define RAY_OS_MACOS   1
+#elif defined(_WIN32)
+  #define RAY_OS_WINDOWS 1
+#else
+  #error "Unsupported platform"
+#endif
+
+/* --------------------------------------------------------------------------
+ * Compiler hints
+ * -------------------------------------------------------------------------- */
+#if !defined(RAY_LIKELY)
+#if defined(__GNUC__) || defined(__clang__)
+  #define RAY_LIKELY(x)   __builtin_expect(!!(x), 1)
+  #define RAY_UNLIKELY(x) __builtin_expect(!!(x), 0)
+  #define RAY_ALIGN(n)    __attribute__((aligned(n)))
+  #define RAY_INLINE      static inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+  #define RAY_LIKELY(x)   (x)
+  #define RAY_UNLIKELY(x) (x)
+  #define RAY_ALIGN(n)    __declspec(align(n))
+  #define RAY_INLINE      static __forceinline
+#else
+  #define RAY_LIKELY(x)   (x)
+  #define RAY_UNLIKELY(x) (x)
+  #define RAY_ALIGN(n)
+  #define RAY_INLINE      static inline
+#endif
+#endif /* !RAY_LIKELY */
+
+/* --------------------------------------------------------------------------
+ * Thread-local storage
+ * -------------------------------------------------------------------------- */
+#if !defined(RAY_TLS)
+#if defined(_MSC_VER)
+  #define RAY_TLS __declspec(thread)
+#else
+  #define RAY_TLS _Thread_local
+#endif
+#endif /* !RAY_TLS */
+
+/* --------------------------------------------------------------------------
+ * Atomics
+ * -------------------------------------------------------------------------- */
+#if !defined(ray_atomic_inc)
+#if defined(_MSC_VER)
+  #include <intrin.h>
+  /* MSVC Interlocked* return the NEW value; adjust to match fetch_add/
+   * fetch_sub semantics (return OLD value).
+   * _InterlockedIncrement returns new, subtract 1 to get pre-increment.
+   * _InterlockedDecrement returns new, add 1 to get pre-decrement.
+   * On ARM use _nf (no fence) / _rel variants for relaxed/release semantics. */
+  #if defined(_M_ARM) || defined(_M_ARM64)
+    #define ray_atomic_inc(p)   (_InterlockedIncrement_nf((volatile long*)(p)) - 1)
+    #define ray_atomic_dec(p)   (_InterlockedDecrement_rel((volatile long*)(p)) + 1)
+    #define ray_atomic_fence_acquire()  __dmb(_ARM_BARRIER_ISH)
+  #else
+    #define ray_atomic_inc(p)   (_InterlockedIncrement((volatile long*)(p)) - 1)
+    #define ray_atomic_dec(p)   (_InterlockedDecrement((volatile long*)(p)) + 1)
+    #define ray_atomic_fence_acquire()  _ReadWriteBarrier()
+  #endif
+  #define ray_atomic_load(p)  _InterlockedOr((volatile long*)(p), 0)
+  #define ray_atomic_store(p, v) _InterlockedExchange((volatile long*)(p), (long)(v))
+  #define ray_atomic_cas(p, expected, desired) \
+      (_InterlockedCompareExchange((volatile long*)(p), (long)(desired), (long)(*(expected))) == (long)(*(expected)))
+#else
+  #include <stdatomic.h>
+  #define ray_atomic_inc(p)   __atomic_fetch_add(p, 1, __ATOMIC_RELAXED)
+  #define ray_atomic_dec(p)   __atomic_fetch_sub(p, 1, __ATOMIC_RELEASE)
+  #define ray_atomic_load(p)  __atomic_load_n(p, __ATOMIC_ACQUIRE)
+  #define ray_atomic_store(p, v) __atomic_store_n(p, v, __ATOMIC_RELEASE)
+  #define ray_atomic_cas(p, expected, desired) \
+      __atomic_compare_exchange_n(p, expected, desired, 0, \
+          __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)
+  #define ray_atomic_fence_acquire()  __atomic_thread_fence(__ATOMIC_ACQUIRE)
+#endif
+#endif /* !ray_atomic_inc */
+
+/* --------------------------------------------------------------------------
+ * Pull in the public header for ray_err_t, ray_t, etc.
+ * -------------------------------------------------------------------------- */
+#include <rayforce.h>
+
+/* --------------------------------------------------------------------------
+ * Thread types
+ * -------------------------------------------------------------------------- */
+#if defined(_WIN32)
+  typedef void* ray_thread_t;
+#else
+  typedef unsigned long ray_thread_t;
+#endif
+
+typedef void (*ray_thread_fn)(void* arg);
+
+/* --------------------------------------------------------------------------
+ * Platform VM API
+ * -------------------------------------------------------------------------- */
+void* ray_vm_alloc(size_t size);
+void  ray_vm_free(void* ptr, size_t size);
+void* ray_vm_map_file(const char* path, size_t* out_size);
+void  ray_vm_unmap_file(void* ptr, size_t size);
+void  ray_vm_advise_seq(void* ptr, size_t size);
+void  ray_vm_advise_willneed(void* ptr, size_t size);
+void  ray_vm_release(void* ptr, size_t size);
+void* ray_vm_alloc_aligned(size_t size, size_t alignment);
+
+/* --------------------------------------------------------------------------
+ * Threading API
+ * -------------------------------------------------------------------------- */
+ray_err_t ray_thread_create(ray_thread_t* t, ray_thread_fn fn, void* arg);
+ray_err_t ray_thread_join(ray_thread_t t);
+uint32_t ray_thread_count(void);
+
+void ray_parallel_begin(void);
+void ray_parallel_end(void);
+extern _Atomic(uint32_t) ray_parallel_flag;
+
+/* --------------------------------------------------------------------------
+ * Semaphore (platform-specific, not in the public header)
+ * -------------------------------------------------------------------------- */
+#if defined(RAY_OS_WINDOWS)
+  typedef void* ray_sem_t;  /* HANDLE */
+#elif defined(RAY_OS_MACOS)
+  #include <dispatch/dispatch.h>
+  typedef dispatch_semaphore_t ray_sem_t;
+#elif defined(RAY_OS_WASM)
+  /* WASM is single-threaded by construction; semaphores are no-op stubs. */
+  typedef int32_t ray_sem_t;
+#else
+  #include <semaphore.h>
+  typedef sem_t ray_sem_t;
+#endif
+
+ray_err_t ray_sem_init(ray_sem_t* s, uint32_t initial_value);
+void     ray_sem_destroy(ray_sem_t* s);
+void     ray_sem_wait(ray_sem_t* s);
+void     ray_sem_signal(ray_sem_t* s);
+
+#endif /* RAY_PLATFORM_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/poll.c b/crates/rayforce-sys/vendor/rayforce/src/core/poll.c
new file mode 100644
index 0000000..5e29140
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/poll.c
@@ -0,0 +1,122 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "core/poll.h"
+#include "mem/sys.h"
+#include <string.h>
+
+#ifndef RAY_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+/* ===== Shared (platform-independent) poll helpers ===== */
+
+void ray_poll_exit(ray_poll_t* poll, int64_t code)
+{
+    if (poll) poll->code = code;
+}
+
+ray_selector_t* ray_poll_get(ray_poll_t* poll, int64_t id)
+{
+    if (!poll || id < 0 || (uint32_t)id >= poll->n_sels)
+        return NULL;
+    return poll->sels[id];
+}
+
+ray_poll_buf_t* ray_poll_buf_new(int64_t size)
+{
+    ray_poll_buf_t* buf = (ray_poll_buf_t*)ray_sys_alloc(
+        sizeof(ray_poll_buf_t) + (size_t)size);
+    if (!buf) return NULL;
+    buf->next   = NULL;
+    buf->size   = size;
+    buf->offset = 0;
+    return buf;
+}
+
+void ray_poll_buf_free(ray_poll_buf_t* buf)
+{
+    while (buf) {
+        ray_poll_buf_t* next = buf->next;
+        ray_sys_free(buf);
+        buf = next;
+    }
+}
+
+void ray_poll_rx_request(ray_poll_t* poll, ray_selector_t* sel, int64_t size)
+{
+    (void)poll;
+    if (sel->rx.buf) {
+        /* Reuse if large enough, otherwise reallocate */
+        if (sel->rx.buf->size >= size) {
+            sel->rx.buf->offset = 0;
+            sel->rx.buf->size   = size;
+            return;
+        }
+        ray_poll_buf_free(sel->rx.buf);
+    }
+    sel->rx.buf = ray_poll_buf_new(size);
+}
+
+void ray_poll_rx_extend(ray_poll_t* poll, ray_selector_t* sel, int64_t extra)
+{
+    (void)poll;
+    if (!sel->rx.buf) {
+        sel->rx.buf = ray_poll_buf_new(extra);
+        return;
+    }
+    int64_t new_size = sel->rx.buf->size + extra;
+    ray_poll_buf_t* nb = ray_poll_buf_new(new_size);
+    if (!nb) return;
+    if (sel->rx.buf->offset > 0)
+        memcpy(nb->data, sel->rx.buf->data, (size_t)sel->rx.buf->offset);
+    nb->offset = sel->rx.buf->offset;
+    ray_poll_buf_free(sel->rx.buf);
+    sel->rx.buf = nb;
+}
+
+void ray_poll_send(ray_poll_t* poll, ray_selector_t* sel,
+                   ray_poll_buf_t* buf)
+{
+    (void)poll;
+    if (!sel || !buf) return;
+
+    /* Use platform send_fn if available, otherwise write() */
+    int64_t sent = 0;
+    while (buf->offset < buf->size) {
+        if (sel->tx.send_fn) {
+            sent = sel->tx.send_fn(sel->fd, buf->data + buf->offset,
+                                   buf->size - buf->offset);
+        } else {
+#ifdef RAY_OS_WINDOWS
+            sent = -1;  /* must have send_fn on Windows */
+#else
+            sent = (int64_t)write((int)sel->fd, buf->data + buf->offset,
+                                  (size_t)(buf->size - buf->offset));
+#endif
+        }
+        if (sent <= 0) break;
+        buf->offset += sent;
+    }
+    ray_poll_buf_free(buf);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/poll.h b/crates/rayforce-sys/vendor/rayforce/src/core/poll.h
new file mode 100644
index 0000000..1424629
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/poll.h
@@ -0,0 +1,115 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_POLL_H
+#define RAY_POLL_H
+
+#include <rayforce.h>
+
+/* Forward declarations */
+typedef struct ray_poll     ray_poll_t;
+typedef struct ray_selector ray_selector_t;
+
+/* ===== Selector types ===== */
+
+#define RAY_SEL_STDIN   0
+#define RAY_SEL_SOCKET  3
+
+/* ===== Callbacks ===== */
+
+typedef int64_t (*ray_io_fn)(int64_t fd, uint8_t* buf, int64_t len);
+typedef ray_t*  (*ray_read_fn)(ray_poll_t* poll, ray_selector_t* sel);
+typedef void    (*ray_event_fn)(ray_poll_t* poll, ray_selector_t* sel);
+typedef ray_t*  (*ray_poll_data_fn)(ray_poll_t* poll, ray_selector_t* sel, void* data);
+
+/* ===== Buffer ===== */
+
+typedef struct ray_poll_buf {
+    struct ray_poll_buf* next;
+    int64_t              size;
+    int64_t              offset;
+    uint8_t              data[];
+} ray_poll_buf_t;
+
+/* ===== Selector — one per registered fd ===== */
+
+struct ray_selector {
+    int64_t          fd;
+    int64_t          id;
+    uint8_t          type;
+    void*            data;
+    ray_event_fn     open_fn;
+    ray_event_fn     close_fn;
+    ray_event_fn     error_fn;
+    ray_poll_data_fn      data_fn;
+    struct { ray_poll_buf_t* buf; ray_io_fn recv_fn; ray_read_fn read_fn; } rx;
+    struct { ray_poll_buf_t* buf; ray_io_fn send_fn; }                      tx;
+};
+
+/* ===== Registration ===== */
+
+typedef struct ray_poll_reg {
+    int64_t          fd;
+    uint8_t          type;
+    ray_event_fn     open_fn;
+    ray_event_fn     close_fn;
+    ray_event_fn     error_fn;
+    ray_poll_data_fn      data_fn;
+    ray_io_fn        recv_fn;
+    ray_io_fn        send_fn;
+    ray_read_fn      read_fn;
+    void*            data;
+} ray_poll_reg_t;
+
+/* ===== Poll ===== */
+
+struct ray_poll {
+    int64_t          fd;       /* epoll/kqueue/iocp handle */
+    int64_t          code;     /* exit code (-1 = running) */
+    ray_selector_t** sels;     /* selector array */
+    uint32_t         n_sels;
+    uint32_t         sel_cap;
+    char             auth_secret[256]; /* password from -u/-U, empty = no auth */
+    bool             restricted;       /* true if -U (read-only IPC mode) */
+};
+
+/* ===== API ===== */
+
+ray_poll_t*     ray_poll_create(void);
+void            ray_poll_destroy(ray_poll_t* poll);
+int64_t         ray_poll_register(ray_poll_t* poll, ray_poll_reg_t* reg);
+void            ray_poll_deregister(ray_poll_t* poll, int64_t id);
+int64_t         ray_poll_run(ray_poll_t* poll);
+void            ray_poll_exit(ray_poll_t* poll, int64_t code);
+ray_selector_t* ray_poll_get(ray_poll_t* poll, int64_t id);
+
+ray_poll_buf_t* ray_poll_buf_new(int64_t size);
+void            ray_poll_buf_free(ray_poll_buf_t* buf);
+void            ray_poll_rx_request(ray_poll_t* poll, ray_selector_t* sel,
+                                    int64_t size);
+void            ray_poll_rx_extend(ray_poll_t* poll, ray_selector_t* sel,
+                                   int64_t extra);
+void            ray_poll_send(ray_poll_t* poll, ray_selector_t* sel,
+                              ray_poll_buf_t* buf);
+
+#endif /* RAY_POLL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/pool.c b/crates/rayforce-sys/vendor/rayforce/src/core/pool.c
new file mode 100644
index 0000000..cb62277
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/pool.c
@@ -0,0 +1,504 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "core/pool.h"
+#include "mem/cow.h"
+#include "mem/heap.h"
+#include "mem/sys.h"
+#include <string.h>
+#include <sched.h>
+
+/* Task granularity: RAY_DISPATCH_MORSELS * RAY_MORSEL_ELEMS elements per task */
+#define TASK_GRAIN  ((int64_t)RAY_DISPATCH_MORSELS * RAY_MORSEL_ELEMS)
+
+/* Maximum ring capacity (power of 2) */
+#define MAX_RING_CAP  (1u << 16)
+
+/* --------------------------------------------------------------------------
+ * Worker thread entry
+ * -------------------------------------------------------------------------- */
+
+typedef struct {
+    ray_pool_t* pool;
+    uint32_t   worker_id;   /* 1-based (0 = main thread) */
+} worker_ctx_t;
+
+static void worker_loop(void* arg) {
+    worker_ctx_t wctx = *(worker_ctx_t*)arg;
+    ray_sys_free(arg);
+
+    ray_pool_t* pool = wctx.pool;
+
+    /* Each worker thread gets its own heap */
+    ray_heap_init();
+    ray_rc_sync = true;  /* workers always use atomic refcounting */
+
+    for (;;) {
+        ray_sem_wait(&pool->work_ready);
+
+        if (atomic_load_explicit(&pool->shutdown, memory_order_acquire))
+            break;
+
+        /* Claim and execute tasks until ring is drained */
+        for (;;) {
+            uint32_t idx = atomic_fetch_add_explicit(&pool->task_tail, 1,
+                                                     memory_order_acq_rel);
+            if (idx >= atomic_load_explicit(&pool->task_count,
+                                            memory_order_acquire))
+                break;
+
+            /* Skip execution if query was cancelled */
+            if (RAY_UNLIKELY(atomic_load_explicit(&pool->cancelled,
+                                                  memory_order_relaxed))) {
+                atomic_fetch_sub_explicit(&pool->pending, 1,
+                                          memory_order_acq_rel);
+                continue;
+            }
+
+            ray_pool_task_t* t = &pool->tasks[idx & (pool->task_cap - 1)];
+            t->fn(t->ctx, wctx.worker_id, t->start, t->end);
+
+            atomic_fetch_sub_explicit(&pool->pending, 1,
+                                      memory_order_acq_rel);
+        }
+
+        /* No ray_heap_gc() here — removing worker GC between dispatch rounds
+         * ensures main can safely modify worker heaps in ray_parallel_end().
+         * Eager madvise in heap_coalesce already releases pages on free. */
+    }
+
+    ray_heap_destroy();
+}
+
+/* --------------------------------------------------------------------------
+ * ray_pool_create
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_pool_create(ray_pool_t* pool, uint32_t n_workers) {
+    /* conc-L7: memset zeroes all fields including the `cancelled` atomic,
+     * which resets any cancellation state from a prior pool instance. */
+    memset(pool, 0, sizeof(*pool));
+    /* H3: Re-initialize atomic fields after memset — memset produces a
+     * valid zero bit pattern on all supported platforms, but C11 requires
+     * atomic_init for well-defined atomic semantics. */
+    atomic_init(&pool->shutdown, 0);
+    atomic_init(&pool->task_tail, 0);
+    atomic_init(&pool->task_count, 0);
+    atomic_init(&pool->pending, 0);
+    atomic_init(&pool->cancelled, 0);
+
+    if (n_workers == 0) {
+        uint32_t ncpu = ray_thread_count();
+        n_workers = (ncpu > 1) ? ncpu - 1 : 0;
+    }
+
+    pool->n_workers = n_workers;
+    atomic_store_explicit(&pool->shutdown, 0, memory_order_relaxed);
+
+    /* Allocate task ring */
+    pool->task_cap = 1024;
+    if (pool->task_cap < MAX_RING_CAP) {
+        /* Will grow if needed in dispatch */
+    }
+    pool->tasks = (ray_pool_task_t*)ray_sys_alloc(pool->task_cap * sizeof(ray_pool_task_t));
+    if (!pool->tasks) return RAY_ERR_OOM;
+
+    pool->task_head = 0;
+    atomic_store_explicit(&pool->task_tail, 0, memory_order_relaxed);
+    atomic_store_explicit(&pool->task_count, 0, memory_order_relaxed);
+    atomic_store_explicit(&pool->pending, 0, memory_order_relaxed);
+
+    ray_err_t err = ray_sem_init(&pool->work_ready, 0);
+    if (err != RAY_OK) {
+        ray_sys_free(pool->tasks);
+        return err;
+    }
+
+    /* Spawn worker threads */
+    if (n_workers > 0) {
+        pool->threads = (ray_thread_t*)ray_sys_alloc(n_workers * sizeof(ray_thread_t));
+        if (!pool->threads) {
+            ray_sem_destroy(&pool->work_ready);
+            ray_sys_free(pool->tasks);
+            return RAY_ERR_OOM;
+        }
+
+        for (uint32_t i = 0; i < n_workers; i++) {
+            worker_ctx_t* wctx = (worker_ctx_t*)ray_sys_alloc(sizeof(worker_ctx_t));
+            if (!wctx) {
+                /* Partial cleanup: shut down already-started threads */
+                atomic_store_explicit(&pool->shutdown, 1, memory_order_release);
+                for (uint32_t j = 0; j < i; j++) {
+                    ray_sem_signal(&pool->work_ready);
+                }
+                for (uint32_t j = 0; j < i; j++) {
+                    ray_thread_join(pool->threads[j]);
+                }
+                ray_sys_free(pool->threads);
+                ray_sem_destroy(&pool->work_ready);
+                ray_sys_free(pool->tasks);
+                return RAY_ERR_OOM;
+            }
+            wctx->pool = pool;
+            wctx->worker_id = i + 1;  /* 0 = main thread */
+
+            err = ray_thread_create(&pool->threads[i], worker_loop, wctx);
+            if (err != RAY_OK) {
+                ray_sys_free(wctx);
+                atomic_store_explicit(&pool->shutdown, 1, memory_order_release);
+                for (uint32_t j = 0; j < i; j++) {
+                    ray_sem_signal(&pool->work_ready);
+                }
+                for (uint32_t j = 0; j < i; j++) {
+                    ray_thread_join(pool->threads[j]);
+                }
+                ray_sys_free(pool->threads);
+                ray_sem_destroy(&pool->work_ready);
+                ray_sys_free(pool->tasks);
+                return err;
+            }
+        }
+    }
+
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_pool_free
+ * -------------------------------------------------------------------------- */
+
+void ray_pool_free(ray_pool_t* pool) {
+    if (!pool) return;
+
+    /* Signal shutdown and wake all workers */
+    atomic_store_explicit(&pool->shutdown, 1, memory_order_release);
+    for (uint32_t i = 0; i < pool->n_workers; i++) {
+        ray_sem_signal(&pool->work_ready);
+    }
+
+    /* Join all worker threads */
+    for (uint32_t i = 0; i < pool->n_workers; i++) {
+        ray_thread_join(pool->threads[i]);
+    }
+
+    ray_sys_free(pool->threads);
+    ray_sem_destroy(&pool->work_ready);
+    ray_sys_free(pool->tasks);
+    memset(pool, 0, sizeof(*pool));
+}
+
+/* --------------------------------------------------------------------------
+ * ray_pool_dispatch
+ * -------------------------------------------------------------------------- */
+
+/* M2: Caller (ray_execute) must reset pool->cancelled before dispatching.
+ * The cancelled flag is per-query state; failing to clear it causes all
+ * subsequent dispatches to skip task execution. */
+void ray_pool_dispatch(ray_pool_t* pool, ray_pool_fn fn, void* ctx,
+                      int64_t total_elems) {
+    if (total_elems <= 0) return;
+
+    /* Calculate number of tasks.
+     * Overflow guard: total_elems + grain - 1 could wrap for extreme values. */
+    int64_t grain = TASK_GRAIN;
+    if (RAY_UNLIKELY(total_elems > INT64_MAX - grain + 1))
+        total_elems = INT64_MAX - grain + 1;
+    uint32_t n_tasks = (uint32_t)((total_elems + grain - 1) / grain);
+
+    /* conc-L6: Ring growth is safe without synchronization because dispatch is
+     * single-producer: only the main thread (the dispatch caller) writes to
+     * task_head, tasks[], and task_cap. Workers only read via task_tail after
+     * the publish fence (task_count store-release). */
+    if (n_tasks > pool->task_cap) {
+        uint32_t new_cap = pool->task_cap;
+        while (new_cap < n_tasks && new_cap < MAX_RING_CAP) new_cap *= 2;
+        if (new_cap > pool->task_cap) {
+            ray_pool_task_t* new_tasks = (ray_pool_task_t*)ray_sys_realloc(
+                pool->tasks, new_cap * sizeof(ray_pool_task_t));
+            if (new_tasks) {
+                pool->tasks = new_tasks;
+                pool->task_cap = new_cap;
+            }
+        }
+    }
+
+    /* Clamp n_tasks to task_cap to prevent ring overflow */
+    if (n_tasks > pool->task_cap) {
+        n_tasks = pool->task_cap;
+        grain = (total_elems + n_tasks - 1) / n_tasks;
+    }
+
+    /* Fill task ring */
+    for (uint32_t i = 0; i < n_tasks; i++) {
+        int64_t start = (int64_t)i * grain;
+        int64_t end = start + grain;
+        if (end > total_elems) end = total_elems;
+
+        uint32_t slot = i & (pool->task_cap - 1);
+        pool->tasks[slot].fn = fn;
+        pool->tasks[slot].ctx = ctx;
+        pool->tasks[slot].start = start;
+        pool->tasks[slot].end = end;
+    }
+
+    pool->task_head = n_tasks;
+    atomic_store_explicit(&pool->task_count, n_tasks, memory_order_release);
+    atomic_store_explicit(&pool->task_tail, 0, memory_order_release);
+    atomic_store_explicit(&pool->pending, n_tasks, memory_order_release);
+
+    /* Mark parallel region: workers are about to run, cross-heap
+     * freelist modification is unsafe until spin-wait completes. */
+    atomic_store_explicit(&ray_parallel_flag, 1, memory_order_release);
+
+    /* Main thread enters atomic refcount mode during parallel dispatch */
+    ray_rc_sync = true;
+
+    /* Wake worker threads */
+    for (uint32_t i = 0; i < pool->n_workers; i++) {
+        ray_sem_signal(&pool->work_ready);
+    }
+
+    /* Main thread participates as worker 0 */
+    for (;;) {
+        uint32_t idx = atomic_fetch_add_explicit(&pool->task_tail, 1,
+                                                 memory_order_acq_rel);
+        if (idx >= n_tasks) break;
+
+        if (RAY_UNLIKELY(atomic_load_explicit(&pool->cancelled,
+                                              memory_order_relaxed))) {
+            atomic_fetch_sub_explicit(&pool->pending, 1, memory_order_acq_rel);
+            continue;
+        }
+
+        ray_pool_task_t* t = &pool->tasks[idx & (pool->task_cap - 1)];
+        t->fn(t->ctx, 0, t->start, t->end);
+
+        atomic_fetch_sub_explicit(&pool->pending, 1, memory_order_acq_rel);
+    }
+
+    /* Spin-wait for workers to finish remaining tasks.
+     * No semaphore — avoids surplus-signal bug between consecutive dispatches. */
+    {
+        unsigned spin_count = 0;
+        while (atomic_load_explicit(&pool->pending, memory_order_acquire) > 0) {
+#if defined(__x86_64__) || defined(__i386__)
+            __builtin_ia32_pause();
+#elif defined(__aarch64__)
+            __asm__ volatile("yield" ::: "memory");
+#endif
+            if (++spin_count % 1024 == 0) sched_yield();
+        }
+    }
+
+    /* All tasks done, workers heading to sem_wait (no GC in loop).
+     * Safe for main to modify worker heaps between dispatches. */
+    atomic_store_explicit(&ray_parallel_flag, 0, memory_order_release);
+
+    /* Memory fence ensures all worker RC operations are visible before
+     * main thread switches to non-atomic refcounting.  Workers may still
+     * be between pending-- and sem_wait. */
+    atomic_thread_fence(memory_order_seq_cst);
+    ray_rc_sync = false;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_pool_dispatch_n — dispatch exactly n_tasks tasks, each [i, i+1)
+ * -------------------------------------------------------------------------- */
+
+void ray_pool_dispatch_n(ray_pool_t* pool, ray_pool_fn fn, void* ctx,
+                         uint32_t n_tasks) {
+    if (n_tasks == 0) return;
+
+    /* Grow ring if needed */
+    if (n_tasks > pool->task_cap) {
+        uint32_t new_cap = pool->task_cap;
+        while (new_cap < n_tasks && new_cap < MAX_RING_CAP) new_cap *= 2;
+        if (new_cap > pool->task_cap) {
+            ray_pool_task_t* new_tasks = (ray_pool_task_t*)ray_sys_realloc(
+                pool->tasks, new_cap * sizeof(ray_pool_task_t));
+            if (new_tasks) {
+                pool->tasks = new_tasks;
+                pool->task_cap = new_cap;
+            }
+        }
+    }
+
+    /* Clamp n_tasks to task_cap to prevent ring overflow */
+    if (n_tasks > pool->task_cap) n_tasks = pool->task_cap;
+
+    /* Fill task ring: one task per partition */
+    for (uint32_t i = 0; i < n_tasks; i++) {
+        uint32_t slot = i & (pool->task_cap - 1);
+        pool->tasks[slot].fn = fn;
+        pool->tasks[slot].ctx = ctx;
+        pool->tasks[slot].start = (int64_t)i;
+        pool->tasks[slot].end = (int64_t)i + 1;
+    }
+
+    pool->task_head = n_tasks;
+    atomic_store_explicit(&pool->task_count, n_tasks, memory_order_release);
+    atomic_store_explicit(&pool->task_tail, 0, memory_order_release);
+    atomic_store_explicit(&pool->pending, n_tasks, memory_order_release);
+
+    atomic_store_explicit(&ray_parallel_flag, 1, memory_order_release);
+    ray_rc_sync = true;
+
+    /* Wake worker threads */
+    for (uint32_t i = 0; i < pool->n_workers; i++) {
+        ray_sem_signal(&pool->work_ready);
+    }
+
+    /* Main thread participates as worker 0 */
+    for (;;) {
+        uint32_t idx = atomic_fetch_add_explicit(&pool->task_tail, 1,
+                                                 memory_order_acq_rel);
+        if (idx >= n_tasks) break;
+
+        if (RAY_UNLIKELY(atomic_load_explicit(&pool->cancelled,
+                                              memory_order_relaxed))) {
+            atomic_fetch_sub_explicit(&pool->pending, 1, memory_order_acq_rel);
+            continue;
+        }
+
+        ray_pool_task_t* t = &pool->tasks[idx & (pool->task_cap - 1)];
+        t->fn(t->ctx, 0, t->start, t->end);
+
+        atomic_fetch_sub_explicit(&pool->pending, 1, memory_order_acq_rel);
+    }
+
+    /* Spin-wait for workers to finish remaining tasks */
+    {
+        unsigned spin_count = 0;
+        while (atomic_load_explicit(&pool->pending, memory_order_acquire) > 0) {
+#if defined(__x86_64__) || defined(__i386__)
+            __builtin_ia32_pause();
+#elif defined(__aarch64__)
+            __asm__ volatile("yield" ::: "memory");
+#endif
+            if (++spin_count % 1024 == 0) sched_yield();
+        }
+    }
+
+    atomic_store_explicit(&ray_parallel_flag, 0, memory_order_release);
+    atomic_thread_fence(memory_order_seq_cst);
+    ray_rc_sync = false;
+}
+
+/* --------------------------------------------------------------------------
+ * Global pool singleton (lazy init)
+ * -------------------------------------------------------------------------- */
+
+/* L4: Global singleton; not destroyed at program exit (OS reclaims resources).
+ * May cause ASan leak reports — suppress via LSAN_OPTIONS=detect_leaks=0 or
+ * an explicit ray_pool_destroy() call before exit. */
+static ray_pool_t  g_pool;
+static _Atomic(uint32_t) g_pool_init_state = 0;  /* 0=uninit, 1=initializing, 2=ready */
+
+ray_pool_t* ray_pool_get(void) {
+    uint32_t state = atomic_load_explicit(&g_pool_init_state, memory_order_acquire);
+    if (state == 2) return &g_pool;
+    if (state == 0) {
+        uint32_t expected = 0;
+        if (atomic_compare_exchange_strong_explicit(&g_pool_init_state, &expected, 1,
+                                                    memory_order_acq_rel,
+                                                    memory_order_acquire)) {
+            ray_err_t err = ray_pool_create(&g_pool, 0);
+            if (err == RAY_OK) {
+                atomic_store_explicit(&g_pool_init_state, 2, memory_order_release);
+                return &g_pool;
+            }
+            /* Failed — allow retry */
+            atomic_store_explicit(&g_pool_init_state, 0, memory_order_release);
+            return NULL;
+        }
+    }
+    /* Spin while another thread initializes or destroys.
+     * M7: state==3 means the pool is being destroyed — treat as unavailable
+     * and wait for it to return to state 0 (then return NULL), or become
+     * state 2 if re-initialized by another thread. */
+    {
+        unsigned spin_count = 0;
+        for (;;) {
+            uint32_t s = atomic_load_explicit(&g_pool_init_state, memory_order_acquire);
+            if (s == 2) return &g_pool;
+            if (s == 0) return NULL;  /* init failed, not started, or destroy completed */
+            /* s == 1: still initializing, s == 3: destroying — spin */
+#if defined(__x86_64__) || defined(__i386__)
+            __builtin_ia32_pause();
+#elif defined(__aarch64__)
+            __asm__ volatile("yield" ::: "memory");
+#endif
+            if (++spin_count % 1024 == 0) sched_yield();
+        }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Public API wrappers (declared in rayforce.h)
+ * -------------------------------------------------------------------------- */
+
+/* conc-L4: If ray_pool_init() is called when the pool is already initialized
+ * (state==2), the n_workers parameter is silently ignored and the existing
+ * pool configuration is preserved. This is by design — the pool is a
+ * singleton and reconfiguration requires ray_pool_destroy() + ray_pool_init(). */
+ray_err_t ray_pool_init(uint32_t n_workers) {
+    uint32_t expected = 0;
+    if (!atomic_compare_exchange_strong_explicit(&g_pool_init_state, &expected, 1,
+                                                 memory_order_acq_rel,
+                                                 memory_order_acquire)) {
+        /* Another thread is currently initializing (state==1); spin until ready */
+        if (expected == 1) {
+            while (atomic_load_explicit(&g_pool_init_state, memory_order_acquire) == 1) {
+#if defined(__x86_64__) || defined(__i386__)
+                __builtin_ia32_pause();
+#elif defined(__aarch64__)
+                __asm__ volatile("yield" ::: "memory");
+#endif
+            }
+        }
+        return RAY_OK;  /* already initialized or completed during our spin */
+    }
+    ray_err_t err = ray_pool_create(&g_pool, n_workers);
+    if (err == RAY_OK) {
+        atomic_store_explicit(&g_pool_init_state, 2, memory_order_release);
+    } else {
+        atomic_store_explicit(&g_pool_init_state, 0, memory_order_release);
+    }
+    return err;
+}
+
+void ray_pool_destroy(void) {
+    uint32_t expected = 2;
+    if (!atomic_compare_exchange_strong_explicit(&g_pool_init_state, &expected, 3,
+                                                  memory_order_acq_rel,
+                                                  memory_order_acquire))
+        return;  /* not ready, or another thread is already destroying */
+    ray_pool_free(&g_pool);
+    atomic_store_explicit(&g_pool_init_state, 0, memory_order_release);
+}
+
+void ray_cancel(void) {
+    ray_pool_t* pool = ray_pool_get();
+    if (pool)
+        atomic_store_explicit(&pool->cancelled, 1, memory_order_release);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/pool.h b/crates/rayforce-sys/vendor/rayforce/src/core/pool.h
new file mode 100644
index 0000000..3252755
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/pool.h
@@ -0,0 +1,95 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_POOL_H
+#define RAY_POOL_H
+
+/*
+ * pool.h -- Persistent thread pool for parallel morsel dispatch.
+ *
+ * Workers sleep on a semaphore and wake when ray_pool_dispatch() submits tasks.
+ * The main thread participates as worker 0 (no thread spawned for it).
+ * Each worker initializes its own thread-local heap via ray_heap_init().
+ */
+
+#include "core/platform.h"
+#include "ops/ops.h"
+
+/* Callback: process elements [start, end) with the given worker_id */
+typedef void (*ray_pool_fn)(void* ctx, uint32_t worker_id, int64_t start, int64_t end);
+
+/* A single work item in the task ring */
+typedef struct {
+    ray_pool_fn  fn;
+    void*       ctx;
+    int64_t     start;
+    int64_t     end;
+} ray_pool_task_t;
+
+/* Thread pool */
+struct ray_pool {
+    ray_thread_t*       threads;       /* worker thread handles [n_workers] */
+    uint32_t           n_workers;     /* number of background threads (nproc - 1) */
+    _Atomic(uint32_t)  shutdown;
+
+    /* SPMC task ring (single producer = main, multi consumer = workers + main) */
+    ray_pool_task_t*    tasks;         /* ring buffer [task_cap] */
+    uint32_t           task_cap;      /* power of 2 */
+    uint32_t           task_head;     /* next to write (main only, no atomic needed) */
+    _Atomic(uint32_t)  task_tail;     /* next to claim (workers, atomic_fetch_add) */
+    _Atomic(uint32_t)  task_count;    /* total tasks submitted this dispatch */
+
+    /* Barrier */
+    _Atomic(uint32_t)  pending;       /* decremented by each task completion */
+    ray_sem_t           work_ready;    /* workers sleep here */
+
+    /* Query cancellation — set by ray_cancel(), checked per-morsel */
+    _Atomic(uint32_t)  cancelled;
+};
+
+/* Total workers = n_workers + 1 (main thread is worker 0) */
+#define ray_pool_total_workers(p) ((p)->n_workers + 1)
+
+/* Initialize pool with n_workers background threads.
+ * Pass 0 to auto-detect (nproc - 1). */
+ray_err_t ray_pool_create(ray_pool_t* pool, uint32_t n_workers);
+
+/* Shutdown and free all resources */
+void ray_pool_free(ray_pool_t* pool);
+
+/* Dispatch fn over [0, total_elems) partitioned into morsel-sized tasks.
+ * Blocks until all tasks complete. Main thread participates as worker 0. */
+void ray_pool_dispatch(ray_pool_t* pool, ray_pool_fn fn, void* ctx, int64_t total_elems);
+
+/* Dispatch exactly n_tasks tasks, each with range [i, i+1).
+ * Used for partition-parallel workloads where each task is one partition. */
+void ray_pool_dispatch_n(ray_pool_t* pool, ray_pool_fn fn, void* ctx, uint32_t n_tasks);
+
+/* Global pool lifecycle (lazy singleton) */
+ray_pool_t* ray_pool_get(void);
+
+/* Public pool init/destroy (moved from rayforce.h) */
+ray_err_t ray_pool_init(uint32_t n_workers);
+void     ray_pool_destroy(void);
+
+#endif /* RAY_POOL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/profile.h b/crates/rayforce-sys/vendor/rayforce/src/core/profile.h
new file mode 100644
index 0000000..e89c495
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/profile.h
@@ -0,0 +1,161 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_PROFILE_H
+#define RAY_PROFILE_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#if defined(RAY_OS_WINDOWS)
+#include <windows.h>
+#else
+#include <time.h>
+/* clock_gettime / CLOCK_MONOTONIC may be hidden under strict -std=c17
+ * without _POSIX_C_SOURCE.  Provide fallback declarations. */
+#ifndef CLOCK_MONOTONIC
+#define CLOCK_MONOTONIC 1
+int clock_gettime(int clk_id, struct timespec *tp);
+#endif
+#endif
+
+/* ===== Span-based execution profiler =====
+ *
+ * Zero overhead when inactive — every call guards on g_ray_profile.active.
+ * Activated by REPL :t command; lives entirely outside hot morsel loops.
+ */
+
+#define RAY_PROFILE_SPANS_MAX 2048
+
+typedef enum {
+    RAY_PROF_SPAN_START,
+    RAY_PROF_SPAN_END,
+    RAY_PROF_SPAN_TICK
+} ray_prof_span_type_t;
+
+typedef struct {
+    ray_prof_span_type_t type;
+    const char*          msg;
+    int64_t              ts;   /* nanoseconds (monotonic) */
+} ray_prof_span_t;
+
+/* Progress callback — set by REPL to render progress bar.
+ * Called at morsel boundaries; receives done/total/label. */
+typedef void (*ray_progress_fn)(int64_t done, int64_t total, const char* label);
+
+typedef struct {
+    bool              active;
+    int32_t           n;
+    /* Progress tracking */
+    int64_t           progress_total;
+    int64_t           progress_done;
+    const char*       progress_label;
+    int64_t           progress_last_render; /* ns timestamp of last render */
+    ray_progress_fn   progress_cb;          /* set by REPL; NULL = no-op  */
+    ray_prof_span_t   spans[RAY_PROFILE_SPANS_MAX];
+} ray_profile_t;
+
+/* Single global instance */
+extern ray_profile_t g_ray_profile;
+
+static inline int64_t ray_profile_now_ns(void) {
+#if defined(RAY_OS_WINDOWS)
+    LARGE_INTEGER freq, cnt;
+    QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&cnt);
+    return (int64_t)((double)cnt.QuadPart / (double)freq.QuadPart * 1e9);
+#else
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec * 1000000000LL + (int64_t)ts.tv_nsec;
+#endif
+}
+
+static inline void ray_profile_reset(void) {
+    g_ray_profile.n = 0;
+    g_ray_profile.progress_total = 0;
+    g_ray_profile.progress_done = 0;
+    g_ray_profile.progress_label = NULL;
+    g_ray_profile.progress_last_render = 0;
+}
+
+static inline void ray_profile_span_start(const char* name) {
+    if (!g_ray_profile.active) return;
+    if (g_ray_profile.n >= RAY_PROFILE_SPANS_MAX) return;
+    ray_prof_span_t* s = &g_ray_profile.spans[g_ray_profile.n++];
+    s->type = RAY_PROF_SPAN_START;
+    s->msg  = name;
+    s->ts   = ray_profile_now_ns();
+}
+
+static inline void ray_profile_span_end(const char* name) {
+    if (!g_ray_profile.active) return;
+    if (g_ray_profile.n >= RAY_PROFILE_SPANS_MAX) return;
+    ray_prof_span_t* s = &g_ray_profile.spans[g_ray_profile.n++];
+    s->type = RAY_PROF_SPAN_END;
+    s->msg  = name;
+    s->ts   = ray_profile_now_ns();
+}
+
+static inline void ray_profile_tick(const char* msg) {
+    if (!g_ray_profile.active) return;
+    if (g_ray_profile.n >= RAY_PROFILE_SPANS_MAX) return;
+    ray_prof_span_t* s = &g_ray_profile.spans[g_ray_profile.n++];
+    s->type = RAY_PROF_SPAN_TICK;
+    s->msg  = msg;
+    s->ts   = ray_profile_now_ns();
+}
+
+/* Progress bar — called between morsels / pipeline stages */
+static inline void ray_profile_progress_begin(const char* label, int64_t total) {
+    if (!g_ray_profile.active) return;
+    g_ray_profile.progress_label = label;
+    g_ray_profile.progress_total = total;
+    g_ray_profile.progress_done  = 0;
+}
+
+/* Throttled: renders at most every 100ms to avoid terminal spam */
+#define RAY_PROGRESS_RENDER_INTERVAL_NS (100 * 1000000LL)
+
+static inline void ray_profile_progress_advance(int64_t delta) {
+    if (!g_ray_profile.active) return;
+    g_ray_profile.progress_done += delta;
+    if (g_ray_profile.progress_cb && g_ray_profile.progress_total > 0) {
+        int64_t now = ray_profile_now_ns();
+        if (now - g_ray_profile.progress_last_render > RAY_PROGRESS_RENDER_INTERVAL_NS) {
+            g_ray_profile.progress_last_render = now;
+            g_ray_profile.progress_cb(g_ray_profile.progress_done,
+                                      g_ray_profile.progress_total,
+                                      g_ray_profile.progress_label);
+        }
+    }
+}
+
+static inline void ray_profile_progress_end(void) {
+    if (!g_ray_profile.active) return;
+    g_ray_profile.progress_label = NULL;
+    g_ray_profile.progress_total = 0;
+    g_ray_profile.progress_done  = 0;
+}
+
+#endif /* RAY_PROFILE_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/progress.c b/crates/rayforce-sys/vendor/rayforce/src/core/progress.c
new file mode 100644
index 0000000..0b00f0a
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/progress.c
@@ -0,0 +1,170 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/*   Pull-based progress reporting. Zero cost when no callback is
+ *   registered; single main-thread pointer/int stores at sync points
+ *   otherwise. Workers never touch this state.
+ */
+
+#if !defined(_WIN32) && !defined(_POSIX_C_SOURCE)
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#include "rayforce.h"
+#include "mem/heap.h"
+#include <time.h>
+#include <string.h>
+
+static ray_progress_cb g_cb;
+static void*           g_user;
+static uint64_t        g_min_ms = 2000;
+static uint64_t        g_tick_ms = 100;
+
+/* Active-query state — only touched by the main executor thread.
+ * A dedicated thread would need atomics, but since every writer is
+ * the main thread we can use plain loads/stores. */
+static const char* g_op_name;
+static const char* g_phase;
+static uint64_t    g_rows_done;
+static uint64_t    g_rows_total;
+static uint64_t    g_start_ns;
+static uint64_t    g_last_fire_ns;
+static bool        g_showing;
+
+static inline uint64_t mono_ns(void) {
+    struct timespec ts;
+#ifdef CLOCK_MONOTONIC_COARSE
+    clock_gettime(CLOCK_MONOTONIC_COARSE, &ts);
+#else
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+#endif
+    return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec;
+}
+
+void ray_progress_set_callback(ray_progress_cb cb, void* user,
+                                uint64_t min_ms, uint64_t tick_interval_ms) {
+    g_cb = cb;
+    g_user = user;
+    if (min_ms) g_min_ms = min_ms;
+    if (tick_interval_ms) g_tick_ms = tick_interval_ms;
+}
+
+static void fire(uint64_t now_ns, bool final) {
+    ray_mem_stats_t ms;
+    ray_mem_stats(&ms);
+    ray_progress_t snap = {
+        .op_name     = g_op_name ? g_op_name : "",
+        .phase       = g_phase ? g_phase : "",
+        .rows_done   = g_rows_done,
+        .rows_total  = g_rows_total,
+        .elapsed_sec = (double)(now_ns - g_start_ns) / 1e9,
+        .mem_used    = (int64_t)(ms.bytes_allocated + ms.direct_bytes),
+        .mem_budget  = ray_mem_budget(),
+        .final       = final,
+    };
+    g_cb(&snap, g_user);
+    g_last_fire_ns = now_ns;
+    g_showing = true;
+}
+
+void ray_progress_update(const char* op_name, const char* phase,
+                         uint64_t rows_done, uint64_t rows_total) {
+    if (!g_cb) return;
+
+    /* Lazy-start the query clock on first call after ray_progress_end
+     * (or on very first call). Callers don't need a separate begin
+     * hook — the first update sets the query start time. */
+    if (g_start_ns == 0) {
+        g_start_ns = mono_ns();
+        g_last_fire_ns = 0;
+        g_showing = false;
+    }
+
+    /* Name/phase follow "NULL = keep previous" so callers can tick
+     * without relabeling. Counters always overwrite — 0 is a valid
+     * "starting fresh" value and must reset stale totals from the
+     * prior op/phase (otherwise a new pivot phase would carry the
+     * previous phase's rows_total forward and render wrong percentages). */
+    if (op_name) g_op_name = op_name;
+    if (phase)   g_phase = phase;
+    g_rows_done  = rows_done;
+    g_rows_total = rows_total;
+
+    uint64_t now = mono_ns();
+    uint64_t elapsed_ms = (now - g_start_ns) / 1000000ull;
+    if (elapsed_ms < g_min_ms) return;
+
+    uint64_t since_last = g_last_fire_ns ? (now - g_last_fire_ns) / 1000000ull : g_tick_ms;
+    if (since_last < g_tick_ms) return;
+
+    fire(now, false);
+}
+
+void ray_progress_label(const char* op_name, const char* phase) {
+    if (!g_cb) return;
+    if (g_start_ns == 0) {
+        g_start_ns = mono_ns();
+        g_last_fire_ns = 0;
+        g_showing = false;
+    }
+    if (op_name) g_op_name = op_name;
+    /* phase is always overwritten — label() marks a new op boundary
+     * so any stale phase string from the previous op (e.g. "pivot:
+     * dedupe") must not leak into the next op's render. Callers pass
+     * NULL when the new op has no phase of its own. */
+    g_phase = phase;
+    /* Reset counters so a freshly-entered op that doesn't know its
+     * row total shows an indeterminate bar instead of the previous
+     * op's percentages. The first ray_progress_update from inside
+     * the op will fill them in. */
+    g_rows_done = 0;
+    g_rows_total = 0;
+
+    uint64_t now = mono_ns();
+    uint64_t elapsed_ms = (now - g_start_ns) / 1000000ull;
+    if (elapsed_ms < g_min_ms) return;
+    uint64_t since_last = g_last_fire_ns ? (now - g_last_fire_ns) / 1000000ull : g_tick_ms;
+    if (since_last < g_tick_ms) return;
+    fire(now, false);
+}
+
+void ray_progress_end(void) {
+    if (!g_cb) {
+        g_start_ns = 0;
+        return;
+    }
+    if (g_showing) {
+        /* Final 100% tick — only if the bar was actually shown, so
+         * short queries don't flash anything at all. */
+        uint64_t now = mono_ns();
+        if (g_rows_total) g_rows_done = g_rows_total;
+        fire(now, true);
+    }
+    g_op_name = NULL;
+    g_phase = NULL;
+    g_rows_done = 0;
+    g_rows_total = 0;
+    g_start_ns = 0;
+    g_last_fire_ns = 0;
+    g_showing = false;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/runtime.c b/crates/rayforce-sys/vendor/rayforce/src/core/runtime.c
new file mode 100644
index 0000000..0aeab15
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/runtime.c
@@ -0,0 +1,367 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "runtime.h"
+#include "mem/heap.h"
+#include "mem/sys.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <errno.h>
+#ifdef RAY_OS_WINDOWS
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+/* Forward-declare lang init/destroy to avoid eval.h ray_vm_t conflict */
+extern ray_err_t ray_lang_init(void);
+extern void      ray_lang_destroy(void);
+
+/* ===== Global state ===== */
+
+ray_runtime_t *__RUNTIME = NULL;
+_Thread_local ray_vm_t *__VM = NULL;
+
+/* Static null singleton — type RAY_NULL, ARENA flag makes retain/release no-ops */
+ray_t __ray_null = { .type = RAY_NULL, .attrs = RAY_ATTR_ARENA, .rc = 0, .len = 0 };
+
+/* Static last-resort OOM error — used when ray_error itself can't allocate
+ * the small block it needs to construct a fresh error.  Tagged with
+ * RAY_ATTR_ARENA so retain/release are no-ops, matching __ray_null.  Code
+ * is "oom" inline (slen=3).  Per-VM message is dropped under deep OOM since
+ * we have no heap to format anything new — callers get the bare type/code.
+ *
+ * Without this sentinel, hard OOM (heap can't satisfy even the 32-byte
+ * error header) makes ray_error return NULL, which silently bypasses
+ * every `if (RAY_IS_ERR(x)) return x;` guard upstream and reintroduces
+ * exactly the silent-failure pathology the wrapper-level fixes were
+ * meant to close. */
+ray_t __ray_oom = {
+    .type  = RAY_ERROR,
+    .attrs = RAY_ATTR_ARENA,
+    .rc    = 0,
+    /* slen / sdata share a union with len / i64 / etc. (bytes 16-31) —
+     * pick one designated path for that union or clang's
+     * -Winitializer-overrides flags it under -Werror. */
+    .slen  = 3,
+    .sdata = { 'o', 'o', 'm', 0, 0, 0, 0 },
+};
+
+/* ===== Error code to string ===== */
+
+const char* ray_err_code_str(ray_err_t e) {
+    static const char* codes[] = {
+        [RAY_OK]          = "ok",
+        [RAY_ERR_OOM]     = "oom",
+        [RAY_ERR_TYPE]    = "type",
+        [RAY_ERR_RANGE]   = "range",
+        [RAY_ERR_LENGTH]  = "length",
+        [RAY_ERR_RANK]    = "rank",
+        [RAY_ERR_DOMAIN]  = "domain",
+        [RAY_ERR_NYI]     = "nyi",
+        [RAY_ERR_IO]      = "io",
+        [RAY_ERR_SCHEMA]  = "schema",
+        [RAY_ERR_CORRUPT] = "corrupt",
+        [RAY_ERR_CANCEL]  = "cancel",
+        [RAY_ERR_PARSE]   = "parse",
+        [RAY_ERR_NAME]    = "name",
+        [RAY_ERR_LIMIT]   = "limit",
+        /* "reserve" (not "reserved") because the err->sdata inline field
+         * is capped at 7 bytes — the past-tense form would truncate. */
+        [RAY_ERR_RESERVED] = "reserve",
+    };
+    if ((unsigned)e >= sizeof(codes)/sizeof(codes[0])) return "error";
+    return codes[e];
+}
+
+ray_err_t ray_err_from_obj(ray_t* err) {
+    if (!err || err->type != RAY_ERROR) return RAY_ERR_DOMAIN;
+    const char* s = err->sdata;
+    int n = err->slen;
+    static const struct { const char* s; int len; ray_err_t e; } map[] = {
+        {"oom",     3, RAY_ERR_OOM},     {"type",    4, RAY_ERR_TYPE},
+        {"range",   5, RAY_ERR_RANGE},   {"length",  6, RAY_ERR_LENGTH},
+        {"rank",    4, RAY_ERR_RANK},    {"domain",  6, RAY_ERR_DOMAIN},
+        {"nyi",     3, RAY_ERR_NYI},     {"io",      2, RAY_ERR_IO},
+        {"schema",  6, RAY_ERR_SCHEMA},  {"corrupt", 7, RAY_ERR_CORRUPT},
+        {"cancel",  6, RAY_ERR_CANCEL},  {"parse",   5, RAY_ERR_PARSE},
+        {"name",    4, RAY_ERR_NAME},    {"limit",   5, RAY_ERR_LIMIT},
+        {"reserve", 7, RAY_ERR_RESERVED},
+    };
+    for (int i = 0; i < (int)(sizeof(map)/sizeof(map[0])); i++)
+        if (n == map[i].len && memcmp(s, map[i].s, n) == 0) return map[i].e;
+    return RAY_ERR_DOMAIN;
+}
+
+/* ===== Error API ===== */
+
+static ray_t* ray_verror(const char* code, const char* fmt, va_list ap) {
+    /* Populate / clear the per-VM message buffer FIRST.  On the deep-OOM
+     * path below we return the static __ray_oom sentinel, but that path
+     * still has to leave __VM->err.msg consistent with this call —
+     * otherwise ray_error_msg() returns text from whatever earlier error
+     * happened to land in the buffer last, which a user would naturally
+     * read as the message for THIS error.  The vsnprintf target is a
+     * fixed-size member of __VM (allocated at runtime-init), so this
+     * step does not depend on the heap and stays valid even when
+     * ray_alloc below fails. */
+    if (__VM) {
+        if (fmt) vsnprintf(__VM->err.msg, sizeof(__VM->err.msg), fmt, ap);
+        else     __VM->err.msg[0] = '\0';
+    }
+
+    ray_t* err = ray_alloc(0);
+    if (!err) return &__ray_oom;  /* sentinel — see __ray_oom comment */
+    err->type = RAY_ERROR;
+    err->slen = 0;
+    memset(err->sdata, 0, 7);
+    if (code) {
+        size_t len = strlen(code);
+        if (len > 7) len = 7;
+        memcpy(err->sdata, code, len);
+        err->slen = (uint8_t)len;
+    }
+    return err;
+}
+
+ray_t* ray_error(const char* code, const char* fmt, ...) {
+    if (fmt) {
+        va_list ap;
+        va_start(ap, fmt);
+        ray_t* err = ray_verror(code, fmt, ap);
+        va_end(ap);
+        return err;
+    }
+    /* No format string — skip va_list entirely for portability.  Clear
+     * the per-VM message buffer FIRST so the deep-OOM sentinel path
+     * doesn't leave stale text from an earlier error visible. */
+    if (__VM) __VM->err.msg[0] = '\0';
+    ray_t* err = ray_alloc(0);
+    if (!err) return &__ray_oom;  /* sentinel — see __ray_oom comment */
+    err->type = RAY_ERROR;
+    err->slen = 0;
+    memset(err->sdata, 0, 7);
+    if (code) {
+        size_t len = strlen(code);
+        if (len > 7) len = 7;
+        memcpy(err->sdata, code, len);
+        err->slen = (uint8_t)len;
+    }
+    return err;
+}
+
+void ray_error_free(ray_t* err) {
+    /* Skip NULL and anything that isn't actually a RAY_ERROR — callers
+     * often pass a result that might be either an error or a real value. */
+    if (!err || !RAY_IS_ERR(err)) return;
+    /* The static OOM sentinel lives in BSS, not the heap.  Freeing it
+     * would corrupt the buddy allocator's bookkeeping. */
+    if (err == RAY_OOM_OBJ) return;
+    /* Both ray_free and ray_release_owned_refs short-circuit on RAY_IS_ERR
+     * as a safety default (the refcount system deliberately does not track
+     * error objects).  Retype the block to a leaf atom (-RAY_I64) so those
+     * guards don't fire — an atom with no owned children is the safest
+     * shape to pass through the standard free path.  The rc was already
+     * 1 from ray_alloc, so ray_free will reclaim the block via the buddy
+     * allocator.  From this point the caller must not touch err again. */
+    err->type = -RAY_I64;
+    ray_free(err);
+}
+
+const char* ray_err_code(ray_t* err) {
+    if (!err || err->type != RAY_ERROR) return NULL;
+    /* sdata is 7 bytes and may not be null-terminated when full */
+    static _Thread_local char buf[8];
+    memcpy(buf, err->sdata, err->slen);
+    buf[err->slen] = '\0';
+    return buf;
+}
+
+const char* ray_error_msg(void) {
+    if (!__VM || !__VM->err.msg[0]) return NULL;
+    return __VM->err.msg;
+}
+
+void ray_error_clear(void) {
+    if (__VM) __VM->err.msg[0] = '\0';
+}
+
+/* ===== Lifecycle ===== */
+
+static ray_runtime_t* runtime_create_impl(const char* sym_path,
+                                           ray_err_t* out_sym_err) {
+    if (out_sym_err) *out_sym_err = RAY_OK;
+
+    /* Init subsystems */
+    ray_heap_init();
+    ray_sym_init();
+
+    /* Allocate runtime and set __VM + mem_budget BEFORE any file I/O so
+     * that ray_error() has a live VM to record diagnostics against and
+     * allocations are bounded by the budget. */
+    ray_runtime_t* rt = (ray_runtime_t*)ray_sys_alloc(sizeof(ray_runtime_t));
+    if (!rt) return NULL;
+    memset(rt, 0, sizeof(*rt));
+
+    /* Create main VM (id=0) */
+    rt->n_vms = 1;
+    rt->vms = (ray_vm_t**)ray_sys_alloc(sizeof(ray_vm_t*));
+    if (!rt->vms) { ray_sys_free(rt); return NULL; }
+    rt->vms[0] = (ray_vm_t*)ray_sys_alloc(sizeof(ray_vm_t));
+    if (!rt->vms[0]) { ray_sys_free(rt->vms); ray_sys_free(rt); return NULL; }
+    memset(rt->vms[0], 0, sizeof(ray_vm_t));
+    rt->vms[0]->id = 0;
+    __VM = rt->vms[0];
+
+    /* Detect memory budget: 80% of physical RAM */
+#ifdef RAY_OS_WINDOWS
+    MEMORYSTATUSEX ms;
+    ms.dwLength = sizeof(ms);
+    if (GlobalMemoryStatusEx(&ms))
+        rt->mem_budget = (int64_t)(ms.ullTotalPhys * 0.8);
+    else
+        rt->mem_budget = (int64_t)(4ULL << 30);
+#else
+    long pages = sysconf(_SC_PHYS_PAGES);
+    long psize = sysconf(_SC_PAGESIZE);
+    if (pages > 0 && psize > 0)
+        rt->mem_budget = (int64_t)((double)pages * (double)psize * 0.8);
+    else
+        rt->mem_budget = (int64_t)(4ULL << 30);
+#endif
+
+    /* __RUNTIME must be visible before ray_sym_load so mem_budget checks
+     * and ray_error() both operate against the live runtime. */
+    __RUNTIME = rt;
+
+    /* Load persisted symbol table BEFORE ray_lang_init interns builtins.
+     * Ordering: __VM + mem_budget are live so file I/O errors surface via
+     * ray_error() and allocations are budget-bounded.  Still before
+     * ray_lang_init so persisted user symbol IDs keep their slots and
+     * builtins append afterwards. */
+    if (sym_path) {
+        /* Pre-flight size check: reject files that would blow past the
+         * memory budget before ever touching ray_col_load.
+         *
+         * errno handling: ENOENT is the normal first-run case and stays
+         * RAY_OK; any *other* stat failure (EACCES, ENOTDIR, EIO, …) is
+         * a real problem and must be surfaced as RAY_ERR_IO, otherwise
+         * the caller would silently continue with an empty sym table
+         * and later hit the "divergence" class of bugs this entrypoint
+         * was added to avoid. */
+        struct stat st;
+        if (stat(sym_path, &st) == 0) {
+            /* Allow the sym file itself plus some working headroom (2x).
+             * A well-formed sym file is a list of interned strings; the
+             * in-memory footprint is bounded by file size within a small
+             * constant factor. */
+            if (st.st_size > 0 &&
+                (int64_t)st.st_size > rt->mem_budget / 2) {
+                if (out_sym_err) *out_sym_err = RAY_ERR_OOM;
+                /* Continue startup with empty sym table; caller decides
+                 * whether to treat this as fatal. */
+            } else {
+                ray_err_t sym_err = ray_sym_load(sym_path);
+                if (out_sym_err) *out_sym_err = sym_err;
+                /* RAY_ERR_CORRUPT and I/O errors are non-fatal here:
+                 * caller inspects out_sym_err to decide recovery. */
+            }
+        } else if (errno != ENOENT) {
+            if (out_sym_err) *out_sym_err = RAY_ERR_IO;
+        }
+        /* ENOENT: leave out_sym_err = RAY_OK — absent sym file is the
+         * normal first-run case. */
+    }
+
+    /* Init language (env + builtins) — must be after __VM is set and
+     * after sym_load so persisted user IDs keep their slots. */
+    ray_lang_init();
+
+    return rt;
+}
+
+ray_runtime_t* ray_runtime_create(int argc, char** argv) {
+    (void)argc; (void)argv;
+    return runtime_create_impl(NULL, NULL);
+}
+
+ray_runtime_t* ray_runtime_create_with_sym(const char* sym_path) {
+    return runtime_create_impl(sym_path, NULL);
+}
+
+ray_runtime_t* ray_runtime_create_with_sym_err(const char* sym_path,
+                                               ray_err_t* out_sym_err) {
+    return runtime_create_impl(sym_path, out_sym_err);
+}
+
+/* ===== Main event loop accessors =====
+ * The poll is opaque to runtime.h (stored as `void*`) so adding it
+ * doesn't drag poll.h into every TU that includes runtime.h.  Set
+ * once by main.c after ray_poll_create; read by runtime-level
+ * builtins (.sys.listen, .sys.cmd "listen N"). */
+
+void ray_runtime_set_poll(void* poll) {
+    if (__RUNTIME) __RUNTIME->poll = poll;
+}
+
+void* ray_runtime_get_poll(void) {
+    return __RUNTIME ? __RUNTIME->poll : NULL;
+}
+
+/* ===== Memory Budget API ===== */
+
+int64_t ray_mem_budget(void) {
+    return __RUNTIME ? __RUNTIME->mem_budget : 0;
+}
+
+bool ray_mem_pressure(void) {
+    if (!__RUNTIME) return false;
+    ray_mem_stats_t st;
+    ray_mem_stats(&st);
+    return (int64_t)(st.bytes_allocated + st.direct_bytes) > __RUNTIME->mem_budget;
+}
+
+void ray_runtime_destroy(ray_runtime_t* rt) {
+    if (!rt) return;
+
+    ray_lang_destroy();
+
+    /* Free VMs */
+    for (int32_t i = 0; i < rt->n_vms; i++) {
+        ray_vm_t* vm = rt->vms[i];
+        if (vm->raise_val) ray_release(vm->raise_val);
+        if (vm->trace) { ray_release(vm->trace); vm->trace = NULL; }
+        ray_sys_free(vm);
+    }
+    ray_sys_free(rt->vms);
+
+    __VM = NULL;
+    __RUNTIME = NULL;
+
+    ray_sym_destroy();
+    ray_heap_destroy();
+
+    ray_sys_free(rt);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/runtime.h b/crates/rayforce-sys/vendor/rayforce/src/core/runtime.h
new file mode 100644
index 0000000..5d8e509
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/runtime.h
@@ -0,0 +1,136 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+#ifndef RAY_RUNTIME_H
+#define RAY_RUNTIME_H
+
+#include <rayforce.h>
+
+/* ===== Error Info (per-VM, ephemeral) ===== */
+
+typedef struct {
+    char msg[256];
+} ray_err_info_t;
+
+/* ===== Scope Frame (moved from env.c) ===== */
+
+#define RAY_SCOPE_CAP  64
+#define RAY_FRAME_CAP  64
+
+typedef struct {
+    int64_t keys[RAY_FRAME_CAP];
+    ray_t*  vals[RAY_FRAME_CAP];
+    int32_t count;
+} ray_scope_frame_t;
+
+/* ===== VM sub-types ===== */
+
+#define RAY_VM_STACK_SIZE 1024
+#define RAY_VM_TRAP_SIZE  16
+
+typedef struct {
+    ray_t   *fn;
+    int32_t  fp;
+    int32_t  ip;
+} ray_vm_ctx_t;
+
+typedef struct {
+    int32_t  rp;
+    int32_t  sp;
+    int32_t  handler_ip;
+    ray_t   *fn;
+    int32_t  fp;
+    int32_t  n_locals;
+} ray_vm_trap_t;
+
+/* ===== Per-thread VM ===== */
+
+typedef struct {
+    /* hot path */
+    int32_t          sp;
+    int32_t          fp;
+    int32_t          rp;
+    int32_t          id;
+    ray_t           *fn;
+    void            *heap;
+    int32_t          tp;
+    /* stacks */
+    ray_t           *ps[RAY_VM_STACK_SIZE];
+    ray_vm_ctx_t     rs[RAY_VM_STACK_SIZE];
+    ray_vm_trap_t    ts[RAY_VM_TRAP_SIZE];
+    /* cold — error/debug */
+    ray_err_info_t   err;
+    ray_t           *nfo;
+    ray_t           *trace;
+    ray_t           *raise_val;
+    /* scope */
+    ray_scope_frame_t scope_stack[RAY_SCOPE_CAP];
+    int32_t          scope_depth;
+} ray_vm_t;
+
+/* ===== Runtime ===== */
+
+typedef struct ray_runtime_s {
+    ray_vm_t       **vms;
+    int32_t          n_vms;
+    int64_t          mem_budget;   /* 80% of physical RAM, bytes */
+    void            *poll;         /* opaque ray_poll_t* — see ray_runtime_(set|get)_poll */
+} ray_runtime_t;
+
+/* Global runtime + per-thread VM */
+extern ray_runtime_t *__RUNTIME;
+extern _Thread_local ray_vm_t *__VM;
+
+/* Lifecycle */
+ray_runtime_t* ray_runtime_create(int argc, char** argv);
+void           ray_runtime_destroy(ray_runtime_t* rt);
+
+/* Main event-loop accessors.  The host (main.c) registers the poll it
+ * created; runtime-level builtins read it back through these to avoid
+ * pulling poll.h into runtime.h (and to keep TUs that include
+ * runtime.h decoupled from the eval-VM definition that conflicts with
+ * the unrelated `ray_vm_t` declared above). */
+void  ray_runtime_set_poll(void* poll);
+void* ray_runtime_get_poll(void);
+
+/* Persistent-consumer lifecycle: load the sym table from `sym_path` (if
+ * present) before builtins register, so user-interned IDs keep the same
+ * slots across process restarts.  The _err variant surfaces the load
+ * result via `out_sym_err` (RAY_OK / RAY_ERR_CORRUPT / I/O errors) so
+ * callers can decide recovery policy; the plain variant discards it. */
+ray_runtime_t* ray_runtime_create_with_sym(const char* sym_path);
+ray_runtime_t* ray_runtime_create_with_sym_err(const char* sym_path,
+                                               ray_err_t* out_sym_err);
+
+/* Error API — allocates ray_t with type=RAY_ERROR, sets __VM->err.msg */
+ray_t* ray_error(const char* code, const char* fmt, ...);
+/* Read error code from a RAY_ERROR object (returns pointer to sdata) */
+const char* ray_err_code(ray_t* err);
+/* ray_error_free() is published in include/rayforce.h */
+
+/* Read VM error detail message (NULL if empty) */
+const char* ray_error_msg(void);
+
+/* Clear VM error detail */
+void ray_error_clear(void);
+
+#endif /* RAY_RUNTIME_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/sock.c b/crates/rayforce-sys/vendor/rayforce/src/core/sock.c
new file mode 100644
index 0000000..2983b9d
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/sock.c
@@ -0,0 +1,201 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OS_WINDOWS
+  #define _GNU_SOURCE
+#endif
+
+#include "core/sock.h"
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+
+#ifdef RAY_OS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <winsock2.h>
+  #include <ws2tcpip.h>
+#else
+  #include <fcntl.h>
+  #include <netdb.h>
+  #include <sys/socket.h>
+  #include <sys/time.h>
+  #include <arpa/inet.h>
+  #include <netinet/tcp.h>
+  #include <poll.h>
+  #include <unistd.h>
+#endif
+
+/* ===== Socket Implementation ===== */
+
+ray_sock_t ray_sock_listen(uint16_t port)
+{
+    ray_sock_t fd = (ray_sock_t)socket(AF_INET, SOCK_STREAM, 0);
+    if (fd == RAY_INVALID_SOCK) return RAY_INVALID_SOCK;
+
+    int yes = 1;
+    setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (const char*)&yes, sizeof(yes));
+
+    struct sockaddr_in addr;
+    memset(&addr, 0, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = htonl(INADDR_ANY);
+    addr.sin_port        = htons(port);
+
+    if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
+        ray_sock_close(fd);
+        return RAY_INVALID_SOCK;
+    }
+    if (listen(fd, 128) < 0) {
+        ray_sock_close(fd);
+        return RAY_INVALID_SOCK;
+    }
+    return fd;
+}
+
+ray_sock_t ray_sock_accept(ray_sock_t srv)
+{
+    ray_sock_t fd;
+    do {
+        fd = (ray_sock_t)accept(srv, NULL, NULL);
+    } while (fd == RAY_INVALID_SOCK && errno == EINTR);
+
+    if (fd == RAY_INVALID_SOCK) return RAY_INVALID_SOCK;
+
+    int yes = 1;
+    setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (const char*)&yes, sizeof(yes));
+    return fd;
+}
+
+ray_sock_t ray_sock_connect(const char* host, uint16_t port, int timeout_ms)
+{
+    struct addrinfo hints, *res = NULL;
+    memset(&hints, 0, sizeof(hints));
+    hints.ai_family   = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    char port_str[8];
+    snprintf(port_str, sizeof(port_str), "%u", (unsigned)port);
+
+    if (getaddrinfo(host, port_str, &hints, &res) != 0 || !res)
+        return RAY_INVALID_SOCK;
+
+    ray_sock_t fd = (ray_sock_t)socket(res->ai_family, res->ai_socktype,
+                                        res->ai_protocol);
+    if (fd == RAY_INVALID_SOCK) {
+        freeaddrinfo(res);
+        return RAY_INVALID_SOCK;
+    }
+
+    /* Set send/recv timeout if requested */
+    if (timeout_ms > 0) {
+#ifdef RAY_OS_WINDOWS
+        DWORD tv = (DWORD)timeout_ms;
+        setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (const char*)&tv, sizeof(tv));
+        setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, (const char*)&tv, sizeof(tv));
+#else
+        struct timeval tv;
+        tv.tv_sec  = timeout_ms / 1000;
+        tv.tv_usec = (timeout_ms % 1000) * 1000;
+        setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
+        setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv));
+#endif
+    }
+
+    if (connect(fd, res->ai_addr, (socklen_t)res->ai_addrlen) < 0) {
+        ray_sock_close(fd);
+        freeaddrinfo(res);
+        return RAY_INVALID_SOCK;
+    }
+    freeaddrinfo(res);
+
+    int yes = 1;
+    setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (const char*)&yes, sizeof(yes));
+    return fd;
+}
+
+int64_t ray_sock_send(ray_sock_t s, const void* buf, size_t len)
+{
+    const uint8_t* p   = (const uint8_t*)buf;
+    size_t         rem = len;
+    while (rem > 0) {
+#ifdef RAY_OS_WINDOWS
+        int n = send(s, (const char*)p, (int)rem, 0);
+#else
+        ssize_t n = send(s, p, rem, MSG_NOSIGNAL);
+#endif
+        if (n < 0) {
+            if (errno == EINTR) continue;
+            if (errno == EAGAIN || errno == EWOULDBLOCK) {
+                /* Wait for write-readiness before retry */
+                struct pollfd pfd = { .fd = s, .events = POLLOUT };
+                poll(&pfd, 1, -1);
+                continue;
+            }
+            return -1;
+        }
+        p   += n;
+        rem -= (size_t)n;
+    }
+    return (int64_t)len;
+}
+
+int64_t ray_sock_recv(ray_sock_t s, void* buf, size_t len)
+{
+    for (;;) {
+#ifdef RAY_OS_WINDOWS
+        int n = recv(s, (char*)buf, (int)len, 0);
+#else
+        ssize_t n = recv(s, buf, len, 0);
+#endif
+        if (n < 0) {
+            if (errno == EINTR) continue;
+            return -1;
+        }
+        return (int64_t)n;   /* 0 = peer closed */
+    }
+}
+
+void ray_sock_close(ray_sock_t s)
+{
+    if (s == RAY_INVALID_SOCK) return;
+#ifdef RAY_OS_WINDOWS
+    closesocket(s);
+#else
+    close(s);
+#endif
+}
+
+ray_err_t ray_sock_set_nonblocking(ray_sock_t s)
+{
+#ifdef RAY_OS_WINDOWS
+    u_long mode = 1;
+    if (ioctlsocket(s, FIONBIO, &mode) != 0)
+        return RAY_ERR_IO;
+#else
+    int flags = fcntl(s, F_GETFL, 0);
+    if (flags < 0) return RAY_ERR_IO;
+    if (fcntl(s, F_SETFL, flags | O_NONBLOCK) < 0)
+        return RAY_ERR_IO;
+#endif
+    return RAY_OK;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/sock.h b/crates/rayforce-sys/vendor/rayforce/src/core/sock.h
new file mode 100644
index 0000000..0e6575c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/sock.h
@@ -0,0 +1,47 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_SOCK_H
+#define RAY_SOCK_H
+
+#include <rayforce.h>
+
+/* ===== Socket Abstraction ===== */
+
+#ifdef RAY_OS_WINDOWS
+  typedef intptr_t ray_sock_t;
+  #define RAY_INVALID_SOCK ((ray_sock_t)-1)
+#else
+  typedef int ray_sock_t;
+  #define RAY_INVALID_SOCK (-1)
+#endif
+
+ray_sock_t ray_sock_listen(uint16_t port);
+ray_sock_t ray_sock_accept(ray_sock_t srv);
+ray_sock_t ray_sock_connect(const char* host, uint16_t port, int timeout_ms);
+int64_t    ray_sock_send(ray_sock_t s, const void* buf, size_t len);
+int64_t    ray_sock_recv(ray_sock_t s, void* buf, size_t len);
+void       ray_sock_close(ray_sock_t s);
+ray_err_t  ray_sock_set_nonblocking(ray_sock_t s);
+
+#endif /* RAY_SOCK_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/types.c b/crates/rayforce-sys/vendor/rayforce/src/core/types.c
new file mode 100644
index 0000000..e811bac
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/types.c
@@ -0,0 +1,57 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "core/types.h"
+
+/* Element sizes indexed by type tag.  Only types 0-14 (vectors) have
+ * non-zero entries; remaining indices are zero (safe for non-vector types). */
+const uint8_t ray_type_sizes[256] = {
+    /* [RAY_LIST]      =  0 */ 8,   /* pointer-sized (ray_t*) */
+    /* [RAY_BOOL]      =  1 */ 1,
+    /* [RAY_U8]        =  2 */ 1,
+    /* [RAY_I16]       =  3 */ 2,
+    /* [RAY_I32]       =  4 */ 4,
+    /* [RAY_I64]       =  5 */ 8,
+    /* [RAY_F32]       =  6 */ 4,
+    /* [RAY_F64]       =  7 */ 8,
+    /* [RAY_DATE]      =  8 */ 4,
+    /* [RAY_TIME]      =  9 */ 4,
+    /* [RAY_TIMESTAMP] = 10 */ 8,
+    /* [RAY_GUID]      = 11 */ 16,
+    /* [RAY_SYM]       = 12 */ 8,   /* W64 default; narrow widths use ray_sym_elem_size */
+    /* [RAY_STR]       = 13 */ 16,  /* sizeof(ray_str_t) */
+    /* [RAY_SEL]       = 14 */ 0,   /* variable-size layout, no elem_size */
+};
+
+/* ===== Semantic Version API ===== */
+
+/* Stringify helpers to build version string from header macros */
+#define RAY_VER_STR_(x) #x
+#define RAY_VER_STR(x)  RAY_VER_STR_(x)
+#define RAY_VERSION_STRING_ \
+    RAY_VER_STR(RAY_VERSION_MAJOR) "." RAY_VER_STR(RAY_VERSION_MINOR) "." RAY_VER_STR(RAY_VERSION_PATCH)
+
+int  ray_version_major(void)         { return RAY_VERSION_MAJOR; }
+int  ray_version_minor(void)         { return RAY_VERSION_MINOR; }
+int  ray_version_patch(void)         { return RAY_VERSION_PATCH; }
+const char* ray_version_string(void) { return RAY_VERSION_STRING_; }
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/types.h b/crates/rayforce-sys/vendor/rayforce/src/core/types.h
new file mode 100644
index 0000000..18b5231
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/core/types.h
@@ -0,0 +1,45 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_TYPES_H
+#define RAY_TYPES_H
+
+/*
+ * types.h — Internal types header.
+ *
+ * The canonical type definitions (ray_t, type constants, attribute flags)
+ * live in <rayforce.h> (the public header).
+ * Internal .c files can include either rayforce.h directly or types.h.
+ */
+#include <rayforce.h>
+
+/* Number of types (positive range): must be > max type ID */
+#define RAY_TYPE_COUNT 15
+
+/* Type sizes lookup table (defined in types.c) */
+extern const uint8_t ray_type_sizes[256];
+
+/* Element size for a given type tag */
+#define ray_elem_size(t)  (ray_type_sizes[(t)])
+
+#endif /* RAY_TYPES_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/io/csv.c b/crates/rayforce-sys/vendor/rayforce/src/io/csv.c
new file mode 100644
index 0000000..499db1c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/io/csv.c
@@ -0,0 +1,1821 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/* ============================================================================
+ * csv.c — Fast parallel CSV reader
+ *
+ * Design:
+ *   1. mmap + MAP_POPULATE for zero-copy file access
+ *   2. memchr-based newline scan for row offset discovery
+ *   3. Single-pass: sample-based type inference, then parallel value parsing
+ *   4. Inline integer/float parsers (bypass strtoll/strtod overhead)
+ *   5. Parallel row parsing via ray_pool_dispatch
+ *   6. Per-worker local sym tables, merged post-parse on main thread
+ * ============================================================================ */
+
+#if defined(__linux__)
+  #define _GNU_SOURCE
+#endif
+
+#include "csv.h"
+#include "mem/heap.h"
+#include "mem/sys.h"
+#include "core/numparse.h"
+#include "core/pool.h"
+#include "lang/format.h"
+#include "ops/hash.h"
+#include "store/fileio.h"
+#include "table/sym.h"
+#include "vec/str.h"
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdarg.h>
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#ifndef RAY_OS_WINDOWS
+#include <unistd.h>
+#endif
+#include <sys/mman.h>
+
+/* --------------------------------------------------------------------------
+ * Constants
+ * -------------------------------------------------------------------------- */
+
+#define CSV_MAX_COLS      256
+#define CSV_SAMPLE_ROWS   100
+
+/* --------------------------------------------------------------------------
+ * mmap flags
+ * -------------------------------------------------------------------------- */
+
+#ifdef __linux__
+  #define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE)
+#else
+  #define MMAP_FLAGS MAP_PRIVATE
+#endif
+
+/* --------------------------------------------------------------------------
+ * Scratch memory helpers (same pattern as exec.c).
+ * Uses ray_alloc/ray_free (buddy allocator) instead of malloc/free.
+ * -------------------------------------------------------------------------- */
+
+static inline void* scratch_alloc(ray_t** hdr_out, size_t nbytes) {
+    ray_t* h = ray_alloc(nbytes);
+    if (!h) { *hdr_out = NULL; return NULL; }
+    *hdr_out = h;
+    return ray_data(h);
+}
+
+static inline void* scratch_realloc(ray_t** hdr_out, size_t old_bytes, size_t new_bytes) {
+    ray_t* old_h = *hdr_out;
+    ray_t* new_h = ray_alloc(new_bytes);
+    if (!new_h) return NULL;
+    void* new_p = ray_data(new_h);
+    if (old_h) {
+        memcpy(new_p, ray_data(old_h), old_bytes < new_bytes ? old_bytes : new_bytes);
+        ray_free(old_h);
+    }
+    *hdr_out = new_h;
+    return new_p;
+}
+
+static inline void scratch_free(ray_t* hdr) {
+    if (hdr) ray_free(hdr);
+}
+
+/* Hash uses wyhash from ops/hash.h (ray_hash_bytes) — much faster than FNV-1a
+ * for short strings typical in CSV columns. */
+
+/* String reference — raw pointer into mmap'd buffer + length.
+ * Used during parse phase; interned into sym table after parse. */
+typedef struct {
+    const char* ptr;
+    uint32_t    len;
+} csv_strref_t;
+
+/* --------------------------------------------------------------------------
+ * Type inference
+ * -------------------------------------------------------------------------- */
+
+typedef enum {
+    CSV_TYPE_UNKNOWN = 0,
+    CSV_TYPE_BOOL,
+    CSV_TYPE_I64,
+    CSV_TYPE_F64,
+    CSV_TYPE_STR,
+    CSV_TYPE_DATE,
+    CSV_TYPE_TIME,
+    CSV_TYPE_TIMESTAMP,
+    CSV_TYPE_GUID
+} csv_type_t;
+
+static csv_type_t detect_type(const char* f, size_t len) {
+    if (len == 0) return CSV_TYPE_UNKNOWN;
+
+    /* Common null sentinel strings → UNKNOWN (will become NULL) */
+    if ((len == 3 && (memcmp(f, "N/A", 3) == 0 || memcmp(f, "n/a", 3) == 0)) ||
+        (len == 2 && (memcmp(f, "NA", 2) == 0 || memcmp(f, "na", 2) == 0)) ||
+        (len == 4 && (memcmp(f, "null", 4) == 0 || memcmp(f, "NULL", 4) == 0 ||
+                      memcmp(f, "None", 4) == 0 || memcmp(f, "none", 4) == 0)) ||
+        (len == 1 && f[0] == '.'))  /* bare dot — not a valid value */
+        return CSV_TYPE_UNKNOWN;
+
+    /* NaN/Inf literals → float */
+    if (len == 3) {
+        if ((f[0]=='n'||f[0]=='N') && (f[1]=='a'||f[1]=='A') && (f[2]=='n'||f[2]=='N'))
+            return CSV_TYPE_F64;
+        if ((f[0]=='i'||f[0]=='I') && (f[1]=='n'||f[1]=='N') && (f[2]=='f'||f[2]=='F'))
+            return CSV_TYPE_F64;
+    }
+    if ((len == 4 && (f[0]=='+' || f[0]=='-')) &&
+        (f[1]=='i'||f[1]=='I') && (f[2]=='n'||f[2]=='N') && (f[3]=='f'||f[3]=='F'))
+        return CSV_TYPE_F64;
+
+    /* Boolean */
+    if ((len == 4 && memcmp(f, "true", 4) == 0) ||
+        (len == 5 && memcmp(f, "false", 5) == 0) ||
+        (len == 4 && memcmp(f, "TRUE", 4) == 0) ||
+        (len == 5 && memcmp(f, "FALSE", 5) == 0))
+        return CSV_TYPE_BOOL;
+
+    /* Numeric scan */
+    const char* p = f;
+    const char* end = f + len;
+    if (*p == '-' || *p == '+') p++;
+    bool has_dot = false, has_e = false, has_digit = false;
+    while (p < end) {
+        unsigned char c = (unsigned char)*p;
+        if (c >= '0' && c <= '9') { has_digit = true; p++; continue; }
+        if (c == '.' && !has_dot) { has_dot = true; p++; continue; }
+        if ((c == 'e' || c == 'E') && !has_e) {
+            has_e = true; p++;
+            if (p < end && (*p == '-' || *p == '+')) p++;
+            continue;
+        }
+        break;
+    }
+    if (p == end && has_digit) {
+        if (!has_dot && !has_e) return CSV_TYPE_I64;
+        return CSV_TYPE_F64;
+    }
+
+    /* Date: YYYY-MM-DD (exactly 10 chars) or Timestamp: YYYY-MM-DD{T| }HH:MM:SS */
+    if (len >= 10 && f[4] == '-' && f[7] == '-') {
+        bool is_date = true;
+        for (int i = 0; i < 10; i++) {
+            if (i == 4 || i == 7) continue;
+            if ((unsigned)(f[i] - '0') > 9) { is_date = false; break; }
+        }
+        if (is_date) {
+            if (len == 10) return CSV_TYPE_DATE;
+            if (len >= 19 && (f[10] == 'T' || f[10] == ' ') &&
+                f[13] == ':' && f[16] == ':') {
+                const int tp[] = {11,12,14,15,17,18};
+                bool is_ts = true;
+                for (int i = 0; i < 6; i++) {
+                    if ((unsigned)(f[tp[i]] - '0') > 9) { is_ts = false; break; }
+                }
+                if (is_ts) return CSV_TYPE_TIMESTAMP;
+            }
+        }
+    }
+
+    /* Time: HH:MM:SS[.ffffff] (at least 8 chars) */
+    if (len >= 8 && f[2] == ':' && f[5] == ':') {
+        const int tp[] = {0,1,3,4,6,7};
+        bool is_time = true;
+        for (int i = 0; i < 6; i++) {
+            if ((unsigned)(f[tp[i]] - '0') > 9) { is_time = false; break; }
+        }
+        if (is_time) return CSV_TYPE_TIME;
+    }
+
+    return CSV_TYPE_STR;
+}
+
+static csv_type_t promote_csv_type(csv_type_t cur, csv_type_t obs) {
+    if (cur == CSV_TYPE_UNKNOWN) return obs;
+    if (obs == CSV_TYPE_UNKNOWN) return cur;
+    if (cur == obs) return cur;
+    if (cur == CSV_TYPE_STR || obs == CSV_TYPE_STR) return CSV_TYPE_STR;
+    /* DATE + TIMESTAMP → TIMESTAMP */
+    if ((cur == CSV_TYPE_DATE && obs == CSV_TYPE_TIMESTAMP) ||
+        (cur == CSV_TYPE_TIMESTAMP && obs == CSV_TYPE_DATE))
+        return CSV_TYPE_TIMESTAMP;
+    /* Numeric promotion: BOOL ⊂ I64 ⊂ F64 (enum values 1 < 2 < 3) */
+    if (cur <= CSV_TYPE_F64 && obs <= CSV_TYPE_F64) {
+        if (cur == CSV_TYPE_F64 || obs == CSV_TYPE_F64) return CSV_TYPE_F64;
+        if (cur == CSV_TYPE_I64 || obs == CSV_TYPE_I64) return CSV_TYPE_I64;
+        return cur;
+    }
+    /* All other mixed types (e.g. DATE+I64, TIME+BOOL) → STR */
+    return CSV_TYPE_STR;
+}
+
+/* --------------------------------------------------------------------------
+ * Zero-copy field scanner
+ *
+ * Returns pointer past the field's trailing delimiter (or at newline/end).
+ * Sets *out and *out_len to the field content. For unquoted fields, *out
+ * points directly into the mmap buffer. For quoted fields with escaped
+ * quotes, content is unescaped into esc_buf.
+ * -------------------------------------------------------------------------- */
+
+static const char* scan_field_quoted(const char* p, const char* buf_end,
+                                     char delim,
+                                     const char** out, size_t* out_len,
+                                     char* esc_buf, char** dyn_esc) {
+    p++; /* skip opening quote */
+    const char* fld_start = p;
+    bool has_escape = false;
+
+    while (p < buf_end) {
+        if (*p == '"') {
+            if (p + 1 < buf_end && *(p + 1) == '"') {
+                has_escape = true;
+                p += 2;
+            } else {
+                break; /* closing quote */
+            }
+        } else {
+            p++;
+        }
+    }
+    size_t raw_len = (size_t)(p - fld_start);
+    if (p < buf_end && *p == '"') p++; /* skip closing quote */
+
+    if (has_escape) {
+        /* raw_len >= output length (quotes are collapsed); no overflow. */
+        char* dest = esc_buf;
+        if (RAY_UNLIKELY(raw_len > 8192)) {
+            /* Field too large for stack buffer — dynamically allocate */
+            dest = (char*)ray_sys_alloc(raw_len);
+            if (!dest) {
+                /* OOM: fall back to raw (quotes remain) */
+                *out = fld_start;
+                *out_len = raw_len;
+                goto advance;
+            }
+            *dyn_esc = dest;
+        }
+        size_t olen = 0;
+        for (const char* s = fld_start; s < fld_start + raw_len; s++) {
+            if (*s == '"' && s + 1 < fld_start + raw_len && *(s + 1) == '"') {
+                dest[olen++] = '"';
+                s++;
+            } else {
+                dest[olen++] = *s;
+            }
+        }
+        *out = dest;
+        *out_len = olen;
+    } else {
+        *out = fld_start;
+        *out_len = raw_len;
+    }
+
+advance:
+    /* Advance past delimiter */
+    if (p < buf_end && *p == delim) p++;
+    /* Don't advance past newline — caller handles row boundaries */
+    return p;
+}
+
+RAY_INLINE const char* scan_field(const char* p, const char* buf_end,
+                                  char delim,
+                                  const char** out, size_t* out_len,
+                                  char* esc_buf, char** dyn_esc) {
+    if (RAY_UNLIKELY(p >= buf_end)) {
+        *out = p;
+        *out_len = 0;
+        return p;
+    }
+
+    if (RAY_LIKELY(*p != '"')) {
+        /* Unquoted field — fast path */
+        const char* s = p;
+        while (p < buf_end && *p != delim && *p != '\n' && *p != '\r') p++;
+        *out = s;
+        *out_len = (size_t)(p - s);
+        if (p < buf_end && *p == delim) return p + 1;
+        return p;
+    }
+
+    return scan_field_quoted(p, buf_end, delim, out, out_len, esc_buf, dyn_esc);
+}
+
+/* --------------------------------------------------------------------------
+ * Numeric field parsers — thin wrappers over core/numparse with the
+ * CSV semantics that the *entire* field must be consumed; otherwise
+ * the cell is null.
+ * -------------------------------------------------------------------------- */
+
+RAY_INLINE int64_t fast_i64(const char* p, size_t len, bool* is_null) {
+    int64_t v = 0;
+    size_t n = ray_parse_i64(p, len, &v);
+    *is_null = (n == 0 || n != len);
+    return *is_null ? 0 : v;
+}
+
+RAY_INLINE double fast_f64(const char* p, size_t len, bool* is_null) {
+    double v = 0.0;
+    size_t n = ray_parse_f64(p, len, &v);
+    *is_null = (n == 0 || n != len);
+    return *is_null ? 0.0 : v;
+}
+
+/* --------------------------------------------------------------------------
+ * Fast inline date/time parsers
+ *
+ * DATE:      YYYY-MM-DD        → int32_t  (days since 2000-01-01)
+ * TIME:      HH:MM:SS[.fff]    → int32_t  (milliseconds since midnight)
+ * TIMESTAMP: YYYY-MM-DD{T| }HH:MM:SS[.ffffff] → int64_t (µs since 2000-01-01)
+ *
+ * Uses Howard Hinnant's civil-calendar algorithm (public domain) for the
+ * date→days conversion — O(1), no tables, no branches.
+ * -------------------------------------------------------------------------- */
+
+RAY_INLINE int32_t civil_to_days(int y, int m, int d) {
+    /* Shift Jan/Feb to months 10/11 of the previous year */
+    if (m <= 2) { y--; m += 9; } else { m -= 3; }
+    int era = (y >= 0 ? y : y - 399) / 400;
+    int yoe = y - era * 400;
+    int doy = (153 * m + 2) / 5 + d - 1;
+    int doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
+    return (int32_t)(era * 146097 + doe - 719468 - 10957);
+}
+
+RAY_INLINE int32_t fast_date(const char* p, size_t len, bool* is_null) {
+    if (RAY_UNLIKELY(len < 10)) { *is_null = true; return 0; }
+    *is_null = false;
+    int y = (p[0]-'0')*1000 + (p[1]-'0')*100 + (p[2]-'0')*10 + (p[3]-'0');
+    int m = (p[5]-'0')*10 + (p[6]-'0');
+    int d = (p[8]-'0')*10 + (p[9]-'0');
+    if (RAY_UNLIKELY(m < 1 || m > 12 || d < 1 || d > 31)) { *is_null = true; return 0; }
+    return civil_to_days(y, m, d);
+}
+
+/* TIME → int32_t milliseconds since midnight */
+RAY_INLINE int32_t fast_time(const char* p, size_t len, bool* is_null) {
+    if (RAY_UNLIKELY(len < 8)) { *is_null = true; return 0; }
+    *is_null = false;
+    int h  = (p[0]-'0')*10 + (p[1]-'0');
+    int mi = (p[3]-'0')*10 + (p[4]-'0');
+    int s  = (p[6]-'0')*10 + (p[7]-'0');
+    if (RAY_UNLIKELY(h > 23 || mi > 59 || s > 59)) { *is_null = true; return 0; }
+    int32_t ms = h * 3600000 + mi * 60000 + s * 1000;
+    /* Fractional seconds → milliseconds */
+    if (len > 8 && p[8] == '.') {
+        int frac = 0, digits = 0;
+        for (size_t i = 9; i < len && digits < 3; i++, digits++) {
+            unsigned di = (unsigned char)p[i] - '0';
+            if (di > 9) break;
+            frac = frac * 10 + (int)di;
+        }
+        while (digits < 3) { frac *= 10; digits++; }
+        ms += (int32_t)frac;
+    }
+    return ms;
+}
+
+/* Timestamp time component → int64_t nanoseconds.
+ * RAY_TIMESTAMP is nanoseconds since 2000-01-01 (matching
+ * src/lang/format.c:ts_to_parts and csv_write_timestamp).  Accept up
+ * to 9 fractional digits; shorter fractions are right-padded with
+ * zeros, longer ones are truncated. */
+RAY_INLINE int64_t fast_time_ns(const char* p, size_t len, bool* is_null) {
+    if (RAY_UNLIKELY(len < 8)) { *is_null = true; return 0; }
+    *is_null = false;
+    int h  = (p[0]-'0')*10 + (p[1]-'0');
+    int mi = (p[3]-'0')*10 + (p[4]-'0');
+    int s  = (p[6]-'0')*10 + (p[7]-'0');
+    if (RAY_UNLIKELY(h > 23 || mi > 59 || s > 59)) { *is_null = true; return 0; }
+    int64_t ns = (int64_t)h * 3600000000000LL + (int64_t)mi * 60000000000LL +
+                 (int64_t)s * 1000000000LL;
+    if (len > 8 && p[8] == '.') {
+        int64_t frac = 0;
+        int digits = 0;
+        for (size_t i = 9; i < len && digits < 9; i++, digits++) {
+            unsigned di = (unsigned char)p[i] - '0';
+            if (di > 9) break;
+            frac = frac * 10 + (int64_t)di;
+        }
+        while (digits < 9) { frac *= 10; digits++; }
+        ns += frac;
+    }
+    return ns;
+}
+
+RAY_INLINE int64_t fast_timestamp(const char* p, size_t len, bool* is_null) {
+    if (RAY_UNLIKELY(len < 19)) { *is_null = true; return 0; }
+    *is_null = false;
+    int32_t days = fast_date(p, 10, is_null);
+    if (*is_null) return 0;
+    bool time_null = false;
+    int64_t time_ns = fast_time_ns(p + 11, len - 11, &time_null);
+    if (time_null) { *is_null = true; return 0; }
+    const int64_t NS_PER_DAY = 86400000000000LL;
+    return (int64_t)days * NS_PER_DAY + time_ns;
+}
+
+/* --------------------------------------------------------------------------
+ * Null-aware boolean parser
+ * -------------------------------------------------------------------------- */
+
+RAY_INLINE uint8_t fast_bool(const char* s, size_t len, bool* is_null) {
+    if (len == 0) { *is_null = true; return 0; }
+    *is_null = false;
+    if ((len == 4 && (memcmp(s, "true", 4) == 0 || memcmp(s, "TRUE", 4) == 0)) ||
+        (len == 1 && s[0] == '1'))
+        return 1;
+    if ((len == 5 && (memcmp(s, "false", 5) == 0 || memcmp(s, "FALSE", 5) == 0)) ||
+        (len == 1 && s[0] == '0'))
+        return 0;
+    *is_null = true;
+    return 0;
+}
+
+/* --------------------------------------------------------------------------
+ * GUID parser (mirrors csv_write_guid: 8-4-4-4-12 hex, 36 chars).
+ * Writes 16 bytes to `dst`.  Sets *is_null on shape or hex mismatch.
+ * -------------------------------------------------------------------------- */
+
+RAY_INLINE int hex_nibble(unsigned char c) {
+    if (c >= '0' && c <= '9') return c - '0';
+    if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+    if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+    return -1;
+}
+
+RAY_INLINE void fast_guid(const char* p, size_t len, uint8_t* dst, bool* is_null) {
+    if (RAY_UNLIKELY(len != 36 ||
+                     p[8]  != '-' || p[13] != '-' ||
+                     p[18] != '-' || p[23] != '-')) {
+        *is_null = true;
+        return;
+    }
+    /* Layout: bytes 0..3 from chars 0..7, then 4..5 from 9..12,
+     * 6..7 from 14..17, 8..9 from 19..22, 10..15 from 24..35. */
+    static const uint8_t pos[16] = { 0,2,4,6,  9,11, 14,16, 19,21, 24,26,28,30,32,34 };
+    for (int i = 0; i < 16; i++) {
+        int hi = hex_nibble((unsigned char)p[pos[i]]);
+        int lo = hex_nibble((unsigned char)p[pos[i] + 1]);
+        if (RAY_UNLIKELY((hi | lo) < 0)) { *is_null = true; return; }
+        dst[i] = (uint8_t)((hi << 4) | lo);
+    }
+    *is_null = false;
+}
+
+/* --------------------------------------------------------------------------
+ * Row offsets builder — memchr-accelerated
+ *
+ * Uses memchr (glibc: SIMD-accelerated ~15-20 GB/s) for newline scanning.
+ * Fast path for quote-free files; falls back to byte-by-byte for quoted
+ * fields with embedded newlines. Returns exact row count.
+ *
+ * Allocates offsets via scratch_alloc. Caller frees with scratch_free.
+ * -------------------------------------------------------------------------- */
+
+static int64_t build_row_offsets(const char* buf, size_t buf_size,
+                                  size_t data_offset,
+                                  int64_t** offsets_out, ray_t** hdr_out) {
+    const char* p = buf + data_offset;
+    const char* end = buf + buf_size;
+
+    /* Do NOT skip leading blank lines: empty lines in the data section
+     * are null rows (they were written out by write-csv for null-valued
+     * single-column tables). Header-level whitespace is consumed by the
+     * header parser before we reach data_offset. */
+    if (p >= end) { *offsets_out = NULL; *hdr_out = NULL; return 0; }
+
+    /* Estimate capacity: ~40 bytes per row + headroom.
+     * 40 bytes/row is conservative for typical CSVs; realloc path handles
+     * underestimates. */
+    size_t remaining = (size_t)(end - p);
+    int64_t est = (int64_t)(remaining / 40) + 16;
+    ray_t* hdr = NULL;
+    int64_t* offs = (int64_t*)scratch_alloc(&hdr, (size_t)est * sizeof(int64_t));
+    if (!offs) { *offsets_out = NULL; *hdr_out = NULL; return 0; }
+
+    int64_t n = 0;
+    offs[n++] = (int64_t)(p - buf);
+
+    /* Check if file has any quotes — determines fast vs slow path */
+    bool has_quotes = (memchr(p, '"', remaining) != NULL);
+
+    if (RAY_LIKELY(!has_quotes)) {
+        /* Fast path: no quotes, use memchr for newlines.
+         * Only scans for \n; pure \r line endings (old Mac) treated as single row.
+         * Empty lines are preserved as rows (for NULL handling). */
+        for (;;) {
+            const char* nl = (const char*)memchr(p, '\n', (size_t)(end - p));
+            if (!nl) break;
+            p = nl + 1;
+            /* Skip optional \r after \n (unusual \n\r endings) */
+            if (p < end && *p == '\r') p++;
+            if (p >= end) break;
+
+            if (n >= est) {
+                est *= 2;
+                offs = (int64_t*)scratch_realloc(&hdr,
+                    (size_t)n * sizeof(int64_t),
+                    (size_t)est * sizeof(int64_t));
+                if (!offs) { scratch_free(hdr); *offsets_out = NULL; *hdr_out = NULL; return 0; }
+            }
+            offs[n++] = (int64_t)(p - buf);
+        }
+    } else {
+        /* Slow path: track quote parity, byte-by-byte.
+         * Empty lines preserved as rows (for NULL handling). */
+        bool in_quote = false;
+        while (p < end) {
+            char c = *p;
+            if (c == '"') {
+                in_quote = !in_quote;
+                p++;
+            } else if (!in_quote && (c == '\n' || c == '\r')) {
+                if (c == '\r' && p + 1 < end && *(p + 1) == '\n') p++;
+                p++;
+                if (p < end) {
+                    if (n >= est) {
+                        est *= 2;
+                        offs = (int64_t*)scratch_realloc(&hdr,
+                            (size_t)n * sizeof(int64_t),
+                            (size_t)est * sizeof(int64_t));
+                        if (!offs) { scratch_free(hdr); *offsets_out = NULL; *hdr_out = NULL; return 0; }
+                    }
+                    offs[n++] = (int64_t)(p - buf);
+                }
+            } else {
+                p++;
+            }
+        }
+    }
+
+    *offsets_out = offs;
+    *hdr_out = hdr;
+    return n;
+}
+
+/* --------------------------------------------------------------------------
+ * Batch-intern string columns after parse.
+ * Single-threaded — walks each string column, interns into global sym table,
+ * writes sym IDs into the final uint32_t column.
+ * -------------------------------------------------------------------------- */
+
+static bool csv_intern_strings(csv_strref_t** str_refs, int n_cols,
+                                const csv_type_t* col_types,
+                                const int8_t* resolved_types,
+                                void** col_data, int64_t n_rows,
+                                int64_t* col_max_ids,
+                                uint8_t** col_nullmaps) {
+    bool ok = true;
+    for (int c = 0; c < n_cols; c++) {
+        if (col_types[c] != CSV_TYPE_STR) continue;
+        /* RAY_STR columns are materialized directly; skip sym interning. */
+        if (resolved_types[c] == RAY_STR) continue;
+        csv_strref_t* refs = str_refs[c];
+        uint32_t* ids = (uint32_t*)col_data[c];
+        uint8_t* nm = col_nullmaps ? col_nullmaps[c] : NULL;
+        int64_t max_id = 0;
+
+        /* Pre-grow: upper bound is n_rows unique strings */
+        uint32_t current = ray_sym_count();
+        if (!ray_sym_ensure_cap(current + (uint32_t)(n_rows < UINT32_MAX ? n_rows : UINT32_MAX)))
+            return false;  /* OOM: cannot grow sym table */
+
+        for (int64_t r = 0; r < n_rows; r++) {
+            if (nm && (nm[r >> 3] & (1u << (r & 7)))) {
+                ids[r] = 0;
+                continue;
+            }
+            uint32_t hash = (uint32_t)ray_hash_bytes(refs[r].ptr, refs[r].len);
+            int64_t id = ray_sym_intern_prehashed(hash, refs[r].ptr, refs[r].len);
+            if (id < 0) { ok = false; id = 0; }
+            ids[r] = (uint32_t)id;
+            if (id > max_id) max_id = id;
+        }
+        if (col_max_ids) col_max_ids[c] = max_id;
+    }
+    return ok;
+}
+
+/* Free strref pointers that were heap-allocated for escaped CSV fields.
+ * Any strref whose ptr falls outside the mmap buffer [buf, buf+buf_size)
+ * was allocated by the parse loop and must be freed here. */
+static void csv_free_escaped_strrefs(csv_strref_t** str_refs, int n_cols,
+                                      const csv_type_t* col_types,
+                                      int64_t n_rows,
+                                      const char* buf, size_t buf_size) {
+    const char* buf_end = buf + buf_size;
+    for (int c = 0; c < n_cols; c++) {
+        if (col_types[c] != CSV_TYPE_STR || !str_refs[c]) continue;
+        for (int64_t r = 0; r < n_rows; r++) {
+            const char* p = str_refs[c][r].ptr;
+            if (p && (p < buf || p >= buf_end))
+                ray_sys_free((void*)p);
+        }
+    }
+}
+
+/* Materialize RAY_STR columns from parsed strrefs. Two-pass so the per-column
+ * string pool is sized exactly once — avoids the repeated realloc/COW path
+ * that ray_str_vec_set would take for a freshly-owned vector. */
+static bool csv_fill_str_cols(csv_strref_t** str_refs, int n_cols,
+                              const int8_t* resolved_types,
+                              ray_t** col_vecs, int64_t n_rows,
+                              uint8_t** col_nullmaps) {
+    for (int c = 0; c < n_cols; c++) {
+        if (resolved_types[c] != RAY_STR) continue;
+        csv_strref_t* refs = str_refs[c];
+        uint8_t* nm = col_nullmaps ? col_nullmaps[c] : NULL;
+        ray_t* vec = col_vecs[c];
+        ray_str_t* dst = (ray_str_t*)ray_data(vec);
+
+        /* ray_str_t.pool_off is u32 — the per-column pool is capped at 4 GiB.
+         * Sum as u64 so the add itself can't wrap, then bail if the total
+         * wouldn't fit in the u32 offset field. */
+        uint64_t pool_bytes = 0;
+        for (int64_t r = 0; r < n_rows; r++) {
+            if (nm && (nm[r >> 3] & (1u << (r & 7)))) continue;
+            uint32_t l = refs[r].len;
+            if (l > RAY_STR_INLINE_MAX) pool_bytes += l;
+        }
+        if (pool_bytes > UINT32_MAX) return false;
+
+        if (pool_bytes > 0) {
+            ray_t* pool = ray_alloc((size_t)pool_bytes);
+            if (!pool || RAY_IS_ERR(pool)) return false;
+            pool->type = RAY_U8;
+            pool->len = 0;
+            vec->str_pool = pool;
+        }
+
+        char* pool_base = vec->str_pool ? (char*)ray_data(vec->str_pool) : NULL;
+        uint32_t pool_off = 0;
+
+        for (int64_t r = 0; r < n_rows; r++) {
+            memset(&dst[r], 0, sizeof(ray_str_t));
+            if (nm && (nm[r >> 3] & (1u << (r & 7)))) continue;
+            const char* p = refs[r].ptr;
+            uint32_t l = refs[r].len;
+            dst[r].len = l;
+            if (l <= RAY_STR_INLINE_MAX) {
+                if (l > 0) memcpy(dst[r].data, p, l);
+            } else {
+                memcpy(dst[r].prefix, p, 4);
+                dst[r].pool_off = pool_off;
+                memcpy(pool_base + pool_off, p, l);
+                pool_off += l;  /* cannot wrap: pool_bytes <= UINT32_MAX */
+            }
+        }
+        if (vec->str_pool) vec->str_pool->len = (int64_t)pool_off;
+    }
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * Stage 9b helper: dispatch csv_fill_str_cols and csv_intern_strings on
+ * separate threads when a pool is available.  They write to disjoint
+ * column data, and intern_strings is the only one that touches the
+ * global sym table (so it stays single-threaded; we just run it in
+ * parallel with fill_str_cols).
+ * -------------------------------------------------------------------------- */
+
+typedef struct {
+    csv_strref_t**    str_refs;
+    int               n_cols;
+    const csv_type_t* parse_types;
+    const int8_t*     resolved_types;
+    void**            col_data;
+    ray_t**           col_vecs;
+    int64_t           n_rows;
+    int64_t*          sym_max_ids;
+    uint8_t**         col_nullmaps;
+    bool              fill_ok;
+    bool              intern_ok;
+} csv_finalize_ctx_t;
+
+static void csv_finalize_task(void* arg, uint32_t worker_id,
+                              int64_t start, int64_t end_idx) {
+    (void)worker_id; (void)end_idx;
+    csv_finalize_ctx_t* ctx = (csv_finalize_ctx_t*)arg;
+    if (start == 0) {
+        ctx->fill_ok = csv_fill_str_cols(ctx->str_refs, ctx->n_cols,
+            ctx->resolved_types, ctx->col_vecs, ctx->n_rows, ctx->col_nullmaps);
+    } else {
+        ctx->intern_ok = csv_intern_strings(ctx->str_refs, ctx->n_cols,
+            ctx->parse_types, ctx->resolved_types, ctx->col_data,
+            ctx->n_rows, ctx->sym_max_ids, ctx->col_nullmaps);
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Parallel parse context and callback
+ * -------------------------------------------------------------------------- */
+
+typedef struct {
+    const char*       buf;
+    size_t            buf_size;
+    const int64_t*    row_offsets;
+    int64_t           n_rows;
+    int               n_cols;
+    char              delim;
+    const csv_type_t* col_types;
+    void**            col_data;     /* non-const: workers write parsed values into columns */
+    csv_strref_t**    str_refs;     /* [n_cols] — strref arrays for string columns, NULL for others */
+    uint8_t**         col_nullmaps;
+    bool*             worker_had_null; /* [n_workers * n_cols] */
+} csv_par_ctx_t;
+
+static void csv_parse_fn(void* arg, uint32_t worker_id,
+                          int64_t start, int64_t end_row) {
+    csv_par_ctx_t* ctx = (csv_par_ctx_t*)arg;
+    char esc_buf[8192];
+    const char* buf_end = ctx->buf + ctx->buf_size;
+    bool* my_had_null = &ctx->worker_had_null[(size_t)worker_id * (size_t)ctx->n_cols];
+
+    for (int64_t row = start; row < end_row; row++) {
+        const char* p = ctx->buf + ctx->row_offsets[row];
+        const char* row_end = (row + 1 < ctx->n_rows)
+            ? ctx->buf + ctx->row_offsets[row + 1]
+            : buf_end;
+
+        for (int c = 0; c < ctx->n_cols; c++) {
+            /* Guard: if past row boundary, fill remaining columns with defaults + null */
+            if (p >= row_end) {
+                for (; c < ctx->n_cols; c++) {
+                    switch (ctx->col_types[c]) {
+                        case CSV_TYPE_BOOL: ((uint8_t*)ctx->col_data[c])[row] = 0; break;
+                        case CSV_TYPE_I64:  ((int64_t*)ctx->col_data[c])[row] = 0; break;
+                        case CSV_TYPE_F64:  ((double*)ctx->col_data[c])[row] = 0.0; break;
+                        case CSV_TYPE_DATE: ((int32_t*)ctx->col_data[c])[row] = 0; break;
+                        case CSV_TYPE_TIME: ((int32_t*)ctx->col_data[c])[row] = 0; break;
+                        case CSV_TYPE_TIMESTAMP:
+                            ((int64_t*)ctx->col_data[c])[row] = 0; break;
+                        case CSV_TYPE_GUID:
+                            memset((uint8_t*)ctx->col_data[c] + (size_t)row * 16, 0, 16);
+                            break;
+                        case CSV_TYPE_STR:
+                            ctx->str_refs[c][row].ptr = NULL;
+                            ctx->str_refs[c][row].len = 0;
+                            break;
+                        default: break;
+                    }
+                    ctx->col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                    my_had_null[c] = true;
+                }
+                break;
+            }
+
+            const char* fld;
+            size_t flen;
+            char* dyn_esc = NULL;
+            p = scan_field(p, buf_end, ctx->delim, &fld, &flen, esc_buf, &dyn_esc);
+
+            /* Strip trailing \r from last field of row */
+            if (c == ctx->n_cols - 1 && flen > 0 && fld[flen - 1] == '\r')
+                flen--;
+
+            switch (ctx->col_types[c]) {
+                case CSV_TYPE_BOOL: {
+                    bool is_null;
+                    uint8_t v = fast_bool(fld, flen, &is_null);
+                    ((uint8_t*)ctx->col_data[c])[row] = v;
+                    if (is_null) {
+                        ctx->col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        my_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_I64: {
+                    bool is_null;
+                    int64_t v = fast_i64(fld, flen, &is_null);
+                    ((int64_t*)ctx->col_data[c])[row] = v;
+                    if (is_null) {
+                        ctx->col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        my_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_F64: {
+                    bool is_null;
+                    double v = fast_f64(fld, flen, &is_null);
+                    ((double*)ctx->col_data[c])[row] = v;
+                    if (is_null) {
+                        ctx->col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        my_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_DATE: {
+                    bool is_null;
+                    int32_t v = fast_date(fld, flen, &is_null);
+                    ((int32_t*)ctx->col_data[c])[row] = v;
+                    if (is_null) {
+                        ctx->col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        my_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_TIME: {
+                    bool is_null;
+                    int32_t v = fast_time(fld, flen, &is_null);
+                    ((int32_t*)ctx->col_data[c])[row] = v;
+                    if (is_null) {
+                        ctx->col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        my_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_TIMESTAMP: {
+                    bool is_null;
+                    int64_t v = fast_timestamp(fld, flen, &is_null);
+                    ((int64_t*)ctx->col_data[c])[row] = v;
+                    if (is_null) {
+                        ctx->col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        my_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_GUID: {
+                    bool is_null;
+                    uint8_t* slot = (uint8_t*)ctx->col_data[c] + (size_t)row * 16;
+                    fast_guid(fld, flen, slot, &is_null);
+                    if (is_null) {
+                        memset(slot, 0, 16);
+                        ctx->col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        my_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_STR: {
+                    if (flen == 0) {
+                        ctx->str_refs[c][row].ptr = NULL;
+                        ctx->str_refs[c][row].len = 0;
+                        ctx->col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        my_had_null[c] = true;
+                    } else {
+                        /* fld may point into esc_buf (stack) or dyn_esc
+                         * (freed below) — both die before csv_fill_str_cols
+                         * reads the strref.  Persist escaped fields. */
+                        if (fld < ctx->buf || fld >= buf_end) {
+                            if (dyn_esc && fld == dyn_esc) {
+                                dyn_esc = NULL; /* transfer ownership */
+                            } else {
+                                char* cp = (char*)ray_sys_alloc(flen);
+                                if (cp) { memcpy(cp, fld, flen); fld = cp; }
+                            }
+                        }
+                        ctx->str_refs[c][row].ptr = fld;
+                        ctx->str_refs[c][row].len = (uint32_t)flen;
+                    }
+                    break;
+                }
+                default:
+                    break;
+            }
+            if (RAY_UNLIKELY(dyn_esc != NULL)) ray_sys_free(dyn_esc);
+        }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Serial parse fallback (small files or no thread pool)
+ * -------------------------------------------------------------------------- */
+
+static void csv_parse_serial(const char* buf, size_t buf_size,
+                              const int64_t* row_offsets, int64_t n_rows,
+                              int n_cols, char delim,
+                              const csv_type_t* col_types, void** col_data,
+                              csv_strref_t** str_refs,
+                              uint8_t** col_nullmaps, bool* col_had_null) {
+    char esc_buf[8192];
+    const char* buf_end = buf + buf_size;
+
+    for (int64_t row = 0; row < n_rows; row++) {
+        const char* p = buf + row_offsets[row];
+        const char* row_end = (row + 1 < n_rows)
+            ? buf + row_offsets[row + 1]
+            : buf_end;
+
+        for (int c = 0; c < n_cols; c++) {
+            /* Guard: if past row boundary, fill remaining columns with defaults + null */
+            if (p >= row_end) {
+                for (; c < n_cols; c++) {
+                    switch (col_types[c]) {
+                        case CSV_TYPE_BOOL: ((uint8_t*)col_data[c])[row] = 0; break;
+                        case CSV_TYPE_I64:  ((int64_t*)col_data[c])[row] = 0; break;
+                        case CSV_TYPE_F64:  ((double*)col_data[c])[row] = 0.0; break;
+                        case CSV_TYPE_DATE: ((int32_t*)col_data[c])[row] = 0; break;
+                        case CSV_TYPE_TIME: ((int32_t*)col_data[c])[row] = 0; break;
+                        case CSV_TYPE_TIMESTAMP:
+                            ((int64_t*)col_data[c])[row] = 0; break;
+                        case CSV_TYPE_GUID:
+                            memset((uint8_t*)col_data[c] + (size_t)row * 16, 0, 16);
+                            break;
+                        case CSV_TYPE_STR:
+                            str_refs[c][row].ptr = NULL;
+                            str_refs[c][row].len = 0;
+                            break;
+                        default: break;
+                    }
+                    col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                    col_had_null[c] = true;
+                }
+                break;
+            }
+
+            const char* fld;
+            size_t flen;
+            char* dyn_esc = NULL;
+            p = scan_field(p, buf_end, delim, &fld, &flen, esc_buf, &dyn_esc);
+
+            /* Strip trailing \r from last field of row */
+            if (c == n_cols - 1 && flen > 0 && fld[flen - 1] == '\r')
+                flen--;
+
+            switch (col_types[c]) {
+                case CSV_TYPE_BOOL: {
+                    bool is_null;
+                    uint8_t v = fast_bool(fld, flen, &is_null);
+                    ((uint8_t*)col_data[c])[row] = v;
+                    if (is_null) {
+                        col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        col_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_I64: {
+                    bool is_null;
+                    int64_t v = fast_i64(fld, flen, &is_null);
+                    ((int64_t*)col_data[c])[row] = v;
+                    if (is_null) {
+                        col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        col_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_F64: {
+                    bool is_null;
+                    double v = fast_f64(fld, flen, &is_null);
+                    ((double*)col_data[c])[row] = v;
+                    if (is_null) {
+                        col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        col_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_DATE: {
+                    bool is_null;
+                    int32_t v = fast_date(fld, flen, &is_null);
+                    ((int32_t*)col_data[c])[row] = v;
+                    if (is_null) {
+                        col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        col_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_TIME: {
+                    bool is_null;
+                    int32_t v = fast_time(fld, flen, &is_null);
+                    ((int32_t*)col_data[c])[row] = v;
+                    if (is_null) {
+                        col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        col_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_TIMESTAMP: {
+                    bool is_null;
+                    int64_t v = fast_timestamp(fld, flen, &is_null);
+                    ((int64_t*)col_data[c])[row] = v;
+                    if (is_null) {
+                        col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        col_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_GUID: {
+                    bool is_null;
+                    uint8_t* slot = (uint8_t*)col_data[c] + (size_t)row * 16;
+                    fast_guid(fld, flen, slot, &is_null);
+                    if (is_null) {
+                        memset(slot, 0, 16);
+                        col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        col_had_null[c] = true;
+                    }
+                    break;
+                }
+                case CSV_TYPE_STR: {
+                    if (flen == 0) {
+                        str_refs[c][row].ptr = NULL;
+                        str_refs[c][row].len = 0;
+                        col_nullmaps[c][row >> 3] |= (uint8_t)(1u << (row & 7));
+                        col_had_null[c] = true;
+                    } else {
+                        /* fld may point into esc_buf (stack) or dyn_esc
+                         * (freed below) — both die before csv_fill_str_cols
+                         * reads the strref.  Persist escaped fields. */
+                        if (fld < buf || fld >= buf_end) {
+                            if (dyn_esc && fld == dyn_esc) {
+                                dyn_esc = NULL; /* transfer ownership */
+                            } else {
+                                char* cp = (char*)ray_sys_alloc(flen);
+                                if (cp) { memcpy(cp, fld, flen); fld = cp; }
+                            }
+                        }
+                        str_refs[c][row].ptr = fld;
+                        str_refs[c][row].len = (uint32_t)flen;
+                    }
+                    break;
+                }
+                default:
+                    break;
+            }
+            if (RAY_UNLIKELY(dyn_esc != NULL)) ray_sys_free(dyn_esc);
+        }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ray_read_csv_opts — main CSV parser
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header,
+                        const int8_t* col_types_in, int32_t n_types) {
+    /* ---- 1. Open file and get size ---- */
+    int fd = open(path, O_RDONLY);
+    if (fd < 0) return ray_error("io", NULL);
+
+    struct stat st;
+    if (fstat(fd, &st) != 0 || st.st_size <= 0) {
+        close(fd);
+        return ray_error("io", NULL);
+    }
+    size_t file_size = (size_t)st.st_size;
+
+    /* ---- 2. mmap the file ---- */
+    char* buf = (char*)mmap(NULL, file_size, PROT_READ, MMAP_FLAGS, fd, 0);
+    close(fd);
+    if (buf == MAP_FAILED) return ray_error("io", NULL);
+
+#ifdef __APPLE__
+    madvise(buf, file_size, MADV_SEQUENTIAL);
+#endif
+
+    const char* buf_end = buf + file_size;
+    ray_t* result = NULL;
+
+    /* ---- 3. Detect delimiter ---- */
+    /* Delimiter auto-detected from header row only. Files where the header
+     * has a different delimiter distribution than data rows may be misdetected;
+     * pass an explicit delimiter for such files.  Scanning additional data rows
+     * was considered but adds complexity for a rare edge case. */
+    if (delimiter == 0) {
+        int commas = 0, tabs = 0;
+        for (const char* p = buf; p < buf_end && *p != '\n'; p++) {
+            if (*p == ',') commas++;
+            if (*p == '\t') tabs++;
+        }
+        delimiter = (tabs > commas) ? '\t' : ',';
+    }
+
+    /* ---- 4. Count columns from first line ---- */
+    int ncols = 1;
+    {
+        const char* p = buf;
+        bool in_quote = false;
+        while (p < buf_end && (in_quote || (*p != '\n' && *p != '\r'))) {
+            if (*p == '"') in_quote = !in_quote;
+            else if (!in_quote && *p == delimiter) ncols++;
+            p++;
+        }
+    }
+    if (ncols > CSV_MAX_COLS) {
+        munmap(buf, file_size);
+        /* fd already closed after mmap (line 1044) — do not close again */
+        return ray_error("range", NULL);  /* too many columns */
+    }
+
+    /* ---- 5. Parse header row ---- */
+    const char* p = buf;
+    char esc_buf[8192];
+    int64_t col_name_ids[CSV_MAX_COLS];
+
+    if (header) {
+        for (int c = 0; c < ncols; c++) {
+            const char* fld;
+            size_t flen;
+            char* dyn_esc = NULL;
+            p = scan_field(p, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc);
+            col_name_ids[c] = ray_sym_intern(fld, flen);
+            if (dyn_esc) ray_sys_free(dyn_esc);
+        }
+        /* Consume exactly one line terminator (\r, \n, or \r\n) after the
+         * header row — NOT a run of newlines, because subsequent empty
+         * lines are null data rows. */
+        if (p < buf_end && *p == '\r') p++;
+        if (p < buf_end && *p == '\n') p++;
+    } else {
+        for (int c = 0; c < ncols; c++) {
+            char name[32];
+            snprintf(name, sizeof(name), "V%d", c + 1);
+            col_name_ids[c] = ray_sym_intern(name, strlen(name));
+        }
+    }
+
+    size_t data_offset = (size_t)(p - buf);
+
+    /* ---- 6. Build row offsets (memchr-accelerated) ---- */
+    ray_t* row_offsets_hdr = NULL;
+    int64_t* row_offsets = NULL;
+    int64_t n_rows = build_row_offsets(buf, file_size, data_offset,
+                                        &row_offsets, &row_offsets_hdr);
+
+    if (n_rows == 0) {
+        /* Empty file → empty table */
+        ray_t* tbl = ray_table_new(ncols);
+        if (!tbl || RAY_IS_ERR(tbl)) goto fail_unmap;
+        for (int c = 0; c < ncols; c++) {
+            ray_t* empty_vec = ray_vec_new(RAY_F64, 0);
+            if (empty_vec && !RAY_IS_ERR(empty_vec)) {
+                tbl = ray_table_add_col(tbl, col_name_ids[c], empty_vec);
+                ray_release(empty_vec);
+            }
+        }
+        munmap(buf, file_size);
+        return tbl;
+    }
+
+    /* ---- 7. Resolve column types ---- */
+    int8_t resolved_types[CSV_MAX_COLS];
+    if (col_types_in && n_types >= ncols) {
+        /* Explicit types provided by caller — validate against known types */
+        for (int c = 0; c < ncols; c++) {
+            int8_t t = col_types_in[c];
+            if (t < RAY_BOOL || t >= RAY_TYPE_COUNT || t == RAY_TABLE) {
+                /* Invalid type constant — fall through to error */
+                goto fail_offsets;
+            }
+            resolved_types[c] = t;
+        }
+    } else if (!col_types_in) {
+        /* Auto-infer from sample rows */
+        csv_type_t col_types[CSV_MAX_COLS];
+        memset(col_types, 0, (size_t)ncols * sizeof(csv_type_t));
+        /* Type inference from first 100 rows. Heterogeneous CSVs with type
+         * changes after row 100 will be mistyped. Use explicit schema
+         * (col_types_in) for such files. */
+        int64_t sample_n = (n_rows < CSV_SAMPLE_ROWS) ? n_rows : CSV_SAMPLE_ROWS;
+        for (int64_t r = 0; r < sample_n; r++) {
+            const char* rp = buf + row_offsets[r];
+            for (int c = 0; c < ncols; c++) {
+                const char* fld;
+                size_t flen;
+                char* dyn_esc = NULL;
+                rp = scan_field(rp, buf_end, delimiter, &fld, &flen, esc_buf, &dyn_esc);
+                csv_type_t t = detect_type(fld, flen);
+                if (dyn_esc) ray_sys_free(dyn_esc);
+                col_types[c] = promote_csv_type(col_types[c], t);
+            }
+        }
+        for (int c = 0; c < ncols; c++) {
+            switch (col_types[c]) {
+                case CSV_TYPE_BOOL:      resolved_types[c] = RAY_BOOL;      break;
+                case CSV_TYPE_I64:       resolved_types[c] = RAY_I64;       break;
+                case CSV_TYPE_F64:       resolved_types[c] = RAY_F64;       break;
+                case CSV_TYPE_DATE:      resolved_types[c] = RAY_DATE;      break;
+                case CSV_TYPE_TIME:      resolved_types[c] = RAY_TIME;      break;
+                case CSV_TYPE_TIMESTAMP: resolved_types[c] = RAY_TIMESTAMP; break;
+                default:                 resolved_types[c] = RAY_SYM;       break;
+            }
+        }
+    } else {
+        /* col_types_in provided but too short — error */
+        goto fail_offsets;
+    }
+
+    /* ---- 8. Allocate column vectors ---- */
+    ray_t* col_vecs[CSV_MAX_COLS];
+    void* col_data[CSV_MAX_COLS];
+
+    for (int c = 0; c < ncols; c++) {
+        int8_t type = resolved_types[c];
+        /* String columns: allocate RAY_SYM at W32 (4B/elem) for sym IDs.
+         * After intern, narrow to W8/W16 if max sym ID permits. */
+        col_vecs[c] = (type == RAY_SYM) ? ray_sym_vec_new(RAY_SYM_W32, n_rows)
+                                        : ray_vec_new(type, n_rows);
+        if (!col_vecs[c] || RAY_IS_ERR(col_vecs[c])) {
+            for (int j = 0; j < c; j++) ray_release(col_vecs[j]);
+            goto fail_offsets;
+        }
+        /* len set early so parallel workers can write to full extent;
+         * parse errors return before table is used. */
+        col_vecs[c]->len = n_rows;
+        col_data[c] = ray_data(col_vecs[c]);
+    }
+
+    /* ---- 8b. Pre-allocate nullmaps for all columns ---- */
+    uint8_t* col_nullmaps[CSV_MAX_COLS];
+    bool col_had_null[CSV_MAX_COLS];
+    if (ncols > 0) memset(col_had_null, 0, (size_t)ncols * sizeof(bool));
+
+    for (int c = 0; c < ncols; c++) {
+        ray_t* vec = col_vecs[c];
+        /* RAY_STR aliases bytes 8-15 of the header with str_pool — inline
+         * nullmap would corrupt the pool pointer, so force external. */
+        bool force_ext = (resolved_types[c] == RAY_STR);
+        if (n_rows <= 128 && !force_ext) {
+            vec->attrs |= RAY_ATTR_HAS_NULLS;
+            memset(vec->nullmap, 0, 16);
+            col_nullmaps[c] = vec->nullmap;
+        } else {
+            size_t bmp_bytes = ((size_t)n_rows + 7) / 8;
+            ray_t* ext = ray_vec_new(RAY_U8, (int64_t)bmp_bytes);
+            if (!ext || RAY_IS_ERR(ext)) {
+                for (int j = 0; j <= c; j++) ray_release(col_vecs[j]);
+                goto fail_offsets;
+            }
+            ext->len = (int64_t)bmp_bytes;
+            memset(ray_data(ext), 0, bmp_bytes);
+            vec->ext_nullmap = ext;
+            vec->attrs |= RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT;
+            col_nullmaps[c] = (uint8_t*)ray_data(ext);
+        }
+    }
+
+    /* Build csv_type_t array for parse functions (maps td types → csv types) */
+    csv_type_t parse_types[CSV_MAX_COLS];
+    for (int c = 0; c < ncols; c++) {
+        switch (resolved_types[c]) {
+            case RAY_BOOL:      parse_types[c] = CSV_TYPE_BOOL;      break;
+            case RAY_I64:       parse_types[c] = CSV_TYPE_I64;       break;
+            case RAY_F64:       parse_types[c] = CSV_TYPE_F64;       break;
+            case RAY_DATE:      parse_types[c] = CSV_TYPE_DATE;      break;
+            case RAY_TIME:      parse_types[c] = CSV_TYPE_TIME;      break;
+            case RAY_TIMESTAMP: parse_types[c] = CSV_TYPE_TIMESTAMP; break;
+            case RAY_GUID:      parse_types[c] = CSV_TYPE_GUID;      break;
+            default:           parse_types[c] = CSV_TYPE_STR;       break;
+        }
+    }
+
+    /* ---- 9. Parse data ---- */
+    int64_t sym_max_ids[CSV_MAX_COLS];
+    memset(sym_max_ids, 0, (size_t)ncols * sizeof(int64_t));
+
+    /* Check if any string columns exist */
+    int has_str_cols = 0;
+    for (int c = 0; c < ncols; c++) {
+        if (parse_types[c] == CSV_TYPE_STR) { has_str_cols = 1; break; }
+    }
+
+    /* Allocate strref arrays for string columns (temporary, freed after intern) */
+    csv_strref_t* str_ref_bufs[CSV_MAX_COLS];
+    ray_t* str_ref_hdrs[CSV_MAX_COLS];
+    memset(str_ref_bufs, 0, sizeof(str_ref_bufs));
+    memset(str_ref_hdrs, 0, sizeof(str_ref_hdrs));
+    for (int c = 0; c < ncols; c++) {
+        if (parse_types[c] == CSV_TYPE_STR) {
+            size_t sz = (size_t)n_rows * sizeof(csv_strref_t);
+            str_ref_bufs[c] = (csv_strref_t*)scratch_alloc(&str_ref_hdrs[c], sz);
+            if (!str_ref_bufs[c]) {
+                for (int j = 0; j < ncols; j++) ray_release(col_vecs[j]);
+                for (int j = 0; j < c; j++) scratch_free(str_ref_hdrs[j]);
+                goto fail_offsets;
+            }
+        }
+    }
+
+    {
+        ray_pool_t* pool = ray_pool_get();
+        bool use_parallel = pool && n_rows > 8192;
+
+        if (use_parallel) {
+            uint32_t n_workers = ray_pool_total_workers(pool);
+            size_t whn_sz = (size_t)n_workers * (size_t)ncols * sizeof(bool);
+            bool* worker_had_null_buf = (bool*)ray_sys_alloc(whn_sz);
+            if (!worker_had_null_buf) {
+                use_parallel = false;
+            } else {
+                memset(worker_had_null_buf, 0, whn_sz);
+
+                csv_par_ctx_t ctx = {
+                    .buf              = buf,
+                    .buf_size         = file_size,
+                    .row_offsets      = row_offsets,
+                    .n_rows           = n_rows,
+                    .n_cols           = ncols,
+                    .delim            = delimiter,
+                    .col_types        = parse_types,
+                    .col_data         = col_data,
+                    .str_refs         = str_ref_bufs,
+                    .col_nullmaps     = col_nullmaps,
+                    .worker_had_null  = worker_had_null_buf,
+                };
+
+                ray_pool_dispatch(pool, csv_parse_fn, &ctx, n_rows);
+
+                /* OR worker null flags into col_had_null */
+                for (uint32_t w = 0; w < n_workers; w++) {
+                    for (int c = 0; c < ncols; c++) {
+                        if (worker_had_null_buf[(size_t)w * (size_t)ncols + (size_t)c])
+                            col_had_null[c] = true;
+                    }
+                }
+                ray_sys_free(worker_had_null_buf);
+            }
+        }
+
+        if (!use_parallel) {
+            csv_parse_serial(buf, file_size, row_offsets, n_rows,
+                             ncols, delimiter, parse_types, col_data,
+                             str_ref_bufs, col_nullmaps, col_had_null);
+        }
+    }
+
+    /* ---- 9b. Materialize RAY_STR columns AND batch-intern sym columns ----
+     * These two phases touch disjoint columns and (after the GUID fix)
+     * intern_strings is the only one that mutates the global sym table.
+     * Dispatch them as two thread-pool tasks so they overlap in wall time
+     * — typically saves the smaller of the two phases. */
+    if (has_str_cols) {
+        csv_finalize_ctx_t fctx = {
+            .str_refs       = str_ref_bufs,
+            .n_cols         = ncols,
+            .parse_types    = parse_types,
+            .resolved_types = resolved_types,
+            .col_data       = col_data,
+            .col_vecs       = col_vecs,
+            .n_rows         = n_rows,
+            .sym_max_ids    = sym_max_ids,
+            .col_nullmaps   = col_nullmaps,
+            .fill_ok        = true,
+            .intern_ok      = true,
+        };
+        ray_pool_t* fpool = ray_pool_get();
+        if (fpool && ray_pool_total_workers(fpool) >= 2) {
+            ray_pool_dispatch_n(fpool, csv_finalize_task, &fctx, 2);
+        } else {
+            csv_finalize_task(&fctx, 0, 0, 1);
+            csv_finalize_task(&fctx, 0, 1, 2);
+        }
+        if (!fctx.fill_ok || !fctx.intern_ok) {
+            csv_free_escaped_strrefs(str_ref_bufs, ncols, parse_types, n_rows, buf, file_size);
+            for (int c = 0; c < ncols; c++) scratch_free(str_ref_hdrs[c]);
+            for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]);
+            goto fail_offsets;
+        }
+    }
+
+    /* Free heap-allocated escaped string copies, then strref buffers */
+    csv_free_escaped_strrefs(str_ref_bufs, ncols, parse_types, n_rows, buf, file_size);
+    for (int c = 0; c < ncols; c++) scratch_free(str_ref_hdrs[c]);
+
+    /* ---- 9c. Strip nullmaps from all-valid columns ---- */
+    for (int c = 0; c < ncols; c++) {
+        if (col_had_null[c]) continue;
+        ray_t* vec = col_vecs[c];
+        if (vec->attrs & RAY_ATTR_NULLMAP_EXT) {
+            ray_release(vec->ext_nullmap);
+            vec->ext_nullmap = NULL;
+        }
+        vec->attrs &= (uint8_t)~(RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT);
+        /* RAY_STR stores str_pool in bytes 8-15 of the header — don't wipe. */
+        if (vec->type != RAY_STR) memset(vec->nullmap, 0, 16);
+    }
+
+    /* ---- 10. Narrow sym columns to optimal width ---- */
+    for (int c = 0; c < ncols; c++) {
+        if (resolved_types[c] != RAY_SYM) continue;
+        uint8_t new_w = ray_sym_dict_width(sym_max_ids[c]);
+        if (new_w >= RAY_SYM_W32) continue; /* already at W32, no savings */
+        ray_t* narrow = ray_sym_vec_new(new_w, n_rows);
+        if (!narrow || RAY_IS_ERR(narrow)) continue;
+        narrow->len = n_rows;
+        const uint32_t* src = (const uint32_t*)col_data[c];
+        void* dst = ray_data(narrow);
+        if (new_w == RAY_SYM_W8) {
+            uint8_t* d = (uint8_t*)dst;
+            for (int64_t r = 0; r < n_rows; r++) d[r] = (uint8_t)src[r];
+        } else { /* RAY_SYM_W16 */
+            uint16_t* d = (uint16_t*)dst;
+            for (int64_t r = 0; r < n_rows; r++) d[r] = (uint16_t)src[r];
+        }
+        /* Transfer nullmap to narrowed vector */
+        if (col_vecs[c]->attrs & RAY_ATTR_HAS_NULLS) {
+            narrow->attrs |= (col_vecs[c]->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT));
+            if (col_vecs[c]->attrs & RAY_ATTR_NULLMAP_EXT) {
+                narrow->ext_nullmap = col_vecs[c]->ext_nullmap;
+                ray_retain(narrow->ext_nullmap);
+            } else {
+                memcpy(narrow->nullmap, col_vecs[c]->nullmap, 16);
+            }
+        }
+        ray_release(col_vecs[c]);
+        col_vecs[c] = narrow;
+        col_data[c] = dst;
+    }
+
+    /* ---- 11. Build table ---- */
+    {
+        ray_t* tbl = ray_table_new(ncols);
+        if (!tbl || RAY_IS_ERR(tbl)) {
+            for (int c = 0; c < ncols; c++) ray_release(col_vecs[c]);
+            goto fail_offsets;
+        }
+
+        for (int c = 0; c < ncols; c++) {
+            tbl = ray_table_add_col(tbl, col_name_ids[c], col_vecs[c]);
+            ray_release(col_vecs[c]);
+        }
+
+        result = tbl;
+    }
+
+    /* ---- 12. Cleanup ---- */
+    scratch_free(row_offsets_hdr);
+    munmap(buf, file_size);
+    return result;
+
+    /* Error paths */
+fail_offsets:
+    scratch_free(row_offsets_hdr);
+fail_unmap:
+    munmap(buf, file_size);
+    return ray_error("oom", NULL);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_read_csv — convenience wrapper with default options
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_read_csv(const char* path) {
+    return ray_read_csv_opts(path, 0, true, NULL, 0);
+}
+
+/* ============================================================================
+ * ray_write_csv — Write a table to a CSV file (RFC 4180)
+ *
+ * Writes header row with column names, then data rows.
+ * Strings containing commas, quotes, or newlines are quoted.
+ * Returns RAY_OK on success, error code on failure.
+ * ============================================================================ */
+
+/* -----------------------------------------------------------------------------
+ * write-csv writer state
+ *
+ * Wraps FILE* with a sticky error flag so the dispatch loop can stay flat
+ * and still report the first I/O error.  On any write failure subsequent
+ * writes are skipped and the final ray_write_csv returns RAY_ERR_IO.
+ * --------------------------------------------------------------------------- */
+
+typedef struct csv_writer_t {
+    FILE*     fp;
+    int       err;  /* 0 = OK, non-zero = sticky error */
+} csv_writer_t;
+
+static inline void cw_putc(csv_writer_t* w, int c) {
+    if (w->err) return;
+    if (fputc(c, w->fp) == EOF) w->err = 1;
+}
+
+static inline void cw_write(csv_writer_t* w, const char* s, size_t len) {
+    if (w->err || len == 0) return;
+    if (fwrite(s, 1, len, w->fp) != len) w->err = 1;
+}
+
+static inline void cw_puts(csv_writer_t* w, const char* s) {
+    if (!s) return;
+    cw_write(w, s, strlen(s));
+}
+
+/* bounded, error-propagating fprintf replacement */
+static void cw_printf(csv_writer_t* w, const char* fmt, ...) {
+    if (w->err) return;
+    char buf[64];
+    va_list ap;
+    va_start(ap, fmt);
+    int n = vsnprintf(buf, sizeof(buf), fmt, ap);
+    va_end(ap);
+    if (n < 0) { w->err = 1; return; }
+    if ((size_t)n >= sizeof(buf)) { w->err = 1; return; }
+    cw_write(w, buf, (size_t)n);
+}
+
+/* Write a string value, quoting if it contains special chars */
+static void csv_write_str(csv_writer_t* w, const char* s, size_t len) {
+    int need_quote = 0;
+    for (size_t i = 0; i < len; i++) {
+        if (s[i] == ',' || s[i] == '"' || s[i] == '\n' || s[i] == '\r') {
+            need_quote = 1;
+            break;
+        }
+    }
+    if (need_quote) {
+        cw_putc(w, '"');
+        size_t start = 0;
+        for (size_t i = 0; i < len; i++) {
+            if (s[i] == '"') {
+                cw_write(w, s + start, i - start);
+                cw_putc(w, '"');   /* escaped quote */
+                start = i;
+            }
+        }
+        cw_write(w, s + start, len - start);
+        cw_putc(w, '"');
+    } else {
+        cw_write(w, s, len);
+    }
+}
+
+static void csv_write_date(csv_writer_t* w, int32_t v) {
+    /* days since 2000-01-01 → YYYY-MM-DD, civil_from_days (Hinnant) */
+    int32_t z = v + 10957 + 719468;
+    int32_t era = (z >= 0 ? z : z - 146096) / 146097;
+    uint32_t doe = (uint32_t)(z - era * 146097);
+    uint32_t yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365;
+    int32_t  y = (int32_t)yoe + era * 400;
+    uint32_t doy = doe - (365*yoe + yoe/4 - yoe/100);
+    uint32_t mp = (5*doy + 2) / 153;
+    int32_t  d = (int32_t)(doy - (153*mp + 2)/5 + 1);
+    int32_t  m = (int32_t)(mp < 10 ? mp + 3 : mp - 9);
+    if (m <= 2) y++;
+    cw_printf(w, "%04d-%02d-%02d", y, m, d);
+}
+
+static void csv_write_time(csv_writer_t* w, int32_t ms) {
+    /* RAY_TIME is a signed ms-of-day. Negative values represent
+     * negative durations (Rayforce convention); render them
+     * with a leading "-" and the absolute magnitude rather than
+     * wrapping modulo one day, which would lose the sign. */
+    int32_t sign = ms < 0 ? -1 : 1;
+    /* Absolute value: handle INT32_MIN by widening. */
+    uint32_t u = (ms == INT32_MIN) ? (uint32_t)INT32_MAX + 1u : (uint32_t)(sign == -1 ? -ms : ms);
+    uint32_t h    = u / 3600000u;
+    uint32_t mi   = (u % 3600000u) / 60000u;
+    uint32_t s    = (u % 60000u)   / 1000u;
+    uint32_t frac = u % 1000u;
+    if (sign == -1) cw_putc(w, '-');
+    if (frac) cw_printf(w, "%02u:%02u:%02u.%03u", h, mi, s, frac);
+    else      cw_printf(w, "%02u:%02u:%02u", h, mi, s);
+}
+
+static void csv_write_timestamp(csv_writer_t* w, int64_t ns) {
+    /* RAY_TIMESTAMP stores *nanoseconds* since 2000-01-01, matching
+     * the language-level formatter (src/lang/format.c:ts_to_parts).
+     * Splitting with C's truncating / and % rounds toward zero, so
+     * fix up after the fact for negative values. */
+    const int64_t NS_PER_DAY = 86400000000000LL;
+    int64_t days   = ns / NS_PER_DAY;
+    int64_t ns_in  = ns % NS_PER_DAY;
+    if (ns_in < 0) { days--; ns_in += NS_PER_DAY; }
+    /* int64 ns / NS_PER_DAY is bounded by ±~106,752 days above INT32,
+     * so even INT64_MIN fits once converted to days. Still, use
+     * int64 through csv_write_date by taking the low bits — any
+     * timestamp that actually fits in an int64 ns count produces a
+     * days value well within int32 range (~±5.88M years). */
+    csv_write_date(w, (int32_t)days);
+    cw_putc(w, 'T');
+    uint64_t tns  = (uint64_t)ns_in;
+    uint32_t h    = (uint32_t)(tns / 3600000000000ULL);
+    uint32_t mi   = (uint32_t)((tns % 3600000000000ULL) / 60000000000ULL);
+    uint32_t s    = (uint32_t)((tns % 60000000000ULL)   / 1000000000ULL);
+    uint32_t frac = (uint32_t)(tns % 1000000000ULL);
+    if (frac) cw_printf(w, "%02u:%02u:%02u.%09u", h, mi, s, frac);
+    else      cw_printf(w, "%02u:%02u:%02u", h, mi, s);
+}
+
+static void csv_write_f64(csv_writer_t* w, double v) {
+    if (isnan(v)) { cw_puts(w, "nan"); return; }
+    if (isinf(v)) { cw_puts(w, v < 0 ? "-inf" : "inf"); return; }
+    /* %.17g is the standard round-trip format; wrap in cw_printf so
+     * a 64-byte buffer stack overflow guards the write. */
+    cw_printf(w, "%.17g", v);
+}
+
+static void csv_write_guid(csv_writer_t* w, const uint8_t* g) {
+    /* RFC 4122 canonical: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx */
+    cw_printf(w,
+        "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        g[0], g[1], g[2],  g[3],  g[4],  g[5],  g[6],  g[7],
+        g[8], g[9], g[10], g[11], g[12], g[13], g[14], g[15]);
+}
+
+/* Per-column resolution: slice-aware data pointer, base row offset,
+ * underlying parent (for ray_vec_is_null), and a cached null flag. */
+typedef struct csv_col_info_t {
+    ray_t*        col;            /* original column (may be sliced) */
+    ray_t*        data_owner;     /* slice_parent or col */
+    int64_t       base_row;       /* slice_offset or 0 */
+    const void*   data;           /* ray_data(data_owner) */
+    int8_t        type;
+    uint8_t       attrs;          /* of data_owner */
+    bool          has_nulls;      /* requires per-row ray_vec_is_null probe */
+} csv_col_info_t;
+
+static void csv_col_info_init(csv_col_info_t* ci, ray_t* col) {
+    ci->col        = col;
+    ci->data_owner = col;
+    ci->base_row   = 0;
+    if (col && (col->attrs & RAY_ATTR_SLICE) && col->slice_parent) {
+        ci->data_owner = col->slice_parent;
+        ci->base_row   = col->slice_offset;
+    }
+    ci->type  = col ? col->type : 0;
+    ci->attrs = ci->data_owner ? ci->data_owner->attrs : 0;
+    ci->data  = ci->data_owner ? ray_data(ci->data_owner) : NULL;
+    /* has_nulls must consult the slice_parent, since a slice view
+     * never carries its own nullmap — ray_vec_is_null handles the
+     * redirect but we still want a fast bypass when neither has nulls. */
+    ci->has_nulls = false;
+    if (col && (col->attrs & RAY_ATTR_HAS_NULLS)) ci->has_nulls = true;
+    if (ci->data_owner && (ci->data_owner->attrs & RAY_ATTR_HAS_NULLS))
+        ci->has_nulls = true;
+}
+
+static void csv_write_cell(csv_writer_t* w, const csv_col_info_t* ci, int64_t r) {
+    if (!ci->col) return;
+    /* Null cell -> empty field (consistent with read-csv). */
+    if (ci->has_nulls && ray_vec_is_null(ci->col, r)) return;
+
+    int64_t dr = ci->base_row + r;
+    int8_t t   = ci->type;
+    const void* d = ci->data;
+
+    switch (t) {
+    case RAY_I64: case RAY_TIMESTAMP: break; /* handled below */
+    default: break;
+    }
+
+    switch (t) {
+    case RAY_I64:
+        cw_printf(w, "%" PRId64, ((const int64_t*)d)[dr]);
+        break;
+    case RAY_I32:
+        cw_printf(w, "%" PRId32, ((const int32_t*)d)[dr]);
+        break;
+    case RAY_I16:
+        cw_printf(w, "%d", (int)((const int16_t*)d)[dr]);
+        break;
+    case RAY_BOOL:
+        cw_puts(w, ((const uint8_t*)d)[dr] ? "true" : "false");
+        break;
+    case RAY_U8:
+        cw_printf(w, "%u", (unsigned)((const uint8_t*)d)[dr]);
+        break;
+    case RAY_F64:
+        csv_write_f64(w, ((const double*)d)[dr]);
+        break;
+    case RAY_DATE:
+        csv_write_date(w, ((const int32_t*)d)[dr]);
+        break;
+    case RAY_TIME:
+        csv_write_time(w, ((const int32_t*)d)[dr]);
+        break;
+    case RAY_TIMESTAMP:
+        csv_write_timestamp(w, ((const int64_t*)d)[dr]);
+        break;
+    case RAY_SYM: {
+        int64_t sym = ray_read_sym(d, dr, t, ci->attrs);
+        ray_t* s = ray_sym_str(sym);
+        if (s) csv_write_str(w, ray_str_ptr(s), ray_str_len(s));
+        /* unknown sym id -> empty field rather than a phantom value */
+        break;
+    }
+    case RAY_STR: {
+        /* ray_str_vec_get accepts the original (possibly sliced) col and
+         * resolves the parent+offset internally.  It returns NULL for
+         * nulls, which we already filtered above, so treat NULL as
+         * empty-but-valid (e.g. a 0-length inline string). */
+        size_t slen = 0;
+        const char* sp = ray_str_vec_get(ci->col, r, &slen);
+        csv_write_str(w, sp ? sp : "", slen);
+        break;
+    }
+    case RAY_GUID:
+        csv_write_guid(w, (const uint8_t*)d + dr * 16);
+        break;
+    case RAY_LIST: {
+        /* LIST cells: recursively format each element as a string via
+         * the atom's printable representation.  For nested tables /
+         * lists-of-lists this produces a best-effort flat string; the
+         * whole list field is quoted to keep commas inside from
+         * breaking column alignment.  A LIST element is itself a
+         * ray_t*, so reuse ray_fmt to get a string form. */
+        ray_t** elems = (ray_t**)d;
+        ray_t* e = elems[dr];
+        if (!e || RAY_IS_ERR(e)) return;
+        ray_t* fmt = ray_fmt(e, false);
+        if (!fmt || RAY_IS_ERR(fmt)) return;
+        csv_write_str(w, ray_str_ptr(fmt), ray_str_len(fmt));
+        ray_release(fmt);
+        break;
+    }
+    default:
+        /* Unhandled type: emit an empty field rather than corrupting
+         * downstream columns.  Callers can inspect the file and see
+         * the missing data explicitly. */
+        break;
+    }
+}
+
+ray_err_t ray_write_csv(ray_t* table, const char* path) {
+    if (!table || !path || path[0] == '\0') return RAY_ERR_TYPE;
+
+    int64_t ncols = ray_table_ncols(table);
+    int64_t nrows = ray_table_nrows(table);
+    if (ncols <= 0) return RAY_ERR_TYPE;
+
+    /* Crash-safe atomic write: tmp -> fsync -> rename. Mirrors
+     * ray_col_save so an interrupted write never replaces the
+     * destination with a partial file. */
+    char tmp_path[1024];
+    if (snprintf(tmp_path, sizeof(tmp_path), "%s.tmp", path) >= (int)sizeof(tmp_path))
+        return RAY_ERR_IO;
+
+    FILE* fp = fopen(tmp_path, "wb");
+    if (!fp) return RAY_ERR_IO;
+
+    csv_writer_t w = { .fp = fp, .err = 0 };
+
+    /* Resolve every column once (slice parent, nullability, type) so
+     * the hot loop just indexes into pre-computed pointers. */
+    ray_t* col_info_block = ray_alloc((size_t)ncols * sizeof(csv_col_info_t));
+    if (!col_info_block || RAY_IS_ERR(col_info_block)) {
+        fclose(fp);
+        remove(tmp_path);
+        return RAY_ERR_OOM;
+    }
+    csv_col_info_t* ci = (csv_col_info_t*)ray_data(col_info_block);
+    for (int64_t c = 0; c < ncols; c++)
+        csv_col_info_init(&ci[c], ray_table_get_col_idx(table, c));
+
+    /* Header row: column names */
+    for (int64_t c = 0; c < ncols; c++) {
+        if (c > 0) cw_putc(&w, ',');
+        int64_t name_id = ray_table_col_name(table, c);
+        ray_t* name_atom = ray_sym_str(name_id);
+        if (name_atom)
+            csv_write_str(&w, ray_str_ptr(name_atom), ray_str_len(name_atom));
+    }
+    cw_putc(&w, '\n');
+
+    /* Data rows */
+    for (int64_t r = 0; r < nrows && !w.err; r++) {
+        for (int64_t c = 0; c < ncols; c++) {
+            if (c > 0) cw_putc(&w, ',');
+            csv_write_cell(&w, &ci[c], r);
+        }
+        cw_putc(&w, '\n');
+    }
+
+    ray_free(col_info_block);
+
+    /* Flush user-space buffer before fsync/rename. */
+    if (fflush(fp) != 0) w.err = 1;
+    int close_err = (fclose(fp) != 0);
+    if (close_err) w.err = 1;
+
+    if (w.err) {
+        remove(tmp_path);
+        return RAY_ERR_IO;
+    }
+
+    /* fsync the temp file so the rename is backed by durable bytes. */
+    ray_fd_t fd = ray_file_open(tmp_path, RAY_OPEN_READ | RAY_OPEN_WRITE);
+    if (fd == RAY_FD_INVALID) { remove(tmp_path); return RAY_ERR_IO; }
+    ray_err_t sync_err = ray_file_sync(fd);
+    ray_file_close(fd);
+    if (sync_err != RAY_OK) { remove(tmp_path); return sync_err; }
+
+    ray_err_t rn_err = ray_file_rename(tmp_path, path);
+    if (rn_err != RAY_OK) { remove(tmp_path); return rn_err; }
+
+    return RAY_OK;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/io/csv.h b/crates/rayforce-sys/vendor/rayforce/src/io/csv.h
new file mode 100644
index 0000000..2240ae4
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/io/csv.h
@@ -0,0 +1,34 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_CSV_H
+#define RAY_CSV_H
+
+#include <rayforce.h>
+
+ray_t* ray_read_csv(const char* path);
+ray_t* ray_read_csv_opts(const char* path, char delimiter, bool header,
+                        const int8_t* col_types, int32_t n_types);
+ray_err_t ray_write_csv(ray_t* table, const char* path);
+
+#endif /* RAY_CSV_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/cal.h b/crates/rayforce-sys/vendor/rayforce/src/lang/cal.h
new file mode 100644
index 0000000..a36aee8
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/cal.h
@@ -0,0 +1,84 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_CAL_H
+#define RAY_CAL_H
+
+#include <stdint.h>
+
+/* ===== Calendar primitives shared by format.c and parse.c ===== */
+
+#define RAY_DATE_EPOCH 2000
+
+/* Cumulative days-in-month lookup: [leap][month].
+ * Index 0 = Jan start (0 days), index 12 = Dec end (365 or 366). */
+static const uint32_t MONTHDAYS[2][13] = {
+    {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
+    {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366},
+};
+
+static inline int date_leap_year(int year) {
+    return (year % 4 == 0 && year % 100 != 0) || year % 400 == 0;
+}
+
+static inline int32_t date_years_by_days(int yy) {
+    return (int32_t)((int64_t)yy * 365 + yy / 4 - yy / 100 + yy / 400);
+}
+
+/* Decode: days-since-epoch → year/month/day */
+static inline void date_to_ymd(int32_t days, int* y, int* m, int* d) {
+    int32_t offset = days + date_years_by_days(RAY_DATE_EPOCH - 1);
+    double approx = (double)offset / 365.2425;
+    int32_t years = (int32_t)(approx >= 0.0 ? approx + 0.5 : approx - 0.5);
+
+    if (date_years_by_days(years) > offset)
+        years -= 1;
+
+    int32_t rem = offset - date_years_by_days(years);
+    int yy = years + 1;
+    int leap = date_leap_year(yy);
+    int mid = 0;
+
+    for (mid = 12; mid > 0; mid--)
+        if (MONTHDAYS[leap][mid] != 0 && rem / (int32_t)MONTHDAYS[leap][mid] != 0)
+            break;
+
+    if (mid == 12 || mid < 0)
+        mid = 0;
+
+    *y = yy;
+    *m = 1 + mid % 12;
+    *d = 1 + rem - (int32_t)MONTHDAYS[leap][mid];
+}
+
+/* Encode: year/month/day → days-since-epoch */
+static inline int32_t ymd_to_date(int year, int month, int day) {
+    int yy = (year > 0) ? year - 1 : 0;
+    int32_t ydays = date_years_by_days(yy);
+    int leap = date_leap_year(year);
+    int mm = (month > 0) ? month - 1 : 0;
+    int32_t mdays = (int32_t)MONTHDAYS[leap][mm];
+    return ydays - date_years_by_days(RAY_DATE_EPOCH - 1) + mdays + day - 1;
+}
+
+#endif /* RAY_CAL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/compile.c b/crates/rayforce-sys/vendor/rayforce/src/lang/compile.c
new file mode 100644
index 0000000..61bc2cf
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/compile.c
@@ -0,0 +1,518 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/eval.h"
+#include "lang/env.h"
+#include "lang/nfo.h"
+#include <stdbool.h>
+#include <string.h>
+
+/* ── Compiler state ──
+ * Internal buffers are ray_t objects whose data area holds the raw
+ * bytes / pointers. This avoids calling malloc/free. */
+typedef struct {
+    ray_t    *code_obj;   /* RAY_U8 vector used as growable byte buffer */
+    uint8_t *code;       /* == ray_data(code_obj) */
+    int32_t  code_len;
+    int32_t  code_cap;
+
+    ray_t    *consts_obj; /* RAY_LIST used as growable pointer array */
+    ray_t   **consts;     /* == ray_data(consts_obj) */
+    int32_t  n_consts;
+    int32_t  consts_cap;
+
+    struct { int64_t sym_id; int32_t slot; } locals[256];
+    int32_t  n_locals;
+    int32_t  max_locals;
+    bool     error;
+    ray_t   *lambda;     /* the lambda being compiled (for 'self' resolution) */
+
+    ray_t    *dbg_obj;   /* I64 vector: pairs of [offset, span.id] */
+    int32_t   dbg_len;
+} compiler_t;
+
+static void compile_expr(compiler_t *c, ray_t *ast);
+
+static bool compiler_init(compiler_t *c) {
+    memset(c, 0, sizeof(*c));
+    c->code_cap = 256;
+    c->code_obj = ray_alloc(c->code_cap);
+    if (!c->code_obj) return false;
+    c->code_obj->type = RAY_U8;
+    c->code_obj->len = 0;
+    c->code = (uint8_t *)ray_data(c->code_obj);
+
+    c->consts_cap = 16;
+    c->consts_obj = ray_alloc(c->consts_cap * sizeof(ray_t *));
+    if (!c->consts_obj) { ray_release(c->code_obj); return false; }
+    c->consts_obj->type = RAY_LIST;
+    c->consts_obj->len = 0;
+    c->consts = (ray_t **)ray_data(c->consts_obj);
+    memset(c->consts, 0, c->consts_cap * sizeof(ray_t *));
+    return true;
+}
+
+static void compiler_destroy(compiler_t *c) {
+    for (int32_t i = 0; i < c->n_consts; i++)
+        if (c->consts[i]) ray_release(c->consts[i]);
+    ray_release(c->consts_obj);
+    ray_release(c->code_obj);
+}
+
+/* ── Debug info helpers ── */
+static void dbg_append(compiler_t* c, int32_t offset, int64_t span_id) {
+    if (!c->dbg_obj) {
+        c->dbg_obj = ray_vec_new(RAY_I64, 0);
+        if (!c->dbg_obj) return;
+    }
+    int64_t off64 = (int64_t)offset;
+    c->dbg_obj = ray_vec_append(c->dbg_obj, &off64);
+    c->dbg_obj = ray_vec_append(c->dbg_obj, &span_id);
+}
+
+#define EMIT_DBG(c, ast) do { \
+    if ((c)->lambda && LAMBDA_NFO((c)->lambda)) { \
+        ray_span_t _sp = ray_nfo_get(LAMBDA_NFO((c)->lambda), (ast)); \
+        if (_sp.id != 0) dbg_append(c, (c)->code_len, _sp.id); \
+    } \
+} while(0)
+
+/* ── Emit helpers ── */
+static void emit(compiler_t *c, uint8_t byte) {
+    if (c->code_len >= c->code_cap) {
+        int32_t new_cap = c->code_cap * 2;
+        ray_t *new_obj = ray_alloc(new_cap);
+        if (!new_obj) { c->error = true; return; }
+        new_obj->type = RAY_U8;
+        new_obj->len = 0;
+        memcpy(ray_data(new_obj), c->code, c->code_len);
+        ray_release(c->code_obj);
+        c->code_obj = new_obj;
+        c->code = (uint8_t *)ray_data(new_obj);
+        c->code_cap = new_cap;
+    }
+    c->code[c->code_len++] = byte;
+}
+
+static void emit_const(compiler_t *c, int32_t idx) {
+    if (idx < 256) {
+        emit(c, OP_LOADCONST);
+        emit(c, (uint8_t)idx);
+    } else {
+        emit(c, OP_LOADCONST_W);
+        emit(c, (uint8_t)(idx >> 8));
+        emit(c, (uint8_t)(idx & 0xFF));
+    }
+}
+
+/* ── Constant pool ── */
+static int32_t add_constant(compiler_t *c, ray_t *value) {
+    for (int32_t i = 0; i < c->n_consts; i++) {
+        ray_t *v = c->consts[i];
+        if (v == value) return i;
+        if (v->type == value->type && ray_is_atom(v)) {
+            if (v->type == -RAY_I64 && v->i64 == value->i64) return i;
+            if (v->type == -RAY_F64 && v->f64 == value->f64) return i;
+            if (v->type == -RAY_BOOL && v->b8 == value->b8) return i;
+            if (v->type == -RAY_SYM && v->i64 == value->i64 &&
+                v->attrs == value->attrs) return i;
+        }
+    }
+    if (c->n_consts >= c->consts_cap) {
+        int32_t new_cap = c->consts_cap * 2;
+        ray_t *new_obj = ray_alloc(new_cap * sizeof(ray_t *));
+        if (!new_obj || RAY_IS_ERR(new_obj)) { c->error = true; return c->n_consts; }
+        new_obj->type = RAY_LIST;
+        new_obj->len = 0;
+        ray_t **new_arr = (ray_t **)ray_data(new_obj);
+        memcpy(new_arr, c->consts, c->n_consts * sizeof(ray_t *));
+        memset(new_arr + c->n_consts, 0, (new_cap - c->n_consts) * sizeof(ray_t *));
+        ray_release(c->consts_obj);
+        c->consts_obj = new_obj;
+        c->consts = new_arr;
+        c->consts_cap = new_cap;
+    }
+    ray_retain(value);
+    c->consts[c->n_consts] = value;
+    return c->n_consts++;
+}
+
+/* ── Local variable tracking ── */
+static int32_t find_local(compiler_t *c, int64_t sym_id) {
+    for (int32_t i = c->n_locals - 1; i >= 0; i--)
+        if (c->locals[i].sym_id == sym_id) return c->locals[i].slot;
+    return -1;
+}
+
+static int32_t add_local(compiler_t *c, int64_t sym_id) {
+    if (c->n_locals >= 256) return -1;
+    int32_t slot = c->n_locals;
+    c->locals[c->n_locals].sym_id = sym_id;
+    c->locals[c->n_locals].slot = slot;
+    c->n_locals++;
+    if (c->n_locals > c->max_locals) c->max_locals = c->n_locals;
+    return slot;
+}
+
+/* ── Jump helpers ── */
+static int32_t emit_jump(compiler_t *c, uint8_t opcode) {
+    emit(c, opcode);
+    int32_t patch_pos = c->code_len;
+    emit(c, 0);
+    emit(c, 0);
+    return patch_pos;
+}
+
+static void patch_jump(compiler_t *c, int32_t pos) {
+    int32_t raw = c->code_len - pos - 2;
+    if (raw > 32767 || raw < -32768) { c->error = true; return; }
+    int16_t offset = (int16_t)raw;
+    c->code[pos]     = (uint8_t)(offset >> 8);
+    c->code[pos + 1] = (uint8_t)(offset & 0xFF);
+}
+
+/* Cached sym IDs for special forms */
+static _Thread_local int64_t sf_set = -1, sf_let = -1, sf_if = -1, sf_do = -1, sf_fn = -1, sf_self = -1, sf_try = -1;
+
+static void init_sf_syms(void) {
+    if (sf_set >= 0) return;
+    sf_set  = ray_sym_intern("set", 3);
+    sf_let  = ray_sym_intern("let", 3);
+    sf_if   = ray_sym_intern("if",  2);
+    sf_do   = ray_sym_intern("do",  2);
+    sf_fn   = ray_sym_intern("fn",  2);
+    sf_self = ray_sym_intern("self", 4);
+    sf_try  = ray_sym_intern("try",  3);
+}
+
+/* ── Compile a list (special form or function call) ── */
+static void compile_list(compiler_t *c, ray_t *ast) {
+    if (c->error) return;
+    EMIT_DBG(c, ast);
+    int64_t n = ray_len(ast);
+    if (n == 0) { c->error = true; return; }
+    ray_t **elems = (ray_t **)ray_data(ast);
+    ray_t *head = elems[0];
+
+    init_sf_syms();
+
+    /* Check for special forms by name */
+    if (head->type == -RAY_SYM && (head->attrs & RAY_ATTR_NAME)) {
+        int64_t sym_id = head->i64;
+
+        /* (set name value) — dynamic eval (set modifies global env) */
+        if (sym_id == sf_set && n == 3) {
+            int32_t idx = add_constant(c, ast);
+            emit_const(c, idx);
+            emit(c, OP_CALLD);
+            emit(c, 0);
+            return;
+        }
+
+        /* (let name value) — compile value, store in local slot.
+         * Reserved names (`.sys.*`, `.os.*`, `.csv.*`, `.ipc.*`) are
+         * refused here so a compiled lambda can't shadow a builtin
+         * through its local-slot table — the same guard
+         * ray_env_set_local enforces on the tree-walking path.
+         * Setting c->error aborts bytecode emission; call_lambda
+         * then falls back to the tree-walking interpreter which
+         * raises the proper `reserve` error via ray_let_fn. */
+        if (sym_id == sf_let && n == 3) {
+            ray_t *name_obj = elems[1];
+            if (name_obj->type != -RAY_SYM ||
+                ray_sym_is_reserved(name_obj->i64)) {
+                c->error = true;
+                return;
+            }
+            compile_expr(c, elems[2]);
+            emit(c, OP_DUP);
+            int32_t slot = find_local(c, name_obj->i64);
+            if (slot < 0) slot = add_local(c, name_obj->i64);
+            if (slot < 0) { c->error = true; return; }
+            emit(c, OP_STOREENV);
+            emit(c, (uint8_t)slot);
+            return;
+        }
+
+        /* (if cond then else?) */
+        if (sym_id == sf_if && n >= 3) {
+            compile_expr(c, elems[1]);
+            int32_t jmpf_pos = emit_jump(c, OP_JMPF);
+            compile_expr(c, elems[2]);
+            if (n >= 4) {
+                int32_t jmp_pos = emit_jump(c, OP_JMP);
+                patch_jump(c, jmpf_pos);
+                compile_expr(c, elems[3]);
+                patch_jump(c, jmp_pos);
+            } else {
+                int32_t jmp_pos = emit_jump(c, OP_JMP);
+                patch_jump(c, jmpf_pos);
+                ray_t *zero = ray_alloc(0);
+                zero->type = -RAY_I64;
+                zero->i64 = 0;
+                int32_t idx = add_constant(c, zero);
+                ray_release(zero);
+                emit_const(c, idx);
+                patch_jump(c, jmp_pos);
+            }
+            return;
+        }
+
+        /* (do expr1 expr2 ...) */
+        if (sym_id == sf_do && n >= 2) {
+            for (int64_t i = 1; i < n; i++) {
+                if (i > 1) emit(c, OP_POP);
+                compile_expr(c, elems[i]);
+            }
+            return;
+        }
+
+        /* (fn [params] body...) — nested lambda via dynamic eval */
+        if (sym_id == sf_fn && n >= 3) {
+            int32_t idx = add_constant(c, ast);
+            emit_const(c, idx);
+            emit(c, OP_CALLD);
+            emit(c, 0);
+            return;
+        }
+
+        /* (try body handler) — compile to OP_TRAP/OP_TRAP_END */
+        if (sym_id == sf_try && n == 3) {
+            /* Reserve a hidden local for err_val */
+            int32_t err_slot = add_local(c, -1);
+            if (err_slot < 0) { c->error = true; return; }
+
+            int32_t trap_pos = emit_jump(c, OP_TRAP);
+            compile_expr(c, elems[1]);       /* body */
+            emit(c, OP_TRAP_END);
+            int32_t jmp_pos = emit_jump(c, OP_JMP);
+            patch_jump(c, trap_pos);         /* handler starts here */
+            /* err_val is on stack (pushed by vm_error_cleanup).
+             * Stash it, compile handler fn, reload err_val, call. */
+            emit(c, OP_STOREENV);
+            emit(c, (uint8_t)err_slot);
+            compile_expr(c, elems[2]);       /* handler fn */
+            emit(c, OP_LOADENV);
+            emit(c, (uint8_t)err_slot);
+            emit(c, OP_CALLF);
+            emit(c, 1);                     /* call handler(err_val) */
+            patch_jump(c, jmp_pos);          /* end */
+            return;
+        }
+    }
+
+    /* Self-recursive call: emit OP_CALLS (lean frame reuse, no fn object) */
+    if (head->type == -RAY_SYM && (head->attrs & RAY_ATTR_NAME) &&
+        head->i64 == sf_self) {
+        int64_t argc = n - 1;
+        if (argc > 64) { c->error = true; return; }
+        for (int64_t i = 1; i < n; i++)
+            compile_expr(c, elems[i]);
+        emit(c, OP_CALLS);
+        emit(c, (uint8_t)argc);
+        return;
+    }
+
+    /* Look up head at compile time to determine call type */
+    ray_t *fn = NULL;
+    if (head->type == -RAY_SYM && (head->attrs & RAY_ATTR_NAME))
+        fn = ray_env_get(head->i64);
+
+    /* Unrecognized special form: dynamic eval on entire form */
+    if (fn && (fn->attrs & RAY_FN_SPECIAL_FORM)) {
+        int32_t idx = add_constant(c, ast);
+        emit_const(c, idx);
+        emit(c, OP_CALLD);
+        emit(c, 0);
+        return;
+    }
+
+    /* General function call: compile head, args, then dispatch.
+     * If head resolved to a builtin at compile time, emit LOADCONST
+     * instead of RESOLVE to skip the runtime hash lookup. */
+    if (fn && (fn->type == RAY_UNARY || fn->type == RAY_BINARY || fn->type == RAY_VARY)) {
+        int32_t idx = add_constant(c, fn);
+        emit_const(c, idx);
+    } else {
+        compile_expr(c, head);
+    }
+    int64_t argc = n - 1;
+    if (argc > 64) { c->error = true; return; }
+    for (int64_t i = 1; i < n; i++)
+        compile_expr(c, elems[i]);
+
+    /* Record call-site span so errors point to the call expression, not the last arg */
+    EMIT_DBG(c, ast);
+
+    if (fn) {
+        switch (fn->type) {
+        case RAY_UNARY:
+            if (argc == 1) { emit(c, OP_CALL1); return; }
+            break;
+        case RAY_BINARY:
+            if (argc == 2) { emit(c, OP_CALL2); return; }
+            break;
+        case RAY_VARY:
+            emit(c, OP_CALLN);
+            emit(c, (uint8_t)argc);
+            return;
+        case RAY_LAMBDA:
+            emit(c, OP_CALLF);
+            emit(c, (uint8_t)argc);
+            return;
+        default:
+            break;
+        }
+    }
+
+    emit(c, OP_CALLF);
+    emit(c, (uint8_t)argc);
+}
+
+/* ── Compile expression ── */
+static void compile_expr(compiler_t *c, ray_t *ast) {
+    if (c->error) return;
+    if (!ast || RAY_IS_ERR(ast)) return;
+    EMIT_DBG(c, ast);
+
+    if (ray_is_atom(ast)) {
+        if (ast->type == -RAY_SYM && (ast->attrs & RAY_ATTR_NAME)) {
+            int32_t slot = find_local(c, ast->i64);
+            if (slot >= 0) {
+                emit(c, OP_LOADENV);
+                emit(c, (uint8_t)slot);
+            } else {
+                int32_t idx = add_constant(c, ast);
+                if (idx < 256) {
+                    emit(c, OP_RESOLVE);
+                    emit(c, (uint8_t)idx);
+                } else {
+                    emit(c, OP_RESOLVE_W);
+                    emit(c, (uint8_t)(idx >> 8));
+                    emit(c, (uint8_t)(idx & 0xFF));
+                }
+            }
+            return;
+        }
+        int32_t idx = add_constant(c, ast);
+        emit_const(c, idx);
+        return;
+    }
+
+    if (ast->type != RAY_LIST) {
+        int32_t idx = add_constant(c, ast);
+        emit_const(c, idx);
+        return;
+    }
+
+    if (ray_len(ast) == 0) {
+        int32_t idx = add_constant(c, ast);
+        emit_const(c, idx);
+        return;
+    }
+
+    compile_list(c, ast);
+}
+
+/* ── Public API ── */
+void ray_compile(ray_t *lambda) {
+    if (LAMBDA_IS_COMPILED(lambda)) return;
+
+    compiler_t c;
+    if (!compiler_init(&c)) return;
+    c.lambda = lambda;
+
+    /* Register params as locals */
+    ray_t *params_list = LAMBDA_PARAMS(lambda);
+    int64_t param_count = ray_len(params_list);
+    int64_t *param_ids = (int64_t*)ray_data(params_list);
+    for (int64_t i = 0; i < param_count; i++) {
+        if (add_local(&c, param_ids[i]) < 0) { c.error = true; break; }
+    }
+
+    /* Compile body expressions */
+    ray_t *body = LAMBDA_BODY(lambda);
+    int64_t body_count = ray_len(body);
+    ray_t **body_exprs = (ray_t **)ray_data(body);
+    for (int64_t i = 0; i < body_count; i++) {
+        if (i > 0) emit(&c, OP_POP);
+        compile_expr(&c, body_exprs[i]);
+    }
+    emit(&c, OP_RET);
+
+    if (c.error) {
+        if (c.dbg_obj) ray_release(c.dbg_obj);
+        compiler_destroy(&c);
+        return;
+    }
+
+    /* Build bytecode vector */
+    ray_t *bc = ray_alloc(c.code_len);
+    if (!bc) { compiler_destroy(&c); return; }
+    bc->type = RAY_U8;
+    bc->len = c.code_len;
+    memcpy(ray_data(bc), c.code, c.code_len);
+
+    /* Build constants list */
+    ray_t *consts = ray_alloc(c.n_consts * sizeof(ray_t *));
+    if (!consts) { ray_release(bc); compiler_destroy(&c); return; }
+    consts->type = RAY_LIST;
+    consts->len = c.n_consts;
+    ray_t **cpool = (ray_t **)ray_data(consts);
+    for (int32_t i = 0; i < c.n_consts; i++) {
+        ray_retain(c.consts[i]);
+        cpool[i] = c.consts[i];
+    }
+
+    LAMBDA_BC(lambda) = bc;
+    LAMBDA_CONSTS(lambda) = consts;
+    LAMBDA_NLOCALS(lambda) = c.max_locals;
+    lambda->attrs |= RAY_FN_COMPILED;
+
+    if (c.dbg_obj) {
+        LAMBDA_DBG(lambda) = c.dbg_obj;
+        /* dbg_obj is now owned by the lambda, don't release it */
+    }
+
+    compiler_destroy(&c);
+}
+
+ray_span_t ray_bc_dbg_get(ray_t* dbg, int32_t ip) {
+    ray_span_t span = {0};
+    if (!dbg || dbg->len == 0) return span;
+    int64_t* data = (int64_t*)ray_data(dbg);
+    int64_t n = dbg->len;
+    int64_t best_offset = -1;
+    for (int64_t i = 0; i < n; i += 2) {
+        int64_t offset = data[i];
+        if (offset <= ip && offset > best_offset) {
+            best_offset = offset;
+            span.id = data[i + 1];
+        }
+    }
+    return span;
+}
+
+void ray_compile_reset(void) {
+    sf_set = sf_let = sf_if = sf_do = sf_fn = sf_self = sf_try = -1;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/env.c b/crates/rayforce-sys/vendor/rayforce/src/lang/env.c
new file mode 100644
index 0000000..8bb2a50
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/env.c
@@ -0,0 +1,658 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/env.h"
+#include "table/sym.h"
+#include "table/dict.h"
+#include "ops/temporal.h"
+#include "ops/linkop.h"
+#include <stdatomic.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* ---- Function constructors ---- */
+
+/* Builtin name stored inline in nullmap[2..15] (max 13 chars + null).
+ * Bytes 0-1 reserved for DAG opcode (any type, not just binary). */
+static void fn_set_name(ray_t* obj, const char* name) {
+    memset(obj->nullmap, 0, 16);
+    size_t len = strlen(name);
+    if (len > 13) len = 13;
+    memcpy(obj->nullmap + 2, name, len);
+}
+
+ray_t* ray_fn_unary(const char* name, uint8_t fn_attrs, ray_unary_fn fn) {
+    ray_t* obj = ray_alloc(0);
+    if (!obj) return ray_error("oom", NULL);
+    obj->type = RAY_UNARY;
+    obj->attrs = fn_attrs;
+    obj->i64 = (int64_t)(uintptr_t)fn;
+    fn_set_name(obj, name);
+    return obj;
+}
+
+ray_t* ray_fn_binary(const char* name, uint8_t fn_attrs, ray_binary_fn fn) {
+    ray_t* obj = ray_alloc(0);
+    if (!obj) return ray_error("oom", NULL);
+    obj->type = RAY_BINARY;
+    obj->attrs = fn_attrs;
+    obj->i64 = (int64_t)(uintptr_t)fn;
+    fn_set_name(obj, name);
+    return obj;
+}
+
+ray_t* ray_fn_vary(const char* name, uint8_t fn_attrs, ray_vary_fn fn) {
+    ray_t* obj = ray_alloc(0);
+    if (!obj) return ray_error("oom", NULL);
+    obj->type = RAY_VARY;
+    obj->attrs = fn_attrs;
+    obj->i64 = (int64_t)(uintptr_t)fn;
+    fn_set_name(obj, name);
+    return obj;
+}
+
+/* ---- Global environment ---- */
+
+/* Spinlock protecting g_env mutations in ray_env_set */
+static _Atomic(int) g_env_lock = 0;
+static inline void env_lock(void) {
+    while (atomic_exchange_explicit(&g_env_lock, 1, memory_order_acquire)) {
+#if defined(__x86_64__) || defined(__i386__)
+        __builtin_ia32_pause();
+#endif
+    }
+}
+static inline void env_unlock(void) {
+    atomic_store_explicit(&g_env_lock, 0, memory_order_release);
+}
+
+#define ENV_CAP 1024
+
+static struct {
+    int64_t keys[ENV_CAP];
+    ray_t*   vals[ENV_CAP];
+    /* Per-slot flag: 1 iff this binding was last written by user code
+     * (ray_env_set / ray_env_set_local-promoted-to-global), 0 if the
+     * latest writer was builtin registration (ray_env_bind / _flat).
+     * Powers ray_env_list_user, which the journal snapshot uses to
+     * pick which globals to dump to <base>.qdb.  A user `(set + 42)`
+     * over a builtin flips the slot to user=1 so the override is
+     * preserved across snapshot/restore. */
+    uint8_t  user[ENV_CAP];
+    int32_t count;
+} g_env;
+
+/* ---- Local scope stack ---- */
+
+#define SCOPE_CAP  64
+#define FRAME_CAP  64
+
+typedef struct {
+    int64_t keys[FRAME_CAP];
+    ray_t*   vals[FRAME_CAP];
+    int32_t count;
+} ray_scope_frame_t;
+
+static _Thread_local ray_scope_frame_t scope_stack[SCOPE_CAP];
+static _Thread_local int32_t scope_depth = 0;
+
+int32_t ray_env_scope_depth(void) { return scope_depth; }
+int32_t ray_env_global_count(void) { return g_env.count; }
+
+ray_err_t ray_env_init(void) {
+    memset(&g_env, 0, sizeof(g_env));
+    scope_depth = 0;
+    return RAY_OK;
+}
+
+void ray_env_destroy(void) {
+    /* Pop any remaining scopes */
+    while (scope_depth > 0) ray_env_pop_scope();
+    for (int32_t i = 0; i < g_env.count; i++) {
+        if (g_env.vals[i]) ray_release(g_env.vals[i]);
+    }
+    memset(&g_env, 0, sizeof(g_env));
+}
+
+/* Flat (non-dotted) lookup — scope stack top-down, then global env.
+ * Returns NULL if not bound.  Always used as the head-segment resolver
+ * for dotted paths, and as the fast path for plain names. */
+static ray_t* env_lookup_flat(int64_t sym_id) {
+    for (int32_t d = scope_depth - 1; d >= 0; d--) {
+        ray_scope_frame_t* f = &scope_stack[d];
+        for (int32_t i = 0; i < f->count; i++) {
+            if (f->keys[i] == sym_id) return f->vals[i];
+        }
+    }
+    for (int32_t i = 0; i < g_env.count; i++) {
+        if (g_env.keys[i] == sym_id) return g_env.vals[i];
+    }
+    return NULL;
+}
+
+ray_t* ray_env_get(int64_t sym_id) {
+    /* Flat lookup first — covers every non-dotted name AND every
+     * reserved builtin like `.sys.gc` which is bound both flat (for
+     * O(1) resolution + prefix enumeration) and inside the `.sys`
+     * namespace dict (for REPL introspection). */
+    ray_t* flat = env_lookup_flat(sym_id);
+    if (flat) return flat;
+    if (!ray_sym_is_dotted(sym_id)) return NULL;
+
+    /* Dotted walk: head resolves via scope+global, rest are sym-keyed
+     * container probes — dicts probe the keys SYM vec and read the
+     * matching slot from the vals LIST, tables look up by schema sym
+     * id, anything else is surfaced as "undefined" (NULL).  Missing
+     * intermediate keys also return NULL so the evaluator's name-error
+     * reporting stays consistent with plain names.  Returning env-owned
+     * pointers (never fresh allocations) keeps the caller's retain/release
+     * balance correct. */
+    const int64_t* segs;
+    int n = ray_sym_segs(sym_id, &segs);
+    if (n < 2) return NULL;  /* defensive — dotted bit without segments */
+
+    ray_t* v = env_lookup_flat(segs[0]);
+    for (int i = 1; v && i < n; i++) {
+        v = ray_container_probe_sym(v, segs[i]);
+    }
+    return v;
+}
+
+/* Owned-ref variant.  Always returns rc>=1 on success; caller must
+ * release.  Additionally handles temporal field extraction in the dotted
+ * walk (e.g. `date.dd`, `ts.hh`) — when the next container-probe step
+ * would fail and the current value is a RAY_DATE / RAY_TIME /
+ * RAY_TIMESTAMP vector or atom, we try mapping the segment sym to a
+ * RAY_EXTRACT_* field and call ray_temporal_extract, which allocates a
+ * fresh result.  Those fresh allocations are exactly why this function
+ * has a different retain contract from ray_env_get. */
+ray_t* ray_env_resolve(int64_t sym_id) {
+    /* Flat lookup first — short-circuits dotted reserved builtins
+     * (`.sys.gc`, `.os.getenv`, …) that are additionally bound flat
+     * alongside their namespace dict.  Non-dotted names take the
+     * same path. */
+    ray_t* flat = env_lookup_flat(sym_id);
+    if (flat) { ray_retain(flat); return flat; }
+    if (!ray_sym_is_dotted(sym_id)) return NULL;
+
+    const int64_t* segs;
+    int n = ray_sym_segs(sym_id, &segs);
+    if (n < 2) return NULL;
+
+    /* `v` is either a borrowed env/container pointer (fresh=false) or a
+     * fresh temporal-extract result (fresh=true).  When switching between
+     * the two we must release the previous fresh value to avoid leaks. */
+    ray_t* v = env_lookup_flat(segs[0]);
+    bool   fresh = false;
+
+    for (int i = 1; v && i < n; i++) {
+        ray_t* next = NULL;
+        bool   next_fresh = false;
+        /* Linked column: deref segs[i] as a target field name (returns
+         * a fresh owning result, columns the same length as v).  Errors
+         * from ray_link_deref (e.g. "nyi: target table has a parted
+         * column") must be surfaced to the caller — silently downgrading
+         * to NULL would convert a real wrong-answer-bug guard into a
+         * confusing "name undefined" message. */
+        if (ray_link_has(v)) {
+            next = ray_link_deref(v, segs[i]);
+            if (next && RAY_IS_ERR(next)) {
+                if (fresh) ray_release(v);
+                return next;
+            }
+            next_fresh = (next != NULL);
+        }
+        if (!next) next = ray_container_probe_sym(v, segs[i]);
+        if (next) {
+            if (fresh) ray_release(v);
+            v = next;
+            fresh = next_fresh;
+            continue;
+        }
+
+        /* Container probe miss — try method dispatch: look up the
+         * segment as a callable in env, and if it's a unary function,
+         * apply it to the current value.  This makes `ts.ss`, `d.dd`,
+         * or any future `x.some_fn` work the same way, with the
+         * segment resolution going through the normal function
+         * registration path instead of a bespoke table.
+         *
+         * Walk both scope and global env looking for a RAY_UNARY
+         * binding — a local non-callable (e.g. a column named `ss`
+         * pushed into scope by the select fallback) must not shadow
+         * the globally-registered accessor function. */
+        ray_t* fn = NULL;
+        for (int32_t d = scope_depth - 1; d >= 0 && !fn; d--) {
+            ray_scope_frame_t* f = &scope_stack[d];
+            for (int32_t k = 0; k < f->count; k++) {
+                if (f->keys[k] == segs[i] && f->vals[k]
+                    && f->vals[k]->type == RAY_UNARY) {
+                    fn = f->vals[k];
+                    break;
+                }
+            }
+        }
+        if (!fn) {
+            for (int32_t k = 0; k < g_env.count; k++) {
+                if (g_env.keys[k] == segs[i] && g_env.vals[k]
+                    && g_env.vals[k]->type == RAY_UNARY) {
+                    fn = g_env.vals[k];
+                    break;
+                }
+            }
+        }
+        if (fn) {
+            ray_unary_fn f = (ray_unary_fn)(uintptr_t)fn->i64;
+            ray_t* r = f(v);
+            if (fresh) ray_release(v);
+            if (!r || RAY_IS_ERR(r)) return NULL;
+            v = r;
+            fresh = true;
+            continue;
+        }
+
+        /* Nothing matched — propagate "undefined". */
+        if (fresh) ray_release(v);
+        return NULL;
+    }
+
+    if (!v) return NULL;
+    if (!fresh) ray_retain(v);   /* hand back an owned ref */
+    return v;
+}
+
+/* Flat-binding helpers: mutate a specific scope (global or top frame) by
+ * sym_id.  Used by both the simple and dotted set paths.  Passing val=NULL
+ * means "delete" — if a slot exists, release its value and compact the
+ * slot out of the array (no-op if the slot doesn't exist).  This matches
+ * ray_del_fn's contract via ray_env_set(sym, NULL) and also covers the
+ * cascade-up case in env_set_dotted where every dict in a dotted path was
+ * emptied by the delete. */
+static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
+    env_lock();
+    for (int32_t i = 0; i < g_env.count; i++) {
+        if (g_env.keys[i] == sym_id) {
+            if (val == NULL) {
+                if (g_env.vals[i]) ray_release(g_env.vals[i]);
+                for (int32_t j = i; j + 1 < g_env.count; j++) {
+                    g_env.keys[j] = g_env.keys[j + 1];
+                    g_env.vals[j] = g_env.vals[j + 1];
+                    g_env.user[j] = g_env.user[j + 1];
+                }
+                g_env.count--;
+                env_unlock();
+                return RAY_OK;
+            }
+            if (g_env.vals[i]) ray_release(g_env.vals[i]);
+            ray_retain(val);
+            g_env.vals[i] = val;
+            /* User write upgrades a builtin slot to user-defined, so a
+             * (set + 42) override survives snapshot/restore.  A builtin
+             * re-bind (e.g. theoretical hot reload) leaves the existing
+             * flag alone — once user, always user, until the slot is
+             * deleted. */
+            if (is_user) g_env.user[i] = 1;
+            env_unlock();
+            return RAY_OK;
+        }
+    }
+    if (val == NULL) {   /* deleting an absent binding: no-op */
+        env_unlock();
+        return RAY_OK;
+    }
+    if (g_env.count >= ENV_CAP) {
+        env_unlock();
+        return RAY_ERR_OOM;
+    }
+    g_env.keys[g_env.count] = sym_id;
+    ray_retain(val);
+    g_env.vals[g_env.count] = val;
+    g_env.user[g_env.count] = is_user ? 1 : 0;
+    g_env.count++;
+    env_unlock();
+    return RAY_OK;
+}
+
+/* Function-pointer-shaped wrapper used by env_set_dotted's bind_fn
+ * indirection — preserves the existing signature. */
+static ray_err_t env_bind_global(int64_t sym_id, ray_t* val) {
+    return env_bind_global_impl(sym_id, val, 0);
+}
+
+/* User-flagged sibling: identical except the slot is marked user=1.
+ * Used by ray_env_set and the dotted-set path it drives. */
+static ray_err_t env_bind_global_user(int64_t sym_id, ray_t* val) {
+    return env_bind_global_impl(sym_id, val, 1);
+}
+
+static ray_err_t env_bind_local(int64_t sym_id, ray_t* val) {
+    ray_scope_frame_t* f = &scope_stack[scope_depth - 1];
+    for (int32_t i = 0; i < f->count; i++) {
+        if (f->keys[i] == sym_id) {
+            if (val == NULL) {
+                if (f->vals[i]) ray_release(f->vals[i]);
+                for (int32_t j = i; j + 1 < f->count; j++) {
+                    f->keys[j] = f->keys[j + 1];
+                    f->vals[j] = f->vals[j + 1];
+                }
+                f->count--;
+                return RAY_OK;
+            }
+            if (f->vals[i]) ray_release(f->vals[i]);
+            ray_retain(val);
+            f->vals[i] = val;
+            return RAY_OK;
+        }
+    }
+    if (val == NULL) return RAY_OK;
+    if (f->count >= FRAME_CAP) return RAY_ERR_OOM;
+    f->keys[f->count] = sym_id;
+    ray_retain(val);
+    f->vals[f->count] = val;
+    f->count++;
+    return RAY_OK;
+}
+
+/* Dotted-path write.  base_lookup(head_sym) returns the current binding in
+ * the scope we are writing to (global or local frame), or NULL.  bind_fn
+ * rebinds the new top-level dict in that same scope.  Walks the existing
+ * chain (if any) for intermediate dicts, then COW-rebuilds bottom-up using
+ * dict_upsert.  Auto-creates missing intermediates as empty dicts. */
+static ray_err_t env_set_dotted(int64_t sym_id, ray_t* val,
+                                ray_t* (*base_lookup)(int64_t),
+                                ray_err_t (*bind_fn)(int64_t, ray_t*)) {
+    const int64_t* segs;
+    int n = ray_sym_segs(sym_id, &segs);
+    if (n < 2) return RAY_ERR_TYPE;   /* dotted flag without segments */
+
+    /* Walk existing chain to the deepest parent that still exists.  Record
+     * each level's dict pointer (borrowed) so we can rebuild upward.  Any
+     * non-dict intermediate is an error. */
+    ray_t* parents[256];
+    parents[0] = base_lookup(segs[0]);
+    if (parents[0] && parents[0]->type != RAY_DICT)
+        return RAY_ERR_TYPE;
+
+    /* parents[i] is the dict at path prefix segs[0..i].  If an intermediate
+     * key is missing, parents[i+1..n-2] are NULL and ray_dict_upsert will
+     * create fresh dicts on the way back up. */
+    for (int i = 1; i < n - 1; i++) {
+        if (!parents[i - 1]) { parents[i] = NULL; continue; }
+        ray_t* child = ray_dict_probe_sym_borrowed(parents[i - 1], segs[i]);
+        if (child && child->type != RAY_DICT)
+            return RAY_ERR_TYPE;
+        parents[i] = child;
+    }
+
+    /* Delete path: (del ns.x) lowers to ray_env_set(sym_id, NULL).  The
+     * non-dotted path removes the env slot; the dotted path must actually
+     * remove the key from the leaf dict and rebuild the chain — otherwise
+     * the user would see a zombie entry like {:x NULL} instead of the
+     * key being gone.  No-op cleanly if any part of the path is missing.
+     * If the leaf-removal empties the containing dict, we must not rebind
+     * {} upward — that would leave a stale empty namespace.  Instead
+     * cascade up: at each level, if `cur` is empty, delete that key from
+     * its parent instead of upserting it.  If the cascade reaches the
+     * head with an empty dict, we rebind the head to NULL (env_bind_*
+     * treats NULL as "remove the slot"). */
+    int start_i;
+    ray_t* cur;
+    bool deleting = (val == NULL);
+    if (deleting) {
+        ray_t* leaf_parent = parents[n - 2];
+        if (!leaf_parent) return RAY_OK;
+        if (!ray_dict_probe_sym_borrowed(leaf_parent, segs[n - 1])) return RAY_OK;
+        ray_retain(leaf_parent);
+        ray_t* k = ray_sym(segs[n - 1]);
+        cur = ray_dict_remove(leaf_parent, k);
+        ray_release(k);
+        if (!cur || RAY_IS_ERR(cur)) return RAY_ERR_OOM;
+        start_i = n - 2;   /* rebuild from the parent of the deleted key up */
+    } else {
+        ray_retain(val);
+        cur = val;
+        start_i = n - 1;
+    }
+
+    /* Build new chain bottom-up.  ray_dict_upsert consumes its `dict` arg,
+     * so we retain parents before passing.  Missing-parent levels are
+     * created from a fresh empty dict.  On failure we release cur and bail
+     * — parents are env-owned borrowed refs. */
+    for (int i = start_i; i >= 1; i--) {
+        ray_t* parent = parents[i - 1];
+
+        if (deleting && cur && cur->type == RAY_DICT && ray_dict_len(cur) == 0) {
+            /* Cascade: the rebuilt child became empty, so remove the key
+             * at this level rather than storing {}.  If parent is absent
+             * too, nothing more to do. */
+            ray_release(cur);
+            if (!parent) { cur = NULL; break; }
+            ray_retain(parent);
+            ray_t* k = ray_sym(segs[i]);
+            cur = ray_dict_remove(parent, k);
+            ray_release(k);
+            if (!cur || RAY_IS_ERR(cur)) return RAY_ERR_OOM;
+            continue;
+        }
+
+        ray_t* dict_in;
+        if (parent) {
+            ray_retain(parent);
+            dict_in = parent;
+        } else {
+            ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, 1);
+            ray_t* vals = ray_list_new(1);
+            dict_in = ray_dict_new(keys, vals);
+            if (!dict_in || RAY_IS_ERR(dict_in)) { ray_release(cur); return RAY_ERR_OOM; }
+        }
+        ray_t* k = ray_sym(segs[i]);
+        ray_t* next = ray_dict_upsert(dict_in, k, cur);
+        ray_release(k);
+        ray_release(cur);
+        if (!next || RAY_IS_ERR(next)) return RAY_ERR_OOM;
+        cur = next;
+    }
+
+    /* If cascade reduced the head-level dict to empty (or propagated up
+     * past a missing parent), rebind the head as NULL so the stale empty
+     * namespace disappears from introspection and from future lookups. */
+    ray_t* to_bind = cur;
+    if (deleting && cur && cur->type == RAY_DICT && ray_dict_len(cur) == 0) {
+        to_bind = NULL;
+    }
+    ray_err_t err = bind_fn(segs[0], to_bind);
+    if (cur) ray_release(cur);
+    return err;
+}
+
+/* Scope-specific base lookups used by env_set_dotted. */
+static ray_t* lookup_global(int64_t sym_id) {
+    for (int32_t i = 0; i < g_env.count; i++) {
+        if (g_env.keys[i] == sym_id) return g_env.vals[i];
+    }
+    return NULL;
+}
+
+static ray_t* lookup_top_frame(int64_t sym_id) {
+    if (scope_depth <= 0) return NULL;
+    ray_scope_frame_t* f = &scope_stack[scope_depth - 1];
+    for (int32_t i = 0; i < f->count; i++) {
+        if (f->keys[i] == sym_id) return f->vals[i];
+    }
+    return NULL;
+}
+
+/* A sym belongs to the reserved system namespace if its name starts with
+ * a dot (e.g. `.sys.gc`, `.os.getenv`).  The leading segment is the
+ * category tag; builtin registration populates these via ray_env_bind
+ * and every user-level binder refuses such names so the system
+ * bindings can't be shadowed in any scope. */
+bool ray_sym_is_reserved(int64_t sym_id) {
+    ray_t* s = ray_sym_str(sym_id);
+    if (!s) return false;
+    const char* p = ray_str_ptr(s);
+    size_t n = ray_str_len(s);
+    return n > 0 && p && p[0] == '.';
+}
+
+ray_err_t ray_env_bind(int64_t sym_id, ray_t* val) {
+    if (ray_sym_is_dotted(sym_id)) {
+        return env_set_dotted(sym_id, val, lookup_global, env_bind_global);
+    }
+    return env_bind_global(sym_id, val);
+}
+
+ray_err_t ray_env_bind_flat(int64_t sym_id, ray_t* val) {
+    return env_bind_global(sym_id, val);
+}
+
+ray_err_t ray_env_set(int64_t sym_id, ray_t* val) {
+    if (ray_sym_is_reserved(sym_id)) return RAY_ERR_RESERVED;
+    /* Same machinery as ray_env_bind, but routes through the user-flagged
+     * binder so the journal snapshot can pick this slot.  Without this
+     * flip, env_bind_global would also be reached via ray_env_bind below
+     * and the slot would carry user=0 — leaving it out of <base>.qdb. */
+    if (ray_sym_is_dotted(sym_id))
+        return env_set_dotted(sym_id, val, lookup_global, env_bind_global_user);
+    return env_bind_global_user(sym_id, val);
+}
+
+ray_err_t ray_env_push_scope(void) {
+    if (scope_depth >= SCOPE_CAP) return RAY_ERR_OOM;
+    scope_stack[scope_depth].count = 0;
+    scope_depth++;
+    return RAY_OK;
+}
+
+void ray_env_pop_scope(void) {
+    if (scope_depth <= 0) return;
+    scope_depth--;
+    ray_scope_frame_t* f = &scope_stack[scope_depth];
+    for (int32_t i = 0; i < f->count; i++) {
+        if (f->vals[i]) ray_release(f->vals[i]);
+    }
+    f->count = 0;
+}
+
+/* ---- Iteration ---- */
+
+int32_t ray_env_list(int64_t* sym_ids, ray_t** vals, int32_t max_entries) {
+    int32_t n = g_env.count < max_entries ? g_env.count : max_entries;
+    for (int32_t i = 0; i < n; i++) {
+        sym_ids[i] = g_env.keys[i];
+        vals[i] = g_env.vals[i];
+    }
+    return n;
+}
+
+int32_t ray_env_list_user(int64_t* sym_ids, ray_t** vals, int32_t max_entries) {
+    int32_t out = 0;
+    for (int32_t i = 0; i < g_env.count && out < max_entries; i++) {
+        if (!g_env.user[i]) continue;
+        sym_ids[out] = g_env.keys[i];
+        vals[out]    = g_env.vals[i];
+        out++;
+    }
+    return out;
+}
+
+/* ---- Prefix lookup ---- */
+
+static const char* s_keywords[] = {
+    "def", "do", "false", "fn", "if", "let", "set", "true", NULL
+};
+
+/* Compare helper for qsort on const char* */
+static int cmp_str_ptr(const void* a, const void* b) {
+    return strcmp(*(const char**)a, *(const char**)b);
+}
+
+bool ray_env_has_name(const char* name, int64_t len) {
+    if (!name || len <= 0) return false;
+    for (int32_t i = 0; i < g_env.count; i++) {
+        ray_t* s = ray_sym_str(g_env.keys[i]);
+        if (!s) continue;
+        const char* n = ray_str_ptr(s);
+        if (!n) continue;
+        if ((int64_t)strlen(n) == len && memcmp(n, name, (size_t)len) == 0)
+            return true;
+    }
+    for (const char** kw = s_keywords; *kw; kw++) {
+        if ((int64_t)strlen(*kw) == len && memcmp(*kw, name, (size_t)len) == 0)
+            return true;
+    }
+    return false;
+}
+
+int64_t ray_env_lookup_prefix(const char* prefix, int64_t len,
+                              const char** results, int64_t max_results) {
+    int64_t count = 0;
+
+    /* Scan global env keys */
+    for (int32_t i = 0; i < g_env.count && count < max_results; i++) {
+        ray_t* s = ray_sym_str(g_env.keys[i]);
+        if (!s) continue;
+        const char* name = ray_str_ptr(s);
+        if (!name) continue;
+        int64_t nlen = (int64_t)strlen(name);
+        if (nlen >= len && strncmp(name, prefix, (size_t)len) == 0) {
+            /* Deduplicate against what we already have */
+            int dup = 0;
+            for (int64_t j = 0; j < count; j++) {
+                if (strcmp(results[j], name) == 0) { dup = 1; break; }
+            }
+            if (!dup) results[count++] = name;
+        }
+    }
+
+    /* Scan static keyword list */
+    for (const char** kw = s_keywords; *kw && count < max_results; kw++) {
+        int64_t klen = (int64_t)strlen(*kw);
+        if (klen >= len && strncmp(*kw, prefix, (size_t)len) == 0) {
+            int dup = 0;
+            for (int64_t j = 0; j < count; j++) {
+                if (strcmp(results[j], *kw) == 0) { dup = 1; break; }
+            }
+            if (!dup) results[count++] = *kw;
+        }
+    }
+
+    /* Sort alphabetically */
+    if (count > 1) {
+        qsort((void*)results, (size_t)count, sizeof(const char*), cmp_str_ptr);
+    }
+    return count;
+}
+
+ray_err_t ray_env_set_local(int64_t sym_id, ray_t* val) {
+    /* Reserved names (.sys.*, .os.*, .csv.*, .ipc.*) can only be
+     * populated by builtin registration (ray_env_bind).  Refuse at
+     * every user-reachable binding path so `(let .sys.gc 99)` or a
+     * lambda parameter named `.sys.gc` cannot shadow the builtin. */
+    if (ray_sym_is_reserved(sym_id)) return RAY_ERR_RESERVED;
+    if (scope_depth <= 0) return ray_env_set(sym_id, val);
+    if (ray_sym_is_dotted(sym_id)) {
+        return env_set_dotted(sym_id, val, lookup_top_frame, env_bind_local);
+    }
+    return env_bind_local(sym_id, val);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/env.h b/crates/rayforce-sys/vendor/rayforce/src/lang/env.h
new file mode 100644
index 0000000..e92b528
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/env.h
@@ -0,0 +1,118 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_ENV_H
+#define RAY_ENV_H
+
+#include <rayforce.h>
+#include "lang/eval.h"
+
+/* Create function objects. Name stored inline in nullmap[0..15].
+ * The function pointer is in the i64 field. */
+ray_t* ray_fn_unary(const char* name, uint8_t fn_attrs, ray_unary_fn fn);
+ray_t* ray_fn_binary(const char* name, uint8_t fn_attrs, ray_binary_fn fn);
+ray_t* ray_fn_vary(const char* name, uint8_t fn_attrs, ray_vary_fn fn);
+
+/* Read builtin name from nullmap[2..15] (null-terminated, max 13 chars).
+ * Bytes 0-1 reserved for DAG opcode on all function types. */
+static inline const char* ray_fn_name(const ray_t* fn) {
+    return (const char*)fn->nullmap + 2;
+}
+
+/* Global environment: symbol -> function object dict */
+ray_err_t ray_env_init(void);
+void     ray_env_destroy(void);
+ray_t*    ray_env_get(int64_t sym_id);
+
+/* User-facing binder.  Refuses any name starting with `.` — that root is
+ * reserved for system namespaces (.sys, .os, .io, .ipc, …) populated by
+ * builtin registration.  Returns RAY_ERR_RESERVED in that case. */
+ray_err_t ray_env_set(int64_t sym_id, ray_t* val);
+
+/* Internal binder used by builtin registration.  Identical to ray_env_set
+ * but WITHOUT the reserved-namespace guard.  Do NOT call this from user-
+ * exposed paths; it is the intended way to populate `.sys` / `.os` etc.
+ * during ray_lang_init. */
+ray_err_t ray_env_bind(int64_t sym_id, ray_t* val);
+
+/* Flat variant of ray_env_bind: writes the binding directly into the
+ * global env hash without traversing dotted-segment dict upserts.
+ * Used to register every fully-qualified builtin name (`.sys.gc`,
+ * `.os.getenv`, …) alongside the root namespace dict, so prefix
+ * lookup (REPL completion + highlighter) enumerates them all. */
+ray_err_t ray_env_bind_flat(int64_t sym_id, ray_t* val);
+
+/* True if a symbol's interned name starts with `.` — i.e. it belongs to
+ * the reserved namespace populated at startup by builtin registration.
+ * User-level binders (ray_env_set, ray_env_set_local, lambda parameter
+ * installer) refuse such names so system bindings can't be shadowed. */
+bool ray_sym_is_reserved(int64_t sym_id);
+
+/* Resolve a name for a Rayfall expression (tree-walking eval or bytecode
+ * op_resolve): returns an OWNED ref (rc >= 1) that the caller must
+ * release, or NULL if undefined.  Unlike ray_env_get which returns a
+ * borrowed ref and leaves refcount management to the caller, env_resolve
+ * retains before returning — so name-resolution sites can drop their
+ * manual ray_retain and still participate in the dotted-sym temporal
+ * extraction path (e.g. `trades.Time.dd`), which allocates fresh values
+ * mid-walk. */
+ray_t*    ray_env_resolve(int64_t sym_id);
+
+/* Prefix lookup: scan global env + keywords for names starting with prefix.
+ * Fills results[] with pointers to interned name strings (valid until next
+ * sym table mutation).  Returns count of matches (up to max_results).
+ * Results are sorted alphabetically. */
+int64_t ray_env_lookup_prefix(const char* prefix, int64_t len,
+                              const char** results, int64_t max_results);
+
+/* True iff `name[0..len)` is an exact-match global env binding or
+ * keyword.  Does NOT intern the probed string (unlike ray_env_get which
+ * would need a sym_id).  Used by the REPL highlighter to decide whether
+ * to paint the current word green — the prefix-lookup API returns only
+ * the first-matching entry, which would misclassify `de` as non-builtin
+ * when an alphabetically-earlier `desc`/`del` hits the same prefix. */
+bool ray_env_has_name(const char* name, int64_t len);
+
+/* Iterate global environment entries.
+ * Fills sym_ids[] and vals[] with up to max_entries items.
+ * Returns count of entries written. */
+int32_t ray_env_list(int64_t* sym_ids, ray_t** vals, int32_t max_entries);
+
+/* Iterate ONLY user-defined bindings (slots last written via ray_env_set,
+ * not ray_env_bind).  Powers the journal snapshot — the .qdb file would
+ * otherwise carry every builtin, which is wasteful and breaks on reload
+ * because builtin function objects hold absolute pointers from the prior
+ * process.  A user `(set + 42)` over a builtin flips the slot to user-
+ * defined, so explicit overrides are preserved. */
+int32_t ray_env_list_user(int64_t* sym_ids, ray_t** vals, int32_t max_entries);
+
+/* Total number of bindings currently in the global env (builtins +
+ * user).  Useful for sizing buffers before ray_env_list. */
+int32_t ray_env_global_count(void);
+
+/* Local scope stack for lexical binding (let, do, lambda) */
+ray_err_t ray_env_push_scope(void);
+void ray_env_pop_scope(void);
+ray_err_t ray_env_set_local(int64_t sym_id, ray_t* val);
+
+#endif /* RAY_ENV_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/eval.c b/crates/rayforce-sys/vendor/rayforce/src/lang/eval.c
new file mode 100644
index 0000000..7d3442e
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/eval.c
@@ -0,0 +1,2626 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/eval.h"
+#include "lang/internal.h"
+#include "lang/env.h"
+#include "lang/nfo.h"
+#include "lang/parse.h"
+#include "core/types.h"
+#include "ops/ops.h"
+#include "ops/temporal.h"
+#include "ops/datalog.h"
+#include "ops/idxop.h"
+#include "ops/linkop.h"
+#include "table/sym.h"
+#include "core/profile.h"
+#include "table/sym.h"
+#include "mem/heap.h"
+#include "mem/sys.h"
+/* store/serde.h, store/splay.h, store/part.h moved to system.c */
+/* ray_lang_print, ray_cast_fn, etc. moved to ops/builtins.c */
+/* ray_error() is declared in <rayforce.h> (included via eval.h) */
+
+#include <string.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <signal.h>
+#include <time.h>
+
+/* Maximum recursion depth for ray_eval() to prevent stack overflow */
+#define RAY_EVAL_MAX_DEPTH 512
+_Thread_local static int eval_depth = 0;
+
+/* Thread-local nfo for eval context — tracks source locations during evaluation */
+static _Thread_local ray_t* g_eval_nfo = NULL;
+
+/* Thread-local error trace — list of [span_i64, filename, fn_name, source] frames */
+static _Thread_local ray_t* g_error_trace = NULL;
+
+/* Interrupt flag — set by REPL signal handler, checked by eval/VM loops */
+static volatile sig_atomic_t g_eval_interrupted = 0;
+
+void ray_request_interrupt(void)      { g_eval_interrupted = 1; }
+void ray_clear_interrupt(void)        { g_eval_interrupted = 0; }
+bool ray_interrupted(void)            { return g_eval_interrupted != 0; }
+
+/* Legacy internal names — thin wrappers kept for existing callers. */
+void ray_eval_request_interrupt(void) { ray_request_interrupt(); }
+void ray_eval_clear_interrupt(void)   { ray_clear_interrupt(); }
+int  ray_eval_is_interrupted(void)    { return ray_interrupted(); }
+
+ray_t* ray_eval_get_nfo(void) { return g_eval_nfo; }
+void   ray_eval_set_nfo(ray_t* nfo) { g_eval_nfo = nfo; }
+
+ray_t* ray_get_error_trace(void) { return g_error_trace; }
+void   ray_clear_error_trace(void) {
+    if (g_error_trace) { ray_release(g_error_trace); g_error_trace = NULL; }
+}
+
+/* ══════════════════════════════════════════
+ * Restricted-mode check
+ * ══════════════════════════════════════════ */
+
+static _Thread_local bool g_eval_restricted = false;
+
+void ray_eval_set_restricted(bool on) { g_eval_restricted = on; }
+bool ray_eval_get_restricted(void)    { return g_eval_restricted; }
+
+static inline bool fn_is_restricted(ray_t* fn_obj) {
+    return g_eval_restricted && (fn_obj->attrs & RAY_FN_RESTRICTED);
+}
+
+/* ══════════════════════════════════════════
+ * Error handling: try / raise
+ * ══════════════════════════════════════════ */
+
+static _Thread_local ray_t *__raise_val = NULL;
+
+/* (raise value) — raise an error with the given value */
+ray_t* ray_raise_fn(ray_t* val) {
+    if (__raise_val) ray_release(__raise_val);
+    ray_retain(val);
+    __raise_val = val;
+    return ray_error("domain", NULL);
+}
+
+/* (try expr handler) — evaluate expr, if error call handler with error value.
+ * Special form: receives unevaluated args. */
+ray_t* ray_try_fn(ray_t* expr, ray_t* handler_expr) {
+    ray_t* result = ray_eval(expr);
+    if (!RAY_IS_ERR(result)) return result;
+
+    /* Get error value (set by raise, or default for runtime errors) */
+    ray_t* err_val = __raise_val;
+    __raise_val = NULL;
+    if (!err_val) err_val = make_i64(0);
+
+    /* Evaluate handler expression */
+    ray_t* handler = ray_eval(handler_expr);
+    if (RAY_IS_ERR(handler)) {
+        ray_release(err_val);
+        return handler;
+    }
+
+    /* Call handler with error value */
+    ray_t* handler_result;
+    if (handler->type == RAY_LAMBDA) {
+        ray_t* args[1] = { err_val };
+        handler_result = call_lambda(handler, args, 1);
+    } else if (handler->type == RAY_UNARY) {
+        ray_unary_fn fn = (ray_unary_fn)(uintptr_t)handler->i64;
+        handler_result = fn(err_val);
+    } else {
+        handler_result = ray_error("type", NULL);
+    }
+
+    ray_release(err_val);
+    ray_release(handler);
+    return handler_result;
+}
+
+/* ══════════════════════════════════════════
+ * FN_ATOMIC auto-mapping helpers
+ * ══════════════════════════════════════════ */
+
+/* Convert a typed vector to a boxed list.  If already a list, retains
+ * and returns it directly.  Caller owns the returned object. */
+ray_t* to_boxed_list(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return x;
+    if (x->type == RAY_LIST) { ray_retain(x); return x; }
+    if (!ray_is_vec(x)) return ray_error("type", NULL);
+
+    int64_t len = ray_len(x);
+    ray_t* list = ray_alloc(len * sizeof(ray_t*));
+    if (!list) return ray_error("oom", NULL);
+    list->type = RAY_LIST;
+    list->len = len;
+    ray_t** dst = (ray_t**)ray_data(list);
+
+    for (int64_t i = 0; i < len; i++) {
+        int alloc = 0;
+        dst[i] = collection_elem(x, i, &alloc);
+        if (RAY_IS_ERR(dst[i])) {
+            for (int64_t j = 0; j < i; j++) ray_release(dst[j]);
+            ray_release(list);
+            return dst[i];
+        }
+        /* collection_elem always allocates for typed vecs, so ownership transfers */
+    }
+    return list;
+}
+
+/* Unbox a typed vector argument to a boxed list for use in builtins.
+ * Sets *_bx to the allocated boxed list (caller must release) or NULL.
+ * Returns the (possibly converted) argument, or an error. */
+ray_t* unbox_vec_arg(ray_t* x, ray_t** _bx) {
+    *_bx = NULL;
+    if (x && !RAY_IS_ERR(x) && ray_is_vec(x)) {
+        *_bx = to_boxed_list(x);
+        return *_bx;
+    }
+    return x;
+}
+
+/* Construct a zero-valued owned atom matching the element type of a
+ * vector (typed or RAY_LIST).  Used only for empty-collection type
+ * probing by the atomic-map helpers: it lets us invoke a binary or
+ * unary `fn` with a representative scalar so the result's output
+ * type is observable even when the input has no elements.
+ *
+ * Symbol / string / GUID columns must produce atoms of their own
+ * element type — falling back to i64(0) for those would make, e.g.,
+ * `(== empty_sym_col 'foo)` probe an integer comparison and return
+ * I64 instead of the BOOL a non-empty input would yield.  Unknown
+ * element types still fall back to ray_i64(0). */
+static ray_t* zero_atom_for_elem_type(ray_t* coll) {
+    if (!coll) return ray_i64(0);
+    if (coll->type == RAY_LIST) return ray_i64(0);
+    switch (coll->type) {
+        case RAY_I64:       return ray_i64(0);
+        case RAY_I32:       return ray_i32(0);
+        case RAY_I16:       return ray_i16(0);
+        case RAY_U8:        return ray_u8(0);
+        case RAY_BOOL:      return make_bool(0);
+        case RAY_F64:       return make_f64(0.0);
+        case RAY_DATE:      return ray_date(0);
+        case RAY_TIME:      return ray_time(0);
+        case RAY_TIMESTAMP: return ray_timestamp(0);
+        case RAY_SYM:       return ray_sym(0);
+        case RAY_STR:       return ray_str("", 0);
+        case RAY_GUID: {
+            static const uint8_t zero_guid[16] = {0};
+            return ray_guid(zero_guid);
+        }
+        default:            return ray_i64(0);
+    }
+}
+
+/* Map a binary function element-wise over collections.
+ * Both args can be collections (zip-map) or one scalar (broadcast).
+ * Produces typed vectors when output is numeric/bool, boxed lists otherwise. */
+ray_t* atomic_map_binary_op(ray_binary_fn fn, uint16_t dag_opcode, ray_t* left, ray_t* right) {
+    int left_coll = is_collection(left);
+    int right_coll = is_collection(right);
+
+    if (!left_coll && !right_coll) return fn(left, right);
+
+    int64_t len;
+    if (left_coll && right_coll) {
+        len = ray_len(left) < ray_len(right) ? ray_len(left) : ray_len(right);
+    } else {
+        len = left_coll ? ray_len(left) : ray_len(right);
+    }
+
+    if (len == 0) {
+        /* Empty collection — no first element to probe, so fabricate a
+         * zero-valued atom of each operand's element type and run `fn`
+         * on it to learn the output type.  Without this the result was
+         * hardcoded to I64 and lost the semantics of type-preserving
+         * ops (e.g. `(xbar empty_TIME_col 10000)` returned an I64 empty
+         * vector instead of a TIME one). */
+        ray_t* la = left_coll  ? zero_atom_for_elem_type(left)  : left;
+        ray_t* ra = right_coll ? zero_atom_for_elem_type(right) : right;
+        ray_t* probe = (la && ra && !RAY_IS_ERR(la) && !RAY_IS_ERR(ra))
+                       ? fn(la, ra) : NULL;
+        if (left_coll  && la) ray_release(la);
+        if (right_coll && ra) ray_release(ra);
+        if (probe && !RAY_IS_ERR(probe) && probe->type < 0) {
+            int8_t t = (int8_t)(-probe->type);
+            ray_release(probe);
+            return ray_vec_new(t, 0);
+        }
+        if (probe && !RAY_IS_ERR(probe)) ray_release(probe);
+        return ray_vec_new(RAY_I64, 0);
+    }
+
+    /* Probe first element to determine output type */
+    int la0 = 0, ra0 = 0;
+    ray_t* a0 = left_coll  ? collection_elem(left, 0, &la0)  : left;
+    ray_t* b0 = right_coll ? collection_elem(right, 0, &ra0) : right;
+    ray_t* e0;
+    if (RAY_IS_ERR(a0) || RAY_IS_ERR(b0)) {
+        e0 = ray_error("type", NULL);
+    } else if (is_collection(a0) || is_collection(b0)) {
+        e0 = atomic_map_binary(fn, a0, b0);
+    } else {
+        e0 = fn(a0, b0);
+    }
+    if (la0) ray_release(a0);
+    if (ra0) ray_release(b0);
+    if (RAY_IS_ERR(e0)) return e0;
+
+    int8_t out_type = -(e0->type);  /* atom type (-RAY_I64) → vector type (RAY_I64) */
+
+    /* If either input is a boxed list (mixed types), always use boxed list output
+     * to preserve type heterogeneity */
+    int force_boxed = (left_coll && left->type == RAY_LIST) ||
+                      (right_coll && right->type == RAY_LIST);
+
+    /* When the probed result is a null atom, the fn already chose the correct
+     * result type (e.g., division returns left-operand-typed null).  Skip the
+     * wider-wins promotion so the typed null lands in the right vector type. */
+    int e0_null = RAY_ATOM_IS_NULL(e0);
+
+    /* When the probed result is a boolean (from comparison ops like ==, <, etc.),
+     * preserve the bool output type — do not promote to wider integer type. */
+    int e0_bool = (e0->type == -RAY_BOOL);
+
+    /* When LEFT is scalar broadcast to RIGHT vector, the output type follows
+     * the RIGHT vector's element type for integer types,
+     * unless float or temporal promotion is involved. */
+    if (!e0_null && !e0_bool && !left_coll && right_coll && ray_is_vec(right) && out_type != RAY_F64) {
+        int8_t vec_type = right->type;
+        /* Only override for integer family: if probed type is wider int, downcast */
+        int out_is_int = (out_type == RAY_I64 || out_type == RAY_I32 || out_type == RAY_I16 || out_type == RAY_U8);
+        int vec_is_int = (vec_type == RAY_I64 || vec_type == RAY_I32 || vec_type == RAY_I16 || vec_type == RAY_U8);
+        if (out_is_int && vec_is_int)
+            out_type = vec_type;
+        /* For temporal: only override if both are same temporal family */
+        if ((vec_type == RAY_DATE || vec_type == RAY_TIME || vec_type == RAY_TIMESTAMP) &&
+            out_type == vec_type)
+            out_type = vec_type; /* no-op, just keep it */
+    }
+    /* When LEFT is vector and RIGHT is scalar, output follows WIDER integer
+     * type between left vector and right scalar */
+    if (!e0_null && !e0_bool && left_coll && !right_coll && ray_is_vec(left) && out_type != RAY_F64 &&
+        ray_is_atom(right)) {
+        int8_t vt = left->type, st = -(right->type);
+        int vt_int = (vt == RAY_I64 || vt == RAY_I32 || vt == RAY_I16 || vt == RAY_U8);
+        int st_int = (st == RAY_I64 || st == RAY_I32 || st == RAY_I16 || st == RAY_U8);
+        int out_is_int = (out_type == RAY_I64 || out_type == RAY_I32 || out_type == RAY_I16 || out_type == RAY_U8);
+        if (out_is_int && vt_int && st_int)
+            out_type = (vt >= st) ? vt : st; /* wider wins */
+    }
+    /* When both are vectors, output type follows wider integer type */
+    if (!e0_null && !e0_bool && left_coll && right_coll && ray_is_vec(left) && ray_is_vec(right) && out_type != RAY_F64) {
+        int8_t lt = left->type, rt = right->type;
+        int lt_int = (lt == RAY_I64 || lt == RAY_I32 || lt == RAY_I16 || lt == RAY_U8);
+        int rt_int = (rt == RAY_I64 || rt == RAY_I32 || rt == RAY_I16 || rt == RAY_U8);
+        if (lt_int && rt_int) {
+            /* Pick wider: I64 > I32 > I16 > U8 (using type tag ordering) */
+            out_type = (lt >= rt) ? lt : rt;
+        }
+    }
+
+    /* When LEFT is a vector collection, override i32 output to match the
+     * left vector type or i64 (e.g., [DATE]-DATE → i64, [i64]-i32 → i64).
+     * Keeps i32 only when left vector is actually i32. */
+    if (!e0_null && !e0_bool && out_type == RAY_I32 && left_coll && ray_is_vec(left) && left->type != RAY_I32) {
+        out_type = RAY_I64;
+    }
+
+    /* ══════════════════════════════════════════════════════════════
+     * FAST PATH: opcode-driven vectorized execution.
+     * I64 ops use direct array loops (lowest overhead).
+     * F64/comparison ops route through DAG executor.
+     * ══════════════════════════════════════════════════════════════ */
+
+    /* Direct array loops — only for cross-temporal and mixed-width cases
+     * that the DAG can't handle. All same-type ops go through DAG. */
+    if (0 && !force_boxed && (dag_opcode == OP_DIV || dag_opcode == OP_MOD)) {
+        int8_t ltype = left_coll ? left->type : -(left->type);
+        int8_t rtype = right_coll ? right->type : -(right->type);
+        int esz_l = (ltype == RAY_I64 || ltype == RAY_TIMESTAMP) ? 8 :
+                    (ltype == RAY_I32 || ltype == RAY_DATE || ltype == RAY_TIME) ? 4 :
+                    (ltype == RAY_I16) ? 2 : (ltype == RAY_U8) ? 1 : 0;
+        int esz_r = (rtype == RAY_I64 || rtype == RAY_TIMESTAMP) ? 8 :
+                    (rtype == RAY_I32 || rtype == RAY_DATE || rtype == RAY_TIME) ? 4 :
+                    (rtype == RAY_I16) ? 2 : (rtype == RAY_U8) ? 1 : 0;
+        int lv = left_coll && ray_is_vec(left) && esz_l > 0;
+        int rv = right_coll && ray_is_vec(right) && esz_r > 0;
+        int ls = !left_coll && esz_l > 0;
+        int rs = !right_coll && esz_r > 0;
+
+        /* Cross-type temporal arithmetic (DATE+TIME→TIMESTAMP) needs eval-level
+         * conversion — only use fast path when types are compatible for raw arithmetic */
+        int8_t ltype2 = lv ? left->type : -(left->type);
+        int8_t rtype2 = rv ? right->type : -(right->type);
+        int same_class = (esz_l == esz_r) || /* same storage width */
+                         (ltype2 == RAY_I64 && rtype2 == RAY_I64) || /* both i64 */
+                         (ltype2 == RAY_TIMESTAMP && rtype2 == RAY_TIMESTAMP) ||
+                         /* scalar int + any integer vec is fine (just adds raw values) */
+                         (ls && (rtype2 == ltype2 || ltype2 == RAY_I64)) ||
+                         (rs && (ltype2 == rtype2 || rtype2 == RAY_I64));
+        /* Reject cross-temporal: DATE+TIME, TIMESTAMP+DATE, etc. */
+        int l_temporal = (ltype2==RAY_DATE||ltype2==RAY_TIME||ltype2==RAY_TIMESTAMP);
+        int r_temporal = (rtype2==RAY_DATE||rtype2==RAY_TIME||rtype2==RAY_TIMESTAMP);
+        if (l_temporal && r_temporal && ltype2 != rtype2) same_class = 0;
+
+        if (same_class && ((ls && rv) || (lv && rs) || (lv && rv))) {
+            /* Read elements as i64 regardless of storage width */
+            #define READ_INT(ptr, esz, i) \
+                ((esz)==8 ? ((int64_t*)(ptr))[(i)] : \
+                 (esz)==4 ? (int64_t)((int32_t*)(ptr))[(i)] : \
+                 (esz)==2 ? (int64_t)((int16_t*)(ptr))[(i)] : \
+                            (int64_t)((uint8_t*)(ptr))[(i)])
+            #define SCALAR_INT(obj) \
+                (((obj)->type==-RAY_I64||(obj)->type==-RAY_TIMESTAMP) ? (obj)->i64 : \
+                 ((obj)->type==-RAY_I32||(obj)->type==-RAY_DATE||(obj)->type==-RAY_TIME) ? (int64_t)(obj)->i32 : \
+                 ((obj)->type==-RAY_I16) ? (int64_t)(obj)->i16 : (int64_t)(obj)->u8)
+
+            /* Reuse input buffer when rc==1 and type matches (avoids allocation).
+             * Retain so the caller's ray_release(left/right) doesn't free our output. */
+            ray_t* vec;
+            if (lv && left->rc == 1 && left->type == out_type) {
+                vec = left;
+                ray_retain(vec);  /* caller will release left; we keep ownership */
+            } else if (rv && right->rc == 1 && right->type == out_type) {
+                vec = right;
+                ray_retain(vec);
+            } else {
+                vec = ray_vec_new(out_type, len);
+            }
+            if (!vec || RAY_IS_ERR(vec)) { ray_release(e0); return vec; }
+            vec->len = len;
+
+            void* ldata = lv ? ray_data(left) : NULL;
+            void* rdata = rv ? ray_data(right) : NULL;
+            int64_t lsv = ls ? SCALAR_INT(left) : 0;
+            int64_t rsv = rs ? SCALAR_INT(right) : 0;
+            int out_esz = ray_elem_size(out_type);
+            int l_atom_null = ls && RAY_ATOM_IS_NULL(left);
+            int r_atom_null = rs && RAY_ATOM_IS_NULL(right);
+
+            #define LA(i) (ldata ? READ_INT(ldata, esz_l, i) : lsv)
+            #define RA(i) (rdata ? READ_INT(rdata, esz_r, i) : rsv)
+
+            /* Hoist null check: skip per-element null testing when no nulls */
+            bool l_has_nulls = l_atom_null || (lv && (left->attrs & RAY_ATTR_HAS_NULLS));
+            bool r_has_nulls = r_atom_null || (rv && (right->attrs & RAY_ATTR_HAS_NULLS));
+            bool any_nulls = l_has_nulls || r_has_nulls;
+            void* out_data = ray_data(vec);  /* hoist out of loop */
+
+            if (!any_nulls) {
+                /* Fast path: no nulls — tight loop, no per-element checks */
+                for (int64_t i = 0; i < len; i++) {
+                    int64_t a = LA(i), b = RA(i);
+                    int64_t r;
+                    switch (dag_opcode) {
+                    case OP_ADD: r = (int64_t)((uint64_t)a + (uint64_t)b); break;
+                    case OP_SUB: r = (int64_t)((uint64_t)a - (uint64_t)b); break;
+                    case OP_MUL: r = (int64_t)((uint64_t)a * (uint64_t)b); break;
+                    case OP_DIV: if (b==0) { if (out_esz==8) ((int64_t*)out_data)[i]=0; else if (out_esz==4) ((int32_t*)out_data)[i]=0; else if (out_esz==2) ((int16_t*)out_data)[i]=0; else ((uint8_t*)out_data)[i]=0; ray_vec_set_null(vec,i,true); continue; }
+                                r=a/b; if ((a^b)<0 && r*b!=a) r--; break;
+                    case OP_MOD: if (b==0) { if (out_esz==8) ((int64_t*)out_data)[i]=0; else if (out_esz==4) ((int32_t*)out_data)[i]=0; else if (out_esz==2) ((int16_t*)out_data)[i]=0; else ((uint8_t*)out_data)[i]=0; ray_vec_set_null(vec,i,true); continue; }
+                                r=a%b; if (r && (r^b)<0) r+=b; break;
+                    default: r = 0; break;
+                    }
+                    if (out_esz == 8)      ((int64_t*)out_data)[i] = r;
+                    else if (out_esz == 4)  ((int32_t*)out_data)[i] = (int32_t)r;
+                    else if (out_esz == 2)  ((int16_t*)out_data)[i] = (int16_t)r;
+                    else                    ((uint8_t*)out_data)[i] = (uint8_t)r;
+                }
+            } else {
+                /* Slow path: check nulls per element */
+                #define ISNULL_L(i) (l_atom_null || (lv && ray_vec_is_null(left, i)))
+                #define ISNULL_R(i) (r_atom_null || (rv && ray_vec_is_null(right, i)))
+                for (int64_t i = 0; i < len; i++) {
+                    int64_t a = LA(i), b = RA(i);
+                    int64_t r;
+                    if (ISNULL_L(i) || ISNULL_R(i)) {
+                        if (out_esz == 8)      ((int64_t*)out_data)[i] = 0;
+                        else if (out_esz == 4)  ((int32_t*)out_data)[i] = 0;
+                        else if (out_esz == 2)  ((int16_t*)out_data)[i] = 0;
+                        else                    ((uint8_t*)out_data)[i] = 0;
+                        ray_vec_set_null(vec, i, true);
+                        continue;
+                    }
+                    switch (dag_opcode) {
+                    case OP_ADD: r = (int64_t)((uint64_t)a + (uint64_t)b); break;
+                    case OP_SUB: r = (int64_t)((uint64_t)a - (uint64_t)b); break;
+                    case OP_MUL: r = (int64_t)((uint64_t)a * (uint64_t)b); break;
+                    case OP_DIV: if (b==0) { ((int64_t*)out_data)[i]=0; ray_vec_set_null(vec,i,true); continue; }
+                                r=a/b; if ((a^b)<0 && r*b!=a) r--; break;
+                    case OP_MOD: if (b==0) { ((int64_t*)out_data)[i]=0; ray_vec_set_null(vec,i,true); continue; }
+                                r=a%b; if (r && (r^b)<0) r+=b; break;
+                    default: r = 0; break;
+                    }
+                    if (out_esz == 8)      ((int64_t*)out_data)[i] = r;
+                    else if (out_esz == 4)  ((int32_t*)out_data)[i] = (int32_t)r;
+                    else if (out_esz == 2)  ((int16_t*)out_data)[i] = (int16_t)r;
+                    else                    ((uint8_t*)out_data)[i] = (uint8_t)r;
+                }
+                #undef ISNULL_L
+                #undef ISNULL_R
+            }
+            #undef LA
+            #undef RA
+            #undef READ_INT
+            #undef SCALAR_INT
+            ray_release(e0);
+            return vec;
+        }
+    }
+
+    /* DAG executor — for F64 and comparisons */
+    if (!force_boxed && dag_opcode > 0) {
+        int is_idiv = (dag_opcode == OP_DIV || dag_opcode == OP_MOD);
+        int is_cmp  = (dag_opcode >= OP_EQ && dag_opcode <= OP_GE);
+
+        /* Classify operands: numeric/temporal vectors or scalars */
+        int8_t lt = left_coll ? left->type : -(left->type);
+        int8_t rt = right_coll ? right->type : -(right->type);
+        #define IS_NUM_TYPE(t) ((t)==RAY_I64||(t)==RAY_F64||(t)==RAY_I32||(t)==RAY_I16|| \
+                                (t)==RAY_U8||(t)==RAY_DATE||(t)==RAY_TIME||(t)==RAY_TIMESTAMP)
+        int l_num_vec = left_coll && ray_is_vec(left) && IS_NUM_TYPE(lt);
+        int r_num_vec = right_coll && ray_is_vec(right) && IS_NUM_TYPE(rt);
+        int l_num_scalar = !left_coll && IS_NUM_TYPE(lt);
+        int r_num_scalar = !right_coll && IS_NUM_TYPE(rt);
+        #undef IS_NUM_TYPE
+
+        int can_dag = (l_num_vec || r_num_vec) &&
+                      (l_num_vec || l_num_scalar) && (r_num_vec || r_num_scalar);
+        /* Null scalar atoms lose their null bit in DAG constants — use slow path */
+        if (l_num_scalar && RAY_ATOM_IS_NULL(left)) can_dag = 0;
+        if (r_num_scalar && RAY_ATOM_IS_NULL(right)) can_dag = 0;
+        /* TODO: migrate expr.c to bitmap nulls and remove this bail-out.
+         * DAG executor (expr.c) still uses sentinel-based null checks. */
+        if (l_num_vec && (left->attrs & RAY_ATTR_HAS_NULLS)) can_dag = 0;
+        if (r_num_vec && (right->attrs & RAY_ATTR_HAS_NULLS)) can_dag = 0;
+
+        /* Div/mod: only I64×I64 (executor has floor-div semantics for I64) */
+        if (is_idiv && !(lt == RAY_I64 && rt == RAY_I64)) can_dag = 0;
+        /* Comparisons: same-type only (cross-type promotion loses type info) */
+        if (is_cmp && lt != rt) can_dag = 0;
+        /* Cross-type temporal: DAG promote() loses type tag (int+TIMESTAMP→I64 not TIMESTAMP) */
+        {   int lt_temp = (lt==RAY_DATE||lt==RAY_TIME||lt==RAY_TIMESTAMP);
+            int rt_temp = (rt==RAY_DATE||rt==RAY_TIME||rt==RAY_TIMESTAMP);
+            if ((lt_temp || rt_temp) && lt != rt) can_dag = 0;
+        }
+
+        if (can_dag) {
+                ray_graph_t* g = ray_graph_new(NULL);
+                if (g) {
+                    /* Build left operand node */
+                    ray_op_t* lop = NULL;
+                    if (l_num_scalar) {
+                        if (left->type == -RAY_F64)
+                            lop = ray_const_f64(g, left->f64);
+                        else {
+                            int64_t sv = as_i64(left);
+                            lop = ray_const_i64(g, sv);
+                            if (lop) lop->out_type = -(left->type);
+                        }
+                    } else {
+                        lop = ray_const_vec(g, left);
+                    }
+                    ray_op_t* rop = NULL;
+                    if (r_num_scalar) {
+                        if (right->type == -RAY_F64)
+                            rop = ray_const_f64(g, right->f64);
+                        else {
+                            int64_t sv = as_i64(right);
+                            rop = ray_const_i64(g, sv);
+                            if (rop) rop->out_type = -(right->type);
+                        }
+                    } else {
+                        rop = ray_const_vec(g, right);
+                    }
+                    if (lop && rop) {
+                        ray_op_t* root = ray_binop(g, dag_opcode, lop, rop);
+                        if (root) {
+                            /* For integer floor-division: ray_binop sets F64 output
+                             * for OP_DIV; override to I64 for floor-div with null prop */
+                            if (is_idiv) root->out_type = RAY_I64;
+                            ray_t* result = ray_execute(g, root);
+                            ray_graph_free(g);
+                            if (result && !RAY_IS_ERR(result)) {
+                                /* Restore temporal type tag if promote() collapsed it */
+                                if (ray_is_vec(result) && result->type != out_type &&
+                                    ray_elem_size(result->type) == ray_elem_size(out_type))
+                                    result->type = out_type;
+                                /* Floor-div post-pass (OP_DIV only) */
+                                if (dag_opcode == OP_DIV && ray_is_vec(result) &&
+                                    result->type == RAY_F64) {
+                                    double* d = (double*)ray_data(result);
+                                    for (int64_t fi = 0; fi < result->len; fi++)
+                                        d[fi] = floor(d[fi]);
+                                }
+                                ray_release(e0);
+                                return result;
+                            }
+                        } else { ray_graph_free(g); }
+                    } else { ray_graph_free(g); }
+                }
+            }
+        }
+    /* SLOW PATH: per-element scalar loop (fallback for mixed types, temporal, etc.) */
+    if (!force_boxed &&
+        (out_type == RAY_I64 || out_type == RAY_F64 || out_type == RAY_I32 ||
+         out_type == RAY_I16 || out_type == RAY_BOOL || out_type == RAY_U8 ||
+         out_type == RAY_DATE || out_type == RAY_TIME || out_type == RAY_TIMESTAMP)) {
+        ray_t* vec = ray_vec_new(out_type, len);
+        if (RAY_IS_ERR(vec)) { ray_release(e0); return vec; }
+        vec->len = len;
+        store_typed_elem(vec, 0, e0);
+        ray_release(e0);
+
+        for (int64_t i = 1; i < len; i++) {
+            int la = 0, ra = 0;
+            ray_t* a = left_coll  ? collection_elem(left, i, &la)  : left;
+            ray_t* b = right_coll ? collection_elem(right, i, &ra) : right;
+            ray_t* elem = (RAY_IS_ERR(a) || RAY_IS_ERR(b))
+                         ? ray_error("type", NULL) : fn(a, b);
+            if (la) ray_release(a);
+            if (ra) ray_release(b);
+            if (RAY_IS_ERR(elem)) { ray_release(vec); return elem; }
+            store_typed_elem(vec, i, elem);
+            ray_release(elem);
+        }
+        return vec;
+    }
+
+    /* Determine scalar int type for list+scalar coercion.
+     * When a boxed list is combined with a scalar, integer results
+     * are coerced to the scalar's integer type (K/q semantics). */
+    int8_t scalar_int_type = 0;
+    if (force_boxed) {
+        ray_t* scalar = (!left_coll) ? left : (!right_coll ? right : NULL);
+        if (scalar && ray_is_atom(scalar)) {
+            int8_t st = scalar->type;
+            if (st == -RAY_I16 || st == -RAY_I32 || st == -RAY_I64 || st == -RAY_U8)
+                scalar_int_type = st;
+        }
+    }
+
+    /* Coerce an integer atom to the scalar's integer type */
+    #define COERCE_TO_SCALAR(elem) do { \
+        if (scalar_int_type && ray_is_atom(elem) && elem->type != scalar_int_type && \
+            elem->type != -RAY_F64 && is_numeric(elem)) { \
+            int64_t _v = as_i64(elem); \
+            ray_t* _coerced = make_typed_int(scalar_int_type, _v); \
+            ray_release(elem); \
+            elem = _coerced; \
+        } \
+    } while(0)
+
+    /* Fallback: boxed list for non-numeric output or mixed-type input */
+    COERCE_TO_SCALAR(e0);
+    ray_t* result = ray_alloc(len * sizeof(ray_t*));
+    if (!result) { ray_release(e0); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    result->len = len;
+    ray_t** out = (ray_t**)ray_data(result);
+    out[0] = e0;  /* first element already computed */
+
+    for (int64_t i = 1; i < len; i++) {
+        int la = 0, ra = 0;
+        ray_t* a = left_coll  ? collection_elem(left, i, &la)  : left;
+        ray_t* b = right_coll ? collection_elem(right, i, &ra) : right;
+        ray_t* elem;
+        if (RAY_IS_ERR(a) || RAY_IS_ERR(b)) {
+            elem = ray_error("type", NULL);
+        } else if (is_collection(a) || is_collection(b)) {
+            /* Recursive auto-map when list element is itself a collection */
+            elem = atomic_map_binary(fn, a, b);
+        } else {
+            elem = fn(a, b);
+        }
+        if (la) ray_release(a);
+        if (ra) ray_release(b);
+        if (RAY_IS_ERR(elem)) {
+            for (int64_t j = 0; j < i; j++) ray_release(out[j]);
+            ray_release(result);
+            return elem;
+        }
+        COERCE_TO_SCALAR(elem);
+        out[i] = elem;
+    }
+    #undef COERCE_TO_SCALAR
+    return result;
+}
+
+/* Map a unary function element-wise over a collection.
+ * Produces typed vectors when output is numeric/bool, boxed lists otherwise. */
+ray_t* atomic_map_unary(ray_unary_fn fn, ray_t* arg) {
+    if (!is_collection(arg)) return fn(arg);
+
+    int64_t len = ray_len(arg);
+
+    if (len == 0) {
+        /* Empty — fabricate a zero atom of the element type and run
+         * `fn` to learn the output type; fall back to I64 if the
+         * probe can't resolve a typed atom. */
+        ray_t* z = zero_atom_for_elem_type(arg);
+        ray_t* probe = (z && !RAY_IS_ERR(z)) ? fn(z) : NULL;
+        if (z) ray_release(z);
+        if (probe && !RAY_IS_ERR(probe) && probe->type < 0) {
+            int8_t t = (int8_t)(-probe->type);
+            ray_release(probe);
+            return ray_vec_new(t, 0);
+        }
+        if (probe && !RAY_IS_ERR(probe)) ray_release(probe);
+        return ray_vec_new(RAY_I64, 0);
+    }
+
+    /* Probe first element to determine output type */
+    int alloc0 = 0;
+    ray_t* e0_in = collection_elem(arg, 0, &alloc0);
+    ray_t* e0 = RAY_IS_ERR(e0_in) ? e0_in : fn(e0_in);
+    if (alloc0) ray_release(e0_in);
+    if (RAY_IS_ERR(e0)) return e0;
+
+    int8_t out_type = -(e0->type);
+
+    /* Try typed vector path for numeric/bool/temporal output */
+    if (out_type == RAY_I64 || out_type == RAY_F64 || out_type == RAY_I32 ||
+        out_type == RAY_I16 || out_type == RAY_BOOL || out_type == RAY_U8 ||
+        out_type == RAY_DATE || out_type == RAY_TIME || out_type == RAY_TIMESTAMP) {
+        ray_t* vec = ray_vec_new(out_type, len);
+        if (RAY_IS_ERR(vec)) { ray_release(e0); return vec; }
+        vec->len = len;
+        store_typed_elem(vec, 0, e0);
+        ray_release(e0);
+
+        for (int64_t i = 1; i < len; i++) {
+            int alloc = 0;
+            ray_t* e = collection_elem(arg, i, &alloc);
+            ray_t* elem = RAY_IS_ERR(e) ? e : fn(e);
+            if (alloc) ray_release(e);
+            if (RAY_IS_ERR(elem)) { ray_release(vec); return elem; }
+            store_typed_elem(vec, i, elem);
+            ray_release(elem);
+        }
+        return vec;
+    }
+
+    /* Fallback: boxed list for non-numeric output */
+    ray_t* result = ray_alloc(len * sizeof(ray_t*));
+    if (!result) { ray_release(e0); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    result->len = len;
+    ray_t** out = (ray_t**)ray_data(result);
+    out[0] = e0;
+
+    for (int64_t i = 1; i < len; i++) {
+        int alloc = 0;
+        ray_t* e = collection_elem(arg, i, &alloc);
+        ray_t* elem = RAY_IS_ERR(e) ? e : fn(e);
+        if (alloc) ray_release(e);
+        if (RAY_IS_ERR(elem)) {
+            for (int64_t j = 0; j < i; j++) ray_release(out[j]);
+            ray_release(result);
+            return elem;
+        }
+        out[i] = elem;
+    }
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * Higher-order functions: map, pmap, fold, scan, filter, apply
+ * ══════════════════════════════════════════ */
+
+/* Helper: call a function object with 1 arg, returning result.
+ * Handles UNARY, BINARY, LAMBDA types. Does not release fn or arg. */
+ray_t* call_fn1(ray_t* fn, ray_t* arg) {
+    if (fn_is_restricted(fn)) return ray_error("access", "restricted");
+    if (fn->type == RAY_UNARY) {
+        ray_unary_fn f = (ray_unary_fn)(uintptr_t)fn->i64;
+        if ((fn->attrs & RAY_FN_ATOMIC) && is_collection(arg))
+            return atomic_map_unary(f, arg);
+        return f(arg);
+    }
+    if (fn->type == RAY_LAMBDA) {
+        ray_t* args[1] = { arg };
+        return call_lambda(fn, args, 1);
+    }
+    return ray_error("type", NULL);
+}
+
+/* Helper: call a function object with 2 args. Does not release fn or args. */
+ray_t* call_fn2(ray_t* fn, ray_t* a, ray_t* b) {
+    if (fn_is_restricted(fn)) return ray_error("access", "restricted");
+    if (fn->type == RAY_BINARY) {
+        ray_binary_fn f = (ray_binary_fn)(uintptr_t)fn->i64;
+        if ((fn->attrs & RAY_FN_ATOMIC) && (is_collection(a) || is_collection(b)))
+            return atomic_map_binary(f, a, b);
+        return f(a, b);
+    }
+    if (fn->type == RAY_LAMBDA) {
+        ray_t* args[2] = { a, b };
+        return call_lambda(fn, args, 2);
+    }
+    if (fn->type == RAY_UNARY) {
+        /* Partial application not supported, just call with first arg */
+        ray_unary_fn f = (ray_unary_fn)(uintptr_t)fn->i64;
+        return f(a);
+    }
+    return ray_error("type", NULL);
+}
+
+
+/* ══════════════════════════════════════════
+ * Sorting builtins
+ * ══════════════════════════════════════════ */
+
+/* Reorder vector elements by an index array */
+ray_t* gather_by_idx(ray_t* vec, int64_t* idx, int64_t n) {
+    int8_t type = vec->type;
+
+    /* Check nulls once — resolve through slices */
+    bool has_nulls = (vec->attrs & RAY_ATTR_HAS_NULLS) ||
+                     ((vec->attrs & RAY_ATTR_SLICE) && vec->slice_parent &&
+                      (vec->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+
+    if (type == RAY_STR) {
+        ray_t* result = ray_vec_new(type, n);
+        if (RAY_IS_ERR(result)) return result;
+        result->len = n;
+        for (int64_t i = 0; i < n; i++) {
+            if (has_nulls && ray_vec_is_null(vec, idx[i])) {
+                result = ray_str_vec_set(result, i, "", 0);
+                ray_vec_set_null(result, i, true);
+            } else {
+                size_t slen;
+                const char* s = ray_str_vec_get(vec, idx[i], &slen);
+                result = ray_str_vec_set(result, i, s ? s : "", s ? slen : 0);
+            }
+        }
+        return result;
+    }
+
+    /* RAY_SYM: use adaptive width, create with matching width */
+    if (type == RAY_SYM) {
+        uint8_t w = vec->attrs & RAY_SYM_W_MASK;
+        ray_t* result = ray_sym_vec_new(w, n);
+        if (RAY_IS_ERR(result)) return result;
+        result->len = n;
+        uint8_t esz = (uint8_t)RAY_SYM_ELEM(w);
+        char* src = (char*)ray_data(vec);
+        char* dst = (char*)ray_data(result);
+        switch (esz) {
+        case 8: for (int64_t i = 0; i < n; i++) memcpy(dst + i*8, src + idx[i]*8, 8); break;
+        case 4: for (int64_t i = 0; i < n; i++) memcpy(dst + i*4, src + idx[i]*4, 4); break;
+        case 2: for (int64_t i = 0; i < n; i++) memcpy(dst + i*2, src + idx[i]*2, 2); break;
+        case 1: for (int64_t i = 0; i < n; i++) dst[i] = src[idx[i]]; break;
+        default: for (int64_t i = 0; i < n; i++) memcpy(dst + i*esz, src + idx[i]*esz, esz); break;
+        }
+        if (vec->sym_dict) {
+            ray_retain(vec->sym_dict);
+            result->sym_dict = vec->sym_dict;
+        }
+        if (has_nulls) {
+            for (int64_t i = 0; i < n; i++)
+                if (ray_vec_is_null(vec, idx[i]))
+                    ray_vec_set_null(result, i, true);
+        }
+        return result;
+    }
+
+    /* LIST: pointer gather with retain */
+    if (type == RAY_LIST) {
+        ray_t* result = ray_alloc(n * sizeof(ray_t*));
+        if (!result || RAY_IS_ERR(result)) return result ? result : ray_error("oom", NULL);
+        result->type = type;
+        result->len = n;
+        ray_t** src_ptrs = (ray_t**)ray_data(vec);
+        ray_t** dst_ptrs = (ray_t**)ray_data(result);
+        for (int64_t i = 0; i < n; i++) {
+            dst_ptrs[i] = src_ptrs[idx[i]];
+            if (dst_ptrs[i]) ray_retain(dst_ptrs[i]);
+        }
+        return result;
+    }
+
+    ray_t* result = ray_vec_new(type, n);
+    if (RAY_IS_ERR(result)) return result;
+    result->len = n;
+    uint8_t esz = ray_type_sizes[type];
+    char* src = (char*)ray_data(vec);
+    char* dst = (char*)ray_data(result);
+    /* Typed gather — compiler constant esz enables vectorization, alias-safe */
+    switch (esz) {
+    case 8: for (int64_t i = 0; i < n; i++) memcpy(dst + i*8, src + idx[i]*8, 8); break;
+    case 4: for (int64_t i = 0; i < n; i++) memcpy(dst + i*4, src + idx[i]*4, 4); break;
+    case 2: for (int64_t i = 0; i < n; i++) memcpy(dst + i*2, src + idx[i]*2, 2); break;
+    case 1: for (int64_t i = 0; i < n; i++) dst[i] = src[idx[i]]; break;
+    default: for (int64_t i = 0; i < n; i++) memcpy(dst + i*esz, src + idx[i]*esz, esz); break;
+    case 16: for (int64_t i = 0; i < n; i++) memcpy(dst + i*16, src + idx[i]*16, 16); break;
+    }
+
+    /* Propagate null bitmap */
+    if (has_nulls) {
+        for (int64_t i = 0; i < n; i++)
+            if (ray_vec_is_null(vec, idx[i]))
+                ray_vec_set_null(result, i, true);
+    }
+
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * Table construction and access
+ * ══════════════════════════════════════════ */
+
+/* (list v1 v2 ...) — package args into a list */
+ray_t* ray_list_fn(ray_t** args, int64_t n) {
+    ray_t* result = ray_alloc(n * sizeof(ray_t*));
+    if (!result) return ray_error("oom", NULL);
+    result->type = RAY_LIST;
+    result->len = n;
+    ray_t** out = (ray_t**)ray_data(result);
+    for (int64_t i = 0; i < n; i++) {
+        ray_retain(args[i]);
+        out[i] = args[i];
+    }
+    return result;
+}
+
+/* (table [col_names] (list col1 col2 ...)) — build a RAY_TABLE */
+ray_t* ray_table_fn(ray_t* names, ray_t* cols) {
+    ray_t *_bxn = NULL, *_bxc = NULL;
+    names = unbox_vec_arg(names, &_bxn);
+    if (RAY_IS_ERR(names)) return names;
+    cols = unbox_vec_arg(cols, &_bxc);
+    if (RAY_IS_ERR(cols)) { if (_bxn) ray_release(_bxn); return cols; }
+    if (!is_list(names) || !is_list(cols)) { if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return ray_error("type", NULL); }
+    int64_t ncols = ray_len(names);
+    if (ray_len(cols) != ncols) { if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return ray_error("domain", NULL); }
+
+    ray_t** name_elems = (ray_t**)ray_data(names);
+    ray_t** col_elems = (ray_t**)ray_data(cols);
+    int64_t expected_rows = -1;
+
+    ray_t* tbl = ray_table_new(ncols);
+    if (RAY_IS_ERR(tbl)) { if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return tbl; }
+
+    for (int64_t i = 0; i < ncols; i++) {
+        if (name_elems[i]->type != -RAY_SYM)
+            { ray_release(tbl); if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return ray_error("type", NULL); }
+        int64_t name_id = name_elems[i]->i64;
+
+        /* Convert Rayfall list (or typed vec) to typed column vector */
+        ray_t* col_src = col_elems[i];
+
+        /* Single atom → wrap in a 1-element vector */
+        ray_t* atom_wrap = NULL;
+        if (ray_is_atom(col_src) && col_src->type != -RAY_SYM) {
+            int8_t atype = -col_src->type;
+            if (atype == RAY_GUID) {
+                atom_wrap = ray_vec_new(RAY_GUID, 1);
+                if (!RAY_IS_ERR(atom_wrap) && col_src->obj)
+                    memcpy(ray_data(atom_wrap), ray_data(col_src->obj), 16);
+                if (!RAY_IS_ERR(atom_wrap)) atom_wrap->len = 1;
+            } else if (atype == RAY_TIMESTAMP || atype == RAY_I64 || atype == RAY_SYM) {
+                atom_wrap = ray_vec_new(atype, 1);
+                if (!RAY_IS_ERR(atom_wrap)) { ((int64_t*)ray_data(atom_wrap))[0] = col_src->i64; atom_wrap->len = 1; }
+            } else if (atype == RAY_F64) {
+                atom_wrap = ray_vec_new(RAY_F64, 1);
+                if (!RAY_IS_ERR(atom_wrap)) { ((double*)ray_data(atom_wrap))[0] = col_src->f64; atom_wrap->len = 1; }
+            } else if (atype == RAY_DATE || atype == RAY_TIME || atype == RAY_I32) {
+                atom_wrap = ray_vec_new(atype, 1);
+                if (!RAY_IS_ERR(atom_wrap)) { ((int32_t*)ray_data(atom_wrap))[0] = col_src->i32; atom_wrap->len = 1; }
+            } else if (atype == RAY_BOOL) {
+                atom_wrap = ray_vec_new(RAY_BOOL, 1);
+                if (!RAY_IS_ERR(atom_wrap)) { ((uint8_t*)ray_data(atom_wrap))[0] = col_src->b8; atom_wrap->len = 1; }
+            }
+            if (atom_wrap && !RAY_IS_ERR(atom_wrap)) col_src = atom_wrap;
+        }
+
+        /* If the column is already a typed vector, use it directly */
+        if (ray_is_vec(col_src)) {
+            int64_t nrows = ray_len(col_src);
+            if (expected_rows < 0) expected_rows = nrows;
+            else if (nrows != expected_rows)
+                { ray_release(tbl); if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return ray_error("domain", NULL); }
+            ray_retain(col_src);
+            tbl = ray_table_add_col(tbl, name_id, col_src);
+            ray_release(col_src);
+            if (RAY_IS_ERR(tbl)) { if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return tbl; }
+            continue;
+        }
+
+        if (!is_list(col_src))
+            { ray_release(tbl); if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return ray_error("type", NULL); }
+        int64_t nrows = ray_len(col_src);
+
+        /* Validate all columns have consistent row count */
+        if (expected_rows < 0) expected_rows = nrows;
+        else if (nrows != expected_rows)
+            { ray_release(tbl); if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return ray_error("domain", NULL); }
+
+        ray_t** row_elems = (ray_t**)ray_data(col_src);
+
+        /* If the LIST contains non-atom values (e.g. nested vectors for an
+         * embedding column), store the LIST as the column directly rather
+         * than trying to build a typed vector from non-atomic elements. */
+        if (nrows > 0 && row_elems[0] && !ray_is_atom(row_elems[0])) {
+            ray_retain(col_src);
+            tbl = ray_table_add_col(tbl, name_id, col_src);
+            ray_release(col_src);
+            if (RAY_IS_ERR(tbl)) { if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return tbl; }
+            continue;
+        }
+
+        /* Determine column type from elements (scan for mixed I64/F64 → F64) */
+        int8_t col_type = RAY_I64;
+        if (nrows > 0) {
+            if (row_elems[0]->type == -RAY_F64) col_type = RAY_F64;
+            else if (row_elems[0]->type == -RAY_BOOL) col_type = RAY_BOOL;
+            else if (row_elems[0]->type == -RAY_SYM) col_type = RAY_SYM;
+            else if (row_elems[0]->type == -RAY_STR) col_type = RAY_STR;
+            else if (row_elems[0]->type == -RAY_GUID) col_type = RAY_GUID;
+            else if (row_elems[0]->type == -RAY_TIMESTAMP) col_type = RAY_TIMESTAMP;
+            else if (row_elems[0]->type == -RAY_DATE) col_type = RAY_DATE;
+            else if (row_elems[0]->type == -RAY_TIME) col_type = RAY_TIME;
+            /* RAY_CHAR removed — char atoms are now -RAY_STR */
+        }
+        /* Promote I64 → F64 if any element is F64 */
+        if (col_type == RAY_I64) {
+            for (int64_t j = 0; j < nrows; j++) {
+                if (row_elems[j]->type == -RAY_F64) { col_type = RAY_F64; break; }
+            }
+        }
+
+        ray_t* col_vec = ray_vec_new(col_type, nrows);
+        if (RAY_IS_ERR(col_vec))
+            { ray_release(tbl); if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return col_vec; }
+
+        for (int64_t j = 0; j < nrows; j++) {
+            if (col_type == RAY_STR) {
+                if (row_elems[j]->type != -RAY_STR) {
+                    ray_release(col_vec); ray_release(tbl);
+                    if (_bxn) ray_release(_bxn);
+                    if (_bxc) ray_release(_bxc);
+                    return ray_error("type", NULL);
+                }
+                const char *sptr = ray_str_ptr(row_elems[j]);
+                size_t slen = ray_str_len(row_elems[j]);
+                col_vec = ray_str_vec_append(col_vec, sptr, slen);
+            } else if (col_type == RAY_GUID) {
+                if (row_elems[j]->type != -RAY_GUID || !row_elems[j]->obj) {
+                    ray_release(col_vec); ray_release(tbl);
+                    if (_bxn) ray_release(_bxn);
+                    if (_bxc) ray_release(_bxc);
+                    return ray_error("type", NULL);
+                }
+                col_vec = ray_vec_append(col_vec, ray_data(row_elems[j]->obj));
+            } else {
+                /* Validate each element matches the column type (allow I64→F64 promotion) */
+                int type_ok = (row_elems[j]->type == -col_type);
+                if (!type_ok && col_type == RAY_F64 && row_elems[j]->type == -RAY_I64) type_ok = 1;
+                if (!type_ok) {
+                    ray_release(col_vec); ray_release(tbl);
+                    if (_bxn) ray_release(_bxn);
+                    if (_bxc) ray_release(_bxc);
+                    return ray_error("type", NULL);
+                }
+                void* val_ptr;
+                double promoted;
+                if (col_type == RAY_F64 && row_elems[j]->type == -RAY_I64) {
+                    promoted = (double)row_elems[j]->i64;
+                    val_ptr = &promoted;
+                } else if (col_type == RAY_I64) val_ptr = &row_elems[j]->i64;
+                else if (col_type == RAY_F64) val_ptr = &row_elems[j]->f64;
+                else if (col_type == RAY_BOOL) val_ptr = &row_elems[j]->b8;
+                else val_ptr = &row_elems[j]->i64; /* SYM/TIMESTAMP/DATE/TIME stored as i64 */
+                col_vec = ray_vec_append(col_vec, val_ptr);
+            }
+            if (RAY_IS_ERR(col_vec))
+                { ray_release(tbl); if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return col_vec; }
+        }
+
+        tbl = ray_table_add_col(tbl, name_id, col_vec);
+        ray_release(col_vec);
+        if (RAY_IS_ERR(tbl)) { if (_bxn) ray_release(_bxn); if (_bxc) ray_release(_bxc); return tbl; }
+    }
+
+    if (_bxn) ray_release(_bxn);
+    if (_bxc) ray_release(_bxc);
+    return tbl;
+}
+
+/* (key dict/table) — return keys vector */
+ray_t* ray_key_fn(ray_t* x) {
+    if (x->type == RAY_DICT) {
+        ray_t* keys = ray_dict_keys(x);
+        if (!keys) return ray_error("type", NULL);
+        ray_retain(keys);
+        return keys;
+    }
+    if (x->type != RAY_TABLE) return ray_error("type", NULL);
+    int64_t ncols = ray_table_ncols(x);
+    ray_t* vec = ray_vec_new(RAY_SYM, ncols);
+    if (RAY_IS_ERR(vec)) return vec;
+    vec->len = ncols;
+    int64_t* out = (int64_t*)ray_data(vec);
+    for (int64_t i = 0; i < ncols; i++)
+        out[i] = ray_table_col_name(x, i);
+    return vec;
+}
+
+/* (value dict/table) — extract values */
+ray_t* ray_value_fn(ray_t* x) {
+    /* Table: return list of column vectors */
+    if (x->type == RAY_TABLE) {
+        /* Table cols slot is a RAY_LIST already — return a fresh copy. */
+        int64_t ncols = ray_table_ncols(x);
+        ray_t* result = ray_list_new(ncols);
+        if (!result || RAY_IS_ERR(result)) return result ? result : ray_error("oom", NULL);
+        for (int64_t i = 0; i < ncols; i++) {
+            ray_t* c = ray_table_get_col_idx(x, i);
+            result = ray_list_append(result, c);
+            if (RAY_IS_ERR(result)) return result;
+        }
+        return result;
+    }
+    if (x->type != RAY_DICT) return ray_error("type", NULL);
+    ray_t* vals = ray_dict_vals(x);
+    if (!vals) return ray_error("type", NULL);
+    ray_retain(vals);
+    return vals;
+}
+
+
+
+/* ray_lang_print, fmt_interpolate, ray_println_fn, ray_show_fn, ray_format_fn,
+ * ray_resolve_fn, ray_timeit_fn, ray_exit_fn, resolve_type_name,
+ * ray_read_csv_fn, ray_write_csv_fn, cast_match, ray_cast_fn, ray_type_fn,
+ * ray_read_file_fn, ray_load_file_fn, ray_write_file_fn
+ * moved to ops/builtins.c */
+
+/* ══════════════════════════════════════════
+ * Special forms: set, let, if, do
+ * ══════════════════════════════════════════ */
+
+/* (set name value) — bind in global env. Receives unevaluated args. */
+ray_t* ray_set_fn(ray_t* name_obj, ray_t* val_expr) {
+    if (name_obj->type != -RAY_SYM)
+        return ray_error("type", NULL);
+    ray_t* val = ray_eval(val_expr);
+    if (RAY_IS_ERR(val)) return val;
+    /* Materialize lazy handles before binding */
+    if (ray_is_lazy(val))
+        val = ray_lazy_materialize(val);
+    if (RAY_IS_ERR(val)) return val;
+    ray_err_t err = ray_env_set(name_obj->i64, val);
+    if (err != RAY_OK) {
+        ray_release(val);
+        return ray_error(ray_err_code_str(err), NULL);
+    }
+    return val;  /* set returns the value */
+}
+
+/* (let name value) — bind in local scope. Receives unevaluated args. */
+ray_t* ray_let_fn(ray_t* name_obj, ray_t* val_expr) {
+    if (name_obj->type != -RAY_SYM)
+        return ray_error("type", NULL);
+    ray_t* val = ray_eval(val_expr);
+    if (RAY_IS_ERR(val)) return val;
+    /* Materialize lazy handles before binding */
+    if (ray_is_lazy(val))
+        val = ray_lazy_materialize(val);
+    if (RAY_IS_ERR(val)) return val;
+    ray_err_t err = ray_env_set_local(name_obj->i64, val);
+    if (err != RAY_OK) { ray_release(val); return ray_error(ray_err_code_str(err), NULL); }
+    return val;
+}
+
+/* (if cond then else?) — conditional. Receives unevaluated args. */
+ray_t* ray_cond_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("domain", NULL);
+    ray_t* cond = ray_eval(args[0]);
+    if (RAY_IS_ERR(cond)) return cond;
+    /* Materialize lazy handles before testing truthiness */
+    if (ray_is_lazy(cond))
+        cond = ray_lazy_materialize(cond);
+    if (RAY_IS_ERR(cond)) return cond;
+    int truthy = is_truthy(cond);
+    ray_release(cond);
+    if (truthy) return ray_eval(args[1]);
+    if (n >= 3) return ray_eval(args[2]);
+    /* No else branch: return 0 */
+    return make_i64(0);
+}
+
+/* (do expr1 expr2 ...) — evaluate in sequence, return last. Pushes local scope. */
+ray_t* ray_do_fn(ray_t** args, int64_t n) {
+    if (n == 0) return make_i64(0);
+    if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
+    ray_t* result = NULL;
+    for (int64_t i = 0; i < n; i++) {
+        if (result) ray_release(result);
+        result = ray_eval(args[i]);
+        if (RAY_IS_ERR(result)) {
+            ray_env_pop_scope();
+            return result;
+        }
+    }
+    ray_env_pop_scope();
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * Lambda functions
+ * ══════════════════════════════════════════ */
+
+/* (fn [params...] body...) — create a lambda object.
+ * Stores params list and body expressions in data area. */
+ray_t* ray_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("domain", NULL);
+    /* args[0] = param vector (list of name symbols), args[1..n-1] = body exprs */
+    ray_t* params_list = args[0];
+
+    /* Reject lambda parameters named under the reserved `.` namespace.
+     * Even though the bytecode VM resolves them to slot indices rather
+     * than env entries, a user-defined fn with `.sys.gc` as a parameter
+     * would silently override the builtin inside the body via the
+     * compile-time name→slot map — that counts as shadowing and is
+     * disallowed for the same reason `(let .sys.gc ...)` is.  Lambda
+     * param lists are SYM *vectors* (not RAY_LISTs): `[a b c]` of all
+     * syms is stored as a flat i64 sym-id array. */
+    if (params_list) {
+        int64_t nparams = ray_len(params_list);
+        if (params_list->type == RAY_SYM) {
+            int64_t* ids = (int64_t*)ray_data(params_list);
+            for (int64_t i = 0; i < nparams; i++)
+                if (ray_sym_is_reserved(ids[i]))
+                    return ray_error("reserve",
+                        "lambda parameter '%s' is in the reserved namespace",
+                        ray_str_ptr(ray_sym_str(ids[i])));
+        } else if (params_list->type == RAY_LIST) {
+            ray_t** pelems = (ray_t**)ray_data(params_list);
+            for (int64_t i = 0; i < nparams; i++) {
+                ray_t* p = pelems[i];
+                if (p && p->type == -RAY_SYM && ray_sym_is_reserved(p->i64))
+                    return ray_error("reserve",
+                        "lambda parameter '%s' is in the reserved namespace",
+                        ray_str_ptr(ray_sym_str(p->i64)));
+            }
+        }
+    }
+
+    /* Create lambda object with space for 7 slots:
+     * [0] params, [1] body, [2] bytecode, [3] constants, [4] n_locals,
+     * [5] nfo (source location), [6] dbg (debug metadata) */
+    ray_t* lambda = ray_alloc(7 * sizeof(ray_t*));
+    if (!lambda) return ray_error("oom", NULL);
+    lambda->type = RAY_LAMBDA;
+    lambda->attrs = 0;
+    lambda->len = 0;
+
+    /* Store params list */
+    ray_retain(params_list);
+    LAMBDA_PARAMS(lambda) = params_list;
+
+    /* Build body list: wrap body expressions in a RAY_LIST */
+    int64_t body_count = n - 1;
+    ray_t* body = ray_alloc(body_count * sizeof(ray_t*));
+    if (!body) {
+        ray_release(params_list);
+        ray_release(lambda);
+        return ray_error("oom", NULL);
+    }
+    body->type = RAY_LIST;
+    body->len = body_count;
+    ray_t** body_elems = (ray_t**)ray_data(body);
+    for (int64_t i = 0; i < body_count; i++) {
+        ray_retain(args[i + 1]);
+        body_elems[i] = args[i + 1];
+    }
+    LAMBDA_BODY(lambda) = body;
+
+    /* Clear compiled slots */
+    LAMBDA_BC(lambda) = NULL;
+    LAMBDA_CONSTS(lambda) = NULL;
+    LAMBDA_NLOCALS(lambda) = 0;
+
+    /* Attach source location info from current eval context */
+    if (g_eval_nfo) {
+        LAMBDA_NFO(lambda) = g_eval_nfo;
+        ray_retain(g_eval_nfo);
+    } else {
+        LAMBDA_NFO(lambda) = NULL;
+    }
+    LAMBDA_DBG(lambda) = NULL;
+
+    return lambda;
+}
+
+/* Build a [span_i64, filename, fn_name, source] frame from a resolved span
+ * and append it to g_error_trace.  Shared by the bytecode and eval paths. */
+static void append_error_frame(ray_t* nfo, ray_span_t span) {
+    if (span.id == 0) return;
+
+    ray_t* frame = ray_alloc(4 * sizeof(ray_t*));
+    if (!frame || RAY_IS_ERR(frame)) return;
+    frame->type = RAY_LIST;
+    frame->len = 4;
+    ray_t** fe = (ray_t**)ray_data(frame);
+
+    fe[0] = ray_i64(span.id);
+    if (nfo && NFO_FILENAME(nfo)) {
+        fe[1] = NFO_FILENAME(nfo);
+        ray_retain(fe[1]);
+    } else {
+        fe[1] = ray_str("<unknown>", 9);
+    }
+    fe[2] = NULL;
+    if (nfo && NFO_SOURCE(nfo)) {
+        fe[3] = NFO_SOURCE(nfo);
+        ray_retain(fe[3]);
+    } else {
+        fe[3] = ray_str("", 0);
+    }
+
+    if (!g_error_trace) {
+        g_error_trace = ray_alloc(sizeof(ray_t*));
+        if (!g_error_trace) { ray_release(frame); return; }
+        g_error_trace->type = RAY_LIST;
+        g_error_trace->len = 1;
+        ((ray_t**)ray_data(g_error_trace))[0] = frame;
+    } else {
+        g_error_trace = ray_list_append(g_error_trace, frame);
+        ray_release(frame);
+    }
+}
+
+/* Build a single error trace frame from a lambda's debug/nfo info at the given
+ * bytecode IP. */
+static void add_error_frame(ray_t* fn, int32_t ip) {
+    if (!fn || fn->type != RAY_LAMBDA) return;
+    ray_t* dbg = LAMBDA_DBG(fn);
+    ray_t* nfo = LAMBDA_NFO(fn);
+    if (!dbg && !nfo) return;
+
+    ray_span_t span = {0};
+    if (dbg) span = ray_bc_dbg_get(dbg, ip);
+    append_error_frame(nfo, span);
+}
+
+/* Add error frame from eval context (nfo + AST node) for call-site errors. */
+static void add_eval_error_frame(ray_t* nfo, ray_t* node) {
+    if (!nfo || !node) return;
+    append_error_frame(nfo, ray_nfo_get(nfo, node));
+}
+
+/* Execute compiled bytecode for a lambda. */
+static ray_t* vm_exec(ray_t* lambda, ray_t** call_args, int64_t argc);
+
+/* Call a lambda: compile on first call, then execute bytecode. */
+ray_t* call_lambda(ray_t* lambda, ray_t** call_args, int64_t argc) {
+    /* Lazy compilation on first call */
+    if (!LAMBDA_IS_COMPILED(lambda)) {
+        ray_compile(lambda);
+    }
+
+    /* If compilation succeeded, run bytecode; otherwise fall back to tree-walk */
+    if (LAMBDA_IS_COMPILED(lambda)) {
+        return vm_exec(lambda, call_args, argc);
+    }
+
+    /* Fallback: tree-walking interpreter */
+    ray_t* params_list = LAMBDA_PARAMS(lambda);
+    ray_t* body = LAMBDA_BODY(lambda);
+
+    int64_t param_count = ray_len(params_list);
+
+    if (argc != param_count)
+        return ray_error("arity", "expected %" PRId64 " args, got %" PRId64, param_count, argc);
+
+    if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
+
+    /* Bind 'self' to the current lambda for recursion */
+    {
+        static int64_t self_sym_id = -1;
+        if (self_sym_id < 0) self_sym_id = ray_sym_intern("self", 4);
+        ray_env_set_local(self_sym_id, lambda);
+    }
+
+    int64_t* param_ids = (int64_t*)ray_data(params_list);
+    for (int64_t i = 0; i < param_count && i < argc; i++) {
+        (void)ray_env_set_local(param_ids[i], call_args[i]);
+    }
+
+    int64_t body_count = ray_len(body);
+    ray_t** body_exprs = (ray_t**)ray_data(body);
+    ray_t* result = NULL;
+    for (int64_t i = 0; i < body_count; i++) {
+        if (result) ray_release(result);
+        result = ray_eval(body_exprs[i]);
+        if (RAY_IS_ERR(result)) {
+            ray_env_pop_scope();
+            return result;
+        }
+    }
+
+    ray_env_pop_scope();
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * Stack-based VM executor (computed goto, frame-based)
+ * ══════════════════════════════════════════ */
+
+static _Thread_local ray_vm_t *__VM = NULL;
+
+static ray_t* vm_exec(ray_t* lambda, ray_t** call_args, int64_t argc) {
+    /* Computed goto dispatch table */
+    static void *dispatch[OP__COUNT] = {
+        [OP_RET]        = &&op_ret,
+        [OP_JMP]        = &&op_jmp,
+        [OP_JMPF]       = &&op_jmpf,
+        [OP_LOADCONST]  = &&op_loadconst,
+        [OP_LOADENV]    = &&op_loadenv,
+        [OP_STOREENV]   = &&op_storeenv,
+        [OP_POP]        = &&op_pop,
+        [OP_RESOLVE]    = &&op_resolve,
+        [OP_CALL1]      = &&op_call1,
+        [OP_CALL2]      = &&op_call2,
+        [OP_CALLN]      = &&op_calln,
+        [OP_CALLF]      = &&op_callf,
+        [OP_CALLS]      = &&op_calls,
+        [OP_CALLD]      = &&op_calld,
+        [OP_DUP]        = &&op_dup,
+        [OP_LOADCONST_W] = &&op_loadconst_w,
+        [OP_RESOLVE_W]  = &&op_resolve_w,
+        [OP_TRAP]       = &&op_trap,
+        [OP_TRAP_END]   = &&op_trap_end,
+    };
+
+    /* Arity check before allocating VM state */
+    {
+        int64_t param_count = ray_len(LAMBDA_PARAMS(lambda));
+        if (argc != param_count)
+            return ray_error("arity", "expected %" PRId64 " args, got %" PRId64, param_count, argc);
+    }
+
+    ray_t *vm_block = ray_alloc(sizeof(ray_vm_t));
+    if (!vm_block || RAY_IS_ERR(vm_block)) return ray_error("oom", NULL);
+    ray_vm_t *vmp = (ray_vm_t *)ray_data(vm_block);
+    memset(vmp, 0, sizeof(ray_vm_t));
+    __VM = vmp;
+
+#define vm (*vmp)
+
+    /* Set up initial frame */
+    vm.fn = lambda;
+    ray_retain(lambda);
+    int32_t n_locals = LAMBDA_NLOCALS(lambda);
+    vm.fp = 0;
+    vm.sp = n_locals;
+
+    /* Bind parameters into local slots */
+    int64_t param_count = ray_len(LAMBDA_PARAMS(lambda));
+    for (int64_t i = 0; i < param_count && i < argc; i++) {
+        ray_retain(call_args[i]);
+        vm.ps[i] = call_args[i];
+    }
+
+    uint8_t *code = (uint8_t *)ray_data(LAMBDA_BC(lambda));
+    ray_t **cpool = (ray_t **)ray_data(LAMBDA_CONSTS(lambda));
+    int32_t ip = 0;
+    ray_t *vm_err_obj = NULL;
+
+#define DISPATCH() goto *dispatch[code[ip++]]
+#define PUSH(v)    do { if (vm.sp >= VM_STACK_SIZE) goto vm_error_limit; vm.ps[vm.sp++] = (v); } while(0)
+#define POP()      (vm.ps[--vm.sp])
+#define PEEK()     (vm.ps[vm.sp - 1])
+#define LOCAL(s)   (vm.ps[vm.fp + (s)])
+
+    DISPATCH();
+
+op_loadconst: {
+    uint8_t idx = code[ip++];
+    ray_t *val = cpool[idx];
+    ray_retain(val);
+    PUSH(val);
+    DISPATCH();
+}
+
+op_loadconst_w: {
+    uint16_t idx = (uint16_t)(code[ip] << 8) | code[ip + 1];
+    ip += 2;
+    ray_t *val = cpool[idx];
+    ray_retain(val);
+    PUSH(val);
+    DISPATCH();
+}
+
+op_loadenv: {
+    uint8_t slot = code[ip++];
+    ray_t *val = LOCAL(slot);
+    if (val) ray_retain(val);
+    else val = make_i64(0);
+    PUSH(val);
+    DISPATCH();
+}
+
+op_storeenv: {
+    uint8_t slot = code[ip++];
+    ray_t *val = POP();
+    if (LOCAL(slot)) ray_release(LOCAL(slot));
+    LOCAL(slot) = val;
+    DISPATCH();
+}
+
+op_pop: {
+    if (vm.sp > vm.fp + n_locals) {
+        ray_t *val = POP();
+        if (val) ray_release(val);
+    }
+    DISPATCH();
+}
+
+op_dup: {
+    ray_t *val = PEEK();
+    ray_retain(val);
+    PUSH(val);
+    DISPATCH();
+}
+
+op_resolve: {
+    uint8_t idx = code[ip++];
+    ray_t *name_obj = cpool[idx];
+    ray_t *val = ray_env_resolve(name_obj->i64);
+    if (!val) goto vm_error_name;
+    /* env_resolve returns an owned ref (rc >= 1); no extra retain needed.
+     * It can also return a real error (e.g. nyi from a parted-target link
+     * deref inside the dotted walker) — surface that as a VM error rather
+     * than pushing it onto the stack as if it were a normal value. */
+    if (RAY_IS_ERR(val)) { vm_err_obj = val; goto vm_error; }
+    PUSH(val);
+    DISPATCH();
+}
+
+op_resolve_w: {
+    uint16_t idx = (uint16_t)((code[ip] << 8) | code[ip + 1]);
+    ip += 2;
+    ray_t *name_obj = cpool[idx];
+    ray_t *val = ray_env_resolve(name_obj->i64);
+    if (!val) goto vm_error_name;
+    if (RAY_IS_ERR(val)) { vm_err_obj = val; goto vm_error; }
+    PUSH(val);
+    DISPATCH();
+}
+
+op_jmp: {
+    int16_t offset = (int16_t)((code[ip] << 8) | code[ip + 1]);
+    ip += 2;
+    ip += offset;
+    if (offset < 0 && g_eval_interrupted) goto vm_error_limit;
+    DISPATCH();
+}
+
+op_jmpf: {
+    int16_t offset = (int16_t)((code[ip] << 8) | code[ip + 1]);
+    ip += 2;
+    ray_t *cond = POP();
+    int truthy = is_truthy(cond);
+    ray_release(cond);
+    if (!truthy) ip += offset;
+    DISPATCH();
+}
+
+op_call1: {
+    ray_t *arg = POP();
+    ray_t *fn_obj = POP();
+    ray_unary_fn fn = (ray_unary_fn)(uintptr_t)fn_obj->i64;
+    ray_t *result;
+    if (RAY_UNLIKELY(RAY_IS_NULL(arg))) {
+        result = (fn == (ray_unary_fn)ray_nil_fn || fn == (ray_unary_fn)ray_type_fn)
+                 ? fn(arg) : ray_error("type", NULL);
+    } else if ((fn_obj->attrs & RAY_FN_ATOMIC) && arg->type >= 0)
+        result = atomic_map_unary(fn, arg);
+    else
+        result = fn(arg);
+    ray_release(arg);
+    ray_release(fn_obj);
+    if (RAY_IS_ERR(result)) { vm_err_obj = result; goto vm_error; }
+    PUSH(result);
+    DISPATCH();
+}
+
+op_call2: {
+    ray_t *right = POP();
+    ray_t *left = POP();
+    ray_t *fn_obj = POP();
+    ray_binary_fn fn = (ray_binary_fn)(uintptr_t)fn_obj->i64;
+    ray_t *result;
+    if (RAY_UNLIKELY(RAY_IS_NULL(left) || RAY_IS_NULL(right))) {
+        result = (fn == (ray_binary_fn)ray_eq_fn || fn == (ray_binary_fn)ray_neq_fn)
+                 ? fn(left, right) : ray_error("type", NULL);
+    /* Fast path: atoms have negative type — skip collection check entirely.
+     * Only call is_collection when at least one arg has type >= 0 (vector/list). */
+    } else if ((fn_obj->attrs & RAY_FN_ATOMIC) && (left->type >= 0 || right->type >= 0))
+        result = atomic_map_binary_op(fn, RAY_FN_OPCODE(fn_obj), left, right);
+    else
+        result = fn(left, right);
+    ray_release(left);
+    ray_release(right);
+    ray_release(fn_obj);
+    if (RAY_IS_ERR(result)) { vm_err_obj = result; goto vm_error; }
+    PUSH(result);
+    DISPATCH();
+}
+
+op_calln: {
+    uint8_t n = code[ip++];
+    if (n > 64) goto vm_error;
+    ray_t *fn_args[64];
+    for (int32_t i = n - 1; i >= 0; i--)
+        fn_args[i] = POP();
+    ray_t *fn_obj = POP();
+    ray_vary_fn fn = (ray_vary_fn)(uintptr_t)fn_obj->i64;
+    ray_t *result = fn(fn_args, n);
+    for (int32_t i = 0; i < n; i++)
+        ray_release(fn_args[i]);
+    ray_release(fn_obj);
+    if (RAY_IS_ERR(result)) { vm_err_obj = result; goto vm_error; }
+    PUSH(result);
+    DISPATCH();
+}
+
+op_callf: {
+    uint8_t n = code[ip++];
+    if (n > 64) goto vm_error;
+    ray_t *fn_args[64];
+    for (int32_t i = n - 1; i >= 0; i--)
+        fn_args[i] = POP();
+    ray_t *fn_obj = POP();
+
+    /* Compiled lambda: push frame, switch to callee bytecode */
+    if (fn_obj->type == RAY_LAMBDA) {
+        if (!LAMBDA_IS_COMPILED(fn_obj))
+            ray_compile(fn_obj);
+
+        if (LAMBDA_IS_COMPILED(fn_obj)) {
+            /* All checks before any VM state mutation.
+             * Stack limits take priority over arity (safety first). */
+            int64_t pcnt = ray_len(LAMBDA_PARAMS(fn_obj));
+            int32_t callee_locals = LAMBDA_NLOCALS(fn_obj);
+            if (vm.rp >= VM_STACK_SIZE ||
+                vm.sp + callee_locals >= VM_STACK_SIZE) {
+                for (int32_t i = 0; i < n; i++)
+                    if (fn_args[i]) ray_release(fn_args[i]);
+                ray_release(fn_obj);
+                goto vm_error_limit;
+            }
+            if (n != pcnt) {
+                for (int32_t i = 0; i < n; i++)
+                    if (fn_args[i]) ray_release(fn_args[i]);
+                ray_release(fn_obj);
+                vm_err_obj = ray_error("arity", "expected %" PRId64 " args, got %d", pcnt, n);
+                goto vm_error;
+            }
+
+            /* Push return frame */
+            vm.rs[vm.rp++] = (vm_ctx_t){ .fn = vm.fn, .fp = vm.fp, .ip = ip };
+
+            /* Set up new frame */
+            vm.fn = fn_obj;  /* takes ownership of stack ref */
+            vm.fp = vm.sp;
+            vm.sp += callee_locals;
+            n_locals = callee_locals;
+
+            /* Bind parameters */
+            int64_t bind = pcnt < n ? pcnt : n;
+            for (int64_t i = 0; i < bind; i++)
+                LOCAL(i) = fn_args[i];  /* transfer ownership from args */
+            for (int32_t i = (int32_t)bind; i < callee_locals; i++)
+                LOCAL(i) = NULL;
+            for (int64_t i = bind; i < n; i++)
+                ray_release(fn_args[i]);  /* excess args */
+
+            /* Check for Ctrl-C interrupt on each compiled call */
+            if (g_eval_interrupted) goto vm_error_limit;
+
+            /* Switch to callee bytecode */
+            code = (uint8_t *)ray_data(LAMBDA_BC(fn_obj));
+            cpool = (ray_t **)ray_data(LAMBDA_CONSTS(fn_obj));
+            ip = 0;
+            DISPATCH();
+        }
+    }
+
+    /* Non-lambda or uncompiled: dispatch by type */
+    {
+        ray_t *result;
+        switch (fn_obj->type) {
+        case RAY_UNARY:
+            if (fn_is_restricted(fn_obj)) { for (int32_t i = 0; i < n; i++) ray_release(fn_args[i]); result = ray_error("access", "restricted"); break; }
+            if (n != 1) { for (int32_t i = 0; i < n; i++) ray_release(fn_args[i]); result = ray_error("arity", "expected 1 arg, got %d", n); break; }
+            result = ((ray_unary_fn)(uintptr_t)fn_obj->i64)(fn_args[0]);
+            ray_release(fn_args[0]);
+            break;
+        case RAY_BINARY:
+            if (fn_is_restricted(fn_obj)) { for (int32_t i = 0; i < n; i++) ray_release(fn_args[i]); result = ray_error("access", "restricted"); break; }
+            if (n != 2) { for (int32_t i = 0; i < n; i++) ray_release(fn_args[i]); result = ray_error("arity", "expected 2 args, got %d", n); break; }
+            result = ((ray_binary_fn)(uintptr_t)fn_obj->i64)(fn_args[0], fn_args[1]);
+            ray_release(fn_args[0]);
+            ray_release(fn_args[1]);
+            break;
+        case RAY_VARY:
+            if (fn_is_restricted(fn_obj)) { for (int32_t i = 0; i < n; i++) ray_release(fn_args[i]); result = ray_error("access", "restricted"); break; }
+            result = ((ray_vary_fn)(uintptr_t)fn_obj->i64)(fn_args, n);
+            for (int32_t i = 0; i < n; i++) ray_release(fn_args[i]);
+            break;
+        case RAY_LAMBDA:
+            result = call_lambda(fn_obj, fn_args, n);
+            for (int32_t i = 0; i < n; i++) ray_release(fn_args[i]);
+            break;
+        default:
+            for (int32_t i = 0; i < n; i++) ray_release(fn_args[i]);
+            result = ray_error("type", NULL);
+            break;
+        }
+        ray_release(fn_obj);
+        if (RAY_IS_ERR(result)) { vm_err_obj = result; goto vm_error; }
+        PUSH(result);
+        DISPATCH();
+    }
+}
+
+op_calls: {
+    /* Self-recursive call — lean path matching rayforce 1.
+     * No fn object on stack. Args are already at sp-argc..sp.
+     * Push return frame, set fp so args become locals, extend for extra locals. */
+    uint8_t argc = code[ip++];
+
+    /* Stack overflow guard */
+    if (RAY_UNLIKELY(vm.rp >= VM_STACK_SIZE)) goto vm_error_limit;
+    if (RAY_UNLIKELY(vm.sp + n_locals >= VM_STACK_SIZE)) goto vm_error_limit;
+
+    /* Push return frame (fn=NULL signals self-call to OP_RET) */
+    vm.rs[vm.rp++] = (vm_ctx_t){ .fn = NULL, .fp = vm.fp, .ip = ip };
+
+    /* Args on stack become the new frame's first locals.
+     * Compiler guarantees argc == param count, so argc <= n_locals. */
+    vm.fp = vm.sp - argc;
+
+    /* Extend stack for extra locals beyond params (let bindings etc.) */
+    for (int32_t i = argc; i < n_locals; i++)
+        vm.ps[vm.sp++] = NULL;
+
+    ip = 0;
+    DISPATCH();
+}
+
+op_calld: {
+    /* Dynamic dispatch: evaluate AST directly via ray_eval */
+    uint8_t n = code[ip++];
+    if (n == 0) {
+        /* n=0: the AST itself is on the stack, eval it directly */
+        ray_t *ast = POP();
+        ray_t *result = ray_eval(ast);
+        ray_release(ast);
+        if (RAY_IS_ERR(result)) { vm_err_obj = result; goto vm_error; }
+        PUSH(result);
+        DISPATCH();
+    }
+    /* n>0: build call list and eval */
+    ray_t *fn_args[64];
+    for (int32_t i = n - 1; i >= 0; i--)
+        fn_args[i] = POP();
+    ray_t *fn_obj = POP();
+
+    ray_t *call_list = ray_alloc((n + 1) * sizeof(ray_t *));
+    if (!call_list || RAY_IS_ERR(call_list)) {
+        for (int32_t i = 0; i < n; i++) ray_release(fn_args[i]);
+        ray_release(fn_obj);
+        goto vm_error;
+    }
+    call_list->type = RAY_LIST;
+    call_list->len = n + 1;
+    ray_t **elems = (ray_t **)ray_data(call_list);
+    elems[0] = fn_obj;
+    for (int32_t i = 0; i < n; i++)
+        elems[i + 1] = fn_args[i];
+
+    ray_t *result = ray_eval(call_list);
+    ray_release(call_list);
+    if (RAY_IS_ERR(result)) { vm_err_obj = result; goto vm_error; }
+    PUSH(result);
+    DISPATCH();
+}
+
+op_ret: {
+    ray_t *result;
+    bool from_stack = (vm.sp > vm.fp + n_locals);
+    if (from_stack) {
+        result = POP();
+        ray_retain(result);  /* prevent free during cleanup if aliased in locals */
+    } else {
+        result = RAY_NULL_OBJ;
+    }
+
+    /* Clean up current frame — release all locals and leftover stack slots */
+    while (vm.sp > vm.fp) {
+        ray_t *v = vm.ps[--vm.sp];
+        if (v) ray_release(v);
+    }
+
+    /* Undo protective retain — POP's reference is the caller's ownership */
+    if (from_stack) ray_release(result);
+
+    if (vm.rp == 0) {
+        /* Top-level return */
+        ray_release(vm.fn);
+        __VM = NULL;
+#undef vm
+        ray_free(vm_block);
+        return result;  /* caller owns the POP'd reference */
+#define vm (*vmp)
+    }
+
+    /* Pop return frame */
+    vm.rp--;
+    vm.fp = vm.rs[vm.rp].fp;
+    ip = vm.rs[vm.rp].ip;
+    if (vm.rs[vm.rp].fn) {
+        /* Normal call: restore caller's function */
+        ray_release(vm.fn);
+        vm.fn = vm.rs[vm.rp].fn;
+        code = (uint8_t *)ray_data(LAMBDA_BC(vm.fn));
+        cpool = (ray_t **)ray_data(LAMBDA_CONSTS(vm.fn));
+        n_locals = LAMBDA_NLOCALS(vm.fn);
+    }
+    /* Self-call (fn==NULL): vm.fn/code/cpool/n_locals are already correct */
+    PUSH(result);
+    DISPATCH();
+}
+
+op_trap: {
+    int16_t offset = (int16_t)((code[ip] << 8) | code[ip + 1]);
+    ip += 2;
+    if (vm.tp >= VM_TRAP_SIZE) goto vm_error_limit;
+    vm.ts[vm.tp++] = (vm_trap_t){
+        .rp = vm.rp, .sp = vm.sp, .handler_ip = ip + offset,
+        .fn = vm.fn, .fp = vm.fp, .n_locals = n_locals
+    };
+    ray_retain(vm.fn);
+    DISPATCH();
+}
+
+op_trap_end: {
+    if (vm.tp > 0) {
+        vm.tp--;
+        ray_release(vm.ts[vm.tp].fn);
+    }
+    DISPATCH();
+}
+
+    const char *vm_err_str = "domain";
+    const char *vm_err_detail = NULL;
+    goto vm_error_cleanup;
+
+vm_error_limit:
+    vm_err_str = "limit";
+    vm_err_detail = "stack overflow";
+    goto vm_error_cleanup;
+
+vm_error_name:
+    vm_err_str = "name";
+    vm_err_detail = NULL;
+    goto vm_error_cleanup;
+
+vm_error:
+    vm_err_str = "domain";
+    vm_err_detail = NULL;
+
+vm_error_cleanup: {
+    /* Check for trap frame */
+    if (vm.tp > 0) {
+        vm.tp--;
+        vm_trap_t trap = vm.ts[vm.tp];
+
+        /* Clean up return frames above trap point */
+        while (vm.rp > trap.rp) {
+            vm.rp--;
+            if (vm.rs[vm.rp].fn) ray_release(vm.rs[vm.rp].fn);
+        }
+
+        /* Clean up stack above trap point */
+        while (vm.sp > trap.sp) {
+            ray_t *v = vm.ps[--vm.sp];
+            if (v) ray_release(v);
+        }
+
+        /* Get error value — prefer vm_err_obj (VM-detected errors like
+         * arity mismatch) over __raise_val (user raise expressions) */
+        ray_t *err_val = vm_err_obj ? vm_err_obj : __raise_val;
+        vm_err_obj = NULL;
+        __raise_val = NULL;
+        if (!err_val) err_val = make_i64(0);
+
+        /* Restore context and push error value */
+        ray_release(vm.fn);
+        vm.fn = trap.fn;  /* takes ownership from trap frame */
+        vm.fp = trap.fp;
+        n_locals = trap.n_locals;
+        code = (uint8_t *)ray_data(LAMBDA_BC(vm.fn));
+        cpool = (ray_t **)ray_data(LAMBDA_CONSTS(vm.fn));
+        ip = trap.handler_ip;
+        PUSH(err_val);
+        DISPATCH();
+    }
+
+    /* No trap frame — regular error cleanup */
+
+    /* Build error trace: current frame + callers from return stack */
+    add_error_frame(vm.fn, ip > 0 ? ip - 1 : 0);
+    for (int32_t i = vm.rp - 1; i >= 0; i--) {
+        if (vm.rs[i].fn)
+            add_error_frame(vm.rs[i].fn, vm.rs[i].ip > 0 ? vm.rs[i].ip - 1 : 0);
+    }
+
+    for (int32_t i = 0; i < vm.sp; i++)
+        if (vm.ps[i]) ray_release(vm.ps[i]);
+    ray_release(vm.fn);
+    for (int32_t i = 0; i < vm.rp; i++)
+        if (vm.rs[i].fn) ray_release(vm.rs[i].fn);
+    for (int32_t i = 0; i < vm.tp; i++)
+        ray_release(vm.ts[i].fn);
+    __VM = NULL;
+#undef vm
+    ray_free(vm_block);
+    if (vm_err_obj)
+        return vm_err_obj;
+    if (vm_err_detail)
+        return ray_error(vm_err_str, "%s", vm_err_detail);
+    return ray_error(vm_err_str, NULL);
+}
+
+#undef DISPATCH
+#undef PUSH
+#undef POP
+#undef PEEK
+#undef LOCAL
+#undef vm
+}
+
+
+/* ray_enlist_fn, ray_dict_fn, ray_nil_fn, ray_where_fn, ray_group_fn,
+ * ray_concat_fn, ray_raze_fn, ray_within_fn, ray_fdiv_fn
+ * moved to ops/builtins.c */
+
+/* ══════════════════════════════════════════
+ * Builtin registration
+ * ══════════════════════════════════════════ */
+
+/* Bind `obj` under `name` in the global env.  For reserved-namespace
+ * names like `.sys.gc`:
+ *
+ *   - `.sys` itself is a RAY_DICT in the env (keys SYM vec + vals
+ *     LIST).  Typing `.sys` at the REPL returns the whole dict for
+ *     introspection.
+ *   - `.sys.gc` is ALSO bound flat in the env, pointing at the same
+ *     function object.  This keeps direct lookup O(1), surfaces the
+ *     full name to `ray_env_lookup_prefix` (so tab completion and
+ *     REPL highlighting continue to see every reserved builtin),
+ *     and lets error messages cite the fully-qualified name.
+ *
+ * The two bindings are created at startup and kept in sync — writes
+ * to any `.`-prefixed name are refused by ray_env_set, so user code
+ * can't drift them apart.  Only 2-level namespaces are in use; the
+ * assert below guards against silent breakage if that changes. */
+/* Get-or-create a child dict by key on `parent`.  ray_dict_get
+ * returns an owned ref (or NULL if missing), so we either reuse it
+ * (after type-checking) or build a fresh subdict and upsert it.
+ * Returns the (possibly-COWd) parent; the child is handed back via
+ * `*out_child`, owned by the caller. */
+static ray_t* dict_get_or_create_subdict(ray_t* parent, ray_t* key,
+                                         ray_t** out_child) {
+    ray_t* existing = ray_dict_get(parent, key);
+    if (existing && !RAY_IS_ERR(existing) && existing->type == RAY_DICT) {
+        *out_child = existing;
+        return parent;
+    }
+    if (existing) ray_release(existing);
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, 4);
+    ray_t* vals = ray_list_new(4);
+    assert(keys && !RAY_IS_ERR(keys) && vals && !RAY_IS_ERR(vals));
+    ray_t* child = ray_dict_new(keys, vals);
+    assert(child && !RAY_IS_ERR(child));
+    ray_retain(child);  /* caller retains; dict_upsert below also retains */
+    parent = ray_dict_upsert(parent, key, child);
+    *out_child = child;
+    return parent;
+}
+
+static void reg_bind(const char* name, ray_t* obj) {
+    int64_t sym = ray_sym_intern(name, strlen(name));
+    if (name[0] == '.' && ray_sym_is_dotted(sym)) {
+        const int64_t* segs;
+        int nsegs = ray_sym_segs(sym, &segs);
+        assert(nsegs >= 2 && "reg_bind: dotted reserved name must have ≥ 2 segments");
+
+        int64_t root_sym = segs[0];      /* e.g. `.sys` or `.db` */
+        int64_t leaf_sym = segs[nsegs-1];/* leaf action sym */
+
+        /* 1. Get-or-create the root dict bound at `.<ns>`. */
+        ray_t* root = ray_env_get(root_sym);
+        if (root) {
+            ray_retain(root);
+        } else {
+            ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, 4);
+            ray_t* vals = ray_list_new(4);
+            assert(keys && !RAY_IS_ERR(keys) && vals && !RAY_IS_ERR(vals));
+            root = ray_dict_new(keys, vals);
+            assert(root && !RAY_IS_ERR(root));
+        }
+
+        /* 2. For each intermediate segment, descend into (or create)
+         *    a sub-dict.  Two-level names skip this loop entirely
+         *    and fall through to the leaf upsert below.  After the
+         *    walk, `cur` points at the dict that should hold the
+         *    leaf; `chain[]` records the parents we still need to
+         *    write back so a COW upsert at the deepest level
+         *    propagates upward through every parent. */
+        enum { MAX_DEPTH = 4 };
+        ray_t* chain[MAX_DEPTH] = { root };
+        int64_t chain_keys[MAX_DEPTH] = { 0 };
+        int chain_len = 1;
+        ray_t* cur = root;
+        for (int i = 1; i < nsegs - 1; i++) {
+            assert(chain_len < MAX_DEPTH);
+            ray_t* mid_key = ray_sym(segs[i]);
+            ray_t* child = NULL;
+            cur = dict_get_or_create_subdict(cur, mid_key, &child);
+            ray_release(mid_key);
+            chain[chain_len - 1] = cur;
+            chain_keys[chain_len - 1] = segs[i];
+            chain[chain_len++] = child;
+            cur = child;
+        }
+
+        /* 3. Upsert the leaf into the deepest dict, then walk back up
+         *    re-upserting any COWd parents into their parents. */
+        ray_t* leaf_key = ray_sym(leaf_sym);
+        ray_t* deepest = ray_dict_upsert(cur, leaf_key, obj);
+        ray_release(leaf_key);
+        chain[chain_len - 1] = deepest;
+        for (int i = chain_len - 1; i > 0; i--) {
+            ray_t* parent_key = ray_sym(chain_keys[i - 1]);
+            chain[i - 1] = ray_dict_upsert(chain[i - 1], parent_key, chain[i]);
+            ray_release(parent_key);
+            ray_release(chain[i]);  /* dict_upsert retained */
+        }
+
+        /* 4. Bind the (possibly-COWd) root and the flat fully-qualified
+         *    name so ray_env_lookup_prefix (REPL completion / syntax
+         *    highlighting) enumerates every reserved builtin by name.
+         *    ray_env_bind_flat skips the dotted-walk so this doesn't
+         *    re-upsert into the same dict we just built. */
+        assert(ray_env_bind(root_sym, chain[0]) == RAY_OK);
+        ray_release(chain[0]);
+        assert(ray_env_bind_flat(sym, obj) == RAY_OK);
+        return;
+    }
+    assert(ray_env_bind(sym, obj) == RAY_OK);
+}
+
+static void register_binary(const char* name, uint8_t attrs, ray_binary_fn fn) {
+    ray_t* obj = ray_fn_binary(name, attrs, fn);
+    reg_bind(name, obj);
+    ray_release(obj);
+}
+
+/* Register binary with a DAG opcode for vectorized execution */
+static void register_binary_op(const char* name, uint8_t attrs, ray_binary_fn fn, uint16_t opcode) {
+    ray_t* obj = ray_fn_binary(name, attrs, fn);
+    RAY_FN_SET_OPCODE(obj, opcode);
+    reg_bind(name, obj);
+    ray_release(obj);
+}
+
+static void register_unary(const char* name, uint8_t attrs, ray_unary_fn fn) {
+    ray_t* obj = ray_fn_unary(name, attrs, fn);
+    reg_bind(name, obj);
+    ray_release(obj);
+}
+
+static void register_unary_op(const char* name, uint8_t attrs, ray_unary_fn fn, uint16_t opcode) {
+    ray_t* obj = ray_fn_unary(name, attrs, fn);
+    RAY_FN_SET_OPCODE(obj, opcode);
+    reg_bind(name, obj);
+    ray_release(obj);
+}
+
+static void register_vary(const char* name, uint8_t attrs, ray_vary_fn fn) {
+    ray_t* obj = ray_fn_vary(name, attrs, fn);
+    reg_bind(name, obj);
+    ray_release(obj);
+}
+
+static void ray_register_builtins(void) {
+    register_binary_op("+",   RAY_FN_ATOMIC, ray_add_fn, OP_ADD);
+    register_binary_op("-",   RAY_FN_ATOMIC, ray_sub_fn, OP_SUB);
+    register_binary_op("*",   RAY_FN_ATOMIC, ray_mul_fn, OP_MUL);
+    register_binary_op("/",   RAY_FN_ATOMIC, ray_div_fn, OP_DIV);
+    register_binary_op("%",   RAY_FN_ATOMIC, ray_mod_fn, OP_MOD);
+    register_binary_op(">",   RAY_FN_ATOMIC, ray_gt_fn,  OP_GT);
+    register_binary_op("<",   RAY_FN_ATOMIC, ray_lt_fn,  OP_LT);
+    register_binary_op(">=",  RAY_FN_ATOMIC, ray_gte_fn,    OP_GE);
+    register_binary_op("<=",  RAY_FN_ATOMIC, ray_lte_fn,    OP_LE);
+    register_binary_op("==",  RAY_FN_ATOMIC, ray_eq_fn,  OP_EQ);
+    register_binary_op("!=",  RAY_FN_ATOMIC, ray_neq_fn,    OP_NE);
+    /* Special-form so args are passed unevaluated and the kernel can
+     * short-circuit on the first determining scalar (matches v1 and the
+     * Lisp/Clojure convention). */
+    register_vary("and", RAY_FN_SPECIAL_FORM, ray_and_vary_fn);
+    register_vary("or",  RAY_FN_SPECIAL_FORM, ray_or_vary_fn);
+    register_unary_op("not",  RAY_FN_NONE,   ray_not_fn, OP_NOT);
+    register_unary_op("neg",  RAY_FN_ATOMIC, ray_neg_fn, OP_NEG);
+    register_unary("round",   RAY_FN_ATOMIC, ray_round_fn);
+    register_unary_op("floor", RAY_FN_ATOMIC, ray_floor_fn, OP_FLOOR);
+    register_unary_op("ceil",  RAY_FN_ATOMIC, ray_ceil_fn,  OP_CEIL);
+    register_unary_op("abs",   RAY_FN_ATOMIC, ray_abs_fn,  OP_ABS);
+    register_unary_op("sqrt",  RAY_FN_ATOMIC, ray_sqrt_fn, OP_SQRT);
+    register_unary_op("log",   RAY_FN_ATOMIC, ray_log_fn,  OP_LOG);
+    register_unary_op("exp",   RAY_FN_ATOMIC, ray_exp_fn,  OP_EXP);
+
+    /* Special forms */
+    register_binary("set", RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_set_fn);
+    register_binary("let", RAY_FN_SPECIAL_FORM, ray_let_fn);
+    register_vary("if",    RAY_FN_SPECIAL_FORM, ray_cond_fn);
+    register_vary("do",    RAY_FN_SPECIAL_FORM, ray_do_fn);
+    register_vary("fn",    RAY_FN_SPECIAL_FORM, ray_fn);
+
+    /* Aggregation builtins */
+    register_unary("sum",   RAY_FN_AGGR, ray_sum_fn);
+    register_unary("count", RAY_FN_AGGR, ray_count_fn);
+    register_unary("avg",   RAY_FN_AGGR, ray_avg_fn);
+    register_unary("min",   RAY_FN_AGGR, ray_min_fn);
+    register_unary("max",   RAY_FN_AGGR, ray_max_fn);
+    register_unary("first", RAY_FN_NONE, ray_first_fn);
+    register_unary("last",  RAY_FN_NONE, ray_last_fn);
+    register_unary("med",   RAY_FN_AGGR, ray_med_fn);
+    register_unary("dev",        RAY_FN_AGGR, ray_dev_fn);
+    register_unary("stddev",     RAY_FN_AGGR, ray_stddev_fn);
+    register_unary("stddev_pop", RAY_FN_AGGR, ray_stddev_pop_fn);
+    register_unary("dev_pop",    RAY_FN_AGGR, ray_stddev_pop_fn);
+    register_unary("var",        RAY_FN_AGGR, ray_var_fn);
+    register_unary("var_pop",    RAY_FN_AGGR, ray_var_pop_fn);
+
+    /* Error handling */
+    register_unary("raise", RAY_FN_NONE, ray_raise_fn);
+    register_binary("try",  RAY_FN_SPECIAL_FORM, ray_try_fn);
+
+    /* Higher-order functions */
+    register_vary("map",    RAY_FN_NONE, ray_map_fn);
+    register_vary("pmap",   RAY_FN_NONE, ray_pmap_fn);
+    register_vary("fold",   RAY_FN_NONE, ray_fold_fn);
+    register_vary("scan",   RAY_FN_NONE, ray_scan_fn);
+    register_binary("filter", RAY_FN_NONE, ray_filter_fn);
+    register_vary("apply",  RAY_FN_NONE, ray_apply_fn);
+
+    /* Collection operations */
+    register_unary("distinct", RAY_FN_NONE, ray_distinct_fn);
+    register_binary("in",      RAY_FN_NONE, ray_in_fn);
+    register_binary("except",  RAY_FN_NONE, ray_except_fn);
+    register_binary("union",   RAY_FN_NONE, ray_union_fn);
+    register_binary("sect",    RAY_FN_NONE, ray_sect_fn);
+    register_binary("take",    RAY_FN_NONE, ray_take_fn);
+    register_binary("at",      RAY_FN_NONE, ray_at_fn);
+    register_binary("find",    RAY_FN_NONE, ray_find_fn);
+    register_unary("reverse",  RAY_FN_NONE, ray_reverse_fn);
+    register_unary("til",      RAY_FN_NONE, ray_til_fn);
+
+    /* Sorting operations */
+    register_unary("asc",      RAY_FN_NONE, ray_asc_fn);
+    register_unary("desc",     RAY_FN_NONE, ray_desc_fn);
+    register_unary("iasc",     RAY_FN_NONE, ray_iasc_fn);
+    register_unary("idesc",    RAY_FN_NONE, ray_idesc_fn);
+    register_unary("rank",     RAY_FN_NONE, ray_rank_fn);
+    register_binary("xasc",    RAY_FN_NONE, ray_xasc_fn);
+    register_binary("xdesc",   RAY_FN_NONE, ray_xdesc_fn);
+
+    /* Table operations */
+    register_vary("list",      RAY_FN_NONE, ray_list_fn);
+    register_binary("table",   RAY_FN_NONE, ray_table_fn);
+    register_unary("key",      RAY_FN_NONE, ray_key_fn);
+    register_unary("value",    RAY_FN_NONE, ray_value_fn);
+    register_binary("union-all",      RAY_FN_NONE, ray_union_all_fn);
+    /* table-distinct removed — distinct dispatches on type */
+
+    /* Query operations */
+    register_vary("select",    RAY_FN_SPECIAL_FORM, ray_select_fn);
+    register_vary("update",    RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_update_fn);
+    register_vary("insert",    RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_insert_fn);
+    register_vary("upsert",    RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_upsert_fn);
+    register_binary("xbar",    RAY_FN_ATOMIC, ray_xbar_fn);
+
+    /* Join operations */
+    register_vary("left-join",   RAY_FN_NONE, ray_left_join_fn);
+    register_vary("inner-join",  RAY_FN_NONE, ray_inner_join_fn);
+    register_vary("anti-join",   RAY_FN_NONE, ray_anti_join_fn);
+    register_vary("window-join", RAY_FN_SPECIAL_FORM, ray_window_join_fn);
+    register_vary("window-join1", RAY_FN_SPECIAL_FORM, ray_window_join_fn);
+    register_vary("asof-join",   RAY_FN_NONE, ray_asof_join_fn);
+
+    /* I/O builtins */
+    register_vary("println",    RAY_FN_NONE, ray_println_fn);
+    register_vary("show",       RAY_FN_NONE, ray_show_fn);
+    register_vary("format",     RAY_FN_NONE, ray_format_fn);
+    register_vary(".csv.read",  RAY_FN_RESTRICTED, ray_read_csv_fn);
+    register_vary(".csv.write", RAY_FN_RESTRICTED, ray_write_csv_fn);
+    register_binary("as",       RAY_FN_NONE, ray_cast_fn);
+    register_unary("type",      RAY_FN_NONE, ray_type_fn);
+    register_unary("read",      RAY_FN_RESTRICTED, ray_read_file_fn);
+    register_binary("write",    RAY_FN_RESTRICTED, ray_write_file_fn);
+    register_unary("load",      RAY_FN_RESTRICTED, ray_load_file_fn);
+    register_unary("exit",      RAY_FN_RESTRICTED, ray_exit_fn);
+    register_vary("resolve",    RAY_FN_SPECIAL_FORM, ray_resolve_fn);
+    register_vary("timeit",     RAY_FN_SPECIAL_FORM, ray_timeit_fn);
+
+    /* Additional builtins (ported from rayforce) */
+    register_vary("enlist",     RAY_FN_NONE, ray_enlist_fn);
+    register_binary("dict",     RAY_FN_NONE, ray_dict_fn);
+    register_unary("nil?",      RAY_FN_NONE, ray_nil_fn);
+    register_unary("where",     RAY_FN_NONE, ray_where_fn);
+    register_unary("group",     RAY_FN_NONE, ray_group_fn);
+    register_binary("concat",   RAY_FN_NONE, ray_concat_fn);
+    register_unary("raze",      RAY_FN_NONE, ray_raze_fn);
+    register_binary("within",   RAY_FN_NONE, ray_within_fn);
+    register_binary("div",      RAY_FN_ATOMIC, ray_fdiv_fn);
+    register_binary("rand",     RAY_FN_NONE, ray_rand_fn);
+    register_binary("bin",      RAY_FN_NONE, ray_bin_fn);
+    register_binary("binr",     RAY_FN_NONE, ray_binr_fn);
+    register_vary("map-left",   RAY_FN_NONE, ray_map_left_fn);
+    register_vary("map-right",  RAY_FN_NONE, ray_map_right_fn);
+
+    /* String operations */
+    register_binary("split",     RAY_FN_NONE, ray_split_fn);
+
+    /* Serialization */
+    register_unary("ser",        RAY_FN_NONE, ray_ser_fn);
+    register_unary("de",         RAY_FN_NONE, ray_de_fn);
+
+    /* Splayed / partitioned table I/O */
+    /* Database storage — splayed and parted table I/O.  Kept under a
+     * dedicated `.db.*` namespace so format-specific siblings stay
+     * grouped (set/get/mount per format) and there's room to grow
+     * without polluting the top-level builtin namespace. */
+    register_vary(".db.splayed.set",   RAY_FN_RESTRICTED, ray_set_splayed_fn);
+    register_vary(".db.splayed.get",   RAY_FN_NONE,       ray_get_splayed_fn);
+    register_vary(".db.splayed.mount", RAY_FN_NONE,       ray_db_splayed_mount_fn);
+    register_vary(".db.parted.get",    RAY_FN_NONE,       ray_get_parted_fn);
+    register_vary(".db.parted.mount",  RAY_FN_NONE,       ray_db_parted_mount_fn);
+
+    /* GUID generation */
+    register_unary("guid",       RAY_FN_NONE, ray_guid_fn);
+
+    /* In-place mutation */
+    register_vary("alter",       RAY_FN_SPECIAL_FORM, ray_alter_fn);
+
+    /* Pattern matching */
+    register_binary("like",      RAY_FN_NONE, ray_like_fn);
+
+    /* Temporal clocks */
+    register_unary("date",       RAY_FN_NONE, ray_date_clock_fn);
+    register_unary("time",       RAY_FN_NONE, ray_time_clock_fn);
+    register_unary("timestamp",  RAY_FN_NONE, ray_timestamp_clock_fn);
+
+    /* Temporal field accessors: unary builtins that map 1:1 onto
+     * ray_temporal_extract.  Registered here so `(ss ts)` / `(dd d)`
+     * participate in the normal call machinery and `ts.ss` / `d.dd`
+     * resolve through env_resolve's "is segment a callable" lookup
+     * instead of a bespoke sym→field table. */
+    register_unary("ss",         RAY_FN_NONE, ray_extract_ss_fn);
+    register_unary("hh",         RAY_FN_NONE, ray_extract_hh_fn);
+    register_unary("minute",     RAY_FN_NONE, ray_extract_minute_fn);
+    register_unary("yyyy",       RAY_FN_NONE, ray_extract_yyyy_fn);
+    register_unary("mm",         RAY_FN_NONE, ray_extract_mm_fn);
+    register_unary("dd",         RAY_FN_NONE, ray_extract_dd_fn);
+    register_unary("dow",        RAY_FN_NONE, ray_extract_dow_fn);
+    register_unary("doy",        RAY_FN_NONE, ray_extract_doy_fn);
+
+    /* Eval, parse, print, meta */
+    register_unary("eval",       RAY_FN_NONE, ray_eval_builtin_fn);
+    register_unary("parse",      RAY_FN_NONE, ray_parse_builtin_fn);
+    register_vary("print",       RAY_FN_NONE, ray_print_fn);
+    register_unary("meta",       RAY_FN_NONE, ray_meta_fn);
+
+    /* System builtins — bound under the reserved `.sys.*` namespace so
+     * user code can't shadow them and a glance at the name identifies
+     * the category. */
+    register_vary (".sys.gc",   RAY_FN_NONE,        ray_gc_fn);
+    register_unary(".sys.exec", RAY_FN_RESTRICTED,  ray_system_fn);
+    /* Registry-dispatched system commands.  `.sys.cmd "name args"` is
+     * the kdb-style entry point; the per-command direct builtins below
+     * skip the string parse for callers that already have a typed arg
+     * in hand.  All share the table in lang/syscmd.c. */
+    register_unary(".sys.cmd",    RAY_FN_RESTRICTED, ray_syscmd_string_dispatch_fn);
+    register_vary (".sys.timeit", RAY_FN_NONE,       ray_sys_timeit_fn);
+    register_unary(".sys.listen", RAY_FN_RESTRICTED, ray_sys_listen_fn);
+    register_vary (".sys.env",    RAY_FN_NONE,       ray_sys_env_fn);
+
+    /* OS env / process interaction under `.os.*` */
+    register_unary( ".os.getenv", RAY_FN_RESTRICTED,  ray_getenv_fn);
+    register_binary(".os.setenv", RAY_FN_RESTRICTED,  ray_setenv_fn);
+    /* Filesystem metadata (issue #36): size + listing.  Predicates
+     * (exists / is-file / is-dir) are reachable via `try` on these
+     * or via shell fallback through `.sys.cmd`. */
+    register_unary( ".os.size",   RAY_FN_NONE,        ray_os_size_fn);
+    register_unary( ".os.list",   RAY_FN_NONE,        ray_os_list_fn);
+
+    /* IPC client primitives under `.ipc.*` */
+    register_unary( ".ipc.open",  RAY_FN_RESTRICTED,  ray_hopen_fn);
+    register_unary( ".ipc.close", RAY_FN_RESTRICTED,  ray_hclose_fn);
+    register_binary(".ipc.send",  RAY_FN_RESTRICTED,  ray_hsend_fn);
+
+    /* Transaction-log journaling under `.log.*` — q/kdb's -l/-L feature.
+     * The CLI flags -l <base> / -L <base> call ray_journal_open() at
+     * startup; these builtins expose the same machinery to Rayfall code
+     * for manual control (open from a script, snapshot on demand, etc). */
+    register_vary(".log.open",     RAY_FN_RESTRICTED, ray_log_open_fn);
+    register_unary(".log.write",   RAY_FN_NONE,       ray_log_write_fn);
+    register_unary(".log.replay",  RAY_FN_RESTRICTED, ray_log_replay_fn);
+    register_unary(".log.validate",RAY_FN_NONE,       ray_log_validate_fn);
+    register_vary(".log.roll",     RAY_FN_RESTRICTED, ray_log_roll_fn);
+    register_vary(".log.snapshot", RAY_FN_RESTRICTED, ray_log_snapshot_fn);
+    register_vary(".log.sync",     RAY_FN_NONE,       ray_log_sync_fn);
+    register_vary(".log.close",    RAY_FN_RESTRICTED, ray_log_close_fn);
+
+    /* quote — special form (unevaluated argument) */
+    register_vary("quote",       RAY_FN_SPECIAL_FORM, ray_quote_fn);
+
+    /* return — early return (identity) */
+    register_unary("return",     RAY_FN_NONE, ray_return_fn);
+
+    /* args — command line arguments */
+    register_unary("args",       RAY_FN_NONE, ray_args_fn);
+
+    /* rc — reference count */
+    register_unary("rc",         RAY_FN_NONE, ray_rc_fn);
+
+    /* diverse — check if all elements unique */
+    register_unary("diverse",    RAY_FN_NONE, ray_diverse_fn);
+
+    /* get — dictionary/table lookup (alias for at) */
+    register_binary("get",       RAY_FN_NONE, ray_get_fn);
+
+    /* remove — remove key from dict */
+    register_binary("remove",    RAY_FN_NONE, ray_remove_fn);
+
+    /* row — single row from table */
+    register_binary("row",       RAY_FN_NONE, ray_row_fn);
+
+    /* timer — high-res monotonic nanosecond timestamp */
+    register_unary("timer",      RAY_FN_NONE, ray_timer_fn);
+
+    /* env — list all global environment bindings */
+    register_unary("env",        RAY_FN_NONE, ray_env_fn);
+
+    /* Directional fold/scan variants */
+    register_vary("fold-left",   RAY_FN_NONE, ray_fold_left_fn);
+    register_vary("fold-right",  RAY_FN_NONE, ray_fold_right_fn);
+    register_vary("scan-left",   RAY_FN_NONE, ray_scan_left_fn);
+    register_vary("scan-right",  RAY_FN_NONE, ray_scan_right_fn);
+
+    /* del, modify, pivot remain top-level language primitives.
+     * Runtime/heap introspection moves under `.sys.*`. */
+    register_vary("del",          RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_del_fn);
+    register_vary(".sys.build", RAY_FN_NONE, ray_internals_fn);
+    register_vary(".sys.mem",   RAY_FN_NONE, ray_memstat_fn);
+    register_vary("modify",     RAY_FN_RESTRICTED, ray_modify_fn);
+    register_vary("pivot",      RAY_FN_NONE, ray_pivot_fn);
+    register_vary(".sys.info",  RAY_FN_NONE, ray_sysinfo_fn);
+    register_unary("sym-name",   RAY_FN_NONE, ray_sym_name_fn);
+    register_binary("unify",     RAY_FN_NONE, ray_unify_fn);
+    register_binary("xrank",     RAY_FN_NONE, ray_xrank_fn);
+
+    /* EAV triple storage */
+    register_vary("datoms",        RAY_FN_NONE, ray_datoms_fn);
+    register_vary("assert-fact",   RAY_FN_NONE, ray_assert_fact_fn);
+    register_vary("retract-fact",  RAY_FN_NONE, ray_retract_fact_fn);
+    register_vary("scan-eav",      RAY_FN_NONE, ray_scan_eav_fn);
+    register_vary("pull",          RAY_FN_NONE, ray_pull_fn);
+
+    /* Datalog */
+    register_vary("rule",         RAY_FN_SPECIAL_FORM, ray_rule_fn);
+    register_vary("query",        RAY_FN_SPECIAL_FORM, ray_query_fn);
+
+    /* Programmatic Datalog API */
+    register_vary("dl-program",    RAY_FN_NONE, ray_dl_program_fn);
+    register_vary("dl-add-edb",    RAY_FN_NONE, ray_dl_add_edb_fn);
+    register_unary("dl-stratify",  RAY_FN_NONE, ray_dl_stratify_fn);
+    register_unary("dl-eval",      RAY_FN_NONE, ray_dl_eval_fn);
+    register_binary("dl-query",    RAY_FN_NONE, ray_dl_query_fn);
+    register_binary("dl-provenance", RAY_FN_NONE, ray_dl_provenance_fn);
+
+    /* Vector similarity / embeddings / HNSW */
+    register_binary("cos-dist",    RAY_FN_NONE, ray_cos_dist_fn);
+    register_binary("inner-prod",  RAY_FN_NONE, ray_inner_prod_fn);
+    register_binary("l2-dist",     RAY_FN_NONE, ray_l2_dist_fn);
+    register_unary ("norm",        RAY_FN_NONE, ray_norm_fn);
+    register_vary  ("knn",         RAY_FN_NONE, ray_knn_fn);
+    register_vary  ("hnsw-build",  RAY_FN_NONE, ray_hnsw_build_fn);
+    register_vary  ("ann",         RAY_FN_NONE, ray_ann_fn);
+    register_unary ("hnsw-free",   RAY_FN_NONE, ray_hnsw_free_fn);
+    register_binary("hnsw-save",   RAY_FN_RESTRICTED, ray_hnsw_save_fn);
+    register_unary ("hnsw-load",   RAY_FN_RESTRICTED, ray_hnsw_load_fn);
+    register_unary ("hnsw-info",   RAY_FN_NONE, ray_hnsw_info_fn);
+
+    /* Per-vector accelerator indices (see src/ops/idxop.h) */
+    register_unary (".idx.zone",   RAY_FN_NONE, ray_idx_zone_fn);
+    register_unary (".idx.hash",   RAY_FN_NONE, ray_idx_hash_fn);
+    register_unary (".idx.sort",   RAY_FN_NONE, ray_idx_sort_fn);
+    register_unary (".idx.bloom",  RAY_FN_NONE, ray_idx_bloom_fn);
+    register_unary (".idx.drop",   RAY_FN_NONE, ray_idx_drop_fn);
+    register_unary (".idx.has?",   RAY_FN_NONE, ray_idx_has_fn);
+    register_unary (".idx.info",   RAY_FN_NONE, ray_idx_info_fn);
+
+    /* Linked columns (see src/ops/linkop.h) */
+    register_binary(".col.link",   RAY_FN_NONE, ray_col_link_fn);
+    register_unary (".col.unlink", RAY_FN_NONE, ray_col_unlink_fn);
+    register_unary (".col.link?",  RAY_FN_NONE, ray_col_link_p_fn);
+    register_unary (".col.target", RAY_FN_NONE, ray_col_target_fn);
+}
+
+/* ══════════════════════════════════════════
+ * Runtime lifecycle
+ * ══════════════════════════════════════════ */
+
+ray_err_t ray_lang_init(void) {
+    ray_err_t err = ray_env_init();
+    if (err != RAY_OK) return err;
+    ray_register_builtins();
+    return RAY_OK;
+}
+
+void ray_lang_destroy(void) {
+    if (__raise_val) { ray_release(__raise_val); __raise_val = NULL; }
+    /* Reset global Datalog rule storage */
+    ray_dl_reset_rules();
+    ray_env_destroy();
+    ray_compile_reset();
+}
+
+/* ══════════════════════════════════════════
+ * Tree-walking evaluator
+ * ══════════════════════════════════════════ */
+
+ray_t* ray_eval(ray_t* obj) {
+    if (!obj || RAY_IS_ERR(obj)) return obj;
+
+    /* Check for external interrupt (e.g. Ctrl-C from REPL) */
+    if (g_eval_interrupted) return ray_error("limit", "interrupted");
+
+    if (++eval_depth > RAY_EVAL_MAX_DEPTH) {
+        eval_depth--;
+        return ray_error("limit", "eval depth exceeded");
+    }
+
+    ray_t* ret;
+
+    /* Atoms: return themselves (retain) */
+    if (ray_is_atom(obj)) {
+        /* Name reference: resolve from env */
+        if (obj->type == -RAY_SYM && (obj->attrs & RAY_ATTR_NAME)) {
+            /* Check for null keyword — compare by string, not cached sym_id,
+             * because sym table may be reinitialized between test runs */
+            {
+                ray_t* name_str = ray_sym_str(obj->i64);
+                if (name_str && ray_str_len(name_str) == 4 &&
+                    memcmp(ray_str_ptr(name_str), "null", 4) == 0) {
+                    ray_release(name_str);
+                    ret = NULL; goto out;
+                }
+                if (name_str) ray_release(name_str);
+            }
+
+            ray_t* val = ray_env_resolve(obj->i64);
+            if (!val) {
+                ray_t* ns = ray_sym_str(obj->i64);
+                if (ns) {
+                    ret = ray_error("name", "'%.*s' undefined",
+                                    (int)ray_str_len(ns), ray_str_ptr(ns));
+                    ray_release(ns);
+                } else {
+                    ret = ray_error("name", NULL);
+                }
+                goto out;
+            }
+            /* env_resolve may also return a real error (e.g. nyi from a
+             * parted-target link deref inside the dotted walker) — surface
+             * it directly rather than treating it as a found value. */
+            if (RAY_IS_ERR(val)) { ret = val; goto out; }
+            /* env_resolve hands back an owned ref; no extra retain. */
+            ret = val; goto out;
+        }
+        ray_retain(obj);
+        ret = obj; goto out;
+    }
+
+    /* Non-list vectors (incl. RAY_DICT/RAY_TABLE): return themselves —
+     * dict literals are self-evaluating; values stay unevaluated.  Use
+     * the (dict ...) builtin for evaluated construction. */
+    if (obj->type != RAY_LIST) { ray_retain(obj); ret = obj; goto out; }
+
+    /* Empty list */
+    if (ray_len(obj) == 0) { ray_retain(obj); ret = obj; goto out; }
+
+    /* List: evaluate first element, dispatch by type */
+    ray_t** elems = (ray_t**)ray_data(obj);
+    ray_t* head = ray_eval(elems[0]);
+    if (RAY_IS_ERR(head)) { ret = head; goto out; }
+
+    int64_t n = ray_len(obj);
+
+    switch (head->type) {
+        case RAY_UNARY: {
+            if (n != 2) { ray_release(head); ret = ray_error("arity", "expected 1 arg, got %d", (int)(n-1)); goto out; }
+            if (fn_is_restricted(head)) { ray_release(head); ret = ray_error("access", "restricted"); goto out; }
+            ray_unary_fn fn = (ray_unary_fn)(uintptr_t)head->i64;
+            uint8_t fn_attrs = head->attrs;
+            ray_t* arg = ray_eval(elems[1]);
+            ray_release(head);
+            if (arg && RAY_IS_ERR(arg)) { ret = arg; goto out; }
+            ray_t* result;
+            if (!arg || RAY_IS_NULL(arg)) {
+                /* Only nil?/type/ser safely handle null */
+                result = (fn == (ray_unary_fn)ray_nil_fn || fn == (ray_unary_fn)ray_type_fn ||
+                          fn == (ray_unary_fn)ray_ser_fn) ? fn(arg) : ray_error("type", NULL);
+            } else if ((fn_attrs & RAY_FN_ATOMIC) && is_collection(arg))
+                result = atomic_map_unary(fn, arg);
+            else
+                result = fn(arg);
+            if (arg) ray_release(arg);
+            ret = result; goto out;
+        }
+        case RAY_BINARY: {
+            if (n != 3) { ray_release(head); ret = ray_error("arity", "expected 2 args, got %d", (int)(n-1)); goto out; }
+            if (fn_is_restricted(head)) { ray_release(head); ret = ray_error("access", "restricted"); goto out; }
+            ray_binary_fn fn = (ray_binary_fn)(uintptr_t)head->i64;
+            uint8_t fn_attrs = head->attrs;
+            if (fn_attrs & RAY_FN_SPECIAL_FORM) {
+                ray_release(head);
+                ret = fn(elems[1], elems[2]); goto out;
+            }
+            ray_t* left = ray_eval(elems[1]);
+            if (left && RAY_IS_ERR(left)) {
+                ray_release(head);
+                ret = left; goto out;
+            }
+            ray_t* right = ray_eval(elems[2]);
+            if (right && RAY_IS_ERR(right)) {
+                ray_release(head); if (left) ray_release(left);
+                ret = right; goto out;
+            }
+            /* If either arg is NULL/void, only == and != can handle it */
+            if (!left || !right || RAY_IS_NULL(left) || RAY_IS_NULL(right)) {
+                if (fn == (ray_binary_fn)ray_eq_fn || fn == (ray_binary_fn)ray_neq_fn) {
+                    ray_release(head);
+                    ray_t* result = fn(left, right);
+                    ray_release(left);
+                    ray_release(right);
+                    ret = result; goto out;
+                }
+                ray_release(head);
+                ray_release(left);
+                ray_release(right);
+                ret = ray_error("type", NULL); goto out;
+            }
+            uint16_t fn_opcode = RAY_FN_OPCODE(head);
+            ray_release(head);
+            ray_t* result;
+            if ((fn_attrs & RAY_FN_ATOMIC) && (is_collection(left) || is_collection(right)))
+                result = atomic_map_binary_op(fn, fn_opcode, left, right);
+            else
+                result = fn(left, right);
+            ray_release(left);
+            ray_release(right);
+            ret = result; goto out;
+        }
+        case RAY_VARY: {
+            if (fn_is_restricted(head)) { ray_release(head); ret = ray_error("access", "restricted"); goto out; }
+            ray_vary_fn fn = (ray_vary_fn)(uintptr_t)head->i64;
+            if (head->attrs & RAY_FN_SPECIAL_FORM) {
+                ray_release(head);
+                ret = fn(elems + 1, n - 1); goto out;
+            }
+            int64_t argc = n - 1;
+            if (argc > 64) { ray_release(head); ret = ray_error("domain", NULL); goto out; }
+            ray_t* args[64];
+            for (int64_t i = 0; i < argc; i++) {
+                args[i] = ray_eval(elems[i + 1]);
+                if (!args[i] || RAY_IS_ERR(args[i])) {
+                    ray_t* err = (!args[i]) ? ray_error("type", NULL) : args[i];
+                    for (int64_t j = 0; j < i; j++) ray_release(args[j]);
+                    ray_release(head);
+                    ret = err; goto out;
+                }
+            }
+            ray_release(head);
+            ray_t* result = fn(args, argc);
+            for (int64_t i = 0; i < argc; i++) ray_release(args[i]);
+            ret = result; goto out;
+        }
+        case RAY_LAMBDA: {
+            int64_t argc = n - 1;
+            if (argc > 64) { ray_release(head); ret = ray_error("domain", NULL); goto out; }
+            ray_t* args[64];
+            for (int64_t i = 0; i < argc; i++) {
+                args[i] = ray_eval(elems[i + 1]);
+                if (!args[i] || RAY_IS_ERR(args[i])) {
+                    ray_t* err = (!args[i]) ? ray_error("type", NULL) : args[i];
+                    for (int64_t j = 0; j < i; j++) ray_release(args[j]);
+                    ray_release(head);
+                    ret = err; goto out;
+                }
+            }
+            ray_t* result = call_lambda(head, args, argc);
+            for (int64_t i = 0; i < argc; i++) ray_release(args[i]);
+            ray_release(head);
+            if (RAY_IS_ERR(result))
+                add_eval_error_frame(g_eval_nfo, obj);
+            ret = result; goto out;
+        }
+        default:
+            ray_release(head);
+            ret = ray_error("type", NULL); goto out;
+    }
+
+out:
+    eval_depth--;
+    /* End-of-top-level-expression cleanup hook. Every path that
+     * entered ray_eval — REPL, IPC, ray_eval_str, file mode — exits
+     * through here; firing ray_progress_end exactly when the depth
+     * returns to 0 guarantees the progress bar is cleared no matter
+     * which builtin drove the update (including ray_group_fn etc.
+     * that bypass ray_execute). */
+    if (eval_depth == 0) ray_progress_end();
+    return ret;
+}
+
+ray_t* ray_eval_str(const char* source) {
+    ray_clear_error_trace();
+    ray_t* nfo = ray_nfo_create("repl", 4, source, strlen(source));
+    ray_t* parsed = ray_parse_with_nfo(source, nfo);
+    if (RAY_IS_ERR(parsed)) { ray_release(nfo); return parsed; }
+
+    ray_t* prev_nfo = g_eval_nfo;
+    g_eval_nfo = nfo;
+    ray_t* result = ray_eval(parsed);
+    g_eval_nfo = prev_nfo;
+
+    ray_release(parsed);
+    ray_release(nfo);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/eval.h b/crates/rayforce-sys/vendor/rayforce/src/lang/eval.h
new file mode 100644
index 0000000..df86e73
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/eval.h
@@ -0,0 +1,298 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_EVAL_H
+#define RAY_EVAL_H
+
+#include <rayforce.h>
+#include <stdio.h>
+#include "lang/nfo.h"
+
+/* ===== Function Attribute Flags (stored in attrs byte) ===== */
+
+#define RAY_FN_NONE          0x00
+#define RAY_FN_LEFT_ATOMIC   0x01  /* auto-map left arg over vectors */
+#define RAY_FN_RIGHT_ATOMIC  0x02  /* auto-map right arg over vectors */
+#define RAY_FN_ATOMIC        0x04  /* auto-map all args over vectors */
+#define RAY_FN_AGGR          0x08  /* aggregation function */
+#define RAY_FN_SPECIAL_FORM  0x10  /* receives unevaluated args */
+#define RAY_FN_RESTRICTED    0x20  /* forbidden during -U restricted IPC evals */
+
+/* AST name flag (distinguishes symbol literal from variable reference) */
+#define RAY_ATTR_NAME        0x20  /* ray_t SYM atom with this flag = name reference */
+
+/* Function type signatures */
+typedef ray_t* (*ray_unary_fn)(ray_t*);
+typedef ray_t* (*ray_binary_fn)(ray_t*, ray_t*);
+typedef ray_t* (*ray_vary_fn)(ray_t**, int64_t);
+
+/* DAG opcode stored in nullmap[0..1] for binary builtins with DAG-exec paths.
+ * Eliminates dispatch table lookups in atomic_map_binary — just read the opcode. */
+#define RAY_FN_OPCODE(fn)        (*(uint16_t*)(fn)->nullmap)
+#define RAY_FN_SET_OPCODE(fn,op) (*(uint16_t*)(fn)->nullmap = (uint16_t)(op))
+
+/* ===== VM Bytecode Opcodes ===== */
+
+enum {
+    OP_RET = 0,       /* return top of stack */
+    OP_JMP,           /* unconditional jump (2-byte signed offset) */
+    OP_JMPF,          /* jump if false (2-byte signed offset) */
+    OP_LOADCONST,     /* push constant pool[operand] (1-byte index) */
+    OP_LOADENV,       /* push local variable (1-byte slot index) */
+    OP_STOREENV,      /* pop and store into local (1-byte slot index) */
+    OP_POP,           /* discard top of stack */
+    OP_RESOLVE,       /* resolve global name: constant pool[operand] is sym_id */
+    OP_CALL1,         /* call unary: pop fn + 1 arg, push result */
+    OP_CALL2,         /* call binary: pop fn + 2 args, push result */
+    OP_CALLN,         /* call variadic: operand = argc, pop fn + N args */
+    OP_CALLF,         /* call compiled lambda: push frame, jump to callee */
+    OP_CALLS,         /* tail call: reuse frame */
+    OP_CALLD,         /* dynamic dispatch: fallback to ray_eval() */
+    OP_DUP,           /* duplicate top of stack */
+    OP_LOADCONST_W,   /* push constant pool[operand] (2-byte index) */
+    OP_RESOLVE_W,     /* resolve global name: 2-byte constant pool index */
+    OP_TRAP,          /* push trap frame, 2-byte handler offset */
+    OP_TRAP_END,      /* pop trap frame (success path) */
+    OP__COUNT
+};
+
+/* ===== Compiled Lambda Layout =====
+ *
+ * A RAY_LAMBDA object with attrs & RAY_FN_COMPILED stores compiled
+ * bytecode in its data area:
+ *
+ *   data[0] = ray_t* params_list   (same as interpreted)
+ *   data[1] = ray_t* body          (parsed body, same as interpreted)
+ *   data[2] = ray_t* bytecode      (RAY_U8 vector of opcodes)
+ *   data[3] = ray_t* constants     (RAY_LIST of constant pool entries)
+ *   data[4] = int32_t n_locals    (number of local slots needed)
+ *   data[5] = ray_t* nfo          (source location info, NULL if absent)
+ *   data[6] = ray_t* dbg          (debug metadata, NULL if absent)
+ */
+
+#define RAY_FN_COMPILED  0x40   /* lambda has been compiled to bytecode */
+
+#define LAMBDA_PARAMS(lam)    (((ray_t**)ray_data(lam))[0])
+#define LAMBDA_BODY(lam)      (((ray_t**)ray_data(lam))[1])
+#define LAMBDA_BC(lam)        (((ray_t**)ray_data(lam))[2])
+#define LAMBDA_CONSTS(lam)    (((ray_t**)ray_data(lam))[3])
+#define LAMBDA_NLOCALS(lam)   (*((int32_t*)&((ray_t**)ray_data(lam))[4]))
+#define LAMBDA_NFO(lam)       (((ray_t**)ray_data(lam))[5])
+#define LAMBDA_DBG(lam)       (((ray_t**)ray_data(lam))[6])
+
+#define LAMBDA_IS_COMPILED(lam) ((lam)->attrs & RAY_FN_COMPILED)
+
+/* ===== VM Types ===== */
+
+#define VM_STACK_SIZE 1024
+
+typedef struct {
+    ray_t   *fn;     /* lambda being executed */
+    int32_t fp;     /* frame pointer */
+    int32_t ip;     /* instruction pointer */
+} vm_ctx_t;
+
+typedef struct {
+    int32_t  rp;        /* return stack depth at trap point */
+    int32_t  sp;        /* stack depth at trap point */
+    int32_t  handler_ip;/* IP of handler code */
+    ray_t    *fn;        /* function containing handler code */
+    int32_t  fp;        /* frame pointer at trap point */
+    int32_t  n_locals;  /* n_locals at trap point */
+} vm_trap_t;
+
+#define VM_TRAP_SIZE 16
+
+typedef struct {
+    int32_t  sp;                    /* stack pointer */
+    int32_t  fp;                    /* frame pointer */
+    int32_t  rp;                    /* return stack pointer */
+    int32_t  id;                    /* VM identifier */
+    ray_t    *fn;                    /* current lambda */
+    void    *heap;                  /* heap pointer (future use) */
+    int32_t  tp;                    /* trap stack pointer */
+    ray_t    *ps[VM_STACK_SIZE];     /* program stack */
+    vm_ctx_t rs[VM_STACK_SIZE];     /* return stack */
+    vm_trap_t ts[VM_TRAP_SIZE];     /* trap frames */
+} ray_vm_t;
+
+/* ===== Public API ===== */
+
+/* Initialize the Rayfall runtime: symbols, environment, builtins. */
+ray_err_t ray_lang_init(void);
+void     ray_lang_destroy(void);
+
+/* Evaluate a parsed ray_t object tree. */
+ray_t* ray_eval(ray_t* obj);
+
+/* Parse + eval convenience. */
+ray_t* ray_eval_str(const char* source);
+
+/* Compile a lambda's body to bytecode. Called lazily on first invocation. */
+void ray_compile(ray_t* lambda);
+
+/* Reset compiler cached state (call from ray_lang_destroy). */
+void ray_compile_reset(void);
+
+/* Look up the source span for a bytecode IP from a lambda's debug vector.
+ * Returns a span with id==0 if not found. */
+ray_span_t ray_bc_dbg_get(ray_t* dbg, int32_t ip);
+
+/* Print a ray_t value to a FILE stream. */
+void ray_lang_print(FILE* fp, ray_t* val);
+
+/* Interrupt support: allow external code (REPL signal handler) to request
+ * that the evaluator abort early.  ray_eval() and the bytecode VM check
+ * this flag at function-call and loop boundaries. */
+void ray_eval_request_interrupt(void);
+void ray_eval_clear_interrupt(void);
+int  ray_eval_is_interrupted(void);
+
+/* Return the current eval context's nfo (source location) object, or NULL. */
+ray_t* ray_eval_get_nfo(void);
+void   ray_eval_set_nfo(ray_t* nfo);
+
+/* Error trace: list of [span_i64, filename, fn_name, source] frames built when
+ * a VM error propagates without a trap.  Cleared at the start of ray_eval_str. */
+ray_t* ray_get_error_trace(void);
+void   ray_clear_error_trace(void);
+
+/* Restricted mode: when true, builtins with RAY_FN_RESTRICTED are blocked. */
+void ray_eval_set_restricted(bool on);
+bool ray_eval_get_restricted(void);
+
+/* ===== Rayfall Builtin Functions ===== */
+
+/* Arithmetic */
+ray_t* ray_add_fn(ray_t* a, ray_t* b);
+ray_t* ray_sub_fn(ray_t* a, ray_t* b);
+ray_t* ray_mul_fn(ray_t* a, ray_t* b);
+ray_t* ray_div_fn(ray_t* a, ray_t* b);
+ray_t* ray_mod_fn(ray_t* a, ray_t* b);
+
+/* Comparison */
+ray_t* ray_gt_fn(ray_t* a, ray_t* b);
+ray_t* ray_lt_fn(ray_t* a, ray_t* b);
+ray_t* ray_gte_fn(ray_t* a, ray_t* b);
+ray_t* ray_lte_fn(ray_t* a, ray_t* b);
+ray_t* ray_eq_fn(ray_t* a, ray_t* b);
+ray_t* ray_neq_fn(ray_t* a, ray_t* b);
+
+/* Logic */
+ray_t* ray_and_fn(ray_t* a, ray_t* b);
+ray_t* ray_or_fn(ray_t* a, ray_t* b);
+ray_t* ray_and_vary_fn(ray_t** args, int64_t n);
+ray_t* ray_or_vary_fn(ray_t** args, int64_t n);
+ray_t* ray_not_fn(ray_t* x);
+ray_t* ray_neg_fn(ray_t* x);
+
+/* Aggregation */
+ray_t* ray_sum_fn(ray_t* x);
+ray_t* ray_count_fn(ray_t* x);
+ray_t* ray_avg_fn(ray_t* x);
+ray_t* ray_min_fn(ray_t* x);
+ray_t* ray_max_fn(ray_t* x);
+ray_t* ray_first_fn(ray_t* x);
+ray_t* ray_last_fn(ray_t* x);
+ray_t* ray_med_fn(ray_t* x);
+ray_t* ray_dev_fn(ray_t* x);
+ray_t* ray_stddev_fn(ray_t* x);
+ray_t* ray_stddev_pop_fn(ray_t* x);
+ray_t* ray_var_fn(ray_t* x);
+ray_t* ray_var_pop_fn(ray_t* x);
+
+/* Higher-order */
+ray_t* ray_map_fn(ray_t** args, int64_t n);
+ray_t* ray_pmap_fn(ray_t** args, int64_t n);
+ray_t* ray_fold_fn(ray_t** args, int64_t n);
+ray_t* ray_scan_fn(ray_t** args, int64_t n);
+ray_t* ray_filter_fn(ray_t* vec, ray_t* mask);
+ray_t* ray_apply_fn(ray_t** args, int64_t n);
+
+/* Collection */
+ray_t* ray_distinct_fn(ray_t* x);
+ray_t* ray_in_fn(ray_t* val, ray_t* vec);
+ray_t* ray_except_fn(ray_t* vec1, ray_t* vec2);
+ray_t* ray_union_fn(ray_t* vec1, ray_t* vec2);
+ray_t* ray_sect_fn(ray_t* vec1, ray_t* vec2);
+ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj);
+ray_t* ray_at_fn(ray_t* vec, ray_t* idx);
+ray_t* ray_find_fn(ray_t* vec, ray_t* val);
+ray_t* ray_til_fn(ray_t* x);
+ray_t* ray_reverse_fn(ray_t* x);
+
+/* Table construction */
+ray_t* ray_list_fn(ray_t** args, int64_t n);
+ray_t* ray_table_fn(ray_t* names, ray_t* cols);
+ray_t* ray_key_fn(ray_t* x);
+ray_t* ray_value_fn(ray_t* x);
+
+/* Query */
+ray_t* ray_select_fn(ray_t** args, int64_t n);
+ray_t* ray_update_fn(ray_t** args, int64_t n);
+ray_t* ray_insert_fn(ray_t** args, int64_t n);
+ray_t* ray_upsert_fn(ray_t** args, int64_t n);
+ray_t* ray_xbar_fn(ray_t* col, ray_t* bucket);
+
+/* Joins */
+ray_t* ray_left_join_fn(ray_t** args, int64_t n);
+ray_t* ray_inner_join_fn(ray_t** args, int64_t n);
+ray_t* ray_window_join_fn(ray_t** args, int64_t n);
+
+/* I/O */
+ray_t* ray_println_fn(ray_t** args, int64_t n);
+ray_t* ray_read_csv_fn(ray_t** args, int64_t n);
+ray_t* ray_write_csv_fn(ray_t** args, int64_t n);
+ray_t* ray_read_file_fn(ray_t* path_obj);
+ray_t* ray_write_file_fn(ray_t* path_obj, ray_t* content);
+
+/* Vector similarity / embeddings / HNSW.
+ * cos-dist and l2-dist return distance (lower = closer); inner-prod is
+ * the raw mathematical dot product. */
+ray_t* ray_cos_dist_fn(ray_t* a, ray_t* b);
+ray_t* ray_inner_prod_fn(ray_t* a, ray_t* b);
+ray_t* ray_l2_dist_fn(ray_t* a, ray_t* b);
+ray_t* ray_norm_fn(ray_t* x);
+ray_t* ray_knn_fn(ray_t** args, int64_t n);
+ray_t* ray_hnsw_build_fn(ray_t** args, int64_t n);
+ray_t* ray_ann_fn(ray_t** args, int64_t n);
+ray_t* ray_hnsw_free_fn(ray_t* h);
+ray_t* ray_hnsw_save_fn(ray_t* h, ray_t* path);
+ray_t* ray_hnsw_load_fn(ray_t* path);
+ray_t* ray_hnsw_info_fn(ray_t* h);
+
+/* Cast and type */
+ray_t* ray_cast_fn(ray_t* type_sym, ray_t* val);
+ray_t* ray_type_fn(ray_t* val);
+
+/* Special forms */
+ray_t* ray_set_fn(ray_t* name_obj, ray_t* val_expr);
+ray_t* ray_let_fn(ray_t* name_obj, ray_t* val_expr);
+ray_t* ray_cond_fn(ray_t** args, int64_t n);
+ray_t* ray_do_fn(ray_t** args, int64_t n);
+ray_t* ray_fn(ray_t** args, int64_t n);
+ray_t* ray_raise_fn(ray_t* val);
+ray_t* ray_try_fn(ray_t* expr, ray_t* handler_expr);
+
+
+#endif /* RAY_EVAL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/format.c b/crates/rayforce-sys/vendor/rayforce/src/lang/format.c
new file mode 100644
index 0000000..dc88fe8
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/format.c
@@ -0,0 +1,1074 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/format.h"
+#include "lang/env.h"
+#include "table/sym.h"
+#include "lang/eval.h"
+#include "ops/ops.h"    /* RAY_LAZY, ray_lazy_materialize */
+#include "mem/heap.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <inttypes.h>
+#include <limits.h>
+
+/* ===== Internal growable buffer ===== */
+
+typedef struct {
+    char*   buf;
+    int32_t len;
+    int32_t cap;
+    ray_t*  block;  /* ray_alloc'd backing block */
+} fmt_buf_t;
+
+static void fmt_init(fmt_buf_t* b) {
+    b->block = ray_alloc(256);
+    b->buf   = (char*)ray_data(b->block);
+    b->len   = 0;
+    b->cap   = 256;
+}
+
+static void fmt_destroy(fmt_buf_t* b) {
+    if (b->block) {
+        ray_free(b->block);
+        b->block = NULL;
+        b->buf   = NULL;
+        b->len   = 0;
+        b->cap   = 0;
+    }
+}
+
+static void fmt_ensure(fmt_buf_t* b, int32_t extra) {
+    if (b->len + extra <= b->cap) return;
+    int32_t new_cap = b->cap;
+    while (new_cap < b->len + extra)
+        new_cap *= 2;
+    ray_t* new_block = ray_alloc((size_t)new_cap);
+    char*  new_buf   = (char*)ray_data(new_block);
+    memcpy(new_buf, b->buf, (size_t)b->len);
+    ray_free(b->block);
+    b->block = new_block;
+    b->buf   = new_buf;
+    b->cap   = new_cap;
+}
+
+static void fmt_putc(fmt_buf_t* b, char c) {
+    fmt_ensure(b, 1);
+    b->buf[b->len++] = c;
+}
+
+static void fmt_puts(fmt_buf_t* b, const char* s) {
+    int32_t slen = (int32_t)strlen(s);
+    fmt_ensure(b, slen);
+    memcpy(b->buf + b->len, s, (size_t)slen);
+    b->len += slen;
+}
+
+static void fmt_printf(fmt_buf_t* b, const char* fmt, ...) {
+    va_list ap;
+
+    /* Try to fit in remaining space first */
+    va_start(ap, fmt);
+    int32_t avail = b->cap - b->len;
+    int n = vsnprintf(b->buf + b->len, (size_t)avail, fmt, ap);
+    va_end(ap);
+
+    if (n < 0) return; /* encoding error */
+
+    if (n < avail) {
+        b->len += n;
+        return;
+    }
+
+    /* Need more space — grow and retry */
+    fmt_ensure(b, n + 1);
+    va_start(ap, fmt);
+    vsnprintf(b->buf + b->len, (size_t)(b->cap - b->len), fmt, ap);
+    va_end(ap);
+    b->len += n;
+}
+
+static void fmt_putn(fmt_buf_t* b, const char* s, int32_t n) {
+    fmt_ensure(b, n);
+    memcpy(b->buf + b->len, s, (size_t)n);
+    b->len += n;
+}
+
+static ray_t* fmt_to_str(fmt_buf_t* b) {
+    ray_t* result = ray_str(b->buf, (size_t)b->len);
+    fmt_destroy(b);
+    return result;
+}
+
+/* ===== Static globals ===== */
+
+static int g_precision = FMT_DEFAULT_PRECISION;
+static int g_row_width = FMT_DEFAULT_ROW_WIDTH;
+
+/* ===== Public API ===== */
+
+void ray_fmt_set_precision(int digits) {
+    if (digits >= 0 && digits <= 20)
+        g_precision = digits;
+}
+
+void ray_fmt_set_width(int cols) {
+    if (cols > 0)
+        g_row_width = cols;
+}
+
+/* Single type-name function. Negative type (atom) → lowercase,
+ * positive type (vector/collection) → uppercase. */
+const char* ray_type_name(int8_t type) {
+    switch (type < 0 ? -type : type) {
+    case RAY_BOOL:      return type < 0 ? "b8"        : "B8";
+    case RAY_U8:        return type < 0 ? "u8"        : "U8";
+    case RAY_I16:       return type < 0 ? "i16"       : "I16";
+    case RAY_I32:       return type < 0 ? "i32"       : "I32";
+    case RAY_I64:       return type < 0 ? "i64"       : "I64";
+    case RAY_F32:       return type < 0 ? "f32"       : "F32";
+    case RAY_F64:       return type < 0 ? "f64"       : "F64";
+    case RAY_DATE:      return type < 0 ? "date"      : "DATE";
+    case RAY_TIME:      return type < 0 ? "time"      : "TIME";
+    case RAY_TIMESTAMP: return type < 0 ? "timestamp" : "TIMESTAMP";
+    case RAY_SYM:       return type < 0 ? "sym"       : "SYM";
+    case RAY_STR:       return type < 0 ? "str"       : "STR";
+    case RAY_GUID:      return type < 0 ? "guid"      : "GUID";
+    case RAY_TABLE:     return "TABLE";
+    case RAY_DICT:      return "DICT";
+    case RAY_LIST:      return "LIST";
+    case RAY_INDEX:     return "INDEX";
+    default:            return "?";
+    }
+}
+
+/* ===== Atom formatters ===== */
+
+static void fmt_bool(fmt_buf_t* b, uint8_t val) {
+    fmt_puts(b, val ? "true" : "false");
+}
+
+static void fmt_u8(fmt_buf_t* b, uint8_t val) {
+    fmt_printf(b, "0x%02x", val);
+}
+
+
+static void fmt_i16(fmt_buf_t* b, int16_t val) {
+    fmt_printf(b, "%d", (int)val);
+}
+
+static void fmt_i32(fmt_buf_t* b, int32_t val) {
+    fmt_printf(b, "%d", (int)val);
+}
+
+static void fmt_i64(fmt_buf_t* b, int64_t val) {
+    fmt_printf(b, "%" PRId64, val);
+}
+
+static void fmt_f64(fmt_buf_t* b, double val) {
+    if (val == -0.0 && signbit(val)) val = 0.0; /* normalize -0.0 */
+    if (val == 0.0) {
+        /* Zero: format as "0.0" (after trailing-zero strip) */
+        char tmp[16];
+        int n = snprintf(tmp, sizeof(tmp), "%.*f", g_precision, 0.0);
+        char* dot = strchr(tmp, '.');
+        if (dot) { char* end = tmp + n - 1; while (end > dot + 1 && *end == '0') end--; n = (int)(end - tmp + 1); }
+        fmt_putn(b, tmp, (int32_t)n);
+        return;
+    }
+    double absval = val < 0 ? -val : val;
+    double order = log10(absval);
+
+    /* Format with requested precision */
+    char tmp[64];
+    int n;
+    if (val != 0.0 && (order > 6 || order < -1))
+        n = snprintf(tmp, sizeof(tmp), "%.*e", g_precision, val);
+    else
+        n = snprintf(tmp, sizeof(tmp), "%.*f", g_precision, val);
+
+    if (n <= 0 || n >= (int)sizeof(tmp)) {
+        fmt_puts(b, "?");
+        return;
+    }
+
+    /* Strip trailing zeros after decimal point, keeping at least one
+     * digit after '.'.  Do NOT touch exponential notation. */
+    char* dot = strchr(tmp, '.');
+    char* e   = strchr(tmp, 'e');
+    if (dot && !e) {
+        char* end = tmp + n - 1;
+        while (end > dot + 1 && *end == '0')
+            end--;
+        n = (int)(end - tmp + 1);
+    }
+
+    fmt_putn(b, tmp, (int32_t)n);
+}
+
+static void fmt_f32(fmt_buf_t* b, float val) {
+    fmt_f64(b, (double)val);
+}
+
+static void fmt_guid(fmt_buf_t* b, const uint8_t* bytes) {
+    static const char hex[] = "0123456789abcdef";
+    /* Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx */
+    static const int groups[] = {4, 2, 2, 2, 6};
+    int pos = 0;
+    for (int g = 0; g < 5; g++) {
+        if (g > 0) fmt_putc(b, '-');
+        for (int j = 0; j < groups[g]; j++) {
+            fmt_putc(b, hex[bytes[pos] >> 4]);
+            fmt_putc(b, hex[bytes[pos] & 0x0F]);
+            pos++;
+        }
+    }
+}
+
+static void fmt_sym(fmt_buf_t* b, int64_t sym_id) {
+    ray_t* s = ray_sym_str(sym_id);
+    if (s && !RAY_IS_ERR(s)) {
+        const char* p = ray_str_ptr(s);
+        size_t      n = ray_str_len(s);
+        fmt_putn(b, p, (int32_t)n);
+        ray_release(s);
+    } else {
+        fmt_puts(b, "0Ns");
+    }
+}
+
+/* ===== Date/time/timestamp helpers ===== */
+
+#include "lang/cal.h"
+
+static void time_to_hms(int32_t ms, int* h, int* min, int* s, int* ms_out) {
+    int32_t mask = ms >> 31;
+    int32_t val  = (mask ^ ms) - mask;  /* absolute value */
+
+    int32_t secs = val / 1000;
+    *ms_out = (int)(val % 1000);
+    *h      = (int)(secs / 3600);
+    int32_t rem = secs % 3600;
+    *min    = (int)(rem / 60);
+    *s      = (int)(rem % 60);
+}
+
+#define NSECS_IN_DAY ((int64_t)24 * 60 * 60 * 1000000000LL)
+
+static void ts_to_parts(int64_t ns, int* y, int* mo, int* d,
+                         int* h, int* mi, int* s, int* nanos) {
+    int64_t days = ns / NSECS_IN_DAY;
+    int64_t span = ns % NSECS_IN_DAY;
+
+    if (span < 0) {
+        days -= 1;
+        span += NSECS_IN_DAY;
+    }
+
+    date_to_ymd((int32_t)days, y, mo, d);
+
+    /* timespan_from_nanos */
+    int64_t secs = span / 1000000000LL;
+    *nanos = (int)(span % 1000000000LL);
+    *h  = (int)(secs / 3600);
+    int64_t rem = secs % 3600;
+    *mi = (int)(rem / 60);
+    *s  = (int)(rem % 60);
+}
+
+static void fmt_date(fmt_buf_t* b, int32_t val) {
+    int y, m, d;
+    date_to_ymd(val, &y, &m, &d);
+    fmt_printf(b, "%04d.%02d.%02d", y, m, d);
+}
+
+static void fmt_time(fmt_buf_t* b, int32_t val) {
+    int h, m, s, ms;
+    time_to_hms(val, &h, &m, &s, &ms);
+    if (val < 0) fmt_putc(b, '-');
+    fmt_printf(b, "%02d:%02d:%02d.%03d", h, m, s, ms);
+}
+
+static void fmt_timestamp(fmt_buf_t* b, int64_t val) {
+    int y, mo, d, h, mi, s, ns;
+    ts_to_parts(val, &y, &mo, &d, &h, &mi, &s, &ns);
+    fmt_printf(b, "%04d.%02d.%02dD%02d:%02d:%02d.%09d", y, mo, d, h, mi, s, ns);
+}
+
+static void fmt_str_atom(fmt_buf_t* b, ray_t* obj, int full) {
+    (void)full;
+    const char* p = ray_str_ptr(obj);
+    size_t      n = ray_str_len(obj);
+    fmt_putc(b, '"');
+    fmt_putn(b, p, (int32_t)n);
+    fmt_putc(b, '"');
+}
+
+/* ===== Forward declarations ===== */
+
+static void fmt_obj(fmt_buf_t* b, ray_t* obj, int mode);
+
+/* ===== Null literal display (type → "0Nx" string) ===== */
+
+static const char* null_literal(int8_t type) {
+    switch (type) {
+    case RAY_BOOL:      return "0Nb";
+    case RAY_U8:        return "0Nu";
+    case RAY_I16:       return "0Nh";
+    case RAY_I32:       return "0Ni";
+    case RAY_I64:       return "0Nl";
+    case RAY_F64:       return "0Nf";
+    case RAY_F32:       return "0Ne";
+    case RAY_DATE:      return "0Nd";
+    case RAY_TIME:      return "0Nt";
+    case RAY_TIMESTAMP: return "0Np";
+    case RAY_SYM:       return "0Ns";
+    case RAY_STR:       return "0Nc";
+    case RAY_GUID:      return "0Ng";
+    default:            return "null";
+    }
+}
+
+/* ===== Vector element formatter ===== */
+
+static void fmt_raw_elem(fmt_buf_t* b, ray_t* vec, int64_t idx) {
+    /* Check for null */
+    if (ray_vec_is_null(vec, idx)) {
+        fmt_puts(b, null_literal(vec->type));
+        return;
+    }
+
+    switch (vec->type) {
+    case RAY_BOOL:      fmt_bool(b, ((bool*)ray_data(vec))[idx]); break;
+    case RAY_U8:        fmt_u8(b, ((uint8_t*)ray_data(vec))[idx]); break;
+
+    case RAY_I16:       fmt_i16(b, ((int16_t*)ray_data(vec))[idx]); break;
+    case RAY_I32:       fmt_i32(b, ((int32_t*)ray_data(vec))[idx]); break;
+    case RAY_I64:       fmt_i64(b, ((int64_t*)ray_data(vec))[idx]); break;
+    case RAY_F32:       fmt_f32(b, ((float*)ray_data(vec))[idx]); break;
+    case RAY_F64:       fmt_f64(b, ((double*)ray_data(vec))[idx]); break;
+    case RAY_DATE:      fmt_date(b, ((int32_t*)ray_data(vec))[idx]); break;
+    case RAY_TIME:      fmt_time(b, ((int32_t*)ray_data(vec))[idx]); break;
+    case RAY_TIMESTAMP: fmt_timestamp(b, ((int64_t*)ray_data(vec))[idx]); break;
+    case RAY_SYM: {
+        int64_t sym_id = ray_read_sym(ray_data(vec), idx, vec->type, vec->attrs);
+        fmt_sym(b, sym_id);
+        break;
+    }
+    case RAY_STR: {
+        size_t slen = 0;
+        const char* p = ray_str_vec_get(vec, idx, &slen);
+        if (p) {
+            fmt_putc(b, '"');
+            fmt_putn(b, p, (int32_t)slen);
+            fmt_putc(b, '"');
+        }
+        break;
+    }
+    case RAY_GUID:
+        fmt_guid(b, ((uint8_t*)ray_data(vec)) + idx * 16);
+        break;
+    case RAY_LIST: {
+        ray_t* child = ((ray_t**)ray_data(vec))[idx];
+        if (child) {
+            ray_t* s = ray_fmt(child, 1);
+            if (s && !RAY_IS_ERR(s)) {
+                fmt_putn(b, ray_str_ptr(s), (int32_t)ray_str_len(s));
+                ray_release(s);
+            } else {
+                fmt_puts(b, "?");
+            }
+        } else {
+            fmt_puts(b, "null");
+        }
+        break;
+    }
+    default:
+        fmt_puts(b, "?");
+        break;
+    }
+}
+
+/* ===== Vector formatter ===== */
+
+static void fmt_vector(fmt_buf_t* b, ray_t* vec, int limit) {
+    int64_t len = ray_len(vec);
+    if (len == 0) { fmt_puts(b, "[]"); return; }
+
+    fmt_puts(b, "[");
+    int32_t start_len = b->len;
+
+    for (int64_t i = 0; i < len; i++) {
+        if (i > 0) fmt_putc(b, ' ');
+
+        int32_t before = b->len;
+        fmt_raw_elem(b, vec, i);
+
+        /* Width limiting: check if we exceeded the limit */
+        if (limit > 0 && (b->len - start_len) > limit) {
+            /* Rewind to before this element and truncate */
+            b->len = before;
+            fmt_puts(b, "..]");
+            return;
+        }
+    }
+
+    fmt_puts(b, "]");
+}
+
+/* ===== List formatter ===== */
+
+static void fmt_list(fmt_buf_t* b, ray_t* list, int mode) {
+    int64_t len = ray_len(list);
+    if (len == 0) { fmt_puts(b, "()"); return; }
+
+    /* Homogeneous atom list → format as vector [...] */
+    ray_t** items = (ray_t**)ray_data(list);
+    if (items && len > 0 && items[0] && !RAY_IS_ERR(items[0]) && ray_is_atom(items[0])) {
+        int8_t first_type = items[0]->type;
+        int homogeneous = 1;
+        for (int64_t i = 1; i < len; i++) {
+            if (!items[i] || RAY_IS_ERR(items[i]) || items[i]->type != first_type) {
+                homogeneous = 0; break;
+            }
+        }
+        if (homogeneous) {
+            fmt_puts(b, "[");
+            for (int64_t i = 0; i < len; i++) {
+                if (i > 0) fmt_putc(b, ' ');
+                fmt_obj(b, items[i], mode);
+            }
+            fmt_puts(b, "]");
+            return;
+        }
+    }
+
+    /* mode 0 = compact/round-trippable: "(list ...)" prefix required
+     * mode 1 = REPL display: "(...)" matching rayforce 1 output */
+    if (mode == 0)
+        fmt_puts(b, "(list ");
+    else
+        fmt_puts(b, "(");
+
+    int64_t max_elems = (mode == 1) ? FMT_LIST_MAX_HEIGHT : len;
+    int64_t show = len < max_elems ? len : max_elems;
+
+    for (int64_t i = 0; i < show; i++) {
+        if (i > 0) fmt_putc(b, ' ');
+        ray_t* elem = ray_list_get(list, i);
+        fmt_obj(b, elem, mode);
+    }
+
+    if (len > show) fmt_puts(b, " ..");
+    fmt_puts(b, ")");
+}
+
+/* ===== Dict formatter ===== */
+
+static void fmt_dict(fmt_buf_t* b, ray_t* dict, int mode) {
+    ray_t* keys = ray_dict_keys(dict);
+    ray_t* vals = ray_dict_vals(dict);
+    int64_t npairs = keys ? keys->len : 0;
+    if (npairs == 0) { fmt_puts(b, "{}"); return; }
+
+    int64_t max_pairs = (mode == 1) ? FMT_LIST_MAX_HEIGHT : npairs;
+    int64_t show = npairs < max_pairs ? npairs : max_pairs;
+
+    fmt_puts(b, "{");
+    for (int64_t i = 0; i < show; i++) {
+        if (i > 0) fmt_putc(b, ' ');
+        /* Render key: synthesize an atom view from the keys vector.  When
+         * the source slot is flagged null in the keys' bitmap, set the
+         * synthesized atom's nullmap bit 0 so fmt_obj renders the proper
+         * null literal.  Without this, nullable GUID/STR/sym keys render
+         * as their underlying bytes (e.g. the 16-zero-byte GUID), losing
+         * null semantics. */
+        bool k_is_null = (keys->type != RAY_LIST) && ray_vec_is_null(keys, i);
+        ray_t k_atom_storage;
+        ray_t* k_atom = NULL;
+        memset(&k_atom_storage, 0, sizeof(k_atom_storage));
+        bool k_owned = false;   /* true if k_atom is a fresh allocation */
+        if (keys->type == RAY_SYM) {
+            k_atom_storage.type = -RAY_SYM;
+            k_atom_storage.i64  = ray_read_sym(ray_data(keys), i, RAY_SYM, keys->attrs);
+            k_atom = &k_atom_storage;
+        } else if (keys->type == RAY_STR) {
+            size_t slen = 0;
+            const char* sp = ray_str_vec_get(keys, i, &slen);
+            k_atom = ray_str(sp ? sp : "", sp ? slen : 0);
+            k_owned = true;
+        } else if (keys->type == RAY_I64 || keys->type == RAY_TIMESTAMP) {
+            k_atom_storage.type = (int8_t)-keys->type;
+            k_atom_storage.i64  = ((int64_t*)ray_data(keys))[i];
+            k_atom = &k_atom_storage;
+        } else if (keys->type == RAY_I32 || keys->type == RAY_DATE || keys->type == RAY_TIME) {
+            k_atom_storage.type = (int8_t)-keys->type;
+            k_atom_storage.i32  = ((int32_t*)ray_data(keys))[i];
+            k_atom = &k_atom_storage;
+        } else if (keys->type == RAY_I16) {
+            k_atom_storage.type = -RAY_I16;
+            k_atom_storage.i16  = ((int16_t*)ray_data(keys))[i];
+            k_atom = &k_atom_storage;
+        } else if (keys->type == RAY_BOOL || keys->type == RAY_U8) {
+            k_atom_storage.type = (int8_t)-keys->type;
+            k_atom_storage.u8   = ((uint8_t*)ray_data(keys))[i];
+            k_atom = &k_atom_storage;
+        } else if (keys->type == RAY_F64) {
+            k_atom_storage.type = -RAY_F64;
+            k_atom_storage.f64  = ((double*)ray_data(keys))[i];
+            k_atom = &k_atom_storage;
+        } else if (keys->type == RAY_F32) {
+            k_atom_storage.type = -RAY_F32;
+            k_atom_storage.f64  = (double)((float*)ray_data(keys))[i];
+            k_atom = &k_atom_storage;
+        } else if (keys->type == RAY_GUID) {
+            /* GUID atoms keep their 16-byte payload in a heap-allocated
+             * child block; the stack-local view trick from the other
+             * branches doesn't carry the bytes (fmt_obj would deref a
+             * bogus inline data[] pointer).  Build a real atom. */
+            k_atom = ray_guid(((const uint8_t*)ray_data(keys)) + i * 16);
+            k_owned = (k_atom && !RAY_IS_ERR(k_atom));
+        } else if (keys->type == RAY_LIST) {
+            /* Borrowed — do NOT release. */
+            k_atom = ((ray_t**)ray_data(keys))[i];
+        }
+        if (k_is_null && k_atom) k_atom->nullmap[0] |= 1;
+        if (k_atom) fmt_obj(b, k_atom, mode);
+        fmt_putc(b, ':');
+
+        /* Render value: borrow from vals (LIST) or synthesize a typed atom
+         * directly from index i (do NOT route through k_atom — for STR keys
+         * k_atom is a fresh allocation we'll release just below).  */
+        if (vals && vals->type == RAY_LIST) {
+            ray_t* v = ray_list_get(vals, i);
+            fmt_obj(b, v, mode);
+        } else if (vals && i < vals->len) {
+            bool v_is_null = ray_vec_is_null(vals, i);
+            ray_t v_storage; memset(&v_storage, 0, sizeof(v_storage));
+            ray_t* v_atom = NULL;
+            bool   v_owned = false;
+            switch (vals->type) {
+                case RAY_BOOL:
+                case RAY_U8:        v_storage.type = (int8_t)-vals->type;
+                                    v_storage.u8   = ((uint8_t*)ray_data(vals))[i];
+                                    v_atom = &v_storage; break;
+                case RAY_I16:       v_storage.type = -RAY_I16;
+                                    v_storage.i16  = ((int16_t*)ray_data(vals))[i];
+                                    v_atom = &v_storage; break;
+                case RAY_I32:
+                case RAY_DATE:
+                case RAY_TIME:      v_storage.type = (int8_t)-vals->type;
+                                    v_storage.i32  = ((int32_t*)ray_data(vals))[i];
+                                    v_atom = &v_storage; break;
+                case RAY_I64:
+                case RAY_TIMESTAMP: v_storage.type = (int8_t)-vals->type;
+                                    v_storage.i64  = ((int64_t*)ray_data(vals))[i];
+                                    v_atom = &v_storage; break;
+                case RAY_F32:       v_storage.type = -RAY_F32;
+                                    v_storage.f64  = (double)((float*)ray_data(vals))[i];
+                                    v_atom = &v_storage; break;
+                case RAY_F64:       v_storage.type = -RAY_F64;
+                                    v_storage.f64  = ((double*)ray_data(vals))[i];
+                                    v_atom = &v_storage; break;
+                case RAY_SYM:       v_storage.type = -RAY_SYM;
+                                    v_storage.i64  = ray_read_sym(ray_data(vals), i, RAY_SYM, vals->attrs);
+                                    v_atom = &v_storage; break;
+                case RAY_STR: {
+                    size_t vl = 0;
+                    const char* vp = ray_str_vec_get(vals, i, &vl);
+                    v_atom = ray_str(vp ? vp : "", vp ? vl : 0);
+                    v_owned = true;
+                    break;
+                }
+                case RAY_GUID:
+                    v_atom = ray_guid(((const uint8_t*)ray_data(vals)) + i * 16);
+                    v_owned = (v_atom && !RAY_IS_ERR(v_atom));
+                    break;
+                default: break;
+            }
+            if (v_is_null && v_atom) v_atom->nullmap[0] |= 1;
+            if (v_atom) fmt_obj(b, v_atom, mode);
+            if (v_owned && v_atom) ray_release(v_atom);
+        }
+
+        if (k_owned && k_atom) ray_release(k_atom);
+    }
+    if (npairs > show) fmt_puts(b, " ..");
+    fmt_puts(b, "}");
+}
+
+/* ===== Box-drawing glyphs (UTF-8) ===== */
+
+#define G_TL "\xe2\x94\x8c"    /* ┌ */
+#define G_TR "\xe2\x94\x90"    /* ┐ */
+#define G_BL "\xe2\x94\x94"    /* └ */
+#define G_BR "\xe2\x94\x98"    /* ┘ */
+#define G_H  "\xe2\x94\x80"    /* ─ */
+#define G_V  "\xe2\x94\x82"    /* │ */
+#define G_TT "\xe2\x94\xac"    /* ┬ */
+#define G_BT "\xe2\x94\xb4"    /* ┴ */
+#define G_LT "\xe2\x94\x9c"    /* ├ */
+#define G_RT "\xe2\x94\xa4"    /* ┤ */
+#define G_X  "\xe2\x94\xbc"    /* ┼ */
+#define G_HDOTS "\xe2\x80\xa6" /* … */
+#define G_VDOTS "\xe2\x94\x86" /* ┆ */
+
+/* ===== Table formatter helpers ===== */
+
+static void fmt_centered(fmt_buf_t* b, const char* s, int32_t slen, int32_t width) {
+    int32_t left  = (width - slen) / 2;
+    int32_t right = width - slen - left;
+    for (int32_t i = 0; i < left; i++)  fmt_putc(b, ' ');
+    fmt_putn(b, s, slen);
+    for (int32_t i = 0; i < right; i++) fmt_putc(b, ' ');
+}
+
+/* Maximum pre-formatted cells: FMT_TABLE_MAX_WIDTH * FMT_TABLE_MAX_HEIGHT = 200 */
+#define FMT_CELL_BUF_SIZE 64
+
+typedef struct {
+    char    str[FMT_CELL_BUF_SIZE];
+    int32_t len;
+} fmt_cell_t;
+
+static void fmt_table(fmt_buf_t* b, ray_t* tbl, int mode) {
+    int64_t ncols = ray_table_ncols(tbl);
+    int64_t nrows = ray_table_nrows(tbl);
+
+    /* Compact mode: round-trippable (table [names] (list col1 col2 ...)) */
+    if (mode == 0) {
+        fmt_puts(b, "(table [");
+        for (int64_t i = 0; i < ncols; i++) {
+            if (i > 0) fmt_putc(b, ' ');
+            int64_t name_id = ray_table_col_name(tbl, i);
+            ray_t* name_str = ray_sym_str(name_id);
+            if (name_str && !RAY_IS_ERR(name_str)) {
+                fmt_putn(b, ray_str_ptr(name_str), (int32_t)ray_str_len(name_str));
+                ray_release(name_str);
+            }
+        }
+        fmt_puts(b, "] (list ");
+        for (int64_t i = 0; i < ncols; i++) {
+            if (i > 0) fmt_putc(b, ' ');
+            ray_t* col = ray_table_get_col_idx(tbl, i);
+            if (col) {
+                fmt_obj(b, col, mode);
+            }
+        }
+        fmt_puts(b, "))");
+        return;
+    }
+
+    /* Full mode (1) and show mode (2) */
+    int64_t table_width  = ncols;
+    int64_t table_height = nrows;
+
+    if (mode == 1) {
+        if (table_width > FMT_TABLE_MAX_WIDTH)
+            table_width = FMT_TABLE_MAX_WIDTH;
+        if (table_height > FMT_TABLE_MAX_HEIGHT)
+            table_height = FMT_TABLE_MAX_HEIGHT;
+    }
+
+    if (table_width == 0) {
+        fmt_puts(b, "<table>");
+        return;
+    }
+
+    bool has_hidden_cols = (table_width < ncols);
+    bool has_hidden_rows = (table_height < nrows);
+
+    /* Allocate metadata arrays.  For mode 1 they fit on the stack
+     * (max 10 cols x 20 rows).  For mode 2 we heap-allocate. */
+    bool heap_alloc = (table_width > FMT_TABLE_MAX_WIDTH ||
+                       table_height > FMT_TABLE_MAX_HEIGHT);
+
+    int32_t     col_widths_stack[FMT_TABLE_MAX_WIDTH];
+    const char* col_names_stack[FMT_TABLE_MAX_WIDTH];
+    int32_t     col_name_lens_stack[FMT_TABLE_MAX_WIDTH];
+    const char* col_types_stack[FMT_TABLE_MAX_WIDTH];
+    int32_t     col_type_lens_stack[FMT_TABLE_MAX_WIDTH];
+    ray_t*      name_refs_stack[FMT_TABLE_MAX_WIDTH];
+    fmt_cell_t  cells_stack[FMT_TABLE_MAX_WIDTH * FMT_TABLE_MAX_HEIGHT];
+
+    /* Heap-backed pointers (NULL when using stack) */
+    ray_t* heap_widths_blk = NULL;
+    ray_t* heap_names_blk  = NULL;
+    ray_t* heap_nlen_blk   = NULL;
+    ray_t* heap_types_blk  = NULL;
+    ray_t* heap_tlen_blk   = NULL;
+    ray_t* heap_refs_blk   = NULL;
+    ray_t* heap_cells_blk  = NULL;
+
+    int32_t*     col_widths;
+    const char** col_names;
+    int32_t*     col_name_lens;
+    const char** col_types;
+    int32_t*     col_type_lens;
+    ray_t**      name_refs;
+    fmt_cell_t*  cells;
+
+    if (!heap_alloc) {
+        col_widths    = col_widths_stack;
+        col_names     = col_names_stack;
+        col_name_lens = col_name_lens_stack;
+        col_types     = col_types_stack;
+        col_type_lens = col_type_lens_stack;
+        name_refs     = name_refs_stack;
+        cells         = cells_stack;
+    } else {
+        heap_widths_blk = ray_alloc((size_t)(table_width * (int64_t)sizeof(int32_t)));
+        heap_names_blk  = ray_alloc((size_t)(table_width * (int64_t)sizeof(const char*)));
+        heap_nlen_blk   = ray_alloc((size_t)(table_width * (int64_t)sizeof(int32_t)));
+        heap_types_blk  = ray_alloc((size_t)(table_width * (int64_t)sizeof(const char*)));
+        heap_tlen_blk   = ray_alloc((size_t)(table_width * (int64_t)sizeof(int32_t)));
+        heap_refs_blk   = ray_alloc((size_t)(table_width * (int64_t)sizeof(ray_t*)));
+        heap_cells_blk  = ray_alloc((size_t)(table_width * table_height * (int64_t)sizeof(fmt_cell_t)));
+
+        col_widths    = (int32_t*)ray_data(heap_widths_blk);
+        col_names     = (const char**)ray_data(heap_names_blk);
+        col_name_lens = (int32_t*)ray_data(heap_nlen_blk);
+        col_types     = (const char**)ray_data(heap_types_blk);
+        col_type_lens = (int32_t*)ray_data(heap_tlen_blk);
+        name_refs     = (ray_t**)ray_data(heap_refs_blk);
+        cells         = (fmt_cell_t*)ray_data(heap_cells_blk);
+    }
+
+    /* Pre-format cells and calculate column widths */
+    for (int64_t ci = 0; ci < table_width; ci++) {
+        /* Column name */
+        int64_t name_id = ray_table_col_name(tbl, ci);
+        ray_t* name_str = ray_sym_str(name_id);
+        name_refs[ci] = name_str;
+        if (name_str && !RAY_IS_ERR(name_str)) {
+            col_names[ci]     = ray_str_ptr(name_str);
+            col_name_lens[ci] = (int32_t)ray_str_len(name_str);
+        } else {
+            col_names[ci]     = "?";
+            col_name_lens[ci] = 1;
+            name_refs[ci]     = NULL;
+        }
+
+        /* Column type */
+        ray_t* col_vec = ray_table_get_col_idx(tbl, ci);
+        const char* tname = ray_type_name(col_vec ? col_vec->type : 0);
+        col_types[ci]     = tname;
+        col_type_lens[ci] = (int32_t)strlen(tname);
+
+        /* Start with max of name and type lengths */
+        int32_t max_w = col_name_lens[ci];
+        if (col_type_lens[ci] > max_w) max_w = col_type_lens[ci];
+
+        int64_t col_len = col_vec ? ray_len(col_vec) : 0;
+
+        /* Format first half (head rows) */
+        int64_t half = table_height / 2;
+        for (int64_t ri = 0; ri < half; ri++) {
+            fmt_cell_t* cell = &cells[ci * table_height + ri];
+            if (ri < col_len) {
+                fmt_buf_t tmp;
+                fmt_init(&tmp);
+                fmt_raw_elem(&tmp, col_vec, ri);
+                int32_t clen = tmp.len < FMT_CELL_BUF_SIZE - 1 ? tmp.len : FMT_CELL_BUF_SIZE - 1;
+                memcpy(cell->str, tmp.buf, (size_t)clen);
+                cell->str[clen] = '\0';
+                cell->len = clen;
+                fmt_destroy(&tmp);
+            } else {
+                memcpy(cell->str, "NA", 3);
+                cell->len = 2;
+            }
+            if (cell->len > max_w) max_w = cell->len;
+        }
+
+        /* Format second half (tail rows) */
+        for (int64_t ri = half; ri < table_height; ri++) {
+            fmt_cell_t* cell = &cells[ci * table_height + ri];
+            int64_t src_idx;
+            if (table_height == col_len || !has_hidden_rows) {
+                src_idx = ri;
+            } else {
+                src_idx = col_len - table_height + ri;
+            }
+            if (src_idx >= 0 && src_idx < col_len) {
+                fmt_buf_t tmp;
+                fmt_init(&tmp);
+                fmt_raw_elem(&tmp, col_vec, src_idx);
+                int32_t clen = tmp.len < FMT_CELL_BUF_SIZE - 1 ? tmp.len : FMT_CELL_BUF_SIZE - 1;
+                memcpy(cell->str, tmp.buf, (size_t)clen);
+                cell->str[clen] = '\0';
+                cell->len = clen;
+                fmt_destroy(&tmp);
+            } else {
+                memcpy(cell->str, "NA", 3);
+                cell->len = 2;
+            }
+            if (cell->len > max_w) max_w = cell->len;
+        }
+
+        col_widths[ci] = max_w + 2; /* +2 for padding (1 space each side) */
+    }
+
+    /* Calculate total width (sum of col widths + separators between columns) */
+    int32_t total_width = 0;
+    for (int64_t ci = 0; ci < table_width; ci++)
+        total_width += col_widths[ci];
+    total_width += (int32_t)(table_width - 1); /* separators between columns */
+
+    /* Format footer to check if we need to widen the last column */
+    char footer[128];
+    int footer_len = snprintf(footer, sizeof(footer),
+        " %" PRId64 " rows (%" PRId64 " shown) %" PRId64 " columns (%" PRId64 " shown)",
+        nrows, table_height, ncols, table_width);
+
+    if (total_width < footer_len) {
+        col_widths[table_width - 1] += footer_len - total_width;
+        total_width = footer_len;
+    }
+
+    /* Extra width for hidden columns indicator */
+    if (has_hidden_cols)
+        total_width += 4; /* "───┐" or " … │" */
+
+    /* === Render === */
+
+    /* 1. Top border: ┌──┬──┐ */
+    fmt_puts(b, G_TL);
+    for (int64_t ci = 0; ci < table_width; ci++) {
+        for (int32_t j = 0; j < col_widths[ci]; j++)
+            fmt_puts(b, G_H);
+        if (ci < table_width - 1 || has_hidden_cols)
+            fmt_puts(b, G_TT);
+        else
+            fmt_puts(b, G_TR);
+    }
+    if (has_hidden_cols) {
+        fmt_puts(b, G_H G_H G_H G_TR);
+    }
+
+    /* 2. Header row: │ name │ (centered) */
+    fmt_putc(b, '\n');
+    fmt_puts(b, G_V);
+    for (int64_t ci = 0; ci < table_width; ci++) {
+        fmt_centered(b, col_names[ci], col_name_lens[ci], col_widths[ci]);
+        fmt_puts(b, G_V);
+    }
+    if (has_hidden_cols) {
+        fmt_puts(b, " " G_HDOTS " " G_V);
+    }
+
+    /* 3. Type row: │ type │ (centered) */
+    fmt_putc(b, '\n');
+    fmt_puts(b, G_V);
+    for (int64_t ci = 0; ci < table_width; ci++) {
+        fmt_centered(b, col_types[ci], col_type_lens[ci], col_widths[ci]);
+        fmt_puts(b, G_V);
+    }
+    if (has_hidden_cols) {
+        fmt_puts(b, " " G_HDOTS " " G_V);
+    }
+
+    /* 4. Separator: ├──┼──┤ */
+    fmt_putc(b, '\n');
+    fmt_puts(b, G_LT);
+    for (int64_t ci = 0; ci < table_width; ci++) {
+        for (int32_t j = 0; j < col_widths[ci]; j++)
+            fmt_puts(b, G_H);
+        if (ci < table_width - 1 || has_hidden_cols)
+            fmt_puts(b, G_X);
+        else
+            fmt_puts(b, G_RT);
+    }
+    if (has_hidden_cols) {
+        fmt_puts(b, G_H G_H G_H G_RT);
+    }
+
+    /* 5. Data rows */
+    int64_t half = table_height / 2;
+    for (int64_t ri = 0; ri < table_height; ri++) {
+        fmt_putc(b, '\n');
+
+        /* 6. Truncation indicator row between head and tail */
+        if (has_hidden_rows && ri == half) {
+            fmt_puts(b, G_VDOTS);
+            for (int64_t ci = 0; ci < table_width; ci++) {
+                /* Center the ellipsis (3 bytes, 1 display char) */
+                int32_t left  = (col_widths[ci] - 1) / 2;
+                int32_t right = col_widths[ci] - 1 - left;
+                for (int32_t p = 0; p < left; p++)  fmt_putc(b, ' ');
+                fmt_puts(b, G_HDOTS);
+                for (int32_t p = 0; p < right; p++) fmt_putc(b, ' ');
+                fmt_puts(b, G_VDOTS);
+            }
+            if (has_hidden_cols) {
+                fmt_puts(b, " " G_HDOTS " " G_VDOTS);
+            }
+            fmt_putc(b, '\n');
+        }
+
+        /* Data row: │ val │ (left-aligned with 1-space padding) */
+        fmt_puts(b, G_V);
+        for (int64_t ci = 0; ci < table_width; ci++) {
+            fmt_cell_t* cell = &cells[ci * table_height + ri];
+            fmt_putc(b, ' ');
+            fmt_putn(b, cell->str, cell->len);
+            int32_t pad = col_widths[ci] - cell->len - 1;
+            for (int32_t p = 0; p < pad; p++)
+                fmt_putc(b, ' ');
+            fmt_puts(b, G_V);
+        }
+        if (has_hidden_cols) {
+            fmt_puts(b, " " G_HDOTS " " G_V);
+        }
+    }
+
+    /* 7. Bottom border (separator before footer): ├──┴──┤ */
+    fmt_putc(b, '\n');
+    fmt_puts(b, G_LT);
+    for (int64_t ci = 0; ci < table_width; ci++) {
+        for (int32_t j = 0; j < col_widths[ci]; j++)
+            fmt_puts(b, G_H);
+        if (ci < table_width - 1 || has_hidden_cols)
+            fmt_puts(b, G_BT);
+        else
+            fmt_puts(b, G_RT);
+    }
+    if (has_hidden_cols) {
+        fmt_puts(b, G_H G_H G_H G_RT);
+    }
+
+    /* 8. Footer row: │ N rows (M shown) C columns (K shown) │ */
+    fmt_putc(b, '\n');
+    fmt_puts(b, G_V);
+    fmt_putn(b, footer, footer_len);
+    for (int32_t i = footer_len; i < total_width; i++)
+        fmt_putc(b, ' ');
+    fmt_puts(b, G_V);
+
+    /* Final bottom border: └───┘ */
+    fmt_putc(b, '\n');
+    fmt_puts(b, G_BL);
+    for (int32_t i = 0; i < total_width; i++)
+        fmt_puts(b, G_H);
+    fmt_puts(b, G_BR);
+
+    /* Release name string refs */
+    for (int64_t ci = 0; ci < table_width; ci++) {
+        if (name_refs[ci]) ray_release(name_refs[ci]);
+    }
+
+    /* Free heap allocations if used */
+    if (heap_alloc) {
+        ray_free(heap_widths_blk);
+        ray_free(heap_names_blk);
+        ray_free(heap_nlen_blk);
+        ray_free(heap_types_blk);
+        ray_free(heap_tlen_blk);
+        ray_free(heap_refs_blk);
+        ray_free(heap_cells_blk);
+    }
+}
+
+/* ===== Core dispatch ===== */
+
+static void fmt_obj(fmt_buf_t* b, ray_t* obj, int mode) {
+    if (!obj || RAY_IS_NULL(obj)) { fmt_puts(b, "null"); return; }
+    if (RAY_IS_ERR(obj)) {
+        char code[8] = {0};
+        memcpy(code, obj->sdata, obj->slen < 7 ? obj->slen : 7);
+        fmt_puts(b, "error: ");
+        fmt_puts(b, code);
+        return;
+    }
+
+    int8_t type = obj->type;
+    if (type < 0) {
+        /* Typed null atom: null bit set → display as 0Nx */
+        if (RAY_ATOM_IS_NULL(obj)) {
+            fmt_puts(b, null_literal(-type));
+            return;
+        }
+        /* Atom: type is negated */
+        switch (-type) {
+        case RAY_BOOL: fmt_bool(b, obj->b8); break;
+        case RAY_U8:   fmt_u8(b, obj->u8); break;
+
+        case RAY_I16:  fmt_i16(b, obj->i16); break;
+        case RAY_I32:  fmt_i32(b, obj->i32); break;
+        case RAY_I64:  fmt_i64(b, obj->i64); break;
+        case RAY_F32:       fmt_f32(b, (float)obj->f64); break;
+        case RAY_F64:       fmt_f64(b, obj->f64); break;
+        case RAY_DATE:      fmt_date(b, obj->i32); break;
+        case RAY_TIME:      fmt_time(b, obj->i32); break;
+        case RAY_TIMESTAMP: fmt_timestamp(b, obj->i64); break;
+        case RAY_SYM:  fmt_sym(b, obj->i64); break;
+        case RAY_STR:  fmt_str_atom(b, obj, mode > 0); break;
+        case RAY_GUID: fmt_guid(b, obj->obj ? (const uint8_t*)ray_data(obj->obj) : (const uint8_t*)ray_data(obj)); break;
+        default:       fmt_puts(b, "?"); break;
+        }
+    } else if (ray_is_vec(obj)) {
+        int limit = (mode == 1) ? g_row_width : -1;
+        fmt_vector(b, obj, limit);
+    } else if (type == RAY_LIST) {
+        fmt_list(b, obj, mode);
+    } else if (type == RAY_TABLE) {
+        fmt_table(b, obj, mode);
+    } else if (type == RAY_DICT) {
+        fmt_dict(b, obj, mode);
+    } else if (type == RAY_LAMBDA) {
+        fmt_puts(b, "lambda");
+    } else if (type == RAY_UNARY || type == RAY_BINARY || type == RAY_VARY) {
+        /* Render function objects with angle brackets so a fn is
+         * visually distinct from a sym or string.  Without them,
+         * `.os` printed as `{getenv:.os.getenv …}` — looked like
+         * a dict of sym self-references.  Now it reads
+         * `{getenv:<.os.getenv> …}`. */
+        const char* name = ray_fn_name(obj);
+        if (name[0]) { fmt_puts(b, "<"); fmt_puts(b, name); fmt_puts(b, ">"); }
+        else fmt_puts(b, type == RAY_UNARY ? "<builtin/1>" :
+                         type == RAY_BINARY ? "<builtin/2>" : "<builtin/n>");
+    } else if (type == RAY_LAZY) {
+        ray_t* concrete = ray_lazy_materialize(obj);
+        fmt_obj(b, concrete, mode);
+        return;
+    } else {
+        fmt_printf(b, "<%s>", ray_type_name(type));
+    }
+}
+
+ray_t* ray_fmt(ray_t* obj, int mode) {
+    fmt_buf_t b;
+    fmt_init(&b);
+    fmt_obj(&b, obj, mode);
+    return fmt_to_str(&b);
+}
+
+void ray_fmt_print(FILE* fp, ray_t* obj, int mode) {
+    ray_t* s = ray_fmt(obj, mode);
+    if (s) {
+        fwrite(ray_str_ptr(s), 1, ray_str_len(s), fp);
+        ray_release(s);
+    }
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/format.h b/crates/rayforce-sys/vendor/rayforce/src/lang/format.h
new file mode 100644
index 0000000..894aa5b
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/format.h
@@ -0,0 +1,50 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_LANG_FORMAT_H
+#define RAY_LANG_FORMAT_H
+
+#include <rayforce.h>
+#include <stdio.h>
+
+#define FMT_TABLE_MAX_WIDTH   10
+#define FMT_TABLE_MAX_HEIGHT  20
+#define FMT_LIST_MAX_HEIGHT   50
+#define FMT_DEFAULT_ROW_WIDTH 80
+#define FMT_DEFAULT_PRECISION  2
+
+/* Format a ray_t value into a new ray_t string (RAY_STR atom).
+ * mode: 0 = compact, 1 = full (REPL), 2 = show (no limits) */
+ray_t* ray_fmt(ray_t* obj, int mode);
+
+/* Format and write to FILE* */
+void ray_fmt_print(FILE* fp, ray_t* obj, int mode);
+
+/* Display settings */
+void ray_fmt_set_precision(int digits);
+void ray_fmt_set_width(int cols);
+
+/* Type name string (e.g. RAY_I64 -> "i64") */
+const char* ray_type_name(int8_t type);
+
+#endif /* RAY_LANG_FORMAT_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/internal.h b/crates/rayforce-sys/vendor/rayforce/src/lang/internal.h
new file mode 100644
index 0000000..3b47b09
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/internal.h
@@ -0,0 +1,514 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/**   Shared helpers for eval.c split — included by arith.c, cmp.c, agg.c, etc.
+ *   Small hot-path helpers are static inline; larger functions that remain in
+ *   eval.c are declared extern.
+ */
+
+#ifndef RAY_LANG_INTERNAL_H
+#define RAY_LANG_INTERNAL_H
+
+#include "lang/eval.h"
+#include "lang/format.h"
+#include "core/types.h"
+#include "mem/heap.h"
+#include "table/sym.h"
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* ══════════════════════════════════════════
+ * Atom constructors
+ * ══════════════════════════════════════════ */
+
+static inline ray_t* make_i64(int64_t v) {
+    ray_t* obj = ray_alloc(0);
+    if (!obj) return ray_error("oom", NULL);
+    obj->type = -RAY_I64;
+    obj->i64 = v;
+    return obj;
+}
+
+static inline ray_t* make_f64(double v) {
+    ray_t* obj = ray_alloc(0);
+    if (!obj) return ray_error("oom", NULL);
+    obj->type = -RAY_F64;
+    obj->f64 = v;
+    return obj;
+}
+
+static inline ray_t* make_i16(int16_t v) {
+    return ray_i16(v);
+}
+
+static inline ray_t* make_i32(int32_t v) {
+    return ray_i32(v);
+}
+
+static inline ray_t* make_u8(uint8_t v) {
+    return ray_u8(v);
+}
+
+static inline ray_t* make_bool(uint8_t v) {
+    ray_t* obj = ray_alloc(0);
+    if (!obj) return ray_error("oom", NULL);
+    obj->type = -RAY_BOOL;
+    obj->b8 = v;
+    return obj;
+}
+
+/* ══════════════════════════════════════════
+ * Type checks and numeric extraction
+ * ══════════════════════════════════════════ */
+
+/* Helpers to extract numeric value as double */
+static inline int is_numeric(ray_t* x) {
+    return x->type == -RAY_I64 || x->type == -RAY_F64 ||
+           x->type == -RAY_I16 || x->type == -RAY_I32 ||
+           x->type == -RAY_U8  || x->type == -RAY_BOOL;
+}
+
+/* Check if an atom is a temporal type */
+static inline int is_temporal(ray_t* x) {
+    return x->type == -RAY_DATE || x->type == -RAY_TIME || x->type == -RAY_TIMESTAMP;
+}
+
+/* Convert temporal atom to nanoseconds for cross-temporal comparison.
+ * DATE = days since epoch -> ns, TIME = ms since midnight -> ns, TIMESTAMP = ns */
+static inline int64_t temporal_as_ns(ray_t* x) {
+    if (x->type == -RAY_TIMESTAMP) return x->i64;
+    if (x->type == -RAY_DATE)      return (int64_t)x->i32 * 86400000000000LL;
+    if (x->type == -RAY_TIME)      return (int64_t)x->i32 * 1000000LL;
+    return 0;
+}
+
+/* Extract integer value from any integer atom as int64_t */
+static inline int64_t as_i64(ray_t* x) {
+    if (x->type == -RAY_I64)  return x->i64;
+    if (x->type == -RAY_I32)  return (int64_t)x->i32;
+    if (x->type == -RAY_I16)  return (int64_t)x->i16;
+    if (x->type == -RAY_U8)   return (int64_t)x->u8;
+    return x->i64; /* fallback */
+}
+
+static inline double as_f64(ray_t* x) {
+    if (x->type == -RAY_F64) return x->f64;
+    if (x->type == -RAY_I64) return (double)x->i64;
+    if (x->type == -RAY_I32) return (double)x->i32;
+    if (x->type == -RAY_I16) return (double)x->i16;
+    if (x->type == -RAY_U8)  return (double)x->u8;
+    if (x->type == -RAY_STR && ray_str_len(x) == 1) return (double)(unsigned char)x->sdata[0];
+    if (x->type == -RAY_BOOL) return (double)x->b8;
+    if (x->type == -RAY_DATE || x->type == -RAY_TIME) return (double)x->i32;
+    if (x->type == -RAY_TIMESTAMP) return (double)x->i64;
+    return (double)x->i64;
+}
+
+static inline int is_float_op(ray_t* a, ray_t* b) {
+    return a->type == -RAY_F64 || b->type == -RAY_F64;
+}
+
+/* ══════════════════════════════════════════
+ * Null/type helpers
+ * ══════════════════════════════════════════ */
+
+/* RAY_ATOM_IS_NULL and ray_typed_null are in rayforce.h */
+
+/* Return a typed null for the promoted result type of two operands */
+static inline ray_t* null_for_promoted(ray_t* a, ray_t* b) {
+    if (a->type == -RAY_F64 || b->type == -RAY_F64)
+        return ray_typed_null(-RAY_F64);
+    if (a->type == -RAY_I64 || b->type == -RAY_I64)
+        return ray_typed_null(-RAY_I64);
+    if (a->type == -RAY_I32 || b->type == -RAY_I32)
+        return ray_typed_null(-RAY_I32);
+    if (a->type == -RAY_I16 || b->type == -RAY_I16)
+        return ray_typed_null(-RAY_I16);
+    return ray_typed_null(-RAY_I64);
+}
+
+/* ══════════════════════════════════════════
+ * Type promotion
+ * ══════════════════════════════════════════ */
+
+/* Determine the promoted integer result type for two numeric operands.
+ * Returns atom type code (negative). */
+static inline int8_t promote_int_type(ray_t* a, ray_t* b) {
+    if (a->type == -RAY_I64 || b->type == -RAY_I64) return -RAY_I64;
+    if (a->type == -RAY_I32 || b->type == -RAY_I32) return -RAY_I32;
+    if (a->type == -RAY_U8 || b->type == -RAY_U8) {
+        /* u8 op u8 -> u8, but u8 op i16 -> i16 etc */
+        if (a->type == -RAY_U8 && b->type == -RAY_U8) return -RAY_U8;
+        return (a->type == -RAY_I16 || b->type == -RAY_I16) ? -RAY_I16 : -RAY_I64;
+    }
+    if (a->type == -RAY_I16 || b->type == -RAY_I16) return -RAY_I16;
+    return -RAY_I64;
+}
+
+/* Promote integer type following right-operand's type (K/q semantics for sub) */
+static inline int8_t promote_int_type_right(ray_t* a, ray_t* b) {
+    (void)a;
+    int8_t bt = b->type;
+    if (bt == -RAY_I32 || bt == -RAY_I16 || bt == -RAY_U8 || bt == -RAY_I64)
+        return bt;
+    int8_t at = a->type;
+    if (at == -RAY_I32 || at == -RAY_I16 || at == -RAY_U8 || at == -RAY_I64)
+        return at;
+    return -RAY_I64;
+}
+
+/* Create a result atom of the given type from an int64_t value */
+static inline ray_t* make_typed_int(int8_t atom_type, int64_t val) {
+    switch (atom_type) {
+    case -RAY_I16: return make_i16((int16_t)val);
+    case -RAY_I32: return make_i32((int32_t)val);
+    case -RAY_U8:  return make_u8((uint8_t)val);
+    default:       return make_i64(val);
+    }
+}
+
+/* ══════════════════════════════════════════
+ * Type name helper
+ * ══════════════════════════════════════════ */
+
+/* Removed: type_sym_name() — use ray_type_name() directly.
+ * Lowercase for atoms (negative type), uppercase for vectors (positive). */
+
+/* ══════════════════════════════════════════
+ * Truthiness
+ * ══════════════════════════════════════════ */
+
+/* Logical -- coerce to truthiness (0/nil/false = falsy, else truthy).
+ * Null forms (RAY_NULL singleton and typed null atoms) are falsy. */
+static inline int is_truthy(ray_t* x) {
+    if (RAY_IS_NULL(x) || RAY_ATOM_IS_NULL(x)) return 0;
+    if (x->type == -RAY_BOOL) return x->b8;
+    if (x->type == -RAY_I64)  return x->i64 != 0;
+    if (x->type == -RAY_F64)  return x->f64 != 0.0;
+    return 1; /* non-null objects are truthy */
+}
+
+/* ══════════════════════════════════════════
+ * Collection helpers
+ * ══════════════════════════════════════════ */
+
+static inline int is_list(ray_t* x) {
+    return x && !RAY_IS_ERR(x) && x->type == RAY_LIST;
+}
+
+/* Check if x is a collection: boxed list OR typed vector */
+static inline int is_collection(ray_t* x) {
+    return x && !RAY_IS_ERR(x) && (x->type == RAY_LIST || ray_is_vec(x));
+}
+
+/* Extract the i-th element of a collection as a ray_t* atom.
+ * For boxed lists, returns the stored pointer (no alloc).
+ * For typed vectors, allocates a new atom.  Caller must release
+ * atoms obtained from typed vectors (allocated == 1). */
+static inline ray_t* collection_elem(ray_t* coll, int64_t i, int *allocated) {
+    if (coll->type == RAY_LIST) {
+        *allocated = 0;
+        return ((ray_t**)ray_data(coll))[i];
+    }
+    *allocated = 1;
+    if (ray_vec_is_null(coll, i))
+        return ray_typed_null(-coll->type);
+    void* d = ray_data(coll);
+    switch (coll->type) {
+        case RAY_I64:       return ray_i64(((int64_t*)d)[i]);
+        case RAY_F64:       return ray_f64(((double*)d)[i]);
+        case RAY_I32:       return ray_i32(((int32_t*)d)[i]);
+        case RAY_I16:       return ray_i16(((int16_t*)d)[i]);
+        case RAY_BOOL:      return ray_bool(((bool*)d)[i]);
+        case RAY_SYM:       return ray_sym(ray_read_sym(d, i, coll->type, coll->attrs));
+        case RAY_U8:        return ray_u8(((uint8_t*)d)[i]);
+        case RAY_DATE:      return ray_date((int64_t)((int32_t*)d)[i]);
+        case RAY_TIME:      return ray_time((int64_t)((int32_t*)d)[i]);
+        case RAY_TIMESTAMP: return ray_timestamp(((int64_t*)d)[i]);
+        case RAY_GUID: {
+            const uint8_t* gd = ((uint8_t*)d) + i * 16;
+            return ray_guid(gd);
+        }
+        /* RAY_CHAR removed -- char vectors no longer exist */
+        case RAY_STR: {
+            size_t slen = 0;
+            const char* sp = ray_str_vec_get(coll, i, &slen);
+            return ray_str(sp ? sp : "", sp ? slen : 0);
+        }
+        default:            *allocated = 0; return ray_error("type", NULL);
+    }
+}
+
+/* Extract a value from an atom for storage, handling cross-type casting.
+ * Returns the value as int64_t (for integer/temporal types). */
+static inline int64_t elem_as_i64(ray_t* elem) {
+    if (elem->type == -RAY_I64 || elem->type == -RAY_TIMESTAMP ||
+        elem->type == -RAY_DATE || elem->type == -RAY_TIME ||
+        elem->type == -RAY_SYM) return elem->i64;
+    if (elem->type == -RAY_I32)  return (int64_t)elem->i32;
+    if (elem->type == -RAY_I16)  return (int64_t)elem->i16;
+    if (elem->type == -RAY_U8)   return (int64_t)elem->u8;
+    if (elem->type == -RAY_F64)  return (int64_t)elem->f64;
+    return elem->i64;
+}
+
+/* Store a scalar result into a typed vector at position i.
+ * Returns 0 on success, -1 if the element type doesn't match. */
+static inline int store_typed_elem(ray_t* vec, int64_t i, ray_t* elem) {
+    if (RAY_ATOM_IS_NULL(elem)) {
+        int esz = ray_elem_size(vec->type);
+        memset((char*)ray_data(vec) + i * esz, 0, esz);
+        ray_vec_set_null(vec, i, true);
+        return 0;
+    }
+    switch (vec->type) {
+        case RAY_I64:       ((int64_t*)ray_data(vec))[i]  = elem_as_i64(elem); return 0;
+        case RAY_F64:       ((double*)ray_data(vec))[i]    = (elem->type == -RAY_F64) ? elem->f64 : (double)elem_as_i64(elem); return 0;
+        case RAY_I32:       ((int32_t*)ray_data(vec))[i]   = (int32_t)elem_as_i64(elem); return 0;
+        case RAY_I16:       ((int16_t*)ray_data(vec))[i]   = (int16_t)elem_as_i64(elem); return 0;
+        case RAY_BOOL:      ((bool*)ray_data(vec))[i]      = elem->b8;  return 0;
+        case RAY_U8:        ((uint8_t*)ray_data(vec))[i]   = (uint8_t)elem_as_i64(elem); return 0;
+        case RAY_DATE:      ((int32_t*)ray_data(vec))[i]   = (int32_t)elem_as_i64(elem); return 0;
+        case RAY_TIME:      ((int32_t*)ray_data(vec))[i]   = (int32_t)elem_as_i64(elem); return 0;
+        case RAY_TIMESTAMP: ((int64_t*)ray_data(vec))[i]   = elem_as_i64(elem); return 0;
+        case RAY_SYM:       ray_write_sym(ray_data(vec), i, (uint64_t)elem->i64, vec->type, vec->attrs); return 0;
+        case RAY_GUID:      if (elem->obj) memcpy(((uint8_t*)ray_data(vec)) + i * 16, ray_data(elem->obj), 16); return 0;
+        default: return -1;
+    }
+}
+
+/* ══════════════════════════════════════════
+ * Extern forward declarations — larger functions that stay in eval.c
+ * ══════════════════════════════════════════ */
+
+ray_t* atomic_map_binary_op(ray_binary_fn fn, uint16_t dag_opcode, ray_t* left, ray_t* right);
+ray_t* atomic_map_unary(ray_unary_fn fn, ray_t* arg);
+ray_t* to_boxed_list(ray_t* x);
+ray_t* unbox_vec_arg(ray_t* x, ray_t** _bx);
+ray_t* call_lambda(ray_t* lambda, ray_t** call_args, int64_t argc);
+ray_t* call_fn1(ray_t* fn, ray_t* arg);
+ray_t* call_fn2(ray_t* fn, ray_t* a, ray_t* b);
+ray_t* gather_by_idx(ray_t* vec, int64_t* idx, int64_t n);
+ray_t* ray_sort(ray_t** cols, uint8_t* descs, uint8_t* nulls_first,
+                uint8_t n_cols, int64_t nrows);
+int    char_str_cmp(ray_t* a, ray_t* b, int *out);
+int    is_comparable(ray_t* x);
+
+/* Arithmetic builtins (formerly static in eval.c, now in arith.c) */
+ray_t* ray_round_fn(ray_t* x);
+ray_t* ray_floor_fn(ray_t* x);
+ray_t* ray_ceil_fn(ray_t* x);
+ray_t* ray_abs_fn(ray_t* x);
+ray_t* ray_sqrt_fn(ray_t* x);
+ray_t* ray_log_fn(ray_t* x);
+ray_t* ray_exp_fn(ray_t* x);
+
+/* Collection helpers (formerly static in eval.c, now in collection.c) */
+int    atom_eq(ray_t* a, ray_t* b);
+ray_t* list_to_typed_vec(ray_t* list, int8_t orig_vec_type);
+
+/* Collection builtins (formerly static in eval.c, now in collection.c) */
+ray_t* ray_map_fn(ray_t** args, int64_t n);
+ray_t* ray_pmap_fn(ray_t** args, int64_t n);
+ray_t* ray_fold_fn(ray_t** args, int64_t n);
+ray_t* ray_scan_fn(ray_t** args, int64_t n);
+ray_t* ray_filter_fn(ray_t* vec, ray_t* mask);
+ray_t* ray_apply_fn(ray_t** args, int64_t n);
+ray_t* ray_distinct_fn(ray_t* x);
+ray_t* ray_in_fn(ray_t* val, ray_t* vec);
+ray_t* ray_except_fn(ray_t* vec1, ray_t* vec2);
+ray_t* ray_union_fn(ray_t* vec1, ray_t* vec2);
+ray_t* ray_sect_fn(ray_t* vec1, ray_t* vec2);
+ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj);
+ray_t* ray_at_fn(ray_t* vec, ray_t* idx);
+ray_t* ray_find_fn(ray_t* vec, ray_t* val);
+ray_t* ray_til_fn(ray_t* x);
+ray_t* ray_reverse_fn(ray_t* x);
+ray_t* ray_rand_fn(ray_t* a, ray_t* b);
+ray_t* ray_bin_fn(ray_t* sorted, ray_t* val);
+ray_t* ray_binr_fn(ray_t* sorted, ray_t* val);
+ray_t* ray_map_left_fn(ray_t** args, int64_t n);
+ray_t* ray_map_right_fn(ray_t** args, int64_t n);
+ray_t* ray_fold_left_fn(ray_t** args, int64_t n);
+ray_t* ray_fold_right_fn(ray_t** args, int64_t n);
+ray_t* ray_scan_left_fn(ray_t** args, int64_t n);
+ray_t* ray_scan_right_fn(ray_t** args, int64_t n);
+ray_t* ray_enlist_fn(ray_t** args, int64_t n);
+
+/* String builtins (formerly static in eval.c, now in str_builtin.c) */
+ray_t* ray_split_fn(ray_t* str, ray_t* delim);
+ray_t* ray_like_fn(ray_t* x, ray_t* pattern);
+ray_t* ray_sym_name_fn(ray_t* x);
+
+/* Table builtins (formerly static in eval.c, now in table_builtin.c) */
+uint16_t pivot_fn_to_agg_op(ray_t* fn);
+ray_t* ray_pivot_fn(ray_t** args, int64_t n);
+ray_t* ray_modify_fn(ray_t** args, int64_t n);
+ray_t* ray_alter_fn(ray_t** args, int64_t n);
+ray_t* ray_del_fn(ray_t** args, int64_t n);
+ray_t* ray_row_fn(ray_t* tbl, ray_t* idx);
+ray_t* ray_union_all_fn(ray_t* t1, ray_t* t2);
+ray_t* ray_table_distinct_fn(ray_t* tbl);
+ray_t* ray_unify_fn(ray_t* a, ray_t* b);
+
+/* Concat (formerly static in eval.c, now extern for table_builtin.c) */
+ray_t* ray_concat_fn(ray_t* a, ray_t* b);
+
+/* Temporal builtins (formerly static in eval.c, now in temporal.c) */
+ray_t* ray_date_clock_fn(ray_t* arg);
+ray_t* ray_time_clock_fn(ray_t* arg);
+ray_t* ray_timestamp_clock_fn(ray_t* arg);
+
+/* Sort builtins (formerly static in eval.c, now in sort.c) */
+ray_t* ray_asc_fn(ray_t* x);
+ray_t* ray_desc_fn(ray_t* x);
+ray_t* ray_iasc_fn(ray_t* x);
+ray_t* ray_idesc_fn(ray_t* x);
+ray_t* ray_rank_fn(ray_t* x);
+ray_t* sort_table_by_keys(ray_t* tbl, ray_t* keys, uint8_t descending);
+ray_t* ray_xasc_fn(ray_t* tbl, ray_t* keys);
+ray_t* ray_xdesc_fn(ray_t* tbl, ray_t* keys);
+ray_t* ray_xrank_fn(ray_t* n_obj, ray_t* vec);
+
+/* Datalog builtins (formerly static in eval.c, now in datalog_builtin.c) */
+ray_t* ray_datoms_fn(ray_t** args, int64_t n);
+ray_t* ray_assert_fact_fn(ray_t** args, int64_t n);
+ray_t* ray_retract_fact_fn(ray_t** args, int64_t n);
+ray_t* ray_scan_eav_fn(ray_t** args, int64_t n);
+ray_t* ray_pull_fn(ray_t** args, int64_t n);
+ray_t* ray_rule_fn(ray_t** args, int64_t n);
+ray_t* ray_query_fn(ray_t** args, int64_t n);
+ray_t* ray_dl_program_fn(ray_t** args, int64_t n);
+ray_t* ray_dl_add_edb_fn(ray_t** args, int64_t n);
+ray_t* ray_dl_stratify_fn(ray_t* x);
+ray_t* ray_dl_eval_fn(ray_t* x);
+ray_t* ray_dl_query_fn(ray_t* prog_obj, ray_t* pred_obj);
+ray_t* ray_dl_provenance_fn(ray_t* prog_obj, ray_t* pred_obj);
+void   ray_dl_reset_rules(void);
+
+/* System builtins (formerly static in eval.c, now in system.c) */
+ray_t* ray_eval_builtin_fn(ray_t* x);
+ray_t* ray_parse_builtin_fn(ray_t* x);
+ray_t* ray_print_fn(ray_t** args, int64_t n);
+ray_t* ray_meta_fn(ray_t* x);
+ray_t* ray_gc_fn(ray_t** args, int64_t n);
+ray_t* ray_system_fn(ray_t* x);
+/* `.sys.cmd "name args"` — registry-dispatched system commands with
+ * shell fallback (see lang/syscmd.h). */
+ray_t* ray_syscmd_string_dispatch_fn(ray_t* x);
+/* Direct typed entry points sharing the syscmd registry. timeit and
+ * env are variadic so they accept the zero-arg toggle/list shape. */
+ray_t* ray_sys_listen_fn(ray_t* x);
+ray_t* ray_sys_timeit_fn(ray_t** args, int64_t n);
+ray_t* ray_sys_env_fn(ray_t** args, int64_t n);
+ray_t* ray_getenv_fn(ray_t* x);
+/* Filesystem metadata under .os.* (issue #36).  Lean two: size +
+ * directory-list.  Existence/is-file/is-dir reachable via try on
+ * either of these, or via the shell fallback in .sys.cmd. */
+ray_t* ray_os_size_fn(ray_t* x);
+ray_t* ray_os_list_fn(ray_t* x);
+ray_t* ray_setenv_fn(ray_t* name, ray_t* val);
+ray_t* ray_quote_fn(ray_t** args, int64_t n);
+ray_t* ray_return_fn(ray_t* x);
+ray_t* ray_args_fn(ray_t* x);
+ray_t* ray_rc_fn(ray_t* x);
+ray_t* ray_diverse_fn(ray_t* x);
+ray_t* ray_get_fn(ray_t* dict, ray_t* key);
+ray_t* ray_remove_fn(ray_t* dict, ray_t* key);
+ray_t* ray_timer_fn(ray_t* x);
+ray_t* ray_env_fn(ray_t* x);
+ray_t* ray_internals_fn(ray_t** args, int64_t n);
+ray_t* ray_memstat_fn(ray_t** args, int64_t n);
+ray_t* ray_sysinfo_fn(ray_t** args, int64_t n);
+ray_t* ray_ser_fn(ray_t* val);
+ray_t* ray_de_fn(ray_t* val);
+ray_t* ray_hopen_fn(ray_t* x);
+ray_t* ray_hclose_fn(ray_t* x);
+ray_t* ray_hsend_fn(ray_t* handle, ray_t* msg);
+ray_t* ray_set_splayed_fn(ray_t** args, int64_t n);
+ray_t* ray_get_splayed_fn(ray_t** args, int64_t n);
+ray_t* ray_get_parted_fn(ray_t** args, int64_t n);
+/* Bulk-load entry points: walk a root directory, find every splayed
+ * (resp. parted) child, bind it as a Rayfall global, return the
+ * resulting {name → table} dict. */
+ray_t* ray_db_splayed_mount_fn(ray_t** args, int64_t n);
+ray_t* ray_db_parted_mount_fn(ray_t** args, int64_t n);
+ray_t* ray_guid_fn(ray_t* n_arg);
+
+/* Transaction-log journaling (.log.*) — q's -l/-L feature.
+ * Implementations live in src/ops/journal.c; the on-disk machinery
+ * is src/store/journal.c. */
+ray_t* ray_log_open_fn(ray_t** args, int64_t n);
+ray_t* ray_log_write_fn(ray_t* expr);
+ray_t* ray_log_replay_fn(ray_t* path);
+ray_t* ray_log_validate_fn(ray_t* path);
+ray_t* ray_log_roll_fn(ray_t** args, int64_t n);
+ray_t* ray_log_snapshot_fn(ray_t** args, int64_t n);
+ray_t* ray_log_sync_fn(ray_t** args, int64_t n);
+ray_t* ray_log_close_fn(ray_t** args, int64_t n);
+
+/* Group (formerly static in eval.c, now extern for query.c) */
+ray_t* ray_group_fn(ray_t* x);
+
+/* I/O and formatting builtins (formerly in eval.c, now in ops/builtins.c) */
+ray_t* ray_println_fn(ray_t** args, int64_t n);
+ray_t* ray_show_fn(ray_t** args, int64_t n);
+ray_t* ray_format_fn(ray_t** args, int64_t n);
+ray_t* ray_resolve_fn(ray_t** args, int64_t n);
+ray_t* ray_timeit_fn(ray_t** args, int64_t n);
+ray_t* ray_exit_fn(ray_t* arg);
+ray_t* ray_read_csv_fn(ray_t** args, int64_t n);
+ray_t* ray_write_csv_fn(ray_t** args, int64_t n);
+ray_t* ray_cast_fn(ray_t* type_sym, ray_t* val);
+ray_t* ray_type_fn(ray_t* val);
+ray_t* ray_read_file_fn(ray_t* path_obj);
+ray_t* ray_load_file_fn(ray_t* path_obj);
+ray_t* ray_write_file_fn(ray_t* path_obj, ray_t* content);
+
+/* Misc builtins (formerly in eval.c, now in ops/builtins.c) */
+ray_t* ray_dict_fn(ray_t* keys, ray_t* vals);
+ray_t* ray_nil_fn(ray_t* x);
+ray_t* ray_where_fn(ray_t* x);
+ray_t* ray_raze_fn(ray_t* x);
+ray_t* ray_within_fn(ray_t* vals, ray_t* range);
+ray_t* ray_fdiv_fn(ray_t* a, ray_t* b);
+
+/* Query bridge builtins (formerly in eval.c, now in ops/query.c) */
+ray_t* ray_select_fn(ray_t** args, int64_t n);
+ray_t* ray_update_fn(ray_t** args, int64_t n);
+ray_t* ray_insert_fn(ray_t** args, int64_t n);
+ray_t* ray_upsert_fn(ray_t** args, int64_t n);
+ray_t* ray_xbar_fn(ray_t* col, ray_t* bucket);
+ray_t* ray_left_join_fn(ray_t** args, int64_t n);
+ray_t* ray_inner_join_fn(ray_t** args, int64_t n);
+ray_t* ray_anti_join_fn(ray_t** args, int64_t n);
+ray_t* ray_window_join_fn(ray_t** args, int64_t n);
+ray_t* ray_asof_join_fn(ray_t** args, int64_t n);
+
+/* Convenience wrapper: atomic_map_binary with no DAG opcode */
+static inline ray_t* atomic_map_binary(ray_binary_fn fn, ray_t* left, ray_t* right) {
+    return atomic_map_binary_op(fn, 0, left, right);
+}
+
+#endif /* RAY_LANG_INTERNAL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/nfo.c b/crates/rayforce-sys/vendor/rayforce/src/lang/nfo.c
new file mode 100644
index 0000000..8169889
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/nfo.c
@@ -0,0 +1,100 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "nfo.h"
+#include <stdint.h>
+
+ray_t* ray_nfo_create(const char* filename, size_t fname_len,
+                      const char* source,   size_t src_len) {
+    ray_t* fname = ray_str(filename, fname_len);
+    if (RAY_IS_ERR(fname)) return fname;
+
+    ray_t* src = ray_str(source, src_len);
+    if (RAY_IS_ERR(src)) { ray_release(fname); return src; }
+
+    ray_t* keys = ray_vec_new(RAY_I64, 0);
+    if (RAY_IS_ERR(keys)) { ray_release(fname); ray_release(src); return keys; }
+
+    ray_t* vals = ray_vec_new(RAY_I64, 0);
+    if (RAY_IS_ERR(vals)) { ray_release(fname); ray_release(src); ray_release(keys); return vals; }
+
+    /* Build the nfo list: alloc 4-slot list, set elements directly. */
+    ray_t* nfo = ray_alloc(4 * sizeof(ray_t*));
+    if (!nfo || RAY_IS_ERR(nfo)) {
+        ray_release(fname); ray_release(src); ray_release(keys); ray_release(vals);
+        return ray_error("oom", NULL);
+    }
+    nfo->type = RAY_LIST;
+    nfo->len = 4;
+    ray_t** elems = (ray_t**)ray_data(nfo);
+    elems[0] = fname;   /* ownership transfers — no extra retain needed */
+    elems[1] = src;
+    elems[2] = keys;
+    elems[3] = vals;
+
+    return nfo;
+}
+
+void ray_nfo_insert(ray_t* nfo, ray_t* node, ray_span_t span) {
+    int64_t key = (intptr_t)node;
+    int64_t val = span.id;
+
+    ray_t* keys = NFO_KEYS(nfo);
+    ray_t* vals = NFO_VALS(nfo);
+    if (!keys || !vals) return;
+
+    ray_t* new_keys = ray_vec_append(keys, &key);
+    ray_t* new_vals = ray_vec_append(vals, &val);
+
+    /* If vec_append reallocated, update the nfo list slots directly.
+     * ray_vec_append already transferred ownership (the old pointer is
+     * invalid), so we just swap the slot pointer; no retain/release needed
+     * since the list already owns one ref from the original append. */
+    ray_t** slots = (ray_t**)ray_data(nfo);
+    if (new_keys != keys) slots[2] = new_keys;
+    if (new_vals != vals) slots[3] = new_vals;
+}
+
+ray_span_t ray_nfo_get(ray_t* nfo, ray_t* node) {
+    ray_span_t none = { .id = 0 };
+    if (!nfo) return none;
+
+    ray_t* keys = NFO_KEYS(nfo);
+    ray_t* vals = NFO_VALS(nfo);
+    if (!keys || !vals) return none;
+
+    int64_t needle = (intptr_t)node;
+    int64_t n      = ray_len(keys);
+    int64_t* kdata = (int64_t*)ray_data(keys);
+    int64_t* vdata = (int64_t*)ray_data(vals);
+
+    for (int64_t i = 0; i < n; i++) {
+        if (kdata[i] == needle) {
+            ray_span_t span;
+            span.id = vdata[i];
+            return span;
+        }
+    }
+
+    return none;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/nfo.h b/crates/rayforce-sys/vendor/rayforce/src/lang/nfo.h
new file mode 100644
index 0000000..fe42050
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/nfo.h
@@ -0,0 +1,69 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_NFO_H
+#define RAY_NFO_H
+
+#include <rayforce.h>
+
+/* ===== Source Span ===== */
+
+/* 8-byte source location: packs start/end line+col into a single int64.
+ * id == 0 means "no span information available". */
+typedef union ray_span_t {
+    int64_t id;
+    struct {
+        uint16_t start_line;
+        uint16_t end_line;
+        uint16_t start_col;
+        uint16_t end_col;
+    };
+} ray_span_t;
+
+/* ===== Nfo Object ===== */
+
+/* An nfo is a RAY_LIST with 4 elements:
+ *   [0] filename  (RAY_STR atom)
+ *   [1] source    (RAY_STR atom)
+ *   [2] keys      (RAY_I64 vector — intptr_t node pointers)
+ *   [3] vals      (RAY_I64 vector — span ids)
+ */
+
+#define NFO_FILENAME(nfo)  ray_list_get((nfo), 0)
+#define NFO_SOURCE(nfo)    ray_list_get((nfo), 1)
+#define NFO_KEYS(nfo)      ray_list_get((nfo), 2)
+#define NFO_VALS(nfo)      ray_list_get((nfo), 3)
+
+/* Create a new nfo object for the given source file.
+ * Returns a RAY_LIST or ray_error() on failure. */
+ray_t* ray_nfo_create(const char* filename, size_t fname_len,
+                      const char* source,   size_t src_len);
+
+/* Record the source span for an AST node. */
+void ray_nfo_insert(ray_t* nfo, ray_t* node, ray_span_t span);
+
+/* Look up the source span for an AST node.
+ * Returns a span with id==0 if not found. */
+ray_span_t ray_nfo_get(ray_t* nfo, ray_t* node);
+
+#endif /* RAY_NFO_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/parse.c b/crates/rayforce-sys/vendor/rayforce/src/lang/parse.c
new file mode 100644
index 0000000..213a685
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/parse.c
@@ -0,0 +1,881 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/parse.h"
+#include "lang/nfo.h"
+#include "lang/env.h"
+#include "core/numparse.h"
+#include "table/sym.h"   /* RAY_SYM_W64 */
+#include <string.h>
+#include <limits.h>
+#include <stdint.h>
+#include <math.h>
+
+/* ══════════════════════════════════════════
+ * ASCII dispatch table (128 bytes)
+ * Single indexed read: PA(c) — zero branches.
+ * ══════════════════════════════════════════ */
+
+#define PA_ERR     0
+#define PA_DIGIT   1
+#define PA_ALPHA   2
+#define PA_STRING  3
+#define PA_QUOTE   4    /* ' symbol prefix */
+#define PA_LPAREN  5
+#define PA_RPAREN  6
+#define PA_LBRACK  7
+#define PA_RBRACK  8
+#define PA_LBRACE  9
+#define PA_RBRACE  10
+#define PA_COLON   11
+#define PA_WS      12
+#define PA_END     13
+#define PA_MINUS   14
+#define PA_SEMI    15   /* ; comment */
+
+static const char _PA[128] =
+/*  NUL                              \t \n                         */
+    "\x0d\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x0c\x00\x00\x0c\x00\x00"
+/*                                                                  */
+    "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+/*  SP   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /  */
+    "\x0c\x02\x03\x02\x02\x02\x02\x04\x05\x06\x02\x02\x02\x0e\x02\x02"
+/*  0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?  */
+    "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x0b\x0f\x02\x02\x02\x02"
+/*  @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O  */
+    "\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02"
+/*  P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _  */
+    "\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x07\x00\x08\x02\x02"
+/*  `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o  */
+    "\x00\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02"
+/*  p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~   DEL */
+    "\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x09\x02\x0a\x02\x00";
+
+#define PA(c) ((unsigned char)(c) < 128 ? (int)(unsigned char)_PA[(unsigned char)(c)] : PA_ERR)
+
+/* ══════════════════════════════════════════
+ * Parser state
+ * ══════════════════════════════════════════ */
+
+typedef struct {
+    const char *src;
+    const char *pos;
+    int32_t line;
+    int32_t col;
+    ray_t  *nfo;
+} ray_parser_t;
+
+static void advance(ray_parser_t *p, int32_t n) {
+    for (int32_t i = 0; i < n; i++) {
+        if (p->pos[i] == '\n') { p->line++; p->col = 0; }
+        else { p->col++; }
+    }
+    p->pos += n;
+}
+
+/* Fixup line/col after raw p->pos advancement (scan consumed region). */
+static void fixup_pos(ray_parser_t *p, const char *old_pos) {
+    for (const char *c = old_pos; c < p->pos; c++) {
+        if (*c == '\n') { p->line++; p->col = 0; }
+        else { p->col++; }
+    }
+}
+
+/* Record a span for node in the nfo object. */
+static void nfo_record(ray_parser_t *p, ray_t *node,
+                        int32_t sl, int32_t sc) {
+    if (!p->nfo || RAY_IS_ERR(node)) return;
+    ray_span_t span;
+    span.start_line = (uint16_t)sl;
+    span.start_col  = (uint16_t)sc;
+    span.end_line   = (uint16_t)p->line;
+    span.end_col    = (uint16_t)(p->col > 0 ? p->col - 1 : 0);
+    ray_nfo_insert(p->nfo, node, span);
+}
+
+static void skip_ws_and_comments(ray_parser_t *p) {
+    for (;;) {
+        while (*p->pos == ' ' || *p->pos == '\t' || *p->pos == '\n' || *p->pos == '\r') {
+            if (*p->pos == '\n') { p->line++; p->col = 0; }
+            else { p->col++; }
+            p->pos++;
+        }
+        if (*p->pos == ';') {
+            while (*p->pos && *p->pos != '\n') { p->col++; p->pos++; }
+            continue;
+        }
+        break;
+    }
+}
+
+/* Forward declarations */
+static ray_t* parse_expr(ray_parser_t *p);
+
+/* ── Date/time/timestamp helpers ── */
+
+#include "lang/cal.h"
+
+#define PARSE_NSECS_IN_DAY ((int64_t)24 * 60 * 60 * 1000000000LL)
+
+/* Try to parse a time literal starting from 'start'.
+ * Returns the char past the end on success, NULL on failure.
+ * Writes the millisecond value into *ms_out, including sign. */
+static const char* try_parse_time(const char* start, int32_t *ms_out) {
+    const char* c = start;
+    int sign = 1;
+    if (*c == '-') { sign = -1; c++; }
+
+    /* HH */
+    if (!(c[0] >= '0' && c[0] <= '9' && c[1] >= '0' && c[1] <= '9')) return NULL;
+    int hh = (c[0] - '0') * 10 + (c[1] - '0'); c += 2;
+    if (*c != ':') return NULL;
+    c++;
+
+    /* MM */
+    if (!(c[0] >= '0' && c[0] <= '9' && c[1] >= '0' && c[1] <= '9')) return NULL;
+    int mm = (c[0] - '0') * 10 + (c[1] - '0'); c += 2;
+    if (*c != ':') return NULL;
+    c++;
+
+    /* SS */
+    if (!(c[0] >= '0' && c[0] <= '9' && c[1] >= '0' && c[1] <= '9')) return NULL;
+    int ss = (c[0] - '0') * 10 + (c[1] - '0'); c += 2;
+
+    /* .mmm (milliseconds) */
+    int ms = 0;
+    if (*c == '.') {
+        c++;
+        if (!(*c >= '0' && *c <= '9')) return NULL;
+        ms = (*c - '0'); c++;
+        if (*c >= '0' && *c <= '9') { ms = ms * 10 + (*c - '0'); c++; }
+        if (*c >= '0' && *c <= '9') { ms = ms * 10 + (*c - '0'); c++; }
+    }
+
+    *ms_out = sign * (int32_t)((hh * 3600 + mm * 60 + ss) * 1000 + ms);
+    return c;
+}
+
+/* ── Number parsing (with hex, nulls, typed suffixes, date/time/timestamp) ── */
+static ray_t* parse_number(ray_parser_t *p) {
+    const char *start = p->pos;
+    int is_neg = 0;
+    if (*p->pos == '-') { is_neg = 1; p->pos++; }
+
+    /* Hex literal: 0x.. */
+    if (p->pos[0] == '0' && p->pos[1] == 'x') {
+        p->pos += 2;
+        uint64_t v;
+        size_t n = ray_parse_u64_hex(p->pos, SIZE_MAX, &v);
+        if (n == 0) return ray_error("parse", NULL);
+        p->pos += n;
+        return ray_u8((uint8_t)v);
+    }
+
+    /* Null literal: 0N{h,i,d,t,p,l,f,s} or bare 0N (defaults to i64 null). */
+    if (!is_neg && p->pos[0] == '0' && p->pos[1] == 'N') {
+        switch (p->pos[2]) {
+        case 'h': p->pos += 3; return ray_typed_null(-RAY_I16);
+        case 'i': p->pos += 3; return ray_typed_null(-RAY_I32);
+        case 'd': p->pos += 3; return ray_typed_null(-RAY_DATE);
+        case 't': p->pos += 3; return ray_typed_null(-RAY_TIME);
+        case 'p': p->pos += 3; return ray_typed_null(-RAY_TIMESTAMP);
+        case 'l': p->pos += 3; return ray_typed_null(-RAY_I64);
+        case 'f': p->pos += 3; return ray_typed_null(-RAY_F64);
+        case 's': p->pos += 3; return ray_typed_null(-RAY_SYM);
+        }
+        /* Bare 0N: only if the next char is not an identifier continuation
+         * (letter/digit/underscore), else fall through to plain number. */
+        char c2 = p->pos[2];
+        if (!((c2 >= 'a' && c2 <= 'z') || (c2 >= 'A' && c2 <= 'Z') ||
+              (c2 >= '0' && c2 <= '9') || c2 == '_')) {
+            p->pos += 2;
+            return ray_typed_null(-RAY_I64);
+        }
+    }
+
+    /* Scan digits */
+    const char *dstart = p->pos;
+    while (*p->pos >= '0' && *p->pos <= '9') p->pos++;
+    int ndigits = (int)(p->pos - dstart);
+
+    /* Date/Timestamp: YYYY.MM.DD or YYYY.MM.DDDhh:mm:ss.nnnnnnnnn */
+    if (ndigits == 4 && !is_neg && *p->pos == '.' &&
+        p->pos[1] >= '0' && p->pos[1] <= '9' &&
+        p->pos[2] >= '0' && p->pos[2] <= '9' &&
+        p->pos[3] == '.') {
+        int year = (int)ray_parse_4_digits(dstart);
+        p->pos++; /* skip first '.' */
+        int month = (p->pos[0] - '0') * 10 + (p->pos[1] - '0');
+        p->pos += 2;
+        if (*p->pos != '.') { p->pos = start; goto plain_number; }
+        p->pos++; /* skip second '.' */
+        if (!(p->pos[0] >= '0' && p->pos[0] <= '9' &&
+              p->pos[1] >= '0' && p->pos[1] <= '9')) {
+            p->pos = start; goto plain_number;
+        }
+        int day = (p->pos[0] - '0') * 10 + (p->pos[1] - '0');
+        p->pos += 2;
+
+        int32_t days = ymd_to_date(year, month, day);
+
+        /* Check for timestamp separator 'D' */
+        if (*p->pos == 'D') {
+            p->pos++; /* skip D */
+            /* Parse HH:MM:SS.nnnnnnnnn */
+            if (!(p->pos[0] >= '0' && p->pos[0] <= '9' &&
+                  p->pos[1] >= '0' && p->pos[1] <= '9'))
+                return ray_error("parse", NULL);
+            int hh = (p->pos[0] - '0') * 10 + (p->pos[1] - '0'); p->pos += 2;
+            if (*p->pos != ':') return ray_error("parse", NULL);
+            p->pos++;
+            int mi = (p->pos[0] - '0') * 10 + (p->pos[1] - '0'); p->pos += 2;
+            if (*p->pos != ':') return ray_error("parse", NULL);
+            p->pos++;
+            int ss = (p->pos[0] - '0') * 10 + (p->pos[1] - '0'); p->pos += 2;
+            if (*p->pos != '.') return ray_error("parse", NULL);
+            p->pos++;
+            /* Parse fractional seconds (up to 9 digits for nanoseconds) */
+            const char* fstart = p->pos;
+            while (*p->pos >= '0' && *p->pos <= '9') p->pos++;
+            int flen = (int)(p->pos - fstart);
+            uint64_t nanos = 0;
+            for (int i = 0; i < flen && i < 9; i++)
+                nanos = nanos * 10 + (uint64_t)(fstart[i] - '0');
+            /* Pad to 9 digits */
+            for (int i = flen; i < 9; i++) nanos *= 10;
+
+            int64_t day_ns = (int64_t)days * PARSE_NSECS_IN_DAY;
+            int64_t time_ns = ((int64_t)hh * 3600 + mi * 60 + ss) * 1000000000LL + (int64_t)nanos;
+            return ray_timestamp(day_ns + time_ns);
+        }
+
+        return ray_date(days);
+    }
+
+    /* Time literal: HH:MM:SS.mmm (detected by colon after 2 digits from digit-start) */
+    if (ndigits == 2 && *p->pos == ':') {
+        p->pos = start; /* reset — let try_parse_time handle sign */
+        int32_t ms;
+        const char* end = try_parse_time(start, &ms);
+        if (end) { p->pos = end; return ray_time(ms); }
+        /* Not a valid time — fall through to regular number parsing */
+        p->pos = start;
+        if (is_neg) p->pos++;
+        while (*p->pos >= '0' && *p->pos <= '9') p->pos++;
+    }
+
+plain_number:;
+    /* At this point p->pos is past the digits. Check for float */
+    int is_float = 0;
+    if (*p->pos == '.' && p->pos[1] >= '0' && p->pos[1] <= '9') {
+        is_float = 1;
+        p->pos++;
+        while (*p->pos >= '0' && *p->pos <= '9') p->pos++;
+    }
+    if (*p->pos == 'e' || *p->pos == 'E') {
+        is_float = 1;
+        p->pos++;
+        if (*p->pos == '+' || *p->pos == '-') p->pos++;
+        while (*p->pos >= '0' && *p->pos <= '9') p->pos++;
+    }
+
+    size_t span = (size_t)(p->pos - start);
+
+    if (is_float) {
+        double v = 0.0;
+        if (ray_parse_f64(start, span, &v) == 0)
+            return ray_error("parse", NULL);
+        return ray_f64(v);
+    }
+
+    /* Integer parse — overflow signalled by `n == 0` (digits present but
+     * value didn't fit int64).  Promote to f64 in that case, matching the
+     * historical strtoll/ERANGE → strtod behavior. */
+    int64_t v = 0;
+    size_t n = ray_parse_i64(start, span, &v);
+    if (n == 0) {
+        double fv = 0.0;
+        if (ray_parse_f64(start, span, &fv) == 0)
+            return ray_error("parse", NULL);
+        return ray_f64(fv);
+    }
+
+    /* Type suffix: h (i16), i (i32) */
+    if (*p->pos == 'h') {
+        p->pos++;
+        if (v < -32767 || v > 32767) return ray_error("domain", NULL);
+        return ray_i16((int16_t)v);
+    }
+    if (*p->pos == 'i') {
+        p->pos++;
+        if (v < -2147483647LL || v > 2147483647LL) return ray_error("domain", NULL);
+        return ray_i32((int32_t)v);
+    }
+
+    return ray_i64(v);
+}
+
+/* ── String parsing with escape sequence decoding ── */
+static ray_t* parse_string(ray_parser_t *p) {
+    p->pos++; /* skip opening " */
+    const char *start = p->pos;
+
+    /* First pass: scan for closing " and check for escapes */
+    bool has_escape = false;
+    const char *scan = p->pos;
+    while (*scan && *scan != '"') {
+        if (*scan == '\\' && scan[1]) { has_escape = true; scan++; }
+        scan++;
+    }
+    size_t raw_len = (size_t)(scan - start);
+    if (*scan != '"') return ray_error("parse", NULL); /* unterminated string */
+    scan++;
+    p->pos = scan;
+
+    if (!has_escape) return ray_str(start, raw_len);
+
+    /* Decode escape sequences into a temporary buffer */
+    char buf[4096];
+    size_t out = 0;
+    const char *r = start;
+    const char *end = start + raw_len;
+    while (r < end) {
+        if (out >= sizeof(buf) - 2)
+            return ray_error("domain", NULL);  /* string too long for escape buffer */
+        if (*r == '\\' && r + 1 < end) {
+            r++;
+            switch (*r) {
+            case 'n':  buf[out++] = '\n'; r++; break;
+            case 't':  buf[out++] = '\t'; r++; break;
+            case 'r':  buf[out++] = '\r'; r++; break;
+            case '\\': buf[out++] = '\\'; r++; break;
+            case '"':  buf[out++] = '"';  r++; break;
+            case '0': case '1': case '2': case '3':
+            case '4': case '5': case '6': case '7': {
+                /* Octal escape: \OOO (1-3 digits) */
+                char ch = (char)(*r - '0'); r++;
+                if (r < end && *r >= '0' && *r <= '7') {
+                    ch = (char)((ch << 3) | (*r - '0')); r++;
+                    if (r < end && *r >= '0' && *r <= '7') {
+                        ch = (char)((ch << 3) | (*r - '0')); r++;
+                    }
+                }
+                buf[out++] = ch;
+                break;
+            }
+            default:   buf[out++] = '\\'; buf[out++] = *r; r++; break;
+            }
+        } else {
+            buf[out++] = *r++;
+        }
+    }
+    return ray_str(buf, out);
+}
+
+/* ── Symbol/char parsing: 'name or 'a' ── */
+static ray_t* parse_symbol(ray_parser_t *p) {
+    p->pos++; /* skip ' */
+    const char *start = p->pos;
+
+    /* Empty symbol (bare tick at end or before terminator) */
+    if (*p->pos == 0 || *p->pos == ' ' || *p->pos == '\t' || *p->pos == '\n' ||
+        *p->pos == ')' || *p->pos == ']' || *p->pos == '}') {
+        /* Null symbol 0Ns */
+        return ray_typed_null(-RAY_SYM);
+    }
+
+    /* Char literal: 'X' or '\n' etc. */
+    if (*p->pos == '\\') {
+        /* Escape sequence char literal */
+        const char *esc = p->pos + 1;
+        char ch;
+        int esc_len = 1;
+        switch (*esc) {
+        case 'n':  ch = '\n'; break;
+        case 'r':  ch = '\r'; break;
+        case 't':  ch = '\t'; break;
+        case '\\': ch = '\\'; break;
+        case '\'': ch = '\''; break;
+        case '0': case '1': case '2': case '3':
+        case '4': case '5': case '6': case '7': {
+            /* Octal escape: \OOO */
+            ch = (char)(*esc - '0');
+            if (esc[1] >= '0' && esc[1] <= '7') {
+                ch = (char)((ch << 3) | (esc[1] - '0'));
+                if (esc[2] >= '0' && esc[2] <= '7') {
+                    ch = (char)((ch << 3) | (esc[2] - '0'));
+                    esc_len = 3;
+                } else {
+                    esc_len = 2;
+                }
+            }
+            break;
+        }
+        default: ch = *esc; break;
+        }
+        if (esc[esc_len] == '\'') {
+            /* Closing quote found — it's a char literal */
+            p->pos = esc + esc_len + 1;
+            return ray_str(&ch, 1);
+        }
+        /* Not a char literal — fall through to symbol parsing */
+    } else if (start[1] == '\'') {
+        /* Simple char literal like 'a' */
+        char ch = *start;
+        p->pos = start + 2; /* skip char + closing quote */
+        return ray_str(&ch, 1);
+    }
+
+    /* Regular symbol */
+    while (PA(*p->pos) == PA_ALPHA || PA(*p->pos) == PA_DIGIT || *p->pos == '_' || *p->pos == '.')
+        p->pos++;
+    size_t len = (size_t)(p->pos - start);
+    if (len == 0) return ray_typed_null(-RAY_SYM); /* empty symbol */
+    int64_t id = ray_sym_intern(start, len);
+    return ray_sym(id);
+}
+
+/* ── Name parsing ── */
+static ray_t* parse_name(ray_parser_t *p) {
+    const char *start = p->pos;
+    /* Name chars: alpha, digit, _, ., -, !, ?, +, *, /, %, <, >, =, & */
+    while (PA(*p->pos) == PA_ALPHA || PA(*p->pos) == PA_DIGIT
+           || *p->pos == '_' || *p->pos == '.' || *p->pos == '-'
+           || *p->pos == '!' || *p->pos == '?' || *p->pos == '+'
+           || *p->pos == '*' || *p->pos == '/' || *p->pos == '%'
+           || *p->pos == '<' || *p->pos == '>' || *p->pos == '='
+           || *p->pos == '&' || *p->pos == '|')
+        p->pos++;
+    size_t len = (size_t)(p->pos - start);
+    if (len == 0) return ray_error("parse", NULL);
+
+    /* Check for true/false */
+    if (len == 4 && memcmp(start, "true", 4) == 0)  return ray_bool(true);
+    if (len == 5 && memcmp(start, "false", 5) == 0) return ray_bool(false);
+    /* null is handled as a name that resolves to NULL at eval time */
+
+    /* Return as name symbol (with RAY_ATTR_NAME flag) */
+    int64_t id = ray_sym_intern(start, len);
+    ray_t* s = ray_sym(id);
+    if (!RAY_IS_ERR(s)) s->attrs |= RAY_ATTR_NAME;
+    return s;
+}
+
+/* ── Vector literal: [1 2 3] ── */
+static ray_t* parse_vector(ray_parser_t *p) {
+    advance(p, 1); /* skip [ */
+
+    /* Collect parsed elements into a temporary array */
+    ray_t* elems[4096];
+    int32_t count = 0;
+
+    skip_ws_and_comments(p);
+    while (*p->pos && *p->pos != ']') {
+        if (count >= 4096) {
+            for (int32_t i = 0; i < count; i++) ray_release(elems[i]);
+            return ray_error("limit", NULL);
+        }
+        ray_t* elem = parse_expr(p);
+        if (RAY_IS_ERR(elem)) {
+            for (int32_t i = 0; i < count; i++) ray_release(elems[i]);
+            return elem;
+        }
+        elems[count++] = elem;
+        skip_ws_and_comments(p);
+    }
+    if (*p->pos != ']') {
+        for (int32_t i = 0; i < count; i++) ray_release(elems[i]);
+        return ray_error("parse", NULL);
+    }
+    advance(p, 1); /* skip ] */
+
+    if (count == 0) {
+        /* Empty vector -> empty i64 vector */
+        return ray_vec_new(RAY_I64, 0);
+    }
+
+    /* Determine element types.
+     * Name references (RAY_ATTR_NAME) must stay as boxed atoms because
+     * the evaluator, compiler, and fn-builder dereference them as ray_t*. */
+    int8_t first_type = elems[0]->type;
+    bool homogeneous = true;
+    bool has_float = (first_type == -RAY_F64);
+    bool has_int   = (first_type == -RAY_I64);
+    bool all_numeric = (first_type == -RAY_I64 || first_type == -RAY_F64);
+
+    for (int32_t i = 0; i < count; i++) {
+        /* Inside [...], names are symbol literals, not variable references */
+        if (elems[i]->attrs & RAY_ATTR_NAME) {
+            elems[i]->attrs &= ~RAY_ATTR_NAME;
+            /* type is already -RAY_SYM from parse_expr */
+        }
+        if (i == 0) continue;
+        int8_t t = elems[i]->type;
+        if (t != first_type) homogeneous = false;
+        if (t == -RAY_F64)      has_float = true;
+        else if (t == -RAY_I64) has_int = true;
+        if (t != -RAY_I64 && t != -RAY_F64) all_numeric = false;
+    }
+
+    /* All same atom type -> typed vector */
+    if (homogeneous && first_type < 0) {
+        int8_t vec_type = -first_type;
+        ray_t* vec = ray_vec_new(vec_type, count);
+        if (RAY_IS_ERR(vec)) {
+            for (int32_t i = 0; i < count; i++) ray_release(elems[i]);
+            return vec;
+        }
+        switch (vec_type) {
+            case RAY_I64: case RAY_TIMESTAMP: {
+                int64_t* d = (int64_t*)ray_data(vec);
+                for (int32_t i = 0; i < count; i++) d[i] = elems[i]->i64;
+                break;
+            }
+            case RAY_F64: {
+                double* d = (double*)ray_data(vec);
+                for (int32_t i = 0; i < count; i++) d[i] = elems[i]->f64;
+                break;
+            }
+            case RAY_I32: case RAY_DATE: case RAY_TIME: {
+                int32_t* d = (int32_t*)ray_data(vec);
+                for (int32_t i = 0; i < count; i++) d[i] = elems[i]->i32;
+                break;
+            }
+            case RAY_I16: {
+                int16_t* d = (int16_t*)ray_data(vec);
+                for (int32_t i = 0; i < count; i++) d[i] = elems[i]->i16;
+                break;
+            }
+            case RAY_BOOL: {
+                bool* d = (bool*)ray_data(vec);
+                for (int32_t i = 0; i < count; i++) d[i] = elems[i]->b8;
+                break;
+            }
+            case RAY_SYM: {
+                int64_t* d = (int64_t*)ray_data(vec);
+                for (int32_t i = 0; i < count; i++) d[i] = elems[i]->i64;
+                break;
+            }
+            case RAY_U8: {
+                uint8_t* d = (uint8_t*)ray_data(vec);
+                for (int32_t i = 0; i < count; i++) d[i] = elems[i]->u8;
+                break;
+            }
+            case RAY_STR: {
+                /* String vectors use ray_str_vec_append */
+                ray_t* svec = ray_vec_new(RAY_STR, count);
+                if (RAY_IS_ERR(svec)) {
+                    ray_free(vec);
+                    for (int32_t i = 0; i < count; i++) ray_release(elems[i]);
+                    return svec;
+                }
+                for (int32_t i = 0; i < count; i++) {
+                    const char* s = ray_str_ptr(elems[i]);
+                    size_t slen = ray_str_len(elems[i]);
+                    svec = ray_str_vec_append(svec, s, slen);
+                    if (RAY_IS_ERR(svec)) {
+                        for (int32_t j = 0; j < count; j++) ray_release(elems[j]);
+                        ray_free(vec);
+                        return svec;
+                    }
+                }
+                ray_free(vec);
+                for (int32_t i = 0; i < count; i++) ray_release(elems[i]);
+                return svec;
+            }
+            default: ray_free(vec); goto boxed_list;
+        }
+        vec->len = count;
+        for (int32_t i = 0; i < count; i++) {
+            if (RAY_ATOM_IS_NULL(elems[i]))
+                ray_vec_set_null(vec, i, true);
+            ray_release(elems[i]);
+        }
+        return vec;
+    }
+
+    /* Mixed int/float -> promote to f64 */
+    if (has_float && has_int && all_numeric) {
+        ray_t* vec = ray_vec_new(RAY_F64, count);
+        if (RAY_IS_ERR(vec)) {
+            for (int32_t i = 0; i < count; i++) ray_release(elems[i]);
+            return vec;
+        }
+        double* d = (double*)ray_data(vec);
+        for (int32_t i = 0; i < count; i++) {
+            d[i] = (elems[i]->type == -RAY_F64) ? elems[i]->f64
+                                                 : (double)elems[i]->i64;
+        }
+        vec->len = count;
+        for (int32_t i = 0; i < count; i++) {
+            if (RAY_ATOM_IS_NULL(elems[i]))
+                ray_vec_set_null(vec, i, true);
+            ray_release(elems[i]);
+        }
+        return vec;
+    }
+
+boxed_list:
+    /* Mixed types in vector literal — domain error */
+    for (int32_t i = 0; i < count; i++) ray_release(elems[i]);
+    return ray_error("domain", NULL);
+}
+
+/* ── Dict literal: {key: val key: val ...} ──
+ *
+ * Builds a RAY_DICT block holding [keys, vals].
+ * Keys are emitted as a RAY_SYM vector when every key is a bareword sym
+ * literal, as a RAY_STR vector when every key is a quoted string literal,
+ * or as a heterogeneous RAY_LIST otherwise.  Values stay unevaluated in
+ * a RAY_LIST so dict literals remain self-evaluating (the (dict ...)
+ * builtin evaluates them on demand).
+ */
+static ray_t* parse_dict(ray_parser_t *p) {
+    advance(p, 1); /* skip { */
+
+    /* Build keys+vals as a generic RAY_LIST of atoms first; then narrow
+     * keys to a typed vector if homogeneous.  16 entries cover every
+     * realistic dict literal — heterogeneous spillover stays as LIST. */
+    ray_t* key_list = ray_list_new(8);
+    if (RAY_IS_ERR(key_list)) return key_list;
+    ray_t* vals = ray_list_new(8);
+    if (RAY_IS_ERR(vals)) { ray_release(key_list); return vals; }
+
+    bool all_sym = true;
+    bool all_str = true;
+
+    skip_ws_and_comments(p);
+    while (*p->pos && *p->pos != '}') {
+        ray_t* key_atom = NULL;
+        if (*p->pos == '"') {
+            const char *sk_before = p->pos;
+            key_atom = parse_string(p);
+            fixup_pos(p, sk_before);
+            if (RAY_IS_ERR(key_atom)) { ray_release(key_list); ray_release(vals); return key_atom; }
+            all_sym = false;
+        } else {
+            const char *kstart = p->pos;
+            while (PA(*p->pos) == PA_ALPHA || PA(*p->pos) == PA_DIGIT
+                   || *p->pos == '_' || *p->pos == '-')
+                p->pos++;
+            p->col += (int32_t)(p->pos - kstart);
+            size_t klen = (size_t)(p->pos - kstart);
+            if (klen == 0) { ray_release(key_list); ray_release(vals); return ray_error("parse", NULL); }
+            int64_t kid = ray_sym_intern(kstart, klen);
+            key_atom = ray_sym(kid);
+            if (RAY_IS_ERR(key_atom)) { ray_release(key_list); ray_release(vals); return key_atom; }
+            all_str = false;
+        }
+
+        skip_ws_and_comments(p);
+        if (*p->pos != ':') { ray_release(key_atom); ray_release(key_list); ray_release(vals); return ray_error("parse", NULL); }
+        advance(p, 1);
+        skip_ws_and_comments(p);
+
+        ray_t* val = parse_expr(p);
+        if (RAY_IS_ERR(val)) { ray_release(key_atom); ray_release(key_list); ray_release(vals); return val; }
+
+        key_list = ray_list_append(key_list, key_atom);
+        ray_release(key_atom);
+        if (RAY_IS_ERR(key_list)) { ray_release(vals); ray_release(val); return key_list; }
+
+        vals = ray_list_append(vals, val);
+        ray_release(val);
+        if (RAY_IS_ERR(vals)) { ray_release(key_list); return vals; }
+
+        skip_ws_and_comments(p);
+    }
+    if (*p->pos != '}') { ray_release(key_list); ray_release(vals); return ray_error("parse", NULL); }
+    advance(p, 1); /* skip } */
+
+    /* Narrow keys to a typed vector when homogeneous. */
+    int64_t n_pairs = key_list->len;
+    ray_t** key_atoms = (ray_t**)ray_data(key_list);
+    ray_t* keys;
+    if (n_pairs > 0 && all_sym) {
+        keys = ray_sym_vec_new(RAY_SYM_W64, n_pairs);
+        if (RAY_IS_ERR(keys)) { ray_release(key_list); ray_release(vals); return keys; }
+        for (int64_t i = 0; i < n_pairs; i++) {
+            int64_t id = key_atoms[i]->i64;
+            keys = ray_vec_append(keys, &id);
+            if (RAY_IS_ERR(keys)) { ray_release(key_list); ray_release(vals); return keys; }
+        }
+        ray_release(key_list);
+    } else if (n_pairs > 0 && all_str) {
+        keys = ray_vec_new(RAY_STR, n_pairs);
+        if (RAY_IS_ERR(keys)) { ray_release(key_list); ray_release(vals); return keys; }
+        for (int64_t i = 0; i < n_pairs; i++) {
+            keys = ray_str_vec_append(keys, ray_str_ptr(key_atoms[i]), ray_str_len(key_atoms[i]));
+            if (RAY_IS_ERR(keys)) { ray_release(key_list); ray_release(vals); return keys; }
+        }
+        ray_release(key_list);
+    } else {
+        keys = key_list;  /* heterogeneous or empty — use the LIST as-is */
+    }
+    return ray_dict_new(keys, vals);
+}
+
+/* ── List (s-expression): (fn arg1 arg2 ...) ── */
+static ray_t* parse_list(ray_parser_t *p) {
+    advance(p, 1); /* skip ( */
+    ray_t* list = ray_list_new(4);
+    if (RAY_IS_ERR(list)) return list;
+
+    skip_ws_and_comments(p);
+    while (*p->pos && *p->pos != ')') {
+        ray_t* elem = parse_expr(p);
+        if (RAY_IS_ERR(elem)) { ray_release(list); return elem; }
+        list = ray_list_append(list, elem);
+        ray_release(elem);
+        if (RAY_IS_ERR(list)) return list;
+        skip_ws_and_comments(p);
+    }
+    if (*p->pos != ')') { ray_release(list); return ray_error("parse", NULL); }
+    advance(p, 1); /* skip ) */
+    return list;
+}
+
+/* ── Main expression dispatch ── */
+static ray_t* parse_expr(ray_parser_t *p) {
+    skip_ws_and_comments(p);
+
+    int32_t sl = p->line, sc = p->col;
+    const char *before = p->pos;
+    ray_t *result;
+
+    switch (PA(*p->pos)) {
+        case PA_END:    return ray_error("parse", NULL);
+        case PA_DIGIT:  result = parse_number(p); break;
+        case PA_MINUS:
+            if (p->pos[1] >= '0' && p->pos[1] <= '9')
+                result = parse_number(p);
+            else
+                result = parse_name(p);  /* standalone '-' or '-name' */
+            break;
+        case PA_ALPHA:  result = parse_name(p); break;
+        case PA_STRING: result = parse_string(p); break;
+        case PA_QUOTE:  result = parse_symbol(p); break;
+        case PA_LPAREN: result = parse_list(p); break;
+        case PA_LBRACK: result = parse_vector(p); break;
+        case PA_LBRACE: result = parse_dict(p); break;
+        case PA_RPAREN: return ray_error("parse", NULL);
+        case PA_RBRACK: return ray_error("parse", NULL);
+        case PA_RBRACE: return ray_error("parse", NULL);
+        case PA_COLON: {
+            /* Keyword literal :name — parse as symbol (like 'name) */
+            p->pos++;  /* skip : */
+            const char *kstart = p->pos;
+            while (PA(*p->pos) == PA_ALPHA || PA(*p->pos) == PA_DIGIT
+                   || *p->pos == '_' || *p->pos == '.' || *p->pos == '-'
+                   || *p->pos == '/' || *p->pos == '?')
+                p->pos++;
+            size_t klen = (size_t)(p->pos - kstart);
+            if (klen == 0) { result = ray_error("parse", "empty keyword"); break; }
+            int64_t kid = ray_sym_intern(kstart, klen);
+            result = ray_sym(kid);
+            break;
+        }
+        default:        result = parse_name(p); break;  /* operators like +, *, etc. */
+    }
+
+    /* Fixup line/col: leaf parsers advance pos without updating line/col.
+     * Compound parsers (list/vector/dict) use advance() internally and
+     * call skip_ws_and_comments, so their line/col is already accurate. */
+    if (PA(*before) != PA_LPAREN && PA(*before) != PA_LBRACK && PA(*before) != PA_LBRACE)
+        fixup_pos(p, before);
+    nfo_record(p, result, sl, sc);
+    return result;
+}
+
+/* ── Internal parse driver (shared by public APIs) ── */
+static ray_t* parse_source(ray_parser_t *p) {
+    ray_t* first = parse_expr(p);
+    if (RAY_IS_ERR(first)) return first;
+
+    /* Check if there are more expressions after the first */
+    skip_ws_and_comments(p);
+    if (*p->pos == '\0') return first;  /* single expression */
+
+    /* Multiple expressions: collect into (do expr1 expr2 ...) */
+    ray_t* exprs[256];
+    int32_t count = 0;
+    exprs[count++] = first;
+
+    while (*p->pos) {
+        if (count >= 256) {
+            for (int32_t i = 0; i < count; i++) ray_release(exprs[i]);
+            return ray_error("domain", NULL);  /* too many top-level expressions */
+        }
+        ray_t* expr = parse_expr(p);
+        if (RAY_IS_ERR(expr)) {
+            for (int32_t i = 0; i < count; i++) ray_release(exprs[i]);
+            return expr;
+        }
+        exprs[count++] = expr;
+        skip_ws_and_comments(p);
+    }
+
+    /* Build (do expr1 expr2 ...) list */
+    int32_t sl = p->line, sc = p->col;
+    ray_t* do_list = ray_alloc((count + 1) * sizeof(ray_t*));
+    if (!do_list) {
+        for (int32_t i = 0; i < count; i++) ray_release(exprs[i]);
+        return ray_error("oom", NULL);
+    }
+    do_list->type = RAY_LIST;
+    do_list->len = 0;
+    ray_t** elems = (ray_t**)ray_data(do_list);
+    /* Build a name-reference atom for "do" so parsing is independent of runtime */
+    ray_t* do_sym = ray_alloc(0);
+    if (!do_sym) {
+        ray_release(do_list);
+        for (int32_t i = 0; i < count; i++) ray_release(exprs[i]);
+        return ray_error("oom", NULL);
+    }
+    do_sym->type = -RAY_SYM;
+    do_sym->attrs = RAY_ATTR_NAME;
+    do_sym->i64 = ray_sym_intern("do", 2);
+    elems[0] = do_sym;
+    for (int32_t i = 0; i < count; i++)
+        elems[i + 1] = exprs[i];
+    do_list->len = count + 1;
+    nfo_record(p, do_list, sl, sc);
+    return do_list;
+}
+
+/* ── Public API ── */
+ray_t* ray_parse(const char* source) {
+    return ray_parse_with_nfo(source, NULL);
+}
+
+ray_t* ray_parse_with_nfo(const char* source, ray_t* nfo) {
+    if (!source) return ray_error("parse", NULL);
+    ray_parser_t p = {
+        .src  = source,
+        .pos  = source,
+        .line = 0,
+        .col  = 0,
+        .nfo  = nfo
+    };
+    return parse_source(&p);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/parse.h b/crates/rayforce-sys/vendor/rayforce/src/lang/parse.h
new file mode 100644
index 0000000..ea08375
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/parse.h
@@ -0,0 +1,39 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_PARSE_H
+#define RAY_PARSE_H
+
+#include <rayforce.h>
+
+/* Parse a Rayfall source string into a ray_t object tree.
+ * Returns a single expression, or a list of expressions if the
+ * source contains multiple top-level forms. */
+ray_t* ray_parse(const char* source);
+
+/* Parse with source-location tracking.  If nfo is non-NULL every AST
+ * node produced by the parser will have its span recorded in the nfo
+ * object (created via ray_nfo_create in lang/nfo.h). */
+ray_t* ray_parse_with_nfo(const char* source, ray_t* nfo);
+
+#endif /* RAY_PARSE_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.c b/crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.c
new file mode 100644
index 0000000..4b97909
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.c
@@ -0,0 +1,359 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/syscmd.h"
+/* Avoid both lang/internal.h and core/runtime.h here: they each
+ * transitively pull a different struct definition for `ray_vm_t`
+ * (lang/eval.h vs core/runtime.h) and we don't need the VM internals
+ * — only the runtime accessors and the lang-side builtins.  The
+ * runtime exposes its main poll via opaque-pointer accessors
+ * declared inline below. */
+#include "core/poll.h"
+#include "core/ipc.h"
+#include "core/profile.h"
+#include "lang/env.h"
+#include "table/sym.h"
+
+/* Forward decls of the bare runtime accessors — these are defined in
+ * core/runtime.c.  Pulling the full runtime.h would re-trigger the
+ * dual ray_vm_t typedef; one extern keeps us decoupled. */
+void* ray_runtime_get_poll(void);
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Public header reaches us through syscmd.h.  These few helpers were
+ * previously sourced from lang/internal.h; pulling them in directly
+ * keeps this TU clear of the runtime/eval VM clash. */
+static inline int ray_is_atom_local(ray_t* x) { return x && !RAY_IS_ERR(x) && x->type < 0; }
+
+/* ══════════════════════════════════════════
+ * Argument parsing helpers — handlers receive a Rayfall ray_t* but
+ * the .sys.cmd / REPL paths arrive with a raw char slice.  These
+ * helpers coerce both into the typed value the handler wants,
+ * keeping the per-handler code free of input-shape branching.
+ * ══════════════════════════════════════════ */
+
+static int arg_is_null(const ray_t* arg) {
+    return !arg || RAY_IS_NULL(arg);
+}
+
+/* Parse signed decimal int64 out of a ray_t (atom or string).  Returns
+ * 0 + sets *err=1 if the arg can't be coerced. */
+static int64_t arg_as_i64(ray_t* arg, int* err) {
+    *err = 0;
+    if (arg_is_null(arg)) { *err = 1; return 0; }
+    if (arg->type == -RAY_I64) return arg->i64;
+    if (arg->type == -RAY_I32) return (int64_t)arg->i32;
+    if (arg->type == -RAY_I16) return (int64_t)arg->i16;
+    if (arg->type == -RAY_U8)  return (int64_t)arg->u8;
+    if (arg->type == -RAY_BOOL) return (int64_t)arg->b8;
+    if (arg->type == -RAY_STR) {
+        const char* p = ray_str_ptr(arg);
+        size_t len = ray_str_len(arg);
+        size_t i = 0;
+        while (i < len && (p[i] == ' ' || p[i] == '\t')) i++;
+        int sign = 1;
+        if (i < len && (p[i] == '+' || p[i] == '-')) { if (p[i] == '-') sign = -1; i++; }
+        if (i >= len || p[i] < '0' || p[i] > '9') { *err = 1; return 0; }
+        int64_t v = 0;
+        while (i < len && p[i] >= '0' && p[i] <= '9') { v = v * 10 + (p[i] - '0'); i++; }
+        return sign * v;
+    }
+    *err = 1;
+    return 0;
+}
+
+/* ══════════════════════════════════════════
+ * Handlers
+ * ══════════════════════════════════════════ */
+
+/* timeit/t — toggle the profiler.
+ *   no arg → toggle
+ *   0      → disable
+ *   nonzero→ enable
+ */
+static ray_t* h_timeit(ray_t* arg, ray_syscmd_ctx_t* ctx) {
+    bool active;
+    if (arg_is_null(arg)) {
+        active = !g_ray_profile.active;
+    } else {
+        int err = 0;
+        int64_t v = arg_as_i64(arg, &err);
+        if (err) return ray_error("type", ":t expects an integer (0 = off, 1 = on)");
+        active = (v != 0);
+    }
+    g_ray_profile.active = active;
+    if (ctx && ctx->repl) {
+        if (ctx->color) fprintf(stdout, "\033[1;33m");
+        fprintf(stdout, ". Timeit is %s.", active ? "on" : "off");
+        if (ctx->color) fprintf(stdout, "\033[0m");
+        fprintf(stdout, "\n");
+        return NULL;
+    }
+    return ray_i64(active ? 1 : 0);
+}
+
+/* listen N — bind an IPC listener on PORT using the runtime's main
+ * poll instance.  Errors with `nyi` if no main loop is wired (i.e.
+ * the host didn't call ray_runtime_create from main.c, or libray is
+ * being used as an embedded library without a poll).  Errors with
+ * `io` if the bind fails (port in use, permission, etc.).  Returns
+ * the listener id on success. */
+static ray_t* h_listen(ray_t* arg, ray_syscmd_ctx_t* ctx) {
+    (void)ctx;
+    int err = 0;
+    int64_t port = arg_as_i64(arg, &err);
+    if (err) return ray_error("type", "listen expects a port number");
+    if (port <= 0 || port > 65535) return ray_error("domain", "listen: port out of range (1..65535)");
+
+    ray_poll_t* poll = (ray_poll_t*)ray_runtime_get_poll();
+    if (!poll) return ray_error("nyi", "listen: no main event loop attached");
+
+    int64_t id = ray_ipc_listen(poll, (uint16_t)port);
+    if (id < 0) {
+        int e = errno;
+        return ray_error("io", "listen: bind to port %lld failed: %s",
+                         (long long)port, strerror(e ? e : EADDRINUSE));
+    }
+    return ray_i64(id);
+}
+
+/* env — list defined globals.  REPL prints a summary; Rayfall path
+ * returns a list of [name, type-label] pairs. */
+static const char* type_label_short(ray_t* v) {
+    if (!v) return "null";
+    switch (v->type) {
+        case RAY_LAMBDA:  return "lambda";
+        case RAY_UNARY:
+        case RAY_BINARY:
+        case RAY_VARY:    return "fn";
+        case RAY_TABLE:   return "table";
+        case RAY_DICT:    return "dict";
+        case RAY_LIST:    return "list";
+        default:
+            if (v->type < 0) return "atom";
+            if (v->type > 0) return "vec";
+            return "?";
+    }
+}
+
+static ray_t* h_env(ray_t* arg, ray_syscmd_ctx_t* ctx) {
+    (void)arg;
+    int64_t sym_ids[512];
+    ray_t*  vals[512];
+    int32_t n = ray_env_list(sym_ids, vals, 512);
+    if (ctx && ctx->repl) {
+        for (int32_t i = 0; i < n; i++) {
+            ray_t* s = ray_sym_str(sym_ids[i]);
+            const char* name = s ? ray_str_ptr(s) : "?";
+            fprintf(stdout, "  %-20s %s\n", name, type_label_short(vals[i]));
+        }
+        fprintf(stdout, "(%d entries)\n", n);
+        return NULL;
+    }
+    /* Non-REPL: just return the count.  Returning the full env as a
+     * Rayfall list is doable but not needed for the .sys.cmd "env"
+     * use case (which is purely informational). */
+    return ray_i64(n);
+}
+
+/* clear — REPL-only screen clear. */
+static ray_t* h_clear(ray_t* arg, ray_syscmd_ctx_t* ctx) {
+    (void)arg;
+    if (ctx && ctx->repl && ctx->color) {
+        fprintf(stdout, "\033[2J\033[H");
+        fflush(stdout);
+    }
+    return NULL;
+}
+
+/* help/? — REPL-only.  Walks the table to print every command's
+ * one-liner so we never get out of sync with what's registered. */
+static ray_t* h_help(ray_t* arg, ray_syscmd_ctx_t* ctx) {
+    (void)arg;
+    if (!ctx || !ctx->repl) return RAY_NULL_OBJ;
+    bool color = ctx->color;
+    if (color) fprintf(stdout, "\033[1;33m");
+    fprintf(stdout, ". Commands list:");
+    if (color) fprintf(stdout, "\033[0m");
+    fprintf(stdout, "\n");
+    if (color) fprintf(stdout, "\033[90m");
+    size_t n = 0;
+    const ray_syscmd_t* tbl = ray_syscmd_table(&n);
+    for (size_t i = 0; i < n; i++) {
+        char tag[32];
+        if (tbl[i].alias) snprintf(tag, sizeof(tag), ":%s/:%s", tbl[i].name, tbl[i].alias);
+        else              snprintf(tag, sizeof(tag), ":%s",     tbl[i].name);
+        fprintf(stdout, "  %-12s - %s\n", tag, tbl[i].help ? tbl[i].help : "");
+    }
+    if (color) fprintf(stdout, "\033[0m");
+    fprintf(stdout, "\n");
+    return NULL;
+}
+
+/* q/quit — REPL-only graceful exit. */
+static ray_t* h_quit(ray_t* arg, ray_syscmd_ctx_t* ctx) {
+    (void)arg; (void)ctx;
+    /* Defer to the standard exit path so atexit handlers run. */
+    exit(0);
+    return NULL;
+}
+
+/* ══════════════════════════════════════════
+ * Registry
+ * ══════════════════════════════════════════ */
+
+static const ray_syscmd_t TABLE[] = {
+    { "help",   "?",  h_help,   RAY_SYSCMD_REPL_ONLY,            "Display this help."                       },
+    { "timeit", "t",  h_timeit, 0,                                "Toggle profiling on/off (or :t 0|1)."     },
+    { "env",    NULL, h_env,    0,                                "List defined globals."                    },
+    { "clear",  NULL, h_clear,  RAY_SYSCMD_REPL_ONLY,            "Clear the screen."                        },
+    { "listen", NULL, h_listen, RAY_SYSCMD_RESTRICTED,           "Start IPC listener on PORT."              },
+    { "q",      NULL, h_quit,   RAY_SYSCMD_REPL_ONLY,            "Exit the REPL."                           },
+    { "quit",   NULL, h_quit,   RAY_SYSCMD_REPL_ONLY,            "Exit the REPL."                           },
+};
+static const size_t TABLE_LEN = sizeof(TABLE) / sizeof(TABLE[0]);
+
+const ray_syscmd_t* ray_syscmd_lookup(const char* name, size_t name_len) {
+    if (!name || name_len == 0) return NULL;
+    for (size_t i = 0; i < TABLE_LEN; i++) {
+        const ray_syscmd_t* e = &TABLE[i];
+        if (e->name && strlen(e->name) == name_len && memcmp(e->name, name, name_len) == 0)
+            return e;
+        if (e->alias && strlen(e->alias) == name_len && memcmp(e->alias, name, name_len) == 0)
+            return e;
+    }
+    return NULL;
+}
+
+const ray_syscmd_t* ray_syscmd_table(size_t* out_count) {
+    if (out_count) *out_count = TABLE_LEN;
+    return TABLE;
+}
+
+/* ══════════════════════════════════════════
+ * Dispatcher used by `.sys.cmd "..."` and the REPL `:` path.
+ *
+ * Splits the string into (command, args).  Looks up the command in
+ * the registry.  If found, builds a Rayfall arg from the args slice
+ * (RAY_NULL_OBJ for empty, otherwise an owned RAY_STR — the handler
+ * can then re-coerce via arg_as_i64 etc.) and calls the handler.
+ *
+ * On miss with allow_shell=true, falls through to system() so users
+ * can do `(.sys.cmd "ls -la")` the kdb way.  With allow_shell=false
+ * (REPL path), returns "domain" so a typo'd `:foo` doesn't hand the
+ * shell anything by accident.
+ * ══════════════════════════════════════════ */
+ray_t* ray_syscmd_dispatch(const char* str, size_t len,
+                           ray_syscmd_ctx_t* ctx, bool allow_shell) {
+    /* Trim leading whitespace */
+    size_t i = 0;
+    while (i < len && (str[i] == ' ' || str[i] == '\t')) i++;
+    if (i >= len) return ray_error("domain", "empty command");
+
+    /* First word = command name (until whitespace). */
+    size_t name_start = i;
+    while (i < len && str[i] != ' ' && str[i] != '\t') i++;
+    size_t name_len = i - name_start;
+
+    /* Args = rest, leading whitespace trimmed. */
+    while (i < len && (str[i] == ' ' || str[i] == '\t')) i++;
+    const char* args_p = str + i;
+    size_t args_len = len - i;
+
+    const ray_syscmd_t* e = ray_syscmd_lookup(str + name_start, name_len);
+    if (!e) {
+        if (!allow_shell)
+            return ray_error("domain", "unknown command");
+        /* Shell fallback — pass the entire original string verbatim
+         * so quoting/redirection survives.  Match .sys.exec semantics:
+         * return the host shell's exit code. */
+        char* cmd = (char*)malloc(len + 1);
+        if (!cmd) return ray_error("oom", NULL);
+        memcpy(cmd, str, len);
+        cmd[len] = '\0';
+        int rc = system(cmd);
+        free(cmd);
+        return ray_i64(rc);
+    }
+
+    if (!e->fn) return ray_error("nyi", "command has no handler");
+
+    /* REPL-only commands (clear / q / help) are reachable only when
+     * a REPL context was supplied — typing them in a Rayfall script
+     * via .sys.cmd would have no useful effect, so reject early with
+     * a clear domain error rather than silently no-op'ing. */
+    if ((e->flags & RAY_SYSCMD_REPL_ONLY) && (!ctx || !ctx->repl))
+        return ray_error("domain", "command is REPL-only");
+
+    ray_t* arg = (args_len > 0) ? ray_str(args_p, args_len) : RAY_NULL_OBJ;
+    ray_t* result = e->fn(arg, ctx);
+    if (arg && arg != RAY_NULL_OBJ) ray_release(arg);
+    return result ? result : RAY_NULL_OBJ;
+}
+
+/* ══════════════════════════════════════════
+ * Rayfall builtins:
+ *   (.sys.cmd "name args")  → string-dispatched
+ *   (.sys.<name> arg)       → direct, typed
+ *
+ * The direct builtins are registered from eval.c at startup; this
+ * file just exposes the entry point each one wraps.
+ * ══════════════════════════════════════════ */
+
+ray_t* ray_syscmd_string_dispatch_fn(ray_t* x) {
+    if (!ray_is_atom_local(x) || x->type != -RAY_STR)
+        return ray_error("type", ".sys.cmd expects a string");
+    return ray_syscmd_dispatch(ray_str_ptr(x), ray_str_len(x),
+                               /*ctx=*/NULL, /*allow_shell=*/true);
+}
+
+/* Adapter for direct `.sys.<name>` invocation: pass the user's arg
+ * straight to the named handler, no string parsing. */
+static ray_t* invoke_by_name(const char* name, ray_t* arg) {
+    const ray_syscmd_t* e = ray_syscmd_lookup(name, strlen(name));
+    if (!e || !e->fn) return ray_error("nyi", NULL);
+    if (e->flags & RAY_SYSCMD_REPL_ONLY)
+        return ray_error("domain", "command is REPL-only");
+    ray_syscmd_ctx_t ctx = { NULL, false };
+    ray_t* r = e->fn(arg, &ctx);
+    return r ? r : RAY_NULL_OBJ;
+}
+
+/* listen requires an arg; keep it unary. */
+ray_t* ray_sys_listen_fn(ray_t* x) { return invoke_by_name("listen", x); }
+
+/* timeit and env are usable with or without an arg ((.sys.timeit) =>
+ * toggle, (.sys.env) => list).  Registering them variadic in eval.c
+ * matches `.sys.gc`'s convention and avoids the arity error users
+ * would otherwise hit calling `(.sys.env)` with no args. */
+ray_t* ray_sys_timeit_fn(ray_t** args, int64_t n) {
+    return invoke_by_name("timeit", n > 0 ? args[0] : RAY_NULL_OBJ);
+}
+ray_t* ray_sys_env_fn(ray_t** args, int64_t n) {
+    (void)args;
+    return invoke_by_name("env", n > 0 ? args[0] : RAY_NULL_OBJ);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.h b/crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.h
new file mode 100644
index 0000000..f68729d
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.h
@@ -0,0 +1,103 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/*
+ * syscmd.h — single registry of system-level commands.
+ *
+ * One source of truth feeding three entry points:
+ *
+ *   1. `.sys.cmd "name args"`  — string-dispatched Rayfall builtin
+ *   2. `:name args`            — REPL terminal command
+ *   3. `(.sys.<name> arg)`     — direct typed Rayfall builtin (per entry)
+ *
+ * Each handler is invoked with one Rayfall arg (or RAY_NULL_OBJ for
+ * commands that take none) and an optional REPL context that carries
+ * the surface-specific state (color flag, repl pointer for things like
+ * `:clear` and `:q`).  Handlers parse / coerce the arg themselves so
+ * callers don't have to special-case whether `:t 1` came in as the
+ * string "1" or the integer 1.
+ *
+ * Unknown command names dispatched through `.sys.cmd` fall through to
+ * the host shell via system(2) — matches the kdb+ `system "..."`
+ * convention so existing muscle memory works.
+ */
+
+#ifndef RAY_LANG_SYSCMD_H
+#define RAY_LANG_SYSCMD_H
+
+#include <rayforce.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ray_repl;
+
+typedef struct ray_syscmd_ctx {
+    struct ray_repl *repl;   /* non-NULL when invoked from the REPL */
+    bool             color;  /* terminal supports ANSI; only meaningful with `repl` */
+} ray_syscmd_ctx_t;
+
+/* Handler contract:
+ *   - `arg` is RAY_NULL_OBJ when no argument was supplied; otherwise an
+ *     owned reference the caller manages (handlers do not retain).
+ *   - Returning NULL means "no value" (treated as RAY_NULL_OBJ by the
+ *     Rayfall surface; suppressed from REPL print).
+ *   - Errors are returned as ray_error(...) values.
+ */
+typedef ray_t* (*ray_syscmd_handler_t)(ray_t* arg, ray_syscmd_ctx_t* ctx);
+
+/* Entry flags */
+#define RAY_SYSCMD_REPL_ONLY   0x01  /* not exposed via .sys.cmd / .sys.<name> */
+#define RAY_SYSCMD_RESTRICTED  0x02  /* honors --restricted IPC mode */
+
+typedef struct ray_syscmd {
+    const char*          name;       /* primary command name, e.g. "timeit" */
+    const char*          alias;      /* short alias e.g. "t"; NULL if none */
+    ray_syscmd_handler_t fn;
+    int                  flags;
+    const char*          help;       /* one-line help text */
+} ray_syscmd_t;
+
+/* Look up a command by name or alias.  `name_len` lets the caller pass
+ * an unterminated slice (e.g. straight out of the REPL or .sys.cmd
+ * tokeniser).  Returns NULL if not found. */
+const ray_syscmd_t* ray_syscmd_lookup(const char* name, size_t name_len);
+
+/* Walk the table.  Returns the entry array + count. */
+const ray_syscmd_t* ray_syscmd_table(size_t* out_count);
+
+/* Parse `"name args..."` into (command, arg-string), look the command
+ * up, dispatch.  Unknown names fall through to system(str) when
+ * `allow_shell` is set (the .sys.cmd path uses true; the REPL passes
+ * false so `:foo` doesn't accidentally exec arbitrary shell). */
+ray_t* ray_syscmd_dispatch(const char* str, size_t len,
+                           ray_syscmd_ctx_t* ctx, bool allow_shell);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RAY_LANG_SYSCMD_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/arena.c b/crates/rayforce-sys/vendor/rayforce/src/mem/arena.c
new file mode 100644
index 0000000..26f5636
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/mem/arena.c
@@ -0,0 +1,160 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "arena.h"
+#include "heap.h"
+#include "sys.h"
+#include <string.h>
+
+/* 32-byte alignment for ray_t */
+#define ARENA_ALIGN 32
+#define ARENA_ALIGN_UP(x) (((x) + ARENA_ALIGN - 1) & ~(size_t)(ARENA_ALIGN - 1))
+
+/* Each chunk is a contiguous block of memory with a bump pointer. */
+typedef struct ray_arena_chunk {
+    struct ray_arena_chunk* next;
+    size_t cap;    /* usable capacity (excluding this header) */
+    size_t used;   /* bytes used so far */
+} ray_arena_chunk_t;
+
+/* Arena header */
+struct ray_arena {
+    ray_arena_chunk_t* chunks;     /* linked list of all chunks (head = current) */
+    size_t            chunk_size; /* default chunk capacity */
+};
+
+/* Chunk data starts at aligned offset after the header */
+static inline char* chunk_data(ray_arena_chunk_t* c) {
+    size_t hdr = ARENA_ALIGN_UP(sizeof(ray_arena_chunk_t));
+    return (char*)c + hdr;
+}
+
+static ray_arena_chunk_t* arena_new_chunk(size_t min_cap) {
+    size_t hdr = ARENA_ALIGN_UP(sizeof(ray_arena_chunk_t));
+    if (min_cap > SIZE_MAX - hdr) return NULL;
+    size_t total = hdr + min_cap;
+    ray_arena_chunk_t* c = (ray_arena_chunk_t*)ray_sys_alloc(total);
+    if (!c) return NULL;
+    c->next = NULL;
+    c->cap = min_cap;
+    c->used = 0;
+    return c;
+}
+
+ray_arena_t* ray_arena_new(size_t chunk_size) {
+    if (chunk_size < 256) chunk_size = 256;
+    chunk_size = ARENA_ALIGN_UP(chunk_size);
+
+    ray_arena_t* a = (ray_arena_t*)ray_sys_alloc(sizeof(ray_arena_t));
+    if (!a) return NULL;
+
+    ray_arena_chunk_t* first = arena_new_chunk(chunk_size);
+    if (!first) {
+        ray_sys_free(a);
+        return NULL;
+    }
+
+    a->chunks = first;
+    a->chunk_size = chunk_size;
+    return a;
+}
+
+ray_t* ray_arena_alloc(ray_arena_t* arena, size_t nbytes) {
+    if (!arena) return NULL;
+    if (nbytes > SIZE_MAX - 32 - (ARENA_ALIGN - 1)) return NULL;
+    size_t block_size = ARENA_ALIGN_UP(32 + nbytes);
+
+    ray_arena_chunk_t* c = arena->chunks;
+
+    if (c->used + block_size > c->cap) {
+        size_t new_cap = arena->chunk_size;
+        if (block_size > new_cap) new_cap = ARENA_ALIGN_UP(block_size);
+
+        ray_arena_chunk_t* nc = arena_new_chunk(new_cap);
+        if (!nc) return NULL;
+
+        nc->next = arena->chunks;
+        arena->chunks = nc;
+        c = nc;
+    }
+
+    char* base = chunk_data(c);
+    ray_t* v = (ray_t*)(base + c->used);
+    c->used += block_size;
+
+    memset(v, 0, 32);
+    v->attrs = RAY_ATTR_ARENA;
+    v->rc = 1;
+
+    return v;
+}
+
+bool ray_arena_reserve(ray_arena_t* arena, size_t bytes) {
+    if (!arena) return false;
+    if (bytes == 0) return true;
+    ray_arena_chunk_t* c = arena->chunks;
+    if (c && (c->cap - c->used) >= bytes) return true;
+    size_t new_cap = arena->chunk_size;
+    if (bytes > new_cap) new_cap = ARENA_ALIGN_UP(bytes);
+    ray_arena_chunk_t* nc = arena_new_chunk(new_cap);
+    if (!nc) return false;
+    nc->next = arena->chunks;
+    arena->chunks = nc;
+    return true;
+}
+
+size_t ray_arena_total_used(const ray_arena_t* arena) {
+    if (!arena) return 0;
+    size_t total = 0;
+    for (const ray_arena_chunk_t* c = arena->chunks; c; c = c->next) {
+        total += c->used;
+    }
+    return total;
+}
+
+void ray_arena_reset(ray_arena_t* arena) {
+    if (!arena || !arena->chunks) return;
+
+    /* Keep the head chunk (most recently allocated), free the rest */
+    ray_arena_chunk_t* keep = arena->chunks;
+    ray_arena_chunk_t* c = keep->next;
+    while (c) {
+        ray_arena_chunk_t* next = c->next;
+        ray_sys_free(c);
+        c = next;
+    }
+    keep->next = NULL;
+    keep->used = 0;
+    arena->chunks = keep;
+}
+
+void ray_arena_destroy(ray_arena_t* arena) {
+    if (!arena) return;
+    ray_arena_chunk_t* c = arena->chunks;
+    while (c) {
+        ray_arena_chunk_t* next = c->next;
+        ray_sys_free(c);
+        c = next;
+    }
+    ray_sys_free(arena);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/arena.h b/crates/rayforce-sys/vendor/rayforce/src/mem/arena.h
new file mode 100644
index 0000000..f405a2c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/mem/arena.h
@@ -0,0 +1,60 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_ARENA_H
+#define RAY_ARENA_H
+
+#include <rayforce.h>
+#include <stdbool.h>
+
+typedef struct ray_arena ray_arena_t;
+
+/* Create arena with given chunk size (bytes). Chunks allocated via ray_sys_alloc. */
+ray_arena_t* ray_arena_new(size_t chunk_size);
+
+/* Allocate ray_t* block with nbytes of data space.
+ * Returns 32-byte aligned ray_t* with RAY_ATTR_ARENA set, rc=1.
+ * Returns NULL on OOM. */
+ray_t* ray_arena_alloc(ray_arena_t* arena, size_t nbytes);
+
+/* Ensure the arena can serve subsequent allocations totalling at least
+ * `bytes` without the head chunk needing to grow.  If the head chunk has
+ * enough free space already, this is a no-op; otherwise a new chunk with
+ * capacity >= `bytes` is allocated and becomes the head.  Returns true on
+ * success, false on OOM.  Useful for making a sequence of follow-on
+ * allocations infallible, which is necessary when commits to multiple
+ * data structures must be atomic. */
+bool ray_arena_reserve(ray_arena_t* arena, size_t bytes);
+
+/* Total bytes currently used across every chunk in this arena.  Diagnostic
+ * introspection — monotonically grows with ray_arena_alloc, resets on
+ * ray_arena_reset.  Safe to call at any time. */
+size_t ray_arena_total_used(const ray_arena_t* arena);
+
+/* Reset arena — rewind all chunks to zero. Memory retained for reuse. */
+void ray_arena_reset(ray_arena_t* arena);
+
+/* Destroy arena — free all backing memory. */
+void ray_arena_destroy(ray_arena_t* arena);
+
+#endif /* RAY_ARENA_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/cow.c b/crates/rayforce-sys/vendor/rayforce/src/mem/cow.c
new file mode 100644
index 0000000..6a453d9
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/mem/cow.c
@@ -0,0 +1,79 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "cow.h"
+#include "heap.h"
+
+/* Thread-local flag: when false (default), refcount uses plain inc/dec.
+ * The thread pool sets this to true before dispatching parallel work.
+ * Mirrors rayforce 1's VM->rc_sync fast path. */
+RAY_TLS bool ray_rc_sync = false;
+
+/* --------------------------------------------------------------------------
+ * ray_retain
+ * -------------------------------------------------------------------------- */
+
+void ray_retain(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return;
+    if (v->attrs & RAY_ATTR_ARENA) return;
+    if (RAY_LIKELY(!ray_rc_sync))
+        v->rc++;
+    else
+        ray_atomic_inc(&v->rc);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_release
+ * -------------------------------------------------------------------------- */
+
+void ray_release(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return;
+    if (v->attrs & RAY_ATTR_ARENA) return;
+    uint32_t prev;
+    if (RAY_LIKELY(!ray_rc_sync)) {
+        prev = v->rc--;
+    } else {
+        prev = ray_atomic_dec(&v->rc);
+    }
+    if (prev == 1) {
+        if (RAY_UNLIKELY(ray_rc_sync))
+            ray_atomic_fence_acquire();
+        ray_free(v);
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ray_cow
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_cow(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return v;
+    if (v->attrs & RAY_ATTR_ARENA) return v;  /* arena-owned, no-op */
+    uint32_t rc = RAY_LIKELY(!ray_rc_sync) ? v->rc : ray_atomic_load(&v->rc);
+    if (rc == 1) return v;  /* sole owner -- mutate in place */
+    ray_t* copy = ray_alloc_copy(v);
+    if (!copy || RAY_IS_ERR(copy)) return copy;
+    /* L3: ray_alloc_copy() already sets copy->rc = 1, so no redundant store needed. */
+    ray_release(v);
+    return copy;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/cow.h b/crates/rayforce-sys/vendor/rayforce/src/mem/cow.h
new file mode 100644
index 0000000..b42643b
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/mem/cow.h
@@ -0,0 +1,43 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_COW_H
+#define RAY_COW_H
+
+/*
+ * cow.h -- COW (Copy-on-Write) ref counting.
+ *
+ * ray_retain: increment reference count
+ * ray_release: decrement reference count, free when it reaches zero
+ * ray_cow: copy-on-write — return same pointer if sole owner, else copy
+ */
+
+#include <rayforce.h>
+#include "core/platform.h"
+
+/* Thread-local flag: plain (false) vs atomic (true) refcount ops.
+ * Default is false (fast single-threaded path).
+ * The thread pool sets true before parallel dispatch. */
+extern RAY_TLS bool ray_rc_sync;
+
+#endif /* RAY_COW_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/heap.c b/crates/rayforce-sys/vendor/rayforce/src/mem/heap.c
new file mode 100644
index 0000000..8f93c5f
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/mem/heap.c
@@ -0,0 +1,1601 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#if defined(__APPLE__)
+#  define _DARWIN_C_SOURCE
+#elif !defined(_WIN32)
+#  define _GNU_SOURCE       /* ftruncate, MAP_SHARED, etc. */
+#endif
+
+#include "heap.h"
+#include "cow.h"
+#include "sys.h"
+#include "core/platform.h"
+#include "table/sym.h"
+#include "lang/eval.h"
+#include "store/hnsw.h"
+#include "ops/idxop.h"
+#include <string.h>
+#include <stdlib.h>     /* getenv */
+#include <stdio.h>      /* snprintf */
+#include <unistd.h>     /* getpid, close, ftruncate, unlink */
+#include <fcntl.h>      /* open, fcntl, F_PREALLOCATE on macOS */
+#include <errno.h>
+#include <sys/mman.h>   /* mmap, munmap */
+#include <sys/stat.h>   /* O_*  modes */
+#include <sys/types.h>
+#include <stdatomic.h>
+
+/* Portable disk-block preallocation.  Returns 0 on success, errno-style
+ * code on failure (matching posix_fallocate's contract).  Linux has
+ * posix_fallocate natively.  macOS uses fcntl(F_PREALLOCATE) — try
+ * contiguous first, fall back to non-contiguous, then ftruncate to
+ * extend the file size if needed (F_PREALLOCATE doesn't grow the file
+ * beyond its current size). */
+static int heap_preallocate(int fd, off_t offset, off_t len) {
+#if defined(__APPLE__)
+    fstore_t fs = {
+        .fst_flags    = F_ALLOCATECONTIG | F_ALLOCATEALL,
+        .fst_posmode  = F_PEOFPOSMODE,
+        .fst_offset   = 0,
+        .fst_length   = offset + len,
+        .fst_bytesalloc = 0,
+    };
+    if (fcntl(fd, F_PREALLOCATE, &fs) == -1) {
+        /* Retry without contiguous-only constraint. */
+        fs.fst_flags = F_ALLOCATEALL;
+        if (fcntl(fd, F_PREALLOCATE, &fs) == -1) return errno ? errno : -1;
+    }
+    /* F_PREALLOCATE reserves blocks but doesn't grow the logical file
+     * size — extend with ftruncate so mmap'd pages past the old size
+     * can actually be written without SIGBUS. */
+    if (ftruncate(fd, offset + len) != 0) return errno ? errno : -1;
+    return 0;
+#else
+    return posix_fallocate(fd, offset, len);
+#endif
+}
+
+/* --------------------------------------------------------------------------
+ * Static asserts
+ * -------------------------------------------------------------------------- */
+_Static_assert(sizeof(ray_pool_hdr_t) <= 16,
+               "ray_pool_hdr_t must fit in nullmap (16 bytes)");
+
+/* --------------------------------------------------------------------------
+ * Thread-local state
+ * -------------------------------------------------------------------------- */
+RAY_TLS ray_heap_t*     ray_tl_heap  = NULL;
+
+/* Stats tracking — always enabled (plain integer ops, negligible vs atomics).
+ * All stats go through the per-heap struct (ray_tl_heap->stats) so that
+ * heap merges keep bytes_allocated accurate.
+ *
+ * bytes_allocated is only modified by the owning thread (alloc/local-free)
+ * or by the main thread during GC flush (return_to_owner=true, workers idle).
+ * No atomics needed. */
+#define RAY_STAT(x) (x)
+
+/* --------------------------------------------------------------------------
+ * Bitmap-based heap ID allocator (atomic CAS, reusable IDs)
+ *
+ * Each bit in the bitmap represents one heap ID. Acquiring sets a bit,
+ * releasing clears it. IDs are reused after release (unlike a monotonic
+ * counter). Cursor rotates to spread contention across words.
+ * -------------------------------------------------------------------------- */
+static _Atomic(uint64_t) g_heap_id_bitmap[RAY_HEAP_ID_WORDS] = { [0] = 1ULL };
+static _Atomic(uint64_t) g_heap_id_cursor = 0;
+
+ray_heap_t* ray_heap_registry[RAY_HEAP_REGISTRY_SIZE];
+
+/* Pending-merge queue head (lock-free LIFO) */
+_Atomic(ray_heap_t*) ray_heap_pending_merge = NULL;
+
+static int heap_id_acquire(void) {
+    uint64_t start = atomic_fetch_add_explicit(&g_heap_id_cursor, 1,
+                                                memory_order_relaxed);
+    for (uint64_t off = 0; off < RAY_HEAP_ID_WORDS; off++) {
+        uint64_t idx = (start + off) % RAY_HEAP_ID_WORDS;
+        uint64_t word = atomic_load_explicit(&g_heap_id_bitmap[idx],
+                                              memory_order_relaxed);
+        while (~word != 0ULL) {
+            uint64_t free_bits = ~word;
+            uint64_t bit = (uint64_t)__builtin_ctzll(free_bits);
+            uint64_t mask = 1ULL << bit;
+            uint64_t new_word = word | mask;
+            if (atomic_compare_exchange_weak_explicit(
+                    &g_heap_id_bitmap[idx], &word, new_word,
+                    memory_order_acq_rel, memory_order_relaxed)) {
+                return (int)(idx * 64 + bit);
+            }
+            /* CAS failed — word updated, retry with new value */
+        }
+    }
+    return -1;  /* pool exhausted */
+}
+
+static void heap_id_release(int id) {
+    if (id < 0 || id >= (int)RAY_HEAP_ID_BITS) return;
+    uint64_t idx = (uint64_t)id >> 6;
+    uint64_t bit = (uint64_t)id & 63ULL;
+    uint64_t mask = ~(1ULL << bit);
+    atomic_fetch_and_explicit(&g_heap_id_bitmap[idx], mask,
+                               memory_order_release);
+}
+
+/* --------------------------------------------------------------------------
+ * Parallel flag
+ * -------------------------------------------------------------------------- */
+_Atomic(uint32_t) ray_parallel_flag = 0;
+
+/* --------------------------------------------------------------------------
+ * Helpers
+ * -------------------------------------------------------------------------- */
+
+static uint8_t ceil_log2(size_t n) {
+    if (n <= 1) return 0;
+    return (uint8_t)(64 - __builtin_clzll(n - 1));
+}
+
+uint8_t ray_order_for_size(size_t data_size) {
+    if (data_size > SIZE_MAX - 32) return RAY_HEAP_MAX_ORDER + 1;
+    size_t total = data_size + 32;  /* 32B ray_t header (no prefix) */
+    uint8_t k = ceil_log2(total);
+    if (k < RAY_ORDER_MIN) k = RAY_ORDER_MIN;
+    return k;
+}
+
+/* --------------------------------------------------------------------------
+ * Pool management
+ *
+ * Self-aligned pools: pool base = ptr & ~(pool_size - 1).
+ * First min-block (64B at offset 0) reserved for pool header.
+ * Remaining space split via cascading buddy split.
+ *
+ * For oversized blocks (order > POOL_ORDER), pool_order = order + 1
+ * so the cascading split produces a right-half block of the needed order.
+ * -------------------------------------------------------------------------- */
+
+static bool heap_add_pool(ray_heap_t* h, uint8_t order);
+
+/* --------------------------------------------------------------------------
+ * Freelist operations (circular sentinel via fl_prev/fl_next)
+ *
+ * Each freelist[order] is a ray_fl_head_t sentinel. fl_remove() unlinks a
+ * block from ANY circular list without needing the head pointer — enabling
+ * safe cross-heap buddy coalescing.
+ * -------------------------------------------------------------------------- */
+
+RAY_INLINE void heap_insert_block(ray_heap_t* h, ray_t* blk, uint8_t order) {
+    ray_fl_head_t* head = &h->freelist[order];
+    ray_t* first = head->fl_next;
+    blk->fl_prev = (ray_t*)head;
+    blk->fl_next = first;
+    first->fl_prev = blk;
+    head->fl_next = blk;
+    ray_atomic_store(&blk->rc, 0);  /* free marker */
+    blk->order = order;
+    h->avail |= (1ULL << order);
+}
+
+/* heap_remove_block: currently unused — retained for future coalescing paths */
+static void __attribute__((unused))
+heap_remove_block(ray_heap_t* h, ray_t* blk, uint8_t order) {
+    fl_remove(blk);  /* circular unlink — works across heaps */
+    if (fl_empty(&h->freelist[order]))
+        h->avail &= ~(1ULL << order);
+}
+
+RAY_INLINE void heap_split_block(ray_heap_t* h, ray_t* blk,
+                                uint8_t target_order, uint8_t block_order) {
+    while (block_order > target_order) {
+        block_order--;
+        ray_t* buddy = (ray_t*)((char*)blk + BSIZEOF(block_order));
+        buddy->mmod  = 0;
+        buddy->order = block_order;
+        heap_insert_block(h, buddy, block_order);
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Coalescing: merge block with buddies up to pool_order
+ *
+ * Pool header at offset 0 has rc=1 and order=RAY_ORDER_MIN, so buddy
+ * checks always fail before reaching the header. Safe sentinel.
+ * -------------------------------------------------------------------------- */
+
+static void heap_coalesce(ray_heap_t* h, ray_t* blk,
+                          uintptr_t pool_base, uint8_t pool_order) {
+    uint8_t order = blk->order;
+
+    /* During parallel execution, skip coalescing entirely — buddies may
+     * belong to other heaps' freelists, and fl_remove would corrupt them. */
+    if (atomic_load_explicit(&ray_parallel_flag, memory_order_relaxed) != 0) {
+        heap_insert_block(h, blk, order);
+        return;
+    }
+
+    for (;; order++) {
+        if (order >= pool_order) break;
+
+        ray_t* buddy = ray_buddy_of(blk, order, pool_base);
+        __builtin_prefetch(buddy, 0, 1);
+
+        uint32_t buddy_rc = ray_atomic_load(&buddy->rc);
+        if (buddy_rc != 0 || buddy->order != order) break;
+
+        fl_remove(buddy);
+        if (fl_empty(&h->freelist[order]))
+            h->avail &= ~(1ULL << order);
+
+        blk = (buddy < blk) ? buddy : blk;
+    }
+
+    heap_insert_block(h, blk, order);
+}
+
+/* --------------------------------------------------------------------------
+ * heap_add_pool implementation
+ * -------------------------------------------------------------------------- */
+
+static bool heap_add_pool(ray_heap_t* h, uint8_t order) {
+    if (h->pool_count >= RAY_MAX_POOLS) return false;
+
+    uint8_t pool_order;
+    if (order >= RAY_HEAP_POOL_ORDER)
+        pool_order = order + 1;  /* need one order larger for header + block */
+    else
+        pool_order = RAY_HEAP_POOL_ORDER;
+
+    if (pool_order > RAY_HEAP_MAX_ORDER) return false;
+    size_t pool_size = BSIZEOF(pool_order);
+
+    void* mem = ray_vm_alloc_aligned(pool_size, pool_size);
+    int   swap_fd  = -1;
+    char* swap_path = NULL;
+
+    if (!mem) {
+        /* Anonymous mmap refused — usually means RAM+swap can't satisfy
+         * pool_size right now.  Fall back to file-backed mmap: create a
+         * tempfile in h->swap_path, reserve `pool_size` bytes of disk
+         * blocks (so writes won't SIGBUS later on disk-full), then map
+         * the file at a self-aligned address using the anonymous-VM
+         * reservation trick — no over-allocation of file or disk. */
+        static _Atomic uint64_t swap_counter = 0;
+        uint64_t cnt = atomic_fetch_add_explicit(&swap_counter, 1, memory_order_relaxed);
+
+        size_t plen = strlen(h->swap_path);
+        size_t need = plen + 64;  /* room for "rayheap_<pid>_<heap>_<cnt>.dat" */
+        swap_path = (char*)ray_sys_alloc(need);
+        if (!swap_path) return false;
+        snprintf(swap_path, need, "%srayheap_%d_%u_%llu.dat",
+                 h->swap_path, (int)getpid(), (unsigned)h->id,
+                 (unsigned long long)cnt);
+
+        swap_fd = open(swap_path, O_RDWR | O_CREAT | O_EXCL, 0600);
+        if (swap_fd < 0) {
+            ray_sys_free(swap_path);
+            return false;
+        }
+
+        /* Reserve EXACTLY pool_size bytes of disk blocks AND grow the
+         * file to pool_size.  Crucial that the file is empty (EOF=0)
+         * when this runs: macOS F_PREALLOCATE with F_PEOFPOSMODE
+         * extends past the current EOF, so doing this before any other
+         * ftruncate keeps the reservation == pool_size, not 2x.  ENOSPC
+         * here surfaces as a clean false return -> ray_alloc NULL ->
+         * ray_error("oom") at the wrapper layer. */
+        if (heap_preallocate(swap_fd, 0, (off_t)pool_size) != 0) {
+            close(swap_fd);
+            unlink(swap_path);
+            ray_sys_free(swap_path);
+            return false;
+        }
+
+        /* Reserve 2*pool_size of address space anonymously to guarantee
+         * a self-aligned subrange exists.  PROT_NONE is enough — we
+         * never read/write the anon mapping; it just holds the address
+         * range so the kernel won't hand it out to a concurrent mmap.
+         * After computing the aligned subrange, free the slack and
+         * MAP_FIXED the file-backed mapping over the kept region. */
+        size_t reserve_size = pool_size + pool_size;
+        void* anon = mmap(NULL, reserve_size, PROT_NONE,
+                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (anon == MAP_FAILED) {
+            close(swap_fd);
+            unlink(swap_path);
+            ray_sys_free(swap_path);
+            return false;
+        }
+
+        uintptr_t addr    = (uintptr_t)anon;
+        uintptr_t aligned = (addr + pool_size - 1) & ~(pool_size - 1);
+        if (aligned > addr)
+            munmap(anon, aligned - addr);
+        uintptr_t end         = addr + reserve_size;
+        uintptr_t aligned_end = aligned + pool_size;
+        if (end > aligned_end)
+            munmap((void*)aligned_end, end - aligned_end);
+
+        /* MAP_FIXED replaces the kept anon mapping atomically with the
+         * file-backed one.  No address-space race since the kept range
+         * is still anon-reserved at this point. */
+        void* mapped = mmap((void*)aligned, pool_size,
+                            PROT_READ | PROT_WRITE,
+                            MAP_SHARED | MAP_FIXED, swap_fd, 0);
+        if (mapped == MAP_FAILED) {
+            munmap((void*)aligned, pool_size);
+            close(swap_fd);
+            unlink(swap_path);
+            ray_sys_free(swap_path);
+            return false;
+        }
+
+        mem = (void*)aligned;
+    }
+
+    /* --- Write pool header at offset 0 --- */
+    ray_t* hdr_block = (ray_t*)mem;
+    memset(hdr_block, 0, BSIZEOF(RAY_ORDER_MIN));
+    hdr_block->mmod  = 0;
+    hdr_block->order = RAY_ORDER_MIN;
+    ray_atomic_store(&hdr_block->rc, 1);  /* sentinel: never free */
+
+    ray_pool_hdr_t* hdr = (ray_pool_hdr_t*)hdr_block;  /* overlay on nullmap */
+    hdr->heap_id    = h->id;
+    hdr->pool_order = pool_order;
+    hdr->vm_base    = mem;  /* on POSIX, same as aligned base */
+
+    /* --- Cascading split: split from pool_order down to RAY_ORDER_MIN.
+     *     Right half of each split → freelist.
+     *     Leftmost min-block = pool header (already set, rc=1). --- */
+    for (uint8_t o = pool_order; o > RAY_ORDER_MIN; o--) {
+        ray_t* right = (ray_t*)((char*)mem + BSIZEOF(o - 1));
+        right->mmod  = 0;
+        right->order = (uint8_t)(o - 1);
+        heap_insert_block(h, right, (uint8_t)(o - 1));
+    }
+
+    /* --- Track pool --- */
+    h->pools[h->pool_count].base       = mem;
+    h->pools[h->pool_count].pool_order = pool_order;
+    h->pools[h->pool_count].backed     = (swap_fd >= 0) ? 1 : 0;
+    h->pools[h->pool_count].swap_fd    = swap_fd;
+    h->pools[h->pool_count].swap_path  = swap_path;  /* NULL when not backed */
+    h->pool_count++;
+
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * Slab cache flush (with coalescing for GC effectiveness)
+ * -------------------------------------------------------------------------- */
+
+static void heap_flush_slabs(ray_heap_t* h) {
+    for (int i = 0; i < RAY_SLAB_ORDERS; i++) {
+        while (h->slabs[i].count > 0) {
+            ray_t* blk = h->slabs[i].stack[--h->slabs[i].count];
+            int pidx = heap_find_pool(h, blk);
+            uintptr_t pb;
+            uint8_t po;
+            if (pidx >= 0) {
+                pb = (uintptr_t)h->pools[pidx].base;
+                po = h->pools[pidx].pool_order;
+            } else {
+                ray_pool_hdr_t* phdr = ray_pool_of(blk);
+                if (!phdr) continue;
+                pb = (uintptr_t)phdr;
+                po = phdr->pool_order;
+            }
+            heap_coalesce(h, blk, pb, po);
+        }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Foreign blocks flush
+ *
+ * When return_to_owner is true, returns each foreign block to its owning
+ * heap (via pool header heap_id → global registry). This ensures workers
+ * can reuse their pools across queries instead of allocating new ones.
+ *
+ * return_to_owner must only be true when workers are idle (on semaphore),
+ * i.e. ray_parallel_flag == 0. Otherwise coalesce into current heap.
+ * -------------------------------------------------------------------------- */
+
+static void heap_flush_foreign(ray_heap_t* h, bool return_to_owner) {
+    /* When workers are active (return_to_owner=false), skip entirely.
+     * Foreign blocks stay queued until the proper GC flush after workers
+     * finish. Absorbing foreign blocks locally would let them be re-
+     * allocated under a different heap while pool ownership stays with
+     * the original heap, corrupting bytes_allocated accounting. */
+    if (!return_to_owner) return;
+
+    ray_t* blk = h->foreign;
+    while (blk) {
+        ray_t* next = blk->fl_next;
+        ray_pool_hdr_t* phdr = ray_pool_of(blk);
+        if (!phdr) { blk = next; continue; }
+        uint16_t owner_id = phdr->heap_id;
+        ray_heap_t* owner = ray_heap_registry[owner_id % RAY_HEAP_REGISTRY_SIZE];
+        if (owner && owner->id == owner_id && owner != h) {
+            /* Return to owner and decrement owner's bytes_allocated.
+             * Safe: workers are idle (return_to_owner=true implies
+             * ray_parallel_flag==0). */
+            int pidx = heap_find_pool(owner, blk);
+            uintptr_t pb;
+            uint8_t po;
+            if (pidx >= 0) {
+                pb = (uintptr_t)owner->pools[pidx].base;
+                po = owner->pools[pidx].pool_order;
+            } else {
+                pb = (uintptr_t)phdr;
+                po = phdr->pool_order;
+            }
+            RAY_STAT(owner->stats.bytes_allocated -= BSIZEOF(blk->order));
+            heap_coalesce(owner, blk, pb, po);
+        } else {
+            /* Owner gone (destroyed/unregistered) — coalesce locally.
+             * No stats adjustment: the owner's stats were destroyed
+             * with the heap, and h never charged the alloc. */
+            int pidx = heap_find_pool(h, blk);
+            uintptr_t pb;
+            uint8_t po;
+            if (pidx >= 0) {
+                pb = (uintptr_t)h->pools[pidx].base;
+                po = h->pools[pidx].pool_order;
+            } else {
+                if (!phdr) { blk = next; continue; }
+                pb = (uintptr_t)phdr;
+                po = phdr->pool_order;
+            }
+            heap_coalesce(h, blk, pb, po);
+        }
+        blk = next;
+    }
+    h->foreign = NULL;
+}
+
+/* --------------------------------------------------------------------------
+ * Owned-reference helpers
+ * -------------------------------------------------------------------------- */
+
+static bool ray_atom_str_is_sso(const ray_t* s) {
+    if (s->slen >= 1 && s->slen <= 7) return true;
+    if (s->slen == 0 && s->obj == NULL) return true;
+    return false;
+}
+
+static bool ray_atom_owns_obj(const ray_t* v) {
+    if (v->type == -RAY_GUID) return v->obj != NULL;
+    if (v->type == -RAY_STR) return !ray_atom_str_is_sso(v);
+    return false;
+}
+
+static void ray_release_owned_refs(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return;
+
+    if (ray_is_atom(v)) {
+        if (v->type == RAY_LAMBDA) {
+            /* Lambda stores [params, body, bytecode, constants, n_locals, nfo, dbg] in ray_data */
+            ray_t** slots = (ray_t**)ray_data(v);
+            for (int i = 0; i < 4; i++) {
+                if (slots[i] && !RAY_IS_ERR(slots[i]))
+                    ray_release(slots[i]);
+            }
+            /* Release optional debug info slots */
+            if (LAMBDA_NFO(v)) ray_release(LAMBDA_NFO(v));
+            if (LAMBDA_DBG(v)) ray_release(LAMBDA_DBG(v));
+            return;
+        }
+        if (v->type == RAY_LAZY) {
+            ray_graph_t* g = RAY_LAZY_GRAPH(v);
+            if (g) {
+                ray_graph_free(g);
+                RAY_LAZY_GRAPH(v) = NULL;
+            }
+            return;
+        }
+        /* I64 atom tagged as an HNSW handle owns a ray_hnsw_t — free it
+         * when the atom's rc drops to zero so rebindings and scope-exit
+         * don't leak the (potentially large) index graph. */
+        if (v->type == -RAY_I64 && (v->attrs & RAY_ATTR_HNSW)) {
+            ray_hnsw_t* idx = (ray_hnsw_t*)(uintptr_t)v->i64;
+            if (idx) ray_hnsw_free(idx);
+            v->i64 = 0;
+            v->attrs &= (uint8_t)~RAY_ATTR_HNSW;
+            return;
+        }
+        if (ray_atom_owns_obj(v) && v->obj && !RAY_IS_ERR(v->obj))
+            ray_release(v->obj);
+        return;
+    }
+
+    if (v->attrs & RAY_ATTR_SLICE) {
+        if (v->slice_parent && !RAY_IS_ERR(v->slice_parent))
+            ray_release(v->slice_parent);
+        return;
+    }
+
+    /* RAY_INDEX block: release per-kind payload children + saved-nullmap
+     * pointers.  Must run before the LIST/TABLE compound checks below
+     * (which would mistreat the data[] payload as child pointers). */
+    if (v->type == RAY_INDEX) {
+        ray_index_t* ix = ray_index_payload(v);
+        ray_index_release_payload(ix);
+        ray_index_release_saved(ix);
+        return;
+    }
+
+    /* Vector with attached index: nullmap[0..7] holds an owning ref to
+     * the index ray_t.  The index owns the displaced ext_nullmap/str_pool/
+     * sym_dict, so we must NOT also try to release those off the parent —
+     * they aren't there anymore.  Skip the NULLMAP_EXT and STR_pool branches. */
+    if (v->attrs & RAY_ATTR_HAS_INDEX) {
+        if (v->index && !RAY_IS_ERR(v->index))
+            ray_release(v->index);
+        return;
+    }
+
+    if ((v->attrs & RAY_ATTR_NULLMAP_EXT) &&
+        v->ext_nullmap && !RAY_IS_ERR(v->ext_nullmap))
+        ray_release(v->ext_nullmap);
+
+    if (v->type == RAY_STR && v->str_pool && !RAY_IS_ERR(v->str_pool))
+        ray_release(v->str_pool);
+
+    if (RAY_IS_PARTED(v->type)) {
+        int64_t n_segs = v->len;
+        ray_t** segs = (ray_t**)ray_data(v);
+        for (int64_t i = 0; i < n_segs; i++) {
+            if (segs[i] && !RAY_IS_ERR(segs[i]))
+                ray_release(segs[i]);
+        }
+        return;
+    }
+
+    if (v->type == RAY_MAPCOMMON) {
+        ray_t** ptrs = (ray_t**)ray_data(v);
+        if (ptrs[0] && !RAY_IS_ERR(ptrs[0])) ray_release(ptrs[0]);
+        if (ptrs[1] && !RAY_IS_ERR(ptrs[1])) ray_release(ptrs[1]);
+        return;
+    }
+
+    if (v->type == RAY_TABLE || v->type == RAY_DICT) {
+        ray_t** slots = (ray_t**)ray_data(v);
+        if (slots[0] && !RAY_IS_ERR(slots[0])) ray_release(slots[0]);
+        if (slots[1] && !RAY_IS_ERR(slots[1])) ray_release(slots[1]);
+        return;
+    }
+
+    if (v->type == RAY_LIST) {
+        ray_t** ptrs = (ray_t**)ray_data(v);
+        for (int64_t i = 0; i < v->len; i++) {
+            ray_t* child = ptrs[i];
+            if (child && !RAY_IS_ERR(child)) ray_release(child);
+        }
+    }
+}
+
+bool ray_retain_owned_refs(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return true;
+
+    if (ray_is_atom(v)) {
+        if (v->type == RAY_LAMBDA) {
+            ray_t** slots = (ray_t**)ray_data(v);
+            for (int i = 0; i < 4; i++) {
+                if (slots[i] && !RAY_IS_ERR(slots[i]))
+                    ray_retain(slots[i]);
+            }
+            if (LAMBDA_NFO(v)) ray_retain(LAMBDA_NFO(v));
+            if (LAMBDA_DBG(v)) ray_retain(LAMBDA_DBG(v));
+            return true;
+        }
+        /* Lazy handles own their graph uniquely — no retain on copy */
+        if (v->type == RAY_LAZY) return true;
+        /* HNSW handle owns its ray_hnsw_t uniquely.  Deep-clone the index
+         * so the copy is an independent owner with the same semantics as
+         * the source.  On clone-OOM, detach the copy (so caller can free
+         * it cleanly) and signal failure — the caller must not treat the
+         * copy as a valid handle. */
+        if (v->type == -RAY_I64 && (v->attrs & RAY_ATTR_HNSW)) {
+            ray_hnsw_t* src = (ray_hnsw_t*)(uintptr_t)v->i64;
+            if (src) {
+                ray_hnsw_t* dup = ray_hnsw_clone(src);
+                if (!dup) {
+                    v->i64 = 0;
+                    v->attrs &= (uint8_t)~RAY_ATTR_HNSW;
+                    return false;
+                }
+                v->i64 = (int64_t)(uintptr_t)dup;
+            }
+            return true;
+        }
+        if (ray_atom_owns_obj(v) && v->obj && !RAY_IS_ERR(v->obj))
+            ray_retain(v->obj);
+        return true;
+    }
+
+    if (v->attrs & RAY_ATTR_SLICE) {
+        if (v->slice_parent && !RAY_IS_ERR(v->slice_parent))
+            ray_retain(v->slice_parent);
+        return true;
+    }
+
+    if (v->type == RAY_INDEX) {
+        ray_index_t* ix = ray_index_payload(v);
+        ray_index_retain_payload(ix);
+        ray_index_retain_saved(ix);
+        return true;
+    }
+
+    if (v->attrs & RAY_ATTR_HAS_INDEX) {
+        if (v->index && !RAY_IS_ERR(v->index))
+            ray_retain(v->index);
+        return true;
+    }
+
+    if ((v->attrs & RAY_ATTR_NULLMAP_EXT) &&
+        v->ext_nullmap && !RAY_IS_ERR(v->ext_nullmap))
+        ray_retain(v->ext_nullmap);
+
+    if (v->type == RAY_STR && v->str_pool && !RAY_IS_ERR(v->str_pool))
+        ray_retain(v->str_pool);
+
+    if (RAY_IS_PARTED(v->type)) {
+        int64_t n_segs = v->len;
+        ray_t** segs = (ray_t**)ray_data(v);
+        for (int64_t i = 0; i < n_segs; i++) {
+            if (segs[i] && !RAY_IS_ERR(segs[i]))
+                ray_retain(segs[i]);
+        }
+        return true;
+    }
+
+    if (v->type == RAY_MAPCOMMON) {
+        ray_t** ptrs = (ray_t**)ray_data(v);
+        if (ptrs[0] && !RAY_IS_ERR(ptrs[0])) ray_retain(ptrs[0]);
+        if (ptrs[1] && !RAY_IS_ERR(ptrs[1])) ray_retain(ptrs[1]);
+        return true;
+    }
+
+    if (v->type == RAY_TABLE || v->type == RAY_DICT) {
+        ray_t** slots = (ray_t**)ray_data(v);
+        if (slots[0] && !RAY_IS_ERR(slots[0])) ray_retain(slots[0]);
+        if (slots[1] && !RAY_IS_ERR(slots[1])) ray_retain(slots[1]);
+        return true;
+    }
+
+    if (v->type == RAY_LIST) {
+        ray_t** ptrs = (ray_t**)ray_data(v);
+        for (int64_t i = 0; i < v->len; i++) {
+            ray_t* child = ptrs[i];
+            if (child && !RAY_IS_ERR(child)) ray_retain(child);
+        }
+    }
+    return true;
+}
+
+static void ray_detach_owned_refs(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return;
+
+    if (ray_is_atom(v)) {
+        if (v->type == RAY_LAMBDA) {
+            ray_t** slots = (ray_t**)ray_data(v);
+            for (int i = 0; i < 4; i++) slots[i] = NULL;
+            LAMBDA_NFO(v) = NULL;
+            LAMBDA_DBG(v) = NULL;
+            return;
+        }
+        if (v->type == RAY_LAZY) {
+            RAY_LAZY_GRAPH(v) = NULL;
+            RAY_LAZY_OP(v)    = NULL;
+            return;
+        }
+        /* HNSW handle: ownership has been transferred elsewhere; stop the
+         * rc→0 cleanup hook from freeing the (now-foreign) index. */
+        if (v->type == -RAY_I64 && (v->attrs & RAY_ATTR_HNSW)) {
+            v->i64 = 0;
+            v->attrs &= (uint8_t)~RAY_ATTR_HNSW;
+            return;
+        }
+        if (ray_atom_owns_obj(v)) v->obj = NULL;
+        return;
+    }
+
+    if (v->attrs & RAY_ATTR_SLICE) {
+        v->slice_parent = NULL;
+        v->slice_offset = 0;
+        v->attrs &= (uint8_t)~RAY_ATTR_SLICE;
+        return;
+    }
+
+    if (v->type == RAY_INDEX) {
+        ray_index_t* ix = ray_index_payload(v);
+        switch ((ray_idx_kind_t)ix->kind) {
+        case RAY_IDX_HASH:  ix->u.hash.table = ix->u.hash.chain = NULL; break;
+        case RAY_IDX_SORT:  ix->u.sort.perm = NULL; break;
+        case RAY_IDX_BLOOM: ix->u.bloom.bits = NULL; break;
+        default: break;
+        }
+        memset(ix->saved_nullmap, 0, 16);
+        ix->saved_attrs = 0;
+        return;
+    }
+
+    if (v->attrs & RAY_ATTR_HAS_INDEX) {
+        v->index    = NULL;
+        v->_idx_pad = NULL;
+        v->attrs   &= (uint8_t)~RAY_ATTR_HAS_INDEX;
+        return;
+    }
+
+    if (v->attrs & RAY_ATTR_NULLMAP_EXT) {
+        v->ext_nullmap = NULL;
+        v->attrs &= (uint8_t)~RAY_ATTR_NULLMAP_EXT;
+    }
+
+    if (v->type == RAY_STR) {
+        v->str_pool = NULL;
+    }
+
+    if (RAY_IS_PARTED(v->type)) {
+        int64_t n_segs = v->len;
+        ray_t** segs = (ray_t**)ray_data(v);
+        for (int64_t i = 0; i < n_segs; i++)
+            segs[i] = NULL;
+        return;
+    }
+
+    if (v->type == RAY_MAPCOMMON) {
+        ray_t** ptrs = (ray_t**)ray_data(v);
+        ptrs[0] = NULL;
+        ptrs[1] = NULL;
+        return;
+    }
+
+    if (v->type == RAY_TABLE || v->type == RAY_DICT) {
+        ray_t** slots = (ray_t**)ray_data(v);
+        slots[0] = NULL;
+        slots[1] = NULL;
+        v->len = 0;
+        return;
+    }
+
+    if (v->type == RAY_LIST) {
+        v->len = 0;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ray_alloc
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_alloc(size_t data_size) {
+    ray_heap_t* h = ray_tl_heap;
+    if (RAY_UNLIKELY(!h)) {
+        ray_heap_init();
+        h = ray_tl_heap;
+        if (!h) return NULL;
+    }
+
+    uint8_t order = ray_order_for_size(data_size);
+    if (order > RAY_HEAP_MAX_ORDER) return NULL;
+
+    /* Slab fast path */
+    if (RAY_LIKELY(IS_SLAB_ORDER(order))) {
+        int idx = SLAB_INDEX(order);
+        if (RAY_LIKELY(h->slabs[idx].count > 0)) {
+            ray_t* v = h->slabs[idx].stack[--h->slabs[idx].count];
+
+            /* Zero full 32-byte header (hot path).
+             * Nullmap (bytes 0-15) must be cleared for null-bit correctness. */
+            memset(v, 0, 32);
+            v->order = order;
+            if (RAY_UNLIKELY(ray_rc_sync))
+                ray_atomic_store(&v->rc, 1);
+            else
+                v->rc = 1;
+
+            RAY_STAT(h->stats.alloc_count++);
+            RAY_STAT(h->stats.slab_hits++);
+            RAY_STAT(h->stats.bytes_allocated += BSIZEOF(order));
+            RAY_STAT(h->stats.peak_bytes = h->stats.bytes_allocated > h->stats.peak_bytes
+                ? h->stats.bytes_allocated : h->stats.peak_bytes);
+            return v;
+        }
+    }
+
+    /* Find free block via avail bitmask.
+     * Avail bits can be stale from cross-heap fl_remove, so we loop
+     * to find a genuinely non-empty freelist. */
+    uint64_t candidates = h->avail & (UINT64_MAX << order);
+
+    if (RAY_UNLIKELY(candidates == 0)) {
+        heap_flush_foreign(h, false);  /* always local in ray_alloc */
+
+        candidates = h->avail & (UINT64_MAX << order);
+
+        if (candidates == 0) {
+            if (!heap_add_pool(h, order)) return NULL;
+            candidates = h->avail & (UINT64_MAX << order);
+            if (candidates == 0) return NULL;
+        }
+    }
+
+    /* Scan past stale avail bits (cross-heap fl_remove may have emptied lists) */
+    uint8_t found_order;
+    for (;;) {
+        if (candidates == 0) {
+            if (!heap_add_pool(h, order)) return NULL;
+            candidates = h->avail & (UINT64_MAX << order);
+            if (candidates == 0) return NULL;
+        }
+        found_order = (uint8_t)__builtin_ctzll(candidates);
+        if (!fl_empty(&h->freelist[found_order])) break;
+        /* Clear stale bit and try next */
+        h->avail &= ~(1ULL << found_order);
+        candidates &= ~(1ULL << found_order);
+    }
+
+    /* Pop from circular sentinel freelist */
+    ray_fl_head_t* head = &h->freelist[found_order];
+    ray_t* blk = head->fl_next;
+    fl_remove(blk);
+    if (fl_empty(head))
+        h->avail &= ~(1ULL << found_order);
+
+    /* Split down to requested order */
+    heap_split_block(h, blk, order, found_order);
+
+    /* Zero ray_t header and set metadata */
+    memset(blk, 0, 32);
+    blk->mmod  = 0;
+    blk->order = order;
+    if (RAY_UNLIKELY(ray_rc_sync))
+        ray_atomic_store(&blk->rc, 1);
+    else
+        blk->rc = 1;
+
+    RAY_STAT(h->stats.alloc_count++);
+    RAY_STAT(h->stats.bytes_allocated += BSIZEOF(order));
+    RAY_STAT(h->stats.peak_bytes = h->stats.bytes_allocated > h->stats.peak_bytes
+        ? h->stats.bytes_allocated : h->stats.peak_bytes);
+
+    return blk;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_free
+ * -------------------------------------------------------------------------- */
+
+void ray_free(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return;
+    if (v->attrs & RAY_ATTR_ARENA) return;  /* arena-owned, bulk-freed */
+
+    /* Guard: keep rc=1 while releasing children so buddy coalescing
+     * won't merge this block prematurely (it checks buddy_rc==0). */
+    ray_atomic_store(&v->rc, 1);
+
+    ray_release_owned_refs(v);
+
+    ray_heap_t* h = ray_tl_heap;
+
+    /* File-mapped: munmap */
+    if (v->mmod == 1) {
+        if (v->type == RAY_TABLE || v->type == RAY_DICT || v->type == RAY_LIST) return;
+        if (v->type > 0 && v->type < RAY_TYPE_COUNT) {
+            uint8_t esz = ray_sym_elem_size(v->type, v->attrs);
+            size_t data_size = 32 + (size_t)v->len * esz;
+            if (v->attrs & RAY_ATTR_NULLMAP_EXT)
+                data_size += ((size_t)v->len + 7) / 8;
+            size_t mapped_size = (data_size + 4095) & ~(size_t)4095;
+            ray_vm_unmap_file(v, mapped_size);
+        } else {
+            ray_vm_unmap_file(v, 4096);
+        }
+        if (h) RAY_STAT(h->stats.free_count++);
+        return;
+    }
+
+    /* Legacy mmod==2 guard */
+    if (v->mmod == 2) return;
+
+    if (!h) return;
+
+    uint8_t order = v->order;
+
+    if (order < RAY_ORDER_MIN || order > RAY_HEAP_MAX_ORDER) return;
+
+    size_t block_size = BSIZEOF(order);
+
+    /* O(1) ownership check via pool header heap_id.
+     * ray_pool_of() derives pool base in O(1) via self-aligned AND mask.
+     * Pool header stores heap_id stamped at pool creation. */
+    ray_pool_hdr_t* phdr = ray_pool_of(v);
+    if (!phdr) return;
+    bool is_local = (phdr->heap_id == h->id);
+
+    /* Slab fast path (same heap only) */
+    if (IS_SLAB_ORDER(order) && is_local) {
+        int idx = SLAB_INDEX(order);
+        if (h->slabs[idx].count < RAY_SLAB_CACHE_SIZE) {
+            /* Mark rc=1 so buddy coalescing skips slab-cached blocks.
+             * Blocks freed via ray_release arrive with rc=0; without this,
+             * a buddy being freed would see rc==0 and incorrectly merge
+             * with the slab-cached block, causing overlapping allocations.
+             * Must be atomic: buddy coalescing on another thread reads rc. */
+            ray_atomic_store(&v->rc, 1);
+            h->slabs[idx].stack[h->slabs[idx].count++] = v;
+            RAY_STAT(h->stats.free_count++);
+            RAY_STAT(h->stats.bytes_allocated -= block_size);
+            return;
+        }
+    }
+
+    /* Foreign: different heap — enqueue to foreign list for later
+     * return to the owner during GC (flush with return_to_owner=true).
+     * Do NOT adjust any heap's bytes_allocated here: the block stays
+     * counted on the owning heap until properly returned and coalesced. */
+    if (!is_local) {
+        v->fl_next = h->foreign;
+        h->foreign = v;
+        RAY_STAT(h->stats.free_count++);
+        return;
+    }
+
+    /* Local block — coalesce with buddy */
+    heap_coalesce(h, v, (uintptr_t)phdr, phdr->pool_order);
+
+    RAY_STAT(h->stats.free_count++);
+    RAY_STAT(h->stats.bytes_allocated -= block_size);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_alloc_copy
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_alloc_copy(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return NULL;
+    size_t data_size;
+    if (ray_is_atom(v)) {
+        data_size = 0;
+    } else if (v->type == RAY_TABLE || v->type == RAY_DICT) {
+        data_size = 2 * sizeof(ray_t*);
+    } else if (RAY_IS_PARTED(v->type) || v->type == RAY_MAPCOMMON) {
+        int64_t n_ptrs = v->len;
+        if (v->type == RAY_MAPCOMMON) n_ptrs = 2;
+        if (n_ptrs < 0) return ray_error("oom", NULL);
+        data_size = (size_t)n_ptrs * sizeof(ray_t*);
+    } else if (v->type == RAY_LIST) {
+        /* RAY_LIST has type==0, which the generic branch below (t <= 0)
+         * would route to data_size=0, silently producing a header-only copy
+         * whose item-pointer area is uninitialised — a shallow COW of a
+         * shared list would then lose every element.  Handle explicitly. */
+        if (v->len < 0 || (uint64_t)v->len > SIZE_MAX / sizeof(ray_t*))
+            return ray_error("oom", NULL);
+        data_size = (size_t)ray_len(v) * sizeof(ray_t*);
+    } else {
+        int8_t t = ray_type(v);
+        if (t <= 0 || t >= RAY_TYPE_COUNT)
+            data_size = 0;
+        else {
+            uint8_t esz = ray_sym_elem_size(t, v->attrs);
+            if (v->len < 0 || (esz > 0 && (uint64_t)v->len > SIZE_MAX / esz))
+                return ray_error("oom", NULL);
+            data_size = (size_t)ray_len(v) * esz;
+        }
+    }
+    ray_t* copy = ray_alloc(data_size);
+    if (!copy) return NULL;
+
+    uint8_t new_order = copy->order;
+    uint8_t new_mmod  = copy->mmod;
+    memcpy(copy, v, 32 + data_size);
+    copy->mmod  = new_mmod;
+    copy->order = new_order;
+    if (RAY_UNLIKELY(ray_rc_sync))
+        ray_atomic_store(&copy->rc, 1);
+    else
+        copy->rc = 1;
+    if (!ray_retain_owned_refs(copy)) {
+        /* Deep-clone of an owned resource failed (e.g. HNSW index OOM).
+         * The copy's owned state has already been neutralized, so a plain
+         * ray_free won't touch the source's resources. */
+        ray_free(copy);
+        return ray_error("oom", NULL);
+    }
+    return copy;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_scratch_alloc / ray_scratch_realloc
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_scratch_alloc(size_t data_size) {
+    return ray_alloc(data_size);
+}
+
+ray_t* ray_scratch_realloc(ray_t* v, size_t new_data_size) {
+    ray_t* new_v = ray_alloc(new_data_size);
+    if (!new_v) return NULL;
+    if (v && !RAY_IS_ERR(v)) {
+        size_t old_data;
+        if (ray_is_atom(v))
+            old_data = 0;
+        else if (v->type == RAY_LIST) {
+            if (v->len < 0) { old_data = 0; }
+            else old_data = (size_t)ray_len(v) * sizeof(ray_t*);
+        } else if (v->type == RAY_TABLE || v->type == RAY_DICT) {
+            old_data = 2 * sizeof(ray_t*);
+        } else if (RAY_IS_PARTED(v->type) || v->type == RAY_MAPCOMMON) {
+            int64_t n_ptrs = v->len;
+            if (v->type == RAY_MAPCOMMON) n_ptrs = 2;
+            if (n_ptrs < 0) n_ptrs = 0;
+            old_data = (size_t)n_ptrs * sizeof(ray_t*);
+        } else {
+            int8_t t = ray_type(v);
+            old_data = (t > 0 && t < RAY_TYPE_COUNT && v->len >= 0) ?
+                       (size_t)ray_len(v) * ray_sym_elem_size(t, v->attrs) : 0;
+        }
+        /* Clamp old_data to actual allocation size */
+        if (v->mmod == 0 && v->order >= RAY_ORDER_MIN) {
+            size_t alloc_data = BSIZEOF(v->order) - 32;
+            if (old_data > alloc_data) old_data = alloc_data;
+        }
+        size_t copy_data = old_data < new_data_size ? old_data : new_data_size;
+        uint8_t new_mmod = new_v->mmod;
+        uint8_t new_order = new_v->order;
+        memcpy(new_v, v, 32 + copy_data);
+        new_v->mmod = new_mmod;
+        new_v->order = new_order;
+        if (RAY_UNLIKELY(ray_rc_sync))
+            ray_atomic_store(&new_v->rc, 1);
+        else
+            new_v->rc = 1;
+        /* Ownership transfers via memcpy — no retain needed on new_v.
+         * Detach nulls old pointers so ray_free won't double-release. */
+        if (!(v->attrs & RAY_ATTR_ARENA)) {
+            ray_detach_owned_refs(v);
+            ray_free(v);
+        }
+    }
+    return new_v;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_mem_stats
+ * -------------------------------------------------------------------------- */
+
+void ray_mem_stats(ray_mem_stats_t* out) {
+    if (ray_tl_heap)
+        *out = ray_tl_heap->stats;
+    else
+        memset(out, 0, sizeof(*out));
+    int64_t sc = 0, sp = 0;
+    ray_sys_get_stat(&sc, &sp);
+    out->sys_current = (size_t)sc;
+    out->sys_peak    = (size_t)sp;
+}
+
+/* --------------------------------------------------------------------------
+ * Heap lifecycle
+ * -------------------------------------------------------------------------- */
+
+void ray_heap_init(void) {
+    if (ray_tl_heap) return;
+
+    size_t heap_sz = (sizeof(ray_heap_t) + 4095) & ~(size_t)4095;
+    ray_heap_t* h = (ray_heap_t*)ray_vm_alloc(heap_sz);
+    if (!h) return;
+    memset(h, 0, heap_sz);
+
+    /* Bitmap-based ID: acquire reusable ID via atomic CAS */
+    int id = heap_id_acquire();
+    if (id < 0) {
+        ray_vm_free(h, heap_sz);
+        return;  /* ID pool exhausted */
+    }
+    h->id = (uint16_t)id;
+
+    /* Register in global heap registry */
+    ray_heap_registry[h->id % RAY_HEAP_REGISTRY_SIZE] = h;
+
+    /* Initialize circular sentinel freelists */
+    for (int i = 0; i < RAY_HEAP_FL_SIZE; i++)
+        fl_init(&h->freelist[i]);
+
+    /* Resolve swap directory for file-backed pool fallback.  RAY_HEAP_SWAP
+     * env var overrides the default ("./"); we always ensure a trailing
+     * slash so heap_add_pool can concatenate `<swap_path><filename>`
+     * unconditionally.  An empty / over-long env value is rejected and the
+     * default kicks in. */
+    const char* env = getenv("RAY_HEAP_SWAP");
+    const char* sp = (env && *env && strlen(env) < sizeof(h->swap_path) - 16) ? env : "./";
+    size_t sp_len = strlen(sp);
+    memcpy(h->swap_path, sp, sp_len);
+    h->swap_path[sp_len] = '\0';
+    if (sp_len > 0 && h->swap_path[sp_len - 1] != '/' && sp_len < sizeof(h->swap_path) - 1) {
+        h->swap_path[sp_len] = '/';
+        h->swap_path[sp_len + 1] = '\0';
+    }
+
+    ray_tl_heap = h;
+}
+
+void ray_heap_destroy(void) {
+    ray_heap_t* h = ray_tl_heap;
+    if (!h) return;
+
+    uint16_t saved_id = h->id;
+
+    /* Unregister from global heap registry */
+    ray_heap_registry[h->id % RAY_HEAP_REGISTRY_SIZE] = NULL;
+
+    /* Skip flush_slabs and flush_foreign — all pools are about to be
+     * munmap'd. Flushing would coalesce blocks and fl_remove buddies
+     * from other heaps' freelists, which races with concurrent worker
+     * destruction during ray_pool_free(). */
+
+    /* Munmap all tracked pools.  File-backed pools also need their fd
+     * closed and their tempfile unlinked so the swap directory doesn't
+     * accumulate orphans. */
+    for (uint32_t i = 0; i < h->pool_count; i++) {
+        ray_pool_hdr_t* hdr = (ray_pool_hdr_t*)h->pools[i].base;
+        ray_vm_free(hdr->vm_base, BSIZEOF(h->pools[i].pool_order));
+        if (h->pools[i].backed) {
+            if (h->pools[i].swap_fd >= 0) close(h->pools[i].swap_fd);
+            if (h->pools[i].swap_path) {
+                unlink(h->pools[i].swap_path);
+                ray_sys_free(h->pools[i].swap_path);
+            }
+        }
+    }
+
+    size_t heap_sz = (sizeof(ray_heap_t) + 4095) & ~(size_t)4095;
+    ray_vm_free(h, heap_sz);
+    ray_tl_heap = NULL;
+
+    /* Release bitmap ID after all memory is freed */
+    heap_id_release(saved_id);
+}
+
+/* --------------------------------------------------------------------------
+ * Return worker-pool blocks from this heap's freelists to their owners.
+ *
+ * After ray_alloc flushes foreign blocks locally (coalesce + madvise),
+ * worker-pool blocks sit on main's freelists with released physical pages.
+ * This function walks the freelists, finds blocks whose pool header
+ * heap_id != ours, removes them, and inserts into the owning worker heap.
+ * Workers can then reuse their pools without allocating new ones.
+ *
+ * ONLY safe when workers are idle (on semaphore, ray_parallel_flag == 0).
+ * -------------------------------------------------------------------------- */
+
+static void heap_return_foreign_freelist(ray_heap_t* h) {
+    for (int order = RAY_ORDER_MIN; order < RAY_HEAP_FL_SIZE; order++) {
+        ray_fl_head_t* head = &h->freelist[order];
+        ray_t* blk = head->fl_next;
+        while (blk != (ray_t*)head) {
+            ray_t* next = blk->fl_next;
+            /* Use heap_find_pool on h first — if found, block is local */
+            int pidx = heap_find_pool(h, blk);
+            if (pidx < 0) {
+                /* Foreign block — find owner via pool header (GC path) */
+                ray_pool_hdr_t* phdr = ray_pool_of(blk);
+                if (!phdr) { blk = next; continue; }
+                ray_heap_t* owner = ray_heap_registry[phdr->heap_id % RAY_HEAP_REGISTRY_SIZE];
+                if (owner && owner->id == phdr->heap_id) {
+                    fl_remove(blk);
+                    if (fl_empty(head))
+                        h->avail &= ~(1ULL << order);
+                    /* Coalesce on owner for defragmentation */
+                    int opidx = heap_find_pool(owner, blk);
+                    uintptr_t pb;
+                    uint8_t po;
+                    if (opidx >= 0) {
+                        pb = (uintptr_t)owner->pools[opidx].base;
+                        po = owner->pools[opidx].pool_order;
+                    } else {
+                        pb = (uintptr_t)phdr;
+                        po = phdr->pool_order;
+                    }
+                    heap_coalesce(owner, blk, pb, po);
+                }
+            }
+            blk = next;
+        }
+    }
+}
+
+void ray_heap_gc(void) {
+    ray_heap_t* h = ray_tl_heap;
+    if (!h) return;
+
+    bool safe = (atomic_load_explicit(&ray_parallel_flag, memory_order_relaxed) == 0);
+
+    /* Phase 1: Flush main heap's foreign blocks and slab caches.
+     * When safe (workers idle), return foreign blocks to their owners
+     * so worker pools become reusable. */
+    heap_flush_foreign(h, safe);
+    heap_flush_slabs(h);
+
+    if (safe) {
+        /* Phase 2: Return foreign blocks absorbed onto our freelists
+         * back to their owning worker heaps. */
+        heap_return_foreign_freelist(h);
+
+        /* Phase 3: Skip worker heaps — we cannot safely touch their
+         * foreign lists or slab caches because workers may still be
+         * between pending-- and sem_wait, calling ray_free which
+         * modifies wh->foreign and wh->slabs.  Workers flush their
+         * own foreign/slabs on their next dispatch entry.
+         * TODO: full cross-heap reclamation requires a worker
+         * quiescence barrier. */
+
+        /* Phase 4: Reclaim OVERSIZED empty pools.
+         * Standard pools (pool_order == RAY_HEAP_POOL_ORDER) are never
+         * munmapped — physical pages released via madvise (phase 5)
+         * re-fault cheaply on next query.
+         * Only oversized pools (pool_order > RAY_HEAP_POOL_ORDER) are
+         * candidates — these are one-off large allocations.
+         *
+         * Emptiness is computed by walking all heaps' freelists and slab
+         * caches to sum free capacity within the pool. This avoids atomic
+         * live_count operations on the alloc/free hot path. */
+        /* Phase 4: Reclaim oversized empty pools.
+         *
+         * For each candidate pool (owned by heap gh), count free bytes from:
+         *   (a) gh's own freelist + slab cache — safe, only gh modifies these
+         *   (b) ALL heaps' foreign lists (read-only) — foreign lists are
+         *       prepend-only during the race window, so a read-only walk
+         *       sees a consistent suffix. A concurrent prepend may be
+         *       missed, making us undercount — which is conservative.
+         *
+         * On removal, only unlink from gh's freelist/slabs. Blocks still
+         * in other heaps' foreign lists will be discovered as dangling on
+         * their next flush (foreign block with unmapped pool → ray_pool_of
+         * returns NULL → skipped by the NULL guard). */
+        for (int hid = 0; hid < RAY_HEAP_REGISTRY_SIZE; hid++) {
+            ray_heap_t* gh = ray_heap_registry[hid];
+            if (!gh) continue;
+
+            for (uint32_t p = 0; p < gh->pool_count; ) {
+                ray_pool_hdr_t* phdr = (ray_pool_hdr_t*)gh->pools[p].base;
+
+                /* Skip standard pools and last-remaining pool */
+                if (phdr->pool_order <= RAY_HEAP_POOL_ORDER
+                    || gh->pool_count <= 1) {
+                    p++;
+                    continue;
+                }
+
+                uint8_t po = phdr->pool_order;
+                uintptr_t pb = (uintptr_t)phdr;
+                uintptr_t pe = pb + BSIZEOF(po);
+                size_t pool_capacity = BSIZEOF(po) - BSIZEOF(RAY_ORDER_MIN);
+
+                /* (a) Sum free bytes from owning heap's freelist + slabs */
+                size_t free_bytes = 0;
+                for (int ord = RAY_ORDER_MIN; ord < RAY_HEAP_FL_SIZE; ord++) {
+                    ray_fl_head_t* fh = &gh->freelist[ord];
+                    ray_t* blk = fh->fl_next;
+                    while (blk != (ray_t*)fh) {
+                        if ((uintptr_t)blk >= pb && (uintptr_t)blk < pe)
+                            free_bytes += BSIZEOF(ord);
+                        blk = blk->fl_next;
+                    }
+                }
+                for (int si = 0; si < RAY_SLAB_ORDERS; si++) {
+                    for (uint32_t j = 0; j < gh->slabs[si].count; j++) {
+                        ray_t* sb = gh->slabs[si].stack[j];
+                        if ((uintptr_t)sb >= pb && (uintptr_t)sb < pe)
+                            free_bytes += BSIZEOF(RAY_SLAB_MIN + si);
+                    }
+                }
+
+                /* (b) Check if ANY blocks from this pool are still in other
+                 *     heaps' foreign lists.  If so, we cannot munmap —
+                 *     those blocks are threaded into the foreign list and
+                 *     dereferencing them after munmap would crash.
+                 *     They'll be flushed to the owner on the next GC. */
+                bool has_foreign = false;
+                for (int fh_id = 0; fh_id < RAY_HEAP_REGISTRY_SIZE && !has_foreign; fh_id++) {
+                    ray_heap_t* fh_heap = ray_heap_registry[fh_id];
+                    if (!fh_heap || fh_heap == gh) continue;
+                    ray_t* fb = fh_heap->foreign;
+                    while (fb) {
+                        if ((uintptr_t)fb >= pb && (uintptr_t)fb < pe) {
+                            has_foreign = true;
+                            break;
+                        }
+                        fb = fb->fl_next;
+                    }
+                }
+
+                if (free_bytes < pool_capacity || has_foreign) {
+                    p++;
+                    continue;  /* pool has live allocations or dangling foreign refs */
+                }
+
+                /* Pool is empty and no foreign-list refs — safe to munmap.
+                 * Remove blocks from owning heap's freelists and slab caches. */
+                for (int ord = RAY_ORDER_MIN; ord < RAY_HEAP_FL_SIZE; ord++) {
+                    ray_fl_head_t* fh = &gh->freelist[ord];
+                    ray_t* blk = fh->fl_next;
+                    while (blk != (ray_t*)fh) {
+                        ray_t* next = blk->fl_next;
+                        if ((uintptr_t)blk >= pb && (uintptr_t)blk < pe) {
+                            fl_remove(blk);
+                            if (fl_empty(fh))
+                                gh->avail &= ~(1ULL << ord);
+                        }
+                        blk = next;
+                    }
+                }
+                for (int si = 0; si < RAY_SLAB_ORDERS; si++) {
+                    uint32_t dst = 0;
+                    for (uint32_t j = 0; j < gh->slabs[si].count; j++) {
+                        ray_t* sb = gh->slabs[si].stack[j];
+                        if ((uintptr_t)sb >= pb && (uintptr_t)sb < pe)
+                            continue;
+                        gh->slabs[si].stack[dst++] = sb;
+                    }
+                    gh->slabs[si].count = dst;
+                }
+
+                ray_vm_free(phdr->vm_base, BSIZEOF(po));
+                /* File-backed pools also need their fd closed and tempfile
+                 * unlinked, mirroring the heap_destroy path. */
+                if (gh->pools[p].backed) {
+                    if (gh->pools[p].swap_fd >= 0) close(gh->pools[p].swap_fd);
+                    if (gh->pools[p].swap_path) {
+                        unlink(gh->pools[p].swap_path);
+                        ray_sys_free(gh->pools[p].swap_path);
+                    }
+                }
+                gh->pools[p] = gh->pools[--gh->pool_count];
+                /* Don't increment p — check swapped entry */
+            }
+        }
+    }
+
+}
+
+void ray_heap_release_pages(void) {
+    ray_heap_t* h = ray_tl_heap;
+    if (!h) return;
+    for (int i = 13; i < RAY_HEAP_FL_SIZE; i++) {
+        ray_fl_head_t* head = &h->freelist[i];
+        ray_t* blk = head->fl_next;
+        while (blk != (ray_t*)head) {
+            size_t bsize = BSIZEOF(i);
+            if (bsize > 4096)
+                ray_vm_release((char*)blk + 4096, bsize - 4096);
+            blk = blk->fl_next;
+        }
+    }
+}
+
+void ray_heap_merge(ray_heap_t* src) {
+    ray_heap_t* dst = ray_tl_heap;
+    if (!dst || !src) return;
+
+    /* Merge stats: dst inherits src's outstanding allocations so that
+     * future local frees of those blocks correctly decrement dst. */
+    dst->stats.alloc_count     += src->stats.alloc_count;
+    dst->stats.free_count      += src->stats.free_count;
+    dst->stats.bytes_allocated += src->stats.bytes_allocated;
+    dst->stats.slab_hits       += src->stats.slab_hits;
+    dst->stats.direct_count    += src->stats.direct_count;
+    dst->stats.direct_bytes    += src->stats.direct_bytes;
+    if (src->stats.peak_bytes > dst->stats.peak_bytes)
+        dst->stats.peak_bytes = src->stats.peak_bytes;
+
+    /* Transfer slabs: fit into dst cache, coalesce overflow */
+    for (int i = 0; i < RAY_SLAB_ORDERS; i++) {
+        while (src->slabs[i].count > 0 && dst->slabs[i].count < RAY_SLAB_CACHE_SIZE)
+            dst->slabs[i].stack[dst->slabs[i].count++] =
+                src->slabs[i].stack[--src->slabs[i].count];
+        while (src->slabs[i].count > 0) {
+            ray_t* blk = src->slabs[i].stack[--src->slabs[i].count];
+            int pidx = heap_find_pool(dst, blk);
+            uintptr_t pb;
+            uint8_t po;
+            if (pidx >= 0) {
+                pb = (uintptr_t)dst->pools[pidx].base;
+                po = dst->pools[pidx].pool_order;
+            } else {
+                ray_pool_hdr_t* phdr = ray_pool_of(blk);
+                if (!phdr) continue;
+                pb = (uintptr_t)phdr;
+                po = phdr->pool_order;
+            }
+            heap_coalesce(dst, blk, pb, po);
+        }
+    }
+
+    /* Free foreign blocks via coalescing */
+    ray_t* fblk = src->foreign;
+    while (fblk) {
+        ray_t* next = fblk->fl_next;
+        int pidx = heap_find_pool(dst, fblk);
+        uintptr_t pb;
+        uint8_t po;
+        if (pidx >= 0) {
+            pb = (uintptr_t)dst->pools[pidx].base;
+            po = dst->pools[pidx].pool_order;
+        } else {
+            ray_pool_hdr_t* phdr = ray_pool_of(fblk);
+            if (!phdr) { fblk = next; continue; }
+            pb = (uintptr_t)phdr;
+            po = phdr->pool_order;
+        }
+        heap_coalesce(dst, fblk, pb, po);
+        fblk = next;
+    }
+    src->foreign = NULL;
+
+    /* Merge freelists: circular list splice (src chain into dst chain) */
+    for (int i = RAY_ORDER_MIN; i < RAY_HEAP_FL_SIZE; i++) {
+        if (fl_empty(&src->freelist[i])) continue;
+
+        ray_fl_head_t* src_head = &src->freelist[i];
+        ray_fl_head_t* dst_head = &dst->freelist[i];
+
+        /* Splice: src's chain [src_first...src_last] into dst after sentinel */
+        ray_t* src_first = src_head->fl_next;
+        ray_t* src_last  = src_head->fl_prev;
+        ray_t* dst_first = dst_head->fl_next;
+
+        /* src_first goes after dst sentinel */
+        dst_head->fl_next = src_first;
+        src_first->fl_prev = (ray_t*)dst_head;
+
+        /* src_last connects to old dst_first */
+        src_last->fl_next = dst_first;
+        dst_first->fl_prev = src_last;
+
+        dst->avail |= (1ULL << i);
+
+        /* Reset src sentinel to empty */
+        fl_init(src_head);
+    }
+
+    src->avail = 0;
+
+    /* Update pool headers: set heap_id to dst, transfer pool entries.
+     * Do NOT rewrite heap_id for pools that can't be tracked — that would
+     * make coalescing reference a pool not in dst's pool table. */
+    for (uint32_t i = 0; i < src->pool_count; i++) {
+        if (dst->pool_count < RAY_MAX_POOLS) {
+            ray_pool_hdr_t* hdr = (ray_pool_hdr_t*)src->pools[i].base;
+            hdr->heap_id = dst->id;
+            dst->pools[dst->pool_count++] = src->pools[i];
+        } else {
+            /* Pool overflow: only triggers at RAY_MAX_POOLS (512 pools = 16GB+).
+             * Fix ownership so blocks free to the correct heap. */
+            ray_pool_hdr_t* hdr = (ray_pool_hdr_t*)src->pools[i].base;
+            hdr->heap_id = dst->id;
+            assert(0 && "ray_heap_merge: pool overflow at RAY_MAX_POOLS");
+        }
+    }
+    src->pool_count = 0;
+}
+
+/* --------------------------------------------------------------------------
+ * Public foreign-blocks flush
+ * -------------------------------------------------------------------------- */
+
+void ray_heap_flush_foreign(void) {
+    ray_heap_t* h = ray_tl_heap;
+    if (!h) return;
+    bool safe = (atomic_load_explicit(&ray_parallel_flag,
+                                       memory_order_relaxed) == 0);
+    heap_flush_foreign(h, safe);
+}
+
+/* --------------------------------------------------------------------------
+ * Pending-merge queue (lock-free LIFO)
+ *
+ * Workers that are torn down push their heap onto this queue instead of
+ * destroying it immediately. The main thread drains the queue, merging
+ * each pending heap into its own and then destroying it.
+ * -------------------------------------------------------------------------- */
+
+void ray_heap_push_pending(ray_heap_t* heap) {
+    if (!heap) return;
+    /* Unregister so no new foreign blocks target this heap */
+    ray_heap_registry[heap->id % RAY_HEAP_REGISTRY_SIZE] = NULL;
+    /* Lock-free push: CAS loop on global LIFO head */
+    heap->pending_next = atomic_load_explicit(&ray_heap_pending_merge, memory_order_relaxed);
+    while (!atomic_compare_exchange_weak_explicit(
+            &ray_heap_pending_merge,
+            &heap->pending_next, heap,
+            memory_order_release, memory_order_relaxed))
+        ;
+}
+
+void ray_heap_drain_pending(void) {
+    /* Atomically steal the entire pending list */
+    ray_heap_t* pending = atomic_exchange_explicit(
+        &ray_heap_pending_merge, NULL,
+        memory_order_acquire);
+    while (pending) {
+        ray_heap_t* next = pending->pending_next;
+        ray_heap_merge(pending);
+        /* Free the heap struct (pools already transferred by merge) */
+        uint16_t saved_id = pending->id;
+        size_t heap_sz = (sizeof(ray_heap_t) + 4095) & ~(size_t)4095;
+        ray_vm_free(pending, heap_sz);
+        heap_id_release(saved_id);
+        pending = next;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Scratch arena: bump allocator backed by buddy-allocated 64KB blocks
+ * -------------------------------------------------------------------------- */
+
+void* ray_scratch_arena_push(ray_scratch_arena_t* a, size_t nbytes) {
+    /* 16-byte alignment */
+    nbytes = (nbytes + 15) & ~(size_t)15;
+
+    if (RAY_LIKELY(a->ptr != NULL && a->ptr + nbytes <= a->end))
+        goto bump;
+
+    /* Need a new backing block */
+    if (a->n_backing >= RAY_ARENA_MAX_BACKING) return NULL;
+
+    size_t block_data = BSIZEOF(RAY_ARENA_BLOCK_ORDER) - 32;
+    /* If request exceeds standard block, allocate exact-fit */
+    size_t alloc_size = nbytes > block_data ? nbytes : block_data;
+    ray_t* blk = ray_alloc(alloc_size);
+    if (!blk) return NULL;
+    a->backing[a->n_backing++] = blk;
+    a->ptr = (char*)ray_data(blk);
+    a->end = (char*)blk + BSIZEOF(blk->order);
+
+bump:;
+    void* ret = a->ptr;
+    a->ptr += nbytes;
+    return ret;
+}
+
+void ray_scratch_arena_reset(ray_scratch_arena_t* a) {
+    for (int i = 0; i < a->n_backing; i++)
+        ray_free(a->backing[i]);
+    a->n_backing = 0;
+    a->ptr = NULL;
+    a->end = NULL;
+}
+
+/* --------------------------------------------------------------------------
+ * Parallel begin / end
+ * -------------------------------------------------------------------------- */
+
+void ray_parallel_begin(void) { atomic_store(&ray_parallel_flag, 1); }
+void ray_parallel_end(void) {
+    atomic_store(&ray_parallel_flag, 0);
+    ray_heap_gc();
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/heap.h b/crates/rayforce-sys/vendor/rayforce/src/mem/heap.h
new file mode 100644
index 0000000..093f3c4
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/mem/heap.h
@@ -0,0 +1,404 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_HEAP_H
+#define RAY_HEAP_H
+
+/*
+ * heap.h -- Rayforce-style per-thread heap allocator (zero-prefix layout).
+ *
+ * Each thread owns one ray_heap_t. Blocks are allocated from self-aligned
+ * mmap'd pools via buddy splitting. ray_t IS the block — no prefix.
+ *
+ * Pool metadata (heap_id, pool_order) is stored in a pool header at
+ * offset 0 of each self-aligned pool (first min-block reserved).
+ * Pool base is derived in O(1): ptr & ~(pool_size - 1).
+ *
+ * Free-list prev/next overlay nullmap bytes 0-15 of ray_t (unused when free).
+ * rc == 0 indicates a free block (replaces the old ray_blk_t.used flag).
+ *
+ * Cross-thread free uses a foreign_blocks list (checked via pool heap_id).
+ */
+
+#include <rayforce.h>
+#include "core/platform.h"
+#include "ops/ops.h"
+#include <stdint.h>
+
+/* ===== Attribute Flags =====
+ *
+ * The `attrs` byte in ray_t is type-namespaced: the same bit positions carry
+ * different meanings depending on the object's type tag.
+ *
+ *   Bits 0x01-0x03  RAY_SYM vectors:  sym index width (RAY_SYM_W8/W16/W32/W64)
+ *   Bits 0x01-0x10  function objects (RAY_UNARY/BINARY/VARY): RAY_FN_* flags
+ *   Bit  0x04       -RAY_I64 atoms:  RAY_ATTR_HNSW (HNSW handle in .i64)
+ *   Bit  0x08       vectors:         RAY_ATTR_HAS_INDEX (index ray_t* in nullmap[0..7])
+ *   Bit  0x10       vectors:         RAY_ATTR_SLICE
+ *   Bit  0x20       vectors:         RAY_ATTR_NULLMAP_EXT
+ *   Bit  0x20       -RAY_SYM:        RAY_ATTR_NAME (variable reference)
+ *   Bit  0x40       vectors:         RAY_ATTR_HAS_NULLS
+ *   Bit  0x80       all types:       RAY_ATTR_ARENA (arena-allocated, no refcount)
+ *
+ * Overlapping bit values are safe because consumers always check the type tag
+ * before interpreting attrs.
+ */
+
+#ifndef RAY_ATTR_SLICE
+#define RAY_ATTR_SLICE        0x10
+#endif
+#define RAY_ATTR_NULLMAP_EXT  0x20
+#define RAY_ATTR_HAS_NULLS    0x40
+#define RAY_ATTR_ARENA        0x80
+
+/* I64 atom carries an owning ray_hnsw_t* in its .i64 slot.
+ * Checked by HNSW builtins before dereferencing.  User must (hnsw-free h). */
+#define RAY_ATTR_HNSW         0x04
+
+/* Vector is a linked column.  The 8 bytes of the nullmap union at offset
+ * 8 (i.e. parent->_idx_pad / parent->slice_offset / parent->sym_dict /
+ * parent->str_pool slot, depending on which arm is in use) hold an int64
+ * sym ID naming the target table.  Resolved against the global env at
+ * deref time.  Restricted to RAY_I32 / RAY_I64 vectors — STR/SYM/SLICE
+ * already use bytes 8-15 for their own pointers/data so HAS_LINK on
+ * those types would alias.
+ *
+ * Coexists with HAS_INDEX: bytes 0-7 carry the index pointer (or saved
+ * nullmap), bytes 8-15 carry the link sym; both bits can be set on the
+ * same column.  A linked vec with nulls is forced to RAY_ATTR_NULLMAP_EXT
+ * because the inline 128-bit bitmap would alias the link-target slot.
+ *
+ * Same numeric value as RAY_ATTR_HNSW (HNSW handles are -RAY_I64 atoms,
+ * the type tag disambiguates). */
+#define RAY_ATTR_HAS_LINK     0x04
+
+/* Vector carries an attached accelerator index in nullmap[0..7] (a ray_t*
+ * of type RAY_INDEX).  The original 16-byte nullmap union content (inline
+ * bitmap, ext_nullmap, str_ext_null/str_pool, sym_dict) is preserved inside
+ * the index ray_t and restored on detach.
+ *
+ * Attribute-bit invariant when HAS_INDEX is set:
+ *   - HAS_NULLS is *preserved* (not cleared).  Many call sites use it as a
+ *     cheap "do I need null-aware logic?" gate; clearing it would silently
+ *     break correctness for nullable columns.  The bit is authoritative.
+ *   - NULLMAP_EXT is *cleared*.  The parent's ext_nullmap field is now the
+ *     index pointer, not a U8 bitmap vec; readers that gate on NULLMAP_EXT
+ *     and dereference ext_nullmap directly would otherwise read garbage.
+ *     The displaced ext-nullmap pointer (if any) lives in
+ *     ix->saved_nullmap[0..7]; ix->saved_attrs records the original
+ *     NULLMAP_EXT bit for restoration on detach.
+ *
+ * Direct nullmap-byte readers (morsel iteration, ray_vec_is_null) MUST
+ * check HAS_INDEX first and route through ix->saved_nullmap / saved_attrs.
+ * See src/ops/idxop.h. */
+#define RAY_ATTR_HAS_INDEX    0x08
+
+/* ===== Internal Allocator Variants ===== */
+
+ray_t*    ray_alloc_copy(ray_t* v);
+ray_t*    ray_scratch_alloc(size_t data_size);
+ray_t*    ray_scratch_realloc(ray_t* v, size_t new_data_size);
+
+/* ===== COW (Copy-on-Write) ===== */
+
+ray_t*    ray_cow(ray_t* v);
+
+/* ===== Memory Statistics ===== */
+
+typedef struct {
+    size_t alloc_count;      /* ray_alloc calls */
+    size_t free_count;       /* ray_free calls */
+    size_t bytes_allocated;  /* currently allocated */
+    size_t peak_bytes;       /* high-water mark */
+    size_t slab_hits;        /* slab cache hits */
+    size_t direct_count;     /* active direct mmaps */
+    size_t direct_bytes;     /* bytes in direct mmaps */
+    size_t sys_current;      /* sys allocator: current mmap'd bytes */
+    size_t sys_peak;         /* sys allocator: peak mmap'd bytes */
+} ray_mem_stats_t;
+
+/* ===== Forward Declarations (internal types) ===== */
+
+typedef struct ray_heap      ray_heap_t;
+typedef struct ray_sym_table ray_sym_table_t;
+typedef struct ray_sym_map   ray_sym_map_t;
+typedef struct ray_task      ray_task_t;
+typedef struct ray_dispatch  ray_dispatch_t;
+
+/* ===== Heap Lifecycle ===== */
+
+void     ray_heap_init(void);
+void     ray_heap_destroy(void);
+void     ray_heap_merge(ray_heap_t* src);
+void     ray_heap_flush_foreign(void);
+void     ray_heap_push_pending(ray_heap_t* heap);
+void     ray_heap_drain_pending(void);
+uint8_t  ray_order_for_size(size_t data_size);
+void     ray_mem_stats(ray_mem_stats_t* out);
+
+void ray_heap_gc(void);
+void ray_heap_release_pages(void);
+
+/* --------------------------------------------------------------------------
+ * Constants
+ * -------------------------------------------------------------------------- */
+#define RAY_HEAP_POOL_ORDER  25      /* 32 MB standard pool */
+#define RAY_HEAP_MAX_ORDER   38      /* 256 GB max pool */
+#define RAY_HEAP_FL_SIZE     (RAY_HEAP_MAX_ORDER + 1)
+#define RAY_MAX_POOLS        512
+
+/* --------------------------------------------------------------------------
+ * Block size helper
+ * -------------------------------------------------------------------------- */
+#define BSIZEOF(o)    ((size_t)1 << (o))
+
+/* --------------------------------------------------------------------------
+ * Pool header: first min-block (64B) of each self-aligned pool.
+ *
+ * Overlaid on bytes 0-15 of the ray_t at pool offset 0.
+ * The ray_t at pool offset 0 has rc=1 (prevents coalescing) and
+ * order=RAY_ORDER_MIN (correct for buddy math).
+ * -------------------------------------------------------------------------- */
+typedef struct {
+    uint16_t heap_id;     /* owning heap ID (for cross-thread free) */
+    uint8_t  pool_order;  /* pool's top order */
+    uint8_t  _pad[5];
+    void*    vm_base;     /* original mmap base (for ray_vm_free on Windows) */
+} ray_pool_hdr_t;
+
+_Static_assert(sizeof(ray_pool_hdr_t) <= 16,
+               "ray_pool_hdr_t must fit in ray_t nullmap (16 bytes)");
+
+/* --------------------------------------------------------------------------
+ * Circular sentinel freelist (Rayforce-style)
+ *
+ * Each freelist[order] is a sentinel node with prev/next pointers at
+ * offsets 0/8 — same layout as ray_t.fl_prev/fl_next. This makes
+ * fl_remove() work without knowing which freelist the block belongs to,
+ * enabling safe cross-heap buddy coalescing.
+ *
+ * Empty list: sentinel.prev = sentinel.next = &sentinel.
+ * -------------------------------------------------------------------------- */
+typedef struct RAY_ALIGN(32) {
+    ray_t* fl_prev;   /* offset 0 — same as ray_t.fl_prev */
+    ray_t* fl_next;   /* offset 8 — same as ray_t.fl_next */
+} ray_fl_head_t;
+
+static inline void fl_init(ray_fl_head_t* h) {
+    h->fl_prev = (ray_t*)h;
+    h->fl_next = (ray_t*)h;
+}
+
+static inline bool fl_empty(const ray_fl_head_t* h) {
+    return h->fl_next == (const ray_t*)h;
+}
+
+/* Unlink a block from whatever circular list it belongs to.
+ * Works across heaps — no head pointer needed. */
+static inline void fl_remove(ray_t* blk) {
+    blk->fl_prev->fl_next = blk->fl_next;
+    blk->fl_next->fl_prev = blk->fl_prev;
+}
+
+/* --------------------------------------------------------------------------
+ * Pool tracking entry (in ray_heap_t)
+ *
+ * Pools are normally backed by anonymous mmap.  When anon mmap fails (the
+ * OS refuses an N-byte allocation because RAM+swap can't satisfy it), the
+ * allocator falls back to a file-backed mmap pointed at a tempfile in the
+ * heap's swap directory — this lets fresh allocations exceed RAM, with
+ * dirty pages flushed to disk by the kernel.
+ *
+ *   backed=0:  anonymous mmap (the common case).  swap_fd unused,
+ *              swap_path NULL.
+ *   backed=1:  file-backed mmap.  swap_fd holds the open fd and
+ *              swap_path holds the absolute path; teardown closes the
+ *              fd, unlinks the file, and ray_sys_frees the path string.
+ * -------------------------------------------------------------------------- */
+typedef struct {
+    void*    base;         /* pool base address (self-aligned) */
+    char*    swap_path;    /* tempfile path when backed=1; NULL otherwise (ray_sys_alloc'd) */
+    int      swap_fd;      /* fd when backed=1; -1 otherwise */
+    uint8_t  pool_order;   /* pool order for munmap sizing */
+    uint8_t  backed;       /* 0 = anon mmap, 1 = file-backed swap */
+    uint8_t  _pad[2];
+} ray_pool_entry_t;
+
+/* --------------------------------------------------------------------------
+ * Pool derivation helpers
+ *
+ * ray_pool_of: derive pool header from any block pointer.
+ *
+ * All pools are self-aligned (pool base = multiple of pool_size). Standard
+ * pools (32 MB) are derived in O(1) via a single AND mask. Oversized pools
+ * (> 32 MB) use a downward walk at 32 MB stride to find the pool header.
+ *
+ * Pool header validation: order == RAY_ORDER_MIN, mmod == 0, rc == 1.
+ * These conditions uniquely identify pool header blocks — cascade/split
+ * blocks always have order > RAY_ORDER_MIN.
+ * -------------------------------------------------------------------------- */
+
+static inline ray_pool_hdr_t* ray_pool_of(ray_t* v) {
+    /* Standard pools (32 MB, self-aligned): one AND gives the base.
+     * Oversized pools need a downward walk but are rare. */
+    uintptr_t stride = BSIZEOF(RAY_HEAP_POOL_ORDER);  /* 32 MB */
+    uintptr_t base = (uintptr_t)v & ~(stride - 1);
+    ray_pool_hdr_t* hdr = (ray_pool_hdr_t*)base;
+
+    /* Fast path: standard pool header at 32 MB boundary (99%+ of calls) */
+    if (RAY_LIKELY(hdr->pool_order == RAY_HEAP_POOL_ORDER))
+        return hdr;
+
+    /* Slow path: oversized pool — walk downward at 32 MB stride */
+    if (hdr->pool_order > RAY_HEAP_POOL_ORDER &&
+        hdr->pool_order <= RAY_HEAP_MAX_ORDER &&
+        (uintptr_t)v < base + BSIZEOF(hdr->pool_order))
+        return hdr;
+
+    for (;;) {
+        if (base < stride) break;
+        base -= stride;
+        hdr = (ray_pool_hdr_t*)base;
+        ray_t* hdr_blk = (ray_t*)base;
+        if (hdr_blk->order == RAY_ORDER_MIN &&
+            hdr_blk->mmod == 0 &&
+            ray_atomic_load(&hdr_blk->rc) == 1) {
+            if (hdr->pool_order >= RAY_HEAP_POOL_ORDER &&
+                hdr->pool_order <= RAY_HEAP_MAX_ORDER &&
+                (uintptr_t)v < base + BSIZEOF(hdr->pool_order))
+                return hdr;
+        }
+    }
+    ray_pool_hdr_t* fallback = (ray_pool_hdr_t*)((uintptr_t)v & ~(stride - 1));
+    if (fallback->pool_order >= RAY_HEAP_POOL_ORDER &&
+        fallback->pool_order <= RAY_HEAP_MAX_ORDER)
+        return fallback;
+    return NULL;
+}
+
+/* --------------------------------------------------------------------------
+ * Buddy derivation: uses self-aligned pool base
+ * -------------------------------------------------------------------------- */
+static inline ray_t* ray_buddy_of(ray_t* v, uint8_t order, uintptr_t pool_base) {
+    return (ray_t*)(pool_base + (((uintptr_t)v - pool_base) ^ BSIZEOF(order)));
+}
+
+/* --------------------------------------------------------------------------
+ * Slab cache for small blocks (orders 6-10, i.e., 64B-1024B)
+ * -------------------------------------------------------------------------- */
+typedef struct {
+    uint32_t count;
+    ray_t*    stack[RAY_SLAB_CACHE_SIZE];
+} ray_slab_t;
+
+#define RAY_SLAB_MIN       RAY_ORDER_MIN
+#define RAY_SLAB_MAX       (RAY_ORDER_MIN + RAY_SLAB_ORDERS - 1)
+#define IS_SLAB_ORDER(o)  ((o) >= RAY_SLAB_MIN && (o) <= RAY_SLAB_MAX)
+#define SLAB_INDEX(o)     ((o) - RAY_SLAB_MIN)
+
+/* --------------------------------------------------------------------------
+ * Per-thread heap
+ * -------------------------------------------------------------------------- */
+typedef struct ray_heap {
+    uint64_t        avail;                       /* bitmask: bit N set = freelist[N] non-empty */
+    uint16_t        id;                          /* heap identity (for cross-thread free) */
+    ray_t*           foreign;                     /* cross-heap freed blocks (lock-free LIFO via fl_next) */
+    ray_slab_t       slabs[RAY_SLAB_ORDERS];       /* small-block slab caches */
+    ray_fl_head_t    freelist[RAY_HEAP_FL_SIZE];   /* circular sentinel per order */
+    ray_mem_stats_t  stats;
+    uint32_t        pool_count;                  /* number of tracked pools */
+    ray_pool_entry_t pools[RAY_MAX_POOLS];         /* pool tracking for destroy/merge */
+    struct ray_heap* pending_next;                /* link for pending-merge LIFO queue */
+    char            swap_path[256];              /* dir for file-backed pool fallback (RAY_HEAP_SWAP env, default "./") */
+} ray_heap_t;
+
+/* --------------------------------------------------------------------------
+ * Bitmap-based heap ID allocator (atomic CAS, reusable IDs)
+ * -------------------------------------------------------------------------- */
+#define RAY_HEAP_ID_WORDS  16   /* 16 * 64 = 1024 IDs (matches registry size) */
+#define RAY_HEAP_ID_BITS   (RAY_HEAP_ID_WORDS * 64)
+
+/* Global pending-merge queue head (lock-free LIFO) */
+extern _Atomic(ray_heap_t*) ray_heap_pending_merge;
+
+/* --------------------------------------------------------------------------
+ * Pool-list scan: find which pool a block belongs to without reading the
+ * remote pool header (avoids cold cache line 32MB away on hot path).
+ * Returns pool index in h->pools[], or -1 if block is foreign.
+ * -------------------------------------------------------------------------- */
+static inline int heap_find_pool(const ray_heap_t* h, const void* ptr) {
+    uintptr_t addr = (uintptr_t)ptr;
+    for (uint32_t i = 0; i < h->pool_count; i++) {
+        uintptr_t pb = (uintptr_t)h->pools[i].base;
+        if (addr >= pb && addr < pb + BSIZEOF(h->pools[i].pool_order))
+            return (int)i;
+    }
+    return -1;
+}
+
+/* --------------------------------------------------------------------------
+ * Thread-local state
+ * -------------------------------------------------------------------------- */
+extern RAY_TLS ray_heap_t*     ray_tl_heap;
+
+/* --------------------------------------------------------------------------
+ * Global heap registry: look up any heap by ID so foreign blocks can be
+ * returned to their owning heap instead of accumulating on the freeing heap.
+ * -------------------------------------------------------------------------- */
+#define RAY_HEAP_REGISTRY_SIZE 1024
+extern ray_heap_t* ray_heap_registry[RAY_HEAP_REGISTRY_SIZE];
+
+/* --------------------------------------------------------------------------
+ * Scratch arena: bump-allocator backed by buddy-allocated pages.
+ * O(1) push (pointer bump), O(n_backing) reset (free all backing blocks).
+ * -------------------------------------------------------------------------- */
+#define RAY_ARENA_MAX_BACKING  64
+#define RAY_ARENA_BLOCK_ORDER  16   /* 64 KB backing blocks */
+
+typedef struct {
+    ray_t*   backing[RAY_ARENA_MAX_BACKING];
+    int     n_backing;
+    char*   ptr;
+    char*   end;
+} ray_scratch_arena_t;
+
+static inline void ray_scratch_arena_init(ray_scratch_arena_t* a) {
+    a->n_backing = 0;
+    a->ptr = NULL;
+    a->end = NULL;
+}
+
+/* Retain all child/owned refs inside a compound block (STR/LIST/TABLE/etc.).
+ * Used by ray_block_copy and ray_alloc_copy after shallow-copying a block.
+ *
+ * Returns true on success, false if a deep-clone of a uniquely-owned
+ * resource (e.g. an HNSW index) failed.  On failure, any owned state that
+ * was memcpy'd into the copy has been neutralized (attr flags cleared,
+ * pointers zeroed) so the caller may safely ray_free(v) without leaks or
+ * double-frees. */
+bool ray_retain_owned_refs(ray_t* v);
+
+void* ray_scratch_arena_push(ray_scratch_arena_t* a, size_t nbytes);
+void  ray_scratch_arena_reset(ray_scratch_arena_t* a);
+
+#endif /* RAY_HEAP_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/sys.c b/crates/rayforce-sys/vendor/rayforce/src/mem/sys.c
new file mode 100644
index 0000000..4d0f34d
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/mem/sys.c
@@ -0,0 +1,122 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "sys.h"
+#include "core/platform.h"
+#include <rayforce.h>
+#include <string.h>
+#include <stdatomic.h>
+
+/* 32-byte header prepended to every sys allocation.
+ * mmap returns page-aligned addresses; data at page+32 is 32-byte aligned,
+ * satisfying RAY_BLOCK_ALIGN for the weak ray_alloc stub. */
+#define SYS_HDR_SIZE 32
+
+typedef struct {
+    size_t map_size;   /* total mmap'd bytes (header + user, page-rounded) */
+    size_t usr_size;   /* user-requested bytes (for realloc memcpy) */
+    /* Padding sized so the struct totals SYS_HDR_SIZE on both 32-bit
+     * (WASM, size_t=4 → pad=24) and 64-bit (Linux/macOS, size_t=8 → pad=16). */
+    char   _pad[SYS_HDR_SIZE - 2 * sizeof(size_t)];
+} sys_hdr_t;
+
+_Static_assert(sizeof(sys_hdr_t) == SYS_HDR_SIZE, "sys_hdr_t must be 32 bytes");
+
+static _Atomic(int64_t) g_sys_current = 0;
+static _Atomic(int64_t) g_sys_peak    = 0;
+
+static inline size_t page_round(size_t n) {
+    return (n + 4095) & ~(size_t)4095;
+}
+
+void* ray_sys_alloc(size_t size) {
+    if (size == 0) size = 1;
+    if (size > SIZE_MAX - SYS_HDR_SIZE) return NULL;
+    size_t total = page_round(SYS_HDR_SIZE + size);
+    void* p = ray_vm_alloc(total);
+    if (!p) return NULL;
+
+    sys_hdr_t* hdr = (sys_hdr_t*)p;
+    hdr->map_size = total;
+    hdr->usr_size = size;
+
+    int64_t cur = atomic_fetch_add_explicit(&g_sys_current, (int64_t)total,
+                                             memory_order_relaxed) + (int64_t)total;
+    int64_t pk = atomic_load_explicit(&g_sys_peak, memory_order_relaxed);
+    while (cur > pk) {
+        if (atomic_compare_exchange_weak_explicit(&g_sys_peak, &pk, cur,
+                                                   memory_order_relaxed,
+                                                   memory_order_relaxed))
+            break;
+    }
+
+    return (char*)p + SYS_HDR_SIZE;
+}
+
+void ray_sys_free(void* ptr) {
+    if (!ptr) return;
+    sys_hdr_t* hdr = (sys_hdr_t*)((char*)ptr - SYS_HDR_SIZE);
+    size_t total = hdr->map_size;
+    ray_vm_free(hdr, total);
+    atomic_fetch_sub_explicit(&g_sys_current, (int64_t)total,
+                               memory_order_relaxed);
+}
+
+/* L5: ray_sys_realloc(ptr, 0) frees ptr and returns NULL, matching the
+ * behavior of some realloc implementations. Callers should not rely on
+ * this as a general-purpose free — use ray_sys_free() explicitly. */
+void* ray_sys_realloc(void* ptr, size_t new_size) {
+    if (!ptr) return ray_sys_alloc(new_size);
+    if (new_size == 0) { ray_sys_free(ptr); return NULL; }
+    if (new_size > SIZE_MAX - SYS_HDR_SIZE) return NULL;
+
+    sys_hdr_t* old_hdr = (sys_hdr_t*)((char*)ptr - SYS_HDR_SIZE);
+    size_t old_usr = old_hdr->usr_size;
+    size_t new_total = page_round(SYS_HDR_SIZE + new_size);
+
+    /* Same page count — just update user size */
+    if (new_total == old_hdr->map_size) {
+        old_hdr->usr_size = new_size;
+        return ptr;
+    }
+
+    void* new_ptr = ray_sys_alloc(new_size);
+    if (!new_ptr) return NULL;
+    memcpy(new_ptr, ptr, old_usr < new_size ? old_usr : new_size);
+    ray_sys_free(ptr);
+    return new_ptr;
+}
+
+char* ray_sys_strdup(const char* s) {
+    if (!s) return NULL;
+    size_t len = strlen(s);
+    char* dup = (char*)ray_sys_alloc(len + 1);
+    if (!dup) return NULL;
+    memcpy(dup, s, len + 1);
+    return dup;
+}
+
+void ray_sys_get_stat(int64_t* out_current, int64_t* out_peak) {
+    *out_current = atomic_load_explicit(&g_sys_current, memory_order_relaxed);
+    *out_peak    = atomic_load_explicit(&g_sys_peak, memory_order_relaxed);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/sys.h b/crates/rayforce-sys/vendor/rayforce/src/mem/sys.h
new file mode 100644
index 0000000..eb53154
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/mem/sys.h
@@ -0,0 +1,49 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_MEM_SYS_H
+#define RAY_MEM_SYS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* --------------------------------------------------------------------------
+ * System-level mmap allocator for infrastructure that can't use the buddy
+ * allocator (cross-thread lifetime, bootstrap, global state).
+ *
+ * Every allocation is tracked. ray_mem_stats() reports the totals so users
+ * can see the full memory footprint.
+ *
+ * Each allocation prepends a 32-byte header (stores mmap size + user size),
+ * so ray_sys_free() needs no size argument.
+ * -------------------------------------------------------------------------- */
+
+void* ray_sys_alloc(size_t size);
+void* ray_sys_realloc(void* ptr, size_t new_size);
+void  ray_sys_free(void* ptr);
+char* ray_sys_strdup(const char* s);
+
+/* Read current sys allocator counters (called by ray_mem_stats in arena.c) */
+void  ray_sys_get_stat(int64_t* out_current, int64_t* out_peak);
+
+#endif /* RAY_MEM_SYS_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/agg.c b/crates/rayforce-sys/vendor/rayforce/src/ops/agg.c
new file mode 100644
index 0000000..c8b5fa3
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/agg.c
@@ -0,0 +1,509 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/internal.h"
+#include "ops/ops.h"
+#include "mem/heap.h"
+
+#include <stdlib.h>  /* qsort (introselect fallback) */
+
+static int dbl_cmp(const void* a, const void* b) {
+    double da = *(const double*)a, db = *(const double*)b;
+    return (da > db) - (da < db);
+}
+
+/* Partition vals[lo..hi] so that vals[k] holds the kth-smallest element,
+ * with everything to the left ≤ and everything to the right ≥.  Average
+ * O(n) (Hoare quickselect with median-of-three), worst-case O(n log n)
+ * via qsort fallback when recursion exceeds 2*log2(range).  Mirrors
+ * std::nth_element's contract; DuckDB's quantile path uses the same
+ * pattern (extension/core_functions/aggregate/holistic/quantile.cpp,
+ * quantile_sort_tree.hpp:191-195). */
+static void nth_element_dbl(double* a, int64_t lo, int64_t hi, int64_t k) {
+    int depth_limit = 0;
+    for (int64_t r = hi - lo + 1; r > 0; r >>= 1) depth_limit++;
+    depth_limit *= 2;
+    while (hi - lo > 16) {
+        if (depth_limit-- <= 0) {
+            qsort(a + lo, (size_t)(hi - lo + 1), sizeof(double), dbl_cmp);
+            return;
+        }
+        int64_t mid = lo + ((hi - lo) >> 1);
+        if (a[lo] > a[mid]) { double t = a[lo]; a[lo] = a[mid]; a[mid] = t; }
+        if (a[lo] > a[hi])  { double t = a[lo]; a[lo] = a[hi];  a[hi]  = t; }
+        if (a[mid] > a[hi]) { double t = a[mid]; a[mid] = a[hi]; a[hi] = t; }
+        /* Park pivot at hi-1; partition (lo, hi-1) with sentinels at both ends. */
+        { double t = a[mid]; a[mid] = a[hi - 1]; a[hi - 1] = t; }
+        double pivot = a[hi - 1];
+        int64_t i = lo, j = hi - 1;
+        for (;;) {
+            while (a[++i] < pivot) {}
+            while (a[--j] > pivot) {}
+            if (i >= j) break;
+            double t = a[i]; a[i] = a[j]; a[j] = t;
+        }
+        /* Restore pivot to its final resting position i. */
+        { double t = a[i]; a[i] = a[hi - 1]; a[hi - 1] = t; }
+        if      (k < i) hi = i - 1;
+        else if (k > i) lo = i + 1;
+        else            return;
+    }
+    /* Small range: insertion sort the slice covers vals[lo..hi]. */
+    for (int64_t i = lo + 1; i <= hi; i++) {
+        double key = a[i];
+        int64_t j = i - 1;
+        while (j >= lo && a[j] > key) { a[j + 1] = a[j]; j--; }
+        a[j + 1] = key;
+    }
+}
+
+/* ══════════════════════════════════════════
+ * Aggregation builtins
+ * ══════════════════════════════════════════ */
+
+/* Build a one-op DAG over a single input vector and execute it. */
+#define AGG_VEC_VIA_DAG(x, ctor) do {                       \
+    ray_graph_t* g = ray_graph_new(NULL);                   \
+    if (!g) return ray_error("oom", NULL);                  \
+    ray_op_t* in = ray_graph_input_vec(g, x);              \
+    ray_op_t* op = ctor(g, in);                            \
+    return ray_lazy_materialize(ray_lazy_wrap(g, op));      \
+} while(0)
+
+/* DAG executor returns I64 for all integer types — cast back to original. */
+static ray_t* recast_i64_to_orig(ray_t* r, int8_t orig_type) {
+    if (!r || RAY_IS_ERR(r)) return r;
+    if (ray_is_atom(r) && r->type == -RAY_I64 && orig_type != RAY_I64 && orig_type != RAY_F64) {
+        int64_t v = r->i64;
+        ray_release(r);
+        if (orig_type == RAY_DATE) return ray_date((int32_t)v);
+        if (orig_type == RAY_TIME) return ray_time(v);
+        if (orig_type == RAY_TIMESTAMP) return ray_timestamp(v);
+        if (orig_type == RAY_I32) return make_i32((int32_t)v);
+        if (orig_type == RAY_I16) return make_i16((int16_t)v);
+        if (orig_type == RAY_U8) return make_u8((uint8_t)v);
+    }
+    return r;
+}
+
+ray_t* ray_sum_fn(ray_t* x) {
+    if (ray_is_lazy(x)) return ray_lazy_append(x, OP_SUM);
+    if (ray_is_atom(x)) {
+        /* u8/i16 scalar sum promotes to i64 */
+        if (x->type == -RAY_U8)  return make_i64((int64_t)x->u8);
+        if (x->type == -RAY_I16) return make_i64((int64_t)x->i16);
+        ray_retain(x); return x;
+    }
+    if (ray_is_vec(x)) {
+        if (x->type == RAY_DATE) return ray_error("type", NULL);
+        /* Narrow/temporal types need specific return constructors that the
+         * DAG executor doesn't provide — use scalar path for these. */
+        if (x->type == RAY_I32 || x->type == RAY_I16 || x->type == RAY_U8 ||
+            x->type == RAY_TIME || x->type == RAY_TIMESTAMP) {
+            int64_t n = x->len;
+            bool has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0;
+            int64_t sum = 0;
+            if (x->type == RAY_I32) {
+                int32_t* d = (int32_t*)ray_data(x);
+                if (has_nulls) { for (int64_t i = 0; i < n; i++) if (!ray_vec_is_null(x, i)) sum += d[i]; }
+                else { for (int64_t i = 0; i < n; i++) sum += d[i]; }
+                return make_i64(sum);
+            } else if (x->type == RAY_I16) {
+                int16_t* d = (int16_t*)ray_data(x);
+                if (has_nulls) { for (int64_t i = 0; i < n; i++) if (!ray_vec_is_null(x, i)) sum += d[i]; }
+                else { for (int64_t i = 0; i < n; i++) sum += d[i]; }
+                return make_i64(sum);
+            } else if (x->type == RAY_U8) {
+                uint8_t* d = (uint8_t*)ray_data(x);
+                if (has_nulls) { for (int64_t i = 0; i < n; i++) if (!ray_vec_is_null(x, i)) sum += d[i]; }
+                else { for (int64_t i = 0; i < n; i++) sum += d[i]; }
+                return make_i64(sum);
+            } else if (x->type == RAY_TIME) {
+                int32_t* d = (int32_t*)ray_data(x);
+                if (has_nulls) { for (int64_t i = 0; i < n; i++) if (!ray_vec_is_null(x, i)) sum += d[i]; }
+                else { for (int64_t i = 0; i < n; i++) sum += d[i]; }
+                return ray_time(sum);
+            } else {
+                int64_t* d = (int64_t*)ray_data(x);
+                if (has_nulls) { for (int64_t i = 0; i < n; i++) if (!ray_vec_is_null(x, i)) sum += d[i]; }
+                else { for (int64_t i = 0; i < n; i++) sum += d[i]; }
+                return ray_timestamp(sum);
+            }
+        }
+        /* I64/F64: parallel morsel-driven reduction via DAG executor */
+        AGG_VEC_VIA_DAG(x, ray_sum);
+    }
+    if (!is_list(x)) return ray_error("type", NULL);
+    int64_t len = ray_len(x);
+    if (len == 0) return make_i64(0);
+    ray_t** elems = (ray_t**)ray_data(x);
+    int has_float = 0;
+    double fsum = 0.0;
+    int64_t isum = 0;
+    for (int64_t i = 0; i < len; i++) {
+        if (!is_numeric(elems[i])) return ray_error("type", NULL);
+        if (RAY_ATOM_IS_NULL(elems[i])) {
+            if (elems[i]->type == -RAY_F64) has_float = 1;
+            continue;
+        }
+        if (elems[i]->type == -RAY_F64) { has_float = 1; fsum += elems[i]->f64; }
+        else if (elems[i]->type == -RAY_I64) { isum += elems[i]->i64; fsum += (double)elems[i]->i64; }
+        else { int64_t v = (int64_t)as_f64(elems[i]); isum += v; fsum += (double)v; }
+    }
+    return has_float ? make_f64(fsum) : make_i64(isum);
+}
+
+ray_t* ray_count_fn(ray_t* x) {
+    if (ray_is_lazy(x)) return ray_lazy_append(x, OP_COUNT);
+    if (x->type == RAY_TABLE) return make_i64(ray_table_nrows(x));
+    if (x->type == RAY_DICT)  return make_i64(ray_dict_len(x));
+    /* String atom: count = string length */
+    if (ray_is_atom(x) && (-x->type) == RAY_STR)
+        return make_i64((int64_t)ray_str_len(x));
+    if (ray_is_vec(x))
+        return make_i64(x->len);  /* count = total length including nulls */
+    if (!is_list(x)) {
+        /* Scalar atom → count 1 */
+        if (ray_is_atom(x)) return make_i64(1);
+        return ray_error("type", NULL);
+    }
+    return make_i64(ray_len(x));
+}
+
+ray_t* ray_avg_fn(ray_t* x) {
+    if (ray_is_lazy(x)) return ray_lazy_append(x, OP_AVG);
+    if (ray_is_atom(x)) {
+        if (RAY_ATOM_IS_NULL(x)) return ray_typed_null(-RAY_F64);
+        if (is_numeric(x)) return make_f64(as_f64(x));
+        ray_retain(x); return x;
+    }
+    if (ray_is_vec(x)) AGG_VEC_VIA_DAG(x, ray_avg);
+    if (!is_list(x)) return ray_error("type", NULL);
+    int64_t len = ray_len(x);
+    if (len == 0) return ray_error("domain", NULL);
+    ray_t** elems = (ray_t**)ray_data(x);
+    double sum = 0.0;
+    int64_t cnt = 0;
+    for (int64_t i = 0; i < len; i++) {
+        if (!is_numeric(elems[i])) return ray_error("type", NULL);
+        if (RAY_ATOM_IS_NULL(elems[i])) continue;
+        sum += as_f64(elems[i]); cnt++;
+    }
+    if (cnt == 0) return ray_typed_null(-RAY_F64);
+    return make_f64(sum / (double)cnt);
+}
+
+ray_t* ray_min_fn(ray_t* x) {
+    if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MIN);
+    if (ray_is_atom(x)) { ray_retain(x); return x; }
+    if (ray_is_vec(x)) {
+        int8_t orig_type = x->type;
+        ray_graph_t* g = ray_graph_new(NULL);
+        if (!g) return ray_error("oom", NULL);
+        ray_op_t* in = ray_graph_input_vec(g, x);
+        ray_op_t* op = ray_min_op(g, in);
+        ray_t* r = ray_lazy_materialize(ray_lazy_wrap(g, op));
+        return recast_i64_to_orig(r, orig_type);
+    }
+    if (!is_list(x)) return ray_error("type", NULL);
+    int64_t len = ray_len(x);
+    if (len == 0) return ray_error("domain", NULL);
+    ray_t** elems = (ray_t**)ray_data(x);
+    int has_float = 0, found = 0;
+    double fmin = 0; int64_t imin = 0;
+    for (int64_t i = 0; i < len; i++) {
+        if (!is_numeric(elems[i])) return ray_error("type", NULL);
+        if (elems[i]->type == -RAY_F64) has_float = 1;
+        if (RAY_ATOM_IS_NULL(elems[i])) continue;
+        double v = as_f64(elems[i]);
+        if (!found || v < fmin) { fmin = v; imin = elems[i]->type == -RAY_I64 ? elems[i]->i64 : 0; found = 1; }
+    }
+    if (!found) return ray_typed_null(has_float ? -RAY_F64 : -RAY_I64);
+    return has_float ? make_f64(fmin) : make_i64(imin);
+}
+
+ray_t* ray_max_fn(ray_t* x) {
+    if (ray_is_lazy(x)) return ray_lazy_append(x, OP_MAX);
+    if (ray_is_atom(x)) { ray_retain(x); return x; }
+    if (ray_is_vec(x)) {
+        int8_t orig_type = x->type;
+        ray_graph_t* g = ray_graph_new(NULL);
+        if (!g) return ray_error("oom", NULL);
+        ray_op_t* in = ray_graph_input_vec(g, x);
+        ray_op_t* op = ray_max_op(g, in);
+        ray_t* r = ray_lazy_materialize(ray_lazy_wrap(g, op));
+        return recast_i64_to_orig(r, orig_type);
+    }
+    if (!is_list(x)) return ray_error("type", NULL);
+    int64_t len = ray_len(x);
+    if (len == 0) return ray_error("domain", NULL);
+    ray_t** elems = (ray_t**)ray_data(x);
+    int has_float = 0, found = 0;
+    double fmax = 0; int64_t imax = 0;
+    for (int64_t i = 0; i < len; i++) {
+        if (!is_numeric(elems[i])) return ray_error("type", NULL);
+        if (elems[i]->type == -RAY_F64) has_float = 1;
+        if (RAY_ATOM_IS_NULL(elems[i])) continue;
+        double v = as_f64(elems[i]);
+        if (!found || v > fmax) { fmax = v; imax = elems[i]->type == -RAY_I64 ? elems[i]->i64 : 0; found = 1; }
+    }
+    if (!found) return ray_typed_null(has_float ? -RAY_F64 : -RAY_I64);
+    return has_float ? make_f64(fmax) : make_i64(imax);
+}
+
+ray_t* ray_first_fn(ray_t* x) {
+    if (ray_is_lazy(x)) return ray_lazy_append(x, OP_FIRST);
+    /* String first: return first char */
+    if (ray_is_atom(x) && (-x->type) == RAY_STR) {
+        size_t slen = ray_str_len(x);
+        if (slen == 0) return ray_error("domain", NULL);
+        const char* p = ray_str_ptr(x);
+        return ray_str(p, 1);
+    }
+    if (ray_is_atom(x)) { ray_retain(x); return x; }
+    /* Table first: return first row as dict */
+    if (x->type == RAY_TABLE) {
+        if (ray_table_nrows(x) == 0) return ray_error("domain", NULL);
+        ray_t* idx = make_i64(0);
+        ray_t* result = ray_at_fn(x, idx);
+        ray_release(idx);
+        return result;
+    }
+    if (ray_is_vec(x)) {
+        if (ray_len(x) == 0) return ray_typed_null(-x->type);
+        /* For non-I64/F64 types route through collection_elem which
+         * preserves the element type.  The DAG path widens to i64 for
+         * DATE/TIME/TIMESTAMP/BOOL/U8 — bypass it. */
+        if (x->type == RAY_SYM   || x->type == RAY_I32  || x->type == RAY_I16 ||
+            x->type == RAY_GUID  || x->type == RAY_STR  || x->type == RAY_BOOL ||
+            x->type == RAY_U8    || x->type == RAY_DATE || x->type == RAY_TIME ||
+            x->type == RAY_TIMESTAMP) {
+            int alloc = 0;
+            return collection_elem(x, 0, &alloc);
+        }
+        AGG_VEC_VIA_DAG(x, ray_first);
+    }
+    if (!is_list(x)) return ray_error("type", NULL);
+    if (ray_len(x) == 0) return ray_typed_null(-RAY_I64);
+    ray_t* elem = ((ray_t**)ray_data(x))[0];
+    ray_retain(elem);
+    return elem;
+}
+
+ray_t* ray_last_fn(ray_t* x) {
+    if (ray_is_lazy(x)) return ray_lazy_append(x, OP_LAST);
+    /* String last: return last char */
+    if (ray_is_atom(x) && (-x->type) == RAY_STR) {
+        size_t slen = ray_str_len(x);
+        if (slen == 0) return ray_error("domain", NULL);
+        const char* p = ray_str_ptr(x);
+        return ray_str(p + slen - 1, 1);
+    }
+    if (ray_is_atom(x)) { ray_retain(x); return x; }
+    /* Table last: return last row as dict */
+    if (x->type == RAY_TABLE) {
+        int64_t nrows = ray_table_nrows(x);
+        if (nrows == 0) return ray_error("domain", NULL);
+        ray_t* idx = make_i64(nrows - 1);
+        ray_t* result = ray_at_fn(x, idx);
+        ray_release(idx);
+        return result;
+    }
+    if (ray_is_vec(x)) {
+        if (ray_len(x) == 0) return ray_typed_null(-x->type);
+        /* See ray_first_fn for rationale on the type whitelist. */
+        if (x->type == RAY_SYM   || x->type == RAY_I32  || x->type == RAY_I16 ||
+            x->type == RAY_GUID  || x->type == RAY_STR  || x->type == RAY_BOOL ||
+            x->type == RAY_U8    || x->type == RAY_DATE || x->type == RAY_TIME ||
+            x->type == RAY_TIMESTAMP) {
+            int alloc = 0;
+            return collection_elem(x, ray_len(x) - 1, &alloc);
+        }
+        AGG_VEC_VIA_DAG(x, ray_last);
+    }
+    if (!is_list(x)) return ray_error("type", NULL);
+    int64_t len = ray_len(x);
+    if (len == 0) return ray_typed_null(-RAY_I64);
+    ray_t* elem = ((ray_t**)ray_data(x))[len - 1];
+    ray_retain(elem);
+    return elem;
+}
+
+/* Helper: copy non-null vec elements to double scratch buffer, compacted.
+ * scratch->len is set to the number of non-null values copied.
+ * Returns scratch ray_t* (caller must ray_release), or error. */
+static ray_t* vec_to_f64_scratch(ray_t* x, double** out_vals) {
+    int64_t len = ray_len(x);
+    ray_t* scratch = ray_alloc(len * sizeof(double));
+    if (!scratch) return ray_error("oom", NULL);
+    scratch->type = RAY_F64;
+    double* vals = (double*)ray_data(scratch);
+    int64_t cnt = 0;
+    if (x->type == RAY_I64) {
+        int64_t* d = (int64_t*)ray_data(x);
+        for (int64_t i = 0; i < len; i++) { if (!ray_vec_is_null(x, i)) vals[cnt++] = (double)d[i]; }
+    } else if (x->type == RAY_F64) {
+        double* d = (double*)ray_data(x);
+        for (int64_t i = 0; i < len; i++) { if (!ray_vec_is_null(x, i)) vals[cnt++] = d[i]; }
+    } else if (x->type == RAY_I32) {
+        int32_t* d = (int32_t*)ray_data(x);
+        for (int64_t i = 0; i < len; i++) { if (!ray_vec_is_null(x, i)) vals[cnt++] = (double)d[i]; }
+    } else if (x->type == RAY_I16) {
+        int16_t* d = (int16_t*)ray_data(x);
+        for (int64_t i = 0; i < len; i++) { if (!ray_vec_is_null(x, i)) vals[cnt++] = (double)d[i]; }
+    } else if (x->type == RAY_U8) {
+        uint8_t* d = (uint8_t*)ray_data(x);
+        for (int64_t i = 0; i < len; i++) { if (!ray_vec_is_null(x, i)) vals[cnt++] = (double)d[i]; }
+    } else {
+        ray_release(scratch);
+        return ray_error("type", NULL);
+    }
+    scratch->len = cnt;
+    *out_vals = vals;
+    return scratch;
+}
+
+ray_t* ray_med_fn(ray_t* x) {
+    if (ray_is_lazy(x)) x = ray_lazy_materialize(x);
+    if (RAY_IS_ERR(x)) return x;
+    /* Scalar: median of single value → f64 */
+    if (ray_is_atom(x)) {
+        if (RAY_ATOM_IS_NULL(x)) return ray_typed_null(-RAY_F64);
+        if (is_numeric(x)) return make_f64(as_f64(x));
+        return ray_error("type", NULL);
+    }
+    int64_t len;
+    ray_t* scratch = NULL;
+    double* vals = NULL;
+
+    if (ray_is_vec(x)) {
+        len = ray_len(x);
+        if (len == 0) return ray_typed_null(-RAY_F64);
+        scratch = vec_to_f64_scratch(x, &vals);
+        if (RAY_IS_ERR(scratch)) return scratch;
+    } else if (is_list(x)) {
+        len = ray_len(x);
+        if (len == 0) return ray_typed_null(-RAY_F64);
+        ray_t** elems = (ray_t**)ray_data(x);
+        scratch = ray_alloc(len * sizeof(double));
+        if (!scratch) return ray_error("oom", NULL);
+        scratch->type = RAY_F64;
+        scratch->len = 0;
+        vals = (double*)ray_data(scratch);
+        int64_t cnt_l = 0;
+        for (int64_t i = 0; i < len; i++) {
+            if (ray_is_atom(elems[i]) && RAY_ATOM_IS_NULL(elems[i])) continue;
+            if (!is_numeric(elems[i])) { ray_release(scratch); return ray_error("type", NULL); }
+            vals[cnt_l++] = as_f64(elems[i]);
+        }
+        scratch->len = cnt_l;
+    } else {
+        return ray_error("type", NULL);
+    }
+
+    /* scratch->len holds the count of non-null values (already compacted) */
+    int64_t cnt = scratch->len;
+    if (cnt == 0) { ray_release(scratch); return ray_typed_null(-RAY_F64); }
+
+    /* O(n) average partial-sort.  Two-call pattern from DuckDB's
+     * QuantileInterpolator::Operation (quantile_sort_tree.hpp:191-195):
+     * for odd n one nth_element places the middle; for even n a second
+     * nth_element on the right half locates the upper middle.  Replaces
+     * an O(n^2) insertion sort that hung for groups larger than ~10k. */
+    int64_t k = cnt / 2;
+    double median;
+    if (cnt % 2 == 1) {
+        nth_element_dbl(vals, 0, cnt - 1, k);
+        median = vals[k];
+    } else {
+        nth_element_dbl(vals, 0, cnt - 1, k - 1);
+        nth_element_dbl(vals, k, cnt - 1, k);
+        median = (vals[k - 1] + vals[k]) / 2.0;
+    }
+    ray_release(scratch);
+    return make_f64(median);
+}
+
+static ray_t* var_stddev_core(ray_t* x, int sample, int take_sqrt);
+
+
+ray_t* ray_dev_fn(ray_t* x) { return var_stddev_core(x, 0, 1); }
+
+/* Shared core for variance / stddev in sample or population mode.
+ * sample=1 -> divide sum-of-squares by (n-1); sample=0 -> divide by n.
+ * take_sqrt=1 -> stddev; take_sqrt=0 -> variance. */
+static ray_t* var_stddev_core(ray_t* x, int sample, int take_sqrt) {
+    if (ray_is_lazy(x)) x = ray_lazy_materialize(x);
+    if (RAY_IS_ERR(x)) return x;
+    if (ray_is_atom(x)) {
+        if (RAY_ATOM_IS_NULL(x)) return ray_typed_null(-RAY_F64);
+        if (is_numeric(x)) return sample ? ray_typed_null(-RAY_F64) : make_f64(0.0);
+        return ray_error("type", NULL);
+    }
+
+    double* vals = NULL;
+    ray_t*  scratch = NULL;
+    int64_t cnt = 0;
+
+    if (ray_is_vec(x)) {
+        if (ray_len(x) == 0) return ray_typed_null(-RAY_F64);
+        scratch = vec_to_f64_scratch(x, &vals);
+        if (RAY_IS_ERR(scratch)) return scratch;
+        cnt = scratch->len;
+    } else if (is_list(x)) {
+        int64_t len = ray_len(x);
+        if (len == 0) return ray_typed_null(-RAY_F64);
+        ray_t** elems = (ray_t**)ray_data(x);
+        /* Use a fresh f64 vec as a scratch buffer so we reuse the vec path's cleanup. */
+        scratch = ray_vec_new(RAY_F64, len);
+        if (RAY_IS_ERR(scratch)) return scratch;
+        vals = (double*)ray_data(scratch);
+        for (int64_t i = 0; i < len; i++) {
+            if (!is_numeric(elems[i])) { ray_release(scratch); return ray_error("type", NULL); }
+            if (!RAY_ATOM_IS_NULL(elems[i])) vals[cnt++] = as_f64(elems[i]);
+        }
+        scratch->len = cnt;
+    } else {
+        return ray_error("type", NULL);
+    }
+
+    if (cnt == 0 || (sample && cnt <= 1)) {
+        ray_release(scratch);
+        return ray_typed_null(-RAY_F64);
+    }
+
+    double sum = 0.0;
+    for (int64_t i = 0; i < cnt; i++) sum += vals[i];
+    double mean = sum / (double)cnt;
+    double sqdiff = 0.0;
+    for (int64_t i = 0; i < cnt; i++) { double d = vals[i] - mean; sqdiff += d * d; }
+    ray_release(scratch);
+    double divisor = sample ? (double)(cnt - 1) : (double)cnt;
+    double v = sqdiff / divisor;
+    return make_f64(take_sqrt ? sqrt(v) : v);
+}
+
+ray_t* ray_stddev_fn(ray_t* x)     { return var_stddev_core(x, 1, 1); }
+ray_t* ray_stddev_pop_fn(ray_t* x) { return var_stddev_core(x, 0, 1); }
+ray_t* ray_var_fn(ray_t* x)        { return var_stddev_core(x, 1, 0); }
+ray_t* ray_var_pop_fn(ray_t* x)    { return var_stddev_core(x, 0, 0); }
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/arith.c b/crates/rayforce-sys/vendor/rayforce/src/ops/arith.c
new file mode 100644
index 0000000..72d92ba
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/arith.c
@@ -0,0 +1,422 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/internal.h"
+
+/* Arithmetic builtins (atom-only).
+ * Vector dispatch goes through the DAG executor. */
+
+ray_t* ray_add_fn(ray_t* a, ray_t* b) {
+    /* Vector fast path — only when at least one operand is a typed vector */
+
+    /* Temporal + integer arithmetic (only int types, not float) */
+    if (is_temporal(a) && is_numeric(b) && b->type != -RAY_F64) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b))
+            return ray_typed_null(a->type);
+
+        int64_t v = as_i64(b);
+        if (a->type == -RAY_DATE)      return ray_date(a->i64 + v);
+        if (a->type == -RAY_TIME)      return ray_time(a->i64 + v);
+        if (a->type == -RAY_TIMESTAMP) return ray_timestamp(a->i64 + v);
+    }
+    if (is_numeric(a) && a->type != -RAY_F64 && is_temporal(b)) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b))
+            return ray_typed_null(b->type);
+
+        int64_t v = as_i64(a);
+        if (b->type == -RAY_DATE)      return ray_date(b->i64 + v);
+        if (b->type == -RAY_TIME)      return ray_time(b->i64 + v);
+        if (b->type == -RAY_TIMESTAMP) return ray_timestamp(b->i64 + v);
+    }
+    /* Reject float + temporal */
+    if ((a->type == -RAY_F64 && is_temporal(b)) || (is_temporal(a) && b->type == -RAY_F64))
+        return ray_error("type", NULL);
+    /* Reject null_numeric + temporal (for null floats etc) */
+    if (is_numeric(a) && RAY_ATOM_IS_NULL(a) && is_temporal(b))
+        return ray_error("type", NULL);
+    if (is_temporal(a) && is_numeric(b) && RAY_ATOM_IS_NULL(b))
+        return ray_error("type", NULL);
+    /* DATE + TIME → TIMESTAMP */
+    if (a->type == -RAY_DATE && b->type == -RAY_TIME) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIMESTAMP);
+        return ray_timestamp(a->i64 * 86400000000000LL + b->i64 * 1000000LL);
+    }
+    if (a->type == -RAY_TIME && b->type == -RAY_DATE) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIMESTAMP);
+        return ray_timestamp(b->i64 * 86400000000000LL + a->i64 * 1000000LL);
+    }
+    /* TIME + TIME → TIME */
+    if (a->type == -RAY_TIME && b->type == -RAY_TIME) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIME);
+        return ray_time(a->i64 + b->i64);
+    }
+    /* TIME + TIMESTAMP → TIMESTAMP (add ms as ns) */
+    if (a->type == -RAY_TIME && b->type == -RAY_TIMESTAMP) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIMESTAMP);
+        return ray_timestamp(b->i64 + a->i64 * 1000000LL);
+    }
+    if (a->type == -RAY_TIMESTAMP && b->type == -RAY_TIME) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIMESTAMP);
+        return ray_timestamp(a->i64 + b->i64 * 1000000LL);
+    }
+
+    if (!is_numeric(a) || !is_numeric(b))
+        return ray_error("type", "cannot add %s and %s",
+                         ray_type_name(a->type), ray_type_name(b->type));
+    /* Null propagation */
+    if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return null_for_promoted(a, b);
+    if (is_float_op(a, b)) return make_f64(as_f64(a) + as_f64(b));
+    int8_t rt = promote_int_type(a, b);
+    return make_typed_int(rt, as_i64(a) + as_i64(b));
+}
+
+ray_t* ray_sub_fn(ray_t* a, ray_t* b) {
+
+    /* Temporal - int null propagation (both operands) */
+    if (is_temporal(a) && is_numeric(b)) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b))
+            return ray_typed_null(a->type);
+    }
+    if (is_numeric(a) && is_temporal(b)) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b))
+            return ray_typed_null(b->type);
+    }
+    /* DATE - int → DATE */
+    if (a->type == -RAY_DATE && is_numeric(b)) {
+        return ray_date(a->i64 - as_i64(b));
+    }
+    /* DATE - DATE → i32 (days difference) */
+    if (a->type == -RAY_DATE && b->type == -RAY_DATE) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_I32);
+        return ray_i32((int32_t)(a->i64 - b->i64));
+    }
+    /* DATE - TIME → TIMESTAMP */
+    if (a->type == -RAY_DATE && b->type == -RAY_TIME) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIMESTAMP);
+        return ray_timestamp(a->i64 * 86400000000000LL - b->i64 * 1000000LL);
+    }
+    /* TIME - int → TIME */
+    if (a->type == -RAY_TIME && is_numeric(b)) {
+        return ray_time(a->i64 - as_i64(b));
+    }
+    /* int - TIME → TIME (negative) */
+    if (is_numeric(a) && b->type == -RAY_TIME) {
+        return ray_time(as_i64(a) - b->i64);
+    }
+    /* TIME - TIME → TIME */
+    if (a->type == -RAY_TIME && b->type == -RAY_TIME) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIME);
+        return ray_time(a->i64 - b->i64);
+    }
+    /* TIMESTAMP - int → TIMESTAMP */
+    if (a->type == -RAY_TIMESTAMP && is_numeric(b)) {
+        return ray_timestamp(a->i64 - as_i64(b));
+    }
+    /* TIMESTAMP - TIME → TIMESTAMP */
+    if (a->type == -RAY_TIMESTAMP && b->type == -RAY_TIME) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIMESTAMP);
+        return ray_timestamp(a->i64 - b->i64 * 1000000LL);
+    }
+    /* TIMESTAMP - TIMESTAMP → int (nanos difference) */
+    if (a->type == -RAY_TIMESTAMP && b->type == -RAY_TIMESTAMP) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_I64);
+        return make_i64(a->i64 - b->i64);
+    }
+    /* TIMESTAMP - DATE → error */
+    if (a->type == -RAY_TIMESTAMP && b->type == -RAY_DATE)
+        return ray_error("type", NULL);
+
+    if (!is_numeric(a) || !is_numeric(b))
+        return ray_error("type", "cannot subtract %s and %s",
+                         ray_type_name(a->type), ray_type_name(b->type));
+    /* Null propagation */
+    if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return null_for_promoted(a, b);
+    if (is_float_op(a, b)) {
+        double r = as_f64(a) - as_f64(b);
+        if (r == 0.0) r = 0.0; /* normalize -0.0 to +0.0 */
+        return make_f64(r);
+    }
+    int8_t rt = promote_int_type_right(a, b);
+    return make_typed_int(rt, as_i64(a) - as_i64(b));
+}
+
+ray_t* ray_mul_fn(ray_t* a, ray_t* b) {
+
+    /* int * TIME → TIME, TIME * int → TIME */
+    if (is_numeric(a) && b->type == -RAY_TIME) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIME);
+        return ray_time(as_i64(a) * b->i64);
+    }
+    if (a->type == -RAY_TIME && is_numeric(b)) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_TIME);
+        return ray_time(a->i64 * as_i64(b));
+    }
+    /* TIME * TIME → error */
+    if (a->type == -RAY_TIME && b->type == -RAY_TIME)
+        return ray_error("type", NULL);
+
+    if (!is_numeric(a) || !is_numeric(b))
+        return ray_error("type", "cannot multiply %s and %s",
+                         ray_type_name(a->type), ray_type_name(b->type));
+    /* Null propagation */
+    if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return null_for_promoted(a, b);
+    if (is_float_op(a, b)) return make_f64(as_f64(a) * as_f64(b));
+    int8_t rt = promote_int_type(a, b);
+    return make_typed_int(rt, as_i64(a) * as_i64(b));
+}
+
+ray_t* ray_div_fn(ray_t* a, ray_t* b) {
+    /* Temporal / numeric → temporal (same type as left operand) */
+    if (is_temporal(a) && is_numeric(b)) {
+        if (RAY_ATOM_IS_NULL(b) || RAY_ATOM_IS_NULL(a))
+            return ray_typed_null(a->type);
+        if (is_float_op(a, b)) {
+            double bv = as_f64(b);
+            if (bv == 0.0)
+                return ray_typed_null(a->type);
+            int64_t result = (int64_t)floor((double)a->i64 / bv);
+            if (a->type == -RAY_TIME)      return ray_time(result);
+            if (a->type == -RAY_DATE)      return ray_date(result);
+            return ray_timestamp(result);
+        }
+        int64_t bv = as_i64(b);
+        if (bv == 0)
+            return ray_typed_null(a->type);
+        int64_t av = a->i64;
+        int64_t q = av / bv;
+        if ((av ^ bv) < 0 && q * bv != av) q--;
+        if (a->type == -RAY_TIME)      return ray_time(q);
+        if (a->type == -RAY_DATE)      return ray_date(q);
+        return ray_timestamp(q);
+    }
+    if (!is_numeric(a) || !is_numeric(b))
+        return ray_error("type", "cannot divide %s by %s",
+                         ray_type_name(a->type), ray_type_name(b->type));
+    /* u8: unsigned byte division — div by 0 returns 0 */
+    if (a->type == -RAY_U8) {
+        uint8_t bv = (uint8_t)as_i64(b);
+        if (bv == 0 || RAY_ATOM_IS_NULL(b)) return make_u8(0);
+        if (RAY_ATOM_IS_NULL(a)) return make_u8(0);
+        return make_u8((uint8_t)((uint8_t)as_i64(a) / bv));
+    }
+    /* Null propagation — null operand → typed null matching left operand type */
+    if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b))
+        return ray_typed_null(a->type);
+
+    /* Integer (floor) division — always returns integer.
+     * Float operands are converted to i64 via floor(a/b). */
+    if (is_float_op(a, b)) {
+        double bv = as_f64(b);
+        if (bv == 0.0)
+            return ray_typed_null(a->type);
+        double result = floor(as_f64(a) / bv);
+        /* Return type matches LEFT operand */
+        if (a->type == -RAY_F64) return make_f64(result);
+        if (a->type == -RAY_I16) return make_i16((int16_t)(int64_t)result);
+        if (a->type == -RAY_I32) return make_i32((int32_t)(int64_t)result);
+        if (result >= (double)INT64_MIN && result <= (double)INT64_MAX)
+            return make_i64((int64_t)result);
+        return ray_typed_null(-RAY_I64);
+    }
+    int64_t bv = as_i64(b);
+    if (bv == 0)
+        return ray_typed_null(a->type);
+
+    int64_t av = as_i64(a);
+    /* Floor division (toward -inf) */
+    int64_t q = av / bv;
+    if ((av ^ bv) < 0 && q * bv != av) q--;
+    /* Return type matches LEFT operand for i16/i32 */
+    if (a->type == -RAY_I16) return make_i16((int16_t)q);
+    if (a->type == -RAY_I32) return make_i32((int32_t)q);
+    return make_i64(q);
+}
+
+ray_t* ray_mod_fn(ray_t* a, ray_t* b) {
+    /* Temporal % numeric → temporal (same type as left operand) */
+    if (is_temporal(a) && is_numeric(b)) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b))
+            return ray_typed_null(a->type);
+        int64_t bv;
+        if (b->type == -RAY_F64) {
+            double bvf = b->f64;
+            if (bvf == 0.0)
+                return ray_typed_null(a->type);
+            bv = (int64_t)bvf;
+        } else {
+            bv = as_i64(b);
+        }
+        if (bv == 0)
+            return ray_typed_null(a->type);
+
+        int64_t av = a->i64;
+        int64_t q = av / bv;
+        if ((av ^ bv) < 0 && q * bv != av) q--;
+        int64_t result = av - bv * q;
+        if (a->type == -RAY_TIME)      return ray_time(result);
+        if (a->type == -RAY_DATE)      return ray_date(result);
+        return ray_timestamp(result);
+    }
+    if (!is_numeric(a) || !is_numeric(b))
+        return ray_error("type", "cannot mod %s by %s",
+                         ray_type_name(a->type), ray_type_name(b->type));
+
+    /* u8: unsigned byte modulo, no null sentinel — mod by 0 returns 0 */
+    if (b->type == -RAY_U8) {
+        uint8_t bv = b->u8;
+        if (bv == 0) return make_u8(0);
+        return make_u8((uint8_t)((uint8_t)as_i64(a) % bv));
+    }
+    if (a->type == -RAY_U8) {
+        /* a is u8 but b is not u8 — treat as integer, result follows b's type */
+    }
+
+    /* Null propagation and division by zero: null type follows RIGHT operand */
+    if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) {
+        int8_t rt = (b->type == -RAY_F64 || a->type == -RAY_F64) ? -RAY_F64 : b->type;
+        return ray_typed_null(rt);
+    }
+
+    /* Float modulo: result = a - b * floor(a/b), type follows RIGHT or f64 */
+    if (is_float_op(a, b)) {
+        double av = as_f64(a), bv = as_f64(b);
+        if (bv == 0.0) {
+            int8_t rt = (b->type == -RAY_F64 || a->type == -RAY_F64) ? -RAY_F64 : b->type;
+            return ray_typed_null(rt);
+        }
+        double result = av - bv * floor(av / bv);
+        /* Snap tiny residual to 0 */
+        if (fabs(result) < 1e-12 || fabs(result - fabs(bv)) < 1e-12) result = bv > 0 ? 0.0 : -0.0;
+        if (b->type == -RAY_F64 || a->type == -RAY_F64) return make_f64(result);
+        if (b->type == -RAY_I32) return make_i32((int32_t)(int64_t)result);
+        if (b->type == -RAY_I16) return make_i16((int16_t)(int64_t)result);
+        return make_i64((int64_t)result);
+    }
+
+    /* Integer modulo: result = a - b * floor(a/b), sign follows b (divisor) */
+    int64_t av = as_i64(a), bv = as_i64(b);
+    if (bv == 0)
+        return ray_typed_null(b->type);
+
+    int64_t q = av / bv;
+    if ((av ^ bv) < 0 && q * bv != av) q--;  /* floor division */
+    int64_t result = av - bv * q;
+    /* Result type follows RIGHT operand */
+    if (b->type == -RAY_I32) return make_i32((int32_t)result);
+    if (b->type == -RAY_I16) return make_i16((int16_t)result);
+    if (b->type == -RAY_U8) return make_u8((uint8_t)result);
+    return make_i64(result);
+}
+
+ray_t* ray_neg_fn(ray_t* x) {
+    if (RAY_ATOM_IS_NULL(x)) { ray_retain(x); return x; }
+    if (x->type == -RAY_F64) return make_f64(-x->f64);
+    /* INT_MIN is the lone overflow case for signed negation: -INT_MIN
+     * doesn't fit in the same width.  Per k/q convention, surface this
+     * as a typed null of the same width — preserving type, avoiding UB,
+     * and giving the caller a `nil?`-detectable signal that overflow
+     * happened.  Consistent with how `(neg 0Ni) → 0Ni` propagates. */
+    if (x->type == -RAY_I64) {
+        if (RAY_UNLIKELY(x->i64 == INT64_MIN)) return ray_typed_null(-RAY_I64);
+        return make_i64(-x->i64);
+    }
+    if (x->type == -RAY_I32) {
+        if (RAY_UNLIKELY(x->i32 == INT32_MIN)) return ray_typed_null(-RAY_I32);
+        return make_i32(-x->i32);
+    }
+    if (x->type == -RAY_I16) {
+        if (RAY_UNLIKELY(x->i16 == INT16_MIN)) return ray_typed_null(-RAY_I16);
+        return make_i16(-x->i16);
+    }
+    return ray_error("type", NULL);
+}
+
+/* round: round to nearest integer (ties go away from zero), returns f64 */
+ray_t* ray_round_fn(ray_t* x) {
+    if (RAY_ATOM_IS_NULL(x)) return ray_typed_null(-RAY_F64);
+    if (x->type == -RAY_F64) return make_f64(round(x->f64));
+    if (is_numeric(x)) return make_f64(round(as_f64(x)));
+    return ray_error("type", NULL);
+}
+
+/* floor: round toward -inf, returns f64 for f64, identity for int */
+ray_t* ray_floor_fn(ray_t* x) {
+    if (RAY_ATOM_IS_NULL(x)) { ray_retain(x); return x; }
+    if (x->type == -RAY_F64) return make_f64(floor(x->f64));
+    if (is_numeric(x)) { ray_retain(x); return x; }
+    return ray_error("type", NULL);
+}
+
+/* ceil: round toward +inf, returns f64 for f64, identity for int */
+ray_t* ray_ceil_fn(ray_t* x) {
+    if (RAY_ATOM_IS_NULL(x)) { ray_retain(x); return x; }
+    if (x->type == -RAY_F64) return make_f64(ceil(x->f64));
+    if (is_numeric(x)) { ray_retain(x); return x; }
+    return ray_error("type", NULL);
+}
+
+/* abs: absolute value, preserves type.  INT_MIN has no representable
+ * positive in the same width — return a typed null instead (same
+ * convention as `neg`).  Stops `(abs -32768h) → -32768h` (negative
+ * result from abs!) and `(abs INT_MIN)` UB simultaneously. */
+ray_t* ray_abs_fn(ray_t* x) {
+    if (RAY_ATOM_IS_NULL(x)) { ray_retain(x); return x; }
+    if (x->type == -RAY_F64) return make_f64(fabs(x->f64));
+    if (x->type == -RAY_I64) {
+        if (RAY_UNLIKELY(x->i64 == INT64_MIN)) return ray_typed_null(-RAY_I64);
+        return make_i64(x->i64 < 0 ? -x->i64 : x->i64);
+    }
+    if (x->type == -RAY_I32) {
+        if (RAY_UNLIKELY(x->i32 == INT32_MIN)) return ray_typed_null(-RAY_I32);
+        return make_i32(x->i32 < 0 ? -x->i32 : x->i32);
+    }
+    if (x->type == -RAY_I16) {
+        if (RAY_UNLIKELY(x->i16 == INT16_MIN)) return ray_typed_null(-RAY_I16);
+        return make_i16(x->i16 < 0 ? -x->i16 : x->i16);
+    }
+    return ray_error("type", NULL);
+}
+
+/* sqrt: square root, returns f64 */
+ray_t* ray_sqrt_fn(ray_t* x) {
+    if (RAY_ATOM_IS_NULL(x)) return ray_typed_null(-RAY_F64);
+    if (x->type == -RAY_F64) return make_f64(sqrt(x->f64));
+    if (is_numeric(x)) return make_f64(sqrt(as_f64(x)));
+    return ray_error("type", NULL);
+}
+
+/* log: natural logarithm, returns f64 */
+ray_t* ray_log_fn(ray_t* x) {
+    if (RAY_ATOM_IS_NULL(x)) return ray_typed_null(-RAY_F64);
+    if (x->type == -RAY_F64) return make_f64(log(x->f64));
+    if (is_numeric(x)) return make_f64(log(as_f64(x)));
+    return ray_error("type", NULL);
+}
+
+/* exp: e^x, returns f64 */
+ray_t* ray_exp_fn(ray_t* x) {
+    if (RAY_ATOM_IS_NULL(x)) return ray_typed_null(-RAY_F64);
+    if (x->type == -RAY_F64) return make_f64(exp(x->f64));
+    if (is_numeric(x)) return make_f64(exp(as_f64(x)));
+    return ray_error("type", NULL);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/builtins.c b/crates/rayforce-sys/vendor/rayforce/src/ops/builtins.c
new file mode 100644
index 0000000..756e39e
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/builtins.c
@@ -0,0 +1,2681 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/**   I/O builtins, type casting, and misc builtins extracted from eval.c.
+ */
+
+#include "lang/eval.h"
+#include "lang/internal.h"
+#include "lang/env.h"
+#include "vec/vec.h"
+#include "lang/nfo.h"
+#include "lang/parse.h"
+#include "core/pool.h"
+#include "core/types.h"
+#include "io/csv.h"
+#include "ops/ops.h"
+#include "table/sym.h"
+#include "core/profile.h"
+#include "mem/sys.h"
+#include "lang/format.h"
+
+#include <string.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#if !defined(RAY_OS_WINDOWS)
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+/* ══════════════════════════════════════════
+ * I/O builtins: println, show, format, read-csv, write-csv, as, type
+ * ══════════════════════════════════════════ */
+
+/* Helper: return the null literal string for a typed null atom (e.g. "0Ni" for I32). */
+static const char* null_literal_str(int8_t type) {
+    switch (-type) {
+        case RAY_I16:       return "0Nh";
+        case RAY_I32:       return "0Ni";
+        case RAY_I64:       return "0Nl";
+        case RAY_F32:       return "0Ne";
+        case RAY_F64:       return "0Nf";
+        case RAY_DATE:      return "0Nd";
+        case RAY_TIME:      return "0Nt";
+        case RAY_TIMESTAMP: return "0Np";
+        case RAY_SYM:       return "0Ns";
+        default:            return "null";
+    }
+}
+
+/* Helper: print a ray_t value to a file handle */
+void ray_lang_print(FILE* fp, ray_t* val) {
+    if (!val || RAY_IS_ERR(val)) { fprintf(fp, "error"); return; }
+    if (RAY_IS_NULL(val)) { fprintf(fp, "null"); return; }
+    /* Materialize lazy handles before printing */
+    if (ray_is_lazy(val))
+        val = ray_lazy_materialize(val);
+    if (!val || RAY_IS_ERR(val)) { fprintf(fp, "error"); return; }
+    if (RAY_ATOM_IS_NULL(val)) {
+        fprintf(fp, "%s", null_literal_str(val->type));
+        return;
+    }
+    switch (val->type) {
+    case -RAY_I64:  fprintf(fp, "%ld", (long)val->i64); break;
+    case -RAY_F64: {
+        double fv = val->f64;
+        if (fv == 0.0 && signbit(fv)) fv = 0.0;
+        fprintf(fp, "%g", fv);
+        break;
+    }
+    case -RAY_BOOL: fprintf(fp, "%s", val->b8 ? "true" : "false"); break;
+    case -RAY_SYM: {
+        ray_t* s = ray_sym_str(val->i64);
+        if (s) fprintf(fp, "'%.*s", (int)ray_str_len(s), ray_str_ptr(s));
+        else fprintf(fp, "'?");
+        break;
+    }
+    case -RAY_STR: {
+        const char* s = ray_str_ptr(val);
+        size_t slen = ray_str_len(val);
+        fprintf(fp, "%.*s", (int)slen, s);
+        break;
+    }
+    case RAY_LIST: {
+        fprintf(fp, "[");
+        int64_t len = ray_len(val);
+        ray_t** elems = (ray_t**)ray_data(val);
+        for (int64_t i = 0; i < len; i++) {
+            if (i > 0) fprintf(fp, " ");
+            ray_lang_print(fp, elems[i]);
+        }
+        fprintf(fp, "]");
+        break;
+    }
+    case RAY_TABLE:
+        fprintf(fp, "<table %ldx%ld>",
+                (long)ray_table_nrows(val), (long)ray_table_ncols(val));
+        break;
+    case RAY_UNARY: case RAY_BINARY: case RAY_VARY: {
+        const char* name = ray_fn_name(val);
+        fprintf(fp, "%s", name[0] ? name : "builtin");
+        break;
+    }
+    default: {
+        /* Fall back to ray_fmt for everything else: i16, i32, u8, all
+         * vector types (I16/I32/F64/SYM/...), DICT, GUID, temporal, etc.
+         * Without this println on (println 5i) printed "<type:-4>" — a
+         * debug placeholder, not the value. */
+        ray_t* s = ray_fmt(val, 0);
+        if (s && !RAY_IS_ERR(s)) {
+            fprintf(fp, "%.*s", (int)ray_str_len(s), ray_str_ptr(s));
+            ray_release(s);
+        } else {
+            fprintf(fp, "<type:%d>", val->type);
+            if (s) ray_release(s);
+        }
+        break;
+    }
+    }
+}
+
+/* Helper: format string with % placeholders, substituting args.
+ * Returns a heap-allocated char* (caller must ray_sys_free) and sets *out_len.
+ * If fmt has no %, returns NULL (caller falls back to plain print). */
+static char* fmt_interpolate(const char* fmt, size_t flen, ray_t** args, int64_t nargs, int64_t arg_start, size_t* out_len) {
+    /* Quick scan: any % in fmt? */
+    int has_pct = 0;
+    for (size_t i = 0; i < flen; i++) if (fmt[i] == '%') { has_pct = 1; break; }
+    if (!has_pct) return NULL;
+
+    /* Build result in a dynamic buffer */
+    size_t cap = flen + 256;
+    char* buf = ray_sys_alloc(cap);
+    if (!buf) return NULL;
+    size_t pos = 0;
+    int64_t ai = arg_start;
+
+    for (size_t i = 0; i < flen; i++) {
+        if (fmt[i] == '%' && ai < nargs) {
+            /* Format the arg into a temp buffer */
+            char tmp[256];
+            ray_t* a = args[ai++];
+            if (ray_is_lazy(a)) a = ray_lazy_materialize(a);
+            int tlen = 0;
+            if (!a || RAY_IS_ERR(a)) {
+                tlen = snprintf(tmp, sizeof(tmp), "error");
+            } else if (RAY_ATOM_IS_NULL(a)) {
+                tlen = snprintf(tmp, sizeof(tmp), "%s", null_literal_str(a->type));
+            } else if (a->type == -RAY_I64) {
+                tlen = snprintf(tmp, sizeof(tmp), "%ld", (long)a->i64);
+            } else if (a->type == -RAY_F64) {
+                double fv = a->f64;
+                if (fv == 0.0 && signbit(fv)) fv = 0.0;
+                tlen = snprintf(tmp, sizeof(tmp), "%g", fv);
+            } else if (a->type == -RAY_BOOL) {
+                tlen = snprintf(tmp, sizeof(tmp), "%s", a->b8 ? "true" : "false");
+            } else if (a->type == -RAY_STR) {
+                const char* sp = ray_str_ptr(a);
+                size_t sl = ray_str_len(a);
+                while (pos + sl + 1 > cap) { cap *= 2; buf = ray_sys_realloc(buf, cap); }
+                memcpy(buf + pos, sp, sl);
+                pos += sl;
+                continue;
+            } else if (a->type == -RAY_SYM) {
+                ray_t* ss = ray_sym_str(a->i64);
+                if (ss) {
+                    const char* sp = ray_str_ptr(ss);
+                    size_t sl = ray_str_len(ss);
+                    while (pos + sl + 1 > cap) { cap *= 2; buf = ray_sys_realloc(buf, cap); }
+                    memcpy(buf + pos, sp, sl);
+                    pos += sl;
+                    ray_release(ss);
+                    continue;
+                }
+                tlen = snprintf(tmp, sizeof(tmp), "'?");
+            } else {
+                /* Fall back to ray_fmt */
+                ray_t* formatted = ray_fmt(a, 0);
+                if (formatted && !RAY_IS_ERR(formatted)) {
+                    const char* sp = ray_str_ptr(formatted);
+                    size_t sl = ray_str_len(formatted);
+                    while (pos + sl + 1 > cap) { cap *= 2; buf = ray_sys_realloc(buf, cap); }
+                    memcpy(buf + pos, sp, sl);
+                    pos += sl;
+                    ray_release(formatted);
+                    continue;
+                }
+                if (formatted) ray_release(formatted);
+                tlen = snprintf(tmp, sizeof(tmp), "<type:%d>", a->type);
+            }
+            while (pos + (size_t)tlen + 1 > cap) { cap *= 2; buf = ray_sys_realloc(buf, cap); }
+            memcpy(buf + pos, tmp, (size_t)tlen);
+            pos += (size_t)tlen;
+        } else {
+            if (pos + 2 > cap) { cap *= 2; buf = ray_sys_realloc(buf, cap); }
+            buf[pos++] = fmt[i];
+        }
+    }
+    buf[pos] = '\0';
+    *out_len = pos;
+    return buf;
+}
+
+/* (println val1 val2 ...) — print values to stdout, newline at end.
+ * If first arg is a string with % placeholders, substitutes remaining args. */
+ray_t* ray_println_fn(ray_t** args, int64_t n) {
+    for (int64_t i = 0; i < n; i++)
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+
+    /* Format string mode: first arg is a string with % placeholders */
+    if (n >= 2 && args[0] && args[0]->type == -RAY_STR) {
+        const char* fmt = ray_str_ptr(args[0]);
+        size_t flen = ray_str_len(args[0]);
+        size_t out_len = 0;
+        char* result = fmt_interpolate(fmt, flen, args, n, 1, &out_len);
+        if (result) {
+            fwrite(result, 1, out_len, stdout);
+            fputc('\n', stdout);
+            fflush(stdout);
+            ray_sys_free(result);
+            return RAY_NULL_OBJ;
+        }
+    }
+
+    for (int64_t i = 0; i < n; i++) {
+        if (i > 0) fputc(' ', stdout);
+        ray_lang_print(stdout, args[i]);
+    }
+    fputc('\n', stdout);
+    fflush(stdout);
+    return RAY_NULL_OBJ;
+}
+
+/* (print val1 val2 ...) — like println but without trailing newline */
+ray_t* ray_print_fn(ray_t** args, int64_t n) {
+    for (int64_t i = 0; i < n; i++)
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+
+    /* Format string mode: first arg is a string with % placeholders */
+    if (n >= 2 && args[0] && args[0]->type == -RAY_STR) {
+        const char* fmt = ray_str_ptr(args[0]);
+        size_t flen = ray_str_len(args[0]);
+        size_t out_len = 0;
+        char* result = fmt_interpolate(fmt, flen, args, n, 1, &out_len);
+        if (result) {
+            fwrite(result, 1, out_len, stdout);
+            fflush(stdout);
+            ray_sys_free(result);
+            return RAY_NULL_OBJ;
+        }
+    }
+
+    for (int64_t i = 0; i < n; i++) {
+        if (i > 0) fputc(' ', stdout);
+        ray_lang_print(stdout, args[i]);
+    }
+    fflush(stdout);
+    return RAY_NULL_OBJ;
+}
+
+/* (show val1 val2 ...) — print values to stdout using ray_fmt, newline at end */
+ray_t* ray_show_fn(ray_t** args, int64_t n) {
+    for (int64_t i = 0; i < n; i++) {
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+        if (!args[i] || RAY_IS_ERR(args[i])) { fprintf(stdout, "error"); continue; }
+        ray_t* formatted = ray_fmt(args[i], 1);
+        if (formatted && !RAY_IS_ERR(formatted)) {
+            const char* sp = ray_str_ptr(formatted);
+            size_t sl = ray_str_len(formatted);
+            fwrite(sp, 1, sl, stdout);
+            ray_release(formatted);
+        } else {
+            if (formatted) ray_release(formatted);
+            ray_lang_print(stdout, args[i]);
+        }
+    }
+    fputc('\n', stdout);
+    fflush(stdout);
+    return RAY_NULL_OBJ;
+}
+
+/* (format "hello % world %" a b) — string formatting with % placeholders */
+ray_t* ray_format_fn(ray_t** args, int64_t n) {
+    if (n < 1) return ray_error("domain", NULL);
+    for (int64_t i = 0; i < n; i++)
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+    if (!args[0] || args[0]->type != -RAY_STR) return ray_error("type", NULL);
+    const char* fmt = ray_str_ptr(args[0]);
+    size_t flen = ray_str_len(args[0]);
+    size_t out_len = 0;
+    char* result = fmt_interpolate(fmt, flen, args, n, 1, &out_len);
+    if (result) {
+        ray_t* s = ray_str(result, out_len);
+        ray_sys_free(result);
+        return s;
+    }
+    /* No placeholders: return fmt as-is */
+    ray_retain(args[0]);
+    return args[0];
+}
+
+/* (resolve 'name) — check if name exists in env, return value or null.
+ * SPECIAL_FORM: does not evaluate args. */
+/* (resolve tbl) — replace I64 columns with SYM columns where values are valid sym IDs.
+ * This makes query results human-readable (sym names instead of intern IDs).
+ * Also accepts (resolve db tbl) for compat — just ignores db. */
+ray_t* ray_resolve_fn(ray_t** args, int64_t n) {
+    if (n < 1) return ray_error("arity", "resolve expects at least 1 argument");
+
+    /* Evaluate all args */
+    ray_t* tbl = NULL;
+    if (n == 1) {
+        tbl = ray_eval(args[0]);
+    } else {
+        /* (resolve db tbl) — ignore db, use tbl */
+        ray_t* db = ray_eval(args[0]);
+        if (db && !RAY_IS_ERR(db)) ray_release(db);
+        tbl = ray_eval(args[1]);
+    }
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl ? tbl : ray_error("type", "resolve: null argument");
+
+    /* Materialize lazy tables */
+    if (ray_is_lazy(tbl)) {
+        ray_t* mat = ray_lazy_materialize(tbl);
+        ray_release(tbl);
+        if (!mat || RAY_IS_ERR(mat)) return mat ? mat : ray_error("domain", "resolve: materialization failed");
+        tbl = mat;
+    }
+
+    /* If not a table, return as-is */
+    if (tbl->type != RAY_TABLE) {
+        if (tbl->type == -RAY_SYM) {
+            ray_t* val = ray_env_get(tbl->i64);
+            ray_release(tbl);
+            if (!val) return NULL;
+            ray_retain(val);
+            return val;
+        }
+        return tbl;
+    }
+
+    int64_t ncols = ray_table_ncols(tbl);
+    int64_t nrows = ray_table_nrows(tbl);
+
+    /* Build a new table replacing I64 columns with SYM columns where possible */
+    ray_t* result = ray_table_new(ncols);
+    if (RAY_IS_ERR(result)) { ray_release(tbl); return result; }
+
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        int64_t col_name = ray_table_col_name(tbl, c);
+        if (!col) continue;
+
+        if (col->type == RAY_I64) {
+            /* Try to resolve: convert to SYM only if ALL positive values
+             * are valid sym IDs. This avoids converting entity-ID columns
+             * where values are plain integers that happen to collide with
+             * low sym IDs. */
+            int64_t* data = (int64_t*)ray_data(col);
+            bool all_user_sym = (nrows > 0);
+            /* Only convert if ALL values resolve to user-defined symbols
+             * (length >= 2, not single-char operators). This distinguishes
+             * symbol references (name='Alice') from entity IDs (e=1). */
+            for (int64_t r = 0; r < nrows; r++) {
+                if (data[r] <= 0) { all_user_sym = false; break; }
+                ray_t* sn = ray_sym_str(data[r]);
+                if (!sn) { all_user_sym = false; break; }
+                size_t slen = ray_str_len(sn);
+                const char* sp = ray_str_ptr(sn);
+                /* Single-char or starts with digit/operator -> not a user symbol */
+                if (slen < 2 || (sp[0] >= '0' && sp[0] <= '9') ||
+                    sp[0] == '+' || sp[0] == '-' || sp[0] == '*' || sp[0] == '/' ||
+                    sp[0] == '<' || sp[0] == '>' || sp[0] == '=' || sp[0] == '!' ||
+                    sp[0] == '?' || sp[0] == '_') {
+                    all_user_sym = false; break;
+                }
+            }
+            if (all_user_sym) {
+                /* Convert to SYM column */
+                ray_t* sym_col = ray_vec_new(RAY_SYM, nrows);
+                if (RAY_IS_ERR(sym_col)) { ray_release(result); ray_release(tbl); return sym_col; }
+                for (int64_t r = 0; r < nrows; r++) {
+                    sym_col = ray_vec_append(sym_col, &data[r]);
+                    if (RAY_IS_ERR(sym_col)) { ray_release(result); ray_release(tbl); return sym_col; }
+                }
+                result = ray_table_add_col(result, col_name, sym_col);
+                ray_release(sym_col);
+            } else {
+                /* Keep as I64 */
+                result = ray_table_add_col(result, col_name, col);
+            }
+        } else {
+            /* Non-I64 column: keep as-is */
+            result = ray_table_add_col(result, col_name, col);
+        }
+        if (RAY_IS_ERR(result)) { ray_release(tbl); return result; }
+    }
+
+    ray_release(tbl);
+    return result;
+}
+
+/* (timeit expr) — evaluate expression and return time in ms as F64.
+ * SPECIAL_FORM: does not pre-evaluate args. */
+ray_t* ray_timeit_fn(ray_t** args, int64_t n) {
+    if (n < 1) return ray_error("domain", NULL);
+    int64_t t0 = ray_profile_now_ns();
+    ray_t* result = ray_eval(args[0]);
+    int64_t t1 = ray_profile_now_ns();
+    if (result && !RAY_IS_ERR(result)) ray_release(result);
+    double ms = (double)(t1 - t0) / 1e6;
+    return make_f64(ms);
+}
+
+/* (exit code) — exit the process */
+ray_t* ray_exit_fn(ray_t* arg) {
+    int code = 0;
+    if (arg && is_numeric(arg)) code = (int)as_i64(arg);
+    exit(code);
+    return NULL; /* unreachable */
+}
+
+/* (read-csv path) — read CSV file, return RAY_TABLE */
+/* Helper: resolve a type name symbol to a ray type code */
+static int8_t resolve_type_name(int64_t sym_id) {
+    ray_t* s = ray_sym_str(sym_id);
+    if (!s) return -1;
+    const char* name = ray_str_ptr(s);
+    size_t len = ray_str_len(s);
+    int8_t result = -1;
+    if (len == 3 && memcmp(name, "I64", 3) == 0) result = RAY_I64;
+    else if (len == 3 && memcmp(name, "I32", 3) == 0) result = RAY_I32;
+    else if (len == 3 && memcmp(name, "I16", 3) == 0) result = RAY_I16;
+    else if (len == 3 && memcmp(name, "F64", 3) == 0) result = RAY_F64;
+    else if (len == 2 && memcmp(name, "B8", 2) == 0) result = RAY_BOOL;
+    else if (len == 2 && memcmp(name, "U8", 2) == 0) result = RAY_U8;
+    else if (len == 6 && memcmp(name, "SYMBOL", 6) == 0) result = RAY_SYM;
+    else if (len == 3 && memcmp(name, "STR", 3) == 0) result = RAY_STR;
+    else if (len == 3 && memcmp(name, "F32", 3) == 0) result = RAY_F32;
+    else if (len == 4 && memcmp(name, "DATE", 4) == 0) result = RAY_DATE;
+    else if (len == 4 && memcmp(name, "TIME", 4) == 0) result = RAY_TIME;
+    else if (len == 9 && memcmp(name, "TIMESTAMP", 9) == 0) result = RAY_TIMESTAMP;
+    else if (len == 4 && memcmp(name, "GUID", 4) == 0) result = RAY_GUID;
+    ray_release(s);
+    return result;
+}
+
+ray_t* ray_read_csv_fn(ray_t** args, int64_t n) {
+    if (n < 1) return ray_error("domain", NULL);
+
+    /* (read-csv [types] "path") or (read-csv "path") */
+    ray_t* path_obj = NULL;
+    ray_t* schema = NULL;
+    if (n >= 2 && ray_is_vec(args[0]) && args[0]->type == RAY_SYM) {
+        schema = args[0];
+        path_obj = args[1];
+    } else {
+        path_obj = args[0];
+    }
+
+    const char* path = NULL;
+    if (path_obj->type == -RAY_STR)
+        path = ray_str_ptr(path_obj);
+    else
+        return ray_error("type", NULL);
+    if (!path) return ray_error("domain", NULL);
+
+    if (schema) {
+        int64_t ncols = schema->len;
+        int8_t col_types[256];
+        if (ncols > 256) return ray_error("limit", NULL);
+        int64_t* sym_ids = (int64_t*)ray_data(schema);
+        for (int64_t i = 0; i < ncols; i++) {
+            col_types[i] = resolve_type_name(sym_ids[i]);
+            if (col_types[i] < 0) return ray_error("type", NULL);
+        }
+        ray_t* tbl = ray_read_csv_opts(path, 0, true, col_types, (int32_t)ncols);
+        if (!tbl || RAY_IS_ERR(tbl)) return ray_error("io", NULL);
+        return tbl;
+    }
+
+    ray_t* tbl = ray_read_csv(path);
+    if (!tbl || RAY_IS_ERR(tbl)) return ray_error("io", NULL);
+    return tbl;
+}
+
+/* (write-csv table path) — write table to CSV file */
+ray_t* ray_write_csv_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("domain", NULL);
+    ray_t* tbl = args[0];
+    ray_t* path_obj = args[1];
+    if (tbl->type != RAY_TABLE) return ray_error("type", NULL);
+    const char* path = NULL;
+    if (path_obj->type == -RAY_STR)
+        path = ray_str_ptr(path_obj);
+    else
+        return ray_error("type", NULL);
+    if (!path) return ray_error("domain", NULL);
+    ray_err_t err = ray_write_csv(tbl, path);
+    if (err != RAY_OK) return ray_error(ray_err_code_str(err), NULL);
+    return make_i64(0);
+}
+
+/* (as 'TypeName value) — type cast */
+/* Case-insensitive type name match helper */
+static int cast_match(const char* tname, size_t tlen, const char* target) {
+    size_t tgt_len = strlen(target);
+    if (tlen != tgt_len) return 0;
+    for (size_t i = 0; i < tlen; i++) {
+        char a = tname[i], b = target[i];
+        if (a >= 'a' && a <= 'z') a -= 32;
+        if (b >= 'a' && b <= 'z') b -= 32;
+        if (a != b) return 0;
+    }
+    return 1;
+}
+
+/* Helper: copy null bitmap from source vec/list to destination vec. */
+static ray_t* cast_vec_copy_nulls(ray_t* vec, ray_t* val) {
+    if (ray_is_vec(val)) {
+        if (ray_vec_copy_nulls(vec, val) != RAY_OK)
+            { ray_release(vec); return ray_error("oom", NULL); }
+    } else if (val->type == RAY_LIST) {
+        ray_t** le = (ray_t**)ray_data(val);
+        for (int64_t j = 0; j < vec->len; j++)
+            if (le[j] && RAY_ATOM_IS_NULL(le[j]))
+                ray_vec_set_null(vec, j, true);
+    }
+    return vec;
+}
+
+/* Bulk-cast loop over [_lo, _hi).  Reads `R` from `_src_p`, writes `W`
+ * to `_dst_p`.  No atom allocations.  The single-threaded path passes
+ * the whole [0, n2) range; the parallel worker passes its slice. */
+#define CAST_LOOP_RANGE(R, W, EXPR, _lo, _hi) do {                     \
+    const R* _src = (const R*)_src_p;                                  \
+    W* _dst = (W*)_dst_p;                                              \
+    for (int64_t _i = (_lo); _i < (_hi); _i++) {                       \
+        R _v = _src[_i];                                               \
+        _dst[_i] = (EXPR);                                             \
+    }                                                                  \
+} while (0)
+#define CAST_LOOP(R, W, EXPR) CAST_LOOP_RANGE(R, W, EXPR, 0, n2)
+
+/* Same-byte-rep type relabels (I64↔TIMESTAMP, I32↔DATE↔TIME): the
+ * per-element data is identical, so a single memcpy populates the new
+ * vector.  Returns true on hit. */
+static bool cast_vec_relabel_compat(int8_t a, int8_t b) {
+    if (a == b) return true;
+    if ((a == RAY_I64 || a == RAY_TIMESTAMP) &&
+        (b == RAY_I64 || b == RAY_TIMESTAMP)) return true;
+    if ((a == RAY_I32 || a == RAY_DATE || a == RAY_TIME) &&
+        (b == RAY_I32 || b == RAY_DATE || b == RAY_TIME)) return true;
+    return false;
+}
+
+/* Vec→vec numeric cast on raw arrays (no per-element atom allocs).
+ * Returns the populated `vec` on success, or NULL if the (in_type,
+ * out_type) pair is unsupported here — caller falls back to the generic
+ * path.
+ *
+ * Temporal cross-unit pairs (matched between the per-atom slow path
+ * and the fast path):
+ *   DATE → TIMESTAMP : days * NS_PER_DAY
+ *   TIMESTAMP → DATE : floor-div by NS_PER_DAY (so ns=-1 → -1 day,
+ *                       i.e. 1999-12-31, not 2000-01-01).
+ *   TIMESTAMP → TIME : floor-mod by NS_PER_DAY then /1_000_000
+ *                       (ns→ms within day, always in [0, 86_400_000)).
+ * Plain `% / /` would truncate toward zero per C semantics and give
+ * wrong components for pre-2000 timestamps; the helpers below give
+ * Python-style floor semantics for a positive divisor. */
+#define NS_PER_DAY 86400000000000LL
+
+static inline int64_t ts_days_floor(int64_t ns) {
+    int64_t q = ns / NS_PER_DAY;
+    int64_t r = ns - q * NS_PER_DAY;
+    if (r < 0) q -= 1;
+    return q;
+}
+static inline int64_t ts_ns_in_day(int64_t ns) {
+    int64_t r = ns % NS_PER_DAY;
+    if (r < 0) r += NS_PER_DAY;
+    return r;
+}
+
+/* Element-wise cast worker: writes _dst_p[lo..hi) from _src_p[lo..hi).
+ * Used by both the single-threaded fast path and the parallel dispatch.
+ * Returns true on hit; false means caller falls back to the generic
+ * (atom) path. */
+static bool cast_range_worker(const void* _src_p, void* _dst_p,
+                              int64_t lo, int64_t hi,
+                              int8_t in_type, int8_t out_type) {
+    /* Temporal unit conversions. */
+    if (in_type == RAY_DATE && out_type == RAY_TIMESTAMP) {
+        CAST_LOOP_RANGE(int32_t, int64_t, (int64_t)_v * NS_PER_DAY, lo, hi);
+        return true;
+    }
+    if (in_type == RAY_TIMESTAMP && out_type == RAY_DATE) {
+        /* Floor-div, not truncate-toward-zero: ns=-1 must give -1 day
+         * (1999-12-31), not 0 (2000-01-01). */
+        CAST_LOOP_RANGE(int64_t, int32_t, (int32_t)ts_days_floor(_v), lo, hi);
+        return true;
+    }
+    if (in_type == RAY_TIMESTAMP && out_type == RAY_TIME) {
+        /* Floor-mod ns within day, then ns→ms. */
+        CAST_LOOP_RANGE(int64_t, int32_t,
+                        (int32_t)(ts_ns_in_day(_v) / 1000000LL), lo, hi);
+        return true;
+    }
+    /* Generic numeric pairs.  The big switch dispatches on (out_type,
+     * in_type); each leaf is a tight typed loop the compiler vectorizes. */
+#define CL(R, W, EXPR) do { CAST_LOOP_RANGE(R, W, EXPR, lo, hi); return true; } while (0)
+    switch (out_type) {
+    case RAY_I64: case RAY_TIMESTAMP:
+        switch (in_type) {
+        case RAY_BOOL:  CL(uint8_t,  int64_t, _v ? 1 : 0);
+        case RAY_U8:    CL(uint8_t,  int64_t, (int64_t)_v);
+        case RAY_I16:   CL(int16_t,  int64_t, (int64_t)_v);
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+                        CL(int32_t,  int64_t, (int64_t)_v);
+        case RAY_F64:   CL(double,   int64_t, (int64_t)_v);
+        }
+        break;
+    case RAY_I32: case RAY_DATE: case RAY_TIME:
+        switch (in_type) {
+        case RAY_BOOL:  CL(uint8_t,  int32_t, _v ? 1 : 0);
+        case RAY_U8:    CL(uint8_t,  int32_t, (int32_t)_v);
+        case RAY_I16:   CL(int16_t,  int32_t, (int32_t)_v);
+        case RAY_I64: case RAY_TIMESTAMP:
+                        CL(int64_t,  int32_t, (int32_t)_v);
+        case RAY_F64:   CL(double,   int32_t, (int32_t)_v);
+        }
+        break;
+    case RAY_I16:
+        switch (in_type) {
+        case RAY_BOOL:  CL(uint8_t,  int16_t, _v ? 1 : 0);
+        case RAY_U8:    CL(uint8_t,  int16_t, (int16_t)_v);
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+                        CL(int32_t,  int16_t, (int16_t)_v);
+        case RAY_I64: case RAY_TIMESTAMP:
+                        CL(int64_t,  int16_t, (int16_t)_v);
+        case RAY_F64:   CL(double,   int16_t, (int16_t)_v);
+        }
+        break;
+    case RAY_U8:
+        switch (in_type) {
+        case RAY_BOOL:  CL(uint8_t,  uint8_t, _v ? 1 : 0);
+        case RAY_I16:   CL(int16_t,  uint8_t, (uint8_t)_v);
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+                        CL(int32_t,  uint8_t, (uint8_t)_v);
+        case RAY_I64: case RAY_TIMESTAMP:
+                        CL(int64_t,  uint8_t, (uint8_t)_v);
+        case RAY_F64:   CL(double,   uint8_t, (uint8_t)_v);
+        }
+        break;
+    case RAY_F64:
+        switch (in_type) {
+        case RAY_BOOL:  CL(uint8_t,  double, _v ? 1.0 : 0.0);
+        case RAY_U8:    CL(uint8_t,  double, (double)_v);
+        case RAY_I16:   CL(int16_t,  double, (double)_v);
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+                        CL(int32_t,  double, (double)_v);
+        case RAY_I64: case RAY_TIMESTAMP:
+                        CL(int64_t,  double, (double)_v);
+        }
+        break;
+    case RAY_BOOL:
+        switch (in_type) {
+        case RAY_U8:    CL(uint8_t,  uint8_t, _v != 0 ? 1 : 0);
+        case RAY_I16:   CL(int16_t,  uint8_t, _v != 0 ? 1 : 0);
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+                        CL(int32_t,  uint8_t, _v != 0 ? 1 : 0);
+        case RAY_I64: case RAY_TIMESTAMP:
+                        CL(int64_t,  uint8_t, _v != 0 ? 1 : 0);
+        case RAY_F64:   CL(double,   uint8_t, _v != 0.0 ? 1 : 0);
+        }
+        break;
+    }
+#undef CL
+    return false;
+}
+
+typedef struct {
+    const void* src;
+    void*       dst;
+    int8_t      in_type;
+    int8_t      out_type;
+} cast_par_ctx_t;
+
+static void cast_par_fn(void* arg, uint32_t worker_id, int64_t lo, int64_t hi) {
+    (void)worker_id;
+    /* Honor SIGINT (ray_request_interrupt / ray_interrupted) per task —
+     * the pool's own per-task gate checks `pool->cancelled` only, so
+     * a Ctrl-C arriving during dispatch wouldn't otherwise short-
+     * circuit the workers.  Skip the task on interrupt; the caller
+     * post-checks via CANCELLED() and returns an error. */
+    if (ray_interrupted()) return;
+    cast_par_ctx_t* ctx = (cast_par_ctx_t*)arg;
+    cast_range_worker(ctx->src, ctx->dst, lo, hi, ctx->in_type, ctx->out_type);
+}
+
+/* Threshold below which the dispatch overhead outweighs the speedup.
+ * Memory-bound conversions saturate ~3 GB/s single-thread; with 8
+ * workers we approach DRAM peak (~25 GB/s).  Below ~256 K elements the
+ * 50 µs dispatch cost dominates. */
+#define CAST_PAR_MIN_ELEMS 262144
+
+static ray_t* cast_vec_numeric_fast(ray_t* val, ray_t* vec, int8_t out_type) {
+    int8_t in_type = val->type;
+    int64_t n2 = val->len;
+    ray_pool_t* pool = ray_pool_get();
+
+/* A cast is "cancelled" if EITHER:
+ *   (a) the pool's per-query cancel flag is set (e.g. via ray_cancel
+ *       from another thread or a long-query timeout), or
+ *   (b) the eval-loop interrupt flag is set (Ctrl-C / SIGINT, signalled
+ *       by ray_request_interrupt and observed via ray_interrupted).
+ * Both must be polled — they're independent signals and either one
+ * means the user wants the operation to abort. */
+#define CANCELLED() ((pool && atomic_load_explicit(&pool->cancelled,   \
+                                                   memory_order_acquire)) \
+                     || ray_interrupted())
+#define CHECK_CANCEL_OR(retval) do {                                   \
+    if (CANCELLED()) return ray_error("cancel", NULL);                 \
+    return (retval);                                                   \
+} while (0)
+
+    /* Function-entry cancel check — gates ALL paths below (relabel,
+     * parallel, and chunked single-thread).  Without this, a cancel
+     * pending at entry would still execute the first ~50 µs of any
+     * path before being observed. */
+    if (CANCELLED()) return ray_error("cancel", NULL);
+
+    /* Same byte-rep types: chunked memcpy.  A single
+     * memcpy(_, _, n*esz) on a 10M-element TIMESTAMP relabel is ~80 MB
+     * and ~10 ms of opaque work — cancel arriving during it can't
+     * interrupt the libc copy, so we'd happily return `vec` even if
+     * the user asked to abort.  Break the copy into ~1 MB chunks and
+     * poll cancel between them; max in-flight work between checks is
+     * one chunk (~100 µs at realistic bandwidth). */
+    if (cast_vec_relabel_compat(in_type, out_type)) {
+        size_t esz = (size_t)ray_elem_size(out_type);
+        if (n2 > 0 && esz > 0) {
+            const char* sp = (const char*)ray_data(val);
+            char* dp = (char*)ray_data(vec);
+            size_t total = (size_t)n2 * esz;
+            const size_t chunk_bytes = (size_t)1 << 20;  /* 1 MiB */
+            size_t off = 0;
+            while (off < total) {
+                if (CANCELLED()) return ray_error("cancel", NULL);
+                size_t cn = total - off;
+                if (cn > chunk_bytes) cn = chunk_bytes;
+                memcpy(dp + off, sp + off, cn);
+                off += cn;
+            }
+        }
+        /* Post-check: a cancel landing in the final chunk would have
+         * been missed by the in-loop check (we copy then exit). */
+        if (CANCELLED()) return ray_error("cancel", NULL);
+        return vec;
+    }
+
+    const void* src_p = ray_data(val);
+    void* dst_p = ray_data(vec);
+
+    /* Three return states from this point on (helper does NOT touch
+     * `vec`'s reference count):
+     *
+     *   - `vec`           : success, fully populated, no cancel observed
+     *   - error pointer   : cancellation observed at any point — the
+     *                       helper bails out as soon as it notices,
+     *                       even mid-loop in the single-thread path
+     *   - NULL            : (in_type, out_type) pair unsupported here
+     *                       AND no cancellation observed — caller may
+     *                       safely fall through to the per-atom slow
+     *                       path with `vec` still valid */
+
+    if (pool && n2 >= CAST_PAR_MIN_ELEMS && ray_pool_total_workers(pool) >= 2) {
+        cast_par_ctx_t pctx = { .src = src_p, .dst = dst_p,
+                                .in_type = in_type, .out_type = out_type };
+        /* Probe the worker on a single element to verify the pair is
+         * supported here.  If unsupported, fall through (NULL) — but
+         * still re-check cancel first so a cancel raced into the probe
+         * window is not swallowed. */
+        if (n2 > 0 && cast_range_worker(src_p, dst_p, 0, 1, in_type, out_type)) {
+            ray_pool_dispatch(pool, cast_par_fn, &pctx, n2);
+            if (CANCELLED()) return ray_error("cancel", NULL);
+            return vec;
+        }
+        CHECK_CANCEL_OR(NULL);
+    }
+
+    /* Chunked single-thread path.  Tight typed loops vectorize well
+     * but block cancellation for the whole `n2` range — chunk into
+     * cache-sized pieces so cancel is honored within ~one chunk
+     * (64K elements ≈ 50 µs at realistic bandwidth). */
+    if (n2 == 0)
+        CHECK_CANCEL_OR(vec);
+    /* Re-check cancel right before the first chunk runs (entry cancel
+     * check above is over the whole helper, but if a cancel raced in
+     * between the relabel path and here we want to bail before doing
+     * any work). */
+    if (CANCELLED()) return ray_error("cancel", NULL);
+    int64_t chunk = (int64_t)65536;
+    int64_t lo = 0;
+    int64_t hi = (n2 < chunk) ? n2 : chunk;
+    /* Probe the first chunk; if it fails, the (in, out) pair is
+     * unsupported here and the caller falls through. */
+    if (!cast_range_worker(src_p, dst_p, lo, hi, in_type, out_type))
+        CHECK_CANCEL_OR(NULL);
+    lo = hi;
+    while (lo < n2) {
+        if (CANCELLED()) return ray_error("cancel", NULL);
+        hi = lo + chunk;
+        if (hi > n2) hi = n2;
+        cast_range_worker(src_p, dst_p, lo, hi, in_type, out_type);
+        lo = hi;
+    }
+    CHECK_CANCEL_OR(vec);
+#undef CHECK_CANCEL_OR
+#undef CANCELLED
+}
+
+/* Helper: cast a vector/list to a numeric/temporal/bool type.
+ * Handles I64, I32, I16, U8, F64, BOOL, DATE, TIME, TIMESTAMP, SYM.
+ * Fast path for typed numeric input vectors (no per-element atoms);
+ * generic path for RAY_LIST and other shapes. */
+static ray_t* cast_vec_numeric(ray_t* type_sym, ray_t* val, int8_t out_type) {
+    int64_t n2 = val->len;
+    ray_t* vec = ray_vec_new(out_type, n2);
+    if (RAY_IS_ERR(vec)) return vec;
+    vec->len = n2;
+
+    /* Fast path: typed numeric vec → numeric vec, no list/string. */
+    if (ray_is_vec(val) && val->type != RAY_STR && val->type != RAY_SYM &&
+        val->type != RAY_GUID && out_type != RAY_SYM) {
+        ray_t* fast = cast_vec_numeric_fast(val, vec, out_type);
+        /* Three return states (helper does NOT release `vec`):
+         *   - vec on success
+         *   - error pointer on cancellation — caller releases `vec`
+         *   - NULL on unsupported (in_type, out_type) — fall through */
+        if (RAY_IS_ERR(fast)) { ray_release(vec); return fast; }
+        if (fast != NULL) {
+            /* Close the cancellation gap that surrounds the post-cast
+             * nullmap copy.  cast_vec_copy_nulls runs after the
+             * cancel-aware fast cast — for nullable inputs it does a
+             * bitmap copy (and a per-element scan on RAY_LIST inputs
+             * of length n2).  A cancel arriving in that window would
+             * otherwise be masked by the success return.  Pre-check
+             * gates the nullmap work; post-check catches a cancel
+             * landing during it. */
+            ray_pool_t* fp = ray_pool_get();
+#define _FP_CANCELLED() ((fp && atomic_load_explicit(&fp->cancelled, \
+                                                     memory_order_acquire)) \
+                         || ray_interrupted())
+            if (_FP_CANCELLED()) { ray_release(vec); return ray_error("cancel", NULL); }
+            ray_t* result = cast_vec_copy_nulls(vec, val);
+            if (RAY_IS_ERR(result)) return result;
+            if (_FP_CANCELLED()) { ray_release(vec); return ray_error("cancel", NULL); }
+#undef _FP_CANCELLED
+            return vec;
+        }
+    }
+
+    /* Fast path: STR vec → SYM vec.  Direct intern from each element's
+     * (ptr, len), no atom alloc or recursive cast.  ray_sym_intern uses
+     * the table's coarse lock so this stays single-threaded — but it
+     * skips ~150 ns of overhead per row. */
+    if (out_type == RAY_SYM && ray_is_vec(val) && val->type == RAY_STR) {
+        int64_t* ids = (int64_t*)ray_data(vec);
+        for (int64_t i = 0; i < n2; i++) {
+            size_t slen = 0;
+            const char* sp = ray_str_vec_get(val, i, &slen);
+            int64_t id = ray_sym_intern(sp ? sp : "", sp ? slen : 0);
+            if (id < 0) { ray_release(vec); return ray_error("oom", NULL); }
+            ids[i] = id;
+        }
+        ray_t* result = cast_vec_copy_nulls(vec, val);
+        if (RAY_IS_ERR(result)) return result;
+        return vec;
+    }
+
+    void* out = ray_data(vec);
+    for (int64_t i = 0; i < n2; i++) {
+        int alloc = 0;
+        ray_t* elem = collection_elem(val, i, &alloc);
+        if (RAY_IS_ERR(elem)) { ray_release(vec); return elem; }
+        ray_t* cast = ray_cast_fn(type_sym, elem);
+        if (alloc) ray_release(elem);
+        if (RAY_IS_ERR(cast)) { ray_release(vec); return cast; }
+        switch (out_type) {
+        case RAY_I64: case RAY_TIMESTAMP: case RAY_SYM:
+            ((int64_t*)out)[i] = cast->i64; break;
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+            ((int32_t*)out)[i] = cast->i32; break;
+        case RAY_I16:  ((int16_t*)out)[i] = cast->i16; break;
+        case RAY_U8:   ((uint8_t*)out)[i] = cast->u8;  break;
+        case RAY_F64:  ((double*)out)[i]  = cast->f64;  break;
+        case RAY_BOOL: ((bool*)out)[i]    = cast->b8;   break;
+        default: break;
+        }
+        ray_release(cast);
+    }
+    ray_t* result = cast_vec_copy_nulls(vec, val);
+    if (RAY_IS_ERR(result)) return result;
+    return vec;
+}
+
+ray_t* ray_cast_fn(ray_t* type_sym, ray_t* val) {
+    if (type_sym->type != -RAY_SYM) return ray_error("type", NULL);
+    /* Null propagation: casting a typed null atom produces a typed null of target type */
+    if (ray_is_atom(val) && RAY_ATOM_IS_NULL(val)) {
+        ray_t* s2 = ray_sym_str(type_sym->i64);
+        if (!s2) return ray_error("domain", NULL);
+        const char* tn = ray_str_ptr(s2);
+        size_t tl = ray_str_len(s2);
+        int8_t tt = 0;
+        if (cast_match(tn, tl, "I64") || cast_match(tn, tl, "i64")) tt = -RAY_I64;
+        else if (cast_match(tn, tl, "I32") || cast_match(tn, tl, "i32")) tt = -RAY_I32;
+        else if (cast_match(tn, tl, "I16") || cast_match(tn, tl, "i16")) tt = -RAY_I16;
+        else if (cast_match(tn, tl, "U8") || cast_match(tn, tl, "u8")) tt = -RAY_U8;
+        else if (cast_match(tn, tl, "F64") || cast_match(tn, tl, "f64")) tt = -RAY_F64;
+        else if (cast_match(tn, tl, "BOOL") || cast_match(tn, tl, "bool") || cast_match(tn, tl, "B8") || cast_match(tn, tl, "b8")) tt = -RAY_BOOL;
+        else if (cast_match(tn, tl, "SYMBOL") || cast_match(tn, tl, "symbol") || cast_match(tn, tl, "sym")) tt = -RAY_SYM;
+        else if (cast_match(tn, tl, "DATE") || cast_match(tn, tl, "date")) tt = -RAY_DATE;
+        else if (cast_match(tn, tl, "TIME") || cast_match(tn, tl, "time")) tt = -RAY_TIME;
+        else if (cast_match(tn, tl, "TIMESTAMP") || cast_match(tn, tl, "timestamp")) tt = -RAY_TIMESTAMP;
+        else if (cast_match(tn, tl, "GUID") || cast_match(tn, tl, "guid")) tt = -RAY_GUID;
+        else if (cast_match(tn, tl, "STR") || cast_match(tn, tl, "str")) { ray_release(s2); return ray_str("", 0); }
+        ray_release(s2);
+        if (tt) return ray_typed_null(tt);
+        return ray_error("domain", NULL);
+    }
+    ray_t* s = ray_sym_str(type_sym->i64);
+    if (!s) return ray_error("domain", NULL);
+    const char* tname = ray_str_ptr(s);
+    size_t tlen = ray_str_len(s);
+
+    /* Cast to I64 / i64 */
+    if (cast_match(tname, tlen, "I64") || cast_match(tname, tlen, "i64")) {
+        ray_release(s);
+        if (val->type == -RAY_I64) { ray_retain(val); return val; }
+        if (val->type == -RAY_F64) return make_i64((int64_t)val->f64);
+        if (val->type == -RAY_BOOL) return make_i64(val->b8 ? 1 : 0);
+        if (val->type == -RAY_I32 || val->type == -RAY_DATE || val->type == -RAY_TIME)
+            return make_i64(val->i32);
+        if (val->type == -RAY_TIMESTAMP) return make_i64(val->i64);
+        if (val->type == -RAY_I16) return make_i64(val->i16);
+        if (val->type == -RAY_U8) return make_i64(val->u8);
+        if (val->type == -RAY_STR) {
+            const char* sp = ray_str_ptr(val);
+            if (!sp) return ray_error("domain", NULL);
+            char* end;
+            int64_t v = strtoll(sp, &end, 10);
+            if (end == sp) return ray_error("domain", NULL);
+            return make_i64(v);
+        }
+        /* Vector/list cast */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_I64);
+        return ray_error("type", NULL);
+    }
+    /* Cast to I32 / i32 */
+    if (cast_match(tname, tlen, "I32") || cast_match(tname, tlen, "i32")) {
+        ray_release(s);
+        if (val->type == -RAY_I32) { ray_retain(val); return val; }
+        if (val->type == -RAY_BOOL) return ray_i32(val->b8 ? 1 : 0);
+        if (val->type == -RAY_U8)  return ray_i32((int32_t)val->u8);
+        if (val->type == -RAY_I16) return ray_i32(val->i16);
+        if (val->type == -RAY_I64) return ray_i32((int32_t)val->i64);
+        if (val->type == -RAY_F64) return ray_i32((int32_t)val->f64);
+        if (val->type == -RAY_DATE || val->type == -RAY_TIME) return ray_i32(val->i32);
+        if (val->type == -RAY_TIMESTAMP) return ray_i32((int32_t)val->i64);
+        if (val->type == -RAY_STR) {
+            const char* sp = ray_str_ptr(val); char* end;
+            long v = strtol(sp, &end, 10);
+            if (end == sp) return ray_error("domain", NULL);
+            return ray_i32((int32_t)v);
+        }
+        /* Vector cast */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_I32);
+        return ray_error("type", NULL);
+    }
+    /* Cast to I16 / i16 */
+    if (cast_match(tname, tlen, "I16") || cast_match(tname, tlen, "i16")) {
+        ray_release(s);
+        if (val->type == -RAY_I16) { ray_retain(val); return val; }
+        if (val->type == -RAY_BOOL) return ray_i16(val->b8 ? 1 : 0);
+        if (val->type == -RAY_U8)  return ray_i16((int16_t)val->u8);
+        if (val->type == -RAY_I32) return ray_i16((int16_t)val->i32);
+        if (val->type == -RAY_I64) return ray_i16((int16_t)val->i64);
+        if (val->type == -RAY_F64) return ray_i16((int16_t)val->f64);
+        if (val->type == -RAY_DATE || val->type == -RAY_TIME) return ray_i16((int16_t)val->i32);
+        if (val->type == -RAY_TIMESTAMP) return ray_i16((int16_t)val->i64);
+        if (val->type == -RAY_STR) {
+            const char* sp = ray_str_ptr(val); char* end;
+            long v = strtol(sp, &end, 10);
+            if (end == sp) return ray_error("domain", NULL);
+            return ray_i16((int16_t)v);
+        }
+        /* Vector cast */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_I16);
+        return ray_error("type", NULL);
+    }
+    /* Cast to F64 / f64 */
+    if (cast_match(tname, tlen, "F64") || cast_match(tname, tlen, "f64")) {
+        ray_release(s);
+        if (val->type == -RAY_F64) { ray_retain(val); return val; }
+        if (val->type == -RAY_BOOL) return make_f64(val->b8 ? 1.0 : 0.0);
+        if (val->type == -RAY_I64) return make_f64((double)val->i64);
+        if (val->type == -RAY_I32) return make_f64((double)val->i32);
+        if (val->type == -RAY_I16) return make_f64((double)val->i16);
+        if (val->type == -RAY_U8)  return make_f64((double)val->u8);
+        if (val->type == -RAY_DATE || val->type == -RAY_TIME) return make_f64((double)val->i32);
+        if (val->type == -RAY_TIMESTAMP) return make_f64((double)val->i64);
+        if (val->type == -RAY_STR) {
+            const char* sp = ray_str_ptr(val);
+            if (!sp) return ray_error("domain", NULL);
+            char* end;
+            double v = strtod(sp, &end);
+            if (end == sp) return ray_error("domain", NULL);
+            return make_f64(v);
+        }
+        /* Vector cast */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_F64);
+        return ray_error("type", NULL);
+    }
+    /* Cast to B8/BOOL/b8 */
+    if (cast_match(tname, tlen, "BOOL") || cast_match(tname, tlen, "B8") || cast_match(tname, tlen, "b8")) {
+        ray_release(s);
+        if (val->type == -RAY_BOOL) { ray_retain(val); return val; }
+        if (val->type == -RAY_I64) return make_bool(val->i64 != 0 ? 1 : 0);
+        if (val->type == -RAY_I32) return make_bool(val->i32 != 0 ? 1 : 0);
+        if (val->type == -RAY_I16) return make_bool(val->i16 != 0 ? 1 : 0);
+        if (val->type == -RAY_U8) return make_bool(val->u8 != 0 ? 1 : 0);
+        if (val->type == -RAY_F64) return make_bool(val->f64 != 0.0 ? 1 : 0);
+        if (val->type == -RAY_DATE) return make_bool(val->i32 != 0 ? 1 : 0);
+        if (val->type == -RAY_TIME) return make_bool(val->i32 != 0 ? 1 : 0);
+        if (val->type == -RAY_TIMESTAMP) return make_bool(val->i64 != 0 ? 1 : 0);
+        if (val->type == -RAY_STR) return make_bool(ray_str_len(val) > 0 ? 1 : 0);
+        /* Vector cast: b8/B8 */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_BOOL);
+        return ray_error("type", NULL);
+    }
+    /* Cast to STR/str */
+    if (cast_match(tname, tlen, "STR") || cast_match(tname, tlen, "str")) {
+        ray_release(s);
+        if (val->type == -RAY_STR) { ray_retain(val); return val; }
+        if (val->type == -RAY_SYM) {
+            ray_t* sym_str = ray_sym_str(val->i64);
+            return sym_str ? sym_str : ray_str("", 0);
+        }
+        if (val->type == -RAY_I64) {
+            char buf[32]; int n2 = snprintf(buf, sizeof(buf), "%lld", (long long)val->i64);
+            return ray_str(buf, (size_t)n2);
+        }
+        if (val->type == -RAY_I32) {
+            char buf[32]; int n2 = snprintf(buf, sizeof(buf), "%d", (int)val->i32);
+            return ray_str(buf, (size_t)n2);
+        }
+        if (val->type == -RAY_I16) {
+            char buf[32]; int n2 = snprintf(buf, sizeof(buf), "%d", (int)val->i16);
+            return ray_str(buf, (size_t)n2);
+        }
+        if (val->type == -RAY_F64) {
+            double fv = val->f64;
+            if (fv == 0.0 && signbit(fv)) fv = 0.0;
+            char buf[32]; int n2 = snprintf(buf, sizeof(buf), "%g", fv);
+            return ray_str(buf, (size_t)n2);
+        }
+        if (val->type == -RAY_BOOL) {
+            return val->b8 ? ray_str("true", 4) : ray_str("false", 5);
+        }
+        /* Fallback: use ray_fmt for any other atom type */
+        if (ray_is_atom(val)) {
+            ray_t* formatted = ray_fmt(val, 0);
+            if (formatted && !RAY_IS_ERR(formatted)) return formatted;
+            if (formatted) ray_release(formatted);
+        }
+        /* Vector/list -> STR vector: cast each element to string */
+        if (ray_is_vec(val) || val->type == RAY_LIST) {
+            int64_t n2 = val->len;
+            ray_t* vec = ray_vec_new(RAY_STR, n2);
+            if (!vec || RAY_IS_ERR(vec)) return vec ? vec : ray_error("oom", NULL);
+            for (int64_t i = 0; i < n2; i++) {
+                int alloc = 0;
+                ray_t* elem = collection_elem(val, i, &alloc);
+                if (RAY_IS_ERR(elem)) { ray_release(vec); return elem; }
+                ray_t* cast = ray_cast_fn(type_sym, elem);
+                if (alloc) ray_release(elem);
+                if (RAY_IS_ERR(cast)) { ray_release(vec); return cast; }
+                const char* sp = ray_str_ptr(cast);
+                size_t slen = ray_str_len(cast);
+                vec = ray_str_vec_append(vec, sp ? sp : "", sp ? slen : 0);
+                ray_release(cast);
+                if (RAY_IS_ERR(vec)) return vec;
+            }
+            ray_t* result = cast_vec_copy_nulls(vec, val);
+            if (RAY_IS_ERR(result)) return result;
+            return vec;
+        }
+        return ray_error("type", NULL);
+    }
+    /* Cast to SYMBOL/sym */
+    if (cast_match(tname, tlen, "SYMBOL") || cast_match(tname, tlen, "sym") || cast_match(tname, tlen, "symbol")) {
+        ray_release(s);
+        if (val->type == -RAY_SYM) { ray_retain(val); return val; }
+        if (val->type == -RAY_STR) {
+            const char* sp = ray_str_ptr(val);
+            size_t slen = ray_str_len(val);
+            int64_t id = ray_sym_intern(sp, slen);
+            return ray_sym(id);
+        }
+        /* Integer/bool atom -> symbol: convert to plain number string */
+        if (ray_is_atom(val) && (is_numeric(val) || val->type == -RAY_BOOL)) {
+            char buf[64]; int n2;
+            if (val->type == -RAY_BOOL)     n2 = snprintf(buf, sizeof(buf), "%d", (int)val->b8);
+            else if (val->type == -RAY_U8)  n2 = snprintf(buf, sizeof(buf), "%u", (unsigned)val->u8);
+            else if (val->type == -RAY_I16) n2 = snprintf(buf, sizeof(buf), "%d", (int)val->i16);
+            else if (val->type == -RAY_I32) n2 = snprintf(buf, sizeof(buf), "%d", (int)val->i32);
+            else if (val->type == -RAY_F64) {
+                double fv = val->f64;
+                if (fv == 0.0 && signbit(fv)) fv = 0.0;
+                n2 = snprintf(buf, sizeof(buf), "%.17g", fv);
+            }
+            else n2 = snprintf(buf, sizeof(buf), "%lld", (long long)as_i64(val));
+            if (n2 > 0) {
+                int64_t id = ray_sym_intern(buf, (size_t)n2);
+                return ray_sym(id);
+            }
+        }
+        /* Temporal/guid atom -> symbol: use ray_fmt for formatting */
+        if (ray_is_atom(val) && (is_temporal(val) || val->type == -RAY_GUID)) {
+            ray_t* formatted = ray_fmt(val, 0);
+            if (formatted && !RAY_IS_ERR(formatted)) {
+                const char* sp = ray_str_ptr(formatted);
+                size_t slen = ray_str_len(formatted);
+                int64_t id = ray_sym_intern(sp, slen);
+                ray_release(formatted);
+                return ray_sym(id);
+            }
+            if (formatted) ray_release(formatted);
+        }
+        /* Vector cast: SYMBOL vec from other vecs */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_SYM);
+        return ray_error("type", NULL);
+    }
+    /* Cast to DATE/date */
+    if (cast_match(tname, tlen, "DATE") || cast_match(tname, tlen, "date")) {
+        ray_release(s);
+        if (val->type == -RAY_DATE) { ray_retain(val); return val; }
+        if (val->type == -RAY_BOOL) return ray_date((int64_t)val->b8);
+        if (val->type == -RAY_U8)  return ray_date((int64_t)val->u8);
+        if (val->type == -RAY_I16) return ray_date((int64_t)val->i16);
+        if (val->type == -RAY_I32) return ray_date((int64_t)val->i32);
+        if (val->type == -RAY_I64) return ray_date(val->i64);
+        if (val->type == -RAY_F64) return ray_date((int64_t)val->f64);
+        if (val->type == -RAY_TIME) return ray_date((int64_t)val->i32);
+        if (val->type == -RAY_TIMESTAMP) return ray_date(ts_days_floor(val->i64));
+        if (val->type == -RAY_STR) {
+            /* Parse "YYYY.MM.DD" format */
+            const char* sp = ray_str_ptr(val);
+            int y, m, d2;
+            if (sscanf(sp, "%d.%d.%d", &y, &m, &d2) != 3) return ray_error("domain", NULL);
+            int64_t days = 0;
+            { int ty;
+              for (ty = 2000; ty < y; ty++) days += (ty % 4 == 0 && (ty % 100 != 0 || ty % 400 == 0)) ? 366 : 365;
+              for (ty = y; ty < 2000; ty++) days -= (ty % 4 == 0 && (ty % 100 != 0 || ty % 400 == 0)) ? 366 : 365;
+            }
+            { static const int md2[] = {0,31,28,31,30,31,30,31,31,30,31,30,31};
+              int leap = (y % 4 == 0 && (y % 100 != 0 || y % 400 == 0));
+              for (int mi = 1; mi < m; mi++) days += md2[mi] + (mi == 2 && leap ? 1 : 0);
+              days += d2 - 1;
+            }
+            return ray_date(days);
+        }
+        /* Vector cast */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_DATE);
+        return ray_error("type", NULL);
+    }
+    /* Cast to TIME/time */
+    if (cast_match(tname, tlen, "TIME") || cast_match(tname, tlen, "time")) {
+        ray_release(s);
+        if (val->type == -RAY_TIME) { ray_retain(val); return val; }
+        if (val->type == -RAY_BOOL) return ray_time((int64_t)val->b8);
+        if (val->type == -RAY_U8)  return ray_time((int64_t)val->u8);
+        if (val->type == -RAY_I16) return ray_time((int64_t)val->i16);
+        if (val->type == -RAY_I32) return ray_time((int64_t)val->i32);
+        if (val->type == -RAY_I64) return ray_time(val->i64);
+        if (val->type == -RAY_F64) return ray_time((int64_t)val->f64);
+        if (val->type == -RAY_DATE) return ray_time((int64_t)val->i32);
+        if (val->type == -RAY_TIMESTAMP)
+            /* TIMESTAMP is ns since epoch; TIME stores ms-of-day.  Use
+             * floor-mod (not C-style truncate-toward-zero %) so pre-
+             * 2000 timestamps give time-of-day in [0, 86_400_000) ms,
+             * matching wall-clock semantics. */
+            return ray_time((int64_t)(ts_ns_in_day(val->i64) / 1000000LL));
+        if (val->type == -RAY_STR) {
+            /* Parse "HH:MM:SS[.mmm]" */
+            const char* sp = ray_str_ptr(val);
+            int th = 0, tm = 0, ts = 0, tms = 0;
+            int nr = sscanf(sp, "%d:%d:%d", &th, &tm, &ts);
+            if (nr < 2) return ray_error("domain", NULL);
+            const char* dot = strchr(sp, '.');
+            if (dot) {
+                dot++;
+                char mbuf[4] = "000";
+                int mi = 0;
+                while (*dot >= '0' && *dot <= '9' && mi < 3) mbuf[mi++] = *dot++;
+                tms = (int)strtol(mbuf, NULL, 10);
+            }
+            int32_t ms = (int32_t)th * 3600000 + (int32_t)tm * 60000 + (int32_t)ts * 1000 + tms;
+            return ray_time((int64_t)ms);
+        }
+        /* Vector cast */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_TIME);
+        return ray_error("type", NULL);
+    }
+    /* Cast to TIMESTAMP/timestamp */
+    if (cast_match(tname, tlen, "TIMESTAMP") || cast_match(tname, tlen, "timestamp")) {
+        ray_release(s);
+        if (val->type == -RAY_TIMESTAMP) { ray_retain(val); return val; }
+        if (val->type == -RAY_BOOL) return ray_timestamp((int64_t)val->b8);
+        if (val->type == -RAY_U8)  return ray_timestamp((int64_t)val->u8);
+        if (val->type == -RAY_I16) return ray_timestamp((int64_t)val->i16);
+        if (val->type == -RAY_I32) return ray_timestamp((int64_t)val->i32);
+        if (val->type == -RAY_I64) return ray_timestamp(val->i64);
+        if (val->type == -RAY_F64) return ray_timestamp((int64_t)val->f64);
+        if (val->type == -RAY_TIME) return ray_timestamp((int64_t)val->i32);
+        if (val->type == -RAY_DATE) {
+            int64_t days = val->i32;
+            return ray_timestamp(days * 24LL * 60 * 60 * 1000000000LL);
+        }
+        /* ISO string -> timestamp: "YYYY-MM-DD[T ]HH:MM:SS[.nnn...]" or "YYYY.MM.DDDHH:MM:SS.nnn..." */
+        if (val->type == -RAY_STR) {
+            const char* sp = ray_str_ptr(val);
+            size_t sl = ray_str_len(val);
+            if (sl < 10) return ray_error("domain", NULL);
+            int y, m, d, hh = 0, mm = 0, ss = 0;
+            long long frac = 0;
+            /* Try both formats: YYYY-MM-DD and YYYY.MM.DD */
+            int parsed = sscanf(sp, "%d-%d-%d", &y, &m, &d);
+            /* parse date: try YYYY-MM-DD then YYYY.MM.DD */
+            if (parsed != 3) {
+                parsed = sscanf(sp, "%d.%d.%d", &y, &m, &d);
+                /* YYYY.MM.DD format */
+            }
+            if (parsed != 3) return ray_error("domain", NULL);
+            /* Parse optional time part */
+            if (sl > 10 && (sp[10] == 'T' || sp[10] == ' ' || sp[10] == 'D')) {
+                sscanf(sp + 11, "%d:%d:%d", &hh, &mm, &ss);
+                /* Parse fractional seconds */
+                const char* dot = memchr(sp + 11, '.', sl - 11);
+                if (dot) {
+                    dot++;
+                    char fbuf[10] = "000000000";
+                    int fi = 0;
+                    while (*dot >= '0' && *dot <= '9' && fi < 9) fbuf[fi++] = *dot++;
+                    frac = strtoll(fbuf, NULL, 10);
+                }
+            }
+            /* Convert to days since 2000-01-01 */
+            int64_t days = 0;
+            { int ty;
+              for (ty = 2000; ty < y; ty++) days += (ty % 4 == 0 && (ty % 100 != 0 || ty % 400 == 0)) ? 366 : 365;
+              for (ty = y; ty < 2000; ty++) days -= (ty % 4 == 0 && (ty % 100 != 0 || ty % 400 == 0)) ? 366 : 365;
+            }
+            { static const int md[] = {0,31,28,31,30,31,30,31,31,30,31,30,31};
+              int leap = (y % 4 == 0 && (y % 100 != 0 || y % 400 == 0));
+              for (int mi = 1; mi < m; mi++) days += md[mi] + (mi == 2 && leap ? 1 : 0);
+              days += d - 1;
+            }
+            int64_t ns = days * 86400000000000LL + (int64_t)hh * 3600000000000LL +
+                         (int64_t)mm * 60000000000LL + (int64_t)ss * 1000000000LL + frac;
+            /* Handle timezone offset: Z, +HH:MM, -HH:MM, +HHMM, -HHMM */
+            if (sl > 19) {
+                const char* tz = sp + 19; /* after YYYY-MM-DDTHH:MM:SS */
+                /* Skip fractional seconds */
+                if (tz < sp + sl && *tz == '.') {
+                    tz++;
+                    while (tz < sp + sl && *tz >= '0' && *tz <= '9') tz++;
+                }
+                if (tz < sp + sl) {
+                    if (*tz == 'Z') {
+                        /* UTC, no adjustment */
+                    } else if (*tz == '+' || *tz == '-') {
+                        int tz_sign = (*tz == '+') ? 1 : -1;
+                        int tz_hh = 0, tz_mm = 0;
+                        tz++;
+                        /* Parse HH:MM or HHMM */
+                        if (tz + 4 < sp + sl && tz[2] == ':') {
+                            sscanf(tz, "%2d:%2d", &tz_hh, &tz_mm);
+                        } else {
+                            sscanf(tz, "%2d%2d", &tz_hh, &tz_mm);
+                        }
+                        int64_t tz_ns = ((int64_t)tz_hh * 3600 + (int64_t)tz_mm * 60) * 1000000000LL;
+                        ns -= tz_sign * tz_ns;
+                    }
+                }
+            }
+            return ray_timestamp(ns);
+        }
+        /* Vector cast */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_TIMESTAMP);
+        return ray_error("type", NULL);
+    }
+    /* Cast to GUID/guid */
+    if (cast_match(tname, tlen, "GUID") || cast_match(tname, tlen, "guid")) {
+        ray_release(s);
+        if (val->type == -RAY_GUID) { ray_retain(val); return val; }
+        if (val->type == -RAY_STR) {
+            /* Parse UUID string: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" */
+            const char* sp = ray_str_ptr(val);
+            size_t sl = ray_str_len(val);
+            if (sl < 36) return ray_error("domain", NULL);
+            uint8_t bytes[16];
+            const char* p = sp;
+            for (int bi = 0; bi < 16; bi++) {
+                if (*p == '-') p++;
+                char hi = *p++;
+                char lo = *p++;
+                int h = (hi >= 'a') ? hi - 'a' + 10 : (hi >= 'A') ? hi - 'A' + 10 : hi - '0';
+                int l = (lo >= 'a') ? lo - 'a' + 10 : (lo >= 'A') ? lo - 'A' + 10 : lo - '0';
+                bytes[bi] = (uint8_t)((h << 4) | l);
+            }
+            return ray_guid(bytes);
+        }
+        /* Vector of GUIDs: empty vector cast */
+        if (ray_is_vec(val) && val->len == 0)
+            return ray_vec_new(RAY_GUID, 0);
+        /* List of strings -> GUID vector */
+        if (val->type == RAY_LIST) {
+            int64_t n2 = val->len;
+            ray_t* vec = ray_vec_new(RAY_GUID, n2);
+            if (RAY_IS_ERR(vec)) return vec;
+            vec->len = n2;
+            uint8_t* data = (uint8_t*)ray_data(vec);
+            ray_t** items = (ray_t**)ray_data(val);
+            for (int64_t i = 0; i < n2; i++) {
+                ray_t* cast = ray_cast_fn(type_sym, items[i]);
+                if (RAY_IS_ERR(cast)) { ray_release(vec); return cast; }
+                if (cast->obj) memcpy(data + i * 16, ray_data(cast->obj), 16);
+                else memcpy(data + i * 16, ray_data(cast), 16);
+                ray_release(cast);
+            }
+            ray_t* result = cast_vec_copy_nulls(vec, val);
+            if (RAY_IS_ERR(result)) return result;
+            return vec;
+        }
+        return ray_error("type", NULL);
+    }
+    /* Cast to U8/u8 */
+    if (cast_match(tname, tlen, "U8") || cast_match(tname, tlen, "u8")) {
+        ray_release(s);
+        if (val->type == -RAY_U8) { ray_retain(val); return val; }
+        if (val->type == -RAY_BOOL) return ray_u8(val->b8 ? 1 : 0);
+        if (val->type == -RAY_I16) return ray_u8((uint8_t)val->i16);
+        if (val->type == -RAY_I32) return ray_u8((uint8_t)val->i32);
+        if (val->type == -RAY_I64) return ray_u8((uint8_t)val->i64);
+        if (val->type == -RAY_F64) return ray_u8((uint8_t)val->f64);
+        if (val->type == -RAY_STR) {
+            const char* sp = ray_str_ptr(val);
+            char* end; long v = strtol(sp, &end, 10);
+            if (end == sp) return ray_error("domain", NULL);
+            return ray_u8((uint8_t)v);
+        }
+        /* Vector cast */
+        if (ray_is_vec(val) || val->type == RAY_LIST)
+            return cast_vec_numeric(type_sym, val, RAY_U8);
+        return ray_error("type", NULL);
+    }
+    /* Cast to DICT */
+    if (cast_match(tname, tlen, "DICT")) {
+        ray_release(s);
+        if (val->type == RAY_DICT) { ray_retain(val); return val; }
+        /* Table -> Dict */
+        if (val->type == RAY_TABLE) {
+            int64_t ncols = ray_table_ncols(val);
+            ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, ncols);
+            if (RAY_IS_ERR(keys)) return keys;
+            ray_t* vals = ray_list_new(ncols);
+            if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+            for (int64_t c = 0; c < ncols; c++) {
+                int64_t col_name = ray_table_col_name(val, c);
+                ray_t* col_val = ray_table_get_col_idx(val, c);
+                keys = ray_vec_append(keys, &col_name);
+                if (RAY_IS_ERR(keys)) { ray_release(vals); return keys; }
+                vals = ray_list_append(vals, col_val);
+                if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+            }
+            return ray_dict_new(keys, vals);
+        }
+        return ray_error("type", NULL);
+    }
+    /* Cast to TABLE */
+    if (cast_match(tname, tlen, "TABLE")) {
+        ray_release(s);
+        if (val->type == RAY_TABLE) { ray_retain(val); return val; }
+        /* Dict -> Table */
+        if (val->type == RAY_DICT) {
+            ray_t* dkeys = ray_dict_keys(val);
+            ray_t* dvals = ray_dict_vals(val);
+            int64_t ncols = dkeys ? dkeys->len : 0;
+            if (!dkeys || dkeys->type != RAY_SYM || !dvals || dvals->type != RAY_LIST)
+                return ray_error("type", NULL);
+            ray_t** col_ptrs = (ray_t**)ray_data(dvals);
+            ray_t* tbl = ray_table_new(ncols);
+            if (RAY_IS_ERR(tbl)) return tbl;
+            for (int64_t c = 0; c < ncols; c++) {
+                int64_t col_name = ray_read_sym(ray_data(dkeys), c, RAY_SYM, dkeys->attrs);
+                ray_t* col_val = col_ptrs[c];
+                ray_retain(col_val);
+                tbl = ray_table_add_col(tbl, col_name, col_val);
+                ray_release(col_val);
+                if (RAY_IS_ERR(tbl)) return tbl;
+            }
+            return tbl;
+        }
+        return ray_error("type", NULL);
+    }
+    ray_release(s);
+    return ray_error("domain", NULL);
+}
+
+/* (type val) — return the type code of a value */
+/* ray_type_name moved to internal.h */
+
+ray_t* ray_type_fn(ray_t* val) {
+    if (!val || RAY_IS_NULL(val)) return ray_sym(ray_sym_intern("null", 4));
+    const char* name = ray_type_name(val->type);
+    int64_t id = ray_sym_intern(name, strlen(name));
+    return ray_sym(id);
+}
+
+/* (read path) — read a file's contents as a string */
+ray_t* ray_read_file_fn(ray_t* path_obj) {
+    if (path_obj->type != -RAY_STR) return ray_error("type", NULL);
+    const char* path = ray_str_ptr(path_obj);
+    if (!path) return ray_error("domain", NULL);
+    FILE* fp = fopen(path, "rb");
+    if (!fp) return ray_error("io", NULL);
+    fseek(fp, 0, SEEK_END);
+    long sz = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+    if (sz < 0) { fclose(fp); return ray_error("io", NULL); }
+    /* Use ray_alloc for the buffer */
+    ray_t* buf = ray_alloc((size_t)sz + 1);
+    if (!buf || RAY_IS_ERR(buf)) { fclose(fp); return ray_error("oom", NULL); }
+    char* data = (char*)ray_data(buf);
+    size_t rd = fread(data, 1, (size_t)sz, fp);
+    fclose(fp);
+    data[rd] = '\0';
+    ray_t* result = ray_str(data, rd);
+    ray_release(buf);
+    return result;
+}
+
+/* (load path) — read and evaluate a Rayfall script file via mmap */
+ray_t* ray_load_file_fn(ray_t* path_obj) {
+    if (path_obj->type != -RAY_STR) return ray_error("type", NULL);
+    const char* path = ray_str_ptr(path_obj);
+    if (!path) return ray_error("domain", NULL);
+    size_t path_len = ray_str_len(path_obj);
+
+#if defined(RAY_OS_WINDOWS)
+    /* Windows: fall back to fread */
+    FILE* fp = fopen(path, "r");
+    if (!fp) return ray_error("io", NULL);
+    fseek(fp, 0, SEEK_END);
+    long sz = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+    if (sz < 0) { fclose(fp); return ray_error("io", NULL); }
+    if (sz == 0) { fclose(fp); return ray_i64(0); }
+    char* buf = (char*)malloc((size_t)sz + 1);
+    if (!buf) { fclose(fp); return ray_error("oom", NULL); }
+    size_t rd = fread(buf, 1, (size_t)sz, fp);
+    fclose(fp);
+    buf[rd] = '\0';
+
+    ray_t* nfo = ray_nfo_create(path, path_len, buf, rd);
+    ray_t* parsed = ray_parse_with_nfo(buf, nfo);
+    if (RAY_IS_ERR(parsed)) { ray_release(nfo); free(buf); return parsed; }
+
+    ray_t* prev_nfo = ray_eval_get_nfo();
+    ray_eval_set_nfo(nfo);
+    ray_t* result = ray_eval(parsed);
+    ray_eval_set_nfo(prev_nfo);
+
+    ray_release(parsed);
+    ray_release(nfo);
+    free(buf);
+    return result;
+#else
+    int fd = open(path, O_RDONLY);
+    if (fd < 0) return ray_error("io", NULL);
+    struct stat st;
+    if (fstat(fd, &st) < 0 || st.st_size < 0) { close(fd); return ray_error("io", NULL); }
+    size_t sz = (size_t)st.st_size;
+    if (sz == 0) { close(fd); return ray_i64(0); }
+    char* map = (char*)mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
+    close(fd);
+    if (map == MAP_FAILED) return ray_error("io", NULL);
+    /* Copy to NUL-terminated buffer -- mmap region may not have a trailing NUL */
+    char* buf = (char*)malloc(sz + 1);
+    if (!buf) { munmap(map, sz); return ray_error("oom", NULL); }
+    memcpy(buf, map, sz);
+    buf[sz] = '\0';
+    munmap(map, sz);
+
+    ray_t* nfo = ray_nfo_create(path, path_len, buf, sz);
+    ray_t* parsed = ray_parse_with_nfo(buf, nfo);
+    if (RAY_IS_ERR(parsed)) { ray_release(nfo); free(buf); return parsed; }
+
+    ray_t* prev_nfo = ray_eval_get_nfo();
+    ray_eval_set_nfo(nfo);
+    ray_t* result = ray_eval(parsed);
+    ray_eval_set_nfo(prev_nfo);
+
+    ray_release(parsed);
+    ray_release(nfo);
+    free(buf);
+    return result;
+#endif
+}
+
+/* (write path content) — write string to a file */
+ray_t* ray_write_file_fn(ray_t* path_obj, ray_t* content) {
+    if (path_obj->type != -RAY_STR) return ray_error("type", NULL);
+    if (content->type != -RAY_STR) return ray_error("type", NULL);
+    const char* path = ray_str_ptr(path_obj);
+    const char* data = ray_str_ptr(content);
+    size_t len = ray_str_len(content);
+    if (!path || !data) return ray_error("domain", NULL);
+    FILE* fp = fopen(path, "wb");
+    if (!fp) return ray_error("io", NULL);
+    size_t written = fwrite(data, 1, len, fp);
+    fclose(fp);
+    if (written != len) return ray_error("io", NULL);
+    return make_i64(0);
+}
+
+/* ══════════════════════════════════════════
+ * Additional builtins (ported from rayforce)
+ * ══════════════════════════════════════════ */
+
+/* (enlist a b c ...) -> typed vector from atoms */
+ray_t* ray_enlist_fn(ray_t** args, int64_t n) {
+    if (n == 0) return ray_vec_new(RAY_I64, 0);
+    /* Determine type from first arg */
+    int8_t atype = args[0]->type;
+    bool homogeneous = true;
+    bool has_float = (atype == -RAY_F64);
+    bool has_int = (atype == -RAY_I64);
+    for (int64_t i = 1; i < n; i++) {
+        if (args[i]->type != atype) homogeneous = false;
+        if (args[i]->type == -RAY_F64) has_float = true;
+        if (args[i]->type == -RAY_I64) has_int = true;
+    }
+    /* Mixed int/float -> promote to f64 */
+    if (!homogeneous && has_float && has_int) {
+        ray_t* vec = ray_vec_new(RAY_F64, n);
+        if (RAY_IS_ERR(vec)) return vec;
+        double* d = (double*)ray_data(vec);
+        for (int64_t i = 0; i < n; i++)
+            d[i] = (args[i]->type == -RAY_F64) ? args[i]->f64 : (double)args[i]->i64;
+        vec->len = n;
+        for (int64_t i = 0; i < n; i++) {
+            if (RAY_ATOM_IS_NULL(args[i]))
+                ray_vec_set_null(vec, i, true);
+        }
+        return vec;
+    }
+    if (homogeneous && atype < 0) {
+        int8_t vtype = -atype;
+        ray_t* vec = ray_vec_new(vtype, n);
+        if (RAY_IS_ERR(vec)) return vec;
+        switch (vtype) {
+        case RAY_I64: case RAY_TIMESTAMP: {
+            int64_t* d = (int64_t*)ray_data(vec);
+            for (int64_t i = 0; i < n; i++) d[i] = args[i]->i64;
+            break;
+        }
+        case RAY_F64: {
+            double* d = (double*)ray_data(vec);
+            for (int64_t i = 0; i < n; i++) d[i] = args[i]->f64;
+            break;
+        }
+        case RAY_I32: case RAY_DATE: case RAY_TIME: {
+            int32_t* d = (int32_t*)ray_data(vec);
+            for (int64_t i = 0; i < n; i++) d[i] = args[i]->i32;
+            break;
+        }
+        case RAY_I16: {
+            int16_t* d = (int16_t*)ray_data(vec);
+            for (int64_t i = 0; i < n; i++) d[i] = args[i]->i16;
+            break;
+        }
+        case RAY_BOOL: {
+            bool* d = (bool*)ray_data(vec);
+            for (int64_t i = 0; i < n; i++) d[i] = args[i]->b8;
+            break;
+        }
+        case RAY_SYM: {
+            int64_t* d = (int64_t*)ray_data(vec);
+            for (int64_t i = 0; i < n; i++) d[i] = args[i]->i64;
+            break;
+        }
+        case RAY_U8: {
+            uint8_t* d = (uint8_t*)ray_data(vec);
+            for (int64_t i = 0; i < n; i++) d[i] = args[i]->u8;
+            break;
+        }
+        case RAY_STR: {
+            ray_t* svec = ray_vec_new(RAY_STR, n);
+            if (RAY_IS_ERR(svec)) { ray_free(vec); return svec; }
+            for (int64_t i = 0; i < n; i++) {
+                svec = ray_str_vec_append(svec, ray_str_ptr(args[i]), ray_str_len(args[i]));
+                if (RAY_IS_ERR(svec)) return svec;
+            }
+            ray_free(vec);
+            return svec;
+        }
+        case RAY_GUID: {
+            uint8_t* d = (uint8_t*)ray_data(vec);
+            for (int64_t i = 0; i < n; i++) {
+                const uint8_t* gd = args[i]->obj ? (const uint8_t*)ray_data(args[i]->obj) : (const uint8_t*)ray_data(args[i]);
+                memcpy(d + i * 16, gd, 16);
+            }
+            break;
+        }
+        default: goto as_list;
+        }
+        vec->len = n;
+        for (int64_t i = 0; i < n; i++) {
+            if (RAY_ATOM_IS_NULL(args[i]))
+                ray_vec_set_null(vec, i, true);
+        }
+        return vec;
+    }
+as_list:;
+    /* Heterogeneous -> list */
+    ray_t* lst = ray_list_new((int32_t)n);
+    if (RAY_IS_ERR(lst)) return lst;
+    for (int64_t i = 0; i < n; i++) {
+        ray_retain(args[i]);
+        lst = ray_list_append(lst, args[i]);
+        ray_release(args[i]);
+        if (RAY_IS_ERR(lst)) return lst;
+    }
+    return lst;
+}
+
+/* (dict keys vals) -> dict.  Wraps two parallel containers as a [keys,
+ * vals] block.  When vals is shorter than keys, the tail is filled with
+ * typed null I64.  Both inputs are copied (refs retained) — caller keeps
+ * ownership of the originals. */
+ray_t* ray_dict_fn(ray_t* keys, ray_t* vals) {
+    if (!ray_is_vec(keys)) return ray_error("type", NULL);
+    int64_t n = keys->len;
+
+    /* Hold a fresh ref to keys so ownership is transferred into the dict. */
+    ray_retain(keys);
+
+    /* Materialize vals as RAY_LIST of length n. */
+    ray_t* vlist = ray_list_new(n);
+    if (RAY_IS_ERR(vlist)) { ray_release(keys); return vlist; }
+    for (int64_t i = 0; i < n; i++) {
+        ray_t* v;
+        int alloc = 0;
+        if (vals->type == RAY_LIST) {
+            v = (i < vals->len) ? ((ray_t**)ray_data(vals))[i] : NULL;
+        } else if (ray_is_vec(vals)) {
+            v = collection_elem(vals, i, &alloc);
+        } else {
+            v = vals;
+        }
+        if (v && !RAY_IS_ERR(v)) {
+            vlist = ray_list_append(vlist, v);
+            if (alloc) ray_release(v);
+        } else {
+            ray_t* null_val = ray_typed_null(-RAY_I64);
+            vlist = ray_list_append(vlist, null_val);
+            ray_release(null_val);
+        }
+        if (RAY_IS_ERR(vlist)) { ray_release(keys); return vlist; }
+    }
+    return ray_dict_new(keys, vlist);
+}
+
+/* (nil? x) -> true if x is null */
+ray_t* ray_nil_fn(ray_t* x) {
+    if (!x || RAY_IS_NULL(x)) return ray_bool(true);
+    if (ray_is_atom(x) && RAY_ATOM_IS_NULL(x)) return ray_bool(true);
+    return ray_bool(false);
+}
+
+/* (where bool-vec) -> indices of true values */
+ray_t* ray_where_fn(ray_t* x) {
+    if (!ray_is_vec(x) || x->type != RAY_BOOL)
+        return ray_error("type", NULL);
+    bool* data = (bool*)ray_data(x);
+    int64_t n = x->len;
+    /* Count trues */
+    int64_t cnt = 0;
+    for (int64_t i = 0; i < n; i++) if (data[i]) cnt++;
+    ray_t* result = ray_vec_new(RAY_I64, cnt);
+    if (RAY_IS_ERR(result)) return result;
+    int64_t* out = (int64_t*)ray_data(result);
+    int64_t j = 0;
+    for (int64_t i = 0; i < n; i++) if (data[i]) out[j++] = i;
+    result->len = cnt;
+    return result;
+}
+
+/* (group vec) -> dict mapping each unique value to its indices */
+/* ---------------------------------------------------------------------------
+ * Open-address hash set for ray_group_fn's scalar / GUID fast paths.
+ *
+ * Each slot holds either GHT_EMPTY or an already-allocated group index.
+ * Lookups compare keys by calling back into the caller with the stored
+ * group index — the caller already knows whether the key shape is a
+ * plain int64 (scalar) or 16 bytes of guid material in the source
+ * column.  Load factor is capped at 0.5; grow on overflow.
+ *
+ * The table is ref-counted via ray_alloc so the main bookkeeping code
+ * can free it in one place on every exit path.
+ * ------------------------------------------------------------------------- */
+
+#define GHT_EMPTY 0xFFFFFFFFu
+
+typedef struct group_ht_t {
+    ray_t*     block;   /* backing ray_alloc block */
+    uint32_t*  slots;   /* cap entries */
+    uint32_t   cap;     /* power of 2 */
+    uint32_t   mask;    /* cap - 1 */
+    uint32_t   count;   /* live entries */
+} group_ht_t;
+
+static bool group_ht_init(group_ht_t* h, uint32_t initial_cap) {
+    uint32_t cap = 16;
+    while (cap < initial_cap) cap *= 2;
+    h->block = ray_alloc((size_t)cap * sizeof(uint32_t));
+    if (!h->block || RAY_IS_ERR(h->block)) { h->block = NULL; return false; }
+    h->slots = (uint32_t*)ray_data(h->block);
+    h->cap   = cap;
+    h->mask  = cap - 1;
+    h->count = 0;
+    for (uint32_t i = 0; i < cap; i++) h->slots[i] = GHT_EMPTY;
+    return true;
+}
+
+static void group_ht_free(group_ht_t* h) {
+    if (h->block) ray_free(h->block);
+    h->block = NULL;
+    h->slots = NULL;
+    h->cap = h->mask = h->count = 0;
+}
+
+/* Rehash callback: given the stored group index, return the hash for
+ * it.  This lets us grow without recomputing raw keys — caller knows
+ * how to translate gi back to a key. */
+typedef uint64_t (*group_ht_gi_hash_fn)(uint32_t gi, void* ctx);
+
+static bool group_ht_grow(group_ht_t* h, group_ht_gi_hash_fn hash_gi, void* ctx) {
+    uint32_t new_cap = h->cap * 2;
+    if (new_cap < h->cap) return false;  /* overflow */
+    ray_t* new_block = ray_alloc((size_t)new_cap * sizeof(uint32_t));
+    if (!new_block || RAY_IS_ERR(new_block)) return false;
+    uint32_t* new_slots = (uint32_t*)ray_data(new_block);
+    uint32_t new_mask = new_cap - 1;
+    for (uint32_t i = 0; i < new_cap; i++) new_slots[i] = GHT_EMPTY;
+    for (uint32_t i = 0; i < h->cap; i++) {
+        uint32_t gi = h->slots[i];
+        if (gi == GHT_EMPTY) continue;
+        uint64_t hh = hash_gi(gi, ctx);
+        uint32_t slot = (uint32_t)(hh & new_mask);
+        while (new_slots[slot] != GHT_EMPTY) slot = (slot + 1) & new_mask;
+        new_slots[slot] = gi;
+    }
+    ray_free(h->block);
+    h->block = new_block;
+    h->slots = new_slots;
+    h->cap   = new_cap;
+    h->mask  = new_mask;
+    return true;
+}
+
+static inline uint64_t mix64(uint64_t h) {
+    /* Murmur3 fmix64 */
+    h ^= h >> 33; h *= 0xFF51AFD7ED558CCDULL;
+    h ^= h >> 33; h *= 0xC4CEB9FE1A85EC53ULL;
+    h ^= h >> 33;
+    return h;
+}
+
+static inline uint64_t hash_guid(const uint8_t* g) {
+    uint64_t a, b;
+    memcpy(&a, g,     8);
+    memcpy(&b, g + 8, 8);
+    return mix64(a ^ (b * 0x9E3779B97F4A7C15ULL));
+}
+
+static inline uint64_t hash_i64(int64_t v) {
+    return mix64((uint64_t)v);
+}
+
+/* Context for GUID rehash: the 16-byte source base and, indirectly,
+ * gvals — which stores the row_idx of the first occurrence per group. */
+typedef struct {
+    const uint8_t* base;
+    const int64_t* gvals;
+} ght_guid_ctx_t;
+
+static uint64_t ght_guid_hash_gi(uint32_t gi, void* ctx) {
+    ght_guid_ctx_t* c = (ght_guid_ctx_t*)ctx;
+    return hash_guid(c->base + c->gvals[gi] * 16);
+}
+
+typedef struct { const int64_t* gvals; } ght_i64_ctx_t;
+static uint64_t ght_i64_hash_gi(uint32_t gi, void* ctx) {
+    ght_i64_ctx_t* c = (ght_i64_ctx_t*)ctx;
+    return hash_i64(c->gvals[gi]);
+}
+
+/* Grow the per-group bookkeeping arrays used by ray_group_fn.
+ * Doubles capacity; copies existing entries; returns false on OOM.
+ * Caller is responsible for cleaning up and returning an error if this fails. */
+static bool group_grow(ray_t** val_block, ray_t** ivblock,
+                       int64_t** gvals, ray_t*** idx_vecs,
+                       int64_t cur_count, int64_t* max_groups) {
+    int64_t new_max = *max_groups * 2;
+    if (new_max <= *max_groups) return false;  /* overflow */
+    ray_t* new_val = ray_alloc((size_t)new_max * sizeof(int64_t));
+    if (!new_val || RAY_IS_ERR(new_val)) return false;
+    ray_t* new_iv = ray_alloc((size_t)new_max * sizeof(ray_t*));
+    if (!new_iv || RAY_IS_ERR(new_iv)) { ray_free(new_val); return false; }
+    memcpy(ray_data(new_val), *gvals, (size_t)cur_count * sizeof(int64_t));
+    memcpy(ray_data(new_iv), *idx_vecs, (size_t)cur_count * sizeof(ray_t*));
+    ray_free(*val_block);
+    ray_free(*ivblock);
+    *val_block = new_val;
+    *ivblock = new_iv;
+    *gvals = (int64_t*)ray_data(new_val);
+    *idx_vecs = (ray_t**)ray_data(new_iv);
+    *max_groups = new_max;
+    return true;
+}
+
+ray_t* ray_group_fn(ray_t* x) {
+    if (!ray_is_vec(x) && x->type != RAY_LIST)
+        return ray_error("type", NULL);
+    int64_t n = x->len;
+    if (n == 0) {
+        ray_t* keys = ray_list_new(0);
+        if (RAY_IS_ERR(keys)) return keys;
+        ray_t* vals = ray_list_new(0);
+        if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+        return ray_dict_new(keys, vals);
+    }
+
+    /* Collect unique values; the scalar and RAY_GUID paths grow these
+     * arrays on demand via group_grow().  The RAY_LIST and RAY_STR
+     * paths below still cap at this initial size (they have their own
+     * side buffers that aren't yet wired into group_grow); starting at
+     * 1024 preserves their prior behaviour. */
+    int64_t max_groups = n < 1024 ? n : 1024;
+    ray_t* val_block = ray_alloc((size_t)(max_groups * sizeof(int64_t)));
+    if (RAY_IS_ERR(val_block)) return val_block;
+    int64_t* gvals = (int64_t*)ray_data(val_block);
+
+    /* For each group, store indices in a separate i64 vector */
+    ray_t** idx_vecs = NULL;
+    ray_t* ivblock = ray_alloc((size_t)(max_groups * sizeof(ray_t*)));
+    if (RAY_IS_ERR(ivblock)) { ray_free(val_block); return ivblock; }
+    idx_vecs = (ray_t**)ray_data(ivblock);
+    int64_t ngroups = 0;
+
+    /* For LIST type, use atom_eq-based grouping with stored keys */
+    if (x->type == RAY_LIST) {
+        ray_t** elems = (ray_t**)ray_data(x);
+        /* Store group keys as ray_t* pointers */
+        ray_t* kblock = ray_alloc((size_t)(max_groups * sizeof(ray_t*)));
+        if (RAY_IS_ERR(kblock)) { ray_free(val_block); ray_free(ivblock); return kblock; }
+        ray_t** gkeys = (ray_t**)ray_data(kblock);
+
+        for (int64_t i = 0; i < n; i++) {
+            ray_t* elem = elems[i];
+            int64_t gi = -1;
+            for (int64_t g = 0; g < ngroups; g++) {
+                if (atom_eq(gkeys[g], elem)) { gi = g; break; }
+            }
+            if (gi < 0) {
+                if (ngroups >= max_groups) {
+                    for (int64_t g = 0; g < ngroups; g++) ray_release(idx_vecs[g]);
+                    ray_free(val_block); ray_free(ivblock); ray_free(kblock);
+                    return ray_error("limit", NULL);
+                }
+                gi = ngroups++;
+                gkeys[gi] = elem;
+                idx_vecs[gi] = ray_vec_new(RAY_I64, 0);
+            }
+            idx_vecs[gi] = ray_vec_append(idx_vecs[gi], &i);
+        }
+        /* Build dict: keys as RAY_LIST (heterogeneous atoms), vals as
+         * RAY_LIST of I64 idx vectors. */
+        ray_t* keys_lst = ray_list_new(ngroups);
+        if (RAY_IS_ERR(keys_lst)) { ray_free(kblock); goto gfail; }
+        ray_t* vals_lst = ray_list_new(ngroups);
+        if (RAY_IS_ERR(vals_lst)) { ray_release(keys_lst); ray_free(kblock); goto gfail; }
+        for (int64_t g = 0; g < ngroups; g++) {
+            keys_lst = ray_list_append(keys_lst, gkeys[g]);
+            if (RAY_IS_ERR(keys_lst)) { ray_release(vals_lst); ray_free(kblock); goto gfail; }
+            vals_lst = ray_list_append(vals_lst, idx_vecs[g]);
+            ray_release(idx_vecs[g]);
+            idx_vecs[g] = NULL;
+            if (RAY_IS_ERR(vals_lst)) { ray_release(keys_lst); ray_free(kblock); goto gfail; }
+        }
+        ray_free(val_block); ray_free(ivblock); ray_free(kblock);
+        return ray_dict_new(keys_lst, vals_lst);
+    }
+
+    /* RAY_GUID: 16-byte fixed-width grouping via open-address hash set
+     * keyed on the guid bytes.  Previously this was an O(N²) linear
+     * scan against every existing group, which made (group guid_col)
+     * and (select ... by: OrderId) on a 10M row table effectively
+     * infinite. */
+    if (x->type == RAY_GUID) {
+        const uint8_t* base = (const uint8_t*)ray_data(x);
+        group_ht_t ht;
+        uint32_t seed_cap = (uint32_t)(n < 64 ? 64 : (n < 1048576 ? (n * 2) : 2097152));
+        if (!group_ht_init(&ht, seed_cap)) {
+            ray_free(val_block); ray_free(ivblock);
+            return ray_error("oom", NULL);
+        }
+        ght_guid_ctx_t gctx = { .base = base, .gvals = gvals };
+        ray_progress_update("group", "guid-scan", 0, (uint64_t)n);
+        for (int64_t i = 0; i < n; i++) {
+            if (((i) & 65535) == 0) {
+                if (ray_interrupted()) {
+                    for (int64_t g = 0; g < ngroups; g++) ray_release(idx_vecs[g]);
+                    group_ht_free(&ht);
+                    ray_free(val_block); ray_free(ivblock);
+                    return ray_error("cancel", "interrupted");
+                }
+                ray_progress_update(NULL, NULL, (uint64_t)i, (uint64_t)n);
+            }
+            const uint8_t* cur = base + i * 16;
+            uint64_t h = hash_guid(cur);
+            uint32_t slot = (uint32_t)(h & ht.mask);
+            uint32_t gi_found = GHT_EMPTY;
+            while (ht.slots[slot] != GHT_EMPTY) {
+                uint32_t gi = ht.slots[slot];
+                if (memcmp(base + gvals[gi] * 16, cur, 16) == 0) {
+                    gi_found = gi;
+                    break;
+                }
+                slot = (slot + 1) & ht.mask;
+            }
+            int64_t gi;
+            if (gi_found != GHT_EMPTY) {
+                gi = gi_found;
+            } else {
+                if (ngroups >= max_groups) {
+                    if (!group_grow(&val_block, &ivblock, &gvals, &idx_vecs,
+                                    ngroups, &max_groups)) {
+                        for (int64_t g = 0; g < ngroups; g++) ray_release(idx_vecs[g]);
+                        group_ht_free(&ht);
+                        ray_free(val_block); ray_free(ivblock);
+                        return ray_error("oom", NULL);
+                    }
+                    gctx.gvals = gvals;
+                }
+                gi = ngroups++;
+                gvals[gi] = i;  /* store row index of first occurrence */
+                idx_vecs[gi] = ray_vec_new(RAY_I64, 0);
+                ht.slots[slot] = (uint32_t)gi;
+                ht.count++;
+                /* Grow at load factor 0.5 */
+                if (ht.count * 2 > ht.cap) {
+                    if (!group_ht_grow(&ht, ght_guid_hash_gi, &gctx)) {
+                        for (int64_t g = 0; g < ngroups; g++) ray_release(idx_vecs[g]);
+                        group_ht_free(&ht);
+                        ray_free(val_block); ray_free(ivblock);
+                        return ray_error("oom", NULL);
+                    }
+                }
+            }
+            idx_vecs[gi] = ray_vec_append(idx_vecs[gi], &i);
+        }
+        group_ht_free(&ht);
+        /* Keys: dense GUID vector built from collected gvals; vals: LIST of idx vecs. */
+        ray_t* keys_vec = ray_vec_new(RAY_GUID, ngroups);
+        if (RAY_IS_ERR(keys_vec)) goto gfail;
+        for (int64_t g = 0; g < ngroups; g++)
+            keys_vec = ray_vec_append(keys_vec, base + gvals[g] * 16);
+        if (RAY_IS_ERR(keys_vec)) goto gfail;
+        ray_t* vals_lst = ray_list_new(ngroups);
+        if (RAY_IS_ERR(vals_lst)) { ray_release(keys_vec); goto gfail; }
+        for (int64_t g = 0; g < ngroups; g++) {
+            vals_lst = ray_list_append(vals_lst, idx_vecs[g]);
+            ray_release(idx_vecs[g]);
+            idx_vecs[g] = NULL;
+            if (RAY_IS_ERR(vals_lst)) { ray_release(keys_vec); goto gfail; }
+        }
+        ray_free(val_block); ray_free(ivblock);
+        return ray_dict_new(keys_vec, vals_lst);
+    }
+
+    /* RAY_STR: string-based grouping using ray_str_vec_get */
+    if (x->type == RAY_STR) {
+        /* Store group keys as (ptr, len) pairs -- use a scratch block for strings */
+        ray_t* skblock = ray_alloc((size_t)(max_groups * sizeof(ray_t*)));
+        if (RAY_IS_ERR(skblock)) { ray_free(val_block); ray_free(ivblock); return skblock; }
+        ray_t** str_keys = (ray_t**)ray_data(skblock);
+
+        for (int64_t i = 0; i < n; i++) {
+            size_t slen = 0;
+            const char* sp = ray_str_vec_get(x, i, &slen);
+
+            int64_t gi = -1;
+            for (int64_t g = 0; g < ngroups; g++) {
+                size_t gsl = ray_str_len(str_keys[g]);
+                const char* gsp = ray_str_ptr(str_keys[g]);
+                if (gsl == slen && (slen == 0 || memcmp(gsp, sp, slen) == 0)) {
+                    gi = g; break;
+                }
+            }
+            if (gi < 0) {
+                if (ngroups >= max_groups) {
+                    for (int64_t g = 0; g < ngroups; g++) {
+                        ray_release(str_keys[g]);
+                        ray_release(idx_vecs[g]);
+                    }
+                    ray_free(val_block); ray_free(ivblock); ray_free(skblock);
+                    return ray_error("limit", NULL);
+                }
+                gi = ngroups++;
+                str_keys[gi] = ray_str(sp ? sp : "", slen);
+                idx_vecs[gi] = ray_vec_new(RAY_I64, 0);
+            }
+            idx_vecs[gi] = ray_vec_append(idx_vecs[gi], &i);
+        }
+
+        /* Build dict: keys as RAY_STR vec from str_keys, vals as LIST of idx vecs. */
+        ray_t* keys_vec = ray_vec_new(RAY_STR, ngroups);
+        if (RAY_IS_ERR(keys_vec)) {
+            for (int64_t g = 0; g < ngroups; g++) {
+                ray_release(str_keys[g]);
+                ray_release(idx_vecs[g]);
+            }
+            ray_free(val_block); ray_free(ivblock); ray_free(skblock);
+            return ray_error("domain", NULL);
+        }
+        for (int64_t g = 0; g < ngroups; g++) {
+            keys_vec = ray_str_vec_append(keys_vec, ray_str_ptr(str_keys[g]), ray_str_len(str_keys[g]));
+            ray_release(str_keys[g]);
+        }
+        ray_t* vals_lst = ray_list_new(ngroups);
+        if (RAY_IS_ERR(vals_lst)) {
+            ray_release(keys_vec); ray_free(skblock); goto gfail;
+        }
+        for (int64_t g = 0; g < ngroups; g++) {
+            vals_lst = ray_list_append(vals_lst, idx_vecs[g]);
+            ray_release(idx_vecs[g]);
+            idx_vecs[g] = NULL;
+            if (RAY_IS_ERR(vals_lst)) { ray_release(keys_vec); ray_free(skblock); goto gfail; }
+        }
+        ray_free(val_block); ray_free(ivblock); ray_free(skblock);
+        return ray_dict_new(keys_vec, vals_lst);
+    }
+
+    /* Scalar fast path: every primitive-typed vector packs its group
+     * key into an int64 (sym id, raw integer, date/time/timestamp, bool).
+     * Use an open-address hash set so high-cardinality group-by stays
+     * linear in n rather than the historical O(N²) per-row linear scan. */
+    group_ht_t ht;
+    uint32_t seed_cap = (uint32_t)(n < 64 ? 64 : (n < 1048576 ? (n * 2) : 2097152));
+    if (!group_ht_init(&ht, seed_cap)) {
+        ray_free(val_block); ray_free(ivblock);
+        return ray_error("oom", NULL);
+    }
+    ght_i64_ctx_t sctx = { .gvals = gvals };
+    /* Null routing: null inputs share the same storage value as a legitimate
+     * zero/sentinel (e.g. NULL_I64's atom stores i64=0, NULL_I32 stores
+     * i32=0).  Without a separate null bucket the hash table would conflate
+     * `0Nl` with a real `0`, silently merging two semantically distinct
+     * groups.  Track a single `null_gi` and route every null row there;
+     * non-null rows continue to use the value-keyed hash table. */
+    int64_t null_gi = -1;
+    for (int64_t i = 0; i < n; i++) {
+        if (ray_vec_is_null(x, i)) {
+            if (null_gi < 0) {
+                if (ngroups >= max_groups) {
+                    if (!group_grow(&val_block, &ivblock, &gvals, &idx_vecs,
+                                    ngroups, &max_groups)) {
+                        for (int64_t g = 0; g < ngroups; g++) ray_release(idx_vecs[g]);
+                        group_ht_free(&ht);
+                        ray_free(val_block); ray_free(ivblock);
+                        return ray_error("oom", NULL);
+                    }
+                    sctx.gvals = gvals;
+                }
+                null_gi = ngroups++;
+                gvals[null_gi] = 0;          /* placeholder; key value set later */
+                idx_vecs[null_gi] = ray_vec_new(RAY_I64, 0);
+            }
+            idx_vecs[null_gi] = ray_vec_append(idx_vecs[null_gi], &i);
+            continue;
+        }
+        int64_t v;
+        if (x->type == RAY_SYM || x->type == RAY_I64 || x->type == RAY_TIMESTAMP)
+            v = ((int64_t*)ray_data(x))[i];
+        else if (x->type == RAY_I32 || x->type == RAY_DATE || x->type == RAY_TIME)
+            v = ((int32_t*)ray_data(x))[i];
+        else if (x->type == RAY_I16)
+            v = ((int16_t*)ray_data(x))[i];
+        else if (x->type == RAY_BOOL || x->type == RAY_U8)
+            v = ((uint8_t*)ray_data(x))[i];
+        else if (x->type == RAY_F64 || x->type == RAY_F32) {
+            /* Hash by IEEE-754 bit pattern, not row index — the previous
+             * `v = i` fallback put every float row in its own group and
+             * the keys_vec build path then reinterpreted those row
+             * indices as floats.  Two adjustments keep the bit-pattern
+             * approach consistent with atom_eq's IEEE semantics
+             * (`a->f64 == b->f64`):
+             *   - +0.0 and -0.0 hash equal: canonicalise -0.0 to 0.0.
+             *   - Each NaN is its own group (NaN != NaN under IEEE).
+             *     Route NaN rows through the dedicated nan-group path
+             *     below so the hash table never matches them. */
+            double f = (x->type == RAY_F64)
+                ? ((double*)ray_data(x))[i]
+                : (double)((float*)ray_data(x))[i];
+            if (f != f) {
+                /* NaN — own bucket per row, just like the null routing. */
+                if (ngroups >= max_groups) {
+                    if (!group_grow(&val_block, &ivblock, &gvals, &idx_vecs,
+                                    ngroups, &max_groups)) {
+                        for (int64_t g = 0; g < ngroups; g++) ray_release(idx_vecs[g]);
+                        group_ht_free(&ht);
+                        ray_free(val_block); ray_free(ivblock);
+                        return ray_error("oom", NULL);
+                    }
+                    sctx.gvals = gvals;
+                }
+                int64_t gi_nan = ngroups++;
+                memcpy(&gvals[gi_nan], &f, sizeof(f));
+                idx_vecs[gi_nan] = ray_vec_new(RAY_I64, 0);
+                idx_vecs[gi_nan] = ray_vec_append(idx_vecs[gi_nan], &i);
+                continue;
+            }
+            if (f == 0.0) f = 0.0;   /* canonicalise -0.0 → +0.0 */
+            memcpy(&v, &f, sizeof(v));
+        } else
+            v = i;
+
+        uint64_t h = hash_i64(v);
+        uint32_t slot = (uint32_t)(h & ht.mask);
+        uint32_t gi_found = GHT_EMPTY;
+        while (ht.slots[slot] != GHT_EMPTY) {
+            uint32_t gi = ht.slots[slot];
+            if (gvals[gi] == v) { gi_found = gi; break; }
+            slot = (slot + 1) & ht.mask;
+        }
+        int64_t gi;
+        if (gi_found != GHT_EMPTY) {
+            gi = gi_found;
+        } else {
+            if (ngroups >= max_groups) {
+                if (!group_grow(&val_block, &ivblock, &gvals, &idx_vecs,
+                                ngroups, &max_groups)) {
+                    for (int64_t g = 0; g < ngroups; g++) ray_release(idx_vecs[g]);
+                    group_ht_free(&ht);
+                    ray_free(val_block); ray_free(ivblock);
+                    return ray_error("oom", NULL);
+                }
+                sctx.gvals = gvals;
+            }
+            gi = ngroups++;
+            gvals[gi] = v;
+            idx_vecs[gi] = ray_vec_new(RAY_I64, 0);
+            ht.slots[slot] = (uint32_t)gi;
+            ht.count++;
+            if (ht.count * 2 > ht.cap) {
+                if (!group_ht_grow(&ht, ght_i64_hash_gi, &sctx)) {
+                    for (int64_t g = 0; g < ngroups; g++) ray_release(idx_vecs[g]);
+                    group_ht_free(&ht);
+                    ray_free(val_block); ray_free(ivblock);
+                    return ray_error("oom", NULL);
+                }
+            }
+        }
+        idx_vecs[gi] = ray_vec_append(idx_vecs[gi], &i);
+    }
+    group_ht_free(&ht);
+
+    /* Build dict: keys vec mirrors x's element type; vals LIST of idx vecs. */
+    int8_t key_type = x->type;
+    ray_t* keys_vec;
+    if (key_type == RAY_SYM) keys_vec = ray_sym_vec_new(RAY_SYM_W64, ngroups);
+    else                     keys_vec = ray_vec_new(key_type, ngroups);
+    if (RAY_IS_ERR(keys_vec)) goto gfail;
+
+    for (int64_t g = 0; g < ngroups; g++) {
+        switch (key_type) {
+            case RAY_SYM:
+            case RAY_I64:
+            case RAY_TIMESTAMP: {
+                int64_t v = gvals[g];
+                keys_vec = ray_vec_append(keys_vec, &v); break;
+            }
+            case RAY_I32:
+            case RAY_DATE:
+            case RAY_TIME: {
+                int32_t v = (int32_t)gvals[g];
+                keys_vec = ray_vec_append(keys_vec, &v); break;
+            }
+            case RAY_I16: { int16_t v = (int16_t)gvals[g]; keys_vec = ray_vec_append(keys_vec, &v); break; }
+            case RAY_BOOL:
+            case RAY_U8:  { uint8_t v = (uint8_t)gvals[g]; keys_vec = ray_vec_append(keys_vec, &v); break; }
+            case RAY_F64: {
+                /* gvals[g] holds the IEEE-754 bit pattern packed by the
+                 * row-loop above; reinterpret rather than int->double
+                 * cast (which would produce 0.0/1.0/2.0… instead of the
+                 * actual float values). */
+                double v;
+                memcpy(&v, &gvals[g], sizeof(v));
+                keys_vec = ray_vec_append(keys_vec, &v);
+                break;
+            }
+            case RAY_F32: {
+                double f;
+                memcpy(&f, &gvals[g], sizeof(f));
+                float  v = (float)f;
+                keys_vec = ray_vec_append(keys_vec, &v);
+                break;
+            }
+            default:      keys_vec = ray_vec_append(keys_vec, &gvals[g]); break;
+        }
+        if (RAY_IS_ERR(keys_vec)) goto gfail;
+        /* If the source column had a null at any row in this group, mark
+         * the group's key as null so dict rendering / lookup can recover
+         * the null semantics (the integer-value key alone collides with a
+         * legitimate zero/sentinel value).  All rows in a value-equality
+         * group share the same null-or-not status, so a single probe of
+         * the first row index suffices. */
+        if (idx_vecs[g] && idx_vecs[g]->len > 0) {
+            int64_t first_row = ((int64_t*)ray_data(idx_vecs[g]))[0];
+            if (ray_vec_is_null(x, first_row))
+                ray_vec_set_null(keys_vec, g, true);
+        }
+    }
+
+    ray_t* vals_lst = ray_list_new(ngroups);
+    if (RAY_IS_ERR(vals_lst)) { ray_release(keys_vec); goto gfail; }
+    for (int64_t g = 0; g < ngroups; g++) {
+        vals_lst = ray_list_append(vals_lst, idx_vecs[g]);
+        ray_release(idx_vecs[g]);
+        idx_vecs[g] = NULL;
+        if (RAY_IS_ERR(vals_lst)) { ray_release(keys_vec); goto gfail; }
+    }
+    ray_free(val_block);
+    ray_free(ivblock);
+    return ray_dict_new(keys_vec, vals_lst);
+
+gfail:
+    for (int64_t g = 0; g < ngroups; g++)
+        if (idx_vecs[g]) ray_release(idx_vecs[g]);
+    ray_free(val_block);
+    ray_free(ivblock);
+    return ray_error("domain", NULL);
+}
+
+/* (concat a b) -> concatenate vectors/strings/dicts/tables */
+ray_t* ray_concat_fn(ray_t* a, ray_t* b) {
+    /* Helper: get string content from atom (STR or CHAR), stripping trailing nulls */
+    {
+        int a_is_str = ray_is_atom(a) && ((-a->type) == RAY_STR);
+        int b_is_str = ray_is_atom(b) && ((-b->type) == RAY_STR);
+        if (a_is_str && b_is_str) {
+            const char *ap, *bp;
+            size_t la, lb;
+            ap = ray_str_ptr(a); la = ray_str_len(a);
+            bp = ray_str_ptr(b); lb = ray_str_len(b);
+            /* Strip trailing null bytes */
+            while (la > 0 && ap[la - 1] == '\0') la--;
+            while (lb > 0 && bp[lb - 1] == '\0') lb--;
+            char buf[8192];
+            if (la + lb > sizeof(buf)) return ray_error("limit", NULL);
+            memcpy(buf, ap, la);
+            memcpy(buf + la, bp, lb);
+            return ray_str(buf, la + lb);
+        }
+    }
+    /* Vector concat: same type — delegate to ray_vec_concat which handles
+     * null bitmap propagation, SYM width promotion, and STR pool merging. */
+    if (ray_is_vec(a) && ray_is_vec(b) && a->type == b->type)
+        return ray_vec_concat(a, b);
+    /* Concat typed vec + boxed list or boxed list + typed vec -> boxed list */
+    if ((ray_is_vec(a) && b->type == RAY_LIST) || (a->type == RAY_LIST && ray_is_vec(b))) {
+        ray_t* la = (a->type == RAY_LIST) ? a : NULL;
+        ray_t* lb = (b->type == RAY_LIST) ? b : NULL;
+        ray_t* va = ray_is_vec(a) ? a : NULL;
+        ray_t* vb = ray_is_vec(b) ? b : NULL;
+        int64_t na = a->len, nb = b->len;
+        ray_t* result = ray_alloc((na + nb) * sizeof(ray_t*));
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = na + nb;
+        ray_t** out = (ray_t**)ray_data(result);
+        for (int64_t i = 0; i < na; i++) {
+            if (va) {
+                int alloc = 0;
+                out[i] = collection_elem(va, i, &alloc);
+            } else {
+                out[i] = ((ray_t**)ray_data(la))[i];
+                ray_retain(out[i]);
+            }
+        }
+        for (int64_t i = 0; i < nb; i++) {
+            if (vb) {
+                int alloc = 0;
+                out[na + i] = collection_elem(vb, i, &alloc);
+            } else {
+                out[na + i] = ((ray_t**)ray_data(lb))[i];
+                ray_retain(out[na + i]);
+            }
+        }
+        return result;
+    }
+    /* Boxed list concat */
+    if (a->type == RAY_LIST && b->type == RAY_LIST) {
+        int64_t na = a->len, nb = b->len;
+        ray_t* result = ray_alloc((na + nb) * sizeof(ray_t*));
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = na + nb;
+        ray_t** out = (ray_t**)ray_data(result);
+        ray_t** ae = (ray_t**)ray_data(a);
+        ray_t** be = (ray_t**)ray_data(b);
+        for (int64_t i = 0; i < na; i++) { ray_retain(ae[i]); out[i] = ae[i]; }
+        for (int64_t i = 0; i < nb; i++) { ray_retain(be[i]); out[na + i] = be[i]; }
+        return result;
+    }
+    /* Vector concat: mixed types -> boxed list (preserves original element types) */
+    if (ray_is_vec(a) && ray_is_vec(b) && a->type != b->type) {
+        int64_t na = a->len, nb = b->len;
+        ray_t* result = ray_alloc((na + nb) * sizeof(ray_t*));
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = na + nb;
+        ray_t** out = (ray_t**)ray_data(result);
+        for (int64_t i = 0; i < na; i++) {
+            int alloc = 0;
+            out[i] = collection_elem(a, i, &alloc);
+            /* collection_elem always allocates for typed vecs, so ownership transfers */
+        }
+        for (int64_t i = 0; i < nb; i++) {
+            int alloc = 0;
+            out[na + i] = collection_elem(b, i, &alloc);
+        }
+        return result;
+    }
+    /* Atom + vector or vector + atom -> append */
+    if (ray_is_atom(a) && ray_is_vec(b) && (-a->type) == b->type) {
+        int64_t nb = b->len;
+        int esz = ray_elem_size(b->type);
+        ray_t* result = ray_vec_new(b->type, 1 + nb);
+        if (RAY_IS_ERR(result)) return result;
+        /* Copy atom value as first element */
+        switch (b->type) {
+        case RAY_I64: case RAY_TIMESTAMP: case RAY_SYM:
+            ((int64_t*)ray_data(result))[0] = a->i64; break;
+        case RAY_F64:
+            ((double*)ray_data(result))[0] = a->f64; break;
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+            ((int32_t*)ray_data(result))[0] = a->i32; break;
+        case RAY_I16:
+            ((int16_t*)ray_data(result))[0] = a->i16; break;
+        case RAY_BOOL:
+            ((bool*)ray_data(result))[0] = a->b8; break;
+        case RAY_U8:
+            ((uint8_t*)ray_data(result))[0] = a->u8; break;
+        case RAY_GUID: {
+            const uint8_t* gd = a->obj ? (const uint8_t*)ray_data(a->obj) : (const uint8_t*)ray_data((ray_t*)a);
+            memcpy(ray_data(result), gd, 16); break;
+        }
+        default: ray_free(result); return ray_error("type", NULL);
+        }
+        memcpy((char*)ray_data(result) + esz, ray_data(b), (size_t)(nb * esz));
+        result->len = 1 + nb;
+        return result;
+    }
+    if (ray_is_vec(a) && ray_is_atom(b) && a->type == (-b->type)) {
+        int64_t na = a->len;
+        int esz = ray_elem_size(a->type);
+        ray_t* result = ray_vec_new(a->type, na + 1);
+        if (RAY_IS_ERR(result)) return result;
+        memcpy(ray_data(result), ray_data(a), (size_t)(na * esz));
+        switch (a->type) {
+        case RAY_I64: case RAY_TIMESTAMP: case RAY_SYM:
+            ((int64_t*)ray_data(result))[na] = b->i64; break;
+        case RAY_F64:
+            ((double*)ray_data(result))[na] = b->f64; break;
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+            ((int32_t*)ray_data(result))[na] = b->i32; break;
+        case RAY_I16:
+            ((int16_t*)ray_data(result))[na] = b->i16; break;
+        case RAY_BOOL:
+            ((bool*)ray_data(result))[na] = b->b8; break;
+        case RAY_U8:
+            ((uint8_t*)ray_data(result))[na] = b->u8; break;
+        case RAY_GUID: {
+            const uint8_t* gd = b->obj ? (const uint8_t*)ray_data(b->obj) : (const uint8_t*)ray_data((ray_t*)b);
+            memcpy((uint8_t*)ray_data(result) + na * 16, gd, 16); break;
+        }
+        default: ray_free(result); return ray_error("type", NULL);
+        }
+        result->len = na + 1;
+        return result;
+    }
+    /* Atom + atom of same type -> 2-element vector */
+    if (ray_is_atom(a) && ray_is_atom(b) && a->type == b->type && a->type != -RAY_STR) {
+        int8_t vtype = -(a->type);
+        ray_t* result = ray_vec_new(vtype, 2);
+        if (RAY_IS_ERR(result)) return result;
+        result->len = 2;
+        switch (vtype) {
+        case RAY_I64: case RAY_TIMESTAMP: case RAY_SYM:
+            ((int64_t*)ray_data(result))[0] = a->i64;
+            ((int64_t*)ray_data(result))[1] = b->i64;
+            break;
+        case RAY_F64:
+            ((double*)ray_data(result))[0] = a->f64;
+            ((double*)ray_data(result))[1] = b->f64;
+            break;
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+            ((int32_t*)ray_data(result))[0] = a->i32;
+            ((int32_t*)ray_data(result))[1] = b->i32;
+            break;
+        case RAY_I16:
+            ((int16_t*)ray_data(result))[0] = a->i16;
+            ((int16_t*)ray_data(result))[1] = b->i16;
+            break;
+        case RAY_BOOL:
+            ((bool*)ray_data(result))[0] = a->b8;
+            ((bool*)ray_data(result))[1] = b->b8;
+            break;
+        case RAY_U8:
+            ((uint8_t*)ray_data(result))[0] = a->u8;
+            ((uint8_t*)ray_data(result))[1] = b->u8;
+            break;
+        case RAY_GUID: {
+            const uint8_t* ga = a->obj ? (const uint8_t*)ray_data(a->obj) : (const uint8_t*)ray_data((ray_t*)a);
+            const uint8_t* gb = b->obj ? (const uint8_t*)ray_data(b->obj) : (const uint8_t*)ray_data((ray_t*)b);
+            memcpy(ray_data(result), ga, 16);
+            memcpy((uint8_t*)ray_data(result) + 16, gb, 16);
+            break;
+        }
+        default: ray_free(result); return ray_error("type", NULL);
+        }
+        return result;
+    }
+    /* Dict concat: merge — keys/vals from b overwrite a's. */
+    if (a->type == RAY_DICT && b->type == RAY_DICT) {
+        ray_retain(a);
+        ray_t* out = a;
+        ray_t* bk = ray_dict_keys(b);
+        ray_t* bv = ray_dict_vals(b);
+        if (!bk || !bv) return out;
+        int64_t bn = bk->len;
+        for (int64_t i = 0; i < bn; i++) {
+            /* Synthesize a key atom view from bk and the value pointer from bv. */
+            ray_t k_storage; memset(&k_storage, 0, sizeof(k_storage));
+            ray_t* k = NULL;
+            if (bk->type == RAY_LIST) {
+                k = ((ray_t**)ray_data(bk))[i];
+            } else if (bk->type == RAY_SYM) {
+                k_storage.type = -RAY_SYM;
+                k_storage.i64  = ray_read_sym(ray_data(bk), i, RAY_SYM, bk->attrs);
+                k = &k_storage;
+            } else if (bk->type == RAY_I64 || bk->type == RAY_TIMESTAMP) {
+                k_storage.type = -bk->type;
+                k_storage.i64  = ((int64_t*)ray_data(bk))[i];
+                k = &k_storage;
+            } else {
+                /* Heterogeneous element types fall back to boxing via collection_elem. */
+                int alloc = 0;
+                k = collection_elem(bk, i, &alloc);
+                ray_t* v;
+                if (bv->type == RAY_LIST) v = ((ray_t**)ray_data(bv))[i];
+                else { int va = 0; v = collection_elem(bv, i, &va); (void)va; }
+                out = ray_dict_upsert(out, k, v);
+                if (alloc) ray_release(k);
+                if (!out || RAY_IS_ERR(out)) return out;
+                continue;
+            }
+            ray_t* v;
+            if (bv->type == RAY_LIST) v = ((ray_t**)ray_data(bv))[i];
+            else { int va = 0; v = collection_elem(bv, i, &va); (void)va; }
+            out = ray_dict_upsert(out, k, v);
+            if (!out || RAY_IS_ERR(out)) return out;
+        }
+        return out;
+    }
+    /* Table concat: append rows */
+    if (a->type == RAY_TABLE && b->type == RAY_TABLE) {
+        int64_t ncols_a = ray_table_ncols(a);
+        int64_t ncols_b = ray_table_ncols(b);
+        /* Match columns of a in b by name */
+        ray_t* result = ray_table_new((int32_t)ncols_a);
+        if (RAY_IS_ERR(result)) return result;
+        for (int64_t c = 0; c < ncols_a; c++) {
+            int64_t col_name_a = ray_table_col_name(a, c);
+            ray_t* acol = ray_table_get_col_idx(a, c);
+            /* Find matching column in b by name */
+            ray_t* bcol = NULL;
+            for (int64_t j = 0; j < ncols_b; j++) {
+                if (ray_table_col_name(b, j) == col_name_a) {
+                    bcol = ray_table_get_col_idx(b, j);
+                    break;
+                }
+            }
+            if (!bcol) {
+                /* Column not present in b — schema mismatch is a "value"
+                 * error (the table values have incompatible columns), not
+                 * a "domain" error (which semantically means out-of-range). */
+                ray_release(result);
+                return ray_error("value", NULL);
+            }
+            /* Type check: columns must have the same type */
+            if (acol->type != bcol->type) {
+                ray_release(result);
+                return ray_error("type", NULL);
+            }
+            ray_t* col = ray_concat_fn(acol, bcol);
+            if (RAY_IS_ERR(col)) { ray_release(result); return col; }
+            result = ray_table_add_col(result, col_name_a, col);
+            ray_release(col);
+            if (RAY_IS_ERR(result)) return result;
+        }
+        return result;
+    }
+    /* Atom + boxed list -> prepend atom to list */
+    if (ray_is_atom(a) && b->type == RAY_LIST && b->type != RAY_DICT) {
+        int64_t nb = b->len;
+        ray_t* result = ray_alloc((1 + nb) * sizeof(ray_t*));
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = 1 + nb;
+        ray_t** out = (ray_t**)ray_data(result);
+        ray_retain(a);
+        out[0] = a;
+        ray_t** be = (ray_t**)ray_data(b);
+        for (int64_t i = 0; i < nb; i++) { ray_retain(be[i]); out[1 + i] = be[i]; }
+        return result;
+    }
+    /* Boxed list + atom -> append atom to list */
+    if (a->type == RAY_LIST && a->type != RAY_DICT && ray_is_atom(b)) {
+        int64_t na = a->len;
+        ray_t* result = ray_alloc((na + 1) * sizeof(ray_t*));
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = na + 1;
+        ray_t** out = (ray_t**)ray_data(result);
+        ray_t** ae = (ray_t**)ray_data(a);
+        for (int64_t i = 0; i < na; i++) { ray_retain(ae[i]); out[i] = ae[i]; }
+        ray_retain(b);
+        out[na] = b;
+        return result;
+    }
+    /* Atom + atom of different types -> 2-element boxed list */
+    if (ray_is_atom(a) && ray_is_atom(b) && a->type != b->type) {
+        ray_t* result = ray_alloc(2 * sizeof(ray_t*));
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = 2;
+        ray_t** out = (ray_t**)ray_data(result);
+        ray_retain(a); out[0] = a;
+        ray_retain(b); out[1] = b;
+        return result;
+    }
+    return ray_error("type", NULL);
+}
+
+/* (raze list-of-vecs) -> flattened vector */
+ray_t* ray_raze_fn(ray_t* x) {
+    /* Scalar passthrough */
+    if (ray_is_atom(x)) { ray_retain(x); return x; }
+    /* Typed vector passthrough */
+    if (ray_is_vec(x)) { ray_retain(x); return x; }
+    if (x->type != RAY_LIST)
+        return ray_error("type", NULL);
+    int64_t n = x->len;
+    if (n == 0) return ray_list_new(0);
+    ray_t** items = (ray_t**)ray_data(x);
+    /* Try to concat all items */
+    ray_t* result = items[0];
+    ray_retain(result);
+    for (int64_t i = 1; i < n; i++) {
+        ray_t* next = ray_concat_fn(result, items[i]);
+        ray_release(result);
+        if (RAY_IS_ERR(next)) return next;
+        result = next;
+    }
+    return result;
+}
+
+/* (within vals [lo hi]) -> bool vector, true where lo <= val <= hi */
+ray_t* ray_within_fn(ray_t* vals, ray_t* range) {
+    if (!ray_is_vec(vals) || !ray_is_vec(range) || range->len != 2)
+        return ray_error("type", NULL);
+    int64_t n = vals->len;
+    ray_t* result = ray_vec_new(RAY_BOOL, n);
+    if (RAY_IS_ERR(result)) return result;
+    bool* out = (bool*)ray_data(result);
+
+    if (vals->type == RAY_I64) {
+        int64_t* d = (int64_t*)ray_data(vals);
+        int64_t* r = (int64_t*)ray_data(range);
+        int64_t lo = r[0], hi = r[1];
+        for (int64_t i = 0; i < n; i++) out[i] = (d[i] >= lo && d[i] <= hi);
+    } else if (vals->type == RAY_F64) {
+        double* d = (double*)ray_data(vals);
+        double* r = (double*)ray_data(range);
+        double lo = r[0], hi = r[1];
+        for (int64_t i = 0; i < n; i++) out[i] = (d[i] >= lo && d[i] <= hi);
+    } else if (vals->type == RAY_I32 || vals->type == RAY_DATE || vals->type == RAY_TIME) {
+        int32_t* d = (int32_t*)ray_data(vals);
+        int32_t* r = (int32_t*)ray_data(range);
+        int32_t lo = r[0], hi = r[1];
+        for (int64_t i = 0; i < n; i++) out[i] = (d[i] >= lo && d[i] <= hi);
+    } else {
+        ray_free(result);
+        return ray_error("type", NULL);
+    }
+    result->len = n;
+    return result;
+}
+
+/* (div a b) -> float division (always returns f64) */
+ray_t* ray_fdiv_fn(ray_t* a, ray_t* b) {
+    if (!ray_is_atom(a) || !ray_is_atom(b)) return ray_error("type", NULL);
+    if (!is_numeric(a) || !is_numeric(b)) return ray_error("type", NULL);
+    /* Null propagation */
+    if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b)) return ray_typed_null(-RAY_F64);
+    double fa = as_f64(a), fb = as_f64(b);
+    if (fb == 0.0) return ray_typed_null(-RAY_F64);
+    return make_f64(fa / fb);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/cmp.c b/crates/rayforce-sys/vendor/rayforce/src/ops/cmp.c
new file mode 100644
index 0000000..f0beae6
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/cmp.c
@@ -0,0 +1,330 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/internal.h"
+#include "ops/ops.h"   /* RAY_LAZY, ray_is_lazy, ray_lazy_materialize */
+
+#include <assert.h>
+
+/* Helper: compare char atom vs string atom.
+ * Returns: -1 if no char/string pair, else memcmp-like result via *out. */
+int char_str_cmp(ray_t* a, ray_t* b, int *out) {
+    const char *ap, *bp;
+    size_t al, bl;
+    int a_cs = (a->type == -RAY_STR);
+    int b_cs = (b->type == -RAY_STR);
+    if (!a_cs || !b_cs) return -1;
+    ap = ray_str_ptr(a); al = ray_str_len(a);
+    bp = ray_str_ptr(b); bl = ray_str_len(b);
+    size_t mn = al < bl ? al : bl;
+    int c = memcmp(ap, bp, mn);
+    if (c != 0) { *out = c; return 0; }
+    *out = (al > bl) ? 1 : (al < bl) ? -1 : 0;
+    return 0;
+}
+
+/* Lexicographic compare of two SYM atoms.  Fast path: equal interned
+ * ids ⇒ identical text ⇒ 0, no global-table lookup.  Slow path: pull
+ * the backing STR via ray_sym_str and delegate to ray_str_cmp, which
+ * uses the 12-byte SSO inline path for short symbols.
+ *
+ * Invariant: any valid SYM atom resolves to its interned string.  A
+ * NULL from ray_sym_str means corruption (uninitialised intern table,
+ * out-of-range id, or evicted slot) — no defensible total order exists
+ * in that state.  We assert and let the process abort rather than
+ * fabricate an answer (returning 0 silently collapses distinct symbols;
+ * returning ±1 by raw id invents a non-lexicographic ordering that
+ * still lies about the contract).  Matches v1 behaviour, which also
+ * trusts the invariant (and would SIGSEGV via strcmp(NULL,...) if it
+ * broke). */
+int sym_atom_cmp(ray_t* a, ray_t* b) {
+    if (a->i64 == b->i64) return 0;
+    ray_t* sa = ray_sym_str(a->i64);
+    ray_t* sb = ray_sym_str(b->i64);
+    assert(sa && sb && "sym_atom_cmp: corrupted intern table — "
+                       "valid SYM atom must resolve to interned string");
+    int r = ray_str_cmp(sa, sb);
+    ray_release(sa);
+    ray_release(sb);
+    return r;
+}
+
+/* Comparison */
+ray_t* ray_gt_fn(ray_t* a, ray_t* b) {
+    { int c; if (char_str_cmp(a, b, &c) == 0) return make_bool(c > 0 ? 1 : 0); }
+    if (a->type == -RAY_SYM && b->type == -RAY_SYM)
+        return make_bool(sym_atom_cmp(a, b) > 0 ? 1 : 0);
+    if (a->type == -RAY_GUID && b->type == -RAY_GUID)
+        return make_bool(memcmp(ray_data(a->obj), ray_data(b->obj), 16) > 0 ? 1 : 0);
+    /* Temporal comparison (same or cross-temporal via nanosecond conversion) */
+    if (is_temporal(a) && is_temporal(b)) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b))
+            return make_bool(RAY_ATOM_IS_NULL(b) && !RAY_ATOM_IS_NULL(a) ? 1 : 0);
+        return make_bool(temporal_as_ns(a) > temporal_as_ns(b) ? 1 : 0);
+    }
+    if (!is_numeric(a) || !is_numeric(b))
+        return ray_error("type", "cannot compare %s and %s",
+                         ray_type_name(a->type), ray_type_name(b->type));
+    int na = RAY_ATOM_IS_NULL(a), nb = RAY_ATOM_IS_NULL(b);
+    if (na && nb) return make_bool(0);       /* null == null → not > */
+    if (na) return make_bool(0);             /* null > X → false */
+    if (nb) return make_bool(1);             /* X > null → true */
+    return make_bool(as_f64(a) > as_f64(b) ? 1 : 0);
+}
+
+ray_t* ray_lt_fn(ray_t* a, ray_t* b) {
+    { int c; if (char_str_cmp(a, b, &c) == 0) return make_bool(c < 0 ? 1 : 0); }
+    if (a->type == -RAY_SYM && b->type == -RAY_SYM)
+        return make_bool(sym_atom_cmp(a, b) < 0 ? 1 : 0);
+    if (a->type == -RAY_GUID && b->type == -RAY_GUID)
+        return make_bool(memcmp(ray_data(a->obj), ray_data(b->obj), 16) < 0 ? 1 : 0);
+    if (is_temporal(a) && is_temporal(b)) {
+        if (RAY_ATOM_IS_NULL(a) || RAY_ATOM_IS_NULL(b))
+            return make_bool(RAY_ATOM_IS_NULL(a) && !RAY_ATOM_IS_NULL(b) ? 1 : 0);
+        return make_bool(temporal_as_ns(a) < temporal_as_ns(b) ? 1 : 0);
+    }
+    if (!is_numeric(a) || !is_numeric(b))
+        return ray_error("type", "cannot compare %s and %s",
+                         ray_type_name(a->type), ray_type_name(b->type));
+    int na = RAY_ATOM_IS_NULL(a), nb = RAY_ATOM_IS_NULL(b);
+    if (na && nb) return make_bool(0);       /* null == null → not < */
+    if (na) return make_bool(1);             /* null < X → true */
+    if (nb) return make_bool(0);             /* X < null → false */
+    return make_bool(as_f64(a) < as_f64(b) ? 1 : 0);
+}
+
+ray_t* ray_gte_fn(ray_t* a, ray_t* b) {
+    { int c; if (char_str_cmp(a, b, &c) == 0) return make_bool(c >= 0 ? 1 : 0); }
+    if (a->type == -RAY_SYM && b->type == -RAY_SYM)
+        return make_bool(sym_atom_cmp(a, b) >= 0 ? 1 : 0);
+    if (a->type == -RAY_GUID && b->type == -RAY_GUID)
+        return make_bool(memcmp(ray_data(a->obj), ray_data(b->obj), 16) >= 0 ? 1 : 0);
+    if (is_temporal(a) && is_temporal(b)) {
+        if (RAY_ATOM_IS_NULL(a) && RAY_ATOM_IS_NULL(b)) return make_bool(1);
+        if (RAY_ATOM_IS_NULL(a)) return make_bool(0);
+        if (RAY_ATOM_IS_NULL(b)) return make_bool(1);
+        return make_bool(temporal_as_ns(a) >= temporal_as_ns(b) ? 1 : 0);
+    }
+    if (!is_numeric(a) || !is_numeric(b))
+        return ray_error("type", "cannot compare %s and %s",
+                         ray_type_name(a->type), ray_type_name(b->type));
+    int na = RAY_ATOM_IS_NULL(a), nb = RAY_ATOM_IS_NULL(b);
+    if (na && nb) return make_bool(1);       /* null == null → >= true */
+    if (na) return make_bool(0);             /* null >= X → false */
+    if (nb) return make_bool(1);             /* X >= null → true */
+    return make_bool(as_f64(a) >= as_f64(b) ? 1 : 0);
+}
+
+ray_t* ray_lte_fn(ray_t* a, ray_t* b) {
+    { int c; if (char_str_cmp(a, b, &c) == 0) return make_bool(c <= 0 ? 1 : 0); }
+    if (a->type == -RAY_SYM && b->type == -RAY_SYM)
+        return make_bool(sym_atom_cmp(a, b) <= 0 ? 1 : 0);
+    if (a->type == -RAY_GUID && b->type == -RAY_GUID)
+        return make_bool(memcmp(ray_data(a->obj), ray_data(b->obj), 16) <= 0 ? 1 : 0);
+    if (is_temporal(a) && is_temporal(b)) {
+        if (RAY_ATOM_IS_NULL(a) && RAY_ATOM_IS_NULL(b)) return make_bool(1);
+        if (RAY_ATOM_IS_NULL(a)) return make_bool(1);
+        if (RAY_ATOM_IS_NULL(b)) return make_bool(0);
+        return make_bool(temporal_as_ns(a) <= temporal_as_ns(b) ? 1 : 0);
+    }
+    if (!is_numeric(a) || !is_numeric(b))
+        return ray_error("type", "cannot compare %s and %s",
+                         ray_type_name(a->type), ray_type_name(b->type));
+    int na = RAY_ATOM_IS_NULL(a), nb = RAY_ATOM_IS_NULL(b);
+    if (na && nb) return make_bool(1);       /* null == null → <= true */
+    if (na) return make_bool(1);             /* null <= X → true */
+    if (nb) return make_bool(0);             /* X <= null → false */
+    return make_bool(as_f64(a) <= as_f64(b) ? 1 : 0);
+}
+
+/* Check if comparable (numeric or temporal) */
+int is_comparable(ray_t* x) {
+    return is_numeric(x) || is_temporal(x);
+}
+
+ray_t* ray_eq_fn(ray_t* a, ray_t* b) {
+    /* Handle all null forms (C NULL, RAY_NULL_OBJ, typed null atoms) */
+    int na = (!a || RAY_ATOM_IS_NULL(a)), nb = (!b || RAY_ATOM_IS_NULL(b));
+    if (na && nb) return make_bool(1);
+    if (na || nb) return make_bool(0);
+    { int c; if (char_str_cmp(a, b, &c) == 0) return make_bool(c == 0 ? 1 : 0); }
+    if (a->type == -RAY_BOOL && b->type == -RAY_BOOL)
+        return make_bool(a->b8 == b->b8 ? 1 : 0);
+    if (a->type == -RAY_SYM && b->type == -RAY_SYM)
+        return make_bool(a->i64 == b->i64 ? 1 : 0);
+    if (a->type == -RAY_GUID && b->type == -RAY_GUID)
+        return make_bool(memcmp(ray_data(a->obj), ray_data(b->obj), 16) == 0 ? 1 : 0);
+    /* Temporal comparison (same or cross-temporal via nanosecond conversion) */
+    if (is_temporal(a) && is_temporal(b))
+        return make_bool(temporal_as_ns(a) == temporal_as_ns(b) ? 1 : 0);
+    if (!is_numeric(a) || !is_numeric(b)) return ray_error("type", NULL);
+    if (is_float_op(a, b))
+        return make_bool(as_f64(a) == as_f64(b) ? 1 : 0);
+    return make_bool(as_i64(a) == as_i64(b) ? 1 : 0);
+}
+
+ray_t* ray_neq_fn(ray_t* a, ray_t* b) {
+    /* Handle all null forms (C NULL, RAY_NULL_OBJ, typed null atoms) */
+    int na = (!a || RAY_ATOM_IS_NULL(a)), nb = (!b || RAY_ATOM_IS_NULL(b));
+    if (na && nb) return make_bool(0);
+    if (na || nb) return make_bool(1);
+    { int c; if (char_str_cmp(a, b, &c) == 0) return make_bool(c != 0 ? 1 : 0); }
+    if (a->type == -RAY_BOOL && b->type == -RAY_BOOL)
+        return make_bool(a->b8 != b->b8 ? 1 : 0);
+    if (a->type == -RAY_SYM && b->type == -RAY_SYM)
+        return make_bool(a->i64 != b->i64 ? 1 : 0);
+    if (a->type == -RAY_GUID && b->type == -RAY_GUID)
+        return make_bool(memcmp(ray_data(a->obj), ray_data(b->obj), 16) != 0 ? 1 : 0);
+    /* Temporal comparison (same or cross-temporal via nanosecond conversion) */
+    if (is_temporal(a) && is_temporal(b))
+        return make_bool(temporal_as_ns(a) != temporal_as_ns(b) ? 1 : 0);
+    if (!is_numeric(a) || !is_numeric(b)) return ray_error("type", NULL);
+    if (is_float_op(a, b))
+        return make_bool(as_f64(a) != as_f64(b) ? 1 : 0);
+    return make_bool(as_i64(a) != as_i64(b) ? 1 : 0);
+}
+
+/* Bool vector element-wise helpers to reduce duplication in and/or/not. */
+#define BOOL_VEC_BINOP(a, b, op) do {                       \
+    int64_t n = a->len < b->len ? a->len : b->len;        \
+    ray_t* r = ray_vec_new(RAY_BOOL, n);                   \
+    if (RAY_IS_ERR(r)) return r;                           \
+    bool* da = (bool*)ray_data(a);                         \
+    bool* db = (bool*)ray_data(b);                         \
+    bool* dr = (bool*)ray_data(r);                         \
+    for (int64_t i = 0; i < n; i++) dr[i] = da[i] op db[i]; \
+    r->len = n;                                            \
+    return r;                                              \
+} while(0)
+
+#define BOOL_VEC_SCALAR_L(vec, sv, op) do {                 \
+    int64_t n = vec->len;                                  \
+    ray_t* r = ray_vec_new(RAY_BOOL, n);                   \
+    if (RAY_IS_ERR(r)) return r;                           \
+    bool* dv = (bool*)ray_data(vec);                       \
+    bool* dr = (bool*)ray_data(r);                         \
+    for (int64_t i = 0; i < n; i++) dr[i] = dv[i] op sv;  \
+    r->len = n;                                            \
+    return r;                                              \
+} while(0)
+
+ray_t* ray_and_fn(ray_t* a, ray_t* b) {
+    /* Element-wise for bool vectors */
+    if (ray_is_vec(a) && a->type == RAY_BOOL && ray_is_vec(b) && b->type == RAY_BOOL)
+        BOOL_VEC_BINOP(a, b, &&);
+    /* Scalar broadcast: vec and scalar */
+    if (ray_is_vec(a) && a->type == RAY_BOOL && ray_is_atom(b))
+        BOOL_VEC_SCALAR_L(a, is_truthy(b), &&);
+    if (ray_is_atom(a) && ray_is_vec(b) && b->type == RAY_BOOL)
+        BOOL_VEC_SCALAR_L(b, is_truthy(a), &&);
+    return make_bool((is_truthy(a) && is_truthy(b)) ? 1 : 0);
+}
+
+ray_t* ray_or_fn(ray_t* a, ray_t* b) {
+    /* Element-wise for bool vectors */
+    if (ray_is_vec(a) && a->type == RAY_BOOL && ray_is_vec(b) && b->type == RAY_BOOL)
+        BOOL_VEC_BINOP(a, b, ||);
+    /* Scalar broadcast */
+    if (ray_is_vec(a) && a->type == RAY_BOOL && ray_is_atom(b))
+        BOOL_VEC_SCALAR_L(a, is_truthy(b), ||);
+    if (ray_is_atom(a) && ray_is_vec(b) && b->type == RAY_BOOL)
+        BOOL_VEC_SCALAR_L(b, is_truthy(a), ||);
+    return make_bool((is_truthy(a) || is_truthy(b)) ? 1 : 0);
+}
+
+/* Special-form variadic AND/OR with short-circuit (matches v1).
+ *
+ * `args` are UNEVALUATED AST nodes — registered with RAY_FN_SPECIAL_FORM
+ * so the evaluator hands us raw forms rather than computed values.  We
+ * call ray_eval per arg ourselves and stop as soon as the result is
+ * determined: AND on first scalar falsy, OR on first scalar truthy.
+ *
+ * Mixed scalar+vector: when the running accumulator becomes a *scalar*
+ * with the determining truth value, we return it immediately — same
+ * shape as Lisp/Clojure where short-circuit yields the determinant.
+ * If the accumulator is a vector we cannot short-circuit (subsequent
+ * args may be vectors that still need element-wise combination), so we
+ * fall through to ray_and_fn / ray_or_fn for that step. */
+static ray_t* eval_and_short(ray_t* arg) {
+    ray_t* v = ray_eval(arg);
+    if (!v || RAY_IS_ERR(v)) return v;
+    if (ray_is_lazy(v)) v = ray_lazy_materialize(v);
+    return v;
+}
+
+ray_t* ray_and_vary_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("arity", "expected at least 2 args, got %lld", (long long)n);
+    ray_t* acc = eval_and_short(args[0]);
+    if (!acc || RAY_IS_ERR(acc)) return acc;
+    /* Short-circuit only when the running result is a *scalar* falsy.
+     * If acc is a vector, subsequent args still need element-wise
+     * combination (so `(and vec false)` broadcasts to all-false vector
+     * of acc's shape rather than a bare scalar). */
+    if (ray_is_atom(acc) && !is_truthy(acc)) return acc;
+    for (int64_t i = 1; i < n; i++) {
+        ray_t* v = eval_and_short(args[i]);
+        if (!v || RAY_IS_ERR(v)) { ray_release(acc); return v; }
+        ray_t* next = ray_and_fn(acc, v);
+        ray_release(acc);
+        ray_release(v);
+        if (!next || RAY_IS_ERR(next)) return next;
+        acc = next;
+        if (ray_is_atom(acc) && !is_truthy(acc)) return acc;
+    }
+    return acc;
+}
+
+ray_t* ray_or_vary_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("arity", "expected at least 2 args, got %lld", (long long)n);
+    ray_t* acc = eval_and_short(args[0]);
+    if (!acc || RAY_IS_ERR(acc)) return acc;
+    /* Short-circuit only on scalar truthy accumulator (see AND comment). */
+    if (ray_is_atom(acc) && is_truthy(acc)) return acc;
+    for (int64_t i = 1; i < n; i++) {
+        ray_t* v = eval_and_short(args[i]);
+        if (!v || RAY_IS_ERR(v)) { ray_release(acc); return v; }
+        ray_t* next = ray_or_fn(acc, v);
+        ray_release(acc);
+        ray_release(v);
+        if (!next || RAY_IS_ERR(next)) return next;
+        acc = next;
+        if (ray_is_atom(acc) && is_truthy(acc)) return acc;
+    }
+    return acc;
+}
+
+/* Unary */
+ray_t* ray_not_fn(ray_t* x) {
+    /* Element-wise for bool vectors */
+    if (ray_is_vec(x) && x->type == RAY_BOOL) {
+        int64_t n = x->len;
+        ray_t* r = ray_vec_new(RAY_BOOL, n);
+        if (RAY_IS_ERR(r)) return r;
+        bool* src = (bool*)ray_data(x);
+        bool* dr = (bool*)ray_data(r);
+        for (int64_t i = 0; i < n; i++) dr[i] = !src[i];
+        r->len = n;
+        return r;
+    }
+    return make_bool(is_truthy(x) ? 0 : 1);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/collection.c b/crates/rayforce-sys/vendor/rayforce/src/ops/collection.c
new file mode 100644
index 0000000..75783f8
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/collection.c
@@ -0,0 +1,2040 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/*  Collection / higher-order builtins — extracted from eval.c  */
+
+#include "lang/internal.h"
+#include "core/types.h"
+#include "core/pool.h"
+#include "mem/sys.h"
+#include "ops/hash.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* ══════════════════════════════════════════
+ * Open-addressing hash set used by distinct/union/except/sect/in
+ * to replace O(n×m) linear scans with O(n+m) hash lookups.
+ *
+ * Slots store the row index of the first occurrence in a single vec,
+ * or HS_EMPTY.  Hashing and equality are dispatched through a small
+ * vtable keyed by collection type so the same set can probe a typed
+ * vec or a boxed RAY_LIST without per-call boxing on the typed path.
+ * Nulls aggregate into a separate null-bucket flag, mirroring atom_eq
+ * (all nulls compare equal regardless of typed-null vs RAY_NULL_OBJ).
+ *
+ * Load factor capped at 0.5; growth doubles capacity.
+ * ══════════════════════════════════════════ */
+
+#define HS_EMPTY ((int64_t)-1)
+
+typedef struct hashset_t {
+    int64_t* slots;        /* cap entries; -1 = empty, else row index */
+    ray_t*   block;        /* backing alloc for slots */
+    int64_t  cap;          /* power of 2 */
+    int64_t  mask;         /* cap - 1 */
+    int64_t  count;        /* live entries (excl. null bucket) */
+    int      null_seen;    /* 1 if any null has been recorded */
+    int64_t  null_idx;     /* row index of first null encountered */
+    /* Cached typed-vec data pointers to avoid re-derefs in hot loops. */
+    ray_t*   src;          /* source vec or list */
+    int8_t   src_type;     /* ray_t.type */
+    bool     src_has_nulls;
+    void*    src_data;     /* pointer to typed data (or RAY_LIST elements) */
+} hashset_t;
+
+/* Hash a single row at index i in src.  Mirrors atom_eq's coercion
+ * rules: numeric types normalize through f64 so an I64 atom and an
+ * F64 atom holding the same value collide (boxed-list path only — a
+ * typed vec is homogeneous, so the dispatch picks one branch). */
+static uint64_t hs_hash_row(ray_t* src, int64_t i, int8_t t, void* data) {
+    switch (t) {
+        case RAY_I64:       return ray_hash_i64(((const int64_t*)data)[i]);
+        case RAY_I32:       return ray_hash_i64((int64_t)((const int32_t*)data)[i]);
+        case RAY_I16:       return ray_hash_i64((int64_t)((const int16_t*)data)[i]);
+        case RAY_U8:        return ray_hash_i64((int64_t)((const uint8_t*)data)[i]);
+        case RAY_BOOL:      return ray_hash_i64((int64_t)((const bool*)data)[i]);
+        case RAY_F64:       return ray_hash_f64(((const double*)data)[i]);
+        case RAY_DATE:      return ray_hash_i64((int64_t)((const int32_t*)data)[i]);
+        case RAY_TIME:      return ray_hash_i64((int64_t)((const int32_t*)data)[i]);
+        case RAY_TIMESTAMP: return ray_hash_i64(((const int64_t*)data)[i]);
+        case RAY_SYM: {
+            uint64_t s = ray_read_sym(data, i, src->type, src->attrs);
+            return ray_hash_i64((int64_t)s);
+        }
+        case RAY_GUID:
+            return ray_hash_bytes((const uint8_t*)data + i * 16, 16);
+        case RAY_STR: {
+            size_t l = 0;
+            const char* p = ray_str_vec_get(src, i, &l);
+            return p ? ray_hash_bytes(p, l) : 0;
+        }
+        case RAY_LIST: {
+            ray_t** elems = (ray_t**)data;
+            ray_t* e = elems[i];
+            if (!e || RAY_ATOM_IS_NULL(e)) return 0;
+            /* Numeric coercion: hash as f64 so distinct numeric types
+             * holding the same value collide (atom_eq does the same). */
+            if (is_numeric(e)) return ray_hash_f64(as_f64(e));
+            switch (e->type) {
+                case -RAY_SYM:       return ray_hash_i64(e->i64);
+                case -RAY_DATE:
+                case -RAY_TIME:      return ray_hash_i64((int64_t)e->i32);
+                case -RAY_TIMESTAMP: return ray_hash_i64(e->i64);
+                case -RAY_GUID: {
+                    const uint8_t* g = e->obj
+                        ? (const uint8_t*)ray_data(e->obj)
+                        : (const uint8_t*)ray_data((ray_t*)e);
+                    return ray_hash_bytes(g, 16);
+                }
+                case -RAY_STR:
+                    return ray_hash_bytes(ray_str_ptr(e), ray_str_len(e));
+                default:
+                    /* Unknown / unsupported atom kind: hash by type tag. */
+                    return ray_hash_i64((int64_t)e->type);
+            }
+        }
+        default:
+            return ray_hash_i64(i);
+    }
+}
+
+/* Compare two rows for equality, dispatched the same way as the hash. */
+static int hs_eq_rows(ray_t* a_src, int64_t ai, int8_t at, void* a_data,
+                      ray_t* b_src, int64_t bi, int8_t bt, void* b_data) {
+    if (at == bt && at != RAY_LIST) {
+        switch (at) {
+            case RAY_I64:       return ((const int64_t*)a_data)[ai] == ((const int64_t*)b_data)[bi];
+            case RAY_I32:       return ((const int32_t*)a_data)[ai] == ((const int32_t*)b_data)[bi];
+            case RAY_I16:       return ((const int16_t*)a_data)[ai] == ((const int16_t*)b_data)[bi];
+            case RAY_U8:        return ((const uint8_t*)a_data)[ai] == ((const uint8_t*)b_data)[bi];
+            case RAY_BOOL:      return ((const bool*)a_data)[ai] == ((const bool*)b_data)[bi];
+            case RAY_F64:       return ((const double*)a_data)[ai] == ((const double*)b_data)[bi];
+            case RAY_DATE:
+            case RAY_TIME:      return ((const int32_t*)a_data)[ai] == ((const int32_t*)b_data)[bi];
+            case RAY_TIMESTAMP: return ((const int64_t*)a_data)[ai] == ((const int64_t*)b_data)[bi];
+            case RAY_SYM: {
+                uint64_t sa = ray_read_sym(a_data, ai, a_src->type, a_src->attrs);
+                uint64_t sb = ray_read_sym(b_data, bi, b_src->type, b_src->attrs);
+                return sa == sb;
+            }
+            case RAY_GUID:
+                return memcmp((const uint8_t*)a_data + ai * 16,
+                              (const uint8_t*)b_data + bi * 16, 16) == 0;
+            case RAY_STR: {
+                size_t al = 0, bl = 0;
+                const char* ap = ray_str_vec_get(a_src, ai, &al);
+                const char* bp = ray_str_vec_get(b_src, bi, &bl);
+                if (!ap) ap = "";
+                if (!bp) bp = "";
+                return al == bl && memcmp(ap, bp, al) == 0;
+            }
+        }
+    }
+    /* Fall back to atom_eq via boxed values.  Used for cross-type
+     * comparisons (e.g. except over typed I64 vs F64 vec) and the
+     * RAY_LIST path.  collection_elem allocates a temporary atom for
+     * typed vecs; the cost is paid only on collisions / mixed types. */
+    int alloc_a = 0, alloc_b = 0;
+    ray_t* a = collection_elem(a_src, ai, &alloc_a);
+    ray_t* b = collection_elem(b_src, bi, &alloc_b);
+    int eq = (a && b) ? atom_eq(a, b) : 0;
+    if (alloc_a && a) ray_release(a);
+    if (alloc_b && b) ray_release(b);
+    return eq;
+}
+
+/* True if the row at index i in src is null. */
+static inline int hs_row_is_null(ray_t* src, int64_t i, void* data) {
+    if (src->type == RAY_LIST) {
+        ray_t* e = ((ray_t**)data)[i];
+        return !e || RAY_ATOM_IS_NULL(e);
+    }
+    return (src->attrs & RAY_ATTR_HAS_NULLS) && ray_vec_is_null(src, i);
+}
+
+static bool hashset_init(hashset_t* hs, ray_t* src, int64_t hint) {
+    int64_t cap = 16;
+    /* Cap target: 2× hint to keep load factor under 0.5. */
+    while (cap < (hint > 0 ? hint * 2 : 16)) cap *= 2;
+    hs->block = ray_alloc((size_t)cap * sizeof(int64_t));
+    if (!hs->block || RAY_IS_ERR(hs->block)) { hs->block = NULL; return false; }
+    hs->slots = (int64_t*)ray_data(hs->block);
+    for (int64_t i = 0; i < cap; i++) hs->slots[i] = HS_EMPTY;
+    hs->cap = cap;
+    hs->mask = cap - 1;
+    hs->count = 0;
+    hs->null_seen = 0;
+    hs->null_idx = HS_EMPTY;
+    hs->src = src;
+    hs->src_type = src ? src->type : 0;
+    hs->src_has_nulls = src ? ((src->attrs & RAY_ATTR_HAS_NULLS) != 0) : false;
+    hs->src_data = src ? ray_data(src) : NULL;
+    return true;
+}
+
+static void hashset_destroy(hashset_t* hs) {
+    if (hs->block) { ray_release(hs->block); hs->block = NULL; }
+    hs->slots = NULL;
+}
+
+static bool hashset_grow(hashset_t* hs) {
+    int64_t old_cap = hs->cap;
+    int64_t* old_slots = hs->slots;
+    int64_t new_cap = old_cap * 2;
+    if (new_cap < old_cap) return false;
+    ray_t* nb = ray_alloc((size_t)new_cap * sizeof(int64_t));
+    if (!nb || RAY_IS_ERR(nb)) return false;
+    int64_t* ns = (int64_t*)ray_data(nb);
+    for (int64_t i = 0; i < new_cap; i++) ns[i] = HS_EMPTY;
+    int64_t mask = new_cap - 1;
+    for (int64_t i = 0; i < old_cap; i++) {
+        int64_t ridx = old_slots[i];
+        if (ridx == HS_EMPTY) continue;
+        uint64_t h = hs_hash_row(hs->src, ridx, hs->src_type, hs->src_data);
+        int64_t s = (int64_t)(h & (uint64_t)mask);
+        while (ns[s] != HS_EMPTY) s = (s + 1) & mask;
+        ns[s] = ridx;
+    }
+    ray_release(hs->block);
+    hs->block = nb;
+    hs->slots = ns;
+    hs->cap = new_cap;
+    hs->mask = mask;
+    return true;
+}
+
+/* Probe the set for the row (probe_src, probe_i).  Returns the stored
+ * row index from the build-side vec on hit, HS_EMPTY on miss. */
+static int64_t hashset_find_xrow(hashset_t* hs, ray_t* probe_src, int64_t probe_i,
+                                  int8_t probe_type, void* probe_data) {
+    if (hs_row_is_null(probe_src, probe_i, probe_data))
+        return hs->null_seen ? hs->null_idx : HS_EMPTY;
+    uint64_t h = hs_hash_row(probe_src, probe_i, probe_type, probe_data);
+    int64_t s = (int64_t)(h & (uint64_t)hs->mask);
+    while (hs->slots[s] != HS_EMPTY) {
+        int64_t stored = hs->slots[s];
+        if (hs_eq_rows(probe_src, probe_i, probe_type, probe_data,
+                       hs->src,    stored,   hs->src_type, hs->src_data))
+            return stored;
+        s = (s + 1) & hs->mask;
+    }
+    return HS_EMPTY;
+}
+
+/* qsort comparator state for distinct_sort_indices: thread-local so
+ * the standard qsort entry point can pull it without a context arg.
+ * Single-threaded VM-eval is the only caller, so TLS is fine. */
+static _Thread_local ray_t* g_dsort_src;
+static _Thread_local int8_t g_dsort_type;
+static _Thread_local const void* g_dsort_data;
+
+static int distinct_sort_cmp(const void* a, const void* b) {
+    int64_t ia = *(const int64_t*)a;
+    int64_t ib = *(const int64_t*)b;
+    switch (g_dsort_type) {
+        case RAY_I64: case RAY_TIMESTAMP: {
+            int64_t va = ((const int64_t*)g_dsort_data)[ia];
+            int64_t vb = ((const int64_t*)g_dsort_data)[ib];
+            return (va > vb) - (va < vb);
+        }
+        case RAY_I32: case RAY_DATE: case RAY_TIME: {
+            int32_t va = ((const int32_t*)g_dsort_data)[ia];
+            int32_t vb = ((const int32_t*)g_dsort_data)[ib];
+            return (va > vb) - (va < vb);
+        }
+        case RAY_I16: {
+            int16_t va = ((const int16_t*)g_dsort_data)[ia];
+            int16_t vb = ((const int16_t*)g_dsort_data)[ib];
+            return (va > vb) - (va < vb);
+        }
+        case RAY_U8: case RAY_BOOL: {
+            uint8_t va = ((const uint8_t*)g_dsort_data)[ia];
+            uint8_t vb = ((const uint8_t*)g_dsort_data)[ib];
+            return (va > vb) - (va < vb);
+        }
+        case RAY_F64: {
+            double va = ((const double*)g_dsort_data)[ia];
+            double vb = ((const double*)g_dsort_data)[ib];
+            return (va > vb) - (va < vb);
+        }
+        default: {
+            /* Fall back to boxed compare for less-common element kinds. */
+            int alloc_a = 0, alloc_b = 0;
+            ray_t* a_e = collection_elem(g_dsort_src, ia, &alloc_a);
+            ray_t* b_e = collection_elem(g_dsort_src, ib, &alloc_b);
+            double va = a_e ? as_f64(a_e) : 0.0;
+            double vb = b_e ? as_f64(b_e) : 0.0;
+            if (alloc_a && a_e) ray_release(a_e);
+            if (alloc_b && b_e) ray_release(b_e);
+            return (va > vb) - (va < vb);
+        }
+    }
+}
+
+/* Sort `count` indices by their numeric value in `src`.  Preserves
+ * the existing `distinct` semantic of returning numeric output sorted. */
+static void distinct_sort_indices(ray_t* src, int64_t* idx, int64_t count) {
+    g_dsort_src = src;
+    g_dsort_type = src->type;
+    g_dsort_data = ray_data(src);
+    qsort(idx, (size_t)count, sizeof(int64_t), distinct_sort_cmp);
+}
+
+/* Insert row i (from the set's build-side src) if absent.
+ * Returns true if newly inserted, false if duplicate.  On grow OOM
+ * the set silently keeps the previous capacity (caller proceeds). */
+static bool hashset_insert(hashset_t* hs, int64_t i) {
+    if (hs_row_is_null(hs->src, i, hs->src_data)) {
+        if (hs->null_seen) return false;
+        hs->null_seen = 1;
+        hs->null_idx = i;
+        return true;
+    }
+    if (hs->count * 2 >= hs->cap) {
+        if (!hashset_grow(hs)) { /* fall through, may degrade */ }
+    }
+    uint64_t h = hs_hash_row(hs->src, i, hs->src_type, hs->src_data);
+    int64_t s = (int64_t)(h & (uint64_t)hs->mask);
+    while (hs->slots[s] != HS_EMPTY) {
+        int64_t stored = hs->slots[s];
+        if (hs_eq_rows(hs->src, i,      hs->src_type, hs->src_data,
+                       hs->src, stored, hs->src_type, hs->src_data))
+            return false;
+        s = (s + 1) & hs->mask;
+    }
+    hs->slots[s] = i;
+    hs->count++;
+    return true;
+}
+
+/* ══════════════════════════════════════════
+ * Higher-order functions
+ * ══════════════════════════════════════════ */
+
+/* (map fn val vec) — apply binary fn(val, elem) to each element of vec.
+ * Also supports (map fn vec) for unary mapping. */
+ray_t* ray_map_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("domain", NULL);
+    for (int64_t i = 0; i < n; i++)
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+
+    ray_t* fn = args[0];
+    ray_t* _bx = NULL;
+
+    if (n == 2) {
+        /* Unary map: (map fn vec) */
+        ray_t* vec = unbox_vec_arg(args[1], &_bx);
+        if (RAY_IS_ERR(vec)) return vec;
+        if (!is_list(vec)) { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+        int64_t len = ray_len(vec);
+        ray_t* result = ray_alloc(len * sizeof(ray_t*));
+        if (!result) { if (_bx) ray_release(_bx); return ray_error("oom", NULL); }
+        result->type = RAY_LIST;
+        result->len = len;
+        ray_t** out = (ray_t**)ray_data(result);
+        ray_t** elems = (ray_t**)ray_data(vec);
+        for (int64_t i = 0; i < len; i++) {
+            out[i] = call_fn1(fn, elems[i]);
+            if (RAY_IS_ERR(out[i])) {
+                for (int64_t j = 0; j < i; j++) ray_release(out[j]);
+                result->len = 0; ray_release(result); if (_bx) ray_release(_bx);
+                return out[i];
+            }
+        }
+        if (_bx) ray_release(_bx);
+        return result;
+    }
+
+    /* Binary map: (map fn val vec) — apply fn(val, elem) */
+    ray_t* val = args[1];
+    ray_t* vec = unbox_vec_arg(args[2], &_bx);
+    if (RAY_IS_ERR(vec)) return vec;
+    /* If vec is scalar, just call fn(val, vec) once */
+    if (!is_list(vec)) {
+        if (_bx) ray_release(_bx);
+        return call_fn2(fn, val, args[2]);
+    }
+    int64_t len = ray_len(vec);
+    ray_t* result = ray_alloc(len * sizeof(ray_t*));
+    if (!result) { if (_bx) ray_release(_bx); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    result->len = len;
+    ray_t** out = (ray_t**)ray_data(result);
+    ray_t** elems = (ray_t**)ray_data(vec);
+    for (int64_t i = 0; i < len; i++) {
+        out[i] = call_fn2(fn, val, elems[i]);
+        if (RAY_IS_ERR(out[i])) {
+            for (int64_t j = 0; j < i; j++) ray_release(out[j]);
+            result->len = 0; ray_release(result); if (_bx) ray_release(_bx);
+            return out[i];
+        }
+    }
+    if (_bx) ray_release(_bx);
+    return result;
+}
+
+/* (pmap fn val vec) — same as map, parallel not implemented yet (sequential fallback) */
+ray_t* ray_pmap_fn(ray_t** args, int64_t n) {
+    return ray_map_fn(args, n);
+}
+
+/* (fold fn vec) or (fold fn init vec) — reduce with binary fn */
+ray_t* ray_fold_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("domain", NULL);
+    for (int64_t i = 0; i < n; i++)
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+
+    ray_t* fn = args[0];
+    ray_t* vec;
+    ray_t* acc;
+    ray_t* _bx = NULL;
+    if (n == 2) {
+        /* (fold fn vec) — use first element as initial value */
+        vec = unbox_vec_arg(args[1], &_bx);
+        if (RAY_IS_ERR(vec)) return vec;
+        if (!is_list(vec)) { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+        int64_t len = ray_len(vec);
+        if (len == 0) { if (_bx) ray_release(_bx); return ray_error("domain", NULL); }
+        ray_t** elems = (ray_t**)ray_data(vec);
+        ray_retain(elems[0]);
+        acc = elems[0];
+        for (int64_t i = 1; i < len; i++) {
+            ray_t* next = call_fn2(fn, acc, elems[i]);
+            ray_release(acc);
+            if (RAY_IS_ERR(next)) { if (_bx) ray_release(_bx); return next; }
+            acc = next;
+        }
+        if (_bx) ray_release(_bx);
+        return acc;
+    }
+
+    /* (fold fn init vec) */
+    ray_retain(args[1]);
+    acc = args[1];
+    vec = unbox_vec_arg(args[2], &_bx);
+    if (RAY_IS_ERR(vec)) { ray_release(acc); return vec; }
+    if (!is_list(vec)) { ray_release(acc); if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+    int64_t len = ray_len(vec);
+    ray_t** elems = (ray_t**)ray_data(vec);
+    for (int64_t i = 0; i < len; i++) {
+        ray_t* next = call_fn2(fn, acc, elems[i]);
+        ray_release(acc);
+        if (RAY_IS_ERR(next)) { if (_bx) ray_release(_bx); return next; }
+        acc = next;
+    }
+    if (_bx) ray_release(_bx);
+    return acc;
+}
+
+/* (scan fn vec) — running fold, returns vector of partial results */
+ray_t* ray_scan_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("domain", NULL);
+    for (int64_t i = 0; i < n; i++)
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+
+    ray_t* fn = args[0];
+    ray_t* _bx = NULL;
+    ray_t* vec = unbox_vec_arg(args[1], &_bx);
+    if (RAY_IS_ERR(vec)) return vec;
+    if (!is_list(vec)) { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+    int64_t len = ray_len(vec);
+    if (len == 0) {
+        if (_bx) ray_release(_bx);
+        ray_t* result = ray_alloc(0);
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = 0;
+        return result;
+    }
+
+    ray_t* result = ray_alloc(len * sizeof(ray_t*));
+    if (!result) { if (_bx) ray_release(_bx); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    result->len = len;
+    ray_t** out = (ray_t**)ray_data(result);
+    ray_t** elems = (ray_t**)ray_data(vec);
+
+    ray_retain(elems[0]);
+    out[0] = elems[0];
+    for (int64_t i = 1; i < len; i++) {
+        out[i] = call_fn2(fn, out[i - 1], elems[i]);
+        if (RAY_IS_ERR(out[i])) {
+            for (int64_t j = 0; j < i; j++) ray_release(out[j]);
+            result->len = 0; ray_release(result); if (_bx) ray_release(_bx);
+            return out[i];
+        }
+    }
+    if (_bx) ray_release(_bx);
+    return result;
+}
+
+/* (filter vec mask) — filter vector by boolean mask */
+ray_t* ray_filter_fn(ray_t* vec, ray_t* mask) {
+    if (ray_is_lazy(vec)) vec = ray_lazy_materialize(vec);
+    if (ray_is_lazy(mask)) mask = ray_lazy_materialize(mask);
+
+    /* Table filter: apply mask to each column */
+    if (vec->type == RAY_TABLE && ray_is_vec(mask) && mask->type == RAY_BOOL) {
+        int64_t ncols = ray_table_ncols(vec);
+        int64_t nrows = ray_table_nrows(vec);
+        if (nrows != mask->len) return ray_error("length", NULL);
+        ray_t* result = ray_table_new(ncols);
+        if (RAY_IS_ERR(result)) return result;
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t cn = ray_table_col_name(vec, c);
+            ray_t* src_col = ray_table_get_col_idx(vec, c);
+            ray_t* filtered = ray_filter_fn(src_col, mask);
+            if (RAY_IS_ERR(filtered)) { ray_release(result); return filtered; }
+            result = ray_table_add_col(result, cn, filtered);
+            ray_release(filtered);
+            if (RAY_IS_ERR(result)) return result;
+        }
+        return result;
+    }
+
+    /* String filter: STR atom + bool mask → filter characters */
+    if (ray_is_atom(vec) && (-vec->type) == RAY_STR && ray_is_vec(mask) && mask->type == RAY_BOOL) {
+        const char* sp = ray_str_ptr(vec);
+        size_t slen = ray_str_len(vec);
+        int64_t mlen = mask->len;
+        if ((int64_t)slen != mlen) return ray_error("length", NULL);
+        bool* mb = (bool*)ray_data(mask);
+        int64_t count = 0;
+        for (int64_t i = 0; i < mlen; i++) if (mb[i]) count++;
+        char buf[8192];
+        if ((size_t)count > sizeof(buf)) return ray_error("limit", NULL);
+        int64_t j = 0;
+        for (int64_t i = 0; i < mlen; i++) {
+            if (mb[i]) buf[j++] = sp[i];
+        }
+        return ray_str(buf, (size_t)count);
+    }
+
+    /* Fast path: typed vector + typed bool mask */
+    if (ray_is_vec(vec) && ray_is_vec(mask) && mask->type == RAY_BOOL) {
+        int64_t len = vec->len;
+        int64_t mlen = mask->len;
+        if (len != mlen) return ray_error("length", NULL);
+        bool* mb = (bool*)ray_data(mask);
+
+        /* Count true values */
+        int64_t count = 0;
+        for (int64_t i = 0; i < len; i++) if (mb[i]) count++;
+
+        int8_t vtype = vec->type;
+        int esz = ray_elem_size(vtype);
+        ray_t* result = ray_vec_new(vtype, count);
+        if (RAY_IS_ERR(result)) return result;
+        result->len = count;
+        char* src = (char*)ray_data(vec);
+        char* dst = (char*)ray_data(result);
+        int64_t j = 0;
+        for (int64_t i = 0; i < len; i++) {
+            if (mb[i]) {
+                memcpy(dst + j * esz, src + i * esz, esz);
+                if (ray_vec_is_null(vec, i))
+                    ray_vec_set_null(result, j, true);
+                j++;
+            }
+        }
+        return result;
+    }
+
+    /* Fallback: boxed list path */
+    ray_t *_bx1 = NULL, *_bx2 = NULL;
+    vec = unbox_vec_arg(vec, &_bx1);
+    if (RAY_IS_ERR(vec)) return vec;
+    mask = unbox_vec_arg(mask, &_bx2);
+    if (RAY_IS_ERR(mask)) { if (_bx1) ray_release(_bx1); return mask; }
+    if (!is_list(vec) || !is_list(mask)) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("type", NULL); }
+    int64_t len = ray_len(vec);
+    int64_t mlen = ray_len(mask);
+    if (len != mlen) return ray_error("length", NULL);
+
+    ray_t** velems = (ray_t**)ray_data(vec);
+    ray_t** melems = (ray_t**)ray_data(mask);
+
+    /* Validate mask is all booleans */
+    for (int64_t i = 0; i < len; i++) {
+        if (melems[i]->type != -RAY_BOOL) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("type", NULL); }
+    }
+    /* Count true values */
+    int64_t count = 0;
+    for (int64_t i = 0; i < len; i++) {
+        if (melems[i]->b8) count++;
+    }
+
+    ray_t* result = ray_alloc(count * sizeof(ray_t*));
+    if (!result) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    result->len = count;
+    ray_t** out = (ray_t**)ray_data(result);
+    int64_t j = 0;
+    for (int64_t i = 0; i < len; i++) {
+        if (melems[i]->b8) {
+            ray_retain(velems[i]);
+            out[j++] = velems[i];
+        }
+    }
+    if (_bx1) ray_release(_bx1);
+    if (_bx2) ray_release(_bx2);
+    return result;
+}
+
+/* (apply fn vec1 vec2) — zip-apply fn element-wise over two vectors */
+ray_t* ray_apply_fn(ray_t** args, int64_t n) {
+    if (n < 3) return ray_error("domain", NULL);
+    for (int64_t i = 0; i < n; i++)
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+
+    ray_t* fn = args[0];
+
+    /* If both args are scalars, just call fn(a, b) once */
+    if (ray_is_atom(args[1]) && ray_is_atom(args[2]))
+        return call_fn2(fn, args[1], args[2]);
+
+    ray_t *_bx1 = NULL, *_bx2 = NULL;
+    ray_t* vec1 = unbox_vec_arg(args[1], &_bx1);
+    if (RAY_IS_ERR(vec1)) return vec1;
+    ray_t* vec2 = unbox_vec_arg(args[2], &_bx2);
+    if (RAY_IS_ERR(vec2)) { if (_bx1) ray_release(_bx1); return vec2; }
+    if (!is_list(vec1) || !is_list(vec2)) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("type", NULL); }
+    int64_t len1 = ray_len(vec1);
+    int64_t len2 = ray_len(vec2);
+    int64_t len = len1 < len2 ? len1 : len2;
+
+    ray_t* result = ray_alloc(len * sizeof(ray_t*));
+    if (!result) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    result->len = len;
+    ray_t** out = (ray_t**)ray_data(result);
+    ray_t** e1 = (ray_t**)ray_data(vec1);
+    ray_t** e2 = (ray_t**)ray_data(vec2);
+
+    for (int64_t i = 0; i < len; i++) {
+        out[i] = call_fn2(fn, e1[i], e2[i]);
+        if (RAY_IS_ERR(out[i])) {
+            for (int64_t j = 0; j < i; j++) ray_release(out[j]);
+            result->len = 0; ray_release(result); if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2);
+            return out[i];
+        }
+    }
+    if (_bx1) ray_release(_bx1);
+    if (_bx2) ray_release(_bx2);
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * Collection operations
+ * ══════════════════════════════════════════ */
+
+/* Helper: compare two atoms for equality (value-based) */
+int atom_eq(ray_t* a, ray_t* b) {
+    int a_null = RAY_ATOM_IS_NULL(a);
+    int b_null = RAY_ATOM_IS_NULL(b);
+    if (a_null && b_null) return 1;
+    if (a_null || b_null) return 0;
+    if (a->type != b->type) {
+        if (is_numeric(a) && is_numeric(b))
+            return as_f64(a) == as_f64(b);
+        return 0;
+    }
+    switch (a->type) {
+    case -RAY_I64:  return a->i64 == b->i64;
+    case -RAY_I32:  return a->i32 == b->i32;
+    case -RAY_I16:  return a->i16 == b->i16;
+    case -RAY_U8:   return a->u8 == b->u8;
+    case -RAY_F64:  return a->f64 == b->f64;
+    case -RAY_BOOL: return a->b8 == b->b8;
+    case -RAY_SYM:  return a->i64 == b->i64;
+    case -RAY_DATE: case -RAY_TIME:
+        return a->i32 == b->i32;
+    case -RAY_TIMESTAMP:
+        return a->i64 == b->i64;
+    case -RAY_GUID: {
+        const uint8_t* ga = a->obj ? (const uint8_t*)ray_data(a->obj) : (const uint8_t*)ray_data((ray_t*)a);
+        const uint8_t* gb = b->obj ? (const uint8_t*)ray_data(b->obj) : (const uint8_t*)ray_data((ray_t*)b);
+        return memcmp(ga, gb, 16) == 0;
+    }
+    case -RAY_STR:
+        return ray_str_len(a) == ray_str_len(b) &&
+               memcmp(ray_str_ptr(a), ray_str_ptr(b), ray_str_len(a)) == 0;
+    default:
+        /* Vector equality: same type and length, element-wise comparison */
+        if (a->type > 0 && a->type == b->type && a->len == b->len) {
+            int esz = ray_elem_size(a->type);
+            return memcmp(ray_data(a), ray_data(b), (size_t)(a->len * esz)) == 0;
+        }
+        return 0;
+    }
+}
+
+/* Forward declaration */
+ray_t* list_to_typed_vec(ray_t* list, int8_t orig_vec_type);
+
+/* (distinct x) — remove duplicates. Dispatches on type:
+ *   table → deduplicate rows (via DAG GROUP with zero aggs)
+ *   vector → remove duplicate elements, preserving first occurrence
+ *   string → unique chars, sorted */
+ray_t* ray_distinct_fn(ray_t* x) {
+    if (ray_is_lazy(x)) x = ray_lazy_materialize(x);
+
+    /* Table distinct: dispatch to table-specific implementation */
+    if (x->type == RAY_TABLE)
+        return ray_table_distinct_fn(x);
+
+    /* String distinct: unique chars, sorted */
+    if (ray_is_atom(x) && (-x->type) == RAY_STR) {
+        const char* sp = ray_str_ptr(x);
+        size_t slen = ray_str_len(x);
+        if (slen == 0) { ray_retain(x); return x; }
+        char uniq[256];
+        int nu = 0;
+        for (size_t i = 0; i < slen; i++) {
+            int dup = 0;
+            for (int j = 0; j < nu; j++) { if (uniq[j] == sp[i]) { dup = 1; break; } }
+            if (!dup && nu < 256) uniq[nu++] = sp[i];
+        }
+        /* Sort */
+        for (int i = 0; i < nu - 1; i++)
+            for (int j = i + 1; j < nu; j++)
+                if ((unsigned char)uniq[i] > (unsigned char)uniq[j]) { char t = uniq[i]; uniq[i] = uniq[j]; uniq[j] = t; }
+        return ray_str(uniq, (size_t)nu);
+    }
+
+    /* Typed vector path: deduplicate via hash set in O(n).
+     * The previous nested-loop scan was O(n^2); for a 100k vec it ran
+     * for ~3 minutes. */
+    if (ray_is_vec(x)) {
+        int64_t len = ray_len(x);
+        if (len == 0) { ray_retain(x); return x; }
+
+        int64_t idx_stack[256];
+        int64_t* idx = (len <= 256) ? idx_stack : (int64_t*)ray_sys_alloc((size_t)len * sizeof(int64_t));
+        if (!idx) return ray_error("oom", NULL);
+
+        hashset_t hs;
+        if (!hashset_init(&hs, x, len)) {
+            if (idx != idx_stack) ray_sys_free(idx);
+            return ray_error("oom", NULL);
+        }
+        int64_t count = 0;
+        for (int64_t i = 0; i < len; i++) {
+            if (hashset_insert(&hs, i)) idx[count++] = i;
+        }
+        hashset_destroy(&hs);
+
+        /* Sort unique indices by value for numeric/temporal types — preserves
+         * pre-existing distinct semantics.  qsort-based; was O(count^2). */
+        if (x->type != RAY_SYM && x->type != RAY_GUID && x->type != RAY_STR && count > 1) {
+            distinct_sort_indices(x, idx, count);
+        }
+
+        ray_t* result = gather_by_idx(x, idx, count);
+        if (idx != idx_stack) ray_sys_free(idx);
+        return result;
+    }
+
+    ray_t* _bx = NULL;
+    x = unbox_vec_arg(x, &_bx);
+    if (RAY_IS_ERR(x)) return x;
+    if (!is_list(x)) { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+    int64_t len = ray_len(x);
+    if (len == 0) { if (_bx) ray_release(_bx); ray_retain(x); return x; }
+    ray_t** elems = (ray_t**)ray_data(x);
+
+    ray_t* result = ray_alloc(len * sizeof(ray_t*));
+    if (!result) { if (_bx) ray_release(_bx); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    ray_t** out = (ray_t**)ray_data(result);
+    int64_t count = 0;
+
+    for (int64_t i = 0; i < len; i++) {
+        int dup = 0;
+        for (int64_t j = 0; j < count; j++) {
+            if (atom_eq(out[j], elems[i])) { dup = 1; break; }
+        }
+        if (!dup) {
+            ray_retain(elems[i]);
+            out[count++] = elems[i];
+        }
+    }
+    result->len = count;
+    /* Sort: atoms before vectors (scalars have negative type) */
+    for (int64_t i = 0; i < count - 1; i++) {
+        for (int64_t j = i + 1; j < count; j++) {
+            int ai = ray_is_atom(out[i]);
+            int aj = ray_is_atom(out[j]);
+            if (!ai && aj) {
+                ray_t* tmp = out[i]; out[i] = out[j]; out[j] = tmp;
+            }
+        }
+    }
+    if (_bx) ray_release(_bx);
+    return result;
+}
+
+/* (in val vec) — check membership */
+ray_t* ray_in_fn(ray_t* val, ray_t* vec) {
+    if (ray_is_lazy(val)) val = ray_lazy_materialize(val);
+    if (ray_is_lazy(vec)) vec = ray_lazy_materialize(vec);
+    /* STR in STR: for each char of val, check membership in vec string */
+    if (ray_is_atom(val) && (-val->type) == RAY_STR && ray_is_atom(vec) && (-vec->type) == RAY_STR) {
+        const char* vp = ray_str_ptr(val);
+        size_t vlen = ray_str_len(val);
+        const char* sp = ray_str_ptr(vec);
+        size_t slen = ray_str_len(vec);
+        ray_t* result = ray_vec_new(RAY_BOOL, (int64_t)vlen);
+        if (RAY_IS_ERR(result)) return result;
+        result->len = (int64_t)vlen;
+        bool* out = (bool*)ray_data(result);
+        for (size_t i = 0; i < vlen; i++) {
+            out[i] = false;
+            for (size_t j = 0; j < slen; j++) {
+                if (vp[i] == sp[j]) { out[i] = true; break; }
+            }
+        }
+        return result;
+    }
+    /* Scalar in scalar: equality check */
+    if (ray_is_atom(val) && ray_is_atom(vec))
+        return make_bool(atom_eq(val, vec) ? 1 : 0);
+    /* STR in LIST: for each char of val, check membership in list elements */
+    if (ray_is_atom(val) && (-val->type) == RAY_STR && (vec->type == RAY_LIST)) {
+        const char* vp = ray_str_ptr(val);
+        size_t vlen = ray_str_len(val);
+        ray_t* result = ray_vec_new(RAY_BOOL, (int64_t)vlen);
+        if (RAY_IS_ERR(result)) return result;
+        result->len = (int64_t)vlen;
+        bool* out_b = (bool*)ray_data(result);
+        ray_t** list_elems = (ray_t**)ray_data(vec);
+        int64_t list_len = ray_len(vec);
+        for (size_t i = 0; i < vlen; i++) {
+            out_b[i] = false;
+            ray_t* ch = ray_str(&vp[i], 1);
+            for (int64_t j = 0; j < list_len; j++) {
+                if (atom_eq(ch, list_elems[j])) { out_b[i] = true; break; }
+            }
+            ray_release(ch);
+        }
+        return result;
+    }
+    /* Vector val: map in over each element */
+    if (is_collection(val) && !ray_is_atom(val)) {
+        int64_t vlen = ray_len(val);
+        if (vlen == 0) {
+            /* Empty collection → return empty list */
+            return ray_list_new(0);
+        }
+        /* Hash-based fast path: both sides are typed vecs of compatible
+         * shape and the result is a row-aligned bool vec.  Build a
+         * hashset over `vec` once, probe per element of `val`.  Was
+         * O(len(val)×len(vec)); now O(len(val)+len(vec)). */
+        if (ray_is_vec(val) && ray_is_vec(vec)) {
+            ray_t* result = ray_vec_new(RAY_BOOL, vlen);
+            if (RAY_IS_ERR(result)) return result;
+            result->len = vlen;
+            bool* out = (bool*)ray_data(result);
+            hashset_t hs;
+            if (!hashset_init(&hs, vec, vec->len)) {
+                ray_release(result);
+                return ray_error("oom", NULL);
+            }
+            for (int64_t j = 0; j < vec->len; j++) hashset_insert(&hs, j);
+            int8_t val_type = val->type;
+            void*  val_data = ray_data(val);
+            for (int64_t i = 0; i < vlen; i++) {
+                out[i] = hashset_find_xrow(&hs, val, i, val_type, val_data) != HS_EMPTY;
+            }
+            hashset_destroy(&hs);
+            return result;
+        }
+        /* Probe first element to check if result is scalar or vector */
+        int alloc0 = 0;
+        ray_t* e0 = collection_elem(val, 0, &alloc0);
+        if (RAY_IS_ERR(e0)) return e0;
+        ray_t* r0 = ray_in_fn(e0, vec);
+        if (alloc0) ray_release(e0);
+        if (RAY_IS_ERR(r0)) return r0;
+        if (ray_is_atom(r0) && r0->type == -RAY_BOOL) {
+            /* All results are scalar bools — use typed bool vector */
+            ray_t* result = ray_vec_new(RAY_BOOL, vlen);
+            if (RAY_IS_ERR(result)) { ray_release(r0); return result; }
+            result->len = vlen;
+            bool* out = (bool*)ray_data(result);
+            out[0] = r0->b8;
+            ray_release(r0);
+            for (int64_t i = 1; i < vlen; i++) {
+                int alloc = 0;
+                ray_t* elem = collection_elem(val, i, &alloc);
+                if (RAY_IS_ERR(elem)) { ray_release(result); return elem; }
+                ray_t* r = ray_in_fn(elem, vec);
+                if (alloc) ray_release(elem);
+                if (RAY_IS_ERR(r)) { ray_release(result); return r; }
+                out[i] = r->b8;
+                ray_release(r);
+            }
+            return result;
+        } else {
+            /* Results are non-scalar — collect as list */
+            ray_t* result = ray_list_new(vlen);
+            if (RAY_IS_ERR(result)) { ray_release(r0); return result; }
+            result = ray_list_append(result, r0);
+            ray_release(r0);
+            if (RAY_IS_ERR(result)) return result;
+            for (int64_t i = 1; i < vlen; i++) {
+                int alloc = 0;
+                ray_t* elem = collection_elem(val, i, &alloc);
+                if (RAY_IS_ERR(elem)) { ray_release(result); return elem; }
+                ray_t* r = ray_in_fn(elem, vec);
+                if (alloc) ray_release(elem);
+                if (RAY_IS_ERR(r)) { ray_release(result); return r; }
+                result = ray_list_append(result, r);
+                ray_release(r);
+                if (RAY_IS_ERR(result)) return result;
+            }
+            return result;
+        }
+    }
+    /* Typed vector: search without boxing */
+    if (ray_is_vec(vec) && ray_is_atom(val)) {
+        int64_t len = vec->len;
+        bool has_nulls = (vec->attrs & RAY_ATTR_HAS_NULLS) != 0;
+        for (int64_t i = 0; i < len; i++) {
+            if (has_nulls && ray_vec_is_null(vec, i)) {
+                if (RAY_ATOM_IS_NULL(val)) return make_bool(1);
+                continue;
+            }
+            int alloc = 0;
+            ray_t* elem = collection_elem(vec, i, &alloc);
+            int eq = atom_eq(val, elem);
+            if (alloc) ray_release(elem);
+            if (eq) return make_bool(1);
+        }
+        return make_bool(0);
+    }
+    ray_t* _bx = NULL;
+    vec = unbox_vec_arg(vec, &_bx);
+    if (RAY_IS_ERR(vec)) return vec;
+    if (!is_list(vec)) { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+    int64_t len = ray_len(vec);
+    ray_t** elems = (ray_t**)ray_data(vec);
+    for (int64_t i = 0; i < len; i++) {
+        if (atom_eq(val, elems[i])) { if (_bx) ray_release(_bx); return make_bool(1); }
+    }
+    if (_bx) ray_release(_bx);
+    return make_bool(0);
+}
+
+/* Helper: convert a boxed list result back to a typed vector if the original was typed */
+ray_t* list_to_typed_vec(ray_t* list, int8_t orig_vec_type) {
+    if (!list || RAY_IS_ERR(list) || list->type != RAY_LIST) return list;
+    int64_t count = list->len;
+    /* For SYM and STR types, only convert when empty (to get [] instead of ()) */
+    if (orig_vec_type == RAY_SYM || orig_vec_type == RAY_STR) {
+        if (count == 0) {
+            ray_release(list);
+            return ray_vec_new(orig_vec_type, 0);
+        }
+        return list; /* Keep as boxed list for non-empty SYM/STR */
+    }
+    ray_t* vec = ray_vec_new(orig_vec_type, count);
+    if (RAY_IS_ERR(vec)) return vec;
+    vec->len = count;
+    ray_t** elems = (ray_t**)ray_data(list);
+    for (int64_t i = 0; i < count; i++)
+        store_typed_elem(vec, i, elems[i]);
+    /* Release the list (ray_release_owned_refs handles child elements) */
+    ray_release(list);
+    return vec;
+}
+
+/* (vec_elem_in helper removed — replaced by hashset-based lookups in
+ * except/union/sect/in/distinct.) */
+
+/* (except vec1 vec2) — elements in vec1 not in vec2 */
+ray_t* ray_except_fn(ray_t* vec1, ray_t* vec2) {
+    if (ray_is_lazy(vec1)) vec1 = ray_lazy_materialize(vec1);
+    if (ray_is_lazy(vec2)) vec2 = ray_lazy_materialize(vec2);
+
+    /* Typed vector path: hash-based.  Was O(len1×len2); now O(len1+len2). */
+    if (ray_is_vec(vec1) && (ray_is_vec(vec2) || ray_is_atom(vec2))) {
+        int64_t len1 = vec1->len;
+        int64_t idx_stack[256];
+        int64_t* idx = (len1 <= 256) ? idx_stack : (int64_t*)ray_sys_alloc((size_t)len1 * sizeof(int64_t));
+        if (!idx) return ray_error("oom", NULL);
+        int64_t count = 0;
+        if (ray_is_atom(vec2)) {
+            /* Scalar: filter out matching elements (single compare per row). */
+            for (int64_t i = 0; i < len1; i++) {
+                int alloc = 0;
+                ray_t* elem = collection_elem(vec1, i, &alloc);
+                int eq = atom_eq(elem, vec2);
+                if (alloc) ray_release(elem);
+                if (!eq) idx[count++] = i;
+            }
+        } else {
+            hashset_t hs;
+            if (!hashset_init(&hs, vec2, vec2->len)) {
+                if (idx != idx_stack) ray_sys_free(idx);
+                return ray_error("oom", NULL);
+            }
+            for (int64_t j = 0; j < vec2->len; j++) hashset_insert(&hs, j);
+            int8_t v1_type = vec1->type;
+            void*  v1_data = ray_data(vec1);
+            for (int64_t i = 0; i < len1; i++) {
+                if (hashset_find_xrow(&hs, vec1, i, v1_type, v1_data) == HS_EMPTY)
+                    idx[count++] = i;
+            }
+            hashset_destroy(&hs);
+        }
+        ray_t* result = gather_by_idx(vec1, idx, count);
+        if (idx != idx_stack) ray_sys_free(idx);
+        return result;
+    }
+
+    /* Boxed list fallback */
+    int8_t orig_type = ray_is_vec(vec1) ? vec1->type : -1;
+    ray_t *_bx1 = NULL, *_bx2 = NULL;
+    vec1 = unbox_vec_arg(vec1, &_bx1);
+    if (RAY_IS_ERR(vec1)) return vec1;
+    vec2 = unbox_vec_arg(vec2, &_bx2);
+    if (RAY_IS_ERR(vec2)) { if (_bx1) ray_release(_bx1); return vec2; }
+    if (!is_list(vec1)) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("type", NULL); }
+    int64_t len1 = ray_len(vec1);
+    ray_t** e1 = (ray_t**)ray_data(vec1);
+
+    ray_t* result = ray_alloc(len1 * sizeof(ray_t*));
+    if (!result) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    ray_t** out = (ray_t**)ray_data(result);
+    int64_t count = 0;
+
+    if (ray_is_atom(vec2)) {
+        for (int64_t i = 0; i < len1; i++) {
+            if (!atom_eq(e1[i], vec2)) { ray_retain(e1[i]); out[count++] = e1[i]; }
+        }
+    } else {
+        int64_t len2 = ray_len(vec2);
+        ray_t** e2 = (ray_t**)ray_data(vec2);
+        for (int64_t i = 0; i < len1; i++) {
+            int found = 0;
+            for (int64_t j = 0; j < len2; j++) {
+                if (atom_eq(e1[i], e2[j])) { found = 1; break; }
+            }
+            if (!found) { ray_retain(e1[i]); out[count++] = e1[i]; }
+        }
+    }
+    result->len = count;
+    if (_bx1) ray_release(_bx1);
+    if (_bx2) ray_release(_bx2);
+    if (orig_type >= 0 && count == 0) { ray_release(result); return ray_vec_new(orig_type, 0); }
+    return result;
+}
+
+/* (union vec1 vec2) — elements in vec1 + elements in vec2 not already in vec1 */
+ray_t* ray_union_fn(ray_t* vec1, ray_t* vec2) {
+    if (ray_is_lazy(vec1)) vec1 = ray_lazy_materialize(vec1);
+    if (ray_is_lazy(vec2)) vec2 = ray_lazy_materialize(vec2);
+
+    /* Typed vector path: hash-based.  Was O(len1×len2); now O(len1+len2). */
+    if (ray_is_vec(vec1) && ray_is_vec(vec2)) {
+        int64_t len2 = vec2->len;
+        int64_t idx_stack[256];
+        int64_t* idx = (len2 <= 256) ? idx_stack : (int64_t*)ray_sys_alloc((size_t)len2 * sizeof(int64_t));
+        if (!idx) return ray_error("oom", NULL);
+        hashset_t hs;
+        if (!hashset_init(&hs, vec1, vec1->len)) {
+            if (idx != idx_stack) ray_sys_free(idx);
+            return ray_error("oom", NULL);
+        }
+        for (int64_t j = 0; j < vec1->len; j++) hashset_insert(&hs, j);
+        int8_t v2_type = vec2->type;
+        void*  v2_data = ray_data(vec2);
+        int64_t extra = 0;
+        for (int64_t i = 0; i < len2; i++) {
+            if (hashset_find_xrow(&hs, vec2, i, v2_type, v2_data) == HS_EMPTY)
+                idx[extra++] = i;
+        }
+        hashset_destroy(&hs);
+        ray_t* part2 = gather_by_idx(vec2, idx, extra);
+        if (idx != idx_stack) ray_sys_free(idx);
+        if (RAY_IS_ERR(part2)) return part2;
+        ray_t* result = ray_concat_fn(vec1, part2);
+        ray_release(part2);
+        return result;
+    }
+
+    /* Boxed list fallback */
+    ray_t *_bx1 = NULL, *_bx2 = NULL;
+    vec1 = unbox_vec_arg(vec1, &_bx1);
+    if (RAY_IS_ERR(vec1)) return vec1;
+    vec2 = unbox_vec_arg(vec2, &_bx2);
+    if (RAY_IS_ERR(vec2)) { if (_bx1) ray_release(_bx1); return vec2; }
+    if (!is_list(vec1) || !is_list(vec2)) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("type", NULL); }
+    int64_t len1 = ray_len(vec1), len2 = ray_len(vec2);
+    ray_t** e1 = (ray_t**)ray_data(vec1);
+    ray_t** e2 = (ray_t**)ray_data(vec2);
+
+    ray_t* result = ray_alloc((len1 + len2) * sizeof(ray_t*));
+    if (!result) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    ray_t** out = (ray_t**)ray_data(result);
+    int64_t count = 0;
+    for (int64_t i = 0; i < len1; i++) { ray_retain(e1[i]); out[count++] = e1[i]; }
+    for (int64_t i = 0; i < len2; i++) {
+        int found = 0;
+        for (int64_t j = 0; j < count; j++)
+            if (atom_eq(out[j], e2[i])) { found = 1; break; }
+        if (!found) { ray_retain(e2[i]); out[count++] = e2[i]; }
+    }
+    result->len = count;
+    if (_bx1) ray_release(_bx1);
+    if (_bx2) ray_release(_bx2);
+    return result;
+}
+
+/* (sect vec1 vec2) — intersection: elements in both */
+ray_t* ray_sect_fn(ray_t* vec1, ray_t* vec2) {
+    if (ray_is_lazy(vec1)) vec1 = ray_lazy_materialize(vec1);
+    if (ray_is_lazy(vec2)) vec2 = ray_lazy_materialize(vec2);
+
+    /* Typed vector path: hash-based.  Was O(len1×len2); now O(len1+len2). */
+    if (ray_is_vec(vec1) && ray_is_vec(vec2)) {
+        int64_t len1 = vec1->len;
+        int64_t idx_stack[256];
+        int64_t* idx = (len1 <= 256) ? idx_stack : (int64_t*)ray_sys_alloc((size_t)len1 * sizeof(int64_t));
+        if (!idx) return ray_error("oom", NULL);
+        hashset_t hs;
+        if (!hashset_init(&hs, vec2, vec2->len)) {
+            if (idx != idx_stack) ray_sys_free(idx);
+            return ray_error("oom", NULL);
+        }
+        for (int64_t j = 0; j < vec2->len; j++) hashset_insert(&hs, j);
+        int8_t v1_type = vec1->type;
+        void*  v1_data = ray_data(vec1);
+        int64_t count = 0;
+        for (int64_t i = 0; i < len1; i++) {
+            if (hashset_find_xrow(&hs, vec1, i, v1_type, v1_data) != HS_EMPTY)
+                idx[count++] = i;
+        }
+        hashset_destroy(&hs);
+        ray_t* result = gather_by_idx(vec1, idx, count);
+        if (idx != idx_stack) ray_sys_free(idx);
+        return result;
+    }
+
+    /* Boxed list fallback */
+    ray_t *_bx1 = NULL, *_bx2 = NULL;
+    vec1 = unbox_vec_arg(vec1, &_bx1);
+    if (RAY_IS_ERR(vec1)) return vec1;
+    vec2 = unbox_vec_arg(vec2, &_bx2);
+    if (RAY_IS_ERR(vec2)) { if (_bx1) ray_release(_bx1); return vec2; }
+    if (!is_list(vec1) || !is_list(vec2)) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("type", NULL); }
+    int64_t len1 = ray_len(vec1);
+    ray_t** e1 = (ray_t**)ray_data(vec1);
+    ray_t** e2 = (ray_t**)ray_data(vec2);
+    int64_t len2 = ray_len(vec2);
+
+    ray_t* result = ray_alloc(len1 * sizeof(ray_t*));
+    if (!result) { if (_bx1) ray_release(_bx1); if (_bx2) ray_release(_bx2); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    ray_t** out = (ray_t**)ray_data(result);
+    int64_t count = 0;
+    for (int64_t i = 0; i < len1; i++) {
+        for (int64_t j = 0; j < len2; j++) {
+            if (atom_eq(e1[i], e2[j])) { ray_retain(e1[i]); out[count++] = e1[i]; break; }
+        }
+    }
+    result->len = count;
+    if (_bx1) ray_release(_bx1);
+    if (_bx2) ray_release(_bx2);
+    return result;
+}
+
+/* (take vec n) — first n elements (positive) or last |n| elements (negative) */
+ray_t* ray_take_fn(ray_t* vec, ray_t* n_obj) {
+    if (ray_is_lazy(vec)) vec = ray_lazy_materialize(vec);
+    /* N must be an integer (or 2-elem i64 vector for range-take).  Reject
+     * floats up front: as_i64(f64) reads the bit pattern and would cause
+     * e.g. (take 1.0 2.0) to attempt a 4.6-quintillion-element allocation
+     * and surface as "oom" — misleading for what is really a type error. */
+    if (ray_is_atom(n_obj) && n_obj->type == -RAY_F64)
+        return ray_error("type", NULL);
+    /* Range take: (take collection [start amount]) — slice from start for amount elements */
+    if (ray_is_vec(n_obj) && n_obj->type == RAY_I64 && ray_len(n_obj) == 2) {
+        int64_t* idx = (int64_t*)ray_data(n_obj);
+        int64_t start = idx[0];
+        int64_t amount = idx[1];
+        if (amount < 0) return ray_error("length", NULL);
+
+        /* Table range take */
+        if (vec->type == RAY_TABLE) {
+            int64_t ncols = ray_table_ncols(vec);
+            ray_t* result = ray_table_new(ncols);
+            if (RAY_IS_ERR(result)) return result;
+            for (int64_t i = 0; i < ncols; i++) {
+                ray_t* col = ray_table_get_col_idx(vec, i);
+                int64_t name_id = ray_table_col_name(vec, i);
+                ray_t* taken = ray_take_fn(col, n_obj);
+                if (RAY_IS_ERR(taken)) { ray_release(result); return taken; }
+                result = ray_table_add_col(result, name_id, taken);
+                if (RAY_IS_ERR(result)) { ray_release(taken); return result; }
+            }
+            return result;
+        }
+
+        /* String range take */
+        if (ray_is_atom(vec) && (-vec->type) == RAY_STR) {
+            const char* s = ray_str_ptr(vec);
+            int64_t slen = (int64_t)ray_str_len(vec);
+            if (start < 0) start = slen + start;
+            if (start < 0) start = 0;
+            if (start >= slen) return ray_str("", 0);
+            int64_t end = start + amount;
+            if (end > slen) end = slen;
+            return ray_str(s + start, (size_t)(end - start));
+        }
+
+        /* Typed vector range take */
+        if (ray_is_vec(vec)) {
+            int64_t len = ray_len(vec);
+            if (start < 0) start = len + start;
+            if (start < 0) start = 0;
+            if (start >= len) return ray_vec_new(vec->type, 0);
+            int64_t end = start + amount;
+            if (end > len) end = len;
+            int64_t count = end - start;
+            int8_t vtype = vec->type;
+            int esz = ray_elem_size(vtype);
+            ray_t* result = ray_vec_new(vtype, count);
+            if (RAY_IS_ERR(result)) return result;
+            result->len = count;
+            memcpy(ray_data(result), (char*)ray_data(vec) + start * esz, (size_t)(count * esz));
+            /* Propagate null bitmap — check parent's flag for slices */
+            bool has_nulls = (vec->attrs & RAY_ATTR_HAS_NULLS) ||
+                             ((vec->attrs & RAY_ATTR_SLICE) && vec->slice_parent &&
+                              (vec->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+            if (has_nulls) {
+                for (int64_t i = 0; i < count; i++)
+                    if (ray_vec_is_null(vec, start + i))
+                        ray_vec_set_null(result, i, true);
+            }
+            return result;
+        }
+
+        /* Dict range take — slice both keys and vals in parallel. */
+        if (vec->type == RAY_DICT) {
+            ray_t* keys = ray_dict_keys(vec);
+            ray_t* vals = ray_dict_vals(vec);
+            int64_t len = keys ? keys->len : 0;
+            if (start < 0) start = len + start;
+            if (start < 0) start = 0;
+            int64_t end = start + amount;
+            if (end > len) end = len;
+            if (end < start) end = start;
+            int64_t count = end - start;
+
+            ray_t* nk = ray_vec_slice(keys, start, count);
+            if (!nk || RAY_IS_ERR(nk)) return nk ? nk : ray_error("oom", NULL);
+
+            ray_t* nv;
+            if (vals && vals->type == RAY_LIST) {
+                nv = ray_alloc(count * sizeof(ray_t*));
+                if (!nv) { ray_release(nk); return ray_error("oom", NULL); }
+                nv->type = RAY_LIST;
+                nv->len  = count;
+                ray_t** vsrc = (ray_t**)ray_data(vals);
+                ray_t** vdst = (ray_t**)ray_data(nv);
+                for (int64_t i = 0; i < count; i++) {
+                    vdst[i] = vsrc[start + i];
+                    if (vdst[i]) ray_retain(vdst[i]);
+                }
+            } else {
+                nv = ray_vec_slice(vals, start, count);
+                if (!nv || RAY_IS_ERR(nv)) { ray_release(nk); return nv ? nv : ray_error("oom", NULL); }
+            }
+            return ray_dict_new(nk, nv);
+        }
+
+        /* Boxed list range take */
+        if (vec->type == RAY_LIST) {
+            int64_t len = ray_len(vec);
+            if (start < 0) start = len + start;
+            if (start < 0) start = 0;
+            if (start >= len) {
+                ray_t* result = ray_alloc(0);
+                result->type = RAY_LIST;
+                result->len = 0;
+                return result;
+            }
+            int64_t end = start + amount;
+            if (end > len) end = len;
+            int64_t count = end - start;
+            ray_t** elems = (ray_t**)ray_data(vec);
+            ray_t* result = ray_alloc(count * sizeof(ray_t*));
+            if (!result) return ray_error("oom", NULL);
+            result->type = RAY_LIST;
+            result->len = count;
+            ray_t** out = (ray_t**)ray_data(result);
+            for (int64_t i = 0; i < count; i++) {
+                ray_retain(elems[start + i]);
+                out[i] = elems[start + i];
+            }
+            return result;
+        }
+
+        return ray_error("type", NULL);
+    }
+    /* Char take: (take 'a' n) → string of n copies of char */
+    if (ray_is_atom(vec) && vec->type == -RAY_STR && ray_str_len(vec) == 1 && ray_is_atom(n_obj) && is_numeric(n_obj)) {
+        int64_t n = as_i64(n_obj);
+        int64_t count = n < 0 ? -n : n;
+        char buf[8192];
+        if (count > (int64_t)sizeof(buf)) return ray_error("limit", NULL);
+        for (int64_t i = 0; i < count; i++) buf[i] = vec->sdata[0];
+        return ray_str(buf, (size_t)count);
+    }
+    /* Scalar take: (take value n) → repeat value n times */
+    if (ray_is_atom(vec) && (-vec->type) != RAY_STR && ray_is_atom(n_obj) && is_numeric(n_obj)) {
+        int64_t n = as_i64(n_obj);
+        int64_t count = n < 0 ? -n : n;
+        int8_t vtype = -(vec->type);
+        ray_t* result = ray_vec_new(vtype, count);
+        if (RAY_IS_ERR(result)) return result;
+        result->len = count;
+        for (int64_t i = 0; i < count; i++)
+            store_typed_elem(result, i, vec);
+        return result;
+    }
+    /* String take: (take "hello" 3) → "hel", with wrapping extension */
+    if (ray_is_atom(vec) && (-vec->type) == RAY_STR && ray_is_atom(n_obj) && is_numeric(n_obj)) {
+        const char* s = ray_str_ptr(vec);
+        int64_t slen = (int64_t)ray_str_len(vec);
+        int64_t n = as_i64(n_obj);
+        int64_t abs_n = n < 0 ? -n : n;
+        char buf[8192];
+        if (abs_n > (int64_t)sizeof(buf)) return ray_error("limit", NULL);
+        if (slen == 0) {
+            return ray_str("", 0);
+        }
+        if (n >= 0) {
+            for (int64_t i = 0; i < abs_n; i++) buf[i] = s[i % slen];
+        } else {
+            for (int64_t i = 0; i < abs_n; i++) {
+                int64_t si = slen - (abs_n - i) % slen;
+                if (si == slen) si = 0;
+                buf[i] = s[si];
+            }
+        }
+        return ray_str(buf, (size_t)abs_n);
+    }
+    /* Table take: apply take to each column */
+    if (vec->type == RAY_TABLE && is_numeric(n_obj)) {
+        int64_t ncols = ray_table_ncols(vec);
+        ray_t* result = ray_table_new(ncols);
+        if (RAY_IS_ERR(result)) return result;
+        for (int64_t i = 0; i < ncols; i++) {
+            ray_t* col = ray_table_get_col_idx(vec, i);
+            int64_t name_id = ray_table_col_name(vec, i);
+            ray_t* taken = ray_take_fn(col, n_obj);
+            if (RAY_IS_ERR(taken)) { ray_release(result); return taken; }
+            result = ray_table_add_col(result, name_id, taken);
+            if (RAY_IS_ERR(result)) { ray_release(taken); return result; }
+        }
+        return result;
+    }
+    /* Dict take: apply take to keys and vals in parallel.  Wrapping for
+     * |n| > pair count works the same as for typed vectors. */
+    if (vec->type == RAY_DICT && is_numeric(n_obj)) {
+        ray_t* keys = ray_dict_keys(vec);
+        ray_t* vals = ray_dict_vals(vec);
+        if (!keys) return ray_error("type", NULL);
+        ray_t* nk = ray_take_fn(keys, n_obj);
+        if (RAY_IS_ERR(nk)) return nk;
+        ray_t* nv = vals ? ray_take_fn(vals, n_obj) : ray_list_new(0);
+        if (!nv || RAY_IS_ERR(nv)) { ray_release(nk); return nv ? nv : ray_error("oom", NULL); }
+        return ray_dict_new(nk, nv);
+    }
+    /* Typed vector take with extension */
+    if (ray_is_vec(vec) && is_numeric(n_obj)) {
+        int64_t len = ray_len(vec);
+        int64_t n = as_i64(n_obj);
+        int64_t abs_n = n < 0 ? -n : n;
+        int8_t vtype = vec->type;
+        int esz = ray_elem_size(vtype);
+        ray_t* result = ray_vec_new(vtype, abs_n);
+        if (RAY_IS_ERR(result)) return result;
+        result->len = abs_n;
+        char* src = (char*)ray_data(vec);
+        char* dst = (char*)ray_data(result);
+        if (len == 0) {
+            memset(dst, 0, (size_t)(abs_n * esz));
+        } else if (n >= 0 && abs_n > 0) {
+            /* Doubling tile-copy: O(log(abs_n/len)) memcpys instead of
+             * abs_n calls of esz bytes each.  Invariant: after every
+             * memcpy `copied` is a multiple of `len`, so dst[0..copied)
+             * holds a perfect tile and we can keep doubling from dst[0].
+             * The final partial copy is < copied so it stays within the
+             * already-tiled prefix. */
+            int64_t to_copy = abs_n < len ? abs_n : len;
+            memcpy(dst, src, (size_t)(to_copy * esz));
+            int64_t copied = to_copy;
+            while (copied + copied <= abs_n) {
+                memcpy(dst + copied * esz, dst, (size_t)(copied * esz));
+                copied *= 2;
+            }
+            int64_t remaining = abs_n - copied;
+            if (remaining > 0)
+                memcpy(dst + copied * esz, dst, (size_t)(remaining * esz));
+        } else if (n < 0) {
+            /* Negative: take from end with wrap */
+            for (int64_t i = 0; i < abs_n; i++) {
+                int64_t si = len - (abs_n - i) % len;
+                if (si == len) si = 0;
+                memcpy(dst + i * esz, src + si * esz, esz);
+            }
+        }
+        /* Propagate null bitmap — check parent's flag for slices */
+        bool has_nulls = len > 0 &&
+                         ((vec->attrs & RAY_ATTR_HAS_NULLS) ||
+                          ((vec->attrs & RAY_ATTR_SLICE) && vec->slice_parent &&
+                           (vec->slice_parent->attrs & RAY_ATTR_HAS_NULLS)));
+        if (has_nulls) {
+            if (n >= 0) {
+                for (int64_t i = 0; i < abs_n; i++)
+                    if (ray_vec_is_null(vec, i % len))
+                        ray_vec_set_null(result, i, true);
+            } else {
+                for (int64_t i = 0; i < abs_n; i++) {
+                    int64_t si = len - (abs_n - i) % len;
+                    if (si == len) si = 0;
+                    if (ray_vec_is_null(vec, si))
+                        ray_vec_set_null(result, i, true);
+                }
+            }
+        }
+        return result;
+    }
+    ray_t* _bx = NULL;
+    vec = unbox_vec_arg(vec, &_bx);
+    if (RAY_IS_ERR(vec)) return vec;
+    if (!is_list(vec) || !is_numeric(n_obj))
+        { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+    int64_t len = ray_len(vec);
+    int64_t n = as_i64(n_obj);
+    ray_t** elems = (ray_t**)ray_data(vec);
+
+    int64_t abs_n = n < 0 ? -n : n;
+    int64_t elem_count = abs_n;
+    ray_t* result = ray_alloc(elem_count * sizeof(ray_t*));
+    if (!result) { if (_bx) ray_release(_bx); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    result->len = elem_count;
+    ray_t** out = (ray_t**)ray_data(result);
+    if (len == 0) {
+        result->len = 0;
+    } else if (n >= 0) {
+        for (int64_t i = 0; i < elem_count; i++) {
+            ray_retain(elems[i % len]);
+            out[i] = elems[i % len];
+        }
+    } else {
+        for (int64_t i = 0; i < elem_count; i++) {
+            int64_t si = len - (elem_count - i) % len;
+            if (si == len) si = 0;
+            ray_retain(elems[si]);
+            out[i] = elems[si];
+        }
+    }
+    if (_bx) ray_release(_bx);
+    return result;
+}
+
+/* (at vec idx) or (at table 'col) — index into vector or table */
+ray_t* ray_at_fn(ray_t* vec, ray_t* idx) {
+    if (ray_is_lazy(vec)) vec = ray_lazy_materialize(vec);
+    /* Table column access by symbol key — return the typed vector directly */
+    if (vec->type == RAY_TABLE && idx->type == -RAY_SYM) {
+        ray_t* col = ray_table_get_col(vec, idx->i64);
+        if (!col) return ray_error("domain", NULL);
+        ray_retain(col);
+        return col;
+    }
+
+    /* Table row access by integer index: (at table 0) → {col1: val1, col2: val2} */
+    if (vec->type == RAY_TABLE && ray_is_atom(idx) &&
+        (idx->type == -RAY_I64 || idx->type == -RAY_I32 ||
+         idx->type == -RAY_I16 || idx->type == -RAY_U8)) {
+        int64_t row = as_i64(idx);
+        int64_t nrows = ray_table_nrows(vec);
+        if (row < 0 || row >= nrows) return ray_error("domain", NULL);
+        int64_t ncols = ray_table_ncols(vec);
+        /* Build a dict: keys SYM vec + vals LIST */
+        ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, ncols);
+        if (RAY_IS_ERR(keys)) return keys;
+        ray_t* vals = ray_list_new(ncols);
+        if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t key_id = ray_table_col_name(vec, c);
+            keys = ray_vec_append(keys, &key_id);
+            if (RAY_IS_ERR(keys)) { ray_release(vals); return keys; }
+            ray_t* col = ray_table_get_col_idx(vec, c);
+            int alloc = 0;
+            ray_t* val = collection_elem(col, row, &alloc);
+            if (RAY_IS_ERR(val)) { ray_release(keys); ray_release(vals); return val; }
+            vals = ray_list_append(vals, val);
+            if (alloc) ray_release(val);
+            if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+        }
+        return ray_dict_new(keys, vals);
+    }
+
+    /* Dict key access: (at dict key) → value or 0Nl if missing */
+    if (vec->type == RAY_DICT) {
+        ray_t* v = ray_dict_get(vec, idx);
+        if (v) return v;
+        return ray_typed_null(-RAY_I64); /* 0Nl for missing key */
+    }
+
+    /* String indexing: (at "hello" 1) → 'e', (at "hello" [0 4]) → "ho" */
+    if (ray_is_atom(vec) && (-vec->type) == RAY_STR) {
+        const char* s = ray_str_ptr(vec);
+        size_t slen = ray_str_len(vec);
+        if (is_collection(idx)) {
+            /* Multiple indices → build string from chars */
+            int64_t idxlen = ray_len(idx);
+            char buf[8192];
+            if ((size_t)idxlen > sizeof(buf)) return ray_error("limit", NULL);
+            for (int64_t j = 0; j < idxlen; j++) {
+                int alloc = 0;
+                ray_t* ie = collection_elem(idx, j, &alloc);
+                int64_t k = as_i64(ie);
+                if (alloc) ray_release(ie);
+                if (k < 0 || (size_t)k >= slen) return ray_error("domain", NULL);
+                buf[j] = s[k];
+            }
+            return ray_str(buf, (size_t)idxlen);
+        }
+        int64_t i = as_i64(idx);
+        if (i < 0 || (size_t)i >= slen) return ray_error("domain", NULL);
+        /* Return 1-char string atom */
+        return ray_str(&s[i], 1);
+    }
+
+    /* Vector index: (at vec [i j k]) → vector of values */
+    if (is_collection(idx) && idx->type != -RAY_SYM) {
+        int64_t idxlen = ray_len(idx);
+        ray_t* result = ray_alloc(idxlen * sizeof(ray_t*));
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = idxlen;
+        ray_t** out = (ray_t**)ray_data(result);
+        for (int64_t j = 0; j < idxlen; j++) {
+            int alloc = 0;
+            ray_t* idx_elem = collection_elem(idx, j, &alloc);
+            if (RAY_IS_ERR(idx_elem)) {
+                for (int64_t k = 0; k < j; k++) ray_release(out[k]);
+                ray_release(result);
+                return idx_elem;
+            }
+            ray_t* sub_idx = idx_elem;
+            ray_t* val = ray_at_fn(vec, sub_idx);
+            if (alloc) ray_release(idx_elem);
+            if (RAY_IS_ERR(val)) {
+                for (int64_t k = 0; k < j; k++) ray_release(out[k]);
+                ray_release(result);
+                return val;
+            }
+            out[j] = val;
+        }
+        return result;
+    }
+
+    if (idx->type != -RAY_I64 && idx->type != -RAY_I32 &&
+        idx->type != -RAY_I16 && idx->type != -RAY_U8)
+        return ray_error("type", NULL);
+    int64_t i = as_i64(idx);
+
+    /* Typed vector: extract element directly */
+    if (ray_is_vec(vec)) {
+        int64_t len = ray_len(vec);
+        if (i < 0 || i >= len) return ray_typed_null(-vec->type); /* out of bounds → typed null */
+        int alloc = 0;
+        ray_t* elem = collection_elem(vec, i, &alloc);
+        /* collection_elem always allocates for typed vecs, so elem is owned */
+        return elem;
+    }
+
+    if (!is_list(vec)) return ray_error("type", NULL);
+    int64_t len = ray_len(vec);
+    if (i < 0 || i >= len) return ray_typed_null(-RAY_I64); /* out of bounds → 0Nl */
+    ray_t* elem = ((ray_t**)ray_data(vec))[i];
+    ray_retain(elem);
+    return elem;
+}
+
+/* (find vec val) — index of first occurrence, or -1 */
+ray_t* ray_find_fn(ray_t* vec, ray_t* val) {
+    if (ray_is_lazy(vec)) vec = ray_lazy_materialize(vec);
+    if (ray_is_lazy(val)) val = ray_lazy_materialize(val);
+    /* String find: (find "hello" 'l') → index of char in string */
+    if (ray_is_atom(vec) && (-vec->type) == RAY_STR && ray_is_atom(val) && val->type == -RAY_STR && ray_str_len(val) == 1) {
+        const char* s = ray_str_ptr(vec);
+        size_t slen = ray_str_len(vec);
+        char c = ray_str_ptr(val)[0];
+        for (size_t i = 0; i < slen; i++) {
+            if (s[i] == c) return make_i64((int64_t)i);
+        }
+        return ray_typed_null(-RAY_I64);
+    }
+    /* Vector val: (find vec [v1 v2]) → [idx1 idx2] */
+    if (is_collection(val)) {
+        /* If vec is empty, return empty vector */
+        if (is_collection(vec) && ray_len(vec) == 0)
+            return ray_vec_new(RAY_I64, 0);
+        int64_t vlen = ray_len(val);
+        ray_t* result = ray_alloc(vlen * sizeof(ray_t*));
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = vlen;
+        ray_t** out = (ray_t**)ray_data(result);
+        for (int64_t j = 0; j < vlen; j++) {
+            int alloc = 0;
+            ray_t* ve = collection_elem(val, j, &alloc);
+            out[j] = ray_find_fn(vec, ve);
+            if (alloc) ray_release(ve);
+            if (RAY_IS_ERR(out[j])) {
+                for (int64_t k = 0; k < j; k++) ray_release(out[k]);
+                ray_release(result);
+                return out[j];
+            }
+        }
+        return result;
+    }
+    /* Typed vector: search without boxing */
+    if (ray_is_vec(vec)) {
+        int64_t len = vec->len;
+        bool has_nulls = (vec->attrs & RAY_ATTR_HAS_NULLS) != 0;
+        bool val_null = RAY_ATOM_IS_NULL(val);
+        for (int64_t i = 0; i < len; i++) {
+            if (has_nulls && ray_vec_is_null(vec, i)) {
+                if (val_null) return make_i64(i);
+                continue;
+            }
+            if (val_null) continue;
+            int alloc = 0;
+            ray_t* elem = collection_elem(vec, i, &alloc);
+            int eq = atom_eq(elem, val);
+            if (alloc) ray_release(elem);
+            if (eq) return make_i64(i);
+        }
+        return ray_typed_null(-RAY_I64);
+    }
+    ray_t* _bx = NULL;
+    vec = unbox_vec_arg(vec, &_bx);
+    if (RAY_IS_ERR(vec)) return vec;
+    if (!is_list(vec)) { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+    int64_t len = ray_len(vec);
+    ray_t** elems = (ray_t**)ray_data(vec);
+    for (int64_t i = 0; i < len; i++) {
+        if (atom_eq(elems[i], val)) { if (_bx) ray_release(_bx); return make_i64(i); }
+    }
+    if (_bx) ray_release(_bx);
+    return ray_typed_null(-RAY_I64); /* 0Nl = not found */
+}
+
+/* (til n) — generate integer sequence [0, 1, ..., n-1] */
+static void til_fill(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    int64_t* out = (int64_t*)ctx;
+    for (int64_t i = start; i < end; i++)
+        out[i] = i;
+}
+
+ray_t* ray_til_fn(ray_t* x) {
+    if (!ray_is_atom(x) || x->type != -RAY_I64) return ray_error("type", NULL);
+    int64_t n = x->i64;
+    if (n < 0) return ray_error("domain", NULL);
+    if (n == 0) return ray_vec_new(RAY_I64, 0);
+
+    ray_t* vec = ray_vec_new(RAY_I64, n);
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    vec->len = n;
+    int64_t* out = (int64_t*)ray_data(vec);
+    ray_pool_dispatch(ray_pool_get(), til_fill, out, n);
+    return vec;
+}
+
+/* (reverse vec) — reverse a vector */
+ray_t* ray_reverse_fn(ray_t* x) {
+    if (ray_is_lazy(x)) x = ray_lazy_materialize(x);
+
+    /* Typed vector: reverse directly without boxing */
+    if (ray_is_vec(x)) {
+        int64_t len = x->len;
+        if (len <= 1) { ray_retain(x); return x; }
+        int8_t vtype = x->type;
+        if (vtype == RAY_STR) {
+            ray_t* result = ray_vec_new(RAY_STR, len);
+            if (RAY_IS_ERR(result)) return result;
+            bool has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0;
+            for (int64_t i = 0; i < len; i++) {
+                if (has_nulls && ray_vec_is_null(x, len - 1 - i)) {
+                    result = ray_str_vec_append(result, "", 0);
+                    if (!RAY_IS_ERR(result))
+                        ray_vec_set_null(result, result->len - 1, true);
+                } else {
+                    size_t slen;
+                    const char* sp = ray_str_vec_get(x, len - 1 - i, &slen);
+                    result = ray_str_vec_append(result, sp ? sp : "", sp ? slen : 0);
+                }
+                if (RAY_IS_ERR(result)) return result;
+            }
+            return result;
+        }
+        ray_t* result = (vtype == RAY_SYM)
+            ? ray_sym_vec_new(x->attrs & RAY_SYM_W_MASK, len)
+            : ray_vec_new(vtype, len);
+        if (!result || RAY_IS_ERR(result)) return result ? result : ray_error("oom", NULL);
+        result->len = len;
+        int esz = ray_elem_size(vtype);
+        if (vtype == RAY_SYM) esz = ray_sym_elem_size(vtype, x->attrs);
+        char* src = (char*)ray_data(x);
+        char* dst = (char*)ray_data(result);
+        bool has_nulls = (x->attrs & RAY_ATTR_HAS_NULLS) != 0;
+        for (int64_t i = 0; i < len; i++) {
+            memcpy(dst + i * esz, src + (len - 1 - i) * esz, esz);
+            if (has_nulls && ray_vec_is_null(x, len - 1 - i))
+                ray_vec_set_null(result, i, true);
+        }
+        return result;
+    }
+
+    /* Boxed list path */
+    ray_t* _bx = NULL;
+    x = unbox_vec_arg(x, &_bx);
+    if (RAY_IS_ERR(x)) return x;
+    if (!is_list(x)) { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+    int64_t len = ray_len(x);
+    ray_t** elems = (ray_t**)ray_data(x);
+
+    ray_t* result = ray_alloc(len * sizeof(ray_t*));
+    if (!result) { if (_bx) ray_release(_bx); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    result->len = len;
+    ray_t** out = (ray_t**)ray_data(result);
+    for (int64_t i = 0; i < len; i++) {
+        ray_retain(elems[len - 1 - i]);
+        out[i] = elems[len - 1 - i];
+    }
+    if (_bx) ray_release(_bx);
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * Binary search
+ * ══════════════════════════════════════════ */
+
+/* (rand n max) → vector of n random i64 in [0, max) */
+ray_t* ray_rand_fn(ray_t* a, ray_t* b) {
+    if (!ray_is_atom(a) || !ray_is_atom(b)) return ray_error("type", NULL);
+    int64_t n, mx;
+    if (a->type == -RAY_I64) n = a->i64;
+    else if (a->type == -RAY_I32) n = a->i32;
+    else return ray_error("type", NULL);
+    if (b->type == -RAY_I64) mx = b->i64;
+    else if (b->type == -RAY_I32) mx = b->i32;
+    else return ray_error("type", NULL);
+    if (n < 0) return ray_error("domain", NULL);
+    if (mx <= 0) return ray_error("domain", NULL);
+    if (n == 0) return ray_vec_new(RAY_I64, 0);
+    ray_t* vec = ray_vec_new(RAY_I64, n);
+    if (RAY_IS_ERR(vec)) return vec;
+    int64_t* d = (int64_t*)ray_data(vec);
+    for (int64_t i = 0; i < n; i++) d[i] = (int64_t)(rand() % mx);
+    vec->len = n;
+    return vec;
+}
+
+/* (bin sorted-vec val) → rightmost index where sorted[i] <= val, -1 if none */
+ray_t* ray_bin_fn(ray_t* sorted, ray_t* val) {
+    if (!ray_is_vec(sorted) || sorted->type != RAY_I64)
+        return ray_error("type", NULL);
+    int64_t* d = (int64_t*)ray_data(sorted);
+    int64_t n = sorted->len;
+
+    if (ray_is_atom(val) && (val->type == -RAY_I64 || val->type == -RAY_I32)) {
+        int64_t v = val->i64;
+        int64_t lo = 0, hi = n - 1, result = -1;
+        while (lo <= hi) {
+            int64_t mid = lo + (hi - lo) / 2;
+            if (d[mid] <= v) { result = mid; lo = mid + 1; }
+            else hi = mid - 1;
+        }
+        return ray_i64(result);
+    }
+    if (ray_is_vec(val) && val->type == RAY_I64) {
+        int64_t* vals = (int64_t*)ray_data(val);
+        int64_t vn = val->len;
+        ray_t* rvec = ray_vec_new(RAY_I64, vn);
+        if (RAY_IS_ERR(rvec)) return rvec;
+        int64_t* out = (int64_t*)ray_data(rvec);
+        for (int64_t i = 0; i < vn; i++) {
+            int64_t v = vals[i];
+            int64_t lo = 0, hi = n - 1, r = -1;
+            while (lo <= hi) {
+                int64_t mid = lo + (hi - lo) / 2;
+                if (d[mid] <= v) { r = mid; lo = mid + 1; }
+                else hi = mid - 1;
+            }
+            out[i] = r;
+        }
+        rvec->len = vn;
+        return rvec;
+    }
+    return ray_error("type", NULL);
+}
+
+/* (binr sorted-vec val) → leftmost index where sorted[i] >= val */
+ray_t* ray_binr_fn(ray_t* sorted, ray_t* val) {
+    if (!ray_is_vec(sorted) || sorted->type != RAY_I64)
+        return ray_error("type", NULL);
+    int64_t* d = (int64_t*)ray_data(sorted);
+    int64_t n = sorted->len;
+
+    if (ray_is_atom(val) && (val->type == -RAY_I64 || val->type == -RAY_I32)) {
+        int64_t v = val->i64;
+        int64_t lo = 0, hi = n - 1, result = n;
+        while (lo <= hi) {
+            int64_t mid = lo + (hi - lo) / 2;
+            if (d[mid] >= v) { result = mid; hi = mid - 1; }
+            else lo = mid + 1;
+        }
+        return ray_i64(result >= n ? n - 1 : result);
+    }
+    if (ray_is_vec(val) && val->type == RAY_I64) {
+        int64_t* vals = (int64_t*)ray_data(val);
+        int64_t vn = val->len;
+        ray_t* rvec = ray_vec_new(RAY_I64, vn);
+        if (RAY_IS_ERR(rvec)) return rvec;
+        int64_t* out = (int64_t*)ray_data(rvec);
+        for (int64_t i = 0; i < vn; i++) {
+            int64_t v = vals[i];
+            int64_t lo = 0, hi = n - 1, r = n;
+            while (lo <= hi) {
+                int64_t mid = lo + (hi - lo) / 2;
+                if (d[mid] >= v) { r = mid; hi = mid - 1; }
+                else lo = mid + 1;
+            }
+            out[i] = r >= n ? n - 1 : r;
+        }
+        rvec->len = vn;
+        return rvec;
+    }
+    return ray_error("type", NULL);
+}
+
+/* ══════════════════════════════════════════
+ * Map variants
+ * ══════════════════════════════════════════ */
+
+/* (map-left fn fixed vec) → apply fn(fixed, elem) for each elem in vec */
+/* Helper for map-left/map-right: iterate over vec calling fn with two args */
+static ray_t* map_iterate(ray_t* fn, ray_t* fixed, ray_t* vec, int fixed_is_left) {
+    /* If both are scalars, just call once */
+    if (!ray_is_vec(vec) && vec->type != RAY_LIST) {
+        if (fixed_is_left)
+            return call_fn2(fn, fixed, vec);
+        else
+            return call_fn2(fn, vec, fixed);
+    }
+
+    int64_t vn = vec->len;
+    ray_t* stack_results[4096];
+    ray_t** results = stack_results;
+    if (vn > 4096) {
+        results = (ray_t**)ray_sys_alloc((size_t)vn * sizeof(ray_t*));
+        if (!results) return ray_error("oom", NULL);
+    }
+
+    for (int64_t i = 0; i < vn; i++) {
+        int alloc = 0;
+        ray_t* elem = collection_elem(vec, i, &alloc);
+        if (fixed_is_left)
+            results[i] = call_fn2(fn, fixed, elem);
+        else
+            results[i] = call_fn2(fn, elem, fixed);
+        if (alloc) ray_release(elem);
+        if (RAY_IS_ERR(results[i])) {
+            ray_t* err = results[i];
+            for (int64_t j = 0; j < i; j++) ray_release(results[j]);
+            if (results != stack_results) ray_sys_free(results);
+            return err;
+        }
+    }
+    ray_t* out = ray_enlist_fn(results, vn);
+    for (int64_t i = 0; i < vn; i++) ray_release(results[i]);
+    if (results != stack_results) ray_sys_free(results);
+    return out;
+}
+
+/* (map-left fn fixed vec) → apply fn(fixed, elem) for each elem in vec.
+ * If vec is scalar but fixed is a vector, auto-swap (iterate over fixed). */
+ray_t* ray_map_left_fn(ray_t** args, int64_t n) {
+    if (n != 3) return ray_error("domain", NULL);
+    ray_t* fn = args[0];
+    ray_t* fixed = args[1];
+    ray_t* vec = args[2];
+
+    /* Auto-detect: if vec is scalar but fixed is a vector, swap roles */
+    if (!ray_is_vec(vec) && vec->type != RAY_LIST &&
+        (ray_is_vec(fixed) || fixed->type == RAY_LIST)) {
+        return map_iterate(fn, vec, fixed, 0); /* fn(elem_of_fixed, vec) — but we want fn(fixed=scalar, elem) */
+    }
+
+    return map_iterate(fn, fixed, vec, 1); /* fn(fixed, elem) */
+}
+
+/* (map-right fn vec fixed) → apply fn(elem, fixed) for each elem in vec.
+ * If vec is scalar but fixed is a vector, auto-swap (iterate over fixed). */
+ray_t* ray_map_right_fn(ray_t** args, int64_t n) {
+    if (n != 3) return ray_error("domain", NULL);
+    ray_t* fn = args[0];
+    ray_t* vec = args[1];
+    ray_t* fixed = args[2];
+
+    /* Auto-detect: if vec is scalar but fixed is a vector, swap roles */
+    if (!ray_is_vec(vec) && vec->type != RAY_LIST &&
+        (ray_is_vec(fixed) || fixed->type == RAY_LIST)) {
+        return map_iterate(fn, vec, fixed, 1); /* fn(vec_scalar, elem_of_fixed) */
+    }
+
+    return map_iterate(fn, fixed, vec, 0); /* fn(elem, fixed) */
+}
+
+/* ══════════════════════════════════════════
+ * Fold/scan variants
+ * ══════════════════════════════════════════ */
+
+/* (fold-left fn init coll) — left fold with explicit initial value */
+ray_t* ray_fold_left_fn(ray_t** args, int64_t n) {
+    /* Same as (fold fn init coll) — fold already goes left-to-right */
+    return ray_fold_fn(args, n);
+}
+
+/* (fold-right fn init coll) — right fold */
+ray_t* ray_fold_right_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("domain", NULL);
+    for (int64_t i = 0; i < n; i++)
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+
+    ray_t* fn = args[0];
+    ray_t* _bx = NULL;
+    ray_t* acc;
+    ray_t* vec;
+
+    if (n == 2) {
+        /* (fold-right fn vec) — use last element as initial value */
+        vec = unbox_vec_arg(args[1], &_bx);
+        if (RAY_IS_ERR(vec)) return vec;
+        if (!is_list(vec)) { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+        int64_t len = ray_len(vec);
+        if (len == 0) { if (_bx) ray_release(_bx); return ray_error("domain", NULL); }
+        ray_t** elems = (ray_t**)ray_data(vec);
+        ray_retain(elems[len - 1]);
+        acc = elems[len - 1];
+        for (int64_t i = len - 2; i >= 0; i--) {
+            ray_t* next = call_fn2(fn, elems[i], acc);
+            ray_release(acc);
+            if (RAY_IS_ERR(next)) { if (_bx) ray_release(_bx); return next; }
+            acc = next;
+        }
+        if (_bx) ray_release(_bx);
+        return acc;
+    }
+
+    /* (fold-right fn init coll) */
+    ray_retain(args[1]);
+    acc = args[1];
+    vec = unbox_vec_arg(args[2], &_bx);
+    if (RAY_IS_ERR(vec)) { ray_release(acc); return vec; }
+    if (!is_list(vec)) { ray_release(acc); if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+    int64_t len = ray_len(vec);
+    ray_t** elems = (ray_t**)ray_data(vec);
+    for (int64_t i = len - 1; i >= 0; i--) {
+        ray_t* next = call_fn2(fn, elems[i], acc);
+        ray_release(acc);
+        if (RAY_IS_ERR(next)) { if (_bx) ray_release(_bx); return next; }
+        acc = next;
+    }
+    if (_bx) ray_release(_bx);
+    return acc;
+}
+
+/* (scan-left fn vec) — running left fold (same as scan) */
+ray_t* ray_scan_left_fn(ray_t** args, int64_t n) {
+    return ray_scan_fn(args, n);
+}
+
+/* (scan-right fn vec) — running right fold, returns vector of partial results */
+ray_t* ray_scan_right_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("domain", NULL);
+    for (int64_t i = 0; i < n; i++)
+        if (ray_is_lazy(args[i])) args[i] = ray_lazy_materialize(args[i]);
+
+    ray_t* fn = args[0];
+    ray_t* _bx = NULL;
+    ray_t* vec = unbox_vec_arg(args[1], &_bx);
+    if (RAY_IS_ERR(vec)) return vec;
+    if (!is_list(vec)) { if (_bx) ray_release(_bx); return ray_error("type", NULL); }
+    int64_t len = ray_len(vec);
+    if (len == 0) {
+        if (_bx) ray_release(_bx);
+        ray_t* result = ray_alloc(0);
+        if (!result) return ray_error("oom", NULL);
+        result->type = RAY_LIST;
+        result->len = 0;
+        return result;
+    }
+
+    ray_t* result = ray_alloc(len * sizeof(ray_t*));
+    if (!result) { if (_bx) ray_release(_bx); return ray_error("oom", NULL); }
+    result->type = RAY_LIST;
+    result->len = len;
+    ray_t** out = (ray_t**)ray_data(result);
+    ray_t** elems = (ray_t**)ray_data(vec);
+
+    ray_retain(elems[len - 1]);
+    out[len - 1] = elems[len - 1];
+    for (int64_t i = len - 2; i >= 0; i--) {
+        out[i] = call_fn2(fn, elems[i], out[i + 1]);
+        if (RAY_IS_ERR(out[i])) {
+            for (int64_t j = i + 1; j < len; j++) ray_release(out[j]);
+            result->len = 0; ray_release(result); if (_bx) ray_release(_bx);
+            return out[i];
+        }
+    }
+    if (_bx) ray_release(_bx);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/datalog.c b/crates/rayforce-sys/vendor/rayforce/src/ops/datalog.c
new file mode 100644
index 0000000..a354412
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/datalog.c
@@ -0,0 +1,4325 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+/*
+ * datalog.c — Datalog evaluation engine for Rayforce
+ *
+ * Compiles Datalog rules into ray_graph_t operation DAGs and evaluates
+ * them to fixpoint using semi-naive evaluation with stratified negation.
+ */
+#include "ops/datalog.h"
+#include "lang/internal.h"
+#include "lang/env.h"
+#include "table/sym.h"
+#include "ops/ops.h"
+#include "ops/hash.h"          /* ray_hash_i64, ray_hash_combine */
+#include "ops/internal.h"      /* col_propagate_str_pool */
+#include "mem/sys.h"           /* ray_sys_alloc / ray_sys_free */
+#include <string.h>
+#include <stdio.h>
+
+/* ========================================================================
+ * Program lifecycle
+ * ======================================================================== */
+
+dl_program_t* dl_program_new(void) {
+    /* Allocate via ray_alloc and use the data region for the program struct.
+     * This avoids alignment issues since ray_alloc returns a ray_t* header. */
+    ray_t* block = ray_alloc(sizeof(dl_program_t));
+    if (!block) return NULL;
+    dl_program_t* prog = (dl_program_t*)ray_data(block);
+    memset(prog, 0, sizeof(dl_program_t));
+    return prog;
+}
+
+/* Recover the ray_t header from a dl_program_t pointer for ray_free. */
+static inline ray_t* dl_prog_block(dl_program_t* prog) {
+    return (ray_t*)((char*)prog - 32);  /* ray_data is at offset 32 */
+}
+
+void dl_program_free(dl_program_t* prog) {
+    if (!prog) return;
+    for (int i = 0; i < prog->n_rels; i++) {
+        if (prog->rels[i].table && !RAY_IS_ERR(prog->rels[i].table))
+            ray_release(prog->rels[i].table);
+        if (prog->rels[i].prov_col && !RAY_IS_ERR(prog->rels[i].prov_col))
+            ray_release(prog->rels[i].prov_col);
+        if (prog->rels[i].prov_src_offsets && !RAY_IS_ERR(prog->rels[i].prov_src_offsets))
+            ray_release(prog->rels[i].prov_src_offsets);
+        if (prog->rels[i].prov_src_data && !RAY_IS_ERR(prog->rels[i].prov_src_data))
+            ray_release(prog->rels[i].prov_src_data);
+    }
+    ray_free(dl_prog_block(prog));
+}
+
+/* ========================================================================
+ * Relation management
+ * ======================================================================== */
+
+int dl_find_rel(dl_program_t* prog, const char* name) {
+    for (int i = 0; i < prog->n_rels; i++) {
+        if (strcmp(prog->rels[i].name, name) == 0)
+            return i;
+    }
+    return -1;
+}
+
+/* Generate a unique column name for a relation: "{relname}__c{idx}" */
+static int64_t dl_col_sym(const char* rel_name, int col_idx) {
+    char buf[80];
+    snprintf(buf, sizeof(buf), "%s__c%d", rel_name, col_idx);
+    return ray_sym_intern(buf, strlen(buf));
+}
+
+int dl_add_edb(dl_program_t* prog, const char* name, ray_t* table, int arity) {
+    if (!prog || !name || !table || prog->n_rels >= DL_MAX_RELS)
+        return -1;
+
+    int idx = prog->n_rels++;
+    dl_rel_t* rel = &prog->rels[idx];
+    memset(rel, 0, sizeof(dl_rel_t));
+
+    size_t name_len = strlen(name);
+    if (name_len >= sizeof(rel->name)) name_len = sizeof(rel->name) - 1;
+    memcpy(rel->name, name, name_len);
+    rel->name[name_len] = '\0';
+
+    rel->arity = arity;
+    rel->is_idb = false;
+
+    /* Build a new table with relation-prefixed column names to avoid
+     * collisions when multiple tables participate in a join. */
+    for (int c = 0; c < arity && c < DL_MAX_ARITY; c++)
+        rel->col_names[c] = dl_col_sym(name, c);
+
+    ray_t* new_tbl = ray_table_new(arity);
+    for (int c = 0; c < arity; c++) {
+        ray_t* col = ray_table_get_col_idx(table, c);
+        if (!col) { ray_release(new_tbl); return -1; }
+        new_tbl = ray_table_add_col(new_tbl, rel->col_names[c], col);
+        if (RAY_IS_ERR(new_tbl)) return -1;
+    }
+    rel->table = new_tbl;
+
+    return idx;
+}
+
+int dl_ensure_idb(dl_program_t* prog, const char* name, int arity) {
+    int idx = dl_find_rel(prog, name);
+    if (idx >= 0) return idx;
+
+    if (prog->n_rels >= DL_MAX_RELS) return -1;
+    idx = prog->n_rels++;
+    dl_rel_t* rel = &prog->rels[idx];
+    memset(rel, 0, sizeof(dl_rel_t));
+
+    size_t name_len = strlen(name);
+    if (name_len >= sizeof(rel->name)) name_len = sizeof(rel->name) - 1;
+    memcpy(rel->name, name, name_len);
+    rel->name[name_len] = '\0';
+
+    /* Create empty table with arity columns */
+    rel->table = ray_table_new(arity);
+    if (!rel->table || RAY_IS_ERR(rel->table)) return -1;
+
+    rel->arity = arity;
+    rel->is_idb = true;
+
+    for (int c = 0; c < arity && c < DL_MAX_ARITY; c++) {
+        rel->col_names[c] = dl_col_sym(name, c);
+        ray_t* empty_col = ray_vec_new(RAY_I64, 0);
+        if (empty_col && !RAY_IS_ERR(empty_col)) {
+            rel->table = ray_table_add_col(rel->table, rel->col_names[c], empty_col);
+            ray_release(empty_col);
+        }
+    }
+
+    return idx;
+}
+
+/* ========================================================================
+ * Rule management
+ * ======================================================================== */
+
+/* When a rule has a typed head constant at slot c, the IDB relation's
+ * column c must be of that type so ray_vec_concat (used by table_union)
+ * doesn't reject the merge.  Rebuilds matching columns on an *empty* IDB
+ * table in-place.  Safe because schema is established before evaluation. */
+static void dl_idb_align_head_const_types(dl_program_t* prog, const dl_rule_t* rule) {
+    int rel_idx = dl_find_rel(prog, rule->head_pred);
+    if (rel_idx < 0) return;
+    dl_rel_t* rel = &prog->rels[rel_idx];
+    if (!rel->is_idb) return;
+    if (!rel->table || RAY_IS_ERR(rel->table)) return;
+    if (ray_table_nrows(rel->table) != 0) return;  /* types already committed */
+
+    int ncols = (int)ray_table_ncols(rel->table);
+    if (ncols != rel->arity) return;
+
+    bool any_change = false;
+    int8_t desired[DL_MAX_ARITY];
+    for (int c = 0; c < rel->arity; c++) {
+        ray_t* col = ray_table_get_col_idx(rel->table, c);
+        int8_t cur = col ? col->type : RAY_I64;
+        int8_t want = rule->head_const_types[c];
+        if (want == 0) {
+            desired[c] = cur;
+        } else if (cur != RAY_I64 && cur != want) {
+            /* First-non-zero-wins policy: once a slot is committed to a
+             * non-default type by a prior rule, any later rule that
+             * disagrees is a program-level conflict.  Mark the program
+             * so dl_eval (which reads eval_err after evaluation) reports
+             * failure — no stderr write from a non-debug code path. */
+            prog->eval_err = true;
+            return;
+        } else {
+            desired[c] = want;
+            if (want != cur) any_change = true;
+        }
+    }
+    if (!any_change) return;
+
+    /* Rebuild the table with typed empty columns.  Alignment is required
+     * for later evaluation to produce type-matching table_union inputs,
+     * so any failure here must also set prog->eval_err = true — silently
+     * returning would leave the IDB schema unaligned and dl_eval would
+     * later hit a ray_vec_concat type mismatch without any error signal. */
+    ray_t* fresh = ray_table_new(rel->arity);
+    if (!fresh) { prog->eval_err = true; return; }
+    if (RAY_IS_ERR(fresh)) { prog->eval_err = true; ray_error_free(fresh); return; }
+    for (int c = 0; c < rel->arity; c++) {
+        ray_t* empty_col = ray_vec_new(desired[c], 0);
+        if (!empty_col) { prog->eval_err = true; ray_release(fresh); return; }
+        if (RAY_IS_ERR(empty_col)) {
+            prog->eval_err = true;
+            ray_error_free(empty_col);
+            ray_release(fresh);
+            return;
+        }
+        ray_t* prev = fresh;
+        fresh = ray_table_add_col(fresh, rel->col_names[c], empty_col);
+        ray_release(empty_col);
+        if (!fresh) { prog->eval_err = true; ray_release(prev); return; }
+        if (RAY_IS_ERR(fresh)) {
+            prog->eval_err = true;
+            ray_release(prev);
+            ray_error_free(fresh);
+            return;
+        }
+    }
+    ray_release(rel->table);
+    rel->table = fresh;
+}
+
+int dl_add_rule(dl_program_t* prog, const dl_rule_t* rule) {
+    if (!prog || !rule || prog->n_rules >= DL_MAX_RULES)
+        return -1;
+    int idx = prog->n_rules++;
+    memcpy(&prog->rules[idx], rule, sizeof(dl_rule_t));
+    prog->rules[idx].stratum = -1;
+
+    /* Ensure IDB relation exists for the head predicate */
+    dl_ensure_idb(prog, rule->head_pred, rule->head_arity);
+
+    /* Align IDB column types to any typed head constants in this rule.
+     * Must run before evaluation so table_union/concat see matching types. */
+    dl_idb_align_head_const_types(prog, rule);
+
+    return idx;
+}
+
+/* ========================================================================
+ * Rule builder helpers
+ * ======================================================================== */
+
+void dl_rule_init(dl_rule_t* rule, const char* head_pred, int head_arity) {
+    memset(rule, 0, sizeof(dl_rule_t));
+    size_t len = strlen(head_pred);
+    if (len >= sizeof(rule->head_pred)) len = sizeof(rule->head_pred) - 1;
+    memcpy(rule->head_pred, head_pred, len);
+    rule->head_pred[len] = '\0';
+    rule->head_arity = head_arity;
+    rule->n_body = 0;
+    rule->n_vars = 0;
+    rule->stratum = -1;
+    for (int i = 0; i < DL_MAX_ARITY; i++)
+        rule->head_vars[i] = DL_CONST;
+}
+
+void dl_rule_head_var(dl_rule_t* rule, int pos, int var_idx) {
+    if (pos < 0 || pos >= rule->head_arity) return;
+    rule->head_vars[pos] = var_idx;
+    rule->head_const_types[pos] = 0;
+    if (var_idx + 1 > rule->n_vars) rule->n_vars = var_idx + 1;
+}
+
+void dl_rule_head_const_typed(dl_rule_t* rule, int pos, int64_t val, int8_t type) {
+    if (pos < 0 || pos >= rule->head_arity) return;
+    /* Default to RAY_I64 if an unrecognized type sneaks through; keeps
+     * old-callers-with-no-type compat when writing to the slot. */
+    if (type != RAY_I64 && type != RAY_SYM && type != RAY_F64)
+        type = RAY_I64;
+    rule->head_vars[pos] = DL_CONST;
+    rule->head_consts[pos] = val;
+    rule->head_const_types[pos] = type;
+}
+
+/* Backward-compatible I64 wrapper.  Pre-aggregates-PR external callers
+ * used this 3-arg form; it now forwards to the typed variant with
+ * RAY_I64. */
+void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val) {
+    dl_rule_head_const_typed(rule, pos, val, RAY_I64);
+}
+
+void dl_rule_head_const_f64(dl_rule_t* rule, int pos, double val) {
+    int64_t bits;
+    memcpy(&bits, &val, sizeof(bits));
+    dl_rule_head_const_typed(rule, pos, bits, RAY_F64);
+}
+
+int dl_rule_add_atom(dl_rule_t* rule, const char* pred, int arity) {
+    if (rule->n_body >= DL_MAX_BODY) return -1;
+    int idx = rule->n_body++;
+    dl_body_t* b = &rule->body[idx];
+    memset(b, 0, sizeof(dl_body_t));
+    b->type = DL_POS;
+    size_t len = strlen(pred);
+    if (len >= sizeof(b->pred)) len = sizeof(b->pred) - 1;
+    memcpy(b->pred, pred, len);
+    b->pred[len] = '\0';
+    b->arity = arity;
+    for (int i = 0; i < DL_MAX_ARITY; i++)
+        b->vars[i] = DL_CONST;
+    return idx;
+}
+
+void dl_body_set_var(dl_rule_t* rule, int body_idx, int pos, int var_idx) {
+    if (body_idx < 0 || body_idx >= rule->n_body) return;
+    if (pos < 0 || pos >= rule->body[body_idx].arity) return;
+    rule->body[body_idx].vars[pos] = var_idx;
+    if (var_idx + 1 > rule->n_vars) rule->n_vars = var_idx + 1;
+}
+
+void dl_body_set_const(dl_rule_t* rule, int body_idx, int pos, int64_t val) {
+    if (body_idx < 0 || body_idx >= rule->n_body) return;
+    if (pos < 0 || pos >= rule->body[body_idx].arity) return;
+    rule->body[body_idx].vars[pos] = DL_CONST;
+    rule->body[body_idx].const_vals[pos] = val;
+}
+
+int dl_rule_add_neg(dl_rule_t* rule, const char* pred, int arity) {
+    int idx = dl_rule_add_atom(rule, pred, arity);
+    if (idx >= 0) rule->body[idx].type = DL_NEG;
+    return idx;
+}
+
+int dl_rule_add_cmp(dl_rule_t* rule, int cmp_op, int lhs_var, int rhs_var) {
+    if (rule->n_body >= DL_MAX_BODY) return -1;
+    int idx = rule->n_body++;
+    dl_body_t* b = &rule->body[idx];
+    memset(b, 0, sizeof(dl_body_t));
+    b->type = DL_CMP;
+    b->cmp_op = cmp_op;
+    b->cmp_lhs = lhs_var;
+    b->cmp_rhs = rhs_var;
+    if (lhs_var + 1 > rule->n_vars) rule->n_vars = lhs_var + 1;
+    if (rhs_var + 1 > rule->n_vars) rule->n_vars = rhs_var + 1;
+    return idx;
+}
+
+int dl_rule_add_cmp_const(dl_rule_t* rule, int cmp_op, int lhs_var, int64_t rhs_val) {
+    if (rule->n_body >= DL_MAX_BODY) return -1;
+    int idx = rule->n_body++;
+    dl_body_t* b = &rule->body[idx];
+    memset(b, 0, sizeof(dl_body_t));
+    b->type = DL_CMP;
+    b->cmp_op = cmp_op;
+    b->cmp_lhs = lhs_var;
+    b->cmp_rhs = DL_CONST;
+    b->cmp_const = rhs_val;
+    if (lhs_var + 1 > rule->n_vars) rule->n_vars = lhs_var + 1;
+    return idx;
+}
+
+/* ========================================================================
+ * Expression tree builders
+ * ======================================================================== */
+
+static dl_expr_t* dl_expr_alloc(void) {
+    ray_t* block = ray_alloc(sizeof(dl_expr_t));
+    if (!block) return NULL;
+    dl_expr_t* e = (dl_expr_t*)ray_data(block);
+    memset(e, 0, sizeof(dl_expr_t));
+    return e;
+}
+
+dl_expr_t* dl_expr_const(int64_t val) {
+    dl_expr_t* e = dl_expr_alloc();
+    if (!e) return NULL;
+    e->kind = DL_EXPR_CONST;
+    e->const_val = val;
+    return e;
+}
+
+dl_expr_t* dl_expr_const_f64(double val) {
+    dl_expr_t* e = dl_expr_alloc();
+    if (!e) return NULL;
+    e->kind = DL_EXPR_CONST_F64;
+    e->const_f64 = val;
+    return e;
+}
+
+dl_expr_t* dl_expr_var(int var_idx) {
+    dl_expr_t* e = dl_expr_alloc();
+    if (!e) return NULL;
+    e->kind = DL_EXPR_VAR;
+    e->var_idx = var_idx;
+    return e;
+}
+
+dl_expr_t* dl_expr_binop(int op, dl_expr_t* left, dl_expr_t* right) {
+    dl_expr_t* e = dl_expr_alloc();
+    if (!e) return NULL;
+    e->kind = DL_EXPR_BINOP;
+    e->binop = op;
+    e->left = left;
+    e->right = right;
+    return e;
+}
+
+/* ========================================================================
+ * Assignment and builtin rule builders
+ * ======================================================================== */
+
+int dl_rule_add_assign(dl_rule_t* rule, int target_var, int op, dl_expr_t* expr) {
+    if (rule->n_body >= DL_MAX_BODY) return -1;
+    int idx = rule->n_body++;
+    dl_body_t* b = &rule->body[idx];
+    memset(b, 0, sizeof(dl_body_t));
+    b->type = DL_ASSIGN;
+    b->assign_var = target_var;
+    b->assign_expr = expr;
+    if (target_var + 1 > rule->n_vars) rule->n_vars = target_var + 1;
+    (void)op;  /* reserved for future assignment operators */
+    return idx;
+}
+
+int dl_rule_add_builtin(dl_rule_t* rule, int builtin_id, int arity) {
+    if (rule->n_body >= DL_MAX_BODY) return -1;
+    int idx = rule->n_body++;
+    dl_body_t* b = &rule->body[idx];
+    memset(b, 0, sizeof(dl_body_t));
+    b->type = DL_BUILTIN;
+    b->builtin_id = builtin_id;
+    b->arity = arity;
+    for (int i = 0; i < DL_MAX_ARITY; i++)
+        b->vars[i] = DL_CONST;
+    return idx;
+}
+
+static int dl_expr_max_var(const dl_expr_t* e) {
+    if (!e) return -1;
+    if (e->kind == DL_EXPR_VAR) return e->var_idx;
+    if (e->kind == DL_EXPR_BINOP) {
+        int l = dl_expr_max_var(e->left);
+        int r = dl_expr_max_var(e->right);
+        return l > r ? l : r;
+    }
+    return -1;
+}
+
+int dl_rule_add_cmp_expr(dl_rule_t* rule, int cmp_op, dl_expr_t* lhs, dl_expr_t* rhs) {
+    if (rule->n_body >= DL_MAX_BODY) return -1;
+    int idx = rule->n_body++;
+    dl_body_t* b = &rule->body[idx];
+    memset(b, 0, sizeof(dl_body_t));
+    b->type = DL_CMP;
+    b->cmp_op = cmp_op;
+    b->cmp_lhs_expr = lhs;
+    b->cmp_rhs_expr = rhs;
+    /* Update n_vars from the expression trees */
+    int mv = dl_expr_max_var(lhs);
+    int rv = dl_expr_max_var(rhs);
+    if (rv > mv) mv = rv;
+    if (mv + 1 > rule->n_vars) rule->n_vars = mv + 1;
+    return idx;
+}
+
+int dl_rule_add_interval(dl_rule_t* rule, int fact_var, int start_var, int end_var) {
+    if (rule->n_body >= DL_MAX_BODY) return -1;
+    int idx = rule->n_body++;
+    dl_body_t* b = &rule->body[idx];
+    memset(b, 0, sizeof(dl_body_t));
+    b->type = DL_INTERVAL;
+    b->interval_fact_var = fact_var;
+    b->interval_start_var = start_var;
+    b->interval_end_var = end_var;
+    if (fact_var + 1 > rule->n_vars) rule->n_vars = fact_var + 1;
+    if (start_var + 1 > rule->n_vars) rule->n_vars = start_var + 1;
+    if (end_var + 1 > rule->n_vars) rule->n_vars = end_var + 1;
+    return idx;
+}
+
+int dl_rule_add_agg(dl_rule_t* rule, int op, int target_var,
+                    const char* pred, int pred_arity, int value_col) {
+    if (rule->n_body >= DL_MAX_BODY) return -1;
+    int idx = rule->n_body++;
+    dl_body_t* b = &rule->body[idx];
+    memset(b, 0, sizeof(*b));
+    b->type           = DL_AGG;
+    b->agg_op         = op;
+    b->agg_target_var = target_var;
+    snprintf(b->agg_pred, sizeof(b->agg_pred), "%s", pred);
+    b->agg_arity      = pred_arity;
+    b->agg_value_col  = value_col;
+    b->agg_n_group_keys = 0;
+    if (target_var + 1 > rule->n_vars) rule->n_vars = target_var + 1;
+    return idx;
+}
+
+int dl_rule_agg_set_group(dl_rule_t* rule, int body_idx,
+                          const int* key_vars, const int* key_cols, int n_keys) {
+    if (!rule || body_idx < 0 || body_idx >= rule->n_body) return -1;
+    if (n_keys < 0 || n_keys > DL_AGG_MAX_KEYS) return -1;
+    dl_body_t* b = &rule->body[body_idx];
+    if (b->type != DL_AGG) return -1;
+    b->agg_n_group_keys = n_keys;
+    for (int i = 0; i < n_keys; i++) {
+        b->agg_group_key_vars[i] = key_vars[i];
+        b->agg_group_key_cols[i] = key_cols[i];
+        if (key_vars[i] + 1 > rule->n_vars)
+            rule->n_vars = key_vars[i] + 1;
+    }
+    return 0;
+}
+
+/* ========================================================================
+ * Stratification — topological sort on negation dependency graph
+ * ======================================================================== */
+
+int dl_stratify(dl_program_t* prog) {
+    if (!prog) return -1;
+
+    /* Build dependency graph: for each IDB predicate, which other IDB
+     * predicates does it depend on positively or negatively? */
+    int n = prog->n_rels;
+    /* dep[i][j]: 0 = no dep, 1 = positive dep, 2 = negative dep */
+    int dep[DL_MAX_RELS][DL_MAX_RELS];
+    memset(dep, 0, sizeof(dep));
+
+    for (int r = 0; r < prog->n_rules; r++) {
+        dl_rule_t* rule = &prog->rules[r];
+        int head_idx = dl_find_rel(prog, rule->head_pred);
+        if (head_idx < 0) continue;
+
+        for (int b = 0; b < rule->n_body; b++) {
+            dl_body_t* body = &rule->body[b];
+            if (body->type == DL_AGG) {
+                /* Aggregates are non-monotonic: head must live in a higher
+                 * stratum than the predicate being aggregated. */
+                int body_idx = dl_find_rel(prog, body->agg_pred);
+                if (body_idx < 0) continue;
+                dep[head_idx][body_idx] = 2;  /* negative (non-monotonic) dep */
+                continue;
+            }
+            if (body->type != DL_POS && body->type != DL_NEG) continue;
+            int body_idx = dl_find_rel(prog, body->pred);
+            if (body_idx < 0) continue;
+            if (body->type == DL_NEG)
+                dep[head_idx][body_idx] = 2;  /* negative dep */
+            else if (dep[head_idx][body_idx] == 0)
+                dep[head_idx][body_idx] = 1;  /* positive dep (don't override neg) */
+        }
+    }
+
+    /* Assign strata: predicates with no negative dependencies go to stratum 0.
+     * A predicate with a negative dep on stratum S goes to stratum S+1.
+     * Repeat until stable. If unstable after n iterations, there's a cycle. */
+    int stratum[DL_MAX_RELS];
+    memset(stratum, 0, sizeof(stratum));
+
+    for (int iter = 0; iter < n + 1; iter++) {
+        bool changed = false;
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < n; j++) {
+                if (dep[i][j] == 2) {
+                    /* Negative dependency: head must be in higher stratum */
+                    if (stratum[i] <= stratum[j]) {
+                        stratum[i] = stratum[j] + 1;
+                        changed = true;
+                    }
+                } else if (dep[i][j] == 1) {
+                    /* Positive dependency: head must be >= stratum */
+                    if (stratum[i] < stratum[j]) {
+                        stratum[i] = stratum[j];
+                        changed = true;
+                    }
+                }
+            }
+        }
+        if (!changed) break;
+        if (iter == n) return -1;  /* unstratifiable negation cycle */
+    }
+
+    /* Build strata arrays */
+    int max_stratum = 0;
+    for (int i = 0; i < n; i++) {
+        if (stratum[i] > max_stratum) max_stratum = stratum[i];
+    }
+    prog->n_strata = max_stratum + 1;
+    memset(prog->strata_sizes, 0, sizeof(prog->strata_sizes));
+
+    for (int i = 0; i < n; i++) {
+        int s = stratum[i];
+        if (s < DL_MAX_STRATA && prog->strata_sizes[s] < DL_MAX_RELS) {
+            prog->strata[s][prog->strata_sizes[s]++] = i;
+        }
+    }
+
+    /* Assign stratum to each rule */
+    for (int r = 0; r < prog->n_rules; r++) {
+        int head_idx = dl_find_rel(prog, prog->rules[r].head_pred);
+        if (head_idx >= 0)
+            prog->rules[r].stratum = stratum[head_idx];
+    }
+
+    return 0;
+}
+
+/* ========================================================================
+ * Rule compiler — materializing approach
+ *
+ * Instead of building a single graph with joins, we execute each body
+ * atom separately, producing intermediate tables, and join them C-level.
+ * This avoids column-name-collision issues in the graph-level join.
+ * ======================================================================== */
+
+/* ========================================================================
+ * Expression evaluation — compute column from expression tree
+ * ======================================================================== */
+
+/* Helper: materialize a column of the given type/size as a copy or promotion
+ * of src. If target==RAY_F64 and src is RAY_I64, promote. Returns new owned column. */
+static ray_t* dl_col_as_f64(ray_t* src, int64_t nrows) {
+    ray_t* out = ray_vec_new(RAY_F64, nrows);
+    if (!out) return NULL;
+    if (RAY_IS_ERR(out)) { ray_error_free(out); return NULL; }
+    out->len = nrows;
+    double* od = (double*)ray_data(out);
+    if (src->type == RAY_F64) {
+        memcpy(od, ray_data(src), (size_t)nrows * sizeof(double));
+    } else { /* RAY_I64 */
+        int64_t* sd = (int64_t*)ray_data(src);
+        for (int64_t r = 0; r < nrows; r++) od[r] = (double)sd[r];
+    }
+    return out;
+}
+
+/* Evaluate an expression tree against the accumulator table.
+ * Returns a new owned vector of length nrows. The element type is RAY_F64
+ * if the expression involves any float constant or any RAY_F64 source column,
+ * otherwise RAY_I64. */
+static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum,
+                             int* var_col, int64_t nrows) {
+    if (!expr) return NULL;
+
+    switch (expr->kind) {
+    case DL_EXPR_CONST: {
+        ray_t* col = ray_vec_new(RAY_I64, nrows);
+        if (!col) return NULL;
+        if (RAY_IS_ERR(col)) { ray_error_free(col); return NULL; }
+        col->len = nrows;
+        int64_t* d = (int64_t*)ray_data(col);
+        for (int64_t r = 0; r < nrows; r++)
+            d[r] = expr->const_val;
+        return col;
+    }
+    case DL_EXPR_CONST_F64: {
+        ray_t* col = ray_vec_new(RAY_F64, nrows);
+        if (!col) return NULL;
+        if (RAY_IS_ERR(col)) { ray_error_free(col); return NULL; }
+        col->len = nrows;
+        double* d = (double*)ray_data(col);
+        for (int64_t r = 0; r < nrows; r++)
+            d[r] = expr->const_f64;
+        return col;
+    }
+    case DL_EXPR_VAR: {
+        int ci = var_col[expr->var_idx];
+        ray_t* src = ray_table_get_col_idx(accum, ci);
+        if (!src) return NULL;
+        if (src->type != RAY_I64 && src->type != RAY_F64) return NULL;
+        size_t elem = (src->type == RAY_F64) ? sizeof(double) : sizeof(int64_t);
+        ray_t* dst = ray_vec_new(src->type, nrows);
+        if (!dst) return NULL;
+        if (RAY_IS_ERR(dst)) { ray_error_free(dst); return NULL; }
+        dst->len = nrows;
+        memcpy(ray_data(dst), ray_data(src), (size_t)nrows * elem);
+        return dst;
+    }
+    case DL_EXPR_BINOP: {
+        ray_t* lv = dl_eval_expr(expr->left, accum, var_col, nrows);
+        ray_t* rv = dl_eval_expr(expr->right, accum, var_col, nrows);
+        if (!lv || !rv) {
+            if (lv) ray_release(lv);
+            if (rv) ray_release(rv);
+            return NULL;
+        }
+        bool is_f64 = (lv->type == RAY_F64) || (rv->type == RAY_F64);
+        if (is_f64) {
+            ray_t* lf = dl_col_as_f64(lv, nrows);
+            ray_t* rf = dl_col_as_f64(rv, nrows);
+            ray_release(lv); ray_release(rv);
+            if (!lf || !rf) {
+                if (lf) ray_release(lf);
+                if (rf) ray_release(rf);
+                return NULL;
+            }
+            ray_t* out = ray_vec_new(RAY_F64, nrows);
+            if (!out) { ray_release(lf); ray_release(rf); return NULL; }
+            if (RAY_IS_ERR(out)) {
+                ray_error_free(out);
+                ray_release(lf); ray_release(rf); return NULL;
+            }
+            out->len = nrows;
+            double* ld = (double*)ray_data(lf);
+            double* rd = (double*)ray_data(rf);
+            double* od = (double*)ray_data(out);
+            for (int64_t r = 0; r < nrows; r++) {
+                switch (expr->binop) {
+                case OP_ADD: od[r] = ld[r] + rd[r]; break;
+                case OP_SUB: od[r] = ld[r] - rd[r]; break;
+                case OP_MUL: od[r] = ld[r] * rd[r]; break;
+                case OP_DIV: od[r] = rd[r] != 0.0 ? ld[r] / rd[r] : 0.0; break;
+                default:     od[r] = 0.0; break;
+                }
+            }
+            ray_release(lf); ray_release(rf);
+            return out;
+        }
+        ray_t* out = ray_vec_new(RAY_I64, nrows);
+        if (!out) { ray_release(lv); ray_release(rv); return NULL; }
+        if (RAY_IS_ERR(out)) {
+            ray_error_free(out);
+            ray_release(lv); ray_release(rv); return NULL;
+        }
+        out->len = nrows;
+        int64_t* ld = (int64_t*)ray_data(lv);
+        int64_t* rd = (int64_t*)ray_data(rv);
+        int64_t* od = (int64_t*)ray_data(out);
+        for (int64_t r = 0; r < nrows; r++) {
+            switch (expr->binop) {
+            case OP_ADD: od[r] = ld[r] + rd[r]; break;
+            case OP_SUB: od[r] = ld[r] - rd[r]; break;
+            case OP_MUL: od[r] = ld[r] * rd[r]; break;
+            case OP_DIV: od[r] = rd[r] != 0 ? ld[r] / rd[r] : 0; break;
+            default:     od[r] = 0; break;
+            }
+        }
+        ray_release(lv);
+        ray_release(rv);
+        return out;
+    }
+    }
+    return NULL;
+}
+
+/* Helper: append a new column to a table. Returns new owned table. */
+static ray_t* dl_table_add_computed_col(ray_t* tbl, ray_t* new_col, const char* name) {
+    int64_t ncols = ray_table_ncols(tbl);
+    ray_t* out = ray_table_new((int)(ncols + 1));
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        if (col)
+            out = ray_table_add_col(out, ray_table_col_name(tbl, c), col);
+    }
+    int64_t sym = ray_sym_intern(name, strlen(name));
+    out = ray_table_add_col(out, sym, new_col);
+    return out;
+}
+
+/* ========================================================================
+ * Builtin predicate evaluation helpers
+ * ======================================================================== */
+
+/* before(S, E, T): keep rows where T < S */
+static ray_t* dl_builtin_before(ray_t* tbl, int s_col, int t_col) {
+    if (!tbl || RAY_IS_ERR(tbl) || ray_table_nrows(tbl) == 0) return tbl;
+
+    int64_t nrows = ray_table_nrows(tbl);
+    int64_t ncols = ray_table_ncols(tbl);
+    int64_t* sd = (int64_t*)ray_data(ray_table_get_col_idx(tbl, s_col));
+    int64_t* t_data = (int64_t*)ray_data(ray_table_get_col_idx(tbl, t_col));
+
+    int64_t count = 0;
+    for (int64_t r = 0; r < nrows; r++)
+        if (t_data[r] < sd[r]) count++;
+
+    if (count == nrows) { ray_retain(tbl); return tbl; }
+
+    ray_t* out = ray_table_new((int)ncols);
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* src = ray_table_get_col_idx(tbl, c);
+        if (!src) continue;
+        ray_t* dst = ray_vec_new(src->type, count);
+        if (!dst || RAY_IS_ERR(dst)) continue;
+        dst->len = count;
+        int64_t* src_d = (int64_t*)ray_data(src);
+        int64_t* dst_d = (int64_t*)ray_data(dst);
+        int64_t j = 0;
+        for (int64_t r = 0; r < nrows; r++)
+            if (t_data[r] < sd[r])
+                dst_d[j++] = src_d[r];
+        out = ray_table_add_col(out, ray_table_col_name(tbl, c), dst);
+        ray_release(dst);
+    }
+    return out;
+}
+
+/* duration_since(T1, T2, D): compute D = T2 - T1, append as new column */
+static ray_t* dl_builtin_duration_since(ray_t* tbl, int t1_col, int t2_col,
+                                          const char* out_name) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    int64_t nrows = ray_table_nrows(tbl);
+    int64_t* t1 = (int64_t*)ray_data(ray_table_get_col_idx(tbl, t1_col));
+    int64_t* t2 = (int64_t*)ray_data(ray_table_get_col_idx(tbl, t2_col));
+
+    ray_t* col = ray_vec_new(RAY_I64, nrows);
+    if (!col || RAY_IS_ERR(col)) { ray_retain(tbl); return tbl; }
+    col->len = nrows;
+    int64_t* d = (int64_t*)ray_data(col);
+    for (int64_t r = 0; r < nrows; r++)
+        d[r] = t2[r] - t1[r];
+
+    ray_t* out = dl_table_add_computed_col(tbl, col, out_name);
+    ray_release(col);
+    return out;
+}
+
+/* abs(X, Y): compute Y = |X|, append as new column */
+static ray_t* dl_builtin_abs(ray_t* tbl, int x_col, const char* out_name) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    int64_t nrows = ray_table_nrows(tbl);
+    int64_t* xd = (int64_t*)ray_data(ray_table_get_col_idx(tbl, x_col));
+
+    ray_t* col = ray_vec_new(RAY_I64, nrows);
+    if (!col || RAY_IS_ERR(col)) { ray_retain(tbl); return tbl; }
+    col->len = nrows;
+    int64_t* d = (int64_t*)ray_data(col);
+    for (int64_t r = 0; r < nrows; r++)
+        d[r] = xd[r] < 0 ? -xd[r] : xd[r];
+
+    ray_t* out = dl_table_add_computed_col(tbl, col, out_name);
+    ray_release(col);
+    return out;
+}
+
+/* Helper: join two tables on specified column pairs. Returns new owned table.
+ * left_cols[k] and right_cols[k] are column indices in left/right tables. */
+static ray_t* dl_join_tables(ray_t* left, ray_t* right,
+                              const int* left_cols, const int* right_cols, int n_keys) {
+    if (!left || RAY_IS_ERR(left) || !right || RAY_IS_ERR(right)) return NULL;
+    if (ray_table_nrows(left) == 0 || ray_table_nrows(right) == 0) {
+        /* Return empty table with left+right non-key columns */
+        int64_t lnc = ray_table_ncols(left);
+        int64_t rnc = ray_table_ncols(right);
+        ray_t* empty = ray_table_new((int)(lnc + rnc));
+        for (int64_t c = 0; c < lnc; c++) {
+            ray_t* col = ray_table_get_col_idx(left, c);
+            if (!col) continue;
+            ray_t* ec = ray_vec_new(col->type, 0);
+            if (ec && !RAY_IS_ERR(ec)) {
+                empty = ray_table_add_col(empty, ray_table_col_name(left, c), ec);
+                ray_release(ec);
+            }
+        }
+        return empty;
+    }
+
+    /* Build unique column names for the join using a single graph */
+    ray_graph_t* g = ray_graph_new(NULL);
+    if (!g) return NULL;
+
+    /* Create copies with unique names */
+    int64_t lnc = ray_table_ncols(left);
+    int64_t rnc = ray_table_ncols(right);
+    ray_t* ltbl = ray_table_new((int)lnc);
+    for (int64_t c = 0; c < lnc; c++) {
+        ray_t* col = ray_table_get_col_idx(left, c);
+        if (!col) continue;
+        char name[32]; snprintf(name, sizeof(name), "L%d", (int)c);
+        int64_t sym = ray_sym_intern(name, strlen(name));
+        ltbl = ray_table_add_col(ltbl, sym, col);
+    }
+    ray_t* rtbl = ray_table_new((int)rnc);
+    for (int64_t c = 0; c < rnc; c++) {
+        ray_t* col = ray_table_get_col_idx(right, c);
+        if (!col) continue;
+        char name[32]; snprintf(name, sizeof(name), "R%d", (int)c);
+        int64_t sym = ray_sym_intern(name, strlen(name));
+        rtbl = ray_table_add_col(rtbl, sym, col);
+    }
+
+    uint16_t l_tid = ray_graph_add_table(g, ltbl);
+    uint16_t r_tid = ray_graph_add_table(g, rtbl);
+    ray_op_t* l_op = ray_const_table(g, ltbl);
+    ray_op_t* r_op = ray_const_table(g, rtbl);
+
+    ray_op_t* lkeys[DL_MAX_ARITY];
+    ray_op_t* rkeys[DL_MAX_ARITY];
+    for (int k = 0; k < n_keys; k++) {
+        char lname[32]; snprintf(lname, sizeof(lname), "L%d", left_cols[k]);
+        char rname[32]; snprintf(rname, sizeof(rname), "R%d", right_cols[k]);
+        lkeys[k] = ray_scan_table(g, l_tid, lname);
+        rkeys[k] = ray_scan_table(g, r_tid, rname);
+    }
+
+    ray_op_t* join = ray_join(g, l_op, lkeys, r_op, rkeys, (uint8_t)n_keys, 0);
+    ray_t* result = ray_execute(g, join);
+    ray_graph_free(g);
+    ray_release(ltbl);
+    ray_release(rtbl);
+    return result;
+}
+
+/* Helper: antijoin two tables on specified column pairs. Returns new owned table. */
+static ray_t* dl_antijoin_tables(ray_t* left, ray_t* right,
+                                  const int* left_cols, const int* right_cols, int n_keys) {
+    if (!left || RAY_IS_ERR(left)) return left;
+    if (!right || RAY_IS_ERR(right) || ray_table_nrows(right) == 0) {
+        ray_retain(left); return left;
+    }
+    if (ray_table_nrows(left) == 0) { ray_retain(left); return left; }
+
+    ray_graph_t* g = ray_graph_new(NULL);
+    if (!g) { ray_retain(left); return left; }
+
+    int64_t lnc = ray_table_ncols(left);
+    int64_t rnc = ray_table_ncols(right);
+    ray_t* ltbl = ray_table_new((int)lnc);
+    for (int64_t c = 0; c < lnc; c++) {
+        ray_t* col = ray_table_get_col_idx(left, c);
+        if (!col) continue;
+        char name[32]; snprintf(name, sizeof(name), "L%d", (int)c);
+        ltbl = ray_table_add_col(ltbl, ray_sym_intern(name, strlen(name)), col);
+    }
+    ray_t* rtbl = ray_table_new((int)rnc);
+    for (int64_t c = 0; c < rnc; c++) {
+        ray_t* col = ray_table_get_col_idx(right, c);
+        if (!col) continue;
+        char name[32]; snprintf(name, sizeof(name), "R%d", (int)c);
+        rtbl = ray_table_add_col(rtbl, ray_sym_intern(name, strlen(name)), col);
+    }
+
+    uint16_t l_tid = ray_graph_add_table(g, ltbl);
+    uint16_t r_tid = ray_graph_add_table(g, rtbl);
+    ray_op_t* l_op = ray_const_table(g, ltbl);
+    ray_op_t* r_op = ray_const_table(g, rtbl);
+
+    ray_op_t* lkeys[DL_MAX_ARITY];
+    ray_op_t* rkeys[DL_MAX_ARITY];
+    for (int k = 0; k < n_keys; k++) {
+        char lname[32]; snprintf(lname, sizeof(lname), "L%d", left_cols[k]);
+        char rname[32]; snprintf(rname, sizeof(rname), "R%d", right_cols[k]);
+        lkeys[k] = ray_scan_table(g, l_tid, lname);
+        rkeys[k] = ray_scan_table(g, r_tid, rname);
+    }
+
+    ray_op_t* aj = ray_antijoin(g, l_op, lkeys, r_op, rkeys, (uint8_t)n_keys);
+    ray_t* result = ray_execute(g, aj);
+    ray_graph_free(g);
+    ray_release(ltbl);
+    ray_release(rtbl);
+    return result;
+}
+
+/* Helper: filter a table to rows where column col_idx == value */
+/* Row-at-index read helper: read an I64 from either a RAY_I64 column
+ * or from a RAY_SYM column (of any adaptive width) as a sym ID.  Other
+ * types aren't supported by the constant-filter path and cause the
+ * caller to pass through the input table unchanged. */
+static bool dl_col_eq_row(ray_t* col, int64_t row, int64_t value) {
+    if (col->type == RAY_I64) return ((int64_t*)ray_data(col))[row] == value;
+    if (col->type == RAY_SYM)
+        return ray_read_sym(ray_data(col), row, col->type, col->attrs) == value;
+    return false;
+}
+
+static ray_t* dl_filter_eq(ray_t* tbl, int col_idx, int64_t value) {
+    /* Contract: always return an owned reference (rc bumped) so the
+     * caller can release uniformly.  Every pass-through must therefore
+     * retain — else the caller's `ray_release(body_tbl); body_tbl =
+     * filtered;` pattern would leave body_tbl under-referenced and a
+     * later release could land on freed memory. */
+    if (!tbl || RAY_IS_ERR(tbl)) { if (tbl) ray_retain(tbl); return tbl; }
+    if (ray_table_nrows(tbl) == 0) { ray_retain(tbl); return tbl; }
+
+    ray_t* col = ray_table_get_col_idx(tbl, col_idx);
+    if (!col) { ray_retain(tbl); return tbl; }
+    /* Non-numeric, non-sym keys: not supported by this filter — pass
+     * through (retained) rather than miscompare via raw memcpy. */
+    if (col->type != RAY_I64 && col->type != RAY_SYM) {
+        ray_retain(tbl);
+        return tbl;
+    }
+
+    int64_t nrows = ray_table_nrows(tbl);
+    int64_t ncols = ray_table_ncols(tbl);
+
+    /* Count matching rows — type-aware read for RAY_SYM adaptive width. */
+    int64_t count = 0;
+    for (int64_t r = 0; r < nrows; r++)
+        if (dl_col_eq_row(col, r, value)) count++;
+
+    if (count == nrows) { ray_retain(tbl); return tbl; }
+
+    /* Build filtered table.  Each surviving column is allocated with
+     * its source's element-size (via ray_sym_elem_size) so narrow-SYM
+     * stays narrow rather than being silently widened to W64. */
+    ray_t* out = ray_table_new((int)ncols);
+    if (!out) return ray_error("memory", "dl_filter_eq: table_new");
+    if (RAY_IS_ERR(out)) return out;
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* src = ray_table_get_col_idx(tbl, c);
+        if (!src) {
+            ray_release(out);
+            return ray_error("domain", "dl_filter_eq: missing source column");
+        }
+        ray_t* dst = (src->type == RAY_SYM)
+            ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, count)
+            : ray_vec_new(src->type, count);
+        if (!dst) { ray_release(out); return ray_error("memory", "dl_filter_eq: vec_new"); }
+        if (RAY_IS_ERR(dst)) { ray_error_free(dst); ray_release(out); return ray_error("memory", "dl_filter_eq: vec_new"); }
+        dst->len = count;
+        uint8_t esz = ray_sym_elem_size(src->type, src->attrs);
+        const uint8_t* src_b = (const uint8_t*)ray_data(src);
+        uint8_t* dst_b = (uint8_t*)ray_data(dst);
+        int64_t j = 0;
+        for (int64_t r = 0; r < nrows; r++) {
+            if (dl_col_eq_row(col, r, value)) {
+                memcpy(dst_b + (size_t)j * esz,
+                       src_b + (size_t)r * esz,
+                       (size_t)esz);
+                j++;
+            }
+        }
+        if (src->type == RAY_STR) col_propagate_str_pool(dst, src);
+        ray_t* next = ray_table_add_col(out, ray_table_col_name(tbl, c), dst);
+        ray_release(dst);
+        /* ray_table_add_col does not release `out` on failure, so we
+         * must release the partially-built table before bailing out. */
+        if (!next) {
+            ray_release(out);
+            return ray_error("memory", "dl_filter_eq: add_col");
+        }
+        if (RAY_IS_ERR(next)) {
+            ray_release(out);
+            return next;
+        }
+        out = next;
+    }
+    return out;
+}
+
+/* Helper: build a fully-owned broadcast column for a constant head slot.
+ *
+ * Returns a fresh ray_t* vec with refcount 1, caller-owned.  The caller is
+ * expected to hand the ref to a table via ray_table_add_col (which retains)
+ * and then ray_release our owning ref, leaving the table as sole owner.
+ *
+ * Correctness note: this must be a real, heap-allocated vec — not a view
+ * onto rule-local scratch — so that the IDB relation table can outlive the
+ * per-iteration scratch that built it.  Cross-IDB reads at subsequent
+ * strata borrow from this column via ray_table_get_col_idx. */
+/* width_template: when type == RAY_SYM, this column is consulted for its
+ * SYM attrs/width so the broadcast matches the IDB relation's existing
+ * adaptive width (otherwise ray_vec_new would default to W64 and a
+ * later table_union would hit a ray_vec_concat width mismatch).  Pass
+ * NULL (no existing column) to get the W64 default.  Using a pointer
+ * here rather than a uint8_t hint avoids the W8=0 sentinel ambiguity
+ * of an "a zero hint means default" convention. */
+static ray_t* dl_broadcast_const_col(int64_t nrows, int8_t type, int64_t val,
+                                      const ray_t* width_template) {
+    if (type != RAY_I64 && type != RAY_SYM && type != RAY_F64) {
+        return ray_error("type", NULL);
+    }
+    uint8_t sym_w = RAY_SYM_W64;
+    if (type == RAY_SYM && width_template && width_template->type == RAY_SYM)
+        sym_w = width_template->attrs & RAY_SYM_W_MASK;
+    ray_t* v = (type == RAY_SYM)
+        ? ray_sym_vec_new(sym_w, nrows)
+        : ray_vec_new(type, nrows);
+    if (!v || RAY_IS_ERR(v)) return v;
+    v->len = nrows;
+
+    if (type == RAY_SYM) {
+        /* Use the generic writer so it handles any adaptive width. */
+        void* data = ray_data(v);
+        for (int64_t i = 0; i < nrows; i++) {
+            ray_write_sym(data, i, (uint64_t)val, v->type, v->attrs);
+        }
+    } else if (type == RAY_F64) {
+        double d;
+        memcpy(&d, &val, sizeof(d));
+        double* data = (double*)ray_data(v);
+        for (int64_t i = 0; i < nrows; i++) data[i] = d;
+    } else {  /* RAY_I64 */
+        int64_t* data = (int64_t*)ray_data(v);
+        for (int64_t i = 0; i < nrows; i++) data[i] = val;
+    }
+    return v;
+}
+
+/* Helper: project table to selected columns, producing output with head relation naming.
+ *
+ * For each output slot c:
+ *   - if col_indices[c] >= 0, copy that column from `tbl`
+ *   - else (constant slot), synthesize a broadcast column from head_consts[c]
+ *     with type head_const_types[c]. */
+static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out,
+                          dl_rel_t* head_rel, const int64_t* head_consts,
+                          const int8_t* head_const_types) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    int64_t nrows = ray_table_nrows(tbl);
+    ray_t* out = ray_table_new(n_out);
+    if (!out || RAY_IS_ERR(out))
+        return out ? out : ray_error("memory", "dl_project: table_new");
+    /* If accum collapsed to zero rows (e.g. antijoin removed everything),
+     * its schema may have been dropped too.  Fall back to the IDB's existing
+     * column types so downstream table_union sees a matching schema. */
+    bool empty_accum = (nrows == 0);
+    for (int c = 0; c < n_out; c++) {
+        int src_idx = col_indices[c];
+        if (src_idx >= 0) {
+            ray_t* src = ray_table_get_col_idx(tbl, src_idx);
+            if (!src) {
+                if (empty_accum && head_rel && head_rel->table) {
+                    ray_t* hcol = ray_table_get_col_idx(head_rel->table, c);
+                    int8_t htype = hcol ? hcol->type : RAY_I64;
+                    /* For SYM columns, preserve the head-relation's
+                     * adaptive-width attrs — ray_vec_new(RAY_SYM, …) would
+                     * force W64 and a later table_union onto a narrower
+                     * head-rel column would hit the column-count check,
+                     * or worse, produce a width-mismatched merge. */
+                    ray_t* ecol = (htype == RAY_SYM && hcol)
+                        ? ray_sym_vec_new(hcol->attrs & RAY_SYM_W_MASK, 0)
+                        : ray_vec_new(htype, 0);
+                    if (!ecol) {
+                        ray_release(out);
+                        return ray_error("memory", "dl_project: empty col");
+                    }
+                    if (RAY_IS_ERR(ecol)) {
+                        ray_error_free(ecol);
+                        ray_release(out);
+                        return ray_error("memory", "dl_project: empty col");
+                    }
+                    ray_t* next = ray_table_add_col(out, head_rel->col_names[c], ecol);
+                    ray_release(ecol);
+                    if (!next) {
+                        ray_release(out);
+                        return ray_error("memory", "dl_project: add_col");
+                    }
+                    if (RAY_IS_ERR(next)) {
+                        ray_release(out);
+                        return next;
+                    }
+                    out = next;
+                    continue;
+                }
+                ray_release(out);
+                return ray_error("domain", "dl_project: source column missing");
+            }
+            /* Preserve SYM index width: ray_vec_new(RAY_SYM, …) would always
+             * produce a W64 vec, so memcpy'ing with the source's narrower
+             * element size would leave the upper bytes of each W64 slot
+             * uninitialized.  ray_sym_vec_new mirrors src's attrs width. */
+            ray_t* dst = (src->type == RAY_SYM)
+                ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, nrows)
+                : ray_vec_new(src->type, nrows);
+            if (!dst) {
+                ray_release(out);
+                return ray_error("memory", "dl_project: vec_new");
+            }
+            if (RAY_IS_ERR(dst)) {
+                ray_error_free(dst);
+                ray_release(out);
+                return ray_error("memory", "dl_project: vec_new");
+            }
+            dst->len = nrows;
+            uint8_t esz = ray_sym_elem_size(src->type, src->attrs);
+            if (esz == 0) {
+                ray_release(dst);
+                ray_release(out);
+                return ray_error("type", "dl_project: unsupported column type");
+            }
+            memcpy(ray_data(dst), ray_data(src), (size_t)nrows * (size_t)esz);
+            /* RAY_STR stores 16-byte ray_str_t handles inline; strings >12
+             * bytes keep their bytes in a per-vector pool referenced via
+             * pool_off.  The memcpy above copies the handles but not the
+             * pool, so propagate the source's pool onto dst or later
+             * reads through pool_off would land in a NULL pool. */
+            if (src->type == RAY_STR) col_propagate_str_pool(dst, src);
+            ray_t* next = ray_table_add_col(out, head_rel->col_names[c], dst);
+            ray_release(dst);
+            /* Release the partial `out` on failure — ray_table_add_col
+             * does not free its input on error. */
+            if (!next) {
+                ray_release(out);
+                return ray_error("memory", "dl_project: add_col");
+            }
+            if (RAY_IS_ERR(next)) {
+                ray_release(out);
+                return next;
+            }
+            out = next;
+        } else {
+            /* Constant head slot: materialize an owned broadcast column. */
+            int8_t ctype = head_const_types ? head_const_types[c] : 0;
+            if (ctype == 0) {
+                ray_release(out);
+                return ray_error("domain", "dl_project: unset head-const type");
+            }
+            /* When the head relation's slot is an existing SYM column
+             * (from a prior aligned rule), match its width so
+             * table_union's ray_vec_concat doesn't reject a W64 vs
+             * narrow mismatch. */
+            const ray_t* width_tpl = NULL;
+            if (ctype == RAY_SYM && head_rel && head_rel->table)
+                width_tpl = ray_table_get_col_idx(head_rel->table, c);
+            ray_t* bcast = dl_broadcast_const_col(nrows, ctype, head_consts[c], width_tpl);
+            if (!bcast || RAY_IS_ERR(bcast)) {
+                ray_release(out);
+                return bcast ? bcast : ray_error("memory", "dl_project: broadcast");
+            }
+            ray_t* next = ray_table_add_col(out, head_rel->col_names[c], bcast);
+            ray_release(bcast);
+            if (!next) {
+                ray_release(out);
+                return ray_error("memory", "dl_project: add_col");
+            }
+            if (RAY_IS_ERR(next)) {
+                ray_release(out);
+                return next;
+            }
+            out = next;
+        }
+    }
+    return out;
+}
+
+ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule,
+                          int delta_pos, int rule_idx, ray_graph_t* g) {
+    /* Materializing approach: execute body atoms one at a time.
+     *
+     * For each positive body atom, we get the relation table and apply
+     * constant filters. Then join with the accumulated result.
+     * Variable bindings track which column in the accumulated table
+     * holds each variable's value.
+     *
+     * var_col[v] = column index in `accum` table for variable v.
+     */
+    int var_col[DL_MAX_ARITY * DL_MAX_BODY];  /* column index in accum per variable */
+    bool var_bound[DL_MAX_ARITY * DL_MAX_BODY];
+    memset(var_bound, 0, sizeof(var_bound));
+    memset(var_col, -1, sizeof(var_col));
+
+    ray_t* accum = NULL;  /* accumulated result table */
+
+    for (int b = 0; b < rule->n_body; b++) {
+        dl_body_t* body = &rule->body[b];
+        if (body->type != DL_POS) continue;
+
+        int rel_idx = dl_find_rel(prog, body->pred);
+        if (rel_idx < 0) { if (accum) ray_release(accum); return NULL; }
+        dl_rel_t* rel = &prog->rels[rel_idx];
+        ray_t* body_tbl = rel->table;
+        ray_retain(body_tbl);
+
+        /* Apply constant filters */
+        for (int c = 0; c < body->arity; c++) {
+            if (body->vars[c] == DL_CONST) {
+                ray_t* filtered = dl_filter_eq(body_tbl, c, body->const_vals[c]);
+                ray_release(body_tbl);
+                if (!filtered) {
+                    /* Treat as genuine failure — dl_filter_eq returns an
+                     * owned reference on every non-NULL path, so NULL
+                     * means something went wrong inside the helper. */
+                    if (accum) ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                if (RAY_IS_ERR(filtered)) {
+                    ray_error_free(filtered);
+                    if (accum) ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                body_tbl = filtered;
+            }
+        }
+
+        if (accum == NULL) {
+            /* First body atom: accum = body_tbl */
+            accum = body_tbl;
+            /* Bind variables to column indices */
+            for (int c = 0; c < body->arity; c++) {
+                int v = body->vars[c];
+                if (v == DL_CONST) continue;
+                if (!var_bound[v]) {
+                    var_bound[v] = true;
+                    var_col[v] = c;
+                }
+            }
+        } else {
+            /* Join accum with body_tbl on shared variables */
+            int lkeys[DL_MAX_ARITY], rkeys[DL_MAX_ARITY];
+            int n_jk = 0;
+            for (int c = 0; c < body->arity; c++) {
+                int v = body->vars[c];
+                if (v == DL_CONST) continue;
+                if (var_bound[v]) {
+                    lkeys[n_jk] = var_col[v];
+                    rkeys[n_jk] = c;
+                    n_jk++;
+                }
+            }
+
+            ray_t* joined;
+            if (n_jk > 0) {
+                joined = dl_join_tables(accum, body_tbl, lkeys, rkeys, n_jk);
+            } else {
+                /* Cross product: use dummy key */
+                int lk0 = 0, rk0 = 0;
+                joined = dl_join_tables(accum, body_tbl, &lk0, &rk0, 0);
+            }
+
+            int64_t accum_ncols = ray_table_ncols(accum);
+            ray_release(accum);
+            ray_release(body_tbl);
+            accum = joined;
+
+            /* Bind new variables: their columns come after left columns in join output.
+             * Join output = [all left cols] + [non-key right cols].
+             * We need to track which right columns appear in output. */
+            int right_col_map[DL_MAX_ARITY]; /* right col c -> output col idx */
+            int out_idx = (int)accum_ncols;
+            for (int c = 0; c < body->arity; c++) {
+                bool is_key = false;
+                for (int k = 0; k < n_jk; k++) {
+                    if (rkeys[k] == c) { is_key = true; break; }
+                }
+                if (is_key) {
+                    right_col_map[c] = -1;  /* key col not in output */
+                } else {
+                    right_col_map[c] = out_idx++;
+                }
+            }
+
+            for (int c = 0; c < body->arity; c++) {
+                int v = body->vars[c];
+                if (v == DL_CONST) continue;
+                if (!var_bound[v]) {
+                    var_bound[v] = true;
+                    var_col[v] = right_col_map[c];
+                }
+            }
+        }
+    }
+
+    /* Rules with only aggregates (no positive body atoms) still need a
+     * one-row binding environment so aggregate results can be projected. */
+    if (!accum) {
+        bool has_agg = false;
+        for (int bi = 0; bi < rule->n_body; bi++) {
+            if (rule->body[bi].type == DL_AGG) {
+                has_agg = true;
+                break;
+            }
+        }
+        if (!has_agg)
+            return NULL;
+        ray_t* one_val = ray_vec_new(RAY_I64, 1);
+        if (!one_val) { prog->eval_err = true; return NULL; }
+        if (RAY_IS_ERR(one_val)) {
+            ray_error_free(one_val);
+            prog->eval_err = true;
+            return NULL;
+        }
+        one_val->len = 1;
+        ((int64_t*)ray_data(one_val))[0] = 0;
+        accum = ray_table_new(1);
+        if (!accum) {
+            ray_release(one_val);
+            prog->eval_err = true;
+            return NULL;
+        }
+        if (RAY_IS_ERR(accum)) {
+            ray_error_free(accum);
+            ray_release(one_val);
+            prog->eval_err = true;
+            return NULL;
+        }
+        int64_t unit_sym = ray_sym_intern("_unit", 5);
+        ray_t* accum_unit = ray_table_add_col(accum, unit_sym, one_val);
+        ray_release(one_val);
+        /* ray_table_add_col doesn't free `accum` on error — release it
+         * ourselves so the partially-built table isn't leaked. */
+        if (!accum_unit) {
+            ray_release(accum);
+            prog->eval_err = true;
+            return NULL;
+        }
+        if (RAY_IS_ERR(accum_unit)) {
+            ray_error_free(accum_unit);
+            ray_release(accum);
+            prog->eval_err = true;
+            return NULL;
+        }
+        accum = accum_unit;
+    }
+
+    if (!accum) return NULL;
+
+    /* Process non-join body literals in declared order.
+     * This ensures dependencies between literals (e.g., interval bind before
+     * assignment, assignment before comparison) are respected. */
+    for (int b = 0; b < rule->n_body; b++) {
+        dl_body_t* body = &rule->body[b];
+        if (body->type == DL_POS) continue;  /* already processed above */
+        if (!accum || RAY_IS_ERR(accum)) break;
+
+        switch (body->type) {
+        case DL_NEG: {
+            int rel_idx = dl_find_rel(prog, body->pred);
+            if (rel_idx < 0) { ray_release(accum); return NULL; }
+            dl_rel_t* rel = &prog->rels[rel_idx];
+
+            /* Apply constant filters to the negated relation first */
+            ray_t* neg_tbl = rel->table;
+            ray_retain(neg_tbl);
+            for (int c = 0; c < body->arity; c++) {
+                if (body->vars[c] == DL_CONST) {
+                    ray_t* filtered = dl_filter_eq(neg_tbl, c, body->const_vals[c]);
+                    ray_release(neg_tbl);
+                    if (!filtered) {
+                        ray_release(accum);
+                        prog->eval_err = true;
+                        return NULL;
+                    }
+                    if (RAY_IS_ERR(filtered)) {
+                        ray_error_free(filtered);
+                        ray_release(accum);
+                        prog->eval_err = true;
+                        return NULL;
+                    }
+                    neg_tbl = filtered;
+                }
+            }
+
+            int lkeys[DL_MAX_ARITY], rkeys[DL_MAX_ARITY];
+            int n_keys = 0;
+            for (int c = 0; c < body->arity; c++) {
+                int v = body->vars[c];
+                if (v == DL_CONST) continue;
+                if (var_bound[v]) {
+                    lkeys[n_keys] = var_col[v];
+                    rkeys[n_keys] = c;
+                    n_keys++;
+                }
+            }
+
+            if (n_keys > 0) {
+                ray_t* result = dl_antijoin_tables(accum, neg_tbl, lkeys, rkeys, n_keys);
+                ray_release(accum);
+                accum = result;
+            }
+            ray_release(neg_tbl);
+            break;
+        }
+
+        case DL_ASSIGN: {
+            int64_t nrows = ray_table_nrows(accum);
+            ray_t* new_col = dl_eval_expr(body->assign_expr, accum, var_col, nrows);
+            /* Silently breaking would leave assign_var unbound and let
+             * the rest of the rule keep compiling with stale bindings,
+             * producing a dl_eval == 0 return alongside wrong rows. */
+            if (!new_col) {
+                ray_release(accum);
+                prog->eval_err = true;
+                return NULL;
+            }
+            if (RAY_IS_ERR(new_col)) {
+                ray_error_free(new_col);
+                ray_release(accum);
+                prog->eval_err = true;
+                return NULL;
+            }
+
+            int new_col_idx = (int)ray_table_ncols(accum);
+            char colname[32];
+            snprintf(colname, sizeof(colname), "_a%d", body->assign_var);
+            ray_t* new_accum = dl_table_add_computed_col(accum, new_col, colname);
+            ray_release(new_col);
+            ray_release(accum);
+            if (!new_accum) { prog->eval_err = true; return NULL; }
+            if (RAY_IS_ERR(new_accum)) {
+                ray_error_free(new_accum);
+                prog->eval_err = true;
+                return NULL;
+            }
+            accum = new_accum;
+
+            var_bound[body->assign_var] = true;
+            var_col[body->assign_var] = new_col_idx;
+            break;
+        }
+
+        case DL_AGG: {
+            if (body->agg_n_group_keys > 0) {
+                /* Grouped aggregation: use rayforce's ray_group on src_table.
+                 *
+                 * Mixed-rule guard: this path assumes accum is the singleton
+                 * _unit placeholder created for aggregate-only rules. If the
+                 * rule has real positive body atoms, accum carries bound
+                 * variables from a prior join that we would need to intersect
+                 * against the group result — not yet supported. Bail early. */
+                bool has_pos = false;
+                for (int bi = 0; bi < rule->n_body; bi++) {
+                    if (rule->body[bi].type == DL_POS) { has_pos = true; break; }
+                }
+                if (has_pos) {
+                    /* nyi: grouped aggregate + positive body atoms.
+                     * Surface via eval_err so dl_eval reports failure
+                     * instead of writing a warning to stderr in a
+                     * non-debug build. */
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+
+                int src_idx = dl_find_rel(prog, body->agg_pred);
+                if (src_idx < 0) { ray_release(accum); return NULL; }
+                ray_t* src_table = prog->rels[src_idx].table;
+                int64_t src_nrows = (src_table && !RAY_IS_ERR(src_table))
+                    ? ray_table_nrows(src_table) : 0;
+                if (src_nrows == 0) {
+                    /* No source rows -> no groups -> rule produces no head tuples. */
+                    ray_release(accum);
+                    return NULL;
+                }
+
+                dl_rel_t* src_rel = &prog->rels[src_idx];
+                int nk = body->agg_n_group_keys;
+
+                /* Build a sub-graph that SCANs src_table's columns by symbol name.
+                 * ray_graph_new retains src_table internally; no extra retain needed. */
+                ray_graph_t* gg = ray_graph_new(src_table);
+                if (!gg) {
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+
+                ray_op_t* keys_ops[DL_AGG_MAX_KEYS];
+                for (int i = 0; i < nk; i++) {
+                    int kc = body->agg_group_key_cols[i];
+                    if (kc < 0 || kc >= src_rel->arity) {
+                        ray_graph_free(gg);
+                        ray_release(accum);
+                        prog->eval_err = true;
+                        return NULL;
+                    }
+                    int64_t sym = src_rel->col_names[kc];
+                    ray_t* s = ray_sym_str(sym);
+                    if (!s) {
+                        ray_graph_free(gg);
+                        ray_release(accum);
+                        prog->eval_err = true;
+                        return NULL;
+                    }
+                    keys_ops[i] = ray_scan(gg, ray_str_ptr(s));
+                    if (!keys_ops[i]) {
+                        ray_graph_free(gg);
+                        ray_release(accum);
+                        prog->eval_err = true;
+                        return NULL;
+                    }
+                }
+
+                /* Agg input: value column (for COUNT we still pass a column; any
+                 * column works since COUNT only counts rows).  Must be bounds-
+                 * checked — silently clamping to 0 would compute a valid-looking
+                 * but wrong result over an unrelated column. */
+                int value_col = body->agg_value_col;
+                if (body->agg_op != DL_AGG_COUNT &&
+                    (value_col < 0 || value_col >= src_rel->arity)) {
+                    ray_graph_free(gg);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                if (value_col < 0 || value_col >= src_rel->arity) value_col = 0;
+                ray_t* vs = ray_sym_str(src_rel->col_names[value_col]);
+                if (!vs) {
+                    ray_graph_free(gg);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                ray_op_t* agg_in = ray_scan(gg, ray_str_ptr(vs));
+                if (!agg_in) {
+                    ray_graph_free(gg);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+
+                uint16_t op_code;
+                switch (body->agg_op) {
+                    case DL_AGG_COUNT: op_code = OP_COUNT; break;
+                    case DL_AGG_SUM:   op_code = OP_SUM;   break;
+                    case DL_AGG_MIN:   op_code = OP_MIN;   break;
+                    case DL_AGG_MAX:   op_code = OP_MAX;   break;
+                    case DL_AGG_AVG:   op_code = OP_AVG;   break;
+                    default:
+                        ray_graph_free(gg);
+                        ray_release(accum); return NULL;
+                }
+
+                ray_op_t* ag_ins[1] = { agg_in };
+                ray_op_t* root = ray_group(gg, keys_ops, (uint8_t)nk, &op_code, ag_ins, 1);
+                if (!root) {
+                    ray_graph_free(gg);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                ray_t* group_tbl = ray_execute(gg, root);
+                ray_graph_free(gg);
+
+                if (!group_tbl) {
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                if (RAY_IS_ERR(group_tbl)) {
+                    ray_error_free(group_tbl);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+
+                /* Replace accum with group_tbl (schema: key0..key{nk-1}, agg).
+                 * This is valid because the DL_AGG case for aggregate-only rules
+                 * created a singleton _unit accum that we can discard. Mixed
+                 * rules (body atoms + grouped agg) are not supported here; they
+                 * would require a join on shared vars and fall under A5/later. */
+                ray_release(accum);
+                accum = group_tbl;
+
+                /* Bind key variables to the key columns in the group output */
+                for (int i = 0; i < nk; i++) {
+                    int kv = body->agg_group_key_vars[i];
+                    var_bound[kv] = true;
+                    var_col[kv] = i;
+                }
+                /* Bind target variable to the aggregate column (last column) */
+                var_bound[body->agg_target_var] = true;
+                var_col[body->agg_target_var] = nk;  /* agg column immediately follows keys */
+                break;
+            }
+            /* -------- existing scalar path below unchanged -------- */
+            int src_idx = dl_find_rel(prog, body->agg_pred);
+            if (src_idx < 0) {
+                ray_release(accum);
+                return NULL;
+            }
+            dl_rel_t* src_rel_s = &prog->rels[src_idx];
+            ray_t* src_table = src_rel_s->table;
+            int64_t src_nrows = (src_table && !RAY_IS_ERR(src_table))
+                ? ray_table_nrows(src_table)
+                : 0;
+
+            /* Bounds-check value column up front for every value-taking op
+             * (SUM/MIN/MAX/AVG).  Must happen before the empty-source early
+             * returns below, otherwise an out-of-range index on an empty
+             * source would silently emit the SUM identity 0 / 0.0. */
+            bool need_value_col = (body->agg_op == DL_AGG_SUM
+                                   || body->agg_op == DL_AGG_MIN
+                                   || body->agg_op == DL_AGG_MAX
+                                   || body->agg_op == DL_AGG_AVG);
+            if (need_value_col &&
+                (body->agg_value_col < 0 ||
+                 body->agg_value_col >= src_rel_s->arity)) {
+                ray_release(accum);
+                prog->eval_err = true;
+                return NULL;
+            }
+
+            if (src_nrows == 0 && (body->agg_op == DL_AGG_MIN
+                     || body->agg_op == DL_AGG_MAX
+                     || body->agg_op == DL_AGG_AVG)) {
+                /* Empty-source: MIN/MAX/AVG emit no row (matches rayforce core's domain
+                 * error / typed-null semantics). COUNT and SUM keep their identities (0). */
+                ray_release(accum);
+                return NULL;
+            }
+
+            int64_t result_i = 0;
+            double  result_f = 0.0;
+            bool    is_avg   = (body->agg_op == DL_AGG_AVG);
+            /* Float promotion: AVG always emits f64; SUM/MIN/MAX track their
+             * source column type (i64 in -> i64 out; f64 in -> f64 out).
+             * COUNT is always i64.  For empty SUM, we still need to inspect
+             * the column type so the identity (0 / 0.0) is emitted in the
+             * correct result type. */
+            bool    is_float = is_avg;
+            if (need_value_col) {
+                ray_t* vc0 = ray_table_get_col_idx(src_table, body->agg_value_col);
+                if (vc0) {
+                    if (vc0->type == RAY_F64) {
+                        is_float = true;
+                    } else if (vc0->type != RAY_I64) {
+                        /* Non-numeric source: reject regardless of row count. */
+                        ray_release(accum);
+                        prog->eval_err = true;
+                        return NULL;
+                    }
+                }
+            }
+            switch (body->agg_op) {
+            case DL_AGG_COUNT:
+                result_i = src_nrows;
+                break;
+            case DL_AGG_SUM:
+            case DL_AGG_MIN:
+            case DL_AGG_MAX:
+            case DL_AGG_AVG:
+                if (src_nrows > 0) {
+                    ray_t* val_col =
+                        ray_table_get_col_idx(src_table, body->agg_value_col);
+                    if (!val_col) {
+                        ray_release(accum);
+                        return NULL;
+                    }
+                    if (val_col->type == RAY_I64) {
+                        int64_t* vd = (int64_t*)ray_data(val_col);
+                        if (body->agg_op == DL_AGG_SUM) {
+                            result_i = 0;
+                            for (int64_t i = 0; i < src_nrows; i++)
+                                result_i += vd[i];
+                        } else if (body->agg_op == DL_AGG_MIN) {
+                            result_i = vd[0];
+                            for (int64_t i = 1; i < src_nrows; i++) {
+                                if (vd[i] < result_i)
+                                    result_i = vd[i];
+                            }
+                        } else if (body->agg_op == DL_AGG_MAX) {
+                            result_i = vd[0];
+                            for (int64_t i = 1; i < src_nrows; i++) {
+                                if (vd[i] > result_i)
+                                    result_i = vd[i];
+                            }
+                        } else { /* DL_AGG_AVG */
+                            int64_t acc = 0;
+                            for (int64_t i = 0; i < src_nrows; i++)
+                                acc += vd[i];
+                            result_f = (double)acc / (double)src_nrows;
+                        }
+                    } else if (val_col->type == RAY_F64) {
+                        is_float = true;  /* SUM/MIN/MAX promote to f64 */
+                        double* vd = (double*)ray_data(val_col);
+                        if (body->agg_op == DL_AGG_SUM) {
+                            result_f = 0.0;
+                            for (int64_t i = 0; i < src_nrows; i++)
+                                result_f += vd[i];
+                        } else if (body->agg_op == DL_AGG_MIN) {
+                            result_f = vd[0];
+                            for (int64_t i = 1; i < src_nrows; i++) {
+                                if (vd[i] < result_f)
+                                    result_f = vd[i];
+                            }
+                        } else if (body->agg_op == DL_AGG_MAX) {
+                            result_f = vd[0];
+                            for (int64_t i = 1; i < src_nrows; i++) {
+                                if (vd[i] > result_f)
+                                    result_f = vd[i];
+                            }
+                        } else { /* DL_AGG_AVG */
+                            double acc = 0.0;
+                            for (int64_t i = 0; i < src_nrows; i++)
+                                acc += vd[i];
+                            result_f = acc / (double)src_nrows;
+                        }
+                    } else {
+                        /* Non-numeric source column — reject loudly rather than
+                         * silently returning zero. */
+                        ray_release(accum);
+                        prog->eval_err = true;
+                        return NULL;
+                    }
+                }
+                break;
+            default:
+                break;
+            }
+
+            int64_t nrows = ray_table_nrows(accum);
+            if (nrows == 0)
+                break;
+            ray_t* new_col = ray_vec_new(is_float ? RAY_F64 : RAY_I64, nrows);
+            /* Silent break would leave agg_target_var unbound and eval
+             * would keep running with a partially-constructed rule —
+             * surface the allocation failure so dl_eval returns -1. */
+            if (!new_col) {
+                ray_release(accum);
+                prog->eval_err = true;
+                return NULL;
+            }
+            if (RAY_IS_ERR(new_col)) {
+                ray_error_free(new_col);
+                ray_release(accum);
+                prog->eval_err = true;
+                return NULL;
+            }
+            new_col->len = nrows;
+            if (is_float) {
+                double* nd = (double*)ray_data(new_col);
+                for (int64_t r = 0; r < nrows; r++) nd[r] = result_f;
+            } else {
+                int64_t* nd = (int64_t*)ray_data(new_col);
+                for (int64_t r = 0; r < nrows; r++) nd[r] = result_i;
+            }
+
+            int new_col_idx = (int)ray_table_ncols(accum);
+            char colname[32];
+            snprintf(colname, sizeof(colname), "_g%d", body->agg_target_var);
+            ray_t* new_accum = dl_table_add_computed_col(accum, new_col, colname);
+            ray_release(new_col);
+            ray_release(accum);
+            accum = new_accum;
+
+            var_bound[body->agg_target_var] = true;
+            var_col[body->agg_target_var] = new_col_idx;
+            break;
+        }
+
+        case DL_BUILTIN: {
+            switch (body->builtin_id) {
+            case DL_BUILTIN_BEFORE: {
+                int s_col = var_col[body->vars[0]];
+                int t_col = var_col[body->vars[2]];
+                ray_t* filtered = dl_builtin_before(accum, s_col, t_col);
+                ray_release(accum);
+                accum = filtered;
+                break;
+            }
+            case DL_BUILTIN_DURATION_SINCE: {
+                int t1_col = var_col[body->vars[0]];
+                int t2_col = var_col[body->vars[1]];
+                int d_var = body->vars[2];
+                int new_idx = (int)ray_table_ncols(accum);
+                char colname[32];
+                snprintf(colname, sizeof(colname), "_d%d", d_var);
+                ray_t* result = dl_builtin_duration_since(accum, t1_col, t2_col, colname);
+                ray_release(accum);
+                accum = result;
+                var_bound[d_var] = true;
+                var_col[d_var] = new_idx;
+                break;
+            }
+            case DL_BUILTIN_ABS: {
+                int x_col = var_col[body->vars[0]];
+                int y_var = body->vars[1];
+                int new_idx = (int)ray_table_ncols(accum);
+                char colname[32];
+                snprintf(colname, sizeof(colname), "_abs%d", y_var);
+                ray_t* result = dl_builtin_abs(accum, x_col, colname);
+                ray_release(accum);
+                accum = result;
+                var_bound[y_var] = true;
+                var_col[y_var] = new_idx;
+                break;
+            }
+            }
+            break;
+        }
+
+        case DL_CMP: {
+            int64_t nrows = ray_table_nrows(accum);
+            if (nrows == 0) break;
+
+            ray_t* lhs_evaled = NULL;
+            ray_t* rhs_evaled = NULL;
+            ray_t* lhs_src = NULL;  /* borrowed reference for type inspection */
+            ray_t* rhs_src = NULL;
+
+            if (body->cmp_lhs_expr) {
+                lhs_evaled = dl_eval_expr(body->cmp_lhs_expr, accum, var_col, nrows);
+                /* LHS evaluation failure can't be silently skipped — a
+                 * missing filter changes the query's answer. */
+                if (!lhs_evaled) {
+                    prog->eval_err = true;
+                    ray_release(accum);
+                    return NULL;
+                }
+                if (RAY_IS_ERR(lhs_evaled)) {
+                    ray_error_free(lhs_evaled);
+                    prog->eval_err = true;
+                    ray_release(accum);
+                    return NULL;
+                }
+                lhs_src = lhs_evaled;
+            } else {
+                int lhs_col = var_col[body->cmp_lhs];
+                lhs_src = ray_table_get_col_idx(accum, lhs_col);
+                if (!lhs_src) {
+                    prog->eval_err = true;
+                    ray_release(accum);
+                    return NULL;
+                }
+            }
+
+            if (body->cmp_rhs_expr) {
+                rhs_evaled = dl_eval_expr(body->cmp_rhs_expr, accum, var_col, nrows);
+                if (!rhs_evaled) {
+                    if (lhs_evaled) ray_release(lhs_evaled);
+                    prog->eval_err = true;
+                    ray_release(accum);
+                    return NULL;
+                }
+                if (RAY_IS_ERR(rhs_evaled)) {
+                    ray_error_free(rhs_evaled);
+                    if (lhs_evaled) ray_release(lhs_evaled);
+                    prog->eval_err = true;
+                    ray_release(accum);
+                    return NULL;
+                }
+                rhs_src = rhs_evaled;
+            } else if (body->cmp_rhs != DL_CONST) {
+                int rhs_col = var_col[body->cmp_rhs];
+                rhs_src = ray_table_get_col_idx(accum, rhs_col);
+                if (!rhs_src) {
+                    if (lhs_evaled) ray_release(lhs_evaled);
+                    prog->eval_err = true;
+                    ray_release(accum);
+                    return NULL;
+                }
+            }
+            /* else rhs is a constant i64 body->cmp_const */
+
+            /* Reject non-numeric sources — DL_CMP has no meaningful
+             * comparison for SYM/STR columns without an ordering hook. */
+            bool lhs_is_f64 = lhs_src && lhs_src->type == RAY_F64;
+            bool rhs_is_f64 = rhs_src && rhs_src->type == RAY_F64;
+            if (lhs_src && lhs_src->type != RAY_I64 && lhs_src->type != RAY_F64) {
+                if (lhs_evaled) ray_release(lhs_evaled);
+                if (rhs_evaled) ray_release(rhs_evaled);
+                prog->eval_err = true;
+                ray_release(accum);
+                return NULL;
+            }
+            if (rhs_src && rhs_src->type != RAY_I64 && rhs_src->type != RAY_F64) {
+                if (lhs_evaled) ray_release(lhs_evaled);
+                if (rhs_evaled) ray_release(rhs_evaled);
+                prog->eval_err = true;
+                ray_release(accum);
+                return NULL;
+            }
+
+            /* Promote to f64 iff either side is f64.  Otherwise stay in
+             * i64 arithmetic for speed and exact integer semantics. */
+            bool use_f64 = lhs_is_f64 || rhs_is_f64;
+            const int64_t* lhs_i = !use_f64 ? (const int64_t*)ray_data(lhs_src) : NULL;
+            const int64_t* rhs_i = !use_f64 && rhs_src ? (const int64_t*)ray_data(rhs_src) : NULL;
+            const double*  lhs_f = use_f64 && !lhs_is_f64 ? NULL
+                                 : (use_f64 ? (const double*)ray_data(lhs_src) : NULL);
+            const double*  rhs_f = use_f64 && rhs_src && rhs_is_f64
+                                 ? (const double*)ray_data(rhs_src) : NULL;
+
+            ray_t* mask_block = ray_alloc((size_t)nrows * sizeof(bool));
+            if (!mask_block) {
+                if (lhs_evaled) ray_release(lhs_evaled);
+                if (rhs_evaled) ray_release(rhs_evaled);
+                break;
+            }
+            bool* mask = (bool*)ray_data(mask_block);
+            int64_t count = 0;
+            for (int64_t r = 0; r < nrows; r++) {
+                bool pass = false;
+                if (use_f64) {
+                    /* Widen the non-f64 side — mixed arithmetic is already
+                     * supported by dl_eval_expr, and DL_CMP_const is i64. */
+                    double lv = lhs_is_f64 ? lhs_f[r] : (double)((const int64_t*)ray_data(lhs_src))[r];
+                    double rv;
+                    if (rhs_src)
+                        rv = rhs_is_f64 ? rhs_f[r] : (double)((const int64_t*)ray_data(rhs_src))[r];
+                    else
+                        rv = (double)body->cmp_const;
+                    switch (body->cmp_op) {
+                    case DL_CMP_EQ: pass = (lv == rv); break;
+                    case DL_CMP_NE: pass = (lv != rv); break;
+                    case DL_CMP_LT: pass = (lv <  rv); break;
+                    case DL_CMP_LE: pass = (lv <= rv); break;
+                    case DL_CMP_GT: pass = (lv >  rv); break;
+                    case DL_CMP_GE: pass = (lv >= rv); break;
+                    }
+                } else {
+                    int64_t lv = lhs_i[r];
+                    int64_t rv = rhs_i ? rhs_i[r] : body->cmp_const;
+                    switch (body->cmp_op) {
+                    case DL_CMP_EQ: pass = (lv == rv); break;
+                    case DL_CMP_NE: pass = (lv != rv); break;
+                    case DL_CMP_LT: pass = (lv <  rv); break;
+                    case DL_CMP_LE: pass = (lv <= rv); break;
+                    case DL_CMP_GT: pass = (lv >  rv); break;
+                    case DL_CMP_GE: pass = (lv >= rv); break;
+                    }
+                }
+                (void)lhs_f;  /* silence unused warnings in non-f64 paths */
+                mask[r] = pass;
+                if (pass) count++;
+            }
+
+            if (lhs_evaled) ray_release(lhs_evaled);
+            if (rhs_evaled) ray_release(rhs_evaled);
+
+            if (count == nrows) {
+                ray_free(mask_block);
+                break;  /* all rows pass */
+            }
+
+            /* Build filtered table — element-size-aware memcpy so f64
+             * columns and narrow-SYM columns survive the mask unchanged.
+             * Silently `continue`-ing past missing columns would yield
+             * a table with fewer columns than accum, breaking schema
+             * invariants in downstream table_union.  Treat every such
+             * failure as unrecoverable. */
+            int64_t ncols = ray_table_ncols(accum);
+            ray_t* out = ray_table_new((int)ncols);
+            if (!out || RAY_IS_ERR(out)) {
+                if (out && RAY_IS_ERR(out)) ray_error_free(out);
+                ray_free(mask_block);
+                ray_release(accum);
+                prog->eval_err = true;
+                return NULL;
+            }
+            for (int64_t c = 0; c < ncols; c++) {
+                ray_t* src = ray_table_get_col_idx(accum, c);
+                if (!src) {
+                    ray_release(out); ray_free(mask_block);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                ray_t* dst = (src->type == RAY_SYM)
+                    ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, count)
+                    : ray_vec_new(src->type, count);
+                if (!dst) {
+                    ray_release(out); ray_free(mask_block);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                if (RAY_IS_ERR(dst)) {
+                    ray_error_free(dst);
+                    ray_release(out); ray_free(mask_block);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                dst->len = count;
+                uint8_t esz = ray_sym_elem_size(src->type, src->attrs);
+                const uint8_t* sb = (const uint8_t*)ray_data(src);
+                uint8_t* db = (uint8_t*)ray_data(dst);
+                int64_t j = 0;
+                for (int64_t r = 0; r < nrows; r++)
+                    if (mask[r]) {
+                        memcpy(db + (size_t)j * esz, sb + (size_t)r * esz, esz);
+                        j++;
+                    }
+                if (src->type == RAY_STR) col_propagate_str_pool(dst, src);
+                ray_t* next = ray_table_add_col(out, ray_table_col_name(accum, c), dst);
+                ray_release(dst);
+                if (!next) {
+                    ray_release(out); ray_free(mask_block);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                if (RAY_IS_ERR(next)) {
+                    ray_error_free(next);
+                    ray_release(out); ray_free(mask_block);
+                    ray_release(accum);
+                    prog->eval_err = true;
+                    return NULL;
+                }
+                out = next;
+            }
+            ray_free(mask_block);
+            ray_release(accum);
+            accum = out;
+            break;
+        }
+
+        case DL_INTERVAL: {
+            int fact_col = var_col[body->interval_fact_var];
+            int start_col = fact_col;
+            int end_col = fact_col + 1;
+
+            var_bound[body->interval_start_var] = true;
+            var_col[body->interval_start_var] = start_col;
+
+            var_bound[body->interval_end_var] = true;
+            var_col[body->interval_end_var] = end_col;
+            break;
+        }
+        } /* switch */
+    }
+
+    /* Project to head variables */
+    int head_idx = dl_find_rel(prog, rule->head_pred);
+    if (head_idx < 0) { ray_release(accum); return NULL; }
+    dl_rel_t* head_rel = &prog->rels[head_idx];
+
+    int proj_cols[DL_MAX_ARITY];
+    for (int c = 0; c < rule->head_arity; c++) {
+        int v = rule->head_vars[c];
+        if (v == DL_CONST) {
+            proj_cols[c] = -1;
+        } else {
+            proj_cols[c] = var_col[v];
+        }
+    }
+
+    ray_t* projected = dl_project(accum, proj_cols, rule->head_arity, head_rel,
+                                   rule->head_consts, rule->head_const_types);
+    ray_release(accum);
+
+    /* dl_project now surfaces hard failures (alloc OOM, type errors, add-col
+     * errors) as RAY_ERROR objects.  Catch those here and flag the program
+     * so dl_eval can return -1 instead of silently dropping the rule's
+     * output via the const_table/execute chain. */
+    if (!projected) return NULL;
+    if (RAY_IS_ERR(projected)) {
+        ray_error_free(projected);
+        prog->eval_err = true;
+        return NULL;
+    }
+
+    /* Store result in the graph as a const_table so the caller can execute */
+    ray_op_t* result_node = ray_const_table(g, projected);
+    ray_release(projected);
+    return result_node;
+}
+
+/* ========================================================================
+ * Table utilities for fixpoint evaluation
+ * ======================================================================== */
+
+/* Rename table columns to match the head relation's expected names.
+ * This is needed because ray_select output column names come from the scan
+ * nodes (e.g., "edge__c0"), but we need them to match the head relation
+ * (e.g., "path__c0"). Returns a new owned table. */
+static ray_t* table_rename_cols(ray_t* tbl, dl_rel_t* target_rel) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    int64_t ncols = ray_table_ncols(tbl);
+    if (ncols <= 0) { ray_retain(tbl); return tbl; }
+
+    int arity = target_rel->arity;
+    if (ncols != arity) { ray_retain(tbl); return tbl; }
+
+    /* Check if renaming is needed */
+    bool needs_rename = false;
+    for (int c = 0; c < arity; c++) {
+        if (ray_table_col_name(tbl, c) != target_rel->col_names[c]) {
+            needs_rename = true;
+            break;
+        }
+    }
+    if (!needs_rename) { ray_retain(tbl); return tbl; }
+
+    /* Build new table with correct column names sharing the same column data */
+    ray_t* out = ray_table_new(arity);
+    for (int c = 0; c < arity; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        if (col)
+            out = ray_table_add_col(out, target_rel->col_names[c], col);
+    }
+    return out;
+}
+
+/* Canonicalize column names to "c0","c1",... Returns new owned table. */
+static ray_t* canonicalize(ray_t* tbl) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    int64_t nc = ray_table_ncols(tbl);
+    ray_t* out = ray_table_new(nc);
+    for (int64_t c = 0; c < nc; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        if (!col) continue;
+        char buf[16];
+        snprintf(buf, sizeof(buf), "c%d", (int)c);
+        int64_t sym = ray_sym_intern(buf, strlen(buf));
+        out = ray_table_add_col(out, sym, col);
+    }
+    return out;
+}
+
+/* Restore original column names from `src` onto `tbl`. */
+static ray_t* restore_names(ray_t* tbl, ray_t* src) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    int64_t nc = ray_table_ncols(tbl);
+    ray_t* out = ray_table_new(nc);
+    for (int64_t c = 0; c < nc; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        if (col)
+            out = ray_table_add_col(out, ray_table_col_name(src, c), col);
+    }
+    ray_release(tbl);
+    return out;
+}
+
+/* Create a table by concatenating all rows from tables a and b (same schema).
+ * Uses column-wise ray_vec_concat. Returns new owned table with a's names. */
+static ray_t* table_union(ray_t* a, ray_t* b) {
+    /* Pass-through paths always return a retained non-NULL result so
+     * callers can release uniformly.  A NULL operand falls back to the
+     * other side; a RAY_ERROR operand is *propagated* (retained) rather
+     * than masked by the non-error side — otherwise a real failure on
+     * `b` would silently surface as `a` and the caller would never see
+     * the error.  ray_retain is a no-op on errors so the retain call is
+     * safe and keeps the contract "release is always valid". */
+    if (!a) {
+        if (b) ray_retain(b);
+        return b;
+    }
+    if (RAY_IS_ERR(a)) {
+        ray_retain(a);  /* no-op for errors; documents "owned return" */
+        return a;
+    }
+    if (!b) { ray_retain(a); return a; }
+    if (RAY_IS_ERR(b)) {
+        ray_retain(b);
+        return b;
+    }
+
+    /* Column-count check must run before the empty-rows short-circuit.
+     * Otherwise one side having 0 rows but a stripped schema (e.g. an
+     * antijoin result that collapsed to (0 rows, 0 cols)) would silently
+     * return the other side's schema and the caller would store a table
+     * whose arity differs from what it expected. */
+    int64_t ncols_a = ray_table_ncols(a);
+    int64_t ncols_b = ray_table_ncols(b);
+    if (ncols_a != ncols_b)
+        return ray_error("schema", "table_union: column count mismatch");
+    int64_t ncols = ncols_a;
+
+    if (ray_table_nrows(a) == 0) { ray_retain(b); return b; }
+    if (ray_table_nrows(b) == 0) { ray_retain(a); return a; }
+
+    ray_t* out = ray_table_new((int)ncols);
+    if (!out || RAY_IS_ERR(out))
+        return out ? out : ray_error("memory", "table_union: table_new");
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col_a = ray_table_get_col_idx(a, c);
+        ray_t* col_b = ray_table_get_col_idx(b, c);
+        if (!col_a || !col_b) {
+            /* Silently dropping a column would produce a schema-incomplete
+             * result that the caller mistakes for a successful union. */
+            ray_release(out);
+            return ray_error("domain", "table_union: missing column");
+        }
+        ray_t* merged = ray_vec_concat(col_a, col_b);
+        if (!merged) {
+            ray_release(out);
+            return ray_error("memory", "table_union: concat");
+        }
+        if (RAY_IS_ERR(merged)) {
+            /* Propagate the original error (e.g. "type" for schema
+             * mismatch) so callers see the real diagnostic instead of
+             * a generic "memory". */
+            ray_release(out);
+            return merged;
+        }
+        ray_t* next = ray_table_add_col(out, ray_table_col_name(a, c), merged);
+        ray_release(merged);
+        if (!next) {
+            ray_release(out);
+            return ray_error("memory", "table_union: add_col");
+        }
+        if (RAY_IS_ERR(next)) {
+            ray_release(out);
+            return next;
+        }
+        out = next;
+    }
+    return out;
+}
+
+/* Deduplicate table rows on all columns. Returns new owned table. */
+static ray_t* table_distinct(ray_t* tbl) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    int64_t nrows = ray_table_nrows(tbl);
+    if (nrows <= 1) { ray_retain(tbl); return tbl; }
+
+    int64_t ncols = ray_table_ncols(tbl);
+    if (ncols <= 0) { ray_retain(tbl); return tbl; }
+
+    ray_t* canonical = canonicalize(tbl);
+    if (!canonical || RAY_IS_ERR(canonical))
+        return canonical ? canonical : ray_error("memory", "table_distinct: canonicalize");
+
+    ray_graph_t* g = ray_graph_new(canonical);
+    if (!g) {
+        ray_release(canonical);
+        return ray_error("memory", "table_distinct: graph_new");
+    }
+
+    ray_op_t* keys[DL_MAX_ARITY];
+    for (int64_t c = 0; c < ncols && c < DL_MAX_ARITY; c++) {
+        char buf[16];
+        snprintf(buf, sizeof(buf), "c%d", (int)c);
+        keys[c] = ray_scan(g, buf);
+    }
+
+    ray_op_t* dist = ray_distinct(g, keys, (uint8_t)ncols);
+    ray_optimize(g, dist);
+    ray_t* deduped = ray_execute(g, dist);
+    ray_graph_free(g);
+    ray_release(canonical);
+
+    return restore_names(deduped, tbl);
+}
+
+/* Anti-join: rows in `left` that don't appear in `right` (same schema).
+ * Returns new owned table with left's original column names. */
+static ray_t* table_antijoin(ray_t* left, ray_t* right) {
+    if (!left || RAY_IS_ERR(left)) return left;
+    if (!right || RAY_IS_ERR(right) || ray_table_nrows(right) == 0) {
+        ray_retain(left);
+        return left;
+    }
+    if (ray_table_nrows(left) == 0) {
+        ray_retain(left);
+        return left;
+    }
+
+    int64_t ncols = ray_table_ncols(left);
+    if (ncols <= 0) { ray_retain(left); return left; }
+
+    ray_t* cl = canonicalize(left);
+    if (!cl || RAY_IS_ERR(cl))
+        return cl ? cl : ray_error("memory", "table_antijoin: canonicalize left");
+    ray_t* cr = canonicalize(right);
+    if (!cr || RAY_IS_ERR(cr)) {
+        ray_release(cl);
+        return cr ? cr : ray_error("memory", "table_antijoin: canonicalize right");
+    }
+
+    ray_graph_t* g = ray_graph_new(NULL);
+    if (!g) {
+        ray_release(cl);
+        ray_release(cr);
+        return ray_error("memory", "table_antijoin: graph_new");
+    }
+
+    ray_op_t* l = ray_const_table(g, cl);
+    ray_op_t* r = ray_const_table(g, cr);
+
+    uint16_t l_tid = ray_graph_add_table(g, cl);
+    uint16_t r_tid = ray_graph_add_table(g, cr);
+
+    ray_op_t* lkeys[DL_MAX_ARITY];
+    ray_op_t* rkeys[DL_MAX_ARITY];
+    for (int64_t c = 0; c < ncols && c < DL_MAX_ARITY; c++) {
+        char buf[16];
+        snprintf(buf, sizeof(buf), "c%d", (int)c);
+        lkeys[c] = ray_scan_table(g, l_tid, buf);
+        rkeys[c] = ray_scan_table(g, r_tid, buf);
+    }
+
+    ray_op_t* aj = ray_antijoin(g, l, lkeys, r, rkeys, (uint8_t)ncols);
+    ray_t* raw = ray_execute(g, aj);
+    ray_graph_free(g);
+    ray_release(cl);
+    ray_release(cr);
+
+    return restore_names(raw, left);
+}
+
+/* Normalize column names of a table to match the target relation's naming scheme.
+ * Returns a new owned table with correct names (shares column data).
+ * Currently unused but retained for future use by external callers. */
+static ray_t* normalize_columns(ray_t* tbl, dl_rel_t* rel)
+    __attribute__((unused));
+static ray_t* normalize_columns(ray_t* tbl, dl_rel_t* rel) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    int64_t ncols = ray_table_ncols(tbl);
+    if (ncols != rel->arity) {
+        /* Arity mismatch — can't normalize */
+        ray_retain(tbl);
+        return tbl;
+    }
+    /* Check if already correct */
+    bool ok = true;
+    for (int c = 0; c < rel->arity; c++) {
+        if (ray_table_col_name(tbl, c) != rel->col_names[c]) { ok = false; break; }
+    }
+    if (ok) { ray_retain(tbl); return tbl; }
+
+    /* Rebuild with correct names, sharing column data */
+    ray_t* out = ray_table_new(rel->arity);
+    for (int c = 0; c < rel->arity; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        if (col)
+            out = ray_table_add_col(out, rel->col_names[c], col);
+    }
+    return out;
+}
+
+/* ========================================================================
+ * Provenance helpers
+ * ======================================================================== */
+
+/* Hash all columns of ref at row r into a single key. */
+static uint64_t dl_row_hash(int64_t** col_data, int64_t ncols, int64_t r) {
+    uint64_t h = ray_hash_i64(col_data[0][r]);
+    for (int64_t c = 1; c < ncols; c++)
+        h = ray_hash_combine(h, ray_hash_i64(col_data[c][r]));
+    return h;
+}
+
+/* Check if rows match across `ncols` columns. */
+static bool dl_row_eq(int64_t** a_cols, int64_t ar,
+                     int64_t** b_cols, int64_t br, int64_t ncols) {
+    for (int64_t c = 0; c < ncols; c++)
+        if (a_cols[c][ar] != b_cols[c][br]) return false;
+    return true;
+}
+
+/* Open-addressing hash set keyed by ref-row tuple.  Slot stores ref row
+ * index; lookup hashes the probe-row tuple and walks the probe chain.
+ * Replaces the per-call O(ref_rows) linear scan in dl_row_in_table —
+ * the previous shape was O(tbl_rows × ref_rows × ncols) which is
+ * quadratic for typical datalog provenance workloads. */
+typedef struct {
+    int64_t* slots;        /* row index, -1 = empty */
+    ray_t*   block;
+    int64_t  cap;
+    int64_t  mask;
+    int64_t** ref_cols;    /* cached column data ptrs for ref */
+    int64_t  ncols;
+} dl_rowset_t;
+
+static bool dl_rowset_init(dl_rowset_t* rs, ray_t* ref) {
+    int64_t ncols = ray_table_ncols(ref);
+    int64_t nrows = ray_table_nrows(ref);
+    rs->ncols = ncols;
+    rs->ref_cols = (int64_t**)ray_sys_alloc(sizeof(int64_t*) * (size_t)ncols);
+    if (!rs->ref_cols) return false;
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(ref, c);
+        rs->ref_cols[c] = col ? (int64_t*)ray_data(col) : NULL;
+    }
+    int64_t cap = 16;
+    while (cap < (nrows > 0 ? nrows * 2 : 16)) cap *= 2;
+    rs->block = ray_alloc((size_t)cap * sizeof(int64_t));
+    if (!rs->block || RAY_IS_ERR(rs->block)) {
+        ray_sys_free(rs->ref_cols);
+        rs->ref_cols = NULL;
+        return false;
+    }
+    rs->slots = (int64_t*)ray_data(rs->block);
+    rs->cap = cap;
+    rs->mask = cap - 1;
+    for (int64_t i = 0; i < cap; i++) rs->slots[i] = -1;
+
+    for (int64_t r = 0; r < nrows; r++) {
+        uint64_t h = dl_row_hash(rs->ref_cols, ncols, r);
+        int64_t s = (int64_t)(h & (uint64_t)rs->mask);
+        while (rs->slots[s] != -1) s = (s + 1) & rs->mask;
+        rs->slots[s] = r;
+    }
+    return true;
+}
+
+static void dl_rowset_destroy(dl_rowset_t* rs) {
+    if (rs->block) { ray_release(rs->block); rs->block = NULL; }
+    if (rs->ref_cols) { ray_sys_free(rs->ref_cols); rs->ref_cols = NULL; }
+}
+
+/* True if the row at `tbl_cols[..][row]` is present in the set. */
+static bool dl_rowset_contains(dl_rowset_t* rs, int64_t** tbl_cols, int64_t row) {
+    uint64_t h = dl_row_hash(tbl_cols, rs->ncols, row);
+    int64_t s = (int64_t)(h & (uint64_t)rs->mask);
+    while (rs->slots[s] != -1) {
+        int64_t r = rs->slots[s];
+        if (dl_row_eq(tbl_cols, row, rs->ref_cols, r, rs->ncols))
+            return true;
+        s = (s + 1) & rs->mask;
+    }
+    return false;
+}
+
+/* Build source provenance for one IDB relation in CSR format.
+ *
+ * For each derived row, extracts head variable bindings from the firing rule
+ * and scans each positive body atom's relation for rows consistent with those
+ * bindings.  Results are stored as two parallel vectors on the relation:
+ *
+ *   prov_src_offsets — I64[nrows+1]: offsets[i] = start index in prov_src_data
+ *                      for derived row i.  offsets[nrows] = total entry count.
+ *   prov_src_data    — I64[total]: each entry = (rel_idx << 32) | row_idx,
+ *                      packed reference to the contributing source row.
+ *                      Row indices are truncated to 32 bits (max ~4 billion rows
+ *                      per relation).
+ *
+ * Body-only variables (not appearing in the head) are unconstrained during
+ * source lookup, so the entry set may be a superset of the true proof. */
+static void dl_build_source_prov(dl_program_t* prog, dl_rel_t* rel,
+                                  int64_t nrows, int64_t* pd) {
+    ray_t* off_vec = ray_vec_new(RAY_I64, nrows + 1);
+    if (!off_vec || RAY_IS_ERR(off_vec)) return;
+    off_vec->len = nrows + 1;
+    int64_t* off = (int64_t*)ray_data(off_vec);
+
+    int64_t buf_cap = (nrows < 16) ? 64 : nrows * 4;
+    ray_t* buf_block = ray_alloc((size_t)buf_cap * sizeof(int64_t));
+    if (!buf_block) { ray_release(off_vec); return; }
+    int64_t* buf = (int64_t*)ray_data(buf_block);
+    int64_t buf_len = 0;
+
+    for (int64_t row = 0; row < nrows; row++) {
+        off[row] = buf_len;
+        if (pd[row] < 0) continue;
+
+        dl_rule_t* rule = &prog->rules[pd[row]];
+
+        int64_t var_vals[DL_MAX_ARITY * DL_MAX_BODY];
+        bool    var_set [DL_MAX_ARITY * DL_MAX_BODY];
+        memset(var_set, 0, sizeof(var_set));
+
+        /* Extract head variable bindings from this derived row */
+        for (int h = 0; h < rule->head_arity; h++) {
+            int v = rule->head_vars[h];
+            if (v == DL_CONST) continue;
+            ray_t* col = ray_table_get_col_idx(rel->table, h);
+            if (!col) continue;
+            var_vals[v] = ((int64_t*)ray_data(col))[row];
+            var_set[v]  = true;
+        }
+
+        /* For each positive body atom, find matching source rows */
+        for (int b = 0; b < rule->n_body; b++) {
+            dl_body_t* body = &rule->body[b];
+            if (body->type != DL_POS) continue;
+
+            int bri = dl_find_rel(prog, body->pred);
+            if (bri < 0) continue;
+            dl_rel_t* brel   = &prog->rels[bri];
+            int64_t   bnrows = ray_table_nrows(brel->table);
+
+            for (int64_t br = 0; br < bnrows; br++) {
+                bool match = true;
+                for (int c = 0; c < body->arity; c++) {
+                    ray_t* bcol = ray_table_get_col_idx(brel->table, c);
+                    if (!bcol) { match = false; break; }
+                    int64_t cell = ((int64_t*)ray_data(bcol))[br];
+                    int     v    = body->vars[c];
+                    if (v == DL_CONST) {
+                        if (cell != body->const_vals[c]) { match = false; break; }
+                    } else if (var_set[v]) {
+                        if (cell != var_vals[v])         { match = false; break; }
+                    }
+                    /* body-only variable: unconstrained, always matches */
+                }
+                if (!match) continue;
+
+                if (buf_len >= buf_cap) {
+                    int64_t   new_cap   = buf_cap * 2;
+                    ray_t*    new_block = ray_alloc((size_t)new_cap * sizeof(int64_t));
+                    if (!new_block) goto oom;
+                    memcpy(ray_data(new_block), buf, (size_t)buf_len * sizeof(int64_t));
+                    ray_free(buf_block);
+                    buf_block = new_block;
+                    buf       = (int64_t*)ray_data(new_block);
+                    buf_cap   = new_cap;
+                }
+                buf[buf_len++] = ((int64_t)bri << 32) | (int64_t)(uint32_t)br;
+            }
+        }
+    }
+
+    /* Success path: finalize CSR */
+    off[nrows] = buf_len;
+    {
+        ray_t* data_vec = ray_vec_new(RAY_I64, buf_len > 0 ? buf_len : 1);
+        if (!data_vec || RAY_IS_ERR(data_vec)) goto oom;
+        data_vec->len = buf_len;
+        if (buf_len > 0)
+            memcpy(ray_data(data_vec), buf, (size_t)buf_len * sizeof(int64_t));
+        ray_free(buf_block);
+
+        if (rel->prov_src_offsets) ray_release(rel->prov_src_offsets);
+        if (rel->prov_src_data)    ray_release(rel->prov_src_data);
+        rel->prov_src_offsets = off_vec;
+        rel->prov_src_data    = data_vec;
+        return;
+    }
+
+oom:
+    /* Allocation failed — discard partial results, leave both fields NULL */
+    ray_free(buf_block);
+    ray_release(off_vec);
+    if (rel->prov_src_offsets) { ray_release(rel->prov_src_offsets); rel->prov_src_offsets = NULL; }
+    if (rel->prov_src_data)    { ray_release(rel->prov_src_data);    rel->prov_src_data    = NULL; }
+}
+
+/* Build provenance for all IDB relations.
+ * For each rule, compile with final tables and mark matching tuples.
+ * Then build deep source provenance (CSR offsets + packed source refs). */
+static void dl_build_provenance(dl_program_t* prog) {
+    for (int ri = 0; ri < prog->n_rels; ri++) {
+        dl_rel_t* rel = &prog->rels[ri];
+        if (!rel->is_idb) continue;
+
+        int64_t nrows = ray_table_nrows(rel->table);
+        if (nrows == 0) continue;
+
+        /* Allocate provenance column initialized to -1 (unknown) */
+        ray_t* prov = ray_vec_new(RAY_I64, nrows);
+        if (!prov || RAY_IS_ERR(prov)) continue;
+        prov->len = nrows;
+        int64_t* pd = (int64_t*)ray_data(prov);
+        for (int64_t r = 0; r < nrows; r++)
+            pd[r] = -1;
+
+        /* For each rule with this head predicate */
+        for (int r = 0; r < prog->n_rules; r++) {
+            dl_rule_t* rule = &prog->rules[r];
+            if (strcmp(rule->head_pred, rel->name) != 0) continue;
+
+            /* Compile and execute the rule to get its derivable tuples */
+            ray_graph_t* g = ray_graph_new(NULL);
+            if (!g) continue;
+
+            ray_op_t* output = dl_compile_rule(prog, rule, -1, r, g);
+            if (!output) { ray_graph_free(g); continue; }
+
+            ray_t* raw = ray_execute(g, output);
+            ray_graph_free(g);
+            if (!raw || RAY_IS_ERR(raw)) continue;
+
+            ray_t* derived = table_rename_cols(raw, rel);
+            ray_release(raw);
+            if (!derived || RAY_IS_ERR(derived)) continue;
+
+            /* Mark rows in rel->table that appear in derived.  Build a
+             * hashset over `derived` once and probe per row of rel —
+             * was O(nrows × derived_rows × ncols), now O(nrows + derived_rows). */
+            dl_rowset_t rs;
+            if (dl_rowset_init(&rs, derived)) {
+                int64_t ncols_t = ray_table_ncols(rel->table);
+                int64_t** tbl_cols = (int64_t**)ray_sys_alloc(sizeof(int64_t*) * (size_t)ncols_t);
+                if (tbl_cols) {
+                    for (int64_t c = 0; c < ncols_t; c++) {
+                        ray_t* col = ray_table_get_col_idx(rel->table, c);
+                        tbl_cols[c] = col ? (int64_t*)ray_data(col) : NULL;
+                    }
+                    for (int64_t row = 0; row < nrows; row++) {
+                        if (pd[row] >= 0) continue;
+                        if (dl_rowset_contains(&rs, tbl_cols, row))
+                            pd[row] = r;
+                    }
+                    ray_sys_free(tbl_cols);
+                }
+                dl_rowset_destroy(&rs);
+            }
+            ray_release(derived);
+        }
+
+        if (rel->prov_col) ray_release(rel->prov_col);
+        rel->prov_col = prov;
+
+        dl_build_source_prov(prog, rel, nrows, pd);
+    }
+}
+
+/* ========================================================================
+ * Semi-naive fixpoint evaluation
+ * ======================================================================== */
+
+int dl_eval(dl_program_t* prog) {
+    if (!prog) return -1;
+
+    /* eval_err is sticky: it may have been raised at rule-add time (e.g.
+     * by a head-const type conflict in dl_idb_align_head_const_types) —
+     * resetting here would silently discard that signal.  Additional
+     * failures during stratify/compile/exec below keep setting the flag,
+     * and the final return honors it either way. */
+    if (prog->eval_err) {
+        /* Short-circuit: compile-time errors already stand; don't run
+         * a potentially broken fixpoint. */
+        return -1;
+    }
+
+    /* Stratify if not already done */
+    if (prog->n_strata == 0) {
+        if (dl_stratify(prog) != 0) return -1;
+    }
+
+    /* Process each stratum */
+    for (int s = 0; s < prog->n_strata; s++) {
+        /* Collect rules in this stratum */
+        dl_rule_t* stratum_rules[DL_MAX_RULES];
+        int stratum_rule_idx[DL_MAX_RULES];  /* original index in prog->rules */
+        int n_stratum_rules = 0;
+
+        for (int r = 0; r < prog->n_rules; r++) {
+            if (prog->rules[r].stratum == s) {
+                stratum_rule_idx[n_stratum_rules] = r;
+                stratum_rules[n_stratum_rules++] = &prog->rules[r];
+            }
+        }
+        if (n_stratum_rules == 0) continue;
+
+        /* Phase A: Initial evaluation — evaluate each rule with full relations */
+        /* Group rules by head predicate */
+        for (int ri = 0; ri < n_stratum_rules; ri++) {
+            dl_rule_t* rule = stratum_rules[ri];
+            int head_idx = dl_find_rel(prog, rule->head_pred);
+            if (head_idx < 0) continue;
+            dl_rel_t* head_rel = &prog->rels[head_idx];
+
+            ray_graph_t* g = ray_graph_new(NULL);
+            if (!g) { prog->eval_err = true; continue; }
+
+            ray_op_t* output = dl_compile_rule(prog, rule, -1, stratum_rule_idx[ri], g);
+            if (!output) {
+                /* dl_compile_rule marks eval_err on genuine failures; a bare
+                 * NULL means "rule has no rows this pass" — not a fault. */
+                ray_graph_free(g);
+                continue;
+            }
+
+            ray_t* raw_tuples = ray_execute(g, output);
+            ray_graph_free(g);
+
+            if (!raw_tuples) continue;
+            if (RAY_IS_ERR(raw_tuples)) { prog->eval_err = true; ray_error_free(raw_tuples); continue; }
+
+            /* Rename columns to match head relation's expected names */
+            ray_t* new_tuples = table_rename_cols(raw_tuples, head_rel);
+            ray_release(raw_tuples);
+            if (!new_tuples) continue;
+            if (RAY_IS_ERR(new_tuples)) { prog->eval_err = true; ray_error_free(new_tuples); continue; }
+
+            /* Merge into the head relation's table */
+            ray_t* merged = table_union(head_rel->table, new_tuples);
+            ray_release(new_tuples);
+            if (!merged) { prog->eval_err = true; continue; }
+            if (RAY_IS_ERR(merged)) { prog->eval_err = true; ray_error_free(merged); continue; }
+            ray_t* deduped = table_distinct(merged);
+            ray_release(merged);
+            if (!deduped) { prog->eval_err = true; continue; }
+            if (RAY_IS_ERR(deduped)) { prog->eval_err = true; ray_error_free(deduped); continue; }
+            ray_release(head_rel->table);
+            head_rel->table = deduped;
+        }
+
+        /* Phase B: Semi-naive loop — iterate with delta relations */
+        /* For each IDB predicate in this stratum, compute delta as the
+         * difference between current and previous table states. */
+        ray_t* prev_tables[DL_MAX_RELS];
+        ray_t* delta_tables[DL_MAX_RELS];
+        memset(prev_tables, 0, sizeof(prev_tables));
+        memset(delta_tables, 0, sizeof(delta_tables));
+
+        /* Initially, delta = full table (all tuples are new) */
+        for (int p = 0; p < prog->strata_sizes[s]; p++) {
+            int rel_idx = prog->strata[s][p];
+            dl_rel_t* rel = &prog->rels[rel_idx];
+            if (rel->is_idb) {
+                ray_retain(rel->table);
+                delta_tables[rel_idx] = rel->table;
+                /* prev = empty table with same schema as the relation.
+                 * Column types must match rel->table so later ray_vec_concat
+                 * calls don't reject the merge when the relation has
+                 * non-i64 columns (e.g. RAY_SYM from head-constant slots). */
+                prev_tables[rel_idx] = ray_table_new(rel->arity);
+                for (int c = 0; c < rel->arity && c < DL_MAX_ARITY; c++) {
+                    ray_t* src = ray_table_get_col_idx(rel->table, c);
+                    int8_t ctype = src ? src->type : RAY_I64;
+                    ray_t* empty_col = ray_vec_new(ctype, 0);
+                    if (empty_col && !RAY_IS_ERR(empty_col)) {
+                        prev_tables[rel_idx] = ray_table_add_col(
+                            prev_tables[rel_idx], rel->col_names[c], empty_col);
+                        ray_release(empty_col);
+                    }
+                }
+            }
+        }
+
+        /* Semi-naive iteration */
+        int max_iter = 1000;
+        for (int iter = 0; iter < max_iter; iter++) {
+            /* Check convergence: all deltas empty */
+            bool any_new = false;
+            for (int p = 0; p < prog->strata_sizes[s]; p++) {
+                int rel_idx = prog->strata[s][p];
+                if (delta_tables[rel_idx] &&
+                    !RAY_IS_ERR(delta_tables[rel_idx]) &&
+                    ray_table_nrows(delta_tables[rel_idx]) > 0) {
+                    any_new = true;
+                    break;
+                }
+            }
+            if (!any_new) break;
+
+            /* For each rule, for each positive body position that uses a
+             * delta relation, compile and execute */
+            ray_t* new_tuples_per_rel[DL_MAX_RELS];
+            memset(new_tuples_per_rel, 0, sizeof(new_tuples_per_rel));
+
+            for (int ri = 0; ri < n_stratum_rules; ri++) {
+                dl_rule_t* rule = stratum_rules[ri];
+                int head_idx = dl_find_rel(prog, rule->head_pred);
+                if (head_idx < 0) continue;
+
+                for (int b = 0; b < rule->n_body; b++) {
+                    dl_body_t* body = &rule->body[b];
+                    if (body->type != DL_POS) continue;
+
+                    int body_rel = dl_find_rel(prog, body->pred);
+                    if (body_rel < 0) continue;
+                    if (!prog->rels[body_rel].is_idb) continue;
+                    if (!delta_tables[body_rel] ||
+                        ray_table_nrows(delta_tables[body_rel]) == 0) continue;
+
+                    /* Swap in delta relation for this body position */
+                    ray_t* saved = prog->rels[body_rel].table;
+                    prog->rels[body_rel].table = delta_tables[body_rel];
+
+                    ray_graph_t* g = ray_graph_new(NULL);
+                    if (!g) {
+                        prog->rels[body_rel].table = saved;
+                        prog->eval_err = true;
+                        continue;
+                    }
+
+                    ray_op_t* output = dl_compile_rule(prog, rule, b, stratum_rule_idx[ri], g);
+                    if (!output) {
+                        ray_graph_free(g);
+                        prog->rels[body_rel].table = saved;
+                        /* dl_compile_rule sets eval_err itself on genuine
+                         * failures; NULL without the flag means "rule yields
+                         * no rows this iteration" and should not fault. */
+                        continue;
+                    }
+
+                    ray_t* raw_result = ray_execute(g, output);
+                    ray_graph_free(g);
+                    prog->rels[body_rel].table = saved;
+
+                    if (!raw_result) continue;
+                    if (RAY_IS_ERR(raw_result)) { prog->eval_err = true; ray_error_free(raw_result); continue; }
+
+                    /* Rename columns to match head relation */
+                    dl_rel_t* head_rel2 = &prog->rels[head_idx];
+                    ray_t* result = table_rename_cols(raw_result, head_rel2);
+                    ray_release(raw_result);
+                    if (!result) continue;
+                    if (RAY_IS_ERR(result)) { prog->eval_err = true; ray_error_free(result); continue; }
+
+                    /* Accumulate new tuples for this head */
+                    if (new_tuples_per_rel[head_idx]) {
+                        ray_t* u = table_union(new_tuples_per_rel[head_idx], result);
+                        ray_release(new_tuples_per_rel[head_idx]);
+                        ray_release(result);
+                        if (!u) {
+                            prog->eval_err = true;
+                            new_tuples_per_rel[head_idx] = NULL;
+                            continue;
+                        }
+                        if (RAY_IS_ERR(u)) {
+                            prog->eval_err = true;
+                            ray_error_free(u);
+                            new_tuples_per_rel[head_idx] = NULL;
+                            continue;
+                        }
+                        new_tuples_per_rel[head_idx] = u;
+                    } else {
+                        new_tuples_per_rel[head_idx] = result;
+                    }
+                }
+            }
+
+            /* For each IDB: dedup new tuples, subtract existing, merge */
+            for (int p = 0; p < prog->strata_sizes[s]; p++) {
+                int rel_idx = prog->strata[s][p];
+                dl_rel_t* rel = &prog->rels[rel_idx];
+                if (!rel->is_idb) continue;
+
+                /* Free old delta */
+                if (delta_tables[rel_idx] && !RAY_IS_ERR(delta_tables[rel_idx]))
+                    ray_release(delta_tables[rel_idx]);
+                delta_tables[rel_idx] = NULL;
+
+                ray_t* new_tuples = new_tuples_per_rel[rel_idx];
+                if (!new_tuples) { delta_tables[rel_idx] = NULL; continue; }
+                if (RAY_IS_ERR(new_tuples)) {
+                    prog->eval_err = true;
+                    ray_error_free(new_tuples);
+                    delta_tables[rel_idx] = NULL;
+                    continue;
+                }
+
+                /* Deduplicate */
+                ray_t* deduped = table_distinct(new_tuples);
+                ray_release(new_tuples);
+                if (!deduped) { prog->eval_err = true; continue; }
+                if (RAY_IS_ERR(deduped)) { prog->eval_err = true; ray_error_free(deduped); continue; }
+
+                /* Subtract existing relation to get true delta */
+                ray_t* delta = table_antijoin(deduped, rel->table);
+                ray_release(deduped);
+                if (!delta) { prog->eval_err = true; continue; }
+                if (RAY_IS_ERR(delta)) { prog->eval_err = true; ray_error_free(delta); continue; }
+
+                delta_tables[rel_idx] = delta;
+
+                /* Merge delta into full relation.  A merge failure here
+                 * leaves delta_tables set but rel->table stale — that would
+                 * desync the fixpoint, so treat it as a hard failure. */
+                if (ray_table_nrows(delta) > 0) {
+                    ray_t* merged = table_union(rel->table, delta);
+                    if (!merged) { prog->eval_err = true; continue; }
+                    if (RAY_IS_ERR(merged)) {
+                        prog->eval_err = true;
+                        ray_error_free(merged);
+                        continue;
+                    }
+                    ray_release(rel->table);
+                    rel->table = merged;
+                }
+            }
+
+            /* Update prev tables */
+            for (int p = 0; p < prog->strata_sizes[s]; p++) {
+                int rel_idx = prog->strata[s][p];
+                if (prev_tables[rel_idx] && !RAY_IS_ERR(prev_tables[rel_idx]))
+                    ray_release(prev_tables[rel_idx]);
+                ray_retain(prog->rels[rel_idx].table);
+                prev_tables[rel_idx] = prog->rels[rel_idx].table;
+            }
+        }
+
+        /* Cleanup stratum temporaries */
+        for (int p = 0; p < prog->strata_sizes[s]; p++) {
+            int rel_idx = prog->strata[s][p];
+            if (prev_tables[rel_idx] && !RAY_IS_ERR(prev_tables[rel_idx]))
+                ray_release(prev_tables[rel_idx]);
+            if (delta_tables[rel_idx] && !RAY_IS_ERR(delta_tables[rel_idx]))
+                ray_release(delta_tables[rel_idx]);
+        }
+    }
+
+    /* Build provenance if requested */
+    if (prog->flags & DL_FLAG_PROVENANCE)
+        dl_build_provenance(prog);
+
+    /* Any compile-time or runtime error surfaced by a rule causes dl_eval
+     * to report failure, so callers (notably ray_query_fn) can turn this
+     * into a user-visible "evaluation failed" error instead of shipping a
+     * silently-incomplete result. */
+    return prog->eval_err ? -1 : 0;
+}
+
+/* ========================================================================
+ * Query — retrieve result after evaluation
+ * ======================================================================== */
+
+ray_t* dl_query(dl_program_t* prog, const char* pred_name) {
+    if (!prog || !pred_name) return NULL;
+    int idx = dl_find_rel(prog, pred_name);
+    if (idx < 0) return NULL;
+    return prog->rels[idx].table;
+}
+
+ray_t* dl_get_provenance(dl_program_t* prog, const char* pred_name) {
+    if (!prog || !pred_name) return NULL;
+    if (!(prog->flags & DL_FLAG_PROVENANCE)) return NULL;
+    int idx = dl_find_rel(prog, pred_name);
+    if (idx < 0) return NULL;
+    return prog->rels[idx].prov_col;
+}
+
+ray_t* dl_get_provenance_src_offsets(dl_program_t* prog, const char* pred_name) {
+    if (!prog || !pred_name) return NULL;
+    if (!(prog->flags & DL_FLAG_PROVENANCE)) return NULL;
+    int idx = dl_find_rel(prog, pred_name);
+    if (idx < 0) return NULL;
+    return prog->rels[idx].prov_src_offsets;
+}
+
+ray_t* dl_get_provenance_src_data(dl_program_t* prog, const char* pred_name) {
+    if (!prog || !pred_name) return NULL;
+    if (!(prog->flags & DL_FLAG_PROVENANCE)) return NULL;
+    int idx = dl_find_rel(prog, pred_name);
+    if (idx < 0) return NULL;
+    return prog->rels[idx].prov_src_data;
+}
+
+/* ── Builtins ── */
+
+/* ══════════════════════════════════════════
+ * EAV triple storage — datoms, assert-fact, scan-eav
+ * ══════════════════════════════════════════ */
+
+/* (datoms) — create empty EAV table with schema [e a v] */
+ray_t* ray_datoms_fn(ray_t** args, int64_t n) {
+    (void)args;
+    if (n != 0) return ray_error("arity", "datoms takes no arguments");
+
+    int64_t e_id = ray_sym_intern("e", 1);
+    int64_t a_id = ray_sym_intern("a", 1);
+    int64_t v_id = ray_sym_intern("v", 1);
+
+    ray_t* tbl = ray_table_new(3);
+    if (RAY_IS_ERR(tbl)) return tbl;
+
+    /* e column: RAY_I64 */
+    ray_t* e_col = ray_vec_new(RAY_I64, 0);
+    if (RAY_IS_ERR(e_col)) { ray_release(tbl); return e_col; }
+    tbl = ray_table_add_col(tbl, e_id, e_col);
+    ray_release(e_col);
+    if (RAY_IS_ERR(tbl)) return tbl;
+
+    /* a column: RAY_SYM */
+    ray_t* a_col = ray_vec_new(RAY_SYM, 0);
+    if (RAY_IS_ERR(a_col)) { ray_release(tbl); return a_col; }
+    tbl = ray_table_add_col(tbl, a_id, a_col);
+    ray_release(a_col);
+    if (RAY_IS_ERR(tbl)) return tbl;
+
+    /* v column: RAY_I64 (symbols stored as intern ID, integers as-is) */
+    ray_t* v_col = ray_vec_new(RAY_I64, 0);
+    if (RAY_IS_ERR(v_col)) { ray_release(tbl); return v_col; }
+    tbl = ray_table_add_col(tbl, v_id, v_col);
+    ray_release(v_col);
+
+    return tbl;
+}
+
+/* (assert-fact db entity attr value) — append a triple to the datoms table */
+ray_t* ray_assert_fact_fn(ray_t** args, int64_t n) {
+    if (n != 4) return ray_error("arity", "assert-fact expects 4 arguments: db entity attr value");
+
+    ray_t* db     = args[0];
+    ray_t* entity = args[1];
+    ray_t* attr   = args[2];
+    ray_t* value  = args[3];
+
+    /* Validate db is a table with 3 columns */
+    if (db->type != RAY_TABLE || ray_table_ncols(db) != 3)
+        return ray_error("type", "assert-fact: first arg must be a datoms table");
+
+    /* Validate entity is i64 */
+    if (entity->type != -RAY_I64)
+        return ray_error("type", "assert-fact: entity must be an integer");
+
+    /* Validate attr is a symbol */
+    if (attr->type != -RAY_SYM)
+        return ray_error("type", "assert-fact: attr must be a symbol");
+
+    /* Value: accept i64 or sym. Store as i64 (sym -> intern ID). */
+    int64_t v_val;
+    if (value->type == -RAY_I64) {
+        v_val = value->i64;
+    } else if (value->type == -RAY_SYM) {
+        v_val = value->i64;  /* sym intern ID is already i64 */
+    } else {
+        return ray_error("type", "assert-fact: value must be an integer or symbol");
+    }
+
+    /* Build new table with appended row */
+    int64_t ncols = 3;
+    ray_t* result = ray_table_new(ncols);
+    if (RAY_IS_ERR(result)) return result;
+
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* old_col = ray_table_get_col_idx(db, c);
+        int64_t col_name = ray_table_col_name(db, c);
+
+        /* Clone the column via retain + COW on append */
+        ray_retain(old_col);
+        ray_t* new_col = old_col;
+
+        if (c == 0) {
+            /* e column: append entity i64 */
+            int64_t e_val = entity->i64;
+            new_col = ray_vec_append(new_col, &e_val);
+        } else if (c == 1) {
+            /* a column: append attr sym ID */
+            int64_t a_val = attr->i64;
+            new_col = ray_vec_append(new_col, &a_val);
+        } else {
+            /* v column: append value as i64 */
+            new_col = ray_vec_append(new_col, &v_val);
+        }
+
+        if (RAY_IS_ERR(new_col)) {
+            /* ray_cow inside ray_vec_append already released old_col ref on error/copy */
+            ray_release(result);
+            return new_col;
+        }
+        /* ray_cow consumed our retain when it copied; don't double-release old_col */
+
+        result = ray_table_add_col(result, col_name, new_col);
+        ray_release(new_col);
+        if (RAY_IS_ERR(result)) return result;
+    }
+
+    return result;
+}
+
+/* (retract-fact db entity attr value) — remove a triple from the datoms table */
+ray_t* ray_retract_fact_fn(ray_t** args, int64_t n) {
+    if (n != 4) return ray_error("arity", "retract-fact expects 4 arguments: db entity attr value");
+
+    ray_t* db     = args[0];
+    ray_t* entity = args[1];
+    ray_t* attr   = args[2];
+    ray_t* value  = args[3];
+
+    if (db->type != RAY_TABLE || ray_table_ncols(db) != 3)
+        return ray_error("type", "retract-fact: first arg must be a datoms table");
+    if (entity->type != -RAY_I64)
+        return ray_error("type", "retract-fact: entity must be an integer");
+    if (attr->type != -RAY_SYM)
+        return ray_error("type", "retract-fact: attr must be a symbol");
+
+    int64_t match_e = entity->i64;
+    int64_t match_a = attr->i64;
+    int64_t match_v;
+    if (value->type == -RAY_I64)
+        match_v = value->i64;
+    else if (value->type == -RAY_SYM)
+        match_v = value->i64;
+    else
+        return ray_error("type", "retract-fact: value must be an integer or symbol");
+
+    /* Get existing columns */
+    ray_t* e_col = ray_table_get_col_idx(db, 0);
+    ray_t* a_col = ray_table_get_col_idx(db, 1);
+    ray_t* v_col = ray_table_get_col_idx(db, 2);
+    int64_t nrows = ray_len(e_col);
+
+    int64_t* e_data = (int64_t*)ray_data(e_col);
+    int64_t* a_data = (int64_t*)ray_data(a_col);
+    int64_t* v_data = (int64_t*)ray_data(v_col);
+
+    /* Build new columns, skipping matching rows */
+    ray_t* new_e = ray_vec_new(RAY_I64, nrows);
+    if (RAY_IS_ERR(new_e)) return new_e;
+    ray_t* new_a = ray_vec_new(RAY_SYM, nrows);
+    if (RAY_IS_ERR(new_a)) { ray_release(new_e); return new_a; }
+    ray_t* new_v = ray_vec_new(RAY_I64, nrows);
+    if (RAY_IS_ERR(new_v)) { ray_release(new_e); ray_release(new_a); return new_v; }
+
+    for (int64_t r = 0; r < nrows; r++) {
+        if (e_data[r] == match_e && a_data[r] == match_a && v_data[r] == match_v)
+            continue; /* skip this row */
+        new_e = ray_vec_append(new_e, &e_data[r]);
+        if (RAY_IS_ERR(new_e)) { ray_release(new_a); ray_release(new_v); return new_e; }
+        new_a = ray_vec_append(new_a, &a_data[r]);
+        if (RAY_IS_ERR(new_a)) { ray_release(new_e); ray_release(new_v); return new_a; }
+        new_v = ray_vec_append(new_v, &v_data[r]);
+        if (RAY_IS_ERR(new_v)) { ray_release(new_e); ray_release(new_a); return new_v; }
+    }
+
+    /* Build result table */
+    ray_t* result = ray_table_new(3);
+    if (RAY_IS_ERR(result)) { ray_release(new_e); ray_release(new_a); ray_release(new_v); return result; }
+    result = ray_table_add_col(result, ray_table_col_name(db, 0), new_e);
+    ray_release(new_e);
+    if (RAY_IS_ERR(result)) { ray_release(new_a); ray_release(new_v); return result; }
+    result = ray_table_add_col(result, ray_table_col_name(db, 1), new_a);
+    ray_release(new_a);
+    if (RAY_IS_ERR(result)) { ray_release(new_v); return result; }
+    result = ray_table_add_col(result, ray_table_col_name(db, 2), new_v);
+    ray_release(new_v);
+    return result;
+}
+
+/* (scan-eav db attr) — filter by attribute, return [e v] table
+   (scan-eav db entity attr) — filter by entity+attr, return single value */
+ray_t* ray_scan_eav_fn(ray_t** args, int64_t n) {
+    if (n < 2 || n > 3)
+        return ray_error("arity", "scan-eav expects 2 or 3 arguments");
+
+    ray_t* db = args[0];
+    if (db->type != RAY_TABLE || ray_table_ncols(db) != 3)
+        return ray_error("type", "scan-eav: first arg must be a datoms table");
+
+    ray_t* e_col = ray_table_get_col_idx(db, 0);
+    ray_t* a_col = ray_table_get_col_idx(db, 1);
+    ray_t* v_col = ray_table_get_col_idx(db, 2);
+    int64_t nrows = ray_table_nrows(db);
+
+    if (n == 2) {
+        /* (scan-eav db attr) — filter by attribute, return [e v] table */
+        ray_t* attr_arg = args[1];
+        if (attr_arg->type != -RAY_SYM)
+            return ray_error("type", "scan-eav: attr must be a symbol");
+        int64_t attr_id = attr_arg->i64;
+
+        int64_t e_name = ray_sym_intern("e", 1);
+        int64_t v_name = ray_sym_intern("v", 1);
+
+        ray_t* re = ray_vec_new(RAY_I64, nrows);
+        if (RAY_IS_ERR(re)) return re;
+        ray_t* rv = ray_vec_new(RAY_I64, nrows);
+        if (RAY_IS_ERR(rv)) { ray_release(re); return rv; }
+
+        const int64_t* e_data = (const int64_t*)ray_data(e_col);
+        const int64_t* v_data = (const int64_t*)ray_data(v_col);
+
+        for (int64_t r = 0; r < nrows; r++) {
+            int64_t a_val = ray_read_sym(ray_data(a_col), r, a_col->type, a_col->attrs);
+            if (a_val == attr_id) {
+                re = ray_vec_append(re, &e_data[r]);
+                if (RAY_IS_ERR(re)) { ray_release(rv); return re; }
+                rv = ray_vec_append(rv, &v_data[r]);
+                if (RAY_IS_ERR(rv)) { ray_release(re); return rv; }
+            }
+        }
+
+        ray_t* result = ray_table_new(2);
+        if (RAY_IS_ERR(result)) { ray_release(re); ray_release(rv); return result; }
+        result = ray_table_add_col(result, e_name, re);
+        ray_release(re);
+        if (RAY_IS_ERR(result)) { ray_release(rv); return result; }
+        result = ray_table_add_col(result, v_name, rv);
+        ray_release(rv);
+        return result;
+
+    } else {
+        /* (scan-eav db entity attr) — filter by entity+attr, return single value */
+        ray_t* entity_arg = args[1];
+        ray_t* attr_arg   = args[2];
+
+        if (entity_arg->type != -RAY_I64)
+            return ray_error("type", "scan-eav: entity must be an integer");
+        if (attr_arg->type != -RAY_SYM)
+            return ray_error("type", "scan-eav: attr must be a symbol");
+
+        int64_t entity_id = entity_arg->i64;
+        int64_t attr_id   = attr_arg->i64;
+
+        const int64_t* e_data = (const int64_t*)ray_data(e_col);
+
+        const int64_t* v_data = (const int64_t*)ray_data(v_col);
+
+        for (int64_t r = 0; r < nrows; r++) {
+            if (e_data[r] != entity_id) continue;
+            int64_t a_val = ray_read_sym(ray_data(a_col), r, a_col->type, a_col->attrs);
+            if (a_val == attr_id) {
+                return ray_i64(v_data[r]);
+            }
+        }
+
+        return ray_error("value", "scan-eav: no matching triple found");
+    }
+}
+
+/* (pull db entity) — all attributes of entity as dict
+   (pull db entity [attrs]) — only specified attributes as dict */
+ray_t* ray_pull_fn(ray_t** args, int64_t n) {
+    if (n < 2 || n > 3)
+        return ray_error("arity", "pull expects 2 or 3 arguments: db entity [attrs]");
+
+    ray_t* db     = args[0];
+    ray_t* entity = args[1];
+
+    if (db->type != RAY_TABLE || ray_table_ncols(db) != 3)
+        return ray_error("type", "pull: first arg must be a datoms table");
+    if (entity->type != -RAY_I64)
+        return ray_error("type", "pull: entity must be an integer");
+
+    /* Optional attribute filter */
+    ray_t* attr_filter = NULL;
+    int64_t n_filter = 0;
+    const int64_t* filter_ids = NULL;
+    if (n == 3) {
+        attr_filter = args[2];
+        if (!ray_is_vec(attr_filter) || attr_filter->type != RAY_SYM)
+            return ray_error("type", "pull: third arg must be a symbol vector [attr ...]");
+        n_filter = attr_filter->len;
+        filter_ids = (const int64_t*)ray_data(attr_filter);
+    }
+
+    int64_t entity_id = entity->i64;
+    ray_t* e_col = ray_table_get_col_idx(db, 0);
+    ray_t* a_col = ray_table_get_col_idx(db, 1);
+    ray_t* v_col = ray_table_get_col_idx(db, 2);
+    int64_t nrows = ray_table_nrows(db);
+
+    const int64_t* e_data = (const int64_t*)ray_data(e_col);
+    const int64_t* v_data = (const int64_t*)ray_data(v_col);
+
+    /* Build dict: keys SYM vec of attribute IDs, vals LIST of i64 atoms. */
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, 8);
+    if (RAY_IS_ERR(keys)) return keys;
+    ray_t* vals = ray_list_new(8);
+    if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+
+    for (int64_t r = 0; r < nrows; r++) {
+        if (e_data[r] != entity_id) continue;
+        int64_t a_val = ray_read_sym(ray_data(a_col), r, a_col->type, a_col->attrs);
+
+        /* Check filter if present */
+        if (attr_filter) {
+            int found = 0;
+            for (int64_t f = 0; f < n_filter; f++) {
+                if (filter_ids[f] == a_val) { found = 1; break; }
+            }
+            if (!found) continue;
+        }
+
+        keys = ray_vec_append(keys, &a_val);
+        if (RAY_IS_ERR(keys)) { ray_release(vals); return keys; }
+
+        ray_t* val = ray_i64(v_data[r]);
+        if (RAY_IS_ERR(val)) { ray_release(keys); ray_release(vals); return val; }
+        vals = ray_list_append(vals, val);
+        ray_release(val);
+        if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+    }
+
+    return ray_dict_new(keys, vals);
+}
+
+/* ══════════════════════════════════════════
+ * Datalog — rule definitions and query compilation
+ * ══════════════════════════════════════════ */
+
+/* Check if a symbol name starts with '?' (Datalog variable) */
+static int is_dl_var(ray_t* x) {
+    if (!x || x->type != -RAY_SYM) return 0;
+    ray_t* s = ray_sym_str(x->i64);
+    if (!s) return 0;
+    const char* p = ray_str_ptr(s);
+    return p && p[0] == '?';
+}
+
+/* ══════════════════════════════════════════
+ * Datalog wrappers — thin layer over src/datalog/datalog.h
+ *
+ * Global rule storage lives in g_dl_rules[] / g_dl_n_rules.
+ * ray_rule_fn parses Rayfall (rule ...) syntax and stores rules.
+ * ray_query_fn builds a temporary dl_program_t, copies global rules,
+ * registers the EAV table, evaluates to fixpoint, and returns results.
+ * ══════════════════════════════════════════ */
+
+/* Global rule storage: rules defined via (rule ...) persist across queries */
+static dl_rule_t  g_dl_rules[DL_MAX_RULES];
+static int        g_dl_n_rules = 0;
+
+void dl_append_global_rules(dl_program_t* prog) {
+    if (!prog) return;
+    for (int i = 0; i < g_dl_n_rules; i++)
+        dl_add_rule(prog, &g_dl_rules[i]);
+}
+
+/* Variable name -> index map for parsing a single rule or query body */
+typedef struct {
+    int64_t syms[DL_MAX_ARITY * DL_MAX_BODY];
+    int     n;
+} dl_var_map_t;
+
+static int dl_var_get_or_create(dl_var_map_t* map, int64_t sym_id) {
+    for (int i = 0; i < map->n; i++)
+        if (map->syms[i] == sym_id) return i;
+    if (map->n >= DL_MAX_ARITY * DL_MAX_BODY) return -1;
+    map->syms[map->n] = sym_id;
+    return map->n++;
+}
+
+/* Map Rayfall comparison operator name to DL_CMP_* constant.
+ * Returns -1 if not a recognized comparison. */
+static int dl_cmp_op_from_name(const char* name) {
+    if (strcmp(name, ">")  == 0) return DL_CMP_GT;
+    if (strcmp(name, ">=") == 0) return DL_CMP_GE;
+    if (strcmp(name, "<")  == 0) return DL_CMP_LT;
+    if (strcmp(name, "<=") == 0) return DL_CMP_LE;
+    if (strcmp(name, "==") == 0) return DL_CMP_EQ;
+    if (strcmp(name, "!=") == 0) return DL_CMP_NE;
+    return -1;
+}
+
+/* Map Rayfall arithmetic operator name to OP_* constant for dl_expr_t.
+ * Returns -1 if not recognized. */
+static int dl_arith_op_from_name(const char* name) {
+    if (strcmp(name, "+") == 0) return OP_ADD;
+    if (strcmp(name, "-") == 0) return OP_SUB;
+    if (strcmp(name, "*") == 0) return OP_MUL;
+    if (strcmp(name, "/") == 0) return OP_DIV;
+    return -1;
+}
+
+/* Build a dl_expr_t from a Rayfall AST node.
+ * Handles: integer constants, ?variables, (op expr expr). */
+static dl_expr_t* dl_build_expr(ray_t* node, dl_var_map_t* vars) {
+    if (!node) return NULL;
+    if (node->type == -RAY_I64)
+        return dl_expr_const(node->i64);
+    if (node->type == -RAY_F64)
+        return dl_expr_const_f64(node->f64);
+    if (node->type == -RAY_SYM && is_dl_var(node)) {
+        int vi = dl_var_get_or_create(vars, node->i64);
+        return (vi >= 0) ? dl_expr_var(vi) : NULL;
+    }
+    if (is_list(node) && ray_len(node) == 3) {
+        ray_t** elems = (ray_t**)ray_data(node);
+        if (elems[0]->type == -RAY_SYM) {
+            ray_t* op_str = ray_sym_str(elems[0]->i64);
+            if (op_str) {
+                int op = dl_arith_op_from_name(ray_str_ptr(op_str));
+                if (op >= 0) {
+                    dl_expr_t* l = dl_build_expr(elems[1], vars);
+                    dl_expr_t* r = dl_build_expr(elems[2], vars);
+                    if (l && r) return dl_expr_binop(op, l, r);
+                }
+            }
+        }
+    }
+    /* Fallback: treat symbols (non-variable) as constants (sym ID) */
+    if (node->type == -RAY_SYM)
+        return dl_expr_const(node->i64);
+    return NULL;
+}
+
+/* Check if a Rayfall list clause is a triple pattern: (?e :attr ?v)
+ * A triple pattern has exactly 3 elements and the first element is a
+ * ?variable (distinguishing it from rule invocations where the first
+ * element is a predicate name symbol). */
+static bool dl_is_wildcard(ray_t* node) {
+    if (node->type != -RAY_SYM) return false;
+    ray_t* s = ray_sym_str(node->i64);
+    return s && ray_str_len(s) == 1 && ray_str_ptr(s)[0] == '_';
+}
+
+
+
+static bool dl_is_triple_pattern(ray_t* clause) {
+    if (!is_list(clause) || ray_len(clause) != 3) return false;
+    ray_t** ce = (ray_t**)ray_data(clause);
+    /* Position 0 must be a ?variable, wildcard _, integer constant,
+     * or quoted symbol (not a bare name that could be a rule predicate).
+     * Triple patterns: (?e :attr ?v), (_ :attr ?v), (1 :attr ?v) */
+    if (is_dl_var(ce[0])) return true;
+    if (ce[0]->type == -RAY_I64) return true;
+    if (dl_is_wildcard(ce[0]) && ce[1]->type == -RAY_SYM && !is_dl_var(ce[1]))
+        return true;  /* _ is always wildcard -- reserved, never a predicate */
+    /* Quoted symbol (no RAY_ATTR_NAME) in position 0 + non-var symbol in position 1 */
+    if (ce[0]->type == -RAY_SYM && !(ce[0]->attrs & RAY_ATTR_NAME)) {
+        if (ce[1]->type == -RAY_SYM && !is_dl_var(ce[1]))
+            return true;
+    }
+    return false;
+}
+
+/* Check if a clause is a negation: (not (...)) */
+static bool dl_is_negation(ray_t* clause) {
+    if (!is_list(clause) || ray_len(clause) != 2) return false;
+    ray_t** ce = (ray_t**)ray_data(clause);
+    if (ce[0]->type != -RAY_SYM) return false;
+    ray_t* name = ray_sym_str(ce[0]->i64);
+    return name && strcmp(ray_str_ptr(name), "not") == 0;
+}
+
+/* Check if a clause is a comparison: (> ?x ?y) or (> ?x 100) */
+static bool dl_is_comparison(ray_t* clause) {
+    if (!is_list(clause) || ray_len(clause) < 3) return false;
+    ray_t** ce = (ray_t**)ray_data(clause);
+    if (ce[0]->type != -RAY_SYM) return false;
+    ray_t* name = ray_sym_str(ce[0]->i64);
+    if (!name) return false;
+    return dl_cmp_op_from_name(ray_str_ptr(name)) >= 0;
+}
+
+/* Check if a clause is an assignment: (= ?var expr) */
+static bool dl_is_assignment(ray_t* clause) {
+    if (!is_list(clause) || ray_len(clause) != 3) return false;
+    ray_t** ce = (ray_t**)ray_data(clause);
+    if (ce[0]->type != -RAY_SYM) return false;
+    ray_t* name = ray_sym_str(ce[0]->i64);
+    if (!name || strcmp(ray_str_ptr(name), "=") != 0) return false;
+    /* LHS must be a variable */
+    return is_dl_var(ce[1]);
+}
+
+static bool dl_is_aggregate(ray_t* clause) {
+    if (!is_list(clause) || ray_len(clause) < 3) return false;
+    ray_t** ce = (ray_t**)ray_data(clause);
+    if (ce[0]->type != -RAY_SYM) return false;
+    ray_t* name = ray_sym_str(ce[0]->i64);
+    if (!name) return false;
+    const char* n = ray_str_ptr(name);
+    return strcmp(n, "count") == 0 || strcmp(n, "sum") == 0
+        || strcmp(n, "min")   == 0 || strcmp(n, "max") == 0
+        || strcmp(n, "avg")   == 0;
+}
+
+static int dl_agg_op_from_name(const char* n) {
+    if (strcmp(n, "count") == 0) return DL_AGG_COUNT;
+    if (strcmp(n, "sum")   == 0) return DL_AGG_SUM;
+    if (strcmp(n, "min")   == 0) return DL_AGG_MIN;
+    if (strcmp(n, "max")   == 0) return DL_AGG_MAX;
+    if (strcmp(n, "avg")   == 0) return DL_AGG_AVG;
+    return -1;
+}
+
+static bool dl_sym_is_name(ray_t* sym, const char* lit) {
+    if (!sym || sym->type != -RAY_SYM) return false;
+    ray_t* s = ray_sym_str(sym->i64);
+    return s && strcmp(ray_str_ptr(s), lit) == 0;
+}
+
+/* Resolve an AST node to a variable or constant in a body atom.
+ * Sets the body position to either a variable or constant.
+ * For expressions like (quote x), evaluates them first. */
+static ray_t* dl_set_body_pos(dl_rule_t* rule, int bidx, int pos,
+                                ray_t* node, dl_var_map_t* vars) {
+    if (is_dl_var(node)) {
+        int vi = dl_var_get_or_create(vars, node->i64);
+        dl_body_set_var(rule, bidx, pos, vi);
+        return NULL;
+    }
+    if (node->type == -RAY_I64) {
+        dl_body_set_const(rule, bidx, pos, node->i64);
+        return NULL;
+    }
+    if (node->type == -RAY_SYM) {
+        ray_t* s = ray_sym_str(node->i64);
+        if (s && strcmp(ray_str_ptr(s), "_") == 0) {
+            /* Wildcard: create a fresh variable */
+            int vi = vars->n++;
+            vars->syms[vi] = -1 - vi;
+            dl_body_set_var(rule, bidx, pos, vi);
+        } else {
+            dl_body_set_const(rule, bidx, pos, node->i64);
+        }
+        return NULL;
+    }
+    if (node->type == -RAY_STR) {
+        /* Quoted string literal in body: intern as sym so it compares
+         * equal to other sym-interned constants.  Mirrors the head
+         * parser convention. */
+        int64_t sym = ray_sym_intern(ray_str_ptr(node), ray_str_len(node));
+        dl_body_set_const(rule, bidx, pos, sym);
+        return NULL;
+    }
+    /* For other forms (e.g., (quote x)), evaluate to get constant */
+    ray_t* val = ray_eval(node);
+    if (!val || RAY_IS_ERR(val))
+        return val ? val : ray_error("type", "rule: cannot evaluate constant in body");
+    if (val->type == -RAY_I64) {
+        dl_body_set_const(rule, bidx, pos, val->i64);
+    } else if (val->type == -RAY_SYM) {
+        dl_body_set_const(rule, bidx, pos, val->i64);
+    } else {
+        ray_release(val);
+        return ray_error("type", "rule: unsupported constant type in body");
+    }
+    ray_release(val);
+    return NULL;
+}
+
+/* Parse a single body clause and add it to the dl_rule_t.
+ * Handles triple patterns, negations, comparisons, assignments,
+ * and rule invocations (positive atoms). */
+static ray_t* dl_parse_body_clause(dl_rule_t* rule, ray_t* clause,
+                                     dl_var_map_t* vars, dl_program_t* prog) {
+    if (!is_list(clause) || ray_len(clause) < 1)
+        return ray_error("type", "rule/query: body clause must be a list");
+
+    ray_t** ce = (ray_t**)ray_data(clause);
+    int64_t clen = ray_len(clause);
+
+    /* -- Triple pattern: (?e :attr ?v) -- */
+    if (dl_is_triple_pattern(clause)) {
+        /* Register as 3-arity atom on "eav" relation:
+         * position 0 = entity, 1 = attr (constant), 2 = value */
+        int bidx = dl_rule_add_atom(rule, "eav", 3);
+        if (bidx < 0) return ray_error("domain", "rule: too many body literals");
+
+        ray_t* err;
+        err = dl_set_body_pos(rule, bidx, 0, ce[0], vars);
+        if (err) return err;
+        err = dl_set_body_pos(rule, bidx, 1, ce[1], vars);
+        if (err) return err;
+        err = dl_set_body_pos(rule, bidx, 2, ce[2], vars);
+        if (err) return err;
+        return NULL; /* success */
+    }
+
+    /* -- Negation: (not (?e :attr ?v))  or  (not (rule-name ?args...)) -- */
+    if (dl_is_negation(clause)) {
+        ray_t* inner = ce[1];
+        if (!is_list(inner) || ray_len(inner) < 1)
+            return ray_error("type", "not: inner clause must be a list");
+        ray_t** ie = (ray_t**)ray_data(inner);
+        int64_t ilen = ray_len(inner);
+
+        if (dl_is_triple_pattern(inner)) {
+            /* Negated triple: (not (?e :attr ?v)) */
+            int bidx = dl_rule_add_neg(rule, "eav", 3);
+            if (bidx < 0) return ray_error("domain", "rule: too many body literals");
+
+            ray_t* err;
+            err = dl_set_body_pos(rule, bidx, 0, ie[0], vars);
+            if (err) return err;
+            err = dl_set_body_pos(rule, bidx, 1, ie[1], vars);
+            if (err) return err;
+            err = dl_set_body_pos(rule, bidx, 2, ie[2], vars);
+            if (err) return err;
+        } else {
+            /* Negated rule invocation: (not (rule-name ?a ?b)) */
+            if (ie[0]->type != -RAY_SYM)
+                return ray_error("type", "not: inner clause head must be a symbol");
+            ray_t* pred_name = ray_sym_str(ie[0]->i64);
+            if (!pred_name)
+                return ray_error("type", "not: cannot resolve predicate name");
+
+            int bidx = dl_rule_add_neg(rule, ray_str_ptr(pred_name), (int)(ilen - 1));
+            if (bidx < 0) return ray_error("domain", "rule: too many body literals");
+
+            for (int64_t j = 1; j < ilen; j++) {
+                ray_t* err = dl_set_body_pos(rule, bidx, (int)(j - 1), ie[j], vars);
+                if (err) return err;
+            }
+        }
+        return NULL;
+    }
+
+    /* -- Aggregate: (count ?N pred) | (sum ?S pred col) | ... [by ?k col ...] -- */
+    if (dl_is_aggregate(clause)) {
+        ray_t* op_str = ray_sym_str(ce[0]->i64);
+        if (!op_str) return ray_error("type", "aggregate: bad operator");
+        int op = dl_agg_op_from_name(ray_str_ptr(op_str));
+        if (op < 0) return ray_error("type", "aggregate: unknown operator");
+
+        if (!is_dl_var(ce[1]))
+            return ray_error("type", "aggregate: first argument must be ?variable");
+        int target_vi = dl_var_get_or_create(vars, ce[1]->i64);
+        if (target_vi < 0)
+            return ray_error("domain", "aggregate: too many variables");
+
+        if (ce[2]->type != -RAY_SYM)
+            return ray_error("type", "aggregate: predicate must be a symbol");
+        ray_t* pred_sym = ray_sym_str(ce[2]->i64);
+        if (!pred_sym)
+            return ray_error("type", "aggregate: cannot resolve predicate name");
+        const char* pred_name = ray_str_ptr(pred_sym);
+
+        /* Record arity=0 as "unknown" when we can't resolve it against the
+         * program (prog=NULL or predicate not yet registered).  The compiler
+         * and env auto-register treat 0 as a wildcard and resolve against the
+         * source relation at evaluation time.  A hardcoded 1 would spuriously
+         * reject any env-bound table whose arity isn't 1. */
+        int pred_arity = 0;
+        if (prog) {
+            int ri = dl_find_rel(prog, pred_name);
+            if (ri >= 0) pred_arity = prog->rels[ri].arity;
+        }
+
+        int i = 3;
+        bool has_value_col = false;
+        int value_col = 0;
+        int key_vars[DL_AGG_MAX_KEYS];
+        int key_cols[DL_AGG_MAX_KEYS];
+        int n_keys = 0;
+
+        while (i < clen) {
+            if (dl_sym_is_name(ce[i], "by")) {
+                i++;
+                while (i < clen) {
+                    if (!is_dl_var(ce[i]))
+                        return ray_error("type", "aggregate: group key must be ?variable");
+                    if (n_keys >= DL_AGG_MAX_KEYS)
+                        return ray_error("domain", "aggregate: too many group keys");
+                    key_vars[n_keys] = dl_var_get_or_create(vars, ce[i]->i64);
+                    i++;
+                    if (i >= clen || ce[i]->type != -RAY_I64)
+                        return ray_error("type", "aggregate: group key column must be integer");
+                    key_cols[n_keys] = (int)ce[i]->i64;
+                    i++;
+                    n_keys++;
+                }
+                break;
+            }
+            if (ce[i]->type == -RAY_I64) {
+                if (has_value_col)
+                    return ray_error("type", "aggregate: at most one value column index");
+                has_value_col = true;
+                value_col = (int)ce[i]->i64;
+                i++;
+                continue;
+            }
+            return ray_error("type", "aggregate: unexpected token in aggregate clause");
+        }
+
+        if (op == DL_AGG_COUNT) {
+            if (has_value_col)
+                return ray_error("type", "aggregate: count does not take a value column");
+        } else {
+            if (!has_value_col)
+                return ray_error("type", "aggregate: sum/min/max/avg require a value column index");
+        }
+
+        int bidx = dl_rule_add_agg(rule, op, target_vi, pred_name, pred_arity, has_value_col ? value_col : 0);
+        if (bidx < 0) return ray_error("domain", "rule: too many body literals");
+        if (n_keys > 0) {
+            if (dl_rule_agg_set_group(rule, bidx, key_vars, key_cols, n_keys) != 0)
+                return ray_error("domain", "aggregate: cannot attach group keys");
+        }
+        return NULL;
+    }
+
+    /* -- Between sugar: (between ?x lo hi) -> (>= ?x lo) and (<= ?x hi) -- */
+    if (clen == 4 && ce[0]->type == -RAY_SYM) {
+        ray_t* nm = ray_sym_str(ce[0]->i64);
+        if (nm && strcmp(ray_str_ptr(nm), "between") == 0) {
+            if (!is_dl_var(ce[1]))
+                return ray_error("type", "between target must be a ?variable");
+            int vi = dl_var_get_or_create(vars, ce[1]->i64);
+            if (vi < 0)
+                return ray_error("domain", "between: too many variables");
+            if (ce[2]->type != -RAY_I64 || ce[3]->type != -RAY_I64)
+                return ray_error("type", "between bounds must be integer constants");
+            if (dl_rule_add_cmp_const(rule, DL_CMP_GE, vi, ce[2]->i64) < 0)
+                return ray_error("domain", "rule: too many body literals");
+            if (dl_rule_add_cmp_const(rule, DL_CMP_LE, vi, ce[3]->i64) < 0)
+                return ray_error("domain", "rule: too many body literals");
+            return NULL;
+        }
+    }
+
+    /* -- Assignment: (= ?var expr) -- */
+    if (dl_is_assignment(clause)) {
+        int target_vi = dl_var_get_or_create(vars, ce[1]->i64);
+        dl_expr_t* expr = dl_build_expr(ce[2], vars);
+        if (!expr)
+            return ray_error("type", "rule: cannot parse assignment expression");
+        dl_rule_add_assign(rule, target_vi, DL_OP_EQ, expr);
+        return NULL;
+    }
+
+    /* -- Comparison: (> ?x ?y) or (> ?x 100) -- */
+    if (dl_is_comparison(clause)) {
+        ray_t* op_str = ray_sym_str(ce[0]->i64);
+        int cmp_op = dl_cmp_op_from_name(ray_str_ptr(op_str));
+
+        /* LHS */
+        bool lhs_is_var = is_dl_var(ce[1]);
+        int lhs_vi = lhs_is_var ? dl_var_get_or_create(vars, ce[1]->i64) : -1;
+        bool lhs_is_const = (!lhs_is_var && (ce[1]->type == -RAY_I64 || ce[1]->type == -RAY_SYM));
+        int64_t lhs_const = lhs_is_const ? ce[1]->i64 : 0;
+
+        /* RHS */
+        bool rhs_is_var = (clen > 2) && is_dl_var(ce[2]);
+        int rhs_vi = rhs_is_var ? dl_var_get_or_create(vars, ce[2]->i64) : -1;
+        bool rhs_is_const = (clen > 2) && !rhs_is_var &&
+                            (ce[2]->type == -RAY_I64 || ce[2]->type == -RAY_SYM);
+        int64_t rhs_const = rhs_is_const ? ce[2]->i64 : 0;
+
+        if (lhs_is_var && rhs_is_var) {
+            dl_rule_add_cmp(rule, cmp_op, lhs_vi, rhs_vi);
+        } else if (lhs_is_var && rhs_is_const) {
+            dl_rule_add_cmp_const(rule, cmp_op, lhs_vi, rhs_const);
+        } else if (lhs_is_const && rhs_is_var) {
+            /* Flip: const op var -> var flipped_op const */
+            int flipped = cmp_op;
+            switch (cmp_op) {
+            case DL_CMP_GT: flipped = DL_CMP_LT; break;
+            case DL_CMP_GE: flipped = DL_CMP_LE; break;
+            case DL_CMP_LT: flipped = DL_CMP_GT; break;
+            case DL_CMP_LE: flipped = DL_CMP_GE; break;
+            default: break;
+            }
+            dl_rule_add_cmp_const(rule, flipped, rhs_vi, lhs_const);
+        } else {
+            /* Expression-based comparison */
+            dl_expr_t* le = dl_build_expr(ce[1], vars);
+            dl_expr_t* re = (clen > 2) ? dl_build_expr(ce[2], vars) : NULL;
+            if (le && re)
+                dl_rule_add_cmp_expr(rule, cmp_op, le, re);
+            else
+                return ray_error("type", "rule: cannot parse comparison operands");
+        }
+        return NULL;
+    }
+
+    /* -- Rule invocation / positive atom: (pred-name ?a ?b ...) -- */
+    if (ce[0]->type == -RAY_SYM) {
+        ray_t* pred_name = ray_sym_str(ce[0]->i64);
+        if (!pred_name)
+            return ray_error("type", "rule: cannot resolve predicate name");
+
+        int bidx = dl_rule_add_atom(rule, ray_str_ptr(pred_name), (int)(clen - 1));
+        if (bidx < 0) return ray_error("domain", "rule: too many body literals");
+
+        for (int64_t j = 1; j < clen; j++) {
+            ray_t* err = dl_set_body_pos(rule, bidx, (int)(j - 1), ce[j], vars);
+            if (err) return err;
+        }
+        return NULL;
+    }
+
+    return ray_error("type", "rule/query: unrecognized body clause form");
+}
+
+/* Parse head + body clauses into out (shared by rule and query inline rules). */
+static ray_t* dl_parse_rule_from_head_and_body(dl_rule_t* out, ray_t* head,
+                                                ray_t** body_args, int64_t n_body,
+                                                dl_var_map_t* vars, dl_program_t* prog) {
+    if (!is_list(head) || ray_len(head) < 1)
+        return ray_error("type", "rule: head must be (name ?var ...)");
+
+    ray_t** hd = (ray_t**)ray_data(head);
+    int64_t hlen = ray_len(head);
+
+    if (hd[0]->type != -RAY_SYM)
+        return ray_error("type", "rule: head name must be a symbol");
+
+    ray_t* head_name_str = ray_sym_str(hd[0]->i64);
+    if (!head_name_str)
+        return ray_error("type", "rule: cannot resolve head name");
+
+    if (ray_str_len(head_name_str) == 1 && ray_str_ptr(head_name_str)[0] == '_')
+        return ray_error("domain", "rule: _ is reserved as wildcard");
+
+    int head_arity = (int)(hlen - 1);
+    dl_rule_init(out, ray_str_ptr(head_name_str), head_arity);
+
+    for (int i = 0; i < head_arity; i++) {
+        ray_t* harg = hd[i + 1];
+        if (is_dl_var(harg)) {
+            int vi = dl_var_get_or_create(vars, harg->i64);
+            dl_rule_head_var(out, i, vi);
+        } else if (harg->type == -RAY_I64) {
+            dl_rule_head_const_typed(out, i, harg->i64, RAY_I64);
+        } else if (harg->type == -RAY_SYM) {
+            dl_rule_head_const_typed(out, i, harg->i64, RAY_SYM);
+        } else if (harg->type == -RAY_F64) {
+            int64_t bits;
+            memcpy(&bits, &harg->f64, sizeof(bits));
+            dl_rule_head_const_typed(out, i, bits, RAY_F64);
+        } else if (harg->type == -RAY_STR) {
+            /* Intern the string as a sym so it can be stored in a RAY_SYM
+             * column.  Matches the body-literal parser convention. */
+            int64_t sym = ray_sym_intern(ray_str_ptr(harg), ray_str_len(harg));
+            dl_rule_head_const_typed(out, i, sym, RAY_SYM);
+        } else {
+            return ray_error("type", "rule: head arguments must be ?variables or constants");
+        }
+    }
+
+    for (int64_t i = 0; i < n_body; i++) {
+        ray_t* err = dl_parse_body_clause(out, body_args[i], vars, prog);
+        if (err) return err;
+    }
+
+    out->n_vars = vars->n;
+    return NULL;
+}
+
+/* One inline rule: ((head-name ?a ...) body1 body2 ...) */
+static ray_t* dl_parse_inline_rule(dl_rule_t* out, ray_t* rule_list, dl_program_t* prog) {
+    if (!is_list(rule_list) || ray_len(rule_list) < 1)
+        return ray_error("type", "query: each (rules ...) entry must be a non-empty list");
+
+    ray_t** re = (ray_t**)ray_data(rule_list);
+    int64_t rlen = ray_len(rule_list);
+    dl_var_map_t vars;
+    memset(&vars, 0, sizeof(vars));
+    return dl_parse_rule_from_head_and_body(out, re[0], &re[1], rlen - 1, &vars, prog);
+}
+
+/* (rule (head-name ?v1 ?v2 ...) clause1 clause2 ...)
+ * Special form: args are NOT evaluated.
+ * Parses the head and body into a dl_rule_t and stores it globally. */
+ray_t* ray_rule_fn(ray_t** args, int64_t n) {
+    if (n < 2)
+        return ray_error("arity", "rule expects at least a head and one body clause");
+
+    if (g_dl_n_rules >= DL_MAX_RULES)
+        return ray_error("domain", "rule: too many rules (max 128)");
+
+    dl_var_map_t vars;
+    memset(&vars, 0, sizeof(vars));
+    dl_rule_t rule;
+    ray_t* perr = dl_parse_rule_from_head_and_body(&rule, args[0], &args[1], n - 1, &vars, NULL);
+    if (perr) return perr;
+
+    memcpy(&g_dl_rules[g_dl_n_rules++], &rule, sizeof(dl_rule_t));
+    return ray_bool(true);
+}
+
+/* (query db (find ?a ?b ...) (where clause1 clause2 ...) [(rules ...)])
+ * Optional fourth arg (rules ...) supplies inline rules only (globals ignored).
+ * Special form: db is evaluated, find/where are NOT evaluated.
+ * Creates a temporary dl_program_t, registers the EAV table,
+ * copies global rules (unless inline rules), builds a synthetic query rule, and evaluates. */
+ray_t* ray_query_fn(ray_t** args, int64_t n) {
+    if (n < 3 || n > 4)
+        return ray_error("arity", "query expects: db (find ...) (where ...) [(rules ...)]");
+
+    /* Evaluate db (first arg) */
+    ray_t* db = ray_eval(args[0]);
+    if (!db || RAY_IS_ERR(db)) return db ? db : ray_error("type", "query: db is null");
+    if (db->type != RAY_TABLE) { ray_release(db); return ray_error("type", "query: first arg must be a datoms table"); }
+
+    /* Parse find clause */
+    ray_t* find_clause = args[1];
+    if (!is_list(find_clause) || ray_len(find_clause) < 2) {
+        ray_release(db);
+        return ray_error("type", "query: second arg must be (find ?var ...)");
+    }
+    ray_t** find_elems = (ray_t**)ray_data(find_clause);
+    int64_t find_len = ray_len(find_clause);
+
+    /* Verify it starts with 'find' */
+    if (find_elems[0]->type != -RAY_SYM) {
+        ray_release(db);
+        return ray_error("type", "query: expected (find ...)");
+    }
+    ray_t* find_name = ray_sym_str(find_elems[0]->i64);
+    if (!find_name || strcmp(ray_str_ptr(find_name), "find") != 0) {
+        ray_release(db);
+        return ray_error("type", "query: expected (find ...) as second argument");
+    }
+
+    /* Collect find variable sym IDs */
+    int64_t find_var_syms[DL_MAX_ARITY];
+    int n_find_vars = 0;
+    for (int64_t i = 1; i < find_len && n_find_vars < DL_MAX_ARITY; i++) {
+        if (!is_dl_var(find_elems[i])) {
+            ray_release(db);
+            return ray_error("type", "query: find arguments must be ?variables");
+        }
+        find_var_syms[n_find_vars++] = find_elems[i]->i64;
+    }
+
+    /* Parse where clause */
+    ray_t* where_clause = args[2];
+    if (!is_list(where_clause) || ray_len(where_clause) < 2) {
+        ray_release(db);
+        return ray_error("type", "query: third arg must be (where clause ...)");
+    }
+    ray_t** where_elems = (ray_t**)ray_data(where_clause);
+    int64_t where_len = ray_len(where_clause);
+
+    /* Verify it starts with 'where' */
+    if (where_elems[0]->type != -RAY_SYM) {
+        ray_release(db);
+        return ray_error("type", "query: expected (where ...)");
+    }
+    ray_t* where_name = ray_sym_str(where_elems[0]->i64);
+    if (!where_name || strcmp(ray_str_ptr(where_name), "where") != 0) {
+        ray_release(db);
+        return ray_error("type", "query: expected (where ...) as third argument");
+    }
+
+    /* Optional 4th arg must be (rules ...) — inline rules override globals */
+    ray_t* rules_clause = NULL;
+    if (n == 4) {
+        ray_t* fourth = args[3];
+        if (!is_list(fourth) || ray_len(fourth) < 1) {
+            ray_release(db);
+            return ray_error("type", "query: fourth argument must be (rules ...)");
+        }
+        ray_t** re4 = (ray_t**)ray_data(fourth);
+        if (re4[0]->type != -RAY_SYM) {
+            ray_release(db);
+            return ray_error("type", "query: fourth argument must be (rules ...)");
+        }
+        ray_t* rname = ray_sym_str(re4[0]->i64);
+        if (!rname || strcmp(ray_str_ptr(rname), "rules") != 0) {
+            ray_release(db);
+            return ray_error("type", "query: fourth argument must be (rules ...)");
+        }
+        rules_clause = fourth;
+    }
+
+    /* Build variable map for the query */
+    dl_var_map_t vars;
+    memset(&vars, 0, sizeof(vars));
+
+    /* Pre-populate the variable map with find variables so they get
+     * the lowest indices (0, 1, 2, ...) -- makes projection trivial */
+    for (int i = 0; i < n_find_vars; i++)
+        dl_var_get_or_create(&vars, find_var_syms[i]);
+
+    /* Build synthetic query rule: __query(?find_vars...) :- body_clauses... */
+    dl_rule_t qrule;
+    dl_rule_init(&qrule, "__query", n_find_vars);
+    for (int i = 0; i < n_find_vars; i++)
+        dl_rule_head_var(&qrule, i, i);
+
+    /* Parse body clauses into the query rule */
+    for (int64_t i = 1; i < where_len; i++) {
+        ray_t* err = dl_parse_body_clause(&qrule, where_elems[i], &vars, NULL);
+        if (err) { ray_release(db); return err; }
+    }
+    qrule.n_vars = vars.n;
+
+    /* Create temporary program */
+    dl_program_t* prog = dl_program_new();
+    if (!prog) { ray_release(db); return ray_error("oom", "query: cannot create program"); }
+
+    /* Register the EAV table as a 3-arity "eav" relation.
+     * The 'a' column is RAY_SYM with adaptive width -- the Datalog engine
+     * operates on I64 data only, so convert SYM columns to I64 first. */
+    {
+        int64_t nrows_db = ray_table_nrows(db);
+        ray_t* eav_tbl = ray_table_new(3);
+        for (int c = 0; c < 3; c++) {
+            ray_t* col = ray_table_get_col_idx(db, c);
+            if (!col) continue;
+            if (col->type == RAY_SYM) {
+                /* Convert SYM -> I64: read sym IDs via ray_read_sym */
+                ray_t* i64col = ray_vec_new(RAY_I64, nrows_db);
+                if (i64col && !RAY_IS_ERR(i64col)) {
+                    i64col->len = nrows_db;
+                    int64_t* d = (int64_t*)ray_data(i64col);
+                    for (int64_t r = 0; r < nrows_db; r++)
+                        d[r] = ray_read_sym(ray_data(col), r, col->type, col->attrs);
+                    eav_tbl = ray_table_add_col(eav_tbl, ray_table_col_name(db, c), i64col);
+                    ray_release(i64col);
+                }
+            } else {
+                eav_tbl = ray_table_add_col(eav_tbl, ray_table_col_name(db, c), col);
+            }
+        }
+        dl_add_edb(prog, "eav", eav_tbl, 3);
+        ray_release(eav_tbl);
+    }
+
+    if (rules_clause) {
+        ray_t** re = (ray_t**)ray_data(rules_clause);
+        int64_t rlen = ray_len(rules_clause);
+        for (int64_t i = 1; i < rlen; i++) {
+            dl_rule_t irule;
+            ray_t* rerr = dl_parse_inline_rule(&irule, re[i], prog);
+            if (rerr) {
+                dl_program_free(prog);
+                ray_release(db);
+                return rerr;
+            }
+            if (dl_add_rule(prog, &irule) < 0) {
+                dl_program_free(prog);
+                ray_release(db);
+                return ray_error("domain", "query: too many rules");
+            }
+        }
+    } else {
+        for (int i = 0; i < g_dl_n_rules; i++)
+            dl_add_rule(prog, &g_dl_rules[i]);
+    }
+
+    /* Add the synthetic query rule */
+    dl_add_rule(prog, &qrule);
+
+    /* Auto-register env-bound EDB tables referenced from rule bodies.
+     *
+     * Rationale: the primary `db` argument becomes the `eav` EDB (above).
+     * User rules can also reference additional relations by name
+     * (e.g. `(facts_i64 ?e ?a ?v)`). Rather than force callers to pre-declare
+     * every EDB, scan the program's rule bodies for positive / negative atom
+     * predicates that are not yet known as a relation, look them up in the
+     * global ray env, and register them when they resolve to a RAY_TABLE of
+     * matching arity. SYM columns are converted to I64 (same treatment as
+     * the primary `eav` table).
+     *
+     * Aggregate sources are handled too (`DL_AGG` uses `agg_pred`).
+     * The built-in synthetic "__query" / "eav" names are skipped. */
+    for (int ri = 0; ri < prog->n_rules; ri++) {
+        dl_rule_t* rr = &prog->rules[ri];
+        for (int bi = 0; bi < rr->n_body; bi++) {
+            dl_body_t* bd = &rr->body[bi];
+            const char* pred_name = NULL;
+            int pred_arity = 0;
+
+            if (bd->type == DL_POS || bd->type == DL_NEG) {
+                pred_name = bd->pred;
+                pred_arity = bd->arity;
+            } else if (bd->type == DL_AGG) {
+                pred_name = bd->agg_pred;
+                pred_arity = bd->agg_arity;
+            } else {
+                continue;
+            }
+
+            if (!pred_name || pred_name[0] == '\0') continue;
+            if (strcmp(pred_name, "eav") == 0) continue;
+            if (dl_find_rel(prog, pred_name) >= 0) continue;
+
+            int64_t env_sym = ray_sym_intern(pred_name, strlen(pred_name));
+            ray_t* env_val = ray_env_get(env_sym);
+            if (!env_val || env_val->type != RAY_TABLE) continue;
+            int64_t ncols = ray_table_ncols(env_val);
+            /* pred_arity == 0 is a "not yet known" sentinel used when the
+             * aggregate parser couldn't resolve the source predicate's arity
+             * at parse time (prog=NULL, surface syntax).  Resolve it from the
+             * env-bound table's column count now. */
+            if (pred_arity == 0) pred_arity = (int)ncols;
+            if (ncols != pred_arity) continue;
+
+            int64_t nrows_env = ray_table_nrows(env_val);
+            ray_t* clean = ray_table_new(pred_arity);
+            if (!clean || RAY_IS_ERR(clean)) {
+                if (clean) ray_release(clean);
+                dl_program_free(prog);
+                ray_release(db);
+                return ray_error("memory", "query: failed to create env-backed EDB table");
+            }
+            for (int c = 0; c < pred_arity; c++) {
+                ray_t* col = ray_table_get_col_idx(env_val, c);
+                ray_t* next_clean;
+                if (!col) {
+                    /* Silently skipping would build `clean` with fewer than
+                     * pred_arity columns yet still register it via dl_add_edb
+                     * — the program would see a schema-inconsistent EDB. */
+                    ray_release(clean);
+                    dl_program_free(prog);
+                    ray_release(db);
+                    return ray_error("schema", "query: env-backed EDB table missing expected column");
+                }
+                if (col->type == RAY_SYM) {
+                    ray_t* i64col = ray_vec_new(RAY_I64, nrows_env);
+                    if (!i64col) {
+                        ray_release(clean);
+                        dl_program_free(prog);
+                        ray_release(db);
+                        return ray_error("memory", "query: failed to convert env-backed SYM column");
+                    }
+                    if (RAY_IS_ERR(i64col)) {
+                        ray_error_free(i64col);
+                        ray_release(clean);
+                        dl_program_free(prog);
+                        ray_release(db);
+                        return ray_error("memory", "query: failed to convert env-backed SYM column");
+                    }
+                    i64col->len = nrows_env;
+                    int64_t* d = (int64_t*)ray_data(i64col);
+                    for (int64_t r = 0; r < nrows_env; r++)
+                        d[r] = ray_read_sym(ray_data(col), r, col->type, col->attrs);
+                    next_clean = ray_table_add_col(clean, ray_table_col_name(env_val, c), i64col);
+                    ray_release(i64col);
+                } else {
+                    next_clean = ray_table_add_col(clean, ray_table_col_name(env_val, c), col);
+                }
+                if (!next_clean) {
+                    ray_release(clean);
+                    dl_program_free(prog);
+                    ray_release(db);
+                    return ray_error("memory", "query: failed to build env-backed EDB table");
+                }
+                if (RAY_IS_ERR(next_clean)) {
+                    ray_error_free(next_clean);
+                    ray_release(clean);
+                    dl_program_free(prog);
+                    ray_release(db);
+                    return ray_error("memory", "query: failed to build env-backed EDB table");
+                }
+                clean = next_clean;
+            }
+            if (dl_add_edb(prog, pred_name, clean, pred_arity) < 0) {
+                ray_release(clean);
+                dl_program_free(prog);
+                ray_release(db);
+                return ray_error("domain", "query: failed to register env-backed EDB table");
+            }
+            ray_release(clean);
+        }
+    }
+
+    /* Stratify and evaluate */
+    if (dl_stratify(prog) != 0) {
+        dl_program_free(prog);
+        ray_release(db);
+        return ray_error("domain", "query: unstratifiable negation cycle");
+    }
+
+    if (dl_eval(prog) != 0) {
+        dl_program_free(prog);
+        ray_release(db);
+        return ray_error("domain", "query: evaluation failed");
+    }
+
+    /* Get the result */
+    ray_t* raw = dl_query(prog, "__query");
+    if (!raw || RAY_IS_ERR(raw)) {
+        dl_program_free(prog);
+        ray_release(db);
+        return raw ? raw : ray_error("domain", "query: no result");
+    }
+
+    /* Build result table with user-friendly column names (the ?variable names) */
+    int64_t nrows = ray_table_nrows(raw);
+    int64_t ncols = ray_table_ncols(raw);
+    ray_t* result = ray_table_new(n_find_vars);
+    for (int i = 0; i < n_find_vars && i < (int)ncols; i++) {
+        ray_t* col = ray_table_get_col_idx(raw, i);
+        if (col)
+            result = ray_table_add_col(result, find_var_syms[i], col);
+    }
+
+    /* Handle empty result: ensure schema is correct */
+    if (nrows == 0 && n_find_vars > 0 && ray_table_ncols(result) == 0) {
+        ray_release(result);
+        result = ray_table_new(n_find_vars);
+        for (int i = 0; i < n_find_vars; i++) {
+            ray_t* ev = ray_vec_new(RAY_I64, 0);
+            if (!RAY_IS_ERR(ev)) {
+                result = ray_table_add_col(result, find_var_syms[i], ev);
+                ray_release(ev);
+            }
+        }
+    }
+
+    dl_program_free(prog);
+    ray_release(db);
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * Programmatic Datalog API builtins
+ * ══════════════════════════════════════════ */
+
+/* Opaque handle for dl_program_t stored in a ray_t atom.
+ * We store the pointer in the i64 field. */
+static ray_t* dl_wrap_program(dl_program_t* prog) {
+    ray_t* obj = ray_alloc(0);
+    if (!obj || RAY_IS_ERR(obj)) return ray_error("oom", NULL);
+    obj->type = -RAY_I64;
+    obj->i64 = (int64_t)(uintptr_t)prog;
+    return obj;
+}
+
+static dl_program_t* dl_unwrap_program(ray_t* obj) {
+    if (!obj || obj->type != -RAY_I64) return NULL;
+    return (dl_program_t*)(uintptr_t)obj->i64;
+}
+
+/* (dl-program) — create a new empty dl_program_t */
+ray_t* ray_dl_program_fn(ray_t** args, int64_t n) {
+    (void)args;
+    if (n != 0) return ray_error("arity", "dl-program takes no arguments");
+    dl_program_t* prog = dl_program_new();
+    if (!prog) return ray_error("oom", "dl-program: cannot allocate");
+    return dl_wrap_program(prog);
+}
+
+/* (dl-add-edb prog "name" table arity) — register EDB */
+ray_t* ray_dl_add_edb_fn(ray_t** args, int64_t n) {
+    if (n != 4) return ray_error("arity", "dl-add-edb expects: prog name table arity");
+    dl_program_t* prog = dl_unwrap_program(args[0]);
+    if (!prog) return ray_error("type", "dl-add-edb: first arg must be a dl-program");
+
+    /* Name can be a symbol or string */
+    const char* name = NULL;
+    ray_t* name_str = NULL;
+    if (args[1]->type == -RAY_SYM) {
+        name_str = ray_sym_str(args[1]->i64);
+        name = name_str ? ray_str_ptr(name_str) : NULL;
+    }
+    if (!name) return ray_error("type", "dl-add-edb: name must be a symbol");
+
+    if (args[2]->type != RAY_TABLE)
+        return ray_error("type", "dl-add-edb: third arg must be a table");
+    if (args[3]->type != -RAY_I64)
+        return ray_error("type", "dl-add-edb: arity must be an integer");
+
+    int rc = dl_add_edb(prog, name, args[2], (int)args[3]->i64);
+    return (rc >= 0) ? ray_bool(true) : ray_error("domain", "dl-add-edb: failed");
+}
+
+/* (dl-stratify prog) — compute strata */
+ray_t* ray_dl_stratify_fn(ray_t* x) {
+    dl_program_t* prog = dl_unwrap_program(x);
+    if (!prog) return ray_error("type", "dl-stratify: arg must be a dl-program");
+    int rc = dl_stratify(prog);
+    return (rc == 0) ? ray_bool(true) : ray_error("domain", "dl-stratify: unstratifiable");
+}
+
+/* (dl-eval prog) — evaluate to fixpoint */
+ray_t* ray_dl_eval_fn(ray_t* x) {
+    dl_program_t* prog = dl_unwrap_program(x);
+    if (!prog) return ray_error("type", "dl-eval: arg must be a dl-program");
+    int rc = dl_eval(prog);
+    return (rc == 0) ? ray_bool(true) : ray_error("domain", "dl-eval: evaluation failed");
+}
+
+/* (dl-query prog "pred") — get result table */
+ray_t* ray_dl_query_fn(ray_t* prog_obj, ray_t* pred_obj) {
+    dl_program_t* prog = dl_unwrap_program(prog_obj);
+    if (!prog) return ray_error("type", "dl-query: first arg must be a dl-program");
+
+    const char* pred = NULL;
+    if (pred_obj->type == -RAY_SYM) {
+        ray_t* s = ray_sym_str(pred_obj->i64);
+        pred = s ? ray_str_ptr(s) : NULL;
+    }
+    if (!pred) return ray_error("type", "dl-query: pred must be a symbol");
+
+    ray_t* result = dl_query(prog, pred);
+    if (!result) return ray_error("domain", "dl-query: predicate not found");
+    ray_retain(result);
+    return result;
+}
+
+/* (dl-provenance prog "pred") — get provenance column */
+ray_t* ray_dl_provenance_fn(ray_t* prog_obj, ray_t* pred_obj) {
+    dl_program_t* prog = dl_unwrap_program(prog_obj);
+    if (!prog) return ray_error("type", "dl-provenance: first arg must be a dl-program");
+
+    const char* pred = NULL;
+    if (pred_obj->type == -RAY_SYM) {
+        ray_t* s = ray_sym_str(pred_obj->i64);
+        pred = s ? ray_str_ptr(s) : NULL;
+    }
+    if (!pred) return ray_error("type", "dl-provenance: pred must be a symbol");
+
+    ray_t* prov = dl_get_provenance(prog, pred);
+    if (!prov) return ray_error("domain", "dl-provenance: not available");
+    ray_retain(prov);
+    return prov;
+}
+
+/* Reset global Datalog rule storage (called from ray_lang_destroy) */
+void ray_dl_reset_rules(void) {
+    g_dl_n_rules = 0;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/datalog.h b/crates/rayforce-sys/vendor/rayforce/src/ops/datalog.h
new file mode 100644
index 0000000..3141097
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/datalog.h
@@ -0,0 +1,344 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+/*
+ * datalog.h — Datalog evaluation engine for Rayforce
+ *
+ * Compiles Datalog rules into ray_graph_t operation DAGs and evaluates
+ * them to fixpoint using Rayforce's vectorized columnar execution engine.
+ * Supports semi-naive evaluation, stratified negation, and multi-rule heads.
+ */
+#ifndef RAYFORCE_DATALOG_H
+#define RAYFORCE_DATALOG_H
+
+#include "rayforce.h"
+#include "ops/ops.h"
+#include <stdbool.h>
+#include <stdint.h>
+
+/* ===== Body literal types ===== */
+#define DL_POS      0   /* positive atom: pred(X, Y, ...) */
+#define DL_NEG      1   /* negated atom:  not pred(X, Y, ...) */
+#define DL_CMP      2   /* comparison:    X < Y, X = c, etc. */
+#define DL_ASSIGN   3   /* assignment:    X = expr */
+#define DL_BUILTIN  4   /* builtin predicate */
+#define DL_INTERVAL 5   /* interval bind: F @[S, E] */
+#define DL_AGG      6   /* aggregate: (count ?N pred), (sum ?S ?expr pred), ... */
+
+/* ===== Comparison operators (for DL_CMP) ===== */
+#define DL_CMP_EQ   0
+#define DL_CMP_NE   1
+#define DL_CMP_LT   2
+#define DL_CMP_LE   3
+#define DL_CMP_GT   4
+#define DL_CMP_GE   5
+
+/* ===== Aggregate operators (for DL_AGG) ===== */
+#define DL_AGG_COUNT 0
+#define DL_AGG_SUM   1
+#define DL_AGG_MIN   2
+#define DL_AGG_MAX   3
+#define DL_AGG_AVG   4
+
+#define DL_AGG_MAX_KEYS 8
+
+/* ===== Assignment operators (for DL_ASSIGN) ===== */
+#define DL_OP_EQ    0   /* simple assignment: X = expr */
+
+/* ===== Builtin predicate IDs (for DL_BUILTIN) ===== */
+#define DL_BUILTIN_BEFORE          0  /* before(S, E, T): filter T < S */
+#define DL_BUILTIN_DURATION_SINCE  1  /* duration_since(T1, T2, D): D = T2 - T1 */
+#define DL_BUILTIN_ABS             2  /* abs(X, Y): Y = |X| */
+
+/* ===== Expression AST for assignments ===== */
+typedef enum {
+    DL_EXPR_CONST,        /* integer constant (back-compat) */
+    DL_EXPR_CONST_F64,    /* float constant */
+    DL_EXPR_VAR,          /* bound variable reference */
+    DL_EXPR_BINOP,        /* binary op: +, -, *, / */
+} dl_expr_kind_t;
+
+typedef struct dl_expr {
+    dl_expr_kind_t  kind;
+    int64_t         const_val;   /* for DL_EXPR_CONST */
+    double          const_f64;   /* for DL_EXPR_CONST_F64 */
+    int             var_idx;     /* for DL_EXPR_VAR */
+    int             binop;       /* for DL_EXPR_BINOP: OP_ADD, OP_SUB, etc. */
+    struct dl_expr *left;        /* for DL_EXPR_BINOP */
+    struct dl_expr *right;       /* for DL_EXPR_BINOP */
+} dl_expr_t;
+
+/* Variable index sentinel: constant value, not a variable */
+#define DL_CONST    (-1)
+
+/* Maximum arity for any relation */
+#define DL_MAX_ARITY 16
+
+/* Maximum number of body literals per rule */
+#define DL_MAX_BODY  16
+
+/* Maximum number of rules in a program */
+#define DL_MAX_RULES 128
+
+/* Maximum number of relations */
+#define DL_MAX_RELS  64
+
+/* Maximum strata */
+#define DL_MAX_STRATA 16
+
+/* Program flags */
+#define DL_FLAG_PROVENANCE  (1 << 0)  /* track which rule derived each tuple */
+
+/* ===== Body literal ===== */
+typedef struct {
+    int     type;                   /* DL_POS, DL_NEG, DL_CMP, DL_ASSIGN */
+    char    pred[64];               /* predicate name (for DL_POS/DL_NEG) */
+    int     arity;                  /* number of argument positions */
+    int     vars[DL_MAX_ARITY];    /* variable indices (DL_CONST for constants) */
+    int64_t const_vals[DL_MAX_ARITY]; /* constant values (I64/SYM) */
+    int     cmp_op;                /* comparison operator (for DL_CMP) */
+    int     cmp_lhs;               /* left variable index (for DL_CMP) */
+    int     cmp_rhs;               /* right variable index or DL_CONST */
+    int64_t cmp_const;             /* constant value if cmp_rhs == DL_CONST */
+    int     assign_var;            /* target variable index (for DL_ASSIGN) */
+    dl_expr_t *assign_expr;        /* expression tree (for DL_ASSIGN) */
+    int     builtin_id;            /* builtin ID (for DL_BUILTIN) */
+    dl_expr_t *cmp_lhs_expr;      /* expression tree for LHS (for DL_CMP with expressions) */
+    dl_expr_t *cmp_rhs_expr;      /* expression tree for RHS (for DL_CMP with expressions) */
+    int     interval_fact_var;     /* fact variable index (for DL_INTERVAL) */
+    int     interval_start_var;    /* start variable index (for DL_INTERVAL) */
+    int     interval_end_var;      /* end variable index (for DL_INTERVAL) */
+    int     agg_op;                /* aggregate operator (for DL_AGG) */
+    int     agg_target_var;        /* variable that receives the aggregate result */
+    char    agg_pred[64];          /* predicate name being aggregated over */
+    int     agg_arity;             /* arity of agg_pred */
+    int     agg_value_col;         /* column index inside agg_pred to aggregate (sum/min/max/avg) */
+    int     agg_n_group_keys;      /* 0 = scalar; >0 = grouped */
+    int     agg_group_key_vars[DL_AGG_MAX_KEYS];
+    int     agg_group_key_cols[DL_AGG_MAX_KEYS];
+} dl_body_t;
+
+/* ===== Datalog rule: head :- body ===== */
+typedef struct {
+    char    head_pred[64];          /* head predicate name */
+    int     head_arity;
+    int     head_vars[DL_MAX_ARITY]; /* variable indices in head */
+    int64_t head_consts[DL_MAX_ARITY]; /* constants (when head_vars[i] == DL_CONST) */
+    int8_t  head_const_types[DL_MAX_ARITY]; /* ray type tag per head slot:
+                                             *   RAY_I64 / RAY_SYM / RAY_F64 when head_vars[i] == DL_CONST,
+                                             *   0 when head_vars[i] is a variable. */
+    int     n_body;                 /* number of body literals */
+    dl_body_t body[DL_MAX_BODY];
+    int     n_vars;                 /* total distinct variable count in rule */
+    int     stratum;                /* assigned stratum (-1 if not yet stratified) */
+} dl_rule_t;
+
+/* ===== Datalog relation ===== */
+typedef struct {
+    char    name[64];               /* relation name */
+    ray_t*  table;                  /* backing columnar table */
+    int     arity;                  /* number of columns */
+    bool    is_idb;                 /* true = derived (intensional) */
+    int64_t col_names[DL_MAX_ARITY]; /* interned column name symbols */
+    ray_t*  prov_col;               /* provenance column (when DL_FLAG_PROVENANCE) */
+    ray_t*  prov_src_offsets;       /* CSR offsets into prov_src_data, length nrows+1 */
+    ray_t*  prov_src_data;          /* packed source refs: (rel_idx << 32) | row_idx */
+} dl_rel_t;
+
+/* ===== Datalog program ===== */
+typedef struct {
+    dl_rel_t    rels[DL_MAX_RELS];
+    int         n_rels;
+    dl_rule_t   rules[DL_MAX_RULES];
+    int         n_rules;
+    int         strata[DL_MAX_STRATA][DL_MAX_RELS]; /* predicate indices per stratum */
+    int         strata_sizes[DL_MAX_STRATA];         /* number of predicates per stratum */
+    int         n_strata;
+    uint32_t    flags;                                /* DL_FLAG_* bitmask */
+    bool        eval_err;                             /* set by compile/eval on
+                                                         unrecoverable failure
+                                                         (distinct from "rule
+                                                         produced no rows"); read
+                                                         by dl_eval to return -1 */
+} dl_program_t;
+
+/* ===== Public API ===== */
+
+/* Create a new empty Datalog program */
+dl_program_t* dl_program_new(void);
+
+/* Free a Datalog program and release all owned tables */
+void dl_program_free(dl_program_t* prog);
+
+/** Append rules registered via the Rayfall (rule ...) special form into a program. */
+void dl_append_global_rules(dl_program_t* prog);
+
+/* Register an EDB (extensional) relation backed by an existing table.
+ * Column names are auto-generated as "c0", "c1", ... unless the table
+ * already has named columns. */
+int dl_add_edb(dl_program_t* prog, const char* name, ray_t* table, int arity);
+
+/* Add a rule to the program. The rule struct is copied. */
+int dl_add_rule(dl_program_t* prog, const dl_rule_t* rule);
+
+/* Compute stratification (topological sort of negation dependency graph).
+ * Returns 0 on success, -1 if program has unstratifiable negation cycle. */
+int dl_stratify(dl_program_t* prog);
+
+/* Evaluate the program to fixpoint using semi-naive evaluation.
+ * Returns 0 on success, -1 on error. */
+int dl_eval(dl_program_t* prog);
+
+/* Query the result of a derived relation after evaluation.
+ * Returns the backing table (caller does NOT own it). */
+ray_t* dl_query(dl_program_t* prog, const char* pred_name);
+
+/* Retrieve the provenance column from a derived relation.
+ * Only valid when DL_FLAG_PROVENANCE is set. Returns the I64 column
+ * of rule indices, or NULL if provenance not enabled/available. */
+ray_t* dl_get_provenance(dl_program_t* prog, const char* pred_name);
+
+/* Retrieve deep provenance source offsets for a derived relation.
+ * Returns an I64 vector of length nrows+1 in CSR format: offsets[i] is the
+ * start index in the source-data vector for derived row i.
+ * Only valid when DL_FLAG_PROVENANCE is set. Returns NULL if unavailable. */
+ray_t* dl_get_provenance_src_offsets(dl_program_t* prog, const char* pred_name);
+
+/* Retrieve deep provenance source data for a derived relation.
+ * Returns a flat I64 vector of packed source references. Each entry encodes
+ * (relation_index << 32) | row_index, identifying which EDB or IDB relation
+ * and row contributed to deriving a given output tuple. Row indices are
+ * truncated to 32 bits (max ~4 billion rows per relation).
+ *
+ * For rules with body-only variables (variables appearing in body atoms but
+ * not in the head), source entries include all body rows consistent with
+ * head-visible bindings. Cross-body join constraints are not re-enforced
+ * during source lookup, so entries may be a superset of the true derivation.
+ *
+ * Only valid when DL_FLAG_PROVENANCE is set. Returns NULL if unavailable. */
+ray_t* dl_get_provenance_src_data(dl_program_t* prog, const char* pred_name);
+
+/* ===== Rule builder helpers ===== */
+
+/* Initialize a rule with the given head predicate and arity */
+void dl_rule_init(dl_rule_t* rule, const char* head_pred, int head_arity);
+
+/* Set a head argument to a variable */
+void dl_rule_head_var(dl_rule_t* rule, int pos, int var_idx);
+
+/* Set a head argument to an I64 constant — backward-compatible
+ * signature. Equivalent to dl_rule_head_const_typed(rule, pos, val,
+ * RAY_I64).  Prefer the typed variant for new code. */
+void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val);
+
+/* Set a head argument to a typed constant.
+ *   type must be RAY_I64, RAY_SYM, or RAY_F64.
+ *   For RAY_F64 callers should pass a double reinterpreted via memcpy/union
+ *   into val's int64 slot; dl_rule_head_const_f64 is the safe wrapper. */
+void dl_rule_head_const_typed(dl_rule_t* rule, int pos, int64_t val, int8_t type);
+
+/* Convenience wrapper: set a head argument to a RAY_F64 constant. */
+void dl_rule_head_const_f64(dl_rule_t* rule, int pos, double val);
+
+/* Add a positive body atom. Returns body literal index. */
+int dl_rule_add_atom(dl_rule_t* rule, const char* pred, int arity);
+
+/* Set a body atom argument to a variable */
+void dl_body_set_var(dl_rule_t* rule, int body_idx, int pos, int var_idx);
+
+/* Set a body atom argument to a constant */
+void dl_body_set_const(dl_rule_t* rule, int body_idx, int pos, int64_t val);
+
+/* Add a negated body atom. Returns body literal index. */
+int dl_rule_add_neg(dl_rule_t* rule, const char* pred, int arity);
+
+/* Add a comparison. Returns body literal index. */
+int dl_rule_add_cmp(dl_rule_t* rule, int cmp_op, int lhs_var, int rhs_var);
+
+/* Add a comparison with a constant RHS. Returns body literal index. */
+int dl_rule_add_cmp_const(dl_rule_t* rule, int cmp_op, int lhs_var, int64_t rhs_val);
+
+/* Add an assignment: target_var = expr. Returns body literal index. */
+int dl_rule_add_assign(dl_rule_t* rule, int target_var, int op, dl_expr_t* expr);
+
+/* Add a builtin predicate. Returns body literal index.
+ * Arguments are set via dl_body_set_var (same as atoms). */
+int dl_rule_add_builtin(dl_rule_t* rule, int builtin_id, int arity);
+
+/* Add a comparison with expression trees on both sides.
+ * E.g., "X + Y < Z * 2" -> cmp_op=DL_CMP_LT, lhs=binop(+,X,Y), rhs=binop(*,Z,2).
+ * Returns body literal index. */
+int dl_rule_add_cmp_expr(dl_rule_t* rule, int cmp_op, dl_expr_t* lhs, dl_expr_t* rhs);
+
+/* Add an interval bind: decompose two consecutive columns at the fact variable's
+ * position into start_var and end_var. Returns body literal index. */
+int dl_rule_add_interval(dl_rule_t* rule, int fact_var, int start_var, int end_var);
+
+/* pred_arity is advisory; evaluator re-resolves against program EDB/IDB at compile time. */
+/* Add an aggregate body literal: (op ?target pred col)
+ *  - op: DL_AGG_COUNT (col is ignored), DL_AGG_SUM/MIN/MAX/AVG
+ *  - target_var: variable that receives the aggregate result
+ *  - pred: predicate to aggregate over
+ *  - pred_arity: arity of that predicate
+ *  - value_col: which column to aggregate (ignored for COUNT)
+ * Returns body literal index. */
+int dl_rule_add_agg(dl_rule_t* rule, int op, int target_var,
+                    const char* pred, int pred_arity, int value_col);
+
+/* Attach group-by keys to an aggregate body literal previously added via
+ * dl_rule_add_agg. body_idx is that builder's return value.
+ * key_vars and key_cols have n_keys entries (<= DL_AGG_MAX_KEYS).
+ * Returns 0 on success, -1 if n_keys is out of range. */
+int dl_rule_agg_set_group(dl_rule_t* rule, int body_idx,
+                          const int* key_vars, const int* key_cols, int n_keys);
+
+/* ===== Expression tree builders ===== */
+
+/* Create a constant expression */
+dl_expr_t* dl_expr_const(int64_t val);
+
+/* Create a float constant expression */
+dl_expr_t* dl_expr_const_f64(double val);
+
+/* Create a variable reference expression */
+dl_expr_t* dl_expr_var(int var_idx);
+
+/* Create a binary operation expression (OP_ADD, OP_SUB, OP_MUL, OP_DIV) */
+dl_expr_t* dl_expr_binop(int op, dl_expr_t* left, dl_expr_t* right);
+
+/* ===== Internal (used by compiler) ===== */
+
+/* Find relation by name. Returns index or -1. */
+int dl_find_rel(dl_program_t* prog, const char* name);
+
+/* Ensure an IDB relation exists for the given head predicate.
+ * Creates it with the correct arity if it doesn't exist yet. */
+int dl_ensure_idb(dl_program_t* prog, const char* name, int arity);
+
+/* Compile one rule into a ray_graph_t for one fixpoint iteration.
+ * delta_pos: which body atom uses the delta relation (-1 for initial pass).
+ * rule_idx: index of this rule in prog->rules (used for provenance).
+ * Returns the output node in g that produces new head tuples. */
+ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule,
+                          int delta_pos, int rule_idx, ray_graph_t* g);
+
+#endif /* RAYFORCE_DATALOG_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/dump.c b/crates/rayforce-sys/vendor/rayforce/src/ops/dump.c
new file mode 100644
index 0000000..3f849e8
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/dump.c
@@ -0,0 +1,254 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops.h"
+#include <stdio.h>
+
+/* Duplicate of find_ext() from opt.c — kept local for self-containment. */
+static ray_op_ext_t* find_ext(ray_graph_t* g, uint32_t node_id) {
+    for (uint32_t i = 0; i < g->ext_count; i++) {
+        if (g->ext_nodes[i] && g->ext_nodes[i]->base.id == node_id)
+            return g->ext_nodes[i];
+    }
+    return NULL;
+}
+
+const char* ray_opcode_name(uint16_t op) {
+    switch (op) {
+        case OP_SCAN:          return "SCAN";
+        case OP_CONST:         return "CONST";
+        case OP_NEG:           return "NEG";
+        case OP_ABS:           return "ABS";
+        case OP_NOT:           return "NOT";
+        case OP_SQRT:          return "SQRT";
+        case OP_LOG:           return "LOG";
+        case OP_EXP:           return "EXP";
+        case OP_CEIL:          return "CEIL";
+        case OP_FLOOR:         return "FLOOR";
+        case OP_ISNULL:        return "ISNULL";
+        case OP_CAST:          return "CAST";
+        case OP_ADD:           return "ADD";
+        case OP_SUB:           return "SUB";
+        case OP_MUL:           return "MUL";
+        case OP_DIV:           return "DIV";
+        case OP_MOD:           return "MOD";
+        case OP_EQ:            return "EQ";
+        case OP_NE:            return "NE";
+        case OP_LT:            return "LT";
+        case OP_LE:            return "LE";
+        case OP_GT:            return "GT";
+        case OP_GE:            return "GE";
+        case OP_AND:           return "AND";
+        case OP_OR:            return "OR";
+        case OP_MIN2:          return "MIN2";
+        case OP_MAX2:          return "MAX2";
+        case OP_IF:            return "IF";
+        case OP_LIKE:          return "LIKE";
+        case OP_ILIKE:         return "ILIKE";
+        case OP_UPPER:         return "UPPER";
+        case OP_LOWER:         return "LOWER";
+        case OP_STRLEN:        return "STRLEN";
+        case OP_SUBSTR:        return "SUBSTR";
+        case OP_REPLACE:       return "REPLACE";
+        case OP_TRIM:          return "TRIM";
+        case OP_CONCAT:        return "CONCAT";
+        case OP_EXTRACT:       return "EXTRACT";
+        case OP_DATE_TRUNC:    return "DATE_TRUNC";
+        case OP_SUM:           return "SUM";
+        case OP_PROD:          return "PROD";
+        case OP_MIN:           return "MIN";
+        case OP_MAX:           return "MAX";
+        case OP_COUNT:         return "COUNT";
+        case OP_AVG:           return "AVG";
+        case OP_FIRST:         return "FIRST";
+        case OP_LAST:          return "LAST";
+        case OP_COUNT_DISTINCT:return "COUNT_DISTINCT";
+        case OP_STDDEV:        return "STDDEV";
+        case OP_STDDEV_POP:    return "STDDEV_POP";
+        case OP_VAR:           return "VAR";
+        case OP_VAR_POP:       return "VAR_POP";
+        case OP_FILTER:        return "FILTER";
+        case OP_SORT:          return "SORT";
+        case OP_GROUP:         return "GROUP";
+        case OP_PIVOT:         return "PIVOT";
+        case OP_ANTIJOIN:      return "ANTIJOIN";
+        case OP_JOIN:          return "JOIN";
+        case OP_WINDOW_JOIN:   return "WINDOW_JOIN";
+        case OP_SELECT:        return "SELECT";
+        case OP_HEAD:          return "HEAD";
+        case OP_TAIL:          return "TAIL";
+        case OP_WINDOW:        return "WINDOW";
+        case OP_ALIAS:         return "ALIAS";
+        case OP_MATERIALIZE:   return "MATERIALIZE";
+        case OP_EXPAND:        return "EXPAND";
+        case OP_VAR_EXPAND:    return "VAR_EXPAND";
+        case OP_SHORTEST_PATH: return "SHORTEST_PATH";
+        case OP_WCO_JOIN:      return "WCO_JOIN";
+        case OP_PAGERANK:      return "PAGERANK";
+        case OP_CONNECTED_COMP: return "CONNECTED_COMP";
+        case OP_DIJKSTRA:      return "DIJKSTRA";
+        case OP_LOUVAIN:       return "LOUVAIN";
+        case OP_DEGREE_CENT:   return "DEGREE_CENT";
+        case OP_TOPSORT:       return "TOPSORT";
+        case OP_DFS:           return "DFS";
+        case OP_ASTAR:         return "ASTAR";
+        case OP_K_SHORTEST:    return "K_SHORTEST";
+        case OP_CLUSTER_COEFF: return "CLUSTER_COEFF";
+        case OP_RANDOM_WALK:   return "RANDOM_WALK";
+        case OP_COSINE_SIM:    return "COSINE_SIM";
+        case OP_EUCLIDEAN_DIST:return "EUCLIDEAN_DIST";
+        case OP_KNN:           return "KNN";
+        case OP_HNSW_KNN:     return "HNSW_KNN";
+        case OP_ANN_RERANK:    return "ANN_RERANK";
+        case OP_KNN_RERANK:    return "KNN_RERANK";
+        default:               return "UNKNOWN";
+    }
+}
+
+static const char* type_name(int8_t t) {
+    switch (t) {
+        case RAY_LIST:      return "LIST";
+        case RAY_BOOL:      return "BOOL";
+        case RAY_U8:        return "U8";
+        case RAY_I16:       return "I16";
+        case RAY_I32:       return "I32";
+        case RAY_I64:       return "I64";
+        case RAY_F64:       return "F64";
+        case RAY_DATE:      return "DATE";
+        case RAY_TIME:      return "TIME";
+        case RAY_TIMESTAMP: return "TIMESTAMP";
+        case RAY_TABLE:     return "TABLE";
+        case RAY_SEL:       return "SEL";
+        case RAY_SYM:       return "SYM";
+        default:           return "?";
+    }
+}
+
+static void dump_node(FILE* f, ray_graph_t* g, ray_op_t* node, int depth) {
+    if (!node) return;
+
+    /* Indentation */
+    for (int i = 0; i < depth; i++)
+        fprintf(f, "  ");
+
+    /* Opcode name */
+    fprintf(f, "%s", ray_opcode_name(node->opcode));
+
+    /* Find extended node for annotations */
+    ray_op_ext_t* ext = find_ext(g, node->id);
+
+    /* Annotations by opcode */
+    switch (node->opcode) {
+        case OP_SCAN:
+            if (ext) {
+                ray_t* s = ray_sym_str(ext->sym);
+                if (s)
+                    fprintf(f, "(%.*s)", (int)s->len, (char*)ray_data(s));
+            }
+            break;
+        case OP_CONST:
+            if (ext && ext->literal) {
+                ray_t* lit = ext->literal;
+                switch (lit->type) {
+                    case RAY_I64:  fprintf(f, "(%lld)", (long long)lit->i64); break;
+                    case RAY_F64:  fprintf(f, "(%.6g)", lit->f64); break;
+                    case RAY_BOOL: fprintf(f, "(%s)", lit->i64 ? "true" : "false"); break;
+                    case RAY_TABLE:fprintf(f, "(table)"); break;
+                    default:      fprintf(f, "(?)"); break;
+                }
+            }
+            break;
+        case OP_JOIN:
+            if (ext) {
+                const char* jt = "INNER";
+                if (ext->join.join_type == 1) jt = "LEFT";
+                else if (ext->join.join_type == 2) jt = "FULL";
+                fprintf(f, "(%s, keys=%u)", jt, ext->join.n_join_keys);
+            }
+            break;
+        case OP_GROUP:
+            if (ext)
+                fprintf(f, "(keys=%u, aggs=%u)", ext->n_keys, ext->n_aggs);
+            break;
+        case OP_HEAD:
+        case OP_TAIL:
+            if (ext)
+                fprintf(f, "(N=%lld)", (long long)ext->sym);
+            break;
+        default:
+            break;
+    }
+
+    /* Output type */
+    fprintf(f, " -> %s", type_name(node->out_type));
+
+    /* Flags */
+    if (node->flags & OP_FLAG_FUSED)
+        fprintf(f, " [fused]");
+
+    /* Estimated rows */
+    if (node->est_rows > 0)
+        fprintf(f, " ~%u rows", node->est_rows);
+
+    /* Node ID */
+    fprintf(f, " #%u", node->id);
+
+    fprintf(f, "\n");
+
+    /* Recurse into children */
+    switch (node->opcode) {
+        case OP_GROUP:
+            if (ext) {
+                /* keys */
+                for (uint8_t i = 0; i < ext->n_keys; i++)
+                    dump_node(f, g, ext->keys[i], depth + 1);
+                /* agg inputs */
+                for (uint8_t i = 0; i < ext->n_aggs; i++)
+                    dump_node(f, g, ext->agg_ins[i], depth + 1);
+            }
+            /* Also recurse into standard inputs */
+            for (uint8_t i = 0; i < node->arity && i < 2; i++)
+                dump_node(f, g, node->inputs[i], depth + 1);
+            break;
+        case OP_SORT:
+        case OP_SELECT:
+            if (ext) {
+                for (uint8_t i = 0; i < ext->sort.n_cols; i++)
+                    dump_node(f, g, ext->sort.columns[i], depth + 1);
+            }
+            for (uint8_t i = 0; i < node->arity && i < 2; i++)
+                dump_node(f, g, node->inputs[i], depth + 1);
+            break;
+        default:
+            for (uint8_t i = 0; i < node->arity && i < 2; i++)
+                dump_node(f, g, node->inputs[i], depth + 1);
+            break;
+    }
+}
+
+void ray_graph_dump(ray_graph_t* g, ray_op_t* root, void* out) {
+    FILE* f = out ? (FILE*)out : stderr;
+    fprintf(f, "=== Query Plan ===\n");
+    dump_node(f, g, root, 0);
+    fprintf(f, "==================\n");
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/embedding.c b/crates/rayforce-sys/vendor/rayforce/src/ops/embedding.c
new file mode 100644
index 0000000..4a4fc05
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/embedding.c
@@ -0,0 +1,870 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+#include "lang/internal.h"
+#include "mem/sys.h"
+
+/* --------------------------------------------------------------------------
+ * exec_cosine_sim: cosine similarity between embedding column and query vector.
+ * dot(a,b) / (||a|| * ||b||) per row.
+ * Input: RAY_F32 embedding column (flat N*D floats)
+ * Output: RAY_F64 vector of similarities (one per row)
+ * -------------------------------------------------------------------------- */
+ray_t* exec_cosine_sim(ray_graph_t* g, ray_op_t* op, ray_t* emb_vec) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    const float* query = ext->vector.query_vec;
+    int32_t dim = ext->vector.dim;
+
+    if (!query || dim <= 0) return ray_error("schema", NULL);
+    if (emb_vec->type != RAY_F32) return ray_error("type", NULL);
+
+    int64_t total = emb_vec->len;
+    int64_t nrows = total / dim;
+    if (nrows * dim != total) return ray_error("length", NULL);
+
+    const float* data = (const float*)ray_data(emb_vec);
+
+    /* Precompute query norm */
+    double q_norm_sq = 0.0;
+    for (int32_t j = 0; j < dim; j++) {
+        q_norm_sq += (double)query[j] * (double)query[j];
+    }
+    double q_norm = sqrt(q_norm_sq);
+
+    /* Compute per-row similarity */
+    ray_t* result = ray_vec_new(RAY_F64, nrows);
+    if (!result || RAY_IS_ERR(result)) return ray_error("oom", NULL);
+    result->len = nrows;
+    double* out = (double*)ray_data(result);
+
+    for (int64_t i = 0; i < nrows; i++) {
+        const float* row = data + i * dim;
+        double dot = 0.0;
+        double r_norm_sq = 0.0;
+        for (int32_t j = 0; j < dim; j++) {
+            dot += (double)row[j] * (double)query[j];
+            r_norm_sq += (double)row[j] * (double)row[j];
+        }
+        double r_norm = sqrt(r_norm_sq);
+        double denom = q_norm * r_norm;
+        out[i] = (denom > 0.0) ? dot / denom : 0.0;
+    }
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_euclidean_dist: euclidean distance between embedding column and query.
+ * sqrt(sum((a_i - b_i)^2)) per row.
+ * -------------------------------------------------------------------------- */
+ray_t* exec_euclidean_dist(ray_graph_t* g, ray_op_t* op, ray_t* emb_vec) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    const float* query = ext->vector.query_vec;
+    int32_t dim = ext->vector.dim;
+
+    if (!query || dim <= 0) return ray_error("schema", NULL);
+    if (emb_vec->type != RAY_F32) return ray_error("type", NULL);
+
+    int64_t total = emb_vec->len;
+    int64_t nrows = total / dim;
+    if (nrows * dim != total) return ray_error("length", NULL);
+
+    const float* data = (const float*)ray_data(emb_vec);
+
+    ray_t* result = ray_vec_new(RAY_F64, nrows);
+    if (!result || RAY_IS_ERR(result)) return ray_error("oom", NULL);
+    result->len = nrows;
+    double* out = (double*)ray_data(result);
+
+    for (int64_t i = 0; i < nrows; i++) {
+        const float* row = data + i * dim;
+        double sum_sq = 0.0;
+        for (int32_t j = 0; j < dim; j++) {
+            double d = (double)row[j] - (double)query[j];
+            sum_sq += d * d;
+        }
+        out[i] = sqrt(sum_sq);
+    }
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_knn: brute-force top-K nearest neighbors over a flat RAY_F32 column.
+ *
+ * Dispatches on ext->vector.metric (default COSINE — 0-initialized struct).
+ * Returns RAY_TABLE with _rowid (I64) and _dist (F64), sorted ascending so
+ * lower = closer across all metrics.
+ *
+ * Distance encoding:
+ *   COSINE → 1 - cosine_similarity
+ *   L2     → sqrt(Σ (a - b)^2)
+ *   IP     → -dot(a, b)
+ * -------------------------------------------------------------------------- */
+
+/* Max-heap entry keyed on distance (root = farthest of top-K kept). */
+typedef struct {
+    double  dist;
+    int64_t rowid;
+} knn_entry_t;
+
+static void knn_heap_insert(knn_entry_t* heap, int64_t k, int64_t* size,
+                             double dist, int64_t rowid) {
+    if (*size < k) {
+        int64_t i = (*size)++;
+        heap[i].dist = dist;
+        heap[i].rowid = rowid;
+        /* Sift up (max-heap: root = largest distance = worst kept) */
+        while (i > 0) {
+            int64_t parent = (i - 1) / 2;
+            if (heap[parent].dist >= heap[i].dist) break;
+            knn_entry_t tmp = heap[parent]; heap[parent] = heap[i]; heap[i] = tmp;
+            i = parent;
+        }
+    } else if (dist < heap[0].dist) {
+        heap[0].dist = dist;
+        heap[0].rowid = rowid;
+        int64_t i = 0;
+        while (1) {
+            int64_t left = 2*i+1, right = 2*i+2, best = i;
+            if (left  < *size && heap[left].dist  > heap[best].dist) best = left;
+            if (right < *size && heap[right].dist > heap[best].dist) best = right;
+            if (best == i) break;
+            knn_entry_t tmp = heap[i]; heap[i] = heap[best]; heap[best] = tmp;
+            i = best;
+        }
+    }
+}
+
+static double knn_row_dist(int32_t metric,
+                             const float* row, const float* query,
+                             double q_norm, int32_t dim) {
+    if (metric == RAY_HNSW_L2) {
+        double s = 0.0;
+        for (int32_t j = 0; j < dim; j++) {
+            double d = (double)row[j] - (double)query[j];
+            s += d * d;
+        }
+        return sqrt(s);
+    }
+    double dot = 0.0, r_norm_sq = 0.0;
+    for (int32_t j = 0; j < dim; j++) {
+        dot += (double)row[j] * (double)query[j];
+        if (metric == RAY_HNSW_COSINE) r_norm_sq += (double)row[j] * (double)row[j];
+    }
+    if (metric == RAY_HNSW_IP) return -dot;
+    /* COSINE */
+    double denom = q_norm * sqrt(r_norm_sq);
+    return (denom > 0.0) ? 1.0 - (dot / denom) : 1.0;
+}
+
+ray_t* exec_knn(ray_graph_t* g, ray_op_t* op, ray_t* emb_vec) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    const float* query = ext->vector.query_vec;
+    int32_t dim = ext->vector.dim;
+    int64_t k = ext->vector.k;
+    int32_t metric = ext->vector.metric;
+    if (metric < RAY_HNSW_COSINE || metric > RAY_HNSW_IP) metric = RAY_HNSW_COSINE;
+
+    if (!query || dim <= 0 || k <= 0) return ray_error("schema", NULL);
+    if (emb_vec->type != RAY_F32) return ray_error("type", NULL);
+
+    int64_t total = emb_vec->len;
+    int64_t nrows = total / dim;
+    if (nrows * dim != total) return ray_error("length", NULL);
+    if (k > nrows) k = nrows;
+
+    const float* data = (const float*)ray_data(emb_vec);
+
+    /* Precompute query norm once (only used by cosine). */
+    double q_norm = 0.0;
+    if (metric == RAY_HNSW_COSINE) {
+        double q_norm_sq = 0.0;
+        for (int32_t j = 0; j < dim; j++)
+            q_norm_sq += (double)query[j] * (double)query[j];
+        q_norm = sqrt(q_norm_sq);
+    }
+
+    ray_t* heap_hdr = NULL;
+    knn_entry_t* heap = (knn_entry_t*)scratch_alloc(&heap_hdr, (size_t)k * sizeof(knn_entry_t));
+    if (!heap) return ray_error("oom", NULL);
+    int64_t heap_size = 0;
+
+    for (int64_t i = 0; i < nrows; i++) {
+        double d = knn_row_dist(metric, data + i * dim, query, q_norm, dim);
+        knn_heap_insert(heap, k, &heap_size, d, i);
+    }
+
+    /* Insertion sort ascending by distance (k is small). */
+    for (int64_t i = 1; i < heap_size; i++) {
+        knn_entry_t key = heap[i];
+        int64_t j = i - 1;
+        while (j >= 0 && heap[j].dist > key.dist) {
+            heap[j + 1] = heap[j];
+            j--;
+        }
+        heap[j + 1] = key;
+    }
+
+    ray_t* rowid_vec = ray_vec_new(RAY_I64, heap_size);
+    ray_t* dist_vec  = ray_vec_new(RAY_F64, heap_size);
+    if (!rowid_vec || RAY_IS_ERR(rowid_vec) || !dist_vec || RAY_IS_ERR(dist_vec)) {
+        scratch_free(heap_hdr);
+        if (rowid_vec && !RAY_IS_ERR(rowid_vec)) ray_release(rowid_vec);
+        if (dist_vec && !RAY_IS_ERR(dist_vec))   ray_release(dist_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* rdata = (int64_t*)ray_data(rowid_vec);
+    double*  ddata = (double*)ray_data(dist_vec);
+    for (int64_t i = 0; i < heap_size; i++) {
+        rdata[i] = heap[i].rowid;
+        ddata[i] = heap[i].dist;
+    }
+    rowid_vec->len = heap_size;
+    dist_vec->len  = heap_size;
+    scratch_free(heap_hdr);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(rowid_vec);
+        ray_release(dist_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_rowid", 6), rowid_vec);
+    ray_release(rowid_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_dist", 5), dist_vec);
+    ray_release(dist_vec);
+    return result;
+}
+
+ray_t* exec_hnsw_knn(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_hnsw_t* idx = (ray_hnsw_t*)ext->hnsw.hnsw_idx;
+    const float* query = ext->hnsw.query_vec;
+    int32_t dim = ext->hnsw.dim;
+    int64_t k = ext->hnsw.k;
+    int32_t ef = ext->hnsw.ef_search;
+
+    if (!idx || !query || dim <= 0 || k <= 0) return ray_error("schema", NULL);
+
+    /* Pre-allocate output arrays */
+    ray_t* ids_hdr = NULL;
+    int64_t* out_ids = (int64_t*)scratch_alloc(&ids_hdr, (size_t)k * sizeof(int64_t));
+    if (!out_ids) return ray_error("oom", NULL);
+
+    ray_t* dists_hdr = NULL;
+    double* out_dists = (double*)scratch_alloc(&dists_hdr, (size_t)k * sizeof(double));
+    if (!out_dists) { scratch_free(ids_hdr); return ray_error("oom", NULL); }
+
+    int64_t n_found = ray_hnsw_search(idx, query, dim, k, ef, out_ids, out_dists);
+    if (n_found < 0) {
+        scratch_free(ids_hdr);
+        scratch_free(dists_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    /* Build output table: _rowid (I64), _dist (F64).  ray_hnsw_search writes
+     * metric-native distances (lower = closer across COSINE / L2 / IP), so we
+     * pass them through unchanged. */
+    ray_t* rowid_vec = ray_vec_new(RAY_I64, n_found);
+    ray_t* dist_vec  = ray_vec_new(RAY_F64, n_found);
+    if (!rowid_vec || RAY_IS_ERR(rowid_vec) || !dist_vec || RAY_IS_ERR(dist_vec)) {
+        scratch_free(ids_hdr);
+        scratch_free(dists_hdr);
+        if (rowid_vec && !RAY_IS_ERR(rowid_vec)) ray_release(rowid_vec);
+        if (dist_vec && !RAY_IS_ERR(dist_vec))   ray_release(dist_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* rdata = (int64_t*)ray_data(rowid_vec);
+    double*  ddata = (double*)ray_data(dist_vec);
+    for (int64_t i = 0; i < n_found; i++) {
+        rdata[i] = out_ids[i];
+        ddata[i] = out_dists[i];
+    }
+    rowid_vec->len = n_found;
+    dist_vec->len  = n_found;
+
+    scratch_free(ids_hdr);
+    scratch_free(dists_hdr);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(rowid_vec);
+        ray_release(dist_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_rowid", 6), rowid_vec);
+    ray_release(rowid_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_dist", 5), dist_vec);
+    ray_release(dist_vec);
+
+    return result;
+}
+
+/* ==========================================================================
+ *  Rayfall builtins — direct metrics, exact KNN, HNSW lifecycle/query
+ *
+ *  Column shape for all builtins accepting a "column" argument:
+ *    RAY_LIST whose entries are numeric vectors (RAY_F32 preferred,
+ *    RAY_F64/RAY_I32/RAY_I64 coerced to double).  All entries must have
+ *    the same length == D.
+ *
+ *  Output of knn / ann:  table {_rowid: I64, _dist: F64} sorted ascending.
+ * ========================================================================== */
+
+static bool rayvec_is_numeric(ray_t* v) {
+    if (!v || !ray_is_vec(v)) return false;
+    return v->type == RAY_F32 || v->type == RAY_F64
+        || v->type == RAY_I32 || v->type == RAY_I64;
+}
+
+static double rayvec_at_f64(ray_t* v, int64_t i) {
+    void* d = ray_data(v);
+    switch (v->type) {
+        case RAY_F32: return (double)((float*)d)[i];
+        case RAY_F64: return ((double*)d)[i];
+        case RAY_I32: return (double)((int32_t*)d)[i];
+        case RAY_I64: return (double)((int64_t*)d)[i];
+        default:      return 0.0;
+    }
+}
+
+/* Copy a numeric vector into a float buffer.  Assumes v->len == dim. */
+static void rayvec_to_floats(ray_t* v, float* dst, int32_t dim) {
+    if (v->type == RAY_F32) {
+        memcpy(dst, ray_data(v), (size_t)dim * sizeof(float));
+        return;
+    }
+    for (int32_t i = 0; i < dim; i++) dst[i] = (float)rayvec_at_f64(v, i);
+}
+
+/* Validate list of numeric vectors, set *out_dim to the common length.
+ * Returns 0 on success, non-zero on error. */
+static int list_vec_validate(ray_t* list, int32_t* out_dim) {
+    if (!list || list->type != RAY_LIST) return 1;
+    if (list->len <= 0) { *out_dim = 0; return 0; }
+    ray_t* first = ray_list_get(list, 0);
+    if (!rayvec_is_numeric(first) || first->len <= 0) return 2;
+    int32_t dim = (int32_t)first->len;
+    for (int64_t i = 1; i < list->len; i++) {
+        ray_t* e = ray_list_get(list, i);
+        if (!rayvec_is_numeric(e) || e->len != dim) return 3;
+    }
+    *out_dim = dim;
+    return 0;
+}
+
+/* Flatten LIST of numeric vectors into a new float[] buffer.
+ * Caller frees with ray_sys_free. */
+static float* list_flatten_floats(ray_t* list, int32_t dim, int64_t* out_n) {
+    int64_t n = list->len;
+    *out_n = n;
+    if (n == 0) return NULL;
+    float* buf = (float*)ray_sys_alloc((size_t)n * (size_t)dim * sizeof(float));
+    if (!buf) return NULL;
+    for (int64_t i = 0; i < n; i++) {
+        ray_t* e = ray_list_get(list, i);
+        rayvec_to_floats(e, buf + i * dim, dim);
+    }
+    return buf;
+}
+
+/* Metric kinds:
+ *   COS_DIST    → 1 - cos(a, b)        (lower = closer, range [0, 2])
+ *   INNER_PROD  → raw dot(a, b)         (sign varies — not a distance)
+ *   L2_DIST     → sqrt(Σ (a - b)^2)     (lower = closer)
+ * These are the values returned by cos-dist / inner-prod / l2-dist builtins. */
+typedef enum { MET_COS_DIST, MET_INNER_PROD, MET_L2_DIST } metric_kind_t;
+
+static double row_score(metric_kind_t k, ray_t* row,
+                         const double* q, double q_norm, int32_t dim) {
+    double acc = 0.0, r_norm_sq = 0.0;
+    if (k == MET_L2_DIST) {
+        for (int32_t j = 0; j < dim; j++) {
+            double d = rayvec_at_f64(row, j) - q[j];
+            acc += d * d;
+        }
+        return sqrt(acc);
+    }
+    for (int32_t j = 0; j < dim; j++) {
+        double a = rayvec_at_f64(row, j);
+        acc += a * q[j];
+        if (k == MET_COS_DIST) r_norm_sq += a * a;
+    }
+    if (k == MET_INNER_PROD) return acc;
+    /* COS_DIST = 1 - cos_sim */
+    double denom = q_norm * sqrt(r_norm_sq);
+    double sim = (denom > 0.0) ? acc / denom : 0.0;
+    return 1.0 - sim;
+}
+
+/* Extract query vector to a double[] scratch buffer. */
+static double* query_to_doubles(ray_t* q, int32_t dim, double* q_norm_out) {
+    double* buf = (double*)ray_sys_alloc((size_t)dim * sizeof(double));
+    if (!buf) return NULL;
+    double ns = 0.0;
+    for (int32_t j = 0; j < dim; j++) {
+        buf[j] = rayvec_at_f64(q, j);
+        ns += buf[j] * buf[j];
+    }
+    *q_norm_out = sqrt(ns);
+    return buf;
+}
+
+/* Binary dispatcher for cos-dist / inner-prod / l2-dist. */
+static ray_t* vec_binary_metric(metric_kind_t kind, ray_t* a, ray_t* b) {
+    if (!a || !b) return ray_error("type", NULL);
+
+    /* LIST × vec → F64 vector (one score per list entry).
+     * vec × LIST → same (treat the LIST as the column). */
+    ray_t* list = NULL;
+    ray_t* query = NULL;
+    if (a->type == RAY_LIST && rayvec_is_numeric(b))      { list = a; query = b; }
+    else if (b->type == RAY_LIST && rayvec_is_numeric(a)) { list = b; query = a; }
+
+    if (list) {
+        int32_t dim;
+        if (list_vec_validate(list, &dim) != 0) return ray_error("type", NULL);
+        if (query->len != dim) return ray_error("length", NULL);
+
+        double q_norm;
+        double* q = query_to_doubles(query, dim, &q_norm);
+        if (!q) return ray_error("oom", NULL);
+
+        int64_t n = list->len;
+        ray_t* result = ray_vec_new(RAY_F64, n);
+        if (!result || RAY_IS_ERR(result)) { ray_sys_free(q); return ray_error("oom", NULL); }
+        result->len = n;
+        double* out = (double*)ray_data(result);
+        for (int64_t i = 0; i < n; i++) {
+            ray_t* row = ray_list_get(list, i);
+            out[i] = row_score(kind, row, q, q_norm, dim);
+        }
+        ray_sys_free(q);
+        return result;
+    }
+
+    /* vec × vec → scalar */
+    if (!rayvec_is_numeric(a) || !rayvec_is_numeric(b)) return ray_error("type", NULL);
+    if (a->len != b->len || a->len <= 0) return ray_error("length", NULL);
+    int32_t dim = (int32_t)a->len;
+
+    double q_norm;
+    double* q = query_to_doubles(b, dim, &q_norm);
+    if (!q) return ray_error("oom", NULL);
+    double v = row_score(kind, a, q, q_norm, dim);
+    ray_sys_free(q);
+    return make_f64(v);
+}
+
+ray_t* ray_cos_dist_fn   (ray_t* a, ray_t* b) { return vec_binary_metric(MET_COS_DIST,   a, b); }
+ray_t* ray_inner_prod_fn (ray_t* a, ray_t* b) { return vec_binary_metric(MET_INNER_PROD, a, b); }
+ray_t* ray_l2_dist_fn    (ray_t* a, ray_t* b) { return vec_binary_metric(MET_L2_DIST,    a, b); }
+
+/* (norm x): x is numeric vec → F64 scalar; x is LIST of numeric vecs → F64 vector. */
+ray_t* ray_norm_fn(ray_t* x) {
+    if (!x) return ray_error("type", NULL);
+    if (x->type == RAY_LIST) {
+        int32_t dim;
+        if (list_vec_validate(x, &dim) != 0) return ray_error("type", NULL);
+        int64_t n = x->len;
+        ray_t* result = ray_vec_new(RAY_F64, n);
+        if (!result || RAY_IS_ERR(result)) return ray_error("oom", NULL);
+        result->len = n;
+        double* out = (double*)ray_data(result);
+        for (int64_t i = 0; i < n; i++) {
+            ray_t* v = ray_list_get(x, i);
+            double s = 0.0;
+            for (int32_t j = 0; j < dim; j++) {
+                double e = rayvec_at_f64(v, j);
+                s += e * e;
+            }
+            out[i] = sqrt(s);
+        }
+        return result;
+    }
+    if (!rayvec_is_numeric(x)) return ray_error("type", NULL);
+    double s = 0.0;
+    for (int64_t i = 0; i < x->len; i++) {
+        double e = rayvec_at_f64(x, i);
+        s += e * e;
+    }
+    return make_f64(sqrt(s));
+}
+
+/* Parse a metric symbol.  Accepted: 'cosine, 'l2, 'ip.  Matches the three
+ * distance flavors. */
+static int parse_metric_sym(ray_t* s, ray_hnsw_metric_t* out) {
+    if (!s || s->type != -RAY_SYM) return 0;
+    int64_t id = s->i64;
+    if (id == ray_sym_find("cosine", 6)) { *out = RAY_HNSW_COSINE; return 1; }
+    if (id == ray_sym_find("l2",     2)) { *out = RAY_HNSW_L2;     return 1; }
+    if (id == ray_sym_find("ip",     2)) { *out = RAY_HNSW_IP;     return 1; }
+    return 0;
+}
+
+static int64_t atom_to_i64(ray_t* a) {
+    if (!a) return 0;
+    switch (a->type) {
+        case -RAY_I64: return a->i64;
+        case -RAY_I32: return (int64_t)a->i32;
+        case -RAY_I16: return (int64_t)a->i16;
+        default: return 0;
+    }
+}
+
+static bool atom_is_int(ray_t* a) {
+    return a && (a->type == -RAY_I64 || a->type == -RAY_I32 || a->type == -RAY_I16);
+}
+
+/* (knn col query k [metric]) → table {_rowid, _dist} */
+ray_t* ray_knn_fn(ray_t** args, int64_t n) {
+    if (n < 3 || n > 4) return ray_error("rank", NULL);
+    ray_t* col   = args[0];
+    ray_t* query = args[1];
+    ray_t* katom = args[2];
+    if (!col || col->type != RAY_LIST) return ray_error("type", NULL);
+    if (!rayvec_is_numeric(query))     return ray_error("type", NULL);
+    if (!atom_is_int(katom))           return ray_error("type", NULL);
+
+    ray_hnsw_metric_t metric = RAY_HNSW_COSINE;
+    if (n == 4 && !parse_metric_sym(args[3], &metric)) return ray_error("domain", NULL);
+
+    int32_t dim;
+    if (list_vec_validate(col, &dim) != 0) return ray_error("type", NULL);
+    if (query->len != dim) return ray_error("length", NULL);
+
+    int64_t k = atom_to_i64(katom);
+    if (k <= 0) return ray_error("domain", NULL);
+    int64_t nrows = col->len;
+    if (k > nrows) k = nrows;
+    if (nrows == 0) {
+        /* Empty result table. */
+        ray_t* rv = ray_vec_new(RAY_I64, 0);
+        ray_t* dv = ray_vec_new(RAY_F64, 0);
+        ray_t* tbl = ray_table_new(2);
+        tbl = ray_table_add_col(tbl, sym_intern_safe("_rowid", 6), rv);
+        tbl = ray_table_add_col(tbl, sym_intern_safe("_dist",  5), dv);
+        ray_release(rv); ray_release(dv);
+        return tbl;
+    }
+
+    /* Prepare query as doubles (cached across all rows). */
+    double q_norm;
+    double* q = query_to_doubles(query, dim, &q_norm);
+    if (!q) return ray_error("oom", NULL);
+
+    /* Max-heap on distance (root = farthest of top-K kept). */
+    typedef struct { double d; int64_t id; } ent_t;
+    ent_t* heap = (ent_t*)ray_sys_alloc((size_t)k * sizeof(ent_t));
+    if (!heap) { ray_sys_free(q); return ray_error("oom", NULL); }
+    int64_t hsz = 0;
+
+    for (int64_t i = 0; i < nrows; i++) {
+        ray_t* row = ray_list_get(col, i);
+        double d;
+        switch (metric) {
+            case RAY_HNSW_L2:
+                d = row_score(MET_L2_DIST, row, q, q_norm, dim);
+                break;
+            case RAY_HNSW_IP:
+                /* Negate inner product so lower = closer. */
+                d = -row_score(MET_INNER_PROD, row, q, q_norm, dim);
+                break;
+            case RAY_HNSW_COSINE:
+            default:
+                d = row_score(MET_COS_DIST, row, q, q_norm, dim);
+                break;
+        }
+
+        if (hsz < k) {
+            int64_t j = hsz++;
+            heap[j] = (ent_t){ d, i };
+            while (j > 0) {
+                int64_t p = (j - 1) / 2;
+                if (heap[p].d >= heap[j].d) break;
+                ent_t t = heap[p]; heap[p] = heap[j]; heap[j] = t;
+                j = p;
+            }
+        } else if (d < heap[0].d) {
+            heap[0] = (ent_t){ d, i };
+            int64_t j = 0;
+            for (;;) {
+                int64_t l = 2*j+1, r = 2*j+2, best = j;
+                if (l < hsz && heap[l].d > heap[best].d) best = l;
+                if (r < hsz && heap[r].d > heap[best].d) best = r;
+                if (best == j) break;
+                ent_t t = heap[j]; heap[j] = heap[best]; heap[best] = t;
+                j = best;
+            }
+        }
+    }
+
+    ray_sys_free(q);
+
+    /* Sort ascending by distance. */
+    for (int64_t i = 1; i < hsz; i++) {
+        ent_t key = heap[i];
+        int64_t j = i - 1;
+        while (j >= 0 && heap[j].d > key.d) {
+            heap[j + 1] = heap[j];
+            j--;
+        }
+        heap[j + 1] = key;
+    }
+
+    ray_t* rv = ray_vec_new(RAY_I64, hsz);
+    ray_t* dv = ray_vec_new(RAY_F64, hsz);
+    if (!rv || RAY_IS_ERR(rv) || !dv || RAY_IS_ERR(dv)) {
+        ray_sys_free(heap);
+        if (rv && !RAY_IS_ERR(rv)) ray_release(rv);
+        if (dv && !RAY_IS_ERR(dv)) ray_release(dv);
+        return ray_error("oom", NULL);
+    }
+    int64_t* rd = (int64_t*)ray_data(rv);
+    double*  dd = (double*)ray_data(dv);
+    for (int64_t i = 0; i < hsz; i++) { rd[i] = heap[i].id; dd[i] = heap[i].d; }
+    rv->len = hsz;
+    dv->len = hsz;
+    ray_sys_free(heap);
+
+    ray_t* tbl = ray_table_new(2);
+    if (!tbl || RAY_IS_ERR(tbl)) { ray_release(rv); ray_release(dv); return ray_error("oom", NULL); }
+    tbl = ray_table_add_col(tbl, sym_intern_safe("_rowid", 6), rv);
+    ray_release(rv);
+    tbl = ray_table_add_col(tbl, sym_intern_safe("_dist",  5), dv);
+    ray_release(dv);
+    return tbl;
+}
+
+/* ---------- HNSW handle plumbing ---------- */
+
+static ray_hnsw_t* hnsw_unwrap(ray_t* h) {
+    if (!h) return NULL;
+    if (h->type != -RAY_I64) return NULL;
+    if (!(h->attrs & RAY_ATTR_HNSW)) return NULL;
+    return (ray_hnsw_t*)(uintptr_t)h->i64;
+}
+
+static ray_t* hnsw_wrap(ray_hnsw_t* idx) {
+    ray_t* h = ray_alloc(0);
+    if (!h || RAY_IS_ERR(h)) return h ? h : ray_error("oom", NULL);
+    h->type  = -RAY_I64;
+    h->attrs |= RAY_ATTR_HNSW;
+    h->i64   = (int64_t)(uintptr_t)idx;
+    return h;
+}
+
+/* (hnsw-build col [metric] [M] [ef_c]) → I64 handle (RAY_ATTR_HNSW) */
+ray_t* ray_hnsw_build_fn(ray_t** args, int64_t n) {
+    if (n < 1 || n > 4) return ray_error("rank", NULL);
+    ray_t* col = args[0];
+    if (!col || col->type != RAY_LIST) return ray_error("type", NULL);
+
+    ray_hnsw_metric_t metric = RAY_HNSW_COSINE;
+    if (n >= 2 && !parse_metric_sym(args[1], &metric)) return ray_error("domain", NULL);
+
+    int32_t M = HNSW_DEFAULT_M;
+    if (n >= 3) {
+        if (!atom_is_int(args[2])) return ray_error("type", NULL);
+        int64_t v = atom_to_i64(args[2]);
+        if (v > 0 && v <= 512) M = (int32_t)v;
+    }
+    int32_t ef_c = HNSW_DEFAULT_EF_C;
+    if (n >= 4) {
+        if (!atom_is_int(args[3])) return ray_error("type", NULL);
+        int64_t v = atom_to_i64(args[3]);
+        if (v > 0 && v <= 4096) ef_c = (int32_t)v;
+    }
+
+    int32_t dim;
+    if (list_vec_validate(col, &dim) != 0) return ray_error("type", NULL);
+    if (dim <= 0) return ray_error("length", NULL);
+
+    int64_t n_rows;
+    float* flat = list_flatten_floats(col, dim, &n_rows);
+    if (!flat && n_rows > 0) return ray_error("oom", NULL);
+
+    ray_hnsw_t* idx = ray_hnsw_build(flat, n_rows, dim, metric, M, ef_c);
+    /* ray_hnsw_build COPIES the vectors (idx->owns_data == true), so free our scratch. */
+    if (flat) ray_sys_free(flat);
+    if (!idx) return ray_error("oom", NULL);
+
+    ray_t* h = hnsw_wrap(idx);
+    if (!h || RAY_IS_ERR(h)) { ray_hnsw_free(idx); return h; }
+    return h;
+}
+
+/* (ann handle query k [ef_s]) → table {_rowid, _dist} */
+ray_t* ray_ann_fn(ray_t** args, int64_t n) {
+    if (n < 3 || n > 4) return ray_error("rank", NULL);
+    ray_hnsw_t* idx = hnsw_unwrap(args[0]);
+    if (!idx) return ray_error("type", NULL);
+    if (!rayvec_is_numeric(args[1])) return ray_error("type", NULL);
+    if (!atom_is_int(args[2]))       return ray_error("type", NULL);
+
+    int32_t dim = idx->dim;
+    if (args[1]->len != dim) return ray_error("length", NULL);
+    int64_t k = atom_to_i64(args[2]);
+    if (k <= 0) return ray_error("domain", NULL);
+
+    int32_t ef = (int32_t)k;
+    if (ef < HNSW_DEFAULT_EF_S) ef = HNSW_DEFAULT_EF_S;
+    if (n == 4) {
+        if (!atom_is_int(args[3])) return ray_error("type", NULL);
+        int64_t v = atom_to_i64(args[3]);
+        if (v > 0 && v <= 4096) ef = (int32_t)v;
+    }
+
+    /* Copy query into float[] scratch. */
+    float* qbuf = (float*)ray_sys_alloc((size_t)dim * sizeof(float));
+    if (!qbuf) return ray_error("oom", NULL);
+    rayvec_to_floats(args[1], qbuf, dim);
+
+    int64_t* out_ids = (int64_t*)ray_sys_alloc((size_t)k * sizeof(int64_t));
+    double*  out_ds  = (double*)ray_sys_alloc((size_t)k * sizeof(double));
+    if (!out_ids || !out_ds) {
+        ray_sys_free(qbuf);
+        if (out_ids) ray_sys_free(out_ids);
+        if (out_ds)  ray_sys_free(out_ds);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t found = ray_hnsw_search(idx, qbuf, dim, k, ef, out_ids, out_ds);
+    if (found < 0) {
+        ray_sys_free(qbuf); ray_sys_free(out_ids); ray_sys_free(out_ds);
+        return ray_error("oom", NULL);
+    }
+
+    ray_t* rv = ray_vec_new(RAY_I64, found);
+    ray_t* dv = ray_vec_new(RAY_F64, found);
+    if (!rv || RAY_IS_ERR(rv) || !dv || RAY_IS_ERR(dv)) {
+        ray_sys_free(qbuf); ray_sys_free(out_ids); ray_sys_free(out_ds);
+        if (rv && !RAY_IS_ERR(rv)) ray_release(rv);
+        if (dv && !RAY_IS_ERR(dv)) ray_release(dv);
+        return ray_error("oom", NULL);
+    }
+    int64_t* rd = (int64_t*)ray_data(rv);
+    double*  dd = (double*)ray_data(dv);
+    for (int64_t i = 0; i < found; i++) { rd[i] = out_ids[i]; dd[i] = out_ds[i]; }
+    rv->len = found;
+    dv->len = found;
+    ray_sys_free(qbuf); ray_sys_free(out_ids); ray_sys_free(out_ds);
+
+    ray_t* tbl = ray_table_new(2);
+    if (!tbl || RAY_IS_ERR(tbl)) { ray_release(rv); ray_release(dv); return ray_error("oom", NULL); }
+    tbl = ray_table_add_col(tbl, sym_intern_safe("_rowid", 6), rv);
+    ray_release(rv);
+    tbl = ray_table_add_col(tbl, sym_intern_safe("_dist",  5), dv);
+    ray_release(dv);
+    return tbl;
+}
+
+/* (hnsw-free handle) → null.  Idempotent: clearing the ATTR on success
+ * means a second call returns a type error rather than double-freeing. */
+ray_t* ray_hnsw_free_fn(ray_t* h) {
+    ray_hnsw_t* idx = hnsw_unwrap(h);
+    if (!idx) return ray_error("type", NULL);
+    ray_hnsw_free(idx);
+    h->i64 = 0;
+    h->attrs &= ~RAY_ATTR_HNSW;
+    return RAY_NULL_OBJ;
+}
+
+/* (hnsw-save handle path) → null */
+ray_t* ray_hnsw_save_fn(ray_t* h, ray_t* path) {
+    ray_hnsw_t* idx = hnsw_unwrap(h);
+    if (!idx) return ray_error("type", NULL);
+    if (!path || path->type != -RAY_STR) return ray_error("type", NULL);
+    const char* p = ray_str_ptr(path);
+    size_t len = ray_str_len(path);
+    if (!p || len == 0 || len >= 1023) return ray_error("domain", NULL);
+    char buf[1024];
+    memcpy(buf, p, len);
+    buf[len] = '\0';
+    ray_err_t err = ray_hnsw_save(idx, buf);
+    if (err != RAY_OK) return ray_error("io", NULL);
+    return RAY_NULL_OBJ;
+}
+
+/* (hnsw-load path) → I64 handle */
+ray_t* ray_hnsw_load_fn(ray_t* path) {
+    if (!path || path->type != -RAY_STR) return ray_error("type", NULL);
+    const char* p = ray_str_ptr(path);
+    size_t len = ray_str_len(path);
+    if (!p || len == 0 || len >= 1023) return ray_error("domain", NULL);
+    char buf[1024];
+    memcpy(buf, p, len);
+    buf[len] = '\0';
+    ray_hnsw_t* idx = ray_hnsw_load(buf);
+    if (!idx) return ray_error("io", NULL);
+    ray_t* h = hnsw_wrap(idx);
+    if (!h || RAY_IS_ERR(h)) { ray_hnsw_free(idx); return h; }
+    return h;
+}
+
+/* (hnsw-info handle) → dict { nrows, dim, metric, nlayers, M, efc }.
+ * Keys avoid hyphens so the 'quote-tick' syntax works: 'nrows, 'dim, etc. */
+ray_t* ray_hnsw_info_fn(ray_t* h) {
+    ray_hnsw_t* idx = hnsw_unwrap(h);
+    if (!idx) return ray_error("type", NULL);
+
+    const char* mname = "cosine";
+    switch ((ray_hnsw_metric_t)idx->metric) {
+        case RAY_HNSW_L2: mname = "l2"; break;
+        case RAY_HNSW_IP: mname = "ip"; break;
+        default: break;
+    }
+
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, 6);
+    if (RAY_IS_ERR(keys)) return keys;
+    ray_t* vals = ray_list_new(6);
+    if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+
+    struct { const char* name; size_t nlen; ray_t* val; } rows[] = {
+        { "nrows",   5, make_i64(idx->n_nodes)               },
+        { "dim",     3, make_i64((int64_t)idx->dim)          },
+        { "metric",  6, ray_sym(sym_intern_safe(mname, strlen(mname))) },
+        { "nlayers", 7, make_i64((int64_t)idx->n_layers)     },
+        { "M",       1, make_i64((int64_t)idx->M)            },
+        { "efc",     3, make_i64((int64_t)idx->ef_construction) },
+    };
+    for (size_t i = 0; i < sizeof(rows)/sizeof(rows[0]); i++) {
+        int64_t s = sym_intern_safe(rows[i].name, rows[i].nlen);
+        keys = ray_vec_append(keys, &s);
+        vals = ray_list_append(vals, rows[i].val);
+        ray_release(rows[i].val);
+    }
+    return ray_dict_new(keys, vals);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/exec.c b/crates/rayforce-sys/vendor/rayforce/src/ops/exec.c
new file mode 100644
index 0000000..a28f41a
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/exec.c
@@ -0,0 +1,2272 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+#include "ops/rowsel.h"
+#include "mem/sys.h"
+
+/* Global profiler instance (zero-initialized = inactive) */
+ray_profile_t g_ray_profile;
+
+/* --------------------------------------------------------------------------
+ * Materialize a MAPCOMMON column into a flat RAY_SYM vector.
+ * Expands key_values × row_counts into one SYM ID per row.
+ * -------------------------------------------------------------------------- */
+ray_t* materialize_mapcommon(ray_t* mc) {
+    ray_t** mc_ptrs = (ray_t**)ray_data(mc);
+    ray_t* kv = mc_ptrs[0];   /* key_values: typed vec (DATE/I64/SYM) */
+    ray_t* rc = mc_ptrs[1];   /* row_counts: RAY_I64 vec of n_parts */
+    int64_t n_parts = kv->len;
+    int8_t kv_type = kv->type;
+    size_t esz = (size_t)ray_sym_elem_size(kv_type, kv->attrs);
+    const char* kdata = (const char*)ray_data(kv);
+    const int64_t* counts = (const int64_t*)ray_data(rc);
+
+    int64_t total = 0;
+    for (int64_t p = 0; p < n_parts; p++) total += counts[p];
+
+    ray_t* flat = ray_vec_new(kv_type, total);
+    if (!flat || RAY_IS_ERR(flat)) return ray_error("oom", NULL);
+    flat->len = total;
+
+    /* Pattern-fill: broadcast each partition's key value across its row range.
+     * Typed fill avoids per-element memcpy overhead. */
+    char* out = (char*)ray_data(flat);
+    int64_t off = 0;
+    for (int64_t p = 0; p < n_parts; p++) {
+        int64_t cnt = counts[p];
+        if (esz == 8) {
+            uint64_t v;
+            memcpy(&v, kdata + (size_t)p * 8, 8);
+            uint64_t* dst = (uint64_t*)(out + off * 8);
+            for (int64_t r = 0; r < cnt; r++) dst[r] = v;
+        } else if (esz == 4) {
+            uint32_t v;
+            memcpy(&v, kdata + (size_t)p * 4, 4);
+            uint32_t* dst = (uint32_t*)(out + off * 4);
+            for (int64_t r = 0; r < cnt; r++) dst[r] = v;
+        } else {
+            for (int64_t r = 0; r < cnt; r++)
+                memcpy(out + (off + r) * esz, kdata + (size_t)p * esz, esz);
+        }
+        off += cnt;
+    }
+    return flat;
+}
+
+/* Materialize first N rows of a MAPCOMMON column into a flat typed vector. */
+ray_t* materialize_mapcommon_head(ray_t* mc, int64_t n) {
+    ray_t** mc_ptrs = (ray_t**)ray_data(mc);
+    ray_t* kv = mc_ptrs[0];
+    ray_t* rc = mc_ptrs[1];
+    int64_t n_parts = kv->len;
+    int8_t kv_type = kv->type;
+    size_t esz = (size_t)ray_sym_elem_size(kv_type, kv->attrs);
+    const char* kdata = (const char*)ray_data(kv);
+    const int64_t* counts = (const int64_t*)ray_data(rc);
+
+    ray_t* flat = ray_vec_new(kv_type, n);
+    if (!flat || RAY_IS_ERR(flat)) return ray_error("oom", NULL);
+    flat->len = n;
+
+    char* out = (char*)ray_data(flat);
+    int64_t off = 0;
+    for (int64_t p = 0; p < n_parts && off < n; p++) {
+        int64_t take = counts[p];
+        if (take > n - off) take = n - off;
+        if (esz == 8) {
+            uint64_t v;
+            memcpy(&v, kdata + (size_t)p * 8, 8);
+            uint64_t* dst = (uint64_t*)(out + off * 8);
+            for (int64_t r = 0; r < take; r++) dst[r] = v;
+        } else if (esz == 4) {
+            uint32_t v;
+            memcpy(&v, kdata + (size_t)p * 4, 4);
+            uint32_t* dst = (uint32_t*)(out + off * 4);
+            for (int64_t r = 0; r < take; r++) dst[r] = v;
+        } else {
+            for (int64_t r = 0; r < take; r++)
+                memcpy(out + (off + r) * esz, kdata + (size_t)p * esz, esz);
+        }
+        off += take;
+    }
+    return flat;
+}
+
+/* Materialize MAPCOMMON through a boolean filter predicate. */
+ray_t* materialize_mapcommon_filter(ray_t* mc, ray_t* pred, int64_t pass_count) {
+    ray_t** mc_ptrs = (ray_t**)ray_data(mc);
+    ray_t* kv = mc_ptrs[0];
+    ray_t* rc = mc_ptrs[1];
+    int64_t n_parts = kv->len;
+    int8_t kv_type = kv->type;
+    size_t esz = (size_t)ray_sym_elem_size(kv_type, kv->attrs);
+    const char* kdata = (const char*)ray_data(kv);
+    const int64_t* counts = (const int64_t*)ray_data(rc);
+
+    ray_t* flat = ray_vec_new(kv_type, pass_count);
+    if (!flat || RAY_IS_ERR(flat)) return ray_error("oom", NULL);
+    flat->len = pass_count;
+
+    char* out = (char*)ray_data(flat);
+    int64_t out_idx = 0;
+    int64_t row = 0;
+    int64_t part_idx = 0;
+    int64_t part_end = counts[0];
+
+    ray_morsel_t mp;
+    ray_morsel_init(&mp, pred);
+    while (ray_morsel_next(&mp)) {
+        const uint8_t* bits = (const uint8_t*)mp.morsel_ptr;
+        for (int64_t i = 0; i < mp.morsel_len; i++, row++) {
+            while (part_idx < n_parts - 1 && row >= part_end) {
+                part_idx++;
+                part_end += counts[part_idx];
+            }
+            if (bits[i])
+                memcpy(out + (size_t)out_idx++ * esz,
+                       kdata + (size_t)part_idx * esz, esz);
+        }
+    }
+    return flat;
+}
+
+
+/* ============================================================================
+ * Parallel index gather — used by filter, sort, and join
+ * ============================================================================ */
+
+void multi_gather_fn(void* raw, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid;
+    multi_gather_ctx_t* c = (multi_gather_ctx_t*)raw;
+    const int64_t* restrict idx = c->idx;
+    int64_t nc = c->ncols;
+
+    /* Process one column at a time per batch of rows.
+     * This focuses random reads on a single source array, giving the
+     * hardware prefetcher only 1 stream to track (instead of ncols
+     * concurrent streams, which overflows the L2 miss queue). */
+#define MG_BATCH 512
+#define MG_PF    32
+    for (int64_t base = start; base < end; base += MG_BATCH) {
+        int64_t bstart = base;
+        int64_t bend = base + MG_BATCH;
+        if (bend > end) bend = end;
+        for (int64_t col = 0; col < nc; col++) {
+            uint8_t e = c->esz[col];
+            char* src = c->srcs[col];
+            char* dst = c->dsts[col];
+            if (e == 8) {
+                const uint64_t* restrict s8 = (const uint64_t*)src;
+                uint64_t* restrict d8 = (uint64_t*)dst;
+                for (int64_t i = bstart; i < bend; i++) {
+                    if (i + MG_PF < bend)
+                        __builtin_prefetch(&s8[idx[i + MG_PF]], 0, 0);
+                    d8[i] = s8[idx[i]];
+                }
+            } else if (e == 4) {
+                const uint32_t* restrict s4 = (const uint32_t*)src;
+                uint32_t* restrict d4 = (uint32_t*)dst;
+                for (int64_t i = bstart; i < bend; i++) {
+                    if (i + MG_PF < bend)
+                        __builtin_prefetch(&s4[idx[i + MG_PF]], 0, 0);
+                    d4[i] = s4[idx[i]];
+                }
+            } else {
+                for (int64_t i = bstart; i < bend; i++) {
+                    if (i + MG_PF < bend)
+                        __builtin_prefetch(src + idx[i + MG_PF] * e, 0, 0);
+                    memcpy(dst + i * e, src + idx[i] * e, e);
+                }
+            }
+        }
+    }
+#undef MG_PF
+#undef MG_BATCH
+}
+
+/* Parallel index gather — single column with prefetching */
+void gather_fn(void* raw, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid;
+    gather_ctx_t* c = (gather_ctx_t*)raw;
+    char* restrict src = (char*)ray_data(c->src_col);
+    char* restrict dst = (char*)ray_data(c->dst_col);
+    uint8_t esz = c->esz;
+    const int64_t* restrict idx = c->idx;
+#define GATHER_PF 16
+
+    if (c->nullable) {
+        for (int64_t i = start; i < end; i++) {
+            if (i + GATHER_PF < end) {
+                int64_t pf = idx[i + GATHER_PF];
+                if (pf >= 0) __builtin_prefetch(src + pf * esz, 0, 0);
+            }
+            int64_t r = idx[i];
+            if (r >= 0)
+                memcpy(dst + i * esz, src + r * esz, esz);
+            else
+                memset(dst + i * esz, 0, esz);
+        }
+    } else {
+        for (int64_t i = start; i < end; i++) {
+            if (i + GATHER_PF < end)
+                __builtin_prefetch(src + idx[i + GATHER_PF] * esz, 0, 0);
+            memcpy(dst + i * esz, src + idx[i] * esz, esz);
+        }
+    }
+#undef GATHER_PF
+}
+
+/* ============================================================================
+ * Partitioned gather — cache-conscious column rearrangement
+ *
+ * Standard gather:  dst[i] = src[idx[i]] — sequential writes, random reads.
+ * With 10M rows the source data (~hundreds of MB) far exceeds L2 cache, so
+ * every read is a main-memory miss (~60ns even with prefetching).
+ *
+ * Partitioned gather groups work by source ranges: for each 16K-row source
+ * block, process all indices that point into it.  The block fits in L2, so
+ * reads become L2 hits (~5ns).  Output writes become random but the CPU's
+ * store buffer absorbs them without stalling (~20ns effective).
+ *
+ * Three phases:
+ *   1. Histogram  — count indices per source block           (parallel)
+ *   2. Route      — scatter (dest, src) pairs into buckets   (parallel)
+ *   3. Block-gather — per block, source in L2 → fast reads   (parallel)
+ * ============================================================================ */
+
+/* Block = 16K source rows.  16K × 16 cols × 8B = 2MB ≈ L2 cache per core. */
+#define PG_BSHIFT 14
+#define PG_BSIZE  (1 << PG_BSHIFT)   /* 16384 */
+#define PG_MIN    (PG_BSIZE * 8)     /* 131072 — below this, routing overhead > benefit */
+
+/* Phase 1+2 use dispatch_n with explicit task-to-range mapping so that
+ * histogram and scatter have consistent per-task assignments regardless
+ * of which worker picks up each task (work-stealing is non-deterministic). */
+
+typedef struct {
+    const int64_t* idx;
+    int64_t*       hist;      /* n_tasks × n_parts, row-major */
+    int64_t        n_parts;
+    int64_t        n;         /* total rows */
+    uint32_t       n_tasks;
+} pg_hist_ctx_t;
+
+static void pg_hist_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid; (void)end;
+    pg_hist_ctx_t* c = (pg_hist_ctx_t*)arg;
+    int64_t task = start;
+
+    int64_t chunk = (c->n + c->n_tasks - 1) / c->n_tasks;
+    int64_t lo = task * chunk;
+    int64_t hi = lo + chunk;
+    if (hi > c->n) hi = c->n;
+    if (lo >= hi) { memset(c->hist + task * c->n_parts, 0,
+                           (size_t)c->n_parts * sizeof(int64_t)); return; }
+
+    int64_t* h = c->hist + task * c->n_parts;
+    memset(h, 0, (size_t)c->n_parts * sizeof(int64_t));
+    const int64_t* idx = c->idx;
+    for (int64_t i = lo; i < hi; i++)
+        h[idx[i] >> PG_BSHIFT]++;
+}
+
+typedef struct {
+    const int64_t* idx;
+    int32_t*       rdest;
+    int32_t*       rsrc;
+    int64_t*       offsets;   /* n_tasks × n_parts write cursors */
+    int64_t        n_parts;
+    int64_t        n;
+    uint32_t       n_tasks;
+} pg_route_ctx_t;
+
+static void pg_route_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid; (void)end;
+    pg_route_ctx_t* c = (pg_route_ctx_t*)arg;
+    int64_t task = start;
+
+    int64_t chunk = (c->n + c->n_tasks - 1) / c->n_tasks;
+    int64_t lo = task * chunk;
+    int64_t hi = lo + chunk;
+    if (hi > c->n) hi = c->n;
+    if (lo >= hi) return;
+
+    int64_t* off = c->offsets + task * c->n_parts;
+    const int64_t* idx = c->idx;
+    int32_t* rd = c->rdest;
+    int32_t* rs = c->rsrc;
+    for (int64_t i = lo; i < hi; i++) {
+        int64_t src = idx[i];
+        int64_t pos = off[src >> PG_BSHIFT]++;
+        rd[pos] = (int32_t)i;
+        rs[pos] = (int32_t)src;
+    }
+}
+
+/* Phase 3: per-block gather — one task per source block */
+typedef struct {
+    const int32_t* rdest;
+    const int32_t* rsrc;
+    const int64_t* part_off;  /* partition start offsets (n_parts + 1) */
+    char**         srcs;
+    char**         dsts;
+    const uint8_t* esz;
+    int64_t        ncols;
+} pg_block_ctx_t;
+
+static void pg_block_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid; (void)end;
+    pg_block_ctx_t* c = (pg_block_ctx_t*)arg;
+    int64_t blk = start;  /* dispatch_n: one task per call */
+
+    int64_t lo = c->part_off[blk];
+    int64_t hi = c->part_off[blk + 1];
+    if (lo >= hi) return;
+
+    const int32_t* rd = c->rdest + lo;
+    const int32_t* rs = c->rsrc  + lo;
+    int64_t cnt = hi - lo;
+
+    /* Column-at-a-time: keeps the source block hot in L2.
+     * After the first few reads, the entire 16K-row source slice
+     * is cache-resident, so subsequent reads are L2 hits. */
+    for (int64_t col = 0; col < c->ncols; col++) {
+        uint8_t e = c->esz[col];
+        const char* src = c->srcs[col];
+        char* dst = c->dsts[col];
+        if (e == 8) {
+            const uint64_t* s8 = (const uint64_t*)src;
+            uint64_t* d8 = (uint64_t*)dst;
+            for (int64_t j = 0; j < cnt; j++)
+                d8[rd[j]] = s8[rs[j]];
+        } else if (e == 4) {
+            const uint32_t* s4 = (const uint32_t*)src;
+            uint32_t* d4 = (uint32_t*)dst;
+            for (int64_t j = 0; j < cnt; j++)
+                d4[rd[j]] = s4[rs[j]];
+        } else if (e == 2) {
+            const uint16_t* s2 = (const uint16_t*)src;
+            uint16_t* d2 = (uint16_t*)dst;
+            for (int64_t j = 0; j < cnt; j++)
+                d2[rd[j]] = s2[rs[j]];
+        } else if (e == 1) {
+            for (int64_t j = 0; j < cnt; j++)
+                dst[rd[j]] = src[rs[j]];
+        } else {
+            for (int64_t j = 0; j < cnt; j++)
+                memcpy(dst + (int64_t)rd[j] * e,
+                       src + (int64_t)rs[j] * e, e);
+        }
+    }
+}
+
+/* Public entry point: partitioned gather for n > PG_MIN, fallback otherwise.
+ * n:        number of index entries (output rows)
+ * src_rows: number of rows in the source columns (indices may reference [0, src_rows)) */
+void partitioned_gather(ray_pool_t* pool, const int64_t* idx, int64_t n,
+                        int64_t src_rows, char** srcs, char** dsts,
+                        const uint8_t* esz, int64_t ncols) {
+    /* Fallback for small arrays or no pool */
+    if (!pool || n < PG_MIN || n > INT32_MAX || src_rows > INT32_MAX) {
+        multi_gather_ctx_t mg = { .idx = idx, .ncols = 0 };
+        for (int64_t c = 0; c < ncols && c < MGATHER_MAX_COLS; c++) {
+            mg.srcs[c] = srcs[c]; mg.dsts[c] = dsts[c]; mg.esz[c] = esz[c];
+            mg.ncols++;
+        }
+        if (pool) ray_pool_dispatch(pool, multi_gather_fn, &mg, n);
+        else      multi_gather_fn(&mg, 0, 0, n);
+        return;
+    }
+
+    /* Partition by SOURCE range — indices can reference any row in [0, src_rows) */
+    int64_t n_parts = (src_rows + PG_BSIZE - 1) >> PG_BSHIFT;
+    uint32_t nw = ray_pool_total_workers(pool);
+
+    /* Allocate routing buffers */
+    ray_t *hist_hdr = NULL, *off_hdr = NULL;
+    ray_t *rdest_hdr = NULL, *rsrc_hdr = NULL, *poff_hdr = NULL;
+
+    int64_t* hist    = (int64_t*)scratch_alloc(&hist_hdr,
+                           (size_t)nw * (size_t)n_parts * sizeof(int64_t));
+    int64_t* offsets = (int64_t*)scratch_alloc(&off_hdr,
+                           (size_t)nw * (size_t)n_parts * sizeof(int64_t));
+    int32_t* rdest   = (int32_t*)scratch_alloc(&rdest_hdr,
+                           (size_t)n * sizeof(int32_t));
+    int32_t* rsrc    = (int32_t*)scratch_alloc(&rsrc_hdr,
+                           (size_t)n * sizeof(int32_t));
+    int64_t* part_off = (int64_t*)scratch_alloc(&poff_hdr,
+                            (size_t)(n_parts + 1) * sizeof(int64_t));
+
+    if (!hist || !offsets || !rdest || !rsrc || !part_off) {
+        scratch_free(hist_hdr); scratch_free(off_hdr);
+        scratch_free(rdest_hdr); scratch_free(rsrc_hdr);
+        scratch_free(poff_hdr);
+        /* Fallback to regular gather on allocation failure */
+        multi_gather_ctx_t mg = { .idx = idx, .ncols = 0 };
+        for (int64_t c = 0; c < ncols && c < MGATHER_MAX_COLS; c++) {
+            mg.srcs[c] = srcs[c]; mg.dsts[c] = dsts[c]; mg.esz[c] = esz[c];
+            mg.ncols++;
+        }
+        ray_pool_dispatch(pool, multi_gather_fn, &mg, n);
+        return;
+    }
+
+    /* Phase 1: parallel histogram (dispatch_n for deterministic task→range) */
+    pg_hist_ctx_t hctx = {
+        .idx = idx, .hist = hist, .n_parts = n_parts,
+        .n = n, .n_tasks = nw,
+    };
+    ray_pool_dispatch_n(pool, pg_hist_fn, &hctx, nw);
+
+    /* Phase 2: prefix sum → per-task scatter offsets + partition boundaries */
+    int64_t running = 0;
+    for (int64_t p = 0; p < n_parts; p++) {
+        part_off[p] = running;
+        for (uint32_t t = 0; t < nw; t++) {
+            offsets[t * n_parts + p] = running;
+            running += hist[t * n_parts + p];
+        }
+    }
+    part_off[n_parts] = running;
+
+    /* Phase 3: parallel route (same task→range mapping as histogram) */
+    pg_route_ctx_t rctx = {
+        .idx = idx, .rdest = rdest, .rsrc = rsrc,
+        .offsets = offsets, .n_parts = n_parts,
+        .n = n, .n_tasks = nw,
+    };
+    ray_pool_dispatch_n(pool, pg_route_fn, &rctx, nw);
+
+    /* Phase 4: parallel per-block gather */
+    pg_block_ctx_t bctx = {
+        .rdest = rdest, .rsrc = rsrc, .part_off = part_off,
+        .srcs = srcs, .dsts = dsts, .esz = esz, .ncols = ncols,
+    };
+    ray_pool_dispatch_n(pool, pg_block_fn, &bctx, (uint32_t)n_parts);
+
+    scratch_free(hist_hdr);
+    scratch_free(off_hdr);
+    scratch_free(rdest_hdr);
+    scratch_free(rsrc_hdr);
+    scratch_free(poff_hdr);
+}
+
+/* (filter execution moved to filter.c) */
+
+
+/* ============================================================================
+ * Sort execution (simple insertion sort)
+ * ============================================================================ */
+
+/* Forward declarations — exec_node wraps exec_node_inner with profiling */
+/* exec_node declared extern in exec_internal.h */
+static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op);
+
+
+
+/* Broadcast a scalar atom to a column vector of nrows elements.
+ * Returns a new vector (caller owns).  On failure returns ray_error(). */
+ray_t* broadcast_scalar(ray_t* atom, int64_t nrows) {
+    if (!atom) return ray_error("domain", NULL);
+    if (nrows <= 0) {
+        /* Empty table: return an empty vector of the matching type */
+        int8_t at = atom->type;
+        int8_t vt;
+        if      (at == -RAY_STR)  vt = RAY_STR;
+        else if (at == -RAY_I64)  vt = RAY_I64;
+        else if (at == -RAY_F64)  vt = RAY_F64;
+        else if (at == -RAY_BOOL) vt = RAY_BOOL;
+        else if (at == -RAY_SYM)  vt = RAY_SYM;
+        else return ray_error("type", NULL);
+        return ray_vec_new(vt, 0);
+    }
+    int8_t at = atom->type;
+
+    /* -RAY_STR → RAY_STR column */
+    if (at == -RAY_STR) {
+        const char* sp = ray_str_ptr(atom);
+        size_t sl = ray_str_len(atom);
+        ray_t* vec = ray_vec_new(RAY_STR, nrows);
+        if (!vec || RAY_IS_ERR(vec)) return vec;
+        for (int64_t r = 0; r < nrows; r++) {
+            vec = ray_str_vec_append(vec, sp, sl);
+            if (RAY_IS_ERR(vec)) return vec;
+        }
+        return vec;
+    }
+
+    /* Numeric / bool / sym scalars */
+    int8_t vt;
+    if      (at == -RAY_I64)  vt = RAY_I64;
+    else if (at == -RAY_F64)  vt = RAY_F64;
+    else if (at == -RAY_BOOL) vt = RAY_BOOL;
+    else if (at == -RAY_SYM)  vt = RAY_SYM;
+    else return ray_error("type", NULL);
+
+    size_t esz = (vt == RAY_BOOL) ? 1 : 8;
+    ray_t* vec = ray_vec_new(vt, nrows);
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    uint8_t elem[8] = {0};
+    memcpy(elem, &atom->i64, esz);
+    for (int64_t r = 0; r < nrows; r++) {
+        vec = ray_vec_append(vec, elem);
+        if (RAY_IS_ERR(vec)) return vec;
+    }
+    return vec;
+}
+
+/* OP_IN worker — process [start, end) of the BOOL output buffer.
+ * Disjoint slices, no synchronization. */
+typedef struct {
+    ray_t*         col;
+    const double*  svf;
+    const int64_t* svi;
+    int64_t        sv_len;
+    uint8_t*       ob;
+    int8_t         ct;
+    bool           col_has_nulls;
+    bool           col_atom_null;
+    bool           col_is_atom;
+    bool           use_double;
+    bool           negate;
+} in_worker_ctx_t;
+
+static void exec_in_worker(void* vctx, uint32_t worker_id,
+                           int64_t start, int64_t end) {
+    (void)worker_id;
+    in_worker_ctx_t* c = (in_worker_ctx_t*)vctx;
+    ray_t* col = c->col;
+    const void* cd = c->col_is_atom ? NULL : ray_data(col);
+    int8_t ct = c->ct;
+    uint8_t cattrs = c->col_is_atom ? 0 : col->attrs;
+    uint8_t* ob = c->ob;
+    int64_t sv_len = c->sv_len;
+    int negate = c->negate ? 1 : 0;
+
+    #define IN_READ_I64(dst, idx) do {                                      \
+        switch (ct) {                                                       \
+        case RAY_BOOL: case RAY_U8: (dst) = ((const uint8_t*)cd)[idx]; break; \
+        case RAY_I16:  (dst) = ((const int16_t*)cd)[idx]; break;            \
+        case RAY_I32:  case RAY_DATE: case RAY_TIME:                        \
+                       (dst) = ((const int32_t*)cd)[idx]; break;            \
+        case RAY_I64:  case RAY_TIMESTAMP:                                  \
+                       (dst) = ((const int64_t*)cd)[idx]; break;            \
+        case RAY_SYM:  (dst) = ray_read_sym(cd, (idx), ct, cattrs); break;  \
+        default:       (dst) = 0; break;                                    \
+        }                                                                   \
+    } while (0)
+
+    #define IN_READ_F64(dst, idx) do {                                      \
+        switch (ct) {                                                       \
+        case RAY_BOOL: case RAY_U8: (dst) = (double)((const uint8_t*)cd)[idx]; break; \
+        case RAY_I16:  (dst) = (double)((const int16_t*)cd)[idx]; break;    \
+        case RAY_I32:  case RAY_DATE: case RAY_TIME:                        \
+                       (dst) = (double)((const int32_t*)cd)[idx]; break;    \
+        case RAY_I64:  case RAY_TIMESTAMP:                                  \
+                       (dst) = (double)((const int64_t*)cd)[idx]; break;    \
+        case RAY_F32:  (dst) = (double)((const float*)cd)[idx]; break;      \
+        case RAY_F64:  (dst) = ((const double*)cd)[idx]; break;             \
+        default:       (dst) = 0.0; break;                                  \
+        }                                                                   \
+    } while (0)
+
+    if (c->use_double) {
+        const double* svf = c->svf;
+        for (int64_t i = start; i < end; i++) {
+            bool row_null = c->col_atom_null ||
+                            (c->col_has_nulls && !c->col_is_atom &&
+                             ray_vec_is_null(col, i));
+            if (row_null) { ob[i] = 0; continue; }
+            double cv;
+            if (c->col_is_atom) cv = (ct == RAY_F64) ? col->f64 : (double)col->i64;
+            else IN_READ_F64(cv, i);
+            int found = 0;
+            for (int64_t j = 0; j < sv_len; j++)
+                if (cv == svf[j]) { found = 1; break; }
+            ob[i] = (uint8_t)(found ^ negate);
+        }
+    } else {
+        const int64_t* svi = c->svi;
+        for (int64_t i = start; i < end; i++) {
+            bool row_null = c->col_atom_null ||
+                            (c->col_has_nulls && !c->col_is_atom &&
+                             ray_vec_is_null(col, i));
+            if (row_null) { ob[i] = 0; continue; }
+            int64_t cv;
+            if (c->col_is_atom) cv = col->i64;
+            else IN_READ_I64(cv, i);
+            int found = 0;
+            for (int64_t j = 0; j < sv_len; j++)
+                if (cv == svi[j]) { found = 1; break; }
+            ob[i] = (uint8_t)(found ^ negate);
+        }
+    }
+    #undef IN_READ_I64
+    #undef IN_READ_F64
+}
+
+/* ============================================================================
+ * exec_in — membership test (col IN set_vec)
+ *
+ * Evaluates each element of `col` against `set`.  Returns a RAY_BOOL
+ * vector of col->len.  For OP_NOT_IN the output is inverted.
+ *
+ * Type handling:
+ *   - SYM ∈ SYM  → compare interned sym IDs as i64
+ *   - Integer-family (BOOL/U8/I16/I32/I64/DATE/TIME/TIMESTAMP) on both
+ *     sides → compare values as signed int64 (narrow types are
+ *     sign-extended during read).
+ *   - Any float on either side, mixed with each other or with
+ *     integer family → promote both sides to double and compare with
+ *     `==`.  This covers the common case `(in price [1 2 3])` where
+ *     price is F64 and the set literal parses as I64.
+ *   - SYM mixed with anything else → no matches (type-mismatch; we
+ *     don't error because it's a legal Rayfall comparison that
+ *     simply produces false).
+ *   - RAY_STR: deferred (returns nyi).
+ * ============================================================================ */
+static ray_t* exec_in(ray_graph_t* g, ray_op_t* op, ray_t* col, ray_t* set) {
+    (void)g;
+    bool negate = (op->opcode == OP_NOT_IN);
+
+    int64_t col_len = ray_is_atom(col) ? 1 : col->len;
+    int64_t set_len = ray_is_atom(set) ? 1 : set->len;
+
+    /* Empty col: the main loop produces an empty BOOL result
+     * correctly, but there's nothing to iterate, so short-circuit. */
+    if (col_len == 0) {
+        ray_t* out = ray_vec_new(RAY_BOOL, 0);
+        if (!out || RAY_IS_ERR(out)) return out;
+        out->len = 0;
+        return out;
+    }
+
+    /* NOTE: we intentionally do NOT short-circuit on set_len == 0.
+     * Even for an empty probe, the main loop still needs to check
+     * each col row's null flag so null rows never leak through as
+     * true for `not-in` (the old memset bypass did exactly that). */
+
+    int8_t ct = ray_is_atom(col) ? (int8_t)(-col->type) : col->type;
+    int8_t st = ray_is_atom(set) ? (int8_t)(-set->type) : set->type;
+    if (RAY_IS_PARTED(ct)) ct = (int8_t)RAY_PARTED_BASETYPE(ct);
+    if (RAY_IS_PARTED(st)) st = (int8_t)RAY_PARTED_BASETYPE(st);
+
+    if (ct == RAY_STR || st == RAY_STR)
+        return ray_error("nyi", "OP_IN on RAY_STR not yet implemented");
+
+    /* Classify each side: 0=int-family, 1=float-family, 2=sym. */
+    #define CLASSIFY(t)                                                    \
+        ((t) == RAY_SYM ? 2 :                                              \
+         ((t) == RAY_F32 || (t) == RAY_F64) ? 1 : 0)
+
+    int col_class = CLASSIFY(ct);
+    int set_class = CLASSIFY(st);
+
+    /* Mixed SYM vs non-SYM → treat as an empty probe.  A SYM set
+     * containing resolved sym IDs has no meaning when compared to a
+     * raw integer column, so nothing can match — but we still drop
+     * through to the main loop so null rows are handled consistently
+     * (they emit 0 regardless of negate). */
+    if ((col_class == 2) != (set_class == 2)) {
+        set_len = 0;
+    }
+
+    /* Float-promoted path: at least one side is float.  Read both as
+     * double and compare. */
+    int use_double = (col_class == 1 || set_class == 1);
+
+    ray_t* out = ray_vec_new(RAY_BOOL, col_len);
+    if (!out || RAY_IS_ERR(out)) return out;
+    out->len = col_len;
+    uint8_t* ob = (uint8_t*)ray_data(out);
+
+    /* Null-aware: null rows in the column never pass either `in` or
+     * `not-in`.  Mirrors SQL-style semantics where NULL IN (…) and
+     * NULL NOT IN (…) both yield UNKNOWN / false in a boolean
+     * context.  Also skip null elements when building the probe
+     * buffer so a non-null col row doesn't accidentally match the
+     * sentinel value of a null set element. */
+    bool col_has_nulls = !ray_is_atom(col) && (col->attrs & RAY_ATTR_HAS_NULLS);
+    bool col_atom_null = ray_is_atom(col) && RAY_ATOM_IS_NULL(col);
+    bool set_has_nulls = !ray_is_atom(set) && (set->attrs & RAY_ATTR_HAS_NULLS);
+
+    #define READ_I64(dst, vec, type, idx) do {                             \
+        const void* _d = ray_data(vec);                                    \
+        switch (type) {                                                    \
+        case RAY_BOOL: case RAY_U8: (dst) = ((const uint8_t*)_d)[idx]; break; \
+        case RAY_I16:  (dst) = ((const int16_t*)_d)[idx]; break;           \
+        case RAY_I32:  case RAY_DATE: case RAY_TIME:                       \
+                       (dst) = ((const int32_t*)_d)[idx]; break;           \
+        case RAY_I64:  case RAY_TIMESTAMP:                                 \
+                       (dst) = ((const int64_t*)_d)[idx]; break;           \
+        case RAY_SYM:  (dst) = ray_read_sym(_d, (idx), (type),             \
+                                            (vec)->attrs); break;          \
+        default:       (dst) = 0; break;                                   \
+        }                                                                  \
+    } while (0)
+
+    #define READ_F64(dst, vec, type, idx) do {                             \
+        const void* _d = ray_data(vec);                                    \
+        switch (type) {                                                    \
+        case RAY_BOOL: case RAY_U8: (dst) = (double)((const uint8_t*)_d)[idx]; break; \
+        case RAY_I16:  (dst) = (double)((const int16_t*)_d)[idx]; break;   \
+        case RAY_I32:  case RAY_DATE: case RAY_TIME:                       \
+                       (dst) = (double)((const int32_t*)_d)[idx]; break;   \
+        case RAY_I64:  case RAY_TIMESTAMP:                                 \
+                       (dst) = (double)((const int64_t*)_d)[idx]; break;   \
+        case RAY_F32:  (dst) = (double)((const float*)_d)[idx]; break;     \
+        case RAY_F64:  (dst) = ((const double*)_d)[idx]; break;            \
+        default:       (dst) = 0.0; break;                                 \
+        }                                                                  \
+    } while (0)
+
+    /* Compact probe buffer: drop null set elements up front so the
+     * inner loop doesn't special-case them. */
+    int64_t sv_len = 0;
+    double  svf_stack[32];
+    int64_t svi_stack[32];
+    double* svf = svf_stack;
+    int64_t* svi = svi_stack;
+    ray_t* sv_hdr = NULL;
+    if (set_len > 32) {
+        size_t bytes = (size_t)set_len * (use_double ? sizeof(double) : sizeof(int64_t));
+        sv_hdr = ray_alloc(bytes);
+        if (!sv_hdr) { ray_release(out); return ray_error("oom", NULL); }
+        if (use_double) svf = (double*)ray_data(sv_hdr);
+        else            svi = (int64_t*)ray_data(sv_hdr);
+    }
+
+    /* set_len is 0 when we want to suppress the set entirely
+     * (SYM-vs-non-SYM type mismatch).  Respect it in BOTH the
+     * atom and vec branches so the probe stays empty. */
+    if (use_double) {
+        if (set_len > 0 && ray_is_atom(set)) {
+            if (!RAY_ATOM_IS_NULL(set)) {
+                svf[0] = (st == RAY_F64) ? set->f64 : (double)set->i64;
+                sv_len = 1;
+            }
+        } else if (set_len > 0) {
+            for (int64_t i = 0; i < set_len; i++) {
+                if (set_has_nulls && ray_vec_is_null(set, i)) continue;
+                READ_F64(svf[sv_len], set, st, i);
+                sv_len++;
+            }
+        }
+    } else {
+        if (set_len > 0 && ray_is_atom(set)) {
+            if (!RAY_ATOM_IS_NULL(set)) { svi[0] = set->i64; sv_len = 1; }
+        } else if (set_len > 0) {
+            for (int64_t i = 0; i < set_len; i++) {
+                if (set_has_nulls && ray_vec_is_null(set, i)) continue;
+                READ_I64(svi[sv_len], set, st, i);
+                sv_len++;
+            }
+        }
+    }
+
+    in_worker_ctx_t in_ctx = {
+        .col = col,
+        .svf = svf, .svi = svi, .sv_len = sv_len,
+        .ob = ob, .ct = ct,
+        .col_has_nulls = col_has_nulls,
+        .col_atom_null = col_atom_null,
+        .col_is_atom = ray_is_atom(col),
+        .use_double = use_double,
+        .negate = negate,
+    };
+
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && col_len >= RAY_PARALLEL_THRESHOLD && !ray_is_atom(col))
+        ray_pool_dispatch(pool, exec_in_worker, &in_ctx, col_len);
+    else
+        exec_in_worker(&in_ctx, 0, 0, col_len);
+
+    if (sv_hdr) ray_free(sv_hdr);
+
+    #undef READ_I64
+    #undef READ_F64
+    #undef CLASSIFY
+    return out;
+}
+
+/* ============================================================================
+ * Recursive executor
+ * ============================================================================ */
+
+/* Is this opcode a "heavy" pipeline breaker worth profiling? */
+static inline bool op_is_heavy(uint16_t opc) {
+    return opc == OP_FILTER || opc == OP_SORT || opc == OP_GROUP ||
+           opc == OP_JOIN   || opc == OP_WINDOW_JOIN || opc == OP_SELECT ||
+           opc == OP_HEAD   || opc == OP_TAIL || opc == OP_WINDOW ||
+           opc == OP_PIVOT  ||
+           (opc >= OP_EXPAND && opc <= OP_KNN_RERANK);
+}
+
+ray_t* exec_node(ray_graph_t* g, ray_op_t* op) {
+    if (!op) return ray_error("nyi", NULL);
+
+    /* Per-op cancellation checkpoint. Long fused pipelines iterate
+     * exec_node many times; this catches Ctrl-C between operators
+     * without adding cost to the per-row hot path. */
+    if (ray_interrupted()) return ray_error("cancel", "interrupted");
+
+    bool heavy = op_is_heavy(op->opcode);
+    bool profiling = g_ray_profile.active && heavy;
+    const char* oname = NULL;
+    if (heavy) {
+        oname = ray_opcode_name(op->opcode);
+        /* Relabel progress without touching counters — leaf ops that
+         * drive their own rows_done/rows_total still work; ops that
+         * don't get a spinner-style indeterminate bar until they
+         * either finish or emit their own update. */
+        ray_progress_label(oname, NULL);
+        if (profiling) ray_profile_span_start(oname);
+    }
+
+    ray_t* _prof_result = exec_node_inner(g, op);
+
+    if (profiling)
+        ray_profile_span_end(oname);
+
+    return _prof_result;
+}
+
+static ray_t* exec_node_inner(ray_graph_t* g, ray_op_t* op) {
+    if (!op) return ray_error("nyi", NULL);
+
+    switch (op->opcode) {
+        case OP_SCAN: {
+            ray_op_ext_t* ext = find_ext(g, op->id);
+            if (!ext) return ray_error("nyi", NULL);
+
+            /* Resolve table: pad[0..1] stores table_id+1 (0 = default g->table) */
+            uint16_t stored_table_id = 0;
+            memcpy(&stored_table_id, ext->base.pad, sizeof(uint16_t));
+            ray_t* scan_tbl;
+            if (stored_table_id > 0 && g->tables && (stored_table_id - 1) < g->n_tables) {
+                scan_tbl = g->tables[stored_table_id - 1];
+            } else {
+                scan_tbl = g->table;
+            }
+            if (!scan_tbl) return ray_error("schema", NULL);
+            ray_t* col = ray_table_get_col(scan_tbl, ext->sym);
+            if (!col) return ray_error("schema", NULL);
+            if (col->type == RAY_MAPCOMMON)
+                return materialize_mapcommon(col);
+            if (RAY_IS_PARTED(col->type)) {
+                /* Concat parted segments into flat vector (cold path) */
+                int8_t base = (int8_t)RAY_PARTED_BASETYPE(col->type);
+                ray_t** sps = (ray_t**)ray_data(col);
+                int64_t total = ray_parted_nrows(col);
+
+                /* RAY_STR: deep-copy to handle multi-pool segments */
+                if (base == RAY_STR)
+                    return parted_flatten_str(sps, col->len, total);
+
+                uint8_t sba = (base == RAY_SYM)
+                            ? parted_first_attrs(sps, col->len) : 0;
+                ray_t* flat = typed_vec_new(base, sba, total);
+                if (!flat || RAY_IS_ERR(flat)) return ray_error("oom", NULL);
+                flat->len = total;
+                ray_t** segs = sps;
+                size_t esz = (size_t)ray_sym_elem_size(base, sba);
+                int64_t off = 0;
+                for (int64_t s = 0; s < col->len; s++) {
+                    if (segs[s] && segs[s]->len > 0 &&
+                        parted_seg_esz_ok(segs[s], base, (uint8_t)esz)) {
+                        memcpy((char*)ray_data(flat) + off * esz,
+                               ray_data(segs[s]), (size_t)segs[s]->len * esz);
+                        off += segs[s]->len;
+                    } else if (segs[s] && segs[s]->len > 0) {
+                        memset((char*)ray_data(flat) + off * esz, 0,
+                               (size_t)segs[s]->len * esz);
+                        off += segs[s]->len;
+                    }
+                }
+                return flat;
+            }
+            ray_retain(col);
+            return col;
+        }
+
+        case OP_CONST: {
+            ray_op_ext_t* ext = find_ext(g, op->id);
+            if (!ext || !ext->literal) return ray_error("nyi", NULL);
+            ray_retain(ext->literal);
+            return ext->literal;
+        }
+
+        case OP_TIL: {
+            ray_op_ext_t* ext = find_ext(g, op->id);
+            if (!ext || !ext->literal) return ray_error("nyi", NULL);
+            int64_t n = ext->literal->i64;
+            if (n <= 0) return ray_vec_new(RAY_I64, 0);
+            ray_t* vec = ray_vec_new(RAY_I64, n);
+            if (!vec || RAY_IS_ERR(vec)) return vec;
+            vec->len = n;
+            int64_t* d = (int64_t*)ray_data(vec);
+            for (int64_t i = 0; i < n; i++) d[i] = i;
+            return vec;
+        }
+
+        /* Membership: col IN set_vec */
+        case OP_IN: case OP_NOT_IN: {
+            ray_t* col = exec_node(g, op->inputs[0]);
+            if (!col || RAY_IS_ERR(col)) return col;
+            ray_t* set = exec_node(g, op->inputs[1]);
+            if (!set || RAY_IS_ERR(set)) { ray_release(col); return set; }
+            ray_t* result = exec_in(g, op, col, set);
+            ray_release(col);
+            ray_release(set);
+            return result;
+        }
+
+        /* Unary element-wise */
+        case OP_NEG: case OP_ABS: case OP_NOT: case OP_SQRT:
+        case OP_LOG: case OP_EXP: case OP_CEIL: case OP_FLOOR: case OP_ROUND:
+        case OP_ISNULL: case OP_CAST:
+        /* Binary element-wise */
+        case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV: case OP_MOD:
+        case OP_EQ: case OP_NE: case OP_LT: case OP_LE:
+        case OP_GT: case OP_GE: case OP_AND: case OP_OR:
+        case OP_MIN2: case OP_MAX2: {
+            /* Try compiled expression first (fuses entire subtree) */
+            if (g->table) {
+                int64_t nr = ray_table_nrows(g->table);
+                if (nr > 0) {
+                    ray_expr_t ex;
+                    if (expr_compile(g, g->table, op, &ex)) {
+                        ray_t* vec = expr_eval_full(&ex, nr);
+                        if (vec && !RAY_IS_ERR(vec)) return vec;
+                    }
+                }
+            }
+            /* Fallback: recursive per-node evaluation */
+            if (op->arity == 1) {
+                ray_t* input = exec_node(g, op->inputs[0]);
+                if (!input || RAY_IS_ERR(input)) return input;
+                ray_t* result = exec_elementwise_unary(g, op, input);
+                ray_release(input);
+                return result;
+            } else {
+                ray_t* lhs = exec_node(g, op->inputs[0]);
+                ray_t* rhs = exec_node(g, op->inputs[1]);
+                if (!lhs || RAY_IS_ERR(lhs)) { if (rhs && !RAY_IS_ERR(rhs)) ray_release(rhs); return lhs; }
+                if (!rhs || RAY_IS_ERR(rhs)) { ray_release(lhs); return rhs; }
+                ray_t* result = exec_elementwise_binary(g, op, lhs, rhs);
+                ray_release(lhs);
+                ray_release(rhs);
+                return result;
+            }
+        }
+
+        /* Reductions */
+        case OP_SUM: case OP_PROD: case OP_MIN: case OP_MAX:
+        case OP_COUNT: case OP_AVG: case OP_FIRST: case OP_LAST:
+        case OP_STDDEV: case OP_STDDEV_POP: case OP_VAR: case OP_VAR_POP: {
+            ray_t* input = exec_node(g, op->inputs[0]);
+            if (!input || RAY_IS_ERR(input)) return input;
+            /* Compact lazy selection before reducing — filters may have
+             * set g->selection without materializing a compacted table. */
+            bool own_input = (input != g->table);
+            if (g->selection && input->type == RAY_TABLE) {
+                ray_t* compacted = sel_compact(g, input, g->selection);
+                if (own_input) ray_release(input);
+                ray_release(g->selection);
+                g->selection = NULL;
+                input = compacted;
+                own_input = true;
+            }
+            ray_t* result = exec_reduction(g, op, input);
+            if (own_input) ray_release(input);
+            return result;
+        }
+
+        case OP_COUNT_DISTINCT: {
+            ray_t* input = exec_node(g, op->inputs[0]);
+            if (!input || RAY_IS_ERR(input)) return input;
+            ray_t* result = exec_count_distinct(g, op, input);
+            ray_release(input);
+            return result;
+        }
+
+        case OP_FILTER: {
+            /* HAVING fusion: FILTER(GROUP) — evaluate the predicate against
+             * the GROUP result rather than the original input table.
+             * SCAN nodes in the predicate tree resolve column names via
+             * g->table, so we temporarily swap it to the GROUP output. */
+            ray_op_t* filter_child = op->inputs[0];
+            if (filter_child && filter_child->opcode == OP_GROUP) {
+                ray_t* group_result = exec_node(g, filter_child);
+                if (!group_result || RAY_IS_ERR(group_result))
+                    return group_result;
+
+                ray_t* saved_table = g->table;
+                ray_t* saved_sel   = g->selection;
+                g->table     = group_result;
+                g->selection = NULL;
+
+                ray_t* pred = exec_node(g, op->inputs[1]);
+
+                g->table     = saved_table;
+                g->selection = saved_sel;
+
+                if (!pred || RAY_IS_ERR(pred)) {
+                    ray_release(group_result);
+                    return pred;
+                }
+
+                ray_t* result = exec_filter(g, op, group_result, pred);
+                ray_release(pred);
+                ray_release(group_result);
+                return result;
+            }
+
+            ray_t* input = exec_node(g, op->inputs[0]);
+            ray_t* pred  = exec_node(g, op->inputs[1]);
+            if (!input || RAY_IS_ERR(input)) { if (pred && !RAY_IS_ERR(pred)) ray_release(pred); return input; }
+            if (!pred || RAY_IS_ERR(pred)) { ray_release(input); return pred; }
+
+            /* Lazy filter: convert predicate to a rowsel (morsel-local
+             * index list) and install on g->selection instead of
+             * materializing a compacted table.  Only for TABLE inputs —
+             * downstream ops (group-by) walk the rowsel directly,
+             * boundary ops (sort/join/window) compact on demand via
+             * sel_compact.  Vector inputs must still materialize
+             * immediately since downstream ops like COUNT rely on
+             * compacted length. */
+            if (pred->type == RAY_BOOL && input->type == RAY_TABLE) {
+                if (g->selection) {
+                    /* Chained filter: refine the existing selection
+                     * with this predicate in one walk. */
+                    ray_t* merged = ray_rowsel_refine(g->selection, pred);
+                    ray_release(pred);
+                    ray_release(g->selection);
+                    g->selection = merged;  /* may be NULL if all-pass */
+                } else {
+                    ray_t* new_sel = ray_rowsel_from_pred(pred);
+                    ray_release(pred);
+                    g->selection = new_sel;  /* may be NULL if all-pass */
+                }
+                return input;  /* original table, not compacted */
+            }
+
+            /* Eager filter for vector inputs and non-BOOL predicates */
+            ray_t* result = exec_filter(g, op, input, pred);
+            ray_release(input);
+            ray_release(pred);
+            return result;
+        }
+
+        case OP_SORT: {
+            ray_t* input = exec_node(g, op->inputs[0]);
+            if (!input || RAY_IS_ERR(input)) return input;
+            ray_t* tbl = (input->type == RAY_TABLE) ? input : g->table;
+            /* Compact lazy selection before sort (needs dense data) */
+            if (g->selection && tbl && !RAY_IS_ERR(tbl) && tbl->type == RAY_TABLE) {
+                ray_t* compacted = sel_compact(g, tbl, g->selection);
+                if (input != g->table) ray_release(input);
+                ray_release(g->selection);
+                g->selection = NULL;
+                input = compacted;
+                tbl = compacted;
+            }
+            ray_t* result = exec_sort(g, op, tbl, 0);
+            if (input != g->table) ray_release(input);
+            return result;
+        }
+
+        case OP_GROUP: {
+            ray_t* tbl = g->table;
+            ray_t* owned_tbl = NULL;
+
+            /* Factorized pipeline: detect OP_EXPAND (factorized) → OP_GROUP.
+             * When the group key is _src and there's a factorized expand node
+             * in the graph, execute the expand first and pipe its output as
+             * the group input table.  This connects the expand→group pipeline
+             * that would otherwise disconnect since GROUP reads g->table. */
+            {
+                ray_op_ext_t* gext = find_ext(g, op->id);
+                if (gext && gext->n_keys == 1) {
+                    ray_op_ext_t* kx = find_ext(g, gext->keys[0]->id);
+                    int64_t src_sym = ray_sym_intern("_src", 4);
+                    if (kx && kx->base.opcode == OP_SCAN && kx->sym == src_sym) {
+                        /* Find the factorized OP_EXPAND connected to this GROUP.
+                         * The expand must be the one whose output the GROUP
+                         * is scanning (connected via OP_SCAN inputs). */
+                        for (uint32_t ei = 0; ei < g->ext_count; ei++) {
+                            ray_op_ext_t* ex = g->ext_nodes[ei];
+                            if (ex && ex->base.id < g->node_count
+                                && g->nodes[ex->base.id].opcode == OP_EXPAND
+                                && ex->graph.factorized) {
+                                ray_op_t* expand_op = &g->nodes[ex->base.id];
+                                ray_t* expand_result = exec_node(g, expand_op);
+                                if (!expand_result || RAY_IS_ERR(expand_result))
+                                    return expand_result;
+                                if (expand_result->type == RAY_TABLE) {
+                                    ray_t* saved = g->table;
+                                    g->table = expand_result;
+                                    ray_t* result = exec_group(g, op, expand_result, 0);
+                                    g->table = saved;
+                                    ray_release(expand_result);
+                                    return result;
+                                }
+                                ray_release(expand_result);
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+
+            /* Lazy selection is consumed by exec_group itself — all
+             * paths (sequential, DA, radix-parallel) honour the
+             * bitmap via group_rows_range / radix scan loops.  We
+             * must still clear g->selection *after* group runs so
+             * downstream ops (SORT etc.) don't try to sel_compact the
+             * aggregated output with a mismatched-length bitmap. */
+            ray_t* result = exec_group(g, op, tbl, 0);
+            if (owned_tbl) ray_release(owned_tbl);
+            if (g->selection) {
+                ray_release(g->selection);
+                g->selection = NULL;
+            }
+            return result;
+        }
+
+        case OP_PIVOT: {
+            ray_t* tbl = g->table;
+            ray_t* owned_tbl = NULL;
+            if (g->selection) {
+                ray_t* compacted = sel_compact(g, tbl, g->selection);
+                if (!compacted || RAY_IS_ERR(compacted)) return compacted;
+                ray_release(g->selection);
+                g->selection = NULL;
+                owned_tbl = compacted;
+                tbl = compacted;
+            }
+            ray_t* result = exec_pivot(g, op, tbl);
+            if (owned_tbl) ray_release(owned_tbl);
+            return result;
+        }
+
+        case OP_JOIN: {
+            ray_t* left = exec_node(g, op->inputs[0]);
+            ray_t* right = exec_node(g, op->inputs[1]);
+            if (!left || RAY_IS_ERR(left)) { if (right && !RAY_IS_ERR(right)) ray_release(right); return left; }
+            if (!right || RAY_IS_ERR(right)) { ray_release(left); return right; }
+            /* Compact lazy selection before join (needs dense data) */
+            if (g->selection && left && !RAY_IS_ERR(left) && left->type == RAY_TABLE) {
+                ray_t* compacted = sel_compact(g, left, g->selection);
+                ray_release(left);
+                ray_release(g->selection);
+                g->selection = NULL;
+                left = compacted;
+            }
+            ray_t* result = exec_join(g, op, left, right);
+            ray_release(left);
+            ray_release(right);
+            return result;
+        }
+
+        case OP_ANTIJOIN: {
+            ray_t* left = exec_node(g, op->inputs[0]);
+            ray_t* right = exec_node(g, op->inputs[1]);
+            if (!left || RAY_IS_ERR(left)) { if (right && !RAY_IS_ERR(right)) ray_release(right); return left; }
+            if (!right || RAY_IS_ERR(right)) { ray_release(left); return right; }
+            if (g->selection && left && !RAY_IS_ERR(left) && left->type == RAY_TABLE) {
+                ray_t* compacted = sel_compact(g, left, g->selection);
+                ray_release(left);
+                ray_release(g->selection);
+                g->selection = NULL;
+                left = compacted;
+            }
+            ray_t* result = exec_antijoin(g, op, left, right);
+            ray_release(left);
+            ray_release(right);
+            return result;
+        }
+
+        case OP_WINDOW_JOIN: {
+            ray_t* left = exec_node(g, op->inputs[0]);
+            ray_t* right = exec_node(g, op->inputs[1]);
+            if (!left || RAY_IS_ERR(left)) { if (right && !RAY_IS_ERR(right)) ray_release(right); return left; }
+            if (!right || RAY_IS_ERR(right)) { ray_release(left); return right; }
+            if (g->selection && left && !RAY_IS_ERR(left) && left->type == RAY_TABLE) {
+                ray_t* compacted = sel_compact(g, left, g->selection);
+                ray_release(left);
+                ray_release(g->selection);
+                g->selection = NULL;
+                left = compacted;
+            }
+            ray_t* result = exec_window_join(g, op, left, right);
+            ray_release(left);
+            ray_release(right);
+            return result;
+        }
+
+        case OP_WINDOW: {
+            ray_t* input = exec_node(g, op->inputs[0]);
+            if (!input || RAY_IS_ERR(input)) return input;
+            ray_t* wdf = (input->type == RAY_TABLE) ? input : g->table;
+            /* Compact lazy selection before window (needs dense data) */
+            if (g->selection && wdf && !RAY_IS_ERR(wdf) && wdf->type == RAY_TABLE) {
+                ray_t* compacted = sel_compact(g, wdf, g->selection);
+                if (input != g->table) ray_release(input);
+                ray_release(g->selection);
+                g->selection = NULL;
+                input = compacted;
+                wdf = compacted;
+            }
+            ray_t* result = exec_window(g, op, wdf);
+            if (input != g->table) ray_release(input);
+            return result;
+        }
+
+        case OP_HEAD: {
+            ray_op_ext_t* ext = find_ext(g, op->id);
+            int64_t n = ext ? ext->sym : 10;
+
+            /* Fused sort+limit: detect SORT child → only gather N rows */
+            ray_op_t* child_op = op->inputs[0];
+            if (child_op && child_op->opcode == OP_SORT) {
+                ray_t* sort_input = exec_node(g, child_op->inputs[0]);
+                if (!sort_input || RAY_IS_ERR(sort_input)) return sort_input;
+                ray_t* tbl = (sort_input->type == RAY_TABLE) ? sort_input : g->table;
+                /* Compact lazy selection before sort */
+                if (g->selection && tbl && !RAY_IS_ERR(tbl) && tbl->type == RAY_TABLE) {
+                    ray_t* compacted = sel_compact(g, tbl, g->selection);
+                    if (sort_input != g->table) ray_release(sort_input);
+                    ray_release(g->selection);
+                    g->selection = NULL;
+                    sort_input = compacted;
+                    tbl = compacted;
+                }
+                ray_t* result = exec_sort(g, child_op, tbl, n);
+                if (sort_input != g->table) ray_release(sort_input);
+                return result;
+            }
+
+            /* HEAD(GROUP) optimization: pass limit hint to exec_group
+             * so it can short-circuit the per-partition loop when all
+             * GROUP BY keys are MAPCOMMON.  The normal HEAD logic below
+             * still trims the result to N rows regardless. */
+            ray_t* input;
+            if (child_op && child_op->opcode == OP_GROUP) {
+                ray_t* tbl = g->table;
+                if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+                ray_t* owned_tbl = NULL;
+                if (g->selection && tbl->type == RAY_TABLE) {
+                    int needs = 0;
+                    int64_t nc = ray_table_ncols(tbl);
+                    for (int64_t c = 0; c < nc; c++) {
+                        ray_t* col = ray_table_get_col_idx(tbl, c);
+                        if (col && !RAY_IS_PARTED(col->type)
+                            && col->type != RAY_MAPCOMMON) {
+                            needs = 1; break;
+                        }
+                    }
+                    if (needs) {
+                        ray_t* compacted = sel_compact(g, tbl, g->selection);
+                        if (!compacted || RAY_IS_ERR(compacted)) return compacted;
+                        ray_release(g->selection);
+                        g->selection = NULL;
+                        owned_tbl = compacted;
+                        tbl = compacted;
+                    }
+                }
+                input = exec_group(g, child_op, tbl, n);
+                if (owned_tbl) ray_release(owned_tbl);
+            } else if (child_op && child_op->opcode == OP_FILTER) {
+                /* HEAD(FILTER): early-termination filter — gather only
+                 * the first N matching rows instead of all matches. */
+                ray_t* filter_input = exec_node(g, child_op->inputs[0]);
+                if (!filter_input || RAY_IS_ERR(filter_input))
+                    return filter_input;
+
+                /* Compact lazy selection before filter evaluation */
+                ray_t* ftbl = (filter_input->type == RAY_TABLE)
+                           ? filter_input : g->table;
+                if (g->selection && ftbl && ftbl->type == RAY_TABLE) {
+                    ray_t* compacted = sel_compact(g, ftbl, g->selection);
+                    if (filter_input != g->table) ray_release(filter_input);
+                    ray_release(g->selection);
+                    g->selection = NULL;
+                    filter_input = compacted;
+                    ftbl = compacted;
+                }
+
+                /* Swap table for predicate evaluation */
+                ray_t* saved_table = g->table;
+                g->table = ftbl;
+                ray_t* pred = exec_node(g, child_op->inputs[1]);
+                g->table = saved_table;
+
+                if (!pred || RAY_IS_ERR(pred)) {
+                    if (filter_input != saved_table)
+                        ray_release(filter_input);
+                    return pred;
+                }
+
+                ray_t* result = exec_filter_head(ftbl, pred, n);
+                ray_release(pred);
+                if (filter_input != saved_table)
+                    ray_release(filter_input);
+                return result;
+            } else {
+                input = exec_node(g, op->inputs[0]);
+            }
+            if (!input || RAY_IS_ERR(input)) return input;
+            if (input->type == RAY_TABLE) {
+                int64_t ncols = ray_table_ncols(input);
+                int64_t nrows = ray_table_nrows(input);
+                if (n > nrows) n = nrows;
+                ray_t* result = ray_table_new(ncols);
+                for (int64_t c = 0; c < ncols; c++) {
+                    ray_t* col = ray_table_get_col_idx(input, c);
+                    int64_t name_id = ray_table_col_name(input, c);
+                    if (!col) continue;
+                    if (col->type == RAY_MAPCOMMON) {
+                        ray_t* mc_head = materialize_mapcommon_head(col, n);
+                        if (mc_head && !RAY_IS_ERR(mc_head)) {
+                            result = ray_table_add_col(result, name_id, mc_head);
+                            ray_release(mc_head);
+                        }
+                        continue;
+                    }
+                    if (RAY_IS_PARTED(col->type)) {
+                        /* Copy first n rows from parted segments */
+                        int8_t base = (int8_t)RAY_PARTED_BASETYPE(col->type);
+                        ray_t** sp = (ray_t**)ray_data(col);
+                        ray_t* head_vec;
+                        if (base == RAY_STR) {
+                            head_vec = parted_head_str(sp, col->len, n);
+                        } else {
+                            uint8_t ba = (base == RAY_SYM)
+                                       ? parted_first_attrs(sp, col->len) : 0;
+                            uint8_t esz = ray_sym_elem_size(base, ba);
+                            head_vec = typed_vec_new(base, ba, n);
+                            if (head_vec && !RAY_IS_ERR(head_vec)) {
+                                head_vec->len = n;
+                                ray_t** segs = (ray_t**)ray_data(col);
+                                int64_t remaining = n;
+                                int64_t dst_off = 0;
+                                for (int64_t s = 0; s < col->len && remaining > 0; s++) {
+                                    if (!segs[s]) continue;
+                                    int64_t take = segs[s]->len;
+                                    if (take > remaining) take = remaining;
+                                    if (parted_seg_esz_ok(segs[s], base, esz)) {
+                                        memcpy((char*)ray_data(head_vec) + dst_off * esz,
+                                               ray_data(segs[s]), (size_t)take * esz);
+                                    } else {
+                                        memset((char*)ray_data(head_vec) + dst_off * esz,
+                                               0, (size_t)take * esz);
+                                    }
+                                    dst_off += take;
+                                    remaining -= take;
+                                }
+                            }
+                        }
+                        result = ray_table_add_col(result, name_id, head_vec);
+                        ray_release(head_vec);
+                    } else {
+                        /* Flat column: direct copy */
+                        uint8_t esz = col_esz(col);
+                        ray_t* head_vec = col_vec_new(col, n);
+                        if (head_vec && !RAY_IS_ERR(head_vec)) {
+                            head_vec->len = n;
+                            memcpy(ray_data(head_vec), ray_data(col),
+                                   (size_t)n * esz);
+                            col_propagate_nulls_range(head_vec, 0, col, 0, n);
+                        }
+                        result = ray_table_add_col(result, name_id, head_vec);
+                        ray_release(head_vec);
+                    }
+                }
+                ray_release(input);
+                return result;
+            }
+            if (n > input->len) n = input->len;
+            /* Materialized copy for vector head */
+            uint8_t esz = col_esz(input);
+            ray_t* result = col_vec_new(input, n);
+            if (result && !RAY_IS_ERR(result)) {
+                result->len = n;
+                memcpy(ray_data(result), ray_data(input), (size_t)n * esz);
+                col_propagate_nulls_range(result, 0, input, 0, n);
+            }
+            ray_release(input);
+            return result;
+        }
+
+        case OP_TAIL: {
+            ray_op_ext_t* ext = find_ext(g, op->id);
+            ray_t* input = exec_node(g, op->inputs[0]);
+            if (!input || RAY_IS_ERR(input)) return input;
+            int64_t n = ext ? ext->sym : 10;
+            if (input->type == RAY_TABLE) {
+                int64_t ncols = ray_table_ncols(input);
+                int64_t nrows = ray_table_nrows(input);
+                if (n > nrows) n = nrows;
+                int64_t skip = nrows - n;
+                ray_t* result = ray_table_new(ncols);
+                for (int64_t c = 0; c < ncols; c++) {
+                    ray_t* col = ray_table_get_col_idx(input, c);
+                    int64_t name_id = ray_table_col_name(input, c);
+                    if (!col) continue;
+                    if (col->type == RAY_MAPCOMMON) {
+                        /* Materialize last N rows from MAPCOMMON partitions */
+                        ray_t** mc_ptrs = (ray_t**)ray_data(col);
+                        ray_t* kv = mc_ptrs[0];
+                        ray_t* rc = mc_ptrs[1];
+                        int64_t n_parts = kv->len;
+                        size_t esz = (size_t)col_esz(kv);
+                        const char* kdata = (const char*)ray_data(kv);
+                        const int64_t* counts = (const int64_t*)ray_data(rc);
+                        ray_t* flat = col_vec_new(kv, n);
+                        if (flat && !RAY_IS_ERR(flat)) {
+                            flat->len = n;
+                            char* out = (char*)ray_data(flat);
+                            /* Walk partitions from end, fill output from end */
+                            int64_t remaining = n;
+                            int64_t dst = n;
+                            for (int64_t p = n_parts - 1; p >= 0 && remaining > 0; p--) {
+                                int64_t take = counts[p];
+                                if (take > remaining) take = remaining;
+                                dst -= take;
+                                for (int64_t r = 0; r < take; r++)
+                                    memcpy(out + (dst + r) * esz, kdata + (size_t)p * esz, esz);
+                                remaining -= take;
+                            }
+                        }
+                        result = ray_table_add_col(result, name_id, flat);
+                        ray_release(flat);
+                        continue;
+                    }
+                    if (RAY_IS_PARTED(col->type)) {
+                        /* Copy last N rows from parted segments */
+                        int8_t base = (int8_t)RAY_PARTED_BASETYPE(col->type);
+                        ray_t** tsp = (ray_t**)ray_data(col);
+                        ray_t* tail_vec;
+                        if (base == RAY_STR) {
+                            tail_vec = parted_tail_str(tsp, col->len, n);
+                        } else {
+                            uint8_t tba = (base == RAY_SYM)
+                                        ? parted_first_attrs(tsp, col->len) : 0;
+                            uint8_t esz = ray_sym_elem_size(base, tba);
+                            tail_vec = typed_vec_new(base, tba, n);
+                            if (tail_vec && !RAY_IS_ERR(tail_vec)) {
+                                tail_vec->len = n;
+                                ray_t** segs = (ray_t**)ray_data(col);
+                                int64_t remaining = n;
+                                int64_t dst = n;
+                                for (int64_t s = col->len - 1; s >= 0 && remaining > 0; s--) {
+                                    if (!segs[s]) continue;
+                                    int64_t take = segs[s]->len;
+                                    if (take > remaining) take = remaining;
+                                    dst -= take;
+                                    if (parted_seg_esz_ok(segs[s], base, esz)) {
+                                        memcpy((char*)ray_data(tail_vec) + (size_t)dst * esz,
+                                               (char*)ray_data(segs[s]) + (size_t)(segs[s]->len - take) * esz,
+                                               (size_t)take * esz);
+                                    } else {
+                                        memset((char*)ray_data(tail_vec) + (size_t)dst * esz,
+                                               0, (size_t)take * esz);
+                                    }
+                                    remaining -= take;
+                                }
+                            }
+                        }
+                        result = ray_table_add_col(result, name_id, tail_vec);
+                        ray_release(tail_vec);
+                    } else {
+                        /* Flat column: direct copy */
+                        uint8_t esz = col_esz(col);
+                        ray_t* tail_vec = col_vec_new(col, n);
+                        if (tail_vec && !RAY_IS_ERR(tail_vec)) {
+                            tail_vec->len = n;
+                            memcpy(ray_data(tail_vec),
+                                   (char*)ray_data(col) + (size_t)skip * esz,
+                                   (size_t)n * esz);
+                            col_propagate_nulls_range(tail_vec, 0, col, skip, n);
+                        }
+                        result = ray_table_add_col(result, name_id, tail_vec);
+                        ray_release(tail_vec);
+                    }
+                }
+                ray_release(input);
+                return result;
+            }
+            if (n > input->len) n = input->len;
+            int64_t skip = input->len - n;
+            uint8_t esz = col_esz(input);
+            ray_t* result = col_vec_new(input, n);
+            if (result && !RAY_IS_ERR(result)) {
+                result->len = n;
+                memcpy(ray_data(result),
+                       (char*)ray_data(input) + (size_t)skip * esz,
+                       (size_t)n * esz);
+                col_propagate_nulls_range(result, 0, input, skip, n);
+            }
+            ray_release(input);
+            return result;
+        }
+
+        case OP_IF: {
+            return exec_if(g, op);
+        }
+
+        case OP_LIKE: {
+            return exec_like(g, op);
+        }
+
+        case OP_ILIKE: {
+            return exec_ilike(g, op);
+        }
+
+        case OP_UPPER: case OP_LOWER: case OP_TRIM: {
+            return exec_string_unary(g, op);
+        }
+        case OP_STRLEN: {
+            return exec_strlen(g, op);
+        }
+        case OP_SUBSTR: {
+            return exec_substr(g, op);
+        }
+        case OP_REPLACE: {
+            return exec_replace(g, op);
+        }
+        case OP_CONCAT: {
+            return exec_concat(g, op);
+        }
+
+        case OP_EXTRACT: {
+            return exec_extract(g, op);
+        }
+
+        case OP_DATE_TRUNC: {
+            return exec_date_trunc(g, op);
+        }
+
+        case OP_ALIAS: {
+            return exec_node(g, op->inputs[0]);
+        }
+
+        case OP_MATERIALIZE: {
+            return exec_node(g, op->inputs[0]);
+        }
+
+        case OP_SELECT: {
+            /* Column projection: select/compute columns from input table */
+            ray_t* input = exec_node(g, op->inputs[0]);
+            if (!input || RAY_IS_ERR(input)) return input;
+            if (input->type != RAY_TABLE) {
+                ray_release(input);
+                return ray_error("nyi", NULL);
+            }
+            ray_op_ext_t* ext = find_ext(g, op->id);
+            if (!ext) { ray_release(input); return ray_error("nyi", NULL); }
+            uint8_t n_cols = ext->sort.n_cols;
+            ray_op_t** columns = ext->sort.columns;
+            ray_t* result = ray_table_new(n_cols);
+
+            /* Set g->table so SCAN nodes inside expressions resolve correctly */
+            ray_t* saved_table = g->table;
+            g->table = input;
+
+            for (uint8_t c = 0; c < n_cols; c++) {
+                if (columns[c]->opcode == OP_SCAN) {
+                    /* Direct column reference — copy from input table */
+                    ray_op_ext_t* col_ext = find_ext(g, columns[c]->id);
+                    if (!col_ext) continue;
+                    int64_t name_id = col_ext->sym;
+                    ray_t* src_col = ray_table_get_col(input, name_id);
+                    if (src_col) {
+                        ray_retain(src_col);
+                        result = ray_table_add_col(result, name_id, src_col);
+                        ray_release(src_col);
+                    }
+                } else {
+                    /* Expression column — evaluate against input table */
+                    ray_t* vec = exec_node(g, columns[c]);
+                    if (!vec || RAY_IS_ERR(vec)) {
+                        ray_release(result);
+                        g->table = saved_table;
+                        ray_release(input);
+                        return vec ? vec : ray_error("nyi", NULL);
+                    }
+                    /* Broadcast scalar atoms to full column vectors */
+                    if (vec->type < 0) {
+                        int64_t nr = ray_table_nrows(input);
+                        ray_t* col = broadcast_scalar(vec, nr);
+                        ray_release(vec);
+                        vec = col;
+                        if (!vec || RAY_IS_ERR(vec)) {
+                            ray_release(result);
+                            g->table = saved_table;
+                            ray_release(input);
+                            return vec ? vec : ray_error("nyi", NULL);
+                        }
+                    }
+                    /* Synthetic name: _expr_0, _expr_1, ... */
+                    char name_buf[16];
+                    int n = 0;
+                    name_buf[n++] = '_'; name_buf[n++] = 'e';
+                    if (c >= 100) name_buf[n++] = '0' + (c / 100);
+                    if (c >= 10)  name_buf[n++] = '0' + ((c / 10) % 10);
+                    name_buf[n++] = '0' + (c % 10);
+                    int64_t name_id = ray_sym_intern(name_buf, (size_t)n);
+                    result = ray_table_add_col(result, name_id, vec);
+                    ray_release(vec);
+                }
+            }
+
+            g->table = saved_table;
+            ray_release(input);
+            return result;
+        }
+
+        case OP_EXPAND: {
+            ray_t* src = exec_node(g, op->inputs[0]);
+            if (!src || RAY_IS_ERR(src)) return src;
+            ray_t* result = exec_expand(g, op, src);
+            ray_release(src);
+            return result;
+        }
+
+        case OP_VAR_EXPAND: {
+            ray_t* start = exec_node(g, op->inputs[0]);
+            if (!start || RAY_IS_ERR(start)) return start;
+            ray_t* result = exec_var_expand(g, op, start);
+            ray_release(start);
+            return result;
+        }
+
+        case OP_SHORTEST_PATH: {
+            ray_t* src = exec_node(g, op->inputs[0]);
+            ray_t* dst = exec_node(g, op->inputs[1]);
+            if (!src || RAY_IS_ERR(src)) {
+                if (dst && !RAY_IS_ERR(dst)) ray_release(dst);
+                return src;
+            }
+            if (!dst || RAY_IS_ERR(dst)) { ray_release(src); return dst; }
+            ray_t* result = exec_shortest_path(g, op, src, dst);
+            ray_release(src);
+            ray_release(dst);
+            return result;
+        }
+
+        case OP_WCO_JOIN: {
+            return exec_wco_join(g, op);
+        }
+
+        case OP_PAGERANK: {
+            return exec_pagerank(g, op);
+        }
+
+        case OP_CONNECTED_COMP: {
+            return exec_connected_comp(g, op);
+        }
+
+        case OP_DIJKSTRA: {
+            ray_t* src = exec_node(g, op->inputs[0]);
+            if (!src || RAY_IS_ERR(src)) return src;
+            ray_t* dst = op->inputs[1] ? exec_node(g, op->inputs[1]) : NULL;
+            if (dst && RAY_IS_ERR(dst)) { ray_release(src); return dst; }
+            ray_t* result = exec_dijkstra(g, op, src, dst);
+            ray_release(src);
+            if (dst) ray_release(dst);
+            return result;
+        }
+
+        case OP_LOUVAIN: {
+            return exec_louvain(g, op);
+        }
+
+        case OP_DEGREE_CENT: {
+            return exec_degree_cent(g, op);
+        }
+
+        case OP_TOPSORT: {
+            return exec_topsort(g, op);
+        }
+
+        case OP_DFS: {
+            ray_t* src = exec_node(g, op->inputs[0]);
+            if (!src || RAY_IS_ERR(src)) return src;
+            ray_t* result = exec_dfs(g, op, src);
+            ray_release(src);
+            return result;
+        }
+
+        case OP_CLUSTER_COEFF: {
+            return exec_cluster_coeff(g, op);
+        }
+
+        case OP_BETWEENNESS: {
+            return exec_betweenness(g, op);
+        }
+
+        case OP_CLOSENESS: {
+            return exec_closeness(g, op);
+        }
+
+        case OP_MST: {
+            return exec_mst(g, op);
+        }
+
+        case OP_RANDOM_WALK: {
+            ray_t* src = exec_node(g, op->inputs[0]);
+            if (!src || RAY_IS_ERR(src)) return src;
+            ray_t* result = exec_random_walk(g, op, src);
+            ray_release(src);
+            return result;
+        }
+
+        case OP_ASTAR: {
+            ray_t* src = exec_node(g, op->inputs[0]);
+            if (!src || RAY_IS_ERR(src)) return src;
+            ray_t* dst = exec_node(g, op->inputs[1]);
+            if (!dst || RAY_IS_ERR(dst)) { ray_release(src); return dst; }
+            ray_t* result = exec_astar(g, op, src, dst);
+            ray_release(src); ray_release(dst);
+            return result;
+        }
+
+        case OP_K_SHORTEST: {
+            ray_t* src = exec_node(g, op->inputs[0]);
+            if (!src || RAY_IS_ERR(src)) return src;
+            ray_t* dst = exec_node(g, op->inputs[1]);
+            if (!dst || RAY_IS_ERR(dst)) { ray_release(src); return dst; }
+            ray_t* result = exec_k_shortest(g, op, src, dst);
+            ray_release(src); ray_release(dst);
+            return result;
+        }
+
+        case OP_COSINE_SIM: {
+            ray_t* emb = exec_node(g, op->inputs[0]);
+            if (!emb || RAY_IS_ERR(emb)) return emb;
+            ray_t* result = exec_cosine_sim(g, op, emb);
+            ray_release(emb);
+            return result;
+        }
+        case OP_EUCLIDEAN_DIST: {
+            ray_t* emb = exec_node(g, op->inputs[0]);
+            if (!emb || RAY_IS_ERR(emb)) return emb;
+            ray_t* result = exec_euclidean_dist(g, op, emb);
+            ray_release(emb);
+            return result;
+        }
+        case OP_KNN: {
+            ray_t* emb = exec_node(g, op->inputs[0]);
+            if (!emb || RAY_IS_ERR(emb)) return emb;
+            ray_t* result = exec_knn(g, op, emb);
+            ray_release(emb);
+            return result;
+        }
+        case OP_HNSW_KNN: {
+            return exec_hnsw_knn(g, op);
+        }
+        case OP_ANN_RERANK: {
+            ray_t* src = exec_node(g, op->inputs[0]);
+            if (!src || RAY_IS_ERR(src)) return src;
+            ray_t* result = exec_ann_rerank(g, op, src);
+            ray_release(src);
+            return result;
+        }
+        case OP_KNN_RERANK: {
+            ray_t* src = exec_node(g, op->inputs[0]);
+            if (!src || RAY_IS_ERR(src)) return src;
+            ray_t* result = exec_knn_rerank(g, op, src);
+            ray_release(src);
+            return result;
+        }
+
+        default:
+            return ray_error("nyi", NULL);
+    }
+}
+
+/* ============================================================================
+ * ray_execute -- top-level entry point (lazy pool init)
+ * ============================================================================ */
+
+/* Merge two partial results from partition-streamed execution.
+ * Concatenates table columns or vectors across segments. */
+static ray_t* ray_result_merge(ray_t* accum, ray_t* partial) {
+    if (!accum || RAY_IS_ERR(accum)) {
+        if (partial && !RAY_IS_ERR(partial)) ray_retain(partial);
+        return partial;
+    }
+    if (!partial || RAY_IS_ERR(partial)) {
+        ray_retain(accum);
+        return accum;
+    }
+
+    /* Table merge: concatenate each column */
+    if (accum->type == RAY_TABLE && partial->type == RAY_TABLE) {
+        int64_t ncols = ray_table_ncols(accum);
+        ray_t* merged = ray_table_new(ncols);
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t name_id = ray_table_col_name(accum, c);
+            ray_t* a_col = ray_table_get_col_idx(accum, c);
+            ray_t* p_col = ray_table_get_col_idx(partial, c);
+            if (!a_col || !p_col) {
+                ray_release(merged);
+                return ray_error("schema", NULL);
+            }
+            ray_t* combined = ray_vec_concat(a_col, p_col);
+            if (!combined || RAY_IS_ERR(combined)) {
+                ray_release(merged);
+                return combined;
+            }
+            merged = ray_table_add_col(merged, name_id, combined);
+            ray_release(combined);
+        }
+        return merged;
+    }
+
+    /* Vector merge: concatenate directly */
+    if (accum->type != RAY_TABLE && partial->type != RAY_TABLE) {
+        return ray_vec_concat(accum, partial);
+    }
+
+    return ray_error("type", NULL);
+}
+
+/* Build a flat table containing one segment's columns from a parted table.
+ * For each parted column, extracts segs[seg_idx] as a flat vector.
+ * MAPCOMMON columns are materialized for segment seg_idx: the partition key
+ * value is broadcast to fill seg_rows elements.
+ * Non-parted columns are retained as-is. */
+static ray_t* build_segment_table(ray_t* parted_tbl, int32_t seg_idx) {
+    int64_t ncols = ray_table_ncols(parted_tbl);
+    ray_t* seg_tbl = ray_table_new(ncols);
+    if (!seg_tbl || RAY_IS_ERR(seg_tbl)) return seg_tbl;
+
+    /* Find segment row count from first parted column */
+    int64_t seg_rows = 0;
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(parted_tbl, c);
+        if (col && RAY_IS_PARTED(col->type)) {
+            ray_t** segs = (ray_t**)ray_data(col);
+            if (seg_idx < col->len && segs[seg_idx])
+                seg_rows = segs[seg_idx]->len;
+            break;
+        }
+    }
+
+    for (int64_t c = 0; c < ncols; c++) {
+        int64_t name_id = ray_table_col_name(parted_tbl, c);
+        ray_t* col = ray_table_get_col_idx(parted_tbl, c);
+        if (!col) continue;
+
+        if (col->type == RAY_MAPCOMMON) {
+            /* Materialize partition key for this segment: broadcast key
+             * value across seg_rows elements. */
+            if (col->len < 2) {
+                ray_release(seg_tbl);
+                return ray_error("schema", NULL);
+            }
+            ray_t** mc_ptrs = (ray_t**)ray_data(col);
+            ray_t* kv = mc_ptrs[0];  /* key_values */
+            if (!kv || seg_idx >= kv->len) {
+                ray_release(seg_tbl);
+                return ray_error("schema", NULL);
+            }
+            int8_t kv_type = kv->type;
+            size_t esz = (size_t)ray_sym_elem_size(kv_type, kv->attrs);
+            if (esz == 0) {
+                ray_release(seg_tbl);
+                return ray_error("type", NULL);
+            }
+            ray_t* flat = ray_vec_new(kv_type, seg_rows);
+            if (!flat || RAY_IS_ERR(flat)) {
+                ray_release(seg_tbl);
+                return ray_error("oom", NULL);
+            }
+            flat->len = seg_rows;
+            const char* src = (const char*)ray_data(kv) + (size_t)seg_idx * esz;
+            char* dst = (char*)ray_data(flat);
+            if (esz == 8) {
+                uint64_t v; memcpy(&v, src, 8);
+                for (int64_t r = 0; r < seg_rows; r++)
+                    ((uint64_t*)dst)[r] = v;
+            } else if (esz == 4) {
+                uint32_t v; memcpy(&v, src, 4);
+                for (int64_t r = 0; r < seg_rows; r++)
+                    ((uint32_t*)dst)[r] = v;
+            } else {
+                for (int64_t r = 0; r < seg_rows; r++)
+                    memcpy(dst + r * esz, src, esz);
+            }
+            seg_tbl = ray_table_add_col(seg_tbl, name_id, flat);
+            ray_release(flat);
+        } else if (RAY_IS_PARTED(col->type)) {
+            ray_t** segs = (ray_t**)ray_data(col);
+            if (seg_idx >= col->len || !segs[seg_idx]) {
+                ray_release(seg_tbl);
+                return ray_error("schema", NULL);
+            }
+            ray_retain(segs[seg_idx]);
+            seg_tbl = ray_table_add_col(seg_tbl, name_id, segs[seg_idx]);
+            ray_release(segs[seg_idx]);
+        } else {
+            /* Non-parted, non-MAPCOMMON column in a parted table:
+             * streaming should have been rejected by ray_execute().
+             * Error here as defense-in-depth to avoid silent duplication. */
+            ray_release(seg_tbl);
+            return ray_error("schema", NULL);
+        }
+    }
+    return seg_tbl;
+}
+
+/* Is this opcode safe for segment streaming with concatenation merge?
+ * Only element-wise, scan, filter, project, and alias ops produce
+ * results that can be correctly concatenated across segments.
+ * Everything else (joins, aggregations, sorts, graph ops, etc.)
+ * requires specialized merge or global state. */
+static bool op_streamable(uint16_t opc) {
+    switch (opc) {
+        /* Data access (OP_CONST excluded: vector constants have total-row
+         * length and produce length mismatches with per-segment data.
+         * Scalar constants are checked separately in dag_can_stream.) */
+        case OP_SCAN:
+        /* Element-wise unary */
+        case OP_NEG: case OP_ABS: case OP_NOT: case OP_SQRT:
+        case OP_LOG: case OP_EXP: case OP_CEIL: case OP_FLOOR: case OP_ROUND:
+        case OP_ISNULL: case OP_CAST:
+        /* Element-wise binary */
+        case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV: case OP_MOD:
+        case OP_EQ: case OP_NE: case OP_LT: case OP_LE:
+        case OP_GT: case OP_GE: case OP_AND: case OP_OR:
+        case OP_MIN2: case OP_MAX2: case OP_IF: case OP_IN: case OP_NOT_IN:
+        /* String element-wise */
+        case OP_LIKE: case OP_ILIKE: case OP_UPPER: case OP_LOWER:
+        case OP_STRLEN: case OP_SUBSTR: case OP_REPLACE: case OP_TRIM:
+        case OP_CONCAT:
+        /* Temporal element-wise */
+        case OP_EXTRACT: case OP_DATE_TRUNC:
+        /* Structure */
+        case OP_FILTER: case OP_SELECT: case OP_ALIAS:
+        case OP_MATERIALIZE:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/* Walk the root's input subtree to check if it reaches a default-table
+ * OP_SCAN.  Returns true if found, false otherwise.  Also rejects the
+ * subtree (sets *ok = false) on vector constants or secondary-table scans.
+ *
+ * Several streamable ops store extra operands in ext nodes rather than in
+ * the standard inputs[] array.  These hidden children must be walked too:
+ *   OP_SELECT  — ext->sort.columns[0..n_cols-1]
+ *   OP_IF      — else branch: g->nodes[(uint32_t)(uintptr_t)ext->literal]
+ *   OP_SUBSTR  — length arg:  g->nodes[(uint32_t)(uintptr_t)ext->literal]
+ *   OP_REPLACE — replacement: g->nodes[(uint32_t)(uintptr_t)ext->literal]
+ *   OP_CONCAT  — args 2+:    g->nodes[trail[i-2]] (uint32_t[] after ext) */
+static bool subtree_has_default_scan(ray_graph_t* g, ray_op_t* op, bool* ok,
+                                     uint64_t* visited) {
+    if (!op || !*ok) return false;
+    /* Skip already-visited nodes (DAGs may share subexpressions). */
+    uint32_t nid = op->id;
+    if (nid < g->node_count) {
+        if (visited[nid / 64] & (1ULL << (nid % 64))) return false;
+        visited[nid / 64] |= (1ULL << (nid % 64));
+    }
+    uint16_t opc = op->opcode;
+    if (opc == OP_CONST) {
+        ray_op_ext_t* ext = find_ext(g, op->id);
+        if (ext && ext->literal && !ray_is_atom(ext->literal))
+            *ok = false;           /* vector constant — can't stream */
+        return false;
+    }
+    if (opc == OP_SCAN) {
+        ray_op_ext_t* ext = find_ext(g, op->id);
+        if (ext) {
+            uint16_t stored_id = 0;
+            memcpy(&stored_id, ext->base.pad, sizeof(uint16_t));
+            if (stored_id > 0) { *ok = false; return false; }
+            return true;           /* default-table scan */
+        }
+        return false;
+    }
+    if (!op_streamable(opc)) { *ok = false; return false; }
+    bool found = false;
+    for (uint8_t i = 0; i < op->arity && i < 2; i++)
+        found |= subtree_has_default_scan(g, op->inputs[i], ok, visited);
+
+    /* Walk hidden operands stored in ext nodes */
+    if (opc == OP_SELECT) {
+        ray_op_ext_t* ext = find_ext(g, op->id);
+        if (ext) {
+            for (uint8_t c = 0; c < ext->sort.n_cols && *ok; c++)
+                found |= subtree_has_default_scan(g, ext->sort.columns[c], ok, visited);
+        }
+    } else if (opc == OP_IF || opc == OP_SUBSTR || opc == OP_REPLACE) {
+        /* 3rd operand stored as node index in ext->literal */
+        ray_op_ext_t* ext = find_ext(g, op->id);
+        if (ext) {
+            uint32_t child_id = (uint32_t)(uintptr_t)ext->literal;
+            if (child_id < g->node_count)
+                found |= subtree_has_default_scan(g, &g->nodes[child_id], ok, visited);
+        }
+    } else if (opc == OP_CONCAT) {
+        /* n_args in ext->sym, args 2+ as uint32_t[] trailing after ext */
+        ray_op_ext_t* ext = find_ext(g, op->id);
+        if (ext) {
+            int n_args = (int)ext->sym;
+            uint32_t* trail = (uint32_t*)((char*)(ext + 1));
+            for (int i = 2; i < n_args && *ok; i++) {
+                if (trail[i - 2] < g->node_count)
+                    found |= subtree_has_default_scan(g, &g->nodes[trail[i - 2]], ok, visited);
+            }
+        }
+    }
+    return found;
+}
+
+/* Check whether a DAG rooted at `root` can be correctly executed via
+ * segment streaming with simple concatenation merge.
+ * Every node in the root's subtree must be streamable, and at least one
+ * OP_SCAN must read from the default table (stored_table_id == 0).
+ * OP_CONST is allowed only for scalar (atom) literals — vector constants
+ * have total-row length and would mismatch per-segment data.
+ * OP_SCAN nodes referencing secondary tables (stored_table_id > 0)
+ * disqualify streaming, since the loop only swaps g->table.
+ * DAGs that never scan the default table (e.g. a bare OP_CONST behind
+ * passthrough ops) are rejected to avoid duplicating table-independent
+ * results across partitions. */
+static bool dag_can_stream(ray_graph_t* g, ray_op_t* root) {
+    uint32_t n_words = (g->node_count + 63) / 64;
+    uint64_t  stack_buf[16];                  /* covers DAGs up to 1024 nodes */
+    ray_t* visited_hdr = NULL;
+    uint64_t* visited;
+    if (n_words <= 16) {
+        visited = stack_buf;
+    } else {
+        visited = (uint64_t*)scratch_alloc(&visited_hdr, n_words * 8);
+        if (!visited) return false;
+    }
+    memset(visited, 0, n_words * 8);
+    bool ok = true;
+    bool has_default_scan = subtree_has_default_scan(g, root, &ok, visited);
+    if (visited_hdr) scratch_free(visited_hdr);
+    return ok && has_default_scan;
+}
+
+static ray_t* ray_execute_inner(ray_graph_t* g, ray_op_t* root);
+
+ray_t* ray_execute(ray_graph_t* g, ray_op_t* root) {
+    ray_t* r = ray_execute_inner(g, root);
+    /* End the current progress tracking session. A no-op when no
+     * callback is registered; otherwise emits the final "100% done"
+     * tick (only if the bar was actually shown). */
+    ray_progress_end();
+    return r;
+}
+
+static ray_t* ray_execute_inner(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root) return ray_error("nyi", NULL);
+
+    /* Lazy-init the global thread pool on first call */
+    ray_pool_t* pool = ray_pool_get();
+
+    /* Reset cancellation flag at the start of each query */
+    if (pool)
+        atomic_store_explicit(&pool->cancelled, 0, memory_order_relaxed);
+
+    /* Detect streaming mode: check if g->table has parted columns.
+     * All non-MAPCOMMON columns must be parted; a flat (non-parted)
+     * column would be duplicated across every segment table, producing
+     * wrong results after concatenation merge.
+     * All parted columns must agree on segment count — a mismatch is
+     * a malformed table and is rejected upfront. */
+    int32_t seg_count = 0;
+    if (g->table) {
+        bool has_flat = false;
+        for (int64_t c = 0; c < ray_table_ncols(g->table); c++) {
+            ray_t* col = ray_table_get_col_idx(g->table, c);
+            if (!col) continue;
+            if (RAY_IS_PARTED(col->type)) {
+                if (seg_count == 0)
+                    seg_count = (int32_t)col->len;
+                else if ((int32_t)col->len != seg_count)
+                    return ray_error("schema", NULL);
+            } else if (col->type != RAY_MAPCOMMON) {
+                has_flat = true;
+            }
+        }
+        if (has_flat)
+            seg_count = 0;  /* fall back to flat materialization */
+    }
+
+    if (seg_count == 0 || !dag_can_stream(g, root)) {
+        /* Non-parted table or DAG contains ops that need specialized merge:
+         * use existing flat-materialization path. */
+        ray_t* result = exec_node(g, root);
+        if (g->selection && result && !RAY_IS_ERR(result)
+            && result->type == RAY_TABLE) {
+            ray_t* compacted = sel_compact(g, result, g->selection);
+            ray_release(result);
+            ray_release(g->selection);
+            g->selection = NULL;
+            result = compacted;
+        }
+        return result;
+    }
+
+    /* Streaming mode: find seg_mask from optimizer (if any) */
+    uint64_t* seg_mask = NULL;
+    int64_t   seg_mask_count = 0;
+    for (uint32_t e = 0; e < g->ext_count; e++) {
+        if (g->ext_nodes[e] && g->ext_nodes[e]->seg_mask) {
+            seg_mask = g->ext_nodes[e]->seg_mask;
+            seg_mask_count = g->ext_nodes[e]->seg_mask_count;
+            break;
+        }
+    }
+
+    /* Validate mask covers all segments — a mismatch means the
+     * MAPCOMMON key count disagrees with the parted column segment
+     * count, which is a schema error.  Surface it rather than
+     * silently dropping data. */
+    if (seg_mask && seg_mask_count != (int64_t)seg_count)
+        return ray_error("schema", NULL);
+
+    ray_t* saved_table = g->table;
+    ray_t* result = NULL;
+
+    for (int32_t s = 0; s < seg_count; s++) {
+        /* Check pruning mask */
+        if (seg_mask && !(seg_mask[s / 64] & (1ULL << (s % 64))))
+            continue;
+
+        /* Check cancellation */
+        if (pool && atomic_load_explicit(&pool->cancelled, memory_order_relaxed)) {
+            g->table = saved_table;
+            if (g->selection) { ray_release(g->selection); g->selection = NULL; }
+            ray_release(result);
+            return ray_error("cancel", NULL);
+        }
+
+        /* Build flat table for this segment and swap g->table.
+         * All operators (OP_SCAN, GROUP, expr_compile, etc.) see flat
+         * columns via g->table, so no special-casing is needed. */
+        ray_t* seg_tbl = build_segment_table(saved_table, s);
+        if (!seg_tbl || RAY_IS_ERR(seg_tbl)) {
+            g->table = saved_table;
+            if (g->selection) { ray_release(g->selection); g->selection = NULL; }
+            ray_release(result);
+            return seg_tbl;
+        }
+        g->table = seg_tbl;
+        if (g->selection) ray_release(g->selection);
+        g->selection = NULL;
+
+        ray_t* partial = exec_node(g, root);
+
+        /* Compact lazy selection for this segment */
+        if (g->selection && partial && !RAY_IS_ERR(partial)
+            && partial->type == RAY_TABLE) {
+            ray_t* compacted = sel_compact(g, partial, g->selection);
+            ray_release(partial);
+            ray_release(g->selection);
+            g->selection = NULL;
+            partial = compacted;
+        }
+
+        g->table = saved_table;
+        ray_release(seg_tbl);
+
+        if (!partial || RAY_IS_ERR(partial)) {
+            if (g->selection) { ray_release(g->selection); g->selection = NULL; }
+            ray_release(result);
+            return partial;
+        }
+
+        /* Merge partial into accumulator */
+        ray_t* merged = ray_result_merge(result, partial);
+        ray_release(result);
+        ray_release(partial);
+        if (!merged || RAY_IS_ERR(merged)) {
+            if (g->selection) { ray_release(g->selection); g->selection = NULL; }
+            return merged;
+        }
+        result = merged;
+    }
+
+    /* Clean up any lingering selection from the last segment iteration */
+    if (g->selection) { ray_release(g->selection); g->selection = NULL; }
+
+    /* All segments pruned: execute DAG on empty table to get correct
+     * output schema (handles SELECT/PROJECT that reshape columns).
+     * Build a fresh 0-row table — do not mutate shared source vectors. */
+    if (!result) {
+        int64_t ncols = ray_table_ncols(saved_table);
+        ray_t* empty_tbl = ray_table_new(ncols);
+        if (empty_tbl && !RAY_IS_ERR(empty_tbl)) {
+            for (int64_t c = 0; c < ncols; c++) {
+                int64_t name_id = ray_table_col_name(saved_table, c);
+                ray_t* col = ray_table_get_col_idx(saved_table, c);
+                if (!col) continue;
+                int8_t base = col->type;
+                if (col->type == RAY_MAPCOMMON) {
+                    ray_t** mc = (ray_t**)ray_data(col);
+                    base = mc[0] ? mc[0]->type : RAY_I64;
+                } else if (RAY_IS_PARTED(col->type)) {
+                    base = (int8_t)RAY_PARTED_BASETYPE(col->type);
+                }
+                ray_t* ecol = ray_vec_new(base, 0);
+                if (!ecol || RAY_IS_ERR(ecol)) {
+                    /* ray_vec_new rejects RAY_LIST (type 0) and other
+                     * non-standard types; fall back to a raw 0-length
+                     * block with the correct type tag. */
+                    ecol = ray_alloc(0);
+                    if (!ecol || RAY_IS_ERR(ecol)) continue;
+                    ecol->type = base;
+                    ecol->len = 0;
+                }
+                empty_tbl = ray_table_add_col(empty_tbl, name_id, ecol);
+                ray_release(ecol);
+            }
+            g->table = empty_tbl;
+            if (g->selection) ray_release(g->selection);
+            g->selection = NULL;
+            result = exec_node(g, root);
+            if (g->selection) {
+                ray_release(g->selection);
+                g->selection = NULL;
+            }
+            g->table = saved_table;
+            ray_release(empty_tbl);
+        }
+    }
+
+    if (!result) return ray_error("oom", NULL);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/exec.h b/crates/rayforce-sys/vendor/rayforce/src/ops/exec.h
new file mode 100644
index 0000000..396677e
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/exec.h
@@ -0,0 +1,29 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_EXEC_H
+#define RAY_EXEC_H
+
+#include "ops.h"
+
+#endif /* RAY_EXEC_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/expr.c b/crates/rayforce-sys/vendor/rayforce/src/ops/expr.c
new file mode 100644
index 0000000..b0f2da6
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/expr.c
@@ -0,0 +1,1776 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+
+static bool atom_to_numeric(ray_t* atom, double* out_f, int64_t* out_i, bool* out_is_f64) {
+    if (!atom || !ray_is_atom(atom)) return false;
+    switch (atom->type) {
+        case -RAY_F64:
+            *out_f = atom->f64;
+            *out_i = (int64_t)atom->f64;
+            *out_is_f64 = true;
+            return true;
+        case -RAY_I64:
+        case -RAY_SYM:
+        case -RAY_DATE:
+        case -RAY_TIME:
+        case -RAY_TIMESTAMP:
+            *out_i = atom->i64;
+            *out_f = (double)atom->i64;
+            *out_is_f64 = false;
+            return true;
+        case -RAY_I32:
+            *out_i = (int64_t)atom->i32;
+            *out_f = (double)atom->i32;
+            *out_is_f64 = false;
+            return true;
+        case -RAY_I16:
+            *out_i = (int64_t)atom->i16;
+            *out_f = (double)atom->i16;
+            *out_is_f64 = false;
+            return true;
+        case -RAY_U8:
+        case -RAY_BOOL:
+            *out_i = (int64_t)atom->u8;
+            *out_f = (double)atom->u8;
+            *out_is_f64 = false;
+            return true;
+        default:
+            return false;
+    }
+}
+
+/* Evaluate a numeric constant sub-expression from op graph.
+ * Supports CONST and arithmetic trees over constant children. */
+static bool eval_const_numeric_expr(ray_graph_t* g, ray_op_t* op,
+                                    double* out_f, int64_t* out_i, bool* out_is_f64) {
+    if (!g || !op || !out_f || !out_i || !out_is_f64) return false;
+
+    if (op->opcode == OP_CONST) {
+        ray_op_ext_t* ext = find_ext(g, op->id);
+        if (!ext || !ext->literal) return false;
+        return atom_to_numeric(ext->literal, out_f, out_i, out_is_f64);
+    }
+
+    if ((op->opcode == OP_NEG || op->opcode == OP_ABS) && op->arity == 1 && op->inputs[0]) {
+        double af = 0.0;
+        int64_t ai = 0;
+        bool a_is_f64 = false;
+        if (!eval_const_numeric_expr(g, op->inputs[0], &af, &ai, &a_is_f64)) return false;
+        if (a_is_f64 || op->out_type == RAY_F64) {
+            double v = a_is_f64 ? af : (double)ai;
+            double r = (op->opcode == OP_NEG) ? -v : fabs(v);
+            *out_f = r;
+            *out_i = (int64_t)r;
+            *out_is_f64 = true;
+            return true;
+        }
+        int64_t v = ai;
+        /* Unsigned negation avoids UB on INT64_MIN */
+        int64_t r = (op->opcode == OP_NEG)
+                  ? (int64_t)(-(uint64_t)v)
+                  : (v < 0 ? (int64_t)(-(uint64_t)v) : v);
+        *out_i = r;
+        *out_f = (double)r;
+        *out_is_f64 = false;
+        return true;
+    }
+
+    if (op->arity != 2 || !op->inputs[0] || !op->inputs[1]) return false;
+    if (op->opcode < OP_ADD || op->opcode > OP_MAX2) return false;
+
+    double lf = 0.0, rf = 0.0;
+    int64_t li = 0, ri = 0;
+    bool l_is_f64 = false, r_is_f64 = false;
+    if (!eval_const_numeric_expr(g, op->inputs[0], &lf, &li, &l_is_f64)) return false;
+    if (!eval_const_numeric_expr(g, op->inputs[1], &rf, &ri, &r_is_f64)) return false;
+
+    if (op->out_type == RAY_F64 || l_is_f64 || r_is_f64 || op->opcode == OP_DIV) {
+        double lv = l_is_f64 ? lf : (double)li;
+        double rv = r_is_f64 ? rf : (double)ri;
+        double r = 0.0;
+        switch (op->opcode) {
+            case OP_ADD: r = lv + rv; break;
+            case OP_SUB: r = lv - rv; break;
+            case OP_MUL: r = lv * rv; break;
+            case OP_DIV: r = rv != 0.0 ? lv / rv : NAN; break;
+            case OP_MOD: { if (rv != 0.0) { r = fmod(lv, rv); if (r && ((r > 0) != (rv > 0))) r += rv; } else { r = NAN; } } break;
+            case OP_MIN2: r = lv < rv ? lv : rv; break;
+            case OP_MAX2: r = lv > rv ? lv : rv; break;
+            default: return false;
+        }
+        *out_f = r;
+        *out_i = (int64_t)r;
+        *out_is_f64 = true;
+        return true;
+    }
+
+    int64_t r = 0;
+    switch (op->opcode) {
+        case OP_ADD: r = (int64_t)((uint64_t)li + (uint64_t)ri); break;
+        case OP_SUB: r = (int64_t)((uint64_t)li - (uint64_t)ri); break;
+        case OP_MUL: r = (int64_t)((uint64_t)li * (uint64_t)ri); break;
+        case OP_DIV:
+            if (ri==0) return false;
+            r = li/ri; if ((li^ri)<0 && r*ri!=li) r--;
+            break;
+        case OP_MOD:
+            if (ri==0) return false;
+            r = li%ri; if (r && (r^ri)<0) r+=ri;
+            break;
+        case OP_MIN2: r = li < ri ? li : ri; break;
+        case OP_MAX2: r = li > ri ? li : ri; break;
+        default: return false;
+    }
+    *out_i = r;
+    *out_f = (double)r;
+    *out_is_f64 = false;
+    return true;
+}
+
+static bool const_expr_to_i64(ray_graph_t* g, ray_op_t* op, int64_t* out) {
+    if (!g || !op || !out) return false;
+    double c_f = 0.0;
+    int64_t c_i = 0;
+    bool c_is_f64 = false;
+    if (!eval_const_numeric_expr(g, op, &c_f, &c_i, &c_is_f64)) return false;
+    if (!c_is_f64) {
+        *out = c_i;
+        return true;
+    }
+    if (!isfinite(c_f)) return false;
+    double ip = 0.0;
+    if (modf(c_f, &ip) != 0.0) return false;
+    if (ip > (double)INT64_MAX || ip < (double)INT64_MIN) return false;
+    *out = (int64_t)ip;
+    return true;
+}
+
+static inline bool type_is_linear_i64_col(int8_t t) {
+    return t == RAY_I64 || t == RAY_TIMESTAMP ||
+           t == RAY_I32 || t == RAY_DATE || t == RAY_TIME || t == RAY_I16 ||
+           t == RAY_U8 || t == RAY_BOOL || RAY_IS_SYM(t);
+}
+
+static bool linear_expr_add_term(linear_expr_i64_t* e, int64_t sym, int64_t coeff) {
+    if (!e) return false;
+    if (coeff == 0) return true;
+    for (uint8_t i = 0; i < e->n_terms; i++) {
+        if (e->syms[i] != sym) continue;
+        int64_t next = e->coeff_i64[i] + coeff;
+        if (next != 0) {
+            e->coeff_i64[i] = next;
+            return true;
+        }
+        for (uint8_t j = i + 1; j < e->n_terms; j++) {
+            e->syms[j - 1] = e->syms[j];
+            e->coeff_i64[j - 1] = e->coeff_i64[j];
+        }
+        e->n_terms--;
+        return true;
+    }
+    if (e->n_terms >= AGG_LINEAR_MAX_TERMS) return false;
+    e->syms[e->n_terms] = sym;
+    e->coeff_i64[e->n_terms] = coeff;
+    e->n_terms++;
+    return true;
+}
+
+static void linear_expr_scale(linear_expr_i64_t* e, int64_t k) {
+    if (!e || k == 1) return;
+    e->bias_i64 *= k;
+    for (uint8_t i = 0; i < e->n_terms; i++)
+        e->coeff_i64[i] *= k;
+}
+
+static bool linear_expr_add_scaled(linear_expr_i64_t* dst, const linear_expr_i64_t* src, int64_t scale) {
+    if (!dst || !src) return false;
+    dst->bias_i64 += src->bias_i64 * scale;
+    for (uint8_t i = 0; i < src->n_terms; i++) {
+        if (!linear_expr_add_term(dst, src->syms[i], src->coeff_i64[i] * scale))
+            return false;
+    }
+    return true;
+}
+
+/* Parse an expression tree into integer linear form:
+ *   sum(coeff[i] * scan(sym[i])) + bias
+ * Supports +, -, unary -, and multiplication by integer constants. */
+static bool parse_linear_i64_expr(ray_graph_t* g, ray_op_t* op, linear_expr_i64_t* out) {
+    if (!g || !op || !out) return false;
+    memset(out, 0, sizeof(*out));
+
+    int64_t c = 0;
+    if (const_expr_to_i64(g, op, &c)) {
+        out->bias_i64 = c;
+        return true;
+    }
+
+    if (op->opcode == OP_SCAN) {
+        ray_op_ext_t* ext = find_ext(g, op->id);
+        if (!ext || ext->base.opcode != OP_SCAN) return false;
+        out->n_terms = 1;
+        out->syms[0] = ext->sym;
+        out->coeff_i64[0] = 1;
+        return true;
+    }
+
+    if (op->opcode == OP_NEG && op->arity == 1 && op->inputs[0]) {
+        linear_expr_i64_t inner;
+        if (!parse_linear_i64_expr(g, op->inputs[0], &inner)) return false;
+        linear_expr_scale(&inner, -1);
+        *out = inner;
+        return true;
+    }
+
+    if ((op->opcode == OP_ADD || op->opcode == OP_SUB) &&
+        op->arity == 2 && op->inputs[0] && op->inputs[1]) {
+        linear_expr_i64_t lhs;
+        linear_expr_i64_t rhs;
+        if (!parse_linear_i64_expr(g, op->inputs[0], &lhs)) return false;
+        if (!parse_linear_i64_expr(g, op->inputs[1], &rhs)) return false;
+        *out = lhs;
+        return linear_expr_add_scaled(out, &rhs, op->opcode == OP_ADD ? 1 : -1);
+    }
+
+    if (op->opcode == OP_MUL && op->arity == 2 && op->inputs[0] && op->inputs[1]) {
+        int64_t k = 0;
+        linear_expr_i64_t side;
+        if (const_expr_to_i64(g, op->inputs[0], &k) &&
+            parse_linear_i64_expr(g, op->inputs[1], &side)) {
+            linear_expr_scale(&side, k);
+            *out = side;
+            return true;
+        }
+        if (const_expr_to_i64(g, op->inputs[1], &k) &&
+            parse_linear_i64_expr(g, op->inputs[0], &side)) {
+            linear_expr_scale(&side, k);
+            *out = side;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* Detect SUM/AVG integer-linear inputs for scalar aggregate fast path.
+ * Example: (v1 + 1) * 2, v1 + v2 + 1 */
+bool try_linear_sumavg_input_i64(ray_graph_t* g, ray_t* tbl, ray_op_t* input_op,
+                                        agg_linear_t* out_plan) {
+    if (!g || !tbl || !input_op || !out_plan) return false;
+    linear_expr_i64_t lin;
+    if (!parse_linear_i64_expr(g, input_op, &lin)) return false;
+
+    memset(out_plan, 0, sizeof(*out_plan));
+    out_plan->n_terms = lin.n_terms;
+    out_plan->bias_i64 = lin.bias_i64;
+    for (uint8_t i = 0; i < lin.n_terms; i++) {
+        ray_t* col = ray_table_get_col(tbl, lin.syms[i]);
+        if (!col || !type_is_linear_i64_col(col->type)) return false;
+        out_plan->term_ptrs[i] = ray_data(col);
+        out_plan->term_types[i] = col->type;
+        out_plan->coeff_i64[i] = lin.coeff_i64[i];
+    }
+    out_plan->enabled = true;
+    return true;
+}
+
+/* Detect SUM/AVG affine inputs of form (scan +/- const) and return scan vector
+ * plus the additive bias so we can adjust results from (sum,count) directly. */
+bool try_affine_sumavg_input(ray_graph_t* g, ray_t* tbl, ray_op_t* input_op,
+                                    ray_t** out_vec, agg_affine_t* out_affine) {
+    if (!g || !tbl || !input_op || !out_vec || !out_affine) return false;
+    if (input_op->opcode != OP_ADD && input_op->opcode != OP_SUB) return false;
+    if (input_op->arity != 2 || !input_op->inputs[0] || !input_op->inputs[1]) return false;
+
+    ray_op_t* lhs = input_op->inputs[0];
+    ray_op_t* rhs = input_op->inputs[1];
+    ray_op_t* base_op = NULL;
+    int sign = 1;
+    double c_f = 0.0;
+    int64_t c_i = 0;
+    bool c_is_f64 = false;
+
+    double lhs_f = 0.0, rhs_f = 0.0;
+    int64_t lhs_i = 0, rhs_i = 0;
+    bool lhs_is_f64 = false, rhs_is_f64 = false;
+    bool lhs_const = eval_const_numeric_expr(g, lhs, &lhs_f, &lhs_i, &lhs_is_f64);
+    bool rhs_const = eval_const_numeric_expr(g, rhs, &rhs_f, &rhs_i, &rhs_is_f64);
+
+    if (input_op->opcode == OP_ADD) {
+        if (lhs_const) {
+            base_op = rhs;
+            sign = 1;
+            c_f = lhs_f;
+            c_i = lhs_i;
+            c_is_f64 = lhs_is_f64;
+        } else if (rhs_const) {
+            base_op = lhs;
+            sign = 1;
+            c_f = rhs_f;
+            c_i = rhs_i;
+            c_is_f64 = rhs_is_f64;
+        }
+    } else { /* OP_SUB */
+        if (rhs_const) {
+            base_op = lhs;
+            sign = -1;
+            c_f = rhs_f;
+            c_i = rhs_i;
+            c_is_f64 = rhs_is_f64;
+        }
+    }
+    if (!base_op) return false;
+
+    ray_op_ext_t* base_ext = find_ext(g, base_op->id);
+    if (!base_ext || base_ext->base.opcode != OP_SCAN) return false;
+    ray_t* base_vec = ray_table_get_col(tbl, base_ext->sym);
+    if (!base_vec) return false;
+
+    int8_t bt = base_vec->type;
+    if (bt == RAY_F64) {
+        out_affine->enabled = true;
+        out_affine->bias_f64 = (double)sign * (c_is_f64 ? c_f : (double)c_i);
+        out_affine->bias_i64 = (int64_t)out_affine->bias_f64;
+        *out_vec = base_vec;
+        return true;
+    }
+
+    if (bt == RAY_I64 || bt == RAY_TIMESTAMP ||
+        bt == RAY_I32 || bt == RAY_I16 || bt == RAY_U8 || bt == RAY_BOOL ||
+        RAY_IS_SYM(bt)) {
+        int64_t c = 0;
+        if (c_is_f64) {
+            if (!isfinite(c_f)) return false;
+            double ip = 0.0;
+            if (modf(c_f, &ip) != 0.0) return false;
+            if (ip > (double)INT64_MAX || ip < (double)INT64_MIN) return false;
+            c = (int64_t)ip;
+        } else {
+            c = c_i;
+        }
+        out_affine->enabled = true;
+        out_affine->bias_i64 = sign > 0 ? c : -c;
+        out_affine->bias_f64 = (double)out_affine->bias_i64;
+        *out_vec = base_vec;
+        return true;
+    }
+
+    return false;
+}
+
+/* ============================================================================
+ * Expression Compiler: morsel-batched fused evaluation
+ *
+ * Compiles an expression DAG (e.g. v1 + v2 * 3) into a flat instruction
+ * array. Evaluates in morsel-sized chunks (1024 elements) with scratch
+ * registers — never allocates full-length intermediate vectors.
+ * ============================================================================ */
+
+/* Is this opcode an element-wise op suitable for expression compilation? */
+static inline bool expr_is_elementwise(uint16_t op) {
+    return (op >= OP_NEG && op <= OP_CAST) || (op >= OP_ADD && op <= OP_MAX2);
+}
+
+/* Insert CAST instruction to promote register to target type */
+static uint8_t expr_ensure_type(ray_expr_t* out, uint8_t src, int8_t target) {
+    if (out->regs[src].type == target) return src;
+    if (out->n_regs >= EXPR_MAX_REGS || out->n_ins >= EXPR_MAX_INS) return src;
+    uint8_t r = out->n_regs;
+    out->regs[r].kind = REG_SCRATCH;
+    out->regs[r].type = target;
+    out->n_regs++;
+    out->n_scratch++;
+    out->ins[out->n_ins++] = (expr_ins_t){
+        .opcode = OP_CAST, .dst = r, .src1 = src, .src2 = 0xFF,
+    };
+    return r;
+}
+
+/* Compile expression DAG into flat instruction array.
+ * Returns true on success. Only compiles element-wise subtrees. */
+bool expr_compile(ray_graph_t* g, ray_t* tbl, ray_op_t* root, ray_expr_t* out) {
+    memset(out, 0, sizeof(*out));
+    if (!root || !g || !tbl) return false;
+    if (root->opcode == OP_SCAN || root->opcode == OP_CONST) return false;
+    if (!expr_is_elementwise(root->opcode)) return false;
+
+    uint32_t nc = g->node_count;
+    if (nc > 4096) return false; /* guard against stack overflow from VLA */
+    uint8_t node_reg[nc];
+    memset(node_reg, 0xFF, nc * sizeof(uint8_t));
+
+    /* Post-order DFS with explicit stack */
+    /* Depth limit 64 — expressions deeper than 64 levels fall back to non-fused path. */
+    typedef struct { ray_op_t* node; uint8_t phase; } dfs_t;
+    dfs_t dfs[64];
+    int sp = 0;
+    dfs[sp++] = (dfs_t){root, 0};
+
+    while (sp > 0) {
+        dfs_t* top = &dfs[sp - 1];
+        ray_op_t* node = top->node;
+
+        if (node->id < nc && node_reg[node->id] != 0xFF) { sp--; continue; }
+
+        if (top->phase == 0) {
+            top->phase = 1;
+            for (int i = node->arity - 1; i >= 0; i--) {
+                ray_op_t* ch = node->inputs[i];
+                if (!ch) continue;
+                if (ch->id < nc && node_reg[ch->id] != 0xFF) continue;
+                if (sp >= 64) return false;
+                dfs[sp++] = (dfs_t){ch, 0};
+            }
+        } else {
+            sp--;
+            uint8_t r = out->n_regs;
+            if (r >= EXPR_MAX_REGS) return false;
+
+            if (node->opcode == OP_SCAN) {
+                ray_op_ext_t* ext = find_ext(g, node->id);
+                if (!ext) return false;
+                ray_t* col = ray_table_get_col(tbl, ext->sym);
+                if (!col) return false;
+                if (col->type == RAY_MAPCOMMON) return false;
+                if (col->type == RAY_STR) return false; /* RAY_STR needs string comparison path */
+                if (col->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_SLICE)) return false; /* nullable cols need bitmap-aware path */
+                out->regs[r].kind = REG_SCAN;
+                if (RAY_IS_PARTED(col->type)) {
+                    int8_t base = (int8_t)RAY_PARTED_BASETYPE(col->type);
+                    out->regs[r].col_type = base;
+                    out->regs[r].data = NULL; /* resolved per-segment */
+                    out->regs[r].is_parted = true;
+                    out->regs[r].parted_col = col;
+                    out->regs[r].type = (base == RAY_F64) ? RAY_F64 : RAY_I64;
+                    out->has_parted = true;
+                } else {
+                    out->regs[r].col_type = col->type;
+                    out->regs[r].col_attrs = col->attrs;
+                    out->regs[r].data = ray_data(col);
+                    out->regs[r].is_parted = false;
+                    out->regs[r].parted_col = NULL;
+                    out->regs[r].type = (col->type == RAY_F64) ? RAY_F64 : RAY_I64;
+                }
+            } else if (node->opcode == OP_CONST) {
+                ray_op_ext_t* ext = find_ext(g, node->id);
+                if (!ext || !ext->literal) return false;
+                if (RAY_ATOM_IS_NULL(ext->literal)) return false; /* null constants need bitmap-aware path */
+                double cf; int64_t ci; bool is_f64;
+                if (!atom_to_numeric(ext->literal, &cf, &ci, &is_f64)) {
+                    /* Try resolving string constant to symbol intern ID —
+                     * enables fused evaluation of SYM column comparisons
+                     * (e.g. id2 = 'id080' compiles to integer EQ). */
+                    if (ext->literal->type == -RAY_STR) {
+                        const char* s = ray_str_ptr(ext->literal);
+                        size_t slen = ray_str_len(ext->literal);
+                        int64_t sid = ray_sym_find(s, slen);
+                        if (sid < 0) return false;
+                        ci = sid;
+                        cf = (double)sid;
+                        is_f64 = false;
+                    } else {
+                        return false;
+                    }
+                }
+                out->regs[r].kind = REG_CONST;
+                out->regs[r].type = is_f64 ? RAY_F64 : RAY_I64;
+                out->regs[r].const_f64 = cf;
+                out->regs[r].const_i64 = ci;
+            } else if (expr_is_elementwise(node->opcode)) {
+                if (!node->inputs[0]) return false;
+                uint8_t s1 = node_reg[node->inputs[0]->id];
+                if (s1 == 0xFF) return false;
+                uint8_t s2 = 0xFF;
+                if (node->arity >= 2 && node->inputs[1]) {
+                    s2 = node_reg[node->inputs[1]->id];
+                    if (s2 == 0xFF) return false;
+                }
+
+                int8_t t1 = out->regs[s1].type;
+                int8_t t2 = (s2 != 0xFF) ? out->regs[s2].type : t1;
+                uint16_t op = node->opcode;
+                int8_t ot;
+
+                /* Determine output type */
+                if (op == OP_CAST)
+                    ot = node->out_type;
+                else if ((op >= OP_EQ && op <= OP_GE) ||
+                    op == OP_AND || op == OP_OR || op == OP_NOT)
+                    ot = RAY_BOOL;
+                else if (t1 == RAY_F64 || t2 == RAY_F64 || op == OP_DIV ||
+                         op == OP_SQRT || op == OP_LOG || op == OP_EXP)
+                    ot = RAY_F64;
+                else
+                    ot = RAY_I64;
+
+                /* Type promotion: ensure both sources match for the operation.
+                 * Skip for OP_CAST — the instruction itself IS the conversion. */
+                if (op == OP_CAST) {
+                    /* No promotion needed; CAST handles the conversion */
+                    r = out->n_regs;
+                    if (r >= EXPR_MAX_REGS) return false;
+                } else if (ot == RAY_F64 && s2 != 0xFF) {
+                    /* Arithmetic with f64 output — promote i64 inputs to f64 */
+                    s1 = expr_ensure_type(out, s1, RAY_F64);
+                    s2 = expr_ensure_type(out, s2, RAY_F64);
+                    r = out->n_regs; /* re-read after possible CAST inserts */
+                    if (r >= EXPR_MAX_REGS) return false;
+                } else if (ot == RAY_F64 && s2 == 0xFF) {
+                    /* Unary f64 — promote input */
+                    s1 = expr_ensure_type(out, s1, RAY_F64);
+                    r = out->n_regs;
+                    if (r >= EXPR_MAX_REGS) return false;
+                } else if (ot == RAY_BOOL && s2 != 0xFF && t1 != t2) {
+                    /* Comparison with mixed types — promote both to f64 */
+                    int8_t pt = (t1 == RAY_F64 || t2 == RAY_F64) ? RAY_F64 : RAY_I64;
+                    s1 = expr_ensure_type(out, s1, pt);
+                    s2 = expr_ensure_type(out, s2, pt);
+                    r = out->n_regs;
+                    if (r >= EXPR_MAX_REGS) return false;
+                }
+
+                out->regs[r].kind = REG_SCRATCH;
+                out->regs[r].type = ot;
+                out->n_scratch++;
+
+                if (out->n_ins >= EXPR_MAX_INS) return false;
+                out->ins[out->n_ins++] = (expr_ins_t){
+                    .opcode = (uint8_t)op, .dst = r, .src1 = s1, .src2 = s2,
+                };
+            } else {
+                return false;
+            }
+
+            out->n_regs++;
+            if (node->id < nc) node_reg[node->id] = r;
+        }
+    }
+
+    if (out->n_regs == 0 || out->n_ins == 0) return false;
+    out->out_reg = out->n_regs - 1;
+    out->out_type = out->regs[out->out_reg].type;
+    return true;
+}
+
+/* ---- Morsel-batched expression evaluator ---- */
+
+/* Load SCAN column data into i64 scratch buffer with type conversion */
+static void expr_load_i64(int64_t* dst, const void* data, int8_t col_type,
+                          uint8_t col_attrs, int64_t start, int64_t n) {
+    switch (col_type) {
+        case RAY_I64: case RAY_TIMESTAMP:
+            memcpy(dst, (const int64_t*)data + start, (size_t)n * 8);
+            break;
+        case RAY_SYM: {
+            for (int64_t j = 0; j < n; j++)
+                dst[j] = ray_read_sym(data, start + j, col_type, col_attrs);
+        } break;
+        case RAY_I32: case RAY_DATE: case RAY_TIME: {
+            const int32_t* s = (const int32_t*)data + start;
+            for (int64_t j = 0; j < n; j++) dst[j] = s[j];
+        } break;
+        case RAY_U8: case RAY_BOOL: {
+            const uint8_t* s = (const uint8_t*)data + start;
+            for (int64_t j = 0; j < n; j++) dst[j] = s[j];
+        } break;
+        case RAY_I16: {
+            const int16_t* s = (const int16_t*)data + start;
+            for (int64_t j = 0; j < n; j++) dst[j] = s[j];
+        } break;
+        default: memset(dst, 0, (size_t)n * 8); break;
+    }
+}
+
+/* Load SCAN column data into f64 scratch buffer with type conversion */
+static void expr_load_f64(double* dst, const void* data, int8_t col_type,
+                          uint8_t col_attrs, int64_t start, int64_t n) {
+    switch (col_type) {
+        case RAY_F64:
+            memcpy(dst, (const double*)data + start, (size_t)n * 8);
+            break;
+        case RAY_I64: case RAY_TIMESTAMP: {
+            const int64_t* s = (const int64_t*)data + start;
+            for (int64_t j = 0; j < n; j++) dst[j] = (double)s[j];
+        } break;
+        case RAY_SYM: {
+            for (int64_t j = 0; j < n; j++)
+                dst[j] = (double)ray_read_sym(data, start + j, col_type, col_attrs);
+        } break;
+        case RAY_I32: case RAY_DATE: case RAY_TIME: {
+            const int32_t* s = (const int32_t*)data + start;
+            for (int64_t j = 0; j < n; j++) dst[j] = (double)s[j];
+        } break;
+        case RAY_U8: case RAY_BOOL: {
+            const uint8_t* s = (const uint8_t*)data + start;
+            for (int64_t j = 0; j < n; j++) dst[j] = (double)s[j];
+        } break;
+        case RAY_I16: {
+            const int16_t* s = (const int16_t*)data + start;
+            for (int64_t j = 0; j < n; j++) dst[j] = (double)s[j];
+        } break;
+        default: memset(dst, 0, (size_t)n * 8); break;
+    }
+}
+
+/* Execute a binary instruction over n elements.
+ * Switch is OUTSIDE the loop so each case auto-vectorizes. */
+static void expr_exec_binary(uint8_t opcode, int8_t dt, void* dp,
+                              int8_t t1, const void* ap,
+                              int8_t t2, const void* bp, int64_t n) {
+    (void)t2;
+    if (dt == RAY_F64) {
+        double* d = (double*)dp;
+        const double* a = (const double*)ap;
+        const double* b = (const double*)bp;
+        switch (opcode) {
+            case OP_ADD: for (int64_t j = 0; j < n; j++) d[j] = a[j] + b[j]; break;
+            case OP_SUB: for (int64_t j = 0; j < n; j++) d[j] = a[j] - b[j]; break;
+            case OP_MUL: for (int64_t j = 0; j < n; j++) d[j] = a[j] * b[j]; break;
+            case OP_DIV: for (int64_t j = 0; j < n; j++) d[j] = b[j] != 0.0 ? a[j] / b[j] : NAN; break;
+            case OP_MOD: for (int64_t j = 0; j < n; j++) {
+                if (b[j] == 0.0) { d[j] = NAN; continue; }
+                double m = fmod(a[j], b[j]);
+                d[j] = (m && ((m > 0) != (b[j] > 0))) ? m + b[j] : m;
+            } break;
+            case OP_MIN2: for (int64_t j = 0; j < n; j++) d[j] = a[j] < b[j] ? a[j] : b[j]; break;
+            case OP_MAX2: for (int64_t j = 0; j < n; j++) d[j] = a[j] > b[j] ? a[j] : b[j]; break;
+            default: break;
+        }
+    } else if (dt == RAY_I64 || dt == RAY_TIMESTAMP) {
+        int64_t* d = (int64_t*)dp;
+        const int64_t* a = (const int64_t*)ap;
+        const int64_t* b = (const int64_t*)bp;
+        switch (opcode) {
+            case OP_ADD: for (int64_t j = 0; j < n; j++) d[j] = (int64_t)((uint64_t)a[j] + (uint64_t)b[j]); break;
+            case OP_SUB: for (int64_t j = 0; j < n; j++) d[j] = (int64_t)((uint64_t)a[j] - (uint64_t)b[j]); break;
+            case OP_MUL: for (int64_t j = 0; j < n; j++) d[j] = (int64_t)((uint64_t)a[j] * (uint64_t)b[j]); break;
+            case OP_DIV: for (int64_t j = 0; j < n; j++) {
+                if (b[j]==0 || (b[j]==-1 && a[j]==((int64_t)1<<63))) { d[j]=0; continue; }
+                int64_t q = a[j]/b[j];
+                if ((a[j]^b[j])<0 && q*b[j]!=a[j]) q--;
+                d[j] = q;
+            } break;
+            case OP_MOD: for (int64_t j = 0; j < n; j++) {
+                if (b[j]==0 || (b[j]==-1 && a[j]==((int64_t)1<<63))) { d[j]=0; continue; }
+                int64_t m = a[j]%b[j];
+                if (m && (m^b[j])<0) m+=b[j];
+                d[j] = m;
+            } break;
+            case OP_MIN2: for (int64_t j = 0; j < n; j++) d[j] = a[j] < b[j] ? a[j] : b[j]; break;
+            case OP_MAX2: for (int64_t j = 0; j < n; j++) d[j] = a[j] > b[j] ? a[j] : b[j]; break;
+            default: break;
+        }
+    } else if (dt == RAY_I32 || dt == RAY_DATE || dt == RAY_TIME) {
+        int32_t* d = (int32_t*)dp;
+        const int32_t* a = (const int32_t*)ap;
+        const int32_t* b = (const int32_t*)bp;
+        switch (opcode) {
+            case OP_ADD: for (int64_t j = 0; j < n; j++) d[j] = (int32_t)((uint32_t)a[j] + (uint32_t)b[j]); break;
+            case OP_SUB: for (int64_t j = 0; j < n; j++) d[j] = (int32_t)((uint32_t)a[j] - (uint32_t)b[j]); break;
+            case OP_MUL: for (int64_t j = 0; j < n; j++) d[j] = (int32_t)((uint32_t)a[j] * (uint32_t)b[j]); break;
+            case OP_DIV: for (int64_t j = 0; j < n; j++) {
+                if (b[j]==0 || (b[j]==-1 && a[j]==((int32_t)1<<31))) { d[j]=0; continue; }
+                int32_t q = a[j]/b[j];
+                if ((a[j]^b[j])<0 && q*b[j]!=a[j]) q--;
+                d[j] = q;
+            } break;
+            case OP_MOD: for (int64_t j = 0; j < n; j++) {
+                if (b[j]==0 || (b[j]==-1 && a[j]==((int32_t)1<<31))) { d[j]=0; continue; }
+                int32_t m = a[j]%b[j];
+                if (m && (m^b[j])<0) m+=b[j];
+                d[j] = m;
+            } break;
+            case OP_MIN2: for (int64_t j = 0; j < n; j++) d[j] = a[j] < b[j] ? a[j] : b[j]; break;
+            case OP_MAX2: for (int64_t j = 0; j < n; j++) d[j] = a[j] > b[j] ? a[j] : b[j]; break;
+            default: break;
+        }
+    } else if (dt == RAY_I16) {
+        int16_t* d = (int16_t*)dp;
+        const int16_t* a = (const int16_t*)ap;
+        const int16_t* b = (const int16_t*)bp;
+        switch (opcode) {
+            case OP_ADD: for (int64_t j = 0; j < n; j++) d[j] = (int16_t)((uint16_t)a[j] + (uint16_t)b[j]); break;
+            case OP_SUB: for (int64_t j = 0; j < n; j++) d[j] = (int16_t)((uint16_t)a[j] - (uint16_t)b[j]); break;
+            case OP_MUL: for (int64_t j = 0; j < n; j++) d[j] = (int16_t)((uint16_t)a[j] * (uint16_t)b[j]); break;
+            case OP_DIV: for (int64_t j = 0; j < n; j++) { d[j] = b[j] ? a[j] / b[j] : 0; } break;
+            case OP_MOD: for (int64_t j = 0; j < n; j++) { d[j] = b[j] ? a[j] % b[j] : 0; } break;
+            case OP_MIN2: for (int64_t j = 0; j < n; j++) d[j] = a[j] < b[j] ? a[j] : b[j]; break;
+            case OP_MAX2: for (int64_t j = 0; j < n; j++) d[j] = a[j] > b[j] ? a[j] : b[j]; break;
+            default: break;
+        }
+    } else if (dt == RAY_U8) {
+        uint8_t* d2 = (uint8_t*)dp;
+        const uint8_t* a2 = (const uint8_t*)ap;
+        const uint8_t* b2 = (const uint8_t*)bp;
+        switch (opcode) {
+            case OP_ADD: for (int64_t j = 0; j < n; j++) d2[j] = a2[j] + b2[j]; break;
+            case OP_SUB: for (int64_t j = 0; j < n; j++) d2[j] = a2[j] - b2[j]; break;
+            case OP_MUL: for (int64_t j = 0; j < n; j++) d2[j] = a2[j] * b2[j]; break;
+            case OP_DIV: for (int64_t j = 0; j < n; j++) { d2[j] = b2[j] ? a2[j] / b2[j] : 0; } break;
+            case OP_MOD: for (int64_t j = 0; j < n; j++) { d2[j] = b2[j] ? a2[j] % b2[j] : 0; } break;
+            case OP_MIN2: for (int64_t j = 0; j < n; j++) d2[j] = a2[j] < b2[j] ? a2[j] : b2[j]; break;
+            case OP_MAX2: for (int64_t j = 0; j < n; j++) d2[j] = a2[j] > b2[j] ? a2[j] : b2[j]; break;
+            default: break;
+        }
+    } else if (dt == RAY_BOOL) {
+        uint8_t* d = (uint8_t*)dp;
+        if (t1 == RAY_F64) {
+            const double* a = (const double*)ap;
+            const double* b = (const double*)bp;
+            /* Null-aware F64 comparisons: NaN is null sentinel.
+             * null == null → true, null < non-null → true, non-null > null → true */
+            #define F64_ISNAN(x) ((x) != (x))
+            switch (opcode) {
+                case OP_EQ: for (int64_t j = 0; j < n; j++) d[j] = (F64_ISNAN(a[j])&&F64_ISNAN(b[j])) ? 1 : (F64_ISNAN(a[j])||F64_ISNAN(b[j])) ? 0 : a[j]==b[j]; break;
+                case OP_NE: for (int64_t j = 0; j < n; j++) d[j] = (F64_ISNAN(a[j])&&F64_ISNAN(b[j])) ? 0 : (F64_ISNAN(a[j])||F64_ISNAN(b[j])) ? 1 : a[j]!=b[j]; break;
+                case OP_LT: for (int64_t j = 0; j < n; j++) d[j] = (F64_ISNAN(a[j])&&F64_ISNAN(b[j])) ? 0 : F64_ISNAN(a[j]) ? 1 : F64_ISNAN(b[j]) ? 0 : a[j]<b[j]; break;
+                case OP_LE: for (int64_t j = 0; j < n; j++) d[j] = (F64_ISNAN(a[j])&&F64_ISNAN(b[j])) ? 1 : F64_ISNAN(a[j]) ? 1 : F64_ISNAN(b[j]) ? 0 : a[j]<=b[j]; break;
+                case OP_GT: for (int64_t j = 0; j < n; j++) d[j] = (F64_ISNAN(a[j])&&F64_ISNAN(b[j])) ? 0 : F64_ISNAN(b[j]) ? 1 : F64_ISNAN(a[j]) ? 0 : a[j]>b[j]; break;
+                case OP_GE: for (int64_t j = 0; j < n; j++) d[j] = (F64_ISNAN(a[j])&&F64_ISNAN(b[j])) ? 1 : F64_ISNAN(b[j]) ? 1 : F64_ISNAN(a[j]) ? 0 : a[j]>=b[j]; break;
+                default: break;
+            }
+            #undef F64_ISNAN
+        } else if (t1 == RAY_I64) {
+            const int64_t* a = (const int64_t*)ap;
+            const int64_t* b = (const int64_t*)bp;
+            /* Plain comparison — null handling via bitmap post-pass.
+             * Values at null positions are zero (from vector init), which
+             * compares correctly for null-as-minimum semantics when both
+             * input null bitmaps are propagated to the result. */
+            switch (opcode) {
+                case OP_EQ: for (int64_t j = 0; j < n; j++) d[j] = a[j]==b[j]; break;
+                case OP_NE: for (int64_t j = 0; j < n; j++) d[j] = a[j]!=b[j]; break;
+                case OP_LT: for (int64_t j = 0; j < n; j++) d[j] = a[j]<b[j]; break;
+                case OP_LE: for (int64_t j = 0; j < n; j++) d[j] = a[j]<=b[j]; break;
+                case OP_GT: for (int64_t j = 0; j < n; j++) d[j] = a[j]>b[j]; break;
+                case OP_GE: for (int64_t j = 0; j < n; j++) d[j] = a[j]>=b[j]; break;
+                default: break;
+            }
+        } else { /* both bool */
+            const uint8_t* a = (const uint8_t*)ap;
+            const uint8_t* b = (const uint8_t*)bp;
+            switch (opcode) {
+                case OP_AND: for (int64_t j = 0; j < n; j++) d[j] = a[j] && b[j]; break;
+                case OP_OR:  for (int64_t j = 0; j < n; j++) d[j] = a[j] || b[j]; break;
+                default: break;
+            }
+        }
+    }
+}
+
+/* Execute a unary instruction over n elements */
+static void expr_exec_unary(uint8_t opcode, int8_t dt, void* dp,
+                             int8_t t1, const void* ap, int64_t n) {
+    if (dt == RAY_F64) {
+        double* d = (double*)dp;
+        if (t1 == RAY_F64) {
+            const double* a = (const double*)ap;
+            switch (opcode) {
+                case OP_NEG:   for (int64_t j = 0; j < n; j++) d[j] = -a[j]; break;
+                case OP_ABS:   for (int64_t j = 0; j < n; j++) d[j] = fabs(a[j]); break;
+                case OP_SQRT:  for (int64_t j = 0; j < n; j++) d[j] = sqrt(a[j]); break;
+                case OP_LOG:   for (int64_t j = 0; j < n; j++) d[j] = log(a[j]); break;
+                case OP_EXP:   for (int64_t j = 0; j < n; j++) d[j] = exp(a[j]); break;
+                case OP_CEIL:  for (int64_t j = 0; j < n; j++) d[j] = ceil(a[j]); break;
+                case OP_FLOOR: for (int64_t j = 0; j < n; j++) d[j] = floor(a[j]); break;
+                case OP_ROUND: for (int64_t j = 0; j < n; j++) d[j] = round(a[j]); break;
+                default: break;
+            }
+        } else { /* CAST i64→f64 */
+            const int64_t* a = (const int64_t*)ap;
+            for (int64_t j = 0; j < n; j++) d[j] = (double)a[j];
+        }
+    } else if (dt == RAY_I64) {
+        int64_t* d = (int64_t*)dp;
+        if (t1 == RAY_I64) {
+            const int64_t* a = (const int64_t*)ap;
+            switch (opcode) {
+                /* Unsigned negation avoids UB on INT64_MIN */
+                case OP_NEG: for (int64_t j = 0; j < n; j++) d[j] = (int64_t)(-(uint64_t)a[j]); break;
+                case OP_ABS: for (int64_t j = 0; j < n; j++) d[j] = a[j] < 0 ? (int64_t)(-(uint64_t)a[j]) : a[j]; break;
+                default: break;
+            }
+        } else { /* CAST f64→i64 — clamp to avoid out-of-range UB */
+            const double* a = (const double*)ap;
+            for (int64_t j = 0; j < n; j++)
+                d[j] = (a[j] >= (double)INT64_MAX) ? INT64_MAX
+                     : (a[j] <= (double)INT64_MIN) ? INT64_MIN
+                     : (int64_t)a[j];
+        }
+    } else if (dt == RAY_BOOL) {
+        uint8_t* d = (uint8_t*)dp;
+        const uint8_t* a = (const uint8_t*)ap;
+        switch (opcode) {
+            case OP_NOT: for (int64_t j = 0; j < n; j++) d[j] = !a[j]; break;
+            default: break;
+        }
+    }
+}
+
+/* Evaluate compiled expression for morsel [start, end).
+ * scratch: array of EXPR_MAX_REGS buffers, each EXPR_MORSEL*8 bytes.
+ * Returns pointer to output data (morsel-relative indexing). */
+static void* expr_eval_morsel(const ray_expr_t* expr, void** scratch,
+                               int64_t start, int64_t end) {
+    int64_t n = end - start;
+    if (n <= 0) return NULL;
+
+    void* rptrs[EXPR_MAX_REGS];
+    for (uint8_t r = 0; r < expr->n_regs; r++) {
+        int8_t rt = expr->regs[r].type;
+        int8_t ct = expr->regs[r].col_type;
+        switch (expr->regs[r].kind) {
+            case REG_SCAN: {
+                /* Direct pointer if native type matches, else convert */
+                uint8_t ca = expr->regs[r].col_attrs;
+                if (rt == RAY_F64 && ct == RAY_F64) {
+                    rptrs[r] = (double*)expr->regs[r].data + start;
+                } else if (rt == RAY_I64 && (ct == RAY_I64 || ct == RAY_TIMESTAMP)) {
+                    rptrs[r] = (int64_t*)expr->regs[r].data + start;
+                } else if (rt == RAY_I64 && ct == RAY_SYM &&
+                           (ca & RAY_SYM_W_MASK) == RAY_SYM_W64) {
+                    rptrs[r] = (int64_t*)expr->regs[r].data + start;
+                } else {
+                    rptrs[r] = scratch[r];
+                    if (rt == RAY_F64)
+                        expr_load_f64(scratch[r], expr->regs[r].data, ct, ca, start, n);
+                    else
+                        expr_load_i64(scratch[r], expr->regs[r].data, ct, ca, start, n);
+                }
+            }
+                break;
+            case REG_CONST:
+                rptrs[r] = scratch[r];
+                if (rt == RAY_F64) {
+                    double v = expr->regs[r].const_f64;
+                    double* d = (double*)scratch[r];
+                    for (int64_t j = 0; j < n; j++) d[j] = v;
+                } else {
+                    int64_t v = expr->regs[r].const_i64;
+                    int64_t* d = (int64_t*)scratch[r];
+                    for (int64_t j = 0; j < n; j++) d[j] = v;
+                }
+                break;
+            default: /* REG_SCRATCH */
+                rptrs[r] = scratch[r];
+                break;
+        }
+    }
+
+    for (uint8_t i = 0; i < expr->n_ins; i++) {
+        const expr_ins_t* ins = &expr->ins[i];
+        int8_t dt = expr->regs[ins->dst].type;
+        if (ins->src2 != 0xFF) {
+            expr_exec_binary(ins->opcode, dt, rptrs[ins->dst],
+                             expr->regs[ins->src1].type, rptrs[ins->src1],
+                             expr->regs[ins->src2].type, rptrs[ins->src2], n);
+        } else {
+            expr_exec_unary(ins->opcode, dt, rptrs[ins->dst],
+                            expr->regs[ins->src1].type, rptrs[ins->src1], n);
+        }
+    }
+
+    return rptrs[expr->out_reg];
+}
+
+/* Context for parallel full-vector expression evaluation */
+typedef struct {
+    const ray_expr_t* expr;
+    void*  out_data;
+    int8_t out_type;
+} expr_full_ctx_t;
+
+static void expr_full_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    expr_full_ctx_t* c = (expr_full_ctx_t*)ctx;
+    const ray_expr_t* expr = c->expr;
+    uint8_t esz = ray_elem_size(c->out_type);
+
+    /* Per-worker scratch buffers (heap-allocated via arena, morsel-sized) */
+    ray_t* scratch_hdr = NULL;
+    char* scratch_mem = (char*)scratch_alloc(&scratch_hdr,
+                            (size_t)EXPR_MAX_REGS * EXPR_MORSEL * 8);
+    if (!scratch_mem) return;
+    void* scratch[EXPR_MAX_REGS];
+    for (uint8_t r = 0; r < expr->n_regs; r++)
+        scratch[r] = scratch_mem + (size_t)r * EXPR_MORSEL * 8;
+
+    for (int64_t ms = start; ms < end; ms += EXPR_MORSEL) {
+        int64_t me = (ms + EXPR_MORSEL < end) ? ms + EXPR_MORSEL : end;
+        void* result = expr_eval_morsel(expr, scratch, ms, me);
+        if (result)
+            memcpy((char*)c->out_data + ms * esz, result, (size_t)(me - ms) * esz);
+    }
+    scratch_free(scratch_hdr);
+}
+
+/* Post-pass for the fused unary path: |INT64_MIN| and -INT64_MIN don't fit in
+ * i64 (signed-overflow; k/q convention surfaces this as typed null).  The
+ * element-wise loop uses unsigned wrap, so any overflow position lands as
+ * INT64_MIN in data.  Convert each such position to typed-null: zero data[i]
+ * (preserve "null position is 0" invariant) and set the null bit.  Caller
+ * must invoke single-threaded — after pool dispatch joins. */
+static void mark_i64_overflow_as_null(ray_t* result, int64_t off, int64_t len) {
+    int64_t* d = (int64_t*)ray_data(result) + off;
+    for (int64_t i = 0; i < len; i++) {
+        if (RAY_UNLIKELY(d[i] == INT64_MIN)) {
+            d[i] = 0;
+            ray_vec_set_null(result, off + i, true);
+        }
+    }
+}
+
+/* The fused unary path may produce INT64_MIN via signed-overflow only for
+ * OP_NEG and OP_ABS over an i64 source (output type i64).  Detect those
+ * shapes from the last instruction in the compiled expression. */
+static bool expr_last_op_overflows_i64(const ray_expr_t* expr) {
+    if (expr->out_type != RAY_I64 || expr->n_ins == 0) return false;
+    const expr_ins_t* last = &expr->ins[expr->n_ins - 1];
+    if (last->opcode != OP_NEG && last->opcode != OP_ABS) return false;
+    if (last->src2 != 0xFF) return false; /* unary only */
+    if (expr->regs[last->src1].type != RAY_I64) return false;
+    if (expr->regs[last->dst].type != RAY_I64) return false;
+    return true;
+}
+
+/* Evaluate compiled expression over parted (segmented) columns.
+ * Iterates segments as outer loop, rebinds data pointers per segment,
+ * then dispatches the existing morsel evaluator per segment. Zero copy. */
+static ray_t* expr_eval_full_parted(const ray_expr_t* expr, int64_t nrows) {
+    ray_t* out = ray_vec_new(expr->out_type, nrows);
+    if (!out || RAY_IS_ERR(out)) {
+        return out;
+    }
+    out->len = nrows;
+
+    /* Find first parted register to get segment structure */
+    ray_t* ref_parted = NULL;
+    for (uint8_t r = 0; r < expr->n_regs; r++) {
+        if (expr->regs[r].is_parted) {
+            ref_parted = expr->regs[r].parted_col;
+            break;
+        }
+    }
+    if (!ref_parted) { ray_release(out); return ray_error("nyi", NULL); }
+
+    int64_t n_segs = ref_parted->len;
+    uint8_t esz = ray_elem_size(expr->out_type);
+    ray_pool_t* pool = ray_pool_get();
+    int64_t global_off = 0;
+
+    for (int64_t s = 0; s < n_segs; s++) {
+        /* Determine segment length from any non-NULL parted register */
+        int64_t seg_len = 0;
+        for (uint8_t r = 0; r < expr->n_regs; r++) {
+            if (expr->regs[r].is_parted) {
+                ray_t** segs = (ray_t**)ray_data(expr->regs[r].parted_col);
+                if (segs[s]) { seg_len = segs[s]->len; break; }
+            }
+        }
+        if (seg_len <= 0) continue;
+
+        /* Stack-copy expr, rebind parted registers to this segment's data */
+        ray_expr_t seg_expr = *expr;
+        bool seg_ok = true;
+        for (uint8_t r = 0; r < seg_expr.n_regs; r++) {
+            if (seg_expr.regs[r].is_parted) {
+                ray_t** segs = (ray_t**)ray_data(seg_expr.regs[r].parted_col);
+                if (!segs[s]) { seg_ok = false; break; }
+                seg_expr.regs[r].data = ray_data(segs[s]);
+            }
+        }
+        if (!seg_ok) {
+            memset((char*)ray_data(out) + global_off * esz, 0,
+                   (size_t)seg_len * esz);
+            global_off += seg_len;
+            continue;
+        }
+
+        expr_full_ctx_t ctx = {
+            .expr = &seg_expr,
+            .out_data = (char*)ray_data(out) + global_off * esz,
+            .out_type = expr->out_type,
+        };
+        if (pool && seg_len >= RAY_PARALLEL_THRESHOLD)
+            ray_pool_dispatch(pool, expr_full_fn, &ctx, seg_len);
+        else
+            expr_full_fn(&ctx, 0, 0, seg_len);
+
+        global_off += seg_len;
+    }
+    if (expr_last_op_overflows_i64(expr))
+        mark_i64_overflow_as_null(out, 0, nrows);
+    return out;
+}
+
+/* Evaluate compiled expression into a full-length output vector.
+ * Replaces exec_node() for expression subtrees — no intermediate vectors. */
+ray_t* expr_eval_full(const ray_expr_t* expr, int64_t nrows) {
+    if (expr->has_parted)
+        return expr_eval_full_parted(expr, nrows);
+
+    ray_t* out = ray_vec_new(expr->out_type, nrows);
+    if (!out || RAY_IS_ERR(out)) return out;
+    out->len = nrows;
+
+    expr_full_ctx_t ctx = {
+        .expr = expr, .out_data = ray_data(out), .out_type = expr->out_type,
+    };
+
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
+        ray_pool_dispatch(pool, expr_full_fn, &ctx, nrows);
+    else
+        expr_full_fn(&ctx, 0, 0, nrows);
+
+    if (expr_last_op_overflows_i64(expr))
+        mark_i64_overflow_as_null(out, 0, nrows);
+    return out;
+}
+
+/* ============================================================================
+ * Null bitmap propagation for element-wise ops
+ * ============================================================================ */
+
+/* Resolve the raw null bitmap pointer and bit offset for a vector.
+ * Returns NULL if the vector has no null bits, or if the inline nullmap
+ * cannot cover the requested range (prevents overread). */
+static const uint8_t* nullmap_bits(ray_t* v, int64_t* bit_offset, int64_t len) {
+    ray_t* target = v;
+    int64_t off = 0;
+    if (v->attrs & RAY_ATTR_SLICE) {
+        target = v->slice_parent;
+        off = v->slice_offset;
+    }
+    if (!(target->attrs & RAY_ATTR_HAS_NULLS)) return NULL;
+    int64_t resolved_off = 0, len_bits = 0;
+    const uint8_t* bits = ray_vec_nullmap_bytes(target, &resolved_off, &len_bits);
+    if (!bits) return NULL;
+    *bit_offset = off + resolved_off;
+    /* Caller assumes inline buffer means 128-bit coverage; reject ranges
+     * that would overrun it just like the original guard. */
+    if (len_bits == 128 && off + len > 128) return NULL;
+    return bits;
+}
+
+/* Writable null bitmap pointer for freshly allocated (non-slice) dst vector.
+ * Returns NULL if inline nullmap cannot cover dst->len (prevents overflow). */
+static uint8_t* nullmap_bits_mut(ray_t* dst) {
+    if (dst->attrs & RAY_ATTR_NULLMAP_EXT)
+        return (uint8_t*)ray_data(dst->ext_nullmap);
+    if (dst->type == RAY_STR) return NULL;
+    if (dst->len > 128) return NULL; /* inline can only cover 128 bits */
+    return dst->nullmap;
+}
+
+/* OR-merge null bitmap from src into dst. Fast byte-level path when possible,
+ * element-level fallback for misaligned slices or RAY_STR without ext nullmap. */
+static void propagate_nulls(ray_t* src, ray_t* dst, int64_t len) {
+    int64_t src_off = 0;
+    const uint8_t* sbits = nullmap_bits(src, &src_off, len);
+    if (!sbits) goto slow; /* no accessible bitmap — use element path */
+
+    /* Ensure dst has ext nullmap for large vectors */
+    if (len > 128 && !(dst->attrs & RAY_ATTR_NULLMAP_EXT))
+        ray_vec_set_null(dst, len - 1, false); /* force ext alloc */
+    uint8_t* dbits = nullmap_bits_mut(dst);
+    if (!dbits) goto slow; /* ext alloc failed or RAY_STR */
+
+    /* Bulk OR — both bitmaps are byte-accessible and src is byte-aligned */
+    if ((src_off % 8) == 0) {
+        int64_t byte_start = src_off / 8;
+        int64_t nbytes = (len + 7) / 8;
+        for (int64_t b = 0; b < nbytes; b++)
+            dbits[b] |= sbits[byte_start + b];
+        dst->attrs |= RAY_ATTR_HAS_NULLS;
+        return;
+    }
+
+slow:
+    for (int64_t i = 0; i < len; i++) {
+        if (ray_vec_is_null(src, i))
+            ray_vec_set_null(dst, i, true);
+    }
+}
+
+/* Returns true for arithmetic ops that should propagate nulls.
+ * Comparisons (EQ..GE) and logical ops (AND/OR) produce false for null inputs. */
+static bool op_propagates_null(uint16_t opc) {
+    return opc < OP_EQ || opc > OP_OR;
+}
+
+/* Check if a scalar operand (atom or length-1 vector) is null.
+ * Handles slices correctly via ray_vec_is_null delegation. */
+static bool scalar_is_null(ray_t* x) {
+    if (ray_is_atom(x)) return RAY_ATOM_IS_NULL(x);
+    /* Length-1 vector — use ray_vec_is_null which handles slices */
+    return ray_vec_is_null(x, 0);
+}
+
+/* Check if a vector might contain nulls (accounts for slices). */
+static bool vec_may_have_nulls(ray_t* v) {
+    return (v->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_SLICE)) != 0;
+}
+
+/* Resolve data pointer for a vector, accounting for slices.
+ * For slices, returns the parent's data and adjusts *offset. */
+static void* resolve_vec_data(ray_t* v, int64_t* offset) {
+    if (v->attrs & RAY_ATTR_SLICE) {
+        *offset += v->slice_offset;
+        return ray_data(v->slice_parent);
+    }
+    return ray_data(v);
+}
+
+/* For comparisons: force result to false for any element where either input is null. */
+/* Fix comparison results at null positions using null-as-minimum semantics.
+ * null == null → true, null < x → true, x > null → true, etc. */
+static void fix_null_comparisons(ray_t* lhs, ray_t* rhs, ray_t* result,
+                                  bool l_scalar, bool r_scalar, int64_t len,
+                                  uint16_t opcode) {
+    uint8_t* dst = (uint8_t*)ray_data(result);
+    bool ln_s = l_scalar && scalar_is_null(lhs);
+    bool rn_s = r_scalar && scalar_is_null(rhs);
+    bool l_has = !l_scalar && vec_may_have_nulls(lhs);
+    bool r_has = !r_scalar && vec_may_have_nulls(rhs);
+    if (!ln_s && !rn_s && !l_has && !r_has) return;
+
+    for (int64_t i = 0; i < len; i++) {
+        bool ln = ln_s || (l_has && ray_vec_is_null(lhs, i));
+        bool rn = rn_s || (r_has && ray_vec_is_null(rhs, i));
+        if (!ln && !rn) continue;
+        /* Both null */
+        if (ln && rn) {
+            dst[i] = (opcode == OP_EQ || opcode == OP_LE || opcode == OP_GE) ? 1 : 0;
+            continue;
+        }
+        /* Left null only (null = minimum) */
+        if (ln) {
+            dst[i] = (opcode == OP_LT || opcode == OP_LE || opcode == OP_NE) ? 1 : 0;
+            continue;
+        }
+        /* Right null only */
+        dst[i] = (opcode == OP_GT || opcode == OP_GE || opcode == OP_NE) ? 1 : 0;
+    }
+}
+
+/* Set all elements in result as null (scalar null broadcast). */
+static void set_all_null(ray_t* result, int64_t len) {
+    if (len > 128 && !(result->attrs & RAY_ATTR_NULLMAP_EXT))
+        ray_vec_set_null(result, len - 1, false); /* force ext alloc */
+    uint8_t* dbits = nullmap_bits_mut(result);
+    if (dbits) {
+        memset(dbits, 0xFF, (size_t)((len + 7) / 8));
+        result->attrs |= RAY_ATTR_HAS_NULLS;
+    } else {
+        for (int64_t i = 0; i < len; i++) ray_vec_set_null(result, i, true);
+    }
+}
+
+/* Propagate null bitmaps for binary ops: null in either operand → null in result. */
+static void propagate_nulls_binary(ray_t* lhs, ray_t* rhs, ray_t* result,
+                                   bool l_scalar, bool r_scalar, int64_t len) {
+    if (l_scalar && scalar_is_null(lhs)) {
+        set_all_null(result, len);
+    } else if (r_scalar && scalar_is_null(rhs)) {
+        set_all_null(result, len);
+    } else {
+        if (!l_scalar && vec_may_have_nulls(lhs)) propagate_nulls(lhs, result, len);
+        if (!r_scalar && vec_may_have_nulls(rhs)) propagate_nulls(rhs, result, len);
+    }
+}
+
+/* ============================================================================
+ * Element-wise execution
+ * ============================================================================ */
+
+ray_t* exec_elementwise_unary(ray_graph_t* g, ray_op_t* op, ray_t* input) {
+    (void)g;
+    if (!input || RAY_IS_ERR(input)) return input;
+    int64_t len = input->len;
+    int8_t in_type = input->type;
+    int8_t out_type = op->out_type;
+
+    ray_t* result = ray_vec_new(out_type, len);
+    if (!result || RAY_IS_ERR(result)) return result;
+    result->len = len;
+
+    ray_morsel_t m;
+    ray_morsel_init(&m, input);
+    int64_t out_off = 0;
+
+    while (ray_morsel_next(&m)) {
+        int64_t n = m.morsel_len;
+        void* dst = (char*)ray_data(result) + out_off * ray_elem_size(out_type);
+
+        if (in_type == RAY_F64 || in_type == RAY_I64) {
+            for (int64_t i = 0; i < n; i++) {
+                if (in_type == RAY_F64) {
+                    double v = ((double*)m.morsel_ptr)[i];
+                    double r;
+                    switch (op->opcode) {
+                        case OP_NEG:   r = -v; break;
+                        case OP_ABS:   r = fabs(v); break;
+                        case OP_SQRT:  r = sqrt(v); break;
+                        case OP_LOG:   r = log(v); break;
+                        case OP_EXP:   r = exp(v); break;
+                        case OP_CEIL:  r = ceil(v); break;
+                        case OP_FLOOR: r = floor(v); break;
+                        case OP_ROUND: r = round(v); break;
+                        default:       r = v; break;
+                    }
+                    if (out_type == RAY_F64) ((double*)dst)[i] = r;
+                    else if (out_type == RAY_I64) ((int64_t*)dst)[i] = (int64_t)r;
+                } else {
+                    int64_t v = ((int64_t*)m.morsel_ptr)[i];
+                    if (out_type == RAY_I64) {
+                        int64_t r;
+                        switch (op->opcode) {
+                            /* Unsigned negation avoids UB on INT64_MIN */
+                            case OP_NEG: r = (int64_t)(-(uint64_t)v); break;
+                            case OP_ABS: r = v < 0 ? (int64_t)(-(uint64_t)v) : v; break;
+                            default:     r = v; break;
+                        }
+                        ((int64_t*)dst)[i] = r;
+                    } else if (out_type == RAY_F64) {
+                        double r;
+                        switch (op->opcode) {
+                            case OP_NEG:   r = -(double)v; break;
+                            case OP_SQRT:  r = sqrt((double)v); break;
+                            case OP_LOG:   r = log((double)v); break;
+                            case OP_EXP:   r = exp((double)v); break;
+                            default:       r = (double)v; break;
+                        }
+                        ((double*)dst)[i] = r;
+                    } else if (out_type == RAY_BOOL) {
+                        /* ISNULL: for non-null vecs, always false */
+                        ((uint8_t*)dst)[i] = 0;
+                    }
+                }
+            }
+        } else if (in_type == RAY_BOOL && op->opcode == OP_NOT) {
+            for (int64_t i = 0; i < n; i++) {
+                ((uint8_t*)dst)[i] = !((uint8_t*)m.morsel_ptr)[i];
+            }
+        } else if (op->opcode == OP_CAST) {
+            /* CAST from narrow integer types (I32/I16/U8/BOOL) to I64/F64 */
+            for (int64_t i = 0; i < n; i++) {
+                int64_t v = 0;
+                if (in_type == RAY_I32 || in_type == RAY_DATE || in_type == RAY_TIME)
+                    v = (int64_t)((int32_t*)m.morsel_ptr)[i];
+                else if (in_type == RAY_I16)
+                    v = (int64_t)((int16_t*)m.morsel_ptr)[i];
+                else if (in_type == RAY_U8 || in_type == RAY_BOOL)
+                    v = (int64_t)((uint8_t*)m.morsel_ptr)[i];
+                if (out_type == RAY_I64)       ((int64_t*)dst)[i] = v;
+                else if (out_type == RAY_F64)  ((double*)dst)[i] = (double)v;
+            }
+        }
+
+        out_off += n;
+    }
+
+    /* Propagate null bitmap from input to result.
+     * ISNULL is special: set output to 1 for null elements. */
+    if (vec_may_have_nulls(input)) {
+        if (op->opcode == OP_ISNULL) {
+            for (int64_t i = 0; i < len; i++) {
+                if (ray_vec_is_null(input, i))
+                    ((uint8_t*)ray_data(result))[i] = 1;
+            }
+        } else {
+            propagate_nulls(input, result, len);
+        }
+    }
+
+    /* OP_NEG/OP_ABS over i64: |INT64_MIN| and -INT64_MIN don't fit — surface
+     * as typed null (k/q convention).  Loop above used unsigned wrap, so
+     * overflow positions land as INT64_MIN in data; convert them to null. */
+    if (out_type == RAY_I64 && in_type == RAY_I64 &&
+        (op->opcode == OP_NEG || op->opcode == OP_ABS))
+        mark_i64_overflow_as_null(result, 0, len);
+
+    return result;
+}
+
+/* Inner loop for binary element-wise string comparison over [start, end) */
+static void binary_range_str(ray_op_t* op, ray_t* lhs, ray_t* rhs, ray_t* result,
+                             bool l_scalar, bool r_scalar,
+                             int64_t start, int64_t end) {
+    uint8_t* dst = (uint8_t*)ray_data(result) + start;
+    int64_t n = end - start;
+    uint16_t opc = op->opcode;
+
+    const ray_str_t* l_elems = NULL;
+    const ray_str_t* r_elems = NULL;
+    const char* l_pool = NULL;
+    const char* r_pool = NULL;
+    if (!l_scalar) { str_resolve(lhs, &l_elems, &l_pool); l_elems += start; }
+    if (!r_scalar) { str_resolve(rhs, &r_elems, &r_pool); r_elems += start; }
+
+    /* For scalar side, build a single ray_str_t */
+    ray_str_t l_scalar_elem = {0}, r_scalar_elem = {0};
+    const char* l_scalar_pool = NULL;
+    const char* r_scalar_pool = NULL;
+    if (l_scalar) {
+        atom_to_str_t(lhs, &l_scalar_elem, &l_scalar_pool);
+        l_elems = &l_scalar_elem;
+    }
+    if (r_scalar) {
+        atom_to_str_t(rhs, &r_scalar_elem, &r_scalar_pool);
+        r_elems = &r_scalar_elem;
+    }
+
+    for (int64_t i = 0; i < n; i++) {
+        const ray_str_t* a = l_scalar ? l_elems : &l_elems[i];
+        const ray_str_t* b = r_scalar ? r_elems : &r_elems[i];
+        const char* pa = l_scalar ? l_scalar_pool : l_pool;
+        const char* pb = r_scalar ? r_scalar_pool : r_pool;
+
+        switch (opc) {
+            case OP_EQ: dst[i] = ray_str_t_eq(a, pa, b, pb); break;
+            case OP_NE: dst[i] = !ray_str_t_eq(a, pa, b, pb); break;
+            case OP_LT: dst[i] = ray_str_t_cmp(a, pa, b, pb) < 0; break;
+            case OP_LE: dst[i] = ray_str_t_cmp(a, pa, b, pb) <= 0; break;
+            case OP_GT: dst[i] = ray_str_t_cmp(a, pa, b, pb) > 0; break;
+            case OP_GE: dst[i] = ray_str_t_cmp(a, pa, b, pb) >= 0; break;
+            default: dst[i] = 0; break;
+        }
+    }
+}
+
+/* Context for parallel RAY_STR binary dispatch */
+typedef struct {
+    ray_op_t* op;
+    ray_t*    lhs;
+    ray_t*    rhs;
+    ray_t*    result;
+    bool     l_scalar;
+    bool     r_scalar;
+} par_binary_str_ctx_t;
+
+static void par_binary_str_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    par_binary_str_ctx_t* c = (par_binary_str_ctx_t*)ctx;
+    binary_range_str(c->op, c->lhs, c->rhs, c->result,
+                     c->l_scalar, c->r_scalar, start, end);
+}
+
+/* Inner loop for binary element-wise over a range [start, end) */
+static void binary_range(ray_op_t* op, int8_t out_type,
+                         ray_t* lhs, ray_t* rhs, ray_t* result,
+                         bool l_scalar, bool r_scalar,
+                         double l_f64, double r_f64,
+                         int64_t l_i64, int64_t r_i64,
+                         int64_t start, int64_t end) {
+    uint8_t out_esz = ray_elem_size(out_type);
+    void* dst = (char*)ray_data(result) + start * out_esz;
+    int64_t n = end - start;
+
+    /* Pointers into source data at offset start */
+    double* lp_f64 = NULL; int64_t* lp_i64 = NULL; uint8_t* lp_bool = NULL;
+    double* rp_f64 = NULL; int64_t* rp_i64 = NULL; uint8_t* rp_bool = NULL;
+
+    int32_t* lp_i32 = NULL; uint32_t* lp_u32 = NULL; int16_t* lp_i16 = NULL;
+    int32_t* rp_i32 = NULL; uint32_t* rp_u32 = NULL; int16_t* rp_i16 = NULL;
+
+    /* VLA bound of zero is UB; guarantee >=1 slot.  The fill loops below
+     * are bounded by n so extra slots are harmless. */
+    int64_t _sym_buf_n = n ? n : 1;
+    int64_t lsym_buf[_sym_buf_n], rsym_buf[_sym_buf_n]; /* stack VLA for narrow RAY_SYM (n<=1024) */
+    if (!l_scalar) {
+        int64_t l_off = start;
+        void* l_data = resolve_vec_data(lhs, &l_off);
+        void* lbase = (char*)l_data + l_off * ray_sym_elem_size(lhs->type, lhs->attrs);
+        if (lhs->type == RAY_F64) lp_f64 = (double*)lbase;
+        else if (lhs->type == RAY_I64 || lhs->type == RAY_TIMESTAMP) lp_i64 = (int64_t*)lbase;
+        else if (RAY_IS_SYM(lhs->type)) {
+            uint8_t w = lhs->attrs & RAY_SYM_W_MASK;
+            if (w == RAY_SYM_W64) lp_i64 = (int64_t*)lbase;
+            else if (w == RAY_SYM_W32) lp_u32 = (uint32_t*)lbase;
+            else { for (int64_t j = 0; j < n; j++) lsym_buf[j] = ray_read_sym(l_data, l_off+j, lhs->type, lhs->attrs); lp_i64 = lsym_buf; }
+        }
+        else if (lhs->type == RAY_I32 || lhs->type == RAY_DATE || lhs->type == RAY_TIME) lp_i32 = (int32_t*)lbase;
+        else if (lhs->type == RAY_I16) lp_i16 = (int16_t*)lbase;
+        else if (lhs->type == RAY_BOOL || lhs->type == RAY_U8) lp_bool = (uint8_t*)lbase;
+    }
+    if (!r_scalar) {
+        int64_t r_off = start;
+        void* r_data = resolve_vec_data(rhs, &r_off);
+        void* rbase = (char*)r_data + r_off * ray_sym_elem_size(rhs->type, rhs->attrs);
+        if (rhs->type == RAY_F64) rp_f64 = (double*)rbase;
+        else if (rhs->type == RAY_I64 || rhs->type == RAY_TIMESTAMP) rp_i64 = (int64_t*)rbase;
+        else if (RAY_IS_SYM(rhs->type)) {
+            uint8_t w = rhs->attrs & RAY_SYM_W_MASK;
+            if (w == RAY_SYM_W64) rp_i64 = (int64_t*)rbase;
+            else if (w == RAY_SYM_W32) rp_u32 = (uint32_t*)rbase;
+            else { for (int64_t j = 0; j < n; j++) rsym_buf[j] = ray_read_sym(r_data, r_off+j, rhs->type, rhs->attrs); rp_i64 = rsym_buf; }
+        }
+        else if (rhs->type == RAY_I32 || rhs->type == RAY_DATE || rhs->type == RAY_TIME) rp_i32 = (int32_t*)rbase;
+        else if (rhs->type == RAY_I16) rp_i16 = (int16_t*)rbase;
+        else if (rhs->type == RAY_BOOL || rhs->type == RAY_U8) rp_bool = (uint8_t*)rbase;
+    }
+
+    for (int64_t i = 0; i < n; i++) {
+        double lv, rv;
+        if (lp_f64)       lv = lp_f64[i];
+        else if (lp_i64)  lv = (double)lp_i64[i];
+        else if (lp_i32)  lv = (double)lp_i32[i];
+        else if (lp_u32)  lv = (double)lp_u32[i];
+        else if (lp_i16)  lv = (double)lp_i16[i];
+        else if (lp_bool) lv = (double)lp_bool[i];
+        else if (l_scalar && (lhs->type == -RAY_F64 || lhs->type == RAY_F64)) lv = l_f64;
+        else              lv = (double)l_i64;
+
+        if (rp_f64)       rv = rp_f64[i];
+        else if (rp_i64)  rv = (double)rp_i64[i];
+        else if (rp_i32)  rv = (double)rp_i32[i];
+        else if (rp_u32)  rv = (double)rp_u32[i];
+        else if (rp_i16)  rv = (double)rp_i16[i];
+        else if (rp_bool) rv = (double)rp_bool[i];
+        else if (r_scalar && (rhs->type == -RAY_F64 || rhs->type == RAY_F64)) rv = r_f64;
+        else              rv = (double)r_i64;
+
+        if (out_type == RAY_F64) {
+            double r;
+            switch (op->opcode) {
+                case OP_ADD: r = lv + rv; break;
+                case OP_SUB: r = lv - rv; break;
+                case OP_MUL: r = lv * rv; break;
+                case OP_DIV: r = rv != 0.0 ? lv / rv : NAN; break;
+                case OP_MOD: { if (rv != 0.0) { r = fmod(lv, rv); if (r && ((r > 0) != (rv > 0))) r += rv; } else { r = NAN; } } break;
+                case OP_MIN2: r = lv < rv ? lv : rv; break;
+                case OP_MAX2: r = lv > rv ? lv : rv; break;
+                default: r = 0.0; break;
+            }
+            ((double*)dst)[i] = r;
+        } else if (out_type == RAY_I64 || out_type == RAY_TIMESTAMP) {
+            int64_t li = (int64_t)lv, ri = (int64_t)rv;
+            int64_t r;
+            switch (op->opcode) {
+                case OP_ADD: r = (int64_t)((uint64_t)li + (uint64_t)ri); break;
+                case OP_SUB: r = (int64_t)((uint64_t)li - (uint64_t)ri); break;
+                case OP_MUL: r = (int64_t)((uint64_t)li * (uint64_t)ri); break;
+                case OP_DIV:
+                    if (ri==0 || (ri==-1 && li==((int64_t)1<<63))) { r = 0; }
+                    else { r = li/ri; if ((li^ri)<0 && r*ri!=li) r--; }
+                    break;
+                case OP_MOD:
+                    if (ri==0 || (ri==-1 && li==((int64_t)1<<63))) { r = 0; }
+                    else { r = li%ri; if (r && (r^ri)<0) r+=ri; }
+                    break;
+                case OP_MIN2: r = li < ri ? li : ri; break;
+                case OP_MAX2: r = li > ri ? li : ri; break;
+                default: r = 0; break;
+            }
+            ((int64_t*)dst)[i] = r;
+        } else if (out_type == RAY_I32 || out_type == RAY_DATE || out_type == RAY_TIME) {
+            int32_t li = (int32_t)lv, ri = (int32_t)rv;
+            int32_t r;
+            switch (op->opcode) {
+                case OP_ADD: r = (int32_t)((uint32_t)li + (uint32_t)ri); break;
+                case OP_SUB: r = (int32_t)((uint32_t)li - (uint32_t)ri); break;
+                case OP_MUL: r = (int32_t)((uint32_t)li * (uint32_t)ri); break;
+                case OP_DIV:
+                    if (ri==0 || (ri==-1 && li==((int32_t)1<<31))) { r = 0; }
+                    else { r = li/ri; if ((li^ri)<0 && r*ri!=li) r--; }
+                    break;
+                case OP_MOD:
+                    if (ri==0 || (ri==-1 && li==((int32_t)1<<31))) { r = 0; }
+                    else { r = li%ri; if (r && (r^ri)<0) r+=ri; }
+                    break;
+                case OP_MIN2: r = li < ri ? li : ri; break;
+                case OP_MAX2: r = li > ri ? li : ri; break;
+                default: r = 0; break;
+            }
+            ((int32_t*)dst)[i] = r;
+        } else if (out_type == RAY_I16) {
+            int16_t li = (int16_t)lv, ri = (int16_t)rv;
+            int16_t r;
+            switch (op->opcode) {
+                case OP_ADD: r = (int16_t)((uint16_t)li + (uint16_t)ri); break;
+                case OP_SUB: r = (int16_t)((uint16_t)li - (uint16_t)ri); break;
+                case OP_MUL: r = (int16_t)((uint16_t)li * (uint16_t)ri); break;
+                case OP_DIV: r = ri ? li / ri : 0; break;
+                case OP_MOD: r = ri ? li % ri : 0; break;
+                case OP_MIN2: r = li < ri ? li : ri; break;
+                case OP_MAX2: r = li > ri ? li : ri; break;
+                default: r = 0; break;
+            }
+            ((int16_t*)dst)[i] = r;
+        } else if (out_type == RAY_U8) {
+            uint8_t li = (uint8_t)lv, ri = (uint8_t)rv;
+            uint8_t r;
+            switch (op->opcode) {
+                case OP_ADD: r = li + ri; break;
+                case OP_SUB: r = li - ri; break;
+                case OP_MUL: r = li * ri; break;
+                case OP_DIV: r = ri ? li / ri : 0; break;
+                case OP_MOD: r = ri ? li % ri : 0; break;
+                case OP_MIN2: r = li < ri ? li : ri; break;
+                case OP_MAX2: r = li > ri ? li : ri; break;
+                default: r = 0; break;
+            }
+            ((uint8_t*)dst)[i] = r;
+        } else if (out_type == RAY_BOOL) {
+            /* Read raw I64 values directly for null-aware comparison
+             * when both operands are I64/I32-family (not F64). */
+            int src_is_i64 = (lp_i64 || lp_i32 || lp_u32 || lp_i16 ||
+                              (l_scalar && lhs->type != -RAY_F64 && lhs->type != RAY_F64)) &&
+                             (rp_i64 || rp_i32 || rp_u32 || rp_i16 ||
+                              (r_scalar && rhs->type != -RAY_F64 && rhs->type != RAY_F64));
+            int64_t li64 = (int64_t)lv, ri64 = (int64_t)rv;
+            uint8_t r;
+            if (src_is_i64) {
+                /* No sentinel nulls — fix_null_comparisons handles null positions */
+                switch (op->opcode) {
+                    case OP_EQ: r = li64==ri64; break;
+                    case OP_NE: r = li64!=ri64; break;
+                    case OP_LT: r = li64<ri64; break;
+                    case OP_LE: r = li64<=ri64; break;
+                    case OP_GT: r = li64>ri64; break;
+                    case OP_GE: r = li64>=ri64; break;
+                    case OP_AND: r = (uint8_t)lv && (uint8_t)rv; break;
+                    case OP_OR:  r = (uint8_t)lv || (uint8_t)rv; break;
+                    default: r = 0; break;
+                }
+            } else {
+                /* Null-aware F64 comparisons: NaN is null sentinel */
+                int ln = (lv != lv), rn = (rv != rv); /* NaN check */
+                switch (op->opcode) {
+                    case OP_EQ:  r = (ln&&rn) ? 1 : (ln||rn) ? 0 : lv==rv; break;
+                    case OP_NE:  r = (ln&&rn) ? 0 : (ln||rn) ? 1 : lv!=rv; break;
+                    case OP_LT:  r = (ln&&rn) ? 0 : ln ? 1 : rn ? 0 : lv<rv; break;
+                    case OP_LE:  r = (ln&&rn) ? 1 : ln ? 1 : rn ? 0 : lv<=rv; break;
+                    case OP_GT:  r = (ln&&rn) ? 0 : rn ? 1 : ln ? 0 : lv>rv; break;
+                    case OP_GE:  r = (ln&&rn) ? 1 : rn ? 1 : ln ? 0 : lv>=rv; break;
+                    case OP_AND: r = (uint8_t)lv && (uint8_t)rv; break;
+                    case OP_OR:  r = (uint8_t)lv || (uint8_t)rv; break;
+                    default: r = 0; break;
+                }
+            }
+            ((uint8_t*)dst)[i] = r;
+        }
+    }
+}
+
+/* Context for parallel binary dispatch */
+typedef struct {
+    ray_op_t* op;
+    int8_t   out_type;
+    ray_t*    lhs;
+    ray_t*    rhs;
+    ray_t*    result;
+    bool     l_scalar;
+    bool     r_scalar;
+    double   l_f64, r_f64;
+    int64_t  l_i64, r_i64;
+} par_binary_ctx_t;
+
+static void par_binary_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    par_binary_ctx_t* c = (par_binary_ctx_t*)ctx;
+    binary_range(c->op, c->out_type, c->lhs, c->rhs, c->result,
+                 c->l_scalar, c->r_scalar,
+                 c->l_f64, c->r_f64, c->l_i64, c->r_i64,
+                 start, end);
+}
+
+ray_t* exec_elementwise_binary(ray_graph_t* g, ray_op_t* op, ray_t* lhs, ray_t* rhs) {
+    (void)g;
+    if (!lhs || RAY_IS_ERR(lhs)) return lhs;
+    if (!rhs || RAY_IS_ERR(rhs)) return rhs;
+
+    bool l_scalar = ray_is_atom(lhs) || (lhs->type > 0 && lhs->len == 1);
+    bool r_scalar = ray_is_atom(rhs) || (rhs->type > 0 && rhs->len == 1);
+
+    int64_t len = 1;
+    if (!l_scalar && !r_scalar) {
+        if (lhs->len != rhs->len) return ray_error("length", NULL);
+        len = lhs->len;
+    } else if (l_scalar && !r_scalar) {
+        len = rhs->len;
+    } else if (!l_scalar && r_scalar) {
+        len = lhs->len;
+    }
+
+    int8_t out_type = op->out_type;
+    ray_t* result = ray_vec_new(out_type, len);
+    if (!result || RAY_IS_ERR(result)) return result;
+    result->len = len;
+
+    /* RAY_STR comparison: use ray_str_t_eq / ray_str_t_cmp directly.
+       Handles RAY_STR column vs RAY_STR column, or -RAY_STR scalar vs RAY_STR column. */
+    {
+        bool l_is_str = (!l_scalar && lhs->type == RAY_STR);
+        bool r_is_str = (!r_scalar && rhs->type == RAY_STR);
+        bool l_atom_str = (l_scalar && (lhs->type == -RAY_STR
+                          || lhs->type == RAY_STR
+                          || (RAY_IS_SYM(lhs->type) && ray_is_atom(lhs))));
+        bool r_atom_str = (r_scalar && (rhs->type == -RAY_STR
+                          || rhs->type == RAY_STR
+                          || (RAY_IS_SYM(rhs->type) && ray_is_atom(rhs))));
+
+        if (l_is_str || r_is_str || (l_atom_str && r_atom_str)) {
+            /* RAY_STR only supports comparison ops — reject arithmetic */
+            uint16_t opc = op->opcode;
+            if (opc < OP_EQ || opc > OP_GE) { ray_release(result); return ray_error("type", NULL); }
+            /* At least one side is a RAY_STR column — use string comparison path.
+               The scalar side (if any) must be -RAY_STR or RAY_SYM atom.
+               The non-scalar side must be RAY_STR. */
+            if (l_scalar && !l_atom_str) { ray_release(result); return ray_error("type", NULL); }
+            if (r_scalar && !r_atom_str) { ray_release(result); return ray_error("type", NULL); }
+            if (!l_scalar && !l_is_str) { ray_release(result); return ray_error("type", NULL); }
+            if (!r_scalar && !r_is_str) { ray_release(result); return ray_error("type", NULL); }
+
+            ray_pool_t* pool = ray_pool_get();
+            if (pool && len >= RAY_PARALLEL_THRESHOLD) {
+                par_binary_str_ctx_t ctx = {
+                    .op = op, .lhs = lhs, .rhs = rhs, .result = result,
+                    .l_scalar = l_scalar, .r_scalar = r_scalar,
+                };
+                ray_pool_dispatch(pool, par_binary_str_fn, &ctx, len);
+                fix_null_comparisons(lhs, rhs, result, l_scalar, r_scalar, len, op->opcode);
+                return result;
+            }
+            binary_range_str(op, lhs, rhs, result, l_scalar, r_scalar, 0, len);
+            fix_null_comparisons(lhs, rhs, result, l_scalar, r_scalar, len, op->opcode);
+            return result;
+        }
+    }
+
+    /* SYM vs STR comparison: resolve string constant to intern ID so we
+       can compare numerically against SYM intern indices.
+       ray_sym_find returns -1 if string not in table → no match. */
+    bool str_resolved = false;
+    int64_t resolved_sym_id = 0;
+    if (r_scalar && rhs->type == -RAY_STR &&
+        RAY_IS_SYM(lhs->type)) {
+        const char* s = ray_str_ptr(rhs);
+        size_t slen = ray_str_len(rhs);
+        resolved_sym_id = ray_sym_find(s, slen);
+        str_resolved = true;
+    } else if (l_scalar && lhs->type == -RAY_STR &&
+               RAY_IS_SYM(rhs->type)) {
+        const char* s = ray_str_ptr(lhs);
+        size_t slen = ray_str_len(lhs);
+        resolved_sym_id = ray_sym_find(s, slen);
+        str_resolved = true;
+    }
+
+    double l_f64_val = 0, r_f64_val = 0;
+    int64_t l_i64_val = 0, r_i64_val = 0;
+    if (l_scalar) {
+        if (str_resolved && lhs->type == -RAY_STR)
+            l_i64_val = resolved_sym_id;
+        else if (ray_is_atom(lhs)) {
+            if (lhs->type == -RAY_F64) l_f64_val = lhs->f64;
+            else if (lhs->type == -RAY_I32 || lhs->type == -RAY_DATE || lhs->type == -RAY_TIME)
+                l_i64_val = (int64_t)lhs->i32;
+            else if (lhs->type == -RAY_I16) l_i64_val = (int64_t)lhs->i16;
+            else if (lhs->type == -RAY_U8 || lhs->type == -RAY_BOOL) l_i64_val = (int64_t)lhs->u8;
+            else l_i64_val = lhs->i64;
+        } else {
+            int8_t t = lhs->type;
+            int64_t elem = 0;
+            void* data = resolve_vec_data(lhs, &elem);
+            if (t == RAY_F64) l_f64_val = ((double*)data)[elem];
+            else l_i64_val = read_col_i64(data, elem, t, lhs->attrs);
+        }
+    }
+    if (r_scalar) {
+        if (str_resolved && rhs->type == -RAY_STR)
+            r_i64_val = resolved_sym_id;
+        else if (ray_is_atom(rhs)) {
+            if (rhs->type == -RAY_F64) r_f64_val = rhs->f64;
+            else if (rhs->type == -RAY_I32 || rhs->type == -RAY_DATE || rhs->type == -RAY_TIME)
+                r_i64_val = (int64_t)rhs->i32;
+            else if (rhs->type == -RAY_I16) r_i64_val = (int64_t)rhs->i16;
+            else if (rhs->type == -RAY_U8 || rhs->type == -RAY_BOOL) r_i64_val = (int64_t)rhs->u8;
+            else r_i64_val = rhs->i64;
+        } else {
+            int8_t t = rhs->type;
+            int64_t elem = 0;
+            void* data = resolve_vec_data(rhs, &elem);
+            if (t == RAY_F64) r_f64_val = ((double*)data)[elem];
+            else r_i64_val = read_col_i64(data, elem, t, rhs->attrs);
+        }
+    }
+
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && len >= RAY_PARALLEL_THRESHOLD) {
+        par_binary_ctx_t ctx = {
+            .op = op, .out_type = out_type,
+            .lhs = lhs, .rhs = rhs, .result = result,
+            .l_scalar = l_scalar, .r_scalar = r_scalar,
+            .l_f64 = l_f64_val, .r_f64 = r_f64_val,
+            .l_i64 = l_i64_val, .r_i64 = r_i64_val,
+        };
+        ray_pool_dispatch(pool, par_binary_fn, &ctx, len);
+    } else {
+        binary_range(op, out_type, lhs, rhs, result,
+                     l_scalar, r_scalar,
+                     l_f64_val, r_f64_val, l_i64_val, r_i64_val,
+                     0, len);
+    }
+
+    /* Null propagation from inputs */
+    if (op_propagates_null(op->opcode))
+        propagate_nulls_binary(lhs, rhs, result, l_scalar, r_scalar, len);
+    else
+        fix_null_comparisons(lhs, rhs, result, l_scalar, r_scalar, len, op->opcode);
+
+    /* Div/mod: mark zero-divisor positions as null.
+     * The morsel loop writes 0 for b==0 but can't set bitmap nulls. */
+    uint16_t opc = op->opcode;
+    if (opc == OP_DIV || opc == OP_MOD) {
+        if (!r_scalar) {
+            int8_t rt = rhs->type;
+            if (rt == RAY_I64 || rt == RAY_TIMESTAMP) {
+                const int64_t* b = (const int64_t*)ray_data(rhs);
+                for (int64_t i = 0; i < len; i++)
+                    if (b[i] == 0) ray_vec_set_null(result, i, true);
+            } else if (rt == RAY_I32 || rt == RAY_DATE || rt == RAY_TIME) {
+                const int32_t* b = (const int32_t*)ray_data(rhs);
+                for (int64_t i = 0; i < len; i++)
+                    if (b[i] == 0) ray_vec_set_null(result, i, true);
+            }
+            /* F64 div-by-zero produces NaN which is handled by propagate_nulls */
+        } else {
+            /* Scalar divisor: check for zero using the correct type */
+            bool is_zero = false;
+            if (rhs->type == -RAY_F64 || rhs->type == RAY_F64)
+                is_zero = (r_f64_val == 0.0);
+            else
+                is_zero = (r_i64_val == 0);
+            if (is_zero) {
+                for (int64_t i = 0; i < len; i++)
+                    ray_vec_set_null(result, i, true);
+            }
+        }
+    }
+
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/filter.c b/crates/rayforce-sys/vendor/rayforce/src/ops/filter.c
new file mode 100644
index 0000000..c7ba85c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/filter.c
@@ -0,0 +1,685 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+#include "ops/rowsel.h"
+
+/* ============================================================================
+ * Filter execution — extracted from exec.c
+ * ============================================================================ */
+
+/* Gather from a parted column using global row indices (sorted ascending).
+ * Walks match_idx with an advancing segment cursor — O(count + n_segs). */
+static void parted_gather_col(ray_t* parted_col, const int64_t* match_idx,
+                               int64_t count, ray_t* dst_col) {
+    int64_t n_segs = parted_col->len;
+    if (n_segs == 0) return;  /* zero-length VLA is UB in C17 */
+    ray_t** segs = (ray_t**)ray_data(parted_col);
+    int8_t base = (int8_t)RAY_PARTED_BASETYPE(parted_col->type);
+    uint8_t base_attrs = (base == RAY_SYM)
+                       ? parted_first_attrs(segs, n_segs) : 0;
+    uint8_t esz = ray_sym_elem_size(base, base_attrs);
+    char* dst = (char*)ray_data(dst_col);
+    memset(dst, 0, (size_t)count * esz);
+
+    /* Build prefix-sum segment end table */
+    int64_t seg_ends[n_segs];
+    int64_t cumul = 0;
+    for (int64_t i = 0; i < n_segs; i++) {
+        cumul += segs[i] ? segs[i]->len : 0;
+        seg_ends[i] = cumul;
+    }
+
+    /* Walk match_idx (sorted ascending) with advancing segment cursor */
+    int64_t seg = 0;
+    for (int64_t i = 0; i < count; i++) {
+        int64_t row = match_idx[i];
+        while (seg < n_segs - 1 && row >= seg_ends[seg]) seg++;
+        if (!segs[seg] || !parted_seg_esz_ok(segs[seg], base, esz))
+            continue;  /* NULL or width-mismatch — skip (zero-fill from vec_new) */
+        int64_t seg_start = (seg > 0) ? seg_ends[seg - 1] : 0;
+        int64_t local_row = row - seg_start;
+        char* src = (char*)ray_data(segs[seg]);
+        memcpy(dst + i * esz, src + local_row * esz, esz);
+        if ((segs[seg]->attrs & RAY_ATTR_HAS_NULLS) &&
+            ray_vec_is_null(segs[seg], local_row))
+            ray_vec_set_null(dst_col, i, true);
+    }
+}
+
+/* Filter a single vector by boolean predicate. */
+static ray_t* exec_filter_vec(ray_t* input, ray_t* pred, int64_t pass_count) {
+    uint8_t esz = col_esz(input);
+    ray_t* result = col_vec_new(input, pass_count);
+    if (!result || RAY_IS_ERR(result)) return result;
+    result->len = pass_count;
+
+    ray_morsel_t mi, mf;
+    ray_morsel_init(&mi, input);
+    ray_morsel_init(&mf, pred);
+    int64_t out_idx = 0;
+
+    if (input->len != pred->len) { ray_release(result); return ray_error("length", NULL); }
+
+    while (ray_morsel_next(&mi) && ray_morsel_next(&mf)) {
+        uint8_t* bits = (uint8_t*)mf.morsel_ptr;
+        char* src = (char*)mi.morsel_ptr;
+        char* dst = (char*)ray_data(result);
+        for (int64_t i = 0; i < mi.morsel_len; i++) {
+            if (bits[i]) {
+                memcpy(dst + out_idx * esz, src + i * esz, esz);
+                out_idx++;
+            }
+        }
+    }
+
+    col_propagate_str_pool(result, input);
+    col_propagate_nulls_filter(result, input,
+                               (const uint8_t*)ray_data(pred), input->len);
+    return result;
+}
+
+/* Filter a parted column by boolean predicate (sequential). */
+static ray_t* exec_filter_parted_vec(ray_t* parted_col, ray_t* pred,
+                                     int64_t pass_count) {
+    int8_t base = (int8_t)RAY_PARTED_BASETYPE(parted_col->type);
+    ray_t** segs = (ray_t**)ray_data(parted_col);
+    int64_t n_segs = parted_col->len;
+    uint8_t* pred_data = (uint8_t*)ray_data(pred);
+
+    /* RAY_STR: deep-copy to handle multi-pool segments */
+    if (base == RAY_STR) {
+        ray_t* result = ray_vec_new(RAY_STR, pass_count);
+        if (!result || RAY_IS_ERR(result)) return result;
+        int64_t pred_off = 0;
+        for (int64_t s = 0; s < n_segs; s++) {
+            if (!segs[s]) continue;
+            int64_t seg_len = segs[s]->len;
+            const char* pool_base = segs[s]->str_pool
+                                  ? (const char*)ray_data(segs[s]->str_pool) : NULL;
+            for (int64_t i = 0; i < seg_len; i++) {
+                if (pred_data[pred_off + i]) {
+                    result = parted_str_append_elem(result, segs[s], i, pool_base);
+                    if (RAY_IS_ERR(result)) return result;
+                }
+            }
+            pred_off += seg_len;
+        }
+        return result;
+    }
+
+    uint8_t base_attrs = (base == RAY_SYM)
+                       ? parted_first_attrs(segs, n_segs) : 0;
+    uint8_t esz = ray_sym_elem_size(base, base_attrs);
+    ray_t* result = typed_vec_new(base, base_attrs, pass_count);
+    if (!result || RAY_IS_ERR(result)) return result;
+    result->len = pass_count;
+
+    int64_t out_idx = 0;
+    int64_t pred_off = 0;
+
+    for (int64_t s = 0; s < n_segs; s++) {
+        if (!segs[s]) continue;
+        int64_t seg_len = segs[s]->len;
+        if (!parted_seg_esz_ok(segs[s], base, esz)) {
+            char* dst = (char*)ray_data(result);
+            for (int64_t i = 0; i < seg_len; i++) {
+                if (pred_data[pred_off + i]) {
+                    memset(dst + out_idx * esz, 0, esz);
+                    out_idx++;
+                }
+            }
+            pred_off += seg_len;
+            continue;
+        }
+        char* src = (char*)ray_data(segs[s]);
+        char* dst = (char*)ray_data(result);
+        bool seg_has_nulls = (segs[s]->attrs & RAY_ATTR_HAS_NULLS) != 0;
+        for (int64_t i = 0; i < seg_len; i++) {
+            if (pred_data[pred_off + i]) {
+                memcpy(dst + out_idx * esz, src + i * esz, esz);
+                if (seg_has_nulls && ray_vec_is_null(segs[s], i))
+                    ray_vec_set_null(result, out_idx, true);
+                out_idx++;
+            }
+        }
+        pred_off += seg_len;
+    }
+    return result;
+}
+
+/* Sequential table filter fallback (small tables or alloc failure). */
+static ray_t* exec_filter_seq(ray_t* input, ray_t* pred, int64_t ncols,
+                             int64_t pass_count) {
+    ray_t* tbl = ray_table_new(ncols);
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(input, c);
+        if (!col || RAY_IS_ERR(col)) continue;
+        int64_t name_id = ray_table_col_name(input, c);
+        if (col->type == RAY_MAPCOMMON) {
+            ray_t* mc_filt = materialize_mapcommon_filter(col, pred, pass_count);
+            if (!mc_filt || RAY_IS_ERR(mc_filt)) { ray_release(tbl); return mc_filt; }
+            tbl = ray_table_add_col(tbl, name_id, mc_filt);
+            ray_release(mc_filt);
+            continue;
+        }
+        ray_t* filtered;
+        if (RAY_IS_PARTED(col->type))
+            filtered = exec_filter_parted_vec(col, pred, pass_count);
+        else
+            filtered = exec_filter_vec(col, pred, pass_count);
+        if (!filtered || RAY_IS_ERR(filtered)) { ray_release(tbl); return filtered; }
+        tbl = ray_table_add_col(tbl, name_id, filtered);
+        ray_release(filtered);
+    }
+    return tbl;
+}
+
+ray_t* exec_filter(ray_graph_t* g, ray_op_t* op, ray_t* input, ray_t* pred) {
+    (void)g;
+    (void)op;
+    if (!input || RAY_IS_ERR(input)) return input;
+    if (!pred || RAY_IS_ERR(pred)) return pred;
+
+    /* Count passing elements — single sequential scan over predicate */
+    int64_t pass_count = 0;
+    {
+        ray_morsel_t mp;
+        ray_morsel_init(&mp, pred);
+        while (ray_morsel_next(&mp)) {
+            uint8_t* bits = (uint8_t*)mp.morsel_ptr;
+            for (int64_t i = 0; i < mp.morsel_len; i++)
+                if (bits[i]) pass_count++;
+        }
+    }
+
+    /* Vector filter — single column, use sequential path */
+    if (input->type != RAY_TABLE)
+        return exec_filter_vec(input, pred, pass_count);
+
+    /* table filter: parallel gather using compact match index */
+    int64_t ncols = ray_table_ncols(input);
+    int64_t nrows = ray_table_nrows(input);
+
+    /* Fall back to sequential for tiny inputs or degenerate tables */
+    if (nrows <= RAY_PARALLEL_THRESHOLD || ncols <= 0)
+        return exec_filter_seq(input, pred, ncols, pass_count);
+
+    /* VLA guard: cap at 256 columns for stack safety (256*16 = 4KB).
+     * Wider tables fall back to sequential filter. */
+    if (ncols > 256) return exec_filter_seq(input, pred, ncols, pass_count);
+
+    /* Build match_idx: match_idx[j] = row of j-th matching element */
+    ray_t* idx_hdr = NULL;
+    int64_t* match_idx = (int64_t*)scratch_alloc(&idx_hdr,
+                                   (size_t)pass_count * sizeof(int64_t));
+    if (!match_idx)
+        return exec_filter_seq(input, pred, ncols, pass_count);
+
+    {
+        int64_t j = 0;
+        ray_morsel_t mp;
+        ray_morsel_init(&mp, pred);
+        int64_t row_base = 0;
+        while (ray_morsel_next(&mp)) {
+            uint8_t* bits = (uint8_t*)mp.morsel_ptr;
+            for (int64_t i = 0; i < mp.morsel_len; i++)
+                if (bits[i]) match_idx[j++] = row_base + i;
+            row_base += mp.morsel_len;
+        }
+    }
+
+    /* Parallel gather — same pattern as sort gather */
+    ray_pool_t* pool = ray_pool_get();
+    ray_t* tbl = ray_table_new(ncols);
+    if (!tbl || RAY_IS_ERR(tbl)) { scratch_free(idx_hdr); return tbl; }
+
+    /* Pre-allocate output columns */
+    ray_t* new_cols[ncols];
+    int64_t col_names[ncols];
+    int64_t valid_ncols = 0;
+
+    bool has_parted_cols = false;
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(input, c);
+        col_names[c] = ray_table_col_name(input, c);
+        if (!col || RAY_IS_ERR(col)) { new_cols[c] = NULL; continue; }
+        if (col->type == RAY_MAPCOMMON) {
+            /* Materialize MAPCOMMON through filter predicate */
+            new_cols[c] = materialize_mapcommon_filter(col, pred, pass_count);
+            if (new_cols[c] && !RAY_IS_ERR(new_cols[c])) valid_ncols++;
+            else new_cols[c] = NULL;
+            continue;
+        }
+        int8_t out_type = RAY_IS_PARTED(col->type)
+                        ? (int8_t)RAY_PARTED_BASETYPE(col->type)
+                        : col->type;
+        uint8_t out_attrs = 0;
+        if (out_type == RAY_SYM) {
+            if (RAY_IS_PARTED(col->type)) {
+                ray_t** sp = (ray_t**)ray_data(col);
+                out_attrs = parted_first_attrs(sp, col->len);
+            } else {
+                out_attrs = col->attrs;
+            }
+        }
+        if (RAY_IS_PARTED(col->type)) has_parted_cols = true;
+        ray_t* nc = typed_vec_new(out_type, out_attrs, pass_count);
+        if (!nc || RAY_IS_ERR(nc)) { new_cols[c] = NULL; continue; }
+        nc->len = pass_count;
+        new_cols[c] = nc;
+        valid_ncols++;
+    }
+
+    if (has_parted_cols) {
+        /* Parted-aware gather: use parted_gather_col for parted columns,
+         * sequential flat gather for non-parted columns */
+        for (int64_t c = 0; c < ncols; c++) {
+            ray_t* col = ray_table_get_col_idx(input, c);
+            if (!col || !new_cols[c]) continue;
+            if (col->type == RAY_MAPCOMMON) continue; /* already materialized */
+            if (RAY_IS_PARTED(col->type)) {
+                int8_t pbase = (int8_t)RAY_PARTED_BASETYPE(col->type);
+                if (pbase == RAY_STR) {
+                    ray_t** psegs = (ray_t**)ray_data(col);
+                    ray_release(new_cols[c]);
+                    new_cols[c] = parted_gather_str_rows(psegs, col->len,
+                                                         match_idx, pass_count);
+                } else {
+                    parted_gather_col(col, match_idx, pass_count, new_cols[c]);
+                }
+            } else {
+                uint8_t esz = col_esz(col);
+                char* src = (char*)ray_data(col);
+                char* dst = (char*)ray_data(new_cols[c]);
+                for (int64_t i = 0; i < pass_count; i++)
+                    memcpy(dst + i * esz, src + match_idx[i] * esz, esz);
+            }
+        }
+    } else if (pool && valid_ncols > 0 && valid_ncols <= MGATHER_MAX_COLS) {
+        /* Fused multi-column gather */
+        multi_gather_ctx_t mgctx = { .idx = match_idx, .ncols = 0 };
+        for (int64_t c = 0; c < ncols; c++) {
+            if (!new_cols[c]) continue;
+            ray_t* col = ray_table_get_col_idx(input, c);
+            if (col && col->type == RAY_MAPCOMMON) continue; /* already materialized */
+            int64_t ci = mgctx.ncols;
+            mgctx.srcs[ci] = (char*)ray_data(col);
+            mgctx.dsts[ci] = (char*)ray_data(new_cols[c]);
+            mgctx.esz[ci]  = col_esz(col);
+            mgctx.ncols++;
+        }
+        ray_pool_dispatch(pool, multi_gather_fn, &mgctx, pass_count);
+    } else if (pool) {
+        /* Per-column parallel gather */
+        for (int64_t c = 0; c < ncols; c++) {
+            ray_t* col = ray_table_get_col_idx(input, c);
+            if (!col || !new_cols[c]) continue;
+            gather_ctx_t gctx = {
+                .idx = match_idx, .src_col = col, .dst_col = new_cols[c],
+                .esz = col_esz(col), .nullable = false,
+            };
+            ray_pool_dispatch(pool, gather_fn, &gctx, pass_count);
+        }
+    } else {
+        /* Sequential gather with index */
+        for (int64_t c = 0; c < ncols; c++) {
+            ray_t* col = ray_table_get_col_idx(input, c);
+            if (!col || !new_cols[c]) continue;
+            uint8_t esz = col_esz(col);
+            char* src = (char*)ray_data(col);
+            char* dst = (char*)ray_data(new_cols[c]);
+            for (int64_t i = 0; i < pass_count; i++)
+                memcpy(dst + i * esz, src + match_idx[i] * esz, esz);
+        }
+    }
+
+    /* Propagate str_pool for any RAY_STR columns gathered by index */
+    /* Propagate str_pool for non-STR parted and flat columns.
+     * STR parted columns were already deep-copied with their own pool. */
+    for (int64_t c = 0; c < ncols; c++) {
+        if (!new_cols[c]) continue;
+        ray_t* col = ray_table_get_col_idx(input, c);
+        if (!col) continue;
+        if (RAY_IS_PARTED(col->type)) {
+            int8_t pb = (int8_t)RAY_PARTED_BASETYPE(col->type);
+            if (pb != RAY_STR) {
+                ray_t** sp = (ray_t**)ray_data(col);
+                col_propagate_str_pool_parted(new_cols[c], sp, col->len);
+            }
+        } else {
+            col_propagate_str_pool(new_cols[c], col);
+        }
+    }
+
+    for (int64_t c = 0; c < ncols; c++) {
+        if (!new_cols[c]) continue;
+        tbl = ray_table_add_col(tbl, col_names[c], new_cols[c]);
+        ray_release(new_cols[c]);
+    }
+
+    scratch_free(idx_hdr);
+    return tbl;
+}
+
+/* ============================================================================
+ * exec_filter_head — filter table, keeping only the first `limit` matches
+ *
+ * Scans the predicate sequentially, collecting matching row indices and
+ * stopping as soon as `limit` matches are found.  Only those rows are
+ * gathered into the result table, avoiding full-table gather when the
+ * number of matches far exceeds the limit.
+ * ============================================================================ */
+ray_t* exec_filter_head(ray_t* input, ray_t* pred, int64_t limit) {
+    if (!input || RAY_IS_ERR(input)) return input;
+    if (!pred || RAY_IS_ERR(pred)) return pred;
+    if (input->type != RAY_TABLE || pred->type != RAY_BOOL) return input;
+
+    int64_t ncols = ray_table_ncols(input);
+    int64_t nrows = ray_table_nrows(input);
+    if (limit <= 0 || ncols <= 0) return ray_table_new(0);
+    if (limit > nrows) limit = nrows;
+
+    /* VLA guard */
+    if (ncols > 256) return ray_error("limit", "table exceeds 256 columns");
+
+    /* Collect up to `limit` matching row indices, stopping early */
+    ray_t* idx_hdr = NULL;
+    int64_t* match_idx = (int64_t*)scratch_alloc(&idx_hdr,
+                                    (size_t)limit * sizeof(int64_t));
+    if (!match_idx) return ray_error("oom", NULL);
+
+    int64_t found = 0;
+    {
+        ray_morsel_t mp;
+        ray_morsel_init(&mp, pred);
+        int64_t row_base = 0;
+        while (ray_morsel_next(&mp) && found < limit) {
+            uint8_t* bits = (uint8_t*)mp.morsel_ptr;
+            for (int64_t i = 0; i < mp.morsel_len && found < limit; i++)
+                if (bits[i]) match_idx[found++] = row_base + i;
+            row_base += mp.morsel_len;
+        }
+    }
+
+    /* Build result table with gathered rows */
+    ray_t* tbl = ray_table_new(ncols);
+    if (!tbl || RAY_IS_ERR(tbl)) { scratch_free(idx_hdr); return tbl; }
+
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(input, c);
+        int64_t name_id = ray_table_col_name(input, c);
+        if (!col) continue;
+        int8_t out_type = RAY_IS_PARTED(col->type)
+                        ? (int8_t)RAY_PARTED_BASETYPE(col->type) : col->type;
+        if (out_type == RAY_MAPCOMMON) continue;
+        uint8_t out_attrs = 0;
+        if (out_type == RAY_SYM) {
+            if (RAY_IS_PARTED(col->type)) {
+                ray_t** sp = (ray_t**)ray_data(col);
+                out_attrs = parted_first_attrs(sp, col->len);
+            } else out_attrs = col->attrs;
+        }
+        uint8_t esz = ray_sym_elem_size(out_type, out_attrs);
+        ray_t* new_col = typed_vec_new(out_type, out_attrs, found);
+        if (!new_col || RAY_IS_ERR(new_col)) continue;
+        new_col->len = found;
+        char* dst = (char*)ray_data(new_col);
+        memset(dst, 0, (size_t)found * esz);
+
+        if (RAY_IS_PARTED(col->type)) {
+            ray_t** segs = (ray_t**)ray_data(col);
+            int64_t n_segs = col->len;
+            if (out_type == RAY_STR) {
+                /* Deep-copy STR to handle multi-pool segments */
+                ray_release(new_col);
+                new_col = parted_gather_str_rows(segs, n_segs, match_idx, found);
+            } else {
+                /* Non-STR parted gather */
+                int64_t seg_start = 0;
+                int64_t cur_seg = 0;
+                int64_t cur_seg_end = (n_segs > 0 && segs[0]) ? segs[0]->len : 0;
+                for (int64_t j = 0; j < found; j++) {
+                    int64_t r = match_idx[j];
+                    while (cur_seg < n_segs - 1 && r >= cur_seg_end) {
+                        seg_start = cur_seg_end;
+                        cur_seg++;
+                        cur_seg_end += segs[cur_seg] ? segs[cur_seg]->len : 0;
+                    }
+                    if (!segs[cur_seg] || !parted_seg_esz_ok(segs[cur_seg], out_type, esz))
+                        continue;
+                    char* src = (char*)ray_data(segs[cur_seg]);
+                    memcpy(dst + j * esz, src + (r - seg_start) * esz, esz);
+                }
+            }
+        } else {
+            char* src = (char*)ray_data(col);
+            for (int64_t j = 0; j < found; j++)
+                memcpy(dst + j * esz, src + match_idx[j] * esz, esz);
+            col_propagate_str_pool(new_col, col);
+        }
+        tbl = ray_table_add_col(tbl, name_id, new_col);
+        ray_release(new_col);
+    }
+
+    scratch_free(idx_hdr);
+    return tbl;
+}
+
+/* ============================================================================
+ * sel_compact — materialize a table by applying a RAY_SEL bitmap
+ *
+ * Used at boundary ops (sort/join/window) that need dense contiguous data.
+ * Reuses the same parallel multi-column gather as exec_filter.
+ * ============================================================================ */
+
+ray_t* sel_compact(ray_graph_t* g, ray_t* tbl, ray_t* sel) {
+    (void)g;
+    if (!tbl || RAY_IS_ERR(tbl) || !sel) return tbl;
+
+    int64_t nrows = ray_table_nrows(tbl);
+    ray_rowsel_t* meta = ray_rowsel_meta(sel);
+
+    /* Defensive: the selection must have been built for a table
+     * with this exact row count.  Mismatch means the caller passed
+     * a stale selection — aborting here is strictly safer than
+     * silently gathering via out-of-range indices. */
+    if (meta->nrows != nrows)
+        return ray_error("domain",
+            "sel_compact: selection nrows mismatch (sel=%lld tbl=%lld)",
+            (long long)meta->nrows, (long long)nrows);
+
+    int64_t pass_count = meta->total_pass;
+
+    /* All-pass: nothing to compact.  (In practice this path is
+     * unreachable because ray_rowsel_from_pred returns NULL for
+     * all-pass; the caller skips sel_compact in that case.
+     * Handled here for safety.) */
+    if (pass_count == nrows) { ray_retain(tbl); return tbl; }
+
+    /* None-pass: return empty table with same schema */
+    if (pass_count == 0) {
+        int64_t ncols = ray_table_ncols(tbl);
+        ray_t* empty = ray_table_new(ncols);
+        if (!empty || RAY_IS_ERR(empty)) return empty;
+        for (int64_t c = 0; c < ncols; c++) {
+            ray_t* col = ray_table_get_col_idx(tbl, c);
+            if (!col) continue;
+            int8_t ct = RAY_IS_PARTED(col->type)
+                      ? (int8_t)RAY_PARTED_BASETYPE(col->type) : col->type;
+            ray_t* nc = ray_vec_new(ct, 0);
+            if (nc && !RAY_IS_ERR(nc)) {
+                nc->len = 0;
+                empty = ray_table_add_col(empty, ray_table_col_name(tbl, c), nc);
+                ray_release(nc);
+            }
+        }
+        return empty;
+    }
+
+    int64_t ncols = ray_table_ncols(tbl);
+    if (ncols <= 0) { ray_retain(tbl); return tbl; }
+
+    /* Build match_idx from bitmap */
+    ray_t* idx_hdr = NULL;
+    int64_t* match_idx = (int64_t*)scratch_alloc(&idx_hdr,
+                                       (size_t)pass_count * sizeof(int64_t));
+    if (!match_idx) { ray_retain(tbl); return tbl; }
+
+    {
+        const uint8_t*  flags   = ray_rowsel_flags(sel);
+        const uint32_t* offsets = ray_rowsel_offsets(sel);
+        const uint16_t* idx     = ray_rowsel_idx(sel);
+        uint32_t n_segs = meta->n_segs;
+        int64_t j = 0;
+        for (uint32_t seg = 0; seg < n_segs; seg++) {
+            uint8_t f = flags[seg];
+            if (f == RAY_SEL_NONE) continue;
+            int64_t seg_start = (int64_t)seg * RAY_MORSEL_ELEMS;
+            int64_t seg_end = seg_start + RAY_MORSEL_ELEMS;
+            if (seg_end > nrows) seg_end = nrows;
+            if (f == RAY_SEL_ALL) {
+                for (int64_t r = seg_start; r < seg_end; r++)
+                    match_idx[j++] = r;
+            } else {
+                const uint16_t* slice = idx + offsets[seg];
+                uint32_t n = offsets[seg + 1] - offsets[seg];
+                for (uint32_t i = 0; i < n; i++)
+                    match_idx[j++] = seg_start + slice[i];
+            }
+        }
+    }
+
+    /* Parallel multi-column gather (same pattern as exec_filter) */
+    ray_pool_t* pool = ray_pool_get();
+    ray_t* out = ray_table_new(ncols);
+    if (!out || RAY_IS_ERR(out)) { scratch_free(idx_hdr); return out; }
+
+    /* VLA guard: 256 cols max for stack arrays */
+    if (ncols > 256) { scratch_free(idx_hdr); return ray_error("limit", "table exceeds 256 columns"); }
+
+    ray_t* new_cols[ncols];
+    int64_t col_names[ncols];
+    int64_t valid_ncols = 0;
+    bool has_parted = false;
+
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        col_names[c] = ray_table_col_name(tbl, c);
+        if (!col || RAY_IS_ERR(col)) { new_cols[c] = NULL; continue; }
+        if (col->type == RAY_MAPCOMMON) { new_cols[c] = NULL; continue; }
+        int8_t ct = RAY_IS_PARTED(col->type)
+                  ? (int8_t)RAY_PARTED_BASETYPE(col->type) : col->type;
+        uint8_t ca = 0;
+        if (ct == RAY_SYM) {
+            if (RAY_IS_PARTED(col->type)) {
+                ray_t** sp = (ray_t**)ray_data(col);
+                ca = parted_first_attrs(sp, col->len);
+            } else ca = col->attrs;
+        }
+        if (RAY_IS_PARTED(col->type)) has_parted = true;
+        ray_t* nc = typed_vec_new(ct, ca, pass_count);
+        if (!nc || RAY_IS_ERR(nc)) { new_cols[c] = NULL; continue; }
+        nc->len = pass_count;
+        new_cols[c] = nc;
+        valid_ncols++;
+    }
+
+    if (has_parted) {
+        for (int64_t c = 0; c < ncols; c++) {
+            ray_t* col = ray_table_get_col_idx(tbl, c);
+            if (!col || !new_cols[c]) continue;
+            if (RAY_IS_PARTED(col->type)) {
+                int8_t pbase = (int8_t)RAY_PARTED_BASETYPE(col->type);
+                if (pbase == RAY_STR) {
+                    ray_t** psegs = (ray_t**)ray_data(col);
+                    ray_release(new_cols[c]);
+                    new_cols[c] = parted_gather_str_rows(psegs, col->len,
+                                                         match_idx, pass_count);
+                } else {
+                    parted_gather_col(col, match_idx, pass_count, new_cols[c]);
+                }
+            } else {
+                uint8_t esz = col_esz(col);
+                char* src = (char*)ray_data(col);
+                char* dst = (char*)ray_data(new_cols[c]);
+                for (int64_t i = 0; i < pass_count; i++)
+                    memcpy(dst + i * esz, src + match_idx[i] * esz, esz);
+            }
+        }
+    } else if (pool && valid_ncols > 0 && valid_ncols <= MGATHER_MAX_COLS) {
+        multi_gather_ctx_t mgctx = { .idx = match_idx, .ncols = 0 };
+        for (int64_t c = 0; c < ncols; c++) {
+            if (!new_cols[c]) continue;
+            ray_t* col = ray_table_get_col_idx(tbl, c);
+            int64_t ci = mgctx.ncols;
+            mgctx.srcs[ci] = (char*)ray_data(col);
+            mgctx.dsts[ci] = (char*)ray_data(new_cols[c]);
+            mgctx.esz[ci]  = col_esz(col);
+            mgctx.ncols++;
+        }
+        ray_pool_dispatch(pool, multi_gather_fn, &mgctx, pass_count);
+    } else if (pool) {
+        for (int64_t c = 0; c < ncols; c++) {
+            ray_t* col = ray_table_get_col_idx(tbl, c);
+            if (!col || !new_cols[c]) continue;
+            gather_ctx_t gctx = {
+                .idx = match_idx, .src_col = col, .dst_col = new_cols[c],
+                .esz = col_esz(col), .nullable = false,
+            };
+            ray_pool_dispatch(pool, gather_fn, &gctx, pass_count);
+        }
+    } else {
+        for (int64_t c = 0; c < ncols; c++) {
+            ray_t* col = ray_table_get_col_idx(tbl, c);
+            if (!col || !new_cols[c]) continue;
+            uint8_t esz = col_esz(col);
+            char* src = (char*)ray_data(col);
+            char* dst = (char*)ray_data(new_cols[c]);
+            for (int64_t i = 0; i < pass_count; i++)
+                memcpy(dst + i * esz, src + match_idx[i] * esz, esz);
+        }
+    }
+
+    for (int64_t c = 0; c < ncols; c++) {
+        if (!new_cols[c]) continue;
+        ray_t* scol = ray_table_get_col_idx(tbl, c);
+        if (scol && RAY_IS_PARTED(scol->type)) {
+            int8_t pb = (int8_t)RAY_PARTED_BASETYPE(scol->type);
+            if (pb != RAY_STR) {
+                ray_t** sp = (ray_t**)ray_data(scol);
+                col_propagate_str_pool_parted(new_cols[c], sp, scol->len);
+            }
+            /* Parted null propagation handled in parted_gather_col / parted_gather_str_rows */
+        } else if (scol) {
+            col_propagate_str_pool(new_cols[c], scol);
+            col_propagate_nulls_gather(new_cols[c], scol, match_idx, pass_count);
+        }
+        out = ray_table_add_col(out, col_names[c], new_cols[c]);
+        ray_release(new_cols[c]);
+    }
+
+    scratch_free(idx_hdr);
+    return out;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/fuse.c b/crates/rayforce-sys/vendor/rayforce/src/ops/fuse.c
new file mode 100644
index 0000000..44606dd
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/fuse.c
@@ -0,0 +1,210 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "fuse.h"
+#include "mem/sys.h"
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * Fusion pass: merge element-wise chains into single fused nodes
+ *
+ * Detection: find maximal chains of element-wise ops where each intermediate
+ * has exactly one consumer. Mark chains with OP_FLAG_FUSED.
+ *
+ * For now this is a lightweight implementation that marks fuseable chains
+ * but relies on the executor's existing per-op evaluation. A full bytecode
+ * interpreter over register slots would be added in a production version.
+ * -------------------------------------------------------------------------- */
+
+/* Element-wise opcodes: unary [OP_NEG=10..OP_CAST=19] and
+ * binary [OP_ADD=20..OP_MAX2=34].  These ranges are contiguous by
+ * design (see rayforce.h opcode definitions). */
+static bool is_elementwise(uint16_t opcode) {
+    return (opcode >= OP_NEG && opcode <= OP_CAST) ||
+           (opcode >= OP_ADD && opcode <= OP_MAX2);
+}
+
+/* O(ext_count) per call; acceptable for typical graph sizes (tens to
+   hundreds of nodes).  L2: intentional duplication to keep files
+   self-contained — also present in opt.c. */
+static ray_op_ext_t* find_ext(ray_graph_t* g, uint32_t node_id) {
+    for (uint32_t i = 0; i < g->ext_count; i++) {
+        if (g->ext_nodes[i] && g->ext_nodes[i]->base.id == node_id)
+            return g->ext_nodes[i];
+    }
+    return NULL;
+}
+
+/* Count references to each node (iterative) */
+static void count_refs(ray_graph_t* g, ray_op_t* root, uint32_t* ref_counts) {
+    if (!root) return;
+
+    uint32_t nc = g->node_count;
+    /* M3: Overflow guard — prevent stack_cap from wrapping around on
+       pathologically large graphs. */
+    if (nc > UINT32_MAX / 2) return;
+    uint32_t stack_cap = nc * 2;
+    uint32_t stack_local[256];
+    uint32_t *stack = stack_cap <= 256 ? stack_local : (uint32_t*)ray_sys_alloc(stack_cap * sizeof(uint32_t));
+    if (!stack) return;
+    int sp = 0;
+    stack[sp++] = root->id;
+    while (sp > 0) {
+        uint32_t nid = stack[--sp];
+        ray_op_t* n = &g->nodes[nid];
+        ref_counts[nid]++;
+        if (ref_counts[nid] > 1) continue;  /* already counted children */
+        for (int i = 0; i < n->arity && i < 2; i++) {
+            if (n->inputs[i] && sp < (int)stack_cap)
+                stack[sp++] = n->inputs[i]->id;
+        }
+        /* M11: 3-input ops (OP_IF, OP_SUBSTR, OP_REPLACE) store the third
+           operand node ID as (uintptr_t)ext->literal. */
+        if (n->opcode == OP_IF || n->opcode == OP_SUBSTR || n->opcode == OP_REPLACE) {
+            ray_op_ext_t* ext = find_ext(g, nid);
+            if (ext) {
+                uint32_t third_id = (uint32_t)(uintptr_t)ext->literal;
+                if (third_id < nc && sp < (int)stack_cap)
+                    stack[sp++] = third_id;
+            }
+        }
+        /* M11: OP_CONCAT stores extra arg IDs (beyond inputs[0..1]) as
+           uint32_t values in trailing bytes after the ext node.
+           ext->sym holds the total arg count. */
+        if (n->opcode == OP_CONCAT) {
+            ray_op_ext_t* ext = find_ext(g, nid);
+            /* M4: Guard against ext->sym < 2 — trailing uint32_t values
+               only exist when there are more than 2 arguments. */
+            if (ext && ext->sym >= 2) {
+                int n_args = (int)ext->sym;
+                uint32_t* trail = (uint32_t*)((char*)(ext + 1));
+                for (int i = 2; i < n_args; i++) {
+                    uint32_t arg_id = trail[i - 2];
+                    if (arg_id < nc && sp < (int)stack_cap)
+                        stack[sp++] = arg_id;
+                }
+            }
+        }
+        /* H2: Count refs for ext node children (GROUP keys/aggs,
+           SORT/SELECT columns, JOIN keys, WINDOW inputs)
+           so fusion ref counts are accurate. */
+        if (n->opcode == OP_GROUP || n->opcode == OP_SORT ||
+            n->opcode == OP_JOIN  || n->opcode == OP_WINDOW_JOIN ||
+            n->opcode == OP_WINDOW ||
+            n->opcode == OP_SELECT) {
+            ray_op_ext_t* ext = find_ext(g, nid);
+            if (ext) {
+                switch (n->opcode) {
+                    case OP_GROUP:
+                        for (uint8_t k = 0; k < ext->n_keys; k++) {
+                            if (ext->keys[k] && sp < (int)stack_cap)
+                                stack[sp++] = ext->keys[k]->id;
+                        }
+                        for (uint8_t a = 0; a < ext->n_aggs; a++) {
+                            if (ext->agg_ins[a] && sp < (int)stack_cap)
+                                stack[sp++] = ext->agg_ins[a]->id;
+                        }
+                        break;
+                    case OP_SORT:
+                    case OP_SELECT:
+                        for (uint8_t k = 0; k < ext->sort.n_cols; k++) {
+                            if (ext->sort.columns[k] && sp < (int)stack_cap)
+                                stack[sp++] = ext->sort.columns[k]->id;
+                        }
+                        break;
+                    case OP_JOIN:
+                        for (uint8_t k = 0; k < ext->join.n_join_keys; k++) {
+                            if (ext->join.left_keys[k] && sp < (int)stack_cap)
+                                stack[sp++] = ext->join.left_keys[k]->id;
+                            if (ext->join.right_keys && ext->join.right_keys[k] && sp < (int)stack_cap)
+                                stack[sp++] = ext->join.right_keys[k]->id;
+                        }
+                        break;
+                    case OP_WINDOW_JOIN:
+                        if (ext->asof.time_key && sp < (int)stack_cap)
+                            stack[sp++] = ext->asof.time_key->id;
+                        for (uint8_t k = 0; k < ext->asof.n_eq_keys; k++) {
+                            if (ext->asof.eq_keys[k] && sp < (int)stack_cap)
+                                stack[sp++] = ext->asof.eq_keys[k]->id;
+                        }
+                        break;
+                    case OP_WINDOW:
+                        for (uint8_t k = 0; k < ext->window.n_part_keys; k++) {
+                            if (ext->window.part_keys[k] && sp < (int)stack_cap)
+                                stack[sp++] = ext->window.part_keys[k]->id;
+                        }
+                        for (uint8_t k = 0; k < ext->window.n_order_keys; k++) {
+                            if (ext->window.order_keys[k] && sp < (int)stack_cap)
+                                stack[sp++] = ext->window.order_keys[k]->id;
+                        }
+                        for (uint8_t f = 0; f < ext->window.n_funcs; f++) {
+                            if (ext->window.func_inputs[f] && sp < (int)stack_cap)
+                                stack[sp++] = ext->window.func_inputs[f]->id;
+                        }
+                        break;
+                    default:
+                        break;
+                }
+            }
+        }
+    }
+    if (stack_cap > 256) ray_sys_free(stack);
+}
+
+void ray_fuse_pass(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root || g->node_count == 0) return;
+
+    uint32_t nc = g->node_count;
+    uint32_t* ref_counts;
+    uint32_t ref_counts_stack[256];
+    if (nc <= 256) {
+        ref_counts = ref_counts_stack;
+    } else {
+        ref_counts = (uint32_t*)ray_sys_alloc(nc * sizeof(uint32_t));
+        if (!ref_counts) return;
+    }
+    memset(ref_counts, 0, nc * sizeof(uint32_t));
+
+    count_refs(g, root, ref_counts);
+
+    /* Mark fuseable chains: element-wise nodes whose inputs have exactly
+       one consumer (this node) and are also element-wise */
+    for (uint32_t i = 0; i < nc; i++) {
+        ray_op_t* n = &g->nodes[i];
+        if (!is_elementwise(n->opcode)) continue;
+        if (n->flags & OP_FLAG_DEAD) continue;
+
+        /* Check if all inputs are single-consumer element-wise */
+        bool can_fuse = false;
+        for (int j = 0; j < n->arity && j < 2; j++) {
+            ray_op_t* inp = n->inputs[j];
+            if (inp && is_elementwise(inp->opcode) && ref_counts[inp->id] == 1) {
+                can_fuse = true;
+            }
+        }
+        if (can_fuse) {
+            n->flags |= OP_FLAG_FUSED;
+        }
+    }
+    if (nc > 256) ray_sys_free(ref_counts);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/fuse.h b/crates/rayforce-sys/vendor/rayforce/src/ops/fuse.h
new file mode 100644
index 0000000..779bc16
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/fuse.h
@@ -0,0 +1,29 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_FUSE_H
+#define RAY_FUSE_H
+
+#include "ops.h"
+
+#endif /* RAY_FUSE_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/fvec.c b/crates/rayforce-sys/vendor/rayforce/src/ops/fvec.c
new file mode 100644
index 0000000..b5be031
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/fvec.c
@@ -0,0 +1,101 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "fvec.h"
+#include "mem/sys.h"
+#include "table/sym.h"
+#include <string.h>
+#include <stdio.h>
+
+ray_ftable_t* ray_ftable_new(uint16_t n_cols) {
+    ray_ftable_t* ft = (ray_ftable_t*)ray_sys_alloc(sizeof(ray_ftable_t));
+    if (!ft) return NULL;
+    memset(ft, 0, sizeof(ray_ftable_t));
+
+    ft->columns = (ray_fvec_t*)ray_sys_alloc((size_t)n_cols * sizeof(ray_fvec_t));
+    if (!ft->columns) {
+        ray_sys_free(ft);
+        return NULL;
+    }
+    memset(ft->columns, 0, (size_t)n_cols * sizeof(ray_fvec_t));
+    ft->n_cols = n_cols;
+
+    return ft;
+}
+
+void ray_ftable_free(ray_ftable_t* ft) {
+    if (!ft) return;
+
+    if (ft->columns) {
+        for (uint16_t i = 0; i < ft->n_cols; i++) {
+            if (ft->columns[i].vec) ray_release(ft->columns[i].vec);
+        }
+        ray_sys_free(ft->columns);
+    }
+    if (ft->semijoin) ray_release(ft->semijoin);
+    ray_sys_free(ft);
+}
+
+ray_t* ray_ftable_materialize(ray_ftable_t* ft) {
+    if (!ft || ft->n_cols == 0) return ray_error("type", NULL);
+
+    ray_t* tbl = ray_table_new(ft->n_cols);
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+
+    for (uint16_t c = 0; c < ft->n_cols; c++) {
+        ray_fvec_t* fv = &ft->columns[c];
+        if (!fv->vec) continue;
+
+        ray_t* col;
+        if (fv->cur_idx >= 0) {
+            /* Flat: replicate single value */
+            if (fv->cardinality <= 0) { ray_release(tbl); return ray_error("range", NULL); }
+            col = ray_vec_new(fv->vec->type, fv->cardinality);
+            if (!col || RAY_IS_ERR(col)) { ray_release(tbl); return col ? col : ray_error("oom", NULL); }
+            col->len = fv->cardinality;
+            void* val = ray_vec_get(fv->vec, fv->cur_idx);
+            if (!val) { ray_release(col); ray_release(tbl); return ray_error("range", NULL); }
+            uint8_t esz = ray_sym_elem_size(fv->vec->type, fv->vec->attrs);
+            char* dst = (char*)ray_data(col);
+            for (int64_t r = 0; r < fv->cardinality; r++)
+                memcpy(dst + r * esz, val, esz);
+        } else {
+            /* Unflat: use as-is */
+            col = fv->vec;
+            ray_retain(col);
+        }
+
+        char name_buf[12];
+        int n = snprintf(name_buf, sizeof(name_buf), "_c%d", c);
+        int64_t name_id = ray_sym_intern(name_buf, (size_t)n);
+        ray_t* new_tbl = ray_table_add_col(tbl, name_id, col);
+        ray_release(col);
+        if (!new_tbl || RAY_IS_ERR(new_tbl)) {
+            if (new_tbl != tbl) ray_release(tbl);
+            return new_tbl ? new_tbl : ray_error("oom", NULL);
+        }
+        tbl = new_tbl;
+    }
+
+    return tbl;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/fvec.h b/crates/rayforce-sys/vendor/rayforce/src/ops/fvec.h
new file mode 100644
index 0000000..cbd1a66
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/fvec.h
@@ -0,0 +1,52 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_FVEC_H
+#define RAY_FVEC_H
+
+#include "ops.h"
+
+/* Factorization state -- pipeline concept, NOT added to ray_t.
+ *
+ * Lives in the pipeline context. ray_t itself remains unchanged.
+ */
+typedef struct ray_fvec {
+    ray_t*    vec;            /* underlying ray_t vector (I64, SYM, etc.) */
+    int64_t  cur_idx;        /* >= 0: flat (single value at index)      */
+                             /* -1: unflat (full vector is active)      */
+    int64_t  cardinality;    /* for flat: how many rows this represents */
+} ray_fvec_t;
+
+/* Factorized Table -- accumulation buffer for ASP-Join */
+typedef struct ray_ftable {
+    ray_fvec_t*  columns;     /* array of factorized vectors   */
+    uint16_t    n_cols;
+    int64_t     n_tuples;    /* factorized tuple count        */
+    ray_t*       semijoin;    /* RAY_SEL bitmap of qualifying keys */
+} ray_ftable_t;
+
+ray_ftable_t* ray_ftable_new(uint16_t n_cols);
+void         ray_ftable_free(ray_ftable_t* ft);
+ray_t*        ray_ftable_materialize(ray_ftable_t* ft);
+
+#endif /* RAY_FVEC_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/glob.c b/crates/rayforce-sys/vendor/rayforce/src/ops/glob.c
new file mode 100644
index 0000000..dea37d1
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/glob.c
@@ -0,0 +1,102 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ */
+
+/*
+ * Iterative glob matcher.  Replaces three pre-existing implementations
+ * that diverged in syntax (eval used *,?,[abc]; DAG used SQL %,_) and
+ * one of which (strop.c::str_glob) blew up exponentially on patterns
+ * like "a*a*a*…a*b" against an a-only string.  This single file is
+ * the only matcher; both call sites delegate here.
+ */
+
+#include "ops/glob.h"
+
+/* Lowercase an ASCII byte; non-ASCII passes through unchanged. */
+static inline char to_lower(char c) {
+    return (c >= 'A' && c <= 'Z') ? (char)(c + 32) : c;
+}
+
+/* Match a single character against a class `[ ... ]`.  On entry *pi
+ * points at the byte after `[`.  On return *pi points one past `]`.
+ * Recognises `[abc]`, `[a-z]`, leading `!` for negation, embedded
+ * `]` is allowed as the first char (after optional `!`). */
+static bool match_class(const char* p, size_t pn, size_t* pi, char c, bool ci) {
+    size_t i = *pi;
+    bool neg = false;
+    if (i < pn && p[i] == '!') { neg = true; i++; }
+    bool matched = false;
+    bool first = true;
+    char ch = ci ? to_lower(c) : c;
+    while (i < pn && (first || p[i] != ']')) {
+        char lo = ci ? to_lower(p[i]) : p[i];
+        if (i + 2 < pn && p[i + 1] == '-' && p[i + 2] != ']') {
+            char hi = ci ? to_lower(p[i + 2]) : p[i + 2];
+            if (ch >= lo && ch <= hi) matched = true;
+            i += 3;
+        } else {
+            if (ch == lo) matched = true;
+            i++;
+        }
+        first = false;
+    }
+    if (i < pn && p[i] == ']') i++;  /* consume closing bracket */
+    *pi = i;
+    return neg ? !matched : matched;
+}
+
+static bool glob_impl(const char* s, size_t sn,
+                     const char* p, size_t pn, bool ci) {
+    size_t si = 0, pi = 0;
+    size_t star_pi = (size_t)-1, star_si = 0;
+
+    while (si < sn) {
+        if (pi < pn && p[pi] == '*') {
+            star_pi = pi++;        /* remember star, skip it */
+            star_si = si;
+        } else if (pi < pn && p[pi] == '?') {
+            pi++;
+            si++;
+        } else if (pi < pn && p[pi] == '[') {
+            size_t cls_pi = pi + 1;
+            if (match_class(p, pn, &cls_pi, s[si], ci)) {
+                pi = cls_pi;
+                si++;
+            } else if (star_pi != (size_t)-1) {
+                pi = star_pi + 1;
+                si = ++star_si;
+            } else {
+                return false;
+            }
+        } else if (pi < pn) {
+            char a = ci ? to_lower(s[si]) : s[si];
+            char b = ci ? to_lower(p[pi]) : p[pi];
+            if (a == b) {
+                pi++;
+                si++;
+            } else if (star_pi != (size_t)-1) {
+                pi = star_pi + 1;
+                si = ++star_si;
+            } else {
+                return false;
+            }
+        } else if (star_pi != (size_t)-1) {
+            pi = star_pi + 1;
+            si = ++star_si;
+        } else {
+            return false;
+        }
+    }
+    /* Consumed all of input — pattern must be at end, modulo trailing stars. */
+    while (pi < pn && p[pi] == '*') pi++;
+    return pi == pn;
+}
+
+bool ray_glob_match(const char* s, size_t sn, const char* p, size_t pn) {
+    return glob_impl(s, sn, p, pn, false);
+}
+
+bool ray_glob_match_ci(const char* s, size_t sn, const char* p, size_t pn) {
+    return glob_impl(s, sn, p, pn, true);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/glob.h b/crates/rayforce-sys/vendor/rayforce/src/ops/glob.h
new file mode 100644
index 0000000..71bc3a2
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/glob.h
@@ -0,0 +1,43 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ */
+
+#ifndef RAY_OPS_GLOB_H
+#define RAY_OPS_GLOB_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+/* Glob pattern match, iterative two-pointer (no catastrophic backtracking).
+ * Worst case O(n*m); typical case linear.
+ *
+ * Supported metacharacters:
+ *   *        — matches zero or more characters
+ *   ?        — matches exactly one character
+ *   [abc]    — character class: matches any of a, b, c
+ *   [a-z]    — range
+ *   [!abc]   — negated class
+ *
+ * Matching a literal metacharacter — there is no backslash escape; wrap
+ * the character in a one-element class instead:
+ *   [*]      matches a literal '*'
+ *   [?]      matches a literal '?'
+ *   [[]      matches a literal '['
+ *   []]      matches a literal ']'  (']' as first char inside [...] is literal)
+ *   [-]      matches a literal '-'  (as the sole char, no range to form)
+ *
+ * `glob_match` is case-sensitive.  `glob_match_ci` lowercases ASCII letters
+ * on both sides before comparing (so it matches 'A' against 'a', 'A-Z'
+ * range matches both case forms, etc.).
+ *
+ * Lenient parsing policy: an unterminated character class (e.g. pattern
+ * "abc[def" with no closing `]`) is accepted — the class consumes input
+ * up to the end of the pattern and the match continues with whatever
+ * `matched` flag accumulated.  This matches glibc fnmatch's permissive
+ * behaviour and avoids surprising `error: parse` mid-search.  Callers
+ * that want strict validation should pre-validate the pattern. */
+bool ray_glob_match(const char* s, size_t sn, const char* p, size_t pn);
+bool ray_glob_match_ci(const char* s, size_t sn, const char* p, size_t pn);
+
+#endif /* RAY_OPS_GLOB_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/graph.c b/crates/rayforce-sys/vendor/rayforce/src/ops/graph.c
new file mode 100644
index 0000000..3d68f46
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/graph.c
@@ -0,0 +1,1822 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "graph.h"
+#include "store/csr.h"
+#include "store/hnsw.h"
+#include "mem/sys.h"
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * Graph allocation helpers
+ * -------------------------------------------------------------------------- */
+
+#define GRAPH_INIT_CAP 4096
+
+static inline ray_op_t* graph_fix_ptr(ray_op_t* p, ptrdiff_t delta) {
+    return p ? (ray_op_t*)((char*)p + delta) : NULL;
+}
+
+static void graph_fixup_ext_ptrs(ray_graph_t* g, ptrdiff_t delta) {
+    for (uint32_t i = 0; i < g->ext_count; i++) {
+        ray_op_ext_t* ext = g->ext_nodes[i];
+        if (!ext) continue;
+
+        ext->base.inputs[0] = graph_fix_ptr(ext->base.inputs[0], delta);
+        ext->base.inputs[1] = graph_fix_ptr(ext->base.inputs[1], delta);
+
+        switch (ext->base.opcode) {
+            case OP_SORT:
+                for (uint8_t k = 0; k < ext->sort.n_cols; k++)
+                    ext->sort.columns[k] = graph_fix_ptr(ext->sort.columns[k], delta);
+                break;
+            case OP_GROUP:
+                for (uint8_t k = 0; k < ext->n_keys; k++)
+                    ext->keys[k] = graph_fix_ptr(ext->keys[k], delta);
+                for (uint8_t a = 0; a < ext->n_aggs; a++)
+                    ext->agg_ins[a] = graph_fix_ptr(ext->agg_ins[a], delta);
+                break;
+            case OP_JOIN:
+            case OP_ANTIJOIN:
+                for (uint8_t k = 0; k < ext->join.n_join_keys; k++)
+                    ext->join.left_keys[k] = graph_fix_ptr(ext->join.left_keys[k], delta);
+                if (ext->join.right_keys) {
+                    for (uint8_t k = 0; k < ext->join.n_join_keys; k++)
+                        ext->join.right_keys[k] = graph_fix_ptr(ext->join.right_keys[k], delta);
+                }
+                break;
+            case OP_WINDOW_JOIN:
+                ext->asof.time_key = graph_fix_ptr(ext->asof.time_key, delta);
+                for (uint8_t k = 0; k < ext->asof.n_eq_keys; k++)
+                    ext->asof.eq_keys[k] = graph_fix_ptr(ext->asof.eq_keys[k], delta);
+                break;
+            case OP_WINDOW:
+                for (uint8_t k = 0; k < ext->window.n_part_keys; k++)
+                    ext->window.part_keys[k] = graph_fix_ptr(ext->window.part_keys[k], delta);
+                for (uint8_t k = 0; k < ext->window.n_order_keys; k++)
+                    ext->window.order_keys[k] = graph_fix_ptr(ext->window.order_keys[k], delta);
+                for (uint8_t f = 0; f < ext->window.n_funcs; f++)
+                    ext->window.func_inputs[f] = graph_fix_ptr(ext->window.func_inputs[f], delta);
+                break;
+            case OP_SELECT:
+                for (uint8_t k = 0; k < ext->sort.n_cols; k++)
+                    ext->sort.columns[k] = graph_fix_ptr(ext->sort.columns[k], delta);
+                break;
+            case OP_PIVOT:
+                for (uint8_t k = 0; k < ext->pivot.n_index; k++)
+                    ext->pivot.index_cols[k] = graph_fix_ptr(ext->pivot.index_cols[k], delta);
+                ext->pivot.pivot_col = graph_fix_ptr(ext->pivot.pivot_col, delta);
+                ext->pivot.value_col = graph_fix_ptr(ext->pivot.value_col, delta);
+                break;
+            /* Graph ops: no ray_op_t* pointers in ext union to fix */
+            case OP_EXPAND:
+            case OP_VAR_EXPAND:
+            case OP_SHORTEST_PATH:
+            case OP_WCO_JOIN:
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+/* After realloc moves g->nodes, fix up all stored input pointers.
+   old_base is saved as uintptr_t before realloc to avoid GCC 14
+   -Wuse-after-free on the stale pointer. */
+static void graph_fixup_ptrs(ray_graph_t* g, uintptr_t old_base) {
+    ptrdiff_t delta = (ptrdiff_t)((uintptr_t)g->nodes - old_base);
+    if (delta == 0) return;
+    for (uint32_t i = 0; i < g->node_count; i++) {
+        g->nodes[i].inputs[0] = graph_fix_ptr(g->nodes[i].inputs[0], delta);
+        g->nodes[i].inputs[1] = graph_fix_ptr(g->nodes[i].inputs[1], delta);
+    }
+    graph_fixup_ext_ptrs(g, delta);
+}
+
+/* L3: node_count is uint32_t — theoretical overflow at 2^32 nodes is
+   unreachable in practice (would require ~128 GB for the nodes array). */
+static ray_op_t* graph_alloc_node(ray_graph_t* g) {
+    if (g->node_count >= g->node_cap) {
+        uintptr_t old_base = (uintptr_t)g->nodes;
+        /* H2: Overflow guard — if node_cap is already > UINT32_MAX/2,
+           doubling would wrap around to a smaller value. */
+        if (g->node_cap > UINT32_MAX / 2) return NULL;
+        uint32_t new_cap = g->node_cap * 2;
+        ray_op_t* new_nodes = (ray_op_t*)ray_sys_realloc(g->nodes,
+                                                      new_cap * sizeof(ray_op_t));
+        if (!new_nodes) return NULL;
+        g->nodes = new_nodes;
+        g->node_cap = new_cap;
+        graph_fixup_ptrs(g, old_base);
+    }
+    ray_op_t* n = &g->nodes[g->node_count];
+    memset(n, 0, sizeof(ray_op_t));
+    n->id = g->node_count;
+    g->node_count++;
+    return n;
+}
+
+static ray_op_ext_t* graph_alloc_ext_node_ex(ray_graph_t* g, size_t extra) {
+    /* Extended nodes are 64 bytes; extra bytes appended for inline arrays */
+    ray_op_ext_t* ext = (ray_op_ext_t*)ray_sys_alloc(sizeof(ray_op_ext_t) + extra);
+    if (!ext) return NULL;
+    memset(ext, 0, sizeof(ray_op_ext_t) + extra);
+
+    /* Also add a placeholder in the nodes array for ID tracking */
+    if (g->node_count >= g->node_cap) {
+        if (g->node_cap > UINT32_MAX / 2) { ray_sys_free(ext); return NULL; }
+        uintptr_t old_base = (uintptr_t)g->nodes;
+        uint32_t new_cap = g->node_cap * 2;
+        ray_op_t* new_nodes = (ray_op_t*)ray_sys_realloc(g->nodes,
+                                                      new_cap * sizeof(ray_op_t));
+        if (!new_nodes) { ray_sys_free(ext); return NULL; }
+        g->nodes = new_nodes;
+        g->node_cap = new_cap;
+        graph_fixup_ptrs(g, old_base);
+    }
+    ext->base.id = g->node_count;
+    /* H4: Do NOT copy ext->base to nodes[] here — the caller fills in
+       fields first and then syncs via g->nodes[ext->base.id] = ext->base. */
+    memset(&g->nodes[g->node_count], 0, sizeof(ray_op_t));
+    g->nodes[g->node_count].id = g->node_count;
+    g->node_count++;
+
+    /* Track ext node for cleanup */
+    if (g->ext_count >= g->ext_cap) {
+        if (g->ext_cap > UINT32_MAX / 2) { g->node_count--; ray_sys_free(ext); return NULL; }
+        uint32_t new_cap = g->ext_cap == 0 ? 16 : g->ext_cap * 2;
+        ray_op_ext_t** new_exts = (ray_op_ext_t**)ray_sys_realloc(g->ext_nodes,
+                                                               new_cap * sizeof(ray_op_ext_t*));
+        if (!new_exts) { g->node_count--; ray_sys_free(ext); return NULL; }
+        g->ext_nodes = new_exts;
+        g->ext_cap = new_cap;
+    }
+    g->ext_nodes[g->ext_count++] = ext;
+
+    return ext;
+}
+
+static ray_op_ext_t* graph_alloc_ext_node(ray_graph_t* g) {
+    return graph_alloc_ext_node_ex(g, 0);
+}
+
+/* Pointer to trailing bytes after the ext node */
+#define EXT_TRAIL(ext) ((char*)((ext) + 1))
+
+/* --------------------------------------------------------------------------
+ * ray_graph_new / ray_graph_free
+ * -------------------------------------------------------------------------- */
+
+ray_graph_t* ray_graph_new(ray_t* tbl) {
+    ray_graph_t* g = (ray_graph_t*)ray_sys_alloc(sizeof(ray_graph_t));
+    if (!g) return NULL;
+
+    g->nodes = (ray_op_t*)ray_sys_alloc(GRAPH_INIT_CAP * sizeof(ray_op_t));
+    if (!g->nodes) { ray_sys_free(g); return NULL; }
+    g->node_cap = GRAPH_INIT_CAP;
+    g->node_count = 0;
+    g->table = tbl;
+    if (tbl) ray_retain(tbl);
+
+    g->tables = NULL;
+    g->n_tables = 0;
+
+    g->ext_nodes = NULL;
+    g->ext_count = 0;
+    g->ext_cap = 0;
+    g->selection = NULL;
+
+    g->cexpr_env_top = 0;  /* compile-time lambda/let env, initially empty */
+
+    return g;
+}
+
+void ray_graph_free(ray_graph_t* g) {
+    if (!g) return;
+
+    /* M6: Release OP_CONST literal values before freeing ext nodes */
+    for (uint32_t i = 0; i < g->ext_count; i++) {
+        ray_op_ext_t* ext = g->ext_nodes[i];
+        if (ext && (g->nodes[ext->base.id].opcode == OP_CONST ||
+                    g->nodes[ext->base.id].opcode == OP_TIL) && ext->literal) {
+            ray_release(ext->literal);
+        }
+        /* Release runtime-built SIP bitmaps on graph traversal nodes */
+        if (ext) {
+            uint16_t oc = g->nodes[ext->base.id].opcode;
+            if ((oc == OP_EXPAND || oc == OP_VAR_EXPAND || oc == OP_SHORTEST_PATH)
+                && ext->graph.sip_sel) {
+                ray_release((ray_t*)ext->graph.sip_sel);
+            }
+            if (oc == OP_ASTAR && ext->graph.node_props) {
+                ray_release((ray_t*)ext->graph.node_props);
+            }
+        }
+    }
+    /* Free seg_mask bitmaps (shared across ext nodes — deduplicate) */
+    for (uint32_t i = 0; i < g->ext_count; i++) {
+        ray_op_ext_t* ext = g->ext_nodes[i];
+        if (ext && ext->seg_mask) {
+            uint64_t* mask = ext->seg_mask;
+            ext->seg_mask = NULL;
+            /* Clear same pointer from other ext nodes */
+            for (uint32_t j = i + 1; j < g->ext_count; j++) {
+                if (g->ext_nodes[j] && g->ext_nodes[j]->seg_mask == mask)
+                    g->ext_nodes[j]->seg_mask = NULL;
+            }
+            ray_sys_free(mask);
+        }
+    }
+    /* Free extended nodes */
+    for (uint32_t i = 0; i < g->ext_count; i++) {
+        ray_sys_free(g->ext_nodes[i]);
+    }
+    ray_sys_free(g->ext_nodes);
+
+    ray_sys_free(g->nodes);
+    if (g->table) ray_release(g->table);
+
+    /* Release table registry */
+    if (g->tables) {
+        for (uint16_t i = 0; i < g->n_tables; i++) {
+            if (g->tables[i]) ray_release(g->tables[i]);
+        }
+        ray_sys_free(g->tables);
+    }
+
+    if (g->selection) ray_release(g->selection);
+    ray_sys_free(g);
+}
+
+/* --------------------------------------------------------------------------
+ * Source ops
+ * -------------------------------------------------------------------------- */
+
+ray_op_t* ray_scan(ray_graph_t* g, const char* col_name) {
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_SCAN;
+    ext->base.arity = 0;
+
+    /* Intern the column name to get symbol ID */
+    int64_t sym_id = ray_sym_intern(col_name, strlen(col_name));
+    ext->sym = sym_id;
+
+    /* Infer output type from the bound table */
+    if (g->table) {
+        ray_t* col = ray_table_get_col(g->table, sym_id);
+        if (col) {
+            ext->base.out_type = col->type;
+            ext->base.est_rows = (uint32_t)col->len;
+        }
+    }
+
+    /* Update the nodes array with the filled base */
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_const_f64(ray_graph_t* g, double val) {
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_CONST;
+    ext->base.arity = 0;
+    ext->base.out_type = RAY_F64;
+    ext->literal = ray_f64(val);
+    /* L4: null/error check on allocation result */
+    if (!ext->literal || RAY_IS_ERR(ext->literal)) ext->literal = NULL;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_const_i64(ray_graph_t* g, int64_t val) {
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_CONST;
+    ext->base.arity = 0;
+    ext->base.out_type = RAY_I64;
+    ext->literal = ray_i64(val);
+    /* L4: null/error check on allocation result */
+    if (!ext->literal || RAY_IS_ERR(ext->literal)) ext->literal = NULL;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_const_bool(ray_graph_t* g, bool val) {
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_CONST;
+    ext->base.arity = 0;
+    ext->base.out_type = RAY_BOOL;
+    ext->literal = ray_bool(val);
+    /* L4: null/error check on allocation result */
+    if (!ext->literal || RAY_IS_ERR(ext->literal)) ext->literal = NULL;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_const_str(ray_graph_t* g, const char* s, size_t len) {
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_CONST;
+    ext->base.arity = 0;
+    ext->base.out_type = RAY_SYM;   /* string constants resolve to SYM at exec time */
+    ext->literal = ray_str(s, len);
+    /* L4: null/error check on allocation result */
+    if (!ext->literal || RAY_IS_ERR(ext->literal)) ext->literal = NULL;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_til(ray_graph_t* g, int64_t n) {
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_TIL;
+    ext->base.arity = 0;
+    ext->base.out_type = RAY_I64;
+    ext->base.est_rows = (uint32_t)(n > UINT32_MAX ? UINT32_MAX : n);
+    ext->literal = ray_i64(n);  /* store n as literal */
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_const_vec(ray_graph_t* g, ray_t* vec) {
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_CONST;
+    ext->base.arity = 0;
+    ext->base.out_type = vec->type;
+    ext->base.est_rows = (uint32_t)vec->len;
+    ext->literal = vec;
+    ray_retain(vec);
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+/* Generic const-atom constructor.  Handles any scalar atom type
+ * (RAY_SYM, RAY_DATE, RAY_TIME, RAY_TIMESTAMP, RAY_GUID, RAY_NULL,
+ * and any other ray_t* used as an immediate literal).  The executor
+ * OP_CONST handler just returns ext->literal, so the same retain/
+ * store mechanism as ray_const_vec works for atoms too. */
+ray_op_t* ray_const_atom(ray_graph_t* g, ray_t* atom) {
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_CONST;
+    ext->base.arity = 0;
+    /* Atom types are stored negated (-RAY_I64 etc); the executor
+     * does not rely on out_type for OP_CONST dispatch, but we keep
+     * it consistent with the source atom. */
+    ext->base.out_type = atom->type;
+    ext->base.est_rows = 1;
+    ext->literal = atom;
+    ray_retain(atom);
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_const_table(ray_graph_t* g, ray_t* tbl) {
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_CONST;
+    ext->base.arity = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->literal = tbl;
+    ray_retain(tbl);
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+/* --------------------------------------------------------------------------
+ * Helper: create unary/binary node
+ * -------------------------------------------------------------------------- */
+
+static ray_op_t* make_unary(ray_graph_t* g, uint16_t opcode, ray_op_t* a, int8_t out_type) {
+    /* Save ID before alloc — realloc may invalidate the pointer */
+    uint32_t a_id = a->id;
+    uint32_t est = a->est_rows;
+    ray_op_t* n = graph_alloc_node(g);
+    if (!n) return NULL;
+    a = &g->nodes[a_id];  /* re-resolve after potential realloc */
+
+    n->opcode = opcode;
+    n->arity = 1;
+    n->inputs[0] = a;
+    n->out_type = out_type;
+    n->est_rows = est;
+    return n;
+}
+
+static ray_op_t* make_binary(ray_graph_t* g, uint16_t opcode, ray_op_t* a, ray_op_t* b, int8_t out_type) {
+    /* Save IDs before alloc — realloc may invalidate the pointers */
+    uint32_t a_id = a->id;
+    uint32_t b_id = b->id;
+    uint32_t est = a->est_rows > b->est_rows ? a->est_rows : b->est_rows;
+    ray_op_t* n = graph_alloc_node(g);
+    if (!n) return NULL;
+    a = &g->nodes[a_id];  /* re-resolve after potential realloc */
+    b = &g->nodes[b_id];
+
+    n->opcode = opcode;
+    n->arity = 2;
+    n->inputs[0] = a;
+    n->inputs[1] = b;
+    n->out_type = out_type;
+    n->est_rows = est;
+    return n;
+}
+
+/* Type promotion: BOOL < U8 < I16 < I32 < I64 < F64.
+ * RAY_STR is its own type class — not promotable to numeric types. */
+static int8_t promote(int8_t a, int8_t b) {
+    if (a == RAY_STR || b == RAY_STR) return RAY_STR;
+    if (a == RAY_F64 || b == RAY_F64) return RAY_F64;
+    if (a == RAY_I64 || b == RAY_I64 || a == RAY_SYM || b == RAY_SYM ||
+        a == RAY_TIMESTAMP || b == RAY_TIMESTAMP) return RAY_I64;
+    if (a == RAY_I32 || b == RAY_I32 ||
+        a == RAY_DATE || b == RAY_DATE || a == RAY_TIME || b == RAY_TIME) return RAY_I32;
+    if (a == RAY_I16 || b == RAY_I16) return RAY_I16;
+    if (a == RAY_U8 || b == RAY_U8) return RAY_U8;
+    return RAY_BOOL;
+}
+
+/* --------------------------------------------------------------------------
+ * Unary element-wise ops
+ * -------------------------------------------------------------------------- */
+
+ray_op_t* ray_neg(ray_graph_t* g, ray_op_t* a)     { return make_unary(g, OP_NEG, a, a->out_type); }
+ray_op_t* ray_abs(ray_graph_t* g, ray_op_t* a)     { return make_unary(g, OP_ABS, a, a->out_type); }
+ray_op_t* ray_not(ray_graph_t* g, ray_op_t* a)     { return make_unary(g, OP_NOT, a, RAY_BOOL); }
+ray_op_t* ray_sqrt_op(ray_graph_t* g, ray_op_t* a) { return make_unary(g, OP_SQRT, a, RAY_F64); }
+ray_op_t* ray_log_op(ray_graph_t* g, ray_op_t* a)  { return make_unary(g, OP_LOG, a, RAY_F64); }
+ray_op_t* ray_exp_op(ray_graph_t* g, ray_op_t* a)  { return make_unary(g, OP_EXP, a, RAY_F64); }
+ray_op_t* ray_ceil_op(ray_graph_t* g, ray_op_t* a) { return make_unary(g, OP_CEIL, a, a->out_type); }
+ray_op_t* ray_floor_op(ray_graph_t* g, ray_op_t* a){ return make_unary(g, OP_FLOOR, a, a->out_type); }
+ray_op_t* ray_round_op(ray_graph_t* g, ray_op_t* a){ return make_unary(g, OP_ROUND, a, a->out_type); }
+ray_op_t* ray_isnull(ray_graph_t* g, ray_op_t* a)  { return make_unary(g, OP_ISNULL, a, RAY_BOOL); }
+
+ray_op_t* ray_cast(ray_graph_t* g, ray_op_t* a, int8_t target_type) {
+    return make_unary(g, OP_CAST, a, target_type);
+}
+
+/* --------------------------------------------------------------------------
+ * Binary element-wise ops
+ * -------------------------------------------------------------------------- */
+
+/* Generic binary op constructor — opcode-driven, no switch/case needed by caller */
+ray_op_t* ray_binop(ray_graph_t* g, uint16_t opcode, ray_op_t* a, ray_op_t* b) {
+    int8_t out;
+    switch (opcode) {
+    case OP_EQ: case OP_NE: case OP_LT: case OP_LE:
+    case OP_GT: case OP_GE: case OP_AND: case OP_OR:
+        out = RAY_BOOL; break;
+    case OP_DIV:
+        out = RAY_F64; break;
+    default:
+        out = promote(a->out_type, b->out_type); break;
+    }
+    return make_binary(g, opcode, a, b, out);
+}
+
+ray_op_t* ray_add(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_ADD, a, b, promote(a->out_type, b->out_type)); }
+ray_op_t* ray_sub(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_SUB, a, b, promote(a->out_type, b->out_type)); }
+ray_op_t* ray_mul(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_MUL, a, b, promote(a->out_type, b->out_type)); }
+ray_op_t* ray_div(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_DIV, a, b, RAY_F64); }
+ray_op_t* ray_mod(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_MOD, a, b, promote(a->out_type, b->out_type)); }
+
+ray_op_t* ray_eq(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_EQ, a, b, RAY_BOOL); }
+ray_op_t* ray_ne(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_NE, a, b, RAY_BOOL); }
+ray_op_t* ray_lt(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_LT, a, b, RAY_BOOL); }
+ray_op_t* ray_le(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_LE, a, b, RAY_BOOL); }
+ray_op_t* ray_gt(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_GT, a, b, RAY_BOOL); }
+ray_op_t* ray_ge(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_GE, a, b, RAY_BOOL); }
+ray_op_t* ray_and(ray_graph_t* g, ray_op_t* a, ray_op_t* b){ return make_binary(g, OP_AND, a, b, RAY_BOOL); }
+ray_op_t* ray_or(ray_graph_t* g, ray_op_t* a, ray_op_t* b) { return make_binary(g, OP_OR, a, b, RAY_BOOL); }
+ray_op_t* ray_min2(ray_graph_t* g, ray_op_t* a, ray_op_t* b){ return make_binary(g, OP_MIN2, a, b, promote(a->out_type, b->out_type)); }
+ray_op_t* ray_max2(ray_graph_t* g, ray_op_t* a, ray_op_t* b){ return make_binary(g, OP_MAX2, a, b, promote(a->out_type, b->out_type)); }
+ray_op_t* ray_in(ray_graph_t* g, ray_op_t* col, ray_op_t* set){ return make_binary(g, OP_IN, col, set, RAY_BOOL); }
+ray_op_t* ray_not_in(ray_graph_t* g, ray_op_t* col, ray_op_t* set){ return make_binary(g, OP_NOT_IN, col, set, RAY_BOOL); }
+
+ray_op_t* ray_if(ray_graph_t* g, ray_op_t* cond, ray_op_t* then_val, ray_op_t* else_val) {
+    /* 3-input node: cond, then, else — needs ext node */
+    uint32_t cond_id = cond->id;
+    uint32_t then_id = then_val->id;
+    uint32_t else_id = else_val->id;
+    int8_t out_type = promote(then_val->out_type, else_val->out_type);
+    /* IF preserves string types: promote() handles RAY_STR (wins over SYM);
+     * SYM override only applies when neither side is RAY_STR */
+    if (out_type != RAY_STR &&
+        (then_val->out_type == RAY_SYM || else_val->out_type == RAY_SYM))
+        out_type = RAY_SYM;
+    uint32_t est = cond->est_rows;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    /* Re-resolve after potential realloc (else_val stored as index, not pointer) */
+    cond = &g->nodes[cond_id];
+    then_val = &g->nodes[then_id];
+
+    ext->base.opcode = OP_IF;
+    ext->base.arity = 2;  /* inputs[0]=cond, inputs[1]=then; else via ext */
+    ext->base.inputs[0] = cond;
+    ext->base.inputs[1] = then_val;
+    ext->base.out_type = out_type;
+    ext->base.est_rows = est;
+    /* Store else_val as a node ID (not a pointer) in the literal field.
+     * Recovered via (uint32_t)(uintptr_t)ext->literal in fuse.c/exec.c. */
+    ext->literal = (ray_t*)(uintptr_t)else_id;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_like(ray_graph_t* g, ray_op_t* input, ray_op_t* pattern) {
+    return make_binary(g, OP_LIKE, input, pattern, RAY_BOOL);
+}
+
+ray_op_t* ray_ilike(ray_graph_t* g, ray_op_t* input, ray_op_t* pattern) {
+    return make_binary(g, OP_ILIKE, input, pattern, RAY_BOOL);
+}
+
+/* String ops */
+ray_op_t* ray_upper(ray_graph_t* g, ray_op_t* a)   { return make_unary(g, OP_UPPER, a, a->out_type == RAY_STR ? RAY_STR : RAY_SYM); }
+ray_op_t* ray_lower(ray_graph_t* g, ray_op_t* a)   { return make_unary(g, OP_LOWER, a, a->out_type == RAY_STR ? RAY_STR : RAY_SYM); }
+ray_op_t* ray_strlen(ray_graph_t* g, ray_op_t* a)  { return make_unary(g, OP_STRLEN, a, RAY_I64); }
+ray_op_t* ray_trim_op(ray_graph_t* g, ray_op_t* a) { return make_unary(g, OP_TRIM, a, a->out_type == RAY_STR ? RAY_STR : RAY_SYM); }
+
+ray_op_t* ray_substr(ray_graph_t* g, ray_op_t* str, ray_op_t* start, ray_op_t* len) {
+    /* 3-input: str=inputs[0], start=inputs[1], len stored via literal field */
+    uint32_t s_id = str->id;
+    uint32_t st_id = start->id;
+    uint32_t l_id = len->id;
+    uint32_t est = str->est_rows;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    str   = &g->nodes[s_id];
+    start = &g->nodes[st_id];
+
+    ext->base.opcode = OP_SUBSTR;
+    ext->base.arity = 2;
+    ext->base.inputs[0] = str;
+    ext->base.inputs[1] = start;
+    ext->base.out_type = (str->out_type == RAY_STR) ? RAY_STR : RAY_SYM;
+    ext->base.est_rows = est;
+    ext->literal = (ray_t*)(uintptr_t)l_id;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_replace(ray_graph_t* g, ray_op_t* str, ray_op_t* from, ray_op_t* to) {
+    /* 3-input: str=inputs[0], from=inputs[1], to stored via literal field */
+    uint32_t s_id = str->id;
+    uint32_t f_id = from->id;
+    uint32_t t_id = to->id;
+    uint32_t est = str->est_rows;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    str  = &g->nodes[s_id];
+    from = &g->nodes[f_id];
+
+    ext->base.opcode = OP_REPLACE;
+    ext->base.arity = 2;
+    ext->base.inputs[0] = str;
+    ext->base.inputs[1] = from;
+    ext->base.out_type = (str->out_type == RAY_STR) ? RAY_STR : RAY_SYM;
+    ext->base.est_rows = est;
+    ext->literal = (ray_t*)(uintptr_t)t_id;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_concat(ray_graph_t* g, ray_op_t** args, int n) {
+    /* Variadic: first 2 in inputs[], rest in trailing IDs */
+    if (!args || n < 2) return NULL;
+    /* M4: Guard VLA upper bound */
+    if (n > 256) return NULL;
+    size_t n_args = (size_t)n;
+    if (n_args > (SIZE_MAX / sizeof(uint32_t))) return NULL;
+    size_t extra = (n > 2) ? (size_t)(n - 2) * sizeof(uint32_t) : 0;
+
+    /* Save IDs before alloc (n is small — bounded by function arity) */
+    uint32_t ids[n];
+    for (int i = 0; i < n; i++) ids[i] = args[i]->id;
+    uint32_t est = args[0]->est_rows;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, extra);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_CONCAT;
+    ext->base.arity = 2;
+    ext->base.inputs[0] = &g->nodes[ids[0]];
+    ext->base.inputs[1] = &g->nodes[ids[1]];
+    /* RAY_STR if any input is RAY_STR, else RAY_SYM */
+    int8_t out_type = RAY_SYM;
+    for (int i = 0; i < n; i++) {
+        if (args[i]->out_type == RAY_STR) { out_type = RAY_STR; break; }
+    }
+    ext->base.out_type = out_type;
+    ext->base.est_rows = est;
+    ext->sym = n; /* total arg count stored in sym field */
+
+    /* Extra args in trailing bytes */
+    uint32_t* trail = (uint32_t*)EXT_TRAIL(ext);
+    for (int i = 2; i < n; i++) trail[i - 2] = ids[i];
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+/* --------------------------------------------------------------------------
+ * Reduction ops
+ * -------------------------------------------------------------------------- */
+
+ray_op_t* ray_sum(ray_graph_t* g, ray_op_t* a)    { return make_unary(g, OP_SUM, a, a->out_type == RAY_F64 ? RAY_F64 : RAY_I64); }
+ray_op_t* ray_prod(ray_graph_t* g, ray_op_t* a)   { return make_unary(g, OP_PROD, a, a->out_type == RAY_F64 ? RAY_F64 : RAY_I64); }
+ray_op_t* ray_min_op(ray_graph_t* g, ray_op_t* a) { return make_unary(g, OP_MIN, a, a->out_type); }
+ray_op_t* ray_max_op(ray_graph_t* g, ray_op_t* a) { return make_unary(g, OP_MAX, a, a->out_type); }
+ray_op_t* ray_count(ray_graph_t* g, ray_op_t* a)  { return make_unary(g, OP_COUNT, a, RAY_I64); }
+ray_op_t* ray_avg(ray_graph_t* g, ray_op_t* a)    { return make_unary(g, OP_AVG, a, RAY_F64); }
+ray_op_t* ray_first(ray_graph_t* g, ray_op_t* a)  { return make_unary(g, OP_FIRST, a, a->out_type); }
+ray_op_t* ray_last(ray_graph_t* g, ray_op_t* a)   { return make_unary(g, OP_LAST, a, a->out_type); }
+ray_op_t* ray_count_distinct(ray_graph_t* g, ray_op_t* a) { return make_unary(g, OP_COUNT_DISTINCT, a, RAY_I64); }
+ray_op_t* ray_stddev(ray_graph_t* g, ray_op_t* a)     { return make_unary(g, OP_STDDEV, a, RAY_F64); }
+ray_op_t* ray_stddev_pop(ray_graph_t* g, ray_op_t* a)  { return make_unary(g, OP_STDDEV_POP, a, RAY_F64); }
+ray_op_t* ray_var(ray_graph_t* g, ray_op_t* a)         { return make_unary(g, OP_VAR, a, RAY_F64); }
+ray_op_t* ray_var_pop(ray_graph_t* g, ray_op_t* a)     { return make_unary(g, OP_VAR_POP, a, RAY_F64); }
+
+/* --------------------------------------------------------------------------
+ * Structural ops
+ * -------------------------------------------------------------------------- */
+
+ray_op_t* ray_filter(ray_graph_t* g, ray_op_t* input, ray_op_t* predicate) {
+    uint32_t input_id = input->id;
+    uint32_t pred_id = predicate->id;
+    uint32_t est = input->est_rows / 2;  /* estimate: 50% selectivity */
+
+    ray_op_t* n = graph_alloc_node(g);
+    if (!n) return NULL;
+
+    input = &g->nodes[input_id];
+    predicate = &g->nodes[pred_id];
+
+    n->opcode = OP_FILTER;
+    n->arity = 2;
+    n->inputs[0] = input;
+    n->inputs[1] = predicate;
+    n->out_type = input->out_type;
+    n->est_rows = est;
+    return n;
+}
+
+ray_op_t* ray_sort_op(ray_graph_t* g, ray_op_t* table_node,
+                     ray_op_t** keys, uint8_t* descs, uint8_t* nulls_first,
+                     uint8_t n_cols) {
+    uint32_t table_id = table_node->id;
+    /* L5: n_cols is uint8_t (max 255) so 256-element array is always sufficient. */
+    uint32_t key_ids[256];
+    for (uint8_t i = 0; i < n_cols; i++) key_ids[i] = keys[i]->id;
+
+    size_t keys_sz = (size_t)n_cols * sizeof(ray_op_t*);
+    size_t descs_sz = (size_t)n_cols;
+    size_t nf_sz = (size_t)n_cols;
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, keys_sz + descs_sz + nf_sz);
+    if (!ext) return NULL;
+
+    table_node = &g->nodes[table_id];
+
+    ext->base.opcode = OP_SORT;
+    ext->base.arity = 1;
+    ext->base.inputs[0] = table_node;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = table_node->est_rows;
+
+    /* Arrays embedded in trailing space — freed with ext node */
+    char* trail = EXT_TRAIL(ext);
+    ext->sort.columns = (ray_op_t**)trail;
+    for (uint8_t i = 0; i < n_cols; i++)
+        ext->sort.columns[i] = &g->nodes[key_ids[i]];
+    ext->sort.desc = (uint8_t*)(trail + keys_sz);
+    memcpy(ext->sort.desc, descs, descs_sz);
+    ext->sort.nulls_first = (uint8_t*)(trail + keys_sz + descs_sz);
+    if (nulls_first) {
+        memcpy(ext->sort.nulls_first, nulls_first, nf_sz);
+    } else {
+        /* Default: NULLS LAST for ASC, NULLS FIRST for DESC */
+        for (uint8_t i = 0; i < n_cols; i++)
+            ext->sort.nulls_first[i] = descs[i] ? 1 : 0;
+    }
+    ext->sort.n_cols = n_cols;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_group(ray_graph_t* g, ray_op_t** keys, uint8_t n_keys,
+                   uint16_t* agg_ops, ray_op_t** agg_ins, uint8_t n_aggs) {
+    uint32_t key_ids[256];
+    uint32_t agg_ids[256];
+    for (uint8_t i = 0; i < n_keys; i++) key_ids[i] = keys[i]->id;
+    for (uint8_t i = 0; i < n_aggs; i++) agg_ids[i] = agg_ins[i]->id;
+
+    size_t keys_sz = (size_t)n_keys * sizeof(ray_op_t*);
+    size_t ops_sz  = (size_t)n_aggs * sizeof(uint16_t);
+    size_t ins_sz  = (size_t)n_aggs * sizeof(ray_op_t*);
+    /* Align ops after keys (pointer-sized), ins after ops (needs ptr alignment) */
+    size_t ops_off = keys_sz;
+    size_t ins_off = ops_off + ops_sz;
+    /* Round ins_off up to pointer alignment */
+    ins_off = (ins_off + sizeof(ray_op_t*) - 1) & ~(sizeof(ray_op_t*) - 1);
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, ins_off + ins_sz);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_GROUP;
+    ext->base.arity = 0;
+    ext->base.out_type = RAY_TABLE;
+    if (n_keys > 0 && keys[0])
+        ext->base.est_rows = g->nodes[key_ids[0]].est_rows / 10;  /* rough estimate */
+    ext->base.inputs[0] = n_keys > 0 ? &g->nodes[key_ids[0]] : NULL;
+
+    /* Arrays embedded in trailing space — freed with ext node */
+    char* trail = EXT_TRAIL(ext);
+    ext->keys = (ray_op_t**)trail;
+    for (uint8_t i = 0; i < n_keys; i++)
+        ext->keys[i] = &g->nodes[key_ids[i]];
+    ext->agg_ops = (uint16_t*)(trail + ops_off);
+    if (ops_sz > 0) memcpy(ext->agg_ops, agg_ops, ops_sz);
+    ext->agg_ins = (ray_op_t**)(trail + ins_off);
+    for (uint8_t i = 0; i < n_aggs; i++)
+        ext->agg_ins[i] = &g->nodes[agg_ids[i]];
+    ext->n_keys = n_keys;
+    ext->n_aggs = n_aggs;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_distinct(ray_graph_t* g, ray_op_t** keys, uint8_t n_keys) {
+    return ray_group(g, keys, n_keys, NULL, NULL, 0);
+}
+
+ray_op_t* ray_pivot_op(ray_graph_t* g,
+                       ray_op_t** index_cols, uint8_t n_index,
+                       ray_op_t* pivot_col,
+                       ray_op_t* value_col,
+                       uint16_t agg_op) {
+    uint32_t idx_ids[16];
+    for (uint8_t i = 0; i < n_index; i++) idx_ids[i] = index_cols[i]->id;
+    uint32_t pcol_id = pivot_col->id;
+    uint32_t vcol_id = value_col->id;
+
+    size_t idx_sz = (size_t)n_index * sizeof(ray_op_t*);
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, idx_sz);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_PIVOT;
+    ext->base.arity = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = 0; /* unknown until execution */
+
+    char* trail = EXT_TRAIL(ext);
+    ext->pivot.index_cols = (ray_op_t**)trail;
+    for (uint8_t i = 0; i < n_index; i++)
+        ext->pivot.index_cols[i] = &g->nodes[idx_ids[i]];
+    ext->pivot.pivot_col = &g->nodes[pcol_id];
+    ext->pivot.value_col = &g->nodes[vcol_id];
+    ext->pivot.agg_op = agg_op;
+    ext->pivot.n_index = n_index;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_join(ray_graph_t* g,
+                  ray_op_t* left_table, ray_op_t** left_keys,
+                  ray_op_t* right_table, ray_op_t** right_keys,
+                  uint8_t n_keys, uint8_t join_type) {
+    uint32_t left_table_id = left_table->id;
+    uint32_t right_table_id = right_table->id;
+    uint32_t lkey_ids[256];
+    uint32_t rkey_ids[256];
+    for (uint8_t i = 0; i < n_keys; i++) {
+        lkey_ids[i] = left_keys[i]->id;
+        rkey_ids[i] = right_keys[i]->id;
+    }
+
+    size_t keys_sz = (size_t)n_keys * sizeof(ray_op_t*);
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, keys_sz * 2);
+    if (!ext) return NULL;
+
+    left_table = &g->nodes[left_table_id];
+    right_table = &g->nodes[right_table_id];
+
+    ext->base.opcode = OP_JOIN;
+    ext->base.arity = 2;
+    ext->base.inputs[0] = left_table;
+    ext->base.inputs[1] = right_table;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = left_table->est_rows;
+
+    /* Arrays embedded in trailing space — freed with ext node */
+    char* trail = EXT_TRAIL(ext);
+    ext->join.left_keys = (ray_op_t**)trail;
+    for (uint8_t i = 0; i < n_keys; i++)
+        ext->join.left_keys[i] = &g->nodes[lkey_ids[i]];
+    ext->join.right_keys = (ray_op_t**)(trail + (size_t)n_keys * sizeof(ray_op_t*));
+    for (uint8_t i = 0; i < n_keys; i++)
+        ext->join.right_keys[i] = &g->nodes[rkey_ids[i]];
+    ext->join.n_join_keys = n_keys;
+    ext->join.join_type = join_type;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_antijoin(ray_graph_t* g,
+                      ray_op_t* left_table, ray_op_t** left_keys,
+                      ray_op_t* right_table, ray_op_t** right_keys,
+                      uint8_t n_keys) {
+    uint32_t left_table_id = left_table->id;
+    uint32_t right_table_id = right_table->id;
+    uint32_t lkey_ids[256];
+    uint32_t rkey_ids[256];
+    for (uint8_t i = 0; i < n_keys; i++) {
+        lkey_ids[i] = left_keys[i]->id;
+        rkey_ids[i] = right_keys[i]->id;
+    }
+
+    size_t keys_sz = (size_t)n_keys * sizeof(ray_op_t*);
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, keys_sz * 2);
+    if (!ext) return NULL;
+
+    left_table = &g->nodes[left_table_id];
+    right_table = &g->nodes[right_table_id];
+
+    ext->base.opcode = OP_ANTIJOIN;
+    ext->base.arity = 2;
+    ext->base.inputs[0] = left_table;
+    ext->base.inputs[1] = right_table;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = left_table->est_rows;
+
+    char* trail = EXT_TRAIL(ext);
+    ext->join.left_keys = (ray_op_t**)trail;
+    for (uint8_t i = 0; i < n_keys; i++)
+        ext->join.left_keys[i] = &g->nodes[lkey_ids[i]];
+    ext->join.right_keys = (ray_op_t**)(trail + (size_t)n_keys * sizeof(ray_op_t*));
+    for (uint8_t i = 0; i < n_keys; i++)
+        ext->join.right_keys[i] = &g->nodes[rkey_ids[i]];
+    ext->join.n_join_keys = n_keys;
+    ext->join.join_type = 3;  /* anti */
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_asof_join(ray_graph_t* g,
+                       ray_op_t* left_table, ray_op_t* right_table,
+                       ray_op_t* time_key,
+                       ray_op_t** eq_keys, uint8_t n_eq_keys,
+                       uint8_t join_type) {
+    uint32_t left_id  = left_table->id;
+    uint32_t right_id = right_table->id;
+    uint32_t time_id  = time_key->id;
+    uint32_t eq_ids[256];
+    for (uint8_t i = 0; i < n_eq_keys; i++) eq_ids[i] = eq_keys[i]->id;
+
+    /* Trailing: [eq_keys: n_eq_keys * ptr] */
+    size_t keys_sz = (size_t)n_eq_keys * sizeof(ray_op_t*);
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, keys_sz);
+    if (!ext) return NULL;
+
+    left_table  = &g->nodes[left_id];
+    right_table = &g->nodes[right_id];
+
+    ext->base.opcode  = OP_WINDOW_JOIN;
+    ext->base.arity   = 2;
+    ext->base.inputs[0] = left_table;
+    ext->base.inputs[1] = right_table;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = left_table->est_rows;
+
+    ext->asof.time_key   = &g->nodes[time_id];
+    ext->asof.n_eq_keys  = n_eq_keys;
+    ext->asof.join_type  = join_type;
+    ext->asof.eq_keys    = (ray_op_t**)EXT_TRAIL(ext);
+    for (uint8_t i = 0; i < n_eq_keys; i++)
+        ext->asof.eq_keys[i] = &g->nodes[eq_ids[i]];
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_window_op(ray_graph_t* g, ray_op_t* table_node,
+                       ray_op_t** part_keys, uint8_t n_part,
+                       ray_op_t** order_keys, uint8_t* order_descs, uint8_t n_order,
+                       uint8_t* func_kinds, ray_op_t** func_inputs,
+                       int64_t* func_params, uint8_t n_funcs,
+                       uint8_t frame_type, uint8_t frame_start, uint8_t frame_end,
+                       int64_t frame_start_n, int64_t frame_end_n) {
+    uint32_t part_ids[256];
+    uint32_t order_ids[256];
+    uint32_t func_ids[256];
+    for (uint8_t i = 0; i < n_part; i++) part_ids[i] = part_keys[i]->id;
+    for (uint8_t i = 0; i < n_order; i++) order_ids[i] = order_keys[i]->id;
+    for (uint8_t i = 0; i < n_funcs; i++) func_ids[i] = func_inputs[i]->id;
+
+    /* Trailing layout:
+     *   [part_keys:   n_part * ptr]
+     *   [order_keys:  n_order * ptr]
+     *   [order_descs: n_order * 1B]
+     *   [padding to ptr alignment]
+     *   [func_inputs: n_funcs * ptr]
+     *   [func_kinds:  n_funcs * 1B]
+     *   [padding to 8B alignment]
+     *   [func_params: n_funcs * 8B]
+     */
+    size_t pk_sz    = (size_t)n_part  * sizeof(ray_op_t*);
+    size_t ok_sz    = (size_t)n_order * sizeof(ray_op_t*);
+    size_t od_sz    = (size_t)n_order;
+    size_t od_end   = pk_sz + ok_sz + od_sz;
+    size_t fi_off   = (od_end + sizeof(ray_op_t*) - 1) & ~(sizeof(ray_op_t*) - 1);
+    size_t fi_sz    = (size_t)n_funcs * sizeof(ray_op_t*);
+    size_t fk_off   = fi_off + fi_sz;
+    size_t fk_sz    = (size_t)n_funcs;
+    size_t fp_off   = (fk_off + fk_sz + 7) & ~(size_t)7;
+    size_t fp_sz    = (size_t)n_funcs * sizeof(int64_t);
+    size_t total    = fp_off + fp_sz;
+
+    /* Save IDs before alloc — realloc may invalidate pointers */
+    uint32_t table_id = table_node->id;
+    uint32_t est   = table_node->est_rows;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, total);
+    if (!ext) return NULL;
+
+    /* Re-resolve table_node after potential realloc */
+    table_node = &g->nodes[table_id];
+
+    ext->base.opcode   = OP_WINDOW;
+    ext->base.arity    = 1;
+    ext->base.inputs[0] = table_node;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = est;  /* window preserves row count */
+
+    /* Fill trailing arrays */
+    char* trail = EXT_TRAIL(ext);
+    ext->window.part_keys = (ray_op_t**)trail;
+    for (uint8_t i = 0; i < n_part; i++)
+        ext->window.part_keys[i] = &g->nodes[part_ids[i]];
+
+    ext->window.order_keys = (ray_op_t**)(trail + pk_sz);
+    for (uint8_t i = 0; i < n_order; i++)
+        ext->window.order_keys[i] = &g->nodes[order_ids[i]];
+
+    ext->window.order_descs = (uint8_t*)(trail + pk_sz + ok_sz);
+    if (n_order) memcpy(ext->window.order_descs, order_descs, od_sz);
+
+    ext->window.func_inputs = (ray_op_t**)(trail + fi_off);
+    for (uint8_t i = 0; i < n_funcs; i++)
+        ext->window.func_inputs[i] = &g->nodes[func_ids[i]];
+
+    ext->window.func_kinds = (uint8_t*)(trail + fk_off);
+    if (n_funcs) memcpy(ext->window.func_kinds, func_kinds, fk_sz);
+
+    ext->window.func_params = (int64_t*)(trail + fp_off);
+    if (n_funcs) memcpy(ext->window.func_params, func_params, fp_sz);
+
+    ext->window.n_part_keys   = n_part;
+    ext->window.n_order_keys  = n_order;
+    ext->window.n_funcs       = n_funcs;
+    ext->window.frame_type    = frame_type;
+    ext->window.frame_start   = frame_start;
+    ext->window.frame_end     = frame_end;
+    ext->window.frame_start_n = frame_start_n;
+    ext->window.frame_end_n   = frame_end_n;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_select(ray_graph_t* g, ray_op_t* input,
+                    ray_op_t** cols, uint8_t n_cols) {
+    uint32_t input_id = input->id;
+    uint32_t col_ids[256];
+    for (uint8_t i = 0; i < n_cols; i++) col_ids[i] = cols[i]->id;
+
+    size_t cols_sz = (size_t)n_cols * sizeof(ray_op_t*);
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, cols_sz);
+    if (!ext) return NULL;
+
+    input = &g->nodes[input_id];
+
+    ext->base.opcode = OP_SELECT;
+    ext->base.arity = 1;
+    ext->base.inputs[0] = input;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = input->est_rows;
+
+    /* Array embedded in trailing space — freed with ext node */
+    ext->sort.columns = (ray_op_t**)EXT_TRAIL(ext);
+    for (uint8_t i = 0; i < n_cols; i++)
+        ext->sort.columns[i] = &g->nodes[col_ids[i]];
+    ext->sort.n_cols = n_cols;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+/* L6: When n (stored as ext->sym) is 0, HEAD produces an empty result
+   with the same schema as the input. */
+ray_op_t* ray_head(ray_graph_t* g, ray_op_t* input, int64_t n) {
+    uint32_t input_id = input->id;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    input = &g->nodes[input_id];
+
+    ext->base.opcode = OP_HEAD;
+    ext->base.arity = 1;
+    ext->base.inputs[0] = input;
+    ext->base.out_type = input->out_type;
+    ext->base.est_rows = (uint32_t)n;
+    ext->sym = n;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_tail(ray_graph_t* g, ray_op_t* input, int64_t n) {
+    uint32_t input_id = input->id;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    input = &g->nodes[input_id];
+
+    ext->base.opcode = OP_TAIL;
+    ext->base.arity = 1;
+    ext->base.inputs[0] = input;
+    ext->base.out_type = input->out_type;
+    ext->base.est_rows = (uint32_t)n;
+    ext->sym = n;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_alias(ray_graph_t* g, ray_op_t* input, const char* name) {
+    uint32_t input_id = input->id;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    input = &g->nodes[input_id];
+
+    ext->base.opcode = OP_ALIAS;
+    ext->base.arity = 1;
+    ext->base.inputs[0] = input;
+    ext->base.out_type = input->out_type;
+    ext->base.est_rows = input->est_rows;
+    ext->sym = ray_sym_intern(name, strlen(name));
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_extract(ray_graph_t* g, ray_op_t* col, int64_t field) {
+    uint32_t col_id = col->id;
+    uint32_t est = col->est_rows;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    col = &g->nodes[col_id];  /* re-resolve after potential realloc */
+
+    ext->base.opcode = OP_EXTRACT;
+    ext->base.arity = 1;
+    ext->base.inputs[0] = col;
+    ext->base.out_type = RAY_I64;
+    ext->base.est_rows = est;
+    ext->sym = field;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_date_trunc(ray_graph_t* g, ray_op_t* col, int64_t field) {
+    uint32_t col_id = col->id;
+    uint32_t est = col->est_rows;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    col = &g->nodes[col_id];  /* re-resolve after potential realloc */
+
+    ext->base.opcode = OP_DATE_TRUNC;
+    ext->base.arity = 1;
+    ext->base.inputs[0] = col;
+    ext->base.out_type = RAY_TIMESTAMP;  /* returns timestamp (microseconds) */
+    ext->base.est_rows = est;
+    ext->sym = field;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_materialize(ray_graph_t* g, ray_op_t* input) {
+    uint32_t input_id = input->id;
+    ray_op_t* n = graph_alloc_node(g);
+    if (!n) return NULL;
+
+    input = &g->nodes[input_id];
+
+    n->opcode = OP_MATERIALIZE;
+    n->arity = 1;
+    n->inputs[0] = input;
+    n->out_type = input->out_type;
+    n->est_rows = input->est_rows;
+    return n;
+}
+
+/* --------------------------------------------------------------------------
+ * Multi-table support
+ * -------------------------------------------------------------------------- */
+
+uint16_t ray_graph_add_table(ray_graph_t* g, ray_t* table) {
+    uint16_t id = g->n_tables;
+    uint16_t new_cap = id + 1;
+
+    ray_t** new_tables = (ray_t**)ray_sys_realloc(g->tables,
+                                                (size_t)new_cap * sizeof(ray_t*));
+    if (!new_tables) return UINT16_MAX;  /* error sentinel */
+    g->tables = new_tables;
+    g->tables[id] = table;
+    ray_retain(table);
+    g->n_tables = new_cap;
+
+    return id;
+}
+
+ray_op_t* ray_scan_table(ray_graph_t* g, uint16_t table_id, const char* col_name) {
+    if (table_id >= g->n_tables || !g->tables[table_id]) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_SCAN;
+    ext->base.arity = 0;
+
+    int64_t sym_id = ray_sym_intern(col_name, strlen(col_name));
+    ext->sym = sym_id;
+
+    /* Store table_id+1 in pad[0..1] as uint16_t (0 = default g->table) */
+    uint16_t stored_id = table_id + 1;
+    memcpy(ext->base.pad, &stored_id, sizeof(uint16_t));
+
+    /* Infer output type from the specified table */
+    ray_t* tbl = g->tables[table_id];
+    if (tbl) {
+        ray_t* col = ray_table_get_col(tbl, sym_id);
+        if (col) {
+            ext->base.out_type = col->type;
+            ext->base.est_rows = (uint32_t)col->len;
+        }
+    }
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+/* --------------------------------------------------------------------------
+ * Graph traversal DAG builders
+ * -------------------------------------------------------------------------- */
+
+ray_op_t* ray_expand(ray_graph_t* g, ray_op_t* src_nodes,
+                    ray_rel_t* rel, uint8_t direction) {
+    uint32_t src_id = src_nodes->id;
+    uint32_t est = src_nodes->est_rows;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    src_nodes = &g->nodes[src_id];
+
+    ext->base.opcode = OP_EXPAND;
+    ext->base.arity = 1;
+    ext->base.inputs[0] = src_nodes;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = est * 10;  /* rough estimate: 10x fan-out */
+    ext->graph.rel = rel;
+    ext->graph.direction = direction;
+    ext->graph.min_depth = 1;
+    ext->graph.max_depth = 1;
+    ext->graph.path_tracking = 0;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_var_expand(ray_graph_t* g, ray_op_t* start_nodes,
+                        ray_rel_t* rel, uint8_t direction,
+                        uint8_t min_depth, uint8_t max_depth,
+                        bool track_path) {
+    uint32_t src_id = start_nodes->id;
+    uint32_t est = start_nodes->est_rows;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    start_nodes = &g->nodes[src_id];
+
+    ext->base.opcode = OP_VAR_EXPAND;
+    ext->base.arity = 1;
+    ext->base.inputs[0] = start_nodes;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = est * 100;  /* rough estimate */
+    ext->graph.rel = rel;
+    ext->graph.direction = direction;
+    ext->graph.min_depth = min_depth;
+    ext->graph.max_depth = max_depth;
+    ext->graph.path_tracking = track_path ? 1 : 0;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_shortest_path(ray_graph_t* g, ray_op_t* src, ray_op_t* dst,
+                           ray_rel_t* rel, uint8_t max_depth) {
+    uint32_t src_id = src->id;
+    uint32_t dst_id = dst->id;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    src = &g->nodes[src_id];
+    dst = &g->nodes[dst_id];
+
+    ext->base.opcode = OP_SHORTEST_PATH;
+    ext->base.arity = 2;
+    ext->base.inputs[0] = src;
+    ext->base.inputs[1] = dst;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = max_depth;
+    ext->graph.rel = rel;
+    ext->graph.direction = 0;  /* forward by default */
+    ext->graph.min_depth = 0;
+    ext->graph.max_depth = max_depth;
+    ext->graph.path_tracking = 0;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+/* --------------------------------------------------------------------------
+ * Graph algorithm builders
+ * -------------------------------------------------------------------------- */
+
+ray_op_t* ray_pagerank(ray_graph_t* g, ray_rel_t* rel,
+                      uint16_t max_iter, double damping) {
+    if (!g || !rel) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode   = OP_PAGERANK;
+    ext->base.arity    = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel      = rel;
+    ext->graph.max_iter  = max_iter;
+    ext->graph.damping   = damping;
+    ext->graph.direction = 0;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_connected_comp(ray_graph_t* g, ray_rel_t* rel) {
+    if (!g || !rel) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode   = OP_CONNECTED_COMP;
+    ext->base.arity    = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel     = rel;
+    ext->graph.direction = 2;  /* both directions for undirected */
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_dijkstra(ray_graph_t* g, ray_op_t* src, ray_op_t* dst,
+                      ray_rel_t* rel, const char* weight_col,
+                      uint8_t max_depth) {
+    if (!g || !src || !rel || !weight_col) return NULL;
+
+    /* Save IDs before alloc — realloc may invalidate the pointers */
+    uint32_t src_id = src->id;
+    uint32_t dst_id = dst ? dst->id : 0;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    src = &g->nodes[src_id];
+    if (dst) dst = &g->nodes[dst_id];
+
+    ext->base.opcode    = OP_DIJKSTRA;
+    ext->base.arity     = dst ? 2 : 1;
+    ext->base.inputs[0] = src;
+    ext->base.inputs[1] = dst;
+    ext->base.out_type  = RAY_TABLE;
+    ext->base.est_rows  = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel       = rel;
+    ext->graph.direction = 0;
+    ext->graph.max_depth = max_depth;
+    ext->graph.weight_col_sym = ray_sym_intern(weight_col, (int64_t)strlen(weight_col));
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_louvain(ray_graph_t* g, ray_rel_t* rel, uint16_t max_iter) {
+    if (!g || !rel) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode   = OP_LOUVAIN;
+    ext->base.arity    = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel      = rel;
+    ext->graph.max_iter  = max_iter > 0 ? max_iter : 100;
+    ext->graph.direction = 2;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_degree_cent(ray_graph_t* g, ray_rel_t* rel) {
+    if (!g || !rel) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode   = OP_DEGREE_CENT;
+    ext->base.arity    = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel     = rel;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_topsort(ray_graph_t* g, ray_rel_t* rel) {
+    if (!g || !rel) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode   = OP_TOPSORT;
+    ext->base.arity    = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel     = rel;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_dfs(ray_graph_t* g, ray_op_t* src, ray_rel_t* rel, uint8_t max_depth) {
+    if (!g || !src || !rel) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    uint32_t src_id = src->id;
+    src = &g->nodes[src_id];
+
+    ext->base.opcode     = OP_DFS;
+    ext->base.arity      = 1;
+    ext->base.inputs[0]  = src;
+    ext->base.out_type   = RAY_TABLE;
+    ext->base.est_rows   = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel       = rel;
+    ext->graph.direction = 0;
+    ext->graph.max_depth = max_depth;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_wco_join(ray_graph_t* g,
+                      ray_rel_t** rels, uint8_t n_rels,
+                      uint8_t n_vars) {
+    size_t extra = (size_t)n_rels * sizeof(ray_rel_t*);
+    ray_op_ext_t* ext = graph_alloc_ext_node_ex(g, extra);
+    if (!ext) return NULL;
+
+    ext->base.opcode = OP_WCO_JOIN;
+    ext->base.arity = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = 1000;  /* rough estimate */
+
+    /* Copy rels array into trailing bytes */
+    ray_rel_t** trail = (ray_rel_t**)EXT_TRAIL(ext);
+    if (n_rels > 0) memcpy(trail, rels, (size_t)n_rels * sizeof(ray_rel_t*));
+    ext->wco.rels = (void**)trail;
+    ext->wco.n_rels = n_rels;
+    ext->wco.n_vars = n_vars;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+/* --------------------------------------------------------------------------
+ * Vector similarity builders
+ * -------------------------------------------------------------------------- */
+
+ray_op_t* ray_cosine_sim(ray_graph_t* g, ray_op_t* emb_col,
+                        const float* query_vec, int32_t dim) {
+    if (!g || !emb_col || !query_vec || dim <= 0) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    emb_col = &g->nodes[emb_col->id];
+
+    ext->base.opcode    = OP_COSINE_SIM;
+    ext->base.arity     = 1;
+    ext->base.inputs[0] = emb_col;
+    ext->base.out_type  = RAY_F64;
+    ext->base.est_rows  = emb_col->est_rows;
+    ext->vector.query_vec = (float*)query_vec;
+    ext->vector.dim       = dim;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_euclidean_dist(ray_graph_t* g, ray_op_t* emb_col,
+                            const float* query_vec, int32_t dim) {
+    if (!g || !emb_col || !query_vec || dim <= 0) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    emb_col = &g->nodes[emb_col->id];
+
+    ext->base.opcode    = OP_EUCLIDEAN_DIST;
+    ext->base.arity     = 1;
+    ext->base.inputs[0] = emb_col;
+    ext->base.out_type  = RAY_F64;
+    ext->base.est_rows  = emb_col->est_rows;
+    ext->vector.query_vec = (float*)query_vec;
+    ext->vector.dim       = dim;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_knn(ray_graph_t* g, ray_op_t* emb_col,
+                 const float* query_vec, int32_t dim, int64_t k,
+                 ray_hnsw_metric_t metric) {
+    if (!g || !emb_col || !query_vec || dim <= 0 || k <= 0) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    emb_col = &g->nodes[emb_col->id];
+
+    ext->base.opcode    = OP_KNN;
+    ext->base.arity     = 1;
+    ext->base.inputs[0] = emb_col;
+    ext->base.out_type  = RAY_TABLE;
+    ext->base.est_rows  = (uint32_t)k;
+    ext->vector.query_vec = (float*)query_vec;
+    ext->vector.dim       = dim;
+    ext->vector.k         = k;
+    ext->vector.metric    = (int32_t)metric;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_cluster_coeff(ray_graph_t* g, ray_rel_t* rel) {
+    if (!g || !rel) return NULL;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    ext->base.opcode   = OP_CLUSTER_COEFF;
+    ext->base.arity    = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel     = rel;
+    ext->graph.direction = 2;
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_random_walk(ray_graph_t* g, ray_op_t* src, ray_rel_t* rel,
+                        uint16_t walk_length) {
+    if (!g || !src || !rel) return NULL;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    uint32_t src_id = src->id;
+    src = &g->nodes[src_id];
+    ext->base.opcode    = OP_RANDOM_WALK;
+    ext->base.arity     = 1;
+    ext->base.inputs[0] = src;
+    ext->base.out_type  = RAY_TABLE;
+    ext->base.est_rows  = walk_length + 1;
+    ext->graph.rel      = rel;
+    ext->graph.max_iter = walk_length;
+    ext->graph.direction = 0;
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_astar(ray_graph_t* g, ray_op_t* src, ray_op_t* dst,
+                  ray_rel_t* rel, const char* weight_col,
+                  const char* lat_col, const char* lon_col,
+                  ray_t* node_props, uint8_t max_depth) {
+    if (!g || !src || !dst || !rel || !weight_col || !lat_col || !lon_col || !node_props)
+        return NULL;
+
+    /* Save IDs before alloc — realloc may invalidate the pointers */
+    uint32_t src_id = src->id;
+    uint32_t dst_id = dst->id;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    src = &g->nodes[src_id];
+    dst = &g->nodes[dst_id];
+
+    ext->base.opcode    = OP_ASTAR;
+    ext->base.arity     = 2;
+    ext->base.inputs[0] = src;
+    ext->base.inputs[1] = dst;
+    ext->base.out_type  = RAY_TABLE;
+    ext->base.est_rows  = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel       = rel;
+    ext->graph.direction = 0;
+    ext->graph.max_depth = max_depth;
+    ext->graph.weight_col_sym = ray_sym_intern(weight_col, (int64_t)strlen(weight_col));
+    ext->graph.coord_col_syms[0] = ray_sym_intern(lat_col, (int64_t)strlen(lat_col));
+    ext->graph.coord_col_syms[1] = ray_sym_intern(lon_col, (int64_t)strlen(lon_col));
+    ext->graph.node_props = node_props;
+    ray_retain(node_props);
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_k_shortest(ray_graph_t* g, ray_op_t* src, ray_op_t* dst,
+                       ray_rel_t* rel, const char* weight_col, uint16_t k) {
+    if (!g || !src || !dst || !rel || !weight_col || k == 0) return NULL;
+
+    /* Save IDs before alloc — realloc may invalidate the pointers */
+    uint32_t src_id = src->id;
+    uint32_t dst_id = dst->id;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    src = &g->nodes[src_id];
+    dst = &g->nodes[dst_id];
+
+    ext->base.opcode    = OP_K_SHORTEST;
+    ext->base.arity     = 2;
+    ext->base.inputs[0] = src;
+    ext->base.inputs[1] = dst;
+    ext->base.out_type  = RAY_TABLE;
+    ext->base.est_rows  = (uint32_t)(k * rel->fwd.n_nodes);
+    ext->graph.rel       = rel;
+    ext->graph.direction = 0;
+    ext->graph.max_iter  = k;
+    ext->graph.weight_col_sym = ray_sym_intern(weight_col, (int64_t)strlen(weight_col));
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_betweenness(ray_graph_t* g, ray_rel_t* rel, uint16_t sample_size) {
+    if (!g || !rel) return NULL;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    ext->base.opcode   = OP_BETWEENNESS;
+    ext->base.arity    = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel       = rel;
+    ext->graph.direction = 2;  /* undirected BFS */
+    ext->graph.max_iter  = sample_size;  /* 0 = exact */
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_closeness(ray_graph_t* g, ray_rel_t* rel, uint16_t sample_size) {
+    if (!g || !rel) return NULL;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    ext->base.opcode   = OP_CLOSENESS;
+    ext->base.arity    = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = (uint32_t)rel->fwd.n_nodes;
+    ext->graph.rel       = rel;
+    ext->graph.direction = 2;  /* undirected BFS */
+    ext->graph.max_iter  = sample_size;  /* 0 = exact */
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_mst(ray_graph_t* g, ray_rel_t* rel, const char* weight_col) {
+    if (!g || !rel || !weight_col) return NULL;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    ext->base.opcode   = OP_MST;
+    ext->base.arity    = 0;
+    ext->base.out_type = RAY_TABLE;
+    ext->base.est_rows = (uint32_t)(rel->fwd.n_nodes > 0 ? rel->fwd.n_nodes - 1 : 0);
+    ext->graph.rel     = rel;
+    ext->graph.direction = 2;
+    ext->graph.weight_col_sym = ray_sym_intern(weight_col, (int64_t)strlen(weight_col));
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_hnsw_knn(ray_graph_t* g, ray_hnsw_t* idx,
+                       const float* query_vec, int32_t dim,
+                       int64_t k, int32_t ef_search) {
+    if (!g || !idx || !query_vec || dim <= 0 || k <= 0) return NULL;
+
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+
+    ext->base.opcode    = OP_HNSW_KNN;
+    ext->base.arity     = 0;  /* nullary: reads from index directly */
+    ext->base.out_type  = RAY_TABLE;
+    ext->base.est_rows  = (uint32_t)k;
+    ext->hnsw.hnsw_idx  = idx;
+    ext->hnsw.query_vec = (float*)query_vec;
+    ext->hnsw.dim       = dim;
+    ext->hnsw.k         = k;
+    ext->hnsw.ef_search = ef_search > 0 ? ef_search : HNSW_DEFAULT_EF_S;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_ann_rerank(ray_graph_t* g, ray_op_t* src,
+                         ray_hnsw_t* idx, const float* query_vec,
+                         int32_t dim, int64_t k, int32_t ef_search) {
+    if (!g || !src || !idx || !query_vec || dim <= 0 || k <= 0) return NULL;
+
+    uint32_t src_id = src->id;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    src = &g->nodes[src_id];
+
+    ext->base.opcode     = OP_ANN_RERANK;
+    ext->base.arity      = 1;
+    ext->base.inputs[0]  = src;
+    ext->base.out_type   = RAY_TABLE;
+    ext->base.est_rows   = (uint32_t)k;
+    ext->rerank.hnsw_idx  = idx;
+    ext->rerank.col_sym   = 0;
+    ext->rerank.query_vec = (float*)query_vec;
+    ext->rerank.dim       = dim;
+    ext->rerank.metric    = idx ? idx->metric : RAY_HNSW_COSINE;
+    ext->rerank.k         = k;
+    ext->rerank.ef_search = ef_search > 0 ? ef_search : HNSW_DEFAULT_EF_S;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+ray_op_t* ray_knn_rerank(ray_graph_t* g, ray_op_t* src,
+                         int64_t col_sym, const float* query_vec,
+                         int32_t dim, int64_t k, ray_hnsw_metric_t metric) {
+    if (!g || !src || !query_vec || dim <= 0 || k <= 0 || col_sym <= 0) return NULL;
+
+    uint32_t src_id = src->id;
+    ray_op_ext_t* ext = graph_alloc_ext_node(g);
+    if (!ext) return NULL;
+    src = &g->nodes[src_id];
+
+    ext->base.opcode     = OP_KNN_RERANK;
+    ext->base.arity      = 1;
+    ext->base.inputs[0]  = src;
+    ext->base.out_type   = RAY_TABLE;
+    ext->base.est_rows   = (uint32_t)k;
+    ext->rerank.hnsw_idx  = NULL;
+    ext->rerank.col_sym   = col_sym;
+    ext->rerank.query_vec = (float*)query_vec;
+    ext->rerank.dim       = dim;
+    ext->rerank.metric    = (int32_t)metric;
+    ext->rerank.k         = k;
+    ext->rerank.ef_search = 0;
+
+    g->nodes[ext->base.id] = ext->base;
+    return &g->nodes[ext->base.id];
+}
+
+/* --------------------------------------------------------------------------
+ * Lazy DAG handles
+ * -------------------------------------------------------------------------- */
+
+ray_op_t* ray_graph_input_vec(ray_graph_t* g, ray_t* vec) {
+    return ray_const_vec(g, vec);
+}
+
+ray_t* ray_lazy_wrap(ray_graph_t* g, ray_op_t* op) {
+    ray_t* h = ray_alloc(0);
+    if (!h) { ray_graph_free(g); return ray_error("oom", NULL); }
+    h->type  = RAY_LAZY;
+    h->attrs = 0;
+    RAY_LAZY_GRAPH(h) = g;
+    RAY_LAZY_OP(h)    = op;
+    return h;
+}
+
+ray_t* ray_lazy_append(ray_t* lazy, uint16_t opcode) {
+    ray_graph_t* g    = RAY_LAZY_GRAPH(lazy);
+    ray_op_t*    prev = RAY_LAZY_OP(lazy);
+
+    /* Determine output type based on opcode */
+    int8_t out_type;
+    switch (opcode) {
+        case OP_COUNT:
+        case OP_COUNT_DISTINCT:
+            out_type = RAY_I64; break;
+        case OP_AVG:
+        case OP_STDDEV:
+        case OP_STDDEV_POP:
+        case OP_VAR:
+        case OP_VAR_POP:
+            out_type = RAY_F64; break;
+        case OP_SUM:
+        case OP_PROD:
+            out_type = (prev->out_type == RAY_F64) ? RAY_F64 : RAY_I64; break;
+        default:
+            out_type = prev->out_type; break;
+    }
+
+    ray_op_t* op = make_unary(g, opcode, prev, out_type);
+    if (!op) return ray_error("oom", NULL);
+    RAY_LAZY_OP(lazy) = op;
+    return lazy;
+}
+
+ray_t* ray_lazy_materialize(ray_t* val) {
+    if (!ray_is_lazy(val)) return val;
+
+    ray_graph_t* g  = RAY_LAZY_GRAPH(val);
+    ray_op_t*    op = RAY_LAZY_OP(val);
+    ray_t* result   = ray_execute(g, op);
+
+    ray_graph_free(g);
+    /* Clear graph pointer before releasing to prevent double-free in
+     * ray_release_owned_refs */
+    RAY_LAZY_GRAPH(val) = NULL;
+    ray_release(val);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/graph.h b/crates/rayforce-sys/vendor/rayforce/src/ops/graph.h
new file mode 100644
index 0000000..b32a17a
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/graph.h
@@ -0,0 +1,29 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_GRAPH_H
+#define RAY_GRAPH_H
+
+#include "ops.h"
+
+#endif /* RAY_GRAPH_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/group.c b/crates/rayforce-sys/vendor/rayforce/src/ops/group.c
new file mode 100644
index 0000000..c26fffe
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/group.c
@@ -0,0 +1,4392 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+#include "ops/rowsel.h"
+
+/* ============================================================================
+ * Reduction execution
+ * ============================================================================ */
+
+typedef struct {
+    double sum_f, min_f, max_f, prod_f, first_f, last_f, sum_sq_f;
+    int64_t sum_i, min_i, max_i, prod_i, first_i, last_i, sum_sq_i;
+    int64_t cnt;
+    int64_t null_count;
+    bool has_first;
+} reduce_acc_t;
+
+static void reduce_acc_init(reduce_acc_t* acc) {
+    acc->sum_f = 0; acc->min_f = DBL_MAX; acc->max_f = -DBL_MAX;
+    acc->prod_f = 1.0; acc->first_f = 0; acc->last_f = 0; acc->sum_sq_f = 0;
+    acc->sum_i = 0; acc->min_i = INT64_MAX; acc->max_i = INT64_MIN;
+    acc->prod_i = 1; acc->first_i = 0; acc->last_i = 0; acc->sum_sq_i = 0;
+    acc->cnt = 0; acc->null_count = 0; acc->has_first = false;
+}
+
+/* Integer reduction loop — reads native type T, accumulates as i64 */
+#define REDUCE_LOOP_I(T, base, start, end, acc, has_nulls, null_bm) \
+    do { \
+        const T* d = (const T*)(base); \
+        for (int64_t row = start; row < end; row++) { \
+            if (has_nulls && (null_bm[row/8] >> (row%8)) & 1) { (acc)->null_count++; continue; } \
+            int64_t v = (int64_t)d[row]; \
+            /* sum/sum_sq may overflow on signed arithmetic — use defined \
+             * unsigned wrap (same semantic, no UBSan whine). */ \
+            (acc)->sum_i    = (int64_t)((uint64_t)(acc)->sum_i    + (uint64_t)v); \
+            (acc)->sum_sq_i = (int64_t)((uint64_t)(acc)->sum_sq_i + (uint64_t)v * (uint64_t)v); \
+            (acc)->prod_i   = (int64_t)((uint64_t)(acc)->prod_i   * (uint64_t)v); \
+            if (v < (acc)->min_i) (acc)->min_i = v; \
+            if (v > (acc)->max_i) (acc)->max_i = v; \
+            if (!(acc)->has_first) { (acc)->first_i = v; (acc)->has_first = true; } \
+            (acc)->last_i = v; (acc)->cnt++; \
+        } \
+    } while (0)
+
+/* Float reduction loop */
+#define REDUCE_LOOP_F(base, start, end, acc, has_nulls, null_bm) \
+    do { \
+        const double* d = (const double*)(base); \
+        for (int64_t row = start; row < end; row++) { \
+            if (has_nulls && (null_bm[row/8] >> (row%8)) & 1) { (acc)->null_count++; continue; } \
+            double v = d[row]; \
+            (acc)->sum_f += v; (acc)->sum_sq_f += v * v; (acc)->prod_f *= v; \
+            if (v < (acc)->min_f) (acc)->min_f = v; \
+            if (v > (acc)->max_f) (acc)->max_f = v; \
+            if (!(acc)->has_first) { (acc)->first_f = v; (acc)->has_first = true; } \
+            (acc)->last_f = v; (acc)->cnt++; \
+        } \
+    } while (0)
+
+static void reduce_range(ray_t* input, int64_t start, int64_t end,
+                         reduce_acc_t* acc, bool has_nulls,
+                         const uint8_t* null_bm) {
+    void* base = ray_data(input);
+    switch (input->type) {
+    case RAY_BOOL: case RAY_U8:
+        REDUCE_LOOP_I(uint8_t, base, start, end, acc, has_nulls, null_bm); break;
+    case RAY_I16:
+        REDUCE_LOOP_I(int16_t, base, start, end, acc, has_nulls, null_bm); break;
+    case RAY_I32: case RAY_DATE: case RAY_TIME:
+        REDUCE_LOOP_I(int32_t, base, start, end, acc, has_nulls, null_bm); break;
+    case RAY_I64: case RAY_TIMESTAMP:
+        REDUCE_LOOP_I(int64_t, base, start, end, acc, has_nulls, null_bm); break;
+    case RAY_F64:
+        REDUCE_LOOP_F(base, start, end, acc, has_nulls, null_bm); break;
+    case RAY_SYM: {
+        /* Adaptive-width SYM columns — use read_col_i64 */
+        for (int64_t row = start; row < end; row++) {
+            if (has_nulls && (null_bm[row/8] >> (row%8)) & 1) { acc->null_count++; continue; }
+            int64_t v = read_col_i64(base, row, input->type, input->attrs);
+            acc->sum_i += v; acc->sum_sq_i += v * v;
+            acc->prod_i = (int64_t)((uint64_t)acc->prod_i * (uint64_t)v);
+            if (v < acc->min_i) acc->min_i = v;
+            if (v > acc->max_i) acc->max_i = v;
+            if (!acc->has_first) { acc->first_i = v; acc->has_first = true; }
+            acc->last_i = v; acc->cnt++;
+        }
+        break;
+    }
+    default: break;
+    }
+}
+
+/* Context for parallel reduction */
+typedef struct {
+    ray_t*         input;
+    reduce_acc_t*  accs;   /* one per worker */
+    bool           has_nulls;
+    const uint8_t* null_bm;
+} par_reduce_ctx_t;
+
+static void par_reduce_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    par_reduce_ctx_t* c = (par_reduce_ctx_t*)ctx;
+    reduce_range(c->input, start, end, &c->accs[worker_id],
+                 c->has_nulls, c->null_bm);
+}
+
+static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_type) {
+    if (in_type == RAY_F64) {
+        dst->sum_f += src->sum_f;
+        dst->sum_sq_f += src->sum_sq_f;
+        dst->prod_f *= src->prod_f;
+        if (src->min_f < dst->min_f) dst->min_f = src->min_f;
+        if (src->max_f > dst->max_f) dst->max_f = src->max_f;
+    } else {
+        /* Defined unsigned wrap — matches REDUCE_LOOP_I's per-row path. */
+        dst->sum_i    = (int64_t)((uint64_t)dst->sum_i    + (uint64_t)src->sum_i);
+        dst->sum_sq_i = (int64_t)((uint64_t)dst->sum_sq_i + (uint64_t)src->sum_sq_i);
+        dst->prod_i   = (int64_t)((uint64_t)dst->prod_i   * (uint64_t)src->prod_i);
+        if (src->min_i < dst->min_i) dst->min_i = src->min_i;
+        if (src->max_i > dst->max_i) dst->max_i = src->max_i;
+    }
+    dst->cnt += src->cnt;
+    dst->null_count += src->null_count;
+    /* reduce_merge does not merge first/last; caller handles these separately.
+     * Since workers process sequential ranges, worker 0's first is the global first,
+     * and the last worker's last is the global last. */
+}
+
+/* Hash-based count distinct for integer/float columns */
+ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) {
+    (void)g; (void)op;
+    if (!input || RAY_IS_ERR(input)) return input;
+
+    int8_t in_type = input->type;
+    int64_t len = input->len;
+
+    if (len == 0) return ray_i64(0);
+
+    /* Only numeric/ordinal/sym column types are supported */
+    switch (in_type) {
+    case RAY_BOOL: case RAY_U8:
+    case RAY_I16: case RAY_I32: case RAY_I64:
+    case RAY_F64: case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP:
+    case RAY_SYM:
+        break;
+    default:
+        return ray_error("type", NULL);
+    }
+
+    /* Use a simple open-addressing hash set for int64 values */
+    uint64_t cap = (uint64_t)(len < 16 ? 32 : len) * 2;
+    /* Round up to power of 2 */
+    uint64_t c = 1;
+    while (c && c < cap) c <<= 1;
+    if (!c) return ray_error("oom", NULL); /* overflow: cap too large */
+    cap = c;
+
+    ray_t* set_hdr;
+    int64_t* set = (int64_t*)scratch_calloc(&set_hdr,
+                                             (size_t)cap * sizeof(int64_t));
+    ray_t* used_hdr;
+    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr,
+                                              (size_t)cap * sizeof(uint8_t));
+    if (!set || !used) {
+        if (set_hdr) scratch_free(set_hdr);
+        if (used_hdr) scratch_free(used_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t count = 0;
+    uint64_t mask = cap - 1;
+    void* base = ray_data(input);
+
+    for (int64_t i = 0; i < len; i++) {
+        int64_t val;
+        if (in_type == RAY_F64) {
+            double fv = ((double*)base)[i];
+            /* Normalize: NaN → canonical NaN, -0.0 → +0.0 */
+            if (fv != fv) fv = (double)NAN;        /* canonical NaN */
+            else if (fv == 0.0) fv = 0.0;          /* +0.0 */
+            memcpy(&val, &fv, sizeof(int64_t));
+        } else {
+            val = read_col_i64(base, i, in_type, input->attrs);
+        }
+
+        /* Open-addressing linear probe */
+        uint64_t h = (uint64_t)val * 0x9E3779B97F4A7C15ULL;
+        uint64_t slot = h & mask;
+        while (used[slot]) {
+            if (set[slot] == val) goto next_val;
+            slot = (slot + 1) & mask;
+        }
+        /* New distinct value */
+        set[slot] = val;
+        used[slot] = 1;
+        count++;
+        next_val:;
+    }
+
+    scratch_free(set_hdr);
+    scratch_free(used_hdr);
+    return ray_i64(count);
+}
+
+ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
+    (void)g;
+    if (!input || RAY_IS_ERR(input)) return input;
+
+    /* TABLE input: COUNT returns row count, others need a column */
+    if (input->type == RAY_TABLE) {
+        if (op->opcode == OP_COUNT)
+            return ray_i64(ray_table_nrows(input));
+        return ray_error("type", NULL);
+    }
+
+    int8_t in_type = input->type;
+    int64_t len = input->len;
+
+    /* Resolve null bitmap once before dispatching.  ray_vec_nullmap_bytes
+     * handles slice / ext / inline / HAS_INDEX uniformly so this works on
+     * vectors that carry an attached accelerator index. */
+    bool has_nulls = (input->attrs & RAY_ATTR_HAS_NULLS) != 0;
+    const uint8_t* null_bm = ray_vec_nullmap_bytes(input, NULL, NULL);
+
+    /* O(1) short-circuit: first/last on numeric columns don't need a
+     * full reduction pass.  Non-numeric types (STR, GUID) fall through
+     * to the serial reduction path below. */
+    if ((op->opcode == OP_FIRST || op->opcode == OP_LAST) &&
+        (in_type == RAY_I64 || in_type == RAY_F64 || in_type == RAY_I32 ||
+         in_type == RAY_I16 || in_type == RAY_BOOL || in_type == RAY_U8 ||
+         in_type == RAY_TIMESTAMP || in_type == RAY_DATE || in_type == RAY_TIME ||
+         in_type == RAY_SYM)) {
+        int64_t row;
+        if (op->opcode == OP_FIRST) {
+            for (row = 0; row < len; row++)
+                if (!has_nulls || !((null_bm[row/8] >> (row%8)) & 1)) break;
+        } else {
+            for (row = len - 1; row >= 0; row--)
+                if (!has_nulls || !((null_bm[row/8] >> (row%8)) & 1)) break;
+        }
+        if (row < 0 || row >= len)
+            return ray_typed_null(-in_type);
+        void* base = ray_data(input);
+        if (in_type == RAY_F64) return ray_f64(((const double*)base)[row]);
+        return ray_i64(read_col_i64(base, row, in_type, input->attrs));
+    }
+
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && len >= RAY_PARALLEL_THRESHOLD) {
+        uint32_t nw = ray_pool_total_workers(pool);
+        ray_t* accs_hdr;
+        reduce_acc_t* accs = (reduce_acc_t*)scratch_calloc(&accs_hdr, nw * sizeof(reduce_acc_t));
+        if (!accs) return ray_error("oom", NULL);
+        for (uint32_t i = 0; i < nw; i++) reduce_acc_init(&accs[i]);
+
+        par_reduce_ctx_t ctx = { .input = input, .accs = accs,
+                                 .has_nulls = has_nulls, .null_bm = null_bm };
+        ray_pool_dispatch(pool, par_reduce_fn, &ctx, len);
+
+        /* Merge: worker 0 is the base, merge the rest in order */
+        reduce_acc_t merged;
+        reduce_acc_init(&merged);
+        merged = accs[0];
+        for (uint32_t i = 1; i < nw; i++) {
+            if (!accs[i].has_first) continue;
+            reduce_merge(&merged, &accs[i], in_type);
+        }
+        /* first = accs[first worker with data], last = accs[last worker with data] */
+        for (uint32_t i = 0; i < nw; i++) {
+            if (accs[i].has_first) {
+                if (in_type == RAY_F64) merged.first_f = accs[i].first_f;
+                else merged.first_i = accs[i].first_i;
+                break;
+            }
+        }
+        for (int32_t i = (int32_t)nw - 1; i >= 0; i--) {
+            if (accs[i].has_first) {
+                if (in_type == RAY_F64) merged.last_f = accs[i].last_f;
+                else merged.last_i = accs[i].last_i;
+                break;
+            }
+        }
+
+        ray_t* result;
+        switch (op->opcode) {
+            case OP_SUM:   result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break;
+            case OP_PROD:  result = in_type == RAY_F64 ? ray_f64(merged.prod_f) : ray_i64(merged.prod_i); break;
+            case OP_MIN:   result = merged.cnt > 0 ? (in_type == RAY_F64 ? ray_f64(merged.min_f) : ray_i64(merged.min_i)) : ray_typed_null(-in_type); break;
+            case OP_MAX:   result = merged.cnt > 0 ? (in_type == RAY_F64 ? ray_f64(merged.max_f) : ray_i64(merged.max_i)) : ray_typed_null(-in_type); break;
+            case OP_COUNT: result = ray_i64(merged.cnt); break;
+            case OP_AVG:   result = merged.cnt > 0 ? ray_f64(in_type == RAY_F64 ? merged.sum_f / merged.cnt : (double)merged.sum_i / merged.cnt) : ray_typed_null(-RAY_F64); break;
+            case OP_FIRST: result = merged.has_first ? (in_type == RAY_F64 ? ray_f64(merged.first_f) : ray_i64(merged.first_i)) : ray_typed_null(-in_type); break;
+            case OP_LAST:  result = merged.has_first ? (in_type == RAY_F64 ? ray_f64(merged.last_f) : ray_i64(merged.last_i)) : ray_typed_null(-in_type); break;
+            case OP_VAR: case OP_VAR_POP:
+            case OP_STDDEV: case OP_STDDEV_POP: {
+                bool insufficient = (op->opcode == OP_VAR || op->opcode == OP_STDDEV) ? merged.cnt <= 1 : merged.cnt <= 0;
+                if (insufficient) { result = ray_typed_null(-RAY_F64); break; }
+                double mean, var_pop;
+                if (in_type == RAY_F64) { mean = merged.sum_f / merged.cnt; var_pop = merged.sum_sq_f / merged.cnt - mean * mean; }
+                else { mean = (double)merged.sum_i / merged.cnt; var_pop = (double)merged.sum_sq_i / merged.cnt - mean * mean; }
+                if (var_pop < 0) var_pop = 0;
+                double val;
+                if (op->opcode == OP_VAR_POP) val = var_pop;
+                else if (op->opcode == OP_VAR) val = var_pop * merged.cnt / (merged.cnt - 1);
+                else if (op->opcode == OP_STDDEV_POP) val = sqrt(var_pop);
+                else val = sqrt(var_pop * merged.cnt / (merged.cnt - 1));
+                result = ray_f64(val);
+                break;
+            }
+            default:       result = ray_error("nyi", NULL); break;
+        }
+        scratch_free(accs_hdr);
+        return result;
+    }
+
+    reduce_acc_t acc;
+    reduce_acc_init(&acc);
+    reduce_range(input, 0, len, &acc, has_nulls, null_bm);
+
+    switch (op->opcode) {
+        case OP_SUM:   return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i);
+        case OP_PROD:  return in_type == RAY_F64 ? ray_f64(acc.prod_f) : ray_i64(acc.prod_i);
+        case OP_MIN:   return acc.cnt > 0 ? (in_type == RAY_F64 ? ray_f64(acc.min_f) : ray_i64(acc.min_i)) : ray_typed_null(-in_type);
+        case OP_MAX:   return acc.cnt > 0 ? (in_type == RAY_F64 ? ray_f64(acc.max_f) : ray_i64(acc.max_i)) : ray_typed_null(-in_type);
+        case OP_COUNT: return ray_i64(acc.cnt);
+        case OP_AVG:   return acc.cnt > 0 ? ray_f64(in_type == RAY_F64 ? acc.sum_f / acc.cnt : (double)acc.sum_i / acc.cnt) : ray_typed_null(-RAY_F64);
+        case OP_FIRST: return acc.has_first ? (in_type == RAY_F64 ? ray_f64(acc.first_f) : ray_i64(acc.first_i)) : ray_typed_null(-in_type);
+        case OP_LAST:  return acc.has_first ? (in_type == RAY_F64 ? ray_f64(acc.last_f) : ray_i64(acc.last_i)) : ray_typed_null(-in_type);
+        case OP_VAR: case OP_VAR_POP:
+        case OP_STDDEV: case OP_STDDEV_POP: {
+            bool insufficient = (op->opcode == OP_VAR || op->opcode == OP_STDDEV) ? acc.cnt <= 1 : acc.cnt <= 0;
+            if (insufficient) return ray_typed_null(-RAY_F64);
+            double mean, var_pop;
+            if (in_type == RAY_F64) { mean = acc.sum_f / acc.cnt; var_pop = acc.sum_sq_f / acc.cnt - mean * mean; }
+            else { mean = (double)acc.sum_i / acc.cnt; var_pop = (double)acc.sum_sq_i / acc.cnt - mean * mean; }
+            if (var_pop < 0) var_pop = 0;
+            double val;
+            if (op->opcode == OP_VAR_POP) val = var_pop;
+            else if (op->opcode == OP_VAR) val = var_pop * acc.cnt / (acc.cnt - 1);
+            else if (op->opcode == OP_STDDEV_POP) val = sqrt(var_pop);
+            else val = sqrt(var_pop * acc.cnt / (acc.cnt - 1));
+            return ray_f64(val);
+        }
+        default:       return ray_error("nyi", NULL);
+    }
+}
+
+/* ============================================================================
+ * Group-by execution — with parallel local hash tables + merge
+ * ============================================================================ */
+
+
+/* Flags controlling which accumulator arrays are allocated */
+/* GHT_NEED_* defined in exec_internal.h */
+
+/* ── Row-layout HT ──────────────────────────────────────────────────────
+ * Keys + accumulators stored inline in both radix entries and group rows.
+ * After phase1 copies data from original columns, phase2 and phase3 never
+ * touch column data again — all access is sequential/local.
+ * ────────────────────────────────────────────────────────────────────── */
+
+/* ght_layout_t defined in exec_internal.h */
+
+ght_layout_t ght_compute_layout(uint8_t n_keys, uint8_t n_aggs,
+                                        ray_t** agg_vecs, uint8_t need_flags,
+                                        const uint16_t* agg_ops,
+                                        const int8_t* key_types) {
+    ght_layout_t ly;
+    memset(&ly, 0, sizeof(ly));
+    ly.n_keys = n_keys;
+    ly.n_aggs = n_aggs;
+    ly.need_flags = need_flags;
+
+    /* Mark wide keys (those that don't fit in 8 bytes).  For each
+     * wide key, the fat-entry and HT-row key slot stores a source
+     * row index; probe/rehash/scatter resolve the actual bytes via
+     * group_ht_t.key_data[k].  Currently only RAY_GUID is supported. */
+    if (key_types) {
+        for (uint8_t k = 0; k < n_keys && k < 8; k++) {
+            if (key_types[k] == RAY_GUID) {
+                ly.wide_key_mask |= (uint8_t)(1u << k);
+                ly.wide_key_esz[k] = 16;
+            }
+        }
+    }
+
+    uint8_t nv = 0;
+    for (uint8_t a = 0; a < n_aggs && a < 8; a++) {
+        if (agg_vecs[a]) {
+            ly.agg_val_slot[a] = (int8_t)nv;
+            if (agg_vecs[a]->type == RAY_F64)
+                ly.agg_is_f64 |= (1u << a);
+            nv++;
+        } else {
+            ly.agg_val_slot[a] = -1;
+        }
+        if (agg_ops) {
+            if (agg_ops[a] == OP_FIRST) ly.agg_is_first |= (1u << a);
+            if (agg_ops[a] == OP_LAST)  ly.agg_is_last  |= (1u << a);
+        }
+    }
+    ly.n_agg_vals = nv;
+    /* Key region = n_keys*8 + 8-byte null mask slot (stored after last key).
+     * The null mask slot holds a bitmap of which keys were null in the source
+     * row (bit k = key k is null). Folding this slot into hash/memcmp lets
+     * null and 0 form distinct groups. */
+    uint16_t key_region = (uint16_t)((uint16_t)n_keys * 8 + 8);
+    ly.entry_stride = (uint16_t)(8 + key_region + (uint16_t)nv * 8);
+
+    uint16_t off = (uint16_t)(8 + key_region);
+    uint16_t block = (uint16_t)nv * 8;
+    if (need_flags & GHT_NEED_SUM)   { ly.off_sum   = off; off += block; }
+    if (need_flags & GHT_NEED_MIN)   { ly.off_min   = off; off += block; }
+    if (need_flags & GHT_NEED_MAX)   { ly.off_max   = off; off += block; }
+    if (need_flags & GHT_NEED_SUMSQ) { ly.off_sumsq = off; off += block; }
+    ly.row_stride = off;
+    return ly;
+}
+
+/* Packed HT slots: [salt:8 | gid:24] in 4 bytes.
+ * Max groups per HT = 16M (24 bits) — ample for partitioned probes.
+ * 4B slots halve cache footprint vs 8B, fitting HT in L2. */
+#define HT_EMPTY    UINT32_MAX
+#define HT_PACK(salt, gid)  (((uint32_t)(uint8_t)(salt) << 24) | ((gid) & 0xFFFFFF))
+#define HT_GID(s)   ((s) & 0xFFFFFF)
+#define HT_SALT_V(s) ((uint8_t)((s) >> 24))
+
+/* group_ht_t defined in exec_internal.h */
+
+static bool group_ht_init_sized(group_ht_t* ht, uint32_t cap,
+                                 const ght_layout_t* ly, uint32_t init_grp_cap) {
+    ht->ht_cap = cap;
+    ht->oom = 0;
+    ht->layout = *ly;
+    /* key_data must be populated by the caller via group_ht_set_key_data
+     * whenever wide_key_mask != 0. */
+    memset(ht->key_data, 0, sizeof(ht->key_data));
+    ht->slots = (uint32_t*)scratch_alloc(&ht->_h_slots, (size_t)cap * sizeof(uint32_t));
+    if (!ht->slots) return false;
+    memset(ht->slots, 0xFF, (size_t)cap * sizeof(uint32_t)); /* HT_EMPTY = all-1s */
+    ht->grp_cap = init_grp_cap;
+    ht->grp_count = 0;
+    ht->rows = (char*)scratch_alloc(&ht->_h_rows,
+        (size_t)init_grp_cap * ly->row_stride);
+    if (!ht->rows) return false;
+    return true;
+}
+
+bool group_ht_init(group_ht_t* ht, uint32_t cap, const ght_layout_t* ly) {
+    return group_ht_init_sized(ht, cap, ly, 256);
+}
+
+/* Populate key_data[k] for wide-key resolution. Called by the HT path
+ * right after group_ht_init / group_ht_init_sized when any key is wide. */
+static inline void group_ht_set_key_data(group_ht_t* ht, void** kd) {
+    uint8_t mask = ht->layout.wide_key_mask;
+    if (!mask || !kd) return;
+    for (uint8_t k = 0; k < ht->layout.n_keys && k < 8; k++) {
+        if (mask & (1u << k)) ht->key_data[k] = kd[k];
+    }
+}
+
+void group_ht_free(group_ht_t* ht) {
+    scratch_free(ht->_h_slots);
+    scratch_free(ht->_h_rows);
+}
+
+static bool group_ht_grow(group_ht_t* ht) {
+    uint32_t old_cap = ht->grp_cap;
+    uint32_t new_cap = old_cap * 2;
+    uint16_t rs = ht->layout.row_stride;
+    char* new_rows = (char*)scratch_realloc(
+        &ht->_h_rows, (size_t)old_cap * rs, (size_t)new_cap * rs);
+    if (!new_rows) return false;
+    ht->rows = new_rows;
+    ht->grp_cap = new_cap;
+    return true;
+}
+
+/* Hash inline int64_t keys (for rehash — resolves wide keys via
+ * the HT's key_data pointers). */
+static inline uint64_t hash_keys_inline(const int64_t* keys, const int8_t* key_types,
+                                         uint8_t n_keys, uint8_t wide_mask,
+                                         const uint8_t* wide_esz, void* const* key_data) {
+    uint64_t h = 0;
+    for (uint8_t k = 0; k < n_keys; k++) {
+        uint64_t kh;
+        if (wide_mask & (1u << k)) {
+            /* Wide key: keys[k] is the source row index. Hash the
+             * actual bytes from key_data[k]. */
+            int64_t row_idx = keys[k];
+            uint8_t esz = wide_esz[k];
+            const void* src = (const char*)key_data[k] + (size_t)row_idx * esz;
+            kh = ray_hash_bytes(src, esz);
+        } else if (key_types[k] == RAY_F64) {
+            double dv;
+            memcpy(&dv, &keys[k], 8);
+            kh = ray_hash_f64(dv);
+        } else {
+            kh = ray_hash_i64(keys[k]);
+        }
+        h = (k == 0) ? kh : ray_hash_combine(h, kh);
+    }
+    /* Fold null mask (slot n_keys) into hash so null/0 form distinct groups */
+    int64_t null_mask = keys[n_keys];
+    if (null_mask)
+        h = ray_hash_combine(h, ray_hash_i64(null_mask));
+    return h;
+}
+
+static void group_ht_rehash(group_ht_t* ht, const int8_t* key_types) {
+    uint32_t new_cap = ht->ht_cap * 2;
+    ray_t* new_h = NULL;
+    uint32_t* new_slots = (uint32_t*)scratch_alloc(&new_h, (size_t)new_cap * sizeof(uint32_t));
+    if (!new_slots) return; /* OOM: keep old HT, it still works (just slower) */
+    scratch_free(ht->_h_slots);
+    ht->_h_slots = new_h;
+    ht->slots = new_slots;
+    memset(ht->slots, 0xFF, (size_t)new_cap * sizeof(uint32_t));
+    ht->ht_cap = new_cap;
+    uint32_t mask = new_cap - 1;
+    uint16_t rs = ht->layout.row_stride;
+    uint8_t nk = ht->layout.n_keys;
+    uint8_t wide = ht->layout.wide_key_mask;
+    for (uint32_t gi = 0; gi < ht->grp_count; gi++) {
+        const int64_t* row_keys = (const int64_t*)(ht->rows + (size_t)gi * rs + 8);
+        uint64_t h = hash_keys_inline(row_keys, key_types, nk, wide,
+                                       ht->layout.wide_key_esz, ht->key_data);
+        uint32_t slot = (uint32_t)(h & mask);
+        while (ht->slots[slot] != HT_EMPTY)
+            slot = (slot + 1) & mask;
+        ht->slots[slot] = HT_PACK(HT_SALT(h), gi);
+    }
+}
+
+/* Initialize accumulators for a new group from entry's inline agg values.
+ * Each unified block has n_agg_vals slots of 8 bytes, typed by agg_is_f64. */
+static inline void init_accum_from_entry(char* row, const char* entry,
+                                          const ght_layout_t* ly) {
+    uint16_t accum_start = (uint16_t)(8 + ((uint16_t)ly->n_keys + 1) * 8);
+    if (ly->row_stride > accum_start)
+        memset(row + accum_start, 0, ly->row_stride - accum_start);
+
+    const char* agg_data = entry + 8 + ((size_t)ly->n_keys + 1) * 8;
+    uint8_t na = ly->n_aggs;
+    uint8_t nf = ly->need_flags;
+
+    for (uint8_t a = 0; a < na; a++) {
+        int8_t s = ly->agg_val_slot[a];
+        if (s < 0) continue;
+        /* Copy raw 8 bytes from entry into each enabled accumulator block */
+        if (nf & GHT_NEED_SUM) memcpy(row + ly->off_sum + s * 8, agg_data + s * 8, 8);
+        if (nf & GHT_NEED_MIN) memcpy(row + ly->off_min + s * 8, agg_data + s * 8, 8);
+        if (nf & GHT_NEED_MAX) memcpy(row + ly->off_max + s * 8, agg_data + s * 8, 8);
+        if (nf & GHT_NEED_SUMSQ) {
+            /* sumsq = v * v for the first entry */
+            if (ly->agg_is_f64 & (1u << a)) {
+                double v; memcpy(&v, agg_data + s * 8, 8);
+                double sq = v * v;
+                memcpy(row + ly->off_sumsq + s * 8, &sq, 8);
+            } else {
+                int64_t v; memcpy(&v, agg_data + s * 8, 8);
+                double sq = (double)v * (double)v;
+                memcpy(row + ly->off_sumsq + s * 8, &sq, 8);
+            }
+        }
+    }
+}
+
+/* Row-layout accessors: cast through void* for strict-aliasing safety.
+ * All row offsets are 8-byte aligned by construction. */
+/* ROW_RD/WR macros defined in exec_internal.h */
+
+/* Accumulate into existing group from entry's inline agg values */
+static inline void accum_from_entry(char* row, const char* entry,
+                                     const ght_layout_t* ly) {
+    const char* agg_data = entry + 8 + ((size_t)ly->n_keys + 1) * 8;
+    uint8_t na = ly->n_aggs;
+    uint8_t nf = ly->need_flags;
+
+    for (uint8_t a = 0; a < na; a++) {
+        int8_t s = ly->agg_val_slot[a];
+        if (s < 0) continue;
+        const char* val = agg_data + s * 8;
+
+        uint8_t amask = (1u << a);
+        if (ly->agg_is_f64 & amask) {
+            double v;
+            memcpy(&v, val, 8);
+            if (nf & GHT_NEED_SUM) {
+                if (ly->agg_is_first & amask) { /* keep init value */ }
+                else if (ly->agg_is_last & amask) { memcpy(row + ly->off_sum + s * 8, val, 8); }
+                else { ROW_WR_F64(row, ly->off_sum, s) += v; }
+            }
+            if (nf & GHT_NEED_MIN) { double* p = &ROW_WR_F64(row, ly->off_min, s); if (v < *p) *p = v; }
+            if (nf & GHT_NEED_MAX) { double* p = &ROW_WR_F64(row, ly->off_max, s); if (v > *p) *p = v; }
+            if (nf & GHT_NEED_SUMSQ) { ROW_WR_F64(row, ly->off_sumsq, s) += v * v; }
+        } else {
+            int64_t v;
+            memcpy(&v, val, 8);
+            if (nf & GHT_NEED_SUM) {
+                if (ly->agg_is_first & amask) { /* keep init value */ }
+                else if (ly->agg_is_last & amask) { memcpy(row + ly->off_sum + s * 8, val, 8); }
+                else { ROW_WR_I64(row, ly->off_sum, s) += v; }
+            }
+            if (nf & GHT_NEED_MIN) { int64_t* p = &ROW_WR_I64(row, ly->off_min, s); if (v < *p) *p = v; }
+            if (nf & GHT_NEED_MAX) { int64_t* p = &ROW_WR_I64(row, ly->off_max, s); if (v > *p) *p = v; }
+            if (nf & GHT_NEED_SUMSQ) { ROW_WR_F64(row, ly->off_sumsq, s) += (double)v * (double)v; }
+        }
+    }
+}
+
+/* Compare the n_keys key slots of two rows, handling wide keys via
+ * key_data[] resolution.  Returns true if all keys are bytewise equal.
+ * Hot path: when wide_mask == 0, reduces to a single memcmp over the
+ * packed 8-byte-per-key region. */
+static inline bool group_keys_equal(const int64_t* a_keys, const int64_t* b_keys,
+                                      const ght_layout_t* ly, void* const* key_data) {
+    uint8_t wide = ly->wide_key_mask;
+    uint8_t nk = ly->n_keys;
+    if (wide == 0) {
+        /* memcmp covers nk values + trailing 8-byte null mask slot */
+        return memcmp(a_keys, b_keys, (size_t)(nk + 1) * 8) == 0;
+    }
+    for (uint8_t k = 0; k < nk; k++) {
+        if (wide & (1u << k)) {
+            int64_t ra = a_keys[k];
+            int64_t rb = b_keys[k];
+            if (ra == rb) continue;  /* same source row - trivially equal */
+            uint8_t esz = ly->wide_key_esz[k];
+            const char* base = (const char*)key_data[k];
+            if (memcmp(base + (size_t)ra * esz,
+                       base + (size_t)rb * esz, esz) != 0) return false;
+        } else {
+            if (a_keys[k] != b_keys[k]) return false;
+        }
+    }
+    /* Null mask slot must match too */
+    if (a_keys[nk] != b_keys[nk]) return false;
+    return true;
+}
+
+/* Probe + accumulate a single fat entry into the HT. Returns updated mask. */
+static inline uint32_t group_probe_entry(group_ht_t* ht,
+    const char* entry, const int8_t* key_types, uint32_t mask) {
+    const ght_layout_t* ly = &ht->layout;
+    uint64_t hash = *(const uint64_t*)entry;
+    const char* ekeys = entry + 8;
+    uint8_t salt = HT_SALT(hash);
+    uint32_t slot = (uint32_t)(hash & mask);
+    uint16_t key_bytes = (uint16_t)((ly->n_keys + 1) * 8);
+
+    for (;;) {
+        uint32_t sv = ht->slots[slot];
+        if (sv == HT_EMPTY) {
+            /* New group */
+            if (ht->grp_count >= ht->grp_cap) {
+                if (!group_ht_grow(ht)) { ht->oom = 1; return mask; }
+            }
+            uint32_t gid = ht->grp_count++;
+            char* row = ht->rows + (size_t)gid * ly->row_stride;
+            *(int64_t*)row = 1;   /* count = 1 */
+            memcpy(row + 8, ekeys, key_bytes);
+            init_accum_from_entry(row, entry, ly);
+            ht->slots[slot] = HT_PACK(salt, gid);
+            if (ht->grp_count * 2 > ht->ht_cap) {
+                group_ht_rehash(ht, key_types);
+                mask = ht->ht_cap - 1;
+            }
+            return mask;
+        }
+        if (HT_SALT_V(sv) == salt) {
+            uint32_t gid = HT_GID(sv);
+            char* row = ht->rows + (size_t)gid * ly->row_stride;
+            if (group_keys_equal((const int64_t*)(row + 8),
+                                  (const int64_t*)ekeys, ly, ht->key_data)) {
+                (*(int64_t*)row)++;   /* count++ */
+                accum_from_entry(row, entry, ly);
+                return mask;
+            }
+        }
+        slot = (slot + 1) & mask;
+    }
+}
+
+/* Process rows [start, end) from original columns into a local hash table.
+ * Converts each row to a fat entry on the stack, then probes. */
+#define GROUP_PREFETCH_BATCH 16
+
+void group_rows_range(group_ht_t* ht, void** key_data, int8_t* key_types,
+                              uint8_t* key_attrs, ray_t** key_vecs, ray_t** agg_vecs,
+                              int64_t start, int64_t end,
+                              const int64_t* match_idx) {
+    const ght_layout_t* ly = &ht->layout;
+    uint8_t nk = ly->n_keys;
+    uint8_t na = ly->n_aggs;
+    uint8_t wide = ly->wide_key_mask;
+    uint32_t mask = ht->ht_cap - 1;
+    /* Stack buffer for one entry: hash + (nk+1) key slots + nv agg_vals.
+     * Max size: 8 + 9*8 + 8*8 = 144 bytes. */
+    char ebuf[8 + 9 * 8 + 8 * 8];
+
+    /* Check which key columns can produce nulls (parent vec's HAS_NULLS
+     * attr for slices) — skips per-row null checks on the fast path. */
+    uint8_t nullable_mask = 0;
+    for (uint8_t k = 0; k < nk; k++) {
+        if (!key_vecs || !key_vecs[k]) continue;
+        ray_t* kv = key_vecs[k];
+        ray_t* src = (kv->attrs & RAY_ATTR_SLICE) ? kv->slice_parent : kv;
+        if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+            nullable_mask |= (uint8_t)(1u << k);
+    }
+
+    /* Wire the HT's key_data pointer table so probe/rehash can
+     * resolve wide keys via the source columns. */
+    if (wide) group_ht_set_key_data(ht, key_data);
+
+    for (int64_t i = start; i < end; i++) {
+        /* Cancellation checkpoint every 65536 rows — ~150 polls on a
+         * 10M-row ingest, imperceptible in the inner loop and still
+         * sub-100ms response time on Ctrl-C. */
+        if (((i - start) & 65535) == 0 && ray_interrupted()) break;
+        int64_t row = match_idx ? match_idx[i] : i;
+        uint64_t h = 0;
+        int64_t* ek = (int64_t*)(ebuf + 8);
+        int64_t null_mask = 0;
+        for (uint8_t k = 0; k < nk; k++) {
+            int8_t t = key_types[k];
+            uint64_t kh;
+            bool is_null = (nullable_mask & (1u << k))
+                           && ray_vec_is_null(key_vecs[k], row);
+            if (is_null) {
+                null_mask |= (int64_t)(1u << k);
+                ek[k] = 0;  /* canonical null value — real 0 differs via null_mask */
+                kh = ray_hash_i64(0);
+            } else if (wide & (1u << k)) {
+                /* Wide key: store source row index, hash the actual bytes. */
+                uint8_t esz = ly->wide_key_esz[k];
+                const void* src = (const char*)key_data[k] + (size_t)row * esz;
+                ek[k] = row;
+                kh = ray_hash_bytes(src, esz);
+            } else if (t == RAY_F64) {
+                int64_t kv;
+                memcpy(&kv, &((double*)key_data[k])[row], 8);
+                ek[k] = kv;
+                kh = ray_hash_f64(((double*)key_data[k])[row]);
+            } else {
+                int64_t kv = read_col_i64(key_data[k], row, t, key_attrs[k]);
+                ek[k] = kv;
+                kh = ray_hash_i64(kv);
+            }
+            h = (k == 0) ? kh : ray_hash_combine(h, kh);
+        }
+        ek[nk] = null_mask;
+        if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask));
+        *(uint64_t*)ebuf = h;
+
+        int64_t* ev = (int64_t*)(ebuf + 8 + ((size_t)nk + 1) * 8);
+        uint8_t vi = 0;
+        for (uint8_t a = 0; a < na; a++) {
+            ray_t* ac = agg_vecs[a];
+            if (!ac) continue;
+            if (ac->type == RAY_F64)
+                memcpy(&ev[vi], &((double*)ray_data(ac))[row], 8);
+            else
+                ev[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs);
+            vi++;
+        }
+
+        mask = group_probe_entry(ht, ebuf, key_types, mask);
+    }
+}
+
+/* ============================================================================
+ * Radix-partitioned parallel group-by
+ *
+ * Phase 1 (parallel): Each worker reads keys+agg values from original columns,
+ *         packs into fat entries (hash, keys, agg_vals), scatters into
+ *         thread-local per-partition buffers.
+ * Phase 2 (parallel): Each partition is aggregated independently using
+ *         inline data — no original column access needed.
+ * Phase 3: Build result columns from inline group rows.
+ * ============================================================================ */
+
+#define RADIX_BITS  8
+#define RADIX_P     (1u << RADIX_BITS)   /* 256 partitions */
+#define RADIX_MASK  (RADIX_P - 1)
+#define RADIX_PART(h) (((uint32_t)((h) >> 16)) & RADIX_MASK)
+
+/* Per-worker, per-partition buffer of fat entries */
+typedef struct {
+    char*    data;           /* flat buffer: data[i * entry_stride] */
+    uint32_t count;
+    uint32_t cap;
+    bool     oom;            /* set on realloc failure */
+    ray_t*    _hdr;
+} radix_buf_t;
+
+static inline void radix_buf_push(radix_buf_t* buf, uint16_t entry_stride,
+                                   uint64_t hash, const int64_t* keys, uint8_t n_keys,
+                                   int64_t null_mask,
+                                   const int64_t* agg_vals, uint8_t n_agg_vals) {
+    if (__builtin_expect(buf->count >= buf->cap, 0)) {
+        uint32_t old_cap = buf->cap;
+        uint32_t new_cap = old_cap * 2;
+        char* new_data = (char*)scratch_realloc(
+            &buf->_hdr, (size_t)old_cap * entry_stride,
+            (size_t)new_cap * entry_stride);
+        if (!new_data) { buf->oom = true; return; }
+        buf->data = new_data;
+        buf->cap = new_cap;
+    }
+    char* dst = buf->data + (size_t)buf->count * entry_stride;
+    *(uint64_t*)dst = hash;
+    memcpy(dst + 8, keys, (size_t)n_keys * 8);
+    /* Null mask slot sits right after the keys */
+    memcpy(dst + 8 + (size_t)n_keys * 8, &null_mask, 8);
+    if (n_agg_vals)
+        memcpy(dst + 8 + ((size_t)n_keys + 1) * 8, agg_vals, (size_t)n_agg_vals * 8);
+    buf->count++;
+}
+
+typedef struct {
+    void**       key_data;
+    int8_t*      key_types;
+    uint8_t*     key_attrs;
+    ray_t**      key_vecs;
+    uint8_t      nullable_mask;   /* bit k = key k column may contain nulls */
+    ray_t**       agg_vecs;
+    uint32_t     n_workers;
+    radix_buf_t* bufs;        /* [n_workers * RADIX_P] */
+    ght_layout_t layout;
+    /* When non-NULL, workers iterate match_idx[start..end) and
+     * read row=match_idx[i].  When NULL, row=i. */
+    const int64_t* match_idx;
+} radix_phase1_ctx_t;
+
+static void radix_phase1_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    radix_phase1_ctx_t* c = (radix_phase1_ctx_t*)ctx;
+    const ght_layout_t* ly = &c->layout;
+    radix_buf_t* my_bufs = &c->bufs[(size_t)worker_id * RADIX_P];
+    uint8_t nk = ly->n_keys;
+    uint8_t na = ly->n_aggs;
+    uint8_t nv = ly->n_agg_vals;
+    uint8_t wide = ly->wide_key_mask;
+    uint16_t estride = ly->entry_stride;
+    const int64_t* match_idx = c->match_idx;
+
+    int64_t keys[8];
+    int64_t agg_vals[8];
+
+    uint8_t nullable = c->nullable_mask;
+    for (int64_t i = start; i < end; i++) {
+        /* Cancellation checkpoint every 65536 rows — ~150 polls on a
+         * 10M-row ingest, imperceptible in the inner loop and still
+         * sub-100ms response time on Ctrl-C. */
+        if (((i - start) & 65535) == 0 && ray_interrupted()) break;
+        int64_t row = match_idx ? match_idx[i] : i;
+        uint64_t h = 0;
+        int64_t null_mask = 0;
+        for (uint8_t k = 0; k < nk; k++) {
+            int8_t t = c->key_types[k];
+            uint64_t kh;
+            bool is_null = (nullable & (1u << k))
+                           && ray_vec_is_null(c->key_vecs[k], row);
+            if (is_null) {
+                null_mask |= (int64_t)(1u << k);
+                keys[k] = 0;
+                kh = ray_hash_i64(0);
+            } else if (wide & (1u << k)) {
+                uint8_t esz = ly->wide_key_esz[k];
+                const void* src = (const char*)c->key_data[k] + (size_t)row * esz;
+                keys[k] = row;
+                kh = ray_hash_bytes(src, esz);
+            } else if (t == RAY_F64) {
+                int64_t kv;
+                memcpy(&kv, &((double*)c->key_data[k])[row], 8);
+                keys[k] = kv;
+                kh = ray_hash_f64(((double*)c->key_data[k])[row]);
+            } else {
+                int64_t kv = read_col_i64(c->key_data[k], row, t, c->key_attrs[k]);
+                keys[k] = kv;
+                kh = ray_hash_i64(kv);
+            }
+            h = (k == 0) ? kh : ray_hash_combine(h, kh);
+        }
+        if (null_mask) h = ray_hash_combine(h, ray_hash_i64(null_mask));
+
+        uint8_t vi = 0;
+        for (uint8_t a = 0; a < na; a++) {
+            ray_t* ac = c->agg_vecs[a];
+            if (!ac) continue;
+            if (ac->type == RAY_F64)
+                memcpy(&agg_vals[vi], &((double*)ray_data(ac))[row], 8);
+            else
+                agg_vals[vi] = read_col_i64(ray_data(ac), row, ac->type, ac->attrs);
+            vi++;
+        }
+
+        uint32_t part = RADIX_PART(h);
+        radix_buf_push(&my_bufs[part], estride, h, keys, nk, null_mask, agg_vals, nv);
+    }
+}
+
+/* Process pre-partitioned fat entries into an HT with prefetch batching.
+ * Two-phase prefetch: (1) prefetch HT slots, (2) prefetch group rows. */
+static void group_rows_indirect(group_ht_t* ht, const int8_t* key_types,
+                                 const char* entries, uint32_t n_entries,
+                                 uint16_t entry_stride) {
+    uint32_t mask = ht->ht_cap - 1;
+    /* Stride-ahead prefetch: prefetch HT slot for entry i+D while processing i.
+     * D=8 covers ~200ns L2/L3 latency at ~25ns per probe iteration. */
+    enum { PF_DIST = 8 };
+    /* Prime the prefetch pipeline */
+    uint32_t pf_end = (n_entries < PF_DIST) ? n_entries : PF_DIST;
+    for (uint32_t j = 0; j < pf_end; j++) {
+        uint64_t h = *(const uint64_t*)(entries + (size_t)j * entry_stride);
+        __builtin_prefetch(&ht->slots[(uint32_t)(h & mask)], 0, 1);
+    }
+    for (uint32_t i = 0; i < n_entries; i++) {
+        /* Prefetch PF_DIST entries ahead */
+        if (i + PF_DIST < n_entries) {
+            uint64_t h = *(const uint64_t*)(entries + (size_t)(i + PF_DIST) * entry_stride);
+            __builtin_prefetch(&ht->slots[(uint32_t)(h & mask)], 0, 1);
+        }
+        const char* e = entries + (size_t)i * entry_stride;
+        mask = group_probe_entry(ht, e, key_types, mask);
+    }
+}
+
+/* Phase 3: build result columns from inline group rows */
+typedef struct {
+    int8_t  out_type;
+    bool    src_f64;
+    uint16_t agg_op;
+    bool    affine;
+    double  bias_f64;
+    int64_t bias_i64;
+    void*   dst;
+    ray_t*  vec;
+} agg_out_t;
+
+/* Aliases for shared parallel null helpers from internal.h */
+#define grp_set_null       par_set_null
+#define grp_prepare_nullmap par_prepare_nullmap
+#define grp_finalize_nulls par_finalize_nulls
+
+typedef struct {
+    group_ht_t*   part_hts;
+    uint32_t*     part_offsets;
+    char**        key_dsts;
+    int8_t*       key_types;
+    uint8_t*      key_attrs;
+    uint8_t*      key_esizes;
+    ray_t**       key_cols;       /* [n_keys] output key vecs (for null bit writes) */
+    uint8_t       n_keys;
+    agg_out_t*    agg_outs;
+    uint8_t       n_aggs;
+    /* For wide-key columns (RAY_GUID), the stored key slot is a
+     * source row index and we copy the actual bytes from the source
+     * column here during the result scatter. */
+    void**        key_src_data;   /* [n_keys]; NULL entry if not wide */
+} radix_phase3_ctx_t;
+
+static void radix_phase3_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    radix_phase3_ctx_t* c = (radix_phase3_ctx_t*)ctx;
+    uint8_t nk = c->n_keys;
+    uint8_t na = c->n_aggs;
+
+    for (int64_t p = start; p < end; p++) {
+        group_ht_t* ph = &c->part_hts[p];
+        uint32_t gc = ph->grp_count;
+        if (gc == 0) continue;
+        uint32_t off = c->part_offsets[p];
+        const ght_layout_t* ly = &ph->layout;
+        uint16_t rs = ly->row_stride;
+
+        /* Single pass over group rows: read each row once, scatter keys + aggs.
+         * Reduces memory traffic from nk+na passes over group data to 1 pass. */
+        for (uint32_t gi = 0; gi < gc; gi++) {
+            const char* row = ph->rows + (size_t)gi * rs;
+            const int64_t* rkeys = (const int64_t*)(const void*)(row + 8);
+            int64_t cnt = *(const int64_t*)(const void*)row;
+            int64_t null_mask = rkeys[nk];
+            uint32_t di = off + gi;
+
+            /* Scatter keys to result columns */
+            for (uint8_t k = 0; k < nk; k++) {
+                if (null_mask & (int64_t)(1u << k)) {
+                    if (c->key_cols && c->key_cols[k])
+                        grp_set_null(c->key_cols[k], di);
+                    continue;
+                }
+                int64_t kv = rkeys[k];
+                int8_t kt = c->key_types[k];
+                char* dst = c->key_dsts[k];
+                uint8_t esz = c->key_esizes[k];
+                size_t doff = (size_t)di * esz;
+                if (ly->wide_key_mask & (1u << k)) {
+                    /* Wide key: kv is the source row index; copy the
+                     * bytes from the source column into the output. */
+                    const char* src = (const char*)c->key_src_data[k];
+                    memcpy(dst + doff, src + (size_t)kv * esz, esz);
+                } else if (kt == RAY_F64) {
+                    memcpy(dst + doff, &kv, 8);
+                } else {
+                    write_col_i64(dst, di, kv, kt, c->key_attrs[k]);
+                }
+            }
+
+            /* Scatter agg results to result columns */
+            for (uint8_t a = 0; a < na; a++) {
+                agg_out_t* ao = &c->agg_outs[a];
+                if (!ao->dst) continue; /* allocation failed (OOM) */
+                uint16_t op = ao->agg_op;
+                bool sf = ao->src_f64;
+                int8_t s = ly->agg_val_slot[a];
+                if (ao->out_type == RAY_F64) {
+                    double v;
+                    switch (op) {
+                        case OP_SUM:
+                            v = sf ? ROW_RD_F64(row, ly->off_sum, s)
+                                   : (double)ROW_RD_I64(row, ly->off_sum, s);
+                            if (ao->affine) v += ao->bias_f64 * cnt;
+                            break;
+                        case OP_AVG:
+                            v = sf ? ROW_RD_F64(row, ly->off_sum, s) / cnt
+                                   : (double)ROW_RD_I64(row, ly->off_sum, s) / cnt;
+                            if (ao->affine) v += ao->bias_f64;
+                            break;
+                        case OP_MIN:
+                            v = sf ? ROW_RD_F64(row, ly->off_min, s)
+                                   : (double)ROW_RD_I64(row, ly->off_min, s);
+                            break;
+                        case OP_MAX:
+                            v = sf ? ROW_RD_F64(row, ly->off_max, s)
+                                   : (double)ROW_RD_I64(row, ly->off_max, s);
+                            break;
+                        case OP_FIRST: case OP_LAST:
+                            v = sf ? ROW_RD_F64(row, ly->off_sum, s)
+                                   : (double)ROW_RD_I64(row, ly->off_sum, s);
+                            break;
+                        case OP_VAR: case OP_VAR_POP:
+                        case OP_STDDEV: case OP_STDDEV_POP: {
+                            bool insuf = (op == OP_VAR || op == OP_STDDEV) ? cnt <= 1 : cnt <= 0;
+                            if (insuf) { v = 0.0; grp_set_null(ao->vec, di); break; }
+                            double sum_val = sf ? ROW_RD_F64(row, ly->off_sum, s)
+                                                : (double)ROW_RD_I64(row, ly->off_sum, s);
+                            double sq_val = ly->off_sumsq ? ROW_RD_F64(row, ly->off_sumsq, s) : 0.0;
+                            double mean = sum_val / cnt;
+                            double var_pop = sq_val / cnt - mean * mean;
+                            if (var_pop < 0) var_pop = 0;
+                            if (op == OP_VAR_POP) v = var_pop;
+                            else if (op == OP_VAR) v = var_pop * cnt / (cnt - 1);
+                            else if (op == OP_STDDEV_POP) v = sqrt(var_pop);
+                            else v = sqrt(var_pop * cnt / (cnt - 1));
+                            break;
+                        }
+                        default: v = 0.0; break;
+                    }
+                    ((double*)(void*)ao->dst)[di] = v;
+                } else {
+                    int64_t v;
+                    switch (op) {
+                        case OP_SUM:
+                            v = ROW_RD_I64(row, ly->off_sum, s);
+                            if (ao->affine) v += ao->bias_i64 * cnt;
+                            break;
+                        case OP_COUNT: v = cnt; break;
+                        case OP_MIN:   v = ROW_RD_I64(row, ly->off_min, s); break;
+                        case OP_MAX:   v = ROW_RD_I64(row, ly->off_max, s); break;
+                        case OP_FIRST: case OP_LAST: v = ROW_RD_I64(row, ly->off_sum, s); break;
+                        default:       v = 0; break;
+                    }
+                    ((int64_t*)(void*)ao->dst)[di] = v;
+                }
+            }
+        }
+    }
+}
+
+/* Phase 2: aggregate each partition independently using inline data */
+typedef struct {
+    int8_t*      key_types;
+    uint8_t      n_keys;
+    uint32_t     n_workers;
+    radix_buf_t* bufs;
+    group_ht_t*  part_hts;
+    ght_layout_t layout;
+    /* Shared (read-only) source column bases for wide-key resolution.
+     * Each partition HT stashes the ones matching wide_key_mask. */
+    void**       key_data;
+} radix_phase2_ctx_t;
+
+static void radix_phase2_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    radix_phase2_ctx_t* c = (radix_phase2_ctx_t*)ctx;
+    uint16_t estride = c->layout.entry_stride;
+
+    for (int64_t p = start; p < end; p++) {
+        uint32_t total = 0;
+        for (uint32_t w = 0; w < c->n_workers; w++)
+            total += c->bufs[(size_t)w * RADIX_P + p].count;
+        if (total == 0) continue;
+
+        uint32_t part_ht_cap = 256;
+        {
+            uint64_t target = (uint64_t)total * 2;
+            if (target < 256) target = 256;
+            while (part_ht_cap < target) part_ht_cap *= 2;
+        }
+        /* Pre-size group store to avoid grows. Use next_pow2(total) as upper
+         * bound on groups. Over-allocation is bounded: worst case total >> groups,
+         * but total * row_stride is already committed via HT capacity anyway. */
+        uint32_t init_grp = 256;
+        while (init_grp < total && init_grp < 65536) init_grp *= 2;
+        if (!group_ht_init_sized(&c->part_hts[p], part_ht_cap, &c->layout, init_grp))
+            continue;
+        /* Wide keys need source-column resolution during probe/rehash. */
+        if (c->layout.wide_key_mask && c->key_data)
+            group_ht_set_key_data(&c->part_hts[p], c->key_data);
+
+        for (uint32_t w = 0; w < c->n_workers; w++) {
+            radix_buf_t* buf = &c->bufs[(size_t)w * RADIX_P + p];
+            if (buf->count == 0) continue;
+            group_rows_indirect(&c->part_hts[p], c->key_types,
+                                buf->data, buf->count, estride);
+        }
+    }
+}
+
+/* ============================================================================
+ * Parallel direct-array accumulation for low-cardinality single integer key
+ * ============================================================================ */
+
+/* Parallel min/max scan for direct-array key range detection */
+typedef struct {
+    const void* key_data;
+    int8_t      key_type;
+    uint8_t     key_attrs;
+    int64_t*    per_worker_min;  /* [n_workers] */
+    int64_t*    per_worker_max;  /* [n_workers] */
+    uint32_t    n_workers;
+    const int64_t* match_idx;    /* NULL = no selection */
+} minmax_ctx_t;
+
+static void minmax_scan_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    minmax_ctx_t* c = (minmax_ctx_t*)ctx;
+    uint32_t wid = worker_id % c->n_workers;
+    const int64_t* match_idx = c->match_idx;
+    int64_t kmin = INT64_MAX, kmax = INT64_MIN;
+    int8_t t = c->key_type;
+
+    #define MINMAX_SEG_LOOP(TYPE, CAST) \
+        do { \
+            const TYPE* kd = (const TYPE*)c->key_data; \
+            for (int64_t i = start; i < end; i++) { \
+                int64_t r = match_idx ? match_idx[i] : i; \
+                int64_t v = (int64_t)CAST kd[r]; \
+                if (v < kmin) kmin = v; \
+                if (v > kmax) kmax = v; \
+            } \
+        } while (0)
+
+    if (t == RAY_I64 || t == RAY_TIMESTAMP)
+        MINMAX_SEG_LOOP(int64_t, );
+    else if (RAY_IS_SYM(t)) {
+        uint8_t w = c->key_attrs & RAY_SYM_W_MASK;
+        if (w == RAY_SYM_W64) MINMAX_SEG_LOOP(int64_t, );
+        else if (w == RAY_SYM_W32) MINMAX_SEG_LOOP(uint32_t, );
+        else if (w == RAY_SYM_W16) MINMAX_SEG_LOOP(uint16_t, );
+        else MINMAX_SEG_LOOP(uint8_t, );
+    }
+    else if (t == RAY_BOOL || t == RAY_U8)
+        MINMAX_SEG_LOOP(uint8_t, );
+    else if (t == RAY_I16)
+        MINMAX_SEG_LOOP(int16_t, );
+    else /* RAY_I32, RAY_DATE, RAY_TIME */
+        MINMAX_SEG_LOOP(int32_t, );
+
+    #undef MINMAX_SEG_LOOP
+
+    /* Merge with existing per-worker values (a worker may process multiple morsels) */
+    if (kmin < c->per_worker_min[wid]) c->per_worker_min[wid] = kmin;
+    if (kmax > c->per_worker_max[wid]) c->per_worker_max[wid] = kmax;
+}
+
+typedef union { double f; int64_t i; } da_val_t;
+
+typedef struct {
+    da_val_t* sum;       /* SUM/AVG/FIRST/LAST [n_slots * n_aggs] */
+    da_val_t* min_val;   /* MIN [n_slots * n_aggs] */
+    da_val_t* max_val;   /* MAX [n_slots * n_aggs] */
+    double*   sumsq_f64; /* sum-of-squares for STDDEV/VAR */
+    int64_t*  count;     /* group counts [n_slots] */
+    /* Arena headers */
+    ray_t* _h_sum;
+    ray_t* _h_min;
+    ray_t* _h_max;
+    ray_t* _h_sumsq;
+    ray_t* _h_count;
+} da_accum_t;
+
+static inline void da_accum_free(da_accum_t* a) {
+    scratch_free(a->_h_sum);
+    scratch_free(a->_h_min);
+    scratch_free(a->_h_max);
+    scratch_free(a->_h_sumsq);
+    scratch_free(a->_h_count);
+}
+
+/* Unified agg result emitter — used by both DA and HT paths.
+ * Arrays indexed by [gi * n_aggs + a], counts by [gi]. */
+static void emit_agg_columns(ray_t** result, ray_graph_t* g, const ray_op_ext_t* ext,
+                              ray_t* const* agg_vecs, uint32_t grp_count,
+                              uint8_t n_aggs,
+                              const double*  sum_f64,  const int64_t* sum_i64,
+                              const double*  min_f64,  const double*  max_f64,
+                              const int64_t* min_i64,  const int64_t* max_i64,
+                              const int64_t* counts,
+                              const agg_affine_t* affine,
+                              const double*  sumsq_f64) {
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        uint16_t agg_op = ext->agg_ops[a];
+        ray_t* agg_col = agg_vecs[a];
+        bool is_f64 = agg_col && agg_col->type == RAY_F64;
+        int8_t out_type;
+        switch (agg_op) {
+            case OP_AVG:
+            case OP_STDDEV: case OP_STDDEV_POP:
+            case OP_VAR: case OP_VAR_POP:
+                out_type = RAY_F64; break;
+            case OP_COUNT: out_type = RAY_I64; break;
+            case OP_SUM: case OP_PROD:
+                out_type = is_f64 ? RAY_F64 : RAY_I64; break;
+            default:
+                out_type = agg_col ? agg_col->type : RAY_I64; break;
+        }
+        ray_t* new_col = ray_vec_new(out_type, (int64_t)grp_count);
+        if (!new_col || RAY_IS_ERR(new_col)) continue;
+        new_col->len = (int64_t)grp_count;
+        for (uint32_t gi = 0; gi < grp_count; gi++) {
+            size_t idx = (size_t)gi * n_aggs + a;
+            if (out_type == RAY_F64) {
+                double v;
+                switch (agg_op) {
+                    case OP_SUM:
+                        v = is_f64 ? sum_f64[idx] : (double)sum_i64[idx];
+                        if (affine && affine[a].enabled)
+                            v += affine[a].bias_f64 * counts[gi];
+                        break;
+                    case OP_AVG:
+                        v = is_f64 ? sum_f64[idx] / counts[gi] : (double)sum_i64[idx] / counts[gi];
+                        if (affine && affine[a].enabled)
+                            v += affine[a].bias_f64;
+                        break;
+                    case OP_MIN: v = is_f64 ? min_f64[idx] : (double)min_i64[idx]; break;
+                    case OP_MAX: v = is_f64 ? max_f64[idx] : (double)max_i64[idx]; break;
+                    case OP_FIRST: case OP_LAST:
+                        v = is_f64 ? sum_f64[idx] : (double)sum_i64[idx]; break;
+                    case OP_VAR: case OP_VAR_POP:
+                    case OP_STDDEV: case OP_STDDEV_POP: {
+                        int64_t cnt = counts[gi];
+                        bool insuf = (agg_op == OP_VAR || agg_op == OP_STDDEV) ? cnt <= 1 : cnt <= 0;
+                        if (insuf) { v = 0.0; ray_vec_set_null(new_col, gi, true); break; }
+                        double sum_val = is_f64 ? sum_f64[idx] : (double)sum_i64[idx];
+                        double sq_val = sumsq_f64 ? sumsq_f64[idx] : 0.0;
+                        double mean = sum_val / cnt;
+                        double var_pop = sq_val / cnt - mean * mean;
+                        if (var_pop < 0) var_pop = 0;
+                        if (agg_op == OP_VAR_POP) v = var_pop;
+                        else if (agg_op == OP_VAR) v = var_pop * cnt / (cnt - 1);
+                        else if (agg_op == OP_STDDEV_POP) v = sqrt(var_pop);
+                        else v = sqrt(var_pop * cnt / (cnt - 1));
+                        break;
+                    }
+                    default:     v = 0.0; break;
+                }
+                ((double*)ray_data(new_col))[gi] = v;
+            } else {
+                int64_t v;
+                switch (agg_op) {
+                    case OP_SUM:
+                        v = sum_i64[idx];
+                        if (affine && affine[a].enabled)
+                            v += affine[a].bias_i64 * counts[gi];
+                        break;
+                    case OP_COUNT: v = counts[gi]; break;
+                    case OP_MIN:   v = min_i64[idx]; break;
+                    case OP_MAX:   v = max_i64[idx]; break;
+                    case OP_FIRST: case OP_LAST: v = sum_i64[idx]; break;
+                    default:       v = 0; break;
+                }
+                ((int64_t*)ray_data(new_col))[gi] = v;
+            }
+        }
+        /* Generate unique column name: base_name + agg suffix (e.g. "v1_sum") */
+        ray_op_ext_t* agg_ext = find_ext(g, ext->agg_ins[a]->id);
+        int64_t name_id;
+        if (agg_ext && agg_ext->base.opcode == OP_SCAN) {
+            ray_t* name_atom = ray_sym_str(agg_ext->sym);
+            const char* base = name_atom ? ray_str_ptr(name_atom) : NULL;
+            size_t blen = base ? ray_str_len(name_atom) : 0;
+            const char* sfx = "";
+            size_t slen = 0;
+            switch (agg_op) {
+                case OP_SUM:   sfx = "_sum";   slen = 4; break;
+                case OP_COUNT: sfx = "_count"; slen = 6; break;
+                case OP_AVG:   sfx = "_mean";  slen = 5; break;
+                case OP_MIN:   sfx = "_min";   slen = 4; break;
+                case OP_MAX:   sfx = "_max";   slen = 4; break;
+                case OP_FIRST: sfx = "_first"; slen = 6; break;
+                case OP_LAST:  sfx = "_last";  slen = 5; break;
+                case OP_STDDEV:     sfx = "_stddev";     slen = 7; break;
+                case OP_STDDEV_POP: sfx = "_stddev_pop"; slen = 11; break;
+                case OP_VAR:        sfx = "_var";        slen = 4; break;
+                case OP_VAR_POP:    sfx = "_var_pop";    slen = 8; break;
+            }
+            char buf[256];
+            if (base && blen + slen < sizeof(buf)) {
+                memcpy(buf, base, blen);
+                memcpy(buf + blen, sfx, slen);
+                name_id = ray_sym_intern(buf, blen + slen);
+            } else {
+                name_id = agg_ext->sym;
+            }
+        } else {
+            /* Expression agg input — synthetic name like "_e0_sum" */
+            char nbuf[32];
+            int np = 0;
+            nbuf[np++] = '_'; nbuf[np++] = 'e';
+            /* Multi-digit agg index */
+            { uint8_t v = a; char dig[3]; int nd = 0;
+              do { dig[nd++] = (char)('0' + v % 10); v /= 10; } while (v);
+              while (nd--) nbuf[np++] = dig[nd]; }
+            const char* nsfx = "";
+            size_t nslen = 0;
+            switch (agg_op) {
+                case OP_SUM:   nsfx = "_sum";   nslen = 4; break;
+                case OP_COUNT: nsfx = "_count"; nslen = 6; break;
+                case OP_AVG:   nsfx = "_mean";  nslen = 5; break;
+                case OP_MIN:   nsfx = "_min";   nslen = 4; break;
+                case OP_MAX:   nsfx = "_max";   nslen = 4; break;
+                case OP_FIRST: nsfx = "_first"; nslen = 6; break;
+                case OP_LAST:  nsfx = "_last";  nslen = 5; break;
+                case OP_STDDEV:     nsfx = "_stddev";     nslen = 7; break;
+                case OP_STDDEV_POP: nsfx = "_stddev_pop"; nslen = 11; break;
+                case OP_VAR:        nsfx = "_var";        nslen = 4; break;
+                case OP_VAR_POP:    nsfx = "_var_pop";    nslen = 8; break;
+            }
+            memcpy(nbuf + np, nsfx, nslen);
+            name_id = ray_sym_intern(nbuf, (size_t)np + nslen);
+        }
+        *result = ray_table_add_col(*result, name_id, new_col);
+        ray_release(new_col);
+    }
+}
+
+/* Bitmask for which accumulator arrays are actually needed */
+#define DA_NEED_SUM   0x01  /* da_val_t sum array */
+#define DA_NEED_MIN   0x02  /* da_val_t min_val array */
+#define DA_NEED_MAX   0x04  /* da_val_t max_val array */
+#define DA_NEED_COUNT 0x08  /* count array */
+#define DA_NEED_SUMSQ 0x10  /* sumsq_f64 array (for STDDEV/VAR) */
+
+typedef struct {
+    da_accum_t*    accums;
+    uint32_t       n_accums;     /* number of accumulator sets (may < pool workers) */
+    void**         key_ptrs;     /* key data pointers [n_keys] */
+    int8_t*        key_types;    /* key type codes [n_keys] */
+    uint8_t*       key_attrs;    /* key attrs for RAY_SYM width [n_keys] */
+    uint8_t*       key_esz;      /* pre-computed per-key elem size [n_keys] */
+    int64_t*       key_mins;     /* per-key minimum [n_keys] */
+    int64_t*       key_strides;  /* per-key stride [n_keys] */
+    uint8_t        n_keys;
+    void**         agg_ptrs;
+    int8_t*        agg_types;
+    uint16_t*      agg_ops;      /* per-agg operation code */
+    uint8_t        n_aggs;
+    uint8_t        need_flags;   /* DA_NEED_* bitmask */
+    uint32_t       agg_f64_mask; /* bitmask: bit a set if agg[a] is RAY_F64 */
+    bool           all_sum;      /* true when all ops are SUM/AVG/COUNT (no MIN/MAX/FIRST/LAST) */
+    uint32_t       n_slots;
+    const int64_t* match_idx;    /* NULL = no selection */
+} da_ctx_t;
+
+/* Composite GID from multi-key.  Arithmetic overflow is prevented in practice
+ * by the DA budget check (DA_PER_WORKER_MAX) which limits total_slots to 262K. */
+static inline int32_t da_composite_gid(da_ctx_t* c, int64_t r) {
+    int32_t gid = 0;
+    for (uint8_t k = 0; k < c->n_keys; k++) {
+        int64_t val = read_by_esz(c->key_ptrs[k], r, c->key_esz[k]);
+        gid += (int32_t)((val - c->key_mins[k]) * c->key_strides[k]);
+    }
+    return gid;
+}
+
+/* Typed composite GID: eliminates per-element switch when all keys share width */
+#define DEFINE_DA_COMPOSITE_GID_TYPED(SUFFIX, KTYPE) \
+static inline int32_t da_composite_gid_##SUFFIX(da_ctx_t* c, int64_t r) { \
+    int32_t gid = 0; \
+    for (uint8_t k = 0; k < c->n_keys; k++) { \
+        int64_t val = (int64_t)((const KTYPE*)c->key_ptrs[k])[r]; \
+        gid += (int32_t)((val - c->key_mins[k]) * c->key_strides[k]); \
+    } \
+    return gid; \
+}
+DEFINE_DA_COMPOSITE_GID_TYPED(u8,  uint8_t)
+DEFINE_DA_COMPOSITE_GID_TYPED(u16, uint16_t)
+DEFINE_DA_COMPOSITE_GID_TYPED(u32, uint32_t)
+DEFINE_DA_COMPOSITE_GID_TYPED(i64, int64_t)
+#undef DEFINE_DA_COMPOSITE_GID_TYPED
+
+static inline void da_read_val(const void* ptr, int8_t type, uint8_t attrs,
+                               int64_t r, double* out_f64, int64_t* out_i64) {
+    if (type == RAY_F64) {
+        *out_f64 = ((const double*)ptr)[r];
+        *out_i64 = (int64_t)*out_f64;
+    } else {
+        *out_i64 = read_col_i64(ptr, r, type, attrs);
+        *out_f64 = (double)*out_i64;
+    }
+}
+
+/* Materialize a scalar (atom or len-1 vector) into a full-length vector so
+ * group-aggregation loops can read row-wise without out-of-bounds access. */
+static ray_t* materialize_broadcast_input(ray_t* src, int64_t nrows) {
+    if (!src || RAY_IS_ERR(src) || nrows < 0) return NULL;
+
+    int8_t out_type = ray_is_atom(src) ? (int8_t)-src->type : src->type;
+    if (out_type <= 0 || out_type >= RAY_TYPE_COUNT) return NULL;
+
+    ray_t* out = ray_vec_new(out_type, nrows);
+    if (!out || RAY_IS_ERR(out)) return out;
+    out->len = nrows;
+    if (nrows == 0) return out;
+
+    if (!ray_is_atom(src)) {
+        uint8_t esz = col_esz(src);
+        const char* s = (const char*)ray_data(src);
+        char* d = (char*)ray_data(out);
+        for (int64_t i = 0; i < nrows; i++)
+            memcpy(d + (size_t)i * esz, s, esz);
+        return out;
+    }
+
+    switch (src->type) {
+        case -RAY_F64: {
+            double v = src->f64;
+            for (int64_t i = 0; i < nrows; i++) ((double*)ray_data(out))[i] = v;
+            return out;
+        }
+        case -RAY_I64:
+        case -RAY_SYM:
+        case -RAY_TIMESTAMP: {
+            int64_t v = src->i64;
+            for (int64_t i = 0; i < nrows; i++) ((int64_t*)ray_data(out))[i] = v;
+            return out;
+        }
+        case -RAY_DATE:
+        case -RAY_TIME: {
+            int32_t v = (int32_t)src->i64;
+            for (int64_t i = 0; i < nrows; i++) ((int32_t*)ray_data(out))[i] = v;
+            return out;
+        }
+        case -RAY_I32: {
+            int32_t v = src->i32;
+            for (int64_t i = 0; i < nrows; i++) ((int32_t*)ray_data(out))[i] = v;
+            return out;
+        }
+        case -RAY_I16: {
+            int16_t v = src->i16;
+            for (int64_t i = 0; i < nrows; i++) ((int16_t*)ray_data(out))[i] = v;
+            return out;
+        }
+        case -RAY_U8:
+        case -RAY_BOOL: {
+            uint8_t v = src->u8;
+            for (int64_t i = 0; i < nrows; i++) ((uint8_t*)ray_data(out))[i] = v;
+            return out;
+        }
+        default:
+            ray_release(out);
+            return NULL;
+    }
+}
+
+/* ---- Scalar aggregate (n_keys==0): one flat scan, no GID, no hash ---- */
+typedef struct {
+    void**         agg_ptrs;
+    int8_t*        agg_types;
+    uint16_t*      agg_ops;
+    agg_linear_t*  agg_linear;
+    uint8_t        n_aggs;
+    uint8_t        need_flags;
+    const int64_t* match_idx;    /* NULL = no selection */
+    /* per-worker accumulators (1 slot each) */
+    da_accum_t*    accums;
+    uint32_t       n_accums;
+} scalar_ctx_t;
+
+static inline int64_t scalar_i64_at(const void* ptr, int8_t type, int64_t r) {
+    return read_col_i64(ptr, r, type, 0);  /* attrs=0: agg columns are numeric, never SYM */
+}
+
+/* Tight SIMD-friendly loop for single SUM/AVG on i64 (no mask).
+ * Note: int64 sum can overflow; caller responsibility to use appropriate types. */
+static void scalar_sum_i64_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    scalar_ctx_t* c = (scalar_ctx_t*)ctx;
+    da_accum_t* acc = &c->accums[worker_id];
+    const int64_t* restrict data = (const int64_t*)c->agg_ptrs[0];
+    int64_t sum = 0;
+    for (int64_t r = start; r < end; r++)
+        sum += data[r];
+    acc->sum[0].i += sum;
+    acc->count[0] += end - start;
+}
+
+/* Tight SIMD-friendly loop for single SUM/AVG on f64 (no mask) */
+static void scalar_sum_f64_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    scalar_ctx_t* c = (scalar_ctx_t*)ctx;
+    da_accum_t* acc = &c->accums[worker_id];
+    const double* restrict data = (const double*)c->agg_ptrs[0];
+    double sum = 0.0;
+    for (int64_t r = start; r < end; r++)
+        sum += data[r];
+    acc->sum[0].f += sum;
+    acc->count[0] += end - start;
+}
+
+/* Tight loop for single SUM/AVG on integer linear expression (no mask). */
+static void scalar_sum_linear_i64_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    scalar_ctx_t* c = (scalar_ctx_t*)ctx;
+    da_accum_t* acc = &c->accums[worker_id];
+    const agg_linear_t* lin = &c->agg_linear[0];
+    int64_t n = end - start;
+
+    int64_t sum = lin->bias_i64 * n;
+    for (uint8_t t = 0; t < lin->n_terms; t++) {
+        int64_t coeff = lin->coeff_i64[t];
+        if (coeff == 0) continue;
+        const void* ptr = lin->term_ptrs[t];
+        int8_t type = lin->term_types[t];
+        int64_t term_sum = 0;
+        for (int64_t r = start; r < end; r++)
+            term_sum += scalar_i64_at(ptr, type, r);
+        sum += coeff * term_sum;
+    }
+
+    acc->sum[0].i += sum;
+    acc->count[0] += n;
+}
+
+/* Generic scalar accumulation: handles all ops, all types, mask */
+/* Inner scalar accumulation for a single row */
+static inline void scalar_accum_row(scalar_ctx_t* c, da_accum_t* acc, int64_t r) {
+    uint8_t n_aggs = c->n_aggs;
+    acc->count[0]++;
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        double fv; int64_t iv;
+        if (c->agg_linear && c->agg_linear[a].enabled) {
+            const agg_linear_t* lin = &c->agg_linear[a];
+            iv = lin->bias_i64;
+            for (uint8_t t = 0; t < lin->n_terms; t++) {
+                iv += lin->coeff_i64[t] *
+                      scalar_i64_at(lin->term_ptrs[t], lin->term_types[t], r);
+            }
+            fv = (double)iv;
+        } else {
+            if (!c->agg_ptrs[a]) continue;
+            da_read_val(c->agg_ptrs[a], c->agg_types[a], 0, r, &fv, &iv);
+        }
+        uint16_t op = c->agg_ops[a];
+        bool is_f = (c->agg_types[a] == RAY_F64);
+        if (op == OP_SUM || op == OP_AVG || op == OP_STDDEV || op == OP_STDDEV_POP || op == OP_VAR || op == OP_VAR_POP) {
+            if (is_f) acc->sum[a].f += fv;
+            else acc->sum[a].i += iv;
+            if (acc->sumsq_f64) acc->sumsq_f64[a] += fv * fv;
+        } else if (op == OP_FIRST) {
+            if (acc->count[0] == 1) {
+                if (is_f) acc->sum[a].f = fv; else acc->sum[a].i = iv;
+            }
+        } else if (op == OP_LAST) {
+            if (is_f) acc->sum[a].f = fv; else acc->sum[a].i = iv;
+        } else if (op == OP_MIN) {
+            if (is_f) { if (fv < acc->min_val[a].f) acc->min_val[a].f = fv; }
+            else      { if (iv < acc->min_val[a].i) acc->min_val[a].i = iv; }
+        } else if (op == OP_MAX) {
+            if (is_f) { if (fv > acc->max_val[a].f) acc->max_val[a].f = fv; }
+            else      { if (iv > acc->max_val[a].i) acc->max_val[a].i = iv; }
+        }
+    }
+}
+
+static void scalar_accum_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    scalar_ctx_t* c = (scalar_ctx_t*)ctx;
+    da_accum_t* acc = &c->accums[worker_id];
+    const int64_t* match_idx = c->match_idx;
+
+    for (int64_t i = start; i < end; i++) {
+        int64_t r = match_idx ? match_idx[i] : i;
+        scalar_accum_row(c, acc, r);
+    }
+}
+
+/* Inner DA accumulation for a single row — shared by single-key and multi-key paths.
+ * Fast path for SUM/AVG-only queries: eliminates op-code dispatch and da_read_val
+ * dual-write overhead.  The branch on c->all_sum is perfectly predicted (invariant
+ * across all rows). */
+static inline void da_accum_row(da_ctx_t* c, da_accum_t* acc, int32_t gid, int64_t r) {
+    uint8_t n_aggs = c->n_aggs;
+    acc->count[gid]++;
+    size_t base = (size_t)gid * n_aggs;
+
+    if (RAY_LIKELY(c->all_sum)) {
+        /* SUM/AVG/COUNT fast path — no op-code dispatch, typed read only.
+         * COUNT-only queries have acc->sum==NULL; count[gid]++ above suffices. */
+        if (!acc->sum) return;
+        uint32_t f64m = c->agg_f64_mask;
+        for (uint8_t a = 0; a < n_aggs; a++) {
+            if (!c->agg_ptrs[a]) continue;
+            size_t idx = base + a;
+            if (f64m & (1u << a))
+                acc->sum[idx].f += ((const double*)c->agg_ptrs[a])[r];
+            else
+                acc->sum[idx].i += read_col_i64(c->agg_ptrs[a], r,
+                                                c->agg_types[a], 0);
+        }
+        return;
+    }
+
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        if (!c->agg_ptrs[a]) continue;
+        size_t idx = base + a;
+        double fv; int64_t iv;
+        da_read_val(c->agg_ptrs[a], c->agg_types[a], 0, r, &fv, &iv);
+        uint16_t op = c->agg_ops[a];
+        if (op == OP_SUM || op == OP_AVG || op == OP_STDDEV || op == OP_STDDEV_POP || op == OP_VAR || op == OP_VAR_POP) {
+            if (c->agg_types[a] == RAY_F64) acc->sum[idx].f += fv;
+            else acc->sum[idx].i = (int64_t)((uint64_t)acc->sum[idx].i + (uint64_t)iv);
+            if (acc->sumsq_f64) acc->sumsq_f64[idx] += fv * fv;
+        } else if (op == OP_FIRST) {
+            if (acc->count[gid] == 1) {
+                if (c->agg_types[a] == RAY_F64) acc->sum[idx].f = fv;
+                else acc->sum[idx].i = iv;
+            }
+        } else if (op == OP_LAST) {
+            if (c->agg_types[a] == RAY_F64) acc->sum[idx].f = fv;
+            else acc->sum[idx].i = iv;
+        } else if (op == OP_MIN) {
+            if (c->agg_types[a] == RAY_F64) {
+                if (fv < acc->min_val[idx].f) acc->min_val[idx].f = fv;
+            } else {
+                if (iv < acc->min_val[idx].i) acc->min_val[idx].i = iv;
+            }
+        } else if (op == OP_MAX) {
+            if (c->agg_types[a] == RAY_F64) {
+                if (fv > acc->max_val[idx].f) acc->max_val[idx].f = fv;
+            } else {
+                if (iv > acc->max_val[idx].i) acc->max_val[idx].i = iv;
+            }
+        }
+    }
+}
+
+static void da_accum_fn(void* ctx, uint32_t worker_id, int64_t start, int64_t end) {
+    da_ctx_t* c = (da_ctx_t*)ctx;
+    da_accum_t* acc = &c->accums[worker_id];
+    uint8_t n_aggs = c->n_aggs;
+    uint8_t n_keys = c->n_keys;
+    const int64_t* match_idx = c->match_idx;
+
+    /* Fast path: single key — avoid composite GID loop overhead.
+     * Templated by key element size: the entire loop is stamped out per width
+     * so the compiler generates direct movzbl/movzwl/movl/movq — zero dispatch. */
+    #define DA_PF_DIST 8
+    #define DA_SINGLE_KEY_LOOP(KTYPE, KCAST) \
+    do { \
+        const KTYPE* kp = (const KTYPE*)c->key_ptrs[0]; \
+        int64_t kmin = c->key_mins[0]; \
+        bool da_pf = c->n_slots >= 4096; \
+        for (int64_t i = start; i < end; i++) { \
+            int64_t r = match_idx ? match_idx[i] : i; \
+            if (da_pf && RAY_LIKELY(i + DA_PF_DIST < end)) { \
+                int64_t pf_r = match_idx ? match_idx[i + DA_PF_DIST] : (i + DA_PF_DIST); \
+                int64_t pfk = (int64_t)KCAST kp[pf_r]; \
+                __builtin_prefetch(&acc->count[(int32_t)(pfk - kmin)], 1, 1); \
+                if (acc->sum) __builtin_prefetch( \
+                    &acc->sum[(size_t)(int32_t)(pfk - kmin) * n_aggs], 1, 1); \
+            } \
+            int64_t kv = (int64_t)KCAST kp[r]; \
+            da_accum_row(c, acc, (int32_t)(kv - kmin), r); \
+        } \
+    } while (0)
+
+    if (n_keys == 1) {
+        switch (c->key_esz[0]) {
+        case 1: DA_SINGLE_KEY_LOOP(uint8_t, ); break;
+        case 2: DA_SINGLE_KEY_LOOP(uint16_t, ); break;
+        case 4: DA_SINGLE_KEY_LOOP(uint32_t, (int64_t)); break;
+        default: DA_SINGLE_KEY_LOOP(int64_t, ); break;
+        }
+        #undef DA_SINGLE_KEY_LOOP
+        return;
+    }
+
+    /* Multi-key composite GID — typed inner loop eliminates read_by_esz switch.
+     * When all keys share the same element size, use da_composite_gid_XX(). */
+    #define DA_MULTI_KEY_LOOP(GID_FN) \
+    do { \
+        bool _da_pf = c->n_slots >= 4096; \
+        for (int64_t i = start; i < end; i++) { \
+            int64_t r = match_idx ? match_idx[i] : i; \
+            if (_da_pf && RAY_LIKELY(i + DA_PF_DIST < end)) { \
+                int64_t pf_r = match_idx ? match_idx[i + DA_PF_DIST] : (i + DA_PF_DIST); \
+                int32_t pf_gid = GID_FN(pf_r); \
+                __builtin_prefetch(&acc->count[pf_gid], 1, 1); \
+                if (acc->sum) __builtin_prefetch(&acc->sum[(size_t)pf_gid * n_aggs], 1, 1); \
+            } \
+            da_accum_row(c, acc, GID_FN(r), r); \
+        } \
+    } while (0)
+
+    /* Check if all keys share the same element size */
+    bool uniform_esz = true;
+    for (uint8_t k = 1; k < n_keys; k++)
+        if (c->key_esz[k] != c->key_esz[0]) { uniform_esz = false; break; }
+
+    if (uniform_esz) {
+        switch (c->key_esz[0]) {
+        case 1:
+#define GID_FN(R) da_composite_gid_u8(c, (R))
+            DA_MULTI_KEY_LOOP(GID_FN);
+#undef GID_FN
+            break;
+        case 2:
+#define GID_FN(R) da_composite_gid_u16(c, (R))
+            DA_MULTI_KEY_LOOP(GID_FN);
+#undef GID_FN
+            break;
+        case 4:
+#define GID_FN(R) da_composite_gid_u32(c, (R))
+            DA_MULTI_KEY_LOOP(GID_FN);
+#undef GID_FN
+            break;
+        default:
+#define GID_FN(R) da_composite_gid_i64(c, (R))
+            DA_MULTI_KEY_LOOP(GID_FN);
+#undef GID_FN
+            break;
+        }
+    } else {
+#define GID_FN(R) da_composite_gid(c, (R))
+        DA_MULTI_KEY_LOOP(GID_FN);
+#undef GID_FN
+    }
+    #undef DA_MULTI_KEY_LOOP
+    #undef DA_PF_DIST
+}
+
+/* Parallel DA merge: merge per-worker accumulators into accums[0] by
+ * dispatching disjoint slot ranges across pool workers. */
+typedef struct {
+    da_accum_t* accums;
+    uint32_t    n_src_workers; /* number of source workers to merge (1..n) */
+    uint8_t     need_flags;
+    uint8_t     n_aggs;
+    const int8_t* agg_types;  /* per-agg value type (for typed merge) */
+    const uint16_t* agg_ops;  /* per-agg opcode (for FIRST/LAST merge) */
+} da_merge_ctx_t;
+
+static void da_merge_fn(void* ctx, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid;
+    da_merge_ctx_t* c = (da_merge_ctx_t*)ctx;
+    da_accum_t* merged = &c->accums[0];
+    uint8_t n_aggs = c->n_aggs;
+    const int8_t* agg_types = c->agg_types;
+    for (uint32_t w = 1; w < c->n_src_workers; w++) {
+        da_accum_t* wa = &c->accums[w];
+        for (int64_t s = start; s < end; s++) {
+            size_t base = (size_t)s * n_aggs;
+            if (c->need_flags & DA_NEED_SUMSQ) {
+                for (uint8_t a = 0; a < n_aggs; a++)
+                    merged->sumsq_f64[base + a] += wa->sumsq_f64[base + a];
+            }
+            if (c->need_flags & DA_NEED_SUM) {
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    size_t idx = base + a;
+                    uint16_t aop = c->agg_ops ? c->agg_ops[a] : OP_SUM;
+                    if (aop == OP_FIRST) {
+                        /* Keep worker 0 value; take from w only if merged has no data */
+                        if (merged->count[s] == 0 && wa->count[s] > 0)
+                            merged->sum[idx] = wa->sum[idx];
+                    } else if (aop == OP_LAST) {
+                        /* Overwrite with last worker that has data */
+                        if (wa->count[s] > 0)
+                            merged->sum[idx] = wa->sum[idx];
+                    } else if (agg_types[a] == RAY_F64)
+                        merged->sum[idx].f += wa->sum[idx].f;
+                    else
+                        merged->sum[idx].i += wa->sum[idx].i;
+                }
+            }
+            if (c->need_flags & DA_NEED_MIN) {
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    size_t idx = base + a;
+                    if (agg_types[a] == RAY_F64) {
+                        if (wa->min_val[idx].f < merged->min_val[idx].f)
+                            merged->min_val[idx].f = wa->min_val[idx].f;
+                    } else {
+                        if (wa->min_val[idx].i < merged->min_val[idx].i)
+                            merged->min_val[idx].i = wa->min_val[idx].i;
+                    }
+                }
+            }
+            if (c->need_flags & DA_NEED_MAX) {
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    size_t idx = base + a;
+                    if (agg_types[a] == RAY_F64) {
+                        if (wa->max_val[idx].f > merged->max_val[idx].f)
+                            merged->max_val[idx].f = wa->max_val[idx].f;
+                    } else {
+                        if (wa->max_val[idx].i > merged->max_val[idx].i)
+                            merged->max_val[idx].i = wa->max_val[idx].i;
+                    }
+                }
+            }
+            merged->count[s] += wa->count[s];
+        }
+    }
+}
+
+/* ============================================================================
+ * Partition-aware group-by: detect parted columns, concatenate segments into
+ * a flat table, then run standard exec_group once.
+ * ============================================================================ */
+ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl,
+                  int64_t group_limit); /* forward decl */
+
+/* Forward declaration — defined below exec_group */
+static ray_t* exec_group_per_partition(ray_t* parted_tbl, ray_op_ext_t* ext,
+                                       int32_t n_parts, const int64_t* key_syms,
+                                       const int64_t* agg_syms, int has_avg,
+                                       int has_stddev, int64_t group_limit);
+
+/* --------------------------------------------------------------------------
+ * exec_group_parted — dispatch per-partition or concat-fallback
+ * -------------------------------------------------------------------------- */
+static ray_t* exec_group_parted(ray_graph_t* g, ray_op_t* op, ray_t* parted_tbl,
+                               int64_t group_limit) {
+    int64_t ncols = ray_table_ncols(parted_tbl);
+    if (ncols <= 0) return ray_error("nyi", NULL);
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    uint8_t n_keys = ext->n_keys;
+    uint8_t n_aggs = ext->n_aggs;
+
+    /* Find partition count and total rows from first parted column */
+    int32_t n_parts = 0;
+    int64_t total_rows = 0;
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(parted_tbl, c);
+        if (col && RAY_IS_PARTED(col->type)) {
+            n_parts = (int32_t)col->len;
+            total_rows = ray_parted_nrows(col);
+            break;
+        }
+    }
+    if (n_parts <= 0 || total_rows <= 0) return ray_error("nyi", NULL);
+
+    /* Check eligibility for per-partition exec + merge:
+     * - All keys and agg inputs must be simple SCANs
+     * - Supported agg ops: SUM, COUNT, MIN, MAX, AVG, FIRST, LAST,
+     *   STDDEV, STDDEV_POP, VAR, VAR_POP */
+    int can_partition = 1;
+    int has_avg = 0;
+    int has_stddev = 0;
+    int64_t key_syms[8];
+    for (uint8_t k = 0; k < n_keys && can_partition; k++) {
+        ray_op_ext_t* ke = find_ext(g, ext->keys[k]->id);
+        if (!ke || ke->base.opcode != OP_SCAN) { can_partition = 0; break; }
+        key_syms[k] = ke->sym;
+    }
+    int64_t agg_syms[8];
+    for (uint8_t a = 0; a < n_aggs && can_partition; a++) {
+        uint16_t aop = ext->agg_ops[a];
+        if (aop != OP_SUM && aop != OP_COUNT && aop != OP_MIN &&
+            aop != OP_MAX && aop != OP_AVG && aop != OP_FIRST &&
+            aop != OP_LAST && aop != OP_STDDEV && aop != OP_STDDEV_POP &&
+            aop != OP_VAR && aop != OP_VAR_POP) { can_partition = 0; break; }
+        if (aop == OP_AVG) has_avg = 1;
+        if (aop == OP_STDDEV || aop == OP_STDDEV_POP ||
+            aop == OP_VAR || aop == OP_VAR_POP) has_stddev = 1;
+        ray_op_ext_t* ae = find_ext(g, ext->agg_ins[a]->id);
+        if (!ae || ae->base.opcode != OP_SCAN) { can_partition = 0; break; }
+        agg_syms[a] = ae->sym;
+    }
+
+    /* Cardinality gate: estimate groups from first partition.
+     * Per-partition only wins when #groups << partition_size. */
+    if (can_partition) {
+        int64_t rows_per_part = total_rows / n_parts;
+        int64_t est_groups = 1;
+        for (uint8_t k = 0; k < n_keys; k++) {
+            ray_t* pcol = ray_table_get_col(parted_tbl, key_syms[k]);
+            if (!pcol) { est_groups = rows_per_part; break; }
+            /* MAPCOMMON key: constant per partition — excluded from
+             * per-partition sub-GROUP-BY, contributes 0 to cardinality. */
+            if (pcol->type == RAY_MAPCOMMON) { continue; }
+            if (!RAY_IS_PARTED(pcol->type)) { est_groups = rows_per_part; break; }
+            ray_t* seg0 = ((ray_t**)ray_data(pcol))[0];
+            if (!seg0 || seg0->len <= 0) { est_groups = rows_per_part; break; }
+            int8_t bt = RAY_PARTED_BASETYPE(pcol->type);
+            int64_t card;
+            if (RAY_IS_SYM(bt)) {
+                uint32_t sym_n = ray_sym_count();
+                if (sym_n == 0 || sym_n > 4194304) { est_groups = rows_per_part; break; }
+                size_t bwords = ((size_t)sym_n + 63) / 64;
+                ray_t* bits_hdr = NULL;
+                uint64_t* bits = (uint64_t*)scratch_calloc(&bits_hdr, bwords * 8);
+                if (!bits) { est_groups = rows_per_part; break; }
+                for (int64_t r = 0; r < seg0->len; r++) {
+                    uint32_t id = (uint32_t)ray_read_sym(ray_data(seg0), r, seg0->type, seg0->attrs);
+                    bits[id / 64] |= 1ULL << (id % 64);
+                }
+                card = 0;
+                for (size_t i = 0; i < bwords; i++)
+                    card += __builtin_popcountll(bits[i]);
+                scratch_free(bits_hdr);
+            } else if (bt == RAY_I64) {
+                const int64_t* v = (const int64_t*)ray_data(seg0);
+                int64_t lo = v[0], hi = v[0];
+                for (int64_t r = 1; r < seg0->len; r++) {
+                    if (v[r] < lo) lo = v[r];
+                    if (v[r] > hi) hi = v[r];
+                }
+                card = hi - lo + 1;
+            } else if (bt == RAY_I32) {
+                const int32_t* v = (const int32_t*)ray_data(seg0);
+                int32_t lo = v[0], hi = v[0];
+                for (int64_t r = 1; r < seg0->len; r++) {
+                    if (v[r] < lo) lo = v[r];
+                    if (v[r] > hi) hi = v[r];
+                }
+                card = (int64_t)(hi - lo + 1);
+            } else {
+                card = seg0->len;
+            }
+            est_groups *= card;
+            if (est_groups > rows_per_part) { est_groups = rows_per_part; break; }
+        }
+        /* Block per-partition when cardinality is high AND the concat
+         * fallback would fit in memory (< 4 GB estimated).  When concat is
+         * too large, per-partition with batched merge is the only option. */
+        int64_t concat_bytes = total_rows * 8LL * (int64_t)(n_keys + n_aggs);
+        if (est_groups * 100 > rows_per_part &&
+            concat_bytes < 4LL * 1024 * 1024 * 1024)
+            can_partition = 0;
+    }
+
+    /* Try per-partition path (separate noinline function to avoid I-cache pressure) */
+    if (can_partition) {
+        ray_t* result = exec_group_per_partition(parted_tbl, ext, n_parts,
+                                                 key_syms, agg_syms, has_avg,
+                                                 has_stddev, group_limit);
+        if (result) return result;
+        /* NULL = per-partition failed, fall through to concat */
+    }
+
+    /* ---- Concat fallback ---- */
+    /* ---- Concat-only-needed-columns fallback ----
+     * Used when query has AVG or expression keys/aggs.
+     * Only concatenates the columns actually referenced by the GROUP BY. */
+    {
+        /* Collect needed column sym IDs (keys + agg inputs) */
+        int64_t needed[16];
+        int n_needed = 0;
+        for (uint8_t k = 0; k < n_keys; k++) {
+            ray_op_ext_t* ke = find_ext(g, ext->keys[k]->id);
+            if (ke && ke->base.opcode == OP_SCAN) {
+                int dup = 0;
+                for (int i = 0; i < n_needed; i++)
+                    if (needed[i] == ke->sym) { dup = 1; break; }
+                if (!dup) needed[n_needed++] = ke->sym;
+            }
+        }
+        for (uint8_t a = 0; a < n_aggs; a++) {
+            ray_op_ext_t* ae = find_ext(g, ext->agg_ins[a]->id);
+            if (ae && ae->base.opcode == OP_SCAN) {
+                int dup = 0;
+                for (int i = 0; i < n_needed; i++)
+                    if (needed[i] == ae->sym) { dup = 1; break; }
+                if (!dup) needed[n_needed++] = ae->sym;
+            } else {
+                /* Expression agg input — need all columns for evaluation.
+                 * Fall back to copying everything. */
+                n_needed = 0;
+                break;
+            }
+        }
+
+        /* Build flat table with only needed columns (or all if n_needed==0) */
+        ray_t* flat_tbl = ray_table_new(n_needed > 0 ? (int64_t)n_needed : ncols);
+        if (!flat_tbl || RAY_IS_ERR(flat_tbl)) return flat_tbl;
+
+        int64_t cols_to_iter = n_needed > 0 ? (int64_t)n_needed : ncols;
+        for (int64_t ci = 0; ci < cols_to_iter; ci++) {
+            ray_t* col;
+            int64_t name_id;
+            if (n_needed > 0) {
+                col = ray_table_get_col(parted_tbl, needed[ci]);
+                name_id = needed[ci];
+            } else {
+                col = ray_table_get_col_idx(parted_tbl, ci);
+                name_id = ray_table_col_name(parted_tbl, ci);
+            }
+            if (!col) continue;
+            if (col->type == RAY_MAPCOMMON) {
+                ray_t* mc_flat = materialize_mapcommon(col);
+                if (mc_flat && !RAY_IS_ERR(mc_flat)) {
+                    flat_tbl = ray_table_add_col(flat_tbl, name_id, mc_flat);
+                    ray_release(mc_flat);
+                }
+                continue;
+            }
+
+            if (!RAY_IS_PARTED(col->type)) {
+                ray_retain(col);
+                flat_tbl = ray_table_add_col(flat_tbl, name_id, col);
+                ray_release(col);
+                continue;
+            }
+
+            int8_t base_type = (int8_t)RAY_PARTED_BASETYPE(col->type);
+            ray_t** segs = (ray_t**)ray_data(col);
+            ray_t* flat;
+
+            if (base_type == RAY_STR) {
+                flat = parted_flatten_str(segs, col->len, total_rows);
+            } else {
+                uint8_t base_attrs = (base_type == RAY_SYM)
+                                   ? parted_first_attrs(segs, col->len) : 0;
+                flat = typed_vec_new(base_type, base_attrs, total_rows);
+                if (!flat || RAY_IS_ERR(flat)) {
+                    ray_release(flat_tbl);
+                    return ray_error("oom", NULL);
+                }
+                flat->len = total_rows;
+
+                size_t elem_size = (size_t)ray_sym_elem_size(base_type, base_attrs);
+                int64_t offset = 0;
+                for (int32_t p = 0; p < n_parts; p++) {
+                    ray_t* seg = segs[p];
+                    if (!seg || seg->len <= 0) continue;
+                    if (parted_seg_esz_ok(seg, base_type, (uint8_t)elem_size)) {
+                        memcpy((char*)ray_data(flat) + (size_t)offset * elem_size,
+                               ray_data(seg), (size_t)seg->len * elem_size);
+                    } else {
+                        memset((char*)ray_data(flat) + (size_t)offset * elem_size,
+                               0, (size_t)seg->len * elem_size);
+                    }
+                    offset += seg->len;
+                }
+            }
+            if (!flat || RAY_IS_ERR(flat)) {
+                ray_release(flat_tbl);
+                return ray_error("oom", NULL);
+            }
+
+            flat_tbl = ray_table_add_col(flat_tbl, name_id, flat);
+            ray_release(flat);
+        }
+
+        ray_t* saved = g->table;
+        g->table = flat_tbl;
+        ray_t* result = exec_group(g, op, flat_tbl, 0);
+        g->table = saved;
+        ray_release(flat_tbl);
+        return result;
+    }
+}
+
+ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl,
+                  int64_t group_limit) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+
+    /* Selection-shape guard — runs BEFORE any fast path (parted
+     * dispatch, factorized shortcut) so every exec_group code path
+     * sees the same validated selection state.  A mismatch here
+     * indicates a graph-construction bug: the caller installed a
+     * selection that was built for a different table shape, and
+     * silently ignoring it would return unfiltered results. */
+    if (g->selection) {
+        ray_rowsel_t* sm = ray_rowsel_meta(g->selection);
+        int64_t tbl_nrows = ray_table_nrows(tbl);
+        if (sm->nrows != tbl_nrows)
+            return ray_error("domain",
+                "exec_group: selection nrows mismatch (sel=%lld tbl=%lld)",
+                (long long)sm->nrows, (long long)tbl_nrows);
+    }
+
+    /* Parted dispatch: detect parted input columns */
+    {
+        int64_t nc = ray_table_ncols(tbl);
+        for (int64_t c = 0; c < nc; c++) {
+            ray_t* col = ray_table_get_col_idx(tbl, c);
+            if (col && (RAY_IS_PARTED(col->type) || col->type == RAY_MAPCOMMON)) {
+                /* exec_group_parted has no rowsel plumbing — a
+                 * selection in flight would be silently ignored.
+                 * Reject rather than produce unfiltered results. */
+                if (g->selection)
+                    return ray_error("nyi",
+                        "GROUP BY with selection on parted table");
+                return exec_group_parted(g, op, tbl, group_limit);
+            }
+        }
+    }
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    int64_t nrows = ray_table_nrows(tbl);
+    uint8_t n_keys = ext->n_keys;
+    uint8_t n_aggs = ext->n_aggs;
+
+    /* Factorized shortcut: if input is a factorized expand result with
+     * (_src, _count) columns, and GROUP BY _src with COUNT/SUM(_count),
+     * return the pre-aggregated table directly without re-scanning.
+     *
+     * Interaction with g->selection: the factorized _count column
+     * encodes weighted counts, so COUNT(*) must SUM _count to get
+     * the true row count and SUM(_count) is the same thing.
+     * Neither the shortcut (returns verbatim, no filter) nor the
+     * main path (counts rows of the _src table, ignoring _count)
+     * knows how to apply a row filter while preserving those
+     * semantics.
+     *
+     * Other agg shapes — SUM/AVG/MIN/MAX of a non-_count column,
+     * etc. — don't rely on the factorized weighting; the main
+     * path handles them correctly with the selection installed.
+     * So the rejection must mirror the shortcut's exact
+     * compatibility check (all aggs are COUNT or SUM(_count)),
+     * not just the presence of a _count column. */
+    if (g->selection && n_keys == 1 && n_aggs > 0 && nrows > 0) {
+        int64_t cnt_sym_probe = ray_sym_intern("_count", 6);
+        ray_t*  cnt_col_probe = ray_table_get_col(tbl, cnt_sym_probe);
+        ray_op_ext_t* key_ext_probe = find_ext(g, ext->keys[0]->id);
+        int64_t src_sym_probe = ray_sym_intern("_src", 4);
+        if (cnt_col_probe && cnt_col_probe->type == RAY_I64 &&
+            key_ext_probe && key_ext_probe->base.opcode == OP_SCAN &&
+            key_ext_probe->sym == src_sym_probe) {
+            /* Reject on ANY agg whose semantics depend on the
+             * factorized _count weighting: COUNT(*) counts
+             * underlying source rows (not _src table rows) and
+             * SUM(_count) is equivalent.  Even if only one agg in
+             * a mixed query needs weighting, the main path can't
+             * handle it correctly, so fail the whole query rather
+             * than return a mix of right and wrong columns.
+             *
+             * Special case: an empty selection (total_pass == 0)
+             * means every row was filtered out, so the result is
+             * an empty group set regardless of which aggs are
+             * involved.  The main path handles this correctly
+             * even for count-weighted aggs because n_scan == 0
+             * produces no group rows at all.  Let it fall
+             * through. */
+            ray_rowsel_t* sm = ray_rowsel_meta(g->selection);
+            if (sm->total_pass > 0) {
+                bool needs_weighting = false;
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    uint16_t aop = ext->agg_ops[a];
+                    ray_op_ext_t* agg_ext = find_ext(g, ext->agg_ins[a]->id);
+                    if (aop == OP_COUNT) { needs_weighting = true; break; }
+                    if (aop == OP_SUM && agg_ext &&
+                        agg_ext->base.opcode == OP_SCAN &&
+                        agg_ext->sym == cnt_sym_probe) {
+                        needs_weighting = true; break;
+                    }
+                }
+                if (needs_weighting)
+                    return ray_error("nyi",
+                        "GROUP BY with selection on factorized expand result "
+                        "(COUNT/SUM(_count) semantics)");
+            }
+        }
+    }
+    if (!g->selection && n_keys == 1 && n_aggs > 0 && nrows > 0) {
+        int64_t cnt_sym = ray_sym_intern("_count", 6);
+        ray_t* cnt_col = ray_table_get_col(tbl, cnt_sym);
+        if (cnt_col && cnt_col->type == RAY_I64) {
+            ray_op_ext_t* key_ext = find_ext(g, ext->keys[0]->id);
+            int64_t src_sym = ray_sym_intern("_src", 4);
+            if (key_ext && key_ext->base.opcode == OP_SCAN &&
+                key_ext->sym == src_sym) {
+                /* Verify all aggs are compatible with factorized data:
+                 * COUNT(*) → use _count directly
+                 * SUM(_count) → use _count directly */
+                bool all_compat = true;
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    uint16_t aop = ext->agg_ops[a];
+                    ray_op_ext_t* agg_ext = find_ext(g, ext->agg_ins[a]->id);
+                    if (aop == OP_COUNT) continue;
+                    if (aop == OP_SUM && agg_ext &&
+                        agg_ext->base.opcode == OP_SCAN &&
+                        agg_ext->sym == cnt_sym) continue;
+                    all_compat = false;
+                    break;
+                }
+                if (all_compat) {
+                    /* The factorized table already has one row per group.
+                     * Build result with _src key + agg columns from _count. */
+                    ray_t* src_col = ray_table_get_col(tbl, src_sym);
+                    if (src_col) {
+                        int64_t out_nkeys = 1;
+                        int64_t out_ncols = out_nkeys + n_aggs;
+                        ray_t* result = ray_table_new((int64_t)out_ncols);
+                        if (!result || RAY_IS_ERR(result))
+                            return ray_error("oom", NULL);
+                        ray_retain(src_col);
+                        ray_t* tmp_r = ray_table_add_col(result, src_sym, src_col);
+                        ray_release(src_col);
+                        if (!tmp_r || RAY_IS_ERR(tmp_r)) {
+                            ray_release(result);
+                            return ray_error("oom", NULL);
+                        }
+                        result = tmp_r;
+                        for (uint8_t a = 0; a < n_aggs; a++) {
+                            ray_retain(cnt_col);
+                            int64_t agg_name = ray_sym_intern("_agg", 4);
+                            if (n_aggs > 1) {
+                                char buf[16];
+                                int n = snprintf(buf, sizeof(buf), "_agg%d", a);
+                                agg_name = ray_sym_intern(buf, (size_t)n);
+                            }
+                            tmp_r = ray_table_add_col(result, agg_name, cnt_col);
+                            ray_release(cnt_col);
+                            if (!tmp_r || RAY_IS_ERR(tmp_r)) {
+                                ray_release(result);
+                                return ray_error("oom", NULL);
+                            }
+                            result = tmp_r;
+                        }
+                        return result;
+                    }
+                }
+            }
+        }
+    }
+
+    if (n_keys > 8 || n_aggs > 8) return ray_error("nyi", NULL);
+
+    /* Extract selection (rowsel) for pushdown.  Workers iterate over
+     * [0, n_scan) and read row=match_idx[i].  When no selection is
+     * present, match_idx is NULL and n_scan equals nrows.  The
+     * match_idx_block must be released on every exec_group exit
+     * path — see the various `goto cleanup` and early returns below.
+     *
+     * The top-of-function guard already rejected nrows mismatches,
+     * so if we reach here with a selection it's guaranteed valid
+     * for `tbl`. */
+    ray_t* match_idx_block = NULL;
+    const int64_t* match_idx = NULL;
+    int64_t n_scan = nrows;
+    if (g->selection) {
+        match_idx_block = ray_rowsel_to_indices(g->selection);
+        if (!match_idx_block) return ray_error("oom", NULL);
+        match_idx = (const int64_t*)ray_data(match_idx_block);
+        n_scan = ray_rowsel_meta(g->selection)->total_pass;
+    }
+
+    /* Resolve key columns (VLA — n_keys ≤ 8; use ≥1 to avoid zero-size VLA UB) */
+    uint8_t vla_keys = n_keys > 0 ? n_keys : 1;
+    ray_t* key_vecs[vla_keys];
+    memset(key_vecs, 0, vla_keys * sizeof(ray_t*));
+
+    uint8_t key_owned[vla_keys]; /* 1 = we allocated via exec_node, must free */
+    memset(key_owned, 0, vla_keys * sizeof(uint8_t));
+    for (uint8_t k = 0; k < n_keys; k++) {
+        ray_op_t* key_op = ext->keys[k];
+        ray_op_ext_t* key_ext = find_ext(g, key_op->id);
+        if (key_ext && key_ext->base.opcode == OP_SCAN) {
+            key_vecs[k] = ray_table_get_col(tbl, key_ext->sym);
+        } else {
+            /* Expression key (CASE WHEN etc) — evaluate against current tbl */
+            ray_t* saved_table = g->table;
+            g->table = tbl;
+            ray_t* vec = exec_node(g, key_op);
+            g->table = saved_table;
+            if (vec && !RAY_IS_ERR(vec)) {
+                key_vecs[k] = vec;
+                key_owned[k] = 1;
+            }
+        }
+    }
+
+    /* Resolve agg input columns (VLA — n_aggs ≤ 8; use ≥1 to avoid zero-size VLA UB) */
+    uint8_t vla_aggs = n_aggs > 0 ? n_aggs : 1;
+    ray_t* agg_vecs[vla_aggs];
+    uint8_t agg_owned[vla_aggs]; /* 1 = we allocated via exec_node, must free */
+    agg_affine_t agg_affine[vla_aggs];
+    agg_linear_t agg_linear[vla_aggs];
+    memset(agg_vecs, 0, vla_aggs * sizeof(ray_t*));
+    memset(agg_owned, 0, vla_aggs * sizeof(uint8_t));
+    memset(agg_affine, 0, vla_aggs * sizeof(agg_affine_t));
+    memset(agg_linear, 0, vla_aggs * sizeof(agg_linear_t));
+
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        ray_op_t* agg_input_op = ext->agg_ins[a];
+        ray_op_ext_t* agg_ext = find_ext(g, agg_input_op->id);
+
+        /* SUM/AVG(scan +/- const): aggregate base scan and apply bias at emit. */
+        uint16_t agg_kind = ext->agg_ops[a];
+        if ((agg_kind == OP_SUM || agg_kind == OP_AVG) &&
+            try_affine_sumavg_input(g, tbl, agg_input_op, &agg_vecs[a], &agg_affine[a])) {
+            continue;
+        }
+
+        /* SUM/AVG(integer-linear expr): scalar path can aggregate directly
+         * without materializing the expression vector. */
+        if (n_keys == 0 && nrows > 0 &&
+            (agg_kind == OP_SUM || agg_kind == OP_AVG) &&
+            try_linear_sumavg_input_i64(g, tbl, agg_input_op, &agg_linear[a])) {
+            continue;
+        }
+
+        if (agg_ext && agg_ext->base.opcode == OP_SCAN) {
+            agg_vecs[a] = ray_table_get_col(tbl, agg_ext->sym);
+        } else if (agg_ext && agg_ext->base.opcode == OP_CONST && agg_ext->literal) {
+            agg_vecs[a] = agg_ext->literal;
+        } else {
+            /* Expression node (ADD/MUL etc) — try compiled expression first */
+            ray_expr_t agg_expr;
+            if (expr_compile(g, tbl, agg_input_op, &agg_expr)) {
+                ray_t* vec = expr_eval_full(&agg_expr, nrows);
+                if (vec && !RAY_IS_ERR(vec)) {
+                    agg_vecs[a] = vec;
+                    agg_owned[a] = 1;
+                    continue;
+                }
+            }
+            /* Fallback: full recursive evaluation */
+            ray_t* saved_table = g->table;
+            g->table = tbl;
+            ray_t* vec = exec_node(g, agg_input_op);
+            g->table = saved_table;
+            if (vec && !RAY_IS_ERR(vec)) {
+                agg_vecs[a] = vec;
+                agg_owned[a] = 1;
+            }
+        }
+    }
+
+    /* Normalize scalar agg inputs to full-length vectors.
+     * Constants and scalar sub-expressions (len=1) must be broadcast to nrows
+     * before row-wise aggregation loops. */
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        if (!agg_vecs[a] || RAY_IS_ERR(agg_vecs[a])) continue;
+        if (ext->agg_ops[a] == OP_COUNT) continue; /* value is ignored for COUNT */
+
+        bool needs_broadcast = ray_is_atom(agg_vecs[a]) ||
+                               (agg_vecs[a]->type > 0 && agg_vecs[a]->len == 1 && nrows > 1);
+        if (!needs_broadcast) continue;
+
+        ray_t* bcast = materialize_broadcast_input(agg_vecs[a], nrows);
+        if (!bcast || RAY_IS_ERR(bcast)) {
+            for (uint8_t i = 0; i < n_aggs; i++) {
+                if (agg_owned[i] && agg_vecs[i]) ray_release(agg_vecs[i]);
+            }
+            for (uint8_t k = 0; k < n_keys; k++) {
+                if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]);
+            }
+            return bcast && RAY_IS_ERR(bcast) ? bcast : ray_error("oom", NULL);
+        }
+
+        if (agg_owned[a]) ray_release(agg_vecs[a]);
+        agg_vecs[a] = bcast;
+        agg_owned[a] = 1;
+    }
+
+    /* Pre-compute key metadata (VLA — n_keys ≤ 8; vla_keys ≥ 1) */
+    void* key_data[vla_keys];
+    int8_t key_types[vla_keys];
+    uint8_t key_attrs[vla_keys];
+    for (uint8_t k = 0; k < n_keys; k++) {
+        if (key_vecs[k]) {
+            key_data[k]  = ray_data(key_vecs[k]);
+            key_types[k] = key_vecs[k]->type;
+            key_attrs[k] = key_vecs[k]->attrs;
+        } else {
+            key_data[k]  = NULL;
+            key_types[k] = 0;
+            key_attrs[k] = 0;
+        }
+    }
+
+    /* ---- Scalar aggregate fast path (n_keys == 0): flat vector scan ---- */
+    if (n_keys == 0 && nrows > 0) {
+        uint8_t need_flags = DA_NEED_COUNT;
+        for (uint8_t a = 0; a < n_aggs; a++) {
+            uint16_t aop = ext->agg_ops[a];
+            if (aop == OP_SUM || aop == OP_AVG || aop == OP_FIRST || aop == OP_LAST)
+                need_flags |= DA_NEED_SUM;
+            else if (aop == OP_STDDEV || aop == OP_STDDEV_POP || aop == OP_VAR || aop == OP_VAR_POP)
+                { need_flags |= DA_NEED_SUM; need_flags |= DA_NEED_SUMSQ; }
+            else if (aop == OP_MIN) need_flags |= DA_NEED_MIN;
+            else if (aop == OP_MAX) need_flags |= DA_NEED_MAX;
+        }
+
+        void* agg_ptrs[vla_aggs];
+        int8_t agg_types[vla_aggs];
+        for (uint8_t a = 0; a < n_aggs; a++) {
+            if (agg_vecs[a]) {
+                agg_ptrs[a]  = ray_data(agg_vecs[a]);
+                agg_types[a] = agg_vecs[a]->type;
+            } else {
+                agg_ptrs[a]  = NULL;
+                agg_types[a] = 0;
+            }
+        }
+
+        ray_pool_t* sc_pool = ray_pool_get();
+        uint32_t sc_n = (sc_pool && nrows >= RAY_PARALLEL_THRESHOLD)
+                        ? ray_pool_total_workers(sc_pool) : 1;
+
+        ray_t* sc_hdr;
+        da_accum_t* sc_acc = (da_accum_t*)scratch_calloc(&sc_hdr,
+            sc_n * sizeof(da_accum_t));
+        if (!sc_acc) goto da_path;
+
+        /* Allocate 1-slot accumulators per worker (n_aggs entries) */
+        bool alloc_ok = true;
+        for (uint32_t w = 0; w < sc_n; w++) {
+            if (need_flags & DA_NEED_SUM) {
+                sc_acc[w].sum = (da_val_t*)scratch_calloc(&sc_acc[w]._h_sum,
+                    n_aggs * sizeof(da_val_t));
+                if (!sc_acc[w].sum) { alloc_ok = false; break; }
+            }
+            if (need_flags & DA_NEED_MIN) {
+                sc_acc[w].min_val = (da_val_t*)scratch_alloc(&sc_acc[w]._h_min,
+                    n_aggs * sizeof(da_val_t));
+                if (!sc_acc[w].min_val) { alloc_ok = false; break; }
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    if (agg_types[a] == RAY_F64) sc_acc[w].min_val[a].f = DBL_MAX;
+                    else sc_acc[w].min_val[a].i = INT64_MAX;
+                }
+            }
+            if (need_flags & DA_NEED_MAX) {
+                sc_acc[w].max_val = (da_val_t*)scratch_alloc(&sc_acc[w]._h_max,
+                    n_aggs * sizeof(da_val_t));
+                if (!sc_acc[w].max_val) { alloc_ok = false; break; }
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    if (agg_types[a] == RAY_F64) sc_acc[w].max_val[a].f = -DBL_MAX;
+                    else sc_acc[w].max_val[a].i = INT64_MIN;
+                }
+            }
+            if (need_flags & DA_NEED_SUMSQ) {
+                sc_acc[w].sumsq_f64 = (double*)scratch_calloc(&sc_acc[w]._h_sumsq,
+                    n_aggs * sizeof(double));
+                if (!sc_acc[w].sumsq_f64) { alloc_ok = false; break; }
+            }
+            sc_acc[w].count = (int64_t*)scratch_calloc(&sc_acc[w]._h_count,
+                1 * sizeof(int64_t));
+            if (!sc_acc[w].count) { alloc_ok = false; break; }
+        }
+        if (!alloc_ok) {
+            for (uint32_t w = 0; w < sc_n; w++) da_accum_free(&sc_acc[w]);
+            scratch_free(sc_hdr);
+            goto da_path;
+        }
+
+        scalar_ctx_t sc_ctx = {
+            .agg_ptrs   = agg_ptrs,
+            .agg_types  = agg_types,
+            .agg_ops    = ext->agg_ops,
+            .agg_linear = agg_linear,
+            .n_aggs     = n_aggs,
+            .need_flags = need_flags,
+            .match_idx  = match_idx,
+            .accums     = sc_acc,
+            .n_accums   = sc_n,
+        };
+
+        /* Pick specialized tight loop when possible, else generic.
+         * The specialized scalar_sum_*_fn variants don't honour
+         * match_idx — they read data[r] directly — so they're only
+         * safe when no selection is in flight. */
+        typedef void (*scalar_fn_t)(void*, uint32_t, int64_t, int64_t);
+        scalar_fn_t sc_fn = scalar_accum_fn;
+        if (n_aggs == 1 && !match_idx && agg_ptrs[0] != NULL) {
+            uint16_t op0 = ext->agg_ops[0];
+            int8_t   t0  = agg_types[0];
+            if ((op0 == OP_SUM || op0 == OP_AVG) &&
+                (t0 == RAY_I64 || t0 == RAY_SYM || t0 == RAY_TIMESTAMP))
+                sc_fn = scalar_sum_i64_fn;
+            else if ((op0 == OP_SUM || op0 == OP_AVG) && t0 == RAY_F64)
+                sc_fn = scalar_sum_f64_fn;
+        } else if (n_aggs == 1 && !match_idx && agg_linear[0].enabled) {
+            uint16_t op0 = ext->agg_ops[0];
+            if (op0 == OP_SUM || op0 == OP_AVG)
+                sc_fn = scalar_sum_linear_i64_fn;
+        }
+
+        if (sc_n > 1)
+            ray_pool_dispatch(sc_pool, sc_fn, &sc_ctx, n_scan);
+        else
+            sc_fn(&sc_ctx, 0, 0, n_scan);
+
+        /* Merge per-worker accumulators into sc_acc[0] */
+        da_accum_t* m = &sc_acc[0];
+        for (uint32_t w = 1; w < sc_n; w++) {
+            da_accum_t* wa = &sc_acc[w];
+            if (need_flags & DA_NEED_SUM) {
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    uint16_t merge_op = ext->agg_ops[a];
+                    if (merge_op == OP_FIRST) {
+                        if (m->count[0] == 0 && wa->count[0] > 0)
+                            m->sum[a] = wa->sum[a];
+                    } else if (merge_op == OP_LAST) {
+                        if (wa->count[0] > 0)
+                            m->sum[a] = wa->sum[a];
+                    } else {
+                        if (agg_types[a] == RAY_F64)
+                            m->sum[a].f += wa->sum[a].f;
+                        else
+                            m->sum[a].i += wa->sum[a].i;
+                    }
+                }
+            }
+            if (need_flags & DA_NEED_SUMSQ) {
+                for (uint8_t a = 0; a < n_aggs; a++)
+                    m->sumsq_f64[a] += wa->sumsq_f64[a];
+            }
+            if (need_flags & DA_NEED_MIN) {
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    if (agg_types[a] == RAY_F64) {
+                        if (wa->min_val[a].f < m->min_val[a].f)
+                            m->min_val[a].f = wa->min_val[a].f;
+                    } else {
+                        if (wa->min_val[a].i < m->min_val[a].i)
+                            m->min_val[a].i = wa->min_val[a].i;
+                    }
+                }
+            }
+            if (need_flags & DA_NEED_MAX) {
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    if (agg_types[a] == RAY_F64) {
+                        if (wa->max_val[a].f > m->max_val[a].f)
+                            m->max_val[a].f = wa->max_val[a].f;
+                    } else {
+                        if (wa->max_val[a].i > m->max_val[a].i)
+                            m->max_val[a].i = wa->max_val[a].i;
+                    }
+                }
+            }
+            m->count[0] += wa->count[0];
+        }
+        for (uint32_t w = 1; w < sc_n; w++) da_accum_free(&sc_acc[w]);
+
+        /* Emit 1-row result with no key columns */
+        ray_t* result = ray_table_new(n_aggs);
+        if (!result || RAY_IS_ERR(result)) {
+            da_accum_free(&sc_acc[0]); scratch_free(sc_hdr);
+            for (uint8_t a = 0; a < n_aggs; a++)
+                if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]);
+            for (uint8_t k = 0; k < n_keys; k++)
+                if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]);
+            if (match_idx_block) ray_release(match_idx_block);
+            return result ? result : ray_error("oom", NULL);
+        }
+
+        emit_agg_columns(&result, g, ext, agg_vecs, 1, n_aggs,
+                         (double*)m->sum, (int64_t*)m->sum,
+                         (double*)m->min_val, (double*)m->max_val,
+                         (int64_t*)m->min_val, (int64_t*)m->max_val,
+                         m->count, agg_affine, m->sumsq_f64);
+
+        da_accum_free(&sc_acc[0]); scratch_free(sc_hdr);
+        for (uint8_t a = 0; a < n_aggs; a++)
+            if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]);
+        for (uint8_t k = 0; k < n_keys; k++)
+            if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]);
+        if (match_idx_block) ray_release(match_idx_block);
+        return result;
+    }
+
+da_path:;
+    /* ---- Direct-array fast path for low-cardinality integer keys ---- */
+    /* Supports multi-key via composite index: product of ranges <= MAX */
+    #define DA_MAX_COMPOSITE_SLOTS 262144  /* 256K slots max */
+    #define DA_MEM_BUDGET      (256ULL << 20)  /* 256 MB total across all workers */
+    #define DA_PER_WORKER_MAX  (6ULL << 20)    /* 6 MB per-worker max */
+    {
+        bool da_eligible = (nrows > 0 && n_keys > 0 && n_keys <= 8);
+        for (uint8_t k = 0; k < n_keys && da_eligible; k++) {
+            if (!key_data[k]) { da_eligible = false; break; }
+            int8_t t = key_types[k];
+            if (t != RAY_I64 && t != RAY_SYM && t != RAY_I32
+                && t != RAY_TIMESTAMP && t != RAY_DATE && t != RAY_TIME
+                && t != RAY_BOOL && t != RAY_U8 && t != RAY_I16) {
+                da_eligible = false;
+            }
+            /* DA path cannot represent nulls — fall back to HT path. */
+            if (key_vecs[k]) {
+                ray_t* src = (key_vecs[k]->attrs & RAY_ATTR_SLICE)
+                             ? key_vecs[k]->slice_parent : key_vecs[k];
+                if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+                    da_eligible = false;
+            }
+        }
+
+        int64_t da_key_min[8], da_key_range[8], da_key_stride[8];
+        uint64_t total_slots = 1;
+        bool da_fits = false;
+
+
+        if (da_eligible) {
+            da_fits = true;
+            ray_pool_t* mm_pool = ray_pool_get();
+            uint32_t mm_n = (mm_pool && nrows >= RAY_PARALLEL_THRESHOLD)
+                            ? ray_pool_total_workers(mm_pool) : 1;
+            /* VLA bounded by worker count — max ~2KB per key even on 256-core systems. */
+            int64_t mm_mins[mm_n], mm_maxs[mm_n];
+            for (uint8_t k = 0; k < n_keys && da_fits; k++) {
+                int64_t kmin, kmax;
+                for (uint32_t w = 0; w < mm_n; w++) {
+                    mm_mins[w] = INT64_MAX;
+                    mm_maxs[w] = INT64_MIN;
+                }
+                minmax_ctx_t mm_ctx = {
+                    .key_data       = key_data[k],
+                    .key_type       = key_types[k],
+                    .key_attrs      = key_attrs[k],
+                    .per_worker_min = mm_mins,
+                    .per_worker_max = mm_maxs,
+                    .n_workers      = mm_n,
+                    .match_idx      = match_idx,
+                };
+                if (mm_n > 1) {
+                    ray_pool_dispatch(mm_pool, minmax_scan_fn, &mm_ctx, n_scan);
+                } else {
+                    minmax_scan_fn(&mm_ctx, 0, 0, n_scan);
+                }
+                kmin = INT64_MAX; kmax = INT64_MIN;
+                for (uint32_t w = 0; w < mm_n; w++) {
+                    if (mm_mins[w] < kmin) kmin = mm_mins[w];
+                    if (mm_maxs[w] > kmax) kmax = mm_maxs[w];
+                }
+                da_key_min[k]   = kmin;
+                /* kmax - kmin may overflow i64 when keys span full range.
+                 * Compute in uint64_t and reject if the span exceeds i64. */
+                uint64_t span = (uint64_t)kmax - (uint64_t)kmin + 1;
+                if (span > (uint64_t)INT64_MAX) { da_fits = false; break; }
+                da_key_range[k] = (int64_t)span;
+                if (da_key_range[k] <= 0) { da_fits = false; break; }
+                total_slots *= (uint64_t)da_key_range[k];
+                if (total_slots > DA_MAX_COMPOSITE_SLOTS) da_fits = false;
+            }
+        }
+
+        if (da_fits) {
+            /* Compute which accumulator arrays we actually need */
+            uint8_t need_flags = DA_NEED_COUNT; /* always need count */
+            for (uint8_t a = 0; a < n_aggs; a++) {
+                uint16_t aop = ext->agg_ops[a];
+                if (aop == OP_SUM || aop == OP_AVG || aop == OP_FIRST || aop == OP_LAST) need_flags |= DA_NEED_SUM;
+                else if (aop == OP_STDDEV || aop == OP_STDDEV_POP || aop == OP_VAR || aop == OP_VAR_POP)
+                    { need_flags |= DA_NEED_SUM; need_flags |= DA_NEED_SUMSQ; }
+                else if (aop == OP_MIN) need_flags |= DA_NEED_MIN;
+                else if (aop == OP_MAX) need_flags |= DA_NEED_MAX;
+            }
+
+            /* Compute per-worker memory budget.  Actual allocation is 1 union
+             * array per type, but MIN/MAX use conditional random writes that
+             * perform worse than radix-partitioned HT at high group counts.
+             * Weight MIN/MAX at 2x to keep those queries on the HT path. */
+            uint32_t arrays_per_agg = 0;
+            if (need_flags & DA_NEED_SUM) arrays_per_agg += 1;
+            if (need_flags & DA_NEED_MIN) arrays_per_agg += 2; /* 2x: DA MIN slow at high cardinality */
+            if (need_flags & DA_NEED_MAX) arrays_per_agg += 2; /* 2x: DA MAX slow at high cardinality */
+            if (need_flags & DA_NEED_SUMSQ) arrays_per_agg += 1;
+            uint64_t per_worker = total_slots * (arrays_per_agg * n_aggs + 1u) * 8u;
+            if (per_worker > DA_PER_WORKER_MAX)
+                da_fits = false;
+        }
+
+        if (da_fits) {
+            /* Recompute need_flags (da_fits may have changed scope) */
+            uint8_t need_flags = DA_NEED_COUNT;
+            bool all_sum = true;
+            for (uint8_t a = 0; a < n_aggs; a++) {
+                uint16_t aop = ext->agg_ops[a];
+                if (aop == OP_SUM || aop == OP_AVG || aop == OP_FIRST || aop == OP_LAST) need_flags |= DA_NEED_SUM;
+                else if (aop == OP_STDDEV || aop == OP_STDDEV_POP || aop == OP_VAR || aop == OP_VAR_POP)
+                    { need_flags |= DA_NEED_SUM; need_flags |= DA_NEED_SUMSQ; }
+                else if (aop == OP_MIN) need_flags |= DA_NEED_MIN;
+                else if (aop == OP_MAX) need_flags |= DA_NEED_MAX;
+                if (aop != OP_SUM && aop != OP_AVG && aop != OP_COUNT)
+                    all_sum = false;
+            }
+
+            /* Compute strides: stride[k] = product of ranges[k+1..n_keys-1]
+             * Guard against overflow: if any product exceeds INT64_MAX,
+             * fall through to HT path. */
+            bool stride_overflow = false;
+            for (uint8_t k = 0; k < n_keys; k++) {
+                int64_t s = 1;
+                for (uint8_t j = k + 1; j < n_keys; j++) {
+                    if (da_key_range[j] != 0 && s > INT64_MAX / da_key_range[j]) {
+                        stride_overflow = true; break;
+                    }
+                    s *= da_key_range[j];
+                }
+                if (stride_overflow) break;
+                da_key_stride[k] = s;
+            }
+            if (stride_overflow) da_fits = false;
+
+            uint32_t n_slots = (uint32_t)total_slots;
+            size_t total = (size_t)n_slots * n_aggs;
+
+            void* agg_ptrs[vla_aggs];
+            int8_t agg_types[vla_aggs];
+            uint32_t agg_f64_mask = 0;
+            for (uint8_t a = 0; a < n_aggs; a++) {
+                if (agg_vecs[a]) {
+                    agg_ptrs[a]  = ray_data(agg_vecs[a]);
+                    agg_types[a] = agg_vecs[a]->type;
+                    if (agg_vecs[a]->type == RAY_F64)
+                        agg_f64_mask |= (1u << a);
+                } else {
+                    agg_ptrs[a]  = NULL;
+                    agg_types[a] = 0;
+                }
+            }
+
+            ray_pool_t* da_pool = ray_pool_get();
+            uint32_t da_n_workers = (da_pool && nrows >= RAY_PARALLEL_THRESHOLD)
+                                    ? ray_pool_total_workers(da_pool) : 1;
+
+            /* Check memory budget — need one accumulator set per worker.
+             * Weight MIN/MAX at 2x in budget (same as eligibility check) to
+             * keep MIN/MAX-heavy queries on the faster radix-HT path. */
+            uint32_t arrays_per_agg = 0;
+            if (need_flags & DA_NEED_SUM) arrays_per_agg += 1;
+            if (need_flags & DA_NEED_MIN) arrays_per_agg += 2;
+            if (need_flags & DA_NEED_MAX) arrays_per_agg += 2;
+            if (need_flags & DA_NEED_SUMSQ) arrays_per_agg += 1;
+            uint64_t per_worker_bytes = (uint64_t)n_slots * (arrays_per_agg * n_aggs + 1u) * 8u;
+            if ((uint64_t)da_n_workers * per_worker_bytes > DA_MEM_BUDGET)
+                da_n_workers = 1;
+
+            ray_t* accums_hdr;
+            da_accum_t* accums = (da_accum_t*)scratch_calloc(&accums_hdr,
+                da_n_workers * sizeof(da_accum_t));
+            if (!accums) goto ht_path;
+
+            bool alloc_ok = true;
+            for (uint32_t w = 0; w < da_n_workers; w++) {
+                if (need_flags & DA_NEED_SUM) {
+                    accums[w].sum = (da_val_t*)scratch_calloc(&accums[w]._h_sum,
+                        total * sizeof(da_val_t));
+                    if (!accums[w].sum) { alloc_ok = false; break; }
+                }
+                if (need_flags & DA_NEED_SUMSQ) {
+                    accums[w].sumsq_f64 = (double*)scratch_calloc(&accums[w]._h_sumsq,
+                        total * sizeof(double));
+                    if (!accums[w].sumsq_f64) { alloc_ok = false; break; }
+                }
+                if (need_flags & DA_NEED_MIN) {
+                    accums[w].min_val = (da_val_t*)scratch_alloc(&accums[w]._h_min,
+                        total * sizeof(da_val_t));
+                    if (!accums[w].min_val) { alloc_ok = false; break; }
+                    for (size_t i = 0; i < total; i++) {
+                        uint8_t a = (uint8_t)(i % n_aggs);
+                        if (agg_types[a] == RAY_F64) accums[w].min_val[i].f = DBL_MAX;
+                        else accums[w].min_val[i].i = INT64_MAX;
+                    }
+                }
+                if (need_flags & DA_NEED_MAX) {
+                    accums[w].max_val = (da_val_t*)scratch_alloc(&accums[w]._h_max,
+                        total * sizeof(da_val_t));
+                    if (!accums[w].max_val) { alloc_ok = false; break; }
+                    for (size_t i = 0; i < total; i++) {
+                        uint8_t a = (uint8_t)(i % n_aggs);
+                        if (agg_types[a] == RAY_F64) accums[w].max_val[i].f = -DBL_MAX;
+                        else accums[w].max_val[i].i = INT64_MIN;
+                    }
+                }
+                accums[w].count = (int64_t*)scratch_calloc(&accums[w]._h_count,
+                    n_slots * sizeof(int64_t));
+                if (!accums[w].count) { alloc_ok = false; break; }
+            }
+            if (!alloc_ok) {
+                for (uint32_t w = 0; w < da_n_workers; w++)
+                    da_accum_free(&accums[w]);
+                scratch_free(accums_hdr);
+                goto ht_path;
+            }
+
+
+            /* Pre-compute per-key element sizes for fast DA reads */
+            uint8_t da_key_esz[n_keys];
+            for (uint8_t k = 0; k < n_keys; k++)
+                da_key_esz[k] = ray_sym_elem_size(key_types[k], key_attrs[k]);
+
+            da_ctx_t da_ctx = {
+                .accums      = accums,
+                .n_accums    = da_n_workers,
+                .key_ptrs    = key_data,
+                .key_types   = key_types,
+                .key_attrs   = key_attrs,
+                .key_esz     = da_key_esz,
+                .key_mins    = da_key_min,
+                .key_strides = da_key_stride,
+                .n_keys      = n_keys,
+                .agg_ptrs    = agg_ptrs,
+                .agg_types   = agg_types,
+                .agg_ops     = ext->agg_ops,
+                .n_aggs      = n_aggs,
+                .need_flags  = need_flags,
+                .agg_f64_mask = agg_f64_mask,
+                .all_sum     = all_sum,
+                .n_slots     = n_slots,
+                .match_idx   = match_idx,
+            };
+
+            if (da_n_workers > 1)
+                ray_pool_dispatch(da_pool, da_accum_fn, &da_ctx, n_scan);
+            else
+                da_accum_fn(&da_ctx, 0, 0, n_scan);
+
+            /* Merge target is always accums[0] */
+            da_accum_t* merged = &accums[0];
+
+            /* Check if any agg is FIRST/LAST (needs ordered per-worker merge) */
+            bool has_first_last = false;
+            for (uint8_t a = 0; a < n_aggs; a++) {
+                uint16_t aop = ext->agg_ops[a];
+                if (aop == OP_FIRST || aop == OP_LAST) { has_first_last = true; break; }
+            }
+
+            /* Merge per-worker accumulators into accums[0].
+             * FIRST/LAST require worker-order-dependent merge (sequential).
+             * All other ops are commutative — dispatch over disjoint slot
+             * ranges for parallel merge. */
+            if (has_first_last) {
+                for (uint32_t w = 1; w < da_n_workers; w++) {
+                    da_accum_t* wa = &accums[w];
+                    if (need_flags & DA_NEED_SUMSQ) {
+                        for (size_t i = 0; i < total; i++)
+                            merged->sumsq_f64[i] += wa->sumsq_f64[i];
+                    }
+                    if (need_flags & DA_NEED_SUM) {
+                        for (uint32_t s = 0; s < n_slots; s++) {
+                            size_t base = (size_t)s * n_aggs;
+                            for (uint8_t a = 0; a < n_aggs; a++) {
+                                size_t idx = base + a;
+                                uint16_t aop = ext->agg_ops[a];
+                                if (aop == OP_SUM || aop == OP_AVG || aop == OP_STDDEV || aop == OP_STDDEV_POP || aop == OP_VAR || aop == OP_VAR_POP) {
+                                    if (agg_types[a] == RAY_F64) merged->sum[idx].f += wa->sum[idx].f;
+                                    else merged->sum[idx].i += wa->sum[idx].i;
+                                } else if (aop == OP_FIRST) {
+                                    if (merged->count[s] == 0 && wa->count[s] > 0)
+                                        merged->sum[idx] = wa->sum[idx];
+                                } else if (aop == OP_LAST) {
+                                    if (wa->count[s] > 0)
+                                        merged->sum[idx] = wa->sum[idx];
+                                }
+                            }
+                        }
+                    }
+                    if (need_flags & DA_NEED_MIN) {
+                        for (size_t i = 0; i < total; i++) {
+                            uint8_t a = (uint8_t)(i % n_aggs);
+                            if (agg_types[a] == RAY_F64) {
+                                if (wa->min_val[i].f < merged->min_val[i].f)
+                                    merged->min_val[i].f = wa->min_val[i].f;
+                            } else {
+                                if (wa->min_val[i].i < merged->min_val[i].i)
+                                    merged->min_val[i].i = wa->min_val[i].i;
+                            }
+                        }
+                    }
+                    if (need_flags & DA_NEED_MAX) {
+                        for (size_t i = 0; i < total; i++) {
+                            uint8_t a = (uint8_t)(i % n_aggs);
+                            if (agg_types[a] == RAY_F64) {
+                                if (wa->max_val[i].f > merged->max_val[i].f)
+                                    merged->max_val[i].f = wa->max_val[i].f;
+                            } else {
+                                if (wa->max_val[i].i > merged->max_val[i].i)
+                                    merged->max_val[i].i = wa->max_val[i].i;
+                            }
+                        }
+                    }
+                    for (uint32_t s = 0; s < n_slots; s++)
+                        merged->count[s] += wa->count[s];
+                }
+            } else if (da_n_workers > 1 && n_slots >= 1024 && da_pool) {
+                /* Parallel merge: dispatch over disjoint slot ranges */
+                da_merge_ctx_t merge_ctx = {
+                    .accums        = accums,
+                    .n_src_workers = da_n_workers,
+                    .need_flags    = need_flags,
+                    .n_aggs        = n_aggs,
+                    .agg_types     = agg_types,
+                    .agg_ops       = ext->agg_ops,
+                };
+                ray_pool_dispatch(da_pool, da_merge_fn, &merge_ctx, (int64_t)n_slots);
+            } else {
+                /* Sequential merge for small slot counts */
+                for (uint32_t w = 1; w < da_n_workers; w++) {
+                    da_accum_t* wa = &accums[w];
+                    if (need_flags & DA_NEED_SUMSQ) {
+                        for (size_t i = 0; i < total; i++)
+                            merged->sumsq_f64[i] += wa->sumsq_f64[i];
+                    }
+                    if (need_flags & DA_NEED_SUM) {
+                        for (uint32_t s = 0; s < n_slots; s++) {
+                            size_t base = (size_t)s * n_aggs;
+                            for (uint8_t a = 0; a < n_aggs; a++) {
+                                size_t idx = base + a;
+                                uint16_t aop = ext->agg_ops[a];
+                                if (aop == OP_FIRST) {
+                                    if (merged->count[s] == 0 && wa->count[s] > 0)
+                                        merged->sum[idx] = wa->sum[idx];
+                                } else if (aop == OP_LAST) {
+                                    if (wa->count[s] > 0)
+                                        merged->sum[idx] = wa->sum[idx];
+                                } else if (agg_types[a] == RAY_F64)
+                                    merged->sum[idx].f += wa->sum[idx].f;
+                                else
+                                    merged->sum[idx].i += wa->sum[idx].i;
+                            }
+                        }
+                    }
+                    if (need_flags & DA_NEED_MIN) {
+                        for (size_t i = 0; i < total; i++) {
+                            uint8_t a = (uint8_t)(i % n_aggs);
+                            if (agg_types[a] == RAY_F64) {
+                                if (wa->min_val[i].f < merged->min_val[i].f)
+                                    merged->min_val[i].f = wa->min_val[i].f;
+                            } else {
+                                if (wa->min_val[i].i < merged->min_val[i].i)
+                                    merged->min_val[i].i = wa->min_val[i].i;
+                            }
+                        }
+                    }
+                    if (need_flags & DA_NEED_MAX) {
+                        for (size_t i = 0; i < total; i++) {
+                            uint8_t a = (uint8_t)(i % n_aggs);
+                            if (agg_types[a] == RAY_F64) {
+                                if (wa->max_val[i].f > merged->max_val[i].f)
+                                    merged->max_val[i].f = wa->max_val[i].f;
+                            } else {
+                                if (wa->max_val[i].i > merged->max_val[i].i)
+                                    merged->max_val[i].i = wa->max_val[i].i;
+                            }
+                        }
+                    }
+                    for (uint32_t s = 0; s < n_slots; s++)
+                        merged->count[s] += wa->count[s];
+                }
+            }
+
+
+
+            for (uint32_t w = 1; w < da_n_workers; w++)
+                da_accum_free(&accums[w]);
+
+            da_val_t* da_sum      = merged->sum;      /* may be NULL if !DA_NEED_SUM */
+            da_val_t* da_min_val  = merged->min_val;  /* may be NULL if !DA_NEED_MIN */
+            da_val_t* da_max_val  = merged->max_val;  /* may be NULL if !DA_NEED_MAX */
+            double*   da_sumsq   = merged->sumsq_f64; /* may be NULL if !DA_NEED_SUMSQ */
+            int64_t*  da_count   = merged->count;
+
+            uint32_t grp_count = 0;
+            for (uint32_t s = 0; s < n_slots; s++)
+                if (da_count[s] > 0) grp_count++;
+
+            int64_t total_cols = n_keys + n_aggs;
+            ray_t* result = ray_table_new(total_cols);
+            if (!result || RAY_IS_ERR(result)) {
+                da_accum_free(&accums[0]); scratch_free(accums_hdr);
+                for (uint8_t a = 0; a < n_aggs; a++)
+                    if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]);
+                for (uint8_t k = 0; k < n_keys; k++)
+                    if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]);
+                if (match_idx_block) ray_release(match_idx_block);
+                return result ? result : ray_error("oom", NULL);
+            }
+
+            /* Key columns — decompose composite slot back to per-key values */
+            for (uint8_t k = 0; k < n_keys; k++) {
+                ray_t* src_col = key_vecs[k];
+                if (!src_col) continue;
+                ray_t* key_col = col_vec_new(src_col, (int64_t)grp_count);
+                if (!key_col || RAY_IS_ERR(key_col)) continue;
+                key_col->len = (int64_t)grp_count;
+                uint32_t gi = 0;
+                for (uint32_t s = 0; s < n_slots; s++) {
+                    if (da_count[s] == 0) continue;
+                    int64_t offset = ((int64_t)s / da_key_stride[k]) % da_key_range[k];
+                    int64_t key_val = da_key_min[k] + offset;
+                    write_col_i64(ray_data(key_col), gi, key_val, src_col->type, key_col->attrs);
+                    gi++;
+                }
+                ray_op_ext_t* key_ext = find_ext(g, ext->keys[k]->id);
+                int64_t name_id = key_ext ? key_ext->sym : (int64_t)k;
+                result = ray_table_add_col(result, name_id, key_col);
+                ray_release(key_col);
+            }
+
+            /* Agg columns — compact sparse DA arrays into dense, then emit */
+            size_t dense_total = (size_t)grp_count * n_aggs;
+            ray_t *_h_dsum = NULL, *_h_dmin = NULL, *_h_dmax = NULL;
+            ray_t *_h_dsq = NULL, *_h_dcnt = NULL;
+            da_val_t* dense_sum     = da_sum     ? (da_val_t*)scratch_alloc(&_h_dsum, dense_total * sizeof(da_val_t)) : NULL;
+            da_val_t* dense_min_val = da_min_val ? (da_val_t*)scratch_alloc(&_h_dmin, dense_total * sizeof(da_val_t)) : NULL;
+            da_val_t* dense_max_val = da_max_val ? (da_val_t*)scratch_alloc(&_h_dmax, dense_total * sizeof(da_val_t)) : NULL;
+            double*   dense_sumsq   = da_sumsq   ? (double*)scratch_alloc(&_h_dsq, dense_total * sizeof(double)) : NULL;
+            int64_t*  dense_counts  = (int64_t*)scratch_alloc(&_h_dcnt, grp_count * sizeof(int64_t));
+
+            uint32_t gi = 0;
+            for (uint32_t s = 0; s < n_slots; s++) {
+                if (da_count[s] == 0) continue;
+                dense_counts[gi] = da_count[s];
+                for (uint8_t a = 0; a < n_aggs; a++) {
+                    size_t si = (size_t)s * n_aggs + a;
+                    size_t di = (size_t)gi * n_aggs + a;
+                    if (dense_sum)     dense_sum[di]     = da_sum[si];
+                    if (dense_min_val) dense_min_val[di] = da_min_val[si];
+                    if (dense_max_val) dense_max_val[di] = da_max_val[si];
+                    if (dense_sumsq)   dense_sumsq[di]   = da_sumsq[si];
+                }
+                gi++;
+            }
+
+            emit_agg_columns(&result, g, ext, agg_vecs, grp_count, n_aggs,
+                             (double*)dense_sum, (int64_t*)dense_sum,
+                             (double*)dense_min_val, (double*)dense_max_val,
+                             (int64_t*)dense_min_val, (int64_t*)dense_max_val,
+                             dense_counts, agg_affine, dense_sumsq);
+
+            scratch_free(_h_dsum); scratch_free(_h_dmin);
+            scratch_free(_h_dmax);
+            scratch_free(_h_dsq); scratch_free(_h_dcnt);
+
+            da_accum_free(&accums[0]); scratch_free(accums_hdr);
+            for (uint8_t a = 0; a < n_aggs; a++)
+                if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]);
+            for (uint8_t k = 0; k < n_keys; k++)
+                if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]);
+            if (match_idx_block) ray_release(match_idx_block);
+            return result;
+        }
+    }
+
+ht_path:;
+    /* Compute which accumulator arrays the HT needs based on agg ops.
+     * COUNT only reads group row's count field — no accumulator needed. */
+    uint8_t ght_need = 0;
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        uint16_t aop = ext->agg_ops[a];
+        if (aop == OP_SUM || aop == OP_AVG || aop == OP_FIRST || aop == OP_LAST)
+            ght_need |= GHT_NEED_SUM;
+        if (aop == OP_STDDEV || aop == OP_STDDEV_POP || aop == OP_VAR || aop == OP_VAR_POP)
+            { ght_need |= GHT_NEED_SUM; ght_need |= GHT_NEED_SUMSQ; }
+        if (aop == OP_MIN) ght_need |= GHT_NEED_MIN;
+        if (aop == OP_MAX) ght_need |= GHT_NEED_MAX;
+    }
+
+    /* RAY_STR keys still need the eval-level path (variable-width
+     * with a pool).  RAY_GUID uses the wide-key row-indirection
+     * support in the layout; see ght_layout_t.wide_key_mask. */
+    for (uint8_t k = 0; k < n_keys; k++) {
+        if (key_types[k] == RAY_STR) {
+            for (uint8_t kk = 0; kk < n_keys; kk++)
+                if (key_owned[kk] && key_vecs[kk]) ray_release(key_vecs[kk]);
+            for (uint8_t a = 0; a < n_aggs; a++)
+                if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]);
+            if (match_idx_block) ray_release(match_idx_block);
+            return ray_error("nyi", NULL);
+        }
+    }
+
+    /* Compute row-layout: keys + agg values inline */
+    ght_layout_t ght_layout = ght_compute_layout(n_keys, n_aggs, agg_vecs, ght_need, ext->agg_ops, key_types);
+
+    /* Right-sized hash table: start small, rehash on load > 0.5 */
+    uint32_t ht_cap = 256;
+    {
+        uint64_t target = (uint64_t)nrows < 65536 ? (uint64_t)nrows : 65536;
+        if (target < 256) target = 256;
+        while (ht_cap < target) ht_cap *= 2;
+    }
+
+    /* Parallel path: radix-partitioned group-by */
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t n_total = pool ? ray_pool_total_workers(pool) : 1;
+
+    group_ht_t single_ht;
+    group_ht_t* final_ht = NULL;
+    ray_t* result = NULL;
+
+    ray_t* radix_bufs_hdr = NULL;
+    radix_buf_t* radix_bufs = NULL;
+    ray_t* part_hts_hdr = NULL;
+    group_ht_t*  part_hts   = NULL;
+
+    if (pool && nrows >= RAY_PARALLEL_THRESHOLD && n_total > 1) {
+        size_t n_bufs = (size_t)n_total * RADIX_P;
+        radix_bufs = (radix_buf_t*)scratch_calloc(&radix_bufs_hdr,
+            n_bufs * sizeof(radix_buf_t));
+        if (!radix_bufs) goto sequential_fallback;
+
+        /* Pre-size each buffer: 1.5x expected, capped so total ≤ 2 GB.
+         * Buffers grow on demand via radix_buf_push doubling. */
+        uint32_t buf_init = (uint32_t)((uint64_t)nrows / (RADIX_P * n_total));
+        if (buf_init < 64) buf_init = 64;
+        buf_init = buf_init + buf_init / 2;  /* 1.5x headroom */
+        uint16_t estride = ght_layout.entry_stride;
+        {
+            /* Cap: total pre-alloc ≤ 2 GB */
+            size_t total_pre = (size_t)n_bufs * buf_init * estride;
+            if (total_pre > (size_t)2 << 30) {
+                buf_init = (uint32_t)(((size_t)2 << 30) / ((size_t)n_bufs * estride));
+                if (buf_init < 64) buf_init = 64;
+            }
+        }
+        for (size_t i = 0; i < n_bufs; i++) {
+            radix_bufs[i].data = (char*)scratch_alloc(
+                &radix_bufs[i]._hdr, (size_t)buf_init * estride);
+            radix_bufs[i].count = 0;
+            radix_bufs[i].cap = buf_init;
+        }
+
+        /* Compute per-key nullability — lets phase1 skip null checks on
+         * key columns with no nulls (the common case). */
+        uint8_t p1_nullable = 0;
+        for (uint8_t k = 0; k < n_keys; k++) {
+            if (!key_vecs[k]) continue;
+            ray_t* src = (key_vecs[k]->attrs & RAY_ATTR_SLICE)
+                         ? key_vecs[k]->slice_parent : key_vecs[k];
+            if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+                p1_nullable |= (uint8_t)(1u << k);
+        }
+
+        /* Phase 1: parallel hash + copy keys/agg values into fat entries */
+        radix_phase1_ctx_t p1ctx = {
+            .key_data      = key_data,
+            .key_types     = key_types,
+            .key_attrs     = key_attrs,
+            .key_vecs      = key_vecs,
+            .nullable_mask = p1_nullable,
+            .agg_vecs      = agg_vecs,
+            .n_workers     = n_total,
+            .bufs          = radix_bufs,
+            .layout        = ght_layout,
+            .match_idx     = match_idx,
+        };
+        ray_pool_dispatch(pool, radix_phase1_fn, &p1ctx, n_scan);
+        CHECK_CANCEL_GOTO(pool, cleanup);
+
+        /* Check for OOM during phase 1 radix buffer growth */
+        {
+            bool phase1_oom = false;
+            for (size_t i = 0; i < n_bufs; i++) {
+                if (radix_bufs[i].oom) { phase1_oom = true; break; }
+            }
+            if (phase1_oom) {
+                for (size_t i = 0; i < n_bufs; i++) scratch_free(radix_bufs[i]._hdr);
+                scratch_free(radix_bufs_hdr);
+                radix_bufs = NULL;
+                goto sequential_fallback;
+            }
+        }
+
+        /* Phase 2: parallel per-partition aggregation (no column access) */
+        part_hts = (group_ht_t*)scratch_calloc(&part_hts_hdr,
+            RADIX_P * sizeof(group_ht_t));
+        if (!part_hts) {
+            for (size_t i = 0; i < n_bufs; i++) scratch_free(radix_bufs[i]._hdr);
+            scratch_free(radix_bufs_hdr);
+            radix_bufs = NULL;
+            goto sequential_fallback;
+        }
+
+        radix_phase2_ctx_t p2ctx = {
+            .key_types   = key_types,
+            .n_keys      = n_keys,
+            .n_workers   = n_total,
+            .bufs        = radix_bufs,
+            .part_hts    = part_hts,
+            .layout      = ght_layout,
+            .key_data    = key_data,
+        };
+        ray_pool_dispatch_n(pool, radix_phase2_fn, &p2ctx, RADIX_P);
+        CHECK_CANCEL_GOTO(pool, cleanup);
+
+        /* Prefix offsets */
+        uint32_t part_offsets[RADIX_P + 1];
+        part_offsets[0] = 0;
+        for (uint32_t p = 0; p < RADIX_P; p++)
+            part_offsets[p + 1] = part_offsets[p] + part_hts[p].grp_count;
+        uint32_t total_grps = part_offsets[RADIX_P];
+
+        /* Build result directly from partition HTs */
+        int64_t total_cols = n_keys + n_aggs;
+        result = ray_table_new(total_cols);
+        if (!result || RAY_IS_ERR(result)) goto cleanup;
+
+        /* Pre-allocate key columns */
+        ray_t* key_cols[n_keys];
+        char* key_dsts[n_keys];
+        int8_t key_out_types[n_keys];
+        uint8_t key_esizes[n_keys];
+        for (uint8_t k = 0; k < n_keys; k++) {
+            ray_t* src_col = key_vecs[k];
+            key_cols[k] = NULL;
+            key_dsts[k] = NULL;
+            key_out_types[k] = 0;
+            key_esizes[k] = 0;
+            if (!src_col) continue;
+            uint8_t esz = ray_sym_elem_size(src_col->type, src_col->attrs);
+            ray_t* new_col;
+            if (src_col->type == RAY_SYM)
+                new_col = ray_sym_vec_new(src_col->attrs & RAY_SYM_W_MASK, (int64_t)total_grps);
+            else
+                new_col = ray_vec_new(src_col->type, (int64_t)total_grps);
+            if (!new_col || RAY_IS_ERR(new_col)) continue;
+            new_col->len = (int64_t)total_grps;
+            key_cols[k] = new_col;
+            key_dsts[k] = (char*)ray_data(new_col);
+            key_out_types[k] = src_col->type;
+            key_esizes[k] = esz;
+        }
+
+        /* Pre-allocate agg result vectors */
+        agg_out_t agg_outs[n_aggs];
+        ray_t* agg_cols[n_aggs];
+        for (uint8_t a = 0; a < n_aggs; a++) {
+            uint16_t agg_op = ext->agg_ops[a];
+            ray_t* agg_col = agg_vecs[a];
+            bool is_f64 = agg_col && agg_col->type == RAY_F64;
+            int8_t out_type;
+            switch (agg_op) {
+                case OP_AVG:
+                case OP_STDDEV: case OP_STDDEV_POP:
+                case OP_VAR: case OP_VAR_POP:
+                    out_type = RAY_F64; break;
+                case OP_COUNT: out_type = RAY_I64; break;
+                case OP_SUM: case OP_PROD:
+                    out_type = is_f64 ? RAY_F64 : RAY_I64; break;
+                default:
+                    out_type = agg_col ? agg_col->type : RAY_I64; break;
+            }
+            ray_t* new_col = ray_vec_new(out_type, (int64_t)total_grps);
+            if (!new_col || RAY_IS_ERR(new_col)) {
+                agg_cols[a] = NULL;
+                memset(&agg_outs[a], 0, sizeof(agg_outs[a]));
+                continue;
+            }
+            new_col->len = (int64_t)total_grps;
+            agg_cols[a] = new_col;
+            agg_outs[a] = (agg_out_t){
+                .out_type = out_type, .src_f64 = is_f64,
+                .agg_op = agg_op,
+                .affine = agg_affine[a].enabled,
+                .bias_f64 = agg_affine[a].bias_f64,
+                .bias_i64 = agg_affine[a].bias_i64,
+                .dst = ray_data(new_col),
+                .vec = new_col,
+            };
+        }
+
+        /* Pre-allocate nullmaps for agg result vectors (parallel safety) */
+        bool nullmap_prep_ok[n_aggs];
+        for (uint8_t a = 0; a < n_aggs; a++)
+            nullmap_prep_ok[a] = agg_cols[a] && (grp_prepare_nullmap(agg_outs[a].vec) == RAY_OK);
+
+        /* Pre-prepare nullmaps on output key columns for parallel null writes */
+        for (uint8_t k = 0; k < n_keys; k++)
+            if (key_cols[k]) grp_prepare_nullmap(key_cols[k]);
+
+        /* Phase 3: parallel key gather + agg result building from inline rows */
+        {
+            radix_phase3_ctx_t p3ctx = {
+                .part_hts     = part_hts,
+                .part_offsets = part_offsets,
+                .key_dsts     = key_dsts,
+                .key_types    = key_out_types,
+                .key_attrs    = key_attrs,
+                .key_esizes   = key_esizes,
+                .key_cols     = key_cols,
+                .n_keys       = n_keys,
+                .agg_outs     = agg_outs,
+                .n_aggs       = n_aggs,
+                .key_src_data = key_data,
+            };
+            ray_pool_dispatch_n(pool, radix_phase3_fn, &p3ctx, RADIX_P);
+        }
+
+        /* Fixup: if nullmap prep failed for any VAR/STDDEV agg, re-scan
+         * hash tables sequentially to ensure all null bits were set */
+        for (uint8_t a = 0; a < n_aggs; a++) {
+            if (nullmap_prep_ok[a] || !agg_cols[a]) continue;
+            uint16_t op = agg_outs[a].agg_op;
+            if (op != OP_VAR && op != OP_VAR_POP &&
+                op != OP_STDDEV && op != OP_STDDEV_POP) continue;
+            for (uint32_t p = 0; p < RADIX_P; p++) {
+                group_ht_t* ph = &part_hts[p];
+                uint32_t gc = ph->grp_count;
+                uint32_t off = part_offsets[p];
+                uint16_t rs = ph->layout.row_stride;
+                for (uint32_t gi = 0; gi < gc; gi++) {
+                    const char* row = ph->rows + (size_t)gi * rs;
+                    int64_t cnt = *(const int64_t*)(const void*)row;
+                    bool insuf = (op == OP_VAR || op == OP_STDDEV) ? cnt <= 1 : cnt <= 0;
+                    if (insuf) ray_vec_set_null(agg_outs[a].vec, off + gi, true);
+                }
+            }
+        }
+
+        /* Finalize null flags after parallel execution */
+        for (uint8_t a = 0; a < n_aggs; a++) {
+            if (!agg_cols[a]) continue;
+            grp_finalize_nulls(agg_outs[a].vec);
+        }
+        for (uint8_t k = 0; k < n_keys; k++) {
+            if (!key_cols[k]) continue;
+            grp_finalize_nulls(key_cols[k]);
+        }
+
+        /* Add key columns to result */
+        for (uint8_t k = 0; k < n_keys; k++) {
+            if (!key_cols[k]) continue;
+            ray_op_ext_t* key_ext = find_ext(g, ext->keys[k]->id);
+            int64_t name_id = key_ext ? key_ext->sym : k;
+            result = ray_table_add_col(result, name_id, key_cols[k]);
+            ray_release(key_cols[k]);
+        }
+
+        /* Add agg columns to result */
+        for (uint8_t a = 0; a < n_aggs; a++) {
+            if (!agg_cols[a]) continue;
+            uint16_t agg_op = ext->agg_ops[a];
+            ray_op_ext_t* agg_ext = find_ext(g, ext->agg_ins[a]->id);
+            int64_t name_id;
+            if (agg_ext && agg_ext->base.opcode == OP_SCAN) {
+                ray_t* name_atom = ray_sym_str(agg_ext->sym);
+                const char* base = name_atom ? ray_str_ptr(name_atom) : NULL;
+                size_t blen = base ? ray_str_len(name_atom) : 0;
+                const char* sfx = "";
+                size_t slen = 0;
+                switch (agg_op) {
+                    case OP_SUM:   sfx = "_sum";   slen = 4; break;
+                    case OP_COUNT: sfx = "_count"; slen = 6; break;
+                    case OP_AVG:   sfx = "_mean";  slen = 5; break;
+                    case OP_MIN:   sfx = "_min";   slen = 4; break;
+                    case OP_MAX:   sfx = "_max";   slen = 4; break;
+                    case OP_FIRST: sfx = "_first"; slen = 6; break;
+                    case OP_LAST:  sfx = "_last";  slen = 5; break;
+                    case OP_STDDEV:     sfx = "_stddev";     slen = 7; break;
+                    case OP_STDDEV_POP: sfx = "_stddev_pop"; slen = 11; break;
+                    case OP_VAR:        sfx = "_var";        slen = 4; break;
+                    case OP_VAR_POP:    sfx = "_var_pop";    slen = 8; break;
+                }
+                char buf[256];
+                ray_t* name_dyn_hdr = NULL;
+                char* nbp = buf;
+                size_t nbc = sizeof(buf);
+                if (base && blen + slen >= sizeof(buf)) {
+                    nbp = (char*)scratch_alloc(&name_dyn_hdr, blen + slen + 1);
+                    if (nbp) nbc = blen + slen + 1;
+                    else { nbp = buf; nbc = sizeof(buf); }
+                }
+                if (base && blen + slen < nbc) {
+                    memcpy(nbp, base, blen);
+                    memcpy(nbp + blen, sfx, slen);
+                    name_id = ray_sym_intern(nbp, blen + slen);
+                } else {
+                    name_id = agg_ext->sym;
+                }
+                scratch_free(name_dyn_hdr);
+            } else {
+                name_id = (int64_t)(n_keys + a);
+            }
+            result = ray_table_add_col(result, name_id, agg_cols[a]);
+            ray_release(agg_cols[a]);
+        }
+
+        goto cleanup;
+    }
+
+sequential_fallback:;
+    /* Sequential path using row-layout HT */
+    if (!group_ht_init(&single_ht, ht_cap, &ght_layout)) {
+        result = ray_error("oom", NULL);
+        goto cleanup;
+    }
+    group_rows_range(&single_ht, key_data, key_types, key_attrs, key_vecs, agg_vecs,
+                     0, n_scan, match_idx);
+    final_ht = &single_ht;
+    if (ray_interrupted()) { result = ray_error("cancel", "interrupted"); goto cleanup; }
+    if (single_ht.oom) { result = ray_error("oom", NULL); goto cleanup; }
+
+    /* Build result from sequential HT (inline row layout) */
+    {
+    uint32_t grp_count = final_ht->grp_count;
+    const ght_layout_t* ly = &final_ht->layout;
+    int64_t total_cols = n_keys + n_aggs;
+    result = ray_table_new(total_cols);
+    if (!result || RAY_IS_ERR(result)) goto cleanup;
+
+    /* Key columns: read from inline group rows, narrow to original type.
+     * Wide keys store a source row index in the HT slot; resolve it
+     * through the original key column (key_data[k]) and copy bytes. */
+    for (uint8_t k = 0; k < n_keys; k++) {
+        ray_t* src_col = key_vecs[k];
+        if (!src_col) continue;
+        uint8_t esz = col_esz(src_col);
+        int8_t kt = src_col->type;
+
+        ray_t* new_col = col_vec_new(src_col, (int64_t)grp_count);
+        if (!new_col || RAY_IS_ERR(new_col)) continue;
+        new_col->len = (int64_t)grp_count;
+
+        bool is_wide = (ly->wide_key_mask & (1u << k)) != 0;
+        const char* src_base = is_wide ? (const char*)key_data[k] : NULL;
+
+        for (uint32_t gi = 0; gi < grp_count; gi++) {
+            const char* row = final_ht->rows + (size_t)gi * ly->row_stride;
+            const int64_t* rkeys = (const int64_t*)(row + 8);
+            int64_t kv = rkeys[k];
+            int64_t null_mask = rkeys[n_keys];
+            if (null_mask & (int64_t)(1u << k)) {
+                ray_vec_set_null(new_col, (int64_t)gi, true);
+                continue;
+            }
+            if (is_wide) {
+                char* dst = (char*)ray_data(new_col) + (size_t)gi * esz;
+                memcpy(dst, src_base + (size_t)kv * esz, esz);
+            } else if (kt == RAY_F64) {
+                char* dst = (char*)ray_data(new_col) + (size_t)gi * esz;
+                memcpy(dst, &kv, 8);
+            } else {
+                write_col_i64(ray_data(new_col), gi, kv, kt, new_col->attrs);
+            }
+        }
+
+        ray_op_ext_t* key_ext = find_ext(g, ext->keys[k]->id);
+        int64_t name_id = key_ext ? key_ext->sym : k;
+        result = ray_table_add_col(result, name_id, new_col);
+        ray_release(new_col);
+    }
+
+    /* Agg columns from inline accumulators */
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        uint16_t agg_op = ext->agg_ops[a];
+        ray_t* agg_col = agg_vecs[a];
+        bool is_f64 = agg_col && agg_col->type == RAY_F64;
+        int8_t out_type;
+        switch (agg_op) {
+            case OP_AVG:
+            case OP_STDDEV: case OP_STDDEV_POP:
+            case OP_VAR: case OP_VAR_POP:
+                out_type = RAY_F64; break;
+            case OP_COUNT: out_type = RAY_I64; break;
+            case OP_SUM: case OP_PROD:
+                out_type = is_f64 ? RAY_F64 : RAY_I64; break;
+            default:
+                out_type = agg_col ? agg_col->type : RAY_I64; break;
+        }
+        ray_t* new_col = ray_vec_new(out_type, (int64_t)grp_count);
+        if (!new_col || RAY_IS_ERR(new_col)) continue;
+        new_col->len = (int64_t)grp_count;
+
+        int8_t s = ly->agg_val_slot[a]; /* unified accum slot */
+        for (uint32_t gi = 0; gi < grp_count; gi++) {
+            const char* row = final_ht->rows + (size_t)gi * ly->row_stride;
+            int64_t cnt = *(const int64_t*)(const void*)row;
+            if (out_type == RAY_F64) {
+                double v;
+                switch (agg_op) {
+                    case OP_SUM:
+                        v = is_f64 ? ROW_RD_F64(row, ly->off_sum, s)
+                                   : (double)ROW_RD_I64(row, ly->off_sum, s);
+                        if (agg_affine[a].enabled) v += agg_affine[a].bias_f64 * cnt;
+                        break;
+                    case OP_AVG:
+                        v = is_f64 ? ROW_RD_F64(row, ly->off_sum, s) / cnt
+                                   : (double)ROW_RD_I64(row, ly->off_sum, s) / cnt;
+                        if (agg_affine[a].enabled) v += agg_affine[a].bias_f64;
+                        break;
+                    case OP_MIN:
+                        v = is_f64 ? ROW_RD_F64(row, ly->off_min, s)
+                                   : (double)ROW_RD_I64(row, ly->off_min, s);
+                        break;
+                    case OP_MAX:
+                        v = is_f64 ? ROW_RD_F64(row, ly->off_max, s)
+                                   : (double)ROW_RD_I64(row, ly->off_max, s);
+                        break;
+                    case OP_FIRST: case OP_LAST:
+                        v = is_f64 ? ROW_RD_F64(row, ly->off_sum, s)
+                                   : (double)ROW_RD_I64(row, ly->off_sum, s);
+                        break;
+                    case OP_VAR: case OP_VAR_POP:
+                    case OP_STDDEV: case OP_STDDEV_POP: {
+                        bool insuf = (agg_op == OP_VAR || agg_op == OP_STDDEV) ? cnt <= 1 : cnt <= 0;
+                        if (insuf) { v = 0.0; ray_vec_set_null(new_col, gi, true); break; }
+                        double sum_val = is_f64 ? ROW_RD_F64(row, ly->off_sum, s)
+                                                : (double)ROW_RD_I64(row, ly->off_sum, s);
+                        double sq_val = ly->off_sumsq ? ROW_RD_F64(row, ly->off_sumsq, s) : 0.0;
+                        double mean = sum_val / cnt;
+                        double var_pop = sq_val / cnt - mean * mean;
+                        if (var_pop < 0) var_pop = 0;
+                        if (agg_op == OP_VAR_POP) v = var_pop;
+                        else if (agg_op == OP_VAR) v = var_pop * cnt / (cnt - 1);
+                        else if (agg_op == OP_STDDEV_POP) v = sqrt(var_pop);
+                        else v = sqrt(var_pop * cnt / (cnt - 1));
+                        break;
+                    }
+                    default: v = 0.0; break;
+                }
+                ((double*)ray_data(new_col))[gi] = v;
+            } else {
+                int64_t v;
+                switch (agg_op) {
+                    case OP_SUM:
+                        v = ROW_RD_I64(row, ly->off_sum, s);
+                        if (agg_affine[a].enabled) v += agg_affine[a].bias_i64 * cnt;
+                        break;
+                    case OP_COUNT: v = cnt; break;
+                    case OP_MIN:   v = ROW_RD_I64(row, ly->off_min, s); break;
+                    case OP_MAX:   v = ROW_RD_I64(row, ly->off_max, s); break;
+                    case OP_FIRST: case OP_LAST: v = ROW_RD_I64(row, ly->off_sum, s); break;
+                    default:       v = 0; break;
+                }
+                ((int64_t*)ray_data(new_col))[gi] = v;
+            }
+        }
+
+        /* Generate unique column name */
+        ray_op_ext_t* agg_ext = find_ext(g, ext->agg_ins[a]->id);
+        int64_t name_id;
+        if (agg_ext && agg_ext->base.opcode == OP_SCAN) {
+            ray_t* name_atom = ray_sym_str(agg_ext->sym);
+            const char* base = name_atom ? ray_str_ptr(name_atom) : NULL;
+            size_t blen = base ? ray_str_len(name_atom) : 0;
+            const char* sfx = "";
+            size_t slen = 0;
+            switch (agg_op) {
+                case OP_SUM:   sfx = "_sum";   slen = 4; break;
+                case OP_COUNT: sfx = "_count"; slen = 6; break;
+                case OP_AVG:   sfx = "_mean";  slen = 5; break;
+                case OP_MIN:   sfx = "_min";   slen = 4; break;
+                case OP_MAX:   sfx = "_max";   slen = 4; break;
+                case OP_FIRST: sfx = "_first"; slen = 6; break;
+                case OP_LAST:  sfx = "_last";  slen = 5; break;
+                case OP_STDDEV:     sfx = "_stddev";     slen = 7; break;
+                case OP_STDDEV_POP: sfx = "_stddev_pop"; slen = 11; break;
+                case OP_VAR:        sfx = "_var";        slen = 4; break;
+                case OP_VAR_POP:    sfx = "_var_pop";    slen = 8; break;
+            }
+            char buf[256];
+            if (base && blen + slen < sizeof(buf)) {
+                memcpy(buf, base, blen);
+                memcpy(buf + blen, sfx, slen);
+                name_id = ray_sym_intern(buf, blen + slen);
+            } else {
+                name_id = agg_ext->sym;
+            }
+        } else {
+            /* Expression agg input — synthetic name like "_e0_sum" */
+            char nbuf[32];
+            int np = 0;
+            nbuf[np++] = '_'; nbuf[np++] = 'e';
+            /* Multi-digit agg index */
+            { uint8_t v = a; char dig[3]; int nd = 0;
+              do { dig[nd++] = (char)('0' + v % 10); v /= 10; } while (v);
+              while (nd--) nbuf[np++] = dig[nd]; }
+            const char* nsfx = "";
+            size_t nslen = 0;
+            switch (agg_op) {
+                case OP_SUM:   nsfx = "_sum";   nslen = 4; break;
+                case OP_COUNT: nsfx = "_count"; nslen = 6; break;
+                case OP_AVG:   nsfx = "_mean";  nslen = 5; break;
+                case OP_MIN:   nsfx = "_min";   nslen = 4; break;
+                case OP_MAX:   nsfx = "_max";   nslen = 4; break;
+                case OP_FIRST: nsfx = "_first"; nslen = 6; break;
+                case OP_LAST:  nsfx = "_last";  nslen = 5; break;
+                case OP_STDDEV:     nsfx = "_stddev";     nslen = 7; break;
+                case OP_STDDEV_POP: nsfx = "_stddev_pop"; nslen = 11; break;
+                case OP_VAR:        nsfx = "_var";        nslen = 4; break;
+                case OP_VAR_POP:    nsfx = "_var_pop";    nslen = 8; break;
+            }
+            memcpy(nbuf + np, nsfx, nslen);
+            name_id = ray_sym_intern(nbuf, (size_t)np + nslen);
+        }
+        result = ray_table_add_col(result, name_id, new_col);
+        ray_release(new_col);
+    }
+    }
+
+cleanup:
+    if (final_ht == &single_ht) {
+        group_ht_free(&single_ht);
+    }
+    if (radix_bufs) {
+        size_t n_bufs = (size_t)n_total * RADIX_P;
+        for (size_t i = 0; i < n_bufs; i++) scratch_free(radix_bufs[i]._hdr);
+        scratch_free(radix_bufs_hdr);
+    }
+    if (part_hts) {
+        for (uint32_t p = 0; p < RADIX_P; p++) {
+            if (part_hts[p].rows) group_ht_free(&part_hts[p]);
+        }
+        scratch_free(part_hts_hdr);
+    }
+    for (uint8_t a = 0; a < n_aggs; a++)
+        if (agg_owned[a] && agg_vecs[a]) ray_release(agg_vecs[a]);
+    for (uint8_t k = 0; k < n_keys; k++)
+        if (key_owned[k] && key_vecs[k]) ray_release(key_vecs[k]);
+    if (match_idx_block) ray_release(match_idx_block);
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_group_per_partition — per-partition GROUP BY with merge
+ *
+ * Runs exec_group on each partition independently (zero-copy mmap segments),
+ * then merges the small partial results via a second exec_group pass.
+ *
+ * Merge ops: SUM→SUM, COUNT→SUM, MIN→MIN, MAX→MAX, FIRST→FIRST, LAST→LAST.
+ * AVG: decomposed into SUM+COUNT per partition, merged, then divided.
+ * STDDEV/VAR: decomposed into SUM(x)+SUM(x²)+COUNT(x) per partition,
+ *   merged with SUM, then final variance/stddev computed from merged totals.
+ *
+ * Returns NULL if any step fails (caller falls through to concat path).
+ * -------------------------------------------------------------------------- */
+static ray_t* __attribute__((noinline))
+exec_group_per_partition(ray_t* parted_tbl, ray_op_ext_t* ext,
+                         int32_t n_parts, const int64_t* key_syms,
+                         const int64_t* agg_syms, int has_avg,
+                         int has_stddev, int64_t group_limit) {
+
+    uint8_t n_keys = ext->n_keys;
+    uint8_t n_aggs = ext->n_aggs;
+
+    /* Guard: fixed-size arrays below cap at 24 agg ops.
+     * Each AVG adds 1 extra (COUNT), each STDDEV/VAR adds 2 (SUM_SQ + COUNT).
+     * n_aggs + n_avg + 2*n_std must stay within 24. */
+    if (n_aggs > 8 || n_keys > 8) return NULL;
+
+    /* Identify MAPCOMMON vs PARTED keys.  MAPCOMMON keys are constant
+     * within a partition, so they are excluded from per-partition GROUP BY
+     * and reconstructed after concat. */
+    uint8_t  n_mc_keys = 0;
+    int64_t  mc_sym_ids[8];
+    uint8_t  n_part_keys = 0;
+    int64_t  pk_syms[8];       /* non-MAPCOMMON key sym IDs */
+
+    for (uint8_t k = 0; k < n_keys; k++) {
+        ray_t* pcol = ray_table_get_col(parted_tbl, key_syms[k]);
+        if (pcol && pcol->type == RAY_MAPCOMMON) {
+            mc_sym_ids[n_mc_keys++] = key_syms[k];
+        } else {
+            pk_syms[n_part_keys++] = key_syms[k];
+        }
+    }
+
+    /* LIMIT pushdown: when all GROUP BY keys are MAPCOMMON (n_part_keys==0),
+     * each partition produces exactly 1 group.  Limit the partition loop. */
+    if (group_limit > 0 && n_part_keys == 0 && group_limit < n_parts)
+        n_parts = (int32_t)group_limit;
+
+    /* Decomposition: AVG(x) → SUM(x) + COUNT(x).
+     * STDDEV/VAR(x) → SUM(x) + SUM(x²) + COUNT(x).
+     * Build per-partition agg_ops with decomposed ops, then merge ops. */
+    uint16_t part_ops[24];   /* per-partition agg ops */
+    uint16_t merge_ops[24];  /* merge agg ops */
+    uint8_t  avg_idx[8];     /* which original agg slots are AVG */
+    uint8_t  std_idx[8];     /* which original agg slots are STDDEV/VAR */
+    uint16_t std_orig_op[8]; /* original op for each std slot */
+    uint8_t  n_avg = 0;
+    uint8_t  n_std = 0;
+    uint8_t  part_n_aggs = n_aggs;
+    /* stddev_needs_sq[a]: index into part_ops for the SUM(x²) slot */
+    uint8_t  std_sq_slot[8];
+    uint8_t  std_cnt_slot[8];
+
+    for (uint8_t a = 0; a < n_aggs; a++) {
+        uint16_t aop = ext->agg_ops[a];
+        if (aop == OP_AVG) {
+            part_ops[a] = OP_SUM;     /* partition: compute SUM */
+            avg_idx[n_avg++] = a;
+        } else if (aop == OP_STDDEV || aop == OP_STDDEV_POP ||
+                   aop == OP_VAR || aop == OP_VAR_POP) {
+            part_ops[a] = OP_SUM;     /* partition: compute SUM(x) */
+            std_orig_op[n_std] = aop;
+            std_idx[n_std++] = a;
+        } else {
+            part_ops[a] = aop;
+        }
+    }
+    /* Guard: total decomposed slots must fit */
+    if (n_aggs + n_avg + 2 * n_std > 24) return NULL;
+
+    /* Append SUM(x²) for each STDDEV/VAR slot */
+    for (uint8_t i = 0; i < n_std; i++) {
+        std_sq_slot[i] = part_n_aggs;
+        part_ops[part_n_aggs++] = OP_SUM;  /* SUM(x²) */
+    }
+    /* Append COUNT for each AVG column */
+    for (uint8_t i = 0; i < n_avg; i++)
+        part_ops[part_n_aggs++] = OP_COUNT;
+    /* Append COUNT for each STDDEV/VAR column */
+    for (uint8_t i = 0; i < n_std; i++) {
+        std_cnt_slot[i] = part_n_aggs;
+        part_ops[part_n_aggs++] = OP_COUNT;
+    }
+
+    /* Merge ops: SUM→SUM, COUNT→SUM, MIN→MIN, MAX→MAX,
+     * FIRST→FIRST, LAST→LAST, all appended slots → SUM */
+    for (uint8_t a = 0; a < part_n_aggs; a++) {
+        merge_ops[a] = part_ops[a];
+        if (merge_ops[a] == OP_COUNT) merge_ops[a] = OP_SUM;
+    }
+
+    /* Agg input syms for the decomposed ops.
+     * AVG's COUNT uses same input column as the AVG itself.
+     * STDDEV's SUM(x²) and COUNT use same input column as the STDDEV. */
+    int64_t part_agg_syms[24];
+    /* Flag: slot needs x*x graph node (for SUM(x²)) */
+    int part_needs_sq[24];
+    memset(part_needs_sq, 0, sizeof(part_needs_sq));
+
+    for (uint8_t a = 0; a < n_aggs; a++)
+        part_agg_syms[a] = agg_syms[a];
+    /* SUM(x²) slots for STDDEV/VAR */
+    for (uint8_t i = 0; i < n_std; i++) {
+        part_agg_syms[std_sq_slot[i]] = agg_syms[std_idx[i]];
+        part_needs_sq[std_sq_slot[i]] = 1;
+    }
+    /* COUNT slots for AVG */
+    for (uint8_t i = 0; i < n_avg; i++)
+        part_agg_syms[n_aggs + n_std + i] = agg_syms[avg_idx[i]];
+    /* COUNT slots for STDDEV/VAR */
+    for (uint8_t i = 0; i < n_std; i++)
+        part_agg_syms[std_cnt_slot[i]] = agg_syms[std_idx[i]];
+
+    /* ---- Batched incremental merge ----
+     * Process partitions in batches of MERGE_BATCH.  After each batch:
+     *   Phase 1: exec_group each partition in batch → batch_partials[]
+     *   Phase 2: concat (running + batch_partials + MAPCOMMON) → merge_tbl
+     *   Phase 3: merge GROUP BY → new running
+     * Bounds peak memory to O(MERGE_BATCH × groups_per_partition). */
+#define MERGE_BATCH 8
+
+    /* Capture agg column name IDs from first partition result */
+    int64_t agg_name_ids[24];
+    int agg_names_captured = 0;
+
+    ray_t* running = NULL;
+    ray_t* merge_tbl = NULL;      /* last merge table (for column name fixup) */
+
+    for (int32_t batch_start = 0; batch_start < n_parts;
+         batch_start += MERGE_BATCH) {
+
+        int32_t batch_end = batch_start + MERGE_BATCH;
+        if (batch_end > n_parts) batch_end = n_parts;
+        int32_t batch_n = batch_end - batch_start;
+
+        /* Phase 1: exec_group each partition in this batch */
+        ray_t* bp[MERGE_BATCH];
+        memset(bp, 0, sizeof(bp));
+
+        for (int32_t bi = 0; bi < batch_n; bi++) {
+            int32_t p = batch_start + bi;
+
+            /* Collect unique agg input sym IDs (avoid duplicate columns) */
+            int64_t unique_agg[24];
+            int n_unique_agg = 0;
+            for (uint8_t a = 0; a < part_n_aggs; a++) {
+                int dup = 0;
+                for (int j = 0; j < n_unique_agg; j++)
+                    if (unique_agg[j] == part_agg_syms[a]) { dup = 1; break; }
+                if (!dup) {
+                    for (uint8_t k = 0; k < n_keys; k++)
+                        if (key_syms[k] == part_agg_syms[a]) { dup = 1; break; }
+                    if (!dup) unique_agg[n_unique_agg++] = part_agg_syms[a];
+                }
+            }
+
+            ray_t* sub = ray_table_new((int64_t)(n_part_keys + n_unique_agg));
+            if (!sub || RAY_IS_ERR(sub)) goto batch_fail;
+
+            for (uint8_t k = 0; k < n_part_keys; k++) {
+                ray_t* pcol = ray_table_get_col(parted_tbl, pk_syms[k]);
+                if (!pcol || !RAY_IS_PARTED(pcol->type)) {
+                    ray_release(sub); goto batch_fail;
+                }
+                ray_t* seg = ((ray_t**)ray_data(pcol))[p];
+                if (!seg) { ray_release(sub); goto batch_fail; }
+                ray_retain(seg);
+                sub = ray_table_add_col(sub, pk_syms[k], seg);
+                ray_release(seg);
+            }
+            for (int j = 0; j < n_unique_agg; j++) {
+                ray_t* pcol = ray_table_get_col(parted_tbl, unique_agg[j]);
+                if (!pcol || !RAY_IS_PARTED(pcol->type)) {
+                    ray_release(sub); goto batch_fail;
+                }
+                ray_t* seg = ((ray_t**)ray_data(pcol))[p];
+                if (!seg) { ray_release(sub); goto batch_fail; }
+                ray_retain(seg);
+                sub = ray_table_add_col(sub, unique_agg[j], seg);
+                ray_release(seg);
+            }
+
+            ray_graph_t* pg = ray_graph_new(sub);
+            if (!pg) { ray_release(sub); goto batch_fail; }
+
+            ray_op_t* pkeys[8];
+            for (uint8_t k = 0; k < n_part_keys; k++) {
+                ray_t* sym_atom = ray_sym_str(pk_syms[k]);
+                pkeys[k] = ray_scan(pg, ray_str_ptr(sym_atom));
+            }
+            ray_op_t* pagg_ins[24];
+            for (uint8_t a = 0; a < part_n_aggs; a++) {
+                ray_t* sym_atom = ray_sym_str(part_agg_syms[a]);
+                pagg_ins[a] = ray_scan(pg, ray_str_ptr(sym_atom));
+            }
+            for (uint8_t j = 0; j < n_std; j++) {
+                uint8_t sq = std_sq_slot[j];
+                ray_op_t* x = pagg_ins[sq];
+                pagg_ins[sq] = ray_mul(pg, x, x);
+            }
+
+            ray_op_t* proot = ray_group(pg, pkeys, n_part_keys,
+                                       part_ops, pagg_ins, part_n_aggs);
+            proot = ray_optimize(pg, proot);
+            bp[bi] = ray_execute(pg, proot);
+            ray_graph_free(pg);
+            ray_release(sub);
+
+            if (!bp[bi] || RAY_IS_ERR(bp[bi])) goto batch_fail;
+
+            /* Capture agg column name IDs once (all partials share names) */
+            if (!agg_names_captured) {
+                for (uint8_t a = 0; a < part_n_aggs; a++)
+                    agg_name_ids[a] = ray_table_col_name(
+                        bp[bi], (int64_t)n_part_keys + a);
+                agg_names_captured = 1;
+            }
+        }
+
+        /* Phase 2: concat (running + batch_partials + MAPCOMMON) */
+        int64_t mrows = running ? ray_table_nrows(running) : 0;
+        for (int32_t i = 0; i < batch_n; i++)
+            mrows += ray_table_nrows(bp[i]);
+
+        if (merge_tbl) { ray_release(merge_tbl); merge_tbl = NULL; }
+        merge_tbl = ray_table_new((int64_t)(n_keys + part_n_aggs));
+        if (!merge_tbl || RAY_IS_ERR(merge_tbl)) {
+            merge_tbl = NULL; goto batch_fail;
+        }
+
+        /* Key columns */
+        for (uint8_t k = 0; k < n_keys; k++) {
+            int is_mc = 0;
+            for (uint8_t m = 0; m < n_mc_keys; m++)
+                if (mc_sym_ids[m] == key_syms[k]) { is_mc = 1; break; }
+
+            /* Type reference for column allocation */
+            ray_t* tref = NULL;
+            if (running) {
+                tref = ray_table_get_col(running, key_syms[k]);
+            } else if (is_mc) {
+                ray_t* mc_col = ray_table_get_col(parted_tbl, key_syms[k]);
+                tref = ((ray_t**)ray_data(mc_col))[0];
+            } else {
+                tref = ray_table_get_col(bp[0], key_syms[k]);
+            }
+            if (!tref) goto batch_fail;
+
+            size_t esz = (size_t)col_esz(tref);
+            ray_t* flat = col_vec_new(tref, mrows);
+            if (!flat || RAY_IS_ERR(flat)) goto batch_fail;
+            flat->len = mrows;
+            char* out = (char*)ray_data(flat);
+            int64_t off = 0;
+
+            /* Copy from running result */
+            if (running) {
+                ray_t* rc = ray_table_get_col(running, key_syms[k]);
+                if (rc && rc->len > 0) {
+                    memcpy(out, ray_data(rc), (size_t)rc->len * esz);
+                    off = rc->len;
+                }
+            }
+
+            /* Copy from batch partials */
+            for (int32_t i = 0; i < batch_n; i++) {
+                int64_t pnrows = ray_table_nrows(bp[i]);
+                if (is_mc) {
+                    /* MAPCOMMON: replicate this partition's key value */
+                    int32_t p = batch_start + i;
+                    ray_t* mc_col = ray_table_get_col(parted_tbl, key_syms[k]);
+                    ray_t* mc_kv = ((ray_t**)ray_data(mc_col))[0];
+                    const char* kdata = (const char*)ray_data(mc_kv);
+                    for (int64_t r = 0; r < pnrows; r++)
+                        memcpy(out + (size_t)(off + r) * esz,
+                               kdata + (size_t)p * esz, esz);
+                    off += pnrows;
+                } else {
+                    ray_t* pc = ray_table_get_col(bp[i], key_syms[k]);
+                    if (pc && pc->len > 0) {
+                        memcpy(out + (size_t)off * esz,
+                               ray_data(pc), (size_t)pc->len * esz);
+                        off += pc->len;
+                    }
+                }
+            }
+
+            merge_tbl = ray_table_add_col(merge_tbl, key_syms[k], flat);
+            ray_release(flat);
+        }
+
+        /* Agg columns */
+        for (uint8_t a = 0; a < part_n_aggs; a++) {
+            ray_t* tref = running
+                ? ray_table_get_col_idx(running, (int64_t)n_keys + a)
+                : ray_table_get_col_idx(bp[0], (int64_t)n_part_keys + a);
+            if (!tref) goto batch_fail;
+
+            size_t esz = (size_t)col_esz(tref);
+            ray_t* flat = col_vec_new(tref, mrows);
+            if (!flat || RAY_IS_ERR(flat)) goto batch_fail;
+            flat->len = mrows;
+            char* out = (char*)ray_data(flat);
+            int64_t off = 0;
+
+            if (running) {
+                ray_t* rc = ray_table_get_col_idx(running, (int64_t)n_keys + a);
+                if (rc && rc->len > 0) {
+                    memcpy(out, ray_data(rc), (size_t)rc->len * esz);
+                    off = rc->len;
+                }
+            }
+
+            for (int32_t i = 0; i < batch_n; i++) {
+                ray_t* pc = ray_table_get_col_idx(bp[i],
+                                                 (int64_t)n_part_keys + a);
+                if (pc && pc->len > 0) {
+                    memcpy(out + (size_t)off * esz,
+                           ray_data(pc), (size_t)pc->len * esz);
+                    off += pc->len;
+                }
+            }
+
+            merge_tbl = ray_table_add_col(merge_tbl, agg_name_ids[a], flat);
+            ray_release(flat);
+        }
+
+        /* Free batch partials */
+        for (int32_t i = 0; i < batch_n; i++) {
+            ray_release(bp[i]);
+            bp[i] = NULL;
+        }
+
+        /* Phase 3: merge GROUP BY */
+        ray_graph_t* mg = ray_graph_new(merge_tbl);
+        if (!mg) goto batch_fail;
+
+        ray_op_t* mkeys[8];
+        for (uint8_t k = 0; k < n_keys; k++) {
+            ray_t* sym_atom = ray_sym_str(key_syms[k]);
+            mkeys[k] = ray_scan(mg, ray_str_ptr(sym_atom));
+        }
+
+        ray_op_t* magg_ins[24];
+        for (uint8_t a = 0; a < part_n_aggs; a++) {
+            ray_t* agg_name = ray_sym_str(agg_name_ids[a]);
+            magg_ins[a] = ray_scan(mg, ray_str_ptr(agg_name));
+        }
+
+        ray_op_t* mroot = ray_group(mg, mkeys, n_keys,
+                                   merge_ops, magg_ins, part_n_aggs);
+        mroot = ray_optimize(mg, mroot);
+        ray_t* new_running = ray_execute(mg, mroot);
+        ray_graph_free(mg);
+
+        if (running) ray_release(running);
+        running = new_running;
+
+        if (!running || RAY_IS_ERR(running)) {
+            ray_release(merge_tbl);
+            return NULL;
+        }
+
+        /* Rename running's agg columns back to the original partial names.
+         * Without this, each merge adds an extra suffix (e.g. v1_sum → v1_sum_sum). */
+        for (uint8_t a = 0; a < part_n_aggs; a++)
+            ray_table_set_col_name(running, (int64_t)n_keys + a, agg_name_ids[a]);
+
+        continue;
+
+batch_fail:
+        for (int32_t i = 0; i < batch_n; i++)
+            if (bp[i]) ray_release(bp[i]);
+        if (running) ray_release(running);
+        if (merge_tbl) ray_release(merge_tbl);
+        return NULL;
+    }
+
+    ray_t* result = running;
+
+    if (!result || RAY_IS_ERR(result)) {
+        if (merge_tbl) ray_release(merge_tbl);
+        return NULL;
+    }
+
+    int64_t rncols = ray_table_ncols(result);
+
+    /* AVG/STDDEV post-processing: build trimmed table (n_keys + n_aggs cols),
+     * computing final AVG = SUM/COUNT and STDDEV/VAR from SUM, SUM_SQ, COUNT. */
+    if (has_avg || has_stddev) {
+        ray_t* trimmed = ray_table_new((int64_t)(n_keys + n_aggs));
+        if (!trimmed || RAY_IS_ERR(trimmed)) {
+            ray_release(result);
+            if (merge_tbl) ray_release(merge_tbl);
+            return NULL;
+        }
+
+        for (int64_t c = 0; c < (int64_t)(n_keys + n_aggs) && c < rncols; c++) {
+            int64_t nm = ray_table_col_name(result, c);
+
+            /* Check if this agg column is an AVG or STDDEV/VAR slot */
+            int is_avg_slot = 0, is_std_slot = 0;
+            uint8_t avg_i = 0, std_i = 0;
+            if (c >= n_keys) {
+                uint8_t a = (uint8_t)(c - n_keys);
+                for (uint8_t j = 0; j < n_avg; j++) {
+                    if (avg_idx[j] == a) { is_avg_slot = 1; avg_i = j; break; }
+                }
+                for (uint8_t j = 0; j < n_std; j++) {
+                    if (std_idx[j] == a) { is_std_slot = 1; std_i = j; break; }
+                }
+            }
+
+            if (is_avg_slot) {
+                /* AVG = SUM(x) / COUNT(x) */
+                int64_t sum_ci = c;
+                /* AVG COUNT slots: after n_aggs + n_std SUM_SQ slots */
+                int64_t cnt_ci = (int64_t)n_keys + n_aggs + n_std + avg_i;
+                ray_t* sum_col = ray_table_get_col_idx(result, sum_ci);
+                ray_t* cnt_col = (cnt_ci < rncols) ? ray_table_get_col_idx(result, cnt_ci) : NULL;
+                if (!sum_col || !cnt_col) {
+                    if (sum_col) {
+                        ray_retain(sum_col);
+                        trimmed = ray_table_add_col(trimmed, nm, sum_col);
+                        ray_release(sum_col);
+                    }
+                    continue;
+                }
+
+                int64_t nrows = sum_col->len;
+                ray_t* avg_col = ray_vec_new(RAY_F64, nrows);
+                if (!avg_col || RAY_IS_ERR(avg_col)) {
+                    ray_release(trimmed); ray_release(result);
+                    if (merge_tbl) ray_release(merge_tbl);
+                    return NULL;
+                }
+                avg_col->len = nrows;
+
+                double* out = (double*)ray_data(avg_col);
+                if (sum_col->type == RAY_F64) {
+                    const double* sv = (const double*)ray_data(sum_col);
+                    const int64_t* cv = (const int64_t*)ray_data(cnt_col);
+                    for (int64_t r = 0; r < nrows; r++)
+                        out[r] = cv[r] > 0 ? sv[r] / (double)cv[r] : 0.0;
+                } else {
+                    const int64_t* sv = (const int64_t*)ray_data(sum_col);
+                    const int64_t* cv = (const int64_t*)ray_data(cnt_col);
+                    for (int64_t r = 0; r < nrows; r++)
+                        out[r] = cv[r] > 0 ? (double)sv[r] / (double)cv[r] : 0.0;
+                }
+                trimmed = ray_table_add_col(trimmed, nm, avg_col);
+                ray_release(avg_col);
+            } else if (is_std_slot) {
+                /* STDDEV/VAR from merged SUM(x), SUM(x²), COUNT(x):
+                 * var_pop = SUM_SQ/N - (SUM/N)²
+                 * var_samp = var_pop * N/(N-1)
+                 * stddev_pop = sqrt(var_pop), stddev_samp = sqrt(var_samp) */
+                int64_t sum_ci = c;
+                int64_t sq_ci  = (int64_t)n_keys + std_sq_slot[std_i];
+                int64_t cnt_ci = (int64_t)n_keys + std_cnt_slot[std_i];
+                ray_t* sum_col = ray_table_get_col_idx(result, sum_ci);
+                ray_t* sq_col  = (sq_ci < rncols) ? ray_table_get_col_idx(result, sq_ci) : NULL;
+                ray_t* cnt_col = (cnt_ci < rncols) ? ray_table_get_col_idx(result, cnt_ci) : NULL;
+                if (!sum_col || !sq_col || !cnt_col) {
+                    if (sum_col) {
+                        ray_retain(sum_col);
+                        trimmed = ray_table_add_col(trimmed, nm, sum_col);
+                        ray_release(sum_col);
+                    }
+                    continue;
+                }
+
+                int64_t nrows = sum_col->len;
+                ray_t* out_col = ray_vec_new(RAY_F64, nrows);
+                if (!out_col || RAY_IS_ERR(out_col)) {
+                    ray_release(trimmed); ray_release(result);
+                    if (merge_tbl) ray_release(merge_tbl);
+                    return NULL;
+                }
+                out_col->len = nrows;
+                double* out = (double*)ray_data(out_col);
+
+                uint16_t orig_op = std_orig_op[std_i];
+                /* SUM(x) is always F64 after merge (SUM produces F64 for F64 input,
+                 * I64 for integer input; SUM(x²) via ray_mul always produces F64). */
+                const double* sq = (const double*)ray_data(sq_col);
+                const int64_t* cv = (const int64_t*)ray_data(cnt_col);
+                if (sum_col->type == RAY_F64) {
+                    const double* sv = (const double*)ray_data(sum_col);
+                    for (int64_t r = 0; r < nrows; r++) {
+                        double n = (double)cv[r];
+                        if (n <= 0) { out[r] = 0.0; ray_vec_set_null(out_col, r, true); continue; }
+                        double mean = sv[r] / n;
+                        double var_pop = sq[r] / n - mean * mean;
+                        if (var_pop < 0) var_pop = 0;
+                        bool insuf = (orig_op == OP_VAR || orig_op == OP_STDDEV) && n <= 1;
+                        if (insuf) { out[r] = 0.0; ray_vec_set_null(out_col, r, true); continue; }
+                        if (orig_op == OP_VAR_POP)         out[r] = var_pop;
+                        else if (orig_op == OP_VAR)         out[r] = var_pop * n / (n - 1);
+                        else if (orig_op == OP_STDDEV_POP)  out[r] = sqrt(var_pop);
+                        else /* OP_STDDEV */                out[r] = sqrt(var_pop * n / (n - 1));
+                    }
+                } else {
+                    const int64_t* sv = (const int64_t*)ray_data(sum_col);
+                    for (int64_t r = 0; r < nrows; r++) {
+                        double n = (double)cv[r];
+                        if (n <= 0) { out[r] = 0.0; ray_vec_set_null(out_col, r, true); continue; }
+                        double mean = (double)sv[r] / n;
+                        double var_pop = sq[r] / n - mean * mean;
+                        if (var_pop < 0) var_pop = 0;
+                        bool insuf = (orig_op == OP_VAR || orig_op == OP_STDDEV) && n <= 1;
+                        if (insuf) { out[r] = 0.0; ray_vec_set_null(out_col, r, true); continue; }
+                        if (orig_op == OP_VAR_POP)         out[r] = var_pop;
+                        else if (orig_op == OP_VAR)         out[r] = var_pop * n / (n - 1);
+                        else if (orig_op == OP_STDDEV_POP)  out[r] = sqrt(var_pop);
+                        else /* OP_STDDEV */                out[r] = sqrt(var_pop * n / (n - 1));
+                    }
+                }
+                trimmed = ray_table_add_col(trimmed, nm, out_col);
+                ray_release(out_col);
+            } else {
+                ray_t* col = ray_table_get_col_idx(result, c);
+                if (col) {
+                    ray_retain(col);
+                    trimmed = ray_table_add_col(trimmed, nm, col);
+                    ray_release(col);
+                }
+            }
+        }
+        ray_release(result);
+        result = trimmed;
+        rncols = ray_table_ncols(result);
+    }
+
+    /* Agg column names already fixed by ray_table_set_col_name inside batch loop.
+     * Apply final name fixup for the user-facing n_aggs columns (trim decomposed extras). */
+    for (uint8_t a = 0; a < n_aggs && (int64_t)(n_keys + a) < rncols; a++)
+        ray_table_set_col_name(result, (int64_t)n_keys + a, agg_name_ids[a]);
+
+    if (merge_tbl) ray_release(merge_tbl);
+    return result;
+}
+
+/* ══════════════════════════════════════════════════════════════════════
+ * pivot_ingest_run — shared parallel hash-aggregate for pivot
+ *
+ * Mirrors the phase1+phase2 radix pipeline exec_group uses, leaving
+ * the result in per-partition HTs with prefix offsets so the caller
+ * can iterate grouped rows without knowing about the radix internals.
+ * Falls back to a single sequential HT for tiny inputs or when no
+ * pool is available — the caller iterates n_parts ∈ {1, RADIX_P}.
+ * ══════════════════════════════════════════════════════════════════════ */
+
+static void pivot_ingest_sequential(pivot_ingest_t* out, const ght_layout_t* ly,
+                                     void** key_data, int8_t* key_types,
+                                     uint8_t* key_attrs, ray_t** key_vecs,
+                                     ray_t** agg_vecs, int64_t n_scan,
+                                     group_ht_t* scratch_ht) {
+    (void)key_data;
+    out->part_hts = scratch_ht;
+    out->n_parts = 1;
+    out->row_stride = ly->row_stride;
+    group_rows_range(scratch_ht, key_data, key_types, key_attrs, key_vecs,
+                     agg_vecs, 0, n_scan, NULL);
+    out->total_grps = scratch_ht->grp_count;
+    out->part_offsets[0] = 0;
+    out->part_offsets[1] = scratch_ht->grp_count;
+    out->part_hts = scratch_ht;
+}
+
+bool pivot_ingest_run(pivot_ingest_t* out,
+                      const ght_layout_t* ly,
+                      void** key_data, int8_t* key_types, uint8_t* key_attrs,
+                      ray_t** key_vecs, ray_t** agg_vecs,
+                      int64_t n_scan) {
+    memset(out, 0, sizeof(*out));
+    out->row_stride = ly->row_stride;
+
+    /* Allocate a small offsets buffer up front (RADIX_P+1 is the max). */
+    out->part_offsets = (uint32_t*)scratch_alloc(&out->_offsets_hdr,
+        (size_t)(RADIX_P + 1) * sizeof(uint32_t));
+    if (!out->part_offsets) return false;
+
+    uint8_t n_keys = ly->n_keys;
+
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t n_total = pool ? ray_pool_total_workers(pool) : 1;
+    bool parallel_ok = (pool && n_scan >= RAY_PARALLEL_THRESHOLD && n_total > 1);
+
+    if (!parallel_ok) {
+        /* Sequential single-HT path — allocate the HT in its own scratch
+         * block and wire part_hts/n_parts immediately so every failure
+         * below funnels through pivot_ingest_free for cleanup. */
+        group_ht_t* seq = (group_ht_t*)scratch_calloc(&out->_part_hts_hdr,
+            sizeof(group_ht_t));
+        if (!seq) return false;
+        out->part_hts = seq;
+        out->n_parts = 1;
+        uint32_t seq_cap = 1024;
+        uint64_t target = (uint64_t)n_scan * 2;
+        while ((uint64_t)seq_cap < target && seq_cap < (1u << 24)) seq_cap <<= 1;
+        if (!group_ht_init(seq, seq_cap, ly)) return false;
+        pivot_ingest_sequential(out, ly, key_data, key_types, key_attrs,
+                                key_vecs, agg_vecs, n_scan, seq);
+        /* Surface grow-path OOM from group_probe_entry so callers don't
+         * silently see a truncated result. */
+        if (seq->oom) return false;
+        return true;
+    }
+
+    /* ═════ Parallel radix path ═════ */
+    size_t n_bufs = (size_t)n_total * RADIX_P;
+    out->_n_bufs = n_bufs;
+    radix_buf_t* radix_bufs = (radix_buf_t*)scratch_calloc(&out->_radix_bufs_hdr,
+        n_bufs * sizeof(radix_buf_t));
+    if (!radix_bufs) return false;
+    out->_radix_bufs = radix_bufs;
+
+    uint32_t buf_init = (uint32_t)((uint64_t)n_scan / (RADIX_P * n_total));
+    if (buf_init < 64) buf_init = 64;
+    buf_init = buf_init + buf_init / 2;
+    uint16_t estride = ly->entry_stride;
+    {
+        size_t total_pre = (size_t)n_bufs * buf_init * estride;
+        if (total_pre > (size_t)2 << 30) {
+            buf_init = (uint32_t)(((size_t)2 << 30) / ((size_t)n_bufs * estride));
+            if (buf_init < 64) buf_init = 64;
+        }
+    }
+    for (size_t i = 0; i < n_bufs; i++) {
+        radix_bufs[i].data = (char*)scratch_alloc(&radix_bufs[i]._hdr,
+            (size_t)buf_init * estride);
+        radix_bufs[i].count = 0;
+        radix_bufs[i].cap = buf_init;
+    }
+
+    uint8_t p1_nullable = 0;
+    for (uint8_t k = 0; k < n_keys; k++) {
+        if (!key_vecs[k]) continue;
+        ray_t* src = (key_vecs[k]->attrs & RAY_ATTR_SLICE)
+                     ? key_vecs[k]->slice_parent : key_vecs[k];
+        if (src && (src->attrs & RAY_ATTR_HAS_NULLS))
+            p1_nullable |= (uint8_t)(1u << k);
+    }
+
+    radix_phase1_ctx_t p1ctx = {
+        .key_data      = key_data,
+        .key_types     = key_types,
+        .key_attrs     = key_attrs,
+        .key_vecs      = key_vecs,
+        .nullable_mask = p1_nullable,
+        .agg_vecs      = agg_vecs,
+        .n_workers     = n_total,
+        .bufs          = radix_bufs,
+        .layout        = *ly,
+        .match_idx     = NULL,
+    };
+    ray_pool_dispatch(pool, radix_phase1_fn, &p1ctx, n_scan);
+    if (ray_interrupted()) return true; /* caller checks ray_interrupted() */
+    /* Sync point — phase1 drained all rows, so rows_done == n_scan. */
+    ray_progress_update(NULL, "hash-partition", (uint64_t)n_scan, (uint64_t)n_scan);
+
+    for (size_t i = 0; i < n_bufs; i++)
+        if (radix_bufs[i].oom) return false;
+
+    group_ht_t* part_hts = (group_ht_t*)scratch_calloc(&out->_part_hts_hdr,
+        RADIX_P * sizeof(group_ht_t));
+    if (!part_hts) return false;
+
+    radix_phase2_ctx_t p2ctx = {
+        .key_types = key_types,
+        .n_keys    = n_keys,
+        .n_workers = n_total,
+        .bufs      = radix_bufs,
+        .part_hts  = part_hts,
+        .layout    = *ly,
+        .key_data  = key_data,
+    };
+    ray_pool_dispatch_n(pool, radix_phase2_fn, &p2ctx, RADIX_P);
+    out->part_hts = part_hts;
+    out->n_parts = RADIX_P;
+    if (ray_interrupted()) return true;
+    /* Sync point — partitions materialized; show RADIX_P/RADIX_P. */
+    ray_progress_update(NULL, "per-partition aggregate", RADIX_P, RADIX_P);
+
+    /* OOM detection for the parallel path. Two distinct failure modes
+     * must be caught here so callers never see a silently-truncated
+     * result:
+     *   (a) phase2 init failed — radix_phase2_fn `continue`s when
+     *       group_ht_init_sized returns false, leaving the partition
+     *       HT with NULL rows despite a non-zero buffer count. Every
+     *       entry routed into that partition would be dropped.
+     *   (b) grow-path OOM — group_probe_entry sets part_hts[p].oom
+     *       on scratch_realloc failure and returns without inserting
+     *       the key, silently truncating later groups. */
+    for (uint32_t p = 0; p < RADIX_P; p++) {
+        if (part_hts[p].oom) return false;
+        if (part_hts[p].rows) continue;
+        uint32_t pcount = 0;
+        for (uint32_t w = 0; w < n_total; w++)
+            pcount += radix_bufs[(size_t)w * RADIX_P + p].count;
+        if (pcount) return false;
+    }
+
+    out->part_offsets[0] = 0;
+    for (uint32_t p = 0; p < RADIX_P; p++)
+        out->part_offsets[p + 1] = out->part_offsets[p] + part_hts[p].grp_count;
+    out->total_grps = out->part_offsets[RADIX_P];
+    return true;
+}
+
+void pivot_ingest_free(pivot_ingest_t* out) {
+    if (!out) return;
+    if (out->part_hts) {
+        for (uint32_t p = 0; p < out->n_parts; p++) {
+            if (out->part_hts[p].rows || out->part_hts[p].slots)
+                group_ht_free(&out->part_hts[p]);
+        }
+        scratch_free(out->_part_hts_hdr);
+    }
+    if (out->_radix_bufs) {
+        radix_buf_t* bufs = (radix_buf_t*)out->_radix_bufs;
+        for (size_t i = 0; i < out->_n_bufs; i++) scratch_free(bufs[i]._hdr);
+        scratch_free(out->_radix_bufs_hdr);
+    }
+    scratch_free(out->_offsets_hdr);
+    memset(out, 0, sizeof(*out));
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/hash.h b/crates/rayforce-sys/vendor/rayforce/src/ops/hash.h
new file mode 100644
index 0000000..814b7bd
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/hash.h
@@ -0,0 +1,252 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/*
+ * hash.h — Fast wyhash-based hashing for Rayforce
+ *
+ * Based on wyhash final version 4.2 by Wang Yi <godspeed_china@yeah.net>
+ * Original: https://github.com/wangyi-fudan/wyhash
+ *
+ * This is free and unencumbered software released into the public domain
+ * under The Unlicense (https://unlicense.org).
+ * See the original repository for full license text.
+ */
+
+#ifndef RAY_HASH_H
+#define RAY_HASH_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+/* ---- Platform detection ------------------------------------------------- */
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+  #define RAY_HASH_LIKELY(x)   __builtin_expect(!!(x), 1)
+  #define RAY_HASH_UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+  #define RAY_HASH_LIKELY(x)   (x)
+  #define RAY_HASH_UNLIKELY(x) (x)
+#endif
+
+#if defined(_MSC_VER) && defined(_M_X64)
+  #include <intrin.h>
+  #pragma intrinsic(_umul128)
+#endif
+
+#ifndef RAY_HASH_LITTLE_ENDIAN
+  #if defined(RAY_OS_WINDOWS) || defined(__LITTLE_ENDIAN__) || \
+      (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+    #define RAY_HASH_LITTLE_ENDIAN 1
+  #elif defined(__BIG_ENDIAN__) || \
+        (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    #define RAY_HASH_LITTLE_ENDIAN 0
+  #else
+    #define RAY_HASH_LITTLE_ENDIAN 1
+  #endif
+#endif
+
+/* ---- Internal primitives ------------------------------------------------ */
+
+/* 128-bit multiply: *A and *B become the low and high 64 bits of A*B */
+static inline void ray__wymum(uint64_t *A, uint64_t *B) {
+#if defined(__SIZEOF_INT128__)
+    __uint128_t r = (__uint128_t)*A * *B;
+    *A = (uint64_t)r;
+    *B = (uint64_t)(r >> 64);
+#elif defined(_MSC_VER) && defined(_M_X64)
+    *A = _umul128(*A, *B, B);
+#else
+    uint64_t ha = *A >> 32, la = (uint32_t)*A;
+    uint64_t hb = *B >> 32, lb = (uint32_t)*B;
+    uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb;
+    uint64_t t = rl + (rm0 << 32), c = t < rl;
+    uint64_t lo = t + (rm1 << 32);
+    c += lo < t;
+    uint64_t hi = rh + (rm0 >> 32) + (rm1 >> 32) + c;
+    *A = lo;
+    *B = hi;
+#endif
+}
+
+/* Mix two 64-bit values via multiply-then-xor */
+static inline uint64_t ray__wymix(uint64_t A, uint64_t B) {
+    ray__wymum(&A, &B);
+    return A ^ B;
+}
+
+/* ---- Byte readers (endian-aware) ---------------------------------------- */
+
+static inline uint64_t ray__wyr8(const uint8_t *p) {
+    uint64_t v;
+    memcpy(&v, p, 8);
+#if RAY_HASH_LITTLE_ENDIAN
+    return v;
+#elif defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap64(v);
+#elif defined(_MSC_VER)
+    return _byteswap_uint64(v);
+#else
+    return ((v >> 56) & 0xff) | ((v >> 40) & 0xff00) |
+           ((v >> 24) & 0xff0000) | ((v >> 8) & 0xff000000) |
+           ((v << 8) & 0xff00000000ULL) | ((v << 24) & 0xff0000000000ULL) |
+           ((v << 40) & 0xff000000000000ULL) | ((v << 56) & 0xff00000000000000ULL);
+#endif
+}
+
+static inline uint64_t ray__wyr4(const uint8_t *p) {
+    uint32_t v;
+    memcpy(&v, p, 4);
+#if RAY_HASH_LITTLE_ENDIAN
+    return v;
+#elif defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap32(v);
+#elif defined(_MSC_VER)
+    return _byteswap_ulong(v);
+#else
+    return ((v >> 24) & 0xff) | ((v >> 8) & 0xff00) |
+           ((v << 8) & 0xff0000) | ((v << 24) & 0xff000000);
+#endif
+}
+
+static inline uint64_t ray__wyr3(const uint8_t *p, size_t k) {
+    return ((uint64_t)p[0] << 16) | ((uint64_t)p[k >> 1] << 8) | p[k - 1];
+}
+
+/* ---- Secret constants (from wyhash final4.2) ---------------------------- */
+
+static const uint64_t ray__wyp[4] = {
+    0x2d358dccaa6c78a5ULL,
+    0x8bb84b93962eacc9ULL,
+    0x4b33a62ed433d4a3ULL,
+    0x4d5a2da51de1aa47ULL,
+};
+
+/* ---- Core: hash arbitrary bytes ----------------------------------------- */
+
+/*
+ * ray_hash_bytes -- hash a byte buffer of length `len`.
+ *
+ * This is the full wyhash final4.2 algorithm: ~3 cycles/8 bytes on
+ * modern x86-64. Seed is fixed at 0 for deterministic, repeatable hashing
+ * within a single process lifetime.
+ */
+/* L2: Fixed seed=0 is acceptable for in-process dataframe operations;
+ * use a random seed if processing adversarial input (e.g., untrusted
+ * CSV with crafted hash collisions). */
+static inline uint64_t ray_hash_bytes(const void *data, size_t len) {
+    const uint8_t *p = (const uint8_t *)data;
+    uint64_t seed = 0;
+    seed ^= ray__wymix(seed ^ ray__wyp[0], ray__wyp[1]);
+
+    uint64_t a, b;
+    if (RAY_HASH_LIKELY(len <= 16)) {
+        if (RAY_HASH_LIKELY(len >= 4)) {
+            a = (ray__wyr4(p) << 32) | ray__wyr4(p + ((len >> 3) << 2));
+            b = (ray__wyr4(p + len - 4) << 32) | ray__wyr4(p + len - 4 - ((len >> 3) << 2));
+        } else if (RAY_HASH_LIKELY(len > 0)) {
+            a = ray__wyr3(p, len);
+            b = 0;
+        } else {
+            a = b = 0;
+        }
+    } else {
+        size_t i = len;
+        if (RAY_HASH_UNLIKELY(i >= 48)) {
+            uint64_t see1 = seed, see2 = seed;
+            do {
+                seed = ray__wymix(ray__wyr8(p)      ^ ray__wyp[1], ray__wyr8(p + 8)  ^ seed);
+                see1 = ray__wymix(ray__wyr8(p + 16)  ^ ray__wyp[2], ray__wyr8(p + 24) ^ see1);
+                see2 = ray__wymix(ray__wyr8(p + 32)  ^ ray__wyp[3], ray__wyr8(p + 40) ^ see2);
+                p += 48;
+                i -= 48;
+            } while (RAY_HASH_LIKELY(i >= 48));
+            seed ^= see1 ^ see2;
+        }
+        while (RAY_HASH_UNLIKELY(i > 16)) {
+            seed = ray__wymix(ray__wyr8(p) ^ ray__wyp[1], ray__wyr8(p + 8) ^ seed);
+            i -= 16;
+            p += 16;
+        }
+        a = ray__wyr8(p + i - 16);
+        b = ray__wyr8(p + i - 8);
+    }
+    a ^= ray__wyp[1];
+    b ^= seed;
+    ray__wymum(&a, &b);
+    return ray__wymix(a ^ ray__wyp[0] ^ len, b ^ ray__wyp[1]);
+}
+
+/* ---- Convenience: hash a single int64 ----------------------------------- */
+
+/*
+ * ray_hash_i64 -- hash a 64-bit integer.
+ *
+ * Uses wyhash64 two-round mixing which is faster than feeding 8 bytes
+ * through the generic path while retaining excellent distribution.
+ */
+static inline uint64_t ray_hash_i64(int64_t val) {
+    uint64_t A = (uint64_t)val ^ 0x2d358dccaa6c78a5ULL;
+    uint64_t B = (uint64_t)val ^ 0x8bb84b93962eacc9ULL;
+    ray__wymum(&A, &B);
+    return ray__wymix(A ^ 0x2d358dccaa6c78a5ULL, B ^ 0x8bb84b93962eacc9ULL);
+}
+
+/* ---- Convenience: hash a double ----------------------------------------- */
+
+/*
+ * ray_hash_f64 -- hash a 64-bit float by its bit pattern.
+ *
+ * Normalizes negative zero to positive zero so that -0.0 and +0.0
+ * hash identically (they compare equal via ==).
+ *
+ * Note: different NaN bit patterns hash differently; SQL NULL is
+ * handled separately at a higher level and never reaches this path.
+ */
+static inline uint64_t ray_hash_f64(double val) {
+    uint64_t bits;
+    if (val == 0.0) { uint64_t z = 0; memcpy(&val, &z, sizeof(val)); } /* normalize -0.0 → +0.0 */
+    memcpy(&bits, &val, sizeof(bits));
+    uint64_t A = bits ^ 0x2d358dccaa6c78a5ULL;
+    uint64_t B = bits ^ 0x8bb84b93962eacc9ULL;
+    ray__wymum(&A, &B);
+    return ray__wymix(A ^ 0x2d358dccaa6c78a5ULL, B ^ 0x8bb84b93962eacc9ULL);
+}
+
+/* ---- Combine two hashes ------------------------------------------------- */
+
+/*
+ * ray_hash_combine -- mix two hash values into one.
+ *
+ * Uses the wyhash64 two-input mixer. This is order-dependent:
+ * combine(a,b) != combine(b,a), which is the desired behaviour for
+ * multi-column key hashing where column order matters.
+ */
+static inline uint64_t ray_hash_combine(uint64_t h1, uint64_t h2) {
+    uint64_t A = h1 ^ 0x2d358dccaa6c78a5ULL;
+    uint64_t B = h2 ^ 0x8bb84b93962eacc9ULL;
+    ray__wymum(&A, &B);
+    return ray__wymix(A ^ 0x2d358dccaa6c78a5ULL, B ^ 0x8bb84b93962eacc9ULL);
+}
+
+#endif /* RAY_HASH_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/idxop.c b/crates/rayforce-sys/vendor/rayforce/src/ops/idxop.c
new file mode 100644
index 0000000..b3817a6
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/idxop.c
@@ -0,0 +1,734 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "idxop.h"
+#include "mem/heap.h"
+#include "mem/cow.h"
+#include "vec/vec.h"
+#include "table/table.h"
+#include "table/sym.h"
+#include "lang/eval.h"
+#include "ops/ops.h"
+#include <math.h>
+#include <string.h>
+
+/* Width of one element of a numeric vector type, or 0 if unsupported. */
+static int numeric_elem_size(int8_t t) {
+    switch (t) {
+    case RAY_BOOL: case RAY_U8:                       return 1;
+    case RAY_I16:                                     return 2;
+    case RAY_I32: case RAY_DATE: case RAY_F32:        return 4;
+    case RAY_I64: case RAY_TIME: case RAY_TIMESTAMP:
+    case RAY_F64:                                     return 8;
+    default:                                          return 0;
+    }
+}
+
+/* Read row i of a numeric vector as a 64-bit hash-input word.  Mirrors the
+ * canonical-equality semantics in the rest of the codebase: -0.0 / +0.0
+ * collapse, NaNs route per-row (caller treats NaN as its own bucket). */
+static uint64_t numeric_key_word(const uint8_t* base, int8_t type, int64_t i) {
+    int es = numeric_elem_size(type);
+    if (type == RAY_F32 || type == RAY_F64) {
+        double v;
+        if (es == 4) { float t; memcpy(&t, base + i*4, 4); v = (double)t; }
+        else         {           memcpy(&v, base + i*8, 8);                }
+        if (v == 0.0) v = 0.0;          /* canonicalise -0.0 -> +0.0 */
+        if (v != v) {                   /* NaN: per-row bucket via row hash */
+            return (uint64_t)i * 0x9E3779B97F4A7C15ULL;
+        }
+        uint64_t bits;
+        memcpy(&bits, &v, 8);
+        return bits;
+    }
+    int64_t k = 0;
+    switch (es) {
+    case 1: k = (int64_t)base[i]; break;
+    case 2: { int16_t t; memcpy(&t, base + i*2, 2); k = (int64_t)t; break; }
+    case 4: { int32_t t; memcpy(&t, base + i*4, 4); k = (int64_t)t; break; }
+    case 8: { int64_t t; memcpy(&t, base + i*8, 8); k =          t; break; }
+    }
+    return (uint64_t)k;
+}
+
+/* 64-bit avalanche mix (splittable hash from Stafford / xxhash). */
+static inline uint64_t mix64(uint64_t x) {
+    x ^= x >> 30;
+    x *= 0xbf58476d1ce4e5b9ULL;
+    x ^= x >> 27;
+    x *= 0x94d049bb133111ebULL;
+    x ^= x >> 31;
+    return x;
+}
+
+/* Smallest power of two >= n, clamped to >= 1. */
+static uint64_t next_pow2(uint64_t n) {
+    if (n <= 1) return 1;
+    uint64_t p = 1;
+    while (p < n) p <<= 1;
+    return p;
+}
+
+/* --------------------------------------------------------------------------
+ * Index ray_t allocation / destruction helpers
+ *
+ * The block layout: 32-byte ray_t header + ray_index_t payload in data[].
+ * type = RAY_INDEX, attrs = 0 (the index itself is never sliced or aliased),
+ * len = sizeof(ray_index_t) (so callers can sanity-check the payload size).
+ * -------------------------------------------------------------------------- */
+
+static ray_t* ray_index_alloc(ray_idx_kind_t kind, int8_t parent_type, int64_t parent_len) {
+    ray_t* idx = ray_alloc(sizeof(ray_index_t));
+    if (!idx || RAY_IS_ERR(idx)) return idx;
+    idx->type  = RAY_INDEX;
+    idx->attrs = 0;
+    idx->len   = (int64_t)sizeof(ray_index_t);
+    memset(idx->data, 0, sizeof(ray_index_t));
+    ray_index_t* ix = ray_index_payload(idx);
+    ix->kind         = (uint8_t)kind;
+    ix->parent_type  = parent_type;
+    ix->built_for_len = parent_len;
+    return idx;
+}
+
+/* Reading saved-nullmap pointers: typed views into the 16-byte snapshot. */
+static inline ray_t* saved_lo_ptr(ray_index_t* ix) {
+    ray_t* p; memcpy(&p, &ix->saved_nullmap[0], sizeof(p)); return p;
+}
+static inline ray_t* saved_hi_ptr(ray_index_t* ix) {
+    ray_t* p; memcpy(&p, &ix->saved_nullmap[8], sizeof(p)); return p;
+}
+static inline void saved_lo_clear(ray_index_t* ix) {
+    memset(&ix->saved_nullmap[0], 0, 8);
+}
+static inline void saved_hi_clear(ray_index_t* ix) {
+    memset(&ix->saved_nullmap[8], 0, 8);
+}
+
+/* --------------------------------------------------------------------------
+ * Saved-nullmap retain / release
+ *
+ * The saved 16 bytes hold pointers iff (parent_type, saved_attrs) say so:
+ *   - saved_attrs & NULLMAP_EXT  => low 8 bytes are an owning ray_t* (ext nullmap)
+ *                                   *except* RAY_STR uses the same slot for
+ *                                   str_ext_null (also an owning ref) — same
+ *                                   semantics, same ownership.
+ *   - parent_type == RAY_STR     => high 8 bytes are str_pool (owning ref)
+ *   - parent_type == RAY_SYM and saved_attrs & NULLMAP_EXT
+ *                                => high 8 bytes are sym_dict (owning ref)
+ *
+ * For all other type/attr combos the bytes are inline bitmap data, not
+ * pointers, and we leave them alone.
+ * -------------------------------------------------------------------------- */
+
+void ray_index_release_saved(ray_index_t* ix) {
+    if (ix->saved_attrs & RAY_ATTR_NULLMAP_EXT) {
+        ray_t* lo = saved_lo_ptr(ix);
+        if (lo && !RAY_IS_ERR(lo)) ray_release(lo);
+        saved_lo_clear(ix);
+    }
+    if (ix->parent_type == RAY_STR) {
+        ray_t* hi = saved_hi_ptr(ix);
+        if (hi && !RAY_IS_ERR(hi)) ray_release(hi);
+        saved_hi_clear(ix);
+    } else if (ix->parent_type == RAY_SYM &&
+               (ix->saved_attrs & RAY_ATTR_NULLMAP_EXT)) {
+        /* RAY_SYM stores sym_dict at high 8 bytes only when an ext nullmap
+         * is present (otherwise the inline bitmap occupies both halves and
+         * sym_dict isn't materialized in the union slot). */
+        ray_t* hi = saved_hi_ptr(ix);
+        if (hi && !RAY_IS_ERR(hi)) ray_release(hi);
+        saved_hi_clear(ix);
+    }
+}
+
+void ray_index_retain_saved(ray_index_t* ix) {
+    if (ix->saved_attrs & RAY_ATTR_NULLMAP_EXT) {
+        ray_t* lo = saved_lo_ptr(ix);
+        if (lo && !RAY_IS_ERR(lo)) ray_retain(lo);
+    }
+    if (ix->parent_type == RAY_STR) {
+        ray_t* hi = saved_hi_ptr(ix);
+        if (hi && !RAY_IS_ERR(hi)) ray_retain(hi);
+    } else if (ix->parent_type == RAY_SYM &&
+               (ix->saved_attrs & RAY_ATTR_NULLMAP_EXT)) {
+        ray_t* hi = saved_hi_ptr(ix);
+        if (hi && !RAY_IS_ERR(hi)) ray_retain(hi);
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Per-kind payload retain / release
+ * -------------------------------------------------------------------------- */
+
+void ray_index_release_payload(ray_index_t* ix) {
+    switch ((ray_idx_kind_t)ix->kind) {
+    case RAY_IDX_HASH:
+        if (ix->u.hash.table && !RAY_IS_ERR(ix->u.hash.table))
+            ray_release(ix->u.hash.table);
+        if (ix->u.hash.chain && !RAY_IS_ERR(ix->u.hash.chain))
+            ray_release(ix->u.hash.chain);
+        ix->u.hash.table = ix->u.hash.chain = NULL;
+        break;
+    case RAY_IDX_SORT:
+        if (ix->u.sort.perm && !RAY_IS_ERR(ix->u.sort.perm))
+            ray_release(ix->u.sort.perm);
+        ix->u.sort.perm = NULL;
+        break;
+    case RAY_IDX_BLOOM:
+        if (ix->u.bloom.bits && !RAY_IS_ERR(ix->u.bloom.bits))
+            ray_release(ix->u.bloom.bits);
+        ix->u.bloom.bits = NULL;
+        break;
+    case RAY_IDX_ZONE:
+    case RAY_IDX_NONE:
+        break;
+    }
+}
+
+void ray_index_retain_payload(ray_index_t* ix) {
+    switch ((ray_idx_kind_t)ix->kind) {
+    case RAY_IDX_HASH:
+        if (ix->u.hash.table && !RAY_IS_ERR(ix->u.hash.table))
+            ray_retain(ix->u.hash.table);
+        if (ix->u.hash.chain && !RAY_IS_ERR(ix->u.hash.chain))
+            ray_retain(ix->u.hash.chain);
+        break;
+    case RAY_IDX_SORT:
+        if (ix->u.sort.perm && !RAY_IS_ERR(ix->u.sort.perm))
+            ray_retain(ix->u.sort.perm);
+        break;
+    case RAY_IDX_BLOOM:
+        if (ix->u.bloom.bits && !RAY_IS_ERR(ix->u.bloom.bits))
+            ray_retain(ix->u.bloom.bits);
+        break;
+    case RAY_IDX_ZONE:
+    case RAY_IDX_NONE:
+        break;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Zone scan -- compute min/max + null count
+ *
+ * Reads the parent vector before the nullmap is displaced.  Integer paths
+ * cover BOOL/U8/I16/I32/I64/DATE/TIME/TIMESTAMP (all stored in int slots);
+ * float paths cover F32/F64.  RAY_SYM/STR/GUID return RAY_ERR_NYI for now;
+ * those types will get string-aware min/max in the P4 zone work.
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t zone_scan_int(ray_t* v, ray_index_t* ix, int elem_size) {
+    int64_t n = v->len;
+    int64_t mn = INT64_MAX, mx = INT64_MIN;
+    int64_t nn = 0;
+    bool any_value = false;
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+
+    for (int64_t i = 0; i < n; i++) {
+        if (ray_vec_is_null(v, i)) { nn++; continue; }
+        int64_t val = 0;
+        switch (elem_size) {
+        case 1: val = (int64_t)base[i]; break;
+        case 2: { int16_t t; memcpy(&t, base + i*2, 2); val = (int64_t)t; break; }
+        case 4: { int32_t t; memcpy(&t, base + i*4, 4); val = (int64_t)t; break; }
+        case 8: { int64_t t; memcpy(&t, base + i*8, 8); val = t;          break; }
+        default: return RAY_ERR_TYPE;
+        }
+        if (val < mn) mn = val;
+        if (val > mx) mx = val;
+        any_value = true;
+    }
+    if (!any_value) { mn = 0; mx = 0; }
+    ix->u.zone.min_i  = mn;
+    ix->u.zone.max_i  = mx;
+    ix->u.zone.n_nulls = nn;
+    return RAY_OK;
+}
+
+static ray_err_t zone_scan_float(ray_t* v, ray_index_t* ix, int elem_size) {
+    int64_t n = v->len;
+    double mn = INFINITY, mx = -INFINITY;
+    int64_t nn = 0;
+    bool any_value = false;
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+
+    for (int64_t i = 0; i < n; i++) {
+        if (ray_vec_is_null(v, i)) { nn++; continue; }
+        double val = 0.0;
+        if (elem_size == 4) {
+            float t; memcpy(&t, base + i*4, 4); val = (double)t;
+        } else {
+            memcpy(&val, base + i*8, 8);
+        }
+        if (isnan(val)) continue;  /* NaNs don't participate in min/max */
+        if (val < mn) mn = val;
+        if (val > mx) mx = val;
+        any_value = true;
+    }
+    if (!any_value) { mn = 0.0; mx = 0.0; }
+    ix->u.zone.min_f  = mn;
+    ix->u.zone.max_f  = mx;
+    ix->u.zone.n_nulls = nn;
+    return RAY_OK;
+}
+
+static ray_err_t zone_scan(ray_t* v, ray_index_t* ix) {
+    switch (v->type) {
+    case RAY_BOOL:
+    case RAY_U8:        return zone_scan_int(v, ix, 1);
+    case RAY_I16:       return zone_scan_int(v, ix, 2);
+    case RAY_I32:
+    case RAY_DATE:      return zone_scan_int(v, ix, 4);
+    case RAY_I64:
+    case RAY_TIME:
+    case RAY_TIMESTAMP: return zone_scan_int(v, ix, 8);
+    case RAY_F32:       return zone_scan_float(v, ix, 4);
+    case RAY_F64:       return zone_scan_float(v, ix, 8);
+    default:            return RAY_ERR_NYI;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Attach
+ *
+ * The 16-byte snapshot must be taken AFTER the scan (so the scan reads the
+ * parent's normal nullmap) but BEFORE we overwrite parent->nullmap with the
+ * index pointer.  Ownership transfer: pointers in the snapshot (ext_nullmap,
+ * str_pool, sym_dict) move from parent to ix.  We do NOT retain them here —
+ * the existing refs simply move.  Symmetrically, when we install the index
+ * pointer in parent->nullmap, we transfer that single ref to the parent
+ * (no extra retain).
+ * -------------------------------------------------------------------------- */
+
+static ray_t* attach_finalize(ray_t* parent, ray_t* idx) {
+    ray_index_t* ix = ray_index_payload(idx);
+    /* Snapshot the parent's 16 raw bytes verbatim. */
+    memcpy(ix->saved_nullmap, parent->nullmap, 16);
+    ix->saved_attrs = parent->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_NULLMAP_EXT);
+
+    /* Install the index pointer — overwrites bytes 0-7 with the index ptr.
+     * Bytes 8-15 carry link_target when HAS_LINK is set; preserve them.
+     * Otherwise zero _idx_pad as a tidy default. */
+    parent->index    = idx;
+    if (!(parent->attrs & RAY_ATTR_HAS_LINK)) parent->_idx_pad = NULL;
+    parent->attrs   |= RAY_ATTR_HAS_INDEX;
+    /* Clear NULLMAP_EXT on the parent: vec->ext_nullmap is now the index
+     * pointer, not a U8 nullmap vec, so naive readers that gate on
+     * NULLMAP_EXT and dereference ext_nullmap would read garbage.  The
+     * displaced ext-nullmap pointer is preserved inside ix->saved_nullmap[0..7]
+     * and accessed via the HAS_INDEX-aware helpers in vec.c / morsel.c.
+     *
+     * IMPORTANT: HAS_NULLS is *preserved* on the parent so the many call
+     * sites that use it as a cheap "do I need null logic at all?" gate
+     * continue to give correct answers.  The actual null bits are read
+     * via ray_vec_is_null / ray_morsel_next, both of which check
+     * HAS_INDEX first and route through the saved snapshot. */
+    parent->attrs   &= (uint8_t)~RAY_ATTR_NULLMAP_EXT;
+    return parent;
+}
+
+/* Validate + COW + drop existing index.  Returns the (possibly new) parent
+ * pointer and updates *vp.  On error returns a RAY_ERROR; caller must
+ * propagate without further modifying *vp. */
+static ray_t* prepare_attach(ray_t** vp, const char* what) {
+    if (!vp || !*vp || RAY_IS_ERR(*vp))
+        return ray_error("type", "%s: null/error vector", what);
+    ray_t* v = *vp;
+    if (!ray_is_vec(v))
+        return ray_error("type", "%s: index can only attach to a vector", what);
+    if (v->attrs & RAY_ATTR_SLICE)
+        return ray_error("type", "%s: cannot index a slice; materialize first", what);
+    if (v->attrs & RAY_ATTR_HAS_INDEX) {
+        ray_index_drop(&v);
+        if (RAY_IS_ERR(v)) return v;
+        *vp = v;
+    }
+    v = ray_cow(v);
+    if (!v || RAY_IS_ERR(v)) return v;
+    *vp = v;
+    if (numeric_elem_size(v->type) == 0) {
+        return ray_error("nyi", "%s: only numeric vectors supported in v1 (got type %d)",
+                         what, (int)v->type);
+    }
+    return v;
+}
+
+ray_t* ray_index_attach_zone(ray_t** vp) {
+    ray_t* v = prepare_attach(vp, "zone");
+    if (RAY_IS_ERR(v)) return v;
+
+    ray_t* idx = ray_index_alloc(RAY_IDX_ZONE, v->type, v->len);
+    if (!idx || RAY_IS_ERR(idx)) return idx;
+
+    ray_err_t err = zone_scan(v, ray_index_payload(idx));
+    if (err != RAY_OK) {
+        ray_release(idx);
+        return ray_error(ray_err_code_str(err), "zone scan failed for type %d", (int)v->type);
+    }
+    return attach_finalize(v, idx);
+}
+
+/* --------------------------------------------------------------------------
+ * Hash index — chained open addressing
+ *
+ * table[capacity]: each slot is rid+1 of the most recent row that hashed
+ *   into the bucket (0 = empty bucket).
+ * chain[parent->len]: each slot is rid+1 of the next-older row in the same
+ *   bucket's chain (0 = end of chain).
+ *
+ * Lookup `k`: rid = table[hash(k) & mask] - 1; while rid >= 0 compare
+ * parent->data[rid] == k, on miss step rid = chain[rid] - 1.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_index_attach_hash(ray_t** vp) {
+    ray_t* v = prepare_attach(vp, "hash");
+    if (RAY_IS_ERR(v)) return v;
+
+    int64_t n = v->len;
+    /* Capacity: at least 8, at most 2*n.  Power of two for cheap masking. */
+    uint64_t cap = next_pow2((uint64_t)(n < 4 ? 8 : 2 * n));
+    if (cap < 8) cap = 8;
+
+    ray_t* table = ray_vec_new(RAY_I64, (int64_t)cap);
+    if (!table || RAY_IS_ERR(table)) return table ? table : ray_error("oom", NULL);
+    table->len = (int64_t)cap;
+    memset(ray_data(table), 0, (size_t)cap * sizeof(int64_t));
+
+    ray_t* chain = ray_vec_new(RAY_I64, n > 0 ? n : 1);
+    if (!chain || RAY_IS_ERR(chain)) {
+        ray_release(table);
+        return chain ? chain : ray_error("oom", NULL);
+    }
+    chain->len = n;
+    if (n > 0) memset(ray_data(chain), 0, (size_t)n * sizeof(int64_t));
+
+    int64_t* tbl = (int64_t*)ray_data(table);
+    int64_t* chn = (int64_t*)ray_data(chain);
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+    int64_t n_keys = 0;
+    uint64_t mask = cap - 1;
+
+    for (int64_t i = 0; i < n; i++) {
+        if (ray_vec_is_null(v, i)) continue;
+        uint64_t h = mix64(numeric_key_word(base, v->type, i));
+        uint64_t slot = h & mask;
+        chn[i] = tbl[slot];     /* link previous head into chain */
+        tbl[slot] = i + 1;      /* this row becomes new head */
+        n_keys++;
+    }
+
+    ray_t* idx = ray_index_alloc(RAY_IDX_HASH, v->type, n);
+    if (!idx || RAY_IS_ERR(idx)) {
+        ray_release(table);
+        ray_release(chain);
+        return idx ? idx : ray_error("oom", NULL);
+    }
+    ray_index_t* ix = ray_index_payload(idx);
+    ix->u.hash.table  = table;
+    ix->u.hash.chain  = chain;
+    ix->u.hash.mask   = mask;
+    ix->u.hash.n_keys = n_keys;
+
+    return attach_finalize(v, idx);
+}
+
+/* --------------------------------------------------------------------------
+ * Sort index — ascending permutation of row ids
+ *
+ * Delegates to the existing parallel sort builder.  Result is an I64 vec of
+ * length parent->len with default null-handling (nulls last for asc).
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_index_attach_sort(ray_t** vp) {
+    ray_t* v = prepare_attach(vp, "sort");
+    if (RAY_IS_ERR(v)) return v;
+
+    ray_t* col = v;
+    ray_t* perm = ray_sort_indices(&col, NULL, NULL, 1, v->len);
+    if (!perm || RAY_IS_ERR(perm)) return perm ? perm : ray_error("oom", NULL);
+
+    ray_t* idx = ray_index_alloc(RAY_IDX_SORT, v->type, v->len);
+    if (!idx || RAY_IS_ERR(idx)) {
+        ray_release(perm);
+        return idx ? idx : ray_error("oom", NULL);
+    }
+    ray_index_t* ix = ray_index_payload(idx);
+    ix->u.sort.perm = perm;
+
+    return attach_finalize(v, idx);
+}
+
+/* --------------------------------------------------------------------------
+ * Bloom filter — m bits, k=3 hashes via double-hashing
+ *
+ * Layout: m is rounded to the next power of two >= max(64, 8*n_non_null).
+ * Each row sets bits at positions (h1 + i*h2) mod m for i in [0..k).
+ * h1, h2 are derived from a single 64-bit mix of the key word.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_index_attach_bloom(ray_t** vp) {
+    ray_t* v = prepare_attach(vp, "bloom");
+    if (RAY_IS_ERR(v)) return v;
+
+    int64_t n = v->len;
+    /* Count non-null rows for sizing. */
+    int64_t n_set = 0;
+    for (int64_t i = 0; i < n; i++) {
+        if (!ray_vec_is_null(v, i)) n_set++;
+    }
+    uint64_t target_bits = (uint64_t)(n_set < 8 ? 64 : 8 * n_set);
+    uint64_t m = next_pow2(target_bits);
+    if (m < 64) m = 64;
+    uint64_t mbytes = m / 8;
+    uint32_t k = 3;
+
+    ray_t* bits = ray_vec_new(RAY_U8, (int64_t)mbytes);
+    if (!bits || RAY_IS_ERR(bits)) return bits ? bits : ray_error("oom", NULL);
+    bits->len = (int64_t)mbytes;
+    memset(ray_data(bits), 0, (size_t)mbytes);
+
+    uint8_t* bbuf = (uint8_t*)ray_data(bits);
+    uint64_t mask = m - 1;
+    const uint8_t* base = (const uint8_t*)ray_data(v);
+
+    for (int64_t i = 0; i < n; i++) {
+        if (ray_vec_is_null(v, i)) continue;
+        uint64_t h = mix64(numeric_key_word(base, v->type, i));
+        uint64_t h1 = h;
+        uint64_t h2 = mix64(h ^ 0xc6a4a7935bd1e995ULL) | 1ULL;  /* ensure odd */
+        for (uint32_t kk = 0; kk < k; kk++) {
+            uint64_t pos = (h1 + (uint64_t)kk * h2) & mask;
+            bbuf[pos >> 3] |= (uint8_t)(1u << (pos & 7));
+        }
+    }
+
+    ray_t* idx = ray_index_alloc(RAY_IDX_BLOOM, v->type, n);
+    if (!idx || RAY_IS_ERR(idx)) {
+        ray_release(bits);
+        return idx ? idx : ray_error("oom", NULL);
+    }
+    ray_index_t* ix = ray_index_payload(idx);
+    ix->u.bloom.bits   = bits;
+    ix->u.bloom.m_mask = mask;
+    ix->u.bloom.k      = k;
+    ix->u.bloom.n_keys = n_set;
+
+    return attach_finalize(v, idx);
+}
+
+/* --------------------------------------------------------------------------
+ * Detach (drop)
+ *
+ * Restore the parent's 16-byte nullmap union from the saved snapshot, then
+ * release the index ray_t.  The release path of RAY_INDEX would otherwise
+ * also try to release the saved-nullmap pointers, so we clear the saved
+ * snapshot and saved_attrs first to neutralize that — ownership is moving
+ * back to the parent.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_index_drop(ray_t** vp) {
+    if (!vp || !*vp || RAY_IS_ERR(*vp)) return *vp;
+    ray_t* v = *vp;
+    if (!(v->attrs & RAY_ATTR_HAS_INDEX)) return v;
+
+    /* Detach mutates the parent in place; require sole ownership. */
+    v = ray_cow(v);
+    if (!v || RAY_IS_ERR(v)) { *vp = v; return v; }
+    *vp = v;
+
+    /* After ray_cow, *vp may be a freshly copied block.  In ray_alloc_copy,
+     * the index pointer was retained by ray_retain_owned_refs (via the
+     * RAY_ATTR_HAS_INDEX branch we add in heap.c), so v->index here is
+     * still the live, owned index ray_t. */
+    ray_t* idx = v->index;
+    ray_index_t* ix = ray_index_payload(idx);
+
+    /* Shared-index case: another vec may share this RAY_INDEX block via
+     * ray_alloc_copy (rc>1).  Don't clobber the snapshot in that case —
+     * the other holder still reads it.  Copy our own retained refs to
+     * the saved-pointer slots so the bytes we move into v->nullmap are
+     * owned by v.  See vec_drop_index_inplace for the same pattern. */
+    uint8_t saved = ix->saved_attrs;
+    bool shared = ray_atomic_load(&idx->rc) > 1;
+    if (shared) {
+        ray_index_retain_saved(ix);
+    }
+    memcpy(v->nullmap, ix->saved_nullmap, 16);
+    if (!shared) {
+        memset(ix->saved_nullmap, 0, 16);
+        ix->saved_attrs = 0;
+    }
+
+    /* Restore parent attrs.  HAS_NULLS was preserved through the attachment
+     * so we don't need to OR it back in; only NULLMAP_EXT (which we cleared
+     * at attach time) needs to be reinstated from saved_attrs. */
+    v->attrs &= (uint8_t)~RAY_ATTR_HAS_INDEX;
+    if (saved & RAY_ATTR_NULLMAP_EXT) v->attrs |= RAY_ATTR_NULLMAP_EXT;
+
+    /* Release the index.  Per-kind children are released by the RAY_INDEX
+     * branch of ray_release_owned_refs (added in heap.c). */
+    ray_release(idx);
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * Info
+ * -------------------------------------------------------------------------- */
+
+static const char* kind_name(ray_idx_kind_t k) {
+    switch (k) {
+    case RAY_IDX_HASH:  return "hash";
+    case RAY_IDX_SORT:  return "sort";
+    case RAY_IDX_ZONE:  return "zone";
+    case RAY_IDX_BLOOM: return "bloom";
+    default:            return "none";
+    }
+}
+
+static ray_t* dict_append_sym_i64(ray_t** keys, ray_t** vals, const char* k, int64_t n) {
+    int64_t kid = ray_sym_intern(k, strlen(k));
+    *keys = ray_vec_append(*keys, &kid);
+    if (RAY_IS_ERR(*keys)) return *keys;
+    ray_t* nv = ray_i64(n);
+    *vals = ray_list_append(*vals, nv);
+    ray_release(nv);
+    return *vals;
+}
+
+static ray_t* dict_append_sym_sym(ray_t** keys, ray_t** vals, const char* k, const char* s) {
+    int64_t kid = ray_sym_intern(k, strlen(k));
+    *keys = ray_vec_append(*keys, &kid);
+    if (RAY_IS_ERR(*keys)) return *keys;
+    int64_t sid = ray_sym_intern(s, strlen(s));
+    ray_t* sv = ray_sym(sid);
+    *vals = ray_list_append(*vals, sv);
+    ray_release(sv);
+    return *vals;
+}
+
+ray_t* ray_index_info(ray_t* v) {
+    if (!ray_index_has(v)) return RAY_NULL_OBJ;
+    ray_index_t* ix = ray_index_payload(v->index);
+
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, 8);
+    if (RAY_IS_ERR(keys)) return keys;
+    ray_t* vals = ray_list_new(8);
+    if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+
+    ray_t* r;
+    r = dict_append_sym_sym(&keys, &vals, "kind", kind_name((ray_idx_kind_t)ix->kind));
+    if (RAY_IS_ERR(r)) goto fail;
+    r = dict_append_sym_i64(&keys, &vals, "length", ix->built_for_len);
+    if (RAY_IS_ERR(r)) goto fail;
+    r = dict_append_sym_i64(&keys, &vals, "parent_type", (int64_t)ix->parent_type);
+    if (RAY_IS_ERR(r)) goto fail;
+    r = dict_append_sym_i64(&keys, &vals, "saved_attrs", (int64_t)ix->saved_attrs);
+    if (RAY_IS_ERR(r)) goto fail;
+
+    switch ((ray_idx_kind_t)ix->kind) {
+    case RAY_IDX_ZONE:
+        if (ix->parent_type == RAY_F32 || ix->parent_type == RAY_F64) {
+            int64_t kmin = ray_sym_intern("min", 3);
+            keys = ray_vec_append(keys, &kmin);
+            ray_t* mn = ray_f64(ix->u.zone.min_f);
+            vals = ray_list_append(vals, mn); ray_release(mn);
+            int64_t kmax = ray_sym_intern("max", 3);
+            keys = ray_vec_append(keys, &kmax);
+            ray_t* mx = ray_f64(ix->u.zone.max_f);
+            vals = ray_list_append(vals, mx); ray_release(mx);
+        } else {
+            r = dict_append_sym_i64(&keys, &vals, "min", ix->u.zone.min_i);
+            if (RAY_IS_ERR(r)) goto fail;
+            r = dict_append_sym_i64(&keys, &vals, "max", ix->u.zone.max_i);
+            if (RAY_IS_ERR(r)) goto fail;
+        }
+        r = dict_append_sym_i64(&keys, &vals, "n_nulls", ix->u.zone.n_nulls);
+        if (RAY_IS_ERR(r)) goto fail;
+        break;
+    case RAY_IDX_HASH:
+        r = dict_append_sym_i64(&keys, &vals, "capacity", (int64_t)(ix->u.hash.mask + 1));
+        if (RAY_IS_ERR(r)) goto fail;
+        r = dict_append_sym_i64(&keys, &vals, "n_keys",   ix->u.hash.n_keys);
+        if (RAY_IS_ERR(r)) goto fail;
+        break;
+    case RAY_IDX_SORT:
+        r = dict_append_sym_i64(&keys, &vals, "perm_len",
+                                ix->u.sort.perm ? ix->u.sort.perm->len : 0);
+        if (RAY_IS_ERR(r)) goto fail;
+        break;
+    case RAY_IDX_BLOOM:
+        r = dict_append_sym_i64(&keys, &vals, "m_bits", (int64_t)(ix->u.bloom.m_mask + 1));
+        if (RAY_IS_ERR(r)) goto fail;
+        r = dict_append_sym_i64(&keys, &vals, "k", (int64_t)ix->u.bloom.k);
+        if (RAY_IS_ERR(r)) goto fail;
+        r = dict_append_sym_i64(&keys, &vals, "n_keys", ix->u.bloom.n_keys);
+        if (RAY_IS_ERR(r)) goto fail;
+        break;
+    case RAY_IDX_NONE:
+        break;
+    }
+
+    return ray_dict_new(keys, vals);
+
+fail:
+    if (!RAY_IS_ERR(keys)) ray_release(keys);
+    if (!RAY_IS_ERR(vals)) ray_release(vals);
+    return r;
+}
+
+/* --------------------------------------------------------------------------
+ * Rayfall builtins (registered from src/lang/eval.c)
+ * -------------------------------------------------------------------------- */
+
+/* Common entry shape: take a borrowed ref, return an owning ref of the
+ * (possibly COW-copied) parent.  See heap.c:ray_release on rc transfer. */
+static ray_t* attach_via(ray_t* v, ray_t* (*fn)(ray_t**)) {
+    if (!v || RAY_IS_ERR(v)) return v;
+    ray_t* w = v;
+    ray_retain(w);
+    ray_t* r = fn(&w);
+    if (RAY_IS_ERR(r)) { ray_release(w); return r; }
+    return w;
+}
+
+ray_t* ray_idx_zone_fn (ray_t* v) { return attach_via(v, ray_index_attach_zone);  }
+ray_t* ray_idx_hash_fn (ray_t* v) { return attach_via(v, ray_index_attach_hash);  }
+ray_t* ray_idx_sort_fn (ray_t* v) { return attach_via(v, ray_index_attach_sort);  }
+ray_t* ray_idx_bloom_fn(ray_t* v) { return attach_via(v, ray_index_attach_bloom); }
+
+ray_t* ray_idx_drop_fn(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return v;
+    ray_t* w = v;
+    ray_retain(w);
+    ray_t* r = ray_index_drop(&w);
+    if (RAY_IS_ERR(r)) { ray_release(w); return r; }
+    return w;
+}
+
+ray_t* ray_idx_has_fn(ray_t* v) {
+    return ray_bool(ray_index_has(v) ? 1 : 0);
+}
+
+ray_t* ray_idx_info_fn(ray_t* v) {
+    return ray_index_info(v);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/idxop.h b/crates/rayforce-sys/vendor/rayforce/src/ops/idxop.h
new file mode 100644
index 0000000..5dcc4c3
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/idxop.h
@@ -0,0 +1,171 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_IDXOP_H
+#define RAY_IDXOP_H
+
+/*
+ * idxop.h -- Per-vector accelerator indices.
+ *
+ * A vector with RAY_ATTR_HAS_INDEX set carries a child ray_t of type
+ * RAY_INDEX in its nullmap[0..7] slot.  The index ray_t holds:
+ *   - the kind (hash / sort / zone / bloom)
+ *   - kind-specific payload (keys vec, perm vec, min/max, bloom bits)
+ *   - a snapshot of the parent's original 16-byte nullmap union plus
+ *     the relevant attrs bits, so detach can restore the vector to its
+ *     pre-attach state byte-for-byte.
+ *
+ * Attach precondition: parent must not be a slice, must not already
+ * carry an index, must be COW'd to rc==1 by the caller's path.
+ *
+ * Mutation invalidates: any in-place write to the parent vector must
+ * call ray_index_drop() first — a stale index is a wrong-answer bug.
+ */
+
+#include <rayforce.h>
+#include "mem/heap.h"  /* RAY_ATTR_HAS_INDEX, RAY_ATTR_NULLMAP_EXT */
+
+/* Index kinds.  Stored in ray_index_t.kind. */
+typedef enum {
+    RAY_IDX_NONE  = 0,
+    RAY_IDX_HASH  = 1,
+    RAY_IDX_SORT  = 2,
+    RAY_IDX_ZONE  = 3,
+    RAY_IDX_BLOOM = 4,
+} ray_idx_kind_t;
+
+/* The payload stored inside data[] of a RAY_INDEX ray_t. */
+typedef struct {
+    uint8_t  kind;            /* ray_idx_kind_t */
+    uint8_t  saved_attrs;     /* parent attrs & (HAS_NULLS|NULLMAP_EXT) at attach */
+    int8_t   parent_type;     /* parent->type (for restore-time pointer interp) */
+    uint8_t  reserved;
+    int64_t  built_for_len;   /* parent->len at attach (mismatch -> stale) */
+
+    /* Raw 16-byte snapshot of parent->nullmap union at attach time.
+     * Restored verbatim on detach.  When this contains pointers
+     * (ext_nullmap, str_pool, sym_dict, str_ext_null) they are owned
+     * by THIS ray_t for the duration of the attachment; release-side
+     * of RAY_INDEX walks these based on (parent_type, saved_attrs). */
+    uint8_t  saved_nullmap[16];
+
+    /* Kind-specific payload.  All ray_t* fields are owning refs. */
+    union {
+        struct {                /* RAY_IDX_HASH */
+            /* Chained open-addressing.  table[mask+1] holds the head rid+1
+             * for each bucket (0 = empty bucket).  chain[parent->len] holds
+             * the next rid+1 in the same bucket's chain (0 = end of chain).
+             * Lookup: hash key, read table[hash & mask] for head, walk chain
+             * until 0 comparing parent->data[rid] for equality. */
+            ray_t*   table;     /* RAY_I64 vec, capacity entries */
+            ray_t*   chain;     /* RAY_I64 vec, parent->len entries */
+            uint64_t mask;      /* capacity - 1 (capacity is power of two) */
+            int64_t  n_keys;    /* number of non-null rows indexed */
+        } hash;
+        struct {                /* RAY_IDX_SORT */
+            ray_t* perm;        /* RAY_I64 vec, perm[i] = row id at sorted pos i */
+        } sort;
+        struct {                /* RAY_IDX_ZONE */
+            int64_t min_i;      /* integer min (used when type is int/date/time) */
+            int64_t max_i;      /* integer max */
+            double  min_f;      /* float min (used when type is f32/f64) */
+            double  max_f;      /* float max */
+            int64_t n_nulls;    /* number of null rows (0 if no nulls) */
+        } zone;
+        struct {                /* RAY_IDX_BLOOM */
+            ray_t*   bits;      /* RAY_U8 vec, m/8 bytes */
+            uint64_t m_mask;    /* m - 1 (m is power of two, m bits total) */
+            uint32_t k;         /* number of hash functions */
+            uint32_t _pad;
+            int64_t  n_keys;    /* number of non-null rows added */
+        } bloom;
+    } u;
+} ray_index_t;
+
+/* Inline accessor — returns ray_index_t* for a RAY_INDEX block. */
+static inline ray_index_t* ray_index_payload(ray_t* idx) {
+    return (ray_index_t*)idx->data;
+}
+
+/* ===== Attach / Detach ===== */
+
+/* Build an accelerator and attach.  Numeric types only for v1
+ * (BOOL/U8/I16/I32/I64/F32/F64/DATE/TIME/TIMESTAMP — RAY_STR/RAY_SYM/RAY_GUID
+ * deferred until the str_pool/sym_dict displacement sweep is complete).
+ * On success, *vp is the (possibly new) parent vector with HAS_INDEX set.
+ * On failure, *vp is unchanged and a RAY_ERROR is returned. */
+ray_t* ray_index_attach_zone (ray_t** vp);
+ray_t* ray_index_attach_hash (ray_t** vp);
+ray_t* ray_index_attach_sort (ray_t** vp);
+ray_t* ray_index_attach_bloom(ray_t** vp);
+
+/* Drop any attached index from *vp.  No-op if none.  Restores the
+ * pre-attach nullmap state byte-for-byte.  Returns *vp. */
+ray_t* ray_index_drop(ray_t** vp);
+
+/* ===== Introspection ===== */
+
+static inline bool ray_index_has(const ray_t* v) {
+    return v && !RAY_IS_ERR((ray_t*)v) &&
+           (v->attrs & RAY_ATTR_HAS_INDEX) &&
+           v->index != NULL;
+}
+
+/* Returns RAY_IDX_NONE if no index is attached. */
+static inline ray_idx_kind_t ray_index_kind(const ray_t* v) {
+    if (!ray_index_has(v)) return RAY_IDX_NONE;
+    return (ray_idx_kind_t)ray_index_payload(v->index)->kind;
+}
+
+/* Returns a fresh RAY_DICT with {kind, length, ...kind-specific...}
+ * or RAY_NULL_OBJ when no index is attached. */
+ray_t* ray_index_info(ray_t* v);
+
+/* ===== Internal helpers (used by retain/release/detach in heap.c
+ * and by mutation paths in vec.c) ===== */
+
+/* Release the saved-nullmap pointers carried by a RAY_INDEX ray_t.
+ * Invoked from ray_release_owned_refs when the index ray_t is freed. */
+void ray_index_release_saved(ray_index_t* ix);
+
+/* Retain the saved-nullmap pointers carried by a RAY_INDEX ray_t.
+ * Invoked from ray_retain_owned_refs after a copy of the index ray_t. */
+void ray_index_retain_saved(ray_index_t* ix);
+
+/* Release per-kind payload children (keys/table/perm/bits...). */
+void ray_index_release_payload(ray_index_t* ix);
+
+/* Retain per-kind payload children. */
+void ray_index_retain_payload(ray_index_t* ix);
+
+/* ===== Rayfall builtin entry points (registered from src/lang/eval.c) ===== */
+
+ray_t* ray_idx_zone_fn (ray_t* v);  /* (.idx.zone  v) -> v with zone  attached */
+ray_t* ray_idx_hash_fn (ray_t* v);  /* (.idx.hash  v) -> v with hash  attached */
+ray_t* ray_idx_sort_fn (ray_t* v);  /* (.idx.sort  v) -> v with sort  attached */
+ray_t* ray_idx_bloom_fn(ray_t* v);  /* (.idx.bloom v) -> v with bloom attached */
+ray_t* ray_idx_drop_fn (ray_t* v);  /* (.idx.drop  v) -> v with index removed */
+ray_t* ray_idx_has_fn  (ray_t* v);  /* (.idx.has?  v) -> 0b/1b */
+ray_t* ray_idx_info_fn (ray_t* v);  /* (.idx.info  v) -> dict of metadata */
+
+#endif /* RAY_IDXOP_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/internal.h b/crates/rayforce-sys/vendor/rayforce/src/ops/internal.h
new file mode 100644
index 0000000..bc0dc1b
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/internal.h
@@ -0,0 +1,992 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/**   Shared helpers for exec.c split — included by expr.c, filter.c, join.c, etc.
+ *   Small hot-path helpers are static inline; larger functions that remain in
+ *   exec.c are declared extern.
+ */
+
+#ifndef RAY_EXEC_INTERNAL_H
+#define RAY_EXEC_INTERNAL_H
+
+#if !defined(RAY_OS_WINDOWS) && !defined(_GNU_SOURCE)
+#define _GNU_SOURCE
+#endif
+
+#include "exec.h"
+#include "hash.h"
+#include "core/pool.h"
+#include "core/profile.h"
+#include "store/csr.h"
+#include "store/hnsw.h"
+#include "lftj.h"
+#include "mem/heap.h"
+#include "table/sym.h"
+#include "table/table.h"
+#include "vec/str.h"
+#include "vec/vec.h"
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <float.h>
+#include <ctype.h>
+
+/* ══════════════════════════════════════════
+ * Parted segment helpers
+ * ══════════════════════════════════════════ */
+
+/* Return attrs of the first non-NULL segment (for SYM width). */
+static inline uint8_t parted_first_attrs(ray_t** segs, int64_t n_segs) {
+    for (int64_t i = 0; i < n_segs; i++)
+        if (segs[i]) return segs[i]->attrs;
+    return 0;
+}
+
+/* Check whether a parted segment's SYM width matches the expected esz.
+ * For non-SYM types this always returns true (attrs don't affect esz). */
+static inline bool parted_seg_esz_ok(ray_t* seg, int8_t base, uint8_t expected_esz) {
+    if (!seg) return false;
+    if (base != RAY_SYM) return true;
+    return ray_sym_elem_size(base, seg->attrs) == expected_esz;
+}
+
+/* ══════════════════════════════════════════
+ * Global profiler
+ * ══════════════════════════════════════════ */
+
+extern ray_profile_t g_ray_profile;
+
+/* ══════════════════════════════════════════
+ * Arena-based scratch allocation helpers
+ * ══════════════════════════════════════════ */
+
+/* Allocate zero-initialized scratch buffer, returns data pointer.
+ * *hdr_out receives the ray_t* header for later ray_free(). */
+static inline void* scratch_calloc(ray_t** hdr_out, size_t nbytes) {
+    ray_t* h = ray_alloc(nbytes);
+    if (!h) { *hdr_out = NULL; return NULL; }
+    void* p = ray_data(h);
+    memset(p, 0, nbytes);
+    *hdr_out = h;
+    return p;
+}
+
+/* Allocate uninitialized scratch buffer. */
+static inline void* scratch_alloc(ray_t** hdr_out, size_t nbytes) {
+    ray_t* h = ray_alloc(nbytes);
+    if (!h) { *hdr_out = NULL; return NULL; }
+    *hdr_out = h;
+    return ray_data(h);
+}
+
+/* Reallocate: alloc new, copy old, free old. Returns new data pointer. */
+static inline void* scratch_realloc(ray_t** hdr_out, size_t old_bytes, size_t new_bytes) {
+    ray_t* old_h = *hdr_out;
+    ray_t* new_h = ray_alloc(new_bytes);
+    if (!new_h) return NULL;
+    void* new_p = ray_data(new_h);
+    if (old_h) {
+        memcpy(new_p, ray_data(old_h), old_bytes < new_bytes ? old_bytes : new_bytes);
+        ray_free(old_h);
+    }
+    *hdr_out = new_h;
+    return new_p;
+}
+
+/* Free a scratch buffer (NULL-safe). */
+static inline void scratch_free(ray_t* hdr) {
+    if (!hdr) return;
+    ray_free(hdr);
+}
+
+/* ══════════════════════════════════════════
+ * Safe sym intern
+ * ══════════════════════════════════════════ */
+
+/* Safe sym intern for constant column names in graph algorithm result tables.
+ * Falls back to 0 on failure (column name interning should never fail for
+ * short constant strings unless ray_sym_init failed). */
+static inline int64_t sym_intern_safe(const char* s, size_t len) {
+    int64_t id = ray_sym_intern(s, len);
+    return id >= 0 ? id : 0;
+}
+
+/* ══════════════════════════════════════════
+ * Unified column read/write helpers
+ * ══════════════════════════════════════════ */
+
+static inline int64_t read_col_i64(const void* data, int64_t row,
+                                    int8_t type, uint8_t attrs) {
+    switch (type) {
+    case RAY_I64: case RAY_TIMESTAMP:
+        return ((const int64_t*)data)[row];
+    case RAY_SYM:
+        switch (attrs & RAY_SYM_W_MASK) {
+        case RAY_SYM_W8:  return (int64_t)((const uint8_t*)data)[row];
+        case RAY_SYM_W16: return (int64_t)((const uint16_t*)data)[row];
+        case RAY_SYM_W32: return (int64_t)((const uint32_t*)data)[row];
+        default:         return ((const int64_t*)data)[row];
+        }
+    case RAY_I32: case RAY_DATE: case RAY_TIME:
+        return (int64_t)((const int32_t*)data)[row];
+    case RAY_I16:
+        return (int64_t)((const int16_t*)data)[row];
+    default: /* RAY_BOOL, RAY_U8 */
+        return (int64_t)((const uint8_t*)data)[row];
+    }
+}
+
+static inline void write_col_i64(void* data, int64_t row, int64_t val,
+                                  int8_t type, uint8_t attrs) {
+    switch (type) {
+    case RAY_I64: case RAY_TIMESTAMP:
+        ((int64_t*)data)[row] = val; return;
+    case RAY_SYM:
+        ray_write_sym(data, row, (uint64_t)val, type, attrs); return;
+    case RAY_I32: case RAY_DATE: case RAY_TIME:
+        ((int32_t*)data)[row] = (int32_t)val; return;
+    case RAY_I16:
+        ((int16_t*)data)[row] = (int16_t)val; return;
+    default: /* RAY_BOOL, RAY_U8 */
+        ((uint8_t*)data)[row] = (uint8_t)val; return;
+    }
+}
+
+/* ══════════════════════════════════════════
+ * RAY_SYM-aware column helpers
+ * ══════════════════════════════════════════ */
+
+static inline uint8_t col_esz(const ray_t* col) {
+    return ray_sym_elem_size(col->type, col->attrs);
+}
+
+/* Fast key reader for DA/sort hot loops: elem_size is pre-computed and
+ * loop-invariant, so the switch is always perfectly predicted.  Avoids the
+ * ray_read_sym → type dispatch chain (3+ branches per element). */
+static inline int64_t read_by_esz(const void* data, int64_t row, uint8_t esz) {
+    switch (esz) {
+    case 1:  return (int64_t)((const uint8_t*)data)[row];
+    case 2:  return (int64_t)((const uint16_t*)data)[row];
+    case 4:  return (int64_t)((const uint32_t*)data)[row];
+    default: return ((const int64_t*)data)[row];
+    }
+}
+
+static inline ray_t* col_vec_new(const ray_t* src, int64_t cap) {
+    if (src->type == RAY_SYM)
+        return ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, cap);
+    return ray_vec_new(src->type, cap);
+}
+
+/* Propagate str_pool from source to gathered result.
+ * Source may be a slice — resolve to owner's pool. */
+static inline void col_propagate_str_pool(ray_t* dst, const ray_t* src) {
+    if (src->type != RAY_STR || dst->type != RAY_STR) return;
+    const ray_t* owner = (src->attrs & RAY_ATTR_SLICE) ? src->slice_parent : src;
+    if (owner->str_pool) {
+        if (dst->str_pool) ray_release(dst->str_pool);
+        ray_retain(owner->str_pool);
+        dst->str_pool = owner->str_pool;
+    }
+}
+
+/* Propagate str_pool from parted segments to gathered result.
+ * All segments must share the same pool for memcpy-gathered results
+ * to be valid. For multi-pool cases, callers must use the deep-copy
+ * gather path (parted_gather_str_col) instead. */
+static inline void col_propagate_str_pool_parted(ray_t* dst, ray_t** segs, int64_t n_segs) {
+    if (dst->type != RAY_STR) return;
+    for (int64_t i = 0; i < n_segs; i++) {
+        if (segs[i] && segs[i]->type == RAY_STR && segs[i]->str_pool) {
+            col_propagate_str_pool(dst, segs[i]);
+            return;
+        }
+    }
+}
+
+/* Check if all non-NULL STR segments share the same str_pool pointer. */
+static inline bool parted_str_single_pool(ray_t** segs, int64_t n_segs) {
+    ray_t* pool = NULL;
+    for (int64_t i = 0; i < n_segs; i++) {
+        if (!segs[i] || segs[i]->type != RAY_STR || !segs[i]->str_pool) continue;
+        if (!pool) pool = segs[i]->str_pool;
+        else if (segs[i]->str_pool != pool) return false;
+    }
+    return true;
+}
+
+/* ---- Null bitmap propagation helpers ---- */
+
+/* Propagate nulls from src to dst via index array: dst[r] gets src's null
+ * bit at indices[r].  indices may contain -1 for LEFT/OUTER join fill rows
+ * (those are set null unconditionally). */
+static inline void col_propagate_nulls_gather(ray_t* dst, const ray_t* src,
+                                               const int64_t* indices,
+                                               int64_t count) {
+    bool src_has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0;
+    for (int64_t r = 0; r < count; r++) {
+        if (indices[r] < 0 ||
+            (src_has_nulls && ray_vec_is_null((ray_t*)src, indices[r])))
+            ray_vec_set_null(dst, r, true);
+    }
+}
+
+/* Propagate nulls from src[src_off..src_off+count) to dst[dst_off..),
+ * for contiguous range copies (HEAD, TAIL, range inserts). */
+static inline void col_propagate_nulls_range(ray_t* dst, int64_t dst_off,
+                                              const ray_t* src, int64_t src_off,
+                                              int64_t count) {
+    if (!(src->attrs & RAY_ATTR_HAS_NULLS)) return;
+    for (int64_t i = 0; i < count; i++) {
+        if (ray_vec_is_null((ray_t*)src, src_off + i))
+            ray_vec_set_null(dst, dst_off + i, true);
+    }
+}
+
+/* Propagate nulls through a boolean filter mask: for each set bit in
+ * mask[0..src_len), copy the null bit from src to dst[out_idx++]. */
+static inline void col_propagate_nulls_filter(ray_t* dst, const ray_t* src,
+                                               const uint8_t* mask,
+                                               int64_t src_len) {
+    if (!(src->attrs & RAY_ATTR_HAS_NULLS)) return;
+    int64_t out = 0;
+    for (int64_t i = 0; i < src_len; i++) {
+        if (mask[i]) {
+            if (ray_vec_is_null((ray_t*)src, i))
+                ray_vec_set_null(dst, out, true);
+            out++;
+        }
+    }
+}
+
+/* Append one string element from a parted segment, preserving nulls. */
+static inline ray_t* parted_str_append_elem(ray_t* out, ray_t* seg,
+                                            int64_t local_idx,
+                                            const char* pool_base) {
+    if ((seg->attrs & RAY_ATTR_HAS_NULLS) && ray_vec_is_null(seg, local_idx)) {
+        out = ray_str_vec_append(out, "", 0);
+        if (!RAY_IS_ERR(out))
+            ray_vec_set_null(out, out->len - 1, true);
+    } else {
+        ray_str_t* elems = (ray_str_t*)ray_data(seg);
+        const char* str = ray_str_t_ptr(&elems[local_idx], pool_base);
+        out = ray_str_vec_append(out, str, elems[local_idx].len);
+    }
+    return out;
+}
+
+/* Deep-copy gather from parted RAY_STR segments by row index.
+ * Resolves each string from its source segment's pool and appends
+ * into the output vector's own pool. Safe for multi-pool segments. */
+static inline ray_t* parted_gather_str_rows(ray_t** segs, int64_t n_segs,
+                                            const int64_t* row_indices,
+                                            int64_t count) {
+    /* Build prefix-sum segment boundaries */
+    int64_t cumul = 0;
+    int64_t stack_ends[64];
+    int64_t* seg_ends = (n_segs <= 64) ? stack_ends : NULL;
+    ray_t* ends_hdr = NULL;
+    if (!seg_ends) {
+        seg_ends = (int64_t*)scratch_alloc(&ends_hdr, (size_t)n_segs * sizeof(int64_t));
+        if (!seg_ends) return ray_error("oom", NULL);
+    }
+    for (int64_t i = 0; i < n_segs; i++) {
+        cumul += (segs[i]) ? segs[i]->len : 0;
+        seg_ends[i] = cumul;
+    }
+
+    ray_t* out = ray_vec_new(RAY_STR, count);
+    if (!out || RAY_IS_ERR(out)) { if (ends_hdr) scratch_free(ends_hdr); return out; }
+
+    int64_t seg = 0;
+    for (int64_t i = 0; i < count; i++) {
+        int64_t row = row_indices[i];
+        while (seg < n_segs - 1 && row >= seg_ends[seg]) seg++;
+        if (!segs[seg]) {
+            out = ray_str_vec_append(out, "", 0);
+            if (!RAY_IS_ERR(out))
+                ray_vec_set_null(out, out->len - 1, true);
+        } else {
+            int64_t seg_start = (seg > 0) ? seg_ends[seg - 1] : 0;
+            int64_t local = row - seg_start;
+            const char* pool_base = segs[seg]->str_pool
+                                  ? (const char*)ray_data(segs[seg]->str_pool) : NULL;
+            out = parted_str_append_elem(out, segs[seg], local, pool_base);
+        }
+        if (RAY_IS_ERR(out)) { if (ends_hdr) scratch_free(ends_hdr); return out; }
+    }
+    if (ends_hdr) scratch_free(ends_hdr);
+    return out;
+}
+
+/* Deep-copy head (first n rows) from parted RAY_STR segments. */
+static inline ray_t* parted_head_str(ray_t** segs, int64_t n_segs, int64_t n) {
+    ray_t* out = ray_vec_new(RAY_STR, n);
+    if (!out || RAY_IS_ERR(out)) return out;
+    int64_t remaining = n;
+    for (int64_t s = 0; s < n_segs && remaining > 0; s++) {
+        if (!segs[s]) continue;
+        int64_t seg_len = segs[s]->len;
+        int64_t take = (seg_len > remaining) ? remaining : seg_len;
+        const char* pool_base = segs[s]->str_pool
+                              ? (const char*)ray_data(segs[s]->str_pool) : NULL;
+        for (int64_t i = 0; i < take; i++) {
+            out = parted_str_append_elem(out, segs[s], i, pool_base);
+            if (RAY_IS_ERR(out)) return out;
+        }
+        remaining -= take;
+    }
+    return out;
+}
+
+/* Deep-copy tail (last n rows) from parted RAY_STR segments. */
+static inline ray_t* parted_tail_str(ray_t** segs, int64_t n_segs, int64_t n) {
+    /* First pass: count total rows to find start offset */
+    int64_t total = 0;
+    for (int64_t s = 0; s < n_segs; s++)
+        if (segs[s]) total += segs[s]->len;
+    int64_t skip = total - n;
+    if (skip < 0) { skip = 0; n = total; }
+
+    ray_t* out = ray_vec_new(RAY_STR, n);
+    if (!out || RAY_IS_ERR(out)) return out;
+    int64_t skipped = 0;
+    for (int64_t s = 0; s < n_segs; s++) {
+        if (!segs[s]) continue;
+        int64_t seg_len = segs[s]->len;
+        int64_t seg_start = 0;
+        if (skipped + seg_len <= skip) { skipped += seg_len; continue; }
+        if (skipped < skip) { seg_start = skip - skipped; skipped = skip; }
+        const char* pool_base = segs[s]->str_pool
+                              ? (const char*)ray_data(segs[s]->str_pool) : NULL;
+        for (int64_t i = seg_start; i < seg_len; i++) {
+            out = parted_str_append_elem(out, segs[s], i, pool_base);
+            if (RAY_IS_ERR(out)) return out;
+        }
+        skipped += seg_len;
+    }
+    return out;
+}
+
+/* Deep-copy flatten all rows from parted RAY_STR segments. */
+static inline ray_t* parted_flatten_str(ray_t** segs, int64_t n_segs, int64_t total) {
+    ray_t* out = ray_vec_new(RAY_STR, total);
+    if (!out || RAY_IS_ERR(out)) return out;
+    for (int64_t s = 0; s < n_segs; s++) {
+        if (!segs[s] || segs[s]->len <= 0) continue;
+        const char* pool_base = segs[s]->str_pool
+                              ? (const char*)ray_data(segs[s]->str_pool) : NULL;
+        for (int64_t i = 0; i < segs[s]->len; i++) {
+            out = parted_str_append_elem(out, segs[s], i, pool_base);
+            if (RAY_IS_ERR(out)) return out;
+        }
+    }
+    return out;
+}
+
+/* Same but from explicit type + attrs (for parted base type, etc.) */
+static inline ray_t* typed_vec_new(int8_t type, uint8_t attrs, int64_t cap) {
+    if (type == RAY_SYM)
+        return ray_sym_vec_new(attrs & RAY_SYM_W_MASK, cap);
+    return ray_vec_new(type, cap);
+}
+
+/* ══════════════════════════════════════════
+ * Cancellation check
+ * ══════════════════════════════════════════ */
+
+static inline bool pool_cancelled(ray_pool_t* pool) {
+    if (RAY_UNLIKELY(ray_interrupted())) return true;
+    return pool && RAY_UNLIKELY(atomic_load_explicit(&pool->cancelled,
+                                                     memory_order_relaxed));
+}
+
+#define CHECK_CANCEL(pool)                                \
+    do { if (pool_cancelled(pool))                        \
+             return ray_error("cancel", NULL); } while(0)
+
+#define CHECK_CANCEL_GOTO(pool, lbl)                      \
+    do { if (pool_cancelled(pool)) {                      \
+             result = ray_error("cancel", NULL);          \
+             goto lbl;                                    \
+         }                                                \
+    } while(0)
+
+/* ══════════════════════════════════════════
+ * Graph helper: find extended node
+ * ══════════════════════════════════════════ */
+
+static inline ray_op_ext_t* find_ext(ray_graph_t* g, uint32_t node_id) {
+    for (uint32_t i = 0; i < g->ext_count; i++) {
+        if (g->ext_nodes[i] && g->ext_nodes[i]->base.id == node_id)
+            return g->ext_nodes[i];
+    }
+    return NULL;
+}
+
+/* ══════════════════════════════════════════
+ * String helpers
+ * ══════════════════════════════════════════ */
+
+/* Convert an atom (-RAY_STR or RAY_SYM scalar) to ray_str_t for comparison */
+static inline void atom_to_str_t(ray_t* atom, ray_str_t* out, const char** out_pool) {
+    const char* sp;
+    size_t sl;
+    if (atom->type == -RAY_STR) {
+        sp = ray_str_ptr(atom);
+        sl = ray_str_len(atom);
+    } else if (atom->type == RAY_STR) {
+        /* Length-1 RAY_STR vector used as scalar */
+        if (atom->len < 1) {
+            memset(out, 0, sizeof(ray_str_t));
+            *out_pool = NULL;
+            return;
+        }
+        /* Resolve slice to parent data — slices have no data of their own,
+         * and str_pool shares the union with slice_offset. */
+        ray_t* src = atom;
+        int64_t idx = 0;
+        if (atom->attrs & RAY_ATTR_SLICE) {
+            src = atom->slice_parent;
+            idx = atom->slice_offset;
+        }
+        const ray_str_t* elems = (const ray_str_t*)ray_data(src);
+        *out = elems[idx];
+        *out_pool = src->str_pool ? (const char*)ray_data(src->str_pool) : NULL;
+        return;
+    } else if (RAY_IS_SYM(atom->type) && ray_is_atom(atom)) {
+        /* SAFETY: ray_sym_str returns a borrowed pointer into the append-only
+         * sym table.  The pointer is valid for the lifetime of the sym table
+         * (i.e., the entire query execution).  If the sym table ever gains
+         * eviction, this must retain the returned atom. */
+        ray_t* s = ray_sym_str(atom->i64);
+        sp = s ? ray_str_ptr(s) : "";
+        sl = s ? ray_str_len(s) : 0;
+    } else {
+        sp = ""; sl = 0;
+    }
+    memset(out, 0, sizeof(ray_str_t));
+    out->len = (uint32_t)sl;
+    if (sl <= RAY_STR_INLINE_MAX) {
+        if (sl > 0) memcpy(out->data, sp, sl);
+        *out_pool = NULL;
+    } else {
+        memcpy(out->prefix, sp, 4);
+        out->pool_off = 0;
+        *out_pool = sp; /* point directly at atom's string data */
+    }
+}
+
+/* Resolve RAY_STR vec to data owner, accounting for slices.
+ * Returns element pointer (already offset for slices) and pool pointer. */
+static inline void str_resolve(const ray_t* v, const ray_str_t** elems,
+                               const char** pool) {
+    const ray_t* owner = (v->attrs & RAY_ATTR_SLICE) ? v->slice_parent : v;
+    int64_t base = (v->attrs & RAY_ATTR_SLICE) ? v->slice_offset : 0;
+    *elems = (const ray_str_t*)ray_data((ray_t*)owner) + base;
+    *pool = owner->str_pool ? (const char*)ray_data(owner->str_pool) : NULL;
+}
+
+/* Helper: resolve sym/enum element to string */
+static inline void sym_elem(const ray_t* input, int64_t i,
+                            const char** out_str, size_t* out_len) {
+    int64_t sym_id = ray_read_sym(ray_data((ray_t*)input), i, input->type, input->attrs);
+    ray_t* atom = ray_sym_str(sym_id);
+    if (!atom) { *out_str = ""; *out_len = 0; return; }
+    *out_str = ray_str_ptr(atom);
+    *out_len = ray_str_len(atom);
+}
+
+/* ══════════════════════════════════════════
+ * Shared types — used by expr.c and exec.c
+ * ══════════════════════════════════════════ */
+
+typedef struct {
+    bool    enabled;
+    double  bias_f64;
+    int64_t bias_i64;
+} agg_affine_t;
+
+#define AGG_LINEAR_MAX_TERMS 8
+
+typedef struct {
+    bool    enabled;
+    uint8_t n_terms;
+    void*   term_ptrs[AGG_LINEAR_MAX_TERMS];
+    int8_t  term_types[AGG_LINEAR_MAX_TERMS];
+    int64_t coeff_i64[AGG_LINEAR_MAX_TERMS];
+    int64_t bias_i64;
+} agg_linear_t;
+
+typedef struct {
+    uint8_t n_terms;
+    int64_t syms[AGG_LINEAR_MAX_TERMS];
+    int64_t coeff_i64[AGG_LINEAR_MAX_TERMS];
+    int64_t bias_i64;
+} linear_expr_i64_t;
+
+/* ── Expression compiler types ── */
+
+#define EXPR_MAX_REGS 16
+#define EXPR_MAX_INS  48
+#define EXPR_MORSEL   RAY_MORSEL_ELEMS
+
+typedef struct {
+    uint8_t opcode;     /* OP_ADD, OP_NEG, OP_CAST, etc. */
+    uint8_t dst;        /* destination register */
+    uint8_t src1;       /* source 1 register */
+    uint8_t src2;       /* source 2 register (0xFF for unary) */
+} expr_ins_t;
+
+enum { REG_SCAN = 0, REG_CONST = 1, REG_SCRATCH = 2 };
+
+typedef struct {
+    uint8_t n_ins;
+    uint8_t n_regs;
+    uint8_t n_scratch;      /* scratch registers needed */
+    uint8_t out_reg;
+    int8_t  out_type;       /* RAY_F64, RAY_I64, or RAY_BOOL */
+    bool    has_parted;     /* true if any REG_SCAN refs a parted column */
+    struct {
+        uint8_t     kind;       /* REG_SCAN / REG_CONST / REG_SCRATCH */
+        int8_t      type;       /* computational type: RAY_F64 / RAY_I64 / RAY_BOOL */
+        int8_t      col_type;   /* original column type (REG_SCAN only) */
+        uint8_t     col_attrs;  /* column attrs — RAY_SYM width (REG_SCAN only) */
+        bool        is_parted;  /* true if this SCAN refs a parted column */
+        const void* data;       /* column data pointer (REG_SCAN only) */
+        ray_t*       parted_col; /* parted wrapper (is_parted only) */
+        double      const_f64;  /* scalar value (REG_CONST) */
+        int64_t     const_i64;  /* scalar value (REG_CONST) */
+    } regs[EXPR_MAX_REGS];
+    expr_ins_t ins[EXPR_MAX_INS];
+} ray_expr_t;
+
+/* ══════════════════════════════════════════
+ * Shared gather types — used by filter.c, exec.c (sort, join)
+ * ══════════════════════════════════════════ */
+
+#define MGATHER_MAX_COLS 16
+
+typedef struct {
+    const int64_t* idx;
+    char*          srcs[MGATHER_MAX_COLS];
+    char*          dsts[MGATHER_MAX_COLS];
+    uint8_t        esz[MGATHER_MAX_COLS];
+    int64_t        ncols;
+} multi_gather_ctx_t;
+
+typedef struct {
+    int64_t*     idx;
+    ray_t*        src_col;
+    ray_t*        dst_col;
+    uint8_t      esz;
+    bool         nullable;  /* true = idx may contain -1 (LEFT JOIN nulls) */
+} gather_ctx_t;
+
+/* ══════════════════════════════════════════
+ * Shared sort types and constants — used by sort_exec.c, exec.c (window)
+ * ══════════════════════════════════════════ */
+
+#define RADIX_SORT_THRESHOLD 4096  /* switch from comparison to radix sort */
+#define SMALL_POOL_THRESHOLD 8192  /* skip pool dispatch below this size */
+#define NEARLY_SORTED_FRAC   0.05  /* threshold for nearly-sorted detection */
+#define MK_PRESCAN_MAX_KEYS  8     /* max sort keys for stack allocation */
+
+typedef struct {
+    ray_t**       vecs;
+    uint8_t*     desc;
+    uint8_t*     nulls_first;
+    uint8_t      n_sort;
+} sort_cmp_ctx_t;
+
+/* Radix pass context (shared across histogram + scatter phases) */
+typedef struct {
+    const uint64_t*  keys;
+    const int64_t*   idx;
+    uint64_t*        keys_out;
+    int64_t*         idx_out;
+    int64_t          n;
+    uint8_t          shift;
+    uint32_t         n_tasks;
+    uint32_t*        hist;       /* flat [n_tasks * 256] */
+    int64_t*         offsets;    /* flat [n_tasks * 256] */
+} radix_pass_ctx_t;
+
+/* Key-encoding context for parallel encode phase */
+typedef struct {
+    uint64_t*       keys;      /* output */
+    int64_t*        indices;   /* if non-NULL, initialize indices[i]=i (fused iota) */
+    /* Single-key fields: */
+    const void*     data;      /* raw column data */
+    ray_t*          col;       /* source column (for null bitmap access) */
+    int8_t          type;      /* column type */
+    uint8_t         col_attrs; /* RAY_SYM width attrs */
+    bool            desc;
+    bool            nulls_first; /* for single-key F64: 1=nulls first */
+    /* SYM rank mapping (NULL if not sym): */
+    const uint32_t* enum_rank; /* intern_id → sort rank */
+    /* Composite-key fields (n_keys > 1): */
+    uint8_t         n_keys;
+    ray_t**          vecs;
+    int64_t         mins[16];
+    int64_t         ranges[16];
+    uint8_t         bit_shifts[16]; /* bit offset for key k in composite */
+    uint8_t         descs[16];
+    const uint32_t* enum_ranks[16]; /* per-key rank mappings */
+} radix_encode_ctx_t;
+
+/* Parallel multi-key min/max prescan context */
+typedef struct {
+    ray_t*     const* vecs;
+    uint32_t* const* enum_ranks;
+    uint8_t          n_keys;
+    int64_t          nrows;
+    uint32_t         n_workers;
+    int64_t*         pw_mins;
+    int64_t*         pw_maxs;
+} mk_prescan_ctx_t;
+
+/* Parallel sort phase 1 context */
+typedef struct {
+    const sort_cmp_ctx_t* cmp_ctx;
+    int64_t*  indices;
+    int64_t*  tmp;
+    int64_t   nrows;
+    uint32_t  n_chunks;
+} sort_phase1_ctx_t;
+
+/* Parallel merge pass context */
+typedef struct {
+    const sort_cmp_ctx_t* cmp_ctx;
+    const int64_t*  src;
+    int64_t*        dst;
+    int64_t         nrows;
+    int64_t         run_size;
+} sort_merge_ctx_t;
+
+/* Compute the number of significant bytes for radix sort based on type.
+ * Returns 1..8: the number of byte passes radix_sort_run needs. */
+static inline uint8_t radix_key_bytes(int8_t type) {
+    switch (type) {
+    case RAY_BOOL: case RAY_U8:   return 1;
+    case RAY_I16:                return 2;
+    case RAY_I32: case RAY_DATE: case RAY_TIME: return 4;
+    default:                    return 8;  /* I64, F64, TIMESTAMP, SYM */
+    }
+}
+
+/* ══════════════════════════════════════════
+ * Extern forward declarations — larger functions in exec.c
+ * ══════════════════════════════════════════ */
+
+/* ── exec.c (gather helpers) ── */
+void multi_gather_fn(void* raw, uint32_t wid, int64_t start, int64_t end);
+void gather_fn(void* raw, uint32_t wid, int64_t start, int64_t end);
+void partitioned_gather(ray_pool_t* pool, const int64_t* idx, int64_t n,
+                        int64_t src_rows, char** srcs, char** dsts,
+                        const uint8_t* esz, int64_t ncols);
+
+/* ── filter.c ── */
+ray_t* exec_filter(ray_graph_t* g, ray_op_t* op, ray_t* input, ray_t* pred);
+ray_t* exec_filter_head(ray_t* input, ray_t* pred, int64_t limit);
+ray_t* sel_compact(ray_graph_t* g, ray_t* tbl, ray_t* sel);
+
+/* ── expr.c ── */
+bool try_affine_sumavg_input(ray_graph_t* g, ray_t* tbl, ray_op_t* input_op,
+                             ray_t** out_vec, agg_affine_t* out_affine);
+bool try_linear_sumavg_input_i64(ray_graph_t* g, ray_t* tbl, ray_op_t* input_op,
+                                 agg_linear_t* out_plan);
+bool expr_compile(ray_graph_t* g, ray_t* tbl, ray_op_t* root, ray_expr_t* out);
+ray_t* expr_eval_full(const ray_expr_t* expr, int64_t nrows);
+ray_t* exec_elementwise_unary(ray_graph_t* g, ray_op_t* op, ray_t* input);
+ray_t* exec_elementwise_binary(ray_graph_t* g, ray_op_t* op, ray_t* lhs, ray_t* rhs);
+
+/* ── sort_exec.c ── */
+int sort_cmp(const sort_cmp_ctx_t* ctx, int64_t a, int64_t b);
+void sort_insertion(const sort_cmp_ctx_t* ctx, int64_t* arr, int64_t n);
+void sort_merge_recursive(const sort_cmp_ctx_t* ctx,
+                          int64_t* arr, int64_t* tmp, int64_t n);
+void sort_phase1_fn(void* arg, uint32_t worker_id, int64_t start, int64_t end);
+void sort_merge_fn(void* arg, uint32_t worker_id, int64_t start, int64_t end);
+void key_introsort(uint64_t* keys, int64_t* idx, int64_t n);
+double detect_sortedness(ray_pool_t* pool, const uint64_t* keys, int64_t n);
+uint8_t compute_key_nbytes(ray_pool_t* pool, const uint64_t* keys,
+                            int64_t n, uint8_t type_max);
+int64_t* radix_sort_run(ray_pool_t* pool, uint64_t* keys, int64_t* indices,
+                         uint64_t* keys_tmp, int64_t* idx_tmp,
+                         int64_t n, uint8_t n_bytes,
+                         uint64_t** sorted_keys_out);
+uint64_t* packed_radix_sort_run(ray_pool_t* pool, uint64_t* data,
+                                 uint64_t* tmp, int64_t n, uint8_t n_bytes);
+int64_t* msd_radix_sort_run(ray_pool_t* pool, uint64_t* keys, int64_t* indices,
+                              uint64_t* keys_tmp, int64_t* idx_tmp,
+                              int64_t n, uint8_t n_bytes,
+                              uint64_t** sorted_keys_out);
+void radix_encode_fn(void* arg, uint32_t wid, int64_t start, int64_t end);
+void mk_prescan_fn(void* arg, uint32_t wid, int64_t start, int64_t end);
+uint32_t* build_enum_rank(ray_t* col, int64_t nrows, ray_t** hdr_out);
+ray_t* exec_sort(ray_graph_t* g, ray_op_t* op, ray_t* tbl, int64_t limit);
+
+/* ── join.c ── */
+ray_t* exec_join(ray_graph_t* g, ray_op_t* op, ray_t* left_table, ray_t* right_table);
+ray_t* exec_antijoin(ray_graph_t* g, ray_op_t* op,
+                     ray_t* left_table, ray_t* right_table);
+ray_t* exec_window_join(ray_graph_t* g, ray_op_t* op,
+                        ray_t* left_table, ray_t* right_table);
+
+/* ── group.c ── */
+ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input);
+ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input);
+ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, int64_t group_limit);
+
+/* Group HT types and helpers — shared with pivot (exec.c) */
+#define GHT_NEED_SUM   0x01
+#define GHT_NEED_MIN   0x02
+#define GHT_NEED_MAX   0x04
+#define GHT_NEED_SUMSQ 0x08
+
+typedef struct {
+    uint16_t entry_stride;
+    uint16_t row_stride;
+    uint8_t  n_keys;
+    uint8_t  n_aggs;
+    uint8_t  n_agg_vals;
+    uint8_t  need_flags;
+    uint8_t  agg_is_f64;
+    uint8_t  agg_is_first;
+    uint8_t  agg_is_last;
+    int8_t   agg_val_slot[8];
+    uint16_t off_sum;
+    uint16_t off_min;
+    uint16_t off_max;
+    uint16_t off_sumsq;
+    /* Wide-key support: bit k set iff key k does not fit in 8 bytes
+     * (e.g. RAY_GUID = 16 B).  For wide keys the 8-byte key slot
+     * stores a source-row index and the actual key bytes live in the
+     * original column, so probe/rehash/scatter must redirect through
+     * key_data[k].  wide_key_esz[k] is the per-element byte size of
+     * the source column. */
+    uint8_t  wide_key_mask;
+    uint8_t  wide_key_esz[8];
+} ght_layout_t;
+
+typedef struct {
+    uint32_t*    slots;
+    uint32_t     ht_cap;
+    char*        rows;
+    uint32_t     grp_count;
+    uint32_t     grp_cap;
+    ght_layout_t layout;
+    /* Non-NULL only when layout.wide_key_mask != 0.  Pointers into
+     * the original key columns (slice-unaware raw data), used by
+     * group_probe_entry / group_ht_rehash to resolve row-indexed
+     * wide keys. */
+    void*        key_data[8];
+    ray_t*        _h_slots;
+    ray_t*        _h_rows;
+    uint8_t       oom;        /* set by group_probe_entry on grow failure */
+} group_ht_t;
+
+/* Row-level accessors for group HT rows */
+#define HT_SALT(h)  ((uint8_t)((h) >> 56))
+#define HT_EMPTY    UINT32_MAX
+#define HT_PACK(salt, gid)  (((uint32_t)(uint8_t)(salt) << 24) | ((gid) & 0xFFFFFF))
+#define HT_GID(s)   ((s) & 0xFFFFFF)
+#define HT_SALT_V(s) ((uint8_t)((s) >> 24))
+
+#define ROW_RD_F64(row, off, slot) (((const double*)((const void*)((row) + (off))))[(slot)])
+#define ROW_RD_I64(row, off, slot) (((const int64_t*)((const void*)((row) + (off))))[(slot)])
+#define ROW_WR_F64(row, off, slot) (((double*)((void*)((row) + (off))))[(slot)])
+#define ROW_WR_I64(row, off, slot) (((int64_t*)((void*)((row) + (off))))[(slot)])
+
+ght_layout_t ght_compute_layout(uint8_t n_keys, uint8_t n_aggs,
+                                ray_t** agg_vecs, uint8_t need_flags,
+                                const uint16_t* agg_ops,
+                                const int8_t* key_types);
+bool group_ht_init(group_ht_t* ht, uint32_t cap, const ght_layout_t* ly);
+void group_ht_free(group_ht_t* ht);
+/* Hash-aggregate rows [start, end) into ht.
+ *
+ * When match_idx is non-NULL, the loop iterates `i` in [start, end)
+ * and reads `row = match_idx[i]` — start/end index the selection
+ * space (number of passing rows), not the source column length.
+ * When match_idx is NULL, `row = i` — iterating directly over source
+ * column rows (no selection). */
+void group_rows_range(group_ht_t* ht, void** key_data, int8_t* key_types,
+                      uint8_t* key_attrs, ray_t** key_vecs, ray_t** agg_vecs,
+                      int64_t start, int64_t end,
+                      const int64_t* match_idx);
+
+/* ══════════════════════════════════════════
+ * Pivot ingest — shared parallel hash-aggregate path.
+ *
+ * Runs the same radix pipeline exec_group uses (phases 1+2), leaving
+ * the result in a set of per-partition HTs with prefix offsets. Phase
+ * 3 is left to the caller so pivot can restructure the output. For
+ * small inputs or when no thread pool is available, falls back to a
+ * single sequential HT transparently — the caller iterates
+ * part_hts[0..n_parts) the same way either way.
+ * ══════════════════════════════════════════ */
+
+typedef struct {
+    group_ht_t* part_hts;       /* n_parts entries */
+    uint32_t*   part_offsets;   /* n_parts+1 entries (prefix sums of grp_counts) */
+    uint32_t    n_parts;        /* 1 when sequential, RADIX_P when parallel */
+    uint32_t    total_grps;
+    uint16_t    row_stride;
+
+    /* Internal cleanup state — do not touch from callers. */
+    ray_t*      _part_hts_hdr;
+    ray_t*      _offsets_hdr;
+    void*       _radix_bufs;    /* radix_buf_t* — allocated only on parallel path */
+    ray_t*      _radix_bufs_hdr;
+    size_t      _n_bufs;
+} pivot_ingest_t;
+
+/* Run parallel (or sequential-fallback) hash aggregation for pivot.
+ * Returns true on success, false on unrecoverable OOM. On true the
+ * caller must eventually call pivot_ingest_free(). Cancellation is
+ * propagated via ray_interrupted() — callers should check that too. */
+bool pivot_ingest_run(pivot_ingest_t* out,
+                      const ght_layout_t* ly,
+                      void** key_data, int8_t* key_types, uint8_t* key_attrs,
+                      ray_t** key_vecs, ray_t** agg_vecs,
+                      int64_t n_scan);
+
+void pivot_ingest_free(pivot_ingest_t* out);
+
+/* ── window.c ── */
+ray_t* exec_window(ray_graph_t* g, ray_op_t* op, ray_t* tbl);
+
+/* ── graph_exec.c ── */
+ray_t* exec_expand(ray_graph_t* g, ray_op_t* op, ray_t* src_vec);
+ray_t* exec_var_expand(ray_graph_t* g, ray_op_t* op, ray_t* start_vec);
+ray_t* exec_shortest_path(ray_graph_t* g, ray_op_t* op,
+                          ray_t* src_val, ray_t* dst_val);
+ray_t* exec_pagerank(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_connected_comp(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_dijkstra(ray_graph_t* g, ray_op_t* op,
+                     ray_t* src_val, ray_t* dst_val);
+ray_t* exec_wco_join(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_louvain(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_degree_cent(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_topsort(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_cluster_coeff(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_betweenness(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_closeness(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_mst(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_random_walk(ray_graph_t* g, ray_op_t* op, ray_t* src_val);
+ray_t* exec_dfs(ray_graph_t* g, ray_op_t* op, ray_t* src_val);
+ray_t* exec_astar(ray_graph_t* g, ray_op_t* op,
+                  ray_t* src_val, ray_t* dst_val);
+ray_t* exec_k_shortest(ray_graph_t* g, ray_op_t* op,
+                       ray_t* src_val, ray_t* dst_val);
+
+/* ── pivot_exec.c ── */
+ray_t* exec_if(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_pivot(ray_graph_t* g, ray_op_t* op, ray_t* tbl);
+
+/* ── embedding_exec.c ── */
+ray_t* exec_cosine_sim(ray_graph_t* g, ray_op_t* op, ray_t* emb_vec);
+ray_t* exec_euclidean_dist(ray_graph_t* g, ray_op_t* op, ray_t* emb_vec);
+ray_t* exec_knn(ray_graph_t* g, ray_op_t* op, ray_t* emb_vec);
+ray_t* exec_hnsw_knn(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_ann_rerank(ray_graph_t* g, ray_op_t* op, ray_t* src);
+ray_t* exec_knn_rerank(ray_graph_t* g, ray_op_t* op, ray_t* src);
+
+/* ── temporal_exec.c ── */
+ray_t* exec_extract(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_date_trunc(ray_graph_t* g, ray_op_t* op);
+
+/* ── string_exec.c ── */
+ray_t* exec_like(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_ilike(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_string_unary(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_strlen(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_substr(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_replace(ray_graph_t* g, ray_op_t* op);
+ray_t* exec_concat(ray_graph_t* g, ray_op_t* op);
+
+/* ── exec.c ── */
+ray_t* materialize_mapcommon(ray_t* mc);
+ray_t* materialize_mapcommon_head(ray_t* mc, int64_t n);
+ray_t* materialize_mapcommon_filter(ray_t* mc, ray_t* pred, int64_t pass_count);
+ray_t* broadcast_scalar(ray_t* atom, int64_t nrows);
+ray_t* exec_node(ray_graph_t* g, ray_op_t* op);
+
+/* ══════════════════════════════════════════
+ * Thread-safe null bitmap helpers (parallel group/window)
+ * ══════════════════════════════════════════ */
+
+/* Atomically set a null bit. For idx >= 128 without ext nullmap, falls back
+ * to ray_vec_set_null (lazy alloc). Safe because OOM forces sequential path. */
+static inline void par_set_null(ray_t* vec, int64_t idx) {
+    if (!(vec->attrs & RAY_ATTR_NULLMAP_EXT)) {
+        if (idx >= 128) {
+            ray_vec_set_null(vec, idx, true);
+            return;
+        }
+        int byte_idx = (int)(idx / 8);
+        int bit_idx  = (int)(idx % 8);
+        __atomic_fetch_or(&vec->nullmap[byte_idx],
+                          (uint8_t)(1u << bit_idx), __ATOMIC_RELAXED);
+        return;
+    }
+    ray_t* ext = vec->ext_nullmap;
+    uint8_t* bits = (uint8_t*)ray_data(ext);
+    int byte_idx = (int)(idx / 8);
+    int bit_idx  = (int)(idx % 8);
+    __atomic_fetch_or(&bits[byte_idx],
+                      (uint8_t)(1u << bit_idx), __ATOMIC_RELAXED);
+}
+
+/* Pre-allocate external nullmap so parallel threads can set bits safely. */
+static inline ray_err_t par_prepare_nullmap(ray_t* vec) {
+    if (vec->len <= 128) return RAY_OK;
+    ray_err_t err = ray_vec_set_null_checked(vec, 0, true);
+    if (err != RAY_OK) return err;
+    ray_vec_set_null_checked(vec, 0, false);
+    vec->attrs &= (uint8_t)~RAY_ATTR_HAS_NULLS;
+    return RAY_OK;
+}
+
+/* Scan nullmap after parallel execution; set RAY_ATTR_HAS_NULLS if any bit set. */
+static inline void par_finalize_nulls(ray_t* vec) {
+    if (vec->attrs & RAY_ATTR_NULLMAP_EXT) {
+        ray_t* ext = vec->ext_nullmap;
+        uint8_t* bits = (uint8_t*)ray_data(ext);
+        int64_t nbytes = (vec->len + 7) / 8;
+        for (int64_t i = 0; i < nbytes; i++) {
+            if (bits[i]) { vec->attrs |= RAY_ATTR_HAS_NULLS; return; }
+        }
+    } else {
+        int64_t nbytes = (vec->len + 7) / 8;
+        if (nbytes > 16) nbytes = 16;
+        for (int64_t i = 0; i < nbytes; i++) {
+            if (vec->nullmap[i]) { vec->attrs |= RAY_ATTR_HAS_NULLS; return; }
+        }
+    }
+}
+
+#endif /* RAY_EXEC_INTERNAL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/join.c b/crates/rayforce-sys/vendor/rayforce/src/ops/join.c
new file mode 100644
index 0000000..21baa4a
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/join.c
@@ -0,0 +1,1972 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+
+/* ── Hash helper (shared by radix and chained HT join paths) ──────────── */
+
+static uint64_t hash_row_keys(ray_t** key_vecs, uint8_t n_keys, int64_t row) {
+    uint64_t h = 0;
+    for (uint8_t k = 0; k < n_keys; k++) {
+        ray_t* col = key_vecs[k];
+        if (!col) continue;
+        /* NULL key — produce unique hash that won't match any other row */
+        if (ray_vec_is_null(col, row))
+            return h ^ ((uint64_t)row * 0x9E3779B97F4A7C15ULL);
+        uint64_t kh;
+        if (col->type == RAY_F64)
+            kh = ray_hash_f64(((double*)ray_data(col))[row]);
+        else
+            kh = ray_hash_i64(read_col_i64(ray_data(col), row, col->type, col->attrs));
+        h = (k == 0) ? kh : ray_hash_combine(h, kh);
+    }
+    return h;
+}
+
+/* ============================================================================
+ * Radix-partitioned hash join
+ *
+ * Four-phase pipeline:
+ *   Phase 1: Partition both sides by radix bits of hash (parallel)
+ *   Phase 2: Per-partition build + probe with open-addressing HT (parallel)
+ *   Phase 3: Gather output columns from matched pairs (parallel)
+ *   Phase 4: Fallback to chained HT for small joins (< RAY_PARALLEL_THRESHOLD)
+ * ============================================================================ */
+
+/* Partition entry: row index + cached hash */
+typedef struct {
+    uint32_t row_idx;
+    uint32_t hash;
+} join_radix_entry_t;
+
+/* Per-partition descriptor */
+typedef struct {
+    join_radix_entry_t* entries;     /* partition buffer (from ray_alloc) */
+    ray_t*               entries_hdr; /* ray_alloc header for freeing */
+    uint32_t            count;       /* number of entries in partition */
+} join_radix_part_t;
+
+/* Choose radix bits so each partition's HT working set fits in cache.
+ * HT working set per partition ≈ 2x right entries × 8B = 16B per right row. */
+static uint8_t radix_join_bits(int64_t right_rows) {
+    /* HT working set: 2x capacity × 8B slot = 16B per right row */
+    size_t right_bytes = (size_t)right_rows * 16;
+    if (right_bytes <= RAY_JOIN_L2_TARGET)
+        return RAY_JOIN_MIN_RADIX;
+
+    /* R = ceil(log2(right_bytes / L2_TARGET)) */
+    uint8_t r = 0;
+    size_t target = RAY_JOIN_L2_TARGET;
+    while (target < right_bytes && r < RAY_JOIN_MAX_RADIX) {
+        target *= 2;
+        r++;
+    }
+    if (r < RAY_JOIN_MIN_RADIX) r = RAY_JOIN_MIN_RADIX;
+    return r;
+}
+
+/* Context for parallel hash pre-computation */
+typedef struct {
+    ray_t**    key_vecs;
+    uint8_t   n_keys;
+    uint32_t* hashes;    /* output: hash[row] */
+} join_radix_hash_ctx_t;
+
+static void join_radix_hash_fn(void* raw, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid;
+    join_radix_hash_ctx_t* c = (join_radix_hash_ctx_t*)raw;
+    for (int64_t r = start; r < end; r++)
+        c->hashes[r] = (uint32_t)hash_row_keys(c->key_vecs, c->n_keys, r);
+}
+
+/* Context for parallel partition histogram + scatter (pre-computed hashes).
+ * Uses fixed row assignment: task i processes rows [i*chunk, (i+1)*chunk).
+ * This ensures histogram and scatter see the same row ranges per task,
+ * enabling non-atomic per-worker scatter offsets. */
+typedef struct {
+    uint32_t* hashes;
+    uint32_t  radix_mask;
+    uint8_t   radix_shift;
+    uint32_t  n_parts;
+    uint32_t  n_workers;
+    int64_t   nrows;
+    uint32_t* histograms;   /* [n_workers][n_parts] flat array */
+} join_radix_hist_ctx_t;
+
+static void join_radix_hist_fn(void* raw, uint32_t wid, int64_t task_start, int64_t task_end) {
+    (void)wid; (void)task_end;
+    join_radix_hist_ctx_t* c = (join_radix_hist_ctx_t*)raw;
+    /* Fixed row range for this task */
+    uint32_t tid = (uint32_t)task_start;
+    int64_t chunk = (c->nrows + (int64_t)c->n_workers - 1) / (int64_t)c->n_workers;
+    int64_t start = (int64_t)tid * chunk;
+    int64_t end = start + chunk;
+    if (end > c->nrows) end = c->nrows;
+    if (start >= c->nrows) return;
+
+    uint32_t* hist = c->histograms + tid * c->n_parts;
+    uint32_t mask = c->radix_mask;
+    uint8_t shift = c->radix_shift;
+
+    for (int64_t r = start; r < end; r++) {
+        uint32_t part = (c->hashes[r] >> shift) & mask;
+        hist[part]++;
+    }
+}
+
+/* Context for parallel partition scatter with write-combining buffers.
+ * Each worker writes to small local buffers (one per partition). When
+ * a buffer fills, it flushes to the partition in a burst memcpy.
+ * This converts random writes into sequential bursts, dramatically
+ * improving cache utilization.
+ *
+ * Uses fixed per-worker row assignments (dispatch_n with n_workers tasks)
+ * to match histogram phase, eliminating atomic operations. */
+#define WCB_SIZE 64  /* entries per write-combine buffer */
+typedef struct {
+    uint32_t*           hashes;
+    uint32_t            radix_mask;
+    uint8_t             radix_shift;
+    uint32_t            n_parts;
+    join_radix_part_t*  parts;
+    uint32_t*           offsets;     /* [n_workers][n_parts] per-worker write positions */
+    int64_t             nrows;
+    uint32_t            n_workers;
+    _Atomic(uint8_t)    had_error;   /* set by any worker on OOM */
+} join_radix_scatter_ctx_t;
+
+static void join_radix_scatter_fn(void* raw, uint32_t wid, int64_t task_start, int64_t task_end) {
+    (void)wid; (void)task_end;
+    join_radix_scatter_ctx_t* c = (join_radix_scatter_ctx_t*)raw;
+    uint32_t mask = c->radix_mask;
+    uint8_t shift = c->radix_shift;
+    uint32_t n_parts = c->n_parts;
+
+    /* Fixed row range for this task (matches histogram) */
+    uint32_t tid = (uint32_t)task_start;
+    int64_t chunk = (c->nrows + (int64_t)c->n_workers - 1) / (int64_t)c->n_workers;
+    int64_t ws = (int64_t)tid * chunk;
+    int64_t we = ws + chunk;
+    if (we > c->nrows) we = c->nrows;
+    if (ws >= c->nrows) return;
+
+    uint32_t* off = c->offsets + tid * n_parts;
+
+    /* Write-combining: per-partition local buffers, flushed in bursts */
+    uint32_t wcb_cnt_stack[1024];
+    uint32_t* wcb_cnt_p = wcb_cnt_stack;
+    ray_t* wcb_cnt_hdr = NULL;
+    if (n_parts > 1024) {
+        wcb_cnt_p = (uint32_t*)scratch_calloc(&wcb_cnt_hdr, (size_t)n_parts * sizeof(uint32_t));
+        if (!wcb_cnt_p) {
+            atomic_store_explicit(&c->had_error, 1, memory_order_relaxed);
+            return;
+        }
+    } else {
+        memset(wcb_cnt_stack, 0, (size_t)n_parts * sizeof(uint32_t));
+    }
+
+    /* Allocate per-partition local buffers */
+    ray_t* local_hdr = NULL;
+    join_radix_entry_t* local_buf = (join_radix_entry_t*)scratch_alloc(&local_hdr,
+        (size_t)n_parts * WCB_SIZE * sizeof(join_radix_entry_t));
+    if (!local_buf) {
+        /* Fallback: direct write without buffering */
+        for (int64_t r = ws; r < we; r++) {
+            uint32_t h = c->hashes[r];
+            uint32_t part = (h >> shift) & mask;
+            uint32_t pos = off[part]++;
+            c->parts[part].entries[pos].row_idx = (uint32_t)r;
+            c->parts[part].entries[pos].hash = h;
+        }
+        if (wcb_cnt_hdr) scratch_free(wcb_cnt_hdr);
+        return;
+    }
+
+    for (int64_t r = ws; r < we; r++) {
+        uint32_t h = c->hashes[r];
+        uint32_t part = (h >> shift) & mask;
+        uint32_t idx = wcb_cnt_p[part];
+        local_buf[part * WCB_SIZE + idx].row_idx = (uint32_t)r;
+        local_buf[part * WCB_SIZE + idx].hash = h;
+        idx++;
+        if (idx == WCB_SIZE) {
+            /* Flush buffer to partition */
+            memcpy(&c->parts[part].entries[off[part]],
+                   &local_buf[part * WCB_SIZE],
+                   WCB_SIZE * sizeof(join_radix_entry_t));
+            off[part] += WCB_SIZE;
+            idx = 0;
+        }
+        wcb_cnt_p[part] = idx;
+    }
+
+    /* Flush remaining entries */
+    for (uint32_t p = 0; p < n_parts; p++) {
+        uint32_t cnt = wcb_cnt_p[p];
+        if (cnt > 0) {
+            memcpy(&c->parts[p].entries[off[p]],
+                   &local_buf[p * WCB_SIZE],
+                   (size_t)cnt * sizeof(join_radix_entry_t));
+            off[p] += cnt;
+        }
+    }
+
+    scratch_free(local_hdr);
+    if (wcb_cnt_hdr) scratch_free(wcb_cnt_hdr);
+}
+
+/* Partition one side of the join. Returns array of join_radix_part_t[n_parts].
+ * Caller must free each partition's entries_hdr and the parts array itself. */
+static join_radix_part_t* join_radix_partition(ray_pool_t* pool, int64_t nrows,
+                                      uint8_t radix_bits,
+                                      uint32_t* hashes,
+                                      ray_t** parts_hdr_out) {
+    uint32_t n_parts = (uint32_t)1 << radix_bits;
+    uint32_t mask = n_parts - 1;
+    /* Use upper bits of hash for radix (lower bits used inside partition HT) */
+    uint8_t shift = 32 - radix_bits;
+
+    /* Allocate partition descriptor array */
+    ray_t* parts_hdr;
+    join_radix_part_t* parts = (join_radix_part_t*)scratch_calloc(&parts_hdr,
+                            (size_t)n_parts * sizeof(join_radix_part_t));
+    if (!parts) { *parts_hdr_out = NULL; return NULL; }
+    *parts_hdr_out = parts_hdr;
+
+    /* Step 1: Histogram — count rows per partition per worker.
+     * n_workers must match dispatch: 1 when running serially so that the
+     * single hist/scatter call covers all rows (chunk = nrows / 1). */
+    uint32_t n_workers = (pool && nrows > RAY_PARALLEL_THRESHOLD) ? pool->n_workers + 1 : 1;
+    ray_t* hist_hdr;
+    uint32_t* histograms = (uint32_t*)scratch_calloc(&hist_hdr,
+                             (size_t)n_workers * n_parts * sizeof(uint32_t));
+    if (!histograms) { scratch_free(parts_hdr); *parts_hdr_out = NULL; return NULL; }
+
+    join_radix_hist_ctx_t hctx = {
+        .hashes = hashes,
+        .radix_mask = mask, .radix_shift = shift,
+        .n_parts = n_parts, .n_workers = n_workers,
+        .nrows = nrows,
+        .histograms = histograms,
+    };
+    if (pool && nrows > RAY_PARALLEL_THRESHOLD)
+        ray_pool_dispatch_n(pool, join_radix_hist_fn, &hctx, n_workers);
+    else
+        join_radix_hist_fn(&hctx, 0, 0, 1);
+
+    /* Compute partition sizes (sum across workers) */
+    for (uint32_t p = 0; p < n_parts; p++) {
+        uint32_t total = 0;
+        for (uint32_t w = 0; w < n_workers; w++)
+            total += histograms[w * n_parts + p];
+        parts[p].count = total;
+    }
+
+    /* Allocate partition buffers */
+    bool oom = false;
+    for (uint32_t p = 0; p < n_parts; p++) {
+        if (parts[p].count == 0) continue;
+        parts[p].entries = (join_radix_entry_t*)scratch_alloc(&parts[p].entries_hdr,
+                             (size_t)parts[p].count * sizeof(join_radix_entry_t));
+        if (!parts[p].entries) {
+            ray_heap_gc();
+            ray_heap_release_pages();
+            parts[p].entries = (join_radix_entry_t*)scratch_alloc(&parts[p].entries_hdr,
+                                 (size_t)parts[p].count * sizeof(join_radix_entry_t));
+            if (!parts[p].entries) { oom = true; break; }
+        }
+    }
+    if (oom) {
+        for (uint32_t p = 0; p < n_parts; p++)
+            if (parts[p].entries_hdr) scratch_free(parts[p].entries_hdr);
+        scratch_free(hist_hdr);
+        scratch_free(parts_hdr);
+        *parts_hdr_out = NULL;
+        return NULL;
+    }
+
+    /* Step 2: Compute per-worker write offsets (prefix sum of histograms).
+     * For each partition p, worker w's write offset =
+     *   sum(histograms[0..w-1][p]) = global prefix for workers before w. */
+    ray_t* off_hdr;
+    uint32_t* offsets = (uint32_t*)scratch_alloc(&off_hdr,
+                            (size_t)n_workers * n_parts * sizeof(uint32_t));
+    if (!offsets) {
+        for (uint32_t p = 0; p < n_parts; p++)
+            if (parts[p].entries_hdr) scratch_free(parts[p].entries_hdr);
+        scratch_free(hist_hdr);
+        scratch_free(parts_hdr);
+        *parts_hdr_out = NULL;
+        return NULL;
+    }
+    for (uint32_t p = 0; p < n_parts; p++) {
+        uint32_t running = 0;
+        for (uint32_t w = 0; w < n_workers; w++) {
+            offsets[w * n_parts + p] = running;
+            running += histograms[w * n_parts + p];
+        }
+    }
+
+    /* Step 3: Scatter rows into partition buffers (fixed row assignment, no atomics) */
+    join_radix_scatter_ctx_t sctx = {
+        .hashes = hashes,
+        .radix_mask = mask, .radix_shift = shift,
+        .n_parts = n_parts, .parts = parts,
+        .offsets = offsets,
+        .nrows = nrows, .n_workers = n_workers,
+        .had_error = 0,
+    };
+    if (pool && nrows > RAY_PARALLEL_THRESHOLD)
+        ray_pool_dispatch_n(pool, join_radix_scatter_fn, &sctx, n_workers);
+    else
+        join_radix_scatter_fn(&sctx, 0, 0, 1);
+
+    scratch_free(off_hdr);
+    scratch_free(hist_hdr);
+
+    if (atomic_load_explicit(&sctx.had_error, memory_order_relaxed)) {
+        for (uint32_t p = 0; p < n_parts; p++)
+            if (parts[p].entries_hdr) scratch_free(parts[p].entries_hdr);
+        scratch_free(parts_hdr);
+        *parts_hdr_out = NULL;
+        return NULL;
+    }
+
+    return parts;
+}
+
+/* ============================================================================
+ * Join execution (parallel hash join)
+ *
+ * Three-phase pipeline:
+ *   Phase 1 (sequential): Build chained hash table on right side
+ *   Phase 2 (parallel):   Two-pass probe — count matches, prefix-sum, fill
+ *   Phase 3 (parallel):   Column gather — assemble result columns
+ * ============================================================================ */
+
+/* Key equality helper — shared by count + fill phases */
+static inline bool join_keys_eq(ray_t* const* l_vecs, ray_t* const* r_vecs, uint8_t n_keys,
+                                 int64_t l, int64_t r) {
+    for (uint8_t k = 0; k < n_keys; k++) {
+        ray_t* lc = l_vecs[k];
+        ray_t* rc = r_vecs[k];
+        if (!lc || !rc) return false;
+        /* NULL != NULL in join predicates */
+        if (ray_vec_is_null(lc, l) || ray_vec_is_null(rc, r)) return false;
+        if (lc->type == RAY_F64) {
+            if (((double*)ray_data(lc))[l] != ((double*)ray_data(rc))[r]) return false;
+        } else {
+            if (read_col_i64(ray_data(lc), l, lc->type, lc->attrs) !=
+                read_col_i64(ray_data(rc), r, rc->type, rc->attrs)) return false;
+        }
+    }
+    return true;
+}
+
+/* ── Per-partition open-addressing build + probe ─────────────────────── */
+
+#define RADIX_HT_EMPTY UINT32_MAX
+
+/* Per-partition single-pass build+probe context.
+ * Each partition writes to its own local output buffer, then results
+ * are consolidated into contiguous arrays afterward. */
+typedef struct {
+    join_radix_part_t*  l_parts;
+    join_radix_part_t*  r_parts;
+    ray_t**         l_key_vecs;
+    ray_t**         r_key_vecs;
+    uint8_t        n_keys;
+    uint8_t        join_type;
+    /* Per-partition output: pp_l[p], pp_r[p] are local buffers */
+    int32_t**      pp_l;         /* per-partition left indices (int32_t) */
+    int32_t**      pp_r;         /* per-partition right indices (int32_t) */
+    ray_t**         pp_l_hdr;     /* allocation headers for freeing */
+    ray_t**         pp_r_hdr;
+    int64_t*       part_counts;  /* actual output count per partition */
+    uint32_t*      pp_cap;       /* capacity per partition */
+    _Atomic(uint8_t)* matched_right;
+    _Atomic(uint8_t)  had_error;  /* set by any partition on OOM */
+} join_radix_bp_ctx_t;
+
+/* Grow per-partition output buffers (matched pair arrays).
+ * Returns true on success, false on OOM (sets had_error). */
+static inline bool bp_grow_bufs(join_radix_bp_ctx_t* c, uint32_t p,
+                                 int32_t** pl, int32_t** pr,
+                                 uint32_t* cap, uint32_t cnt) {
+    if (cnt < *cap) return true;
+    if (*cap > UINT32_MAX / 2) {
+        atomic_store_explicit(&c->had_error, 1, memory_order_relaxed);
+        return false;
+    }
+    uint32_t new_cap = *cap * 2;
+    ray_t* nl_hdr; ray_t* nr_hdr;
+    int32_t* nl = (int32_t*)scratch_alloc(&nl_hdr, (size_t)new_cap * sizeof(int32_t));
+    int32_t* nr = (int32_t*)scratch_alloc(&nr_hdr, (size_t)new_cap * sizeof(int32_t));
+    if (!nl || !nr) {
+        if (nl_hdr) scratch_free(nl_hdr);
+        if (nr_hdr) scratch_free(nr_hdr);
+        atomic_store_explicit(&c->had_error, 1, memory_order_relaxed);
+        return false;
+    }
+    memcpy(nl, *pl, (size_t)cnt * sizeof(int32_t));
+    memcpy(nr, *pr, (size_t)cnt * sizeof(int32_t));
+    scratch_free(c->pp_l_hdr[p]); scratch_free(c->pp_r_hdr[p]);
+    *pl = nl; *pr = nr;
+    c->pp_l_hdr[p] = nl_hdr; c->pp_r_hdr[p] = nr_hdr;
+    *cap = new_cap;
+    return true;
+}
+
+static void join_radix_build_probe_fn(void* raw, uint32_t wid, int64_t task_start, int64_t task_end) {
+    (void)wid; (void)task_end;
+    join_radix_bp_ctx_t* c = (join_radix_bp_ctx_t*)raw;
+    uint32_t p = (uint32_t)task_start;
+
+    join_radix_part_t* rp = &c->r_parts[p];
+    join_radix_part_t* lp = &c->l_parts[p];
+
+    if (rp->count == 0) {
+        /* No right rows — emit unmatched left rows for LEFT/FULL */
+        if (c->join_type >= 1 && lp->count > 0) {
+            uint32_t cap = lp->count;
+            int32_t* pl = (int32_t*)scratch_alloc(&c->pp_l_hdr[p], (size_t)cap * sizeof(int32_t));
+            int32_t* pr = (int32_t*)scratch_alloc(&c->pp_r_hdr[p], (size_t)cap * sizeof(int32_t));
+            if (pl && pr) {
+                for (uint32_t i = 0; i < lp->count; i++) {
+                    pl[i] = (int32_t)lp->entries[i].row_idx;
+                    pr[i] = -1;
+                }
+                c->pp_l[p] = pl; c->pp_r[p] = pr;
+                c->part_counts[p] = lp->count;
+                c->pp_cap[p] = cap;
+            } else {
+                if (c->pp_l_hdr[p]) scratch_free(c->pp_l_hdr[p]);
+                if (c->pp_r_hdr[p]) scratch_free(c->pp_r_hdr[p]);
+                c->pp_l_hdr[p] = NULL; c->pp_r_hdr[p] = NULL;
+                atomic_store_explicit(&c->had_error, 1, memory_order_relaxed);
+            }
+        }
+        return;
+    }
+
+    /* Allocate per-partition output buffer.
+     * Capacity = max(left, right) handles 1:1 and 1:N joins.
+     * For N:M (overflow), we grow by re-allocating. */
+    uint32_t init_cap = lp->count > rp->count ? lp->count : rp->count;
+    if (init_cap < 64) init_cap = 64;
+    int32_t* pl = (int32_t*)scratch_alloc(&c->pp_l_hdr[p], (size_t)init_cap * sizeof(int32_t));
+    int32_t* pr = (int32_t*)scratch_alloc(&c->pp_r_hdr[p], (size_t)init_cap * sizeof(int32_t));
+    if (!pl || !pr) {
+        if (c->pp_l_hdr[p]) scratch_free(c->pp_l_hdr[p]);
+        if (c->pp_r_hdr[p]) scratch_free(c->pp_r_hdr[p]);
+        c->pp_l_hdr[p] = NULL; c->pp_r_hdr[p] = NULL;
+        c->part_counts[p] = 0;
+        atomic_store_explicit(&c->had_error, 1, memory_order_relaxed);
+        return;
+    }
+    uint32_t cap = init_cap;
+    uint32_t cnt = 0;
+
+    /* Build open-addressing HT for right partition */
+    uint32_t ht_cap = 256;
+    uint64_t ht_target = (uint64_t)rp->count * 2;
+    while ((uint64_t)ht_cap < ht_target && ht_cap <= (UINT32_MAX >> 1)) ht_cap *= 2;
+    if ((uint64_t)ht_cap < ht_target) {
+        /* Partition too large for open-addressing HT — signal error */
+        atomic_store_explicit(&c->had_error, 1, memory_order_relaxed);
+        c->part_counts[p] = 0;
+        scratch_free(c->pp_l_hdr[p]); scratch_free(c->pp_r_hdr[p]);
+        c->pp_l_hdr[p] = NULL; c->pp_r_hdr[p] = NULL;
+        return;
+    }
+    uint32_t ht_mask = ht_cap - 1;
+
+    ray_t* ht_hdr;
+    uint32_t* ht = (uint32_t*)scratch_calloc(&ht_hdr, (size_t)ht_cap * 2 * sizeof(uint32_t));
+    if (!ht) {
+        atomic_store_explicit(&c->had_error, 1, memory_order_relaxed);
+        scratch_free(c->pp_l_hdr[p]); scratch_free(c->pp_r_hdr[p]);
+        c->pp_l_hdr[p] = NULL; c->pp_r_hdr[p] = NULL;
+        c->part_counts[p] = 0;
+        return;
+    }
+    for (uint32_t s = 0; s < ht_cap; s++)
+        ht[s * 2 + 1] = RADIX_HT_EMPTY;
+
+    for (uint32_t i = 0; i < rp->count; i++) {
+        uint32_t h = rp->entries[i].hash;
+        uint32_t slot = h & ht_mask;
+        if (i + 4 < rp->count)
+            __builtin_prefetch(&ht[(rp->entries[i + 4].hash & ht_mask) * 2], 1, 1);
+        while (ht[slot * 2 + 1] != RADIX_HT_EMPTY)
+            slot = (slot + 1) & ht_mask;
+        ht[slot * 2] = h;
+        ht[slot * 2 + 1] = rp->entries[i].row_idx;
+    }
+
+    /* Single-pass probe + fill */
+    for (uint32_t i = 0; i < lp->count; i++) {
+        uint32_t h = lp->entries[i].hash;
+        uint32_t lr = lp->entries[i].row_idx;
+        uint32_t slot = h & ht_mask;
+        if (i + 4 < lp->count)
+            __builtin_prefetch(&ht[(lp->entries[i + 4].hash & ht_mask) * 2], 0, 1);
+        bool matched = false;
+        while (ht[slot * 2 + 1] != RADIX_HT_EMPTY) {
+            if (ht[slot * 2] == h) {
+                uint32_t rr = ht[slot * 2 + 1];
+                if (join_keys_eq(c->l_key_vecs, c->r_key_vecs, c->n_keys,
+                                 (int64_t)lr, (int64_t)rr)) {
+                    if (!bp_grow_bufs(c, p, &pl, &pr, &cap, cnt))
+                        goto done;
+                    pl[cnt] = (int32_t)lr;
+                    pr[cnt] = (int32_t)rr;
+                    cnt++;
+                    matched = true;
+                    if (c->matched_right)
+                        atomic_store_explicit(&c->matched_right[rr], 1, memory_order_relaxed);
+                }
+            }
+            slot = (slot + 1) & ht_mask;
+        }
+        if (!matched && c->join_type >= 1) {
+            if (!bp_grow_bufs(c, p, &pl, &pr, &cap, cnt))
+                goto done;
+            pl[cnt] = (int32_t)lr;
+            pr[cnt] = -1;
+            cnt++;
+        }
+    }
+
+done:
+    scratch_free(ht_hdr);
+    c->pp_l[p] = pl; c->pp_r[p] = pr;
+    c->part_counts[p] = cnt;
+    c->pp_cap[p] = cap;
+}
+
+/* ── Parallel join HT build ─────────────────────────────────────────────
+ * Workers hash right-side rows in parallel and insert into the shared
+ * chain-linked hash table using atomic CAS on ht_heads[slot].
+ * ht_next[r] is per-row (no contention). Load factor ~0.3 → negligible
+ * CAS contention.
+ * ──────────────────────────────────────────────────────────────────── */
+
+/* ht_heads is accessed atomically from multiple workers during join build.
+ * Using _Atomic(uint32_t)* for C11-compliant atomic access. */
+#define JHT_EMPTY UINT32_MAX  /* sentinel for empty HT slot/chain end */
+
+typedef struct {
+    _Atomic(uint32_t)* ht_heads;  /* shared, protected by atomic CAS */
+    uint32_t* ht_next;            /* per-row, no contention */
+    uint32_t ht_mask;       /* ht_cap - 1 */
+    ray_t**   r_key_vecs;
+    uint8_t  n_keys;
+    /* ASP-Join: semijoin filter from factorized left side (NULL if N/A) */
+    uint64_t* asp_bits;
+    int64_t   asp_key_max;
+} join_build_ctx_t;
+
+static void join_build_fn(void* raw, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid;
+    join_build_ctx_t* c = (join_build_ctx_t*)raw;
+    _Atomic(uint32_t)* heads = c->ht_heads;
+    uint32_t* restrict next  = c->ht_next;
+    uint32_t mask  = c->ht_mask;
+
+    /* ASP-Join: precompute pointer for right-side build filtering */
+    uint64_t* asp_bits = c->asp_bits;
+    int64_t asp_max = c->asp_key_max;
+    int64_t* rk0 = (asp_bits && c->n_keys == 1) ? (int64_t*)ray_data(c->r_key_vecs[0]) : NULL;
+
+    for (int64_t r = start; r < end; r++) {
+        /* ASP-Join skip: if right key not in left-side bitmap, skip insert */
+        if (rk0 && rk0[r] >= 0 && rk0[r] <= asp_max &&
+            !RAY_SEL_BIT_TEST(asp_bits, rk0[r])) {
+            next[(uint32_t)r] = JHT_EMPTY;  /* mark as unused */
+            continue;
+        }
+        if (r + 8 < end) {
+            uint64_t pf_h = hash_row_keys(c->r_key_vecs, c->n_keys, r + 8);
+            __builtin_prefetch(&heads[(uint32_t)(pf_h & mask)], 1, 1);
+        }
+        uint64_t h = hash_row_keys(c->r_key_vecs, c->n_keys, r);
+        uint32_t slot = (uint32_t)(h & mask);
+        uint32_t row32 = (uint32_t)r;
+        uint32_t old = atomic_load_explicit(&heads[slot], memory_order_relaxed);
+        do {
+            next[row32] = old;
+        } while (!atomic_compare_exchange_weak_explicit(&heads[slot], &old, row32,
+                    memory_order_release, memory_order_relaxed));
+    }
+}
+
+#define JOIN_MORSEL 8192
+
+typedef struct {
+    _Atomic(uint32_t)* ht_heads;
+    uint32_t*    ht_next;
+    uint32_t     ht_cap;
+    ray_t**       l_key_vecs;
+    ray_t**       r_key_vecs;
+    uint8_t      n_keys;
+    uint8_t      join_type;
+    int64_t      left_rows;
+    /* Per-morsel counts/offsets (allocated by main thread) */
+    int64_t*     morsel_counts;
+    int64_t*     morsel_offsets;
+    /* Shared output arrays (phase 2 fill) */
+    int64_t*     l_idx;
+    int64_t*     r_idx;
+    /* FULL OUTER: track which right rows were matched (NULL if not full) */
+    _Atomic(uint8_t)* matched_right;
+    /* S-Join: semijoin filter bitmap (NULL if not applicable) */
+    uint64_t*    sjoin_bits;
+    int64_t      sjoin_key_max;
+} join_probe_ctx_t;
+
+/* Phase 2a: count matches per morsel */
+static void join_count_fn(void* raw, uint32_t wid, int64_t task_start, int64_t task_end) {
+    (void)wid; (void)task_end;
+    join_probe_ctx_t* c = (join_probe_ctx_t*)raw;
+    uint32_t tid = (uint32_t)task_start;
+    int64_t row_start = (int64_t)tid * JOIN_MORSEL;
+    int64_t row_end = row_start + JOIN_MORSEL;
+    if (row_end > c->left_rows) row_end = c->left_rows;
+
+    /* S-Join: precompute pointer for fast semijoin check */
+    uint64_t* sjbits = c->sjoin_bits;
+    int64_t sjmax = c->sjoin_key_max;
+    int64_t* lk0 = (sjbits && c->n_keys == 1) ? (int64_t*)ray_data(c->l_key_vecs[0]) : NULL;
+
+    int64_t count = 0;
+    uint32_t ht_mask = c->ht_cap - 1;
+    for (int64_t l = row_start; l < row_end; l++) {
+        /* S-Join skip: if left key not in right-side bitmap, skip probe */
+        if (lk0 && lk0[l] >= 0 && lk0[l] <= sjmax &&
+            !RAY_SEL_BIT_TEST(sjbits, lk0[l])) {
+            if (c->join_type >= 1) count++;  /* LEFT/FULL: emit unmatched */
+            continue;
+        }
+
+        if (l + 8 < row_end) {
+            uint64_t pf_h = hash_row_keys(c->l_key_vecs, c->n_keys, l + 8);
+            __builtin_prefetch(&c->ht_heads[(uint32_t)(pf_h & ht_mask)], 0, 1);
+        }
+        uint64_t h = hash_row_keys(c->l_key_vecs, c->n_keys, l);
+        uint32_t slot = (uint32_t)(h & ht_mask);
+        bool matched = false;
+        for (uint32_t r = c->ht_heads[slot]; r != JHT_EMPTY; r = c->ht_next[r]) {
+            if (join_keys_eq(c->l_key_vecs, c->r_key_vecs, c->n_keys, l, (int64_t)r)) {
+                count++;
+                matched = true;
+            }
+        }
+        if (!matched && c->join_type >= 1) count++;
+    }
+    c->morsel_counts[tid] = count;
+}
+
+/* Phase 2b: fill match pairs using pre-computed offsets */
+static void join_fill_fn(void* raw, uint32_t wid, int64_t task_start, int64_t task_end) {
+    (void)wid; (void)task_end;
+    join_probe_ctx_t* c = (join_probe_ctx_t*)raw;
+    uint32_t tid = (uint32_t)task_start;
+    int64_t row_start = (int64_t)tid * JOIN_MORSEL;
+    int64_t row_end = row_start + JOIN_MORSEL;
+    if (row_end > c->left_rows) row_end = c->left_rows;
+
+    int64_t off = c->morsel_offsets[tid];
+    int64_t* restrict li = c->l_idx;
+    int64_t* restrict ri = c->r_idx;
+
+    /* S-Join: precompute pointer for fast semijoin check */
+    uint64_t* sjbits = c->sjoin_bits;
+    int64_t sjmax = c->sjoin_key_max;
+    int64_t* lk0 = (sjbits && c->n_keys == 1) ? (int64_t*)ray_data(c->l_key_vecs[0]) : NULL;
+
+    uint32_t ht_mask = c->ht_cap - 1;
+    for (int64_t l = row_start; l < row_end; l++) {
+        /* S-Join skip: if left key not in right-side bitmap, skip probe */
+        if (lk0 && lk0[l] >= 0 && lk0[l] <= sjmax &&
+            !RAY_SEL_BIT_TEST(sjbits, lk0[l])) {
+            if (c->join_type >= 1) {
+                li[off] = l;
+                ri[off] = -1;
+                off++;
+            }
+            continue;
+        }
+
+        if (l + 8 < row_end) {
+            uint64_t pf_h = hash_row_keys(c->l_key_vecs, c->n_keys, l + 8);
+            __builtin_prefetch(&c->ht_heads[(uint32_t)(pf_h & ht_mask)], 0, 1);
+        }
+        uint64_t h = hash_row_keys(c->l_key_vecs, c->n_keys, l);
+        uint32_t slot = (uint32_t)(h & ht_mask);
+        bool matched = false;
+        for (uint32_t r = c->ht_heads[slot]; r != JHT_EMPTY; r = c->ht_next[r]) {
+            if (join_keys_eq(c->l_key_vecs, c->r_key_vecs, c->n_keys, l, (int64_t)r)) {
+                li[off] = l;
+                ri[off] = (int64_t)r;
+                off++;
+                matched = true;
+                /* Monotonic 0→1 store from multiple workers. */
+                if (c->matched_right) atomic_store_explicit(&c->matched_right[r], 1, memory_order_relaxed);
+            }
+        }
+        if (!matched && c->join_type >= 1) {
+            li[off] = l;
+            ri[off] = -1;
+            off++;
+        }
+    }
+}
+
+ray_t* exec_join(ray_graph_t* g, ray_op_t* op, ray_t* left_table, ray_t* right_table) {
+    if (!left_table || RAY_IS_ERR(left_table)) return left_table;
+    if (!right_table || RAY_IS_ERR(right_table)) return right_table;
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    int64_t left_rows = ray_table_nrows(left_table);
+    int64_t right_rows = ray_table_nrows(right_table);
+    /* Guard: radix path stores row indices as int32_t (widened to int64_t on gather).
+     * Chained HT path uses uint32_t.  Cap at INT32_MAX for correctness. */
+    if (right_rows > (int64_t)INT32_MAX || left_rows > (int64_t)INT32_MAX)
+        return ray_error("nyi", NULL);
+    uint8_t n_keys = ext->join.n_join_keys;
+    uint8_t join_type = ext->join.join_type;
+
+    /* VLA bound of zero is UB under -fsanitize=undefined.  Guarantee >=1
+     * slot; iterations below are bounded by n_keys so the extra slot is
+     * untouched when n_keys == 0. */
+    size_t key_slots = n_keys ? n_keys : 1;
+    ray_t* l_key_vecs[key_slots];
+    ray_t* r_key_vecs[key_slots];
+    memset(l_key_vecs, 0, key_slots * sizeof(ray_t*));
+    memset(r_key_vecs, 0, key_slots * sizeof(ray_t*));
+
+    for (uint8_t k = 0; k < n_keys; k++) {
+        ray_op_ext_t* lk = find_ext(g, ext->join.left_keys[k]->id);
+        ray_op_ext_t* rk = find_ext(g, ext->join.right_keys[k]->id);
+        if (lk && lk->base.opcode == OP_SCAN)
+            l_key_vecs[k] = ray_table_get_col(left_table, lk->sym);
+        if (rk && rk->base.opcode == OP_SCAN)
+            r_key_vecs[k] = ray_table_get_col(right_table, rk->sym);
+        if (rk && rk->base.opcode == OP_CONST && rk->literal)
+            r_key_vecs[k] = rk->literal;
+    }
+
+    /* RAY_STR keys not yet supported (16-byte elements vs 8-byte hash/eq slots) */
+    for (uint8_t k = 0; k < n_keys; k++) {
+        if ((l_key_vecs[k] && l_key_vecs[k]->type == RAY_STR) ||
+            (r_key_vecs[k] && r_key_vecs[k]->type == RAY_STR))
+            return ray_error("nyi", NULL);
+    }
+
+    ray_pool_t* pool = ray_pool_get();
+
+    /* Shared output state — used by both radix and chained HT paths */
+    ray_t* result = NULL;
+    ray_t* counts_hdr = NULL;
+    ray_t* l_idx_hdr = NULL;
+    ray_t* r_idx_hdr = NULL;
+    ray_t* matched_right_hdr = NULL;
+    ray_t* sjoin_sel = NULL;
+    ray_t* asp_sel = NULL;
+    ray_t* ht_next_hdr = NULL;
+    ray_t* ht_heads_hdr = NULL;
+    int64_t* l_idx = NULL;
+    int64_t* r_idx = NULL;
+    int64_t pair_count = 0;
+    _Atomic(uint8_t)* matched_right = NULL;
+
+    /* ── Radix-partitioned path (large joins) ──────────────────────── */
+    if (right_rows > RAY_PARALLEL_THRESHOLD) {
+        uint8_t radix_bits = radix_join_bits(right_rows);
+        uint32_t n_rparts = (uint32_t)1 << radix_bits;
+
+        /* Pre-compute hashes for both sides (once, reused by histogram+scatter) */
+        ray_t* r_hash_hdr = NULL;
+        uint32_t* r_hashes = (uint32_t*)scratch_alloc(&r_hash_hdr,
+                                (size_t)right_rows * sizeof(uint32_t));
+        ray_t* l_hash_hdr = NULL;
+        uint32_t* l_hashes = (uint32_t*)scratch_alloc(&l_hash_hdr,
+                                (size_t)left_rows * sizeof(uint32_t));
+        if (!r_hashes || !l_hashes) {
+            if (r_hash_hdr) scratch_free(r_hash_hdr);
+            if (l_hash_hdr) scratch_free(l_hash_hdr);
+            goto chained_ht_fallback;
+        }
+        join_radix_hash_ctx_t rhctx = { .key_vecs = r_key_vecs, .n_keys = n_keys, .hashes = r_hashes };
+        join_radix_hash_ctx_t lhctx = { .key_vecs = l_key_vecs, .n_keys = n_keys, .hashes = l_hashes };
+        if (pool) {
+            ray_pool_dispatch(pool, join_radix_hash_fn, &rhctx, right_rows);
+            ray_pool_dispatch(pool, join_radix_hash_fn, &lhctx, left_rows);
+        } else {
+            join_radix_hash_fn(&rhctx, 0, 0, right_rows);
+            join_radix_hash_fn(&lhctx, 0, 0, left_rows);
+        }
+
+        if (pool_cancelled(pool)) {
+            scratch_free(r_hash_hdr); scratch_free(l_hash_hdr);
+            return ray_error("cancel", NULL);
+        }
+
+        /* Partition both sides using cached hashes */
+        ray_t* r_parts_hdr = NULL;
+        join_radix_part_t* r_parts = join_radix_partition(pool, right_rows,
+                                                          radix_bits, r_hashes, &r_parts_hdr);
+        ray_t* l_parts_hdr = NULL;
+        join_radix_part_t* l_parts = join_radix_partition(pool, left_rows,
+                                                          radix_bits, l_hashes, &l_parts_hdr);
+        scratch_free(r_hash_hdr);
+        scratch_free(l_hash_hdr);
+        if (!r_parts || !l_parts) {
+            /* OOM during partitioning — fall through to chained HT path */
+            if (r_parts) {
+                for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++)
+                    if (r_parts[rp2].entries_hdr) scratch_free(r_parts[rp2].entries_hdr);
+                scratch_free(r_parts_hdr);
+            }
+            if (l_parts) {
+                for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++)
+                    if (l_parts[rp2].entries_hdr) scratch_free(l_parts[rp2].entries_hdr);
+                scratch_free(l_parts_hdr);
+            }
+            goto chained_ht_fallback;
+        }
+
+        if (pool_cancelled(pool)) {
+            for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++) {
+                if (r_parts[rp2].entries_hdr) scratch_free(r_parts[rp2].entries_hdr);
+                if (l_parts[rp2].entries_hdr) scratch_free(l_parts[rp2].entries_hdr);
+            }
+            scratch_free(r_parts_hdr); scratch_free(l_parts_hdr);
+            return ray_error("cancel", NULL);
+        }
+
+        /* FULL OUTER: allocate matched_right tracker */
+        if (join_type == 2 && right_rows > 0) {
+            matched_right = (_Atomic(uint8_t)*)scratch_calloc(&matched_right_hdr,
+                                                               (size_t)right_rows);
+            if (!matched_right) {
+                for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++) {
+                    if (r_parts[rp2].entries_hdr) scratch_free(r_parts[rp2].entries_hdr);
+                    if (l_parts[rp2].entries_hdr) scratch_free(l_parts[rp2].entries_hdr);
+                }
+                scratch_free(r_parts_hdr); scratch_free(l_parts_hdr);
+                matched_right_hdr = NULL;
+                goto chained_ht_fallback;
+            }
+        }
+
+        /* Single-pass per-partition build+probe with local output buffers */
+        ray_t* pcounts_hdr = NULL;
+        int64_t* part_counts = (int64_t*)scratch_calloc(&pcounts_hdr,
+                                  (size_t)n_rparts * sizeof(int64_t));
+        ray_t* pp_meta_hdr = NULL;
+        /* Allocate per-partition pointer arrays */
+        size_t pp_alloc_sz = (size_t)n_rparts * (2 * sizeof(int32_t*) + 2 * sizeof(ray_t*) + sizeof(uint32_t));
+        char* pp_mem = (char*)scratch_calloc(&pp_meta_hdr, pp_alloc_sz);
+        if (!part_counts || !pp_mem) {
+            if (pcounts_hdr) scratch_free(pcounts_hdr);
+            if (pp_meta_hdr) scratch_free(pp_meta_hdr);
+            for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++) {
+                if (r_parts[rp2].entries_hdr) scratch_free(r_parts[rp2].entries_hdr);
+                if (l_parts[rp2].entries_hdr) scratch_free(l_parts[rp2].entries_hdr);
+            }
+            scratch_free(r_parts_hdr); scratch_free(l_parts_hdr);
+            if (matched_right_hdr) { scratch_free(matched_right_hdr); matched_right_hdr = NULL; }
+            matched_right = NULL;
+            goto chained_ht_fallback;
+        }
+        int32_t** pp_l = (int32_t**)pp_mem;
+        int32_t** pp_r = (int32_t**)(pp_mem + (size_t)n_rparts * sizeof(int32_t*));
+        ray_t** pp_l_hdr = (ray_t**)(pp_mem + (size_t)n_rparts * 2 * sizeof(int32_t*));
+        ray_t** pp_r_hdr = (ray_t**)(pp_mem + (size_t)n_rparts * (2 * sizeof(int32_t*) + sizeof(ray_t*)));
+        uint32_t* pp_cap = (uint32_t*)(pp_mem + (size_t)n_rparts * (2 * sizeof(int32_t*) + 2 * sizeof(ray_t*)));
+
+        join_radix_bp_ctx_t bp_ctx = {
+            .l_parts = l_parts, .r_parts = r_parts,
+            .l_key_vecs = l_key_vecs, .r_key_vecs = r_key_vecs,
+            .n_keys = n_keys, .join_type = join_type,
+            .pp_l = pp_l, .pp_r = pp_r,
+            .pp_l_hdr = pp_l_hdr, .pp_r_hdr = pp_r_hdr,
+            .part_counts = part_counts, .pp_cap = pp_cap,
+            .matched_right = matched_right,
+            .had_error = 0,
+        };
+        if (pool && n_rparts > 1)
+            ray_pool_dispatch_n(pool, join_radix_build_probe_fn, &bp_ctx, n_rparts);
+        else
+            for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++)
+                join_radix_build_probe_fn(&bp_ctx, 0, rp2, rp2 + 1);
+
+        /* Check cancellation and errors during build+probe */
+        bool bp_cancelled = pool_cancelled(pool);
+        bool bp_error = atomic_load_explicit(&bp_ctx.had_error, memory_order_relaxed);
+        if (bp_cancelled || bp_error) {
+            /* Free all per-partition buffers */
+            for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++) {
+                if (r_parts[rp2].entries_hdr) scratch_free(r_parts[rp2].entries_hdr);
+                if (l_parts[rp2].entries_hdr) scratch_free(l_parts[rp2].entries_hdr);
+                if (pp_l_hdr[rp2]) scratch_free(pp_l_hdr[rp2]);
+                if (pp_r_hdr[rp2]) scratch_free(pp_r_hdr[rp2]);
+            }
+            scratch_free(r_parts_hdr); scratch_free(l_parts_hdr);
+            scratch_free(pp_meta_hdr); scratch_free(pcounts_hdr);
+            if (matched_right_hdr) { scratch_free(matched_right_hdr); matched_right_hdr = NULL; }
+            matched_right = NULL;
+            if (bp_cancelled) return ray_error("cancel", NULL);
+            goto chained_ht_fallback;
+        }
+
+        /* Free partition buffers — no longer needed */
+        for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++) {
+            if (r_parts[rp2].entries_hdr) scratch_free(r_parts[rp2].entries_hdr);
+            if (l_parts[rp2].entries_hdr) scratch_free(l_parts[rp2].entries_hdr);
+        }
+        scratch_free(r_parts_hdr);
+        scratch_free(l_parts_hdr);
+
+        /* Compute total output size and consolidate per-partition buffers */
+        for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++)
+            pair_count += part_counts[rp2];
+
+        /* FULL OUTER: count unmatched right rows */
+        int64_t unmatched_right = 0;
+        if (join_type == 2 && matched_right) {
+            for (int64_t r = 0; r < right_rows; r++)
+                if (!matched_right[r]) unmatched_right++;
+        }
+        int64_t total_out = pair_count + unmatched_right;
+
+        if (total_out > 0) {
+            l_idx = (int64_t*)scratch_alloc(&l_idx_hdr, (size_t)total_out * sizeof(int64_t));
+            r_idx = (int64_t*)scratch_alloc(&r_idx_hdr, (size_t)total_out * sizeof(int64_t));
+            if (!l_idx || !r_idx) {
+                scratch_free(l_idx_hdr); scratch_free(r_idx_hdr);
+                l_idx_hdr = NULL; r_idx_hdr = NULL;
+                for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++) {
+                    if (pp_l_hdr[rp2]) scratch_free(pp_l_hdr[rp2]);
+                    if (pp_r_hdr[rp2]) scratch_free(pp_r_hdr[rp2]);
+                }
+                scratch_free(pp_meta_hdr);
+                scratch_free(pcounts_hdr);
+                if (matched_right_hdr) scratch_free(matched_right_hdr);
+                matched_right_hdr = NULL;
+                return ray_error("oom", NULL);
+            }
+
+            /* Copy per-partition results into contiguous arrays (int32→int64 widen) */
+            int64_t off = 0;
+            for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++) {
+                int64_t cnt = part_counts[rp2];
+                if (cnt > 0 && pp_l[rp2] && pp_r[rp2]) {
+                    for (int64_t j = 0; j < cnt; j++) {
+                        l_idx[off + j] = (int64_t)pp_l[rp2][j];
+                        r_idx[off + j] = (int64_t)pp_r[rp2][j];
+                    }
+                    off += cnt;
+                }
+            }
+
+            /* FULL OUTER: append unmatched right rows */
+            if (unmatched_right > 0) {
+                for (int64_t r = 0; r < right_rows; r++) {
+                    if (!matched_right[r]) {
+                        l_idx[off] = -1;
+                        r_idx[off] = r;
+                        off++;
+                    }
+                }
+            }
+            pair_count = total_out;
+        }
+
+        /* Free per-partition buffers allocated by worker threads.
+         * Safe: ray_pool_dispatch_n has completed (workers are back on semaphore),
+         * ray_parallel_flag is 0, and ray_free handles cross-heap deallocation
+         * via the foreign-block list flushed by ray_heap_gc at ray_parallel_end. */
+        for (uint32_t rp2 = 0; rp2 < n_rparts; rp2++) {
+            if (pp_l_hdr[rp2]) scratch_free(pp_l_hdr[rp2]);
+            if (pp_r_hdr[rp2]) scratch_free(pp_r_hdr[rp2]);
+        }
+        scratch_free(pp_meta_hdr);
+        scratch_free(pcounts_hdr);
+        goto join_gather;
+    }
+
+chained_ht_fallback:;
+    /* ── Chained HT path (small joins / radix OOM fallback) ────────── */
+    uint64_t ht_cap64 = 256;
+    uint64_t target = (uint64_t)right_rows * 2;
+    while (ht_cap64 < target) ht_cap64 *= 2;
+    if (ht_cap64 > UINT32_MAX) ht_cap64 = (uint64_t)1 << 31;
+    uint32_t ht_cap = (uint32_t)ht_cap64;
+
+    uint32_t* ht_next = (uint32_t*)scratch_alloc(&ht_next_hdr, (size_t)right_rows * sizeof(uint32_t));
+    // cppcheck-suppress internalAstError
+    // Valid C11/C17 _Atomic(T)* declaration; cppcheck parser may mis-handle this syntax.
+    _Atomic(uint32_t)* ht_heads = (_Atomic(uint32_t)*)scratch_alloc(&ht_heads_hdr, ht_cap * sizeof(uint32_t));
+    if (!ht_next || !ht_heads) {
+        scratch_free(ht_next_hdr); scratch_free(ht_heads_hdr);
+        return ray_error("oom", NULL);
+    }
+    memset(ht_heads, 0xFF, ht_cap * sizeof(uint32_t));  /* JHT_EMPTY = 0xFFFFFFFF */
+
+    /* Phase 0.5: ASP-Join — extract semijoin filter from factorized left side.
+     * When the left input comes from a factorized expand (_count column present),
+     * build a RAY_SEL bitmap of left-side key values to skip right-side rows
+     * during hash-build whose keys can't match any left-side row. */
+    uint64_t* asp_bits = NULL;
+    int64_t asp_key_max = 0;
+    if (n_keys == 1 && join_type == 0 && l_key_vecs[0] &&
+        l_key_vecs[0]->type == RAY_I64 && right_rows > left_rows * 2) {
+        int64_t cnt_sym = ray_sym_intern("_count", 6);
+        ray_t* cnt_col = ray_table_get_col(left_table, cnt_sym);
+        if (cnt_col) {  /* left is factorized */
+            int64_t* lk = (int64_t*)ray_data(l_key_vecs[0]);
+            int64_t lk_max = 0;
+            for (int64_t i = 0; i < left_rows; i++)
+                if (lk[i] > lk_max) lk_max = lk[i];
+
+            if (lk_max < (int64_t)1 << 24) {
+                asp_sel = ray_sel_new(lk_max + 1);
+                if (asp_sel && !RAY_IS_ERR(asp_sel)) {
+                    asp_bits = ray_sel_bits(asp_sel);
+                    asp_key_max = lk_max;
+                    for (int64_t i = 0; i < left_rows; i++) {
+                        int64_t k = lk[i];
+                        if (k >= 0 && k <= lk_max)
+                            RAY_SEL_BIT_SET(asp_bits, k);
+                    }
+                }
+            }
+        }
+    }
+
+    {
+        join_build_ctx_t bctx = {
+            .ht_heads   = ht_heads,
+            .ht_next    = ht_next,
+            .ht_mask    = ht_cap - 1,
+            .r_key_vecs = r_key_vecs,
+            .n_keys     = n_keys,
+            .asp_bits   = asp_bits,
+            .asp_key_max = asp_key_max,
+        };
+        if (pool && right_rows > RAY_PARALLEL_THRESHOLD)
+            ray_pool_dispatch(pool, join_build_fn, &bctx, right_rows);
+        else
+            join_build_fn(&bctx, 0, 0, right_rows);
+    }
+    CHECK_CANCEL_GOTO(pool, join_cleanup);
+
+    /* Phase 1.5: S-Join semijoin filter extraction.
+     * Build a RAY_SEL bitmap of all distinct right-side key values that
+     * appear in the hash table. This can be used to skip left-side rows
+     * whose key cannot match any right-side row.
+     *
+     * Applied when: single I64 key, inner join, left side is large enough
+     * to benefit from filtering (> 2x right side). */
+    if (n_keys == 1 && join_type == 0 && l_key_vecs[0] && r_key_vecs[0] &&
+        l_key_vecs[0]->type == RAY_I64 && r_key_vecs[0]->type == RAY_I64 &&
+        left_rows > right_rows * 2) {
+        /* Determine key range to size the bitmap */
+        int64_t* rk = (int64_t*)ray_data(r_key_vecs[0]);
+        int64_t key_max = 0;
+        for (int64_t i = 0; i < right_rows; i++)
+            if (rk[i] > key_max) key_max = rk[i];
+
+        if (key_max < (int64_t)1 << 24) {  /* only for reasonably bounded keys */
+            sjoin_sel = ray_sel_new(key_max + 1);
+            if (sjoin_sel && !RAY_IS_ERR(sjoin_sel)) {
+                uint64_t* bits = ray_sel_bits(sjoin_sel);
+                for (int64_t i = 0; i < right_rows; i++) {
+                    int64_t k = rk[i];
+                    if (k >= 0 && k <= key_max)
+                        RAY_SEL_BIT_SET(bits, k);
+                }
+            }
+        }
+    }
+
+    /* Phase 2: Parallel probe (two-pass: count → prefix-sum → fill) */
+    uint32_t n_tasks = (uint32_t)((left_rows + JOIN_MORSEL - 1) / JOIN_MORSEL);
+    if (n_tasks == 0) n_tasks = 1;
+
+    int64_t* morsel_counts = (int64_t*)scratch_calloc(&counts_hdr,
+                              (size_t)(n_tasks + 1) * sizeof(int64_t));
+    if (!morsel_counts) {
+        scratch_free(ht_next_hdr); scratch_free(ht_heads_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    /* For FULL OUTER JOIN, allocate matched_right tracker */
+    if (join_type == 2 && right_rows > 0) {
+        matched_right = (_Atomic(uint8_t)*)scratch_calloc(&matched_right_hdr,
+                                                           (size_t)right_rows);
+        if (!matched_right) goto join_cleanup;
+    }
+
+    /* Prepare S-Join fields for probe context */
+    uint64_t* sjoin_bits = NULL;
+    int64_t sjoin_key_max = 0;
+    if (sjoin_sel && !RAY_IS_ERR(sjoin_sel)) {
+        sjoin_bits = ray_sel_bits(sjoin_sel);
+        sjoin_key_max = sjoin_sel->len - 1;
+    }
+
+    join_probe_ctx_t probe_ctx = {
+        .ht_heads    = ht_heads,
+        .ht_next     = ht_next,
+        .ht_cap      = ht_cap,
+        .l_key_vecs  = l_key_vecs,
+        .r_key_vecs  = r_key_vecs,
+        .n_keys      = n_keys,
+        .join_type   = join_type,
+        .left_rows   = left_rows,
+        .morsel_counts = morsel_counts,
+        .matched_right = matched_right,
+        .sjoin_bits  = sjoin_bits,
+        .sjoin_key_max = sjoin_key_max,
+    };
+
+    /* 2a: Count matches per morsel */
+    if (pool && n_tasks > 1)
+        ray_pool_dispatch_n(pool, join_count_fn, &probe_ctx, n_tasks);
+    else
+        for (uint32_t t = 0; t < n_tasks; t++)
+            join_count_fn(&probe_ctx, 0, t, t + 1);
+
+    /* Prefix sum → morsel_offsets (reuse counts array as offsets) */
+    pair_count = 0;
+    for (uint32_t t = 0; t < n_tasks; t++) {
+        int64_t cnt = morsel_counts[t];
+        morsel_counts[t] = pair_count;
+        pair_count += cnt;
+    }
+
+    /* Allocate output pair arrays */
+    if (pair_count > 0) {
+        l_idx = (int64_t*)scratch_alloc(&l_idx_hdr, (size_t)pair_count * sizeof(int64_t));
+        r_idx = (int64_t*)scratch_alloc(&r_idx_hdr, (size_t)pair_count * sizeof(int64_t));
+        if (!l_idx || !r_idx) goto join_cleanup;
+    }
+
+    /* 2b: Fill match pairs */
+    probe_ctx.morsel_offsets = morsel_counts;  /* now holds prefix sums */
+    probe_ctx.l_idx = l_idx;
+    probe_ctx.r_idx = r_idx;
+
+    if (pair_count > 0) {
+        if (pool && n_tasks > 1)
+            ray_pool_dispatch_n(pool, join_fill_fn, &probe_ctx, n_tasks);
+        else
+            for (uint32_t t = 0; t < n_tasks; t++)
+                join_fill_fn(&probe_ctx, 0, t, t + 1);
+    }
+
+    CHECK_CANCEL_GOTO(pool, join_cleanup);
+
+    /* FULL OUTER: append unmatched right rows (l_idx=-1, r_idx=r) */
+    if (join_type == 2 && matched_right) {
+        int64_t unmatched_right = 0;
+        for (int64_t r = 0; r < right_rows; r++)
+            if (!matched_right[r]) unmatched_right++;
+
+        if (unmatched_right > 0) {
+            int64_t total = pair_count + unmatched_right;
+            ray_t* new_l_hdr;
+            ray_t* new_r_hdr;
+            int64_t* new_l = (int64_t*)scratch_alloc(&new_l_hdr,
+                                (size_t)total * sizeof(int64_t));
+            int64_t* new_r = (int64_t*)scratch_alloc(&new_r_hdr,
+                                (size_t)total * sizeof(int64_t));
+            if (!new_l || !new_r) {
+                scratch_free(new_l_hdr); scratch_free(new_r_hdr);
+                goto join_cleanup;
+            }
+            if (pair_count > 0) {
+                memcpy(new_l, l_idx, (size_t)pair_count * sizeof(int64_t));
+                memcpy(new_r, r_idx, (size_t)pair_count * sizeof(int64_t));
+            }
+            scratch_free(l_idx_hdr);
+            scratch_free(r_idx_hdr);
+            int64_t off = pair_count;
+            for (int64_t r = 0; r < right_rows; r++) {
+                if (!matched_right[r]) {
+                    new_l[off] = -1;
+                    new_r[off] = r;
+                    off++;
+                }
+            }
+            l_idx = new_l;  r_idx = new_r;
+            l_idx_hdr = new_l_hdr;  r_idx_hdr = new_r_hdr;
+            pair_count = total;
+        }
+    }
+
+join_gather:;
+    /* Phase 3: Build result table with parallel column gather.
+     * Use multi_gather for batched column access when possible (non-nullable
+     * indices), falling back to per-column gather for nullable RIGHT columns. */
+    int64_t left_ncols = ray_table_ncols(left_table);
+    int64_t right_ncols = ray_table_ncols(right_table);
+    result = ray_table_new(left_ncols + right_ncols);
+    if (!result || RAY_IS_ERR(result)) goto join_cleanup;
+
+    /* Allocate all output columns upfront for batched gather */
+    ray_t* l_out_cols[MGATHER_MAX_COLS];
+    int64_t l_out_names[MGATHER_MAX_COLS];
+    int64_t l_out_count = 0;
+    for (int64_t c = 0; c < left_ncols && l_out_count < MGATHER_MAX_COLS; c++) {
+        ray_t* col = ray_table_get_col_idx(left_table, c);
+        if (!col) continue;
+        ray_t* new_col = col_vec_new(col, pair_count);
+        if (!new_col || RAY_IS_ERR(new_col)) continue;
+        new_col->len = pair_count;
+        l_out_cols[l_out_count] = new_col;
+        l_out_names[l_out_count] = ray_table_col_name(left_table, c);
+        l_out_count++;
+    }
+
+    ray_t* r_out_cols[MGATHER_MAX_COLS];
+    ray_t* r_src_cols[MGATHER_MAX_COLS];
+    int64_t r_out_names[MGATHER_MAX_COLS];
+    int64_t r_out_count = 0;
+    for (int64_t c = 0; c < right_ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(right_table, c);
+        int64_t name_id = ray_table_col_name(right_table, c);
+        if (!col) continue;
+        bool is_key = false;
+        for (uint8_t k = 0; k < n_keys; k++) {
+            ray_op_ext_t* rk = find_ext(g, ext->join.right_keys[k]->id);
+            if (rk && rk->base.opcode == OP_SCAN && rk->sym == name_id) {
+                is_key = true; break;
+            }
+        }
+        if (is_key) continue;
+        if (r_out_count >= MGATHER_MAX_COLS) continue;
+        ray_t* new_col = col_vec_new(col, pair_count);
+        if (!new_col || RAY_IS_ERR(new_col)) continue;
+        new_col->len = pair_count;
+        r_out_cols[r_out_count] = new_col;
+        r_src_cols[r_out_count] = col;
+        r_out_names[r_out_count] = name_id;
+        r_out_count++;
+    }
+
+    if (pair_count > 0) {
+        /* Left columns: multi_gather (non-nullable for INNER/LEFT) */
+        bool l_nullable = (join_type == 2);  /* only FULL OUTER */
+        if (!l_nullable && l_out_count > 1 && l_out_count <= MGATHER_MAX_COLS) {
+            multi_gather_ctx_t mgctx = { .idx = l_idx, .ncols = l_out_count };
+            int64_t si = 0;
+            for (int64_t c = 0; c < left_ncols && si < l_out_count; c++) {
+                ray_t* col = ray_table_get_col_idx(left_table, c);
+                if (!col) continue;
+                mgctx.srcs[si] = (char*)ray_data(col);
+                mgctx.dsts[si] = (char*)ray_data(l_out_cols[si]);
+                mgctx.esz[si] = col_esz(col);
+                si++;
+            }
+            if (pool && pair_count > RAY_PARALLEL_THRESHOLD)
+                ray_pool_dispatch(pool, multi_gather_fn, &mgctx, pair_count);
+            else
+                multi_gather_fn(&mgctx, 0, 0, pair_count);
+        } else {
+            /* Fall back to per-column gather for nullable or single column */
+            int64_t si = 0;
+            for (int64_t c = 0; c < left_ncols && si < l_out_count; c++) {
+                ray_t* col = ray_table_get_col_idx(left_table, c);
+                if (!col) continue;
+                gather_ctx_t gctx = {
+                    .idx = l_idx, .src_col = col, .dst_col = l_out_cols[si],
+                    .esz = col_esz(col), .nullable = l_nullable,
+                };
+                if (pool && pair_count > RAY_PARALLEL_THRESHOLD)
+                    ray_pool_dispatch(pool, gather_fn, &gctx, pair_count);
+                else
+                    gather_fn(&gctx, 0, 0, pair_count);
+                si++;
+            }
+        }
+
+        /* Right columns: per-column gather (nullable for LEFT/FULL OUTER) */
+        bool r_nullable = (join_type >= 1);
+        if (!r_nullable && r_out_count > 1 && r_out_count <= MGATHER_MAX_COLS) {
+            multi_gather_ctx_t mgctx = { .idx = r_idx, .ncols = r_out_count };
+            for (int64_t i = 0; i < r_out_count; i++) {
+                mgctx.srcs[i] = (char*)ray_data(r_src_cols[i]);
+                mgctx.dsts[i] = (char*)ray_data(r_out_cols[i]);
+                mgctx.esz[i] = col_esz(r_out_cols[i]);
+            }
+            if (pool && pair_count > RAY_PARALLEL_THRESHOLD)
+                ray_pool_dispatch(pool, multi_gather_fn, &mgctx, pair_count);
+            else
+                multi_gather_fn(&mgctx, 0, 0, pair_count);
+        } else {
+            for (int64_t i = 0; i < r_out_count; i++) {
+                gather_ctx_t gctx = {
+                    .idx = r_idx, .src_col = r_src_cols[i], .dst_col = r_out_cols[i],
+                    .esz = col_esz(r_src_cols[i]), .nullable = r_nullable,
+                };
+                if (pool && pair_count > RAY_PARALLEL_THRESHOLD)
+                    ray_pool_dispatch(pool, gather_fn, &gctx, pair_count);
+                else
+                    gather_fn(&gctx, 0, 0, pair_count);
+            }
+        }
+    }
+
+    /* Propagate RAY_STR string pools and null bitmaps from source columns */
+    {
+        int64_t si = 0;
+        for (int64_t c = 0; c < left_ncols && si < l_out_count; c++) {
+            ray_t* col = ray_table_get_col_idx(left_table, c);
+            if (!col) continue;
+            col_propagate_str_pool(l_out_cols[si], col);
+            col_propagate_nulls_gather(l_out_cols[si], col, l_idx, pair_count);
+            si++;
+        }
+    }
+    for (int64_t i = 0; i < r_out_count; i++) {
+        col_propagate_str_pool(r_out_cols[i], r_src_cols[i]);
+        col_propagate_nulls_gather(r_out_cols[i], r_src_cols[i], r_idx, pair_count);
+    }
+
+    /* Add columns to result */
+    for (int64_t i = 0; i < l_out_count; i++) {
+        result = ray_table_add_col(result, l_out_names[i], l_out_cols[i]);
+        ray_release(l_out_cols[i]);
+    }
+    for (int64_t i = 0; i < r_out_count; i++) {
+        result = ray_table_add_col(result, r_out_names[i], r_out_cols[i]);
+        ray_release(r_out_cols[i]);
+    }
+
+join_cleanup:
+    if (ht_next_hdr) scratch_free(ht_next_hdr);
+    if (ht_heads_hdr) scratch_free(ht_heads_hdr);
+    scratch_free(l_idx_hdr);
+    scratch_free(r_idx_hdr);
+    if (counts_hdr) scratch_free(counts_hdr);
+    scratch_free(matched_right_hdr);
+    if (sjoin_sel) ray_release(sjoin_sel);
+    if (asp_sel) ray_release(asp_sel);
+
+    return result;
+}
+
+/* ============================================================================
+ * OP_ANTIJOIN: anti-semi-join — keep left rows with NO matching right row
+ * Build hash set from right keys, probe left, emit non-matching left rows.
+ * ============================================================================ */
+
+ray_t* exec_antijoin(ray_graph_t* g, ray_op_t* op,
+                            ray_t* left_table, ray_t* right_table) {
+    if (!left_table || RAY_IS_ERR(left_table)) return left_table;
+    if (!right_table || RAY_IS_ERR(right_table)) return right_table;
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    int64_t left_rows  = ray_table_nrows(left_table);
+    int64_t right_rows = ray_table_nrows(right_table);
+
+    if (right_rows > (int64_t)INT32_MAX || left_rows > (int64_t)INT32_MAX)
+        return ray_error("nyi", NULL);
+
+    uint8_t n_keys = ext->join.n_join_keys;
+
+    /* Trivial case: empty right → all left rows pass */
+    if (right_rows == 0) {
+        ray_retain(left_table);
+        return left_table;
+    }
+    /* Trivial case: empty left → empty result */
+    if (left_rows == 0) {
+        ray_retain(left_table);
+        return left_table;
+    }
+
+    ray_t* l_key_vecs[16];
+    ray_t* r_key_vecs[16];
+    memset(l_key_vecs, 0, n_keys * sizeof(ray_t*));
+    memset(r_key_vecs, 0, n_keys * sizeof(ray_t*));
+
+    for (uint8_t k = 0; k < n_keys; k++) {
+        ray_op_ext_t* lk = find_ext(g, ext->join.left_keys[k]->id);
+        ray_op_ext_t* rk = find_ext(g, ext->join.right_keys[k]->id);
+        if (lk && lk->base.opcode == OP_SCAN)
+            l_key_vecs[k] = ray_table_get_col(left_table, lk->sym);
+        if (rk && rk->base.opcode == OP_SCAN)
+            r_key_vecs[k] = ray_table_get_col(right_table, rk->sym);
+        if (rk && rk->base.opcode == OP_CONST && rk->literal)
+            r_key_vecs[k] = rk->literal;
+    }
+
+    /* RAY_STR keys not yet supported */
+    for (uint8_t k = 0; k < n_keys; k++) {
+        if ((l_key_vecs[k] && l_key_vecs[k]->type == RAY_STR) ||
+            (r_key_vecs[k] && r_key_vecs[k]->type == RAY_STR))
+            return ray_error("nyi", NULL);
+    }
+
+    /* Build chained hash table from right side */
+    ray_t* ht_next_hdr = NULL;
+    ray_t* ht_heads_hdr = NULL;
+
+    uint64_t ht_cap64 = 256;
+    uint64_t target = (uint64_t)right_rows * 2;
+    while (ht_cap64 < target) ht_cap64 *= 2;
+    if (ht_cap64 > UINT32_MAX) ht_cap64 = (uint64_t)1 << 31;
+    uint32_t ht_cap = (uint32_t)ht_cap64;
+
+    uint32_t* ht_next = (uint32_t*)scratch_alloc(&ht_next_hdr,
+                            (size_t)right_rows * sizeof(uint32_t));
+    _Atomic(uint32_t)* ht_heads = (_Atomic(uint32_t)*)scratch_alloc(&ht_heads_hdr,
+                            ht_cap * sizeof(uint32_t));
+    if (!ht_next || !ht_heads) {
+        if (ht_next_hdr) scratch_free(ht_next_hdr);
+        if (ht_heads_hdr) scratch_free(ht_heads_hdr);
+        return ray_error("oom", NULL);
+    }
+    memset(ht_heads, 0xFF, ht_cap * sizeof(uint32_t));  /* JHT_EMPTY */
+
+    /* Build: insert right rows into HT */
+    ray_pool_t* pool = ray_pool_get();
+    {
+        join_build_ctx_t bctx = {
+            .ht_heads   = ht_heads,
+            .ht_next    = ht_next,
+            .ht_mask    = ht_cap - 1,
+            .r_key_vecs = r_key_vecs,
+            .n_keys     = n_keys,
+            .asp_bits   = NULL,
+            .asp_key_max = 0,
+        };
+        if (pool && right_rows > RAY_PARALLEL_THRESHOLD)
+            ray_pool_dispatch(pool, join_build_fn, &bctx, right_rows);
+        else
+            join_build_fn(&bctx, 0, 0, right_rows);
+    }
+
+    if (pool_cancelled(pool)) {
+        scratch_free(ht_next_hdr);
+        scratch_free(ht_heads_hdr);
+        return ray_error("cancel", NULL);
+    }
+
+    /* Probe: scan left rows, collect indices of those with NO match */
+    ray_t* out_idx_hdr = NULL;
+    int64_t* out_idx = (int64_t*)scratch_alloc(&out_idx_hdr,
+                            (size_t)left_rows * sizeof(int64_t));
+    if (!out_idx) {
+        scratch_free(ht_next_hdr);
+        scratch_free(ht_heads_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    uint32_t ht_mask = ht_cap - 1;
+    int64_t out_count = 0;
+    for (int64_t l = 0; l < left_rows; l++) {
+        uint64_t h = hash_row_keys(l_key_vecs, n_keys, l);
+        uint32_t slot = (uint32_t)(h & ht_mask);
+        bool matched = false;
+        for (uint32_t r = ht_heads[slot]; r != JHT_EMPTY; r = ht_next[r]) {
+            if (join_keys_eq(l_key_vecs, r_key_vecs, n_keys, l, (int64_t)r)) {
+                matched = true;
+                break;  /* anti-join: one match is enough to exclude */
+            }
+        }
+        if (!matched) {
+            out_idx[out_count++] = l;
+        }
+    }
+
+    scratch_free(ht_next_hdr);
+    scratch_free(ht_heads_hdr);
+
+    /* Gather: build result table with only left columns */
+    int64_t left_ncols = ray_table_ncols(left_table);
+    ray_t* result = ray_table_new(left_ncols);
+    if (!result || RAY_IS_ERR(result)) {
+        scratch_free(out_idx_hdr);
+        return result;
+    }
+
+    if (out_count > 0) {
+        for (int64_t c = 0; c < left_ncols; c++) {
+            ray_t* col = ray_table_get_col_idx(left_table, c);
+            if (!col) continue;
+            ray_t* new_col = col_vec_new(col, out_count);
+            if (!new_col || RAY_IS_ERR(new_col)) continue;
+            new_col->len = out_count;
+
+            gather_ctx_t gctx = {
+                .idx = out_idx, .src_col = col, .dst_col = new_col,
+                .esz = col_esz(col), .nullable = false,
+            };
+            if (pool && out_count > RAY_PARALLEL_THRESHOLD)
+                ray_pool_dispatch(pool, gather_fn, &gctx, out_count);
+            else
+                gather_fn(&gctx, 0, 0, out_count);
+
+            col_propagate_str_pool(new_col, col);
+
+            int64_t name_id = ray_table_col_name(left_table, c);
+            result = ray_table_add_col(result, name_id, new_col);
+            ray_release(new_col);
+        }
+    }
+
+    scratch_free(out_idx_hdr);
+    return result;
+}
+
+/* ============================================================================
+ * OP_WINDOW_JOIN: ASOF join (sort-merge)
+ * For each left row, find the most recent right row where right.time <= left.time,
+ * optionally partitioned by equality keys. O(N+M) after sorting.
+ * ============================================================================ */
+
+ray_t* exec_window_join(ray_graph_t* g, ray_op_t* op,
+                               ray_t* left_table, ray_t* right_table) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    uint8_t n_eq      = ext->asof.n_eq_keys;
+    uint8_t join_type = ext->asof.join_type;
+
+    int64_t left_n  = ray_table_nrows(left_table);
+    int64_t right_n = ray_table_nrows(right_table);
+
+    /* Resolve time key */
+    ray_op_ext_t* time_ext = find_ext(g, ext->asof.time_key->id);
+    if (!time_ext || time_ext->base.opcode != OP_SCAN)
+        return ray_error("nyi", NULL);
+    int64_t time_sym = time_ext->sym;
+
+    /* Resolve equality keys */
+    int64_t eq_syms[256];
+    for (uint8_t k = 0; k < n_eq; k++) {
+        ray_op_ext_t* ek = find_ext(g, ext->asof.eq_keys[k]->id);
+        if (!ek || ek->base.opcode != OP_SCAN)
+            return ray_error("nyi", NULL);
+        eq_syms[k] = ek->sym;
+    }
+
+    /* Get time vectors — use int64 representation for comparison.
+     * TIME uses 4-byte i32 (ms), TIMESTAMP uses 8-byte i64 (ns).
+     * We expand to a temporary i64 array for uniform comparison. */
+    ray_t* lt_time_vec = ray_table_get_col(left_table, time_sym);
+    ray_t* rt_time_vec = ray_table_get_col(right_table, time_sym);
+    if (!lt_time_vec || !rt_time_vec) return ray_error("schema", NULL);
+    int8_t time_type = lt_time_vec->type;
+
+    /* Helper macro to read time value as int64_t regardless of storage type */
+    #define READ_TIME(vec, idx) \
+        ((time_type == RAY_TIME || time_type == RAY_DATE) \
+            ? (int64_t)((int32_t*)ray_data(vec))[(idx)] \
+            : ((int64_t*)ray_data(vec))[(idx)])
+
+    /* Build i64 time arrays for efficient comparison */
+    ray_t* lt_time_hdr = NULL, *rt_time_hdr = NULL;
+    int64_t* lt_time = (int64_t*)scratch_alloc(&lt_time_hdr, (size_t)left_n * sizeof(int64_t));
+    int64_t* rt_time = (int64_t*)scratch_alloc(&rt_time_hdr, (size_t)right_n * sizeof(int64_t));
+    if ((!lt_time && left_n > 0) || (!rt_time && right_n > 0)) {
+        if (lt_time_hdr) scratch_free(lt_time_hdr);
+        if (rt_time_hdr) scratch_free(rt_time_hdr);
+        return ray_error("oom", NULL);
+    }
+    for (int64_t i = 0; i < left_n; i++) lt_time[i] = READ_TIME(lt_time_vec, i);
+    for (int64_t i = 0; i < right_n; i++) rt_time[i] = READ_TIME(rt_time_vec, i);
+    #undef READ_TIME
+
+    /* Get eq key vectors — stored as ray_t* for type-safe access */
+    ray_t* lt_eq[256], *rt_eq[256];
+    for (uint8_t k = 0; k < n_eq; k++) {
+        ray_t* lv = ray_table_get_col(left_table, eq_syms[k]);
+        ray_t* rv = ray_table_get_col(right_table, eq_syms[k]);
+        if (!lv || !rv) {
+            if (lt_time_hdr) scratch_free(lt_time_hdr);
+            if (rt_time_hdr) scratch_free(rt_time_hdr);
+            return ray_error("schema", NULL);
+        }
+        lt_eq[k] = lv;
+        rt_eq[k] = rv;
+    }
+
+    /* Precompute per-row "any key is null" bitsets.  Null-keyed rows must
+     * not match — left rows fall through to the left-outer null fill,
+     * right rows are skipped entirely during the merge walk.  SQL-style
+     * NULLs-never-match semantics. */
+    ray_t* lt_null_hdr = NULL, *rt_null_hdr = NULL;
+    uint8_t* lt_null = left_n > 0
+        ? (uint8_t*)scratch_alloc(&lt_null_hdr, (size_t)left_n)
+        : NULL;
+    uint8_t* rt_null = right_n > 0
+        ? (uint8_t*)scratch_alloc(&rt_null_hdr, (size_t)right_n)
+        : NULL;
+    if ((!lt_null && left_n > 0) || (!rt_null && right_n > 0)) {
+        if (lt_null_hdr) scratch_free(lt_null_hdr);
+        if (rt_null_hdr) scratch_free(rt_null_hdr);
+        if (lt_time_hdr) scratch_free(lt_time_hdr);
+        if (rt_time_hdr) scratch_free(rt_time_hdr);
+        return ray_error("oom", NULL);
+    }
+    if (left_n > 0) memset(lt_null, 0, (size_t)left_n);
+    if (right_n > 0) memset(rt_null, 0, (size_t)right_n);
+    if (lt_time_vec->attrs & RAY_ATTR_HAS_NULLS)
+        for (int64_t i = 0; i < left_n; i++)
+            if (ray_vec_is_null(lt_time_vec, i)) lt_null[i] = 1;
+    if (rt_time_vec->attrs & RAY_ATTR_HAS_NULLS)
+        for (int64_t i = 0; i < right_n; i++)
+            if (ray_vec_is_null(rt_time_vec, i)) rt_null[i] = 1;
+    for (uint8_t k = 0; k < n_eq; k++) {
+        if (lt_eq[k]->attrs & RAY_ATTR_HAS_NULLS)
+            for (int64_t i = 0; i < left_n; i++)
+                if (ray_vec_is_null(lt_eq[k], i)) lt_null[i] = 1;
+        if (rt_eq[k]->attrs & RAY_ATTR_HAS_NULLS)
+            for (int64_t i = 0; i < right_n; i++)
+                if (ray_vec_is_null(rt_eq[k], i)) rt_null[i] = 1;
+    }
+
+    /* Sort both tables by (eq_keys, time_key) using index arrays.  Rows
+     * with any null key sort LAST (NULLS LAST) so the merge walk reaches
+     * them once all real candidates are consumed and can skip them
+     * cheaply. */
+    ray_t* li_hdr = NULL, *ri_hdr = NULL;
+    int64_t* li_idx = (int64_t*)scratch_alloc(&li_hdr, (size_t)left_n * sizeof(int64_t));
+    int64_t* ri_idx = (int64_t*)scratch_alloc(&ri_hdr, (size_t)right_n * sizeof(int64_t));
+    if ((!li_idx && left_n > 0) || (!ri_idx && right_n > 0)) {
+        if (li_hdr) scratch_free(li_hdr);
+        if (ri_hdr) scratch_free(ri_hdr);
+        if (lt_null_hdr) scratch_free(lt_null_hdr);
+        if (rt_null_hdr) scratch_free(rt_null_hdr);
+        if (lt_time_hdr) scratch_free(lt_time_hdr);
+        if (rt_time_hdr) scratch_free(rt_time_hdr);
+        return ray_error("oom", NULL);
+    }
+    for (int64_t i = 0; i < left_n; i++) li_idx[i] = i;
+    for (int64_t i = 0; i < right_n; i++) ri_idx[i] = i;
+
+    /* Bottom-up mergesort on index arrays — O(N log N) */
+    {
+        int64_t max_n = left_n > right_n ? left_n : right_n;
+        ray_t* tmp_hdr = NULL;
+        int64_t* tmp = max_n > 0
+            ? (int64_t*)scratch_alloc(&tmp_hdr, (size_t)max_n * sizeof(int64_t))
+            : NULL;
+        if (!tmp && max_n > 0) {
+            scratch_free(li_hdr); scratch_free(ri_hdr);
+            if (lt_null_hdr) scratch_free(lt_null_hdr);
+            if (rt_null_hdr) scratch_free(rt_null_hdr);
+            if (lt_time_hdr) scratch_free(lt_time_hdr);
+            if (rt_time_hdr) scratch_free(rt_time_hdr);
+            return ray_error("oom", NULL);
+        }
+
+        /* Sort left indices by (nulls-last, eq_keys, time) */
+        for (int64_t width = 1; width < left_n; width *= 2) {
+            for (int64_t lo = 0; lo < left_n; lo += 2 * width) {
+                int64_t mid = lo + width;
+                int64_t hi = lo + 2 * width;
+                if (mid > left_n) mid = left_n;
+                if (hi > left_n) hi = left_n;
+                int64_t a = lo, b = mid, t = lo;
+                while (a < mid && b < hi) {
+                    int64_t ai = li_idx[a], bi = li_idx[b];
+                    int cmp = 0;
+                    if (lt_null[ai] != lt_null[bi])
+                        cmp = lt_null[ai] - lt_null[bi]; /* 1 > 0 → nulls last */
+                    for (uint8_t k2 = 0; k2 < n_eq && cmp == 0; k2++) {
+                        int64_t va = read_col_i64(ray_data(lt_eq[k2]), ai, lt_eq[k2]->type, lt_eq[k2]->attrs);
+                        int64_t vb = read_col_i64(ray_data(lt_eq[k2]), bi, lt_eq[k2]->type, lt_eq[k2]->attrs);
+                        if (va < vb) cmp = -1;
+                        else if (va > vb) cmp = 1;
+                    }
+                    if (cmp == 0) {
+                        if (lt_time[ai] < lt_time[bi]) cmp = -1;
+                        else if (lt_time[ai] > lt_time[bi]) cmp = 1;
+                    }
+                    tmp[t++] = (cmp <= 0) ? li_idx[a++] : li_idx[b++];
+                }
+                while (a < mid) tmp[t++] = li_idx[a++];
+                while (b < hi) tmp[t++] = li_idx[b++];
+                for (int64_t c = lo; c < hi; c++) li_idx[c] = tmp[c];
+            }
+        }
+
+        /* Sort right indices by (nulls-last, eq_keys, time) */
+        for (int64_t width = 1; width < right_n; width *= 2) {
+            for (int64_t lo = 0; lo < right_n; lo += 2 * width) {
+                int64_t mid = lo + width;
+                int64_t hi = lo + 2 * width;
+                if (mid > right_n) mid = right_n;
+                if (hi > right_n) hi = right_n;
+                int64_t a = lo, b = mid, t = lo;
+                while (a < mid && b < hi) {
+                    int64_t ai = ri_idx[a], bi = ri_idx[b];
+                    int cmp = 0;
+                    if (rt_null[ai] != rt_null[bi])
+                        cmp = rt_null[ai] - rt_null[bi];
+                    for (uint8_t k2 = 0; k2 < n_eq && cmp == 0; k2++) {
+                        int64_t va = read_col_i64(ray_data(rt_eq[k2]), ai, rt_eq[k2]->type, rt_eq[k2]->attrs);
+                        int64_t vb = read_col_i64(ray_data(rt_eq[k2]), bi, rt_eq[k2]->type, rt_eq[k2]->attrs);
+                        if (va < vb) cmp = -1;
+                        else if (va > vb) cmp = 1;
+                    }
+                    if (cmp == 0) {
+                        if (rt_time[ai] < rt_time[bi]) cmp = -1;
+                        else if (rt_time[ai] > rt_time[bi]) cmp = 1;
+                    }
+                    tmp[t++] = (cmp <= 0) ? ri_idx[a++] : ri_idx[b++];
+                }
+                while (a < mid) tmp[t++] = ri_idx[a++];
+                while (b < hi) tmp[t++] = ri_idx[b++];
+                for (int64_t c = lo; c < hi; c++) ri_idx[c] = tmp[c];
+            }
+        }
+
+        if (tmp_hdr) scratch_free(tmp_hdr);
+    }
+
+    /* Build match array: for each left row (sorted), find best right match */
+    ray_t* match_hdr = NULL;
+    int64_t* match = (int64_t*)scratch_alloc(&match_hdr, (size_t)left_n * sizeof(int64_t));
+    if (!match && left_n > 0) {
+        scratch_free(li_hdr); scratch_free(ri_hdr);
+        if (lt_null_hdr) scratch_free(lt_null_hdr);
+        if (rt_null_hdr) scratch_free(rt_null_hdr);
+        if (lt_time_hdr) scratch_free(lt_time_hdr);
+        if (rt_time_hdr) scratch_free(rt_time_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    /* Two-pointer merge with best-match carry-forward.  Because the sort
+     * pins null-keyed rows to the end, skipping them is just an early
+     * "no match" for left and a plain `rp++` for right. */
+    int64_t rp = 0;        /* right pointer (only advances) */
+    int64_t best_ri = -1;  /* best right match in current partition */
+    /* Track the previous *non-null* left row for partition-change detection
+     * so a null-keyed left row doesn't force an incorrect partition reset
+     * (and so its own null keys aren't read through read_col_i64). */
+    int64_t prev_non_null_li = -1;
+    for (int64_t lp = 0; lp < left_n; lp++) {
+        int64_t li = li_idx[lp];
+
+        if (lt_null[li]) {
+            /* Null-keyed left row cannot match; in left-outer mode it
+             * still appears in the result with all right cols null. */
+            match[lp] = -1;
+            continue;
+        }
+
+        /* Detect partition change — reset best match and rewind rp */
+        if (prev_non_null_li >= 0) {
+            int changed = 0;
+            for (uint8_t k = 0; k < n_eq; k++) {
+                int64_t cv = read_col_i64(ray_data(lt_eq[k]), li, lt_eq[k]->type, lt_eq[k]->attrs);
+                int64_t pv = read_col_i64(ray_data(lt_eq[k]), prev_non_null_li, lt_eq[k]->type, lt_eq[k]->attrs);
+                if (cv != pv) { changed = 1; break; }
+            }
+            if (changed) {
+                best_ri = -1;
+                /* Rewind rp to find start of new partition in right table */
+                while (rp > 0) {
+                    int64_t ri_prev = ri_idx[rp - 1];
+                    if (rt_null[ri_prev]) break;
+                    int eq_match = 1;
+                    for (uint8_t k = 0; k < n_eq; k++) {
+                        int64_t rv = read_col_i64(ray_data(rt_eq[k]), ri_prev, rt_eq[k]->type, rt_eq[k]->attrs);
+                        int64_t lv = read_col_i64(ray_data(lt_eq[k]), li, lt_eq[k]->type, lt_eq[k]->attrs);
+                        if (rv < lv) { eq_match = 0; break; }
+                    }
+                    if (!eq_match) break;
+                    rp--;
+                }
+            }
+        }
+
+        /* Advance right pointer, accumulating best match */
+        while (rp < right_n) {
+            int64_t ri = ri_idx[rp];
+            if (rt_null[ri]) { rp++; continue; }  /* null keys never match */
+            int eq_cmp = 0;
+            for (uint8_t k = 0; k < n_eq && eq_cmp == 0; k++) {
+                int64_t rv = read_col_i64(ray_data(rt_eq[k]), ri, rt_eq[k]->type, rt_eq[k]->attrs);
+                int64_t lv = read_col_i64(ray_data(lt_eq[k]), li, lt_eq[k]->type, lt_eq[k]->attrs);
+                if (rv < lv) eq_cmp = -1;
+                else if (rv > lv) eq_cmp = 1;
+            }
+            if (eq_cmp > 0) break;  /* right partition past left */
+            if (eq_cmp == 0) {
+                if (rt_time[ri] <= lt_time[li])
+                    best_ri = ri;  /* valid candidate */
+                else
+                    break;  /* right time past left time */
+            }
+            rp++;
+        }
+        match[lp] = best_ri;
+        prev_non_null_li = li;
+    }
+
+    /* Remap match[] from sorted order to original left-row order.
+     * match[lp] gives the best right row for sorted left position lp.
+     * We need match_orig[li] = best right row for original left row li. */
+    ray_t* mo_hdr = NULL;
+    int64_t* match_orig = (int64_t*)scratch_alloc(&mo_hdr, (size_t)left_n * sizeof(int64_t));
+    if (!match_orig && left_n > 0) {
+        scratch_free(match_hdr); scratch_free(li_hdr); scratch_free(ri_hdr);
+        return ray_error("oom", NULL);
+    }
+    for (int64_t lp = 0; lp < left_n; lp++)
+        match_orig[li_idx[lp]] = match[lp];
+
+    /* Count output rows */
+    int64_t out_n = 0;
+    if (join_type == 1) {
+        out_n = left_n;  /* left outer: all left rows */
+    } else {
+        for (int64_t i = 0; i < left_n; i++)
+            if (match_orig[i] >= 0) out_n++;
+    }
+
+    /* Build output table */
+    int64_t left_ncols  = ray_table_ncols(left_table);
+    int64_t right_ncols = ray_table_ncols(right_table);
+
+    /* Collect right column indices, excluding duplicate key columns */
+    int64_t right_out_idx[256];
+    int64_t right_out_count = 0;
+    for (int64_t c = 0; c < right_ncols; c++) {
+        int64_t rname = ray_table_col_name(right_table, c);
+        int skip = 0;
+        if (rname == time_sym) skip = 1;
+        for (uint8_t k = 0; k < n_eq && !skip; k++)
+            if (rname == eq_syms[k]) skip = 1;
+        if (!skip) right_out_idx[right_out_count++] = c;
+    }
+
+    ray_t* out = ray_table_new(left_ncols + right_out_count);
+
+    /* Build index arrays for gather so col_propagate_nulls_gather can
+     * copy the null bitmap correctly (null bit in source → null bit in
+     * output, plus explicit null for match_orig == -1 on the right side). */
+    ray_t* lidx_hdr = NULL, *ridx_hdr = NULL;
+    int64_t* lidx = out_n > 0
+        ? (int64_t*)scratch_alloc(&lidx_hdr, (size_t)out_n * sizeof(int64_t))
+        : NULL;
+    int64_t* ridx = out_n > 0
+        ? (int64_t*)scratch_alloc(&ridx_hdr, (size_t)out_n * sizeof(int64_t))
+        : NULL;
+    if (out_n > 0 && (!lidx || !ridx)) {
+        if (lidx_hdr) scratch_free(lidx_hdr);
+        if (ridx_hdr) scratch_free(ridx_hdr);
+        scratch_free(mo_hdr);
+        scratch_free(match_hdr);
+        scratch_free(li_hdr);
+        scratch_free(ri_hdr);
+        if (lt_null_hdr) scratch_free(lt_null_hdr);
+        if (rt_null_hdr) scratch_free(rt_null_hdr);
+        if (lt_time_hdr) scratch_free(lt_time_hdr);
+        if (rt_time_hdr) scratch_free(rt_time_hdr);
+        return ray_error("oom", NULL);
+    }
+    {
+        int64_t wi = 0;
+        for (int64_t li = 0; li < left_n; li++) {
+            if (join_type == 0 && match_orig[li] < 0) continue;
+            lidx[wi] = li;
+            ridx[wi] = match_orig[li];
+            wi++;
+        }
+    }
+
+    /* Gather left columns — iterate in original row order, preserve nulls */
+    for (int64_t c = 0; c < left_ncols; c++) {
+        int64_t col_name = ray_table_col_name(left_table, c);
+        ray_t* src_col = ray_table_get_col_idx(left_table, c);
+        int8_t ctype = src_col->type;
+        ray_t* dst_col = ray_vec_new(ctype, out_n);
+
+        uint8_t esz = ray_type_sizes[ctype];
+        char* src = (char*)ray_data(src_col);
+        char* dst = (char*)ray_data(dst_col);
+        for (int64_t wi = 0; wi < out_n; wi++)
+            memcpy(dst + wi * esz, src + lidx[wi] * esz, esz);
+        dst_col->len = out_n;
+        col_propagate_str_pool(dst_col, src_col);
+        col_propagate_nulls_gather(dst_col, src_col, lidx, out_n);
+        out = ray_table_add_col(out, col_name, dst_col);
+        ray_release(dst_col);
+    }
+
+    /* Gather right columns (excluding key duplicates) — original left-row order.
+     * For unmatched rows (ridx[wi] == -1) we memset 0 for the value and
+     * rely on col_propagate_nulls_gather to set the null bit; the zero
+     * bytes keep the vector well-formed when consumers ignore the null
+     * bit. */
+    for (int64_t rc = 0; rc < right_out_count; rc++) {
+        int64_t cidx = right_out_idx[rc];
+        int64_t col_name = ray_table_col_name(right_table, cidx);
+        ray_t* src_col = ray_table_get_col_idx(right_table, cidx);
+        int8_t ctype = src_col->type;
+        ray_t* dst_col = ray_vec_new(ctype, out_n);
+
+        uint8_t esz = ray_type_sizes[ctype];
+        char* src = (char*)ray_data(src_col);
+        char* dst = (char*)ray_data(dst_col);
+        for (int64_t wi = 0; wi < out_n; wi++) {
+            int64_t ri = ridx[wi];
+            if (ri >= 0) memcpy(dst + wi * esz, src + ri * esz, esz);
+            else         memset(dst + wi * esz, 0, esz);
+        }
+        dst_col->len = out_n;
+        col_propagate_str_pool(dst_col, src_col);
+        col_propagate_nulls_gather(dst_col, src_col, ridx, out_n);
+        out = ray_table_add_col(out, col_name, dst_col);
+        ray_release(dst_col);
+    }
+
+    if (lidx_hdr) scratch_free(lidx_hdr);
+    if (ridx_hdr) scratch_free(ridx_hdr);
+    scratch_free(mo_hdr);
+    scratch_free(match_hdr);
+    scratch_free(li_hdr);
+    scratch_free(ri_hdr);
+    if (lt_null_hdr) scratch_free(lt_null_hdr);
+    if (rt_null_hdr) scratch_free(rt_null_hdr);
+    if (lt_time_hdr) scratch_free(lt_time_hdr);
+    if (rt_time_hdr) scratch_free(rt_time_hdr);
+    return out;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/journal.c b/crates/rayforce-sys/vendor/rayforce/src/ops/journal.c
new file mode 100644
index 0000000..fa18294
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/journal.c
@@ -0,0 +1,191 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "journal.h"
+#include "store/journal.h"
+#include "store/serde.h"
+#include "core/ipc.h"
+#include "mem/sys.h"
+
+#include <string.h>
+
+/* Copy a Rayfall string atom into a NUL-terminated C buffer.  Returns
+ * NULL if the atom isn't a string or doesn't fit. */
+static const char* str_to_cpath(ray_t* s, char* buf, size_t bufsz) {
+    if (!s || s->type != -RAY_STR) return NULL;
+    const char* p = ray_str_ptr(s);
+    size_t      n = ray_str_len(s);
+    if (n + 1 > bufsz) return NULL;
+    memcpy(buf, p, n);
+    buf[n] = '\0';
+    return buf;
+}
+
+/* Map a ray_err_t into a Rayfall error object so callers can `try` them. */
+static ray_t* err_to_ray(ray_err_t e, const char* fallback) {
+    if (e == RAY_OK) return RAY_NULL_OBJ;
+    const char* code = ray_err_code_str(e);
+    return ray_error(code ? code : (fallback ? fallback : "io"), NULL);
+}
+
+/* (.log.open args) — args is a 2-tuple (`async; "base") or (`sync; "base").
+ * Accepting the mode as a sym keyword keeps the call self-documenting
+ * without needing a second function or a magic int. */
+ray_t* ray_log_open_fn(ray_t** args, int64_t n) {
+    if (n != 2)
+        return ray_error("rank", ".log.open expects (`async|`sync; \"base\")");
+    if (!args[0] || args[0]->type != -RAY_SYM)
+        return ray_error("type", ".log.open mode must be `async or `sync");
+    if (!args[1] || args[1]->type != -RAY_STR)
+        return ray_error("type", ".log.open base must be a string");
+
+    int64_t sym_async = ray_sym_intern("async", 5);
+    int64_t sym_sync  = ray_sym_intern("sync",  4);
+    int64_t mode_id   = args[0]->i64;
+    ray_journal_mode_t mode;
+    if      (mode_id == sym_async) mode = RAY_JOURNAL_ASYNC;
+    else if (mode_id == sym_sync)  mode = RAY_JOURNAL_SYNC;
+    else return ray_error("domain", ".log.open mode must be `async or `sync");
+
+    char base[1024];
+    if (!str_to_cpath(args[1], base, sizeof(base)))
+        return ray_error("type", ".log.open base path too long or not a string");
+
+    ray_err_t e = ray_journal_open(base, mode);
+    return err_to_ray(e, "io");
+}
+
+/* (.log.write expr) — append a synthetic entry containing the
+ * serialized form of `expr`.  Useful for users who want REPL-driven
+ * mutations captured in the log alongside the IPC stream.
+ *
+ * If the journal isn't open, ERROR rather than silently no-op — a
+ * silent no-op would lie to the user about durability ("I logged
+ * your change") when in fact nothing was persisted. */
+ray_t* ray_log_write_fn(ray_t* expr) {
+    if (!ray_journal_is_open())
+        return ray_error("noopen", ".log.write: no journal open (start with -l/-L)");
+    if (!expr) return ray_error("type", ".log.write expects an argument");
+
+    int64_t pay_size = ray_serde_size(expr);
+    if (pay_size <= 0) return ray_error("domain", ".log.write: serde size 0");
+
+    uint8_t* payload = (uint8_t*)ray_sys_alloc((size_t)pay_size);
+    if (!payload) return ray_error("oom", NULL);
+
+    int64_t written = ray_ser_raw(payload, expr);
+    if (written != pay_size) { ray_sys_free(payload); return ray_error("io", NULL); }
+
+    ray_ipc_header_t hdr = {
+        .prefix  = RAY_SERDE_PREFIX,
+        .version = RAY_SERDE_WIRE_VERSION,
+        .flags   = 0,
+        .endian  = 0,
+        .msgtype = RAY_IPC_MSG_ASYNC,
+        .size    = pay_size,
+    };
+    ray_err_t e = ray_journal_write_bytes(&hdr, payload, pay_size);
+    ray_sys_free(payload);
+    return err_to_ray(e, "io");
+}
+
+ray_t* ray_log_replay_fn(ray_t* path) {
+    char p[1024];
+    if (!str_to_cpath(path, p, sizeof(p)))
+        return ray_error("type", ".log.replay expects a string path");
+
+    int64_t chunks = 0, errs = 0;
+    ray_jreplay_status_t status = RAY_JREPLAY_OK;
+    ray_journal_replay(p, &chunks, &errs, &status);
+    switch (status) {
+    case RAY_JREPLAY_OK:
+        return ray_i64(chunks);
+    case RAY_JREPLAY_BADTAIL: {
+        int64_t valid_chunks = 0, valid_bytes = 0;
+        ray_journal_validate(p, &valid_chunks, &valid_bytes);
+        return ray_error("badtail",
+                         "%s: framing broken after %lld entries (valid bytes = %lld)",
+                         p, (long long)chunks, (long long)valid_bytes);
+    }
+    case RAY_JREPLAY_DESER:
+        return ray_error("deser",
+                         "%s: deserialization failed at chunk %lld — framing intact, content/version skew",
+                         p, (long long)chunks);
+    case RAY_JREPLAY_DECOMP:
+        return ray_error("decompress",
+                         "%s: decompression failed at chunk %lld — framing intact, do not truncate",
+                         p, (long long)chunks);
+    case RAY_JREPLAY_OOM:
+        return ray_error("oom",
+                         "%s: out of memory mid-replay after %lld entries",
+                         p, (long long)chunks);
+    case RAY_JREPLAY_IO:
+        return ray_error("io",
+                         "%s: I/O failure after %lld entries", p, (long long)chunks);
+    }
+    return ray_error("internal", "unknown replay status");
+}
+
+/* (.log.validate "path") -> (chunks; valid_bytes) — a 2-list. */
+ray_t* ray_log_validate_fn(ray_t* path) {
+    char p[1024];
+    if (!str_to_cpath(path, p, sizeof(p)))
+        return ray_error("type", ".log.validate expects a string path");
+
+    int64_t chunks = 0, valid_bytes = 0;
+    ray_err_t e = ray_journal_validate(p, &chunks, &valid_bytes);
+    if (e != RAY_OK) return err_to_ray(e, "io");
+
+    ray_t* list = ray_list_new(2);
+    if (!list || RAY_IS_ERR(list)) return ray_error("oom", NULL);
+    ray_t* a = ray_i64(chunks);
+    ray_t* b = ray_i64(valid_bytes);
+    list = ray_list_append(list, a); ray_release(a);
+    if (RAY_IS_ERR(list)) { ray_release(b); return list; }
+    list = ray_list_append(list, b); ray_release(b);
+    return list;
+}
+
+ray_t* ray_log_roll_fn(ray_t** args, int64_t n) {
+    (void)args; (void)n;
+    if (!ray_journal_is_open())
+        return ray_error("domain", ".log.roll: no journal open");
+    return err_to_ray(ray_journal_roll(), "io");
+}
+
+ray_t* ray_log_snapshot_fn(ray_t** args, int64_t n) {
+    (void)args; (void)n;
+    if (!ray_journal_is_open())
+        return ray_error("domain", ".log.snapshot: no journal open");
+    return err_to_ray(ray_journal_snapshot(), "io");
+}
+
+ray_t* ray_log_sync_fn(ray_t** args, int64_t n) {
+    (void)args; (void)n;
+    return err_to_ray(ray_journal_sync(), "io");
+}
+
+ray_t* ray_log_close_fn(ray_t** args, int64_t n) {
+    (void)args; (void)n;
+    return err_to_ray(ray_journal_close(), "io");
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/journal.h b/crates/rayforce-sys/vendor/rayforce/src/ops/journal.h
new file mode 100644
index 0000000..daea3df
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/journal.h
@@ -0,0 +1,64 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/* Rayfall-facing thin wrappers over store/journal.{c,h}.  These are the
+ * functions registered under the `.log.*` namespace in eval.c. */
+#ifndef RAY_OPS_JOURNAL_H
+#define RAY_OPS_JOURNAL_H
+
+#include <rayforce.h>
+
+/* (.log.open args) — args = (`async; "base") or (`sync; "base").
+ * Loads <base>.qdb if present, replays <base>.log if present, then
+ * opens <base>.log for append. */
+ray_t* ray_log_open_fn(ray_t** args, int64_t n);
+
+/* (.log.write expr) — append a synthetic entry containing the
+ * serialized form of `expr`.  No-op (returns null) if no journal is
+ * open or if a replay is currently in progress. */
+ray_t* ray_log_write_fn(ray_t* expr);
+
+/* (.log.replay "path") -> i64 chunks replayed.  Errors with "badtail"
+ * if the file ends mid-frame; the error message includes the byte
+ * offset of the last good entry. */
+ray_t* ray_log_replay_fn(ray_t* path);
+
+/* (.log.validate "path") -> (chunks; valid_bytes) pair.  Maps to
+ * q's `-11!(-2; file)` — count valid frames without evaluating. */
+ray_t* ray_log_validate_fn(ray_t* path);
+
+/* (.log.roll) — close and rename current log to <base>.<UTC>.log,
+ * open a fresh empty <base>.log. */
+ray_t* ray_log_roll_fn(ray_t** args, int64_t n);
+
+/* (.log.snapshot) — write the current global env to <base>.qdb,
+ * then roll the log. */
+ray_t* ray_log_snapshot_fn(ray_t** args, int64_t n);
+
+/* (.log.sync) — fflush + fsync the open log (no-op in -L mode). */
+ray_t* ray_log_sync_fn(ray_t** args, int64_t n);
+
+/* (.log.close) — close the active log. */
+ray_t* ray_log_close_fn(ray_t** args, int64_t n);
+
+#endif /* RAY_OPS_JOURNAL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/lftj.c b/crates/rayforce-sys/vendor/rayforce/src/ops/lftj.c
new file mode 100644
index 0000000..c991ec9
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/lftj.c
@@ -0,0 +1,258 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lftj.h"
+#include <string.h>
+
+/* Grow output buffers when full. Returns false on OOM. */
+static bool lftj_grow_output(lftj_enum_ctx_t* ctx) {
+    if (ctx->out_cap > INT64_MAX / 2) return false;
+    int64_t new_cap = ctx->out_cap < 64 ? 64 : ctx->out_cap * 2;
+    /* Allocate all new blocks first (atomic: no state change on failure) */
+    ray_t* new_hdrs[LFTJ_MAX_VARS];
+    for (uint8_t v = 0; v < ctx->n_vars; v++) {
+        new_hdrs[v] = ray_alloc((size_t)new_cap * sizeof(int64_t));
+        if (!new_hdrs[v]) {
+            for (uint8_t j = 0; j < v; j++) ray_free(new_hdrs[j]);
+            return false;
+        }
+        memcpy(ray_data(new_hdrs[v]), ctx->col_data[v],
+               (size_t)ctx->out_count * sizeof(int64_t));
+    }
+    /* Commit: swap pointers (no allocation can fail past here) */
+    for (uint8_t v = 0; v < ctx->n_vars; v++) {
+        ray_free(ctx->buf_hdrs[v]);
+        ctx->buf_hdrs[v] = new_hdrs[v];
+        ctx->col_data[v] = (int64_t*)ray_data(new_hdrs[v]);
+    }
+    ctx->out_cap = new_cap;
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * Leapfrog search: intersect k sorted iterators
+ * Returns true + sets *out if intersection found.
+ * -------------------------------------------------------------------------- */
+
+bool leapfrog_search(ray_lftj_iter_t** iters, int k, int64_t* out) {
+    if (k <= 0) return false;
+
+    /* Check for any exhausted iterator */
+    for (int i = 0; i < k; i++)
+        if (lftj_at_end(iters[i])) return false;
+
+    /* Find initial max */
+    int max_idx = 0;
+    for (int i = 1; i < k; i++)
+        if (lftj_key(iters[i]) > lftj_key(iters[max_idx])) max_idx = i;
+
+    for (;;) {
+        int64_t max_val = lftj_key(iters[max_idx]);
+        int next = (max_idx + 1) % k;
+
+        lftj_seek(iters[next], max_val);
+        if (lftj_at_end(iters[next])) return false;
+
+        if (lftj_key(iters[next]) == max_val) {
+            /* Check all iterators agree */
+            bool all_equal = true;
+            for (int i = 0; i < k; i++) {
+                if (lftj_key(iters[i]) != max_val) {
+                    all_equal = false;
+                    break;
+                }
+            }
+            if (all_equal) {
+                *out = max_val;
+                return true;
+            }
+        }
+        max_idx = next;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Binding plan construction
+ * -------------------------------------------------------------------------- */
+
+bool lftj_build_plan(lftj_enum_ctx_t* ctx,
+                     ray_rel_t** rels, uint8_t n_rels, uint8_t n_vars,
+                     const uint8_t* rel_src_var, const uint8_t* rel_dst_var) {
+    if (n_vars > LFTJ_MAX_VARS) return false;
+    ctx->n_vars = n_vars;
+
+    for (uint8_t v = 0; v < n_vars; v++)
+        ctx->var_plans[v].n_bindings = 0;
+
+    /* For each relationship, add bindings to the appropriate variables.
+     * A relationship rel[i] connecting src_var→dst_var adds:
+     *   - If dst_var has higher index: binding on dst_var using fwd CSR, bound_var=src_var
+     *   - If src_var has higher index: binding on src_var using rev CSR, bound_var=dst_var
+     *
+     * For the first variable (depth 0), we need a special "root" iterator
+     * that enumerates all nodes. We handle this differently: depth-0 variable
+     * gets bindings from all rels where it's the src, using a full range iterator.
+     */
+    for (uint8_t r = 0; r < n_rels; r++) {
+        uint8_t sv = rel_src_var[r];
+        uint8_t dv = rel_dst_var[r];
+
+        /* Self-loop (sv == dv) is invalid — skip it */
+        if (sv == dv) continue;
+        if (sv >= n_vars || dv >= n_vars) return false;
+
+        /* Add binding to the later-bound variable */
+        if (sv < dv) {
+            /* sv is bound first; add fwd binding to dv */
+            lftj_var_plan_t* vp = &ctx->var_plans[dv];
+            if (vp->n_bindings >= LFTJ_MAX_ITERS_PER_VAR) return false;
+            vp->bindings[vp->n_bindings].csr = &rels[r]->fwd;
+            vp->bindings[vp->n_bindings].bound_var = sv;
+            vp->n_bindings++;
+        } else {
+            /* dv is bound first; add rev binding to sv */
+            lftj_var_plan_t* vp = &ctx->var_plans[sv];
+            if (vp->n_bindings >= LFTJ_MAX_ITERS_PER_VAR) return false;
+            vp->bindings[vp->n_bindings].csr = &rels[r]->rev;
+            vp->bindings[vp->n_bindings].bound_var = dv;
+            vp->n_bindings++;
+        }
+    }
+
+    return true;
+}
+
+bool lftj_build_default_plan(lftj_enum_ctx_t* ctx,
+                             ray_rel_t** rels, uint8_t n_rels, uint8_t n_vars) {
+    if (n_vars == 3 && n_rels == 3) {
+        /* Triangle: rels[0]=a→b, rels[1]=b→c, rels[2]=a→c */
+        uint8_t src_vars[3] = {0, 1, 0};
+        uint8_t dst_vars[3] = {1, 2, 2};
+        return lftj_build_plan(ctx, rels, n_rels, n_vars, src_vars, dst_vars);
+    } else if (n_vars == 2) {
+        /* All rels connect var 0→var 1 */
+        uint8_t src_vars[16], dst_vars[16];
+        if (n_rels > 16) return false;
+        for (uint8_t r = 0; r < n_rels; r++) {
+            src_vars[r] = 0;
+            dst_vars[r] = 1;
+        }
+        return lftj_build_plan(ctx, rels, n_rels, n_vars, src_vars, dst_vars);
+    } else if (n_vars == 4 && n_rels == 6) {
+        /* 4-clique: rels[0]=a→b, rels[1]=a→c, rels[2]=a→d,
+         *           rels[3]=b→c, rels[4]=b→d, rels[5]=c→d */
+        uint8_t src_vars[6] = {0, 0, 0, 1, 1, 2};
+        uint8_t dst_vars[6] = {1, 2, 3, 2, 3, 3};
+        return lftj_build_plan(ctx, rels, n_rels, n_vars, src_vars, dst_vars);
+    }
+
+    /* Fallback: chain pattern — rel[i] connects var i→var i+1 */
+    if (n_rels == n_vars - 1) {
+        uint8_t src_vars[16], dst_vars[16];
+        if (n_rels > 16) return false;
+        for (uint8_t r = 0; r < n_rels; r++) {
+            src_vars[r] = r;
+            dst_vars[r] = r + 1;
+        }
+        return lftj_build_plan(ctx, rels, n_rels, n_vars, src_vars, dst_vars);
+    }
+
+    return false;
+}
+
+/* --------------------------------------------------------------------------
+ * Recursive backtracking enumeration
+ *
+ * At each depth d, open iterators for variable d's bindings using the
+ * currently bound values, then leapfrog-intersect to find valid bindings.
+ * At the last depth, emit tuples to output.
+ * -------------------------------------------------------------------------- */
+
+void lftj_enumerate(lftj_enum_ctx_t* ctx, uint8_t depth) {
+    if (ctx->oom) return;
+
+    if (depth == ctx->n_vars) {
+        /* All variables bound — emit tuple */
+        if (ctx->out_count >= ctx->out_cap) {
+            if (!lftj_grow_output(ctx)) {
+                ctx->oom = true;
+                return;
+            }
+        }
+        for (uint8_t v = 0; v < ctx->n_vars; v++)
+            ctx->col_data[v][ctx->out_count] = ctx->bound[v];
+        ctx->out_count++;
+        return;
+    }
+
+    lftj_var_plan_t* vp = &ctx->var_plans[depth];
+
+    if (vp->n_bindings == 0) {
+        /* Root variable (depth 0 with no bindings): iterate all nodes.
+         * Use the first rel's fwd CSR to determine node range. */
+        if (depth != 0) return;  /* non-root var must have bindings */
+
+        /* Find max n_nodes across all CSRs in the query */
+        int64_t n_nodes = 0;
+        for (uint8_t v = 0; v < ctx->n_vars; v++) {
+            for (uint8_t b = 0; b < ctx->var_plans[v].n_bindings; b++) {
+                if (ctx->var_plans[v].bindings[b].csr) {
+                    int64_t nn = ctx->var_plans[v].bindings[b].csr->n_nodes;
+                    if (nn > n_nodes) n_nodes = nn;
+                }
+            }
+        }
+        if (n_nodes == 0) return;
+
+        for (int64_t a = 0; a < n_nodes; a++) {
+            ctx->bound[0] = a;
+            lftj_enumerate(ctx, 1);
+            if (ctx->oom) return;
+        }
+        return;
+    }
+
+    /* Open iterators for this variable's bindings */
+    ray_lftj_iter_t iter_buf[LFTJ_MAX_ITERS_PER_VAR];
+    ray_lftj_iter_t* iter_ptrs[LFTJ_MAX_ITERS_PER_VAR];
+
+    for (uint8_t b = 0; b < vp->n_bindings; b++) {
+        lftj_binding_t* bind = &vp->bindings[b];
+        if (!bind->csr) return;
+        int64_t parent = ctx->bound[bind->bound_var];
+        if (parent < 0 || parent >= bind->csr->n_nodes) return;
+        lftj_open(&iter_buf[b], bind->csr, parent);
+        iter_ptrs[b] = &iter_buf[b];
+    }
+
+    /* Leapfrog intersect */
+    int64_t val;
+    while (leapfrog_search(iter_ptrs, vp->n_bindings, &val)) {
+        ctx->bound[depth] = val;
+        lftj_enumerate(ctx, depth + 1);
+        if (ctx->oom) return;
+        /* Advance all iterators past current match */
+        for (uint8_t b = 0; b < vp->n_bindings; b++)
+            lftj_next(iter_ptrs[b]);
+    }
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/lftj.h b/crates/rayforce-sys/vendor/rayforce/src/ops/lftj.h
new file mode 100644
index 0000000..1ce4380
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/lftj.h
@@ -0,0 +1,136 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_LFTJ_H
+#define RAY_LFTJ_H
+
+#include "ops.h"
+#include "store/csr.h"
+
+/* Trie iterator over sorted CSR adjacency list */
+typedef struct ray_lftj_iter {
+    int64_t* targets;        /* pointer into CSR targets data */
+    int64_t  start;          /* current range start */
+    int64_t  end;            /* current range end */
+    int64_t  pos;            /* current position in [start, end) */
+} ray_lftj_iter_t;
+
+/* O(1) */
+static inline int64_t lftj_key(ray_lftj_iter_t* it) {
+    if (!it->targets || it->pos >= it->end) return INT64_MAX;
+    return it->targets[it->pos];
+}
+
+static inline bool lftj_at_end(ray_lftj_iter_t* it) {
+    return !it->targets || it->pos >= it->end;
+}
+
+static inline void lftj_next(ray_lftj_iter_t* it) {
+    if (it->pos < it->end) it->pos++;
+}
+
+/* O(log degree) - binary search within [pos, end) */
+static inline void lftj_seek(ray_lftj_iter_t* it, int64_t v) {
+    if (!it->targets) { it->pos = it->end; return; }
+    int64_t lo = it->pos, hi = it->end;
+    while (lo < hi) {
+        int64_t mid = lo + (hi - lo) / 2;
+        if (it->targets[mid] < v) lo = mid + 1;
+        else hi = mid;
+    }
+    it->pos = lo;
+}
+
+/* Open trie level: set iterator to a node's adjacency list */
+static inline void lftj_open(ray_lftj_iter_t* it, ray_csr_t* csr, int64_t parent) {
+    if (!csr || !csr->offsets || !csr->targets
+        || parent < 0 || parent >= csr->n_nodes) {
+        it->targets = NULL; it->start = 0; it->end = 0; it->pos = 0;
+        return;
+    }
+    int64_t* o = (int64_t*)ray_data(csr->offsets);
+    it->targets = (int64_t*)ray_data(csr->targets);
+    it->start = o[parent];
+    it->end   = o[parent + 1];
+    it->pos   = it->start;
+}
+
+/* Leapfrog search: intersect k sorted iterators */
+bool leapfrog_search(ray_lftj_iter_t** iters, int k, int64_t* out);
+
+/* --------------------------------------------------------------------------
+ * General LFTJ enumeration
+ * -------------------------------------------------------------------------- */
+
+#define LFTJ_MAX_VARS 16
+#define LFTJ_MAX_ITERS_PER_VAR 8
+
+/* Binding entry: one iterator constraint on a variable.
+ * "Open CSR `csr` at the node bound to `bound_var`" */
+typedef struct lftj_binding {
+    ray_csr_t* csr;           /* CSR to open (fwd or rev of some rel) */
+    uint8_t   bound_var;     /* index of already-bound variable providing the parent node */
+} lftj_binding_t;
+
+/* Per-variable binding plan */
+typedef struct lftj_var_plan {
+    lftj_binding_t bindings[LFTJ_MAX_ITERS_PER_VAR];
+    uint8_t        n_bindings;
+} lftj_var_plan_t;
+
+/* Enumeration context */
+typedef struct lftj_enum_ctx {
+    lftj_var_plan_t var_plans[LFTJ_MAX_VARS];
+    uint8_t         n_vars;
+    int64_t         bound[LFTJ_MAX_VARS];   /* currently bound values */
+
+    /* Output buffers (caller-owned, dynamically grown) */
+    int64_t**       col_data;    /* [n_vars] arrays of output values */
+    int64_t         out_count;
+    int64_t         out_cap;
+    ray_t*           buf_hdrs[LFTJ_MAX_VARS]; /* scratch headers for realloc */
+    bool            oom;         /* set on allocation failure */
+} lftj_enum_ctx_t;
+
+/* Build binding plan from relationship array.
+ * Assumes variable ordering 0..n_vars-1.
+ * For each rel: rel[i] connects src_var→dst_var.
+ * The caller encodes this mapping as (src_var, dst_var) pairs.
+ * Returns true on success. */
+bool lftj_build_plan(lftj_enum_ctx_t* ctx,
+                     ray_rel_t** rels, uint8_t n_rels, uint8_t n_vars,
+                     const uint8_t* rel_src_var, const uint8_t* rel_dst_var);
+
+/* Build default binding plan for simple patterns.
+ * Triangle (n_vars=3, n_rels=3): rels[0]=a→b, rels[1]=b→c, rels[2]=a→c
+ * 2-var (n_vars=2): all rels connect var 0→var 1
+ * Returns true on success, false if pattern not recognized. */
+bool lftj_build_default_plan(lftj_enum_ctx_t* ctx,
+                             ray_rel_t** rels, uint8_t n_rels, uint8_t n_vars);
+
+/* Recursive backtracking enumeration.
+ * Caller must initialize ctx->col_data, out_cap, out_count=0, buf_hdrs.
+ * Populates ctx->col_data with matching tuples. */
+void lftj_enumerate(lftj_enum_ctx_t* ctx, uint8_t depth);
+
+#endif /* RAY_LFTJ_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/linkop.c b/crates/rayforce-sys/vendor/rayforce/src/ops/linkop.c
new file mode 100644
index 0000000..0d0aa11
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/linkop.c
@@ -0,0 +1,328 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "linkop.h"
+#include "idxop.h"
+#include "ops/internal.h"   /* col_propagate_str_pool */
+#include "ops/ops.h"        /* RAY_IS_PARTED */
+#include "mem/cow.h"
+#include "vec/vec.h"
+#include "table/table.h"
+#include "table/sym.h"
+#include "lang/eval.h"
+#include "lang/env.h"
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * Promote inline nullmap to ext-nullmap before attaching a link.
+ *
+ * A linked column places its int64 target sym at nullmap-union bytes 8-15.
+ * If the column has inline nulls and >64 elements, those bytes hold real
+ * bitmap bits that would be clobbered.  Promote up front to keep nulls
+ * intact.  Mirrors the promotion logic in ray_vec_set_null_checked. */
+static ray_err_t promote_inline_to_ext(ray_t* vec) {
+    if (!(vec->attrs & RAY_ATTR_HAS_NULLS)) return RAY_OK;
+    if (vec->attrs & RAY_ATTR_NULLMAP_EXT)  return RAY_OK;
+
+    int64_t bitmap_len = (vec->len + 7) / 8;
+    if (bitmap_len < 1) bitmap_len = 1;
+    ray_t* ext = ray_vec_new(RAY_U8, bitmap_len);
+    if (!ext || RAY_IS_ERR(ext)) return RAY_ERR_OOM;
+    ext->len = bitmap_len;
+
+    /* Copy existing inline bits (16 bytes max) into ext. */
+    int64_t copy = bitmap_len < 16 ? bitmap_len : 16;
+    memcpy(ray_data(ext), vec->nullmap, (size_t)copy);
+    if (bitmap_len > 16) {
+        memset((char*)ray_data(ext) + 16, 0, (size_t)(bitmap_len - 16));
+    }
+    /* Now overwrite bytes 0-7 with the ext_nullmap pointer.  Bytes 8-15
+     * become don't-care — caller is about to write link_target there. */
+    vec->ext_nullmap = ext;
+    vec->attrs |= RAY_ATTR_NULLMAP_EXT;
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_link_attach
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_link_attach(ray_t** vp, int64_t target_sym_id) {
+    if (!vp || !*vp || RAY_IS_ERR(*vp))
+        return ray_error("type", "link: null/error vector");
+    ray_t* v = *vp;
+
+    if (!ray_is_vec(v) || (v->type != RAY_I32 && v->type != RAY_I64))
+        return ray_error("type", "link: column must be RAY_I32 or RAY_I64 (got %d)",
+                         (int)v->type);
+    if (v->attrs & RAY_ATTR_SLICE)
+        return ray_error("type", "link: cannot attach to a slice; materialize first");
+    if (target_sym_id < 0)
+        return ray_error("type", "link: invalid target sym ID");
+
+    /* Validate that target_sym_id resolves to a RAY_TABLE in the env. */
+    ray_t* target = ray_env_get(target_sym_id);
+    if (!target || target->type != RAY_TABLE)
+        return ray_error("name", "link: target sym does not name a table");
+
+    /* Reject parted dim tables — deref math (target_col[linkcol[i]]) is
+     * straight indexing, with no notion of which segment a global rowid
+     * lives in.  Pointing a link at a parted target would silently
+     * misbehave at deref time.  Better an explicit nyi here than a
+     * three-layers-deep wrong-answer bug.  See guide-indexes.html and
+     * queries-links.html for the supported shape (parted fact -> regular
+     * non-parted dim). */
+    int64_t tcols = ray_table_ncols(target);
+    for (int64_t c = 0; c < tcols; c++) {
+        ray_t* tcol = ray_table_get_col_idx(target, c);
+        if (tcol && RAY_IS_PARTED(tcol->type))
+            return ray_error("nyi",
+                "link: target table has a parted column (%d); "
+                "link targets must be non-parted (in-memory or splayed) tables",
+                (int)c);
+    }
+
+    /* COW so we own the bytes we're about to mutate. */
+    v = ray_cow(v);
+    if (!v || RAY_IS_ERR(v)) return v;
+    *vp = v;
+
+    /* Promote nulls to ext if necessary so bytes 8-15 are free. */
+    ray_err_t err = promote_inline_to_ext(v);
+    if (err != RAY_OK) return ray_error(ray_err_code_str(err), "link: oom");
+
+    /* Replace any existing link (idempotent re-attach with new target). */
+    v->link_target = target_sym_id;
+    v->attrs |= RAY_ATTR_HAS_LINK;
+
+    /* If an accelerator index is also attached, the index's saved snapshot
+     * captured the pre-link bytes 8-15 (which were _idx_pad / NULL).  Update
+     * the snapshot so a future index-drop restores the link too. */
+    if (v->attrs & RAY_ATTR_HAS_INDEX) {
+        ray_index_t* ix = ray_index_payload(v->index);
+        memcpy(&ix->saved_nullmap[8], &target_sym_id, 8);
+    }
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_link_detach
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_link_detach(ray_t** vp) {
+    if (!vp || !*vp || RAY_IS_ERR(*vp)) return *vp;
+    ray_t* v = *vp;
+    if (!(v->attrs & RAY_ATTR_HAS_LINK)) return v;
+
+    v = ray_cow(v);
+    if (!v || RAY_IS_ERR(v)) { *vp = v; return v; }
+    *vp = v;
+
+    v->link_target = 0;
+    v->attrs &= (uint8_t)~RAY_ATTR_HAS_LINK;
+
+    if (v->attrs & RAY_ATTR_HAS_INDEX) {
+        ray_index_t* ix = ray_index_payload(v->index);
+        memset(&ix->saved_nullmap[8], 0, 8);
+    }
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_link_deref — produce target_col[link_col[i]] for each row i
+ *
+ * Result type matches the target column type.  Length matches the link
+ * column.  Null rows in the link become null in the result; null rows in
+ * the target also propagate.  Returns NULL when target table or field
+ * column don't exist (caller treats as a probe miss).
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_link_deref(ray_t* v, int64_t sym_id) {
+    if (!ray_link_has(v)) return NULL;
+    if (v->type != RAY_I32 && v->type != RAY_I64) return NULL;
+
+    /* Slice-through: a slice over a linked parent inherits the link.
+     * link_target lives on the parent; the slice's own bytes 8-15 are
+     * slice_offset, which would be garbage if we read it as a sym ID. */
+    int64_t target_sym = (v->attrs & RAY_ATTR_SLICE)
+                         ? v->slice_parent->link_target
+                         : v->link_target;
+    ray_t* target_tab = ray_env_get(target_sym);
+    if (!target_tab || target_tab->type != RAY_TABLE) return NULL;
+
+    /* Reject parted targets at deref time, mirroring the attach-time guard
+     * in ray_link_attach.  The attach-time check catches the obvious case
+     * (user calls (.col.link 'parted_dim ...)), but two paths bypass it:
+     *   1. Lazy rebind — attach saw a non-parted table; the sym was later
+     *      rebound to a parted one (env lookup is at deref time).
+     *   2. .link sidecar reload — try_load_link_sidecar (col.c) writes
+     *      link_target straight from the on-disk sym name without any
+     *      env-state check.
+     * Without a deref-time guard, both produce a silent wrong-answer bug
+     * (target_col[linkcol[i]] indexes into RAY_PARTED data, which is a
+     * list of segment pointers — straight-byte indexing is meaningless). */
+    int64_t tcols = ray_table_ncols(target_tab);
+    for (int64_t c = 0; c < tcols; c++) {
+        ray_t* tcol = ray_table_get_col_idx(target_tab, c);
+        if (tcol && RAY_IS_PARTED(tcol->type))
+            return ray_error("nyi",
+                "link deref: target table has a parted column (%d); "
+                "links to parted dim tables are not supported in v1",
+                (int)c);
+    }
+
+    ray_t* target_col = ray_table_get_col(target_tab, sym_id);
+    if (!target_col) return NULL;
+
+    int64_t n = v->len;
+    int64_t target_n = target_col->len;
+    int8_t  out_type = target_col->type;
+
+    /* Resolve through slices: SYM-width and (later) sym_dict / str_pool
+     * all live on the slice_parent's attrs/union, never on the slice
+     * itself.  The slice contributes only its [slice_offset, len) view.
+     * Compute the canonical width and base-pointer once here so the
+     * gather loop stays correct for narrow-width sliced sym columns. */
+    ray_t* col_owner = (target_col->attrs & RAY_ATTR_SLICE)
+                       ? target_col->slice_parent : target_col;
+    int64_t col_off  = (target_col->attrs & RAY_ATTR_SLICE)
+                       ? target_col->slice_offset : 0;
+    uint8_t target_width = col_owner->attrs & RAY_SYM_W_MASK;
+    uint8_t target_esz   = (out_type == RAY_SYM)
+                           ? (uint8_t)(1u << target_width)
+                           : ray_sym_elem_size(out_type, col_owner->attrs);
+
+    /* Allocate result.  For RAY_SYM mirror the parent's width so the
+     * subsequent memcpy is byte-correct; otherwise the canonical size
+     * for the type. */
+    ray_t* result;
+    if (out_type == RAY_SYM) {
+        result = ray_sym_vec_new(target_width, n);
+    } else {
+        result = ray_vec_new(out_type, n);
+    }
+    if (!result || RAY_IS_ERR(result)) return result;
+    result->len = n;
+
+    uint8_t out_esz = ray_sym_elem_size(out_type, result->attrs);
+    if (out_esz > 0) memset(ray_data(result), 0, (size_t)n * out_esz);
+    /* By construction, out_esz == target_esz: SYM widths match,
+     * STR is always 16, numeric types match because out_type == target. */
+
+    const uint8_t* link_base = (const uint8_t*)ray_data(v);
+    uint8_t link_esz = ray_sym_elem_size(v->type, v->attrs);
+    char* out_base = (char*)ray_data(result);
+    /* Compute the source-data base by hand (not via ray_data on the
+     * slice) because ray_data_fn assumes ray_type_sizes[RAY_SYM] = 8
+     * (W64), which mis-offsets narrow-width sliced sym columns. */
+    const char* col_data_base = (const char*)ray_data(col_owner);
+    const char* tgt_base      = col_data_base + (size_t)col_off * target_esz;
+
+    for (int64_t i = 0; i < n; i++) {
+        if (ray_vec_is_null(v, i)) {
+            ray_vec_set_null(result, i, true);
+            continue;
+        }
+        int64_t rid;
+        if (link_esz == 4) {
+            int32_t r;
+            memcpy(&r, link_base + i * 4, 4);
+            rid = (int64_t)r;
+        } else {
+            memcpy(&rid, link_base + i * 8, 8);
+        }
+        if (rid < 0 || rid >= target_n) {
+            ray_vec_set_null(result, i, true);
+            continue;
+        }
+        if (ray_vec_is_null(target_col, rid)) {
+            ray_vec_set_null(result, i, true);
+            continue;
+        }
+        if (target_esz > 0 && out_esz == target_esz) {
+            memcpy(out_base + i * out_esz,
+                   tgt_base + rid * target_esz,
+                   target_esz);
+        }
+    }
+
+    /* Type-specific metadata propagation.
+     *   RAY_STR: share the source pool so ray_str_t pool_offs are valid.
+     *   RAY_SYM: if the source column carries a local sym_dict, share it.
+     *
+     * sym_dict aliases bytes 8-15 of the nullmap union.  It is only a
+     * real pointer when the column doesn't have inline nulls clobbering
+     * those bytes, i.e. either no nulls or NULLMAP_EXT.  Mirrors the
+     * guard pattern in src/ops/sort.c:3307 and src/ops/rerank.c:182. */
+    if (out_type == RAY_STR) {
+        col_propagate_str_pool(result, target_col);
+    } else if (out_type == RAY_SYM) {
+        if (col_owner && !(col_owner->attrs & RAY_ATTR_SLICE) &&
+            (!(col_owner->attrs & RAY_ATTR_HAS_NULLS) ||
+             (col_owner->attrs & RAY_ATTR_NULLMAP_EXT)) &&
+            col_owner->sym_dict) {
+            ray_retain(col_owner->sym_dict);
+            result->sym_dict = col_owner->sym_dict;
+        }
+    }
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * Rayfall builtin entry points
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_col_link_fn(ray_t* target_sym, ray_t* int_vec) {
+    if (!target_sym || target_sym->type != -RAY_SYM)
+        return ray_error("type", "(.col.link target v): target must be a sym");
+    if (!int_vec || RAY_IS_ERR(int_vec))
+        return int_vec ? int_vec : ray_error("type", "(.col.link target v): null v");
+    int64_t target_id = target_sym->i64;
+
+    ray_t* w = int_vec;
+    ray_retain(w);
+    ray_t* r = ray_link_attach(&w, target_id);
+    if (RAY_IS_ERR(r)) { ray_release(w); return r; }
+    return w;
+}
+
+ray_t* ray_col_unlink_fn(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return v;
+    ray_t* w = v;
+    ray_retain(w);
+    ray_t* r = ray_link_detach(&w);
+    if (RAY_IS_ERR(r)) { ray_release(w); return r; }
+    return w;
+}
+
+ray_t* ray_col_link_p_fn(ray_t* v) {
+    return ray_bool(ray_link_has(v) ? 1 : 0);
+}
+
+ray_t* ray_col_target_fn(ray_t* v) {
+    if (!ray_link_has(v)) return RAY_NULL_OBJ;
+    /* Slice-aware: ray_link_target_id reads from slice_parent for slices,
+     * because v->link_target on a slice aliases slice_offset and would
+     * surface as a garbage sym ID. */
+    return ray_sym(ray_link_target_id(v));
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/linkop.h b/crates/rayforce-sys/vendor/rayforce/src/ops/linkop.h
new file mode 100644
index 0000000..4ef7477
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/linkop.h
@@ -0,0 +1,105 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_LINKOP_H
+#define RAY_LINKOP_H
+
+/*
+ * linkop.h -- Linked columns.
+ *
+ * A linked column is an integer vector (RAY_I32 / RAY_I64) where every
+ * value is a row index into a target table.  Querying linkcol.field
+ * dereferences as target_table[linkcol[i]][field] for each row i — a
+ * single array access, no hash probe.
+ *
+ * Storage: RAY_ATTR_HAS_LINK = 0x04 set on the column; the int64 sym ID
+ * naming the target lives at bytes 8-15 of the nullmap union (the
+ * `link_target` field).  See include/rayforce.h for the union layout
+ * and src/mem/heap.h for the attr-bit semantics.
+ *
+ * Resolution is lazy: link_target is just a sym, looked up against the
+ * global env at deref time.  If the target table has been rebound, the
+ * link follows automatically.
+ *
+ * HAS_LINK is a property of the column, not a transient accelerator —
+ * unlike HAS_INDEX it is preserved across in-place mutation and
+ * persisted to disk via a `.link` sidecar file.
+ */
+
+#include <rayforce.h>
+#include "mem/heap.h"
+
+/* ===== Attach / Detach ===== */
+
+/* Attach a link to *vp pointing at the target named by target_sym_id.
+ * Returns the (possibly COW'd) parent vector with HAS_LINK set, or a
+ * RAY_ERROR.  Validates: target sym must resolve to a RAY_TABLE in the
+ * current env; *vp must be a RAY_I32 or RAY_I64 vector and not a slice. */
+ray_t* ray_link_attach(ray_t** vp, int64_t target_sym_id);
+
+/* Clear HAS_LINK from *vp.  No-op if not linked.  link_target byte slot
+ * is zeroed.  Returns *vp. */
+ray_t* ray_link_detach(ray_t** vp);
+
+/* ===== Introspection ===== */
+
+/* True iff `v` is a linked column or a slice of one.  Slices over a
+ * linked parent inherit the link transparently — the slice's own attrs
+ * carry RAY_ATTR_SLICE without HAS_LINK, but `link_target` lives on the
+ * parent and reading it through the slice is safe via slice_parent. */
+static inline bool ray_link_has(const ray_t* v) {
+    if (!v || RAY_IS_ERR((ray_t*)v)) return false;
+    if (v->attrs & RAY_ATTR_HAS_LINK) return true;
+    if (v->attrs & RAY_ATTR_SLICE) {
+        const ray_t* p = v->slice_parent;
+        return p && (p->attrs & RAY_ATTR_HAS_LINK);
+    }
+    return false;
+}
+
+/* Returns the target sym ID (int64) or -1 if no link is attached.
+ * Slice-aware: looks through to slice_parent->link_target. */
+static inline int64_t ray_link_target_id(const ray_t* v) {
+    if (!ray_link_has(v)) return (int64_t)-1;
+    if (v->attrs & RAY_ATTR_SLICE) return v->slice_parent->link_target;
+    return v->link_target;
+}
+
+/* ===== Resolution ===== */
+
+/* Dereference linked column v at field sym_id of the target table.
+ * Returns a fresh owning ref to a column the same length as v, with
+ * the same type as the target's field column.  Null rows in v
+ * propagate as nulls in the result; null rows in the target also
+ * propagate.  Returns NULL if the target table is missing or doesn't
+ * have a column named `sym_id` (caller may treat as a probe miss). */
+ray_t* ray_link_deref(ray_t* v, int64_t sym_id);
+
+/* ===== Rayfall builtin entry points ===== */
+
+ray_t* ray_col_link_fn   (ray_t* target_sym, ray_t* int_vec);  /* (.col.link 'target v) */
+ray_t* ray_col_unlink_fn (ray_t* v);                            /* (.col.unlink v) */
+ray_t* ray_col_link_p_fn (ray_t* v);                            /* (.col.link? v) */
+ray_t* ray_col_target_fn (ray_t* v);                            /* (.col.target v) */
+
+#endif /* RAY_LINKOP_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/ops.h b/crates/rayforce-sys/vendor/rayforce/src/ops/ops.h
new file mode 100644
index 0000000..033a9aa
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/ops.h
@@ -0,0 +1,726 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OPS_H
+#define RAY_OPS_H
+
+#include <rayforce.h>
+#include "store/hnsw.h"  /* ray_hnsw_metric_t, ray_hnsw_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ===== Internal Type Constants ===== */
+
+#define RAY_SEL       14   /* selection bitmap (lazy filter) */
+
+/* Lazy DAG handle (atom-only; stored inline in nullmap region) */
+#define RAY_LAZY      104
+
+/* ===== Forward Declarations (internal types) ===== */
+
+typedef struct ray_pool      ray_pool_t;
+typedef struct ray_csr       ray_csr_t;
+typedef struct ray_rel       ray_rel_t;
+typedef struct ray_hnsw      ray_hnsw_t;
+
+/* ===== Lazy DAG Handle Accessors ===== */
+
+typedef struct ray_graph ray_graph_t;
+typedef struct ray_op    ray_op_t;
+
+static inline bool ray_is_lazy(ray_t* x) {
+    return x && !RAY_IS_ERR(x) && x->type == RAY_LAZY;
+}
+
+ray_t*    ray_lazy_materialize(ray_t* val);
+
+/* ===== Cancel API ===== */
+
+void     ray_cancel(void);
+
+/* ===== Parted Types ===== */
+
+#define RAY_PARTED_BASE   32
+#define RAY_MAPCOMMON     64   /* virtual partition column */
+
+/* MAPCOMMON inferred sub-types (stored in attrs field) */
+#define RAY_MC_SYM    0   /* opaque partition key strings */
+#define RAY_MC_DATE   1   /* YYYY.MM.DD partition directories */
+#define RAY_MC_I64    2   /* pure integer partition keys */
+
+#define RAY_IS_PARTED(t)       ((t) >= RAY_PARTED_BASE && (t) < RAY_MAPCOMMON)
+#define RAY_PARTED_BASETYPE(t) ((t) - RAY_PARTED_BASE)
+
+/* ===== Morsel Constants ===== */
+
+#define RAY_MORSEL_ELEMS  1024
+
+/* ===== Slab Cache Constants ===== */
+
+#define RAY_SLAB_CACHE_SIZE  64
+#define RAY_SLAB_ORDERS      5
+
+/* ===== Heap Allocator Constants ===== */
+
+#define RAY_ORDER_MIN  6
+#define RAY_ORDER_MAX  30
+
+/* ===== Parallel Threshold ===== */
+
+#define RAY_PARALLEL_THRESHOLD  (64 * RAY_MORSEL_ELEMS)
+#define RAY_DISPATCH_MORSELS    8
+
+/* Radix-partitioned hash join tuning.
+ * L2_TARGET: per-partition HT working set limit (tuned for L1d/L2).     */
+#define RAY_JOIN_L2_TARGET   (256 * 1024)   /* target partition HT size in bytes */
+#define RAY_JOIN_MIN_RADIX   2              /* min radix bits (4 partitions)   */
+#define RAY_JOIN_MAX_RADIX   14             /* max radix bits (16K partitions) */
+
+/* ===== Operation Graph ===== */
+
+/* Opcodes — Sources */
+#define OP_SCAN          1
+#define OP_CONST         2
+#define OP_TIL           3   /* generate 0..n-1 sequence (lazy source)  */
+
+/* Opcodes — Unary element-wise (fuseable) */
+#define OP_NEG          10
+#define OP_ABS          11
+#define OP_NOT          12
+#define OP_SQRT         13
+#define OP_LOG          14
+#define OP_EXP          15
+#define OP_CEIL         16
+#define OP_FLOOR        17
+#define OP_ISNULL       18
+#define OP_CAST         19
+#define OP_ROUND         9   /* unary element-wise round */
+
+/* Opcodes — Binary element-wise (fuseable) */
+#define OP_ADD          20
+#define OP_SUB          21
+#define OP_MUL          22
+#define OP_DIV          23
+#define OP_MOD          24
+#define OP_EQ           25
+#define OP_NE           26
+#define OP_LT           27
+#define OP_LE           28
+#define OP_GT           29
+#define OP_GE           30
+#define OP_AND          31
+#define OP_OR           32
+#define OP_MIN2         33
+#define OP_MAX2         34
+#define OP_IF           35
+#define OP_LIKE         36
+#define OP_UPPER        37
+#define OP_LOWER        38
+#define OP_STRLEN       39
+#define OP_SUBSTR       40
+#define OP_REPLACE      41
+#define OP_TRIM         42
+#define OP_CONCAT       43
+#define OP_EXTRACT      45
+#define OP_DATE_TRUNC   46
+#define OP_IN           47   /* binary: col in set_vec -> BOOL */
+#define OP_NOT_IN       48   /* binary: col not in set_vec -> BOOL */
+
+/* EXTRACT / DATE_TRUNC field identifiers */
+#define RAY_EXTRACT_YEAR    0
+#define RAY_EXTRACT_MONTH   1
+#define RAY_EXTRACT_DAY     2
+#define RAY_EXTRACT_HOUR    3
+#define RAY_EXTRACT_MINUTE  4
+#define RAY_EXTRACT_SECOND  5
+#define RAY_EXTRACT_DOW     6
+#define RAY_EXTRACT_DOY     7
+#define RAY_EXTRACT_EPOCH   8
+
+/* Opcodes — Reductions (pipeline breakers) */
+#define OP_SUM          50
+#define OP_PROD         51
+#define OP_MIN          52
+#define OP_MAX          53
+#define OP_COUNT        54
+#define OP_AVG          55
+#define OP_FIRST        56
+#define OP_LAST         57
+#define OP_COUNT_DISTINCT 58
+#define OP_STDDEV       59
+
+/* Opcodes — Structural (pipeline breakers) */
+#define OP_FILTER       60
+#define OP_SORT         61
+#define OP_GROUP        62
+#define OP_JOIN         63
+#define OP_WINDOW_JOIN  64
+#define OP_SELECT       66
+#define OP_HEAD         67
+#define OP_TAIL         68
+
+/* Opcodes — Window */
+#define OP_WINDOW       72
+
+/* Opcodes — Statistical aggregates */
+#define OP_STDDEV_POP   73
+#define OP_VAR          74
+#define OP_VAR_POP      75
+#define OP_ILIKE        76
+#define OP_PIVOT        77   /* single-pass pivot table            */
+#define OP_ANTIJOIN     78   /* anti-semi-join (left rows with no right match) */
+
+/* Opcodes — Graph */
+#define OP_EXPAND        80   /* 1-hop CSR neighbor expansion       */
+#define OP_VAR_EXPAND    81   /* variable-length BFS/DFS            */
+#define OP_SHORTEST_PATH 82   /* BFS shortest path                  */
+#define OP_WCO_JOIN      83   /* worst-case optimal join (LFTJ)     */
+#define OP_PAGERANK        84   /* iterative PageRank                 */
+#define OP_CONNECTED_COMP  85   /* connected components (label prop)  */
+#define OP_DIJKSTRA        86   /* weighted shortest path (Dijkstra)  */
+#define OP_LOUVAIN         87   /* community detection (Louvain)      */
+
+/* Opcodes — Graph algorithms (batch 1) */
+#define OP_DEGREE_CENT     92   /* degree centrality                  */
+#define OP_TOPSORT         93   /* topological sort (Kahn's)          */
+#define OP_DFS             94   /* depth-first search traversal       */
+
+/* Opcodes — Graph algorithms (batch 2) */
+#define OP_ASTAR           95   /* A* shortest path (coordinate heuristic) */
+#define OP_K_SHORTEST      96   /* Yen's k-shortest paths                 */
+#define OP_CLUSTER_COEFF   97   /* clustering coefficients                */
+#define OP_RANDOM_WALK     98   /* random walk traversal                  */
+#define OP_BETWEENNESS     99   /* betweenness centrality (Brandes)       */
+#define OP_CLOSENESS      100   /* closeness centrality                   */
+#define OP_MST            101   /* minimum spanning forest (Kruskal)      */
+
+/* Opcodes — Vector similarity */
+#define OP_COSINE_SIM      88   /* cosine similarity between embeddings   */
+#define OP_EUCLIDEAN_DIST  89   /* euclidean distance between embeddings  */
+#define OP_KNN             90   /* brute-force K nearest neighbors        */
+#define OP_HNSW_KNN        91   /* HNSW approximate K nearest neighbors   */
+#define OP_ANN_RERANK     102   /* index-backed ANN over filtered source  */
+#define OP_KNN_RERANK     103   /* brute-force KNN over filtered source   */
+
+/* Opcodes — Misc */
+#define OP_ALIAS        70
+#define OP_MATERIALIZE  71
+
+/* Window function kinds (stored in func_kinds[]) */
+#define RAY_WIN_ROW_NUMBER    0
+#define RAY_WIN_RANK          1
+#define RAY_WIN_DENSE_RANK    2
+#define RAY_WIN_NTILE         3
+#define RAY_WIN_SUM           4
+#define RAY_WIN_AVG           5
+#define RAY_WIN_MIN           6
+#define RAY_WIN_MAX           7
+#define RAY_WIN_COUNT         8
+#define RAY_WIN_LAG           9
+#define RAY_WIN_LEAD         10
+#define RAY_WIN_FIRST_VALUE  11
+#define RAY_WIN_LAST_VALUE   12
+#define RAY_WIN_NTH_VALUE   13
+
+/* Frame types */
+#define RAY_FRAME_ROWS    0
+#define RAY_FRAME_RANGE   1
+
+/* Frame bounds */
+#define RAY_BOUND_UNBOUNDED_PRECEDING  0
+#define RAY_BOUND_N_PRECEDING          1
+#define RAY_BOUND_CURRENT_ROW          2
+#define RAY_BOUND_N_FOLLOWING          3
+#define RAY_BOUND_UNBOUNDED_FOLLOWING  4
+
+/* Op flags */
+#define OP_FLAG_FUSED        0x01
+#define OP_FLAG_DEAD         0x02
+
+/* Operation node (32 bytes, fits one cache line) */
+typedef struct ray_op {
+    uint16_t       opcode;     /* OP_ADD, OP_SCAN, OP_FILTER, etc. */
+    uint8_t        arity;      /* 0, 1, or 2 */
+    uint8_t        flags;      /* FUSED, DEAD */
+    int8_t         out_type;   /* inferred output type */
+    uint8_t        pad[3];
+    uint32_t       id;         /* unique node ID */
+    uint32_t       est_rows;   /* estimated row count */
+    struct ray_op*  inputs[2];  /* NULL if unused */
+} ray_op_t;
+
+/* Extended operation node for N-ary ops (heap-allocated, variable size) */
+typedef struct ray_op_ext {
+    ray_op_t base;              /* 32 bytes standard node */
+    union {
+        ray_t*   literal;       /* OP_CONST: inline literal value */
+        int64_t sym;           /* OP_SCAN: column name symbol ID */
+        struct {               /* OP_GROUP: group-by specification */
+            ray_op_t**  keys;
+            uint8_t    n_keys;
+            uint8_t    n_aggs;
+            uint16_t*  agg_ops;
+            ray_op_t**  agg_ins;
+        };
+        struct {               /* OP_SORT: multi-column sort */
+            ray_op_t**  columns;
+            uint8_t*   desc;
+            uint8_t*   nulls_first; /* 1=nulls first, 0=nulls last */
+            uint8_t    n_cols;
+        } sort;
+        struct {               /* OP_JOIN: join specification */
+            ray_op_t**  left_keys;
+            ray_op_t**  right_keys;
+            uint8_t    n_join_keys;
+            uint8_t    join_type;  /* 0=inner, 1=left, 2=full, 3=anti */
+        } join;
+        struct {               /* OP_WINDOW_JOIN: ASOF join */
+            ray_op_t*   time_key;      /* time/ordered key column */
+            ray_op_t**  eq_keys;       /* equality partition keys */
+            uint8_t    n_eq_keys;     /* number of equality keys */
+            uint8_t    join_type;     /* 0=inner, 1=left outer */
+        } asof;
+        struct {               /* OP_WINDOW: window functions */
+            ray_op_t**  part_keys;
+            ray_op_t**  order_keys;
+            uint8_t*   order_descs;
+            ray_op_t**  func_inputs;
+            uint8_t*   func_kinds;    /* RAY_WIN_ROW_NUMBER etc. */
+            int64_t*   func_params;   /* NTILE(n), LAG offset, etc. */
+            uint8_t    n_part_keys;
+            uint8_t    n_order_keys;
+            uint8_t    n_funcs;
+            uint8_t    frame_type;    /* RAY_FRAME_ROWS / RAY_FRAME_RANGE */
+            uint8_t    frame_start;   /* RAY_BOUND_* */
+            uint8_t    frame_end;     /* RAY_BOUND_* */
+            int64_t    frame_start_n;
+            int64_t    frame_end_n;
+        } window;
+        struct {  /* OP_EXPAND / OP_VAR_EXPAND / OP_SHORTEST_PATH / graph algos */
+            void*     rel;            /* ray_rel_t* (opaque to public header) */
+            void*     sip_sel;        /* ray_t* RAY_SEL bitmap for SIP source-side skip */
+            uint8_t   direction;      /* 0=fwd, 1=rev, 2=both */
+            uint8_t   min_depth;
+            uint8_t   max_depth;
+            uint8_t   path_tracking;
+            uint8_t   factorized;     /* 1 = emit factorized output (fvec) */
+            uint16_t  max_iter;       /* PageRank/Louvain iterations  */
+            double    damping;        /* PageRank damping factor      */
+            int64_t   weight_col_sym; /* Dijkstra/Astar/Yen weight column   */
+            int64_t   coord_col_syms[2]; /* A*: lat/lon property column names */
+            void*     node_props;       /* ray_t* node property table (A*: coords) */
+        } graph;
+        struct {  /* OP_WCO_JOIN */
+            void**    rels;           /* ray_rel_t** array */
+            uint8_t   n_rels;
+            uint8_t   n_vars;
+        } wco;
+        struct {  /* OP_COSINE_SIM / OP_EUCLIDEAN_DIST / OP_INNER_PRODUCT / OP_KNN */
+            float*    query_vec;      /* query embedding (caller-owned, must outlive graph) */
+            int32_t   dim;            /* embedding dimension */
+            int64_t   k;              /* top-K for KNN */
+            int32_t   metric;         /* ray_hnsw_metric_t — used by OP_KNN only */
+        } vector;
+        struct {  /* OP_HNSW_KNN */
+            void*     hnsw_idx;       /* ray_hnsw_t* (opaque, must outlive graph) */
+            float*    query_vec;
+            int32_t   dim;
+            int64_t   k;
+            int32_t   ef_search;
+        } hnsw;
+        struct {  /* OP_ANN_RERANK / OP_KNN_RERANK */
+            void*     hnsw_idx;       /* ray_hnsw_t* for ANN; NULL for KNN */
+            int64_t   col_sym;        /* sym id of column for KNN; 0 for ANN */
+            float*    query_vec;      /* caller-owned */
+            int32_t   dim;
+            int32_t   metric;         /* ray_hnsw_metric_t — KNN variant only */
+            int64_t   k;              /* target result count from `take` */
+            int32_t   ef_search;      /* ANN only */
+        } rerank;
+        struct {  /* OP_PIVOT */
+            ray_op_t**  index_cols;   /* OP_SCAN nodes for index columns */
+            ray_op_t*   pivot_col;    /* OP_SCAN node for pivot column */
+            ray_op_t*   value_col;    /* OP_SCAN node for value column */
+            uint16_t    agg_op;       /* OP_SUM, OP_AVG, etc. */
+            uint8_t     n_index;      /* number of index columns */
+        } pivot;
+    };
+    uint64_t* seg_mask;   /* partition pruning bitmap (NULL = all active) */
+    int64_t   seg_mask_count; /* number of partitions the mask covers */
+} ray_op_ext_t;
+
+/* Operation graph */
+typedef struct ray_graph {
+    ray_op_t*       nodes;       /* array of op nodes (malloc'd) */
+    uint32_t       node_count;  /* number of nodes */
+    uint32_t       node_cap;    /* allocated capacity */
+    ray_t*          table;       /* bound table (provides columns for OP_SCAN) */
+    ray_t**         tables;      /* table registry (indexed by table_id) */
+    uint16_t       n_tables;    /* number of registered tables */
+    ray_op_ext_t**  ext_nodes;   /* tracked extended nodes for cleanup */
+    uint32_t       ext_count;   /* number of extended nodes */
+    uint32_t       ext_cap;     /* capacity of ext_nodes array */
+    ray_t*          selection;   /* RAY_SEL bitmap — lazy filter (NULL = all pass) */
+
+    /* Compile-time local env for lambda / let inlining in
+     * compile_expr_dag (src/ops/query.c).  Stack of
+     * {formal_sym_id → node_id}.  Pushed on lambda call / let
+     * entry, popped on exit.  Looked up BEFORE ray_scan so
+     * formals shadow column names naturally.
+     *
+     * Stores node IDs (uint32_t), not raw ray_op_t* — the
+     * g->nodes array is dynamically resized, so any realloc
+     * between push and lookup would dangle stored pointers.
+     * Lookup re-resolves &g->nodes[id] on every call. */
+    struct {
+        int64_t    sym;
+        uint32_t   node_id;
+    } cexpr_env[32];
+    int             cexpr_env_top;
+} ray_graph_t;
+
+/* ===== Morsel Iterator ===== */
+
+typedef struct {
+    ray_t*    vec;          /* source vector */
+    int64_t  offset;       /* current position (element index) */
+    int64_t  len;          /* total length of vector */
+    uint32_t elem_size;    /* bytes per element */
+    int64_t  morsel_len;   /* elements in current morsel (<=RAY_MORSEL_ELEMS) */
+    void*    morsel_ptr;   /* pointer to current morsel data */
+    uint8_t* null_bits;    /* current morsel null bitmap (or NULL) */
+} ray_morsel_t;
+
+/* ===== Selection Bitmap (RAY_SEL) ===== */
+
+/* Segment flags — one per morsel (RAY_MORSEL_ELEMS rows) */
+#define RAY_SEL_NONE  0   /* all bits 0 — skip entire morsel           */
+#define RAY_SEL_ALL   1   /* all bits 1 — process without bitmap check */
+#define RAY_SEL_MIX   2   /* mixed bits — must check per-row           */
+
+/* Words per morsel segment: 1024 rows / 64 bits = 16 uint64_t */
+#define RAY_SEL_WORDS_PER_SEG  (RAY_MORSEL_ELEMS / 64)
+
+/* Inline metadata at ray_data(sel) */
+typedef struct {
+    int64_t   total_pass;  /* total passing rows                      */
+    uint32_t  n_segs;      /* ceil(nrows / RAY_MORSEL_ELEMS)           */
+    uint32_t  _pad;
+} ray_sel_meta_t;
+
+/*
+ * RAY_SEL block layout (ray_data offset 0):
+ *
+ *   ray_sel_meta_t  meta        (16 bytes)
+ *   uint8_t        seg_flags[] (n_segs, padded to 8-byte alignment)
+ *   uint16_t       seg_popcnt[](n_segs, padded to 8-byte alignment)
+ *   uint64_t       bits[]      (ceil(nrows/64) words)
+ */
+
+static inline ray_sel_meta_t* ray_sel_meta(ray_t* s) {
+    return (ray_sel_meta_t*)ray_data(s);
+}
+static inline uint8_t* ray_sel_flags(ray_t* s) {
+    return (uint8_t*)ray_data(s) + sizeof(ray_sel_meta_t);
+}
+static inline uint16_t* ray_sel_popcnt(ray_t* s) {
+    uint32_t n = ray_sel_meta(s)->n_segs;
+    return (uint16_t*)(ray_sel_flags(s) + ((n + 7u) & ~7u));
+}
+static inline uint64_t* ray_sel_bits(ray_t* s) {
+    uint32_t n = ray_sel_meta(s)->n_segs;
+    uint16_t* pc = ray_sel_popcnt(s);
+    return (uint64_t*)(pc + ((n + 3u) & ~3u));
+}
+
+/* Bit ops */
+#define RAY_SEL_BIT_TEST(bits, r)  ((bits)[(r) >> 6] & (1ULL << ((r) & 63)))
+#define RAY_SEL_BIT_SET(bits, r)   ((bits)[(r) >> 6] |= (1ULL << ((r) & 63)))
+#define RAY_SEL_BIT_CLR(bits, r)   ((bits)[(r) >> 6] &= ~(1ULL << ((r) & 63)))
+
+/* ===== Executor Pipeline ===== */
+
+typedef struct ray_pipe {
+    ray_op_t*          op;            /* operation node */
+    struct ray_pipe*   inputs[2];     /* upstream pipes */
+    ray_morsel_t       state;         /* current morsel state */
+    ray_t*             materialized;  /* materialized intermediate (or NULL) */
+    int               spill_fd;      /* file descriptor for spill (-1 if none) */
+} ray_pipe_t;
+
+/* ===== Selection API ===== */
+
+ray_t* ray_sel_new(int64_t nrows);              /* all-zero (no rows pass)       */
+ray_t* ray_sel_from_pred(ray_t* bool_vec);       /* convert RAY_BOOL vec -> RAY_SEL  */
+ray_t* ray_sel_and(ray_t* a, ray_t* b);           /* AND two selections            */
+void  ray_sel_recompute(ray_t* sel);             /* rebuild seg_flags + popcounts */
+
+/* ===== Morsel Iterator API ===== */
+
+void ray_morsel_init(ray_morsel_t* m, ray_t* vec);
+void ray_morsel_init_range(ray_morsel_t* m, ray_t* vec, int64_t start, int64_t end);
+bool ray_morsel_next(ray_morsel_t* m);
+
+/* ===== Operation Graph API ===== */
+
+ray_graph_t* ray_graph_new(ray_t* tbl);
+void        ray_graph_free(ray_graph_t* g);
+
+/* Source ops */
+ray_op_t* ray_scan(ray_graph_t* g, const char* col_name);
+ray_op_t* ray_const_f64(ray_graph_t* g, double val);
+ray_op_t* ray_const_i64(ray_graph_t* g, int64_t val);
+ray_op_t* ray_const_bool(ray_graph_t* g, bool val);
+ray_op_t* ray_const_str(ray_graph_t* g, const char* s, size_t len);
+ray_op_t* ray_const_vec(ray_graph_t* g, ray_t* vec);
+ray_op_t* ray_const_atom(ray_graph_t* g, ray_t* atom);
+ray_op_t* ray_const_table(ray_graph_t* g, ray_t* table);
+
+/* Unary element-wise ops */
+ray_op_t* ray_neg(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_abs(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_not(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_sqrt_op(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_log_op(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_exp_op(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_ceil_op(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_floor_op(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_round_op(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_isnull(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_cast(ray_graph_t* g, ray_op_t* a, int8_t target_type);
+
+/* Generic binary op — opcode-driven dispatch, no switch/case */
+ray_op_t* ray_binop(ray_graph_t* g, uint16_t opcode, ray_op_t* a, ray_op_t* b);
+
+/* Binary element-wise ops */
+ray_op_t* ray_add(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_sub(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_mul(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_div(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_mod(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_eq(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_ne(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_lt(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_le(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_gt(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_ge(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_and(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_or(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_min2(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_max2(ray_graph_t* g, ray_op_t* a, ray_op_t* b);
+ray_op_t* ray_in(ray_graph_t* g, ray_op_t* col, ray_op_t* set);
+ray_op_t* ray_not_in(ray_graph_t* g, ray_op_t* col, ray_op_t* set);
+ray_op_t* ray_if(ray_graph_t* g, ray_op_t* cond, ray_op_t* then_val, ray_op_t* else_val);
+ray_op_t* ray_like(ray_graph_t* g, ray_op_t* input, ray_op_t* pattern);
+ray_op_t* ray_ilike(ray_graph_t* g, ray_op_t* input, ray_op_t* pattern);
+ray_op_t* ray_upper(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_lower(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_strlen(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_substr(ray_graph_t* g, ray_op_t* str, ray_op_t* start, ray_op_t* len);
+ray_op_t* ray_replace(ray_graph_t* g, ray_op_t* str, ray_op_t* from, ray_op_t* to);
+ray_op_t* ray_trim_op(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_concat(ray_graph_t* g, ray_op_t** args, int n);
+
+/* Date/time extraction and truncation */
+ray_op_t* ray_extract(ray_graph_t* g, ray_op_t* col, int64_t field);
+ray_op_t* ray_date_trunc(ray_graph_t* g, ray_op_t* col, int64_t field);
+
+/* Source ops */
+ray_op_t* ray_til(ray_graph_t* g, int64_t n);
+
+/* Reduction ops */
+ray_op_t* ray_sum(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_prod(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_min_op(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_max_op(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_count(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_avg(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_first(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_last(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_count_distinct(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_stddev(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_stddev_pop(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_var(ray_graph_t* g, ray_op_t* a);
+ray_op_t* ray_var_pop(ray_graph_t* g, ray_op_t* a);
+
+/* Structural ops */
+ray_op_t* ray_filter(ray_graph_t* g, ray_op_t* input, ray_op_t* predicate);
+ray_op_t* ray_sort_op(ray_graph_t* g, ray_op_t* table_node,
+                     ray_op_t** keys, uint8_t* descs, uint8_t* nulls_first,
+                     uint8_t n_cols);
+ray_op_t* ray_group(ray_graph_t* g, ray_op_t** keys, uint8_t n_keys,
+                   uint16_t* agg_ops, ray_op_t** agg_ins, uint8_t n_aggs);
+ray_op_t* ray_distinct(ray_graph_t* g, ray_op_t** keys, uint8_t n_keys);
+ray_op_t* ray_pivot_op(ray_graph_t* g,
+                       ray_op_t** index_cols, uint8_t n_index,
+                       ray_op_t* pivot_col,
+                       ray_op_t* value_col,
+                       uint16_t agg_op);
+ray_op_t* ray_join(ray_graph_t* g,
+                  ray_op_t* left_table, ray_op_t** left_keys,
+                  ray_op_t* right_table, ray_op_t** right_keys,
+                  uint8_t n_keys, uint8_t join_type);
+ray_op_t* ray_antijoin(ray_graph_t* g,
+                      ray_op_t* left_table, ray_op_t** left_keys,
+                      ray_op_t* right_table, ray_op_t** right_keys,
+                      uint8_t n_keys);
+ray_op_t* ray_asof_join(ray_graph_t* g,
+                       ray_op_t* left_table, ray_op_t* right_table,
+                       ray_op_t* time_key,
+                       ray_op_t** eq_keys, uint8_t n_eq_keys,
+                       uint8_t join_type);
+ray_op_t* ray_window_op(ray_graph_t* g, ray_op_t* table_node,
+                       ray_op_t** part_keys, uint8_t n_part,
+                       ray_op_t** order_keys, uint8_t* order_descs, uint8_t n_order,
+                       uint8_t* func_kinds, ray_op_t** func_inputs,
+                       int64_t* func_params, uint8_t n_funcs,
+                       uint8_t frame_type, uint8_t frame_start, uint8_t frame_end,
+                       int64_t frame_start_n, int64_t frame_end_n);
+ray_op_t* ray_select(ray_graph_t* g, ray_op_t* input,
+                    ray_op_t** cols, uint8_t n_cols);
+ray_op_t* ray_head(ray_graph_t* g, ray_op_t* input, int64_t n);
+ray_op_t* ray_tail(ray_graph_t* g, ray_op_t* input, int64_t n);
+ray_op_t* ray_alias(ray_graph_t* g, ray_op_t* input, const char* name);
+ray_op_t* ray_materialize(ray_graph_t* g, ray_op_t* input);
+
+/* ===== Graph Ops ===== */
+
+/* Multi-table support */
+uint16_t ray_graph_add_table(ray_graph_t* g, ray_t* table);
+ray_op_t* ray_scan_table(ray_graph_t* g, uint16_t table_id, const char* col_name);
+
+/* Graph traversal */
+ray_op_t* ray_expand(ray_graph_t* g, ray_op_t* src_nodes,
+                    ray_rel_t* rel, uint8_t direction);
+ray_op_t* ray_var_expand(ray_graph_t* g, ray_op_t* start_nodes,
+                        ray_rel_t* rel, uint8_t direction,
+                        uint8_t min_depth, uint8_t max_depth,
+                        bool track_path);
+ray_op_t* ray_shortest_path(ray_graph_t* g, ray_op_t* src, ray_op_t* dst,
+                           ray_rel_t* rel, uint8_t max_depth);
+ray_op_t* ray_wco_join(ray_graph_t* g,
+                      ray_rel_t** rels, uint8_t n_rels,
+                      uint8_t n_vars);
+
+/* Graph algorithms */
+ray_op_t* ray_pagerank(ray_graph_t* g, ray_rel_t* rel,
+                      uint16_t max_iter, double damping);
+ray_op_t* ray_connected_comp(ray_graph_t* g, ray_rel_t* rel);
+ray_op_t* ray_dijkstra(ray_graph_t* g, ray_op_t* src, ray_op_t* dst,
+                      ray_rel_t* rel, const char* weight_col,
+                      uint8_t max_depth);
+ray_op_t* ray_louvain(ray_graph_t* g, ray_rel_t* rel,
+                     uint16_t max_iter);
+ray_op_t* ray_degree_cent(ray_graph_t* g, ray_rel_t* rel);
+ray_op_t* ray_topsort(ray_graph_t* g, ray_rel_t* rel);
+ray_op_t* ray_dfs(ray_graph_t* g, ray_op_t* src, ray_rel_t* rel, uint8_t max_depth);
+ray_op_t* ray_astar(ray_graph_t* g, ray_op_t* src, ray_op_t* dst,
+                  ray_rel_t* rel, const char* weight_col,
+                  const char* lat_col, const char* lon_col,
+                  ray_t* node_props, uint8_t max_depth);
+ray_op_t* ray_k_shortest(ray_graph_t* g, ray_op_t* src, ray_op_t* dst,
+                       ray_rel_t* rel, const char* weight_col, uint16_t k);
+ray_op_t* ray_cluster_coeff(ray_graph_t* g, ray_rel_t* rel);
+ray_op_t* ray_random_walk(ray_graph_t* g, ray_op_t* src, ray_rel_t* rel,
+                        uint16_t walk_length);
+ray_op_t* ray_betweenness(ray_graph_t* g, ray_rel_t* rel, uint16_t sample_size);
+ray_op_t* ray_closeness(ray_graph_t* g, ray_rel_t* rel, uint16_t sample_size);
+ray_op_t* ray_mst(ray_graph_t* g, ray_rel_t* rel, const char* weight_col);
+
+/* Vector similarity ops */
+ray_op_t* ray_cosine_sim(ray_graph_t* g, ray_op_t* emb_col,
+                        const float* query_vec, int32_t dim);
+ray_op_t* ray_euclidean_dist(ray_graph_t* g, ray_op_t* emb_col,
+                            const float* query_vec, int32_t dim);
+ray_op_t* ray_knn(ray_graph_t* g, ray_op_t* emb_col,
+                 const float* query_vec, int32_t dim, int64_t k,
+                 ray_hnsw_metric_t metric);
+
+/* HNSW-accelerated KNN (uses pre-built index instead of brute-force) */
+ray_op_t* ray_hnsw_knn(ray_graph_t* g, ray_hnsw_t* idx,
+                       const float* query_vec, int32_t dim,
+                       int64_t k, int32_t ef_search);
+
+/* Rerank ops: consume a filtered source table and return top-K nearest rows
+ * (source columns + _dist appended).  Used by `select ... nearest ... take`. */
+ray_op_t* ray_ann_rerank(ray_graph_t* g, ray_op_t* src,
+                         ray_hnsw_t* idx, const float* query_vec,
+                         int32_t dim, int64_t k, int32_t ef_search);
+ray_op_t* ray_knn_rerank(ray_graph_t* g, ray_op_t* src,
+                         int64_t col_sym, const float* query_vec,
+                         int32_t dim, int64_t k, ray_hnsw_metric_t metric);
+
+/* CSR / Relationship API */
+ray_rel_t* ray_rel_build(ray_t* from_table, const char* fk_col,
+                         int64_t n_target_nodes, bool sort_targets);
+ray_rel_t* ray_rel_from_edges(ray_t* edge_table,
+                             const char* src_col, const char* dst_col,
+                             int64_t n_src_nodes, int64_t n_dst_nodes,
+                             bool sort_targets);
+ray_err_t  ray_rel_save(ray_rel_t* rel, const char* dir);
+ray_rel_t* ray_rel_load(const char* dir);
+ray_rel_t* ray_rel_mmap(const char* dir);
+void      ray_rel_set_props(ray_rel_t* rel, ray_t* props);
+void      ray_rel_free(ray_rel_t* rel);
+const int64_t* ray_rel_neighbors(ray_rel_t* rel, int64_t node,
+                                uint8_t direction, int64_t* out_count);
+int64_t   ray_rel_n_nodes(ray_rel_t* rel, uint8_t direction);
+
+/* ===== Optimizer API ===== */
+
+ray_op_t* ray_optimize(ray_graph_t* g, ray_op_t* root);
+void     ray_fuse_pass(ray_graph_t* g, ray_op_t* root);
+
+/* ===== Plan Printer ===== */
+
+const char* ray_opcode_name(uint16_t op);
+void ray_graph_dump(ray_graph_t* g, ray_op_t* root, void* out);
+
+/* ===== Sort API ===== */
+
+/* Sort columns and return index array (I64 vector of sorted indices).
+ * Uses parallel radix sort for numerics, merge sort for strings/symbols.
+ * descs/nulls_first may be NULL (all-asc / nulls-last default). */
+ray_t* ray_sort_indices(ray_t** cols, uint8_t* descs, uint8_t* nulls_first,
+                        uint8_t n_cols, int64_t nrows);
+
+/* ===== Executor API ===== */
+
+ray_t* ray_execute(ray_graph_t* g, ray_op_t* root);
+
+/* ===== Lazy DAG Handle (Internal) ===== */
+
+#define RAY_LAZY_GRAPH(p) (*(ray_graph_t**)((p)->nullmap))
+#define RAY_LAZY_OP(p)    (*(ray_op_t**)(((p)->nullmap) + 8))
+
+ray_op_t* ray_graph_input_vec(ray_graph_t* g, ray_t* vec);
+ray_t*    ray_lazy_wrap(ray_graph_t* g, ray_op_t* op);
+ray_t*    ray_lazy_append(ray_t* lazy, uint16_t opcode);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RAY_OPS_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/opt.c b/crates/rayforce-sys/vendor/rayforce/src/ops/opt.c
new file mode 100644
index 0000000..65c5b38
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/opt.c
@@ -0,0 +1,2031 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#if !defined(RAY_OS_WINDOWS) && !defined(_GNU_SOURCE)
+#define _GNU_SOURCE
+#endif
+
+#include "opt.h"
+#include "core/profile.h"
+#include "mem/sys.h"
+#include "mem/heap.h"
+#include <math.h>
+#include <string.h>
+
+/* Forward declaration — defined below, used by type inference and DCE passes. */
+static ray_op_ext_t* find_ext(ray_graph_t* g, uint32_t node_id);
+
+/* --------------------------------------------------------------------------
+ * Optimizer passes (v1): Type Inference + Constant Folding + Fusion + DCE
+ *
+ * Per the spec's staged rollout:
+ *   v1: Type Inference + Constant Folding + Fusion + DCE
+ *   v2: Predicate/Projection Pushdown + CSE (future)
+ *   v3: Op Reordering + Join Optimization (future)
+ * -------------------------------------------------------------------------- */
+
+/* --------------------------------------------------------------------------
+ * Pass 1: Type inference (bottom-up)
+ *
+ * Most type inference is done during graph construction (graph.c).
+ * This pass validates and propagates any missing types.
+ * -------------------------------------------------------------------------- */
+
+static int8_t promote_type(int8_t a, int8_t b) {
+    if (a == RAY_STR || b == RAY_STR) return RAY_STR;
+    if (a == RAY_F64 || b == RAY_F64) return RAY_F64;
+    /* Treat SYM/TIMESTAMP/DATE/TIME as integer-class types */
+    if (a == RAY_I64 || b == RAY_I64 || a == RAY_SYM || b == RAY_SYM ||
+        a == RAY_TIMESTAMP || b == RAY_TIMESTAMP) return RAY_I64;
+    if (a == RAY_I32 || b == RAY_I32 ||
+        a == RAY_DATE || b == RAY_DATE || a == RAY_TIME || b == RAY_TIME) return RAY_I32;
+    if (a == RAY_I16 || b == RAY_I16) return RAY_I16;
+    if (a == RAY_U8 || b == RAY_U8) return RAY_U8;
+    return RAY_BOOL;
+}
+
+static void infer_type_for_node(ray_op_t* node) {
+    if (node->out_type == 0 && node->opcode != OP_SCAN && node->opcode != OP_CONST) {
+        /* Comparison and boolean ops always produce BOOL */
+        if (node->opcode >= OP_EQ && node->opcode <= OP_GE) {
+            node->out_type = RAY_BOOL;
+            return;
+        }
+        if (node->opcode == OP_AND || node->opcode == OP_OR) {
+            node->out_type = RAY_BOOL;
+            return;
+        }
+        if (node->arity >= 2 && node->inputs[0] && node->inputs[1]) {
+            node->out_type = promote_type(node->inputs[0]->out_type,
+                                           node->inputs[1]->out_type);
+        } else if (node->arity >= 1 && node->inputs[0]) {
+            node->out_type = node->inputs[0]->out_type;
+        }
+    }
+}
+
+static void pass_type_inference(ray_graph_t* g, ray_op_t* root) {
+    if (!root || root->flags & OP_FLAG_DEAD) return;
+
+    /* Iterative post-order: collect nodes into an order array, then
+       process in reverse (children before parents). */
+    uint32_t nc = g->node_count;
+    uint32_t stack_cap = nc * 2 + 64;  /* extra space for high fan-out nodes */
+    uint32_t stack_local[256], order_local[256];
+    bool visited_stack[256];
+    uint32_t *stack = stack_cap <= 256 ? stack_local : (uint32_t*)ray_sys_alloc(stack_cap * sizeof(uint32_t));
+    uint32_t *order = nc <= 256 ? order_local : (uint32_t*)ray_sys_alloc(nc * sizeof(uint32_t));
+    bool* visited;
+    if (nc <= 256) {
+        visited = visited_stack;
+    } else {
+        visited = (bool*)ray_sys_alloc(nc * sizeof(bool));
+    }
+    if (!stack || !order || !visited) {
+        { if (stack_cap > 256) ray_sys_free(stack); if (nc > 256) { ray_sys_free(order); ray_sys_free(visited); } }
+        return;
+    }
+    memset(visited, 0, nc * sizeof(bool));
+
+    int sp = 0, oc = 0;
+    stack[sp++] = root->id;
+    while (sp > 0 && oc < (int)nc) {
+        uint32_t nid = stack[--sp];
+        ray_op_t* n = &g->nodes[nid];
+        if (!n || n->flags & OP_FLAG_DEAD) continue;
+        if (visited[nid]) continue;
+        visited[nid] = true;
+        order[oc++] = nid;
+        for (int i = 0; i < 2 && i < n->arity; i++) {
+            if (n->inputs[i] && sp < (int)stack_cap)
+                stack[sp++] = n->inputs[i]->id;
+        }
+        /* M3: Traverse ext node children so type inference reaches all
+           referenced nodes (GROUP keys/aggs, SORT/PROJECT/SELECT columns,
+           JOIN keys, WINDOW partition/order/func_inputs). */
+        ray_op_ext_t* ext = find_ext(g, nid);
+        if (ext) {
+            switch (n->opcode) {
+                case OP_GROUP:
+                    for (uint8_t k = 0; k < ext->n_keys; k++)
+                        if (ext->keys[k] && !visited[ext->keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->keys[k]->id;
+                    for (uint8_t a = 0; a < ext->n_aggs; a++)
+                        if (ext->agg_ins[a] && !visited[ext->agg_ins[a]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->agg_ins[a]->id;
+                    break;
+                case OP_SORT:
+                case OP_SELECT:
+                    for (uint8_t k = 0; k < ext->sort.n_cols; k++)
+                        if (ext->sort.columns[k] && !visited[ext->sort.columns[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->sort.columns[k]->id;
+                    break;
+                case OP_JOIN:
+                    for (uint8_t k = 0; k < ext->join.n_join_keys; k++) {
+                        if (ext->join.left_keys[k] && !visited[ext->join.left_keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->join.left_keys[k]->id;
+                        if (ext->join.right_keys && ext->join.right_keys[k] &&
+                            !visited[ext->join.right_keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->join.right_keys[k]->id;
+                    }
+                    break;
+                case OP_WINDOW_JOIN: {
+                    ray_op_ext_t* wj_ext = find_ext(g, n->id);
+                    if (wj_ext) {
+                        if (wj_ext->asof.time_key && !visited[wj_ext->asof.time_key->id] && sp < (int)stack_cap)
+                            stack[sp++] = wj_ext->asof.time_key->id;
+                        for (uint8_t k = 0; k < wj_ext->asof.n_eq_keys; k++) {
+                            if (wj_ext->asof.eq_keys[k] && !visited[wj_ext->asof.eq_keys[k]->id] && sp < (int)stack_cap)
+                                stack[sp++] = wj_ext->asof.eq_keys[k]->id;
+                        }
+                    }
+                    break;
+                }
+                case OP_WINDOW:
+                    for (uint8_t k = 0; k < ext->window.n_part_keys; k++)
+                        if (ext->window.part_keys[k] && !visited[ext->window.part_keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->window.part_keys[k]->id;
+                    for (uint8_t k = 0; k < ext->window.n_order_keys; k++)
+                        if (ext->window.order_keys[k] && !visited[ext->window.order_keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->window.order_keys[k]->id;
+                    for (uint8_t f = 0; f < ext->window.n_funcs; f++)
+                        if (ext->window.func_inputs[f] && !visited[ext->window.func_inputs[f]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->window.func_inputs[f]->id;
+                    break;
+                /* M3b: 3-input ops store third operand node ID in ext->literal */
+                case OP_IF:
+                case OP_SUBSTR:
+                case OP_REPLACE: {
+                    uint32_t third_id = (uint32_t)(uintptr_t)ext->literal;
+                    if (third_id < nc && !visited[third_id] && sp < (int)stack_cap)
+                        stack[sp++] = third_id;
+                    break;
+                }
+                /* M3c: OP_CONCAT trailing arg node IDs beyond inputs[0..1] */
+                case OP_CONCAT:
+                    if (ext->sym >= 2) {
+                        int n_args = (int)ext->sym;
+                        uint32_t* trail = (uint32_t*)((char*)(ext + 1));
+                        for (int j = 2; j < n_args; j++) {
+                            uint32_t arg_id = trail[j - 2];
+                            if (arg_id < nc && !visited[arg_id] && sp < (int)stack_cap)
+                                stack[sp++] = arg_id;
+                        }
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    /* Process in reverse order (children before parents) */
+    for (int i = oc - 1; i >= 0; i--)
+        infer_type_for_node(&g->nodes[order[i]]);
+
+    { if (stack_cap > 256) ray_sys_free(stack); if (nc > 256) { ray_sys_free(order); ray_sys_free(visited); } }
+}
+
+/* --------------------------------------------------------------------------
+ * Pass 2: Constant folding
+ *
+ * If all inputs to an element-wise op are OP_CONST, evaluate immediately
+ * and replace the node with a new OP_CONST.
+ * -------------------------------------------------------------------------- */
+
+static bool is_const(ray_op_t* n) {
+    return n && n->opcode == OP_CONST;
+}
+
+/* O(ext_count) per call; acceptable for typical graph sizes (tens to
+   hundreds of nodes).  L2: intentional duplication to keep files
+   self-contained — also present in fuse.c. */
+static ray_op_ext_t* find_ext(ray_graph_t* g, uint32_t node_id) {
+    for (uint32_t i = 0; i < g->ext_count; i++) {
+        if (g->ext_nodes[i] && g->ext_nodes[i]->base.id == node_id)
+            return g->ext_nodes[i];
+    }
+    return NULL;
+}
+
+static bool track_ext_node(ray_graph_t* g, ray_op_ext_t* ext) {
+    if (g->ext_count >= g->ext_cap) {
+        if (g->ext_cap > UINT32_MAX / 2) return false;
+        uint32_t new_cap = g->ext_cap == 0 ? 16 : g->ext_cap * 2;
+        ray_op_ext_t** new_exts =
+            (ray_op_ext_t**)ray_sys_realloc(g->ext_nodes, new_cap * sizeof(ray_op_ext_t*));
+        if (!new_exts) return false;
+        g->ext_nodes = new_exts;
+        g->ext_cap = new_cap;
+    }
+    g->ext_nodes[g->ext_count++] = ext;
+    return true;
+}
+
+static ray_op_ext_t* ensure_ext_node(ray_graph_t* g, uint32_t node_id) {
+    ray_op_ext_t* ext = find_ext(g, node_id);
+    if (ext) return ext;
+
+    ext = (ray_op_ext_t*)ray_sys_alloc(sizeof(ray_op_ext_t));
+    if (!ext) return NULL;
+    /* M1: Zero-init to prevent use of uninitialized fields (literal,
+       keys, agg_ins, etc.) before the caller populates them. */
+    memset(ext, 0, sizeof(*ext));
+    ext->base.id = node_id;
+    if (!track_ext_node(g, ext)) {
+        ray_sys_free(ext);
+        return NULL;
+    }
+    return ext;
+}
+
+static bool atom_to_numeric(ray_t* v, double* out_f, int64_t* out_i, bool* is_f64) {
+    if (!v || !ray_is_atom(v)) return false;
+    if (RAY_ATOM_IS_NULL(v)) return false;
+    switch (v->type) {
+        case -RAY_F64:
+            *out_f = v->f64;
+            *out_i = (int64_t)v->f64;
+            *is_f64 = true;
+            return true;
+        case -RAY_I64:
+        case -RAY_SYM:
+        case -RAY_DATE:
+        case -RAY_TIME:
+        case -RAY_TIMESTAMP:
+            *out_i = v->i64;
+            *out_f = (double)v->i64;
+            *is_f64 = false;
+            return true;
+        case -RAY_I32:
+            *out_i = (int64_t)v->i32;
+            *out_f = (double)v->i32;
+            *is_f64 = false;
+            return true;
+        case -RAY_I16:
+            *out_i = (int64_t)v->i16;
+            *out_f = (double)v->i16;
+            *is_f64 = false;
+            return true;
+        case -RAY_U8:
+        case -RAY_BOOL:
+            *out_i = (int64_t)v->u8;
+            *out_f = (double)v->u8;
+            *is_f64 = false;
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool replace_with_const(ray_graph_t* g, ray_op_t* node, ray_t* literal) {
+    /* H3: If the node already has an ext node (GROUP, SORT, JOIN, etc.),
+       skip constant replacement — overwriting the ext union would clobber
+       structural data.  Structural ops should never be constant-folded. */
+    if (find_ext(g, node->id)) return false;
+
+    ray_op_ext_t* ext = ensure_ext_node(g, node->id);
+    if (!ext) return false;
+
+    ext->base = *node;
+    ext->base.opcode = OP_CONST;
+    ext->base.arity = 0;
+    ext->base.inputs[0] = NULL;
+    ext->base.inputs[1] = NULL;
+    ext->base.flags &= (uint8_t)~OP_FLAG_FUSED;
+    ext->base.out_type = literal->type < 0 ? (int8_t)(-(int)literal->type) : literal->type;
+    ext->literal = literal;
+
+    *node = ext->base;
+    g->nodes[node->id] = ext->base;
+    return true;
+}
+
+static bool fold_unary_const(ray_graph_t* g, ray_op_t* node) {
+    ray_op_t* operand = node->inputs[0];
+    if (!is_const(operand)) return false;
+
+    ray_op_ext_t* oe = find_ext(g, operand->id);
+    if (!oe || !oe->literal || !ray_is_atom(oe->literal)) return false;
+
+    double vf = 0.0;
+    int64_t vi = 0;
+    bool is_f64 = false;
+    if (!atom_to_numeric(oe->literal, &vf, &vi, &is_f64)) return false;
+
+    ray_t* folded = NULL;
+    switch (node->opcode) {
+        case OP_NEG:
+            if (is_f64) folded = ray_f64(-vf);
+            else if (vi == INT64_MIN) return false;  /* -INT64_MIN overflows */
+            else folded = ray_i64(-vi);
+            break;
+        case OP_ABS:
+            if (is_f64)
+                folded = ray_f64(fabs(vf));
+            else if (vi == INT64_MIN) return false;  /* -INT64_MIN overflows */
+            else folded = ray_i64(vi < 0 ? -vi : vi);
+            break;
+        case OP_NOT:
+            folded = ray_bool(is_f64 ? vf == 0.0 : vi == 0);
+            break;
+        case OP_SQRT:
+            folded = ray_f64(sqrt(is_f64 ? vf : (double)vi));
+            break;
+        case OP_LOG:
+            folded = ray_f64(log(is_f64 ? vf : (double)vi));
+            break;
+        case OP_EXP:
+            folded = ray_f64(exp(is_f64 ? vf : (double)vi));
+            break;
+        case OP_CEIL:
+            folded = is_f64 ? ray_f64(ceil(vf)) : ray_i64(vi);
+            break;
+        case OP_FLOOR:
+            folded = is_f64 ? ray_f64(floor(vf)) : ray_i64(vi);
+            break;
+        default:
+            return false;
+    }
+
+    if (!folded || RAY_IS_ERR(folded)) return false;
+    if (!replace_with_const(g, node, folded)) {
+        ray_release(folded);
+        return false;
+    }
+    return true;
+}
+
+static bool fold_binary_const(ray_graph_t* g, ray_op_t* node) {
+    ray_op_t* lhs = node->inputs[0];
+    ray_op_t* rhs = node->inputs[1];
+    if (!is_const(lhs) || !is_const(rhs)) return false;
+
+    ray_op_ext_t* le = find_ext(g, lhs->id);
+    ray_op_ext_t* re = find_ext(g, rhs->id);
+    if (!le || !re || !le->literal || !re->literal) return false;
+    if (!ray_is_atom(le->literal) || !ray_is_atom(re->literal)) return false;
+
+    double lf = 0.0, rf = 0.0;
+    int64_t li = 0, ri = 0;
+    bool l_is_f64 = false, r_is_f64 = false;
+    if (!atom_to_numeric(le->literal, &lf, &li, &l_is_f64)) return false;
+    if (!atom_to_numeric(re->literal, &rf, &ri, &r_is_f64)) return false;
+
+    ray_t* folded = NULL;
+    switch (node->out_type) {
+        case RAY_F64: {
+            double lv = l_is_f64 ? lf : (double)li;
+            double rv = r_is_f64 ? rf : (double)ri;
+            double r = 0.0;
+            switch (node->opcode) {
+                case OP_ADD: r = lv + rv; break;
+                case OP_SUB: r = lv - rv; break;
+                case OP_MUL: r = lv * rv; break;
+                case OP_DIV: r = lv / rv; break;  /* IEEE 754: ±Inf or NaN */
+                case OP_MOD: r = fmod(lv, rv); break;  /* IEEE 754: NaN for rv==0 */
+                case OP_MIN2: r = fmin(lv, rv); break;  /* NaN-propagating */
+                case OP_MAX2: r = fmax(lv, rv); break;  /* NaN-propagating */
+                default: return false;
+            }
+            folded = ray_f64(r);
+            break;
+        }
+        case RAY_I64: {
+            int64_t lv = l_is_f64 ? (int64_t)lf : li;
+            int64_t rv = r_is_f64 ? (int64_t)rf : ri;
+            int64_t r = 0;
+            switch (node->opcode) {
+                case OP_ADD: r = (int64_t)((uint64_t)lv + (uint64_t)rv); break;
+                case OP_SUB: r = (int64_t)((uint64_t)lv - (uint64_t)rv); break;
+                case OP_MUL: r = (int64_t)((uint64_t)lv * (uint64_t)rv); break;
+                case OP_DIV:
+                    r = (rv != 0 && !(lv == INT64_MIN && rv == -1)) ? lv / rv : 0;
+                    break;
+                case OP_MOD:
+                    r = (rv != 0 && !(lv == INT64_MIN && rv == -1)) ? lv % rv : 0;
+                    break;
+                case OP_MIN2: r = lv < rv ? lv : rv; break;
+                case OP_MAX2: r = lv > rv ? lv : rv; break;
+                default: return false;
+            }
+            folded = ray_i64(r);
+            break;
+        }
+        case RAY_BOOL: {
+            /* NaN comparison follows IEEE 754; SQL NULL handled separately
+               in executor. */
+            double lv = l_is_f64 ? lf : (double)li;
+            double rv = r_is_f64 ? rf : (double)ri;
+            bool r = false;
+            switch (node->opcode) {
+                case OP_EQ:  r = lv == rv; break;
+                case OP_NE:  r = lv != rv; break;
+                case OP_LT:  r = lv < rv; break;
+                case OP_LE:  r = lv <= rv; break;
+                case OP_GT:  r = lv > rv; break;
+                case OP_GE:  r = lv >= rv; break;
+                case OP_AND: r = (lv != 0.0) && (rv != 0.0); break;
+                case OP_OR:  r = (lv != 0.0) || (rv != 0.0); break;
+                default: return false;
+            }
+            folded = ray_bool(r);
+            break;
+        }
+        case RAY_I32: case RAY_DATE: case RAY_TIME: {
+            int32_t lv = (int32_t)(l_is_f64 ? (int64_t)lf : li);
+            int32_t rv = (int32_t)(r_is_f64 ? (int64_t)rf : ri);
+            int32_t r = 0;
+            switch (node->opcode) {
+                case OP_ADD: r = (int32_t)((uint32_t)lv + (uint32_t)rv); break;
+                case OP_SUB: r = (int32_t)((uint32_t)lv - (uint32_t)rv); break;
+                case OP_MUL: r = (int32_t)((uint32_t)lv * (uint32_t)rv); break;
+                case OP_DIV:
+                    r = (rv != 0 && !(lv == INT32_MIN && rv == -1)) ? lv / rv : 0;
+                    break;
+                case OP_MOD:
+                    r = (rv != 0 && !(lv == INT32_MIN && rv == -1)) ? lv % rv : 0;
+                    break;
+                case OP_MIN2: r = lv < rv ? lv : rv; break;
+                case OP_MAX2: r = lv > rv ? lv : rv; break;
+                default: return false;
+            }
+            folded = ray_i32(r);
+            break;
+        }
+        default:
+            return false;
+    }
+
+    if (!folded || RAY_IS_ERR(folded)) return false;
+    if (!replace_with_const(g, node, folded)) {
+        ray_release(folded);
+        return false;
+    }
+    return true;
+}
+
+static bool atom_to_bool(ray_t* v, bool* out) {
+    double vf = 0.0;
+    int64_t vi = 0;
+    bool is_f64 = false;
+    if (!atom_to_numeric(v, &vf, &vi, &is_f64)) return false;
+    if (is_f64) {
+        *out = vf != 0.0;
+    } else {
+        *out = vi != 0;
+    }
+    return true;
+}
+
+static bool fold_filter_const_predicate(ray_graph_t* g, ray_op_t* node) {
+    if (node->opcode != OP_FILTER || node->arity != 2) return false;
+    ray_op_t* pred = node->inputs[1];
+    if (!is_const(pred)) return false;
+
+    ray_op_ext_t* pred_ext = find_ext(g, pred->id);
+    if (!pred_ext || !pred_ext->literal || !ray_is_atom(pred_ext->literal)) return false;
+
+    bool keep_rows = false;
+    if (!atom_to_bool(pred_ext->literal, &keep_rows)) return false;
+
+    if (keep_rows) {
+        node->opcode = OP_MATERIALIZE;
+        node->arity = 1;
+        node->inputs[1] = NULL;
+        node->flags &= (uint8_t)~OP_FLAG_FUSED;
+        g->nodes[node->id] = *node;
+        return true;
+    }
+
+    ray_op_ext_t* ext = ensure_ext_node(g, node->id);
+    if (!ext) return false;
+    ext->base = *node;
+    ext->base.opcode = OP_HEAD;
+    ext->base.arity = 1;
+    ext->base.inputs[1] = NULL;
+    ext->base.est_rows = 0;
+    ext->base.flags &= (uint8_t)~OP_FLAG_FUSED;
+    ext->sym = 0;
+
+    *node = ext->base;
+    g->nodes[node->id] = ext->base;
+    return true;
+}
+
+/* Fold reduction(OP_TIL(n)) → closed-form result.
+ * sum(0..n-1) = n*(n-1)/2,  min(0..n-1) = 0,  max(0..n-1) = n-1,
+ * count(0..n-1) = n,  avg(0..n-1) = (n-1)/2.0 */
+static bool fold_reduction_til(ray_graph_t* g, ray_op_t* node) {
+    if (node->arity != 1) return false;
+    ray_op_t* input = node->inputs[0];
+    if (!input || input->opcode != OP_TIL) return false;
+    ray_op_ext_t* til_ext = find_ext(g, input->id);
+    if (!til_ext || !til_ext->literal) return false;
+    int64_t n = til_ext->literal->i64;
+    if (n <= 0) return false;
+
+    ray_t* folded = NULL;
+    switch (node->opcode) {
+        case OP_SUM:   folded = ray_i64((n * (n - 1)) / 2); break;
+        case OP_MIN:   folded = ray_i64(0); break;
+        case OP_MAX:   folded = ray_i64(n - 1); break;
+        case OP_COUNT: folded = ray_i64(n); break;
+        case OP_AVG:   folded = ray_f64((double)(n - 1) / 2.0); break;
+        case OP_FIRST: folded = ray_i64(0); break;
+        case OP_LAST:  folded = ray_i64(n - 1); break;
+        default: return false;
+    }
+    if (!folded || RAY_IS_ERR(folded)) return false;
+    if (!replace_with_const(g, node, folded)) { ray_release(folded); return false; }
+    return true;
+}
+
+static void fold_node(ray_graph_t* g, ray_op_t* node) {
+    /* Fold unary element-wise ops with constant input */
+    if (node->arity == 1 && node->opcode >= OP_NEG && node->opcode <= OP_FLOOR) {
+        (void)fold_unary_const(g, node);
+    }
+    /* Fold binary element-wise ops with two const inputs */
+    if (node->arity == 2 && node->opcode >= OP_ADD && node->opcode <= OP_MAX2) {
+        (void)fold_binary_const(g, node);
+    }
+    /* Fold reduction(til(n)) to closed-form */
+    if (node->arity == 1 && node->opcode >= OP_SUM && node->opcode <= OP_LAST) {
+        (void)fold_reduction_til(g, node);
+    }
+    /* FILTER with constant predicate can be reduced to pass-through/empty. */
+    (void)fold_filter_const_predicate(g, node);
+}
+
+static void pass_constant_fold(ray_graph_t* g, ray_op_t* root) {
+    if (!root || root->flags & OP_FLAG_DEAD) return;
+
+    /* Iterative post-order: collect nodes, then process in reverse
+       (children before parents). */
+    uint32_t nc = g->node_count;
+    uint32_t stack_cap = nc * 2 + 64;  /* extra space for high fan-out nodes */
+    uint32_t stack_local[256], order_local[256];
+    bool visited_stack[256];
+    uint32_t *stack = stack_cap <= 256 ? stack_local : (uint32_t*)ray_sys_alloc(stack_cap * sizeof(uint32_t));
+    uint32_t *order = nc <= 256 ? order_local : (uint32_t*)ray_sys_alloc(nc * sizeof(uint32_t));
+    bool* visited;
+    if (nc <= 256) {
+        visited = visited_stack;
+    } else {
+        visited = (bool*)ray_sys_alloc(nc * sizeof(bool));
+    }
+    if (!stack || !order || !visited) {
+        { if (stack_cap > 256) ray_sys_free(stack); if (nc > 256) { ray_sys_free(order); ray_sys_free(visited); } }
+        return;
+    }
+    memset(visited, 0, nc * sizeof(bool));
+
+    int sp = 0, oc = 0;
+    stack[sp++] = root->id;
+    while (sp > 0 && oc < (int)nc) {
+        uint32_t nid = stack[--sp];
+        ray_op_t* n = &g->nodes[nid];
+        if (!n || n->flags & OP_FLAG_DEAD) continue;
+        if (visited[nid]) continue;
+        visited[nid] = true;
+        order[oc++] = nid;
+        for (int i = 0; i < 2 && i < n->arity; i++) {
+            if (n->inputs[i] && sp < (int)stack_cap)
+                stack[sp++] = n->inputs[i]->id;
+        }
+        /* H1: Traverse ext-node children so constant folding reaches all
+           referenced nodes (GROUP keys/aggs, SORT/PROJECT/SELECT columns,
+           JOIN keys, WINDOW partition/order/func_inputs). */
+        ray_op_ext_t* ext = find_ext(g, nid);
+        if (ext) {
+            switch (n->opcode) {
+                case OP_GROUP:
+                    for (uint8_t k = 0; k < ext->n_keys; k++)
+                        if (ext->keys[k] && !visited[ext->keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->keys[k]->id;
+                    for (uint8_t a = 0; a < ext->n_aggs; a++)
+                        if (ext->agg_ins[a] && !visited[ext->agg_ins[a]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->agg_ins[a]->id;
+                    break;
+                case OP_SORT:
+                case OP_SELECT:
+                    for (uint8_t k = 0; k < ext->sort.n_cols; k++)
+                        if (ext->sort.columns[k] && !visited[ext->sort.columns[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->sort.columns[k]->id;
+                    break;
+                case OP_JOIN:
+                    for (uint8_t k = 0; k < ext->join.n_join_keys; k++) {
+                        if (ext->join.left_keys[k] && !visited[ext->join.left_keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->join.left_keys[k]->id;
+                        if (ext->join.right_keys && ext->join.right_keys[k] &&
+                            !visited[ext->join.right_keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->join.right_keys[k]->id;
+                    }
+                    break;
+                case OP_WINDOW_JOIN: {
+                    ray_op_ext_t* wj_ext = find_ext(g, n->id);
+                    if (wj_ext) {
+                        if (wj_ext->asof.time_key && !visited[wj_ext->asof.time_key->id] && sp < (int)stack_cap)
+                            stack[sp++] = wj_ext->asof.time_key->id;
+                        for (uint8_t k = 0; k < wj_ext->asof.n_eq_keys; k++) {
+                            if (wj_ext->asof.eq_keys[k] && !visited[wj_ext->asof.eq_keys[k]->id] && sp < (int)stack_cap)
+                                stack[sp++] = wj_ext->asof.eq_keys[k]->id;
+                        }
+                    }
+                    break;
+                }
+                case OP_WINDOW:
+                    for (uint8_t k = 0; k < ext->window.n_part_keys; k++)
+                        if (ext->window.part_keys[k] && !visited[ext->window.part_keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->window.part_keys[k]->id;
+                    for (uint8_t k = 0; k < ext->window.n_order_keys; k++)
+                        if (ext->window.order_keys[k] && !visited[ext->window.order_keys[k]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->window.order_keys[k]->id;
+                    for (uint8_t f = 0; f < ext->window.n_funcs; f++)
+                        if (ext->window.func_inputs[f] && !visited[ext->window.func_inputs[f]->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->window.func_inputs[f]->id;
+                    break;
+                /* H1b: 3-input ops store third operand node ID in ext->literal */
+                case OP_IF:
+                case OP_SUBSTR:
+                case OP_REPLACE: {
+                    uint32_t third_id = (uint32_t)(uintptr_t)ext->literal;
+                    if (third_id < nc && !visited[third_id] && sp < (int)stack_cap)
+                        stack[sp++] = third_id;
+                    break;
+                }
+                /* H1c: OP_CONCAT trailing arg node IDs beyond inputs[0..1] */
+                case OP_CONCAT:
+                    if (ext->sym >= 2) {
+                        int n_args = (int)ext->sym;
+                        uint32_t* trail = (uint32_t*)((char*)(ext + 1));
+                        for (int j = 2; j < n_args; j++) {
+                            uint32_t arg_id = trail[j - 2];
+                            if (arg_id < nc && !visited[arg_id] && sp < (int)stack_cap)
+                                stack[sp++] = arg_id;
+                        }
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    /* Process in reverse order (children before parents) */
+    for (int i = oc - 1; i >= 0; i--)
+        fold_node(g, &g->nodes[order[i]]);
+
+    { if (stack_cap > 256) ray_sys_free(stack); if (nc > 256) { ray_sys_free(order); ray_sys_free(visited); } }
+}
+
+/* --------------------------------------------------------------------------
+ * Pass 3: Dead code elimination
+ *
+ * Mark nodes unreachable from root as DEAD.
+ * -------------------------------------------------------------------------- */
+
+static void mark_live(ray_graph_t* g, ray_op_t* root, bool* live) {
+    if (!root) return;
+
+    uint32_t nc = g->node_count;
+    if (nc > UINT32_MAX / 2) return;
+    /* Worst case: each node can contribute up to ~N children (CONCAT trailing),
+       but nc*2 is a safe upper bound for the stack. */
+    uint32_t stack_cap = nc * 2;
+    uint32_t stack_local[256];
+    uint32_t *stack = stack_cap <= 256 ? stack_local : (uint32_t*)ray_sys_alloc(stack_cap * sizeof(uint32_t));
+    if (!stack) return;
+    int sp = 0;
+    stack[sp++] = root->id;
+    while (sp > 0) {
+        uint32_t nid = stack[--sp];
+        if (live[nid]) continue;
+        live[nid] = true;
+        ray_op_t* n = &g->nodes[nid];
+        for (int i = 0; i < 2; i++) {
+            if (n->inputs[i] && sp < (int)stack_cap)
+                stack[sp++] = n->inputs[i]->id;
+        }
+        /* H4: 3-input ops (OP_IF, OP_SUBSTR, OP_REPLACE) store the third
+           operand node ID as (uintptr_t)ext->literal. */
+        if (n->opcode == OP_IF || n->opcode == OP_SUBSTR || n->opcode == OP_REPLACE) {
+            ray_op_ext_t* ext = find_ext(g, nid);
+            if (ext) {
+                uint32_t third_id = (uint32_t)(uintptr_t)ext->literal;
+                if (third_id < nc && sp < (int)stack_cap)
+                    stack[sp++] = third_id;
+            }
+        }
+        /* H5: OP_CONCAT stores extra arg IDs (beyond inputs[0..1]) as
+           uint32_t values in trailing bytes after the ext node.
+           ext->sym holds the total arg count. */
+        if (n->opcode == OP_CONCAT) {
+            ray_op_ext_t* ext = find_ext(g, nid);
+            /* M4: Guard against ext->sym < 2 — trailing uint32_t values
+               only exist when there are more than 2 arguments. */
+            if (ext && ext->sym >= 2) {
+                int n_args = (int)ext->sym;
+                uint32_t* trail = (uint32_t*)((char*)(ext + 1));
+                for (int i = 2; i < n_args; i++) {
+                    uint32_t arg_id = trail[i - 2];
+                    if (arg_id < nc && sp < (int)stack_cap)
+                        stack[sp++] = arg_id;
+                }
+            }
+        }
+        /* H1: Traverse ext node children for structural ops so DCE does
+           not incorrectly mark referenced nodes as dead. */
+        if (n->opcode == OP_GROUP || n->opcode == OP_SORT ||
+            n->opcode == OP_JOIN  || n->opcode == OP_ANTIJOIN ||
+            n->opcode == OP_WINDOW_JOIN ||
+            n->opcode == OP_WINDOW || n->opcode == OP_PIVOT ||
+            n->opcode == OP_SELECT) {
+            ray_op_ext_t* ext = find_ext(g, nid);
+            if (ext) {
+                switch (n->opcode) {
+                    case OP_GROUP:
+                        for (uint8_t k = 0; k < ext->n_keys; k++) {
+                            if (ext->keys[k] && !live[ext->keys[k]->id] && sp < (int)stack_cap)
+                                stack[sp++] = ext->keys[k]->id;
+                        }
+                        for (uint8_t a = 0; a < ext->n_aggs; a++) {
+                            if (ext->agg_ins[a] && !live[ext->agg_ins[a]->id] && sp < (int)stack_cap)
+                                stack[sp++] = ext->agg_ins[a]->id;
+                        }
+                        break;
+                    case OP_SORT:
+                    case OP_SELECT:
+                        for (uint8_t k = 0; k < ext->sort.n_cols; k++) {
+                            if (ext->sort.columns[k] && !live[ext->sort.columns[k]->id] && sp < (int)stack_cap)
+                                stack[sp++] = ext->sort.columns[k]->id;
+                        }
+                        break;
+                    case OP_JOIN:
+                    case OP_ANTIJOIN:
+                        for (uint8_t k = 0; k < ext->join.n_join_keys; k++) {
+                            if (ext->join.left_keys[k] && !live[ext->join.left_keys[k]->id] && sp < (int)stack_cap)
+                                stack[sp++] = ext->join.left_keys[k]->id;
+                            if (ext->join.right_keys && ext->join.right_keys[k] &&
+                                !live[ext->join.right_keys[k]->id] && sp < (int)stack_cap)
+                                stack[sp++] = ext->join.right_keys[k]->id;
+                        }
+                        break;
+                    case OP_PIVOT:
+                        for (uint8_t k = 0; k < ext->pivot.n_index; k++) {
+                            if (ext->pivot.index_cols[k] && !live[ext->pivot.index_cols[k]->id] && sp < (int)stack_cap)
+                                stack[sp++] = ext->pivot.index_cols[k]->id;
+                        }
+                        if (ext->pivot.pivot_col && !live[ext->pivot.pivot_col->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->pivot.pivot_col->id;
+                        if (ext->pivot.value_col && !live[ext->pivot.value_col->id] && sp < (int)stack_cap)
+                            stack[sp++] = ext->pivot.value_col->id;
+                        break;
+                    case OP_WINDOW_JOIN: {
+                        ray_op_ext_t* wj_ext = find_ext(g, n->id);
+                        if (wj_ext) {
+                            if (wj_ext->asof.time_key && !live[wj_ext->asof.time_key->id] && sp < (int)stack_cap)
+                                stack[sp++] = wj_ext->asof.time_key->id;
+                            for (uint8_t k = 0; k < wj_ext->asof.n_eq_keys; k++) {
+                                if (wj_ext->asof.eq_keys[k] && !live[wj_ext->asof.eq_keys[k]->id] && sp < (int)stack_cap)
+                                    stack[sp++] = wj_ext->asof.eq_keys[k]->id;
+                            }
+                        }
+                        break;
+                    }
+                    case OP_WINDOW:
+                        for (uint8_t k = 0; k < ext->window.n_part_keys; k++) {
+                            if (ext->window.part_keys[k] && !live[ext->window.part_keys[k]->id] && sp < (int)stack_cap)
+                                stack[sp++] = ext->window.part_keys[k]->id;
+                        }
+                        for (uint8_t k = 0; k < ext->window.n_order_keys; k++) {
+                            if (ext->window.order_keys[k] && !live[ext->window.order_keys[k]->id] && sp < (int)stack_cap)
+                                stack[sp++] = ext->window.order_keys[k]->id;
+                        }
+                        for (uint8_t f = 0; f < ext->window.n_funcs; f++) {
+                            if (ext->window.func_inputs[f] && !live[ext->window.func_inputs[f]->id] && sp < (int)stack_cap)
+                                stack[sp++] = ext->window.func_inputs[f]->id;
+                        }
+                        break;
+                    default:
+                        break;
+                }
+            }
+        }
+    }
+    if (stack_cap > 256) ray_sys_free(stack);
+}
+
+static void pass_dce(ray_graph_t* g, ray_op_t* root) {
+    uint32_t nc = g->node_count;
+    bool* live;
+    bool live_stack[256];
+    if (nc <= 256) {
+        live = live_stack;
+    } else {
+        live = (bool*)ray_sys_alloc(nc * sizeof(bool));
+        if (!live) return;
+    }
+    memset(live, 0, nc * sizeof(bool));
+
+    mark_live(g, root, live);
+
+    for (uint32_t i = 0; i < nc; i++) {
+        if (!live[i]) {
+            g->nodes[i].flags |= OP_FLAG_DEAD;
+        }
+    }
+    if (nc > 256) ray_sys_free(live);
+}
+
+/* --------------------------------------------------------------------------
+ * Pass: SIP (Sideways Information Passing)
+ *
+ * Bottom-up DAG walk. For each OP_EXPAND:
+ *   1. Find downstream filter on target side
+ *   2. Reverse-CSR: mark source nodes that have any passing target -> RAY_SEL
+ *   3. Attach source_sel to upstream scan
+ *
+ * Currently a no-op placeholder — activated when graph ops are present.
+ * -------------------------------------------------------------------------- */
+
+/* Find downstream consumer of a node (first node that uses it as input) */
+static ray_op_t* find_consumer(ray_graph_t* g, uint32_t node_id) {
+    for (uint32_t i = 0; i < g->node_count; i++) {
+        ray_op_t* n = &g->nodes[i];
+        if (n->flags & OP_FLAG_DEAD) continue;
+        for (int j = 0; j < n->arity && j < 2; j++) {
+            if (n->inputs[j] && n->inputs[j]->id == node_id)
+                return n;
+        }
+    }
+    return NULL;
+}
+
+/* Find upstream OP_SCAN that feeds into a node via input chain (iterative) */
+static ray_op_t* find_upstream_scan(ray_graph_t* g, ray_op_t* node) {
+    uint32_t limit = g ? g->node_count : 1024;
+    for (uint32_t steps = 0; node && steps < limit; steps++) {
+        if (node->opcode == OP_SCAN) return node;
+        if (node->arity > 0 && node->inputs[0])
+            node = node->inputs[0];
+        else return NULL;
+    }
+    return NULL;
+}
+
+static void sip_pass(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root) return;
+
+    uint32_t nc = g->node_count;
+
+    /* Collect graph traversal nodes (bottom-up for chained SIP) */
+    uint32_t expand_ids[64];
+    uint32_t n_expands = 0;
+    for (uint32_t i = 0; i < nc && n_expands < 64; i++) {
+        ray_op_t* n = &g->nodes[i];
+        if (n->flags & OP_FLAG_DEAD) continue;
+        if (n->opcode != OP_EXPAND && n->opcode != OP_VAR_EXPAND
+            && n->opcode != OP_SHORTEST_PATH) continue;
+        expand_ids[n_expands++] = i;
+    }
+
+    /* Process bottom-up (deepest expand first — process in reverse ID order
+     * since deeper nodes in the pipeline tend to have higher IDs) */
+    for (int ei = (int)n_expands - 1; ei >= 0; ei--) {
+        ray_op_t* expand = &g->nodes[expand_ids[ei]];
+        ray_op_ext_t* ext = find_ext(g, expand->id);
+        if (!ext || !ext->graph.rel) continue;
+
+        /* 1. Find downstream consumer — look for OP_FILTER on target side */
+        ray_op_t* consumer = find_consumer(g, expand->id);
+        if (!consumer) continue;
+
+        /* If the consumer is OP_FILTER, we can extract a semijoin.
+         * The filter's condition restricts which target nodes pass.
+         * We reverse-propagate through the CSR to mark which source
+         * nodes could produce any passing target. */
+        if (consumer->opcode != OP_FILTER) continue;
+
+        /* 2. Find the input scan to this expand (source side) */
+        ray_op_t* src_scan = NULL;
+        if (expand->arity > 0 && expand->inputs[0])
+            src_scan = find_upstream_scan(g, expand->inputs[0]);
+
+        if (!src_scan) continue;
+
+        /* 3. Propagate backward: attach selection hint to the expand node.
+         * The executor will use this to build a RAY_SEL bitmap at runtime
+         * by evaluating the filter condition, reverse-CSR propagating,
+         * and applying the resulting source-side selection.
+         *
+         * We store the filter node ID in the expand's ext pad bytes
+         * so the executor can find the downstream filter for runtime SIP. */
+        /* pad[2] = 1 signals the executor to build SIP bitmap at runtime.
+         * Note: pad is only 3 bytes (pad[0..2]) — do NOT write uint16_t
+         * at pad+2 as that overflows into the 'id' field at offset 8. */
+        ext->base.pad[2] = 1;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Pass: Factorized detection
+ *
+ * Detect OP_EXPAND → OP_GROUP patterns where factorized execution
+ * avoids materializing the full cross-product.
+ * -------------------------------------------------------------------------- */
+static void factorize_pass(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root) return;
+
+    uint32_t nc = g->node_count;
+    for (uint32_t i = 0; i < nc; i++) {
+        ray_op_t* n = &g->nodes[i];
+        if (n->flags & OP_FLAG_DEAD) continue;
+        if (n->opcode != OP_EXPAND) continue;
+
+        ray_op_ext_t* ext = find_ext(g, n->id);
+        if (!ext || ext->graph.factorized) continue;  /* already set by SIP pass */
+
+        /* Look for immediate OP_GROUP consumer with _src as group key */
+        ray_op_t* consumer = find_consumer(g, n->id);
+        if (!consumer || consumer->opcode != OP_GROUP) continue;
+
+        ray_op_ext_t* grp_ext = find_ext(g, consumer->id);
+        if (!grp_ext || grp_ext->n_keys != 1 || !grp_ext->keys[0]) continue;
+
+        ray_op_ext_t* key_ext = find_ext(g, grp_ext->keys[0]->id);
+        if (!key_ext || key_ext->base.opcode != OP_SCAN) continue;
+
+        int64_t src_sym = ray_sym_intern("_src", 4);
+        if (key_ext->sym == src_sym) {
+            ext->graph.factorized = 1;
+        }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Pass: Filter reordering
+ *
+ * Reorder chained OP_FILTER nodes so cheapest predicates execute first.
+ * Also splits AND trees into separate chained filters.
+ * -------------------------------------------------------------------------- */
+
+/* Allocate a new node in the graph (for use during optimization passes).
+ * Same logic as graph_alloc_node in graph.c but local to opt.c. */
+static ray_op_t* graph_alloc_node_opt(ray_graph_t* g) {
+    if (g->node_count >= g->node_cap) {
+        if (g->node_cap > UINT32_MAX / 2) return NULL;
+        uint32_t new_cap = g->node_cap * 2;
+        uintptr_t old_base = (uintptr_t)g->nodes;
+        ray_op_t* new_nodes = (ray_op_t*)ray_sys_realloc(g->nodes,
+                                                       new_cap * sizeof(ray_op_t));
+        if (!new_nodes) return NULL;
+        g->nodes = new_nodes;
+        g->node_cap = new_cap;
+        /* Fix up all input pointers after realloc */
+        ptrdiff_t delta = (ptrdiff_t)((uintptr_t)g->nodes - old_base);
+        if (delta != 0) {
+            for (uint32_t i = 0; i < g->node_count; i++) {
+                if (g->nodes[i].inputs[0])
+                    g->nodes[i].inputs[0] = (ray_op_t*)((char*)g->nodes[i].inputs[0] + delta);
+                if (g->nodes[i].inputs[1])
+                    g->nodes[i].inputs[1] = (ray_op_t*)((char*)g->nodes[i].inputs[1] + delta);
+            }
+            /* Fix ext node input pointers */
+            for (uint32_t i = 0; i < g->ext_count; i++) {
+                if (g->ext_nodes[i]) {
+                    if (g->ext_nodes[i]->base.inputs[0])
+                        g->ext_nodes[i]->base.inputs[0] =
+                            (ray_op_t*)((char*)g->ext_nodes[i]->base.inputs[0] + delta);
+                    if (g->ext_nodes[i]->base.inputs[1])
+                        g->ext_nodes[i]->base.inputs[1] =
+                            (ray_op_t*)((char*)g->ext_nodes[i]->base.inputs[1] + delta);
+                    /* Fix structural op column pointers */
+                    switch (g->ext_nodes[i]->base.opcode) {
+                        case OP_GROUP:
+                            for (uint8_t k = 0; k < g->ext_nodes[i]->n_keys; k++)
+                                if (g->ext_nodes[i]->keys[k])
+                                    g->ext_nodes[i]->keys[k] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->keys[k] + delta);
+                            for (uint8_t a = 0; a < g->ext_nodes[i]->n_aggs; a++)
+                                if (g->ext_nodes[i]->agg_ins[a])
+                                    g->ext_nodes[i]->agg_ins[a] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->agg_ins[a] + delta);
+                            break;
+                        case OP_SORT:
+                        case OP_SELECT:
+                            for (uint8_t k = 0; k < g->ext_nodes[i]->sort.n_cols; k++)
+                                if (g->ext_nodes[i]->sort.columns[k])
+                                    g->ext_nodes[i]->sort.columns[k] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->sort.columns[k] + delta);
+                            break;
+                        case OP_JOIN:
+                            for (uint8_t k = 0; k < g->ext_nodes[i]->join.n_join_keys; k++) {
+                                if (g->ext_nodes[i]->join.left_keys[k])
+                                    g->ext_nodes[i]->join.left_keys[k] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->join.left_keys[k] + delta);
+                                if (g->ext_nodes[i]->join.right_keys &&
+                                    g->ext_nodes[i]->join.right_keys[k])
+                                    g->ext_nodes[i]->join.right_keys[k] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->join.right_keys[k] + delta);
+                            }
+                            break;
+                        case OP_WINDOW_JOIN:
+                            if (g->ext_nodes[i]->asof.time_key)
+                                g->ext_nodes[i]->asof.time_key = (ray_op_t*)((char*)g->ext_nodes[i]->asof.time_key + delta);
+                            for (uint8_t k = 0; k < g->ext_nodes[i]->asof.n_eq_keys; k++)
+                                if (g->ext_nodes[i]->asof.eq_keys[k])
+                                    g->ext_nodes[i]->asof.eq_keys[k] = (ray_op_t*)((char*)g->ext_nodes[i]->asof.eq_keys[k] + delta);
+                            break;
+                        case OP_ANTIJOIN:
+                            for (uint8_t k = 0; k < g->ext_nodes[i]->join.n_join_keys; k++) {
+                                if (g->ext_nodes[i]->join.left_keys[k])
+                                    g->ext_nodes[i]->join.left_keys[k] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->join.left_keys[k] + delta);
+                                if (g->ext_nodes[i]->join.right_keys &&
+                                    g->ext_nodes[i]->join.right_keys[k])
+                                    g->ext_nodes[i]->join.right_keys[k] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->join.right_keys[k] + delta);
+                            }
+                            break;
+                        case OP_PIVOT:
+                            for (uint8_t k = 0; k < g->ext_nodes[i]->pivot.n_index; k++)
+                                if (g->ext_nodes[i]->pivot.index_cols[k])
+                                    g->ext_nodes[i]->pivot.index_cols[k] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->pivot.index_cols[k] + delta);
+                            if (g->ext_nodes[i]->pivot.pivot_col)
+                                g->ext_nodes[i]->pivot.pivot_col =
+                                    (ray_op_t*)((char*)g->ext_nodes[i]->pivot.pivot_col + delta);
+                            if (g->ext_nodes[i]->pivot.value_col)
+                                g->ext_nodes[i]->pivot.value_col =
+                                    (ray_op_t*)((char*)g->ext_nodes[i]->pivot.value_col + delta);
+                            break;
+                        case OP_WINDOW:
+                            for (uint8_t k = 0; k < g->ext_nodes[i]->window.n_part_keys; k++)
+                                if (g->ext_nodes[i]->window.part_keys[k])
+                                    g->ext_nodes[i]->window.part_keys[k] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->window.part_keys[k] + delta);
+                            for (uint8_t k = 0; k < g->ext_nodes[i]->window.n_order_keys; k++)
+                                if (g->ext_nodes[i]->window.order_keys[k])
+                                    g->ext_nodes[i]->window.order_keys[k] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->window.order_keys[k] + delta);
+                            for (uint8_t f = 0; f < g->ext_nodes[i]->window.n_funcs; f++)
+                                if (g->ext_nodes[i]->window.func_inputs[f])
+                                    g->ext_nodes[i]->window.func_inputs[f] =
+                                        (ray_op_t*)((char*)g->ext_nodes[i]->window.func_inputs[f] + delta);
+                            break;
+                        default:
+                            break;
+                    }
+                }
+            }
+        }
+    }
+    ray_op_t* n = &g->nodes[g->node_count];
+    memset(n, 0, sizeof(ray_op_t));
+    n->id = g->node_count;
+    g->node_count++;
+    return n;
+}
+
+/* Count how many live nodes use node_id as an input.
+ * Returns the consumer count (0 if unreferenced). */
+static int count_node_consumers(ray_graph_t* g, uint32_t node_id) {
+    int count = 0;
+    uint32_t nc = g->node_count;
+    for (uint32_t j = 0; j < nc; j++) {
+        ray_op_t* c = &g->nodes[j];
+        if (c->flags & OP_FLAG_DEAD) continue;
+        for (int k = 0; k < c->arity && k < 2; k++) {
+            if (c->inputs[k] && c->inputs[k]->id == node_id) {
+                count++;
+                break;  /* count each consumer node once */
+            }
+        }
+    }
+    for (uint32_t j = 0; j < g->ext_count; j++) {
+        if (!g->ext_nodes[j]) continue;
+        ray_op_t* c = &g->ext_nodes[j]->base;
+        if (c->flags & OP_FLAG_DEAD) continue;
+        if (c->id < nc) continue;  /* already counted in nodes[] */
+        for (int k = 0; k < c->arity && k < 2; k++) {
+            if (c->inputs[k] && c->inputs[k]->id == node_id) {
+                count++;
+                break;
+            }
+        }
+    }
+    return count;
+}
+
+/* --------------------------------------------------------------------------
+ * Pass: Predicate pushdown
+ *
+ * Move OP_FILTER nodes below PROJECT/SELECT, GROUP (key-only), JOIN
+ * (one-sided), and EXPAND (source-only) to reduce rows flowing through
+ * expensive operators.
+ * -------------------------------------------------------------------------- */
+
+/* Collect all OP_SCAN node IDs referenced by a predicate subtree.
+ * Returns count on success, -1 if traversal was truncated (stack or result
+ * overflow) — caller must treat -1 as "unknown" and skip optimisation. */
+static int collect_pred_scans(ray_graph_t* g, ray_op_t* pred,
+                              uint32_t* scan_ids, int max) {
+    if (!pred || max <= 0) return 0;
+    int n = 0;
+
+    uint32_t stack[64];
+    int sp = 0;
+    stack[sp++] = pred->id;
+
+    bool visited[4096];
+    uint32_t nc = g->node_count;
+    if (nc > 4096) return -1;  /* safety: skip for huge graphs */
+    memset(visited, 0, nc * sizeof(bool));
+
+    while (sp > 0) {
+        uint32_t nid = stack[--sp];
+        if (nid >= nc || visited[nid]) continue;
+        visited[nid] = true;
+        ray_op_t* node = &g->nodes[nid];
+        if (node->flags & OP_FLAG_DEAD) continue;
+
+        if (node->opcode == OP_SCAN) {
+            if (n >= max) return -1;  /* result overflow */
+            scan_ids[n++] = nid;
+            continue;
+        }
+        for (int i = 0; i < node->arity && i < 2; i++) {
+            if (node->inputs[i]) {
+                if (sp >= 64) return -1;  /* stack overflow */
+                stack[sp++] = node->inputs[i]->id;
+            }
+        }
+        /* Walk ext-stored operands for multi-input ops */
+        ray_op_ext_t* ext = find_ext(g, nid);
+        if (ext) {
+            switch (node->opcode) {
+                case OP_IF:
+                case OP_SUBSTR:
+                case OP_REPLACE: {
+                    uint32_t third_id = (uint32_t)(uintptr_t)ext->literal;
+                    if (third_id < nc && !visited[third_id]) {
+                        if (sp >= 64) return -1;
+                        stack[sp++] = third_id;
+                    }
+                    break;
+                }
+                case OP_CONCAT:
+                    if (ext->sym >= 2) {
+                        int n_args = (int)ext->sym;
+                        uint32_t* trail = (uint32_t*)((char*)(ext + 1));
+                        for (int j = 2; j < n_args; j++) {
+                            uint32_t arg_id = trail[j - 2];
+                            if (arg_id < nc && !visited[arg_id]) {
+                                if (sp >= 64) return -1;
+                                stack[sp++] = arg_id;
+                            }
+                        }
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    return n;
+}
+
+/* Check if target_id is reachable from start by walking inputs.
+ * Returns true if target_id is in the subgraph rooted at start. */
+static bool is_reachable_from(ray_graph_t* g, ray_op_t* start, uint32_t target_id) {
+    if (!start) return false;
+    if (start->id == target_id) return true;
+
+    uint32_t nc = g->node_count;
+    if (nc > 4096) return false;
+
+    bool visited[4096];
+    memset(visited, 0, nc * sizeof(bool));
+
+    uint32_t stack[64];
+    int sp = 0;
+    stack[sp++] = start->id;
+
+    while (sp > 0) {
+        uint32_t nid = stack[--sp];
+        if (nid >= nc || visited[nid]) continue;
+        visited[nid] = true;
+        if (nid == target_id) return true;
+        ray_op_t* node = &g->nodes[nid];
+        if (node->flags & OP_FLAG_DEAD) continue;
+        for (int i = 0; i < node->arity && i < 2; i++) {
+            if (node->inputs[i] && sp < 64)
+                stack[sp++] = node->inputs[i]->id;
+        }
+        /* Walk ext-stored operands for multi-input ops */
+        ray_op_ext_t* ext = find_ext(g, nid);
+        if (ext) {
+            switch (node->opcode) {
+                case OP_IF:
+                case OP_SUBSTR:
+                case OP_REPLACE: {
+                    uint32_t third_id = (uint32_t)(uintptr_t)ext->literal;
+                    if (third_id < nc && !visited[third_id] && sp < 64)
+                        stack[sp++] = third_id;
+                    break;
+                }
+                case OP_CONCAT:
+                    if (ext->sym >= 2) {
+                        int n_args = (int)ext->sym;
+                        uint32_t* trail = (uint32_t*)((char*)(ext + 1));
+                        for (int j = 2; j < n_args; j++) {
+                            uint32_t arg_id = trail[j - 2];
+                            if (arg_id < nc && !visited[arg_id] && sp < 64)
+                                stack[sp++] = arg_id;
+                        }
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    return false;
+}
+
+/* Redirect all consumers of old_id to point to new_target instead.
+ * Skips nodes with IDs skip_a and skip_b (the swapped pair).
+ * Updates both g->nodes[] and g->ext_nodes[].base.inputs[]. */
+static void redirect_consumers(ray_graph_t* g, uint32_t old_id,
+                               ray_op_t* new_target,
+                               uint32_t skip_a, uint32_t skip_b) {
+    uint32_t nc = g->node_count;
+    for (uint32_t j = 0; j < nc; j++) {
+        ray_op_t* c = &g->nodes[j];
+        if (c->flags & OP_FLAG_DEAD || j == skip_a || j == skip_b) continue;
+        for (int k = 0; k < c->arity && k < 2; k++) {
+            if (c->inputs[k] && c->inputs[k]->id == old_id)
+                c->inputs[k] = new_target;
+        }
+    }
+    /* Also update ext_node heap copies to keep them in sync */
+    for (uint32_t j = 0; j < g->ext_count; j++) {
+        if (!g->ext_nodes[j]) continue;
+        ray_op_t* c = &g->ext_nodes[j]->base;
+        if (c->flags & OP_FLAG_DEAD) continue;
+        if (c->id == skip_a || c->id == skip_b) continue;
+        for (int k = 0; k < c->arity && k < 2; k++) {
+            if (c->inputs[k] && c->inputs[k]->id == old_id)
+                c->inputs[k] = new_target;
+        }
+    }
+}
+
+static ray_op_t* pass_predicate_pushdown(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root) return root;
+
+    /* Multiple iterations: pushdown may enable further pushdowns */
+    for (int iter = 0; iter < 4; iter++) {
+        bool changed = false;
+        uint32_t nc = g->node_count;
+
+        for (uint32_t i = 0; i < nc; i++) {
+            ray_op_t* n = &g->nodes[i];
+            if (n->flags & OP_FLAG_DEAD) continue;
+            if (n->opcode != OP_FILTER || n->arity != 2) continue;
+
+            ray_op_t* child = n->inputs[0];
+            ray_op_t* pred  = n->inputs[1];
+            if (!child || !pred) continue;
+
+            /* Push past SELECT/ALIAS (only if child is single-consumer,
+             * otherwise mutating child->inputs[0] would corrupt other branches) */
+            if (child->opcode == OP_SELECT ||
+                child->opcode == OP_ALIAS) {
+                if (count_node_consumers(g, child->id) > 1) continue;
+                /* Swap: FILTER(pred, SELECT(x)) -> SELECT(FILTER(pred, x)) */
+                ray_op_t* proj_input = child->inputs[0];
+                n->inputs[0] = proj_input;
+                child->inputs[0] = n;
+                redirect_consumers(g, n->id, child, child->id, n->id);
+                if (n->id == root->id) root = child;
+                changed = true;
+                continue;
+            }
+
+            /* GROUP pushdown disabled: the executor's key/agg scans
+             * bypass the filter, producing wrong results. Needs executor
+             * support for filtered scan propagation before enabling. */
+
+            /* Push past EXPAND (source-side predicates, single-consumer only) */
+            if (child->opcode == OP_EXPAND) {
+                if (count_node_consumers(g, child->id) > 1) continue;
+                uint32_t scan_ids[64];
+                int n_scans = collect_pred_scans(g, pred, scan_ids, 64);
+                if (n_scans <= 0) continue;  /* 0 = no scans, -1 = truncated */
+
+                /* All predicate scans must be reachable from the expand's
+                 * source input (inputs[0]).  Walk the source subtree. */
+                ray_op_t* expand_src_tree = child->inputs[0];
+                bool all_source = true;
+                for (int s = 0; s < n_scans; s++) {
+                    if (!is_reachable_from(g, expand_src_tree, scan_ids[s])) {
+                        all_source = false;
+                        break;
+                    }
+                }
+                if (!all_source) continue;
+
+                /* Swap: FILTER(pred, EXPAND(src, rel)) -> EXPAND(FILTER(pred, src), rel) */
+                ray_op_t* expand_src = child->inputs[0];
+                n->inputs[0] = expand_src;
+                child->inputs[0] = n;
+                redirect_consumers(g, n->id, child, child->id, n->id);
+                if (n->id == root->id) root = child;
+                changed = true;
+                continue;
+            }
+        }
+        if (!changed) break;
+    }
+    return root;
+}
+
+/* Score a predicate subtree: lower = cheaper = execute first. */
+static int filter_cost(ray_graph_t* g, ray_op_t* pred) {
+    (void)g;
+    if (!pred) return 99;
+    int cost = 0;
+
+    /* Constant comparison: one input is OP_CONST */
+    bool has_const = false;
+    for (int i = 0; i < pred->arity && i < 2; i++) {
+        if (pred->inputs[i] && pred->inputs[i]->opcode == OP_CONST)
+            has_const = true;
+    }
+    if (!has_const) cost += 4;  /* col-col comparison */
+
+    /* Type width cost */
+    int8_t t = pred->out_type;
+    if (pred->arity >= 1 && pred->inputs[0])
+        t = pred->inputs[0]->out_type;
+    switch (t) {
+        case RAY_BOOL: case RAY_U8:  cost += 0; break;
+        case RAY_I16:               cost += 1; break;
+        case RAY_I32:  case RAY_DATE: case RAY_TIME: cost += 2; break;
+        default:                   cost += 3; break;  /* I64, F64, SYM, STR */
+    }
+
+    /* Comparison type cost */
+    switch (pred->opcode) {
+        case OP_EQ: case OP_NE:    cost += 0; break;
+        case OP_LT: case OP_LE:
+        case OP_GT: case OP_GE:    cost += 2; break;
+        case OP_LIKE: case OP_ILIKE: cost += 4; break;
+        default:                   cost += 1; break;
+    }
+
+    return cost;
+}
+
+/* Split FILTER(AND(a, b), input) into FILTER(a, FILTER(b, input)).
+ * Returns the new outer filter node, or the original if no split. */
+static ray_op_t* split_and_filter(ray_graph_t* g, ray_op_t* filter_node) {
+    if (!filter_node || filter_node->opcode != OP_FILTER) return filter_node;
+    if (filter_node->arity != 2) return filter_node;
+
+    ray_op_t* pred = filter_node->inputs[1];
+    if (!pred || pred->opcode != OP_AND || pred->arity != 2) return filter_node;
+
+    ray_op_t* pred_a = pred->inputs[0];
+    ray_op_t* pred_b = pred->inputs[1];
+    ray_op_t* input  = filter_node->inputs[0];
+    if (!pred_a || !pred_b || !input) return filter_node;
+
+    /* Save IDs before potential realloc */
+    uint32_t filter_id = filter_node->id;
+    uint32_t pred_a_id = pred_a->id;
+    uint32_t pred_b_id = pred_b->id;
+
+    /* Allocate new outer filter first, before mutating existing nodes */
+    ray_op_t* outer = graph_alloc_node_opt(g);
+    if (!outer) return &g->nodes[filter_id];  /* OOM: leave unsplit */
+
+    /* Re-fetch after potential realloc */
+    filter_node = &g->nodes[filter_id];
+    pred_a = &g->nodes[pred_a_id];
+    pred_b = &g->nodes[pred_b_id];
+
+    /* Rewrite: filter_node becomes FILTER(pred_a, input) */
+    filter_node->inputs[1] = pred_a;
+
+    outer->opcode = OP_FILTER;
+    outer->arity = 2;
+    outer->inputs[0] = filter_node;
+    outer->inputs[1] = pred_b;
+    outer->out_type = filter_node->out_type;
+    outer->est_rows = filter_node->est_rows;
+
+    return outer;
+}
+
+/* Collect a chain of OP_FILTER nodes. Returns count (max 64). */
+static int collect_filter_chain(ray_op_t* top, ray_op_t** chain, int max) {
+    int n = 0;
+    ray_op_t* cur = top;
+    while (cur && cur->opcode == OP_FILTER && n < max) {
+        chain[n++] = cur;
+        cur = cur->inputs[0];
+    }
+    return n;
+}
+
+static ray_op_t* pass_filter_reorder(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root) return root;
+
+    uint32_t root_id = root->id;
+
+    /* First pass: split AND predicates in filters.
+     * Iterate until no more splits occur so nested ANDs like
+     * AND(AND(a,b), c) are fully decomposed into individual filters. */
+    for (int split_iter = 0; split_iter < 16; split_iter++) {
+        bool split_changed = false;
+        uint32_t nc = g->node_count;
+        for (uint32_t i = 0; i < nc; i++) {
+            ray_op_t* n = &g->nodes[i];
+            if (n->flags & OP_FLAG_DEAD) continue;
+            if (n->opcode != OP_FILTER) continue;
+            if (n->arity != 2 || !n->inputs[1]) continue;
+            if (n->inputs[1]->opcode != OP_AND) continue;
+
+            /* Split AND and update consumers to point to new outer.
+             * split_and_filter may realloc g->nodes, so re-fetch n afterwards. */
+            uint32_t orig_id = i;
+            ray_op_t* new_outer = split_and_filter(g, n);
+            n = &g->nodes[orig_id];  /* re-fetch after potential realloc */
+            if (new_outer->id != orig_id) {
+                redirect_consumers(g, orig_id, new_outer, new_outer->id, orig_id);
+                if (orig_id == root_id) root_id = new_outer->id;
+                split_changed = true;
+            }
+        }
+        if (!split_changed) break;
+    }
+
+    /* Second pass: reorder filter chains by cost.
+     * Use insertion sort on chain arrays (chains are typically short). */
+    uint32_t nc = g->node_count;  /* may have grown from splits */
+    bool* visited = NULL;
+    bool visited_stack[256];
+    if (nc <= 256) {
+        visited = visited_stack;
+    } else {
+        visited = (bool*)ray_sys_alloc(nc * sizeof(bool));
+        if (!visited) return &g->nodes[root_id];
+    }
+    memset(visited, 0, nc * sizeof(bool));
+
+    for (uint32_t i = 0; i < nc; i++) {
+        ray_op_t* n = &g->nodes[i];
+        if (n->flags & OP_FLAG_DEAD) continue;
+        if (n->opcode != OP_FILTER) continue;
+        if (visited[i]) continue;
+
+        /* Collect the filter chain starting at this node */
+        ray_op_t* chain[64];
+        int chain_len = collect_filter_chain(n, chain, 64);
+        if (chain_len < 2) {
+            for (int c = 0; c < chain_len; c++) visited[chain[c]->id] = true;
+            continue;
+        }
+
+        /* Mark all as visited */
+        for (int c = 0; c < chain_len; c++) visited[chain[c]->id] = true;
+
+        /* Skip reordering if any filter in the chain has multiple consumers,
+         * since swapping predicates would change semantics for other branches */
+        bool has_shared = false;
+        for (int c = 0; c < chain_len; c++) {
+            if (count_node_consumers(g, chain[c]->id) > 1) {
+                has_shared = true;
+                break;
+            }
+        }
+        if (has_shared) continue;
+
+        /* Score each filter's predicate */
+        int costs[64];
+        for (int c = 0; c < chain_len; c++)
+            costs[c] = filter_cost(g, chain[c]->inputs[1]);
+
+        /* Insertion sort predicates by cost descending (stable: preserves
+         * original order for equal costs). Expensive predicates go to
+         * chain[0] (outer, runs last), cheap go to chain[N-1] (inner,
+         * runs first). We swap predicates, not filter nodes. */
+        for (int c = 1; c < chain_len; c++) {
+            ray_op_t* pred = chain[c]->inputs[1];
+            int cost = costs[c];
+            int j = c - 1;
+            while (j >= 0 && costs[j] < cost) {
+                chain[j + 1]->inputs[1] = chain[j]->inputs[1];
+                costs[j + 1] = costs[j];
+                j--;
+            }
+            chain[j + 1]->inputs[1] = pred;
+            costs[j + 1] = cost;
+        }
+    }
+
+    if (nc > 256) ray_sys_free(visited);
+    return &g->nodes[root_id];
+}
+
+/* --------------------------------------------------------------------------
+ * Pass 7: Projection pushdown
+ *
+ * BFS from root collecting all reachable node IDs (following inputs and
+ * ext-node children).  Any node not reachable is marked DEAD so the DCE
+ * pass can clean it up.
+ * -------------------------------------------------------------------------- */
+
+static bool pass_projection_pushdown(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root) return false;
+    uint32_t nc = g->node_count;
+
+    bool live_stack[256];
+    bool* live = nc <= 256 ? live_stack : (bool*)ray_sys_alloc(nc * sizeof(bool));
+    uint32_t q_stack[256];
+    uint32_t* q = nc <= 256 ? q_stack : (uint32_t*)ray_sys_alloc(nc * sizeof(uint32_t));
+    if (!live || !q) { if (nc > 256) { ray_sys_free(live); ray_sys_free(q); } return false; }
+    memset(live, 0, nc * sizeof(bool));
+
+    /* BFS from root */
+    int qh = 0, qt = 0;
+    q[qt++] = root->id;
+    live[root->id] = true;
+
+    while (qh < qt) {
+        uint32_t nid = q[qh++];
+        ray_op_t* n = &g->nodes[nid];
+
+        /* Follow standard inputs */
+        for (int i = 0; i < 2 && i < n->arity; i++) {
+            if (n->inputs[i] && !live[n->inputs[i]->id]) {
+                live[n->inputs[i]->id] = true;
+                if (qt < (int)nc) q[qt++] = n->inputs[i]->id;
+            }
+        }
+
+        /* Follow ext node children (mirrors pass_type_inference traversal) */
+        ray_op_ext_t* ext = find_ext(g, nid);
+        if (ext) {
+            switch (n->opcode) {
+                case OP_GROUP:
+                    for (uint8_t k = 0; k < ext->n_keys; k++)
+                        if (ext->keys[k] && !live[ext->keys[k]->id]) {
+                            live[ext->keys[k]->id] = true;
+                            if (qt < (int)nc) q[qt++] = ext->keys[k]->id;
+                        }
+                    for (uint8_t a = 0; a < ext->n_aggs; a++)
+                        if (ext->agg_ins[a] && !live[ext->agg_ins[a]->id]) {
+                            live[ext->agg_ins[a]->id] = true;
+                            if (qt < (int)nc) q[qt++] = ext->agg_ins[a]->id;
+                        }
+                    break;
+                case OP_SORT:
+                case OP_SELECT:
+                    for (uint8_t k = 0; k < ext->sort.n_cols; k++)
+                        if (ext->sort.columns[k] && !live[ext->sort.columns[k]->id]) {
+                            live[ext->sort.columns[k]->id] = true;
+                            if (qt < (int)nc) q[qt++] = ext->sort.columns[k]->id;
+                        }
+                    break;
+                case OP_JOIN:
+                    for (uint8_t k = 0; k < ext->join.n_join_keys; k++) {
+                        if (ext->join.left_keys[k] && !live[ext->join.left_keys[k]->id]) {
+                            live[ext->join.left_keys[k]->id] = true;
+                            if (qt < (int)nc) q[qt++] = ext->join.left_keys[k]->id;
+                        }
+                        if (ext->join.right_keys && ext->join.right_keys[k] &&
+                            !live[ext->join.right_keys[k]->id]) {
+                            live[ext->join.right_keys[k]->id] = true;
+                            if (qt < (int)nc) q[qt++] = ext->join.right_keys[k]->id;
+                        }
+                    }
+                    break;
+                case OP_WINDOW_JOIN: {
+                    ray_op_ext_t* wj_ext = find_ext(g, n->id);
+                    if (wj_ext) {
+                        if (wj_ext->asof.time_key && !live[wj_ext->asof.time_key->id]) {
+                            live[wj_ext->asof.time_key->id] = true;
+                            if (qt < (int)nc) q[qt++] = wj_ext->asof.time_key->id;
+                        }
+                        for (uint8_t k = 0; k < wj_ext->asof.n_eq_keys; k++) {
+                            if (wj_ext->asof.eq_keys[k] && !live[wj_ext->asof.eq_keys[k]->id]) {
+                                live[wj_ext->asof.eq_keys[k]->id] = true;
+                                if (qt < (int)nc) q[qt++] = wj_ext->asof.eq_keys[k]->id;
+                            }
+                        }
+                    }
+                    break;
+                }
+                case OP_WINDOW:
+                    for (uint8_t k = 0; k < ext->window.n_part_keys; k++)
+                        if (ext->window.part_keys[k] && !live[ext->window.part_keys[k]->id]) {
+                            live[ext->window.part_keys[k]->id] = true;
+                            if (qt < (int)nc) q[qt++] = ext->window.part_keys[k]->id;
+                        }
+                    for (uint8_t k = 0; k < ext->window.n_order_keys; k++)
+                        if (ext->window.order_keys[k] && !live[ext->window.order_keys[k]->id]) {
+                            live[ext->window.order_keys[k]->id] = true;
+                            if (qt < (int)nc) q[qt++] = ext->window.order_keys[k]->id;
+                        }
+                    for (uint8_t f = 0; f < ext->window.n_funcs; f++)
+                        if (ext->window.func_inputs[f] && !live[ext->window.func_inputs[f]->id]) {
+                            live[ext->window.func_inputs[f]->id] = true;
+                            if (qt < (int)nc) q[qt++] = ext->window.func_inputs[f]->id;
+                        }
+                    break;
+                case OP_IF:
+                case OP_SUBSTR:
+                case OP_REPLACE: {
+                    uint32_t third_id = (uint32_t)(uintptr_t)ext->literal;
+                    if (third_id < nc && !live[third_id]) {
+                        live[third_id] = true;
+                        if (qt < (int)nc) q[qt++] = third_id;
+                    }
+                    break;
+                }
+                case OP_CONCAT:
+                    if (ext->sym >= 2) {
+                        int n_args = (int)ext->sym;
+                        uint32_t* trail = (uint32_t*)((char*)(ext + 1));
+                        for (int j = 2; j < n_args; j++) {
+                            uint32_t arg_id = trail[j - 2];
+                            if (arg_id < nc && !live[arg_id]) {
+                                live[arg_id] = true;
+                                if (qt < (int)nc) q[qt++] = arg_id;
+                            }
+                        }
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    /* Mark unreachable nodes DEAD */
+    for (uint32_t i = 0; i < nc; i++) {
+        if (!live[i])
+            g->nodes[i].flags |= OP_FLAG_DEAD;
+    }
+
+    if (nc > 256) { ray_sys_free(live); ray_sys_free(q); }
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * Pass 8: Partition pruning
+ *
+ * Recognize FILTER(EQ(SCAN(mapcommon_col), CONST(val))) patterns and set
+ * est_rows=1 to hint that most partitions can be skipped at execution time.
+ * -------------------------------------------------------------------------- */
+
+static void pass_partition_pruning(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root) return;
+    (void)root;
+
+    for (uint32_t i = 0; i < g->node_count; i++) {
+        ray_op_t* n = &g->nodes[i];
+        if (n->flags & OP_FLAG_DEAD) continue;
+        if (n->opcode != OP_FILTER || n->arity != 2) continue;
+
+        ray_op_t* pred = n->inputs[1];
+        if (!pred || pred->arity != 2) continue;
+
+        uint16_t cmp_op = pred->opcode;
+        if (cmp_op != OP_EQ && cmp_op != OP_NE &&
+            cmp_op != OP_LT && cmp_op != OP_GT &&
+            cmp_op != OP_LE && cmp_op != OP_GE &&
+            cmp_op != OP_IN && cmp_op != OP_NOT_IN) continue;
+
+        ray_op_t* lhs = pred->inputs[0];
+        ray_op_t* rhs = pred->inputs[1];
+        if (!lhs || !rhs) continue;
+
+        ray_op_t* scan_node = NULL;
+        ray_op_t* const_node = NULL;
+        bool swapped = false;
+        if (lhs->opcode == OP_SCAN && rhs->opcode == OP_CONST) {
+            scan_node = lhs; const_node = rhs;
+        } else if (rhs->opcode == OP_SCAN && lhs->opcode == OP_CONST) {
+            scan_node = rhs; const_node = lhs; swapped = true;
+        } else continue;
+
+        if (scan_node->out_type != RAY_MAPCOMMON) continue;
+
+        ray_op_ext_t* scan_ext = find_ext(g, scan_node->id);
+        if (!scan_ext) continue;
+
+        /* Resolve table */
+        uint16_t stored_table_id = 0;
+        memcpy(&stored_table_id, scan_ext->base.pad, sizeof(uint16_t));
+        ray_t* tbl;
+        if (stored_table_id > 0 && g->tables && (stored_table_id - 1) < g->n_tables)
+            tbl = g->tables[stored_table_id - 1];
+        else
+            tbl = g->table;
+        if (!tbl) continue;
+
+        ray_t* mc_col = ray_table_get_col(tbl, scan_ext->sym);
+        if (!mc_col || mc_col->type != RAY_MAPCOMMON) continue;
+
+        /* Extract constant value */
+        ray_op_ext_t* const_ext = find_ext(g, const_node->id);
+        if (!const_ext || !const_ext->literal) continue;
+        ray_t* lit = const_ext->literal;
+
+        /* Read partition keys from MAPCOMMON: [key_values, row_counts] */
+        if (mc_col->len < 2) continue;
+        ray_t** mc_ptrs = (ray_t**)ray_data(mc_col);
+        ray_t* key_values = mc_ptrs[0];
+        if (!key_values) continue;
+        int64_t n_parts = key_values->len;
+        if (n_parts <= 0) continue;
+
+        /* Type-class check: partition keys and the literal must live in
+         * the same value namespace, otherwise comparisons are nonsense.
+         *   - SYM keys are interned IDs; they can only be compared to
+         *     SYM set elements.
+         *   - int-family keys (I16/I32/I64/DATE/TIME/TIMESTAMP/BOOL/U8)
+         *     compare only to other int-family values.
+         *   - mixing the two is always wrong at the raw-bits level,
+         *     so skip pruning (the executor filter still runs). */
+        int8_t pkey_t = key_values->type;
+        int8_t lit_base = lit->type < 0 ? (int8_t)(-lit->type) : lit->type;
+        bool pkey_is_sym = (pkey_t == RAY_SYM);
+        bool lit_is_sym  = (lit_base == RAY_SYM);
+        if (pkey_is_sym != lit_is_sym) {
+            continue;
+        }
+
+        /* Allocate seg_mask bitmap */
+        uint32_t n_words = (uint32_t)((n_parts + 63) / 64);
+        uint64_t* mask = (uint64_t*)ray_sys_alloc(n_words * sizeof(uint64_t));
+        if (!mask) continue;
+        memset(mask, 0, n_words * sizeof(uint64_t));
+
+        /* OP_IN / OP_NOT_IN expects a literal vector const on the RHS.
+         * For the scalar ops, the const is a single atom or 1-elem vec. */
+        bool is_in  = (cmp_op == OP_IN);
+        bool is_nin = (cmp_op == OP_NOT_IN);
+
+        /* For IN/NOT_IN the scan must be the LHS (col IN set), not
+         * swapped — we never pruned on `const IN col_set` anyway. */
+        if ((is_in || is_nin) && swapped) { ray_sys_free(mask); continue; }
+
+        /* Extract constant(s) for comparison.  Scalar ops take one
+         * value; IN ops take an array of values read from the vec
+         * literal.  We normalize all values to int64_t (which covers
+         * I64, TIMESTAMP, SYM interned IDs, and sign-extended I32/
+         * DATE/TIME).  Atoms store the value in the header; vectors
+         * store it in data. */
+        int64_t const_val = 0;                 /* for scalar ops */
+        int64_t set_stack[32];
+        int64_t* set_vals = set_stack;         /* for IN/NOT_IN */
+        int64_t set_len   = 0;
+        ray_t*  set_heap  = NULL;
+
+        int8_t lt = lit->type < 0 ? (int8_t)(-lit->type) : lit->type;
+        bool narrow32 = (lt == RAY_I32 || lt == RAY_DATE || lt == RAY_TIME);
+        bool wide64   = (lt == RAY_I64 || lt == RAY_TIMESTAMP || lt == RAY_SYM);
+        if (!narrow32 && !wide64) {
+            ray_sys_free(mask);
+            continue;  /* unsupported type for partition pruning */
+        }
+
+        if (is_in || is_nin) {
+            /* Literal must be a vector (ray_const_vec carries the vec
+             * pointer unchanged in ext->literal). */
+            if (lit->type <= 0) { ray_sys_free(mask); continue; }
+            set_len = lit->len;
+            if (set_len <= 0) {
+                /* Empty set: for IN no partition can match → mask stays 0
+                 * and we attach it below (skipping all segments).  For
+                 * NOT_IN every partition passes → set all bits. */
+                if (is_nin) {
+                    for (int64_t p = 0; p < n_parts; p++)
+                        mask[p / 64] |= (1ULL << (p % 64));
+                }
+                goto attach_mask;
+            }
+            if (set_len > 32) {
+                set_heap = ray_alloc((size_t)set_len * sizeof(int64_t));
+                if (!set_heap) { ray_sys_free(mask); continue; }
+                set_vals = (int64_t*)ray_data(set_heap);
+            }
+            /* Read set elements — skip nulls in the literal so a null
+             * sentinel can never match a partition key. */
+            int64_t next = 0;
+            bool set_has_nulls = (lit->attrs & RAY_ATTR_HAS_NULLS) != 0;
+            for (int64_t i = 0; i < set_len; i++) {
+                if (set_has_nulls && ray_vec_is_null(lit, i)) continue;
+                if (narrow32) {
+                    int32_t v32;
+                    memcpy(&v32, (char*)ray_data(lit) + i * sizeof(int32_t), sizeof(int32_t));
+                    set_vals[next++] = v32;
+                } else {
+                    int64_t v64;
+                    memcpy(&v64, (char*)ray_data(lit) + i * sizeof(int64_t), sizeof(int64_t));
+                    set_vals[next++] = v64;
+                }
+            }
+            set_len = next;
+            /* Also handle the degenerate case where all set elements
+             * were null — treat like empty set. */
+            if (set_len == 0) {
+                if (is_nin) {
+                    for (int64_t p = 0; p < n_parts; p++)
+                        mask[p / 64] |= (1ULL << (p % 64));
+                }
+                if (set_heap) ray_free(set_heap);
+                goto attach_mask;
+            }
+        } else {
+            /* Scalar const path (EQ/NE/LT/GT/LE/GE). */
+            if (wide64) {
+                if (lit->type < 0) const_val = lit->i64;
+                else memcpy(&const_val, ray_data(lit), sizeof(int64_t));
+            } else {
+                int32_t v32;
+                if (lit->type < 0) v32 = lit->i32;
+                else memcpy(&v32, ray_data(lit), sizeof(int32_t));
+                const_val = v32;
+            }
+        }
+
+        /* Effective comparison: if swapped, reverse direction
+         * (IN/NOT_IN are never swapped — gated above). */
+        uint16_t eff_op = cmp_op;
+        if (swapped) {
+            if (cmp_op == OP_LT) eff_op = OP_GT;
+            else if (cmp_op == OP_GT) eff_op = OP_LT;
+            else if (cmp_op == OP_LE) eff_op = OP_GE;
+            else if (cmp_op == OP_GE) eff_op = OP_LE;
+        }
+
+        for (int64_t p = 0; p < n_parts; p++) {
+            int64_t pkey = 0;
+            if (key_values->type == RAY_DATE || key_values->type == RAY_I32 || key_values->type == RAY_TIME) {
+                int32_t v32;
+                memcpy(&v32, (char*)ray_data(key_values) + p * sizeof(int32_t), sizeof(int32_t));
+                pkey = v32;
+            } else {
+                memcpy(&pkey, (char*)ray_data(key_values) + p * sizeof(int64_t), sizeof(int64_t));
+            }
+
+            bool pass = false;
+            if (is_in || is_nin) {
+                bool found = false;
+                for (int64_t j = 0; j < set_len; j++) {
+                    if (pkey == set_vals[j]) { found = true; break; }
+                }
+                pass = is_in ? found : !found;
+            } else {
+                switch (eff_op) {
+                    case OP_EQ: pass = (pkey == const_val); break;
+                    case OP_NE: pass = (pkey != const_val); break;
+                    case OP_LT: pass = (pkey <  const_val); break;
+                    case OP_GT: pass = (pkey >  const_val); break;
+                    case OP_LE: pass = (pkey <= const_val); break;
+                    case OP_GE: pass = (pkey >= const_val); break;
+                    default: break;
+                }
+            }
+            if (pass)
+                mask[p / 64] |= (1ULL << (p % 64));
+        }
+        if (set_heap) ray_free(set_heap);
+    attach_mask:;
+
+        /* Attach seg_mask to OP_SCAN nodes reading parted columns from same table.
+         * When !any_active the mask is all-zeros — attach it anyway so the
+         * segment loop in ray_execute skips all segments and hits the
+         * empty-table path instead of reading every partition. */
+        bool mask_owned = false;
+        for (uint32_t s = 0; s < g->node_count; s++) {
+            ray_op_t* sn = &g->nodes[s];
+            if (sn->flags & OP_FLAG_DEAD || sn->opcode != OP_SCAN) continue;
+            if (sn == scan_node) continue;
+
+            ray_op_ext_t* sn_ext = find_ext(g, sn->id);
+            if (!sn_ext) continue;
+
+            uint16_t sn_tid = 0;
+            memcpy(&sn_tid, sn_ext->base.pad, sizeof(uint16_t));
+            if (sn_tid != stored_table_id) continue;
+
+            ray_t* sn_col = ray_table_get_col(tbl, sn_ext->sym);
+            if (!sn_col || !RAY_IS_PARTED(sn_col->type)) continue;
+
+            if (sn_ext->seg_mask) {
+                /* AND with existing mask (conjunctive filters) */
+                uint32_t exist_w = (uint32_t)((sn_ext->seg_mask_count + 63) / 64);
+                uint32_t min_w = n_words < exist_w ? n_words : exist_w;
+                for (uint32_t w = 0; w < min_w; w++)
+                    sn_ext->seg_mask[w] &= mask[w];
+                /* Zero out words beyond new mask (prune extra segments) */
+                for (uint32_t w = min_w; w < exist_w; w++)
+                    sn_ext->seg_mask[w] = 0;
+                /* Tighten count to the smaller partition set */
+                if (n_parts < sn_ext->seg_mask_count)
+                    sn_ext->seg_mask_count = n_parts;
+            } else {
+                sn_ext->seg_mask = mask;
+                sn_ext->seg_mask_count = n_parts;
+                mask_owned = true;
+            }
+        }
+        if (!mask_owned) ray_sys_free(mask);
+
+        n->est_rows = 1;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ray_optimize — run all passes in order, return (possibly updated) root
+ * -------------------------------------------------------------------------- */
+
+ray_op_t* ray_optimize(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root) return root;
+
+    ray_profile_span_start("optimize");
+
+    /* Pass 1: Type inference */
+    pass_type_inference(g, root);
+    ray_profile_tick("type inference");
+
+    /* Pass 2: Constant folding */
+    pass_constant_fold(g, root);
+    ray_profile_tick("constant fold");
+
+    /* Pass 3: SIP (graph-aware sideways information passing) */
+    sip_pass(g, root);
+    ray_profile_tick("SIP");
+
+    /* Pass 4: Factorized detection (OP_EXPAND → OP_GROUP optimization) */
+    factorize_pass(g, root);
+    ray_profile_tick("factorize");
+
+    /* Pass 5: Predicate pushdown (may change root) */
+    root = pass_predicate_pushdown(g, root);
+    ray_profile_tick("predicate pushdown");
+
+    /* Pass 6: Filter reordering (split ANDs + reorder by cost, may change root) */
+    root = pass_filter_reorder(g, root);
+    ray_profile_tick("filter reorder");
+
+    /* Pass 7: Projection pushdown (mark unreachable nodes dead) */
+    bool proj_ok = pass_projection_pushdown(g, root);
+    ray_profile_tick("projection pushdown");
+
+    /* Pass 8: Partition pruning (set est_rows hints for mapcommon filters).
+     * Only safe to run if projection pushdown completed: pruning walks all
+     * nodes and would attach seg_masks to disconnected branches otherwise. */
+    if (proj_ok)
+        pass_partition_pruning(g, root);
+    ray_profile_tick("partition pruning");
+
+    /* Pass 9: Fusion */
+    ray_fuse_pass(g, root);
+    ray_profile_tick("fusion");
+
+    /* Pass 10: DCE */
+    pass_dce(g, root);
+    ray_profile_tick("DCE");
+
+    ray_profile_span_end("optimize");
+
+    return root;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/opt.h b/crates/rayforce-sys/vendor/rayforce/src/ops/opt.h
new file mode 100644
index 0000000..af3956b
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/opt.h
@@ -0,0 +1,29 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OPT_H
+#define RAY_OPT_H
+
+#include "ops.h"
+
+#endif /* RAY_OPT_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/pipe.c b/crates/rayforce-sys/vendor/rayforce/src/ops/pipe.c
new file mode 100644
index 0000000..1c04342
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/pipe.c
@@ -0,0 +1,63 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "pipe.h"
+#include "mem/sys.h"
+#include <string.h>
+#ifndef RAY_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+/* --------------------------------------------------------------------------
+ * ray_pipe_new
+ *
+ * Allocate a new pipe structure with all fields zeroed and spill_fd = -1.
+ * -------------------------------------------------------------------------- */
+
+ray_pipe_t* ray_pipe_new(void) {
+    ray_pipe_t* p = (ray_pipe_t*)ray_sys_alloc(sizeof(ray_pipe_t));
+    if (!p) return NULL;
+    /* L3: Zero-init the entire struct before setting individual fields,
+       ensuring no uninitialized pointers or state. */
+    memset(p, 0, sizeof(*p));
+    p->spill_fd = -1;
+
+    return p;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_pipe_free
+ *
+ * Free a pipe. Closes the spill file descriptor if it was opened.
+ * Does NOT recursively free upstream input pipes.
+ * -------------------------------------------------------------------------- */
+
+void ray_pipe_free(ray_pipe_t* pipe) {
+    if (!pipe) return;
+
+    if (pipe->spill_fd >= 0) {
+        close(pipe->spill_fd);
+    }
+
+    ray_sys_free(pipe);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/pipe.h b/crates/rayforce-sys/vendor/rayforce/src/ops/pipe.h
new file mode 100644
index 0000000..088bcfe
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/pipe.h
@@ -0,0 +1,43 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_PIPE_H
+#define RAY_PIPE_H
+
+/*
+ * pipe.h -- Pipeline infrastructure.
+ *
+ * A pipe connects operation nodes in the executor pipeline. Each pipe
+ * holds a morsel iterator state, optional materialized intermediate,
+ * and upstream input pipe references.
+ */
+
+#include "ops.h"
+
+/* Allocate and initialize a new pipe (all fields zeroed, spill_fd = -1). */
+ray_pipe_t* ray_pipe_new(void);
+
+/* Free a pipe. Closes spill_fd if open. Does NOT free upstream pipes. */
+void ray_pipe_free(ray_pipe_t* pipe);
+
+#endif /* RAY_PIPE_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/pivot.c b/crates/rayforce-sys/vendor/rayforce/src/ops/pivot.c
new file mode 100644
index 0000000..778123c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/pivot.c
@@ -0,0 +1,666 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+
+/* ============================================================================
+ * OP_IF: ternary select  result[i] = cond[i] ? then[i] : else[i]
+ * ============================================================================ */
+
+ray_t* exec_if(ray_graph_t* g, ray_op_t* op) {
+    /* cond = inputs[0], then = inputs[1], else_id stored in ext->literal */
+    ray_t* cond_v = exec_node(g, op->inputs[0]);
+    ray_t* then_v = exec_node(g, op->inputs[1]);
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    uint32_t else_id = (uint32_t)(uintptr_t)ext->literal;
+    ray_t* else_v = exec_node(g, &g->nodes[else_id]);
+
+    if (!cond_v || RAY_IS_ERR(cond_v)) {
+        if (then_v && !RAY_IS_ERR(then_v)) ray_release(then_v);
+        if (else_v && !RAY_IS_ERR(else_v)) ray_release(else_v);
+        return cond_v;
+    }
+    if (!then_v || RAY_IS_ERR(then_v)) {
+        ray_release(cond_v);
+        if (else_v && !RAY_IS_ERR(else_v)) ray_release(else_v);
+        return then_v;
+    }
+    if (!else_v || RAY_IS_ERR(else_v)) {
+        ray_release(cond_v); ray_release(then_v);
+        return else_v;
+    }
+
+    int64_t len = cond_v->len;
+    bool then_scalar = ray_is_atom(then_v) || (then_v->type > 0 && then_v->len == 1);
+    bool else_scalar = ray_is_atom(else_v) || (else_v->type > 0 && else_v->len == 1);
+    if (then_scalar && !else_scalar) len = else_v->len;
+    if (!then_scalar) len = then_v->len;
+
+    int8_t out_type = op->out_type;
+    ray_t* result = ray_vec_new(out_type, len);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(cond_v); ray_release(then_v); ray_release(else_v);
+        return result;
+    }
+    result->len = len;
+
+    uint8_t* cond_p = (uint8_t*)ray_data(cond_v);
+
+    if (out_type == RAY_F64) {
+        double t_scalar = then_scalar ? (ray_is_atom(then_v) ? then_v->f64 : ((double*)ray_data(then_v))[0]) : 0;
+        double e_scalar = else_scalar ? (ray_is_atom(else_v) ? else_v->f64 : ((double*)ray_data(else_v))[0]) : 0;
+        double* t_arr = then_scalar ? NULL : (double*)ray_data(then_v);
+        double* e_arr = else_scalar ? NULL : (double*)ray_data(else_v);
+        double* dst = (double*)ray_data(result);
+        for (int64_t i = 0; i < len; i++)
+            dst[i] = cond_p[i] ? (t_arr ? t_arr[i] : t_scalar)
+                               : (e_arr ? e_arr[i] : e_scalar);
+    } else if (out_type == RAY_I64) {
+        int64_t t_scalar = then_scalar ? (ray_is_atom(then_v) ? then_v->i64 : ((int64_t*)ray_data(then_v))[0]) : 0;
+        int64_t e_scalar = else_scalar ? (ray_is_atom(else_v) ? else_v->i64 : ((int64_t*)ray_data(else_v))[0]) : 0;
+        int64_t* t_arr = then_scalar ? NULL : (int64_t*)ray_data(then_v);
+        int64_t* e_arr = else_scalar ? NULL : (int64_t*)ray_data(else_v);
+        int64_t* dst = (int64_t*)ray_data(result);
+        for (int64_t i = 0; i < len; i++)
+            dst[i] = cond_p[i] ? (t_arr ? t_arr[i] : t_scalar)
+                               : (e_arr ? e_arr[i] : e_scalar);
+    } else if (out_type == RAY_I32) {
+        int32_t t_scalar = then_scalar ? (ray_is_atom(then_v) ? then_v->i32 : ((int32_t*)ray_data(then_v))[0]) : 0;
+        int32_t e_scalar = else_scalar ? (ray_is_atom(else_v) ? else_v->i32 : ((int32_t*)ray_data(else_v))[0]) : 0;
+        int32_t* t_arr = then_scalar ? NULL : (int32_t*)ray_data(then_v);
+        int32_t* e_arr = else_scalar ? NULL : (int32_t*)ray_data(else_v);
+        int32_t* dst = (int32_t*)ray_data(result);
+        for (int64_t i = 0; i < len; i++)
+            dst[i] = cond_p[i] ? (t_arr ? t_arr[i] : t_scalar)
+                               : (e_arr ? e_arr[i] : e_scalar);
+    } else if (out_type == RAY_STR) {
+        /* RAY_STR: resolve each side to string data and ray_str_vec_append.
+         * Scalars may be -RAY_STR or RAY_SYM atoms. */
+        result->len = 0; /* ray_str_vec_append manages len */
+        for (int64_t i = 0; i < len; i++) {
+            const char* sp;
+            size_t sl;
+            if (cond_p[i]) {
+                if (then_scalar) {
+                    if (then_v->type == -RAY_STR) {
+                        sp = ray_str_ptr(then_v);
+                        sl = ray_str_len(then_v);
+                    } else if (then_v->type == RAY_STR) {
+                        sp = ray_str_vec_get(then_v, 0, &sl);
+                        if (!sp) { sp = ""; sl = 0; }
+                    } else if (RAY_IS_SYM(then_v->type)) {
+                        ray_t* s = ray_sym_str(then_v->i64);
+                        sp = s ? ray_str_ptr(s) : "";
+                        sl = s ? ray_str_len(s) : 0;
+                    } else { sp = ""; sl = 0; }
+                } else if (then_v->type == RAY_STR) {
+                    sp = ray_str_vec_get(then_v, i, &sl);
+                    if (!sp) { sp = ""; sl = 0; }
+                } else {
+                    /* RAY_SYM column */
+                    int64_t sid = ray_read_sym(ray_data(then_v), i, then_v->type, then_v->attrs);
+                    ray_t* sa = ray_sym_str(sid);
+                    sp = sa ? ray_str_ptr(sa) : "";
+                    sl = sa ? ray_str_len(sa) : 0;
+                }
+            } else {
+                if (else_scalar) {
+                    if (else_v->type == -RAY_STR) {
+                        sp = ray_str_ptr(else_v);
+                        sl = ray_str_len(else_v);
+                    } else if (else_v->type == RAY_STR) {
+                        sp = ray_str_vec_get(else_v, 0, &sl);
+                        if (!sp) { sp = ""; sl = 0; }
+                    } else if (RAY_IS_SYM(else_v->type)) {
+                        ray_t* s = ray_sym_str(else_v->i64);
+                        sp = s ? ray_str_ptr(s) : "";
+                        sl = s ? ray_str_len(s) : 0;
+                    } else { sp = ""; sl = 0; }
+                } else if (else_v->type == RAY_STR) {
+                    sp = ray_str_vec_get(else_v, i, &sl);
+                    if (!sp) { sp = ""; sl = 0; }
+                } else {
+                    /* RAY_SYM column */
+                    int64_t sid = ray_read_sym(ray_data(else_v), i, else_v->type, else_v->attrs);
+                    ray_t* sa = ray_sym_str(sid);
+                    sp = sa ? ray_str_ptr(sa) : "";
+                    sl = sa ? ray_str_len(sa) : 0;
+                }
+            }
+            result = ray_str_vec_append(result, sp, sl);
+            if (RAY_IS_ERR(result)) break;
+        }
+    } else if (out_type == RAY_SYM) {
+        /* SYM columns may have narrow widths (W8/W16/W32) — use ray_read_sym.
+         * Scalars may be string atoms that need interning. Output is always W64. */
+        int64_t t_scalar = 0, e_scalar = 0;
+        if (then_scalar) {
+            if (then_v->type == -RAY_STR) {
+                t_scalar = ray_sym_intern(ray_str_ptr(then_v), ray_str_len(then_v));
+            } else {
+                t_scalar = then_v->i64;
+            }
+        }
+        if (else_scalar) {
+            if (else_v->type == -RAY_STR) {
+                e_scalar = ray_sym_intern(ray_str_ptr(else_v), ray_str_len(else_v));
+            } else {
+                e_scalar = else_v->i64;
+            }
+        }
+        int64_t* dst = (int64_t*)ray_data(result);
+        for (int64_t i = 0; i < len; i++) {
+            int64_t tv = then_scalar ? t_scalar
+                : ray_read_sym(ray_data(then_v), i, then_v->type, then_v->attrs);
+            int64_t ev = else_scalar ? e_scalar
+                : ray_read_sym(ray_data(else_v), i, else_v->type, else_v->attrs);
+            dst[i] = cond_p[i] ? tv : ev;
+        }
+    } else if (out_type == RAY_BOOL || out_type == RAY_U8) {
+        uint8_t t_scalar = then_scalar ? then_v->b8 : 0;
+        uint8_t e_scalar = else_scalar ? else_v->b8 : 0;
+        uint8_t* t_arr = then_scalar ? NULL : (uint8_t*)ray_data(then_v);
+        uint8_t* e_arr = else_scalar ? NULL : (uint8_t*)ray_data(else_v);
+        uint8_t* dst = (uint8_t*)ray_data(result);
+        for (int64_t i = 0; i < len; i++)
+            dst[i] = cond_p[i] ? (t_arr ? t_arr[i] : t_scalar)
+                               : (e_arr ? e_arr[i] : e_scalar);
+    } else if (out_type == RAY_TIMESTAMP || out_type == RAY_TIME || out_type == RAY_DATE) {
+        /* TIMESTAMP is 8B like I64; DATE and TIME are 4B like I32 */
+        if (out_type == RAY_TIMESTAMP) {
+            int64_t t_scalar2 = then_scalar ? then_v->i64 : 0;
+            int64_t e_scalar2 = else_scalar ? else_v->i64 : 0;
+            int64_t* t_arr = then_scalar ? NULL : (int64_t*)ray_data(then_v);
+            int64_t* e_arr = else_scalar ? NULL : (int64_t*)ray_data(else_v);
+            int64_t* dst = (int64_t*)ray_data(result);
+            for (int64_t i = 0; i < len; i++)
+                dst[i] = cond_p[i] ? (t_arr ? t_arr[i] : t_scalar2)
+                                   : (e_arr ? e_arr[i] : e_scalar2);
+        } else {
+            int32_t t_scalar2 = then_scalar ? then_v->i32 : 0;
+            int32_t e_scalar2 = else_scalar ? else_v->i32 : 0;
+            int32_t* t_arr = then_scalar ? NULL : (int32_t*)ray_data(then_v);
+            int32_t* e_arr = else_scalar ? NULL : (int32_t*)ray_data(else_v);
+            int32_t* dst = (int32_t*)ray_data(result);
+            for (int64_t i = 0; i < len; i++)
+                dst[i] = cond_p[i] ? (t_arr ? t_arr[i] : t_scalar2)
+                                   : (e_arr ? e_arr[i] : e_scalar2);
+        }
+    } else if (out_type == RAY_I16) {
+        int16_t t_scalar = then_scalar ? (int16_t)then_v->i32 : 0;
+        int16_t e_scalar = else_scalar ? (int16_t)else_v->i32 : 0;
+        int16_t* t_arr = then_scalar ? NULL : (int16_t*)ray_data(then_v);
+        int16_t* e_arr = else_scalar ? NULL : (int16_t*)ray_data(else_v);
+        int16_t* dst = (int16_t*)ray_data(result);
+        for (int64_t i = 0; i < len; i++)
+            dst[i] = cond_p[i] ? (t_arr ? t_arr[i] : t_scalar)
+                               : (e_arr ? e_arr[i] : e_scalar);
+    }
+
+    ray_release(cond_v); ray_release(then_v); ray_release(else_v);
+    return result;
+}
+
+/* ============================================================================
+ * exec_pivot — single-pass hash-aggregated pivot table
+ *
+ * Groups by (index_cols, pivot_col), aggregates value_col, then unstacks
+ * pivot values into separate output columns.
+ * ============================================================================ */
+
+ray_t* exec_pivot(ray_graph_t* g, ray_op_t* op, ray_t* tbl) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    uint8_t n_idx   = ext->pivot.n_index;
+    uint16_t agg_op = ext->pivot.agg_op;
+    int64_t nrows   = ray_table_nrows(tbl);
+
+    /* Resolve input columns */
+    ray_t* idx_vecs[16];
+    for (uint8_t i = 0; i < n_idx; i++) {
+        ray_op_ext_t* ie = find_ext(g, ext->pivot.index_cols[i]->id);
+        idx_vecs[i] = (ie && ie->base.opcode == OP_SCAN)
+                     ? ray_table_get_col(tbl, ie->sym) : NULL;
+        if (!idx_vecs[i]) return ray_error("domain", "pivot: index column not found");
+    }
+
+    ray_op_ext_t* pe = find_ext(g, ext->pivot.pivot_col->id);
+    ray_t* pcol = (pe && pe->base.opcode == OP_SCAN)
+                ? ray_table_get_col(tbl, pe->sym) : NULL;
+    if (!pcol) return ray_error("domain", "pivot: pivot column not found");
+
+    ray_op_ext_t* ve = find_ext(g, ext->pivot.value_col->id);
+    ray_t* vcol = (ve && ve->base.opcode == OP_SCAN)
+                ? ray_table_get_col(tbl, ve->sym) : NULL;
+    if (!vcol) return ray_error("domain", "pivot: value column not found");
+
+    if (nrows == 0) return ray_table_new(0);
+
+    /* Combined keys: index_cols + pivot_col */
+    uint8_t n_keys = n_idx + 1;
+    if (n_keys > 8) return ray_error("limit", "pivot: too many index columns");
+
+    /* Wide-key resolution: for RAY_GUID the HT slot holds a source row
+     * index rather than the 16 raw bytes, so phase2 dedupe and emit
+     * route wide keys through the source column (key_data[k]). */
+    bool idx_wide[8] = {0};
+    for (uint8_t k = 0; k < n_idx; k++)
+        idx_wide[k] = (idx_vecs[k]->type == RAY_GUID);
+    bool pvt_wide = (pcol->type == RAY_GUID);
+
+    void*   key_data[8];
+    int8_t  key_types[8];
+    uint8_t key_attrs[8];
+    ray_t*  key_vecs[8];
+    for (uint8_t k = 0; k < n_idx; k++) {
+        key_data[k]  = ray_data(idx_vecs[k]);
+        key_types[k] = idx_vecs[k]->type;
+        key_attrs[k] = idx_vecs[k]->attrs;
+        key_vecs[k]  = idx_vecs[k];
+    }
+    key_data[n_idx]  = ray_data(pcol);
+    key_types[n_idx] = pcol->type;
+    key_attrs[n_idx] = pcol->attrs;
+    key_vecs[n_idx]  = pcol;
+
+    /* Single agg input: value column */
+    ray_t* agg_vecs[1] = { vcol };
+    uint16_t agg_ops[1] = { agg_op };
+
+    /* Compute need_flags for the agg op */
+    uint8_t need_flags = GHT_NEED_SUM; /* always need sum (used for FIRST/LAST too) */
+    if (agg_op == OP_MIN) need_flags |= GHT_NEED_MIN;
+    if (agg_op == OP_MAX) need_flags |= GHT_NEED_MAX;
+
+    ght_layout_t ly = ght_compute_layout(n_keys, 1, agg_vecs, need_flags, agg_ops, key_types);
+
+    /* Hash-aggregate all rows via the shared radix pipeline — parallel
+     * across thread-pool workers for n_scan ≥ RAY_PARALLEL_THRESHOLD,
+     * sequential single-HT for smaller inputs. */
+    ray_progress_update("pivot", "hash-aggregate", 0, (uint64_t)nrows);
+    pivot_ingest_t pg;
+    if (!pivot_ingest_run(&pg, &ly, key_data, key_types, key_attrs,
+                          key_vecs, agg_vecs, nrows)) {
+        pivot_ingest_free(&pg);
+        return ray_error("oom", NULL);
+    }
+    ray_progress_update("pivot", "dedupe", 0, (uint64_t)pg.total_grps);
+    if (ray_interrupted()) { pivot_ingest_free(&pg); return ray_error("cancel", "interrupted"); }
+    uint32_t grp_count = pg.total_grps;
+    if (grp_count == 0) { pivot_ingest_free(&pg); return ray_table_new(0); }
+
+    /* Phase 2: Collect distinct pivot values and distinct index keys.
+     * Each group row layout: [hash:8][key0:8]...[keyN-1:8][null_mask:8][accum...]
+     * where the keys region holds n_idx index keys + 1 pivot key,
+     * followed by the key-null bitmap written by group_rows_range. */
+
+    /* SQL PIVOT treats a null pivot key as "no column" — drop those groups. */
+    const uint8_t pvt_null_bit = (uint8_t)(1u << n_idx);
+
+    /* Collect distinct pivot values */
+    uint32_t pv_cap = 64, pv_count = 0;
+    ray_t* pv_hdr = NULL;
+    int64_t* pv_vals = (int64_t*)scratch_alloc(&pv_hdr, pv_cap * sizeof(int64_t));
+    if (!pv_vals) { pivot_ingest_free(&pg); return ray_error("oom", NULL); }
+
+    const char* pvt_base = pvt_wide ? (const char*)key_data[n_idx] : NULL;
+    for (uint32_t _p = 0; _p < pg.n_parts; _p++) {
+        group_ht_t* ph = &pg.part_hts[_p];
+        uint32_t pcount = ph->grp_count;
+        for (uint32_t gi_local = 0; gi_local < pcount; gi_local++) {
+            const char* row = ph->rows + (size_t)gi_local * pg.row_stride;
+            const int64_t* rkeys = (const int64_t*)(row + 8);
+            int64_t nmask = rkeys[n_keys];
+            if (nmask & pvt_null_bit) continue;
+            int64_t pval = rkeys[n_idx];
+            bool found = false;
+            for (uint32_t p = 0; p < pv_count; p++) {
+                if (pvt_wide) {
+                    if (memcmp(pvt_base + (size_t)pv_vals[p] * 16,
+                               pvt_base + (size_t)pval * 16, 16) == 0) { found = true; break; }
+                } else {
+                    if (pv_vals[p] == pval) { found = true; break; }
+                }
+            }
+            if (!found) {
+                if (pv_count >= pv_cap) {
+                    uint32_t new_cap = pv_cap * 2;
+                    int64_t* new_pv = (int64_t*)scratch_realloc(&pv_hdr,
+                        pv_cap * sizeof(int64_t), new_cap * sizeof(int64_t));
+                    if (!new_pv) { pivot_ingest_free(&pg); return ray_error("oom", NULL); }
+                    pv_vals = new_pv;
+                    pv_cap = new_cap;
+                }
+                pv_vals[pv_count++] = pval;
+            }
+        }
+    }
+
+    /* Collect distinct index keys.
+     * Flat append-only entry array + secondary open-addressed HT keyed by
+     * the hash of (idx_keys + idx_null_mask). The HT makes phase2 dedupe
+     * O(grp_count) instead of the previous O(grp_count * ix_count)
+     * linear scan which hung on large pivots.
+     * Entry layout: [hash:8 | idx_keys:8*n_idx | idx_null_mask:8]. */
+    uint32_t ix_cap = 256, ix_count = 0;
+    ray_t* ix_hdr = NULL;
+    size_t ix_entry = 8 + (size_t)n_idx * 8 + 8;
+    const uint8_t idx_null_bits = (uint8_t)((1u << n_idx) - 1u);
+    char* ix_rows = (char*)scratch_alloc(&ix_hdr, ix_cap * ix_entry);
+    if (!ix_rows) { scratch_free(pv_hdr); pivot_ingest_free(&pg); return ray_error("oom", NULL); }
+
+    /* Secondary HT: hash slot -> ix_row index; empty = UINT32_MAX. */
+    uint32_t ix_ht_cap = 256;
+    while (ix_ht_cap < (uint32_t)grp_count * 2 && ix_ht_cap < (1u << 30)) ix_ht_cap <<= 1;
+    ray_t* ix_ht_hdr = NULL;
+    uint32_t* ix_ht = (uint32_t*)scratch_alloc(&ix_ht_hdr, ix_ht_cap * sizeof(uint32_t));
+    if (!ix_ht) {
+        scratch_free(ix_hdr); scratch_free(pv_hdr); pivot_ingest_free(&pg);
+        return ray_error("oom", NULL);
+    }
+    memset(ix_ht, 0xFF, ix_ht_cap * sizeof(uint32_t));
+    uint32_t ix_ht_mask = ix_ht_cap - 1;
+
+    /* Map: group_id -> (ix_row, pv_idx) for result cell placement */
+    ray_t* map_hdr = NULL;
+    uint32_t* grp_ix  = (uint32_t*)scratch_alloc(&map_hdr, grp_count * 2 * sizeof(uint32_t));
+    if (!grp_ix) { scratch_free(ix_ht_hdr); scratch_free(ix_hdr); scratch_free(pv_hdr); pivot_ingest_free(&pg); return ray_error("oom", NULL); }
+    uint32_t* grp_pv = grp_ix + grp_count;
+
+    for (uint32_t _p = 0; _p < pg.n_parts; _p++) {
+        group_ht_t* ph = &pg.part_hts[_p];
+        uint32_t pcount = ph->grp_count;
+        uint32_t gi_base = pg.part_offsets[_p];
+        /* Progress tick at each partition boundary — time-gated so
+         * 256 small partitions do not spam the callback. */
+        ray_progress_update(NULL, NULL, gi_base, (uint64_t)grp_count);
+        for (uint32_t gi_local = 0; gi_local < pcount; gi_local++) {
+            uint32_t gi = gi_base + gi_local;
+            const char* row = ph->rows + (size_t)gi_local * pg.row_stride;
+            const int64_t* keys = (const int64_t*)(row + 8);
+            int64_t nmask = keys[n_keys];
+            if (nmask & pvt_null_bit) {
+                grp_ix[gi] = UINT32_MAX;
+                grp_pv[gi] = UINT32_MAX;
+                continue;
+            }
+        int64_t idx_nmask = nmask & idx_null_bits;
+
+        /* Hash index keys only (exclude pivot key) + null mask.
+         * Wide keys (GUID) resolve actual bytes via key_data[k]. */
+        uint64_t ih = 0;
+        for (uint8_t k = 0; k < n_idx; k++) {
+            uint64_t kh;
+            if (idx_wide[k]) {
+                const char* base = (const char*)key_data[k];
+                kh = ray_hash_bytes(base + (size_t)keys[k] * 16, 16);
+            } else if (key_types[k] == RAY_F64) {
+                kh = ray_hash_f64(*(const double*)&keys[k]);
+            } else {
+                kh = ray_hash_i64(keys[k]);
+            }
+            ih = (k == 0) ? kh : ray_hash_combine(ih, kh);
+        }
+        if (idx_nmask) ih = ray_hash_combine(ih, ray_hash_i64(idx_nmask));
+
+        /* Open-addressed HT probe. On match, reuse; else insert. */
+        uint32_t ix_row = UINT32_MAX;
+        uint32_t slot = (uint32_t)(ih & ix_ht_mask);
+        for (;;) {
+            uint32_t ent = ix_ht[slot];
+            if (ent == UINT32_MAX) break; /* empty → insert below */
+            const char* ix_entry_p = ix_rows + (size_t)ent * ix_entry;
+            if (*(const uint64_t*)ix_entry_p == ih) {
+                const int64_t* ekeys = (const int64_t*)(ix_entry_p + 8);
+                bool eq = true;
+                for (uint8_t k = 0; k < n_idx && eq; k++) {
+                    if (idx_wide[k]) {
+                        const char* base = (const char*)key_data[k];
+                        eq = (memcmp(base + (size_t)ekeys[k] * 16,
+                                      base + (size_t)keys[k] * 16, 16) == 0);
+                    } else {
+                        eq = (ekeys[k] == keys[k]);
+                    }
+                }
+                int64_t ent_nmask;
+                memcpy(&ent_nmask, ix_entry_p + 8 + (size_t)n_idx * 8, 8);
+                if (eq && ent_nmask == idx_nmask) { ix_row = ent; break; }
+            }
+            slot = (slot + 1) & ix_ht_mask;
+        }
+        if (ix_row == UINT32_MAX) {
+            if (ix_count >= ix_cap) {
+                uint32_t new_cap = ix_cap * 2;
+                char* new_rows = (char*)scratch_realloc(&ix_hdr,
+                    ix_cap * ix_entry, new_cap * ix_entry);
+                if (!new_rows) {
+                    scratch_free(map_hdr); scratch_free(ix_ht_hdr);
+                    scratch_free(pv_hdr); pivot_ingest_free(&pg);
+                    return ray_error("oom", NULL);
+                }
+                ix_rows = new_rows;
+                ix_cap = new_cap;
+            }
+            ix_row = ix_count++;
+            char* dst = ix_rows + (size_t)ix_row * ix_entry;
+            *(uint64_t*)dst = ih;
+            memcpy(dst + 8, keys, (size_t)n_idx * 8);
+            memcpy(dst + 8 + (size_t)n_idx * 8, &idx_nmask, 8);
+            ix_ht[slot] = ix_row;
+        }
+
+        /* Find pivot column index. For wide pivot keys both slot values
+         * are source row indices — resolve to actual bytes for compare,
+         * otherwise duplicate GUID pivot values map to the wrong column. */
+        int64_t pval = keys[n_idx];
+        uint32_t pv_idx = UINT32_MAX;
+        for (uint32_t p = 0; p < pv_count; p++) {
+            if (pvt_wide) {
+                if (memcmp(pvt_base + (size_t)pv_vals[p] * 16,
+                           pvt_base + (size_t)pval * 16, 16) == 0) { pv_idx = p; break; }
+            } else {
+                if (pv_vals[p] == pval) { pv_idx = p; break; }
+            }
+        }
+
+            grp_ix[gi] = ix_row;
+            grp_pv[gi] = pv_idx;
+        }
+    }
+
+    /* Phase 3: Build output table */
+    ray_progress_update("pivot", "scatter", 0, (uint64_t)pv_count);
+    bool val_is_f64 = vcol->type == RAY_F64;
+    int8_t out_agg_type;
+    switch (agg_op) {
+        case OP_AVG:   out_agg_type = RAY_F64; break;
+        case OP_COUNT: out_agg_type = RAY_I64; break;
+        case OP_SUM:   out_agg_type = val_is_f64 ? RAY_F64 : RAY_I64; break;
+        default:       out_agg_type = vcol->type; break;
+    }
+
+    int64_t out_ncols = (int64_t)n_idx + (int64_t)pv_count;
+    ray_t* result = ray_table_new(out_ncols);
+    if (!result || RAY_IS_ERR(result)) goto pivot_cleanup;
+
+    /* Index columns */
+    for (uint8_t k = 0; k < n_idx; k++) {
+        ray_t* new_col = col_vec_new(idx_vecs[k], (int64_t)ix_count);
+        if (!new_col || RAY_IS_ERR(new_col)) { ray_release(result); result = ray_error("oom", NULL); goto pivot_cleanup; }
+        new_col->len = (int64_t)ix_count;
+        uint8_t esz = col_esz(idx_vecs[k]);
+        int8_t kt = idx_vecs[k]->type;
+        const char* src_base = idx_wide[k] ? (const char*)key_data[k] : NULL;
+        for (uint32_t r = 0; r < ix_count; r++) {
+            const char* ix_entry_p = ix_rows + r * ix_entry;
+            int64_t kv = ((const int64_t*)(ix_entry_p + 8))[k];
+            int64_t ent_nmask;
+            memcpy(&ent_nmask, ix_entry_p + 8 + (size_t)n_idx * 8, 8);
+            if (ent_nmask & (int64_t)(1u << k)) {
+                ray_vec_set_null(new_col, (int64_t)r, true);
+                continue;
+            }
+            if (idx_wide[k]) {
+                /* kv is a source row index; copy the 16 raw bytes. */
+                memcpy((char*)ray_data(new_col) + (size_t)r * esz,
+                       src_base + (size_t)kv * 16, 16);
+            } else if (kt == RAY_F64) {
+                memcpy((char*)ray_data(new_col) + (size_t)r * esz, &kv, 8);
+            } else {
+                write_col_i64(ray_data(new_col), (int64_t)r, kv, kt, new_col->attrs);
+            }
+        }
+        if (idx_vecs[k]->type == RAY_STR)
+            col_propagate_str_pool(new_col, idx_vecs[k]);
+
+        ray_op_ext_t* ie = find_ext(g, ext->pivot.index_cols[k]->id);
+        result = ray_table_add_col(result, ie->sym, new_col);
+        ray_release(new_col);
+        if (RAY_IS_ERR(result)) goto pivot_cleanup;
+    }
+
+    /* Value columns — one per distinct pivot value */
+    {
+    int8_t s = ly.agg_val_slot[0]; /* single agg input -> slot 0 */
+    for (uint32_t p = 0; p < pv_count; p++) {
+        ray_t* new_col = (out_agg_type == vcol->type)
+                        ? col_vec_new(vcol, (int64_t)ix_count)
+                        : ray_vec_new(out_agg_type, (int64_t)ix_count);
+        if (!new_col || RAY_IS_ERR(new_col)) { ray_release(result); result = ray_error("oom", NULL); goto pivot_cleanup; }
+        new_col->len = (int64_t)ix_count;
+
+        /* Initialize with zero (missing cells get 0) */
+        memset(ray_data(new_col), 0, (size_t)ix_count * (out_agg_type == RAY_F64 ? 8 : (size_t)col_esz(new_col)));
+
+        for (uint32_t _pp = 0; _pp < pg.n_parts; _pp++) {
+            group_ht_t* ph = &pg.part_hts[_pp];
+            uint32_t pcount = ph->grp_count;
+            uint32_t gi_base = pg.part_offsets[_pp];
+            for (uint32_t gi_local = 0; gi_local < pcount; gi_local++) {
+                uint32_t gi = gi_base + gi_local;
+                if (grp_pv[gi] != p) continue;
+                uint32_t r = grp_ix[gi];
+                const char* row = ph->rows + (size_t)gi_local * pg.row_stride;
+                int64_t cnt = *(const int64_t*)(const void*)row;
+
+            if (out_agg_type == RAY_F64) {
+                double v;
+                switch (agg_op) {
+                    case OP_SUM:
+                        v = val_is_f64 ? ROW_RD_F64(row, ly.off_sum, s)
+                                       : (double)ROW_RD_I64(row, ly.off_sum, s);
+                        break;
+                    case OP_AVG:
+                        v = val_is_f64 ? ROW_RD_F64(row, ly.off_sum, s) / cnt
+                                       : (double)ROW_RD_I64(row, ly.off_sum, s) / cnt;
+                        break;
+                    case OP_MIN:
+                        v = val_is_f64 ? ROW_RD_F64(row, ly.off_min, s)
+                                       : (double)ROW_RD_I64(row, ly.off_min, s);
+                        break;
+                    case OP_MAX:
+                        v = val_is_f64 ? ROW_RD_F64(row, ly.off_max, s)
+                                       : (double)ROW_RD_I64(row, ly.off_max, s);
+                        break;
+                    case OP_FIRST: case OP_LAST:
+                        v = val_is_f64 ? ROW_RD_F64(row, ly.off_sum, s)
+                                       : (double)ROW_RD_I64(row, ly.off_sum, s);
+                        break;
+                    default: v = 0.0; break;
+                }
+                ((double*)ray_data(new_col))[r] = v;
+            } else {
+                int64_t v;
+                switch (agg_op) {
+                    case OP_SUM:   v = ROW_RD_I64(row, ly.off_sum, s); break;
+                    case OP_COUNT: v = cnt; break;
+                    case OP_MIN:   v = ROW_RD_I64(row, ly.off_min, s); break;
+                    case OP_MAX:   v = ROW_RD_I64(row, ly.off_max, s); break;
+                    case OP_FIRST: case OP_LAST: v = ROW_RD_I64(row, ly.off_sum, s); break;
+                    default:       v = 0; break;
+                }
+                    write_col_i64(ray_data(new_col), (int64_t)r, v, out_agg_type, new_col->attrs);
+                }
+            }
+        }
+
+        /* Column name from pivot value — match pivot_val_to_sym semantics */
+        int64_t pval = pv_vals[p];
+        int64_t col_sym;
+        if (pcol->type == RAY_SYM) {
+            col_sym = pval;
+        } else if (pvt_wide) {
+            /* GUID: format 16 bytes as xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx.
+             * pval is a source row index into pvt_base. */
+            static const char hex[] = "0123456789abcdef";
+            static const int groups[] = {4, 2, 2, 2, 6};
+            char buf[37];
+            const uint8_t* bytes = (const uint8_t*)pvt_base + (size_t)pval * 16;
+            int pos = 0, bpos = 0;
+            for (int g = 0; g < 5; g++) {
+                if (g > 0) buf[bpos++] = '-';
+                for (int j = 0; j < groups[g]; j++) {
+                    buf[bpos++] = hex[bytes[pos] >> 4];
+                    buf[bpos++] = hex[bytes[pos] & 0x0F];
+                    pos++;
+                }
+            }
+            col_sym = ray_sym_intern(buf, (size_t)bpos);
+        } else {
+            char buf[128];
+            int len = 0;
+            int8_t pt = key_types[n_idx];
+            if (pt == RAY_F64) {
+                double fv;
+                memcpy(&fv, &pval, 8);
+                if (fv == 0.0 && signbit(fv)) fv = 0.0;
+                len = snprintf(buf, sizeof(buf), "%g", fv);
+            } else if (pt == RAY_BOOL) {
+                len = snprintf(buf, sizeof(buf), "%s", pval ? "true" : "false");
+            } else if (pt == RAY_I64 || pt == RAY_I32 || pt == RAY_I16 ||
+                       pt == RAY_DATE || pt == RAY_TIME || pt == RAY_TIMESTAMP) {
+                len = snprintf(buf, sizeof(buf), "%ld", (long)pval);
+            } else {
+                len = snprintf(buf, sizeof(buf), "col%ld", (long)pval);
+            }
+            col_sym = ray_sym_intern(buf, (size_t)len);
+        }
+
+        result = ray_table_add_col(result, col_sym, new_col);
+        ray_release(new_col);
+        if (RAY_IS_ERR(result)) goto pivot_cleanup;
+    }
+    }
+
+pivot_cleanup:
+    scratch_free(map_hdr);
+    scratch_free(ix_ht_hdr);
+    scratch_free(ix_hdr);
+    scratch_free(pv_hdr);
+    pivot_ingest_free(&pg);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/plan.c b/crates/rayforce-sys/vendor/rayforce/src/ops/plan.c
new file mode 100644
index 0000000..ba92a5f
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/plan.c
@@ -0,0 +1,31 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "plan.h"
+
+/* --------------------------------------------------------------------------
+ * Plan: linearize DAG into execution order
+ *
+ * For now, the executor recursively evaluates nodes (exec.c). This file
+ * is a placeholder for future topological sort + pipeline planning.
+ * -------------------------------------------------------------------------- */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/plan.h b/crates/rayforce-sys/vendor/rayforce/src/ops/plan.h
new file mode 100644
index 0000000..0d0edfc
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/plan.h
@@ -0,0 +1,29 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_PLAN_H
+#define RAY_PLAN_H
+
+#include "ops.h"
+
+#endif /* RAY_PLAN_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/query.c b/crates/rayforce-sys/vendor/rayforce/src/ops/query.c
new file mode 100644
index 0000000..17aaf0f
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/query.c
@@ -0,0 +1,6329 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/**   Query bridge: select, update, insert, upsert, join operations.
+ *   Extracted from eval.c.
+ */
+
+#include "lang/internal.h"
+#include "lang/eval.h"
+#include "lang/env.h"
+#include "ops/ops.h"
+#include "ops/internal.h"
+#include "ops/hash.h"
+#include "ops/temporal.h"
+#include "table/sym.h"
+#include "table/dict.h"
+#include "mem/sys.h"
+
+#include <string.h>
+#include <math.h>
+#include <inttypes.h>
+
+/* ══════════════════════════════════════════
+ * Select query — DAG bridge
+ * ══════════════════════════════════════════ */
+
+/* Helper: look up a key in a select-clause dict (RAY_DICT).
+ * Returns the value expression (unevaluated), or NULL if not found. */
+static ray_t* dict_get(ray_t* dict, const char* key) {
+    if (!dict || dict->type != RAY_DICT) return NULL;
+    int64_t key_id = ray_sym_intern(key, strlen(key));
+    return ray_dict_probe_sym_borrowed(dict, key_id);
+}
+
+/* Flatten a RAY_DICT (keys SYM vec + vals LIST) into a transient
+ * [k0,v0,k1,v1,...] array view so the existing dict-walking loops in
+ * ray_select_fn et al. can iterate without rewriting every site.
+ *
+ * Caller passes stack-local buffers sized at DICT_VIEW_MAX.  If the dict
+ * has more pairs than fits, sets `*out_n = -1` to flag overflow — every
+ * call site checks this and returns a "domain" error rather than letting
+ * the writes spill past the buffers.  The previous version of this helper
+ * had no such guard and silently corrupted the stack on user-controlled
+ * dicts with > 64 pairs.
+ *
+ * `key_atoms` must hold at least DICT_VIEW_MAX entries; `out_elems` at
+ * least 2 * DICT_VIEW_MAX.  Keys are synthesized as -RAY_SYM atoms in
+ * `key_atoms`; values are borrowed from the dict's vals list. */
+#define DICT_VIEW_MAX 256
+static void dict_pair_view(ray_t* d, ray_t* key_atoms, ray_t** out_elems, int64_t* out_n) {
+    *out_n = 0;
+    if (!d || d->type != RAY_DICT) return;
+    ray_t* keys = ray_dict_keys(d);
+    ray_t* vals = ray_dict_vals(d);
+    if (!keys || keys->type != RAY_SYM || !vals || vals->type != RAY_LIST) return;
+    int64_t n = keys->len;
+    if (n > DICT_VIEW_MAX) { *out_n = -1; return; }
+    void* kbase = ray_data(keys);
+    ray_t** vptrs = (ray_t**)ray_data(vals);
+    for (int64_t i = 0; i < n; i++) {
+        memset(&key_atoms[i], 0, sizeof(ray_t));
+        key_atoms[i].type = -RAY_SYM;
+        key_atoms[i].i64  = ray_read_sym(kbase, i, RAY_SYM, keys->attrs);
+        out_elems[i*2]   = &key_atoms[i];
+        out_elems[i*2+1] = vptrs[i];
+    }
+    *out_n = 2 * n;
+}
+
+#define DICT_VIEW_DECL(name)                            \
+    ray_t   name##_keybuf[DICT_VIEW_MAX];               \
+    ray_t*  name[DICT_VIEW_MAX * 2];                    \
+    int64_t name##_n
+#define DICT_VIEW_OPEN(d, name)                          \
+    dict_pair_view((d), name##_keybuf, name, &name##_n)
+/* Returns true if the open exceeded DICT_VIEW_MAX — caller should
+ * `ray_release(tbl); return ray_error("domain", "clause too big");`. */
+#define DICT_VIEW_OVERFLOW(name) ((name##_n) < 0)
+
+/* Convert a RAY_DICT (keys, vals) into a transient interleaved
+ * [k0_atom, v0, k1_atom, v1, …] RAY_LIST.  Used by select's group-by
+ * aggregation paths which were written for the old in-place pair-array
+ * representation of grouping output.  Returns an owned RAY_LIST (rc=1).
+ * Atom keys are freshly boxed for typed-vector key columns (sym, i64,
+ * etc.); for RAY_LIST keys they are retained borrows. */
+static ray_t* groups_to_pair_list(ray_t* d) {
+    if (!d || d->type != RAY_DICT) return ray_error("type", NULL);
+    ray_t* keys = ray_dict_keys(d);
+    ray_t* vals = ray_dict_vals(d);
+    int64_t n = keys ? keys->len : 0;
+    ray_t* out = ray_list_new(n * 2);
+    if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL);
+    ray_t** vptrs = (vals && vals->type == RAY_LIST) ? (ray_t**)ray_data(vals) : NULL;
+    for (int64_t i = 0; i < n; i++) {
+        ray_t* k = NULL;
+        if (!keys) {
+            k = NULL;
+        } else if (keys->type == RAY_LIST) {
+            k = ((ray_t**)ray_data(keys))[i];
+            if (k) ray_retain(k);
+        } else {
+            void* base = ray_data(keys);
+            switch (keys->type) {
+                case RAY_SYM: k = ray_sym(ray_read_sym(base, i, RAY_SYM, keys->attrs)); break;
+                case RAY_I64:
+                case RAY_TIMESTAMP: k = ray_i64(((int64_t*)base)[i]); break;
+                case RAY_I32:
+                case RAY_DATE:
+                case RAY_TIME: k = ray_i32(((int32_t*)base)[i]); break;
+                case RAY_I16: k = ray_i16(((int16_t*)base)[i]); break;
+                case RAY_BOOL:
+                case RAY_U8:  k = ray_u8(((uint8_t*)base)[i]); break;
+                case RAY_F64: k = ray_f64(((double*)base)[i]); break;
+                case RAY_STR: { size_t sl = 0; const char* sp = ray_str_vec_get(keys, i, &sl);
+                                 k = ray_str(sp ? sp : "", sp ? sl : 0); break; }
+                case RAY_GUID: k = ray_guid(((uint8_t*)base) + i * 16); break;
+                default: k = NULL; break;
+            }
+        }
+        out = ray_list_append(out, k);
+        if (k) ray_release(k);
+        ray_t* v = vptrs ? vptrs[i] : NULL;
+        out = ray_list_append(out, v);
+    }
+    return out;
+}
+
+/* Map a Rayfall builtin name to a DAG binary op constructor */
+typedef ray_op_t* (*dag_binary_ctor)(ray_graph_t*, ray_op_t*, ray_op_t*);
+typedef ray_op_t* (*dag_unary_ctor)(ray_graph_t*, ray_op_t*);
+
+static dag_binary_ctor resolve_binary_dag(int64_t sym_id) {
+    ray_t* s = ray_sym_str(sym_id);
+    if (!s) return NULL;
+    const char* name = ray_str_ptr(s);
+    size_t len = ray_str_len(s);
+    if (len == 1) {
+        switch (name[0]) {
+            case '+': return ray_add;
+            case '-': return ray_sub;
+            case '*': return ray_mul;
+            case '/': return ray_div;
+            case '%': return ray_mod;
+            case '>': return ray_gt;
+            case '<': return ray_lt;
+        }
+    } else if (len == 2) {
+        if (name[0] == '>' && name[1] == '=') return ray_ge;
+        if (name[0] == '<' && name[1] == '=') return ray_le;
+        if (name[0] == '=' && name[1] == '=') return ray_eq;
+        if (name[0] == '!' && name[1] == '=') return ray_ne;
+        if (name[0] == 'o' && name[1] == 'r') return ray_or;
+        if (name[0] == 'i' && name[1] == 'n') return ray_in;
+    } else if (len == 3) {
+        if (memcmp(name, "and",  3) == 0) return ray_and;
+    } else if (len == 4) {
+        if (memcmp(name, "like", 4) == 0) return ray_like;
+    } else if (len == 5) {
+        if (memcmp(name, "ilike", 5) == 0) return ray_ilike;
+    } else if (len == 6) {
+        if (memcmp(name, "not-in", 6) == 0) return ray_not_in;
+    }
+    return NULL;
+}
+
+static dag_unary_ctor resolve_unary_dag(int64_t sym_id) {
+    ray_t* s = ray_sym_str(sym_id);
+    if (!s) return NULL;
+    const char* name = ray_str_ptr(s);
+    size_t len = ray_str_len(s);
+    if (len == 3) {
+        if (memcmp(name, "neg", 3) == 0) return ray_neg;
+        if (memcmp(name, "not", 3) == 0) return ray_not;
+        if (memcmp(name, "abs", 3) == 0) return ray_abs;
+        if (memcmp(name, "exp", 3) == 0) return ray_exp_op;
+        if (memcmp(name, "log", 3) == 0) return ray_log_op;
+    } else if (len == 4) {
+        if (memcmp(name, "ceil",  4) == 0) return ray_ceil_op;
+        if (memcmp(name, "sqrt",  4) == 0) return ray_sqrt_op;
+        if (memcmp(name, "trim",  4) == 0) return ray_trim_op;
+    } else if (len == 5) {
+        if (memcmp(name, "floor", 5) == 0) return ray_floor_op;
+        if (memcmp(name, "round", 5) == 0) return ray_round_op;
+        if (memcmp(name, "upper", 5) == 0) return ray_upper;
+        if (memcmp(name, "lower", 5) == 0) return ray_lower;
+    } else if (len == 6) {
+        if (memcmp(name, "strlen", 6) == 0) return ray_strlen;
+    }
+    /* NOTE: no DAG wiring for nil?/isnull yet.  The eval-level
+     * builtin `nil?` (src/lang/eval.c:2029) is atom-only — it
+     * returns false when applied to a column vec.  OP_ISNULL in
+     * the DAG is per-element.  Wiring `nil?` here would diverge
+     * from the eval fallback.  A proper pass should first add an
+     * element-wise null-check builtin at eval level, then map it
+     * here. */
+    return NULL;
+}
+
+/* Map Rayfall aggregation name to DAG opcode */
+static uint16_t resolve_agg_opcode(int64_t sym_id) {
+    ray_t* s = ray_sym_str(sym_id);
+    if (!s) return 0;
+    const char* name = ray_str_ptr(s);
+    size_t len = ray_str_len(s);
+    if (len == 3 && memcmp(name, "sum",   3) == 0) return OP_SUM;
+    if (len == 3 && memcmp(name, "avg",   3) == 0) return OP_AVG;
+    if (len == 3 && memcmp(name, "min",   3) == 0) return OP_MIN;
+    if (len == 3 && memcmp(name, "max",   3) == 0) return OP_MAX;
+    if (len == 3 && memcmp(name, "dev",   3) == 0) return OP_STDDEV;
+    if (len == 3 && memcmp(name, "var",   3) == 0) return OP_VAR;
+    if (len == 4 && memcmp(name, "prod",  4) == 0) return OP_PROD;
+    if (len == 4 && memcmp(name, "last",  4) == 0) return OP_LAST;
+    if (len == 5 && memcmp(name, "count", 5) == 0) return OP_COUNT;
+    if (len == 5 && memcmp(name, "first", 5) == 0) return OP_FIRST;
+    if (len == 6 && memcmp(name, "stddev",6) == 0) return OP_STDDEV;
+    if (len == 7 && memcmp(name, "dev_pop",      7) == 0) return OP_STDDEV_POP;
+    if (len == 7 && memcmp(name, "var_pop",      7) == 0) return OP_VAR_POP;
+    if (len == 10 && memcmp(name, "stddev_pop", 10) == 0) return OP_STDDEV_POP;
+    return 0;
+}
+
+/* Apply sort (asc/desc) and take clauses to a materialized result table.
+ * Used by eval-level paths that bypass the DAG (e.g., LIST/STR group keys).
+ * Builds a temporary DAG for sorting (supports per-column direction flags)
+ * and applies take via ray_head/ray_tail or ray_take_fn. */
+static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n,
+                              int64_t asc_id, int64_t desc_id, int64_t take_id) {
+    if (!result || RAY_IS_ERR(result)) return result;
+
+    /* Check for sort/take clauses */
+    bool has_sort = false;
+    ray_t* take_val_expr = NULL;
+    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+        int64_t kid = dict_elems[i]->i64;
+        if (kid == asc_id || kid == desc_id) has_sort = true;
+        if (kid == take_id) take_val_expr = dict_elems[i + 1];
+    }
+    if (!has_sort && !take_val_expr) return result;
+
+    /* Build temporary DAG on the materialized result */
+    ray_graph_t* g = ray_graph_new(result);
+    if (!g) return result;
+    ray_op_t* root = ray_const_table(g, result);
+
+    /* Sort */
+    if (has_sort) {
+        ray_op_t* sort_keys[16];
+        uint8_t   sort_descs[16];
+        uint8_t   n_sort = 0;
+        for (int64_t i = 0; i + 1 < dict_n && n_sort < 16; i += 2) {
+            int64_t kid = dict_elems[i]->i64;
+            uint8_t is_desc = 0;
+            if (kid == asc_id) is_desc = 0;
+            else if (kid == desc_id) is_desc = 1;
+            else continue;
+            ray_t* val = dict_elems[i + 1];
+            if (val->type == -RAY_SYM) {
+                ray_t* s = ray_sym_str(val->i64);
+                sort_keys[n_sort] = ray_scan(g, ray_str_ptr(s));
+                sort_descs[n_sort] = is_desc;
+                n_sort++;
+            } else if (ray_is_vec(val) && val->type == RAY_SYM) {
+                for (int64_t c = 0; c < val->len && n_sort < 16; c++) {
+                    int64_t sid = ray_read_sym(ray_data(val), c, val->type, val->attrs);
+                    ray_t* s = ray_sym_str(sid);
+                    sort_keys[n_sort] = ray_scan(g, ray_str_ptr(s));
+                    sort_descs[n_sort] = is_desc;
+                    n_sort++;
+                }
+            }
+        }
+        if (n_sort > 0)
+            root = ray_sort_op(g, root, sort_keys, sort_descs, NULL, n_sort);
+    }
+
+    /* Take: avoid the DAG ray_head/ray_tail op — it can't handle
+     * tables with LIST columns (from non-agg scatter).  Use
+     * ray_take_fn, but convert the atom form into a `[start amount]`
+     * range so we get CLAMP semantics (group-by take),
+     * not the wrap/pad behavior of atom-n take on a short table. */
+    ray_t* take_range   = NULL;    /* [start amount] literal form */
+    int    take_is_atom = 0;
+    int64_t atom_n      = 0;
+    if (take_val_expr) {
+        ray_t* tv = ray_eval(take_val_expr);
+        if (!tv || RAY_IS_ERR(tv)) {
+            ray_graph_free(g); ray_release(result);
+            return tv ? tv : ray_error("domain", NULL);
+        }
+        if (ray_is_atom(tv) && (tv->type == -RAY_I64 || tv->type == -RAY_I32)) {
+            atom_n = (tv->type == -RAY_I64) ? tv->i64 : tv->i32;
+            take_is_atom = 1;
+            ray_release(tv);
+        } else if (ray_is_vec(tv) && (tv->type == RAY_I64 || tv->type == RAY_I32) && tv->len == 2) {
+            take_range = tv;
+        } else {
+            ray_release(tv); ray_graph_free(g); ray_release(result);
+            return ray_error("domain", NULL);
+        }
+    }
+
+    root = ray_optimize(g, root);
+    ray_t* sorted = ray_execute(g, root);
+    ray_graph_free(g);
+    ray_release(result);
+
+    if (take_is_atom && sorted && !RAY_IS_ERR(sorted)) {
+        /* Build [start, amount] so ray_take_fn uses its range
+         * branch, which clamps to the available length. */
+        int64_t nrows = (sorted->type == RAY_TABLE)
+                      ? ray_table_nrows(sorted)
+                      : (ray_is_vec(sorted) ? sorted->len : 0);
+        int64_t start, amount;
+        if (atom_n >= 0) {
+            start  = 0;
+            amount = atom_n < nrows ? atom_n : nrows;
+        } else {
+            int64_t want = -atom_n;
+            amount = want < nrows ? want : nrows;
+            start  = nrows - amount;
+        }
+        ray_t* rng = ray_vec_new(RAY_I64, 2);
+        if (!rng || RAY_IS_ERR(rng)) {
+            ray_release(sorted);
+            return rng ? rng : ray_error("oom", NULL);
+        }
+        ((int64_t*)ray_data(rng))[0] = start;
+        ((int64_t*)ray_data(rng))[1] = amount;
+        rng->len = 2;
+        ray_t* sliced = ray_take_fn(sorted, rng);
+        ray_release(sorted);
+        ray_release(rng);
+        return sliced;
+    }
+    if (take_range && sorted && !RAY_IS_ERR(sorted)) {
+        ray_t* sliced = ray_take_fn(sorted, take_range);
+        ray_release(sorted);
+        ray_release(take_range);
+        return sliced;
+    }
+    if (take_range) ray_release(take_range);
+    return sorted;
+}
+
+/* --------------------------------------------------------------------------
+ * Compile-time local env helpers for lambda / let inlining.
+ *
+ * compile_expr_dag hangs a small stack of {formal_sym_id → node_id}
+ * bindings on the graph.  When the recursive walker encounters a
+ * name reference, it checks the env first; if the name is bound,
+ * return &g->nodes[node_id] — otherwise fall through to ray_scan.
+ *
+ * Store IDs, not pointers: g->nodes is a dynamically-resized array,
+ * and any realloc between push and lookup would dangle stored
+ * pointers.  IDs are stable across reallocs; we re-resolve
+ * &g->nodes[id] on every lookup.
+ *
+ * Shadowing is automatic: nested lambda / let pushes appear later in
+ * the stack, and cexpr_env_lookup walks top-down so the innermost
+ * binding wins.  Pops are counted — never partial rewinds.
+ *
+ * No retain/release: op nodes live in g->nodes and are freed
+ * uniformly by ray_graph_free.
+ * -------------------------------------------------------------------------- */
+static ray_op_t* cexpr_env_lookup(ray_graph_t* g, int64_t sym) {
+    for (int i = g->cexpr_env_top - 1; i >= 0; i--)
+        if (g->cexpr_env[i].sym == sym)
+            return &g->nodes[g->cexpr_env[i].node_id];
+    return NULL;
+}
+
+static bool cexpr_env_push(ray_graph_t* g, int64_t sym, ray_op_t* node) {
+    if (g->cexpr_env_top >= 32) return false;
+    g->cexpr_env[g->cexpr_env_top].sym     = sym;
+    g->cexpr_env[g->cexpr_env_top].node_id = node->id;
+    g->cexpr_env_top++;
+    return true;
+}
+
+static void cexpr_env_pop(ray_graph_t* g, int n) {
+    g->cexpr_env_top -= n;
+    if (g->cexpr_env_top < 0) g->cexpr_env_top = 0;  /* defensive */
+}
+
+/* Re-resolve a ray_op_t* by its stable node ID.  Use this whenever
+ * a pointer to an op node has been held across another DAG-building
+ * call (which may grow g->nodes via graph_alloc_node and invalidate
+ * all previously-returned pointers).  The ID is stable; only the
+ * backing address may change. */
+
+/* Compile a Rayfall AST expression into a DAG node */
+static ray_op_t* compile_expr_dag(ray_graph_t* g, ray_t* expr) {
+    if (!expr) return NULL;
+
+    /* Atom literal → const node.  Handle non-null scalar literals
+     * via the dedicated ctors that carry just the raw value; typed
+     * null atoms (e.g. `0Nl`, `0Nf`) must go through ray_const_atom
+     * so the null flag in atom->nullmap rides along — otherwise
+     * downstream comparisons lose the null-ness and fall back to
+     * sentinel-value equality. */
+    if (expr->type == -RAY_I64 && !RAY_ATOM_IS_NULL(expr))
+        return ray_const_i64(g, expr->i64);
+    if (expr->type == -RAY_F64 && !RAY_ATOM_IS_NULL(expr))
+        return ray_const_f64(g, expr->f64);
+    if (expr->type == -RAY_BOOL && !RAY_ATOM_IS_NULL(expr))
+        return ray_const_bool(g, expr->b8);
+    if (expr->type == -RAY_STR && !RAY_ATOM_IS_NULL(expr)) {
+        const char *ptr = ray_str_ptr(expr);
+        size_t len = ray_str_len(expr);
+        return ray_const_str(g, ptr, len);
+    }
+
+    /* Name reference → local env first, then column scan, then
+     * global env (for set-bound constants).  Local env holds lambda
+     * / let bindings and takes precedence so formals shadow columns
+     * naturally.  Global env is a last resort — it catches cases
+     * like `(set threshold 50)` used inside a lambda body. */
+    if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME)) {
+        ray_op_t* bound = cexpr_env_lookup(g, expr->i64);
+        if (bound) return bound;
+        ray_t* s = ray_sym_str(expr->i64);
+        if (!s) return NULL;
+
+        /* Dotted name — desugar at compile time by walking the
+         * segments: emit a scan for the head column, then for each
+         * subsequent segment look up the name's registered DAG-level
+         * emitter and chain it.  `col.ss` → scan(col) → extract(SS),
+         * `col.date` → scan(col) → date_trunc(DAY), etc.  Segment
+         * resolution uses the same name table as the runtime
+         * `(ss col)` form, so adding a new accessor means registering
+         * one unary builtin (temporal or otherwise) — no bespoke sym
+         * → field map here.  Anything we can't lower returns NULL
+         * (compile error), avoiding the old crash path where unknown
+         * dotted names became scans of non-existent columns. */
+        if (ray_sym_is_dotted(expr->i64)) {
+            const int64_t* segs;
+            int nsegs = ray_sym_segs(expr->i64, &segs);
+            if (nsegs < 2) return NULL;
+            if (!g->table || g->table->type != RAY_TABLE) return NULL;
+            if (!ray_table_get_col(g->table, segs[0])) return NULL;
+            ray_t* head_name = ray_sym_str(segs[0]);
+            if (!head_name) return NULL;
+            ray_op_t* op = ray_scan(g, ray_str_ptr(head_name));
+            if (!op) return NULL;
+            for (int i = 1; i < nsegs; i++) {
+                int field = ray_temporal_field_from_sym(segs[i]);
+                if (field >= 0) {
+                    op = ray_extract(g, op, field);
+                    if (!op) return NULL;
+                    continue;
+                }
+                int trunc_kind = ray_temporal_trunc_from_sym(segs[i]);
+                if (trunc_kind >= 0) {
+                    op = ray_date_trunc(g, op, trunc_kind);
+                    if (!op) return NULL;
+                    continue;
+                }
+                return NULL;
+            }
+            return op;
+        }
+
+        /* Column names on the bound table shadow global env —
+         * matches eval-level name-resolution order. */
+        if (g->table && g->table->type == RAY_TABLE &&
+            ray_table_get_col(g->table, expr->i64))
+            return ray_scan(g, ray_str_ptr(s));
+        /* Global env: atom literals / typed vectors compile as
+         * const nodes.  Lambdas only make sense as call heads
+         * and are handled in the list branch below. */
+        ray_t* gv = ray_env_get(expr->i64);
+        if (gv) {
+            if (ray_is_atom(gv)) return ray_const_atom(g, gv);
+            if (ray_is_vec(gv))  return ray_const_vec(g, gv);
+        }
+        /* Unknown name — let ray_scan produce a column-not-found
+         * error at exec time, matching prior behavior. */
+        return ray_scan(g, ray_str_ptr(s));
+    }
+
+    /* Symbol literal (no RAY_ATTR_NAME) → const atom node. */
+    if (expr->type == -RAY_SYM)
+        return ray_const_atom(g, expr);
+
+    /* Other atom literal types → const atom node.  Also falls
+     * through to here for typed null I64/F64/BOOL/STR atoms
+     * (which the fast-path branches above rejected via
+     * RAY_ATOM_IS_NULL). */
+    if (ray_is_atom(expr) && !(expr->attrs & RAY_ATTR_NAME))
+        return ray_const_atom(g, expr);
+
+    /* Typed-vector literal (e.g. [1 2 3], [AAPL MSFT], ["a" "b"]) →
+     * const vector node.  ray_const_vec already stores any ray_t*
+     * vec in ext->literal, and the OP_CONST executor returns it
+     * directly — so this unlocks every typed literal vector as a
+     * DAG operand (crucial for OP_IN set operands). */
+    if (ray_is_vec(expr) && !(expr->attrs & RAY_ATTR_NAME))
+        return ray_const_vec(g, expr);
+
+    /* List → function call: (fn arg1 arg2 ...) */
+    if (expr->type == RAY_LIST) {
+        int64_t n = ray_len(expr);
+        if (n == 0) return NULL;
+        ray_t** elems = (ray_t**)ray_data(expr);
+        ray_t* head = elems[0];
+
+        /* Lambda invocation: `((fn [formals] body) a1 a2 …)`.
+         * β-reduce at the DAG-node level — compile each actual
+         * arg into its own op (in the current env), push the
+         * {formal_i → actual_op_i} frame, recurse into the body
+         * (which reads the env via cexpr_env_lookup when it hits
+         * a name reference), then pop.  Sub-expression sharing is
+         * automatic: multiple uses of a formal all resolve to the
+         * single compiled actual op. */
+        if (head->type == RAY_LIST) {
+            int64_t hn = ray_len(head);
+            if (hn != 3) return NULL;
+            ray_t** hel = (ray_t**)ray_data(head);
+            if (hel[0]->type != -RAY_SYM) return NULL;
+            ray_t* hname_str = ray_sym_str(hel[0]->i64);
+            if (!hname_str || ray_str_len(hname_str) != 2 ||
+                memcmp(ray_str_ptr(hname_str), "fn", 2) != 0) return NULL;
+
+            ray_t* formals = hel[1];
+            ray_t* body    = hel[2];
+            if (!ray_is_vec(formals) || formals->type != RAY_SYM) return NULL;
+            int64_t nf = formals->len;
+            if (n - 1 != nf) return NULL;              /* arity mismatch */
+            if (nf > 16) return NULL;                  /* too many formals */
+            if (g->cexpr_env_top + (int)nf > 32) return NULL; /* env overflow */
+
+            /* Compile actuals in the CURRENT env, before pushing.
+             * Snapshot IDs, not pointers — g->nodes can realloc
+             * between successive compile_expr_dag calls so any
+             * raw ray_op_t* saved from an earlier iteration may
+             * dangle by the time we push it. */
+            uint32_t actual_ids[16];
+            for (int64_t i = 0; i < nf; i++) {
+                ray_op_t* a = compile_expr_dag(g, elems[i + 1]);
+                if (!a) return NULL;
+                actual_ids[i] = a->id;
+            }
+            int64_t* fids = (int64_t*)ray_data(formals);
+            int pushed = 0;
+            for (int64_t i = 0; i < nf; i++) {
+                if (g->cexpr_env_top >= 32) {
+                    cexpr_env_pop(g, pushed);
+                    return NULL;
+                }
+                g->cexpr_env[g->cexpr_env_top].sym     = fids[i];
+                g->cexpr_env[g->cexpr_env_top].node_id = actual_ids[i];
+                g->cexpr_env_top++;
+                pushed++;
+            }
+            ray_op_t* result = compile_expr_dag(g, body);
+            cexpr_env_pop(g, pushed);
+            return result;
+        }
+
+        /* Named-lambda call: `(f a1 a2 …)` where `f` is globally
+         * bound to a RAY_LAMBDA with a single-expression body.
+         * Inline exactly like the literal `((fn …) …)` case.
+         * Shadowing order matches the value-position name-ref
+         * branch: local cexpr_env > table columns > globals.  A
+         * column named `f` isn't callable, but we still must honor
+         * shadowing so the exec-time error is consistent. */
+        if (head->type == -RAY_SYM && (head->attrs & RAY_ATTR_NAME) &&
+            cexpr_env_lookup(g, head->i64) == NULL &&
+            !(g->table && g->table->type == RAY_TABLE &&
+              ray_table_get_col(g->table, head->i64))) {
+            ray_t* gv = ray_env_get(head->i64);
+            if (gv && gv->type == RAY_LAMBDA) {
+                ray_t* formals  = LAMBDA_PARAMS(gv);
+                ray_t* body_lst = LAMBDA_BODY(gv);
+                if (formals && body_lst && body_lst->type == RAY_LIST &&
+                    ray_len(body_lst) == 1 &&
+                    ray_is_vec(formals) && formals->type == RAY_SYM) {
+                    int64_t nf = formals->len;
+                    if (n - 1 == nf && nf <= 16 &&
+                        g->cexpr_env_top + (int)nf <= 32) {
+                        ray_t* body = ((ray_t**)ray_data(body_lst))[0];
+                        uint32_t actual_ids[16];
+                        for (int64_t i = 0; i < nf; i++) {
+                            ray_op_t* a = compile_expr_dag(g, elems[i + 1]);
+                            if (!a) return NULL;
+                            actual_ids[i] = a->id;
+                        }
+                        int64_t* fids = (int64_t*)ray_data(formals);
+                        int pushed = 0;
+                        for (int64_t i = 0; i < nf; i++) {
+                            g->cexpr_env[g->cexpr_env_top].sym     = fids[i];
+                            g->cexpr_env[g->cexpr_env_top].node_id = actual_ids[i];
+                            g->cexpr_env_top++;
+                            pushed++;
+                        }
+                        ray_op_t* result = compile_expr_dag(g, body);
+                        cexpr_env_pop(g, pushed);
+                        return result;
+                    }
+                }
+            }
+        }
+
+        /* Head must be a name referencing a builtin */
+        if (head->type != -RAY_SYM) return NULL;
+        int64_t fn_sym = head->i64;
+
+        /* Check for xbar */
+        ray_t* fn_name_str = ray_sym_str(fn_sym);
+        const char* fname = fn_name_str ? ray_str_ptr(fn_name_str) : NULL;
+        size_t fname_len = fn_name_str ? ray_str_len(fn_name_str) : 0;
+
+        if (fname_len == 4 && memcmp(fname, "xbar", 4) == 0) {
+            if (n != 3) return NULL;
+            ray_op_t* col = compile_expr_dag(g, elems[1]);
+            if (!col) return NULL;
+            uint32_t col_id = col->id;
+            ray_op_t* bucket = compile_expr_dag(g, elems[2]);
+            if (!bucket) return NULL;
+            col = &g->nodes[col_id];
+            /* xbar(x, b) = x - (x % b)  (stays in integer domain) */
+            ray_op_t* m = ray_mod(g, col, bucket);
+            if (!m) return NULL;
+            col = &g->nodes[col_id];
+            return ray_sub(g, col, m);
+        }
+
+        /* (if cond then else) — 4 elements (fn + 3 args).  Compiles
+         * to OP_IF which is supported by the element-wise fusion
+         * pipeline. */
+        if (fname_len == 2 && memcmp(fname, "if", 2) == 0) {
+            if (n != 4) return NULL;
+            ray_op_t* c = compile_expr_dag(g, elems[1]);
+            if (!c) return NULL;
+            uint32_t c_id = c->id;
+            ray_op_t* t = compile_expr_dag(g, elems[2]);
+            if (!t) return NULL;
+            uint32_t t_id = t->id;
+            ray_op_t* e = compile_expr_dag(g, elems[3]);
+            if (!e) return NULL;
+            c = &g->nodes[c_id];
+            t = &g->nodes[t_id];
+            return ray_if(g, c, t, e);
+        }
+
+        /* (substr str start len) — 4 elements. */
+        if (fname_len == 6 && memcmp(fname, "substr", 6) == 0) {
+            if (n != 4) return NULL;
+            ray_op_t* str = compile_expr_dag(g, elems[1]);
+            if (!str) return NULL;
+            uint32_t str_id = str->id;
+            ray_op_t* start = compile_expr_dag(g, elems[2]);
+            if (!start) return NULL;
+            uint32_t start_id = start->id;
+            ray_op_t* ln = compile_expr_dag(g, elems[3]);
+            if (!ln) return NULL;
+            str = &g->nodes[str_id];
+            start = &g->nodes[start_id];
+            return ray_substr(g, str, start, ln);
+        }
+
+        /* (replace str from to) — 4 elements. */
+        if (fname_len == 7 && memcmp(fname, "replace", 7) == 0) {
+            if (n != 4) return NULL;
+            ray_op_t* str = compile_expr_dag(g, elems[1]);
+            if (!str) return NULL;
+            uint32_t str_id = str->id;
+            ray_op_t* from = compile_expr_dag(g, elems[2]);
+            if (!from) return NULL;
+            uint32_t from_id = from->id;
+            ray_op_t* to = compile_expr_dag(g, elems[3]);
+            if (!to) return NULL;
+            str = &g->nodes[str_id];
+            from = &g->nodes[from_id];
+            return ray_replace(g, str, from, to);
+        }
+
+        /* (concat a b ...) — variadic string concat. */
+        if (fname_len == 6 && memcmp(fname, "concat", 6) == 0) {
+            if (n < 2 || n - 1 > 16) return NULL;
+            uint32_t arg_ids[16];
+            for (int64_t i = 1; i < n; i++) {
+                ray_op_t* a = compile_expr_dag(g, elems[i]);
+                if (!a) return NULL;
+                arg_ids[i - 1] = a->id;
+            }
+            ray_op_t* args[16];
+            for (int64_t i = 0; i < n - 1; i++)
+                args[i] = &g->nodes[arg_ids[i]];
+            return ray_concat(g, args, (int)(n - 1));
+        }
+
+        /* (as 'TYPE col) — cast.  The type is a sym literal like 'I64 / 'F64. */
+        if (fname_len == 2 && memcmp(fname, "as", 2) == 0) {
+            if (n != 3) return NULL;
+            ray_t* type_expr = elems[1];
+            if (type_expr->type != -RAY_SYM) return NULL;
+            int8_t tgt = -1;
+            ray_t* ts = ray_sym_str(type_expr->i64);
+            if (ts) {
+                const char* tn = ray_str_ptr(ts);
+                size_t tl = ray_str_len(ts);
+                if (tl == 3 && memcmp(tn, "I64", 3) == 0)       tgt = RAY_I64;
+                else if (tl == 3 && memcmp(tn, "F64", 3) == 0)  tgt = RAY_F64;
+                else if (tl == 3 && memcmp(tn, "I32", 3) == 0)  tgt = RAY_I32;
+                else if (tl == 3 && memcmp(tn, "I16", 3) == 0)  tgt = RAY_I16;
+                else if (tl == 3 && memcmp(tn, "F32", 3) == 0)  tgt = RAY_F32;
+                else if (tl == 2 && memcmp(tn, "U8", 2) == 0)   tgt = RAY_U8;
+                else if (tl == 4 && memcmp(tn, "BOOL", 4) == 0) tgt = RAY_BOOL;
+            }
+            if (tgt < 0) return NULL;
+            ray_op_t* col = compile_expr_dag(g, elems[2]);
+            if (!col) return NULL;
+            return ray_cast(g, col, tgt);
+        }
+
+        /* Temporal extract: (year col), (month col), (day col), ... */
+        if (n == 2) {
+            int64_t field = -1;
+            if (fname_len == 4 && memcmp(fname, "year",  4) == 0) field = RAY_EXTRACT_YEAR;
+            else if (fname_len == 5 && memcmp(fname, "month", 5) == 0) field = RAY_EXTRACT_MONTH;
+            else if (fname_len == 3 && memcmp(fname, "day",   3) == 0) field = RAY_EXTRACT_DAY;
+            else if (fname_len == 4 && memcmp(fname, "hour",  4) == 0) field = RAY_EXTRACT_HOUR;
+            else if (fname_len == 6 && memcmp(fname, "minute",6) == 0) field = RAY_EXTRACT_MINUTE;
+            else if (fname_len == 6 && memcmp(fname, "second",6) == 0) field = RAY_EXTRACT_SECOND;
+            else if (fname_len == 9 && memcmp(fname, "dayofweek",9) == 0) field = RAY_EXTRACT_DOW;
+            else if (fname_len == 9 && memcmp(fname, "dayofyear",9) == 0) field = RAY_EXTRACT_DOY;
+            if (field >= 0) {
+                ray_op_t* col = compile_expr_dag(g, elems[1]);
+                if (!col) return NULL;
+                return ray_extract(g, col, field);
+            }
+        }
+
+        /* (do e1 e2 ... en) → compile only the last expression.
+         * Earlier expressions can't have side-effects in DAG context;
+         * if they do, they'll be silently dropped.  Use eval-level
+         * for side-effectful scripts. */
+        if (fname_len == 2 && memcmp(fname, "do", 2) == 0) {
+            if (n < 2) return NULL;
+            return compile_expr_dag(g, elems[n - 1]);
+        }
+
+        /* (let var val body) — compile `val` in the current env,
+         * bind var → val_op by ID (pointer-safe across reallocs),
+         * compile `body`, pop.  Same β-reduction mechanism as
+         * lambda inlining, just with a single binding. */
+        if (fname_len == 3 && memcmp(fname, "let", 3) == 0) {
+            if (n != 4) return NULL;
+            ray_t* var_expr = elems[1];
+            if (var_expr->type != -RAY_SYM) return NULL;
+            int64_t var_sym = var_expr->i64;
+            ray_op_t* val_op = compile_expr_dag(g, elems[2]);
+            if (!val_op) return NULL;
+            /* cexpr_env_push already snapshots node->id, which is
+             * stable across subsequent graph reallocations. */
+            if (!cexpr_env_push(g, var_sym, val_op)) return NULL;
+            ray_op_t* body_op = compile_expr_dag(g, elems[3]);
+            cexpr_env_pop(g, 1);
+            return body_op;
+        }
+
+        /* (cond (p1 e1) (p2 e2) ... (else en)) → nested OP_IF. */
+        if (fname_len == 4 && memcmp(fname, "cond", 4) == 0) {
+            if (n < 2) return NULL;
+            /* Walk right-to-left, building an OP_IF chain.  The last
+             * clause must be an `else` form. */
+            uint32_t chain_id = UINT32_MAX;
+            for (int64_t i = n - 1; i >= 1; i--) {
+                ray_t* clause = elems[i];
+                if (clause->type != RAY_LIST || ray_len(clause) != 2) return NULL;
+                ray_t** cpair = (ray_t**)ray_data(clause);
+                int is_else = 0;
+                if (cpair[0]->type == -RAY_SYM) {
+                    ray_t* ns = ray_sym_str(cpair[0]->i64);
+                    if (ns && ray_str_len(ns) == 4 &&
+                        memcmp(ray_str_ptr(ns), "else", 4) == 0)
+                        is_else = 1;
+                }
+                if (is_else) {
+                    if (i != n - 1) return NULL;
+                    ray_op_t* c = compile_expr_dag(g, cpair[1]);
+                    if (!c) return NULL;
+                    chain_id = c->id;
+                } else {
+                    if (chain_id == UINT32_MAX) return NULL;
+                    ray_op_t* pred = compile_expr_dag(g, cpair[0]);
+                    if (!pred) return NULL;
+                    uint32_t pred_id = pred->id;
+                    ray_op_t* body = compile_expr_dag(g, cpair[1]);
+                    if (!body) return NULL;
+                    pred = &g->nodes[pred_id];
+                    ray_op_t* chain = &g->nodes[chain_id];
+                    ray_op_t* r = ray_if(g, pred, body, chain);
+                    if (!r) return NULL;
+                    chain_id = r->id;
+                }
+            }
+            if (chain_id == UINT32_MAX) return NULL;
+            return &g->nodes[chain_id];
+        }
+
+        /* Variadic `and`/`or`: fold into a balanced binary tree.
+         * `(and a b c d)` → `(and (and a b) (and c d))` — depth log2(N).
+         * Without this, n>=4 falls through `compile_expr_dag` and the
+         * caller (e.g. select WHERE) reports "WHERE predicate not
+         * supported by DAG compiler".  The fused-expr executor evaluates
+         * the resulting tree as a sequence of binary AND/OR instructions
+         * sharing scratch registers — no extra column allocations vs
+         * what hand-nested binary forms already do.
+         *
+         * Balanced tree (rather than left-fold) keeps the canonical
+         * shape symmetric and minimises dependency-chain depth, which
+         * future OoO / parallel-instruction executors can exploit. */
+        if (n >= 4) {
+            bool is_and = (fname_len == 3 && memcmp(fname, "and", 3) == 0);
+            bool is_or  = (fname_len == 2 && memcmp(fname, "or",  2) == 0);
+            if (is_and || is_or) {
+                int64_t k = n - 1;
+                if (k > 64) return NULL;          /* depth/space guard */
+                uint32_t arg_ids[64];
+                for (int64_t i = 0; i < k; i++) {
+                    ray_op_t* a = compile_expr_dag(g, elems[i + 1]);
+                    if (!a) return NULL;
+                    arg_ids[i] = a->id;
+                }
+                dag_binary_ctor ctor = is_and ? ray_and : ray_or;
+                /* Iterative pairwise reduction: at each round, fold
+                 * adjacent pairs into a single node, halving the count.
+                 * Equivalent to recursive bisect but avoids a helper. */
+                int64_t cnt = k;
+                while (cnt > 1) {
+                    int64_t out = 0;
+                    for (int64_t i = 0; i + 1 < cnt; i += 2) {
+                        /* make_binary re-resolves both inputs via stored
+                         * IDs after its own potential realloc, so the
+                         * pointers we pass here are safe to use. */
+                        ray_op_t* l = &g->nodes[arg_ids[i]];
+                        ray_op_t* r = &g->nodes[arg_ids[i + 1]];
+                        ray_op_t* combined = ctor(g, l, r);
+                        if (!combined) return NULL;
+                        arg_ids[out++] = combined->id;
+                    }
+                    if (cnt & 1)                    /* carry odd tail */
+                        arg_ids[out++] = arg_ids[cnt - 1];
+                    cnt = out;
+                }
+                return &g->nodes[arg_ids[0]];
+            }
+        }
+
+        /* Binary op? */
+        if (n == 3) {
+            dag_binary_ctor ctor = resolve_binary_dag(fn_sym);
+            if (ctor) {
+                ray_op_t* left = compile_expr_dag(g, elems[1]);
+                if (!left) return NULL;
+                uint32_t left_id = left->id;
+                ray_op_t* right = compile_expr_dag(g, elems[2]);
+                if (!right) return NULL;
+                left = &g->nodes[left_id];
+                return ctor(g, left, right);
+            }
+        }
+
+        /* Unary op or aggregation? */
+        if (n == 2) {
+            /* Check for unary DAG ops */
+            dag_unary_ctor uctor = resolve_unary_dag(fn_sym);
+            if (uctor) {
+                ray_op_t* arg = compile_expr_dag(g, elems[1]);
+                return arg ? uctor(g, arg) : NULL;
+            }
+            /* Aggregation functions return DAG agg nodes */
+            uint16_t agg_op = resolve_agg_opcode(fn_sym);
+            if (agg_op) {
+                ray_op_t* arg = compile_expr_dag(g, elems[1]);
+                if (!arg) return NULL;
+                switch (agg_op) {
+                    case OP_SUM:         return ray_sum(g, arg);
+                    case OP_AVG:         return ray_avg(g, arg);
+                    case OP_MIN:         return ray_min_op(g, arg);
+                    case OP_MAX:         return ray_max_op(g, arg);
+                    case OP_COUNT:       return ray_count(g, arg);
+                    case OP_FIRST:       return ray_first(g, arg);
+                    case OP_LAST:        return ray_last(g, arg);
+                    case OP_PROD:        return ray_prod(g, arg);
+                    case OP_STDDEV:      return ray_stddev(g, arg);
+                    case OP_STDDEV_POP:  return ray_stddev_pop(g, arg);
+                    case OP_VAR:         return ray_var(g, arg);
+                    case OP_VAR_POP:     return ray_var_pop(g, arg);
+                    default: return NULL;
+                }
+            }
+        }
+    }
+
+    return NULL;
+}
+
+/* Walk an expression tree and bind any name-symbols that match table columns
+ * into the current local scope. Recurses into list sub-expressions. */
+static void expr_bind_table_names(ray_t* expr, ray_t* tbl) {
+    if (!expr) return;
+    if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME)) {
+        /* Plain column reference — bind the column into local scope. */
+        ray_t* col = ray_table_get_col(tbl, expr->i64);
+        if (col) { ray_env_set_local(expr->i64, col); return; }
+        /* Dotted reference (e.g. `Timestamp.ss`) — the whole dotted
+         * sym isn't a column name, but its HEAD segment might be.
+         * Bind the head so ray_env_resolve's dotted walk can reach
+         * it when ray_eval fires on this expression.  Non-column
+         * heads (globals, locals) fall through to env_resolve's
+         * normal scope-chain lookup. */
+        if (ray_sym_is_dotted(expr->i64)) {
+            const int64_t* segs;
+            int nsegs = ray_sym_segs(expr->i64, &segs);
+            if (nsegs >= 1) {
+                ray_t* head_col = ray_table_get_col(tbl, segs[0]);
+                if (head_col) ray_env_set_local(segs[0], head_col);
+            }
+        }
+        return;
+    }
+    if (expr->type == RAY_LIST) {
+        ray_t** elems = (ray_t**)ray_data(expr);
+        int64_t n = ray_len(expr);
+        for (int64_t i = 0; i < n; i++)
+            expr_bind_table_names(elems[i], tbl);
+    }
+}
+
+static int is_agg_expr(ray_t* expr);  /* defined below */
+
+/* Return 1 if expr references a table column in a position where the
+ * column is expected to flow through row-by-row (not reduced by an
+ * enclosing aggregation).  Used to decide whether a non-agg expression
+ * is expected to produce a row-aligned result — pure constants and
+ * aggregation-reduced expressions (e.g. `(+ 1 (sum p))`) legitimately
+ * produce scalars/short-length results that must be broadcast.
+ *
+ * The walker stops recursing when it hits an aggregation call: any
+ * column refs inside get reduced to a scalar, so they don't drive the
+ * row-alignment expectation.
+ *
+ * Lambda call forms `((fn ...) actuals)` are also treated as
+ * "unknown shape" — even if the actuals reference columns, the
+ * body may reduce them via an enclosed aggregation.  Returning 0
+ * here means the scatter will rely purely on the runtime shape
+ * check (row-aligned → gather, else broadcast) instead of
+ * erroring.  This loses a bug-catching net for lambda calls whose
+ * body IS row-preserving but returns a mismatched-length result,
+ * but that's a niche case compared to the common "lambda wrapping
+ * an agg" pattern users actually write. */
+static int expr_refs_row_column(ray_t* expr, ray_t* tbl) {
+    if (!expr) return 0;
+    if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME)) {
+        if (ray_table_get_col(tbl, expr->i64)) return 1;
+        /* Dotted name whose head is a column is a row-aligned ref —
+         * `Timestamp.ss` flows through row-by-row the same as plain
+         * `Timestamp` would, so the scatter must treat it as one. */
+        if (ray_sym_is_dotted(expr->i64)) {
+            const int64_t* segs;
+            int nsegs = ray_sym_segs(expr->i64, &segs);
+            if (nsegs >= 1 && ray_table_get_col(tbl, segs[0])) return 1;
+        }
+        return 0;
+    }
+    if (expr->type == RAY_LIST) {
+        /* If this call is itself an aggregation, its column refs
+         * collapse to a scalar — don't recurse.  The whole subtree
+         * is treated as a constant from the row-alignment POV. */
+        if (is_agg_expr(expr)) return 0;
+        ray_t** elems = (ray_t**)ray_data(expr);
+        int64_t n = ray_len(expr);
+        if (n == 0) return 0;
+        /* Lambda call form: head is itself a LIST.  We can't tell
+         * from the outside whether the body is row-preserving or
+         * aggregating, so surrender row-alignment enforcement. */
+        if (elems[0]->type == RAY_LIST) return 0;
+        /* Skip elems[0] — it's the function name, not a column. */
+        for (int64_t i = 1; i < n; i++)
+            if (expr_refs_row_column(elems[i], tbl)) return 1;
+    }
+    return 0;
+}
+
+/* Check if an expression is an aggregation call (head is an agg function) */
+static int is_agg_expr(ray_t* expr) {
+    if (!expr || expr->type != RAY_LIST) return 0;
+    if (expr->type == RAY_DICT) return 0;
+    int64_t n = ray_len(expr);
+    if (n < 2) return 0;
+    ray_t** elems = (ray_t**)ray_data(expr);
+    if (elems[0]->type != -RAY_SYM) return 0;
+    return resolve_agg_opcode(elems[0]->i64) != 0;
+}
+
+/* True for `(fn arg ...)` where fn resolves to a RAY_UNARY marked
+ * RAY_FN_AGGR — i.e. a builtin aggregator (sum/avg/min/max/count and
+ * the non-whitelisted med/dev/var/stddev/etc).  Used to route these
+ * through the streaming-style per-group AGG branch rather than the
+ * full ray_eval per-group fallback.  This is a SUPERSET of is_agg_expr:
+ * it includes everything resolve_agg_opcode names plus the AGGR
+ * builtins that lack a streaming-engine opcode. */
+static int is_aggr_unary_call(ray_t* expr) {
+    if (!expr || expr->type != RAY_LIST) return 0;
+    int64_t n = ray_len(expr);
+    if (n < 2) return 0;
+    ray_t** elems = (ray_t**)ray_data(expr);
+    if (elems[0]->type != -RAY_SYM) return 0;
+    ray_t* fn_obj = ray_env_get(elems[0]->i64);
+    if (!fn_obj || fn_obj->type != RAY_UNARY) return 0;
+    return (fn_obj->attrs & RAY_FN_AGGR) != 0;
+}
+
+/* Walk expr once, gather unique column-ref symbol ids that resolve to
+ * columns of `tbl`.  Dotted refs (`Timestamp.ss`) record the head
+ * segment.  Caps at `max_out` entries (16 is plenty for s: clauses);
+ * returns the count gathered.  Used by the per-group fallback to slice
+ * each ref exactly once per group instead of re-walking the AST. */
+static int collect_col_refs(ray_t* expr, ray_t* tbl,
+                            int64_t* out_syms, int max_out, int n) {
+    if (!expr || n >= max_out) return n;
+    if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME)) {
+        int64_t want = -1;
+        if (ray_table_get_col(tbl, expr->i64)) {
+            want = expr->i64;
+        } else if (ray_sym_is_dotted(expr->i64)) {
+            const int64_t* segs;
+            int nsegs = ray_sym_segs(expr->i64, &segs);
+            if (nsegs >= 1 && ray_table_get_col(tbl, segs[0])) want = segs[0];
+        }
+        if (want >= 0) {
+            for (int i = 0; i < n; i++) if (out_syms[i] == want) return n;
+            if (n < max_out) out_syms[n++] = want;
+        }
+        return n;
+    }
+    if (expr->type == RAY_LIST) {
+        ray_t** elems = (ray_t**)ray_data(expr);
+        int64_t cnt = ray_len(expr);
+        for (int64_t i = 0; i < cnt && n < max_out; i++)
+            n = collect_col_refs(elems[i], tbl, out_syms, max_out, n);
+    }
+    return n;
+}
+
+/* Bind a single column-id to a slice of its column under `idx_list`.
+ * Helper used inside the per-group hot loop (slices the table's column
+ * via ray_at_fn, hands the slice to env_bind_local which retains, then
+ * drops our ref).  Returns 0 on success, error ray_t* on failure. */
+static ray_t* bind_col_slice(int64_t sym, ray_t* col, ray_t* idx_list) {
+    ray_t* slice = ray_at_fn(col, idx_list);
+    if (!slice || RAY_IS_ERR(slice)) {
+        return slice ? slice : ray_error("oom", NULL);
+    }
+    ray_env_set_local(sym, slice);
+    ray_release(slice);
+    return NULL;
+}
+
+/* Convert a partly-filled typed vec (indices 0..fill-1 valid) back into
+ * a LIST of n_groups owned atom refs (only first `fill` initialized).
+ * Used by the per-group eval fallback when the probe-typed-direct path
+ * detects a mid-loop type mismatch and has to demote to a list. */
+static ray_t* typed_vec_to_list(ray_t* tv, int64_t fill, int64_t n_groups) {
+    ray_t* list_col = ray_alloc(n_groups * sizeof(ray_t*));
+    if (!list_col) return ray_error("oom", NULL);
+    list_col->type = RAY_LIST;
+    list_col->len = 0;
+    ray_t** out = (ray_t**)ray_data(list_col);
+    for (int64_t k = 0; k < fill; k++) {
+        int allocated = 0;
+        ray_t* atom = collection_elem(tv, k, &allocated);
+        if (!allocated && atom) ray_retain(atom);
+        out[k] = atom;
+        list_col->len = k + 1;
+    }
+    return list_col;
+}
+
+/* Inner per-group eval body shared by the LIST-`groups` and `idx_buf`
+ * variants.  Pre-collects unique column refs, pushes ONE local scope
+ * around the whole loop, and probes the first cell:
+ *   - scalar atom of a typed-vec primitive → write directly into a
+ *     pre-allocated typed vec (no list intermediate, no post-collapse);
+ *   - otherwise → collect into a LIST column.
+ * If the typed-direct path hits a mid-loop type mismatch, it demotes
+ * to a LIST cleanly (one-time cost).  `feeder` produces the per-group
+ * idx_list ray_t* (caller controls its lifetime / reuse); the closure
+ * over `feeder_state` lets the buf variant reuse a single I64 wrapper.
+ *
+ * Returns either a typed vec (homogeneous scalars) or a LIST col. */
+typedef ray_t* (*idx_feeder_fn)(int64_t gi, void* state);
+
+static ray_t* nonagg_eval_per_group_core(ray_t* expr, ray_t* tbl,
+                                         idx_feeder_fn feeder, void* fstate,
+                                         int64_t n_groups) {
+    int64_t col_syms[16];
+    int n_cols = collect_col_refs(expr, tbl, col_syms, 16, 0);
+    ray_t* cols[16];
+    for (int i = 0; i < n_cols; i++)
+        cols[i] = ray_table_get_col(tbl, col_syms[i]);
+
+    if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
+
+    ray_t* result = NULL;       /* typed vec OR list col */
+    int direct_typed = 0;       /* non-zero → result is a typed vec */
+    int8_t typed_t = 0;         /* atom type sentinel for the typed path */
+
+    for (int64_t gi = 0; gi < n_groups; gi++) {
+        ray_t* idx_list = feeder(gi, fstate);
+        if (!idx_list) {
+            ray_env_pop_scope();
+            if (result) ray_release(result);
+            return ray_error("oom", NULL);
+        }
+        for (int i = 0; i < n_cols; i++) {
+            ray_t* err = bind_col_slice(col_syms[i], cols[i], idx_list);
+            if (err) {
+                ray_env_pop_scope();
+                if (result) ray_release(result);
+                return err;
+            }
+        }
+        ray_t* cell = ray_eval(expr);
+        if (!cell || RAY_IS_ERR(cell)) {
+            ray_env_pop_scope();
+            if (result) ray_release(result);
+            return cell ? cell : ray_error("domain", NULL);
+        }
+
+        if (gi == 0) {
+            int8_t t = cell->type;
+            int collapsable = (t < 0 && t != -RAY_SYM && t != -RAY_STR && t != -RAY_GUID);
+            if (collapsable) {
+                int8_t vt = (int8_t)(-t);
+                result = ray_vec_new(vt, n_groups);
+                if (!result || RAY_IS_ERR(result)) {
+                    ray_env_pop_scope(); ray_release(cell);
+                    return result ? result : ray_error("oom", NULL);
+                }
+                result->len = n_groups;
+                if (store_typed_elem(result, 0, cell) == 0) {
+                    direct_typed = 1; typed_t = t;
+                    ray_release(cell);
+                } else {
+                    /* type unsupported by store_typed_elem → fall to list */
+                    ray_release(result); result = NULL;
+                    collapsable = 0;
+                }
+            }
+            if (!collapsable) {
+                result = ray_alloc(n_groups * sizeof(ray_t*));
+                if (!result) {
+                    ray_env_pop_scope(); ray_release(cell);
+                    return ray_error("oom", NULL);
+                }
+                result->type = RAY_LIST;
+                result->len = 0;
+                ((ray_t**)ray_data(result))[0] = cell;
+                result->len = 1;
+            }
+            continue;
+        }
+
+        if (direct_typed) {
+            if (cell->type == typed_t && store_typed_elem(result, gi, cell) == 0) {
+                ray_release(cell);
+            } else {
+                /* Demote: convert typed vec [0..gi-1] to list, append cell, continue as list. */
+                ray_t* list_col = typed_vec_to_list(result, gi, n_groups);
+                ray_release(result);
+                if (RAY_IS_ERR(list_col)) {
+                    ray_env_pop_scope(); ray_release(cell);
+                    return list_col;
+                }
+                result = list_col;
+                ((ray_t**)ray_data(result))[gi] = cell;  /* takes ownership */
+                result->len = gi + 1;
+                direct_typed = 0;
+            }
+        } else {
+            ((ray_t**)ray_data(result))[gi] = cell;  /* takes ownership */
+            result->len = gi + 1;
+        }
+    }
+
+    ray_env_pop_scope();
+    return result;
+}
+
+/* idx_feeder for the eval-fallback's LIST `groups` layout. */
+typedef struct { ray_t** items; } groups_state_t;
+static ray_t* groups_idx_feed(int64_t gi, void* st) {
+    groups_state_t* s = (groups_state_t*)st;
+    return s->items[gi * 2 + 1];
+}
+
+static ray_t* nonagg_eval_per_group(ray_t* expr, ray_t* tbl,
+                                    ray_t* groups, int64_t n_groups) {
+    groups_state_t st = { .items = (ray_t**)ray_data(groups) };
+    return nonagg_eval_per_group_core(expr, tbl, groups_idx_feed, &st, n_groups);
+}
+
+/* idx_feeder for the DAG fast-path's idx_buf+offsets+grp_cnt layout.
+ * Reuses a single RAY_I64 wrapper across all groups: just retargets the
+ * data pointer-equivalent by memcpy'ing into its data area and adjusting
+ * `len`.  Saves n_groups vec allocs/frees. */
+typedef struct {
+    const int64_t* idx_buf;
+    const int64_t* offsets;
+    const int64_t* grp_cnt;
+    ray_t*         scratch;     /* RAY_I64 vec, sized to max grp_cnt */
+} buf_state_t;
+
+static ray_t* buf_idx_feed(int64_t gi, void* st) {
+    buf_state_t* s = (buf_state_t*)st;
+    int64_t cnt = s->grp_cnt[gi];
+    s->scratch->len = cnt;
+    if (cnt > 0) {
+        memcpy(ray_data(s->scratch), &s->idx_buf[s->offsets[gi]],
+               (size_t)cnt * sizeof(int64_t));
+    }
+    return s->scratch;
+}
+
+static ray_t* nonagg_eval_per_group_buf(ray_t* expr, ray_t* tbl,
+                                        const int64_t* idx_buf,
+                                        const int64_t* offsets,
+                                        const int64_t* grp_cnt,
+                                        int64_t n_groups) {
+    int64_t max_cnt = 0;
+    for (int64_t gi = 0; gi < n_groups; gi++)
+        if (grp_cnt[gi] > max_cnt) max_cnt = grp_cnt[gi];
+    ray_t* scratch = ray_vec_new(RAY_I64, max_cnt > 0 ? max_cnt : 1);
+    if (!scratch || RAY_IS_ERR(scratch))
+        return scratch ? scratch : ray_error("oom", NULL);
+    buf_state_t st = { idx_buf, offsets, grp_cnt, scratch };
+    ray_t* res = nonagg_eval_per_group_core(expr, tbl, buf_idx_feed, &st, n_groups);
+    ray_release(scratch);
+    return res;
+}
+
+/* Streaming-style per-group AGG body, DAG flavor.  For an expression
+ * like `(med v)` (head is RAY_FN_AGGR + RAY_UNARY, second elem is a
+ * column ref or full-table-eval-able sub-expression), slice src per
+ * group via ray_at_fn, call the unary fn directly, store the scalar
+ * result into a pre-sized typed vec.  Mirrors the eval-fallback's AGG
+ * branch (`query.c:~1955`) but with the idx_buf+offsets+grp_cnt
+ * layout the DAG path produces. */
+static ray_t* aggr_unary_per_group_buf(ray_t* expr, ray_t* tbl,
+                                       const int64_t* idx_buf,
+                                       const int64_t* offsets,
+                                       const int64_t* grp_cnt,
+                                       int64_t n_groups) {
+    ray_t** elems = (ray_t**)ray_data(expr);
+    ray_t* fn_name = elems[0];
+    ray_t* col_expr = elems[1];
+
+    ray_t* fn_obj = ray_env_get(fn_name->i64);
+    if (!fn_obj || fn_obj->type != RAY_UNARY)
+        return ray_error("type", NULL);
+    ray_unary_fn uf = (ray_unary_fn)(uintptr_t)fn_obj->i64;
+
+    /* Resolve the source column: either a direct column ref (no copy)
+     * or a full-table eval of the sub-expression. */
+    ray_t* src = NULL;
+    if (col_expr->type == -RAY_SYM && (col_expr->attrs & RAY_ATTR_NAME)) {
+        src = ray_table_get_col(tbl, col_expr->i64);
+        if (src) ray_retain(src);
+    }
+    if (!src) {
+        /* Bind table cols and eval — same pattern as the existing path. */
+        if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
+        expr_bind_table_names(col_expr, tbl);
+        src = ray_eval(col_expr);
+        ray_env_pop_scope();
+        if (!src || RAY_IS_ERR(src)) return src ? src : ray_error("domain", NULL);
+    }
+
+    /* Reusable I64 idx wrapper. */
+    int64_t max_cnt = 0;
+    for (int64_t gi = 0; gi < n_groups; gi++)
+        if (grp_cnt[gi] > max_cnt) max_cnt = grp_cnt[gi];
+    ray_t* idx_vec = ray_vec_new(RAY_I64, max_cnt > 0 ? max_cnt : 1);
+    if (!idx_vec || RAY_IS_ERR(idx_vec)) {
+        ray_release(src);
+        return idx_vec ? idx_vec : ray_error("oom", NULL);
+    }
+
+    ray_t* agg_vec = NULL;
+    int8_t agg_atom_t = 0;
+
+    for (int64_t gi = 0; gi < n_groups; gi++) {
+        idx_vec->len = grp_cnt[gi];
+        if (grp_cnt[gi] > 0) {
+            memcpy(ray_data(idx_vec), &idx_buf[offsets[gi]],
+                   (size_t)grp_cnt[gi] * sizeof(int64_t));
+        }
+        ray_t* subset = ray_at_fn(src, idx_vec);
+        if (!subset || RAY_IS_ERR(subset)) continue;
+        ray_t* agg_val = uf(subset);
+        ray_release(subset);
+        if (!agg_val || RAY_IS_ERR(agg_val)) continue;
+
+        if (!agg_vec) {
+            agg_atom_t = agg_val->type;
+            int8_t vt = (int8_t)(-agg_atom_t);
+            agg_vec = ray_vec_new(vt, n_groups);
+            if (!agg_vec || RAY_IS_ERR(agg_vec)) {
+                ray_release(agg_val); ray_release(idx_vec); ray_release(src);
+                return agg_vec ? agg_vec : ray_error("oom", NULL);
+            }
+            agg_vec->len = n_groups;
+        }
+        if (agg_val->type != agg_atom_t || store_typed_elem(agg_vec, gi, agg_val) != 0) {
+            /* Fallback: shouldn't happen for well-behaved aggregators; if it
+             * does, demote to a list so we don't return a partly-typed vec.
+             * Convert what we have so far to a list, then reattempt as a
+             * generic non-streaming eval. */
+            ray_release(agg_val);
+            ray_release(idx_vec); ray_release(src);
+            ray_release(agg_vec);
+            return nonagg_eval_per_group_buf(expr, tbl, idx_buf, offsets, grp_cnt, n_groups);
+        }
+        ray_release(agg_val);
+    }
+
+    ray_release(idx_vec); ray_release(src);
+    if (!agg_vec) {
+        /* No groups produced a value (all empty?) — return an empty typed
+         * vec sized n_groups; default to I64 for lack of a better guess. */
+        agg_vec = ray_vec_new(RAY_I64, n_groups);
+        if (agg_vec && !RAY_IS_ERR(agg_vec)) agg_vec->len = n_groups;
+    }
+    return agg_vec;
+}
+
+/* Forward declarations for eval-level groupby fallback */
+
+/* (select {from: t [where: pred] [by: key] [col: expr ...]})
+ * Special form — receives unevaluated dict arg. */
+ray_t* ray_select_fn(ray_t** args, int64_t n) {
+    if (n < 1) return ray_error("domain", NULL);
+    ray_t* dict = args[0];
+    if (!dict || dict->type != RAY_DICT)
+        return ray_error("type", NULL);
+
+    /* Evaluate 'from:' to get the source table */
+    ray_t* from_expr = dict_get(dict, "from");
+    if (!from_expr) return ray_error("domain", NULL);
+    ray_t* tbl = ray_eval(from_expr);
+    if (RAY_IS_ERR(tbl)) return tbl;
+    if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); }
+
+    ray_t* where_expr = dict_get(dict, "where");
+    ray_t* by_expr = dict_get(dict, "by");
+    ray_t* take_expr = dict_get(dict, "take");
+    ray_t* nearest_expr = dict_get(dict, "nearest");
+
+    /* Collect output columns (keys that are not reserved).  The dict's
+     * physical layout is [keys, vals] but the iteration loops below were
+     * written for the old interleaved [k0,v0,...] form — open a transient
+     * pair view so the existing code keeps working. */
+    DICT_VIEW_DECL(dv);
+    DICT_VIEW_OPEN(dict, dv);
+    if (DICT_VIEW_OVERFLOW(dv)) {
+        ray_release(tbl);
+        return ray_error("domain", "select clause has too many keys");
+    }
+    int64_t dict_n = dv_n;
+    ray_t** dict_elems = dv;
+    int64_t from_id    = ray_sym_intern("from",    4);
+    int64_t where_id   = ray_sym_intern("where",   5);
+    int64_t by_id      = ray_sym_intern("by",      2);
+    int64_t take_id    = ray_sym_intern("take",    4);
+    int64_t asc_id     = ray_sym_intern("asc",     3);
+    int64_t desc_id    = ray_sym_intern("desc",    4);
+    int64_t nearest_id = ray_sym_intern("nearest", 7);
+
+    /* Check for asc/desc presence */
+    bool has_sort = false;
+    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+        int64_t kid = dict_elems[i]->i64;
+        if (kid == asc_id || kid == desc_id) { has_sort = true; break; }
+    }
+
+    /* `nearest` is mutually exclusive with `asc`/`desc`/`by` — ANN
+     * ordering is an index scan, not a column sort, and cannot be
+     * composed with group-by in this phase. */
+    if (nearest_expr) {
+        if (has_sort) {
+            ray_release(tbl);
+            return ray_error("domain",
+                "select: `nearest` cannot be combined with asc/desc");
+        }
+        if (by_expr) {
+            ray_release(tbl);
+            return ray_error("domain",
+                "select: `nearest` cannot be combined with `by`");
+        }
+    }
+
+    /* Count output columns */
+    int n_out = 0;
+    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+        int64_t kid = dict_elems[i]->i64;
+        if (kid != from_id && kid != where_id && kid != by_id &&
+            kid != take_id && kid != asc_id && kid != desc_id &&
+            kid != nearest_id)
+            n_out++;
+    }
+
+    /* Simple case: no clauses at all → return table as-is */
+    if (n_out == 0 && !where_expr && !by_expr && !take_expr && !has_sort && !nearest_expr)
+        return tbl;
+
+    /* Dict-form by-clause pre-evaluation: MUST happen before we
+     * build the DAG, so the graph sees the augmented table with
+     * the materialised dict-val columns already present.
+     * (select {... by: {o: OrderId b: (xbar Ts 1000)} ...})
+     * Dict values can be any expression; we eval each against tbl
+     * with its columns bound as locals, add the result as a new
+     * column named after the dict key, then rewrite by_expr as a
+     * plain RAY_SYM vector of the dict keys so the rest of
+     * ray_select_fn sees a standard multi-key group-by. */
+    ray_t* by_sym_vec_owned = NULL;
+    DICT_VIEW_DECL(byv);
+    if (by_expr && by_expr->type == RAY_DICT) {
+        DICT_VIEW_OPEN(by_expr, byv);
+        if (DICT_VIEW_OVERFLOW(byv)) {
+            ray_release(tbl);
+            return ray_error("domain", "by-dict has too many keys");
+        }
+        int64_t dlen = byv_n;
+        int64_t nk = dlen / 2;
+        if (nk == 0 || nk > 16) {
+            ray_release(tbl);
+            return ray_error("domain", "by-dict must have 1..16 keys");
+        }
+        ray_t** d_elems = byv;
+
+        ray_env_push_scope();
+        int64_t in_ncols = ray_table_ncols(tbl);
+        for (int64_t c = 0; c < in_ncols; c++) {
+            int64_t cn = ray_table_col_name(tbl, c);
+            ray_t* cv = ray_table_get_col_idx(tbl, c);
+            if (cv) ray_env_set_local(cn, cv);
+        }
+
+        by_sym_vec_owned = ray_vec_new(RAY_SYM, nk);
+        if (!by_sym_vec_owned || RAY_IS_ERR(by_sym_vec_owned)) {
+            ray_env_pop_scope();
+            ray_release(tbl);
+            return ray_error("oom", NULL);
+        }
+        int64_t* sv_data = (int64_t*)ray_data(by_sym_vec_owned);
+        by_sym_vec_owned->len = nk;
+
+        bool failed = false;
+        ray_t* fail_err = NULL;
+        int64_t expected_len = ray_table_nrows(tbl);
+        for (int64_t i = 0; i < nk; i++) {
+            ray_t* k = d_elems[i * 2];
+            ray_t* v = d_elems[i * 2 + 1];
+            if (!k || k->type != -RAY_SYM) {
+                fail_err = ray_error("domain", "by-dict key must be a symbol name");
+                failed = true; break;
+            }
+            /* Duplicate key guard: {g: A g: B} would otherwise append
+             * two cols both named g, then group on the first g twice
+             * (silently dropping B).  Reject explicitly. */
+            bool duplicate_key = false;
+            for (int64_t j = 0; j < i && !duplicate_key; j++)
+                if (d_elems[j * 2]->i64 == k->i64) duplicate_key = true;
+            if (duplicate_key) {
+                fail_err = ray_error("domain", "by-dict has duplicate key");
+                failed = true; break;
+            }
+            /* Collision check: if the dict key already exists in the
+             * input table, ray_table_add_col would append a second
+             * column with the same name and ray_table_get_col finds
+             * the ORIGINAL, so the group-by would silently scan the
+             * user's existing column instead of our materialised
+             * one.  The one allowed exception is {x: x}, a trivial
+             * self-alias: the input column is already exactly what
+             * we want to group on. */
+            bool already_in_tbl = (ray_table_get_col(tbl, k->i64) != NULL);
+            bool trivial_self = (v->type == -RAY_SYM && v->i64 == k->i64);
+            if (already_in_tbl && !trivial_self) {
+                fail_err = ray_error("domain",
+                    "by-dict alias shadows an existing input column");
+                failed = true; break;
+            }
+            if (trivial_self) {
+                /* No eval / no add: just group on the existing col. */
+                sv_data[i] = k->i64;
+                continue;
+            }
+            ray_t* col_vec = ray_eval(v);
+            if (!col_vec || RAY_IS_ERR(col_vec)) {
+                fail_err = col_vec ? col_vec : ray_error("domain", "by-dict val eval");
+                failed = true; break;
+            }
+            if (!ray_is_vec(col_vec) || ray_len(col_vec) != expected_len) {
+                ray_release(col_vec);
+                fail_err = ray_error("length", "by-dict val must be a column vector");
+                failed = true; break;
+            }
+            ray_t* new_tbl = ray_table_add_col(tbl, k->i64, col_vec);
+            ray_release(col_vec);
+            if (!new_tbl || RAY_IS_ERR(new_tbl)) {
+                fail_err = new_tbl ? new_tbl : ray_error("oom", NULL);
+                failed = true; break;
+            }
+            tbl = new_tbl;
+            /* Re-bind the newly added column under its dict key so
+             * later dict vals can reference earlier keys. */
+            ray_env_set_local(k->i64, col_vec);
+            sv_data[i] = k->i64;
+        }
+        ray_env_pop_scope();
+        if (failed) {
+            ray_release(by_sym_vec_owned);
+            ray_release(tbl);
+            return fail_err;
+        }
+        by_expr = by_sym_vec_owned;
+    }
+
+    /* Build DAG */
+    ray_graph_t* g = ray_graph_new(tbl);
+    if (!g) {
+        if (by_sym_vec_owned) ray_release(by_sym_vec_owned);
+        ray_release(tbl); return ray_error("oom", NULL);
+    }
+
+    ray_op_t* root = ray_const_table(g, tbl);
+
+    /* Non-agg expression tracking for post-DAG scatter (used in GROUP BY) */
+    int64_t nonagg_names[16];
+    ray_t*  nonagg_exprs[16];
+    uint8_t n_nonaggs = 0;
+    int synth_count_col = 0;  /* 1 if we synthesized OP_COUNT for group boundaries */
+
+    /* Apply WHERE filter */
+    if (where_expr) {
+        ray_op_t* pred = compile_expr_dag(g, where_expr);
+        if (!pred) {
+            ray_graph_free(g); ray_release(tbl);
+            return ray_error("domain",
+                "WHERE predicate not supported by DAG compiler — "
+                "most common causes: arity mismatch "
+                "(e.g. `(in v)` instead of `(in col v)`), "
+                "unknown function name, unsupported special form, "
+                "or a sub-expression the compiler can't lower");
+        }
+        root = ray_filter(g, root, pred);
+    }
+
+    /* Apply NEAREST (ANN/KNN) re-ranking.  Mutually exclusive with
+     * asc/desc/by (already rejected above).  Runs after WHERE so the
+     * filter feeds the rerank executor directly.  `take k` becomes the
+     * target result count; the rerank executor handles the take internally
+     * so the bottom-of-function take block is skipped when nearest is set. */
+    float* nearest_query_owned = NULL;   /* freed after ray_execute below */
+    ray_t* nearest_handle_owned = NULL;  /* HNSW handle kept alive for the
+                                          * DAG's lifetime; released after
+                                          * ray_execute.  Without this, an
+                                          * inline `(ann (hnsw-build ...) ...)`
+                                          * drops the handle's rc to 0 before
+                                          * exec runs — the rc→0 hook frees
+                                          * the index and the ext's stored
+                                          * pointer dangles. */
+    if (nearest_expr) {
+        if (nearest_expr->type != RAY_LIST || ray_len(nearest_expr) < 3) {
+            ray_graph_free(g); ray_release(tbl);
+            return ray_error("domain",
+                "nearest: expected (ann <handle> <query> [ef]) or (knn <col> <query> [metric])");
+        }
+        int64_t nlen = ray_len(nearest_expr);
+        ray_t** nlist = (ray_t**)ray_data(nearest_expr);
+        ray_t* head = nlist[0];
+        if (head->type != -RAY_SYM) {
+            ray_graph_free(g); ray_release(tbl);
+            return ray_error("domain",
+                "nearest: first element must be the symbol `ann` or `knn`");
+        }
+        int64_t ann_sym_id = ray_sym_intern("ann", 3);
+        int64_t knn_sym_id = ray_sym_intern("knn", 3);
+
+        /* Resolve k from take (default 10). */
+        int64_t k_req = 10;
+        if (take_expr) {
+            ray_t* tv = ray_eval(take_expr);
+            if (!tv || RAY_IS_ERR(tv)) {
+                ray_graph_free(g); ray_release(tbl);
+                return tv ? tv : ray_error("domain", NULL);
+            }
+            if (tv->type == -RAY_I64)      k_req = tv->i64;
+            else if (tv->type == -RAY_I32) k_req = tv->i32;
+            else {
+                ray_release(tv);
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("type", "nearest: take must be an integer atom");
+            }
+            ray_release(tv);
+            if (k_req <= 0) {
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("domain", "nearest: take must be positive");
+            }
+        }
+
+        /* Evaluate the query vector (arg index 2). */
+        ray_t* qvec = ray_eval(nlist[2]);
+        if (!qvec || RAY_IS_ERR(qvec)) {
+            ray_graph_free(g); ray_release(tbl);
+            return qvec ? qvec : ray_error("domain", NULL);
+        }
+        if (!ray_is_vec(qvec) ||
+            (qvec->type != RAY_F32 && qvec->type != RAY_F64 &&
+             qvec->type != RAY_I32 && qvec->type != RAY_I64)) {
+            ray_release(qvec);
+            ray_graph_free(g); ray_release(tbl);
+            return ray_error("type", "nearest: query must be a numeric vector");
+        }
+        int32_t dim = (int32_t)qvec->len;
+        if (dim <= 0) {
+            ray_release(qvec);
+            ray_graph_free(g); ray_release(tbl);
+            return ray_error("length", "nearest: query vector is empty");
+        }
+
+        /* Copy query into a fresh float[] that the DAG op borrows; freed
+         * after ray_execute completes. */
+        nearest_query_owned = (float*)ray_sys_alloc((size_t)dim * sizeof(float));
+        if (!nearest_query_owned) {
+            ray_release(qvec);
+            ray_graph_free(g); ray_release(tbl);
+            return ray_error("oom", NULL);
+        }
+        switch (qvec->type) {
+            case RAY_F32:
+                memcpy(nearest_query_owned, ray_data(qvec), (size_t)dim * sizeof(float));
+                break;
+            case RAY_F64: {
+                double* s = (double*)ray_data(qvec);
+                for (int32_t j = 0; j < dim; j++) nearest_query_owned[j] = (float)s[j];
+                break;
+            }
+            case RAY_I32: {
+                int32_t* s = (int32_t*)ray_data(qvec);
+                for (int32_t j = 0; j < dim; j++) nearest_query_owned[j] = (float)s[j];
+                break;
+            }
+            case RAY_I64: {
+                int64_t* s = (int64_t*)ray_data(qvec);
+                for (int32_t j = 0; j < dim; j++) nearest_query_owned[j] = (float)s[j];
+                break;
+            }
+        }
+        ray_release(qvec);
+
+        if (head->i64 == ann_sym_id) {
+            ray_t* hobj = ray_eval(nlist[1]);
+            if (!hobj || RAY_IS_ERR(hobj)) {
+                ray_sys_free(nearest_query_owned);
+                ray_graph_free(g); ray_release(tbl);
+                return hobj ? hobj : ray_error("domain", NULL);
+            }
+            if (hobj->type != -RAY_I64 || !(hobj->attrs & RAY_ATTR_HNSW)) {
+                ray_release(hobj); ray_sys_free(nearest_query_owned);
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("type",
+                    "nearest (ann): first arg must be an HNSW handle (from hnsw-build)");
+            }
+            ray_hnsw_t* idx = (ray_hnsw_t*)(uintptr_t)hobj->i64;
+            if (!idx) {
+                /* Defensive: attr set but pointer cleared — treat as invalid. */
+                ray_release(hobj); ray_sys_free(nearest_query_owned);
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("type",
+                    "nearest (ann): HNSW handle has been freed");
+            }
+            if (idx->dim != dim) {
+                ray_release(hobj); ray_sys_free(nearest_query_owned);
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("length",
+                    "nearest (ann): query dim does not match index dim");
+            }
+            int32_t ef = HNSW_DEFAULT_EF_S;
+            if (nlen >= 4) {
+                ray_t* ev = ray_eval(nlist[3]);
+                if (!ev || RAY_IS_ERR(ev)) {
+                    ray_release(hobj); ray_sys_free(nearest_query_owned);
+                    ray_graph_free(g); ray_release(tbl);
+                    return ev ? ev : ray_error("domain",
+                        "nearest (ann): ef expression failed to evaluate");
+                }
+                if (ev->type == -RAY_I64)      ef = (int32_t)ev->i64;
+                else if (ev->type == -RAY_I32) ef = ev->i32;
+                else {
+                    ray_release(ev); ray_release(hobj);
+                    ray_sys_free(nearest_query_owned);
+                    ray_graph_free(g); ray_release(tbl);
+                    return ray_error("type",
+                        "nearest (ann): ef must be an integer atom");
+                }
+                ray_release(ev);
+            }
+            if ((int64_t)ef < k_req) ef = (int32_t)k_req;
+            root = ray_ann_rerank(g, root, idx, nearest_query_owned, dim, k_req, ef);
+            /* Steal the retain from ray_eval — the ext now borrows `idx`
+             * through hobj.  Released in the common exit path after
+             * ray_execute has completed. */
+            nearest_handle_owned = hobj;
+        } else if (head->i64 == knn_sym_id) {
+            ray_t* col_expr = nlist[1];
+            if (col_expr->type != -RAY_SYM) {
+                ray_sys_free(nearest_query_owned);
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("type",
+                    "nearest (knn): first arg must be an unquoted column name");
+            }
+            int64_t col_sym = col_expr->i64;
+            ray_hnsw_metric_t metric = RAY_HNSW_COSINE;
+            if (nlen >= 4) {
+                ray_t* mv = nlist[3];
+                if (mv && mv->type == -RAY_SYM) {
+                    int64_t mid = mv->i64;
+                    if (mid == ray_sym_find("l2", 2))          metric = RAY_HNSW_L2;
+                    else if (mid == ray_sym_find("ip", 2))     metric = RAY_HNSW_IP;
+                    else if (mid == ray_sym_find("cosine", 6)) metric = RAY_HNSW_COSINE;
+                    else {
+                        ray_sys_free(nearest_query_owned);
+                        ray_graph_free(g); ray_release(tbl);
+                        return ray_error("domain",
+                            "nearest (knn): metric must be 'cosine, 'l2, or 'ip");
+                    }
+                }
+            }
+            root = ray_knn_rerank(g, root, col_sym, nearest_query_owned, dim, k_req, metric);
+        } else {
+            ray_sys_free(nearest_query_owned);
+            ray_graph_free(g); ray_release(tbl);
+            return ray_error("domain",
+                "nearest: expected `ann` or `knn` as the first element");
+        }
+        if (!root) {
+            if (nearest_handle_owned) ray_release(nearest_handle_owned);
+            ray_sys_free(nearest_query_owned);
+            ray_graph_free(g); ray_release(tbl);
+            return ray_error("oom", NULL);
+        }
+
+        /* When the user didn't specify output columns, project only the
+         * source schema — NOT the rerank's synthetic `_dist`.  This keeps
+         * `(select {from: t nearest: ...})` shape-compatible with
+         * `(select {from: t})`; users who want `_dist` must name it
+         * explicitly (e.g. `{from: t d: _dist ...}`).
+         *
+         * Must handle arbitrarily wide tables (up to ray_select's uint8
+         * limit of 255 cols) — a silent 16-col cap would let `_dist`
+         * leak through for real-world tables. */
+        if (n_out == 0) {
+            int64_t src_ncols = ray_table_ncols(tbl);
+            if (src_ncols > 255) {
+                if (nearest_handle_owned) ray_release(nearest_handle_owned);
+                ray_sys_free(nearest_query_owned);
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("limit",
+                    "nearest: implicit projection exceeds 255 source columns — "
+                    "specify output columns explicitly");
+            }
+            if (src_ncols > 0) {
+                ray_op_t** col_ops = (ray_op_t**)ray_sys_alloc(
+                    (size_t)src_ncols * sizeof(ray_op_t*));
+                if (!col_ops) {
+                    if (nearest_handle_owned) ray_release(nearest_handle_owned);
+                    ray_sys_free(nearest_query_owned);
+                    ray_graph_free(g); ray_release(tbl);
+                    return ray_error("oom", NULL);
+                }
+                int nc = 0;
+                bool scan_err = false;
+                for (int64_t c = 0; c < src_ncols; c++) {
+                    int64_t name_id = ray_table_col_name(tbl, c);
+                    ray_t* s = ray_sym_str(name_id);
+                    if (!s) continue;
+                    ray_op_t* scan_op = ray_scan(g, ray_str_ptr(s));
+                    if (!scan_op) { scan_err = true; break; }
+                    col_ops[nc++] = scan_op;
+                }
+                if (scan_err) {
+                    ray_sys_free(col_ops);
+                    if (nearest_handle_owned) ray_release(nearest_handle_owned);
+                    ray_sys_free(nearest_query_owned);
+                    ray_graph_free(g); ray_release(tbl);
+                    return ray_error("oom", NULL);
+                }
+                root = ray_select(g, root, col_ops, (uint8_t)nc);
+                ray_sys_free(col_ops);
+                if (!root) {
+                    if (nearest_handle_owned) ray_release(nearest_handle_owned);
+                    ray_sys_free(nearest_query_owned);
+                    ray_graph_free(g); ray_release(tbl);
+                    return ray_error("oom", NULL);
+                }
+            }
+        }
+    }
+
+    /* GROUP BY */
+    if (by_expr) {
+        /* Resolve a "single key" sym id when by_expr is either a
+         * scalar -RAY_SYM name or a single-element RAY_SYM vector.
+         * The eval_group branch and several downstream sites used to
+         * read `by_expr->i64` directly, which is garbage when by_expr
+         * is a vector — use by_key_sym instead. */
+        int64_t by_key_sym = -1;
+        if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME))
+            by_key_sym = by_expr->i64;
+        else if (by_expr->type == RAY_SYM && ray_len(by_expr) == 1)
+            by_key_sym = ((int64_t*)ray_data(by_expr))[0];
+
+        /* Detect non-aggregate expressions before routing so we can
+         * decide whether GUID keys go to the DAG HT path or fall back
+         * to eval-level. */
+        int any_nonagg = 0;
+        if (n_out > 0) {
+            for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+                int64_t kid = dict_elems[i]->i64;
+                if (kid == from_id || kid == where_id || kid == by_id ||
+                    kid == take_id || kid == asc_id || kid == desc_id) continue;
+                if (!is_agg_expr(dict_elems[i + 1])) { any_nonagg = 1; break; }
+            }
+        }
+
+        /* Decide routing.  LIST/STR always fall to the eval-level
+         * grouping because the DAG HT path can't pack them into
+         * 8-byte key slots.  GUID is packed via row-indirection in
+         * the HT layout (wide_key_mask), so it uses the parallel DAG
+         * path *except* for queries with non-aggregate expressions
+         * (the non-agg scatter still requires 8-byte-packable key
+         * reads through its KEY_READ macro). */
+        int use_eval_group = 0;
+        if (by_key_sym >= 0) {
+            ray_t* key_col = ray_table_get_col(tbl, by_key_sym);
+            if (key_col) {
+                int8_t kct = key_col->type;
+                if (RAY_IS_PARTED(kct)) kct = (int8_t)RAY_PARTED_BASETYPE(kct);
+                if (kct == RAY_LIST || kct == RAY_STR)
+                    use_eval_group = 1;
+                else if (kct == RAY_GUID && (any_nonagg || n_out == 0))
+                    /* RAY_GUID routes to eval-level ray_group_fn only
+                     * for (a) non-agg expression queries (existing
+                     * behavior) and (b) the "no output columns" form
+                     * `(select {from: t by: guid})` which otherwise
+                     * lands in the DAG no-agg-no-nonagg branch whose
+                     * first-occurrence scanner is O(N × n_groups) and
+                     * truncates wide keys to 8 bytes via ray_read_sym.
+                     * Pure-agg group-bys with GUID keys still take the
+                     * DAG path (exec_group handles wide keys correctly
+                     * and stays parallel / segment-streamed on parted
+                     * tables). */
+                    use_eval_group = 1;
+            }
+        }
+        /* Non-aggregation expressions (arithmetic, lambda, etc.) are
+         * handled post-DAG: aggs go through the parallel GROUP pipeline,
+         * then non-agg results are evaluated on the full table and
+         * scattered per-group into LIST columns.  The scatter block
+         * only handles single scalar-key by-clauses — for multi-key
+         * or computed-key groupings, fall back to eval-level so the
+         * non-agg scatter has a well-defined row→group mapping. */
+        if (!use_eval_group && any_nonagg) {
+            /* Fast path requires a single scalar-named key column.
+             * Multi-key and computed-key by-clauses with non-agg
+             * expressions are not yet supported. */
+            int single_scalar_key = 0;
+            if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME)) {
+                single_scalar_key = 1;
+            } else if (by_expr->type == RAY_SYM && ray_len(by_expr) == 1) {
+                single_scalar_key = 1;
+            }
+            if (!single_scalar_key) {
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("nyi", "non-agg expression with multi-key or computed group key");
+            }
+        }
+        if (use_eval_group) {
+            /* Apply WHERE filter first (if any), then eval-level groupby */
+            ray_t* eval_tbl = tbl;
+            if (where_expr) {
+                root = ray_optimize(g, root);
+                ray_t* fres = ray_execute(g, root);
+                ray_graph_free(g); g = NULL;
+                if (!fres || RAY_IS_ERR(fres)) { ray_release(tbl); return fres ? fres : ray_error("domain", NULL); }
+                if (ray_is_lazy(fres)) fres = ray_lazy_materialize(fres);
+                if (!fres || RAY_IS_ERR(fres)) { ray_release(tbl); return fres ? fres : ray_error("domain", NULL); }
+                eval_tbl = fres;
+            } else {
+                ray_graph_free(g); g = NULL;
+            }
+            /* eval_group path supports only simple scalar / [col] by-forms;
+             * multi-key and computed keys shouldn't land here. */
+            if (by_key_sym < 0) {
+                if (eval_tbl != tbl) ray_release(eval_tbl);
+                ray_release(tbl);
+                return ray_error("nyi", "eval-level groupby requires scalar key");
+            }
+            ray_t* key_col = ray_table_get_col(eval_tbl, by_key_sym);
+
+            /* Fast path: (select {from: t by: k}) with no aggs and
+             * no non-agg expressions — we only need first-of-group
+             * for each non-key column, not full per-group index
+             * lists. Scan the key column once, record the first
+             * row index of each distinct key in a hash table, then
+             * gather that index list from every other column. This
+             * avoids ray_group_fn's per-group ray_vec_append churn
+             * which dominated the cost on 10M-row / 1M-group
+             * workloads. */
+            if (n_out == 0 && key_col && key_col->type == RAY_GUID) {
+                int64_t n = key_col->len;
+                const uint8_t* kb = (const uint8_t*)ray_data(key_col);
+                uint32_t cap = 64;
+                while ((uint64_t)cap < (uint64_t)n * 2 && cap < (1u << 28)) cap <<= 1;
+                uint32_t mask = cap - 1;
+                ray_t* ht_hdr = ray_alloc((size_t)cap * sizeof(uint32_t));
+                if (!ht_hdr) { if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return ray_error("oom", NULL); }
+                uint32_t* ht = (uint32_t*)ray_data(ht_hdr);
+                memset(ht, 0xFF, (size_t)cap * sizeof(uint32_t));
+
+                int64_t fi_cap = n < 1024 ? 1024 : (n < (1 << 20) ? n : (1 << 20));
+                if (fi_cap < 256) fi_cap = 256;
+                ray_t* fi_hdr = ray_alloc((size_t)fi_cap * sizeof(int64_t));
+                if (!fi_hdr) { ray_free(ht_hdr); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return ray_error("oom", NULL); }
+                int64_t* fi = (int64_t*)ray_data(fi_hdr);
+                int64_t ngroups = 0;
+
+                for (int64_t i = 0; i < n; i++) {
+                    if ((i & 65535) == 0) {
+                        if (ray_interrupted()) {
+                            ray_free(fi_hdr);
+                            ray_free(ht_hdr);
+                            if (eval_tbl != tbl) ray_release(eval_tbl);
+                            ray_release(tbl);
+                            return ray_error("cancel", "interrupted");
+                        }
+                        ray_progress_update("select", "by: first-of-group",
+                                            (uint64_t)i, (uint64_t)n);
+                    }
+                    const uint8_t* cur = kb + (size_t)i * 16;
+                    uint64_t h; memcpy(&h, cur, 8); h ^= h >> 33; h *= 0xff51afd7ed558ccdULL;
+                    uint32_t slot = (uint32_t)(h & mask);
+                    uint32_t gi = UINT32_MAX;
+                    while (ht[slot] != UINT32_MAX) {
+                        uint32_t cand = ht[slot];
+                        if (memcmp(kb + (size_t)fi[cand] * 16, cur, 16) == 0) { gi = cand; break; }
+                        slot = (slot + 1) & mask;
+                    }
+                    if (gi == UINT32_MAX) {
+                        if (ngroups >= fi_cap) {
+                            int64_t new_cap = fi_cap * 2;
+                            ray_t* new_hdr = ray_alloc((size_t)new_cap * sizeof(int64_t));
+                            if (!new_hdr) { ray_free(fi_hdr); ray_free(ht_hdr); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return ray_error("oom", NULL); }
+                            memcpy(ray_data(new_hdr), fi, (size_t)ngroups * sizeof(int64_t));
+                            ray_free(fi_hdr);
+                            fi_hdr = new_hdr;
+                            fi = (int64_t*)ray_data(fi_hdr);
+                            fi_cap = new_cap;
+                        }
+                        fi[ngroups] = i;
+                        ht[slot] = (uint32_t)ngroups;
+                        ngroups++;
+                    }
+                }
+                ray_free(ht_hdr);
+
+                /* Build result table: key column first (gathered from
+                 * the original at fi[]), then every other column the
+                 * same way. Allocation failures and width mismatches
+                 * must propagate — partial results silently dropping
+                 * columns would be a correctness bug. */
+                int64_t nc_src = ray_table_ncols(eval_tbl);
+                ray_t* res = ray_table_new(nc_src);
+                ray_t* first_err = NULL;
+                if (!res || RAY_IS_ERR(res)) {
+                    first_err = res && RAY_IS_ERR(res) ? res : ray_error("oom", NULL);
+                    res = NULL;
+                    goto fog_cleanup;
+                }
+
+                for (int64_t pass = 0; pass < nc_src + 1 && !first_err; pass++) {
+                    int64_t cn;
+                    if (pass == 0) cn = by_key_sym;
+                    else {
+                        cn = ray_table_col_name(eval_tbl, pass - 1);
+                        if (cn == by_key_sym) continue;
+                    }
+                    ray_t* sc = ray_table_get_col(eval_tbl, cn);
+                    if (!sc) continue;
+                    ray_t* dst = NULL;
+                    int8_t sct = sc->type;
+                    if (RAY_IS_PARTED(sct)) sct = (int8_t)RAY_PARTED_BASETYPE(sct);
+
+                    if (sct == RAY_STR) {
+                        dst = ray_vec_new(RAY_STR, ngroups);
+                        for (int64_t gi = 0; gi < ngroups && dst && !RAY_IS_ERR(dst); gi++) {
+                            size_t slen = 0;
+                            const char* sp = ray_str_vec_get(sc, fi[gi], &slen);
+                            dst = ray_str_vec_append(dst, sp ? sp : "", sp ? slen : 0);
+                        }
+                    } else if (sct == RAY_LIST) {
+                        dst = ray_list_new((int32_t)ngroups);
+                        if (dst && !RAY_IS_ERR(dst)) {
+                            ray_t** sitems = (ray_t**)ray_data(sc);
+                            ray_t** dout = (ray_t**)ray_data(dst);
+                            for (int64_t gi = 0; gi < ngroups; gi++) {
+                                dout[gi] = sitems[fi[gi]];
+                                ray_retain(dout[gi]);
+                            }
+                            dst->len = ngroups;
+                        }
+                    } else if (sct == RAY_SYM) {
+                        /* Preserve the source sym-width from attrs so
+                         * narrow sym columns (1/2/4-byte indices)
+                         * memcpy the same esz on both sides. */
+                        dst = ray_sym_vec_new(sc->attrs & RAY_SYM_W_MASK, ngroups);
+                        if (dst && !RAY_IS_ERR(dst)) {
+                            dst->len = ngroups;
+                            uint8_t esz = ray_sym_elem_size(sct, dst->attrs);
+                            const char* sb = (const char*)ray_data(sc);
+                            char* db = (char*)ray_data(dst);
+                            bool src_has_nulls = (sc->attrs & RAY_ATTR_HAS_NULLS) != 0;
+                            for (int64_t gi = 0; gi < ngroups; gi++) {
+                                memcpy(db + (size_t)gi * esz,
+                                       sb + (size_t)fi[gi] * esz, esz);
+                                if (src_has_nulls && ray_vec_is_null(sc, fi[gi]))
+                                    ray_vec_set_null(dst, gi, true);
+                            }
+                        }
+                    } else {
+                        dst = ray_vec_new(sct, ngroups);
+                        if (dst && !RAY_IS_ERR(dst)) {
+                            dst->len = ngroups;
+                            uint8_t esz = ray_sym_elem_size(sct, sc->attrs);
+                            const char* sb = (const char*)ray_data(sc);
+                            char* db = (char*)ray_data(dst);
+                            bool src_has_nulls = (sc->attrs & RAY_ATTR_HAS_NULLS) != 0;
+                            for (int64_t gi = 0; gi < ngroups; gi++) {
+                                memcpy(db + (size_t)gi * esz,
+                                       sb + (size_t)fi[gi] * esz, esz);
+                                if (src_has_nulls && ray_vec_is_null(sc, fi[gi]))
+                                    ray_vec_set_null(dst, gi, true);
+                            }
+                        }
+                    }
+
+                    if (!dst || RAY_IS_ERR(dst)) {
+                        first_err = (dst && RAY_IS_ERR(dst)) ? dst : ray_error("oom", NULL);
+                        if (dst && !RAY_IS_ERR(dst)) ray_release(dst);
+                        break;
+                    }
+                    res = ray_table_add_col(res, cn, dst);
+                    ray_release(dst);
+                    if (RAY_IS_ERR(res)) { first_err = res; res = NULL; break; }
+                }
+
+            fog_cleanup:
+                ray_free(fi_hdr);
+                if (eval_tbl != tbl) ray_release(eval_tbl);
+                ray_release(tbl);
+                if (first_err) {
+                    if (res) ray_release(res);
+                    return first_err;
+                }
+                return apply_sort_take(res, dict_elems, dict_n, asc_id, desc_id, take_id);
+            }
+
+            ray_t* groups_dict = ray_group_fn(key_col);
+            if (RAY_IS_ERR(groups_dict)) { if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return groups_dict; }
+            /* Flatten the dict into the legacy [k0,v0,…] interleaved LIST
+             * representation that the rest of this branch was written for. */
+            ray_t* groups = groups_to_pair_list(groups_dict);
+            ray_release(groups_dict);
+            if (RAY_IS_ERR(groups)) { if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return groups; }
+
+            int64_t gn = ray_len(groups);
+            int64_t n_groups = gn / 2;
+
+            /* Empty groups with no explicit aggs: return empty table with full schema */
+            if (n_groups == 0 && n_out == 0) {
+                ray_release(groups);
+                int64_t nc0 = ray_table_ncols(eval_tbl);
+                ray_t* empty = ray_table_new(nc0);
+                if (!RAY_IS_ERR(empty)) {
+                    /* Key column first */
+                    { ray_t* sc = ray_table_get_col(eval_tbl, by_key_sym);
+                      if (sc) {
+                        ray_t* ev = ray_vec_new(sc->type, 0);
+                        if (ev && !RAY_IS_ERR(ev)) { empty = ray_table_add_col(empty, by_key_sym, ev); ray_release(ev); }
+                      }
+                    }
+                    for (int64_t c = 0; c < nc0; c++) {
+                        int64_t cn = ray_table_col_name(eval_tbl, c);
+                        if (cn == by_key_sym) continue;
+                        ray_t* sc = ray_table_get_col_idx(eval_tbl, c);
+                        ray_t* ev = (sc->type == RAY_STR) ? ray_vec_new(RAY_STR, 0) :
+                                    (sc->type == RAY_LIST) ? ray_list_new(0) :
+                                    ray_vec_new(sc->type, 0);
+                        if (ev && !RAY_IS_ERR(ev)) { empty = ray_table_add_col(empty, cn, ev); ray_release(ev); }
+                    }
+                }
+                if (eval_tbl != tbl) ray_release(eval_tbl);
+                ray_release(tbl);
+                return empty;
+            }
+
+            /* Collect aggregation results */
+            int n_agg_out = 0;
+            int64_t agg_names[16];
+            ray_t* agg_results[16];
+            for (int64_t i = 0; i + 1 < dict_n && n_agg_out < 16; i += 2) {
+                int64_t kid = dict_elems[i]->i64;
+                if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue;
+                ray_t* val_expr_item = dict_elems[i + 1];
+
+                if (is_aggr_unary_call(val_expr_item)) {
+                    /* Streaming-style per-group AGG branch.  Accepts both
+                     * the resolve_agg_opcode whitelist (sum/avg/min/max/...)
+                     * and the broader RAY_FN_AGGR + RAY_UNARY set
+                     * (med/dev/var/stddev/...) — for the eval-fallback path
+                     * the only thing the body needs is a unary fn pointer
+                     * to call directly with the per-group slice. */
+                    ray_t** agg_elems = (ray_t**)ray_data(val_expr_item);
+                    ray_t* agg_fn_name = agg_elems[0];
+                    ray_t* agg_col_expr = agg_elems[1];
+
+                    /* Resolve source column from filtered table */
+                    ray_t* src_col_val = NULL;
+                    if (agg_col_expr->type == -RAY_SYM && (agg_col_expr->attrs & RAY_ATTR_NAME)) {
+                        src_col_val = ray_table_get_col(eval_tbl, agg_col_expr->i64);
+                        if (src_col_val) ray_retain(src_col_val);
+                    }
+                    if (!src_col_val) {
+                        src_col_val = ray_eval(agg_col_expr);
+                        if (RAY_IS_ERR(src_col_val)) {
+                            for (int ai = 0; ai < n_agg_out; ai++) { if (agg_results[ai]) ray_release(agg_results[ai]); }
+                            ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return src_col_val;
+                        }
+                    }
+
+                    /* For each group, compute aggregation */
+                    ray_t* agg_vec = NULL;
+                    ray_t** grp_items = (ray_t**)ray_data(groups);
+                    for (int64_t gi = 0; gi < n_groups; gi++) {
+                        ray_t* idx_list = grp_items[gi * 2 + 1];
+                        ray_t* subset = ray_at_fn(src_col_val, idx_list);
+                        if (RAY_IS_ERR(subset)) continue;
+                        ray_t* agg_val = NULL;
+                        ray_t* fn_obj = ray_env_get(agg_fn_name->i64);
+                        if (fn_obj && fn_obj->type == RAY_UNARY) {
+                            ray_unary_fn uf = (ray_unary_fn)(uintptr_t)fn_obj->i64;
+                            agg_val = uf(subset);
+                        }
+                        ray_release(subset);
+                        if (!agg_val || RAY_IS_ERR(agg_val)) continue;
+
+                        if (!agg_vec) {
+                            int8_t vt = -(agg_val->type);
+                            agg_vec = ray_vec_new(vt, n_groups);
+                            if (RAY_IS_ERR(agg_vec)) { ray_release(agg_val); break; }
+                            agg_vec->len = n_groups;
+                        }
+                        store_typed_elem(agg_vec, gi, agg_val);
+                        ray_release(agg_val);
+                    }
+                    ray_release(src_col_val);
+                    agg_names[n_agg_out] = kid;
+                    agg_results[n_agg_out] = agg_vec;
+                    n_agg_out++;
+                } else {
+                    /* Non-aggregation expression: evaluate on full table,
+                     * then gather per-group subsets into a LIST column
+                     * (non-agg produces list-of-vectors). */
+                    if (ray_env_push_scope() != RAY_OK) {
+                        for (int ai = 0; ai < n_agg_out; ai++) { if (agg_results[ai]) ray_release(agg_results[ai]); }
+                        ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                        return ray_error("oom", NULL);
+                    }
+                    expr_bind_table_names(val_expr_item, eval_tbl);
+                    ray_t* full_val = ray_eval(val_expr_item);
+                    ray_env_pop_scope();
+                    if (RAY_IS_ERR(full_val)) {
+                        for (int ai = 0; ai < n_agg_out; ai++) { if (agg_results[ai]) ray_release(agg_results[ai]); }
+                        ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return full_val;
+                    }
+
+                    /* Build LIST column: pre-allocate, then gather per group.
+                     * Direct pointer assignment avoids ray_list_append overhead. */
+                    ray_t* list_col = ray_alloc(n_groups * sizeof(ray_t*));
+                    if (!list_col || RAY_IS_ERR(list_col)) {
+                        ray_release(full_val);
+                        for (int ai = 0; ai < n_agg_out; ai++) { if (agg_results[ai]) ray_release(agg_results[ai]); }
+                        ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                        return ray_error("oom", NULL);
+                    }
+                    list_col->type = RAY_LIST;
+                    /* Track filled length incrementally — see the DAG
+                     * scatter above for rationale (no memset, exact
+                     * cleanup via v->len walk in ray_release). */
+                    list_col->len = 0;
+                    ray_t** list_out = (ray_t**)ray_data(list_col);
+
+                    /* Decide per-group disposition of full_val:
+                     *   - expression references a column → result must
+                     *     be row-aligned; a typed-vec or LIST whose len
+                     *     matches eval_tbl's nrows → gather, otherwise
+                     *     that's a genuine bug and we error out.
+                     *   - expression is constant (no column refs) →
+                     *     broadcast as-is to every group cell. */
+                    int64_t eval_nrows = ray_table_nrows(eval_tbl);
+                    int refs_column = expr_refs_row_column(val_expr_item, eval_tbl);
+                    int is_indexable =
+                        ray_is_vec(full_val) || full_val->type == RAY_LIST;
+                    int full_is_row_aligned =
+                        is_indexable && full_val->len == eval_nrows;
+
+                    if (refs_column && !full_is_row_aligned) {
+                        /* Non-streaming aggregation fallback: the full-table
+                         * eval didn't produce a row-aligned shape (e.g. a
+                         * user lambda returned a scalar from a vector arg),
+                         * so collect per-group and post-apply the expression
+                         * to each group's slice.  Each cell can be any shape;
+                         * homogeneous-scalar cells collapse to a typed vec. */
+                        ray_release(full_val);
+                        ray_release(list_col);  /* len=0, walks nothing */
+                        ray_t* per_group = nonagg_eval_per_group(
+                            val_expr_item, eval_tbl, groups, n_groups);
+                        if (RAY_IS_ERR(per_group)) {
+                            for (int ai = 0; ai < n_agg_out; ai++) { if (agg_results[ai]) ray_release(agg_results[ai]); }
+                            ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                            return per_group;
+                        }
+                        /* core produces typed vec or list as appropriate */
+                        agg_names[n_agg_out] = kid;
+                        agg_results[n_agg_out] = per_group;
+                        n_agg_out++;
+                        continue;
+                    }
+
+                    ray_t** gi_items = (ray_t**)ray_data(groups);
+                    for (int64_t gi = 0; gi < n_groups; gi++) {
+                        ray_t* idx_list = gi_items[gi * 2 + 1];
+                        ray_t* cell;
+                        if (full_is_row_aligned) {
+                            cell = gather_by_idx(full_val,
+                                (int64_t*)ray_data(idx_list), idx_list->len);
+                        } else {
+                            /* Pure constant (no column refs) → broadcast */
+                            ray_retain(full_val);
+                            cell = full_val;
+                        }
+                        list_out[gi] = cell;
+                        list_col->len = gi + 1;  /* commit slot */
+                    }
+                    ray_release(full_val);
+                    agg_names[n_agg_out] = kid;
+                    agg_results[n_agg_out] = list_col;
+                    n_agg_out++;
+                }
+            }
+
+            /* Build result table: key column + aggregation columns */
+            ray_t* result = ray_table_new(1 + n_agg_out);
+            if (RAY_IS_ERR(result)) { ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return result; }
+
+            /* Key column: build a typed vector matching the source column type */
+            ray_t** grp_items = (ray_t**)ray_data(groups);
+            ray_t* key_col_src = ray_table_get_col(eval_tbl, by_key_sym);
+            {
+                int8_t ktype = key_col_src ? key_col_src->type : RAY_I64;
+                if (RAY_IS_PARTED(ktype)) ktype = (int8_t)RAY_PARTED_BASETYPE(ktype);
+                ray_t* key_vec;
+                if (ktype == RAY_STR) {
+                    key_vec = ray_vec_new(RAY_STR, n_groups);
+                    for (int64_t gi = 0; gi < n_groups && key_vec && !RAY_IS_ERR(key_vec); gi++) {
+                        ray_t* k = grp_items[gi * 2];
+                        const char* sp = ray_str_ptr(k);
+                        size_t slen = ray_str_len(k);
+                        key_vec = ray_str_vec_append(key_vec, sp ? sp : "", sp ? slen : 0);
+                    }
+                } else {
+                    uint8_t kattrs = key_col_src ? key_col_src->attrs : 0;
+                    if (ktype == RAY_SYM)
+                        key_vec = ray_sym_vec_new(kattrs & RAY_SYM_W_MASK, n_groups);
+                    else
+                        key_vec = ray_vec_new(ktype, n_groups);
+                    if (key_vec && !RAY_IS_ERR(key_vec)) {
+                        key_vec->len = n_groups;
+                        /* Zero-fill data region so skipped GUID/null slots are safe */
+                        memset(ray_data(key_vec), 0, (size_t)n_groups * ray_sym_elem_size(ktype, key_vec->attrs));
+                        for (int64_t gi = 0; gi < n_groups; gi++)
+                            store_typed_elem(key_vec, gi, grp_items[gi * 2]);
+                    }
+                }
+                if (!key_vec || RAY_IS_ERR(key_vec)) {
+                    for (int i = 0; i < n_agg_out; i++) { if (agg_results[i]) ray_release(agg_results[i]); }
+                    ray_release(result); ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                    return key_vec ? key_vec : ray_error("oom", NULL);
+                }
+                result = ray_table_add_col(result, by_key_sym, key_vec);
+                ray_release(key_vec);
+            }
+
+            for (int i = 0; i < n_agg_out; i++) {
+                if (agg_results[i])
+                    result = ray_table_add_col(result, agg_names[i], agg_results[i]);
+                if (agg_results[i]) ray_release(agg_results[i]);
+            }
+
+            /* No explicit aggs: gather first-of-group for all non-key columns */
+            if (n_agg_out == 0 && n_groups > 0) {
+                ray_t** gi_items = (ray_t**)ray_data(groups);
+                /* Collect first index per group */
+                int64_t fi_stack[256];
+                ray_t* fi_hdr = NULL;
+                int64_t* fi = (n_groups <= 256) ? fi_stack : NULL;
+                if (!fi) {
+                    fi_hdr = ray_alloc((size_t)n_groups * sizeof(int64_t));
+                    if (!fi_hdr) { ray_release(result); ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return ray_error("oom", NULL); }
+                    fi = (int64_t*)ray_data(fi_hdr);
+                }
+                for (int64_t gi = 0; gi < n_groups; gi++) {
+                    ray_t* il = gi_items[gi * 2 + 1];
+                    int a = 0; ray_t* i0 = collection_elem(il, 0, &a);
+                    fi[gi] = as_i64(i0);
+                    if (a) ray_release(i0);
+                }
+                int64_t nc = ray_table_ncols(eval_tbl);
+                for (int64_t c = 0; c < nc && !RAY_IS_ERR(result); c++) {
+                    int64_t cn = ray_table_col_name(eval_tbl, c);
+                    if (cn == by_key_sym) continue;
+                    ray_t* sc = ray_table_get_col_idx(eval_tbl, c);
+                    ray_t* dst = NULL;
+                    if (sc->type == RAY_STR) {
+                        dst = ray_vec_new(RAY_STR, n_groups);
+                        bool src_has_nulls = (sc->attrs & RAY_ATTR_HAS_NULLS) != 0;
+                        for (int64_t gi = 0; gi < n_groups && dst && !RAY_IS_ERR(dst); gi++) {
+                            if (src_has_nulls && ray_vec_is_null(sc, fi[gi])) {
+                                dst = ray_str_vec_append(dst, "", 0);
+                                if (dst && !RAY_IS_ERR(dst))
+                                    ray_vec_set_null(dst, dst->len - 1, true);
+                            } else {
+                                size_t slen = 0;
+                                const char* sp = ray_str_vec_get(sc, fi[gi], &slen);
+                                dst = ray_str_vec_append(dst, sp ? sp : "", sp ? slen : 0);
+                            }
+                        }
+                    } else if (sc->type == RAY_LIST) {
+                        dst = ray_alloc(n_groups * sizeof(ray_t*));
+                        if (dst) {
+                            dst->type = RAY_LIST; dst->len = n_groups;
+                            ray_t** dout = (ray_t**)ray_data(dst);
+                            ray_t** sitems = (ray_t**)ray_data(sc);
+                            for (int64_t gi = 0; gi < n_groups; gi++) { dout[gi] = sitems[fi[gi]]; ray_retain(dout[gi]); }
+                        }
+                    } else {
+                        dst = ray_vec_new(sc->type, n_groups);
+                        if (dst && !RAY_IS_ERR(dst)) {
+                            /* len BEFORE the loop: store_typed_elem's null
+                             * path routes through ray_vec_set_null which
+                             * silently drops out-of-range writes — post-
+                             * loop assignment would lose the null bit on
+                             * every nullable row in this gather. */
+                            dst->len = n_groups;
+                            for (int64_t gi = 0; gi < n_groups; gi++) {
+                                int a = 0; ray_t* v = collection_elem(sc, fi[gi], &a);
+                                store_typed_elem(dst, gi, v);
+                                if (a) ray_release(v);
+                            }
+                        }
+                    }
+                    if (!dst || RAY_IS_ERR(dst)) {
+                        if (dst) ray_release(dst);
+                        ray_release(result);
+                        result = ray_error("oom", NULL);
+                        break;
+                    }
+                    result = ray_table_add_col(result, cn, dst);
+                    ray_release(dst);
+                }
+                if (fi_hdr) ray_free(fi_hdr);
+            }
+
+            ray_release(groups);
+            if (eval_tbl != tbl) ray_release(eval_tbl);
+            ray_release(tbl);
+            return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
+        }
+
+        /* Pre-scan: any non-aggregation expressions?  If so and there's a
+         * WHERE, we must materialize the filtered table first so the
+         * post-DAG scatter evaluates on filtered data (matching agg semantics). */
+        int has_nonagg = 0;
+        for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+            int64_t kid = dict_elems[i]->i64;
+            if (kid == from_id || kid == where_id || kid == by_id ||
+                kid == take_id || kid == asc_id || kid == desc_id) continue;
+            if (!is_agg_expr(dict_elems[i + 1])) { has_nonagg = 1; break; }
+        }
+
+        /* The post-DAG scatter needs a flat single-segment table: it
+         * reads key columns directly and runs ray_eval over the whole
+         * input.  Detect parted tables up front — if the source is
+         * parted and there's no WHERE to materialize it, return nyi. */
+        int table_is_parted = 0;
+        if (has_nonagg) {
+            int64_t ncols = ray_table_ncols(tbl);
+            for (int64_t c = 0; c < ncols; c++) {
+                ray_t* col = ray_table_get_col_idx(tbl, c);
+                if (col && RAY_IS_PARTED(col->type)) { table_is_parted = 1; break; }
+            }
+            if (table_is_parted && !where_expr) {
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("nyi", "non-agg expression on parted table without WHERE");
+            }
+        }
+
+        /* WHERE + BY handling.  Two paths:
+         *
+         *   (A) Fused path — applicable when there are no non-agg
+         *       output expressions and the source table is flat
+         *       (not parted).  Execute the filter node in-place
+         *       via exec_node; OP_FILTER on a TABLE input installs
+         *       a lazy RAY_SEL bitmap on g->selection and returns
+         *       the original uncompacted table.  The subsequent
+         *       ray_group call builds its own key/agg scans over
+         *       g->table, and exec_group honours g->selection in
+         *       the radix / DA / sequential paths — so no rows are
+         *       materialized twice.  This is the fast path for
+         *       `select ... by ... where` queries.
+         *
+         *   (B) Materialize path — applicable when (A) is not.
+         *       Pre-execute the filter and flatten into a new
+         *       table, then rebuild the graph.  Needed because
+         *       the non-agg scatter runs ray_eval over a flat
+         *       single-segment table, and parted tables need
+         *       segment-level flattening before group anyway.
+         *
+         * (This also fixes a pre-existing WHERE-vs-by bug: any
+         * WHERE clause on a `select ... by` query was silently
+         * ignored before the filter was wired through the group
+         * pipeline.) */
+        if (where_expr) {
+            bool can_fuse = !has_nonagg && !table_is_parted;
+            if (can_fuse) {
+                root = ray_optimize(g, root);
+                /* exec_node populates g->selection as a side effect
+                 * of OP_FILTER on a table input, and returns the
+                 * uncompacted table (== g->table).  Discard the
+                 * result — we only needed the side effect. */
+                ray_t* fres = exec_node(g, root);
+                if (!fres || RAY_IS_ERR(fres)) {
+                    if (g->selection) {
+                        ray_release(g->selection);
+                        g->selection = NULL;
+                    }
+                    ray_graph_free(g); ray_release(tbl);
+                    return fres ? fres : ray_error("domain", NULL);
+                }
+                /* OP_CONST/OP_FILTER both retain, so the returned
+                 * table has an extra refcount we must release.
+                 * g->table still owns tbl via the graph, so this
+                 * only drops the exec-node-side retain. */
+                ray_release(fres);
+            } else {
+                root = ray_optimize(g, root);
+                ray_t* fres = ray_execute(g, root);
+                ray_graph_free(g); g = NULL;
+                if (!fres || RAY_IS_ERR(fres)) { ray_release(tbl); return fres ? fres : ray_error("domain", NULL); }
+                if (ray_is_lazy(fres)) fres = ray_lazy_materialize(fres);
+                if (!fres || RAY_IS_ERR(fres)) { ray_release(tbl); return fres ? fres : ray_error("domain", NULL); }
+                ray_release(tbl);
+                tbl = fres;
+                g = ray_graph_new(tbl);
+                if (!g) { ray_release(tbl); return ray_error("oom", NULL); }
+                root = ray_const_table(g, tbl);
+            }
+        }
+
+        /* Compile group key(s) */
+        ray_op_t* key_ops[16];
+        uint8_t n_keys = 0;
+
+        if (by_expr->type == RAY_SYM) {
+            /* Multiple keys as SYM vector: [col1 col2 ...] */
+            int64_t nk = ray_len(by_expr);
+            int64_t* sym_ids = (int64_t*)ray_data(by_expr);
+            for (int64_t i = 0; i < nk && n_keys < 16; i++) {
+                ray_t* name_str = ray_sym_str(sym_ids[i]);
+                if (!name_str) { ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); }
+                key_ops[n_keys] = ray_scan(g, ray_str_ptr(name_str));
+                if (!key_ops[n_keys]) { ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); }
+                n_keys++;
+            }
+        } else {
+            /* Single key expression */
+            key_ops[0] = compile_expr_dag(g, by_expr);
+            if (!key_ops[0]) { ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); }
+            n_keys = 1;
+        }
+
+        /* Collect aggregation expressions from output columns.
+         * Non-agg expressions are tracked separately for post-DAG scatter. */
+        uint16_t agg_ops[16];
+        ray_op_t* agg_ins[16];
+        uint8_t n_aggs = 0;
+
+        for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+            int64_t kid = dict_elems[i]->i64;
+            if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue;
+
+            ray_t* val_expr = dict_elems[i + 1];
+            if (is_agg_expr(val_expr) && n_aggs < 16) {
+                ray_t** agg_elems = (ray_t**)ray_data(val_expr);
+                agg_ops[n_aggs] = resolve_agg_opcode(agg_elems[0]->i64);
+                /* Compile the aggregation input (the column reference) */
+                agg_ins[n_aggs] = compile_expr_dag(g, agg_elems[1]);
+                if (!agg_ins[n_aggs]) { ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); }
+                n_aggs++;
+            } else if (!is_agg_expr(val_expr) && n_nonaggs < 16) {
+                nonagg_names[n_nonaggs] = kid;
+                nonagg_exprs[n_nonaggs] = val_expr;
+                n_nonaggs++;
+            }
+        }
+
+        if (n_aggs > 0 || n_nonaggs > 0) {
+            if (n_aggs > 0) {
+                root = ray_group(g, key_ops, n_keys, agg_ops, agg_ins, n_aggs);
+            } else {
+                /* No aggs but non-agg expressions exist — still need group
+                 * boundaries.  Use GROUP+COUNT on the key to get group keys.
+                 * The count column will be dropped after execution. */
+                uint16_t cnt_op = OP_COUNT;
+                ray_op_t* cnt_in = key_ops[0];
+                root = ray_group(g, key_ops, n_keys, &cnt_op, &cnt_in, 1);
+                synth_count_col = 1;
+            }
+        } else {
+            /* No explicit aggregations — apply WHERE filter first (if any),
+             * then use DAG GROUP+COUNT for fast hash-parallel group boundaries,
+             * then gather first-of-group from the filtered table. */
+            ray_t* filtered_tbl = tbl;
+            if (where_expr) {
+                root = ray_optimize(g, root);
+                ray_t* fres = ray_execute(g, root);
+                ray_graph_free(g); g = NULL;
+                if (!fres || RAY_IS_ERR(fres)) { ray_release(tbl); return fres ? fres : ray_error("domain", NULL); }
+                if (ray_is_lazy(fres)) fres = ray_lazy_materialize(fres);
+                if (!fres || RAY_IS_ERR(fres)) { ray_release(tbl); return fres ? fres : ray_error("domain", NULL); }
+                filtered_tbl = fres;
+                /* Rebuild graph on filtered table for GROUP+COUNT */
+                g = ray_graph_new(filtered_tbl);
+                if (!g) { if (filtered_tbl != tbl) ray_release(filtered_tbl); ray_release(tbl); return ray_error("oom", NULL); }
+                n_keys = 0;
+                if (by_expr->type == RAY_SYM) {
+                    int64_t nk = ray_len(by_expr);
+                    int64_t* sym_ids = (int64_t*)ray_data(by_expr);
+                    for (int64_t i = 0; i < nk && n_keys < 16; i++) {
+                        ray_t* ns = ray_sym_str(sym_ids[i]);
+                        if (ns) key_ops[n_keys++] = ray_scan(g, ray_str_ptr(ns));
+                    }
+                } else {
+                    key_ops[0] = compile_expr_dag(g, by_expr);
+                    if (key_ops[0]) n_keys = 1;
+                }
+            }
+
+            uint16_t cnt_op = OP_COUNT;
+            ray_op_t* cnt_in = key_ops[0];
+            root = ray_group(g, key_ops, n_keys, &cnt_op, &cnt_in, 1);
+            root = ray_optimize(g, root);
+            ray_t* grouped = ray_execute(g, root);
+            ray_graph_free(g); g = NULL;
+            if (!grouped || RAY_IS_ERR(grouped)) { if (filtered_tbl != tbl) ray_release(filtered_tbl); ray_release(tbl); return grouped; }
+            if (ray_is_lazy(grouped)) grouped = ray_lazy_materialize(grouped);
+
+            int64_t n_groups = ray_table_nrows(grouped);
+
+            /* Resolve key column sym early — needed for empty result schema.
+             * A dotted name like `Timestamp.date` compiles to a scan + trunc
+             * chain, not a direct column lookup, so it must land in the
+             * computed-key fallback path below (key_sym stays -1).  Otherwise
+             * downstream `ray_table_get_col(filtered_tbl, key_sym)` would
+             * return NULL for the non-existent "Timestamp.date" column and
+             * the subsequent deref would crash. */
+            int64_t key_sym = -1;
+            if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME)
+                && !ray_sym_is_dotted(by_expr->i64))
+                key_sym = by_expr->i64;
+            else if (by_expr->type == RAY_SYM && ray_len(by_expr) == 1)
+                key_sym = ((int64_t*)ray_data(by_expr))[0];
+
+            if (n_groups == 0) {
+                ray_release(grouped);
+                int64_t nc0 = ray_table_ncols(filtered_tbl);
+                ray_t* empty = ray_table_new(nc0 + 1);
+                if (!RAY_IS_ERR(empty)) {
+                    /* Key column.  For a plain/column key, key_sym
+                     * names a real source column and we mirror its
+                     * type.  For a computed key (dotted, xbar, ...)
+                     * we evaluate by_expr against the filtered (empty)
+                     * table to learn the key's type and name without
+                     * duplicating schema derivation logic. */
+                    int64_t empty_key_name = key_sym;
+                    ray_t* empty_key_vec = NULL;
+                    if (key_sym >= 0) {
+                        ray_t* sc = ray_table_get_col(filtered_tbl, key_sym);
+                        if (sc) {
+                            empty_key_vec = (sc->type == RAY_STR)
+                                            ? ray_vec_new(RAY_STR, 0)
+                                            : ray_vec_new(sc->type, 0);
+                        }
+                    } else {
+                        /* Match the computed-key fallback's naming
+                         * rules (dotted tail / last name arg) and
+                         * collision handling. */
+                        int64_t ck_name = -1;
+                        int64_t ck_full = -1;
+                        int64_t ck_head = -1;
+                        if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME)) {
+                            ck_full = by_expr->i64;
+                            if (ray_sym_is_dotted(by_expr->i64)) {
+                                const int64_t* segs;
+                                int nsegs = ray_sym_segs(by_expr->i64, &segs);
+                                if (nsegs > 0) { ck_name = segs[nsegs - 1]; ck_head = segs[0]; }
+                            } else {
+                                ck_name = by_expr->i64;
+                            }
+                        } else if (by_expr->type == RAY_LIST && by_expr->len >= 2) {
+                            ray_t** be = (ray_t**)ray_data(by_expr);
+                            for (int64_t i = by_expr->len - 1; i >= 1; i--) {
+                                if (be[i]->type == -RAY_SYM && (be[i]->attrs & RAY_ATTR_NAME)) {
+                                    ck_name = be[i]->i64;
+                                    break;
+                                }
+                            }
+                        }
+                        if (ck_name < 0) ck_name = ray_sym_intern("key", 3);
+                        if (ck_head >= 0 && ck_full >= 0 && ck_name != ck_full) {
+                            for (int64_t c = 0; c < nc0; c++) {
+                                int64_t cn = ray_table_col_name(filtered_tbl, c);
+                                if (cn == ck_name && cn != ck_head) {
+                                    ck_name = ck_full;
+                                    break;
+                                }
+                            }
+                        }
+                        empty_key_name = ck_name;
+
+                        /* Evaluate by_expr against the (empty) filtered table
+                         * to get a length-0 key vector typed like the
+                         * non-empty path would produce it. */
+                        ray_env_push_scope();
+                        for (int64_t c = 0; c < nc0; c++) {
+                            ray_env_set_local(ray_table_col_name(filtered_tbl, c),
+                                              ray_table_get_col_idx(filtered_tbl, c));
+                        }
+                        ray_t* ck_vec = ray_eval(by_expr);
+                        ray_env_pop_scope();
+                        if (ck_vec && !RAY_IS_ERR(ck_vec) && ray_is_vec(ck_vec)) {
+                            int8_t kt = ck_vec->type;
+                            empty_key_vec = (kt == RAY_STR)
+                                            ? ray_vec_new(RAY_STR, 0)
+                                            : (kt == RAY_LIST)
+                                              ? ray_list_new(0)
+                                              : ray_vec_new(kt, 0);
+                        }
+                        if (ck_vec && !RAY_IS_ERR(ck_vec)) ray_release(ck_vec);
+                    }
+                    if (empty_key_vec && !RAY_IS_ERR(empty_key_vec)) {
+                        empty = ray_table_add_col(empty, empty_key_name, empty_key_vec);
+                        ray_release(empty_key_vec);
+                    }
+
+                    for (int64_t c = 0; c < nc0; c++) {
+                        int64_t cn = ray_table_col_name(filtered_tbl, c);
+                        if (cn == empty_key_name) continue;
+                        ray_t* sc = ray_table_get_col_idx(filtered_tbl, c);
+                        ray_t* ev = (sc->type == RAY_STR) ? ray_vec_new(RAY_STR, 0) :
+                                    (sc->type == RAY_LIST) ? ray_list_new(0) :
+                                    ray_vec_new(sc->type, 0);
+                        if (!RAY_IS_ERR(ev)) { empty = ray_table_add_col(empty, cn, ev); ray_release(ev); }
+                    }
+                }
+                if (filtered_tbl != tbl) ray_release(filtered_tbl);
+                ray_release(tbl);
+                return empty;
+            }
+
+            /* Build first_idx: scan filtered key column once, record first
+             * occurrence of each group key value. */
+            if (key_sym < 0) {
+                /* Computed group key (e.g., xbar) — fall back to eval-level groupby */
+                ray_release(grouped);
+                int64_t tbl_ncols = ray_table_ncols(filtered_tbl);
+                ray_env_push_scope();
+                for (int64_t c = 0; c < tbl_ncols; c++) {
+                    int64_t cn = ray_table_col_name(filtered_tbl, c);
+                    ray_t* cv = ray_table_get_col_idx(filtered_tbl, c);
+                    ray_env_set_local(cn, cv);
+                }
+                ray_t* computed_key = ray_eval(by_expr);
+                ray_env_pop_scope();
+                if (!computed_key || RAY_IS_ERR(computed_key)) {
+                    if (filtered_tbl != tbl) ray_release(filtered_tbl);
+                    ray_release(tbl);
+                    return computed_key ? computed_key : ray_error("domain", NULL);
+                }
+                ray_t* groups2_dict = ray_group_fn(computed_key);
+                if (!groups2_dict || RAY_IS_ERR(groups2_dict)) {
+                    ray_release(computed_key);
+                    if (filtered_tbl != tbl) ray_release(filtered_tbl);
+                    ray_release(tbl);
+                    return groups2_dict ? groups2_dict : ray_error("domain", NULL);
+                }
+                ray_t* groups2 = groups_to_pair_list(groups2_dict);
+                ray_release(groups2_dict);
+                if (RAY_IS_ERR(groups2)) {
+                    ray_release(computed_key);
+                    if (filtered_tbl != tbl) ray_release(filtered_tbl);
+                    ray_release(tbl);
+                    return groups2;
+                }
+                int64_t ng2 = ray_len(groups2) / 2;
+                if (ng2 == 0) { ray_release(groups2); ray_release(computed_key); if (filtered_tbl != tbl) ray_release(filtered_tbl); ray_release(tbl); return ray_table_new(0); }
+                ray_t** gi2 = (ray_t**)ray_data(groups2);
+
+                /* fi2 must sweep EVERY group, not just the first 256 —
+                 * the downstream result-column loops iterate up to ng2
+                 * and indexed reads beyond a fixed-size stack slot would
+                 * pick up uninitialised bytes.  Stack-fast for small
+                 * group counts, heap-fallback once we need more. */
+                int64_t fi2_stack[256];
+                ray_t*  fi2_hdr = NULL;
+                int64_t* fi2 = fi2_stack;
+                if (ng2 > 256) {
+                    fi2_hdr = ray_alloc((size_t)ng2 * sizeof(int64_t));
+                    if (!fi2_hdr) {
+                        ray_release(groups2); ray_release(computed_key);
+                        if (filtered_tbl != tbl) ray_release(filtered_tbl);
+                        ray_release(tbl);
+                        return ray_error("oom", NULL);
+                    }
+                    fi2 = (int64_t*)ray_data(fi2_hdr);
+                }
+                for (int64_t g2 = 0; g2 < ng2; g2++) {
+                    int alloc2 = 0;
+                    ray_t* i02 = collection_elem(gi2[g2 * 2 + 1], 0, &alloc2);
+                    fi2[g2] = as_i64(i02);
+                    if (alloc2) ray_release(i02);
+                }
+                /* Name for the synthesized key column:
+                 *  - dotted sym `a.b.c` → tail segment (`c`) so `Timestamp.ss`
+                 *    surfaces as an `ss` column (pretty in the common case).
+                 *    If the tail collides with an *unrelated* source column
+                 *    (not the head of the dotted path), fall back to the
+                 *    full dotted name so we don't silently drop real data.
+                 *  - list expr `(xbar N col)` / `(+ col 1)` → last name-typed
+                 *    argument, so the transform's output deliberately
+                 *    replaces the source column (matches xbar convention).
+                 *  - fall back to an interned "key" if nothing more specific
+                 *    can be derived. */
+                int64_t ckey_name = -1;
+                int64_t ckey_full = -1;       /* full dotted sym, for collision fallback */
+                int64_t ckey_head = -1;       /* head segment of dotted expr (input column) */
+                if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME)) {
+                    ckey_full = by_expr->i64;
+                    if (ray_sym_is_dotted(by_expr->i64)) {
+                        const int64_t* segs;
+                        int nsegs = ray_sym_segs(by_expr->i64, &segs);
+                        if (nsegs > 0) {
+                            ckey_name = segs[nsegs - 1];
+                            ckey_head = segs[0];
+                        }
+                    } else {
+                        ckey_name = by_expr->i64;
+                    }
+                } else if (by_expr->type == RAY_LIST && by_expr->len >= 2) {
+                    ray_t** be = (ray_t**)ray_data(by_expr);
+                    for (int64_t i = by_expr->len - 1; i >= 1; i--) {
+                        if (be[i]->type == -RAY_SYM && (be[i]->attrs & RAY_ATTR_NAME)) {
+                            ckey_name = be[i]->i64;
+                            break;
+                        }
+                    }
+                }
+                if (ckey_name < 0) ckey_name = ray_sym_intern("key", 3);
+
+                /* Collision check for dotted tail: if the tail name matches
+                 * a source column that isn't the head of the dotted expr,
+                 * the old code silently dropped that source column from the
+                 * result.  Promote to the full dotted sym so both stay. */
+                if (ckey_head >= 0 && ckey_full >= 0 && ckey_name != ckey_full) {
+                    for (int64_t c = 0; c < tbl_ncols; c++) {
+                        int64_t cn = ray_table_col_name(filtered_tbl, c);
+                        if (cn == ckey_name && cn != ckey_head) {
+                            ckey_name = ckey_full;
+                            break;
+                        }
+                    }
+                }
+
+                ray_t* res2 = ray_table_new(tbl_ncols + 1);
+                /* Key column: computed_key's first-of-group values, which
+                 * are the distinct grouping-key values surfaced to the
+                 * user.  Using the source column at fi2 indices would lose
+                 * the transform (e.g. raw Timestamp instead of its `.ss`). */
+                if (ray_is_vec(computed_key)) {
+                    ray_t* kv = ray_vec_new(computed_key->type, ng2);
+                    if (!RAY_IS_ERR(kv)) {
+                        /* len BEFORE store loop — ray_vec_set_null (called
+                         * by store_typed_elem for null atoms) range-checks
+                         * idx against vec->len and silently no-ops
+                         * otherwise. */
+                        kv->len = ng2;
+                        for (int64_t g2 = 0; g2 < ng2; g2++) {
+                            int a2 = 0;
+                            ray_t* v2 = collection_elem(computed_key, fi2[g2], &a2);
+                            store_typed_elem(kv, g2, v2);
+                            if (a2) ray_release(v2);
+                        }
+                        res2 = ray_table_add_col(res2, ckey_name, kv);
+                        ray_release(kv);
+                    }
+                }
+                for (int64_t c = 0; c < tbl_ncols; c++) {
+                    int64_t cn = ray_table_col_name(filtered_tbl, c);
+                    /* Avoid duplicating a column name already used by the
+                     * key: e.g. `by: Timestamp` (plain, non-dotted) would
+                     * collide with the source Timestamp column. */
+                    if (cn == ckey_name) continue;
+                    ray_t* sc = ray_table_get_col_idx(filtered_tbl, c);
+                    ray_t* dc = ray_vec_new(sc->type, ng2);
+                    dc->len = ng2;    /* see note above — hoisted for null bits */
+                    for (int64_t g2 = 0; g2 < ng2; g2++) { int a2 = 0; ray_t* v2 = collection_elem(sc, fi2[g2], &a2); store_typed_elem(dc, g2, v2); if (a2) ray_release(v2); }
+                    res2 = ray_table_add_col(res2, cn, dc); ray_release(dc);
+                }
+                if (fi2_hdr) ray_free(fi2_hdr);
+                ray_release(groups2); ray_release(computed_key);
+                if (filtered_tbl != tbl) ray_release(filtered_tbl);
+                ray_release(tbl);
+                return res2;
+            }
+
+            ray_t* orig_key_col = ray_table_get_col(filtered_tbl, key_sym);
+            int64_t nrows_orig = orig_key_col ? orig_key_col->len : 0;
+
+            /* Read group key values from grouped table BEFORE releasing it.
+             * grp_key_col points into grouped — must not access after release. */
+            ray_t* grp_key_col = ray_table_get_col(grouped, key_sym);
+            int8_t kt = orig_key_col ? orig_key_col->type : 0;
+
+            /* Heap-allocate gk_vals when n_groups > 256 */
+            int64_t gk_stack[256];
+            ray_t* gk_heap_hdr = NULL;
+            int64_t* gk_vals = gk_stack;
+            if (n_groups > 256) {
+                gk_heap_hdr = ray_alloc((size_t)n_groups * sizeof(int64_t));
+                if (!gk_heap_hdr) { ray_release(grouped); if (filtered_tbl != tbl) ray_release(filtered_tbl); ray_release(tbl); return ray_error("oom", NULL); }
+                gk_vals = (int64_t*)ray_data(gk_heap_hdr);
+            }
+
+            /* Copy group key values while grouped is still alive.
+             * STR/LIST/GUID keys are routed through eval-level fallback
+             * above, so only integer-like types reach here.  Use
+             * read_col_i64 for non-F64 types — it dispatches on the
+             * column type (I32/I16/I8/BOOL/SYM adaptive width etc.),
+             * whereas ray_read_sym interprets `attrs` as SYM width and
+             * silently truncates to 1 byte for plain integer columns
+             * where attrs doesn't carry width bits.
+             *
+             * We also record a per-group null flag.  The DAG GROUP path
+             * stores null keys with value=0 and differentiates via a
+             * null mask — if we hashed raw bits only, a null group would
+             * collide with non-null value 0 (for I64 / I32 / SYM / DATE
+             * / TIME etc.) or with +0.0 for F64 (ray_hash_f64 normalises
+             * -0.0 to +0.0, and F64's null bit pattern on this platform
+             * is the -0.0 pattern).  The null flag keeps those groups
+             * distinct. */
+            uint8_t gk_null_stack[256];
+            ray_t*  gk_null_hdr = NULL;
+            uint8_t* gk_null = gk_null_stack;
+            if (n_groups > 256) {
+                gk_null_hdr = ray_alloc((size_t)n_groups * sizeof(uint8_t));
+                if (!gk_null_hdr) {
+                    if (gk_heap_hdr) ray_free(gk_heap_hdr);
+                    ray_release(grouped);
+                    if (filtered_tbl != tbl) ray_release(filtered_tbl);
+                    ray_release(tbl);
+                    return ray_error("oom", NULL);
+                }
+                gk_null = (uint8_t*)ray_data(gk_null_hdr);
+            }
+            memset(gk_null, 0, (size_t)n_groups * sizeof(uint8_t));
+
+            if (grp_key_col) {
+                bool gk_has_nulls = (grp_key_col->attrs & RAY_ATTR_HAS_NULLS) != 0;
+                for (int64_t gi = 0; gi < n_groups; gi++) {
+                    if (kt == RAY_F64)
+                        memcpy(&gk_vals[gi], &((double*)ray_data(grp_key_col))[gi], 8);
+                    else
+                        gk_vals[gi] = read_col_i64(ray_data(grp_key_col), gi, kt, grp_key_col->attrs);
+                    if (gk_has_nulls && ray_vec_is_null(grp_key_col, gi))
+                        gk_null[gi] = 1;
+                }
+            }
+            ray_release(grouped); /* grp_key_col is now invalid */
+
+            /* Allocate first_idx */
+            int64_t first_idx_stack[256];
+            ray_t* fi_heap_hdr = NULL;
+            int64_t* first_idx = first_idx_stack;
+            if (n_groups > 256) {
+                fi_heap_hdr = ray_alloc((size_t)n_groups * sizeof(int64_t));
+                if (!fi_heap_hdr) { if (gk_heap_hdr) ray_free(gk_heap_hdr); if (filtered_tbl != tbl) ray_release(filtered_tbl); ray_release(tbl); return ray_error("oom", NULL); }
+                first_idx = (int64_t*)ray_data(fi_heap_hdr);
+            }
+
+            /* Build {key_bits -> group_index} hash table from gk_vals so the
+             * scan below is O(nrows_orig + n_groups) instead of
+             * O(nrows_orig * n_groups).  Without this a 1M-row / 1M-group
+             * float-key grouping hangs for tens of seconds — I64 has a
+             * low-cardinality direct-array fast path upstream, but F64
+             * and other non-GUID scalar keys fall through to this scan. */
+            for (int64_t gi = 0; gi < n_groups; gi++) first_idx[gi] = -1;
+            {
+                uint32_t fi_cap = 64;
+                while ((uint64_t)fi_cap < (uint64_t)n_groups * 2 && fi_cap < (1u << 30))
+                    fi_cap <<= 1;
+                uint32_t fi_mask = fi_cap - 1;
+                ray_t* fi_ht_hdr = ray_alloc((size_t)fi_cap * sizeof(uint32_t));
+                if (!fi_ht_hdr) {
+                    if (gk_heap_hdr) ray_free(gk_heap_hdr);
+                    if (fi_heap_hdr) ray_free(fi_heap_hdr);
+                    if (filtered_tbl != tbl) ray_release(filtered_tbl);
+                    ray_release(tbl);
+                    return ray_error("oom", NULL);
+                }
+                uint32_t* fi_ht = (uint32_t*)ray_data(fi_ht_hdr);
+                memset(fi_ht, 0xFF, (size_t)fi_cap * sizeof(uint32_t));
+
+                /* Insert every group key into the HT keyed by bit pattern.
+                 * For F64 keys, hash via the float path; memcpy bit pattern
+                 * out of gk_vals to dodge strict-aliasing.  Null groups
+                 * get a distinct hash so they don't collide with zero-valued
+                 * groups (F64 null has the -0.0 bit pattern, which
+                 * ray_hash_f64 normalises to +0.0; integer-flavoured
+                 * nulls are stored as value=0). */
+                for (int64_t gi = 0; gi < n_groups; gi++) {
+                    uint64_t h;
+                    if (gk_null[gi]) {
+                        h = ray_hash_i64((int64_t)0xDEADBEEFCAFEBABEULL);
+                    } else if (kt == RAY_F64) {
+                        double dv;
+                        memcpy(&dv, &gk_vals[gi], 8);
+                        h = ray_hash_f64(dv);
+                    } else {
+                        h = ray_hash_i64(gk_vals[gi]);
+                    }
+                    uint32_t slot = (uint32_t)(h & fi_mask);
+                    while (fi_ht[slot] != UINT32_MAX) slot = (slot + 1) & fi_mask;
+                    fi_ht[slot] = (uint32_t)gi;
+                }
+
+                /* Single linear scan of the source column; for each row
+                 * hash-lookup its group index and record the first row
+                 * that maps to it.  Terminate early once every group has
+                 * a first-row. */
+                bool orig_nulls_flag = orig_key_col
+                    && (orig_key_col->attrs & RAY_ATTR_HAS_NULLS) != 0;
+                int64_t found = 0;
+                for (int64_t r = 0; r < nrows_orig && found < n_groups; r++) {
+                    bool r_null = orig_nulls_flag && ray_vec_is_null(orig_key_col, r);
+                    int64_t ov;
+                    if (kt == RAY_F64) memcpy(&ov, &((double*)ray_data(orig_key_col))[r], 8);
+                    else ov = read_col_i64(ray_data(orig_key_col), r, kt, orig_key_col->attrs);
+                    uint64_t h;
+                    if (r_null) {
+                        h = ray_hash_i64((int64_t)0xDEADBEEFCAFEBABEULL);
+                    } else if (kt == RAY_F64) {
+                        double dv;
+                        memcpy(&dv, &ov, 8);
+                        h = ray_hash_f64(dv);
+                    } else {
+                        h = ray_hash_i64(ov);
+                    }
+                    uint32_t slot = (uint32_t)(h & fi_mask);
+                    while (fi_ht[slot] != UINT32_MAX) {
+                        uint32_t cand = fi_ht[slot];
+                        bool match = (r_null && gk_null[cand])
+                                     || (!r_null && !gk_null[cand] && gk_vals[cand] == ov);
+                        if (match) {
+                            if (first_idx[cand] < 0) {
+                                first_idx[cand] = r;
+                                found++;
+                            }
+                            break;
+                        }
+                        slot = (slot + 1) & fi_mask;
+                    }
+                }
+                ray_free(fi_ht_hdr);
+            }
+            if (gk_null_hdr) ray_free(gk_null_hdr);
+            if (gk_heap_hdr) ray_free(gk_heap_hdr);
+
+            /* Now build the result table using first_idx gathered above.
+             * key_sym and n_groups are already set. */
+
+            /* Build result table: key column first, then others */
+            int64_t ncols = ray_table_ncols(filtered_tbl);
+            ray_t* result = ray_table_new(ncols);
+            if (RAY_IS_ERR(result)) { if (fi_heap_hdr) ray_free(fi_heap_hdr); ray_release(tbl); return result; }
+
+            /* Add key column first */
+            ray_t* key_vec_src = ray_table_get_col(filtered_tbl, key_sym);
+            if (key_vec_src->type == RAY_STR) {
+                ray_t* key_vec_dst = ray_vec_new(RAY_STR, n_groups);
+                if (!key_vec_dst || RAY_IS_ERR(key_vec_dst)) { if (fi_heap_hdr) ray_free(fi_heap_hdr); ray_release(tbl); ray_release(result); return key_vec_dst ? key_vec_dst : ray_error("oom", NULL); }
+                for (int64_t gi = 0; gi < n_groups; gi++) {
+                    size_t slen = 0;
+                    const char* sp = ray_str_vec_get(key_vec_src, first_idx[gi], &slen);
+                    key_vec_dst = ray_str_vec_append(key_vec_dst, sp ? sp : "", sp ? slen : 0);
+                    if (RAY_IS_ERR(key_vec_dst)) { if (fi_heap_hdr) ray_free(fi_heap_hdr); ray_release(tbl); ray_release(result); return key_vec_dst; }
+                }
+                result = ray_table_add_col(result, key_sym, key_vec_dst);
+                ray_release(key_vec_dst);
+            } else {
+                ray_t* key_vec_dst = ray_vec_new(key_vec_src->type, n_groups);
+                if (RAY_IS_ERR(key_vec_dst)) { if (fi_heap_hdr) ray_free(fi_heap_hdr); ray_release(tbl); ray_release(result); return key_vec_dst; }
+                /* Set len BEFORE the store loop: store_typed_elem routes
+                 * null atoms through ray_vec_set_null, which range-checks
+                 * idx against vec->len and silently returns RAY_ERR_RANGE
+                 * otherwise.  Postponing len=n_groups until after the loop
+                 * therefore dropped the null bit on every nullable key row
+                 * — the result would read back the raw zero/-0.0 bits with
+                 * no HAS_NULLS flag, corrupting the grouped key column. */
+                key_vec_dst->len = n_groups;
+                for (int64_t gi = 0; gi < n_groups; gi++) {
+                    int alloc = 0;
+                    ray_t* val = collection_elem(key_vec_src, first_idx[gi], &alloc);
+                    store_typed_elem(key_vec_dst, gi, val);
+                    if (alloc) ray_release(val);
+                }
+                result = ray_table_add_col(result, key_sym, key_vec_dst);
+                ray_release(key_vec_dst);
+            }
+
+            /* Add non-key columns */
+            for (int64_t c = 0; c < ncols; c++) {
+                int64_t col_name = ray_table_col_name(filtered_tbl, c);
+                if (col_name == key_sym) continue;
+                ray_t* src_col = ray_table_get_col_idx(filtered_tbl, c);
+                int8_t ct = src_col->type;
+
+                if (ct == RAY_STR) {
+                    /* String column: build STR vector */
+                    ray_t* dst = ray_vec_new(RAY_STR, n_groups);
+                    if (!dst || RAY_IS_ERR(dst)) { if (fi_heap_hdr) ray_free(fi_heap_hdr); ray_release(tbl); ray_release(result); return dst ? dst : ray_error("oom", NULL); }
+                    for (int64_t gi = 0; gi < n_groups; gi++) {
+                        size_t slen = 0;
+                        const char* sp = ray_str_vec_get(src_col, first_idx[gi], &slen);
+                        dst = ray_str_vec_append(dst, sp ? sp : "", sp ? slen : 0);
+                        if (RAY_IS_ERR(dst)) { if (fi_heap_hdr) ray_free(fi_heap_hdr); ray_release(tbl); ray_release(result); return dst; }
+                    }
+                    result = ray_table_add_col(result, col_name, dst);
+                    ray_release(dst);
+                } else if (ct == RAY_LIST) {
+                    /* List column: pick items */
+                    ray_t* dst = ray_alloc(n_groups * sizeof(ray_t*));
+                    if (!dst) { if (fi_heap_hdr) ray_free(fi_heap_hdr); ray_release(tbl); ray_release(result); return ray_error("oom", NULL); }
+                    dst->type = RAY_LIST;
+                    dst->len = n_groups;
+                    ray_t** dout = (ray_t**)ray_data(dst);
+                    ray_t** src_items = (ray_t**)ray_data(src_col);
+                    for (int64_t gi = 0; gi < n_groups; gi++) {
+                        dout[gi] = src_items[first_idx[gi]];
+                        ray_retain(dout[gi]);
+                    }
+                    result = ray_table_add_col(result, col_name, dst);
+                    ray_release(dst);
+                } else {
+                    /* Typed vector: copy elements at first indices.
+                     * len must be set before the store loop so null bits
+                     * propagate through store_typed_elem → ray_vec_set_null
+                     * (same reason as the key column above). */
+                    ray_t* dst = ray_vec_new(ct, n_groups);
+                    if (RAY_IS_ERR(dst)) { if (fi_heap_hdr) ray_free(fi_heap_hdr); ray_release(tbl); ray_release(result); return dst; }
+                    dst->len = n_groups;
+                    for (int64_t gi = 0; gi < n_groups; gi++) {
+                        int alloc = 0;
+                        ray_t* val = collection_elem(src_col, first_idx[gi], &alloc);
+                        store_typed_elem(dst, gi, val);
+                        if (alloc) ray_release(val);
+                    }
+                    result = ray_table_add_col(result, col_name, dst);
+                    ray_release(dst);
+                }
+                if (RAY_IS_ERR(result)) { if (fi_heap_hdr) ray_free(fi_heap_hdr); ray_release(tbl); return result; }
+            }
+
+            if (fi_heap_hdr) ray_free(fi_heap_hdr);
+            if (filtered_tbl != tbl) ray_release(filtered_tbl);
+            ray_release(tbl);
+            return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
+        }
+    } else if (n_out > 0) {
+        /* Projection only (no group by) — select specific columns */
+        ray_op_t* col_ops[16];
+        uint8_t nc = 0;
+        for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+            int64_t kid = dict_elems[i]->i64;
+            if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue;
+            if (nc < 16) {
+                col_ops[nc] = compile_expr_dag(g, dict_elems[i + 1]);
+                if (!col_ops[nc]) {
+                    /* Nearest-path resources must be freed here too — the
+                     * rerank handle/query buffers are held across the whole
+                     * ray_select_fn body, not just inside the nearest block. */
+                    if (nearest_handle_owned) ray_release(nearest_handle_owned);
+                    if (nearest_query_owned)  ray_sys_free(nearest_query_owned);
+                    ray_graph_free(g); ray_release(tbl);
+                    return ray_error("domain", NULL);
+                }
+                nc++;
+            }
+        }
+        root = ray_select(g, root, col_ops, nc);
+    }
+
+    /* Sort: collect asc/desc columns in dict iteration order.
+     * Only add to the DAG when there's no group-by — group-by changes the
+     * output schema, so sort on output columns must happen post-execution.
+     * Values are unevaluated — a SYM atom is a column name, a SYM vector
+     * is multiple column names.  No ray_eval needed. */
+    if (has_sort && !by_expr) {
+        ray_op_t* sort_keys[16];
+        uint8_t   sort_descs[16];
+        uint8_t   n_sort = 0;
+        for (int64_t i = 0; i + 1 < dict_n && n_sort < 16; i += 2) {
+            int64_t kid = dict_elems[i]->i64;
+            uint8_t is_desc = 0;
+            if (kid == asc_id) is_desc = 0;
+            else if (kid == desc_id) is_desc = 1;
+            else continue;
+            ray_t* val = dict_elems[i + 1];
+            if (val->type == -RAY_SYM) {
+                /* Single column name */
+                ray_t* s = ray_sym_str(val->i64);
+                sort_keys[n_sort] = ray_scan(g, ray_str_ptr(s));
+                sort_descs[n_sort] = is_desc;
+                n_sort++;
+            } else if (ray_is_vec(val) && val->type == RAY_SYM) {
+                /* Multiple column names */
+                for (int64_t c = 0; c < val->len && n_sort < 16; c++) {
+                    int64_t sid = ray_read_sym(ray_data(val), c, val->type, val->attrs);
+                    ray_t* s = ray_sym_str(sid);
+                    sort_keys[n_sort] = ray_scan(g, ray_str_ptr(s));
+                    sort_descs[n_sort] = is_desc;
+                    n_sort++;
+                }
+            } else {
+                ray_graph_free(g); ray_release(tbl);
+                return ray_error("domain", NULL);
+            }
+        }
+        if (n_sort > 0)
+            root = ray_sort_op(g, root, sort_keys, sort_descs, NULL, n_sort);
+    }
+
+    /* Take: add to DAG only when no group-by and no nearest (rerank
+     * absorbs the take into its k parameter). */
+    ray_t* take_range = NULL;
+    if (take_expr && !by_expr && !nearest_expr) {
+        ray_t* tv = ray_eval(take_expr);
+        if (!tv || RAY_IS_ERR(tv)) { ray_graph_free(g); ray_release(tbl); return tv ? tv : ray_error("domain", NULL); }
+        if (ray_is_atom(tv) && (tv->type == -RAY_I64 || tv->type == -RAY_I32)) {
+            int64_t n_take = (tv->type == -RAY_I64) ? tv->i64 : tv->i32;
+            ray_release(tv);
+            if (n_take >= 0)
+                root = ray_head(g, root, n_take);
+            else
+                root = ray_tail(g, root, -n_take);
+        } else if (ray_is_vec(tv) && (tv->type == RAY_I64 || tv->type == RAY_I32) && tv->len == 2) {
+            take_range = tv;  /* apply after DAG execution */
+        } else {
+            ray_release(tv);
+            ray_graph_free(g); ray_release(tbl);
+            return ray_error("domain", NULL);
+        }
+    }
+
+    /* Optimize and execute */
+    root = ray_optimize(g, root);
+    ray_t* result = ray_execute(g, root);
+
+    ray_graph_free(g);
+    /* The nearest-query buffer was only referenced by ext->rerank.query_vec
+     * and is safe to free once the graph (and thus the op ext) is gone. */
+    if (nearest_query_owned) ray_sys_free(nearest_query_owned);
+    /* The HNSW handle was kept alive through ray_execute so the rerank
+     * ext's idx pointer stayed valid.  Safe to release now that the
+     * graph (and its ext nodes) has been freed. */
+    if (nearest_handle_owned) ray_release(nearest_handle_owned);
+
+    /* Post-process: range take [start count] applied after execution */
+    if (take_range && result && !RAY_IS_ERR(result)) {
+        ray_t* sliced = ray_take_fn(result, take_range);
+        ray_release(result);
+        ray_release(take_range);
+        result = sliced;
+    } else if (take_range) {
+        ray_release(take_range);
+    }
+
+    /* Post-process: reorder GROUP BY BOOL results to match first-occurrence
+     * order in the original table (exec.c radix sort puts false before true) */
+    if (by_expr && result && !RAY_IS_ERR(result) && result->type == RAY_TABLE) {
+        if (ray_is_lazy(result)) result = ray_lazy_materialize(result);
+        if (result && !RAY_IS_ERR(result) && result->type == RAY_TABLE) {
+            ray_t* key_col = ray_table_get_col_idx(result, 0);
+            if (key_col && key_col->type == RAY_BOOL && key_col->len >= 2) {
+                /* Find first-occurrence order of bool values in original
+                 * table.  Accept both scalar `-RAY_SYM` and single-element
+                 * `RAY_SYM` vector forms. */
+                int64_t by_sym = -1;
+                if (by_expr->type == -RAY_SYM)
+                    by_sym = by_expr->i64;
+                else if (by_expr->type == RAY_SYM && ray_len(by_expr) == 1)
+                    by_sym = ((int64_t*)ray_data(by_expr))[0];
+                ray_t* orig_key = (by_sym >= 0) ? ray_table_get_col(tbl, by_sym) : NULL;
+                if (orig_key && orig_key->type == RAY_BOOL && orig_key->len > 0) {
+                    bool first_val = ((bool*)ray_data(orig_key))[0];
+                    bool result_first = ((bool*)ray_data(key_col))[0];
+                    if (first_val != result_first) {
+                        /* Swap rows: reverse row order in all columns */
+                        int64_t nrows_r = ray_table_nrows(result);
+                        int64_t ncols_r = ray_table_ncols(result);
+                        ray_t* reordered = ray_table_new((int32_t)ncols_r);
+                        if (reordered && !RAY_IS_ERR(reordered)) {
+                            int ok = 1;
+                            for (int64_t c = 0; c < ncols_r && ok; c++) {
+                                int64_t cn = ray_table_col_name(result, c);
+                                ray_t* col = ray_table_get_col_idx(result, c);
+                                int esz = ray_elem_size(col->type);
+                                ray_t* new_col = ray_vec_new(col->type, nrows_r);
+                                if (RAY_IS_ERR(new_col)) { ok = 0; break; }
+                                new_col->len = nrows_r;
+                                char* src = (char*)ray_data(col);
+                                char* dst = (char*)ray_data(new_col);
+                                bool has_nulls = (col->attrs & RAY_ATTR_HAS_NULLS) != 0;
+                                for (int64_t r = 0; r < nrows_r; r++) {
+                                    memcpy(dst + r * esz, src + (nrows_r - 1 - r) * esz, esz);
+                                    if (has_nulls && ray_vec_is_null(col, nrows_r - 1 - r))
+                                        ray_vec_set_null(new_col, r, true);
+                                }
+                                reordered = ray_table_add_col(reordered, cn, new_col);
+                                ray_release(new_col);
+                                if (RAY_IS_ERR(reordered)) { ok = 0; break; }
+                            }
+                            if (ok) {
+                                ray_release(result);
+                                result = reordered;
+                            } else if (reordered && !RAY_IS_ERR(reordered)) {
+                                ray_release(reordered);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /* Drop the synthesized COUNT column (used only to get group
+     * boundaries when n_aggs == 0 && n_nonaggs > 0).  Must happen
+     * before the rename/sort_take steps so they don't see a phantom
+     * column. */
+    if (synth_count_col && by_expr && result && !RAY_IS_ERR(result)) {
+        if (ray_is_lazy(result)) result = ray_lazy_materialize(result);
+        if (result && !RAY_IS_ERR(result) && result->type == RAY_TABLE) {
+            int64_t nc = ray_table_ncols(result);
+            if (nc >= 1) {
+                ray_t* rebuilt = ray_table_new(nc - 1);
+                if (rebuilt && !RAY_IS_ERR(rebuilt)) {
+                    for (int64_t c = 0; c < nc - 1; c++) {
+                        int64_t cn = ray_table_col_name(result, c);
+                        ray_t* col = ray_table_get_col_idx(result, c);
+                        rebuilt = ray_table_add_col(rebuilt, cn, col);
+                    }
+                    ray_release(result);
+                    result = rebuilt;
+                }
+            }
+        }
+    }
+
+    /* NOTE: tbl is released below AFTER the non-agg scatter, which
+     * runs post-rename and post-sort_take so LIST columns do not
+     * flow through the scalar-only apply_sort_take DAG. */
+
+    /* Rename output columns if user specified names */
+    if (result && !RAY_IS_ERR(result) && n_out > 0) {
+        /* Materialize lazy results if needed */
+        if (ray_is_lazy(result)) result = ray_lazy_materialize(result);
+    }
+    if (result && !RAY_IS_ERR(result) && result->type == RAY_TABLE && n_out > 0) {
+        ray_t* schema = ray_table_schema(result);
+        if (schema && !RAY_IS_ERR(schema) && schema->type > 0 && schema->type < RAY_TYPE_COUNT) {
+            int64_t ncols = schema->len;
+            /* Count key columns in by clause */
+            int n_key_cols = 0;
+            if (by_expr) {
+                if (ray_is_vec(by_expr) && by_expr->type == RAY_SYM) n_key_cols = (int)ray_len(by_expr);
+                else n_key_cols = 1;
+            }
+            /* Collect user-defined output column names.
+             * For group-by, the result layout is [keys, aggs..., nonaggs...].
+             * Non-agg columns were added by the post-DAG scatter block
+             * with correct names already — only agg columns need renaming,
+             * in dict-iteration order of the agg entries. */
+            int64_t agg_user_names[16];
+            int64_t all_user_names[16];
+            int n_agg_user = 0;
+            int n_all_user = 0;
+            for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+                int64_t kid = dict_elems[i]->i64;
+                if (kid == from_id || kid == where_id || kid == by_id ||
+                    kid == take_id || kid == asc_id || kid == desc_id) continue;
+                if (n_all_user < 16) all_user_names[n_all_user++] = kid;
+                if (by_expr && !is_agg_expr(dict_elems[i + 1])) continue;
+                if (n_agg_user < 16) agg_user_names[n_agg_user++] = kid;
+            }
+            if (by_expr) {
+                /* Rename only the agg columns (positions after keys).
+                 * Non-agg LIST columns were named at scatter time. */
+                for (int j = 0; j < n_agg_user && n_key_cols + j < ncols; j++)
+                    ray_table_set_col_name(result, n_key_cols + j, agg_user_names[j]);
+            } else {
+                /* Projection-only: columns are in dict order */
+                for (int j = 0; j < n_all_user && n_key_cols + j < ncols; j++)
+                    ray_table_set_col_name(result, n_key_cols + j, all_user_names[j]);
+            }
+        }
+    }
+
+    /* Post-process: scatter non-agg expressions into LIST columns.
+     * Must run BEFORE apply_sort_take so the sort clause can
+     * reference non-agg output columns (and so the take clause
+     * slices the fully-populated result).  apply_sort_take handles
+     * LIST columns in the result table (same path used by the
+     * eval_group branch).
+     *
+     * Reads group keys from the DAG result and builds row→group_id
+     * against the original tbl. */
+    if (n_nonaggs > 0 && by_expr && result && !RAY_IS_ERR(result)) {
+        if (ray_is_lazy(result)) result = ray_lazy_materialize(result);
+        if (result && !RAY_IS_ERR(result) && result->type == RAY_TABLE) {
+            int64_t n_groups = ray_table_nrows(result);
+
+            /* Resolve key sym — gated to single scalar key above. */
+            int64_t ks = -1;
+            if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME))
+                ks = by_expr->i64;
+            else if (by_expr->type == RAY_SYM && ray_len(by_expr) == 1)
+                ks = ((int64_t*)ray_data(by_expr))[0];
+
+            if (ks < 0) {
+                ray_release(result); ray_release(tbl);
+                return ray_error("domain", NULL);
+            }
+
+            ray_t* orig_key = ray_table_get_col(tbl, ks);
+            ray_t* grp_key  = ray_table_get_col(result, ks);
+            int64_t nrows = orig_key ? orig_key->len : 0;
+
+            if (!orig_key || !grp_key) {
+                ray_release(result); ray_release(tbl);
+                return ray_error("domain", NULL);
+            }
+
+            if (n_groups > 0 && nrows > 0) {
+                int8_t okt = orig_key->type;
+                int8_t gkt = grp_key->type;
+                if (RAY_IS_PARTED(okt)) okt = (int8_t)RAY_PARTED_BASETYPE(okt);
+                if (RAY_IS_PARTED(gkt)) gkt = (int8_t)RAY_PARTED_BASETYPE(gkt);
+
+                /* Type-aware key element reader.  Normalizes any
+                 * comparable scalar key into an int64_t so linear
+                 * scans can use equality.  For floats we bitcast so
+                 * NaN and -0/+0 match the DAG's hash-equality. */
+                #define KEY_READ(dst, vec, base_type, idx) do {                \
+                    const void* _d = ray_data(vec);                            \
+                    switch (base_type) {                                       \
+                    case RAY_BOOL:                                             \
+                    case RAY_U8:   (dst) = ((const uint8_t* )_d)[idx]; break;  \
+                    case RAY_I16:  (dst) = ((const int16_t* )_d)[idx]; break;  \
+                    case RAY_I32:  (dst) = ((const int32_t* )_d)[idx]; break;  \
+                    case RAY_I64:  (dst) = ((const int64_t* )_d)[idx]; break;  \
+                    case RAY_F32: { uint32_t _u;                               \
+                        memcpy(&_u, &((const float*)_d)[idx], 4);              \
+                        (dst) = (int64_t)_u; break; }                          \
+                    case RAY_F64: { int64_t _u;                                \
+                        memcpy(&_u, &((const double*)_d)[idx], 8);             \
+                        (dst) = _u; break; }                                   \
+                    case RAY_DATE: case RAY_TIME:                              \
+                        (dst) = ((const int32_t*)_d)[idx]; break;              \
+                    case RAY_TIMESTAMP:                                        \
+                        (dst) = ((const int64_t*)_d)[idx]; break;              \
+                    case RAY_SYM:                                              \
+                        (dst) = ray_read_sym(_d, (idx), (base_type),           \
+                                             (vec)->attrs); break;             \
+                    default: {                                                 \
+                        /* Unsupported key type: signal via sentinel so the    \
+                         * caller's type-mismatch guard catches it.  Should    \
+                         * not actually reach here because okt == gkt is       \
+                         * checked above and only known types pass. */         \
+                        (dst) = 0; break;                                      \
+                    }                                                          \
+                    }                                                          \
+                } while (0)
+
+                /* Whitelist of key types supported by KEY_READ.  Any
+                 * other type (LIST, STR, GUID, unknown) must error out —
+                 * otherwise KEY_READ silently returns 0 and collapses
+                 * all rows into a single (wrong) group.  LIST/STR/GUID
+                 * are already routed through use_eval_group earlier;
+                 * this is the last-line defense for future additions. */
+                int key_supported =
+                    (okt == RAY_BOOL || okt == RAY_U8   ||
+                     okt == RAY_I16  || okt == RAY_I32  || okt == RAY_I64 ||
+                     okt == RAY_F32  || okt == RAY_F64  ||
+                     okt == RAY_DATE || okt == RAY_TIME || okt == RAY_TIMESTAMP ||
+                     okt == RAY_SYM);
+                if (!key_supported) {
+                    ray_release(result); ray_release(tbl);
+                    return ray_error("nyi", "non-agg scatter: unsupported group key type");
+                }
+
+                /* The DAG group result key column must have a base
+                 * type comparable to the input.  If types differ
+                 * unexpectedly, fall back to error rather than mis-
+                 * compare. */
+                if (okt != gkt) {
+                    ray_release(result); ray_release(tbl);
+                    return ray_error("type", "group key type mismatch");
+                }
+
+                /* Allocations — any failure errors out rather than
+                 * silently returning partial results. */
+                ray_t* gk_hdr  = ray_alloc((size_t)n_groups * sizeof(int64_t));
+                ray_t* rg_hdr  = ray_alloc((size_t)nrows    * sizeof(int64_t));
+                ray_t* cnt_hdr = ray_alloc((size_t)n_groups * sizeof(int64_t));
+                ray_t* off_hdr = ray_alloc((size_t)n_groups * sizeof(int64_t));
+                ray_t* pos_hdr = ray_alloc((size_t)n_groups * sizeof(int64_t));
+                if (!gk_hdr || !rg_hdr || !cnt_hdr || !off_hdr || !pos_hdr) {
+                    if (gk_hdr)  ray_free(gk_hdr);
+                    if (rg_hdr)  ray_free(rg_hdr);
+                    if (cnt_hdr) ray_free(cnt_hdr);
+                    if (off_hdr) ray_free(off_hdr);
+                    if (pos_hdr) ray_free(pos_hdr);
+                    ray_release(result); ray_release(tbl);
+                    return ray_error("oom", NULL);
+                }
+                int64_t* gk      = (int64_t*)ray_data(gk_hdr);
+                int64_t* row_gid = (int64_t*)ray_data(rg_hdr);
+                int64_t* grp_cnt = (int64_t*)ray_data(cnt_hdr);
+                int64_t* offsets = (int64_t*)ray_data(off_hdr);
+                int64_t* pos     = (int64_t*)ray_data(pos_hdr);
+
+                /* Copy group key values from the (possibly sliced) result */
+                for (int64_t gi = 0; gi < n_groups; gi++)
+                    KEY_READ(gk[gi], grp_key, gkt, gi);
+
+                /* Build row→group_id map.  Rows whose key isn't in the
+                 * surviving group set get row_gid = -1 and are skipped. */
+                for (int64_t r = 0; r < nrows; r++) {
+                    int64_t rv;
+                    KEY_READ(rv, orig_key, okt, r);
+                    row_gid[r] = -1;
+                    for (int64_t gi = 0; gi < n_groups; gi++) {
+                        if (rv == gk[gi]) { row_gid[r] = gi; break; }
+                    }
+                }
+                #undef KEY_READ
+
+                memset(grp_cnt, 0, (size_t)n_groups * sizeof(int64_t));
+                for (int64_t r = 0; r < nrows; r++)
+                    if (row_gid[r] >= 0) grp_cnt[row_gid[r]]++;
+
+                int64_t total = 0;
+                for (int64_t gi = 0; gi < n_groups; gi++) total += grp_cnt[gi];
+                ray_t* idx_hdr = ray_alloc((size_t)total * sizeof(int64_t));
+                if (!idx_hdr) {
+                    ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr);
+                    ray_free(off_hdr); ray_free(pos_hdr);
+                    ray_release(result); ray_release(tbl);
+                    return ray_error("oom", NULL);
+                }
+                int64_t* idx_buf = (int64_t*)ray_data(idx_hdr);
+
+                offsets[0] = 0;
+                for (int64_t gi = 1; gi < n_groups; gi++)
+                    offsets[gi] = offsets[gi - 1] + grp_cnt[gi - 1];
+
+                memcpy(pos, offsets, (size_t)n_groups * sizeof(int64_t));
+                for (int64_t r = 0; r < nrows; r++) {
+                    int64_t gi = row_gid[r];
+                    if (gi >= 0) idx_buf[pos[gi]++] = r;
+                }
+
+                ray_t* scatter_err = NULL;
+                for (uint8_t ni = 0; ni < n_nonaggs && !scatter_err; ni++) {
+                    /* Streaming-style fast path for `(aggr_fn col_or_expr)`
+                     * where aggr_fn is RAY_FN_AGGR + RAY_UNARY (sum/avg/...,
+                     * med/dev/var/stddev/...).  Bypasses the full-table eval
+                     * + non-row-aligned fallback by slicing the source per
+                     * group and calling the unary fn directly into a typed
+                     * vec.  Equivalent perf-class to the streaming AGG path
+                     * the eval-fallback uses for the same shapes. */
+                    if (is_aggr_unary_call(nonagg_exprs[ni])) {
+                        ray_t* col = aggr_unary_per_group_buf(
+                            nonagg_exprs[ni], tbl,
+                            idx_buf, offsets, grp_cnt, n_groups);
+                        if (RAY_IS_ERR(col)) { scatter_err = col; break; }
+                        result = ray_table_add_col(result, nonagg_names[ni], col);
+                        ray_release(col);
+                        if (RAY_IS_ERR(result)) {
+                            scatter_err = result; result = NULL; break;
+                        }
+                        continue;
+                    }
+
+                    if (ray_env_push_scope() != RAY_OK) {
+                        scatter_err = ray_error("oom", NULL); break;
+                    }
+                    expr_bind_table_names(nonagg_exprs[ni], tbl);
+                    ray_t* full_val = ray_eval(nonagg_exprs[ni]);
+                    ray_env_pop_scope();
+                    if (!full_val || RAY_IS_ERR(full_val)) {
+                        scatter_err = full_val ? full_val : ray_error("domain", NULL);
+                        break;
+                    }
+
+                    ray_t* list_col = ray_alloc(n_groups * sizeof(ray_t*));
+                    if (!list_col) {
+                        ray_release(full_val);
+                        scatter_err = ray_error("oom", NULL); break;
+                    }
+                    list_col->type = RAY_LIST;
+                    /* Track filled length incrementally: ray_release of
+                     * a RAY_LIST walks exactly v->len children, so
+                     * keeping len in sync with the number of initialized
+                     * slots lets error paths free without touching
+                     * uninitialized memory — and avoids a memset. */
+                    list_col->len = 0;
+                    ray_t** list_out = (ray_t**)ray_data(list_col);
+
+                    /* Decide per-group disposition of full_val:
+                     *   - expression references a column → result must
+                     *     be row-aligned; otherwise that's a bug and
+                     *     we error out rather than silently broadcast.
+                     *   - constant expression (no column refs) →
+                     *     broadcast the value into every group cell. */
+                    int refs_column = expr_refs_row_column(nonagg_exprs[ni], tbl);
+                    int is_indexable =
+                        ray_is_vec(full_val) || full_val->type == RAY_LIST;
+                    int full_is_row_aligned =
+                        is_indexable && full_val->len == nrows;
+
+                    if (refs_column && !full_is_row_aligned) {
+                        /* Non-streaming fallback: the expression didn't
+                         * produce a row-aligned full-table result (e.g. a
+                         * user lambda collapsed a vector to a scalar), so
+                         * collect per-group and post-apply.  Cells can be
+                         * any shape; homogeneous-scalar cells collapse to
+                         * a typed vec. */
+                        ray_release(full_val);
+                        ray_release(list_col);  /* len=0, walks nothing */
+                        ray_t* per_group = nonagg_eval_per_group_buf(
+                            nonagg_exprs[ni], tbl, idx_buf, offsets, grp_cnt, n_groups);
+                        if (RAY_IS_ERR(per_group)) {
+                            scatter_err = per_group; break;
+                        }
+                        /* core produces typed vec or list as appropriate */
+                        result = ray_table_add_col(result, nonagg_names[ni], per_group);
+                        ray_release(per_group);
+                        if (RAY_IS_ERR(result)) {
+                            scatter_err = result; result = NULL; break;
+                        }
+                        continue;
+                    }
+
+                    int gather_ok = 1;
+                    for (int64_t gi = 0; gi < n_groups; gi++) {
+                        ray_t* cell;
+                        if (full_is_row_aligned) {
+                            cell = gather_by_idx(full_val,
+                                &idx_buf[offsets[gi]], grp_cnt[gi]);
+                            if (!cell || RAY_IS_ERR(cell)) {
+                                gather_ok = 0;
+                                break;
+                            }
+                        } else {
+                            /* Constant (no column refs): broadcast */
+                            ray_retain(full_val);
+                            cell = full_val;
+                        }
+                        list_out[gi] = cell;
+                        list_col->len = gi + 1;  /* commit slot */
+                    }
+                    ray_release(full_val);
+
+                    if (!gather_ok) {
+                        ray_release(list_col);  /* releases exactly len filled slots */
+                        scatter_err = ray_error("oom", NULL); break;
+                    }
+
+                    result = ray_table_add_col(result, nonagg_names[ni], list_col);
+                    ray_release(list_col);
+                    if (RAY_IS_ERR(result)) {
+                        scatter_err = result; result = NULL; break;
+                    }
+                }
+
+                ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr);
+                ray_free(off_hdr); ray_free(pos_hdr); ray_free(idx_hdr);
+
+                if (scatter_err) {
+                    if (result) ray_release(result);
+                    ray_release(tbl);
+                    return scatter_err;
+                }
+            } else {
+                /* Empty group set: add empty LIST columns so the
+                 * output schema still includes the user-declared
+                 * non-agg columns. */
+                for (uint8_t ni = 0; ni < n_nonaggs; ni++) {
+                    ray_t* empty_list = ray_list_new(0);
+                    if (!empty_list || RAY_IS_ERR(empty_list)) {
+                        ray_release(result); ray_release(tbl);
+                        return empty_list ? empty_list : ray_error("oom", NULL);
+                    }
+                    result = ray_table_add_col(result, nonagg_names[ni], empty_list);
+                    ray_release(empty_list);
+                    if (RAY_IS_ERR(result)) { ray_release(tbl); return result; }
+                }
+            }
+        }
+    }
+
+    ray_release(tbl);
+
+    /* Post-process: apply sort/take for group-by queries.  Runs
+     * last so non-agg LIST columns are already in the result,
+     * allowing sort clauses to reference non-agg output columns. */
+    if (by_expr && (has_sort || take_expr))
+        result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
+
+    if (by_sym_vec_owned) ray_release(by_sym_vec_owned);
+
+    return result;
+}
+
+/* (xbar col bucket) — time/value bucketing: floor(col/bucket)*bucket */
+ray_t* ray_xbar_fn(ray_t* col, ray_t* bucket) {
+    /* Recursive unwrap for nested collections (list of vectors) */
+    if (is_collection(col) || is_collection(bucket))
+        return atomic_map_binary(ray_xbar_fn, col, bucket);
+    /* Both are integer types (i64, i32, i16) → integer xbar */
+    if (is_numeric(col) && is_numeric(bucket) && !is_float_op(col, bucket)) {
+        int64_t a = as_i64(col), b = as_i64(bucket);
+        if (b == 0 || RAY_ATOM_IS_NULL(col) || RAY_ATOM_IS_NULL(bucket))
+            return ray_error("domain", NULL);
+        int64_t q = a / b;
+        if ((a ^ b) < 0 && q * b != a) q--;
+        int64_t result = q * b;
+        /* Result type follows the wider of the two operands */
+        if (col->type == -RAY_I32 && bucket->type == -RAY_I32) return make_i32((int32_t)result);
+        if (col->type == -RAY_I16 && bucket->type == -RAY_I16) return make_i16((int16_t)result);
+        return make_i64(result);
+    }
+    /* Float path: either operand is f64 */
+    if (is_numeric(col) && is_numeric(bucket)) {
+        if (RAY_ATOM_IS_NULL(col) || RAY_ATOM_IS_NULL(bucket))
+            return ray_error("domain", NULL);
+        double c = as_f64(col), b = as_f64(bucket);
+        if (b == 0.0) return ray_error("domain", NULL);
+        double fq = floor(c / b);
+        return make_f64(fq * b);
+    }
+    /* Temporal xbar: col is temporal, bucket is integer or temporal (not float) */
+    if (is_temporal(col) && (is_temporal(bucket) ||
+        (is_numeric(bucket) && bucket->type != -RAY_F64))) {
+        int64_t a = col->i64, b;
+        if (is_temporal(bucket)) {
+            b = bucket->i64;
+            /* Cross-temporal conversion: TIME(ms) bucket on TIMESTAMP(ns) col */
+            if (col->type == -RAY_TIMESTAMP && bucket->type == -RAY_TIME)
+                b *= 1000000LL;
+        } else {
+            b = as_i64(bucket);
+        }
+        if (b == 0 || RAY_ATOM_IS_NULL(bucket)) return ray_error("domain", NULL);
+        int64_t q = a / b;
+        if ((a ^ b) < 0 && q * b != a) q--;
+        int64_t result = q * b;
+        if (col->type == -RAY_TIME) return ray_time(result);
+        if (col->type == -RAY_DATE) return ray_date(result);
+        return ray_timestamp(result);
+    }
+    return ray_error("type", NULL);
+}
+
+/* ══════════════════════════════════════════
+ * Update, Insert, Upsert
+ * ══════════════════════════════════════════ */
+
+/* Helper: convert a Rayfall list of atoms into a typed column vector by
+ * appending to an existing column (for insert/upsert). */
+static ray_t* append_atom_to_col(ray_t* col_vec, ray_t* atom) {
+    if (RAY_ATOM_IS_NULL(atom)) {
+        int64_t idx = col_vec->len;
+        uint8_t zero[16] = {0};
+        col_vec = ray_vec_append(col_vec, zero);
+        if (!RAY_IS_ERR(col_vec))
+            ray_vec_set_null(col_vec, idx, true);
+        return col_vec;
+    }
+    int8_t ct = col_vec->type;
+    if (ct == RAY_I64) {
+        if (atom->type != -RAY_I64)
+            return ray_error("type", NULL);
+        int64_t v = atom->i64;
+        return ray_vec_append(col_vec, &v);
+    } else if (ct == RAY_SYM) {
+        if (atom->type != -RAY_SYM)
+            return ray_error("type", NULL);
+        int64_t v = atom->i64;
+        return ray_vec_append(col_vec, &v);
+    } else if (ct == RAY_F64) {
+        if (atom->type != -RAY_F64 && atom->type != -RAY_I64)
+            return ray_error("type", NULL);
+        double v = (atom->type == -RAY_F64) ? atom->f64 : (double)atom->i64;
+        return ray_vec_append(col_vec, &v);
+    } else if (ct == RAY_BOOL) {
+        if (atom->type != -RAY_BOOL)
+            return ray_error("type", NULL);
+        uint8_t v = atom->b8;
+        return ray_vec_append(col_vec, &v);
+    } else if (ct == RAY_STR && atom->type == -RAY_STR) {
+        const char *sptr = ray_str_ptr(atom);
+        size_t slen = ray_str_len(atom);
+        return ray_str_vec_append(col_vec, sptr, slen);
+    }
+    return ray_error("type", NULL);
+}
+
+/* (update {col: expr ... from: t [where: pred]})
+ * Special form — receives unevaluated dict arg.
+ * For rows matching where (or all if no where), evaluate column expressions
+ * and replace those column values. Returns a new table. */
+/* Forward declarations */
+
+ray_t* ray_update_fn(ray_t** args, int64_t n) {
+    if (n < 1) return ray_error("domain", NULL);
+    ray_t* dict = args[0];
+    if (!dict || dict->type != RAY_DICT)
+        return ray_error("type", NULL);
+
+    ray_t* from_expr = dict_get(dict, "from");
+    if (!from_expr) return ray_error("domain", NULL);
+    /* Detect in-place update: from: 't means quoted symbol */
+    int64_t inplace_sym = -1;
+    ray_t* tbl = ray_eval(from_expr);
+    if (RAY_IS_ERR(tbl)) return tbl;
+    if (tbl->type == -RAY_SYM) {
+        /* from: 't — resolve symbol to table variable */
+        inplace_sym = tbl->i64;
+        ray_release(tbl);
+        tbl = ray_env_get(inplace_sym);
+        if (!tbl || RAY_IS_ERR(tbl)) return ray_error("domain", NULL);
+        ray_retain(tbl);
+    }
+    if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); }
+
+    ray_t* where_expr = dict_get(dict, "where");
+    ray_t* by_expr = dict_get(dict, "by");
+
+    /* UPDATE WITH BY: group, compute aggregate, broadcast back */
+    if (by_expr && !where_expr) {
+        DICT_VIEW_DECL(updv);
+        DICT_VIEW_OPEN(dict, updv);
+        if (DICT_VIEW_OVERFLOW(updv)) {
+            ray_release(tbl);
+            return ray_error("domain", "update clause has too many keys");
+        }
+        int64_t dict_n = updv_n;
+        ray_t** dict_elems = updv;
+        int64_t from_id  = ray_sym_intern("from",  4);
+        int64_t where_id = ray_sym_intern("where", 5);
+        int64_t by_id    = ray_sym_intern("by",    2);
+
+        /* Resolve group key column name.
+         * by_expr is a name reference (not evaluated) — extract sym_id directly */
+        int64_t by_col_name = -1;
+        if (by_expr->type == -RAY_SYM) {
+            by_col_name = by_expr->i64;
+        }
+        if (by_col_name < 0) { ray_release(tbl); return ray_error("type", NULL); }
+
+        /* Find group column in table */
+        ray_t* grp_col = ray_table_get_col(tbl, by_col_name);
+        if (!grp_col) { ray_release(tbl); return ray_error("domain", NULL); }
+        int64_t nrows2 = ray_table_nrows(tbl);
+
+        /* Use ray_group_fn to get group indices: {key: [indices]}.
+         * Flatten the resulting RAY_DICT into the legacy interleaved
+         * [k0,v0,…] LIST shape this branch was written against. */
+        ray_t* groups = NULL;
+        {
+            ray_t* gd = ray_group_fn(grp_col);
+            if (!gd || RAY_IS_ERR(gd)) { ray_release(tbl); return gd ? gd : ray_error("oom", NULL); }
+            groups = groups_to_pair_list(gd);
+            ray_release(gd);
+            if (RAY_IS_ERR(groups)) { ray_release(tbl); return groups; }
+        }
+
+        /* Start with a copy of the original table */
+        int64_t ncols = ray_table_ncols(tbl);
+        ray_t* result = ray_table_new((int32_t)ncols);
+        if (RAY_IS_ERR(result)) { ray_release(groups); ray_release(tbl); return result; }
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t cn = ray_table_col_name(tbl, c);
+            ray_t* col = ray_table_get_col_idx(tbl, c);
+            ray_retain(col);
+            result = ray_table_add_col(result, cn, col);
+            ray_release(col);
+            if (RAY_IS_ERR(result)) { ray_release(groups); ray_release(tbl); return result; }
+        }
+
+        /* For each aggregate expression, compute per group and broadcast */
+        for (int64_t d = 0; d + 1 < dict_n; d += 2) {
+            int64_t kid = dict_elems[d]->i64;
+            if (kid == from_id || kid == where_id || kid == by_id) continue;
+            ray_t* agg_expr = dict_elems[d + 1];
+
+            /* Evaluate the aggregate for each group and broadcast */
+            ray_t* grp_items = (ray_t**)ray_data(groups) ? groups : NULL;
+            if (!grp_items) { ray_release(result); ray_release(groups); ray_release(tbl); return ray_error("oom", NULL); }
+            int64_t ngroups = groups->len / 2;
+            ray_t** gdata = (ray_t**)ray_data(groups);
+
+            /* We need to evaluate the aggregate per group.
+             * Build the result column by evaluating the expression on each group's subset. */
+            ray_t* out_col = ray_vec_new(RAY_I64, nrows2); /* will be resized to correct type */
+            if (RAY_IS_ERR(out_col)) { ray_release(result); ray_release(groups); ray_release(tbl); return out_col; }
+
+            int8_t out_type = RAY_I64;
+            int first_group = 1;
+
+            for (int64_t gi = 0; gi < ngroups; gi++) {
+                ray_t* idx_vec = gdata[gi * 2 + 1]; /* index vector for this group */
+                int64_t gsize = ray_len(idx_vec);
+
+                /* Build a sub-table for this group */
+                ray_t* sub_tbl = ray_table_new((int32_t)ncols);
+                if (RAY_IS_ERR(sub_tbl)) { ray_release(out_col); ray_release(result); ray_release(groups); ray_release(tbl); return sub_tbl; }
+                for (int64_t c = 0; c < ncols; c++) {
+                    int64_t cn = ray_table_col_name(tbl, c);
+                    ray_t* full_col = ray_table_get_col_idx(tbl, c);
+                    int8_t ct = full_col->type;
+                    ray_t* sub_col = ray_vec_new(ct, gsize);
+                    if (RAY_IS_ERR(sub_col)) { ray_release(sub_tbl); ray_release(out_col); ray_release(result); ray_release(groups); ray_release(tbl); return sub_col; }
+                    sub_col->len = gsize;
+                    int esz = ray_elem_size(ct);
+                    char* src = (char*)ray_data(full_col);
+                    char* dst = (char*)ray_data(sub_col);
+                    int64_t* idxs = (int64_t*)ray_data(idx_vec);
+                    for (int64_t r = 0; r < gsize; r++)
+                        memcpy(dst + r * esz, src + idxs[r] * esz, esz);
+                    sub_tbl = ray_table_add_col(sub_tbl, cn, sub_col);
+                    ray_release(sub_col);
+                    if (RAY_IS_ERR(sub_tbl)) { ray_release(out_col); ray_release(result); ray_release(groups); ray_release(tbl); return sub_tbl; }
+                }
+
+                /* Evaluate expression on sub-table via DAG */
+                ray_graph_t* ug = ray_graph_new(sub_tbl);
+                ray_op_t* expr_op = compile_expr_dag(ug, agg_expr);
+                if (!expr_op) { ray_graph_free(ug); ray_release(sub_tbl); ray_release(out_col); ray_release(result); ray_release(groups); ray_release(tbl); return ray_error("domain", NULL); }
+                expr_op = ray_optimize(ug, expr_op);
+                ray_t* agg_result = ray_execute(ug, expr_op);
+                ray_graph_free(ug);
+                ray_release(sub_tbl);
+
+                if (RAY_IS_ERR(agg_result)) { ray_release(out_col); ray_release(result); ray_release(groups); ray_release(tbl); return agg_result; }
+
+                /* Determine output type from first group */
+                if (first_group) {
+                    if (ray_is_atom(agg_result)) out_type = -agg_result->type;
+                    else if (ray_is_vec(agg_result)) out_type = agg_result->type;
+                    ray_release(out_col);
+                    out_col = ray_vec_new(out_type, nrows2);
+                    if (RAY_IS_ERR(out_col)) { ray_release(agg_result); ray_release(result); ray_release(groups); ray_release(tbl); return out_col; }
+                    out_col->len = nrows2;
+                    first_group = 0;
+                }
+
+                /* Broadcast aggregate value to all rows in this group */
+                int64_t* idxs = (int64_t*)ray_data(idx_vec);
+                if (ray_is_atom(agg_result)) {
+                    for (int64_t r = 0; r < gsize; r++)
+                        store_typed_elem(out_col, idxs[r], agg_result);
+                }
+                ray_release(agg_result);
+            }
+
+            /* Add the new column to the result table */
+            result = ray_table_add_col(result, kid, out_col);
+            ray_release(out_col);
+            if (RAY_IS_ERR(result)) { ray_release(groups); ray_release(tbl); return result; }
+        }
+
+        ray_release(groups);
+        /* Store in-place if needed */
+        if (inplace_sym >= 0) {
+            ray_env_set(inplace_sym, result);
+        }
+        ray_release(tbl);
+        return result;
+    }
+
+    /* Evaluate WHERE using the DAG to get a boolean mask */
+    int64_t nrows = ray_table_nrows(tbl);
+    uint8_t* mask = NULL;
+
+    if (where_expr) {
+        /* Try DAG compilation first, fall back to eval-level */
+        ray_t* mask_vec = NULL;
+        ray_graph_t* g = ray_graph_new(tbl);
+        if (g) {
+            ray_op_t* pred = compile_expr_dag(g, where_expr);
+            if (pred) {
+                pred = ray_optimize(g, pred);
+                mask_vec = ray_execute(g, pred);
+            }
+            ray_graph_free(g);
+        }
+        /* Fallback: eval-level predicate evaluation */
+        if (!mask_vec || RAY_IS_ERR(mask_vec)) {
+            /* Bind column names to column vectors in env, then eval */
+            int64_t ncols2 = ray_table_ncols(tbl);
+            ray_env_push_scope();
+            for (int64_t c = 0; c < ncols2; c++) {
+                int64_t cn = ray_table_col_name(tbl, c);
+                ray_t* col = ray_table_get_col_idx(tbl, c);
+                ray_env_set(cn, col);
+            }
+            mask_vec = ray_eval(where_expr);
+            ray_env_pop_scope();
+        }
+        if (!mask_vec || RAY_IS_ERR(mask_vec)) { ray_release(tbl); return mask_vec ? mask_vec : ray_error("type", NULL); }
+        if (mask_vec->type != RAY_BOOL || mask_vec->len != nrows) {
+            ray_release(mask_vec);
+            ray_release(tbl);
+            return ray_error("type", NULL);
+        }
+        mask = (uint8_t*)ray_data(mask_vec);
+        /* Keep mask_vec alive until we're done */
+
+        /* Build a new table with updated columns */
+        int64_t ncols = ray_table_ncols(tbl);
+        DICT_VIEW_DECL(updw);
+        DICT_VIEW_OPEN(dict, updw);
+        if (DICT_VIEW_OVERFLOW(updw)) {
+            ray_release(mask_vec); ray_release(tbl);
+            return ray_error("domain", "update clause has too many keys");
+        }
+        int64_t dict_n = updw_n;
+        ray_t** dict_elems = updw;
+        int64_t from_id = ray_sym_intern("from", 4);
+        int64_t where_id = ray_sym_intern("where", 5);
+
+        ray_t* result = ray_table_new(ncols);
+        if (RAY_IS_ERR(result)) { ray_release(mask_vec); ray_release(tbl); return result; }
+
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t col_name = ray_table_col_name(tbl, c);
+            ray_t* orig_col = ray_table_get_col_idx(tbl, c);
+
+            /* Check if this column has an update expression */
+            ray_t* update_expr = NULL;
+            for (int64_t d = 0; d + 1 < dict_n; d += 2) {
+                int64_t kid = dict_elems[d]->i64;
+                if (kid == from_id || kid == where_id) continue;
+                if (kid == col_name) { update_expr = dict_elems[d + 1]; break; }
+            }
+
+            if (!update_expr) {
+                /* No update for this column — copy as-is */
+                ray_retain(orig_col);
+                result = ray_table_add_col(result, col_name, orig_col);
+                ray_release(orig_col);
+            } else {
+                /* Evaluate the expression for each row and apply to matching rows */
+                int8_t ct = orig_col->type;
+                ray_t* new_col = ray_vec_new(ct, nrows);
+                if (RAY_IS_ERR(new_col)) { ray_release(result); ray_release(mask_vec); ray_release(tbl); return new_col; }
+
+                /* Evaluate expression via DAG, fallback to eval-level */
+                ray_t* expr_vec = NULL;
+                {
+                    ray_graph_t* ug = ray_graph_new(tbl);
+                    if (ug) {
+                        ray_op_t* expr_op = compile_expr_dag(ug, update_expr);
+                        if (expr_op) {
+                            expr_op = ray_optimize(ug, expr_op);
+                            expr_vec = ray_execute(ug, expr_op);
+                        }
+                        ray_graph_free(ug);
+                    }
+                }
+                if (!expr_vec || RAY_IS_ERR(expr_vec)) {
+                    /* Fallback: eval with column bindings */
+                    int64_t ncols_e = ray_table_ncols(tbl);
+                    ray_env_push_scope();
+                    for (int64_t c2 = 0; c2 < ncols_e; c2++) {
+                        int64_t cn = ray_table_col_name(tbl, c2);
+                        ray_t* col2 = ray_table_get_col_idx(tbl, c2);
+                        ray_env_set(cn, col2);
+                    }
+                    expr_vec = ray_eval(update_expr);
+                    ray_env_pop_scope();
+                }
+                if (!expr_vec || RAY_IS_ERR(expr_vec)) { ray_release(new_col); ray_release(result); ray_release(mask_vec); ray_release(tbl); return expr_vec ? expr_vec : ray_error("type", NULL); }
+
+                /* WHERE update: expression result replaces ONLY masked rows.
+                 * When type differs (e.g., I64 col, F64 expr from (* col 1.1)),
+                 * keep original column type and cast expr results.
+                 * Only numeric promotions are allowed — STR↔numeric is a type error. */
+                int8_t expr_type = (expr_vec->type < 0) ? -expr_vec->type : expr_vec->type;
+                if (expr_type != ct && expr_type > 0 && ray_is_vec(expr_vec)) {
+                    /* Only allow numeric promotions (I64↔F64, I32↔F64) */
+                    int is_numeric_promo = (ct == RAY_I64 || ct == RAY_I32 || ct == RAY_F64) &&
+                                           (expr_type == RAY_I64 || expr_type == RAY_I32 || expr_type == RAY_F64);
+                    if (!is_numeric_promo) {
+                        ray_release(expr_vec); ray_release(new_col); ray_release(result); ray_release(mask_vec); ray_release(tbl);
+                        return ray_error("type", NULL);
+                    }
+                    /* Copy original column values first */
+                    int esz = ray_elem_size(ct);
+                    memcpy(ray_data(new_col), ray_data(orig_col), (size_t)(nrows * esz));
+                    new_col->len = nrows;
+                    /* Overlay masked rows with type conversion */
+                    for (int64_t r = 0; r < nrows; r++) {
+                        if (!mask[r]) continue;
+                        if (ct == RAY_I64 && expr_type == RAY_F64)
+                            ((int64_t*)ray_data(new_col))[r] = (int64_t)((double*)ray_data(expr_vec))[r];
+                        else if (ct == RAY_I32 && expr_type == RAY_F64)
+                            ((int32_t*)ray_data(new_col))[r] = (int32_t)((double*)ray_data(expr_vec))[r];
+                        else if (ct == RAY_F64 && expr_type == RAY_I64)
+                            ((double*)ray_data(new_col))[r] = (double)((int64_t*)ray_data(expr_vec))[r];
+                    }
+                    /* Null-bit propagation: memcpy above only copies values,
+                     * not the nullmap.  Carry over orig_col's nulls for the
+                     * untouched rows, and pull expr_vec's nulls in for the
+                     * masked rows.  Without this, casting a null F64 expr
+                     * back to an I64 column silently produces 0. */
+                    for (int64_t r = 0; r < nrows; r++) {
+                        ray_t* src = mask[r] ? expr_vec : orig_col;
+                        if (ray_vec_is_null(src, r))
+                            ray_vec_set_null(new_col, r, true);
+                    }
+                    ray_release(expr_vec);
+                    result = ray_table_add_col(result, col_name, new_col);
+                    ray_release(new_col);
+                    if (RAY_IS_ERR(result)) { ray_release(mask_vec); ray_release(tbl); return result; }
+                    continue;
+                }
+
+                /* Broadcast scalar atom to full column vector if needed */
+                if (expr_vec->type < 0) {
+                    /* Type check atom against column type BEFORE broadcast */
+                    int ok = (expr_vec->type == -ct);
+                    if (!ok && ct == RAY_F64 && expr_vec->type == -RAY_I64) ok = 1;
+                    if (!ok && ct == RAY_LIST && expr_vec->type == -RAY_SYM) ok = 1;
+                    if (!ok && ct == RAY_SYM && expr_vec->type == -RAY_SYM) ok = 1;
+                    if (!ok) {
+                        ray_release(expr_vec); ray_release(new_col); ray_release(result); ray_release(mask_vec); ray_release(tbl);
+                        return ray_error("type", NULL);
+                    }
+                    /* SYM atom to LIST column: build boxed list, merge with mask */
+                    if (ct == RAY_LIST && expr_vec->type == -RAY_SYM) {
+                        ray_free(new_col);
+                        ray_t* new_list = ray_list_new((int32_t)nrows);
+                        if (RAY_IS_ERR(new_list)) { ray_release(expr_vec); ray_release(result); ray_release(mask_vec); ray_release(tbl); return new_list; }
+                        ray_t** orig_elems = (ray_t**)ray_data(orig_col);
+                        for (int64_t r = 0; r < nrows; r++) {
+                            ray_t* elem = mask[r] ? expr_vec : orig_elems[r];
+                            ray_retain(elem);
+                            new_list = ray_list_append(new_list, elem);
+                            ray_release(elem);
+                            if (RAY_IS_ERR(new_list)) { ray_release(expr_vec); ray_release(result); ray_release(mask_vec); ray_release(tbl); return new_list; }
+                        }
+                        ray_release(expr_vec);
+                        result = ray_table_add_col(result, col_name, new_list);
+                        ray_release(new_list);
+                        if (RAY_IS_ERR(result)) { ray_release(mask_vec); ray_release(tbl); return result; }
+                        continue;
+                    }
+                    ray_t* bcast = ray_vec_new(ct, nrows);
+                    if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(new_col); ray_release(result); ray_release(mask_vec); ray_release(tbl); return bcast; }
+                    if (ct == RAY_STR && expr_vec->type == -RAY_STR) {
+                        const char* sp = ray_str_ptr(expr_vec);
+                        size_t sl = ray_str_len(expr_vec);
+                        for (int64_t r = 0; r < nrows; r++) {
+                            bcast = ray_str_vec_append(bcast, sp, sl);
+                            if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(new_col); ray_release(result); ray_release(mask_vec); ray_release(tbl); return bcast; }
+                        }
+                    } else {
+                        size_t esz = (ct == RAY_BOOL) ? 1 : 8;
+                        uint8_t elem[8] = {0};
+                        if (ct == RAY_F64 && expr_vec->type == -RAY_I64) {
+                            double promoted = (double)expr_vec->i64;
+                            memcpy(elem, &promoted, 8);
+                        } else {
+                            memcpy(elem, &expr_vec->i64, esz);
+                        }
+                        for (int64_t r = 0; r < nrows; r++) {
+                            bcast = ray_vec_append(bcast, elem);
+                            if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(new_col); ray_release(result); ray_release(mask_vec); ray_release(tbl); return bcast; }
+                        }
+                    }
+                    /* Preserve typed-null markers across broadcast.  Without
+                     * this, (update {a: 0N from: t}) silently writes plain
+                     * zeros into the I64 column — the value bits get copied
+                     * but the null bitmap doesn't, so (nil? a) reports false
+                     * on what should be null cells. */
+                    if (RAY_ATOM_IS_NULL(expr_vec)) {
+                        for (int64_t r = 0; r < nrows; r++)
+                            ray_vec_set_null(bcast, r, true);
+                    }
+                    ray_release(expr_vec);
+                    expr_vec = bcast;
+                }
+
+                /* Promote I64 vector to F64 if column is F64 */
+                if (expr_vec->type == RAY_I64 && ct == RAY_F64) {
+                    int64_t nr = ray_len(expr_vec);
+                    ray_t* promoted = ray_vec_new(RAY_F64, nr);
+                    if (RAY_IS_ERR(promoted)) { ray_release(expr_vec); ray_release(new_col); ray_release(result); ray_release(mask_vec); ray_release(tbl); return promoted; }
+                    int64_t* src_data = (int64_t*)ray_data(expr_vec);
+                    for (int64_t r = 0; r < nr; r++) {
+                        double v = (double)src_data[r];
+                        promoted = ray_vec_append(promoted, &v);
+                        if (RAY_IS_ERR(promoted)) { ray_release(expr_vec); ray_release(new_col); ray_release(result); ray_release(mask_vec); ray_release(tbl); return promoted; }
+                    }
+                    /* Carry the nullmap across the I64→F64 promotion. */
+                    for (int64_t r = 0; r < nr; r++)
+                        if (ray_vec_is_null(expr_vec, r))
+                            ray_vec_set_null(promoted, r, true);
+                    ray_release(expr_vec);
+                    expr_vec = promoted;
+                }
+
+                /* Type check: expr_vec must match original column type */
+                if (expr_vec->type != ct) {
+                    ray_release(expr_vec); ray_release(new_col); ray_release(result); ray_release(mask_vec); ray_release(tbl);
+                    return ray_error("type", NULL);
+                }
+
+                /* Merge: use expr_vec for matching rows, orig_col for non-matching.
+                 * Null-bit propagation applies to STR/SYM as well — a null in
+                 * either the orig column (unmasked rows) or the expr (masked
+                 * rows) must travel into new_col's nullmap. */
+                if (ct == RAY_STR) {
+                    for (int64_t r = 0; r < nrows; r++) {
+                        ray_t* src_vec = mask[r] ? expr_vec : orig_col;
+                        size_t slen = 0;
+                        const char* sp = ray_str_vec_get(src_vec, r, &slen);
+                        new_col = ray_str_vec_append(new_col, sp ? sp : "", sp ? slen : 0);
+                        if (RAY_IS_ERR(new_col)) { ray_release(expr_vec); ray_release(result); ray_release(mask_vec); ray_release(tbl); return new_col; }
+                        if (ray_vec_is_null(src_vec, r))
+                            ray_vec_set_null(new_col, new_col->len - 1, true);
+                    }
+                } else if (ct == RAY_SYM) {
+                    for (int64_t r = 0; r < nrows; r++) {
+                        ray_t* src_vec = mask[r] ? expr_vec : orig_col;
+                        int64_t sym_val = ray_read_sym(ray_data(src_vec), r, src_vec->type, src_vec->attrs);
+                        new_col = ray_vec_append(new_col, &sym_val);
+                        if (RAY_IS_ERR(new_col)) { ray_release(expr_vec); ray_release(result); ray_release(mask_vec); ray_release(tbl); return new_col; }
+                        if (ray_vec_is_null(src_vec, r))
+                            ray_vec_set_null(new_col, new_col->len - 1, true);
+                    }
+                } else {
+                    size_t elem_sz = (ct == RAY_BOOL) ? 1 : 8;
+                    uint8_t* orig_data = (uint8_t*)ray_data(orig_col);
+                    uint8_t* expr_data = (uint8_t*)ray_data(expr_vec);
+                    for (int64_t r = 0; r < nrows; r++) {
+                        ray_t* src_vec = mask[r] ? expr_vec : orig_col;
+                        uint8_t* base  = mask[r] ? expr_data : orig_data;
+                        new_col = ray_vec_append(new_col, base + r * elem_sz);
+                        if (RAY_IS_ERR(new_col)) { ray_release(expr_vec); ray_release(result); ray_release(mask_vec); ray_release(tbl); return new_col; }
+                        /* Propagate null bit from whichever side supplied
+                         * the value.  Without this, masking in a typed-null
+                         * broadcast would copy zero bytes into the slot but
+                         * leave the destination's nullmap clear → silent
+                         * loss of null marker. */
+                        if (ray_vec_is_null(src_vec, r))
+                            ray_vec_set_null(new_col, new_col->len - 1, true);
+                    }
+                }
+                result = ray_table_add_col(result, col_name, new_col);
+                ray_release(new_col);
+                ray_release(expr_vec);
+            }
+            if (RAY_IS_ERR(result)) { ray_release(mask_vec); ray_release(tbl); return result; }
+        }
+
+        ray_release(mask_vec);
+        if (inplace_sym >= 0 && result && !RAY_IS_ERR(result)) {
+            ray_env_set(inplace_sym, result);
+        }
+        ray_release(tbl);
+        return result;
+    }
+
+    /* No WHERE — update all rows */
+    int64_t ncols = ray_table_ncols(tbl);
+    DICT_VIEW_DECL(upda);
+    DICT_VIEW_OPEN(dict, upda);
+    if (DICT_VIEW_OVERFLOW(upda)) {
+        ray_release(tbl);
+        return ray_error("domain", "update clause has too many keys");
+    }
+    int64_t dict_n = upda_n;
+    ray_t** dict_elems = upda;
+    int64_t from_id = ray_sym_intern("from", 4);
+
+    ray_t* result = ray_table_new(ncols);
+    if (RAY_IS_ERR(result)) { ray_release(tbl); return result; }
+
+    for (int64_t c = 0; c < ncols; c++) {
+        int64_t col_name = ray_table_col_name(tbl, c);
+        ray_t* orig_col = ray_table_get_col_idx(tbl, c);
+
+        ray_t* update_expr = NULL;
+        for (int64_t d = 0; d + 1 < dict_n; d += 2) {
+            int64_t kid = dict_elems[d]->i64;
+            if (kid == from_id) continue;
+            if (kid == col_name) { update_expr = dict_elems[d + 1]; break; }
+        }
+
+        if (!update_expr) {
+            ray_retain(orig_col);
+            result = ray_table_add_col(result, col_name, orig_col);
+            ray_release(orig_col);
+        } else {
+            ray_t* expr_vec = NULL;
+            {
+                ray_graph_t* ug = ray_graph_new(tbl);
+                if (ug) {
+                    ray_op_t* expr_op = compile_expr_dag(ug, update_expr);
+                    if (expr_op) {
+                        expr_op = ray_optimize(ug, expr_op);
+                        expr_vec = ray_execute(ug, expr_op);
+                    }
+                    ray_graph_free(ug);
+                }
+            }
+            if (!expr_vec || RAY_IS_ERR(expr_vec)) {
+                /* Fallback: eval with column bindings */
+                int64_t ncols_f = ray_table_ncols(tbl);
+                ray_env_push_scope();
+                for (int64_t cf = 0; cf < ncols_f; cf++) {
+                    int64_t cn = ray_table_col_name(tbl, cf);
+                    ray_t* colf = ray_table_get_col_idx(tbl, cf);
+                    ray_env_set(cn, colf);
+                }
+                expr_vec = ray_eval(update_expr);
+                ray_env_pop_scope();
+            }
+            if (!expr_vec || RAY_IS_ERR(expr_vec)) { ray_release(result); ray_release(tbl); return expr_vec ? expr_vec : ray_error("type", NULL); }
+
+            /* Broadcast scalar atom to full column vector if needed */
+            if (expr_vec->type < 0) {
+                int64_t nrows = ray_table_nrows(tbl);
+                int8_t ct = orig_col->type;
+                /* Type check atom against column type BEFORE broadcast */
+                int ok = (expr_vec->type == -ct);
+                if (!ok && ct == RAY_F64 && expr_vec->type == -RAY_I64) ok = 1;
+                /* SYM atom → LIST column (LIST of SYM atoms) */
+                if (!ok && ct == RAY_LIST && expr_vec->type == -RAY_SYM) ok = 1;
+                if (!ok) {
+                    ray_release(expr_vec); ray_release(result); ray_release(tbl);
+                    return ray_error("type", NULL);
+                }
+                /* SYM atom to LIST column: broadcast as boxed list */
+                if (ct == RAY_LIST && expr_vec->type == -RAY_SYM) {
+                    ray_t* bcast = ray_list_new((int32_t)nrows);
+                    if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(result); ray_release(tbl); return bcast; }
+                    for (int64_t r = 0; r < nrows; r++) {
+                        ray_retain(expr_vec);
+                        bcast = ray_list_append(bcast, expr_vec);
+                        ray_release(expr_vec);
+                        if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(result); ray_release(tbl); return bcast; }
+                    }
+                    ray_release(expr_vec);
+                    expr_vec = bcast;
+                    goto no_where_add_col;
+                }
+                ray_t* bcast = ray_vec_new(ct, nrows);
+                if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(result); ray_release(tbl); return bcast; }
+                if (ct == RAY_STR && expr_vec->type == -RAY_STR) {
+                    const char* sp = ray_str_ptr(expr_vec);
+                    size_t sl = ray_str_len(expr_vec);
+                    for (int64_t r = 0; r < nrows; r++) {
+                        bcast = ray_str_vec_append(bcast, sp, sl);
+                        if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(result); ray_release(tbl); return bcast; }
+                    }
+                } else {
+                    size_t esz = (ct == RAY_BOOL) ? 1 : 8;
+                    uint8_t elem[8] = {0};
+                    if (ct == RAY_F64 && expr_vec->type == -RAY_I64) {
+                        double promoted = (double)expr_vec->i64;
+                        memcpy(elem, &promoted, 8);
+                    } else {
+                        memcpy(elem, &expr_vec->i64, esz);
+                    }
+                    for (int64_t r = 0; r < nrows; r++) {
+                        bcast = ray_vec_append(bcast, elem);
+                        if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(result); ray_release(tbl); return bcast; }
+                    }
+                }
+                /* Preserve typed-null markers across broadcast (mirrors the
+                 * WHERE branch fix at the analogous site above). */
+                if (RAY_ATOM_IS_NULL(expr_vec)) {
+                    for (int64_t r = 0; r < nrows; r++)
+                        ray_vec_set_null(bcast, r, true);
+                }
+                ray_release(expr_vec);
+                expr_vec = bcast;
+            }
+
+            /* Promote I64 vector to F64 if column is F64 */
+            if (expr_vec->type == RAY_I64 && orig_col->type == RAY_F64) {
+                int64_t nr = ray_len(expr_vec);
+                ray_t* promoted = ray_vec_new(RAY_F64, nr);
+                if (RAY_IS_ERR(promoted)) { ray_release(expr_vec); ray_release(result); ray_release(tbl); return promoted; }
+                int64_t* src_data = (int64_t*)ray_data(expr_vec);
+                for (int64_t r = 0; r < nr; r++) {
+                    double v = (double)src_data[r];
+                    promoted = ray_vec_append(promoted, &v);
+                    if (RAY_IS_ERR(promoted)) { ray_release(expr_vec); ray_release(result); ray_release(tbl); return promoted; }
+                }
+                /* Carry the nullmap across the I64→F64 promotion. */
+                for (int64_t r = 0; r < nr; r++)
+                    if (ray_vec_is_null(expr_vec, r))
+                        ray_vec_set_null(promoted, r, true);
+                ray_release(expr_vec);
+                expr_vec = promoted;
+            }
+
+            /* No-WHERE update: allow type change for same-category types.
+             * Atoms (type<0) will be broadcast later, check after broadcast.
+             * For vectors, check now: only numeric promotions or same type.
+             * Also allow SYM/LIST interop (columns may be stored as LIST). */
+            if (expr_vec->type > 0 && expr_vec->type != orig_col->type) {
+                int is_ok = 0;
+                /* Numeric promotions */
+                if ((orig_col->type == RAY_I64 || orig_col->type == RAY_I32 || orig_col->type == RAY_F64) &&
+                    (expr_vec->type == RAY_I64 || expr_vec->type == RAY_I32 || expr_vec->type == RAY_F64))
+                    is_ok = 1;
+                /* SYM/LIST interop */
+                if ((orig_col->type == RAY_SYM || orig_col->type == RAY_LIST) &&
+                    (expr_vec->type == RAY_SYM || expr_vec->type == RAY_LIST))
+                    is_ok = 1;
+                if (!is_ok) {
+                    ray_release(expr_vec); ray_release(result); ray_release(tbl);
+                    return ray_error("type", NULL);
+                }
+            }
+
+no_where_add_col:
+            result = ray_table_add_col(result, col_name, expr_vec);
+            ray_release(expr_vec);
+        }
+        if (RAY_IS_ERR(result)) { ray_release(tbl); return result; }
+    }
+
+    /* Add NEW columns from dict (columns not already in the table) */
+    for (int64_t d = 0; d + 1 < dict_n; d += 2) {
+        int64_t kid = dict_elems[d]->i64;
+        if (kid == from_id) continue;
+        /* Check if this column already exists */
+        int exists = 0;
+        for (int64_t c = 0; c < ncols; c++) {
+            if (ray_table_col_name(tbl, c) == kid) { exists = 1; break; }
+        }
+        if (exists) continue;
+
+        /* New column: evaluate expression and add */
+        ray_t* update_expr = dict_elems[d + 1];
+        ray_graph_t* ug = ray_graph_new(tbl);
+        ray_op_t* expr_op = compile_expr_dag(ug, update_expr);
+        if (!expr_op) { ray_release(result); ray_release(tbl); ray_graph_free(ug); return ray_error("domain", NULL); }
+        expr_op = ray_optimize(ug, expr_op);
+        ray_t* expr_vec = ray_execute(ug, expr_op);
+        ray_graph_free(ug);
+        if (RAY_IS_ERR(expr_vec)) { ray_release(result); ray_release(tbl); return expr_vec; }
+
+        /* Broadcast scalar to column */
+        if (expr_vec->type < 0) {
+            int64_t nrows = ray_table_nrows(tbl);
+            int8_t ct = -expr_vec->type;
+            ray_t* bcast = ray_vec_new(ct, nrows);
+            if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(result); ray_release(tbl); return bcast; }
+            size_t esz = ray_elem_size(ct);
+            uint8_t elem[8] = {0};
+            memcpy(elem, &expr_vec->i64, esz > 8 ? 8 : esz);
+            for (int64_t r = 0; r < nrows; r++) {
+                bcast = ray_vec_append(bcast, elem);
+                if (RAY_IS_ERR(bcast)) { ray_release(expr_vec); ray_release(result); ray_release(tbl); return bcast; }
+            }
+            /* Preserve typed-null markers across broadcast (mirrors the
+             * existing-column branches above).  Without this,
+             * (update {c: 0N from: t}) would silently materialise a
+             * brand-new column of plain zeros. */
+            if (RAY_ATOM_IS_NULL(expr_vec)) {
+                for (int64_t r = 0; r < nrows; r++)
+                    ray_vec_set_null(bcast, r, true);
+            }
+            ray_release(expr_vec);
+            expr_vec = bcast;
+        }
+
+        result = ray_table_add_col(result, kid, expr_vec);
+        ray_release(expr_vec);
+        if (RAY_IS_ERR(result)) { ray_release(tbl); return result; }
+    }
+
+    /* Store in-place if from: 't */
+    if (inplace_sym >= 0 && result && !RAY_IS_ERR(result)) {
+        ray_env_set(inplace_sym, result);
+    }
+    ray_release(tbl);
+    return result;
+}
+
+/* (insert table (list val1 val2 ...)) — append a row to a table */
+ray_t* ray_insert_fn(ray_t** args, int64_t n) {
+    if (n < 2) return ray_error("domain", NULL);
+
+    /* Special form: detect 'sym (quoted symbol for in-place insert) */
+    int64_t inplace_sym = -1;
+    ray_t* tbl_raw = args[0];
+    ray_t* tbl;
+
+    /* Detect calling convention: already-evaluated args (from upsert) vs raw parse tree */
+    int already_eval = (tbl_raw && tbl_raw->type == RAY_TABLE);
+
+    if (!already_eval && tbl_raw && tbl_raw->type == -RAY_SYM && !(tbl_raw->attrs & RAY_ATTR_NAME)) {
+        /* Quoted symbol 'sym (no ATTR_NAME) — in-place insert */
+        inplace_sym = tbl_raw->i64;
+        tbl = ray_env_get(inplace_sym);
+        if (!tbl || RAY_IS_ERR(tbl)) return ray_error("domain", NULL);
+        ray_retain(tbl);
+    } else if (already_eval) {
+        tbl = tbl_raw;
+        ray_retain(tbl);
+    } else {
+        tbl = ray_eval(tbl_raw);
+        if (!tbl || RAY_IS_ERR(tbl)) return tbl ? tbl : ray_error("type", NULL);
+    }
+
+    /* ====================================================================
+     * Vec/list dispatch — n==2 append, n==3 positional insert.
+     * Tables with n==2 fall through to the legacy table-row append below.
+     * ==================================================================== */
+    if (tbl->type != RAY_TABLE) {
+        if (already_eval) { ray_release(tbl); return ray_error("type", NULL); }
+        if (tbl->attrs & RAY_ATTR_ARENA) { ray_release(tbl); return ray_error("type", NULL); }
+
+        /* Slice → materialise so cow can mutate. Lists never slice. */
+        if (tbl->attrs & RAY_ATTR_SLICE) {
+            if (tbl->type == RAY_LIST) { ray_release(tbl); return ray_error("type", NULL); }
+            ray_t* empty = ray_vec_new(tbl->type, 0);
+            if (!empty || RAY_IS_ERR(empty)) {
+                ray_release(tbl);
+                return empty ? empty : ray_error("oom", NULL);
+            }
+            ray_t* mat = ray_vec_concat(tbl, empty);
+            ray_release(empty);
+            ray_release(tbl);
+            if (!mat || RAY_IS_ERR(mat)) return mat ? mat : ray_error("oom", NULL);
+            tbl = mat;
+        }
+
+        bool is_target_list = (tbl->type == RAY_LIST);
+        bool is_target_vec  = ray_is_vec(tbl);
+        if (!is_target_list && !is_target_vec) {
+            ray_release(tbl);
+            return ray_error("type", NULL);
+        }
+        if (n != 2 && n != 3) {
+            ray_release(tbl);
+            return ray_error("domain", NULL);
+        }
+
+        ray_t* result = NULL;
+        int8_t tt = tbl->type;
+
+        if (n == 2) {
+            /* APPEND */
+            ray_t* val = ray_eval(args[1]);
+            if (!val || RAY_IS_ERR(val)) {
+                ray_release(tbl);
+                return val ? val : ray_error("type", NULL);
+            }
+            if (is_target_list) {
+                /* Always one slot — never splice on append. */
+                tbl = ray_list_append(tbl, val);
+                result = tbl;
+            } else if (val->type == -tt) {
+                /* Atom of matching type → element append */
+                int64_t new_idx = tbl->len;
+                if (tt == RAY_STR) {
+                    tbl = ray_str_vec_append(tbl, ray_str_ptr(val), ray_str_len(val));
+                } else if (tt == RAY_SYM) {
+                    int64_t s = val->i64;
+                    tbl = ray_vec_append(tbl, &s);
+                } else if (tt == RAY_GUID) {
+                    /* GUID atom's 16-byte payload lives in val->obj; typed-null
+                     * atoms have obj==NULL — write zeros and let the post-call
+                     * RAY_ATOM_IS_NULL check mark the slot. */
+                    static const uint8_t zero_guid[16] = {0};
+                    const void* src = val->obj ? ray_data(val->obj) : zero_guid;
+                    tbl = ray_vec_append(tbl, src);
+                } else {
+                    tbl = ray_vec_append(tbl, &val->u8);
+                }
+                if (tbl && !RAY_IS_ERR(tbl) && RAY_ATOM_IS_NULL(val))
+                    ray_vec_set_null(tbl, new_idx, true);
+                result = tbl;
+            } else if (val->type == tt) {
+                /* Same-type vec → splice at end */
+                result = ray_vec_concat(tbl, val);
+                ray_release(tbl);
+            } else {
+                ray_release(tbl);
+                ray_release(val);
+                return ray_error("type", NULL);
+            }
+            ray_release(val);
+        } else {
+            /* n == 3 — POSITIONAL */
+            ray_t* idx_arg = ray_eval(args[1]);
+            if (!idx_arg || RAY_IS_ERR(idx_arg)) {
+                ray_release(tbl);
+                return idx_arg ? idx_arg : ray_error("type", NULL);
+            }
+            ray_t* val = ray_eval(args[2]);
+            if (!val || RAY_IS_ERR(val)) {
+                ray_release(tbl);
+                ray_release(idx_arg);
+                return val ? val : ray_error("type", NULL);
+            }
+
+            if (is_target_list) {
+                if (idx_arg->type == -RAY_I64) {
+                    tbl = ray_list_insert_at(tbl, idx_arg->i64, val);
+                    result = tbl;
+                } else if (idx_arg->type == RAY_I64) {
+                    if (val->type != RAY_LIST) {
+                        ray_release(tbl); ray_release(idx_arg); ray_release(val);
+                        return ray_error("type", NULL);
+                    }
+                    result = ray_list_insert_many(tbl, idx_arg, val);
+                    ray_release(tbl);
+                } else {
+                    ray_release(tbl); ray_release(idx_arg); ray_release(val);
+                    return ray_error("type", NULL);
+                }
+            } else {
+                /* vec target */
+                if (idx_arg->type == -RAY_I64) {
+                    int64_t i = idx_arg->i64;
+                    if (val->type == -tt) {
+                        if (tt == RAY_STR) {
+                            result = ray_str_vec_insert_at(tbl, i,
+                                        ray_str_ptr(val), ray_str_len(val));
+                            ray_release(tbl);
+                        } else if (tt == RAY_SYM) {
+                            int64_t s = val->i64;
+                            tbl = ray_vec_insert_at(tbl, i, &s);
+                            result = tbl;
+                        } else if (tt == RAY_GUID) {
+                            static const uint8_t zero_guid[16] = {0};
+                            const void* src = val->obj ? ray_data(val->obj) : zero_guid;
+                            tbl = ray_vec_insert_at(tbl, i, src);
+                            result = tbl;
+                        } else {
+                            tbl = ray_vec_insert_at(tbl, i, &val->u8);
+                            result = tbl;
+                        }
+                        if (result && !RAY_IS_ERR(result) && RAY_ATOM_IS_NULL(val))
+                            ray_vec_set_null(result, i, true);
+                    } else if (val->type == tt) {
+                        result = ray_vec_insert_vec_at(tbl, i, val);
+                        ray_release(tbl);
+                    } else {
+                        ray_release(tbl); ray_release(idx_arg); ray_release(val);
+                        return ray_error("type", NULL);
+                    }
+                } else if (idx_arg->type == RAY_I64) {
+                    if (tt == RAY_STR) {
+                        ray_release(tbl); ray_release(idx_arg); ray_release(val);
+                        return ray_error("type", NULL);
+                    }
+                    if (val->type != tt && val->type != -tt) {
+                        ray_release(tbl); ray_release(idx_arg); ray_release(val);
+                        return ray_error("type", NULL);
+                    }
+                    result = ray_vec_insert_many(tbl, idx_arg, val);
+                    ray_release(tbl);
+                } else {
+                    ray_release(tbl); ray_release(idx_arg); ray_release(val);
+                    return ray_error("type", NULL);
+                }
+            }
+            ray_release(idx_arg);
+            ray_release(val);
+        }
+
+        if (inplace_sym >= 0 && result && !RAY_IS_ERR(result)) {
+            ray_env_set(inplace_sym, result);
+            ray_retain(result);
+        }
+        return result;
+    }
+
+    /* Table target: arity-3 positional row insert is not implemented. */
+    if (n != 2) { ray_release(tbl); return ray_error("nyi", NULL); }
+
+    /* Evaluate the row argument (skip if already evaluated) */
+    ray_t* row = already_eval ? (ray_retain(args[1]), args[1]) : ray_eval(args[1]);
+    if (!row || RAY_IS_ERR(row)) { ray_release(tbl); return row ? row : ray_error("type", NULL); }
+    if (tbl->type != RAY_TABLE) { ray_release(tbl); ray_release(row); return ray_error("type", NULL); }
+
+    int64_t ncols = ray_table_ncols(tbl);
+    ray_t* row_orig = row; /* keep original eval result for cleanup */
+
+    if (!is_list(row) && row->type != RAY_TABLE && row->type != RAY_DICT) { ray_release(tbl); ray_release(row); return ray_error("type", NULL); }
+
+    /* Table row: convert to list of column vectors */
+    ray_t* tbl_row_list = NULL;
+    if (row->type == RAY_TABLE) {
+        int64_t src_ncols = ray_table_ncols(row);
+        if (src_ncols != ncols) { ray_release(tbl); ray_release(row); return ray_error("domain", NULL); }
+        tbl_row_list = ray_alloc(ncols * sizeof(ray_t*));
+        if (!tbl_row_list) { ray_release(tbl); ray_release(row_orig); return ray_error("oom", NULL); }
+        tbl_row_list->type = RAY_LIST;
+        tbl_row_list->len = ncols;
+        ray_t** trl = (ray_t**)ray_data(tbl_row_list);
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t col_name = ray_table_col_name(tbl, c);
+            ray_t* src_col = ray_table_get_col(row, col_name);
+            if (!src_col) src_col = ray_table_get_col_idx(row, c);
+            if (!src_col) {
+                tbl_row_list->len = 0;
+                ray_free(tbl_row_list);
+                ray_release(tbl); ray_release(row_orig);
+                return ray_error("domain", NULL);
+            }
+            trl[c] = src_col;
+            ray_retain(src_col);
+        }
+        row = tbl_row_list;
+    }
+
+    /* Dict row: extract values in table column order */
+    ray_t* dict_vals = NULL;
+    if (row->type == RAY_DICT) {
+        ray_t* dkeys = ray_dict_keys(row);
+        ray_t* dvals = ray_dict_vals(row);
+        if (!dkeys || dkeys->type != RAY_SYM || !dvals) {
+            ray_release(tbl); ray_release(row_orig);
+            return ray_error("type", NULL);
+        }
+        int64_t dict_len = dkeys->len;
+
+        dict_vals = ray_alloc(ncols * sizeof(ray_t*));
+        if (!dict_vals) { ray_release(tbl); ray_release(row_orig); return ray_error("oom", NULL); }
+        dict_vals->type = RAY_LIST;
+        dict_vals->len = ncols;
+        ray_t** dv = (ray_t**)ray_data(dict_vals);
+
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t col_name = ray_table_col_name(tbl, c);
+            dv[c] = NULL;
+            for (int64_t d = 0; d < dict_len; d++) {
+                int64_t dk = ray_read_sym(ray_data(dkeys), d, RAY_SYM, dkeys->attrs);
+                if (dk != col_name) continue;
+                if (dvals->type == RAY_LIST) {
+                    dv[c] = ((ray_t**)ray_data(dvals))[d];
+                    if (dv[c]) ray_retain(dv[c]);
+                } else {
+                    int alloc = 0;
+                    dv[c] = collection_elem(dvals, d, &alloc);
+                    if (!alloc && dv[c]) ray_retain(dv[c]);
+                }
+                break;
+            }
+        }
+        /* Verify all dict keys exist as table columns */
+        for (int64_t d = 0; d < dict_len; d++) {
+            int64_t dk = ray_read_sym(ray_data(dkeys), d, RAY_SYM, dkeys->attrs);
+            int found_in_tbl = 0;
+            for (int64_t c = 0; c < ncols; c++) {
+                if (ray_table_col_name(tbl, c) == dk) { found_in_tbl = 1; break; }
+            }
+            if (!found_in_tbl) {
+                for (int64_t c = 0; c < ncols; c++) if (dv[c]) ray_release(dv[c]);
+                dict_vals->len = 0;
+                ray_free(dict_vals);
+                ray_release(tbl); ray_release(row_orig);
+                return ray_error("value", NULL);
+            }
+        }
+        row = dict_vals;
+    }
+
+    if (ray_len(row) != ncols) {
+        if (dict_vals) {
+            for (int64_t c = 0; c < ncols; c++) ray_release(((ray_t**)ray_data(dict_vals))[c]);
+            dict_vals->len = 0;
+            ray_free(dict_vals);
+        }
+        ray_release(tbl); ray_release(row_orig);
+        return ray_error("domain", NULL);
+    }
+
+    ray_t** row_elems = (ray_t**)ray_data(row);
+    int64_t nrows = ray_table_nrows(tbl);
+
+    ray_t* result = ray_table_new(ncols);
+    if (RAY_IS_ERR(result)) return result;
+
+    for (int64_t c = 0; c < ncols; c++) {
+        int64_t col_name = ray_table_col_name(tbl, c);
+        ray_t* orig_col = ray_table_get_col_idx(tbl, c);
+        int8_t ct = orig_col->type;
+
+        ray_t* new_col = ray_vec_new(ct, nrows + 1);
+        if (RAY_IS_ERR(new_col)) { ray_release(result); return new_col; }
+
+        /* Copy existing data */
+        bool src_has_nulls = (orig_col->attrs & RAY_ATTR_HAS_NULLS) != 0;
+        if (ct == RAY_STR) {
+            for (int64_t r = 0; r < nrows; r++) {
+                if (src_has_nulls && ray_vec_is_null(orig_col, r)) {
+                    new_col = ray_str_vec_append(new_col, "", 0);
+                    if (!RAY_IS_ERR(new_col))
+                        ray_vec_set_null(new_col, new_col->len - 1, true);
+                } else {
+                    size_t slen = 0;
+                    const char* sp = ray_str_vec_get(orig_col, r, &slen);
+                    new_col = ray_str_vec_append(new_col, sp ? sp : "", sp ? slen : 0);
+                }
+                if (RAY_IS_ERR(new_col)) { ray_release(result); return new_col; }
+            }
+        } else if (ct == RAY_SYM) {
+            for (int64_t r = 0; r < nrows; r++) {
+                int64_t sym_val = ray_read_sym(ray_data(orig_col), r, orig_col->type, orig_col->attrs);
+                new_col = ray_vec_append(new_col, &sym_val);
+                if (RAY_IS_ERR(new_col)) { ray_release(result); return new_col; }
+                if (src_has_nulls && ray_vec_is_null(orig_col, r))
+                    ray_vec_set_null(new_col, new_col->len - 1, true);
+            }
+        } else {
+            size_t elem_sz = (ct == RAY_BOOL) ? 1 : 8;
+            uint8_t* src = (uint8_t*)ray_data(orig_col);
+            for (int64_t r = 0; r < nrows; r++) {
+                new_col = ray_vec_append(new_col, src + r * elem_sz);
+                if (RAY_IS_ERR(new_col)) { ray_release(result); return new_col; }
+                if (src_has_nulls && ray_vec_is_null(orig_col, r))
+                    ray_vec_set_null(new_col, new_col->len - 1, true);
+            }
+        }
+
+        /* Append new row value(s) — atom for single row, vector for multi-row */
+        if (!row_elems[c]) {
+            /* NULL = null value for this column type */
+            ray_t* null_atom = ray_typed_null(-ct);
+            new_col = append_atom_to_col(new_col, null_atom);
+            ray_release(null_atom);
+        } else if (ray_is_atom(row_elems[c])) {
+            new_col = append_atom_to_col(new_col, row_elems[c]);
+        } else if (ray_is_vec(row_elems[c]) || row_elems[c]->type == RAY_LIST) {
+            ray_t* merged = ray_concat_fn(new_col, row_elems[c]);
+            ray_release(new_col);
+            new_col = merged;
+        } else {
+            new_col = append_atom_to_col(new_col, row_elems[c]);
+        }
+        if (RAY_IS_ERR(new_col)) { ray_release(result); return new_col; }
+
+        result = ray_table_add_col(result, col_name, new_col);
+        ray_release(new_col);
+        if (RAY_IS_ERR(result)) return result;
+    }
+
+    /* Cleanup dict_vals, tbl_row_list, and original row */
+    if (dict_vals) {
+        ray_t** dv = (ray_t**)ray_data(dict_vals);
+        for (int64_t c = 0; c < ncols; c++) if (dv[c]) ray_release(dv[c]);
+        dict_vals->len = 0; /* prevent ray_free from double-releasing children */
+        ray_free(dict_vals);
+    }
+    if (tbl_row_list) {
+        ray_t** trl = (ray_t**)ray_data(tbl_row_list);
+        for (int64_t c = 0; c < ncols; c++) if (trl[c]) ray_release(trl[c]);
+        tbl_row_list->len = 0;
+        ray_free(tbl_row_list);
+    }
+    ray_release(tbl);
+    ray_release(row_orig);
+
+    /* In-place: update the variable in the env */
+    if (inplace_sym >= 0 && !RAY_IS_ERR(result)) {
+        ray_env_set(inplace_sym, result);
+        ray_retain(result);
+        return result;
+    }
+    return result;
+}
+
+/* (upsert table key_col (list val1 val2 ...)) — update row if key matches, else insert.
+ * Special form: first arg may be 'sym for in-place, other args are evaluated. */
+ray_t* ray_upsert_fn(ray_t** args, int64_t n) {
+    if (n < 3) return ray_error("domain", NULL);
+
+    /* Detect calling convention: already-evaluated args (from recursive call) vs raw parse tree */
+    int64_t inplace_sym = -1;
+    ray_t* tbl_raw = args[0];
+    int already_eval = (tbl_raw && tbl_raw->type == RAY_TABLE);
+    ray_t* tbl;
+
+    if (!already_eval && tbl_raw && tbl_raw->type == -RAY_SYM && !(tbl_raw->attrs & RAY_ATTR_NAME)) {
+        inplace_sym = tbl_raw->i64;
+        tbl = ray_env_get(inplace_sym);
+        if (!tbl || RAY_IS_ERR(tbl)) return ray_error("domain", NULL);
+        ray_retain(tbl);
+    } else if (already_eval) {
+        tbl = tbl_raw;
+        ray_retain(tbl);
+    } else {
+        tbl = ray_eval(tbl_raw);
+        if (!tbl || RAY_IS_ERR(tbl)) return tbl ? tbl : ray_error("type", NULL);
+    }
+
+    ray_t* key_sym = already_eval ? (ray_retain(args[1]), args[1]) : ray_eval(args[1]);
+    if (!key_sym || RAY_IS_ERR(key_sym)) { ray_release(tbl); return key_sym ? key_sym : ray_error("type", NULL); }
+
+    ray_t* row = already_eval ? (ray_retain(args[2]), args[2]) : ray_eval(args[2]);
+    if (!row || RAY_IS_ERR(row)) { ray_release(tbl); ray_release(key_sym); return row ? row : ray_error("type", NULL); }
+
+    if (tbl->type != RAY_TABLE) { ray_release(tbl); ray_release(key_sym); ray_release(row); return ray_error("type", NULL); }
+    if (!is_list(row) && row->type != RAY_TABLE && row->type != RAY_DICT) { ray_release(tbl); ray_release(key_sym); ray_release(row); return ray_error("type", NULL); }
+
+    int64_t ncols = ray_table_ncols(tbl);
+
+    /* Table row: iterate row-by-row for proper upsert semantics */
+    if (row->type == RAY_TABLE) {
+        int64_t src_nrows = ray_table_nrows(row);
+        int64_t src_ncols = ray_table_ncols(row);
+
+        /* Zero-row payload → upsert is a no-op regardless of payload
+         * schema.  Skip all schema-strictness here: rejecting an empty
+         * partial payload (e.g. for missing key columns) regresses the
+         * pre-existing "empty input = do nothing" behavior.  No data
+         * flows, so neither silent-drop nor null-key crashes are
+         * possible below. */
+        if (src_nrows == 0) {
+            ray_release(key_sym); ray_release(row);
+            return tbl;
+        }
+
+        /* Schema-strictness (table payload is a PARTIAL view — columns
+         * in target but not in source are intentionally null-filled).
+         * We only need to reject:
+         *   (a) a source column whose name isn't in the target (extra
+         *       → silent drop of user data);
+         *   (b) a source column name that appears more than once in the
+         *       source (ambiguous);
+         *   (c) a source column name whose target column appears more
+         *       than once in `tbl` (name-keyed gather can't tell which
+         *       target slot the value belongs to → silent duplication).
+         * Duplicate target columns whose names don't appear in `row`
+         * are harmless — they get null-filled like any other missing
+         * column. */
+        for (int64_t sc = 0; sc < src_ncols; sc++) {
+            int64_t scn = ray_table_col_name(row, sc);
+            int64_t tbl_matches = 0, src_matches = 0;
+            for (int64_t i = 0; i < ncols;     i++) if (ray_table_col_name(tbl, i) == scn) tbl_matches++;
+            for (int64_t i = 0; i < src_ncols; i++) if (ray_table_col_name(row, i) == scn) src_matches++;
+            if (tbl_matches != 1 || src_matches != 1) {
+                ray_release(tbl); ray_release(key_sym); ray_release(row);
+                return ray_error("value", NULL);
+            }
+        }
+
+        /* Partial updates may null-fill ordinary columns, but the key
+         * column(s) MUST be present — otherwise the recursive upsert
+         * reads a NULL from row_elems[key_col] and segfaults.  Resolve
+         * key names from key_sym and require each to appear in row. */
+        int64_t key_names[16];
+        int64_t n_key = 0;
+        if (key_sym->type == -RAY_SYM) {
+            key_names[n_key++] = key_sym->i64;
+        } else if (key_sym->type == -RAY_I64) {
+            int64_t k = key_sym->i64;
+            if (k <= 0 || k > ncols || k > 16) {
+                ray_release(tbl); ray_release(key_sym); ray_release(row);
+                return ray_error("domain", NULL);
+            }
+            for (int64_t i = 0; i < k; i++)
+                key_names[n_key++] = ray_table_col_name(tbl, i);
+        } else {
+            ray_release(tbl); ray_release(key_sym); ray_release(row);
+            return ray_error("type", NULL);
+        }
+        for (int64_t k = 0; k < n_key; k++) {
+            int found = 0;
+            for (int64_t i = 0; i < src_ncols; i++)
+                if (ray_table_col_name(row, i) == key_names[k]) { found = 1; break; }
+            if (!found) {
+                ray_release(tbl); ray_release(key_sym); ray_release(row);
+                return ray_error("value", NULL);
+            }
+        }
+
+        /* Gather source columns in target order (now guaranteed 1-to-1). */
+        ray_t* src_cols[64];
+        for (int64_t c = 0; c < ncols && c < 64; c++) {
+            int64_t cn = ray_table_col_name(tbl, c);
+            src_cols[c] = ray_table_get_col(row, cn);
+        }
+        ray_t* cur_tbl = tbl;
+        ray_retain(cur_tbl);
+        for (int64_t r = 0; r < src_nrows; r++) {
+            ray_t* single = ray_alloc(ncols * sizeof(ray_t*));
+            if (!single) { ray_release(cur_tbl); ray_release(tbl); ray_release(key_sym); ray_release(row); return ray_error("oom", NULL); }
+            single->type = RAY_LIST;
+            single->len = ncols;
+            ray_t** sr = (ray_t**)ray_data(single);
+            for (int64_t c = 0; c < ncols; c++) {
+                int alloc = 0;
+                sr[c] = src_cols[c] ? collection_elem(src_cols[c], r, &alloc) : NULL;
+                if (!alloc && sr[c]) ray_retain(sr[c]);
+            }
+            ray_t* upsert_args[3] = { cur_tbl, key_sym, single };
+            ray_t* new_tbl = ray_upsert_fn(upsert_args, 3);
+            for (int64_t c = 0; c < ncols; c++) if (sr[c]) ray_release(sr[c]);
+            single->len = 0;
+            ray_free(single);
+            ray_release(cur_tbl);
+            if (RAY_IS_ERR(new_tbl)) { ray_release(tbl); ray_release(key_sym); ray_release(row); return new_tbl; }
+            cur_tbl = new_tbl;
+        }
+        ray_release(tbl);
+        ray_release(key_sym);
+        ray_release(row);
+        if (inplace_sym >= 0 && !RAY_IS_ERR(cur_tbl)) {
+            ray_env_set(inplace_sym, cur_tbl);
+            ray_retain(cur_tbl);
+        }
+        return cur_tbl;
+    }
+
+    /* Dict row: extract values in column order to create a plain list */
+    ray_t* dict_row_list = NULL;
+    if (row->type == RAY_DICT) {
+        ray_t* dkeys = ray_dict_keys(row);
+        ray_t* dvals = ray_dict_vals(row);
+        if (!dkeys || dkeys->type != RAY_SYM || !dvals) {
+            ray_release(tbl); ray_release(key_sym); ray_release(row);
+            return ray_error("type", NULL);
+        }
+        int64_t n_pairs = dkeys->len;
+
+        /* Schema-strictness: every column name must appear exactly once
+         * on each side.  Mirrors the table-payload path. */
+        if (n_pairs != ncols) {
+            ray_release(tbl); ray_release(key_sym); ray_release(row);
+            return ray_error("value", NULL);
+        }
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t cn = ray_table_col_name(tbl, c);
+            int64_t tbl_matches = 0, dict_matches = 0;
+            for (int64_t i = 0; i < ncols; i++)
+                if (ray_table_col_name(tbl, i) == cn) tbl_matches++;
+            for (int64_t d = 0; d < n_pairs; d++) {
+                int64_t dk = ray_read_sym(ray_data(dkeys), d, RAY_SYM, dkeys->attrs);
+                if (dk == cn) dict_matches++;
+            }
+            if (tbl_matches != 1 || dict_matches != 1) {
+                ray_release(tbl); ray_release(key_sym); ray_release(row);
+                return ray_error("value", NULL);
+            }
+        }
+
+        dict_row_list = ray_alloc(ncols * sizeof(ray_t*));
+        if (!dict_row_list) { ray_release(tbl); ray_release(key_sym); ray_release(row); return ray_error("oom", NULL); }
+        dict_row_list->type = RAY_LIST;
+        dict_row_list->len = ncols;
+        ray_t** drl = (ray_t**)ray_data(dict_row_list);
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t col_name = ray_table_col_name(tbl, c);
+            drl[c] = NULL;
+            for (int64_t d = 0; d < n_pairs; d++) {
+                int64_t dk = ray_read_sym(ray_data(dkeys), d, RAY_SYM, dkeys->attrs);
+                if (dk != col_name) continue;
+                if (dvals->type == RAY_LIST) {
+                    drl[c] = ((ray_t**)ray_data(dvals))[d];
+                    if (drl[c]) ray_retain(drl[c]);
+                } else {
+                    int alloc = 0;
+                    drl[c] = collection_elem(dvals, d, &alloc);
+                    if (!alloc && drl[c]) ray_retain(drl[c]);
+                }
+                break;
+            }
+        }
+        ray_release(row);
+        row = dict_row_list;
+    }
+
+    if (ray_len(row) != ncols) { ray_release(tbl); ray_release(key_sym); ray_release(row); return ray_error("domain", NULL); }
+
+    ray_t** row_elems = (ray_t**)ray_data(row);
+    int64_t nrows = ray_table_nrows(tbl);
+
+    /* Determine key columns — integer N means "first N columns are keys" */
+    int64_t n_key_cols = 1;
+    int64_t key_col_indices[16];
+    if (key_sym->type == -RAY_SYM) {
+        key_col_indices[0] = -1;
+        for (int64_t c = 0; c < ncols; c++) {
+            if (ray_table_col_name(tbl, c) == key_sym->i64) {
+                key_col_indices[0] = c;
+                break;
+            }
+        }
+        if (key_col_indices[0] < 0) { ray_release(tbl); ray_release(key_sym); ray_release(row); return ray_error("domain", NULL); }
+    } else if (key_sym->type == -RAY_I64) {
+        n_key_cols = key_sym->i64;
+        if (n_key_cols <= 0 || n_key_cols > ncols || n_key_cols > 16) { ray_release(tbl); ray_release(key_sym); ray_release(row); return ray_error("domain", NULL); }
+        for (int64_t k = 0; k < n_key_cols; k++) key_col_indices[k] = k;
+    } else {
+        ray_release(tbl); ray_release(key_sym); ray_release(row);
+        return ray_error("type", NULL);
+    }
+
+    /* Multi-row upsert: if row values are vectors, iterate row-by-row */
+    ray_t* key_elem = row_elems[key_col_indices[0]];
+    if (ray_is_vec(key_elem) || key_elem->type == RAY_LIST) {
+        int64_t new_nrows = ray_len(key_elem);
+        ray_t* cur_tbl = tbl;
+        ray_retain(cur_tbl);
+        for (int64_t r = 0; r < new_nrows; r++) {
+            /* Build single-row list from multi-row columns */
+            ray_t* single_row = ray_alloc(ncols * sizeof(ray_t*));
+            if (!single_row) { ray_release(cur_tbl); ray_release(tbl); ray_release(key_sym); ray_release(row); return ray_error("oom", NULL); }
+            single_row->type = RAY_LIST;
+            single_row->len = ncols;
+            ray_t** sr = (ray_t**)ray_data(single_row);
+            for (int64_t c = 0; c < ncols; c++) {
+                int alloc = 0;
+                sr[c] = collection_elem(row_elems[c], r, &alloc);
+                if (!alloc && sr[c]) ray_retain(sr[c]);
+            }
+            /* Upsert single row into current table */
+            ray_t* upsert_args[3] = { cur_tbl, key_sym, single_row };
+            ray_t* new_tbl = ray_upsert_fn(upsert_args, 3);
+            /* Clean up single_row */
+            for (int64_t c = 0; c < ncols; c++) if (sr[c]) ray_release(sr[c]);
+            single_row->len = 0;
+            ray_free(single_row);
+            ray_release(cur_tbl);
+            if (RAY_IS_ERR(new_tbl)) { ray_release(tbl); ray_release(key_sym); ray_release(row); return new_tbl; }
+            cur_tbl = new_tbl;
+        }
+        ray_release(tbl);
+        ray_release(key_sym);
+        ray_release(row);
+        if (inplace_sym >= 0 && !RAY_IS_ERR(cur_tbl)) {
+            ray_env_set(inplace_sym, cur_tbl);
+            ray_retain(cur_tbl);
+        }
+        return cur_tbl;
+    }
+
+    /* Type-check key columns before searching */
+    for (int64_t k = 0; k < n_key_cols; k++) {
+        int64_t kci = key_col_indices[k];
+        ray_t* key_col = ray_table_get_col_idx(tbl, kci);
+        ray_t* key_atom = row_elems[kci];
+        int8_t kt = key_col->type;
+        if (kt == RAY_STR && key_atom->type != -RAY_STR) {
+            ray_release(tbl); ray_release(key_sym); ray_release(row);
+            return ray_error("type", NULL);
+        }
+        if (kt == RAY_SYM && key_atom->type != -RAY_SYM) {
+            ray_release(tbl); ray_release(key_sym); ray_release(row);
+            return ray_error("type", NULL);
+        }
+    }
+
+    /* Find the row to update by composite key match */
+    int64_t match_row = -1;
+    for (int64_t r = 0; r < nrows; r++) {
+        int match = 1;
+        for (int64_t k = 0; k < n_key_cols && match; k++) {
+            int64_t kci = key_col_indices[k];
+            ray_t* key_col = ray_table_get_col_idx(tbl, kci);
+            ray_t* key_atom = row_elems[kci];
+            int8_t kt = key_col->type;
+            if (kt == RAY_F64) {
+                double needle = (key_atom->type == -RAY_F64) ? key_atom->f64 : (double)key_atom->i64;
+                if (((double*)ray_data(key_col))[r] != needle) match = 0;
+            } else if (kt == RAY_SYM) {
+                if (ray_read_sym(ray_data(key_col), r, key_col->type, key_col->attrs) != key_atom->i64) match = 0;
+            } else if (kt == RAY_STR) {
+                const char* ns = ray_str_ptr(key_atom);
+                size_t nl = ray_str_len(key_atom);
+                size_t rl = 0;
+                const char* rs = ray_str_vec_get(key_col, r, &rl);
+                if (rl != nl || (nl > 0 && (!rs || !ns || memcmp(rs, ns, nl) != 0))) match = 0;
+            } else {
+                int64_t needle = elem_as_i64(key_atom);
+                int64_t existing = (kt == RAY_I64 || kt == RAY_TIMESTAMP) ?
+                    ((int64_t*)ray_data(key_col))[r] :
+                    (kt == RAY_I32 || kt == RAY_DATE || kt == RAY_TIME) ?
+                    (int64_t)((int32_t*)ray_data(key_col))[r] :
+                    (kt == RAY_BOOL) ? (int64_t)((uint8_t*)ray_data(key_col))[r] :
+                    ((int64_t*)ray_data(key_col))[r];
+                if (existing != needle) match = 0;
+            }
+        }
+        if (match) { match_row = r; break; }
+    }
+
+    if (match_row < 0) {
+        /* Key not found — insert: pass pre-evaluated args */
+        ray_t* insert_args[2] = { tbl, row };
+        ray_t* result = ray_insert_fn(insert_args, 2);
+        ray_release(tbl);
+        ray_release(key_sym);
+        ray_release(row);
+        if (inplace_sym >= 0 && !RAY_IS_ERR(result)) {
+            ray_env_set(inplace_sym, result);
+            ray_retain(result);
+        }
+        return result;
+    }
+
+    /* Key found — update that row */
+    ray_t* result = ray_table_new(ncols);
+    if (RAY_IS_ERR(result)) { ray_release(tbl); ray_release(key_sym); ray_release(row); return result; }
+
+    for (int64_t c = 0; c < ncols; c++) {
+        int64_t col_name = ray_table_col_name(tbl, c);
+        ray_t* orig_col = ray_table_get_col_idx(tbl, c);
+        int8_t ct = orig_col->type;
+
+        ray_t* new_col = ray_vec_new(ct, nrows);
+        if (RAY_IS_ERR(new_col)) { ray_release(result); ray_release(tbl); ray_release(key_sym); ray_release(row); return new_col; }
+
+        /* If row_elems[c] is NULL (missing column), keep original values */
+        int has_new_val = (row_elems[c] != NULL);
+
+        if (ct == RAY_STR) {
+            for (int64_t r = 0; r < nrows; r++) {
+                if (r == match_row && has_new_val) {
+                    new_col = append_atom_to_col(new_col, row_elems[c]);
+                } else {
+                    size_t slen = 0;
+                    const char* sp = ray_str_vec_get(orig_col, r, &slen);
+                    new_col = ray_str_vec_append(new_col, sp ? sp : "", sp ? slen : 0);
+                }
+                if (RAY_IS_ERR(new_col)) { ray_release(result); ray_release(tbl); ray_release(key_sym); ray_release(row); return new_col; }
+            }
+        } else if (ct == RAY_SYM) {
+            for (int64_t r = 0; r < nrows; r++) {
+                if (r == match_row && has_new_val) {
+                    new_col = append_atom_to_col(new_col, row_elems[c]);
+                } else {
+                    int64_t sym_val = ray_read_sym(ray_data(orig_col), r, orig_col->type, orig_col->attrs);
+                    new_col = ray_vec_append(new_col, &sym_val);
+                }
+                if (RAY_IS_ERR(new_col)) { ray_release(result); ray_release(tbl); ray_release(key_sym); ray_release(row); return new_col; }
+            }
+        } else {
+            size_t elem_sz = (ct == RAY_BOOL) ? 1 : 8;
+            uint8_t* src = (uint8_t*)ray_data(orig_col);
+            for (int64_t r = 0; r < nrows; r++) {
+                if (r == match_row && has_new_val) {
+                    new_col = append_atom_to_col(new_col, row_elems[c]);
+                } else {
+                    new_col = ray_vec_append(new_col, src + r * elem_sz);
+                }
+                if (RAY_IS_ERR(new_col)) { ray_release(result); ray_release(tbl); ray_release(key_sym); ray_release(row); return new_col; }
+            }
+        }
+
+        result = ray_table_add_col(result, col_name, new_col);
+        ray_release(new_col);
+        if (RAY_IS_ERR(result)) { ray_release(tbl); ray_release(key_sym); ray_release(row); return result; }
+    }
+
+    ray_release(tbl);
+    ray_release(key_sym);
+    ray_release(row);
+
+    if (inplace_sym >= 0 && !RAY_IS_ERR(result)) {
+        ray_env_set(inplace_sym, result);
+        ray_retain(result);
+    }
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * Join operations
+ * ══════════════════════════════════════════ */
+
+/* Shared implementation for left-join (join_type=1) and inner-join (join_type=0).
+ * (left-join t1 t2 [key ...]) / (inner-join t1 t2 [key ...]) */
+static ray_t* join_impl(ray_t** args, int64_t n, uint8_t join_type) {
+    if (n < 3) return ray_error("domain", NULL);
+
+    ray_t* left_tbl  = args[0];
+    ray_t* right_tbl = args[1];
+    ray_t* keys      = args[2];
+
+    /* Detect alternative calling convention: (join [keys] t1 t2) */
+    if (left_tbl->type != RAY_TABLE && args[1]->type == RAY_TABLE && args[2]->type == RAY_TABLE) {
+        keys      = args[0];
+        left_tbl  = args[1];
+        right_tbl = args[2];
+    }
+
+    if (left_tbl->type != RAY_TABLE || right_tbl->type != RAY_TABLE)
+        return ray_error("type", NULL);
+    ray_t* _bxk = NULL;
+    keys = unbox_vec_arg(keys, &_bxk);
+    if (RAY_IS_ERR(keys)) return keys;
+    if (!is_list(keys))
+        { if (_bxk) ray_release(_bxk); return ray_error("type", NULL); }
+
+    int64_t nk = ray_len(keys);
+    if (nk == 0 || nk > 16) { if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+    ray_t** key_elems = (ray_t**)ray_data(keys);
+
+    ray_graph_t* g = ray_graph_new(left_tbl);
+    if (!g) { if (_bxk) ray_release(_bxk); return ray_error("oom", NULL); }
+
+    ray_op_t* left_node  = ray_const_table(g, left_tbl);
+    ray_op_t* right_node = ray_const_table(g, right_tbl);
+
+    ray_op_t* lk[16], *rk[16];
+    for (int64_t i = 0; i < nk; i++) {
+        if (key_elems[i]->type != -RAY_SYM) {
+            ray_graph_free(g); if (_bxk) ray_release(_bxk);
+            return ray_error("type", NULL);
+        }
+        ray_t* name_str = ray_sym_str(key_elems[i]->i64);
+        if (!name_str) { ray_graph_free(g); if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+        lk[i] = ray_scan(g, ray_str_ptr(name_str));
+        rk[i] = ray_scan(g, ray_str_ptr(name_str));
+        if (!lk[i] || !rk[i]) { ray_graph_free(g); if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+    }
+
+    if (_bxk) ray_release(_bxk);
+
+    ray_op_t* jn = ray_join(g, left_node, lk, right_node, rk,
+                           (uint8_t)nk, join_type);
+    if (!jn) { ray_graph_free(g); return ray_error("oom", NULL); }
+
+    jn = ray_optimize(g, jn);
+    ray_t* result = ray_execute(g, jn);
+    ray_graph_free(g);
+    return result;
+}
+
+ray_t* ray_left_join_fn(ray_t** args, int64_t n)  { return join_impl(args, n, 1); }
+ray_t* ray_inner_join_fn(ray_t** args, int64_t n) { return join_impl(args, n, 0); }
+
+/* (antijoin left right [keys])
+ * Anti-semi-join: keep rows from left that have NO match in right on keys. */
+static ray_t* antijoin_impl(ray_t** args, int64_t n) {
+    if (n < 3) return ray_error("domain", NULL);
+
+    ray_t* left_tbl  = args[0];
+    ray_t* right_tbl = args[1];
+    ray_t* keys      = args[2];
+
+    /* Detect alternative calling convention: (antijoin [keys] t1 t2) */
+    if (left_tbl->type != RAY_TABLE && args[1]->type == RAY_TABLE && args[2]->type == RAY_TABLE) {
+        keys      = args[0];
+        left_tbl  = args[1];
+        right_tbl = args[2];
+    }
+
+    if (left_tbl->type != RAY_TABLE || right_tbl->type != RAY_TABLE)
+        return ray_error("type", NULL);
+    ray_t* _bxk = NULL;
+    keys = unbox_vec_arg(keys, &_bxk);
+    if (RAY_IS_ERR(keys)) return keys;
+    if (!is_list(keys))
+        { if (_bxk) ray_release(_bxk); return ray_error("type", NULL); }
+
+    int64_t nk = ray_len(keys);
+    if (nk == 0 || nk > 16) { if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+    ray_t** key_elems = (ray_t**)ray_data(keys);
+
+    ray_graph_t* g = ray_graph_new(left_tbl);
+    if (!g) { if (_bxk) ray_release(_bxk); return ray_error("oom", NULL); }
+
+    ray_op_t* left_node  = ray_const_table(g, left_tbl);
+    ray_op_t* right_node = ray_const_table(g, right_tbl);
+
+    ray_op_t* lk[16], *rk[16];
+    for (int64_t i = 0; i < nk; i++) {
+        if (key_elems[i]->type != -RAY_SYM) {
+            ray_graph_free(g); if (_bxk) ray_release(_bxk);
+            return ray_error("type", NULL);
+        }
+        ray_t* name_str = ray_sym_str(key_elems[i]->i64);
+        if (!name_str) { ray_graph_free(g); if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+        lk[i] = ray_scan(g, ray_str_ptr(name_str));
+        rk[i] = ray_scan(g, ray_str_ptr(name_str));
+        if (!lk[i] || !rk[i]) { ray_graph_free(g); if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+    }
+
+    if (_bxk) ray_release(_bxk);
+
+    ray_op_t* jn = ray_antijoin(g, left_node, lk, right_node, rk, (uint8_t)nk);
+    if (!jn) { ray_graph_free(g); return ray_error("oom", NULL); }
+
+    jn = ray_optimize(g, jn);
+    ray_t* result = ray_execute(g, jn);
+    ray_graph_free(g);
+    return result;
+}
+
+ray_t* ray_anti_join_fn(ray_t** args, int64_t n) { return antijoin_impl(args, n); }
+
+/* ------------------------------------------------------------------------ */
+/* window-join parallel worker                                              */
+/* ------------------------------------------------------------------------ */
+
+#define WJ_MAX_AGG 16
+
+typedef struct {
+    int64_t cnt;
+    int64_t sum_i;
+    double  sum_f;
+    int64_t sum_sq_i;
+    double  sum_sq_f;
+    int64_t extreme_i;
+    double  extreme_f;
+    int64_t prod_i;
+    double  prod_f;
+} wj_acc_t;
+
+typedef struct {
+    int64_t  left_nrows;
+    int64_t  right_nrows;
+    int64_t  n_eq;
+    int64_t  n_agg;
+
+    /* Left-row metadata — pre-extracted to int64 so workers can read
+     * without touching any ray_t objects (no locking, no allocation). */
+    const int64_t*  lo_arr;
+    const int64_t*  hi_arr;
+    const int64_t*  left_eq_arr[WJ_MAX_AGG];
+
+    /* Right-side sort order and time column (sorted rank -> original idx) */
+    const int64_t*  right_sort;
+    const int64_t*  rt_time_i;
+
+    /* Right equality columns (raw), kept for binary-search compares */
+    const void*     eq_data[WJ_MAX_AGG];
+    int8_t          eq_type[WJ_MAX_AGG];
+    uint8_t         eq_attrs[WJ_MAX_AGG];
+
+    /* Per-agg metadata and preloaded sorted source vectors */
+    uint8_t         agg_raw[WJ_MAX_AGG];
+    uint16_t        agg_ops[WJ_MAX_AGG];
+    int8_t          agg_result_types[WJ_MAX_AGG];
+    int             agg_is_float[WJ_MAX_AGG];
+    const int64_t*  sorted_i[WJ_MAX_AGG];
+    const double*   sorted_f[WJ_MAX_AGG];
+    const uint8_t*  sorted_nn[WJ_MAX_AGG];
+
+    /* Per-agg result output — writers index by lr directly */
+    void*           result_data[WJ_MAX_AGG];
+    uint8_t*        result_null[WJ_MAX_AGG];  /* 1 byte per row: 1 = null */
+} wj_scan_ctx_t;
+
+static void wj_scan_fn(void* ctx_, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    wj_scan_ctx_t* c = (wj_scan_ctx_t*)ctx_;
+    wj_acc_t acc[WJ_MAX_AGG];
+    int64_t  n_eq   = c->n_eq;
+    int64_t  n_agg  = c->n_agg;
+    int64_t  rn     = c->right_nrows;
+    const int64_t* right_sort = c->right_sort;
+    const int64_t* rt_time_i  = c->rt_time_i;
+
+    for (int64_t lr = start; lr < end; lr++) {
+        int64_t lo = c->lo_arr[lr];
+        int64_t hi = c->hi_arr[lr];
+
+        int64_t target_eq[WJ_MAX_AGG];
+        for (int64_t e = 0; e < n_eq; e++)
+            target_eq[e] = c->left_eq_arr[e][lr];
+
+        /* lower_bound: first rank with (eq, time) >= (target_eq, lo) */
+        int64_t lb = 0, lb_hi = rn;
+        while (lb < lb_hi) {
+            int64_t m = (lb + lb_hi) >> 1;
+            int64_t ri = right_sort[m];
+            int cmp = 0;
+            for (int64_t e = 0; e < n_eq && cmp == 0; e++) {
+                int64_t rv = read_col_i64(c->eq_data[e], ri, c->eq_type[e], c->eq_attrs[e]);
+                if (rv < target_eq[e]) cmp = -1;
+                else if (rv > target_eq[e]) cmp = 1;
+            }
+            if (cmp == 0 && rt_time_i[ri] < lo) cmp = -1;
+            if (cmp < 0) lb = m + 1; else lb_hi = m;
+        }
+        int64_t ub = lb, ub_hi = rn;
+        while (ub < ub_hi) {
+            int64_t m = (ub + ub_hi) >> 1;
+            int64_t ri = right_sort[m];
+            int cmp = 0;
+            for (int64_t e = 0; e < n_eq && cmp == 0; e++) {
+                int64_t rv = read_col_i64(c->eq_data[e], ri, c->eq_type[e], c->eq_attrs[e]);
+                if (rv < target_eq[e]) cmp = -1;
+                else if (rv > target_eq[e]) cmp = 1;
+            }
+            if (cmp == 0 && rt_time_i[ri] <= hi) cmp = -1;
+            if (cmp < 0) ub = m + 1; else ub_hi = m;
+        }
+
+        memset(acc, 0, sizeof(acc));
+        for (int64_t a = 0; a < n_agg; a++) {
+            if (c->agg_ops[a] == OP_PROD) { acc[a].prod_i = 1; acc[a].prod_f = 1.0; }
+        }
+
+        /* Per-agg tight scan (hoisted switch, sequential SIMD-friendly read) */
+        for (int64_t a = 0; a < n_agg; a++) {
+            if (c->agg_raw[a]) continue;
+            wj_acc_t* A = &acc[a];
+            uint16_t op = c->agg_ops[a];
+            if (op == OP_COUNT) { A->cnt += (ub - lb); continue; }
+
+            const uint8_t* nn = c->sorted_nn[a];
+            if (c->agg_is_float[a]) {
+                const double* ss = c->sorted_f[a];
+                switch (op) {
+                case OP_SUM: case OP_AVG: {
+                    double sum = 0; int64_t cnt = 0;
+                    if (nn) { for (int64_t k = lb; k < ub; k++) if (nn[k]) { sum += ss[k]; cnt++; } }
+                    else    { for (int64_t k = lb; k < ub; k++) sum += ss[k]; cnt = ub - lb; }
+                    A->sum_f = sum; A->cnt = cnt; break;
+                }
+                case OP_VAR: case OP_VAR_POP:
+                case OP_STDDEV: case OP_STDDEV_POP: {
+                    double sum = 0, sum2 = 0; int64_t cnt = 0;
+                    if (nn) {
+                        for (int64_t k = lb; k < ub; k++)
+                            if (nn[k]) { double v = ss[k]; sum += v; sum2 += v * v; cnt++; }
+                    } else {
+                        for (int64_t k = lb; k < ub; k++) { double v = ss[k]; sum += v; sum2 += v * v; }
+                        cnt = ub - lb;
+                    }
+                    A->sum_f = sum; A->sum_sq_f = sum2; A->cnt = cnt; break;
+                }
+                case OP_PROD: {
+                    double p = 1.0; int64_t cnt = 0;
+                    if (nn) { for (int64_t k = lb; k < ub; k++) if (nn[k]) { p *= ss[k]; cnt++; } }
+                    else    { for (int64_t k = lb; k < ub; k++) p *= ss[k]; cnt = ub - lb; }
+                    A->prod_f = p; A->cnt = cnt; break;
+                }
+                case OP_MIN: {
+                    int64_t k = lb;
+                    if (nn) {
+                        double best = 0; int64_t cnt = 0;
+                        for (; k < ub; k++) if (nn[k]) { best = ss[k]; cnt = 1; k++; break; }
+                        for (; k < ub; k++) if (nn[k]) { double v = ss[k]; if (v < best) best = v; cnt++; }
+                        A->extreme_f = best; A->cnt = cnt;
+                    } else if (k < ub) {
+                        double best = ss[k++];
+                        for (; k < ub; k++) { double v = ss[k]; if (v < best) best = v; }
+                        A->extreme_f = best; A->cnt = ub - lb;
+                    }
+                    break;
+                }
+                case OP_MAX: {
+                    int64_t k = lb;
+                    if (nn) {
+                        double best = 0; int64_t cnt = 0;
+                        for (; k < ub; k++) if (nn[k]) { best = ss[k]; cnt = 1; k++; break; }
+                        for (; k < ub; k++) if (nn[k]) { double v = ss[k]; if (v > best) best = v; cnt++; }
+                        A->extreme_f = best; A->cnt = cnt;
+                    } else if (k < ub) {
+                        double best = ss[k++];
+                        for (; k < ub; k++) { double v = ss[k]; if (v > best) best = v; }
+                        A->extreme_f = best; A->cnt = ub - lb;
+                    }
+                    break;
+                }
+                case OP_FIRST: {
+                    if (nn) {
+                        int64_t cnt = 0;
+                        for (int64_t k = lb; k < ub; k++) if (nn[k]) {
+                            if (cnt == 0) A->extreme_f = ss[k];
+                            cnt++;
+                        }
+                        A->cnt = cnt;
+                    } else if (lb < ub) {
+                        A->extreme_f = ss[lb]; A->cnt = ub - lb;
+                    }
+                    break;
+                }
+                case OP_LAST: {
+                    if (nn) {
+                        int64_t cnt = 0, last_k = -1;
+                        for (int64_t k = lb; k < ub; k++) if (nn[k]) { last_k = k; cnt++; }
+                        if (last_k >= 0) A->extreme_f = ss[last_k];
+                        A->cnt = cnt;
+                    } else if (lb < ub) {
+                        A->extreme_f = ss[ub - 1]; A->cnt = ub - lb;
+                    }
+                    break;
+                }
+                default: break;
+                }
+            } else {
+                const int64_t* ss = c->sorted_i[a];
+                switch (op) {
+                case OP_SUM: case OP_AVG: {
+                    int64_t sum = 0; int64_t cnt = 0;
+                    if (nn) { for (int64_t k = lb; k < ub; k++) if (nn[k]) { sum += ss[k]; cnt++; } }
+                    else    { for (int64_t k = lb; k < ub; k++) sum += ss[k]; cnt = ub - lb; }
+                    A->sum_i = sum; A->cnt = cnt; break;
+                }
+                case OP_VAR: case OP_VAR_POP:
+                case OP_STDDEV: case OP_STDDEV_POP: {
+                    int64_t sum = 0, sum2 = 0; int64_t cnt = 0;
+                    if (nn) {
+                        for (int64_t k = lb; k < ub; k++)
+                            if (nn[k]) { int64_t v = ss[k]; sum += v; sum2 += v * v; cnt++; }
+                    } else {
+                        for (int64_t k = lb; k < ub; k++) { int64_t v = ss[k]; sum += v; sum2 += v * v; }
+                        cnt = ub - lb;
+                    }
+                    A->sum_i = sum; A->sum_sq_i = sum2; A->cnt = cnt; break;
+                }
+                case OP_PROD: {
+                    int64_t p = 1; int64_t cnt = 0;
+                    if (nn) { for (int64_t k = lb; k < ub; k++) if (nn[k]) { p *= ss[k]; cnt++; } }
+                    else    { for (int64_t k = lb; k < ub; k++) p *= ss[k]; cnt = ub - lb; }
+                    A->prod_i = p; A->cnt = cnt; break;
+                }
+                case OP_MIN: {
+                    int64_t k = lb;
+                    if (nn) {
+                        int64_t best = 0, cnt = 0;
+                        for (; k < ub; k++) if (nn[k]) { best = ss[k]; cnt = 1; k++; break; }
+                        for (; k < ub; k++) if (nn[k]) { int64_t v = ss[k]; if (v < best) best = v; cnt++; }
+                        A->extreme_i = best; A->cnt = cnt;
+                    } else if (k < ub) {
+                        int64_t best = ss[k++];
+                        for (; k < ub; k++) { int64_t v = ss[k]; if (v < best) best = v; }
+                        A->extreme_i = best; A->cnt = ub - lb;
+                    }
+                    break;
+                }
+                case OP_MAX: {
+                    int64_t k = lb;
+                    if (nn) {
+                        int64_t best = 0, cnt = 0;
+                        for (; k < ub; k++) if (nn[k]) { best = ss[k]; cnt = 1; k++; break; }
+                        for (; k < ub; k++) if (nn[k]) { int64_t v = ss[k]; if (v > best) best = v; cnt++; }
+                        A->extreme_i = best; A->cnt = cnt;
+                    } else if (k < ub) {
+                        int64_t best = ss[k++];
+                        for (; k < ub; k++) { int64_t v = ss[k]; if (v > best) best = v; }
+                        A->extreme_i = best; A->cnt = ub - lb;
+                    }
+                    break;
+                }
+                case OP_FIRST: {
+                    if (nn) {
+                        int64_t cnt = 0;
+                        for (int64_t k = lb; k < ub; k++) if (nn[k]) {
+                            if (cnt == 0) A->extreme_i = ss[k];
+                            cnt++;
+                        }
+                        A->cnt = cnt;
+                    } else if (lb < ub) {
+                        A->extreme_i = ss[lb]; A->cnt = ub - lb;
+                    }
+                    break;
+                }
+                case OP_LAST: {
+                    if (nn) {
+                        int64_t cnt = 0, last_k = -1;
+                        for (int64_t k = lb; k < ub; k++) if (nn[k]) { last_k = k; cnt++; }
+                        if (last_k >= 0) A->extreme_i = ss[last_k];
+                        A->cnt = cnt;
+                    } else if (lb < ub) {
+                        A->extreme_i = ss[ub - 1]; A->cnt = ub - lb;
+                    }
+                    break;
+                }
+                default: break;
+                }
+            }
+        }
+
+        /* Finalize → indexed write at slot lr */
+        for (int64_t a = 0; a < n_agg; a++) {
+            wj_acc_t* A = &acc[a];
+            int8_t rty = c->agg_result_types[a];
+            bool null_out = false;
+            int64_t out_i = 0;
+            double  out_f = 0.0;
+
+            if (c->agg_raw[a]) {
+                null_out = true;
+            } else {
+                switch (c->agg_ops[a]) {
+                case OP_COUNT: out_i = A->cnt; break;
+                case OP_SUM:
+                    if (c->agg_is_float[a]) out_f = A->sum_f; else out_i = A->sum_i;
+                    break;
+                case OP_PROD:
+                    if (A->cnt == 0) null_out = true;
+                    else if (c->agg_is_float[a]) out_f = A->prod_f; else out_i = A->prod_i;
+                    break;
+                case OP_MIN: case OP_MAX: case OP_FIRST: case OP_LAST:
+                    if (A->cnt == 0) null_out = true;
+                    else if (c->agg_is_float[a]) out_f = A->extreme_f; else out_i = A->extreme_i;
+                    break;
+                case OP_AVG:
+                    if (A->cnt == 0) null_out = true;
+                    else out_f = c->agg_is_float[a]
+                        ? A->sum_f / (double)A->cnt
+                        : (double)A->sum_i / (double)A->cnt;
+                    break;
+                case OP_VAR: case OP_VAR_POP:
+                case OP_STDDEV: case OP_STDDEV_POP: {
+                    bool sample = (c->agg_ops[a] == OP_VAR || c->agg_ops[a] == OP_STDDEV);
+                    bool insuf = sample ? (A->cnt <= 1) : (A->cnt <= 0);
+                    if (insuf) { null_out = true; break; }
+                    double mean, var_pop;
+                    if (c->agg_is_float[a]) {
+                        mean    = A->sum_f / (double)A->cnt;
+                        var_pop = A->sum_sq_f / (double)A->cnt - mean * mean;
+                    } else {
+                        mean    = (double)A->sum_i / (double)A->cnt;
+                        var_pop = (double)A->sum_sq_i / (double)A->cnt - mean * mean;
+                    }
+                    if (var_pop < 0) var_pop = 0;
+                    if      (c->agg_ops[a] == OP_VAR_POP)    out_f = var_pop;
+                    else if (c->agg_ops[a] == OP_VAR)        out_f = var_pop * A->cnt / (A->cnt - 1);
+                    else if (c->agg_ops[a] == OP_STDDEV_POP) out_f = sqrt(var_pop);
+                    else                                     out_f = sqrt(var_pop * A->cnt / (A->cnt - 1));
+                    break;
+                }
+                default: null_out = true; break;
+                }
+            }
+
+            c->result_null[a][lr] = null_out ? 1 : 0;
+            if (null_out) continue;
+
+            void* rd = c->result_data[a];
+            if (rty == RAY_F64)       ((double*)rd)[lr] = out_f;
+            else if (rty == RAY_F32)  ((float*)rd)[lr]  = (float)out_f;
+            else if (rty == RAY_I64 || rty == RAY_TIMESTAMP)
+                ((int64_t*)rd)[lr] = out_i;
+            else if (rty == RAY_I32 || rty == RAY_DATE || rty == RAY_TIME)
+                ((int32_t*)rd)[lr] = (int32_t)out_i;
+            else if (rty == RAY_I16)  ((int16_t*)rd)[lr] = (int16_t)out_i;
+            else                       ((uint8_t*)rd)[lr] = (uint8_t)out_i;
+        }
+    }
+}
+
+/* (window-join t1 t2 [eq-keys] time-col)
+ * ASOF join: for each left row, find closest right row with time <= left.time
+ * within the same equality partition. */
+ray_t* ray_window_join_fn(ray_t** args, int64_t n) {
+    if (n < 4) return ray_error("domain", NULL);
+
+    /* Special form: evaluate first 4 args, keep agg dict (args[4]) unevaluated */
+    ray_t* eargs[5];
+    for (int i = 0; i < 4 && i < (int)n; i++) {
+        eargs[i] = ray_eval(args[i]);
+        if (!eargs[i] || RAY_IS_ERR(eargs[i])) {
+            for (int j = 0; j < i; j++) ray_release(eargs[j]);
+            return eargs[i] ? eargs[i] : ray_error("type", NULL);
+        }
+    }
+    eargs[4] = (n >= 5) ? args[4] : NULL; /* agg dict stays unevaluated */
+
+    /* Rayforce calling convention:
+     * (window-join [eq+time keys] intervals left right {agg}) */
+    if (n >= 5 && ray_is_vec(eargs[0]) && eargs[0]->type == RAY_SYM &&
+        eargs[2]->type == RAY_TABLE && eargs[3]->type == RAY_TABLE) {
+        /* Rayforce convention: implement at eval level.
+         * See file-scope wj_scan_fn / wj_scan_ctx_t for the parallel worker. */
+        ray_t* keys_vec = eargs[0];      /* [Sym Time] — equality + time keys */
+        ray_t* intervals = eargs[1];     /* list of [lo hi] time windows */
+        ray_t* left_tbl = eargs[2];      /* trades */
+        ray_t* right_tbl = eargs[3];     /* quotes */
+        ray_t* agg_dict = eargs[4];      /* unevaluated dict */
+
+        int64_t nkeys = ray_len(keys_vec);
+        if (nkeys < 2) return ray_error("domain", NULL);
+        int64_t* key_ids = (int64_t*)ray_data(keys_vec);
+
+        /* Last key is the time key, rest are equality keys */
+        int64_t time_key = key_ids[nkeys - 1];
+        int64_t n_eq = nkeys - 1;
+
+        int64_t left_nrows = ray_table_nrows(left_tbl);
+        int64_t right_nrows = ray_table_nrows(right_tbl);
+
+        /* Get left time column */
+        ray_t* left_time = ray_table_get_col(left_tbl, time_key);
+        ray_t* right_time = ray_table_get_col(right_tbl, time_key);
+        if (!left_time || !right_time) return ray_error("domain", NULL);
+
+        /* Get equality columns */
+        ray_t* left_eq[16], *right_eq[16];
+        for (int64_t e = 0; e < n_eq && e < 16; e++) {
+            left_eq[e] = ray_table_get_col(left_tbl, key_ids[e]);
+            right_eq[e] = ray_table_get_col(right_tbl, key_ids[e]);
+            if (!left_eq[e] || !right_eq[e]) return ray_error("domain", NULL);
+        }
+
+        /* Parse every (name, (op src)) pair from the agg dict.  The dict's
+         * physical layout is [keys (SYM vec), vals (LIST)] — read keys[i]
+         * via ray_read_sym and pair it with vals[i] from the LIST.
+         * WJ_MAX_AGG is defined at file scope (for wj_scan_ctx_t). */
+        int64_t  agg_names[WJ_MAX_AGG];
+        uint16_t agg_ops[WJ_MAX_AGG];
+        int64_t  agg_src_ids[WJ_MAX_AGG];
+        ray_t*   agg_src_vecs[WJ_MAX_AGG] = {0};
+        int8_t   agg_types[WJ_MAX_AGG];
+        int      agg_is_float[WJ_MAX_AGG];
+        ray_t*   agg_result_vecs[WJ_MAX_AGG] = {0};
+        int      agg_raw[WJ_MAX_AGG] = {0};  /* {name: Col} bare-column form — legacy placeholder */
+        int64_t  n_agg = 0;
+
+        if (agg_dict && agg_dict->type == RAY_DICT) {
+            ray_t* dkeys = ray_dict_keys(agg_dict);
+            ray_t* dvals = ray_dict_vals(agg_dict);
+            int64_t adn = (dkeys && dkeys->type == RAY_SYM) ? dkeys->len : 0;
+            ray_t** lvals = (dvals && dvals->type == RAY_LIST) ? (ray_t**)ray_data(dvals) : NULL;
+            for (int64_t di = 0; di < adn && n_agg < WJ_MAX_AGG; di++) {
+                int64_t kname_id = ray_read_sym(ray_data(dkeys), di, RAY_SYM, dkeys->attrs);
+                ray_t* expr = lvals ? lvals[di] : NULL;
+                if (!expr) continue;
+                /* (op col) aggregation form */
+                if (expr->type == RAY_LIST && expr->len >= 2) {
+                    ray_t** ae = (ray_t**)ray_data(expr);
+                    if (!(ae[0]->type == -RAY_SYM && (ae[0]->attrs & RAY_ATTR_NAME))) continue;
+                    if (!(ae[1]->type == -RAY_SYM && (ae[1]->attrs & RAY_ATTR_NAME))) continue;
+                    agg_names[n_agg]   = kname_id;
+                    agg_ops[n_agg]     = resolve_agg_opcode(ae[0]->i64);
+                    agg_src_ids[n_agg] = ae[1]->i64;
+                    agg_raw[n_agg]     = 0;
+                    n_agg++;
+                    continue;
+                }
+                /* Bare column reference — legacy map-group form, emitted as null column */
+                if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME)) {
+                    agg_names[n_agg]   = kname_id;
+                    agg_ops[n_agg]     = OP_MIN;
+                    agg_src_ids[n_agg] = expr->i64;
+                    agg_raw[n_agg]     = 1;
+                    n_agg++;
+                    continue;
+                }
+            }
+        }
+
+        /* Resolve sources, pick result types, allocate result vectors.
+         * Raw bare-column form ({name: Col}) is a legacy placeholder — it
+         * accepts any column type (numeric or not) and always produces a
+         * nullable i64 column filled with nulls. All true aggregation ops
+         * require a numeric source column. */
+        int8_t agg_result_types[WJ_MAX_AGG];
+        for (int64_t a = 0; a < n_agg; a++) {
+            if (agg_raw[a]) {
+                agg_src_vecs[a]    = NULL;
+                agg_types[a]       = RAY_I64;
+                agg_is_float[a]    = 0;
+                agg_result_types[a] = RAY_I64;
+                agg_result_vecs[a] = ray_vec_new(RAY_I64, left_nrows);
+                if (RAY_IS_ERR(agg_result_vecs[a])) {
+                    ray_t* err = agg_result_vecs[a];
+                    for (int64_t b = 0; b < a; b++) ray_release(agg_result_vecs[b]);
+                    for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+                    return err;
+                }
+                continue;
+            }
+            if (agg_ops[a] == 0) {
+                for (int64_t b = 0; b < a; b++) ray_release(agg_result_vecs[b]);
+                for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+                return ray_error("domain", NULL);
+            }
+            ray_t* src = ray_table_get_col(right_tbl, agg_src_ids[a]);
+            if (!src) {
+                for (int64_t b = 0; b < a; b++) ray_release(agg_result_vecs[b]);
+                for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+                return ray_error("domain", NULL);
+            }
+            int8_t t = src->type;
+            /* COUNT never reads source values — accept any column type. Every
+             * other aggregation reads v_i/v_f and requires a numeric source. */
+            if (agg_ops[a] != OP_COUNT) {
+                switch (t) {
+                case RAY_I64: case RAY_I32: case RAY_I16: case RAY_U8:
+                case RAY_F64: case RAY_F32: case RAY_BOOL:
+                case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP:
+                    break;
+                default:
+                    for (int64_t b = 0; b < a; b++) ray_release(agg_result_vecs[b]);
+                    for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+                    return ray_error("type", NULL);
+                }
+            }
+            agg_src_vecs[a]  = src;
+            agg_types[a]     = t;
+            agg_is_float[a]  = (t == RAY_F64 || t == RAY_F32);
+
+            int8_t rt;
+            switch (agg_ops[a]) {
+            case OP_COUNT: rt = RAY_I64; break;
+            case OP_AVG:
+            case OP_VAR: case OP_VAR_POP:
+            case OP_STDDEV: case OP_STDDEV_POP: rt = RAY_F64; break;
+            case OP_SUM: case OP_PROD:
+                rt = agg_is_float[a] ? RAY_F64 : RAY_I64; break;
+            default: /* MIN/MAX/FIRST/LAST */ rt = t; break;
+            }
+            agg_result_types[a] = rt;
+            agg_result_vecs[a] = ray_vec_new(rt, left_nrows);
+            if (RAY_IS_ERR(agg_result_vecs[a])) {
+                ray_t* err = agg_result_vecs[a];
+                for (int64_t b = 0; b < a; b++) ray_release(agg_result_vecs[b]);
+                for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+                return err;
+            }
+        }
+
+        /* wj_acc_t is defined at file scope now (used by wj_scan_fn). */
+
+        /* Sort right table by (eq_keys..., time) once so each left row only
+         * scans the quote rows whose (eq,time) fall inside its window.
+         * Per-row cost drops from O(right_nrows) to O(log right_nrows + window). */
+        ray_t*   rs_hdr = NULL, *rt_hdr = NULL, *tmp_hdr = NULL;
+        int64_t* right_sort = NULL;
+        int64_t* rt_time_i  = NULL;
+        int64_t* tmp_sort   = NULL;
+        const void* eq_data[16];
+        int8_t      eq_type[16];
+        uint8_t     eq_attrs[16];
+        for (int64_t e = 0; e < n_eq; e++) {
+            eq_data[e]  = ray_data(right_eq[e]);
+            eq_type[e]  = right_eq[e]->type;
+            eq_attrs[e] = right_eq[e]->attrs;
+        }
+        if (right_nrows > 0) {
+            right_sort = (int64_t*)scratch_alloc(&rs_hdr,  (size_t)right_nrows * sizeof(int64_t));
+            rt_time_i  = (int64_t*)scratch_alloc(&rt_hdr,  (size_t)right_nrows * sizeof(int64_t));
+            tmp_sort   = (int64_t*)scratch_alloc(&tmp_hdr, (size_t)right_nrows * sizeof(int64_t));
+            if (!right_sort || !rt_time_i || !tmp_sort) {
+                if (rs_hdr)  scratch_free(rs_hdr);
+                if (rt_hdr)  scratch_free(rt_hdr);
+                if (tmp_hdr) scratch_free(tmp_hdr);
+                for (int64_t a = 0; a < n_agg; a++) ray_release(agg_result_vecs[a]);
+                for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+                return ray_error("oom", NULL);
+            }
+            /* Cache time column access so the sort compare avoids reloading them */
+            int8_t  rt_type  = right_time->type;
+            uint8_t rt_attrs = right_time->attrs;
+            const void* rt_data = ray_data(right_time);
+            for (int64_t rr = 0; rr < right_nrows; rr++) {
+                right_sort[rr] = rr;
+                rt_time_i[rr]  = read_col_i64(rt_data, rr, rt_type, rt_attrs);
+            }
+            /* Bottom-up merge sort on index array */
+            for (int64_t width = 1; width < right_nrows; width *= 2) {
+                for (int64_t lo = 0; lo < right_nrows; lo += 2 * width) {
+                    int64_t mid = lo + width;
+                    int64_t hi  = lo + 2 * width;
+                    if (mid > right_nrows) mid = right_nrows;
+                    if (hi  > right_nrows) hi  = right_nrows;
+                    int64_t a = lo, b = mid, t = lo;
+                    while (a < mid && b < hi) {
+                        int64_t ai = right_sort[a], bi = right_sort[b];
+                        int cmp = 0;
+                        for (int64_t e = 0; e < n_eq && cmp == 0; e++) {
+                            int64_t va = read_col_i64(eq_data[e], ai, eq_type[e], eq_attrs[e]);
+                            int64_t vb = read_col_i64(eq_data[e], bi, eq_type[e], eq_attrs[e]);
+                            if (va < vb) cmp = -1;
+                            else if (va > vb) cmp = 1;
+                        }
+                        if (cmp == 0) {
+                            if      (rt_time_i[ai] < rt_time_i[bi]) cmp = -1;
+                            else if (rt_time_i[ai] > rt_time_i[bi]) cmp = 1;
+                        }
+                        tmp_sort[t++] = (cmp <= 0) ? right_sort[a++] : right_sort[b++];
+                    }
+                    while (a < mid) tmp_sort[t++] = right_sort[a++];
+                    while (b < hi)  tmp_sort[t++] = right_sort[b++];
+                    for (int64_t c = lo; c < hi; c++) right_sort[c] = tmp_sort[c];
+                }
+            }
+            scratch_free(tmp_hdr);
+            tmp_hdr  = NULL;
+            tmp_sort = NULL;
+        }
+
+        /* Preload one sorted source column per aggregation.
+         * After sorting right_sort, the hot loop wants *sequential* access
+         * (SIMD + prefetch friendly) — not an indirect gather through
+         * right_sort[k]. We materialize sorted_src_i[a][k] = value at
+         * right_sort[k] once, then every left row's window scans are a
+         * plain array walk.
+         *
+         * COUNT / raw form carry no source; nothing to preload. PROD and
+         * ops on null-containing columns still go through the slow scan
+         * (see below), so the preload is gated on the easy numeric cases. */
+        int64_t* sorted_i[WJ_MAX_AGG]  = {0};
+        double*  sorted_f[WJ_MAX_AGG]  = {0};
+        uint8_t* sorted_nn[WJ_MAX_AGG] = {0};  /* 0 = null, 1 = value present */
+        ray_t*   sorted_i_hdr[WJ_MAX_AGG] = {0};
+        ray_t*   sorted_f_hdr[WJ_MAX_AGG] = {0};
+        ray_t*   sorted_nn_hdr[WJ_MAX_AGG] = {0};
+        for (int64_t a = 0; a < n_agg; a++) {
+            if (agg_raw[a] || agg_ops[a] == OP_COUNT) continue;
+            ray_t* src = agg_src_vecs[a];
+            if (!src || right_nrows == 0) continue;
+            bool has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0;
+            if (has_nulls) {
+                sorted_nn[a] = (uint8_t*)scratch_alloc(&sorted_nn_hdr[a],
+                                                        (size_t)right_nrows);
+                if (!sorted_nn[a]) { goto wj_preload_oom; }
+            }
+            if (agg_is_float[a]) {
+                sorted_f[a] = (double*)scratch_alloc(&sorted_f_hdr[a],
+                                                      (size_t)right_nrows * sizeof(double));
+                if (!sorted_f[a]) { goto wj_preload_oom; }
+                int8_t t = agg_types[a];
+                const void* sd = ray_data(src);
+                for (int64_t k = 0; k < right_nrows; k++) {
+                    int64_t rr = right_sort[k];
+                    double v = (t == RAY_F32)
+                        ? (double)((const float*)sd)[rr]
+                        : ((const double*)sd)[rr];
+                    sorted_f[a][k] = v;
+                    if (has_nulls) sorted_nn[a][k] = ray_vec_is_null(src, rr) ? 0 : 1;
+                }
+            } else {
+                sorted_i[a] = (int64_t*)scratch_alloc(&sorted_i_hdr[a],
+                                                       (size_t)right_nrows * sizeof(int64_t));
+                if (!sorted_i[a]) { goto wj_preload_oom; }
+                const void* sd = ray_data(src);
+                int8_t t = agg_types[a];
+                uint8_t at = src->attrs;
+                for (int64_t k = 0; k < right_nrows; k++) {
+                    int64_t rr = right_sort[k];
+                    sorted_i[a][k] = read_col_i64(sd, rr, t, at);
+                    if (has_nulls) sorted_nn[a][k] = ray_vec_is_null(src, rr) ? 0 : 1;
+                }
+            }
+        }
+
+        #define WJ_CLEANUP_TEMP() do {                              \
+            if (rs_hdr) scratch_free(rs_hdr);                       \
+            if (rt_hdr) scratch_free(rt_hdr);                       \
+            if (tmp_hdr) scratch_free(tmp_hdr);                     \
+            for (int64_t _a = 0; _a < n_agg; _a++) {                \
+                if (sorted_i_hdr[_a])  scratch_free(sorted_i_hdr[_a]);  \
+                if (sorted_f_hdr[_a])  scratch_free(sorted_f_hdr[_a]);  \
+                if (sorted_nn_hdr[_a]) scratch_free(sorted_nn_hdr[_a]); \
+            }                                                       \
+        } while (0)
+
+        if (0) {
+        wj_preload_oom:
+            WJ_CLEANUP_TEMP();
+            for (int64_t a = 0; a < n_agg; a++) ray_release(agg_result_vecs[a]);
+            for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+            return ray_error("oom", NULL);
+        }
+
+        /* Pre-extract left-row metadata (interval endpoints + eq-key tuples)
+         * into flat int64 arrays. This hoists all ray_t allocation and the
+         * width-aware reads out of the hot loop so the parallel worker can
+         * process rows without touching any ref-counted objects. */
+        ray_t* lo_hdr = NULL, *hi_hdr = NULL;
+        int64_t* lo_arr = (int64_t*)scratch_alloc(&lo_hdr, (size_t)left_nrows * sizeof(int64_t));
+        int64_t* hi_arr = (int64_t*)scratch_alloc(&hi_hdr, (size_t)left_nrows * sizeof(int64_t));
+        if ((!lo_arr || !hi_arr) && left_nrows > 0) {
+            if (lo_hdr) scratch_free(lo_hdr);
+            if (hi_hdr) scratch_free(hi_hdr);
+            WJ_CLEANUP_TEMP();
+            for (int64_t a = 0; a < n_agg; a++) ray_release(agg_result_vecs[a]);
+            for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+            return ray_error("oom", NULL);
+        }
+        for (int64_t lr = 0; lr < left_nrows; lr++) {
+            int alloc_iv = 0;
+            ray_t* iv = collection_elem(intervals, lr, &alloc_iv);
+            if (!iv || RAY_IS_ERR(iv) || ray_len(iv) < 2) {
+                if (alloc_iv && iv) ray_release(iv);
+                if (lo_hdr) scratch_free(lo_hdr);
+                if (hi_hdr) scratch_free(hi_hdr);
+                WJ_CLEANUP_TEMP();
+                for (int64_t a = 0; a < n_agg; a++) ray_release(agg_result_vecs[a]);
+                for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+                return ray_error("domain", NULL);
+            }
+            int alloc_lo = 0, alloc_hi = 0;
+            ray_t* lo_atom = collection_elem(iv, 0, &alloc_lo);
+            ray_t* hi_atom = collection_elem(iv, 1, &alloc_hi);
+            lo_arr[lr] = as_i64(lo_atom);
+            hi_arr[lr] = as_i64(hi_atom);
+            if (alloc_lo) ray_release(lo_atom);
+            if (alloc_hi) ray_release(hi_atom);
+            if (alloc_iv) ray_release(iv);
+        }
+
+        ray_t*   left_eq_hdr[WJ_MAX_AGG] = {0};
+        int64_t* left_eq_arr[WJ_MAX_AGG] = {0};
+        for (int64_t e = 0; e < n_eq; e++) {
+            left_eq_arr[e] = (int64_t*)scratch_alloc(&left_eq_hdr[e],
+                                                     (size_t)left_nrows * sizeof(int64_t));
+            if (!left_eq_arr[e] && left_nrows > 0) {
+                if (lo_hdr) scratch_free(lo_hdr);
+                if (hi_hdr) scratch_free(hi_hdr);
+                for (int64_t f = 0; f < e; f++)
+                    if (left_eq_hdr[f]) scratch_free(left_eq_hdr[f]);
+                WJ_CLEANUP_TEMP();
+                for (int64_t a = 0; a < n_agg; a++) ray_release(agg_result_vecs[a]);
+                for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+                return ray_error("oom", NULL);
+            }
+            const void* sd = ray_data(left_eq[e]);
+            int8_t  t  = left_eq[e]->type;
+            uint8_t at = left_eq[e]->attrs;
+            for (int64_t lr = 0; lr < left_nrows; lr++)
+                left_eq_arr[e][lr] = read_col_i64(sd, lr, t, at);
+        }
+
+        /* Pre-size each result vector and allocate a 1-byte-per-row null
+         * staging array — writers index by lr without touching the nullmap. */
+        ray_t*   null_stage_hdr[WJ_MAX_AGG] = {0};
+        uint8_t* null_stage[WJ_MAX_AGG]     = {0};
+        for (int64_t a = 0; a < n_agg; a++) {
+            agg_result_vecs[a]->len = left_nrows;
+            null_stage[a] = (uint8_t*)scratch_alloc(&null_stage_hdr[a], (size_t)left_nrows);
+            if (!null_stage[a] && left_nrows > 0) {
+                if (lo_hdr) scratch_free(lo_hdr);
+                if (hi_hdr) scratch_free(hi_hdr);
+                for (int64_t f = 0; f < n_eq; f++) if (left_eq_hdr[f]) scratch_free(left_eq_hdr[f]);
+                for (int64_t b = 0; b < a; b++) if (null_stage_hdr[b]) scratch_free(null_stage_hdr[b]);
+                WJ_CLEANUP_TEMP();
+                for (int64_t b = 0; b < n_agg; b++) ray_release(agg_result_vecs[b]);
+                for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+                return ray_error("oom", NULL);
+            }
+            memset(null_stage[a], 0, (size_t)left_nrows);
+        }
+
+        /* Build the scan context and dispatch. */
+        wj_scan_ctx_t wctx;
+        memset(&wctx, 0, sizeof(wctx));
+        wctx.left_nrows  = left_nrows;
+        wctx.right_nrows = right_nrows;
+        wctx.n_eq        = n_eq;
+        wctx.n_agg       = n_agg;
+        wctx.lo_arr      = lo_arr;
+        wctx.hi_arr      = hi_arr;
+        wctx.right_sort  = right_sort;
+        wctx.rt_time_i   = rt_time_i;
+        for (int64_t e = 0; e < n_eq; e++) {
+            wctx.left_eq_arr[e] = left_eq_arr[e];
+            wctx.eq_data[e]     = eq_data[e];
+            wctx.eq_type[e]     = eq_type[e];
+            wctx.eq_attrs[e]    = eq_attrs[e];
+        }
+        for (int64_t a = 0; a < n_agg; a++) {
+            wctx.agg_raw[a]          = (uint8_t)agg_raw[a];
+            wctx.agg_ops[a]          = agg_ops[a];
+            wctx.agg_result_types[a] = agg_result_types[a];
+            wctx.agg_is_float[a]     = agg_is_float[a];
+            wctx.sorted_i[a]         = sorted_i[a];
+            wctx.sorted_f[a]         = sorted_f[a];
+            wctx.sorted_nn[a]        = sorted_nn[a];
+            wctx.result_data[a]      = ray_data(agg_result_vecs[a]);
+            wctx.result_null[a]      = null_stage[a];
+        }
+
+        ray_pool_t* pool = ray_pool_get();
+        if (pool && left_nrows >= 2048) {
+            ray_pool_dispatch(pool, wj_scan_fn, &wctx, left_nrows);
+        } else {
+            wj_scan_fn(&wctx, 0, 0, left_nrows);
+        }
+
+        /* Apply staged null flags to each result vec's null bitmap sequentially. */
+        for (int64_t a = 0; a < n_agg; a++) {
+            ray_t* rv = agg_result_vecs[a];
+            const uint8_t* stage = null_stage[a];
+            for (int64_t lr = 0; lr < left_nrows; lr++)
+                if (stage[lr]) ray_vec_set_null(rv, lr, true);
+        }
+
+        /* Free pre-extract scratch */
+        if (lo_hdr) scratch_free(lo_hdr);
+        if (hi_hdr) scratch_free(hi_hdr);
+        for (int64_t e = 0; e < n_eq; e++)
+            if (left_eq_hdr[e]) scratch_free(left_eq_hdr[e]);
+        for (int64_t a = 0; a < n_agg; a++)
+            if (null_stage_hdr[a]) scratch_free(null_stage_hdr[a]);
+
+
+        WJ_CLEANUP_TEMP();
+        #undef WJ_CLEANUP_TEMP
+
+        /* Build result table: left columns + every aggregation column */
+        int64_t ncols = ray_table_ncols(left_tbl);
+        ray_t* result = ray_table_new(ncols + n_agg);
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t cn = ray_table_col_name(left_tbl, c);
+            ray_t* cv = ray_table_get_col_idx(left_tbl, c);
+            ray_retain(cv);
+            result = ray_table_add_col(result, cn, cv);
+            ray_release(cv);
+        }
+        for (int64_t a = 0; a < n_agg; a++) {
+            result = ray_table_add_col(result, agg_names[a], agg_result_vecs[a]);
+            ray_release(agg_result_vecs[a]);
+        }
+        for (int i = 0; i < 4; i++) ray_release(eargs[i]);
+        return result;
+        #undef WJ_MAX_AGG
+    }
+
+    ray_t* left_tbl  = eargs[0];
+    ray_t* right_tbl = eargs[1];
+    ray_t* eq_keys   = eargs[2];
+    ray_t* time_sym  = eargs[3];
+
+    if (left_tbl->type != RAY_TABLE || right_tbl->type != RAY_TABLE)
+        return ray_error("type", NULL);
+    if (time_sym->type != -RAY_SYM)
+        return ray_error("type", NULL);
+
+    uint8_t n_eq = 0;
+    ray_t** eq_elems = NULL;
+    ray_t* _bxeq = NULL;
+    eq_keys = unbox_vec_arg(eq_keys, &_bxeq);
+    if (is_list(eq_keys)) {
+        n_eq = (uint8_t)ray_len(eq_keys);
+        eq_elems = (ray_t**)ray_data(eq_keys);
+    }
+
+    ray_graph_t* g = ray_graph_new(left_tbl);
+    if (!g) return ray_error("oom", NULL);
+
+    ray_op_t* left_node  = ray_const_table(g, left_tbl);
+    ray_op_t* right_node = ray_const_table(g, right_tbl);
+
+    ray_t* tname = ray_sym_str(time_sym->i64);
+    if (!tname) { ray_graph_free(g); return ray_error("domain", NULL); }
+    ray_op_t* time_op = ray_scan(g, ray_str_ptr(tname));
+    if (!time_op) { ray_graph_free(g); return ray_error("domain", NULL); }
+
+    ray_op_t* eq_ops[16];
+    for (uint8_t i = 0; i < n_eq; i++) {
+        if (eq_elems[i]->type != -RAY_SYM) {
+            ray_graph_free(g);
+            return ray_error("type", NULL);
+        }
+        ray_t* nm = ray_sym_str(eq_elems[i]->i64);
+        if (!nm) { ray_graph_free(g); return ray_error("domain", NULL); }
+        eq_ops[i] = ray_scan(g, ray_str_ptr(nm));
+        if (!eq_ops[i]) { ray_graph_free(g); return ray_error("domain", NULL); }
+    }
+
+    if (_bxeq) ray_release(_bxeq);
+
+    ray_op_t* jn = ray_asof_join(g, left_node, right_node,
+                                time_op, eq_ops, n_eq, 1);
+    if (!jn) { ray_graph_free(g); return ray_error("oom", NULL); }
+
+    jn = ray_optimize(g, jn);
+    ray_t* result = ray_execute(g, jn);
+    ray_graph_free(g);
+    return result;
+}
+
+/* (asof-join [key1 key2 ... timeKey] leftTable rightTable)
+ * Last key is the time/asof column, rest are equality keys. */
+ray_t* ray_asof_join_fn(ray_t** args, int64_t n) {
+    if (n < 3) return ray_error("arity", NULL);
+    ray_t* keys_vec   = args[0];
+    ray_t* left_tbl   = args[1];
+    ray_t* right_tbl  = args[2];
+
+    if (left_tbl->type != RAY_TABLE || right_tbl->type != RAY_TABLE)
+        return ray_error("type", NULL);
+
+    /* Keys vector must be a SYM vector with at least 2 elements (eq + time) */
+    ray_t* _bxk = NULL;
+    keys_vec = unbox_vec_arg(keys_vec, &_bxk);
+    if (!is_list(keys_vec) || ray_len(keys_vec) < 2) {
+        if (_bxk) ray_release(_bxk);
+        return ray_error("domain", NULL);
+    }
+    ray_t** kelems = (ray_t**)ray_data(keys_vec);
+    int64_t nkeys = ray_len(keys_vec);
+
+    /* Last key is the time column */
+    ray_t* time_sym = kelems[nkeys - 1];
+    if (time_sym->type != -RAY_SYM) {
+        if (_bxk) ray_release(_bxk);
+        return ray_error("type", NULL);
+    }
+
+    /* Remaining keys are equality keys */
+    uint8_t n_eq = (uint8_t)(nkeys - 1);
+    ray_t** eq_syms = kelems; /* first n_eq elements */
+
+    ray_graph_t* g = ray_graph_new(left_tbl);
+    if (!g) { if (_bxk) ray_release(_bxk); return ray_error("oom", NULL); }
+
+    ray_op_t* left_node  = ray_const_table(g, left_tbl);
+    ray_op_t* right_node = ray_const_table(g, right_tbl);
+
+    ray_t* tname = ray_sym_str(time_sym->i64);
+    if (!tname) { ray_graph_free(g); if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+    ray_op_t* time_op = ray_scan(g, ray_str_ptr(tname));
+    if (!time_op) { ray_graph_free(g); if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+
+    ray_op_t* eq_ops[16];
+    for (uint8_t i = 0; i < n_eq; i++) {
+        if (eq_syms[i]->type != -RAY_SYM) {
+            ray_graph_free(g); if (_bxk) ray_release(_bxk);
+            return ray_error("type", NULL);
+        }
+        ray_t* nm = ray_sym_str(eq_syms[i]->i64);
+        if (!nm) { ray_graph_free(g); if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+        eq_ops[i] = ray_scan(g, ray_str_ptr(nm));
+        if (!eq_ops[i]) { ray_graph_free(g); if (_bxk) ray_release(_bxk); return ray_error("domain", NULL); }
+    }
+
+    if (_bxk) ray_release(_bxk);
+
+    ray_op_t* jn = ray_asof_join(g, left_node, right_node,
+                                time_op, eq_ops, n_eq, 1);
+    if (!jn) { ray_graph_free(g); return ray_error("oom", NULL); }
+
+    jn = ray_optimize(g, jn);
+    ray_t* result = ray_execute(g, jn);
+    ray_graph_free(g);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/rerank.c b/crates/rayforce-sys/vendor/rayforce/src/ops/rerank.c
new file mode 100644
index 0000000..a35b94b
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/rerank.c
@@ -0,0 +1,546 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/*
+ * Rerank executors: combine a filtered source table with a top-K
+ * nearest-neighbour step (index-backed ANN or brute-force KNN) in
+ * one DAG op.  Used by `select ... where <p> nearest (ann|knn ...) take k`.
+ */
+
+#include "ops/internal.h"
+#include "ops/rowsel.h"
+#include "mem/sys.h"
+#include "store/hnsw.h"
+#include <math.h>
+#include <string.h>
+
+/* ==========================================================================
+ *  Helpers
+ * ========================================================================== */
+
+/* Element access into a numeric ray_t vector (F32 / F64 / I32 / I64) → double. */
+static double rr_at_f64(ray_t* v, int64_t i) {
+    void* d = ray_data(v);
+    switch (v->type) {
+        case RAY_F32: return (double)((float*)d)[i];
+        case RAY_F64: return ((double*)d)[i];
+        case RAY_I32: return (double)((int32_t*)d)[i];
+        case RAY_I64: return (double)((int64_t*)d)[i];
+        default:      return 0.0;
+    }
+}
+
+static bool rr_is_numeric(ray_t* v) {
+    if (!v || !ray_is_vec(v)) return false;
+    return v->type == RAY_F32 || v->type == RAY_F64
+        || v->type == RAY_I32 || v->type == RAY_I64;
+}
+
+/* Distance metrics — mirror row_score in src/ops/embedding.c. */
+typedef enum { RR_COS_DIST, RR_IP_NEG, RR_L2_DIST } rr_metric_t;
+
+static rr_metric_t rr_metric_from_hnsw(int32_t m) {
+    switch ((ray_hnsw_metric_t)m) {
+        case RAY_HNSW_L2: return RR_L2_DIST;
+        case RAY_HNSW_IP: return RR_IP_NEG;
+        case RAY_HNSW_COSINE:
+        default:          return RR_COS_DIST;
+    }
+}
+
+static double rr_row_dist(rr_metric_t m, ray_t* row,
+                           const double* q, double q_norm, int32_t dim) {
+    double acc = 0.0, r_norm_sq = 0.0;
+    if (m == RR_L2_DIST) {
+        for (int32_t j = 0; j < dim; j++) {
+            double d = rr_at_f64(row, j) - q[j];
+            acc += d * d;
+        }
+        return sqrt(acc);
+    }
+    for (int32_t j = 0; j < dim; j++) {
+        double a = rr_at_f64(row, j);
+        acc += a * q[j];
+        if (m == RR_COS_DIST) r_norm_sq += a * a;
+    }
+    if (m == RR_IP_NEG) return -acc;
+    double denom = q_norm * sqrt(r_norm_sq);
+    double sim = (denom > 0.0) ? acc / denom : 0.0;
+    return 1.0 - sim;
+}
+
+/* Build an empty-rows clone of the source schema plus a trailing _dist
+ * column (F64, len=0).  Used for both the "source is empty" and "filter
+ * rejected everything" cases so callers always get a stable 4-column
+ * table shape. */
+static ray_t* empty_result_with_dist(ray_t* src) {
+    int64_t ncols = ray_table_ncols(src);
+    ray_t* out = ray_table_new(ncols + 1);
+    if (!out || RAY_IS_ERR(out)) return NULL;
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* sc = ray_table_get_col_idx(src, c);
+        if (!sc) continue;
+        int8_t ct = RAY_IS_PARTED(sc->type)
+                  ? (int8_t)RAY_PARTED_BASETYPE(sc->type) : sc->type;
+        ray_t* nc = (ct == RAY_LIST) ? ray_list_new(0) : ray_vec_new(ct, 0);
+        if (!nc || RAY_IS_ERR(nc)) { ray_release(out); return NULL; }
+        nc->len = 0;
+        out = ray_table_add_col(out, ray_table_col_name(src, c), nc);
+        ray_release(nc);
+        if (RAY_IS_ERR(out)) return NULL;
+    }
+    ray_t* dv = ray_vec_new(RAY_F64, 0);
+    if (!dv || RAY_IS_ERR(dv)) { ray_release(out); return NULL; }
+    dv->len = 0;
+    out = ray_table_add_col(out, sym_intern_safe("_dist", 5), dv);
+    ray_release(dv);
+    return out;
+}
+
+/* Gather k rows from `tbl` at dense `rowids[]`, appending a `_dist` F64
+ * column with the parallel distances.  Caller owns the returned table.
+ * Returns NULL on OOM. */
+static ray_t* gather_rows_with_dist(ray_t* tbl,
+                                     const int64_t* rowids, const double* dists,
+                                     int64_t k) {
+    int64_t ncols = ray_table_ncols(tbl);
+    ray_t* result = ray_table_new(ncols + 1);
+    if (!result || RAY_IS_ERR(result)) return NULL;
+
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* src_col = ray_table_get_col_idx(tbl, c);
+        if (!src_col) { ray_release(result); return NULL; }
+
+        /* PARTED columns carry ray_t** segment pointers in their data
+         * region, not raw element bytes — the byte-wise gather below
+         * would read pointer values as column data.  Reject with a clear
+         * error rather than produce garbage; PARTED support is future work. */
+        if (RAY_IS_PARTED(src_col->type)) {
+            ray_release(result);
+            return ray_error("nyi",
+                "nearest: PARTED columns not supported in result projection");
+        }
+
+        int8_t ct = src_col->type;
+
+        /* Allocate the destination column with the right shape.  col_vec_new
+         * handles SYM width preservation; LIST uses its own constructor. */
+        ray_t* new_col = (ct == RAY_LIST) ? ray_list_new(k) : col_vec_new(src_col, k);
+        if (!new_col || RAY_IS_ERR(new_col)) { ray_release(result); return NULL; }
+        new_col->len = k;
+
+        if (ct == RAY_LIST) {
+            ray_t** d = (ray_t**)ray_data(new_col);
+            ray_t** s = (ray_t**)ray_data(src_col);
+            for (int64_t i = 0; i < k; i++) {
+                d[i] = s[rowids[i]];
+                if (d[i]) ray_retain(d[i]);
+            }
+        } else {
+            /* All fixed-width types (including SYM at any width, RAY_STR's
+             * 16-byte inline cells, DATE/TIME/TIMESTAMP, GUID) go through
+             * byte-wise memcpy driven by the column's element size.
+             * Mirrors sel_compact's gather convention. */
+            uint8_t esz = col_esz(src_col);
+            char* dst = (char*)ray_data(new_col);
+            const char* src = (const char*)ray_data(src_col);
+            for (int64_t i = 0; i < k; i++)
+                memcpy(dst + i * esz, src + rowids[i] * esz, esz);
+
+            /* RAY_STR: share the source pool (inline bytes reference
+             * pooled long-string data). */
+            if (ct == RAY_STR) col_propagate_str_pool(new_col, src_col);
+
+            /* RAY_SYM: propagate the per-vector sym_dict so narrow-width
+             * local indices resolve against the same dictionary.  For
+             * sliced SYM columns the sym_dict lives on the slice_parent
+             * (the slice's own union slot holds slice_parent/offset).
+             * Guards against the inline-nullmap aliasing mirror sort.c:3307. */
+            if (ct == RAY_SYM) {
+                const ray_t* dict_owner = (src_col->attrs & RAY_ATTR_SLICE)
+                                        ? src_col->slice_parent : src_col;
+                if (dict_owner &&
+                    (!(dict_owner->attrs & RAY_ATTR_HAS_NULLS) ||
+                     (dict_owner->attrs & RAY_ATTR_NULLMAP_EXT)) &&
+                    dict_owner->sym_dict) {
+                    ray_retain(dict_owner->sym_dict);
+                    new_col->sym_dict = dict_owner->sym_dict;
+                }
+            }
+
+            /* Null bitmap: the shared col_propagate_nulls_gather only
+             * inspects src's own attrs — for a sliced src it misses
+             * HAS_NULLS on the parent.  Mirror sort.c:3315's slice-aware
+             * check so sliced source columns don't lose their nulls. */
+            bool src_has_nulls =
+                (src_col->attrs & RAY_ATTR_HAS_NULLS) ||
+                ((src_col->attrs & RAY_ATTR_SLICE) && src_col->slice_parent &&
+                 (src_col->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+            if (src_has_nulls) {
+                for (int64_t r = 0; r < k; r++) {
+                    if (ray_vec_is_null(src_col, rowids[r]))
+                        ray_vec_set_null(new_col, r, true);
+                }
+            }
+        }
+
+        ray_t* prev = result;
+        result = ray_table_add_col(result, ray_table_col_name(tbl, c), new_col);
+        ray_release(new_col);
+        if (!result || RAY_IS_ERR(result)) {
+            /* ray_table_add_col's error paths don't release the input
+             * table when they fail mid-way (cow may have returned the
+             * same pointer).  Release our prior accumulator to avoid
+             * leaking the partially-built table and its retained cols. */
+            if (prev && !RAY_IS_ERR(prev) && prev != result) ray_release(prev);
+            return NULL;
+        }
+    }
+
+    /* Append _dist column */
+    ray_t* dist_vec = ray_vec_new(RAY_F64, k);
+    if (!dist_vec || RAY_IS_ERR(dist_vec)) { ray_release(result); return NULL; }
+    dist_vec->len = k;
+    double* dd = (double*)ray_data(dist_vec);
+    for (int64_t i = 0; i < k; i++) dd[i] = dists[i];
+    ray_t* prev = result;
+    result = ray_table_add_col(result, sym_intern_safe("_dist", 5), dist_vec);
+    ray_release(dist_vec);
+    if (!result || RAY_IS_ERR(result)) {
+        if (prev && !RAY_IS_ERR(prev) && prev != result) ray_release(prev);
+        return NULL;
+    }
+    return result;
+}
+
+/* Extract the accepted rowid set from a possibly-lazy source table.
+ *
+ * Returns:
+ *   - NULL pointer AND *count = nrows     — no filter: identity scan, all rows accepted.
+ *   - NULL pointer AND *count = 0         — filter rejected every row.
+ *   - NULL pointer AND *count = -1        — ALLOCATION FAILURE: caller must propagate OOM.
+ *   - non-NULL pointer AND *count > 0     — explicit rowid list to walk.
+ *
+ * `g->selection` is always cleared before returning when this helper has
+ * observed it — success or failure — so downstream ops don't double-filter. */
+static int64_t* accepted_rowids(ray_graph_t* g, int64_t nrows, int64_t* count) {
+    if (!g->selection) { *count = nrows; return NULL; }
+
+    int64_t n_accepted = ray_rowsel_meta(g->selection)->total_pass;
+
+    /* Consume the selection up front so all exit paths leave g->selection
+     * clean regardless of downstream allocation outcomes. */
+    ray_t* sel = g->selection;
+    g->selection = NULL;
+
+    if (n_accepted == 0) {
+        ray_release(sel);
+        *count = 0;
+        return NULL;
+    }
+
+    ray_t* idx_blk = ray_rowsel_to_indices(sel);
+    if (!idx_blk) {
+        ray_release(sel);
+        *count = -1;  /* OOM */
+        return NULL;
+    }
+
+    int64_t* dense = (int64_t*)ray_sys_alloc((size_t)n_accepted * sizeof(int64_t));
+    if (!dense) {
+        ray_release(idx_blk);
+        ray_release(sel);
+        *count = -1;  /* OOM */
+        return NULL;
+    }
+    memcpy(dense, ray_data(idx_blk), (size_t)n_accepted * sizeof(int64_t));
+    ray_release(idx_blk);
+    ray_release(sel);
+    *count = n_accepted;
+    return dense;
+}
+
+/* Max-heap top-K by distance (lower=closer).  Mirrors the heap in
+ * src/ops/embedding.c:ray_knn_fn. */
+typedef struct { double d; int64_t id; } rr_ent_t;
+
+static void rr_heap_insert(rr_ent_t* heap, int64_t k, int64_t* size,
+                            double d, int64_t id) {
+    if (*size < k) {
+        int64_t j = (*size)++;
+        heap[j] = (rr_ent_t){ d, id };
+        while (j > 0) {
+            int64_t p = (j - 1) / 2;
+            if (heap[p].d >= heap[j].d) break;
+            rr_ent_t t = heap[p]; heap[p] = heap[j]; heap[j] = t;
+            j = p;
+        }
+    } else if (d < heap[0].d) {
+        heap[0] = (rr_ent_t){ d, id };
+        int64_t j = 0;
+        for (;;) {
+            int64_t l = 2*j+1, r = 2*j+2, best = j;
+            if (l < *size && heap[l].d > heap[best].d) best = l;
+            if (r < *size && heap[r].d > heap[best].d) best = r;
+            if (best == j) break;
+            rr_ent_t t = heap[j]; heap[j] = heap[best]; heap[best] = t;
+            j = best;
+        }
+    }
+}
+
+static void rr_heap_sort(rr_ent_t* heap, int64_t size) {
+    /* Insertion sort ascending by distance — size is small. */
+    for (int64_t i = 1; i < size; i++) {
+        rr_ent_t key = heap[i];
+        int64_t j = i - 1;
+        while (j >= 0 && heap[j].d > key.d) {
+            heap[j + 1] = heap[j];
+            j--;
+        }
+        heap[j + 1] = key;
+    }
+}
+
+/* ==========================================================================
+ *  exec_ann_rerank — index-backed, filter-aware iterative scan.
+ *
+ *  Pushes the filter's accepted-rowid bitmap into HNSW's beam search as
+ *  a predicate callback (`ray_hnsw_search_filter`).  Rejected nodes are
+ *  still traversed for graph connectivity; only accepted nodes enter the
+ *  result heap.  This replaces the prior oversample+refilter loop which
+ *  degraded to near-full-scan for highly selective filters with no
+ *  recall guarantee.
+ * ========================================================================== */
+
+/* Predicate context — membership bitmap over the index's row space. */
+typedef struct {
+    const uint8_t* member;
+    int64_t        n_nodes;
+} rr_member_ctx_t;
+
+static bool rr_member_accept(int64_t node_id, void* ctx) {
+    const rr_member_ctx_t* c = (const rr_member_ctx_t*)ctx;
+    if (node_id < 0 || node_id >= c->n_nodes) return false;
+    return (c->member[node_id / 8] >> (node_id % 8)) & 1;
+}
+
+ray_t* exec_ann_rerank(ray_graph_t* g, ray_op_t* op, ray_t* src) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+    if (!src || src->type != RAY_TABLE) return ray_error("type", NULL);
+
+    ray_hnsw_t* idx    = (ray_hnsw_t*)ext->rerank.hnsw_idx;
+    const float* query = ext->rerank.query_vec;
+    int32_t      dim   = ext->rerank.dim;
+    int64_t      k     = ext->rerank.k;
+    int32_t      ef    = ext->rerank.ef_search;
+    if (!idx || !query || dim <= 0 || k <= 0) return ray_error("schema", NULL);
+    if (dim != idx->dim) return ray_error("length", NULL);
+
+    int64_t src_rows = ray_table_nrows(src);
+
+    /* Special-case empty source: return a well-shaped empty result. */
+    if (src_rows == 0) {
+        if (g->selection) { ray_release(g->selection); g->selection = NULL; }
+        ray_t* r = empty_result_with_dist(src);
+        return r ? r : ray_error("oom", NULL);
+    }
+
+    int64_t accepted_count = 0;
+    int64_t* accepted = accepted_rowids(g, src_rows, &accepted_count);
+    if (accepted_count < 0) return ray_error("oom", NULL);
+    if (accepted_count == 0) {
+        ray_t* r = empty_result_with_dist(src);
+        return r ? r : ray_error("oom", NULL);
+    }
+
+    int64_t n_nodes = idx->n_nodes;
+    int32_t ef_search = ef;
+    if ((int64_t)ef_search < k) ef_search = (int32_t)k;
+
+    int64_t* out_ids = (int64_t*)ray_sys_alloc((size_t)k * sizeof(int64_t));
+    double*  out_ds  = (double*)ray_sys_alloc((size_t)k * sizeof(double));
+    if (!out_ids || !out_ds) {
+        if (out_ids) ray_sys_free(out_ids);
+        if (out_ds)  ray_sys_free(out_ds);
+        if (accepted) ray_sys_free(accepted);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t n_found;
+    if (!accepted) {
+        /* No filter — plain search with no per-candidate callback. */
+        n_found = ray_hnsw_search(idx, query, dim, k, ef_search, out_ids, out_ds);
+    } else {
+        /* Build membership bitmap over the index's row space and hand it
+         * to the filtered iterative scan as a predicate callback. */
+        size_t bm_size = ((size_t)n_nodes + 7) / 8;
+        uint8_t* member = (uint8_t*)ray_sys_alloc(bm_size);
+        if (!member) {
+            ray_sys_free(out_ids); ray_sys_free(out_ds); ray_sys_free(accepted);
+            return ray_error("oom", NULL);
+        }
+        memset(member, 0, bm_size);
+        for (int64_t i = 0; i < accepted_count; i++) {
+            int64_t rid = accepted[i];
+            if (rid >= 0 && rid < n_nodes) member[rid / 8] |= (uint8_t)(1u << (rid % 8));
+        }
+        ray_sys_free(accepted);
+        accepted = NULL;
+
+        rr_member_ctx_t cb_ctx = { .member = member, .n_nodes = n_nodes };
+        n_found = ray_hnsw_search_filter(idx, query, dim, k, ef_search,
+                                          rr_member_accept, &cb_ctx,
+                                          out_ids, out_ds);
+        ray_sys_free(member);
+    }
+    if (accepted) ray_sys_free(accepted);
+
+    /* ray_hnsw_search / _filter return -1 on internal OOM — surface it as
+     * an error rather than silently returning a zero-row table. */
+    if (n_found < 0) {
+        ray_sys_free(out_ids);
+        ray_sys_free(out_ds);
+        return ray_error("oom", NULL);
+    }
+
+    ray_t* result = gather_rows_with_dist(src, out_ids, out_ds, n_found);
+    ray_sys_free(out_ids);
+    ray_sys_free(out_ds);
+    if (!result) return ray_error("oom", NULL);
+    return result;
+}
+
+/* ==========================================================================
+ *  exec_knn_rerank — brute force over a filtered column
+ * ========================================================================== */
+
+ray_t* exec_knn_rerank(ray_graph_t* g, ray_op_t* op, ray_t* src) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+    if (!src || src->type != RAY_TABLE) return ray_error("type", NULL);
+
+    int64_t      col_sym = ext->rerank.col_sym;
+    const float* query   = ext->rerank.query_vec;
+    int32_t      dim     = ext->rerank.dim;
+    int64_t      k       = ext->rerank.k;
+    rr_metric_t  metric  = rr_metric_from_hnsw(ext->rerank.metric);
+    if (col_sym <= 0 || !query || dim <= 0 || k <= 0) return ray_error("schema", NULL);
+
+    /* Special-case empty source: return a well-shaped empty result rather
+     * than falling into the top-K code with k_eff=0. */
+    int64_t src_rows = ray_table_nrows(src);
+    if (src_rows == 0) {
+        /* Consume any dangling selection to keep downstream ops clean. */
+        if (g->selection) { ray_release(g->selection); g->selection = NULL; }
+        ray_t* r = empty_result_with_dist(src);
+        return r ? r : ray_error("oom", NULL);
+    }
+
+    /* We walk the ORIGINAL source table and skip non-accepted rows via
+     * an accepted-rowid list.  Avoids sel_compact, which currently
+     * doesn't correctly materialise RAY_LIST columns. */
+    ray_t* col = ray_table_get_col(src, col_sym);
+    if (!col) return ray_error("name", NULL);
+    if (col->type != RAY_LIST) return ray_error("type", NULL);
+
+    int64_t nrows = col->len;
+
+    int64_t  accepted_count = 0;
+    int64_t* accepted = accepted_rowids(g, nrows, &accepted_count);
+    if (accepted_count < 0) return ray_error("oom", NULL);
+    if (accepted_count == 0) {
+        ray_t* r = empty_result_with_dist(src);
+        return r ? r : ray_error("oom", NULL);
+    }
+
+    /* Convert query float* → double[] + norm. */
+    double* q_buf = (double*)ray_sys_alloc((size_t)dim * sizeof(double));
+    if (!q_buf) { if (accepted) ray_sys_free(accepted); return ray_error("oom", NULL); }
+    double q_norm_sq = 0.0;
+    for (int32_t j = 0; j < dim; j++) {
+        q_buf[j] = (double)query[j];
+        q_norm_sq += q_buf[j] * q_buf[j];
+    }
+    double q_norm = sqrt(q_norm_sq);
+
+    int64_t k_eff = k;
+    if (k_eff > accepted_count) k_eff = accepted_count;
+
+    rr_ent_t* heap = (rr_ent_t*)ray_sys_alloc((size_t)k_eff * sizeof(rr_ent_t));
+    if (!heap) {
+        ray_sys_free(q_buf); if (accepted) ray_sys_free(accepted);
+        return ray_error("oom", NULL);
+    }
+    int64_t heap_size = 0;
+
+    /* Walk accepted rows — identity scan if no filter, dense rowid list otherwise. */
+    if (accepted) {
+        for (int64_t ai = 0; ai < accepted_count; ai++) {
+            int64_t i = accepted[ai];
+            if (i < 0 || i >= nrows) continue;
+            ray_t* row = ray_list_get(col, i);
+            if (!rr_is_numeric(row) || row->len != dim) {
+                ray_sys_free(heap); ray_sys_free(q_buf); ray_sys_free(accepted);
+                return ray_error("type", NULL);
+            }
+            double d = rr_row_dist(metric, row, q_buf, q_norm, dim);
+            rr_heap_insert(heap, k_eff, &heap_size, d, i);
+        }
+    } else {
+        for (int64_t i = 0; i < nrows; i++) {
+            ray_t* row = ray_list_get(col, i);
+            if (!rr_is_numeric(row) || row->len != dim) {
+                ray_sys_free(heap); ray_sys_free(q_buf);
+                return ray_error("type", NULL);
+            }
+            double d = rr_row_dist(metric, row, q_buf, q_norm, dim);
+            rr_heap_insert(heap, k_eff, &heap_size, d, i);
+        }
+    }
+    ray_sys_free(q_buf);
+    if (accepted) ray_sys_free(accepted);
+
+    rr_heap_sort(heap, heap_size);
+
+    int64_t* out_ids = (int64_t*)ray_sys_alloc((size_t)heap_size * sizeof(int64_t));
+    double*  out_ds  = (double*)ray_sys_alloc((size_t)heap_size * sizeof(double));
+    if ((!out_ids || !out_ds) && heap_size > 0) {
+        if (out_ids) ray_sys_free(out_ids);
+        if (out_ds)  ray_sys_free(out_ds);
+        ray_sys_free(heap);
+        return ray_error("oom", NULL);
+    }
+    for (int64_t i = 0; i < heap_size; i++) {
+        out_ids[i] = heap[i].id;
+        out_ds[i]  = heap[i].d;
+    }
+    ray_sys_free(heap);
+
+    ray_t* result = gather_rows_with_dist(src, out_ids, out_ds, heap_size);
+    if (out_ids) ray_sys_free(out_ids);
+    if (out_ds)  ray_sys_free(out_ds);
+    if (!result) return ray_error("oom", NULL);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.c b/crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.c
new file mode 100644
index 0000000..aa83b2d
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.c
@@ -0,0 +1,445 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/*
+ * ray_rowsel — implementation.  See src/ops/rowsel.h for the data
+ * layout and lifetime contract.
+ */
+
+#include "ops/rowsel.h"
+#include "ops/ops.h"
+#include "core/pool.h"
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+
+/* ──────────────────────────────────────────────────────────────────
+ * Allocation helpers
+ * ────────────────────────────────────────────────────────────────── */
+
+ray_t* ray_rowsel_new(int64_t nrows, int64_t total_pass, int64_t idx_count) {
+    if (nrows < 0 || total_pass < 0 || total_pass > nrows ||
+        idx_count < 0 || idx_count > total_pass) return NULL;
+
+    size_t payload = ray_rowsel_payload_bytes(nrows, idx_count);
+    ray_t* block = ray_alloc(payload);
+    if (!block) return NULL;
+
+    /* ray_alloc zeroes the 32-byte header but NOT the data area.
+     * Initialize the inline meta header explicitly; arrays are filled
+     * by the producer after this call. */
+    ray_rowsel_t* m = ray_rowsel_meta(block);
+    m->total_pass = total_pass;
+    m->nrows      = nrows;
+    m->n_segs     = (uint32_t)((nrows + RAY_MORSEL_ELEMS - 1) / RAY_MORSEL_ELEMS);
+    if (nrows <= 0) m->n_segs = 0;
+    m->_pad       = 0;
+
+    return block;
+}
+
+void ray_rowsel_release(ray_t* block) {
+    if (block) ray_release(block);
+}
+
+/* ──────────────────────────────────────────────────────────────────
+ * Producer — parallel two-pass build from a RAY_BOOL pred vec
+ * ────────────────────────────────────────────────────────────────── */
+
+/* Pass 1 worker context.  Each worker owns a disjoint segment range
+ * [start, end) and writes per-segment popcounts into popcount[]. */
+typedef struct {
+    const uint8_t* pred_data;
+    int64_t        nrows;
+    uint32_t*      popcount;     /* one entry per segment */
+} rowsel_pass1_ctx_t;
+
+static void rowsel_pass1_fn(void* vctx, uint32_t worker_id,
+                            int64_t start_seg, int64_t end_seg) {
+    (void)worker_id;
+    rowsel_pass1_ctx_t* c = (rowsel_pass1_ctx_t*)vctx;
+    const uint8_t* pred = c->pred_data;
+    int64_t nrows = c->nrows;
+    uint32_t* popcount = c->popcount;
+
+    for (int64_t seg = start_seg; seg < end_seg; seg++) {
+        int64_t base = seg * RAY_MORSEL_ELEMS;
+        int64_t end  = base + RAY_MORSEL_ELEMS;
+        if (end > nrows) end = nrows;
+        uint32_t n = 0;
+        for (int64_t r = base; r < end; r++)
+            n += pred[r] != 0;
+        popcount[seg] = n;
+    }
+}
+
+/* Pass 2 worker context.  Each worker owns a disjoint segment range
+ * and writes morsel-local indices into the (already-sized) idx[]
+ * array.  Workers never overlap because each segment's slice
+ * idx[seg_offsets[seg] .. seg_offsets[seg+1]) is exclusive. */
+typedef struct {
+    const uint8_t*  pred_data;
+    int64_t         nrows;
+    const uint8_t*  seg_flags;
+    const uint32_t* seg_offsets;
+    uint16_t*       idx;
+} rowsel_pass2_ctx_t;
+
+static void rowsel_pass2_fn(void* vctx, uint32_t worker_id,
+                            int64_t start_seg, int64_t end_seg) {
+    (void)worker_id;
+    rowsel_pass2_ctx_t* c = (rowsel_pass2_ctx_t*)vctx;
+    const uint8_t* pred = c->pred_data;
+    int64_t nrows = c->nrows;
+
+    for (int64_t seg = start_seg; seg < end_seg; seg++) {
+        if (c->seg_flags[seg] != RAY_SEL_MIX) continue;  /* NONE / ALL: nothing to write */
+        int64_t base = seg * RAY_MORSEL_ELEMS;
+        int64_t end  = base + RAY_MORSEL_ELEMS;
+        if (end > nrows) end = nrows;
+        uint16_t* out = c->idx + c->seg_offsets[seg];
+        uint32_t  out_n = 0;
+        for (int64_t r = base; r < end; r++) {
+            if (pred[r])
+                out[out_n++] = (uint16_t)(r - base);
+        }
+        /* sanity: out_n must equal seg_offsets[seg+1] - seg_offsets[seg] */
+    }
+}
+
+ray_t* ray_rowsel_from_pred(ray_t* pred) {
+    if (!pred || pred->type != RAY_BOOL) return NULL;
+    int64_t nrows = pred->len;
+    if (nrows == 0) {
+        /* Empty source — empty selection. */
+        return ray_rowsel_new(0, 0, 0);
+    }
+
+    const uint8_t* pred_data = (const uint8_t*)ray_data(pred);
+    uint32_t n_segs = (uint32_t)((nrows + RAY_MORSEL_ELEMS - 1) / RAY_MORSEL_ELEMS);
+
+    /* Temporary popcount[seg] buffer.  ray_alloc returns a ray_t*
+     * whose data area is the byte buffer we need. */
+    ray_t* pop_block = ray_alloc((size_t)n_segs * sizeof(uint32_t));
+    if (!pop_block) return NULL;
+    uint32_t* popcount = (uint32_t*)ray_data(pop_block);
+
+    /* Pass 1 — parallel popcount per segment. */
+    rowsel_pass1_ctx_t p1 = {
+        .pred_data = pred_data,
+        .nrows     = nrows,
+        .popcount  = popcount,
+    };
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
+        ray_pool_dispatch(pool, rowsel_pass1_fn, &p1, (int64_t)n_segs);
+    else
+        rowsel_pass1_fn(&p1, 0, 0, (int64_t)n_segs);
+
+    /* Single sweep: classify each segment and accumulate both
+     * total_pass (ALL + MIX rows, for meta) and idx_count (MIX rows
+     * only, for sizing idx[]).  Walking popcount[] sequentially —
+     * n_segs is at most ~10K for a 10M-row table, trivial. */
+    int64_t total_pass = 0;
+    int64_t idx_count  = 0;
+    for (uint32_t s = 0; s < n_segs; s++) {
+        int64_t seg_start = (int64_t)s * RAY_MORSEL_ELEMS;
+        int64_t seg_end   = seg_start + RAY_MORSEL_ELEMS;
+        if (seg_end > nrows) seg_end = nrows;
+        int64_t seg_len = seg_end - seg_start;
+        uint32_t pc = popcount[s];
+        total_pass += pc;
+        if (pc != 0 && (int64_t)pc != seg_len)
+            idx_count += pc;
+    }
+
+    if (total_pass == nrows) {
+        /* All rows pass — convention is "no selection". */
+        ray_release(pop_block);
+        return NULL;
+    }
+
+    /* Allocate the result block sized for the MIX-contributed
+     * indices only.  ALL and NONE segments add nothing to idx[]. */
+    ray_t* block = ray_rowsel_new(nrows, total_pass, idx_count);
+    if (!block) {
+        ray_release(pop_block);
+        return NULL;
+    }
+
+    /* Fill seg_flags + seg_offsets in a second sequential walk over
+     * popcount[].  cum accumulates MIX-contributed indices to build
+     * the prefix sum into idx[]. */
+    uint8_t*  seg_flags   = ray_rowsel_flags(block);
+    uint32_t* seg_offsets = ray_rowsel_offsets(block);
+    uint32_t cum = 0;
+    for (uint32_t s = 0; s < n_segs; s++) {
+        seg_offsets[s] = cum;
+        int64_t seg_start = (int64_t)s * RAY_MORSEL_ELEMS;
+        int64_t seg_end   = seg_start + RAY_MORSEL_ELEMS;
+        if (seg_end > nrows) seg_end = nrows;
+        int64_t seg_len = seg_end - seg_start;
+        uint32_t pc = popcount[s];
+        if (pc == 0) {
+            seg_flags[s] = RAY_SEL_NONE;
+        } else if ((int64_t)pc == seg_len) {
+            seg_flags[s] = RAY_SEL_ALL;
+            /* ALL contributes nothing to idx[]; cum unchanged. */
+        } else {
+            seg_flags[s] = RAY_SEL_MIX;
+            cum += pc;
+        }
+    }
+    seg_offsets[n_segs] = cum;
+
+    /* Pass 2 — parallel index write into idx[]. */
+    if (cum > 0) {
+        rowsel_pass2_ctx_t p2 = {
+            .pred_data   = pred_data,
+            .nrows       = nrows,
+            .seg_flags   = seg_flags,
+            .seg_offsets = seg_offsets,
+            .idx         = ray_rowsel_idx(block),
+        };
+        if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
+            ray_pool_dispatch(pool, rowsel_pass2_fn, &p2, (int64_t)n_segs);
+        else
+            rowsel_pass2_fn(&p2, 0, 0, (int64_t)n_segs);
+    }
+
+    ray_release(pop_block);
+    return block;
+}
+
+/* ──────────────────────────────────────────────────────────────────
+ * ray_rowsel_to_indices — flatten to a dense int64 array
+ * ────────────────────────────────────────────────────────────────── */
+
+/* Pass 2 worker context for ray_rowsel_to_indices. */
+typedef struct {
+    const uint8_t*  flags;
+    const uint32_t* offsets;
+    const uint16_t* idx;
+    const uint32_t* flat_offsets;  /* per-segment offset into out[] */
+    int64_t*        out;
+    int64_t         nrows;
+} rowsel_to_idx_ctx_t;
+
+static void rowsel_to_idx_fn(void* vctx, uint32_t worker_id,
+                             int64_t start_seg, int64_t end_seg) {
+    (void)worker_id;
+    rowsel_to_idx_ctx_t* c = (rowsel_to_idx_ctx_t*)vctx;
+    int64_t nrows = c->nrows;
+    for (int64_t seg = start_seg; seg < end_seg; seg++) {
+        uint8_t f = c->flags[seg];
+        if (f == RAY_SEL_NONE) continue;
+        int64_t base = seg * RAY_MORSEL_ELEMS;
+        int64_t end  = base + RAY_MORSEL_ELEMS;
+        if (end > nrows) end = nrows;
+        int64_t j = c->flat_offsets[seg];
+        if (f == RAY_SEL_ALL) {
+            for (int64_t r = base; r < end; r++) c->out[j++] = r;
+        } else {
+            const uint16_t* slice = c->idx + c->offsets[seg];
+            uint32_t n = c->offsets[seg + 1] - c->offsets[seg];
+            for (uint32_t i = 0; i < n; i++) c->out[j++] = base + slice[i];
+        }
+    }
+}
+
+ray_t* ray_rowsel_to_indices(ray_t* sel) {
+    if (!sel) return NULL;
+    ray_rowsel_t*   m       = ray_rowsel_meta(sel);
+    const uint8_t*  flags   = ray_rowsel_flags(sel);
+    const uint32_t* offsets = ray_rowsel_offsets(sel);
+    const uint16_t* idx     = ray_rowsel_idx(sel);
+    int64_t nrows      = m->nrows;
+    int64_t total_pass = m->total_pass;
+    uint32_t n_segs    = m->n_segs;
+
+    ray_t* block = ray_alloc((size_t)total_pass * sizeof(int64_t));
+    if (!block) return NULL;
+    int64_t* out = (int64_t*)ray_data(block);
+
+    if (total_pass == 0 || n_segs == 0) return block;
+
+    /* Build per-segment flat offsets into out[].  Sequential prefix
+     * sum over n_segs entries — cheap (n_segs ≈ nrows/1024). */
+    ray_t* fo_block = ray_alloc((size_t)n_segs * sizeof(uint32_t));
+    if (!fo_block) { ray_release(block); return NULL; }
+    uint32_t* flat_offsets = (uint32_t*)ray_data(fo_block);
+    uint32_t cum = 0;
+    for (uint32_t s = 0; s < n_segs; s++) {
+        flat_offsets[s] = cum;
+        uint8_t f = flags[s];
+        if (f == RAY_SEL_NONE) continue;
+        if (f == RAY_SEL_ALL) {
+            int64_t base = (int64_t)s * RAY_MORSEL_ELEMS;
+            int64_t end  = base + RAY_MORSEL_ELEMS;
+            if (end > nrows) end = nrows;
+            cum += (uint32_t)(end - base);
+        } else {
+            cum += offsets[s + 1] - offsets[s];
+        }
+    }
+
+    /* Parallel write: each worker fills its own segment range into
+     * out[] using flat_offsets to find the start of each segment.
+     * Slices are non-overlapping by construction. */
+    rowsel_to_idx_ctx_t ctx = {
+        .flags        = flags,
+        .offsets      = offsets,
+        .idx          = idx,
+        .flat_offsets = flat_offsets,
+        .out          = out,
+        .nrows        = nrows,
+    };
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
+        ray_pool_dispatch(pool, rowsel_to_idx_fn, &ctx, (int64_t)n_segs);
+    else
+        rowsel_to_idx_fn(&ctx, 0, 0, (int64_t)n_segs);
+
+    ray_release(fo_block);
+    return block;
+}
+
+/* ──────────────────────────────────────────────────────────────────
+ * Refine — chained filter
+ * ────────────────────────────────────────────────────────────────── */
+
+/* refine: walk `existing`'s surviving rows, test pred at each, emit a
+ * new selection.  Sequential — chained filters are typically applied
+ * to already-shrunk row sets where parallelism doesn't pay back the
+ * dispatch overhead.  Phase 2 will revisit if measurement says
+ * otherwise. */
+ray_t* ray_rowsel_refine(ray_t* existing, ray_t* pred) {
+    if (!existing) return ray_rowsel_from_pred(pred);
+    if (!pred || pred->type != RAY_BOOL) return NULL;
+
+    ray_rowsel_t*  em = ray_rowsel_meta(existing);
+    int64_t        nrows = em->nrows;
+    if (pred->len != nrows) return NULL;
+
+    const uint8_t*  pred_data    = (const uint8_t*)ray_data(pred);
+    const uint8_t*  e_flags      = ray_rowsel_flags(existing);
+    const uint32_t* e_offsets    = ray_rowsel_offsets(existing);
+    const uint16_t* e_idx        = ray_rowsel_idx(existing);
+    uint32_t        n_segs       = em->n_segs;
+
+    /* Pass 1 — count survivors per segment. */
+    ray_t* pop_block = ray_alloc((size_t)n_segs * sizeof(uint32_t));
+    if (!pop_block) return NULL;
+    uint32_t* popcount = (uint32_t*)ray_data(pop_block);
+    memset(popcount, 0, (size_t)n_segs * sizeof(uint32_t));
+
+    int64_t total_pass = 0;
+    int64_t idx_count  = 0;
+    for (uint32_t s = 0; s < n_segs; s++) {
+        uint8_t f = e_flags[s];
+        if (f == RAY_SEL_NONE) continue;
+        int64_t base = (int64_t)s * RAY_MORSEL_ELEMS;
+        int64_t end  = base + RAY_MORSEL_ELEMS;
+        if (end > nrows) end = nrows;
+        int64_t seg_len = end - base;
+        uint32_t n = 0;
+        if (f == RAY_SEL_ALL) {
+            for (int64_t r = base; r < end; r++)
+                n += pred_data[r] != 0;
+        } else { /* MIX */
+            const uint16_t* src = e_idx + e_offsets[s];
+            uint32_t src_n = e_offsets[s + 1] - e_offsets[s];
+            for (uint32_t i = 0; i < src_n; i++) {
+                int64_t r = base + src[i];
+                n += pred_data[r] != 0;
+            }
+        }
+        popcount[s] = n;
+        total_pass += n;
+        /* This segment will be MIX in the output (and contribute to
+         * idx[]) iff some-but-not-all of its rows pass. */
+        if (n != 0 && (int64_t)n != seg_len)
+            idx_count += n;
+    }
+
+    if (total_pass == nrows) {
+        /* Refinement somehow ended up matching every source row.
+         * Should be impossible unless `existing` was already
+         * effectively all-pass and pred is all-true — but handle it. */
+        ray_release(pop_block);
+        return NULL;
+    }
+
+    ray_t* block = ray_rowsel_new(nrows, total_pass, idx_count);
+    if (!block) {
+        ray_release(pop_block);
+        return NULL;
+    }
+    uint8_t*  seg_flags   = ray_rowsel_flags(block);
+    uint32_t* seg_offsets = ray_rowsel_offsets(block);
+    uint16_t* idx_out     = ray_rowsel_idx(block);
+
+    uint32_t cum = 0;
+    for (uint32_t s = 0; s < n_segs; s++) {
+        seg_offsets[s] = cum;
+        int64_t base = (int64_t)s * RAY_MORSEL_ELEMS;
+        int64_t end  = base + RAY_MORSEL_ELEMS;
+        if (end > nrows) end = nrows;
+        int64_t seg_len = end - base;
+        uint32_t pc = popcount[s];
+        if (pc == 0) {
+            seg_flags[s] = RAY_SEL_NONE;
+            continue;
+        }
+        if ((int64_t)pc == seg_len) {
+            seg_flags[s] = RAY_SEL_ALL;
+            continue;
+        }
+        seg_flags[s] = RAY_SEL_MIX;
+
+        /* Pass 2 (inlined, sequential) — write the surviving
+         * morsel-local indices for this segment. */
+        uint16_t* dst = idx_out + cum;
+        uint32_t  dn  = 0;
+        uint8_t f = e_flags[s];
+        if (f == RAY_SEL_ALL) {
+            for (int64_t r = base; r < end; r++)
+                if (pred_data[r])
+                    dst[dn++] = (uint16_t)(r - base);
+        } else { /* MIX in existing */
+            const uint16_t* src = e_idx + e_offsets[s];
+            uint32_t src_n = e_offsets[s + 1] - e_offsets[s];
+            for (uint32_t i = 0; i < src_n; i++) {
+                int64_t r = base + src[i];
+                if (pred_data[r])
+                    dst[dn++] = (uint16_t)(r - base);
+            }
+        }
+        cum += pc;
+    }
+    seg_offsets[n_segs] = cum;
+
+    ray_release(pop_block);
+    return block;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.h b/crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.h
new file mode 100644
index 0000000..c28e593
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.h
@@ -0,0 +1,187 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/*
+ * ray_rowsel — morsel-local row-filter selection.
+ *
+ * Replacement for the bitmap (RAY_SEL) form of g->selection used by
+ * OP_FILTER on table inputs.  Stores the surviving rows of a filter
+ * as morsel-local uint16 indices instead of a per-row bitmap, so the
+ * downstream group / sort / agg hot loops iterate only the live rows
+ * with no per-row bitmap test.
+ *
+ * Layout — single ray_alloc block, contiguous payload at ray_data():
+ *
+ *   ray_rowsel_t  meta            (24 bytes; at ray_data(block))
+ *   uint8_t       seg_flags[]     (n_segs, padded to 8-byte boundary)
+ *   uint32_t      seg_offsets[]   (n_segs + 1, prefix sum into idx[])
+ *   uint16_t      idx[]           (total_pass entries; only MIX
+ *                                  segments contribute)
+ *
+ * Per-segment flag values are the same NONE / ALL / MIX constants the
+ * existing RAY_SEL bitmap uses (src/ops/ops.h):
+ *   - NONE: no rows in this morsel pass — consumer skips wholesale.
+ *   - ALL:  every row in this morsel passes — seg_offsets[seg+1]
+ *           equals seg_offsets[seg], no indices stored, consumer
+ *           iterates [seg_start, seg_end) densely.
+ *   - MIX:  partial pass — idx[seg_offsets[seg] .. seg_offsets[seg+1])
+ *           holds the morsel-local positions (0..1023) of passing
+ *           rows in segment order.
+ *
+ * Lifetime: single-owner.  Producer (ray_rowsel_from_pred / refine)
+ * returns a fresh ray_t* with rc=1.  Consumer calls ray_rowsel_release
+ * to free.  No COW semantics — selection data is never shared and
+ * never serialized.
+ *
+ * The block is allocated via ray_alloc and uses no specific type tag
+ * (zeroed by ray_alloc); nothing in the runtime dispatches on it.
+ * The accessors below are the only valid way to read its contents.
+ *
+ * Note: this is unrelated to the existing RAY_SEL type tag used by
+ * src/ops/join.c and src/ops/traverse.c as a generic key-bit set.
+ * Those continue to use ray_sel_* unchanged.
+ */
+
+#ifndef RAY_ROWSEL_H
+#define RAY_ROWSEL_H
+
+#include "rayforce.h"
+#include "ops/ops.h"   /* RAY_SEL_NONE/ALL/MIX, RAY_MORSEL_ELEMS */
+
+#include <stdint.h>
+
+/* RAY_MORSEL_ELEMS must fit in uint16_t for morsel-local indices. */
+_Static_assert(RAY_MORSEL_ELEMS <= 65536,
+               "morsel size exceeds uint16_t index range");
+
+/* Inline header at ray_data(block).  Pointer fields are NOT stored
+ * here — they are reconstructed from this header's n_segs / total_pass
+ * via the accessor inlines below.  The payload arrays live immediately
+ * after this struct in the same allocation. */
+typedef struct {
+    int64_t  total_pass;   /* number of passing rows                   */
+    int64_t  nrows;        /* source row count this selection covers   */
+    uint32_t n_segs;       /* ceil(nrows / RAY_MORSEL_ELEMS)            */
+    uint32_t _pad;
+} ray_rowsel_t;
+
+/* Round n up to a multiple of 8 so the next array starts aligned. */
+static inline size_t ray_rowsel_pad8(size_t n) {
+    return (n + 7u) & ~(size_t)7u;
+}
+
+static inline ray_rowsel_t* ray_rowsel_meta(ray_t* block) {
+    return (ray_rowsel_t*)ray_data(block);
+}
+
+static inline uint8_t* ray_rowsel_flags(ray_t* block) {
+    return (uint8_t*)ray_data(block) + sizeof(ray_rowsel_t);
+}
+
+static inline uint32_t* ray_rowsel_offsets(ray_t* block) {
+    ray_rowsel_t* m = ray_rowsel_meta(block);
+    return (uint32_t*)(ray_rowsel_flags(block) + ray_rowsel_pad8(m->n_segs));
+}
+
+static inline uint16_t* ray_rowsel_idx(ray_t* block) {
+    ray_rowsel_t* m = ray_rowsel_meta(block);
+    return (uint16_t*)(ray_rowsel_offsets(block) + (m->n_segs + 1));
+}
+
+/* Compute the total bytes needed for the inline payload.
+ * `idx_count` is the number of uint16_t entries the idx[] array
+ * needs to hold — this is the sum of popcounts over MIX segments
+ * only, NOT the total passing-row count.  ALL segments contribute
+ * zero to idx[]. */
+static inline size_t ray_rowsel_payload_bytes(int64_t nrows, int64_t idx_count) {
+    uint32_t n_segs = (uint32_t)((nrows + RAY_MORSEL_ELEMS - 1) / RAY_MORSEL_ELEMS);
+    if (nrows <= 0) n_segs = 0;
+    return sizeof(ray_rowsel_t)
+         + ray_rowsel_pad8(n_segs)
+         + (size_t)(n_segs + 1) * sizeof(uint32_t)
+         + (size_t)idx_count    * sizeof(uint16_t);
+}
+
+/* Allocate a rowsel block.
+ *
+ * `nrows`      — source row count this selection covers.
+ * `total_pass` — number of passing rows (ALL + MIX).  Stored in
+ *                meta; consumers read it for sizing decisions.
+ * `idx_count`  — number of uint16_t slots the idx[] array needs.
+ *                Equal to the sum of popcounts over segments
+ *                tagged MIX in the final layout.  ALL and NONE
+ *                segments contribute zero.
+ *
+ * Header fields are populated; arrays are uninitialized.  Caller
+ * fills seg_flags, seg_offsets, and idx, then hands the block off
+ * (g->selection, etc.) or releases via ray_rowsel_release.
+ * Returns NULL on OOM. */
+ray_t* ray_rowsel_new(int64_t nrows, int64_t total_pass, int64_t idx_count);
+
+/* Release a rowsel block.  Equivalent to ray_release / ray_free of
+ * the underlying allocation — exposed under its own name for clarity
+ * at call sites. */
+void ray_rowsel_release(ray_t* block);
+
+/* Build a rowsel from a RAY_BOOL predicate vector.
+ *
+ * pred must be a flat RAY_BOOL vec (byte-per-row).  Returns:
+ *   - NULL if all rows pass (the all-pass convention is "no
+ *     selection", same as g->selection == NULL).
+ *   - A fresh rowsel block (rc=1) otherwise, including the
+ *     none-pass case (zero-length idx, all flags NONE).
+ *
+ * The build runs in two parallel passes when nrows is large enough
+ * to benefit (>= RAY_PARALLEL_THRESHOLD): pass 1 computes per-segment
+ * popcount + flag, an inline prefix sum fills seg_offsets, pass 2
+ * writes the morsel-local indices into the global idx[] (each worker
+ * writes its own non-overlapping slice).  Smaller pred vecs run the
+ * same logic single-threaded. */
+ray_t* ray_rowsel_from_pred(ray_t* pred);
+
+/* Flatten a rowsel into a dense int64 array of global row indices,
+ * sorted ascending.  Length of the array is `meta->total_pass`.
+ *
+ * Returned block is a ray_t* byte buffer whose ray_data() points to
+ * an `int64_t[total_pass]`.  Consumer gets a raw pointer via
+ * ray_data() and releases the block when done via ray_release.
+ * Returns NULL on OOM.
+ *
+ * Used by exec_group and similar consumers that can't cheaply walk
+ * the morsel-local rowsel inline (yet) — they dispatch workers over
+ * [0, total_pass) using the flattened indices directly. */
+ray_t* ray_rowsel_to_indices(ray_t* sel);
+
+/* Refine an existing rowsel by AND-ing it with a fresh predicate vec.
+ *
+ * Used by chained OP_FILTER on a table input that already has a
+ * g->selection.  Walks `existing`'s surviving rows, tests pred at each,
+ * emits a new rowsel containing only the positions that pass both.
+ * Returns NULL if the result is all-pass (impossible here unless
+ * existing was already all-pass), or a fresh block otherwise.
+ *
+ * Does not consume `existing` — caller is responsible for releasing
+ * the old selection after replacing it. */
+ray_t* ray_rowsel_refine(ray_t* existing, ray_t* pred);
+
+#endif /* RAY_ROWSEL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/sort.c b/crates/rayforce-sys/vendor/rayforce/src/ops/sort.c
new file mode 100644
index 0000000..4b0b502
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/sort.c
@@ -0,0 +1,3682 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+#include "lang/internal.h"
+#include "ops/ops.h"
+#include "mem/sys.h"
+
+/* --------------------------------------------------------------------------
+ * Sort comparator: compare two row indices across all sort keys.
+ * Returns negative if a < b, positive if a > b, 0 if equal.
+ * -------------------------------------------------------------------------- */
+/* sort_cmp_ctx_t defined in exec_internal.h */
+
+int sort_cmp(const sort_cmp_ctx_t* ctx, int64_t a, int64_t b) {
+    for (uint8_t k = 0; k < ctx->n_sort; k++) {
+        ray_t* col = ctx->vecs[k];
+        if (!col) continue;
+        int cmp = 0;
+        int null_cmp = 0;
+        int desc = ctx->desc ? ctx->desc[k] : 0;
+        int nf = ctx->nulls_first ? ctx->nulls_first[k] : desc;
+
+        /* Check null bitmap for both elements */
+        int a_null = ray_vec_is_null(col, a);
+        int b_null = ray_vec_is_null(col, b);
+        if (a_null || b_null) {
+            null_cmp = 1;
+            if (a_null && b_null) cmp = 0;
+            else if (a_null) cmp = nf ? -1 : 1;
+            else cmp = nf ? 1 : -1;
+        } else if (col->type == RAY_F64) {
+            double va = ((double*)ray_data(col))[a];
+            double vb = ((double*)ray_data(col))[b];
+            if (va < vb) cmp = -1;
+            else if (va > vb) cmp = 1;
+        } else if (col->type == RAY_I64 || col->type == RAY_TIMESTAMP) {
+            int64_t va = ((int64_t*)ray_data(col))[a];
+            int64_t vb = ((int64_t*)ray_data(col))[b];
+            if (va < vb) cmp = -1;
+            else if (va > vb) cmp = 1;
+        } else if (col->type == RAY_I32) {
+            int32_t va = ((int32_t*)ray_data(col))[a];
+            int32_t vb = ((int32_t*)ray_data(col))[b];
+            if (va < vb) cmp = -1;
+            else if (va > vb) cmp = 1;
+        } else if (RAY_IS_SYM(col->type)) {
+            int64_t va = ray_read_sym(ray_data(col), a, col->type, col->attrs);
+            int64_t vb = ray_read_sym(ray_data(col), b, col->type, col->attrs);
+            ray_t* sa = ray_sym_str(va);
+            ray_t* sb = ray_sym_str(vb);
+            if (sa && sb) cmp = ray_str_cmp(sa, sb);
+        } else if (col->type == RAY_I16) {
+            int16_t va = ((int16_t*)ray_data(col))[a];
+            int16_t vb = ((int16_t*)ray_data(col))[b];
+            if (va < vb) cmp = -1;
+            else if (va > vb) cmp = 1;
+        } else if (col->type == RAY_BOOL || col->type == RAY_U8) {
+            uint8_t va = ((uint8_t*)ray_data(col))[a];
+            uint8_t vb = ((uint8_t*)ray_data(col))[b];
+            if (va < vb) cmp = -1;
+            else if (va > vb) cmp = 1;
+        } else if (col->type == RAY_DATE || col->type == RAY_TIME) {
+            int32_t va = ((int32_t*)ray_data(col))[a];
+            int32_t vb = ((int32_t*)ray_data(col))[b];
+            if (va < vb) cmp = -1;
+            else if (va > vb) cmp = 1;
+        } else if (col->type == RAY_GUID) {
+            const uint8_t* base = (const uint8_t*)ray_data(col);
+            cmp = memcmp(base + a * 16, base + b * 16, 16);
+        } else if (col->type == RAY_STR) {
+            const ray_str_t* elems;
+            const char* pool;
+            str_resolve(col, &elems, &pool);
+            cmp = ray_str_t_cmp(&elems[a], pool, &elems[b], pool);
+        }
+
+        if (desc && !null_cmp) cmp = -cmp;
+        if (cmp != 0) return cmp;
+    }
+    return 0;
+}
+
+/* --------------------------------------------------------------------------
+ * Small-array sort: introsort on (key, idx) pairs.
+ *
+ * For arrays ≤ RADIX_SORT_THRESHOLD, a single-pass encode + comparison sort
+ * beats multi-pass radix sort.  Uses quicksort with median-of-3 pivot and
+ * heapsort fallback (introsort) to guarantee O(n log n) worst case.
+ * -------------------------------------------------------------------------- */
+
+/* RADIX_SORT_THRESHOLD, SMALL_POOL_THRESHOLD defined in exec_internal.h */
+
+static void key_sift_down(uint64_t* keys, int64_t* idx, int64_t n, int64_t i) {
+    for (;;) {
+        int64_t largest = i, l = 2*i+1, r = 2*i+2;
+        if (l < n && keys[l] > keys[largest]) largest = l;
+        if (r < n && keys[r] > keys[largest]) largest = r;
+        if (largest == i) return;
+        uint64_t tk = keys[i]; keys[i] = keys[largest]; keys[largest] = tk;
+        int64_t  ti = idx[i];  idx[i]  = idx[largest];  idx[largest]  = ti;
+        i = largest;
+    }
+}
+
+static void key_heapsort(uint64_t* keys, int64_t* idx, int64_t n) {
+    for (int64_t i = n/2 - 1; i >= 0; i--)
+        key_sift_down(keys, idx, n, i);
+    for (int64_t i = n - 1; i > 0; i--) {
+        uint64_t tk = keys[0]; keys[0] = keys[i]; keys[i] = tk;
+        int64_t  ti = idx[0];  idx[0]  = idx[i];  idx[i]  = ti;
+        key_sift_down(keys, idx, i, 0);
+    }
+}
+
+static void key_insertion_sort(uint64_t* keys, int64_t* idx, int64_t n) {
+    for (int64_t i = 1; i < n; i++) {
+        uint64_t kk = keys[i];
+        int64_t  ii = idx[i];
+        int64_t j = i - 1;
+        while (j >= 0 && keys[j] > kk) {
+            keys[j+1] = keys[j];
+            idx[j+1]  = idx[j];
+            j--;
+        }
+        keys[j+1] = kk;
+        idx[j+1]  = ii;
+    }
+}
+
+static void key_introsort_impl(uint64_t* keys, int64_t* idx,
+                                 int64_t n, int depth) {
+    while (n > 32) {
+        if (depth == 0) {
+            key_heapsort(keys, idx, n);
+            return;
+        }
+        depth--;
+
+        /* Median-of-3 pivot */
+        int64_t mid = n / 2;
+        uint64_t a = keys[0], b = keys[mid], c = keys[n-1];
+        int64_t pi;
+        if (a < b) pi = (b < c) ? mid : (a < c ? n-1 : 0);
+        else       pi = (a < c) ? 0   : (b < c ? n-1 : mid);
+
+        /* Move pivot to end */
+        uint64_t pk = keys[pi]; keys[pi] = keys[n-1]; keys[n-1] = pk;
+        int64_t  pv = idx[pi];  idx[pi]  = idx[n-1];  idx[n-1]  = pv;
+
+        /* Partition */
+        int64_t lo = 0;
+        for (int64_t i = 0; i < n - 1; i++) {
+            if (keys[i] < pk) {
+                uint64_t tk = keys[i]; keys[i] = keys[lo]; keys[lo] = tk;
+                int64_t  ti = idx[i];  idx[i]  = idx[lo];  idx[lo]  = ti;
+                lo++;
+            }
+        }
+        keys[n-1] = keys[lo]; keys[lo] = pk;
+        idx[n-1]  = idx[lo];  idx[lo]  = pv;
+
+        /* Recurse on smaller partition, iterate on larger */
+        if (lo < n - 1 - lo) {
+            key_introsort_impl(keys, idx, lo, depth);
+            keys += lo + 1; idx += lo + 1; n -= lo + 1;
+        } else {
+            key_introsort_impl(keys + lo + 1, idx + lo + 1, n - lo - 1, depth);
+            n = lo;
+        }
+    }
+    key_insertion_sort(keys, idx, n);
+}
+
+/* Sort (key, idx) pairs in-place by key.  O(n log n) guaranteed. */
+void key_introsort(uint64_t* keys, int64_t* idx, int64_t n) {
+    if (n <= 1) return;
+    int depth = 0;
+    for (int64_t nn = n; nn > 1; nn >>= 1) depth++;
+    depth *= 2;
+    key_introsort_impl(keys, idx, n, depth);
+}
+
+/* --------------------------------------------------------------------------
+ * Adaptive pre-sort detection.
+ *
+ * Scans encoded keys to detect already-sorted and nearly-sorted data.
+ * Returns a sortedness metric: fraction of out-of-order pairs [0.0, 1.0].
+ *   0.0 = perfectly sorted → skip sort entirely
+ *   small = nearly sorted → prefer comparison-based sort (adaptive mergesort)
+ *   large = random → use radix sort
+ * -------------------------------------------------------------------------- */
+
+typedef struct {
+    const uint64_t* keys;
+    int64_t*        pw_unsorted; /* per-worker out-of-order count */
+} sortedness_ctx_t;
+
+/* Each worker counts out-of-order pairs in [start, end).
+ * Also checks the boundary: keys[start-1] vs keys[start] (for start > 0). */
+static void sortedness_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    sortedness_ctx_t* c = (sortedness_ctx_t*)arg;
+    const uint64_t* keys = c->keys;
+    int64_t unsorted = 0;
+    for (int64_t i = start + 1; i < end; i++) {
+        if (keys[i] < keys[i - 1]) unsorted++;
+    }
+    c->pw_unsorted[wid] += unsorted;
+}
+
+/* Detect sortedness of encoded keys.  Returns fraction of out-of-order pairs.
+ * If the result is 0.0, data is already sorted and sort can be skipped.
+ * If < threshold (e.g. 0.05), comparison sort is faster than radix. */
+double detect_sortedness(ray_pool_t* pool, const uint64_t* keys, int64_t n) {
+    if (n <= 1) return 0.0;
+
+    int64_t total_unsorted;
+    if (pool && n > SMALL_POOL_THRESHOLD) {
+        uint32_t nw = ray_pool_total_workers(pool);
+        int64_t pw[nw];
+        memset(pw, 0, (size_t)nw * sizeof(int64_t));
+        sortedness_ctx_t ctx = { .keys = keys, .pw_unsorted = pw };
+        ray_pool_dispatch(pool, sortedness_fn, &ctx, n);
+
+        total_unsorted = 0;
+        for (uint32_t t = 0; t < nw; t++)
+            total_unsorted += pw[t];
+
+        /* Check cross-task boundaries (each task starts at a TASK_GRAIN
+         * boundary; the sortedness_fn only checks within [start+1, end)
+         * so boundaries between adjacent tasks are missed). */
+        int64_t grain = RAY_DISPATCH_MORSELS * RAY_MORSEL_ELEMS;
+        for (int64_t b = grain; b < n; b += grain) {
+            if (keys[b] < keys[b - 1])
+                total_unsorted++;
+        }
+    } else {
+        total_unsorted = 0;
+        for (int64_t i = 1; i < n; i++) {
+            if (keys[i] < keys[i - 1]) total_unsorted++;
+        }
+    }
+
+    return (double)total_unsorted / (double)(n - 1);
+}
+
+/* Threshold: if fewer than 5% of pairs are out of order, data is
+ * "nearly sorted" and adaptive comparison sort beats radix. */
+/* NEARLY_SORTED_FRAC, radix_key_bytes defined in exec_internal.h */
+
+/* Scan encoded keys to compute actual significant byte count from data range.
+ * Eliminates histogram passes for bytes that are uniform across all keys. */
+typedef struct {
+    const uint64_t* keys;
+    uint64_t*       pw_or;   /* per-worker XOR-diff accumulator */
+} key_range_ctx_t;
+
+static void key_range_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    key_range_ctx_t* c = (key_range_ctx_t*)arg;
+    const uint64_t* keys = c->keys;
+    uint64_t local_or = c->pw_or[wid];
+    uint64_t first = keys[start];
+    for (int64_t i = start; i < end; i++)
+        local_or |= keys[i] ^ first;
+    c->pw_or[wid] = local_or;
+}
+
+uint8_t compute_key_nbytes(ray_pool_t* pool, const uint64_t* keys,
+                            int64_t n, uint8_t type_max) {
+    if (n <= 1) return 1;
+    uint64_t diff;
+    if (pool && n > SMALL_POOL_THRESHOLD) {
+        uint32_t nw = ray_pool_total_workers(pool);
+        uint64_t pw_or[nw];
+        memset(pw_or, 0, nw * sizeof(uint64_t));
+        key_range_ctx_t ctx = { .keys = keys, .pw_or = pw_or };
+        ray_pool_dispatch(pool, key_range_fn, &ctx, n);
+        diff = 0;
+        for (uint32_t w = 0; w < nw; w++) diff |= pw_or[w];
+        /* Also XOR the first element from different worker ranges to
+         * catch cross-worker differences (workers' "first" may differ) */
+        uint64_t first = keys[0];
+        int64_t chunk = (n + nw - 1) / nw;
+        for (uint32_t w = 1; w < nw; w++) {
+            int64_t wstart = (int64_t)w * chunk;
+            if (wstart < n) diff |= keys[wstart] ^ first;
+        }
+    } else {
+        diff = 0;
+        uint64_t first = keys[0];
+        for (int64_t i = 1; i < n; i++)
+            diff |= keys[i] ^ first;
+    }
+    uint8_t nb = 0;
+    while (diff) { nb++; diff >>= 8; }
+    if (nb < 1) nb = 1;
+    return nb < type_max ? nb : type_max;
+}
+
+/* --------------------------------------------------------------------------
+ * Parallel LSB radix sort (8-bit digits, 256 buckets)
+ *
+ * Used for single-key sorts on I64/F64/I32/SYM/TIMESTAMP columns,
+ * and composite-key sorts where all keys are integer types with total
+ * bit width <= 64.
+ *
+ * Three phases per byte:
+ *   1. Parallel histogram — each task counts byte occurrences in its range
+ *   2. Sequential prefix-sum — compute per-task scatter offsets
+ *   3. Parallel scatter — write elements to sorted positions
+ *
+ * Byte-skip: after histogram, if all elements share the same byte value,
+ * skip that pass entirely.  Critical for small-range integers where most
+ * upper bytes are identical.
+ * -------------------------------------------------------------------------- */
+
+/* radix_pass_ctx_t defined in exec_internal.h */
+
+/* Phase 1: histogram — each task counts byte values in its fixed range */
+static void radix_hist_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid; (void)end;
+    radix_pass_ctx_t* c = (radix_pass_ctx_t*)arg;
+    int64_t task = start; /* dispatch_n: [task, task+1) */
+
+    /* Zero histogram slice BEFORE early return — empty tasks must still
+     * clear their slice so the prefix-sum sees zeros, not garbage. */
+    uint32_t* h = c->hist + task * 256;
+    memset(h, 0, 256 * sizeof(uint32_t));
+
+    int64_t chunk = (c->n + c->n_tasks - 1) / c->n_tasks;
+    int64_t lo = task * chunk;
+    int64_t hi = lo + chunk;
+    if (hi > c->n) hi = c->n;
+    if (lo >= hi) return;
+
+    const uint64_t* keys = c->keys;
+    uint8_t shift = c->shift;
+    for (int64_t i = lo; i < hi; i++)
+        h[(keys[i] >> shift) & 0xFF]++;
+}
+
+/* Phase 3: scatter with software write-combining (SWC).
+ * Buffers entries per bucket before flushing, converting random writes
+ * into sequential bursts that are friendlier to the cache hierarchy. */
+#define SWC_N 8  /* entries per bucket buffer; 8*8=64B per bucket = 32KB total */
+static void radix_scatter_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid; (void)end;
+    radix_pass_ctx_t* c = (radix_pass_ctx_t*)arg;
+    int64_t task = start;
+
+    int64_t chunk = (c->n + c->n_tasks - 1) / c->n_tasks;
+    int64_t lo = task * chunk;
+    int64_t hi = lo + chunk;
+    if (hi > c->n) hi = c->n;
+    if (lo >= hi) return;
+
+    int64_t* off = c->offsets + task * 256;
+    const uint64_t* k_in = c->keys;
+    const int64_t*  i_in = c->idx;
+    uint64_t* k_out = c->keys_out;
+    int64_t*  i_out = c->idx_out;
+    uint8_t shift = c->shift;
+
+    /* SWC buffers: separate key/idx arrays to match output layout */
+    uint64_t kbuf[256][SWC_N];
+    int64_t  ibuf[256][SWC_N];
+    uint8_t  bcnt[256];
+    memset(bcnt, 0, 256);
+
+    for (int64_t i = lo; i < hi; i++) {
+        uint8_t byte = (k_in[i] >> shift) & 0xFF;
+        uint8_t bp = bcnt[byte];
+        kbuf[byte][bp] = k_in[i];
+        ibuf[byte][bp] = i_in[i];
+        if (++bp == SWC_N) {
+            int64_t pos = off[byte];
+            memcpy(&k_out[pos], kbuf[byte], SWC_N * sizeof(uint64_t));
+            memcpy(&i_out[pos], ibuf[byte], SWC_N * sizeof(int64_t));
+            off[byte] = pos + SWC_N;
+            bp = 0;
+        }
+        bcnt[byte] = bp;
+    }
+
+    /* Flush remaining entries */
+    for (int b = 0; b < 256; b++) {
+        int64_t pos = off[b];
+        for (uint8_t j = 0; j < bcnt[b]; j++) {
+            k_out[pos + j] = kbuf[b][j];
+            i_out[pos + j] = ibuf[b][j];
+        }
+        off[b] = pos + bcnt[b];
+    }
+}
+#undef SWC_N
+
+/* Run radix sort on pre-encoded uint64_t keys + int64_t indices.
+ * n_bytes limits the number of byte passes (1..8) based on key width.
+ * Returns pointer to the final sorted index array (either `indices` or
+ * `idx_tmp`).  Caller must keep both alive until done reading indices
+ * (the result may point into idx_tmp if an odd number of passes executed).
+ * If sorted_keys_out is non-NULL, stores the pointer to the final sorted
+ * keys buffer (either `keys` or `keys_tmp`).
+ * Returns NULL on failure. */
+int64_t* radix_sort_run(ray_pool_t* pool,
+                                uint64_t* keys, int64_t* indices,
+                                uint64_t* keys_tmp, int64_t* idx_tmp,
+                                int64_t n, uint8_t n_bytes,
+                                uint64_t** sorted_keys_out) {
+    uint32_t n_tasks = pool ? ray_pool_total_workers(pool) : 1;
+    if (n_tasks < 1) n_tasks = 1;
+
+    ray_t *hist_hdr = NULL, *off_hdr = NULL;
+    uint32_t* hist = (uint32_t*)scratch_alloc(&hist_hdr,
+                        (size_t)n_tasks * 256 * sizeof(uint32_t));
+    int64_t* offsets = (int64_t*)scratch_alloc(&off_hdr,
+                        (size_t)n_tasks * 256 * sizeof(int64_t));
+    if (!hist || !offsets) {
+        scratch_free(hist_hdr); scratch_free(off_hdr);
+        return NULL;
+    }
+
+    uint64_t* src_k = keys,     *dst_k = keys_tmp;
+    int64_t*  src_i = indices,   *dst_i = idx_tmp;
+
+    for (uint8_t bp = 0; bp < n_bytes; bp++) {
+        uint8_t shift = bp * 8;
+
+        radix_pass_ctx_t ctx = {
+            .keys = src_k, .idx = src_i,
+            .keys_out = dst_k, .idx_out = dst_i,
+            .n = n, .shift = shift, .n_tasks = n_tasks,
+            .hist = hist, .offsets = offsets,
+        };
+
+        /* Phase 1: parallel histogram */
+        if (pool && n_tasks > 1)
+            ray_pool_dispatch_n(pool, radix_hist_fn, &ctx, n_tasks);
+        else
+            radix_hist_fn(&ctx, 0, 0, 1);
+
+        /* Check uniformity via global histogram */
+        bool uniform = false;
+        for (int b = 0; b < 256; b++) {
+            uint32_t total = 0;
+            for (uint32_t t = 0; t < n_tasks; t++)
+                total += hist[t * 256 + b];
+            if (total == (uint32_t)n) { uniform = true; break; }
+        }
+        if (uniform) continue; /* all same byte — skip this pass */
+
+        /* Phase 2: prefix sum → per-task scatter offsets */
+        int64_t running = 0;
+        for (int b = 0; b < 256; b++) {
+            for (uint32_t t = 0; t < n_tasks; t++) {
+                offsets[t * 256 + b] = running;
+                running += hist[t * 256 + b];
+            }
+        }
+
+        /* Phase 3: parallel scatter */
+        if (pool && n_tasks > 1)
+            ray_pool_dispatch_n(pool, radix_scatter_fn, &ctx, n_tasks);
+        else
+            radix_scatter_fn(&ctx, 0, 0, 1);
+
+        /* Swap double-buffer pointers */
+        uint64_t* tk = src_k; src_k = dst_k; dst_k = tk;
+        int64_t*  ti = src_i; src_i = dst_i; dst_i = ti;
+    }
+
+    scratch_free(hist_hdr);
+    scratch_free(off_hdr);
+    if (sorted_keys_out) *sorted_keys_out = src_k;
+    return src_i;  /* pointer to final sorted indices */
+}
+
+/* ============================================================================
+ * Packed radix sort — key+index in a single uint64_t
+ *
+ * When key_nbytes * 8 + index_bits ≤ 64, we pack the encoded key and the
+ * row index into one uint64_t:
+ *   packed[i] = encoded_key[i] | ((uint64_t)i << idx_shift)
+ *
+ * Radix sort then moves ONE 8-byte value per element per pass instead of
+ * TWO 8-byte values (key + index).  This halves all memory traffic:
+ *   - SWC buffer: 16KB instead of 32KB (fits better in L1)
+ *   - Scatter writes: 8B instead of 16B per element
+ *   - Total traffic per pass: n×8B instead of n×16B
+ *
+ * After sorting, indices are extracted: idx = packed >> idx_shift
+ * ============================================================================ */
+
+/* Packed scatter: single-array SWC scatter, no separate index array. */
+#define PSWC_N 8
+static void packed_scatter_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid; (void)end;
+    radix_pass_ctx_t* c = (radix_pass_ctx_t*)arg;
+    int64_t task = start;
+
+    int64_t chunk = (c->n + c->n_tasks - 1) / c->n_tasks;
+    int64_t lo = task * chunk;
+    int64_t hi = lo + chunk;
+    if (hi > c->n) hi = c->n;
+    if (lo >= hi) return;
+
+    int64_t* off = c->offsets + task * 256;
+    const uint64_t* in  = c->keys;
+    uint64_t*       out = c->keys_out;
+    uint8_t shift = c->shift;
+
+    /* Single SWC buffer: 256 × 8 × 8B = 16KB — fits in L1 */
+    uint64_t buf[256][PSWC_N];
+    uint8_t  bcnt[256];
+    memset(bcnt, 0, 256);
+
+    for (int64_t i = lo; i < hi; i++) {
+        uint8_t byte = (in[i] >> shift) & 0xFF;
+        uint8_t bp = bcnt[byte];
+        buf[byte][bp] = in[i];
+        if (++bp == PSWC_N) {
+            int64_t pos = off[byte];
+            memcpy(&out[pos], buf[byte], PSWC_N * sizeof(uint64_t));
+            off[byte] = pos + PSWC_N;
+            bp = 0;
+        }
+        bcnt[byte] = bp;
+    }
+
+    /* Flush remaining entries */
+    for (int b = 0; b < 256; b++) {
+        int64_t pos = off[b];
+        for (uint8_t j = 0; j < bcnt[b]; j++)
+            out[pos + j] = buf[b][j];
+        off[b] = pos + bcnt[b];
+    }
+}
+#undef PSWC_N
+
+/* Packed radix sort: sorts an array of packed (key|index) uint64_t values.
+ * Sorts by bytes lo_byte to hi_byte-1 (the key bytes).
+ * Returns pointer to final sorted array (data or tmp). */
+uint64_t* packed_radix_sort_run(ray_pool_t* pool,
+                                         uint64_t* data, uint64_t* tmp,
+                                         int64_t n, uint8_t n_bytes) {
+    uint32_t n_tasks = pool ? ray_pool_total_workers(pool) : 1;
+    if (n_tasks < 1) n_tasks = 1;
+
+    ray_t *hist_hdr = NULL, *off_hdr = NULL;
+    uint32_t* hist = (uint32_t*)scratch_alloc(&hist_hdr,
+                        (size_t)n_tasks * 256 * sizeof(uint32_t));
+    int64_t* offsets = (int64_t*)scratch_alloc(&off_hdr,
+                        (size_t)n_tasks * 256 * sizeof(int64_t));
+    if (!hist || !offsets) {
+        scratch_free(hist_hdr); scratch_free(off_hdr);
+        return NULL;
+    }
+
+    uint64_t* src = data, *dst = tmp;
+
+    for (uint8_t bp = 0; bp < n_bytes; bp++) {
+        uint8_t shift = bp * 8;
+
+        /* Reuse radix_pass_ctx_t — only .keys and .keys_out are used
+         * by radix_hist_fn and packed_scatter_fn. */
+        radix_pass_ctx_t ctx = {
+            .keys = src, .keys_out = dst,
+            .n = n, .shift = shift, .n_tasks = n_tasks,
+            .hist = hist, .offsets = offsets,
+        };
+
+        /* Phase 1: parallel histogram (reuses existing radix_hist_fn) */
+        if (pool && n_tasks > 1)
+            ray_pool_dispatch_n(pool, radix_hist_fn, &ctx, n_tasks);
+        else
+            radix_hist_fn(&ctx, 0, 0, 1);
+
+        /* Check uniformity */
+        bool uniform = false;
+        for (int b = 0; b < 256; b++) {
+            uint32_t total = 0;
+            for (uint32_t t = 0; t < n_tasks; t++)
+                total += hist[t * 256 + b];
+            if (total == (uint32_t)n) { uniform = true; break; }
+        }
+        if (uniform) continue;
+
+        /* Phase 2: prefix sum */
+        int64_t running = 0;
+        for (int b = 0; b < 256; b++) {
+            for (uint32_t t = 0; t < n_tasks; t++) {
+                offsets[t * 256 + b] = running;
+                running += hist[t * 256 + b];
+            }
+        }
+
+        /* Phase 3: packed scatter (half the traffic of dual-array scatter) */
+        if (pool && n_tasks > 1)
+            ray_pool_dispatch_n(pool, packed_scatter_fn, &ctx, n_tasks);
+        else
+            packed_scatter_fn(&ctx, 0, 0, 1);
+
+        uint64_t* t2 = src; src = dst; dst = t2;
+    }
+
+    scratch_free(hist_hdr);
+    scratch_free(off_hdr);
+    return src;
+}
+
+/* Fused pack + sortedness detection for packed radix sort.
+ * Packs keys[i] |= (i << key_bits) in-place while counting:
+ *   - forward inversions (keys[i] < keys[i-1]) → unsorted
+ *   - reverse inversions (keys[i] > keys[i-1]) → not_reverse
+ * If unsorted==0: already sorted. If not_reverse==0: reverse-sorted. */
+typedef struct {
+    uint64_t* keys;
+    uint8_t   key_bits;
+    uint64_t  key_mask;       /* mask for significant key bytes */
+    int64_t*  pw_unsorted;    /* count of forward inversions */
+    int64_t*  pw_not_reverse; /* count of strict ascending pairs */
+} packed_detect_ctx_t;
+
+static void packed_detect_fn(void* arg, uint32_t wid,
+                              int64_t start, int64_t end) {
+    packed_detect_ctx_t* c = (packed_detect_ctx_t*)arg;
+    uint64_t* k = c->keys;
+    uint8_t kb = c->key_bits;
+    uint64_t km = c->key_mask;
+    int64_t unsorted = 0, not_rev = 0;
+    uint64_t prev = (start > 0) ? (k[start - 1] & km) : 0;
+    for (int64_t i = start; i < end; i++) {
+        uint64_t cur = k[i] & km;  /* mask to significant bytes */
+        if (i > start) {
+            if (cur < prev) unsorted++;
+            if (cur > prev) not_rev++;
+        }
+        /* Pack: significant key bits | (index << key_bits) */
+        k[i] = cur | ((uint64_t)i << kb);
+        prev = cur;
+    }
+    c->pw_unsorted[wid] += unsorted;
+    c->pw_not_reverse[wid] += not_rev;
+}
+
+/* Parallel unpack: extract indices (and optionally sorted keys) from
+ * packed values after packed radix sort. */
+typedef struct {
+    const uint64_t* sorted;
+    int64_t*        indices;
+    uint64_t*       keys_out;
+    uint8_t         key_bits;
+    uint64_t        idx_mask;
+    uint64_t        key_mask;
+    bool            extract_keys;
+} packed_unpack_ctx_t;
+
+static void packed_unpack_fn(void* arg, uint32_t wid,
+                              int64_t start, int64_t end) {
+    (void)wid;
+    packed_unpack_ctx_t* c = (packed_unpack_ctx_t*)arg;
+    for (int64_t i = start; i < end; i++) {
+        uint64_t v = c->sorted[i];
+        c->indices[i] = (int64_t)((v >> c->key_bits) & c->idx_mask);
+        if (c->extract_keys)
+            c->keys_out[i] = v & c->key_mask;
+    }
+}
+
+/* ============================================================================
+ * MSD+LSB hybrid radix sort
+ *
+ * First pass: MSD partition by the most significant non-uniform byte.
+ * Creates up to 256 buckets, each small enough to fit in L2 cache.
+ * Subsequent passes: LSB radix sort within each bucket (in-cache, fast).
+ *
+ * For 10M I64 values with 3 significant bytes:
+ *   LSB: 3 full passes over 160MB (keys+indices) = ~960MB random traffic
+ *   MSD+LSB: 1 full pass + 256 × 2 in-cache passes ≈ ~400MB random + ~5ms in-cache
+ *
+ * Cache behavior: after the first MSD partition, each bucket (10M/256 ≈ 39K
+ * elements ≈ 625KB) fits in L2.  Subsequent passes operate entirely within
+ * cache, making them effectively free compared to the first pass.
+ * ============================================================================ */
+
+/* Per-bucket LSB radix sort (non-parallel, for cache-resident data).
+ * No SWC needed since data fits in L2/L1 cache. */
+static int64_t* bucket_lsb_sort(uint64_t* keys, int64_t* idx,
+                                  uint64_t* ktmp, int64_t* itmp,
+                                  int64_t n, uint8_t n_bytes) {
+    if (n <= 64) {
+        key_introsort(keys, idx, n);
+        return idx;
+    }
+
+    uint64_t* src_k = keys, *dst_k = ktmp;
+    int64_t*  src_i = idx,  *dst_i = itmp;
+
+    for (uint8_t bp = 0; bp < n_bytes; bp++) {
+        uint8_t shift = bp * 8;
+
+        uint32_t hist[256];
+        memset(hist, 0, sizeof(hist));
+        for (int64_t i = 0; i < n; i++)
+            hist[(src_k[i] >> shift) & 0xFF]++;
+
+        /* Check uniformity — skip this byte if all values share the same digit */
+        bool uniform = false;
+        for (int b = 0; b < 256; b++) {
+            if (hist[b] == (uint32_t)n) { uniform = true; break; }
+        }
+        if (uniform) continue;
+
+        /* Prefix sum */
+        int64_t off[256];
+        off[0] = 0;
+        for (int b = 1; b < 256; b++)
+            off[b] = off[b-1] + (int64_t)hist[b-1];
+
+        /* Scatter (no SWC — data is cache-resident) */
+        for (int64_t i = 0; i < n; i++) {
+            uint8_t byte = (src_k[i] >> shift) & 0xFF;
+            int64_t pos = off[byte]++;
+            dst_k[pos] = src_k[i];
+            dst_i[pos] = src_i[i];
+        }
+
+        uint64_t* tk = src_k; src_k = dst_k; dst_k = tk;
+        int64_t*  ti = src_i; src_i = dst_i; dst_i = ti;
+    }
+
+    return src_i;
+}
+
+/* Context for parallel per-bucket sorting after MSD partition */
+typedef struct {
+    uint64_t*  data_k;          /* MSD output: partitioned keys */
+    int64_t*   data_i;          /* MSD output: partitioned indices */
+    uint64_t*  tmp_k;           /* scratch (MSD input buffer, now free) */
+    int64_t*   tmp_i;
+    int64_t    bucket_offsets[257]; /* prefix-sum of bucket sizes */
+    uint8_t    n_bytes;            /* remaining bytes to sort per bucket */
+} msd_bucket_ctx_t;
+
+static void msd_bucket_sort_fn(void* arg, uint32_t wid,
+                                 int64_t start, int64_t end) {
+    (void)wid;
+    msd_bucket_ctx_t* c = (msd_bucket_ctx_t*)arg;
+
+    for (int64_t b = start; b < end; b++) {
+        int64_t off = c->bucket_offsets[b];
+        int64_t cnt = c->bucket_offsets[b + 1] - off;
+        if (cnt <= 1) continue;
+
+        int64_t* sorted = bucket_lsb_sort(
+            c->data_k + off, c->data_i + off,
+            c->tmp_k  + off, c->tmp_i  + off,
+            cnt, c->n_bytes);
+
+        /* Ensure result is in the canonical buffer (data_k/data_i).
+         * bucket_lsb_sort may leave result in the scratch buffer if an
+         * odd number of scatter passes executed. */
+        if (sorted != c->data_i + off) {
+            memcpy(c->data_k + off, c->tmp_k + off,
+                   (size_t)cnt * sizeof(uint64_t));
+            memcpy(c->data_i + off, c->tmp_i + off,
+                   (size_t)cnt * sizeof(int64_t));
+        }
+    }
+}
+
+/* MSD+LSB hybrid radix sort.
+ * Returns pointer to final sorted indices (always idx_tmp).
+ * If sorted_keys_out is non-NULL, stores sorted keys pointer (always keys_tmp).
+ * Falls back to LSB radix sort for small arrays or single-byte keys. */
+int64_t* msd_radix_sort_run(ray_pool_t* pool,
+                                     uint64_t* keys, int64_t* indices,
+                                     uint64_t* keys_tmp, int64_t* idx_tmp,
+                                     int64_t n, uint8_t n_bytes,
+                                     uint64_t** sorted_keys_out) {
+    /* MSD is beneficial when:
+     * (1) Many significant bytes (≥4) — saving 1 of 4+ LSB passes is worth it.
+     * (2) Data is large enough that full passes dominate over MSD overhead.
+     * (3) Average bucket fits in L2 cache (~256KB = 16K elements × 16B).
+     * For ≤3 byte keys, LSB radix with range-adaptive byte skip is already fast
+     * and MSD adds partitioning + dispatch overhead without enough payoff. */
+    /* MSD adds partitioning + dispatch overhead that only pays off for
+     * very wide keys (≥6 bytes) where saving multiple LSB passes matters.
+     * For typical data (≤5 bytes after range analysis), LSB with SWC is faster. */
+    if (n_bytes <= 5 || n <= 1000000) {
+        return radix_sort_run(pool, keys, indices, keys_tmp, idx_tmp,
+                               n, n_bytes, sorted_keys_out);
+    }
+
+    uint32_t n_tasks = pool ? ray_pool_total_workers(pool) : 1;
+    if (n_tasks < 1) n_tasks = 1;
+
+    /* Allocate histogram and offsets for MSD pass */
+    ray_t *hist_hdr = NULL, *off_hdr = NULL;
+    uint32_t* hist = (uint32_t*)scratch_alloc(&hist_hdr,
+                        (size_t)n_tasks * 256 * sizeof(uint32_t));
+    int64_t* offsets = (int64_t*)scratch_alloc(&off_hdr,
+                        (size_t)n_tasks * 256 * sizeof(int64_t));
+    if (!hist || !offsets) {
+        scratch_free(hist_hdr); scratch_free(off_hdr);
+        return radix_sort_run(pool, keys, indices, keys_tmp, idx_tmp,
+                               n, n_bytes, sorted_keys_out);
+    }
+
+    /* MSD pass: partition by the most significant non-uniform byte */
+    uint8_t msd_byte = n_bytes - 1;
+    uint8_t shift = msd_byte * 8;
+
+    radix_pass_ctx_t ctx = {
+        .keys = keys, .idx = indices,
+        .keys_out = keys_tmp, .idx_out = idx_tmp,
+        .n = n, .shift = shift, .n_tasks = n_tasks,
+        .hist = hist, .offsets = offsets,
+    };
+
+    /* Phase 1: parallel histogram */
+    if (pool && n_tasks > 1)
+        ray_pool_dispatch_n(pool, radix_hist_fn, &ctx, n_tasks);
+    else
+        radix_hist_fn(&ctx, 0, 0, 1);
+
+    /* Check uniformity */
+    bool uniform = false;
+    for (int b = 0; b < 256; b++) {
+        uint32_t total = 0;
+        for (uint32_t t = 0; t < n_tasks; t++)
+            total += hist[t * 256 + b];
+        if (total == (uint32_t)n) { uniform = true; break; }
+    }
+
+    if (uniform) {
+        /* All keys share the same MSB — skip this byte, try next */
+        scratch_free(hist_hdr); scratch_free(off_hdr);
+        return msd_radix_sort_run(pool, keys, indices, keys_tmp, idx_tmp,
+                                    n, n_bytes - 1, sorted_keys_out);
+    }
+
+    /* Phase 2: prefix sum → per-task scatter offsets + bucket boundaries */
+    int64_t bucket_offsets[257];
+    {
+        int64_t running = 0;
+        for (int b = 0; b < 256; b++) {
+            bucket_offsets[b] = running;
+            for (uint32_t t = 0; t < n_tasks; t++) {
+                offsets[t * 256 + b] = running;
+                running += hist[t * 256 + b];
+            }
+        }
+        bucket_offsets[256] = running;
+    }
+
+    /* Phase 3: parallel scatter with SWC */
+    if (pool && n_tasks > 1)
+        ray_pool_dispatch_n(pool, radix_scatter_fn, &ctx, n_tasks);
+    else
+        radix_scatter_fn(&ctx, 0, 0, 1);
+
+    scratch_free(hist_hdr);
+    scratch_free(off_hdr);
+
+    /* Data is now in keys_tmp/idx_tmp, partitioned by MSB.
+     * Sort each bucket independently using the remaining bytes.
+     * Use keys/indices as scratch (MSD input, now free to reuse). */
+    uint8_t remaining_bytes = msd_byte; /* bytes 0..msd_byte-1 */
+
+    msd_bucket_ctx_t bctx = {
+        .data_k = keys_tmp, .data_i = idx_tmp,
+        .tmp_k  = keys,     .tmp_i  = indices,
+        .n_bytes = remaining_bytes,
+    };
+    memcpy(bctx.bucket_offsets, bucket_offsets, sizeof(bucket_offsets));
+
+    if (pool)
+        ray_pool_dispatch_n(pool, msd_bucket_sort_fn, &bctx, 256);
+    else
+        msd_bucket_sort_fn(&bctx, 0, 0, 256);
+
+    /* Result is always in keys_tmp/idx_tmp */
+    if (sorted_keys_out) *sorted_keys_out = keys_tmp;
+    return idx_tmp;
+}
+
+/* radix_encode_ctx_t defined in exec_internal.h */
+
+void radix_encode_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    (void)wid;
+    radix_encode_ctx_t* c = (radix_encode_ctx_t*)arg;
+
+    /* Fused iota: initialize index array alongside key encoding */
+    if (c->indices) {
+        int64_t* idx = c->indices;
+        for (int64_t i = start; i < end; i++) idx[i] = i;
+    }
+
+    if (c->n_keys <= 1) {
+        /* Single-key fast path */
+        switch (c->type) {
+        case RAY_I64: case RAY_TIMESTAMP: {
+            const int64_t* d = (const int64_t*)c->data;
+            bool has_nulls = c->col && (c->col->attrs & RAY_ATTR_HAS_NULLS);
+            bool nf = c->nulls_first;
+            bool desc = c->desc;
+            /* Null key: nf=true→sort first, nf=false→sort last.
+             * For ASC  NULLS FIRST → e=0            (smallest)
+             * For ASC  NULLS LAST  → e=UINT64_MAX   (largest)
+             * For DESC NULLS FIRST → e=UINT64_MAX   (~e=0, smallest after flip)
+             * For DESC NULLS LAST  → e=0            (~e=UINT64_MAX, largest after flip) */
+            uint64_t null_e = (nf ^ desc) ? 0 : UINT64_MAX;
+            if (desc) {
+                for (int64_t i = start; i < end; i++) {
+                    if (has_nulls && ray_vec_is_null(c->col, i))
+                        c->keys[i] = ~null_e;
+                    else
+                        c->keys[i] = ~((uint64_t)d[i] ^ ((uint64_t)1 << 63));
+                }
+            } else {
+                for (int64_t i = start; i < end; i++) {
+                    if (has_nulls && ray_vec_is_null(c->col, i))
+                        c->keys[i] = null_e;
+                    else
+                        c->keys[i] = (uint64_t)d[i] ^ ((uint64_t)1 << 63);
+                }
+            }
+            break;
+        }
+        case RAY_F64: {
+            const double* d = (const double*)c->data;
+            bool nf   = c->nulls_first;
+            bool desc = c->desc;
+            /* NaN override: encode NaN so it sorts first or last.
+             * For ASC  NULLS FIRST → e=0            (smallest key)
+             * For ASC  NULLS LAST  → e=UINT64_MAX   (largest key)
+             * For DESC NULLS FIRST → e=UINT64_MAX   (~e=0, smallest)
+             * For DESC NULLS LAST  → e=0            (~e=UINT64_MAX, largest)
+             * Pattern: e = (nf ^ desc) ? 0 : UINT64_MAX */
+            uint64_t nan_e = (nf ^ desc) ? 0 : UINT64_MAX;
+            for (int64_t i = start; i < end; i++) {
+                uint64_t bits;
+                memcpy(&bits, &d[i], 8);
+                /* NaN: exponent all-1s (0x7FF) and mantissa non-zero */
+                if ((bits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
+                    (bits & 0x000FFFFFFFFFFFFFULL)) {
+                    c->keys[i] = desc ? ~nan_e : nan_e;
+                } else {
+                    uint64_t mask = -(bits >> 63) | ((uint64_t)1 << 63);
+                    uint64_t e = bits ^ mask;
+                    c->keys[i] = desc ? ~e : e;
+                }
+            }
+            break;
+        }
+        case RAY_I32: case RAY_DATE: case RAY_TIME: {
+            const int32_t* d = (const int32_t*)c->data;
+            bool has_nulls = c->col && (c->col->attrs & RAY_ATTR_HAS_NULLS);
+            bool nf = c->nulls_first;
+            bool desc = c->desc;
+            uint64_t null_e = (nf ^ desc) ? 0 : UINT64_MAX;
+            if (desc) {
+                for (int64_t i = start; i < end; i++) {
+                    if (has_nulls && ray_vec_is_null(c->col, i))
+                        c->keys[i] = ~null_e;
+                    else
+                        c->keys[i] = ~((uint64_t)((uint32_t)d[i] ^ ((uint32_t)1 << 31)));
+                }
+            } else {
+                for (int64_t i = start; i < end; i++) {
+                    if (has_nulls && ray_vec_is_null(c->col, i))
+                        c->keys[i] = null_e;
+                    else
+                        c->keys[i] = (uint64_t)((uint32_t)d[i] ^ ((uint32_t)1 << 31));
+                }
+            }
+            break;
+        }
+        case RAY_SYM: {
+            const uint32_t* rank = c->enum_rank;
+            if (c->desc) {
+                for (int64_t i = start; i < end; i++) {
+                    uint32_t raw = (uint32_t)ray_read_sym(c->data, i, c->type, c->col_attrs);
+                    c->keys[i] = ~(uint64_t)rank[raw];
+                }
+            } else {
+                for (int64_t i = start; i < end; i++) {
+                    uint32_t raw = (uint32_t)ray_read_sym(c->data, i, c->type, c->col_attrs);
+                    c->keys[i] = (uint64_t)rank[raw];
+                }
+            }
+            break;
+        }
+        case RAY_I16: {
+            const int16_t* d = (const int16_t*)c->data;
+            if (c->desc) {
+                for (int64_t i = start; i < end; i++)
+                    c->keys[i] = ~((uint64_t)((uint16_t)d[i] ^ ((uint16_t)1 << 15)));
+            } else {
+                for (int64_t i = start; i < end; i++)
+                    c->keys[i] = (uint64_t)((uint16_t)d[i] ^ ((uint16_t)1 << 15));
+            }
+            break;
+        }
+        case RAY_BOOL: case RAY_U8: {
+            const uint8_t* d = (const uint8_t*)c->data;
+            if (c->desc) {
+                for (int64_t i = start; i < end; i++)
+                    c->keys[i] = ~(uint64_t)d[i];
+            } else {
+                for (int64_t i = start; i < end; i++)
+                    c->keys[i] = (uint64_t)d[i];
+            }
+            break;
+        }
+        }
+    } else {
+        /* Composite-key encoding */
+        for (int64_t i = start; i < end; i++) {
+            uint64_t composite = 0;
+            for (uint8_t k = 0; k < c->n_keys; k++) {
+                ray_t* col = c->vecs[k];
+                int64_t val;
+                if (c->enum_ranks[k]) {
+                    uint32_t raw = (uint32_t)ray_read_sym(ray_data(col), i, col->type, col->attrs);
+                    val = (int64_t)c->enum_ranks[k][raw];
+                } else if (col->type == RAY_I64 || col->type == RAY_TIMESTAMP) {
+                    val = ((const int64_t*)ray_data(col))[i];
+                } else if (col->type == RAY_F64) {
+                    uint64_t bits;
+                    memcpy(&bits, &((const double*)ray_data(col))[i], 8);
+                    uint64_t mask = -(bits >> 63) | ((uint64_t)1 << 63);
+                    val = (int64_t)(bits ^ mask);
+                } else if (col->type == RAY_I32 || col->type == RAY_DATE || col->type == RAY_TIME) {
+                    val = (int64_t)((const int32_t*)ray_data(col))[i];
+                } else if (col->type == RAY_I16) {
+                    val = (int64_t)((const int16_t*)ray_data(col))[i];
+                } else if (col->type == RAY_BOOL || col->type == RAY_U8) {
+                    val = (int64_t)((const uint8_t*)ray_data(col))[i];
+                } else {
+                    val = 0;
+                }
+                uint64_t part = (uint64_t)val - (uint64_t)c->mins[k];
+                if (c->descs[k]) part = (uint64_t)c->ranges[k] - part;
+                composite |= part << c->bit_shifts[k];
+            }
+            c->keys[i] = composite;
+        }
+    }
+}
+
+/* ============================================================================
+ * Adaptive string sort (single-key RAY_STR)
+ *
+ * Pipeline:
+ *   1. Null partition — move nulls to sorted_idx[n_live..nrows).
+ *   2. Probe — one linear pass over the non-null range computes
+ *        • max_len                     (→ key width)
+ *        • run_count / run_all_asc/desc (→ pre-sorted short-circuit)
+ *        • card_estimate on the first 1024 rows via an exact hashset
+ *                                      (future-facing; unused today)
+ *      Every downstream decision is taken from these runtime numbers —
+ *      nothing in this file branches on "we know the bench is str8".
+ *   3. Single-run short-circuit — if the probe reports one monotone
+ *      run across the entire non-null range, we're done: copy (or
+ *      reverse, for DESC × ASC mismatch) and skip sorting entirely.
+ *      This is the vergesort trivial case; the general multi-run
+ *      merge path is scoped for a follow-up.
+ *   4. Key materialization — pack each non-null string into a record
+ *        struct { uint64_t parts[parts]; uint32_t row; uint32_t len; }
+ *      where parts = min(4, ceil(max_len/8)) and each part holds 8
+ *      bytes of the string byte-swapped into big-endian u64 form, so
+ *      raw u64 comparison == lex comparison.  One sequential pass
+ *      over the input, zero per-byte function calls downstream.
+ *   5. American-Flag in-place MSD byte radix on the packed records.
+ *      Top-level byte histogram → 256 buckets → one in-place swap
+ *      pass → recurse.  Sub-base-case buckets (≤ 24) finish with
+ *      insertion sort using the full multi-u64 comparator.  When
+ *      recursion exhausts the packed prefix (depth == parts*8),
+ *      ties fall through to a tail comparator that walks the
+ *      original bytes via ray_str_t_cmp — the only place cold
+ *      pool memory is touched during the sort proper.
+ *   6. Scatter row indices back to sorted_idx.
+ *   7. DESC reverses the non-null range; nulls-first rotates nulls
+ *      to the front.
+ *
+ * Every threshold and resource allocation here is driven by runtime
+ * numbers (n, max_len, worker count) or machine geometry (cache line,
+ * pool workers) — never by assumptions about input shape.
+ * ============================================================================ */
+
+#define RAY_STRSORT_KEY_PARTS_MAX 4    /* 32-byte packed prefix cap */
+#define RAY_STRSORT_BASE_CASE     24   /* small-bucket insertion-sort threshold */
+#define RAY_STRSORT_PROBE_HEAD    1024 /* rows sampled for exact distinct count */
+
+typedef struct {
+    uint64_t parts[RAY_STRSORT_KEY_PARTS_MAX];
+    uint32_t row;
+    uint32_t len;
+} ray_strkey_t;
+
+/* Convert a native-endian u64 to big-endian so raw u64 comparison yields
+ * lex order over the original byte layout.  On LE targets (everything we
+ * build for today) this is a single bswap instruction. */
+static inline uint64_t strkey_lex_u64(uint64_t v) {
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    return __builtin_bswap64(v);
+#else
+    return v;
+#endif
+}
+
+/* Load 8 bytes starting at src[offset], zero-padding past `len`, then
+ * byte-swap into lex u64 form.  Returns 0 when offset ≥ len. */
+static inline uint64_t strkey_load_part(const char* src, int64_t len, int offset) {
+    int64_t remaining = len - offset;
+    if (remaining <= 0) return 0;
+    uint64_t raw = 0;
+    int64_t take = remaining < 8 ? remaining : 8;
+    memcpy(&raw, src + offset, (size_t)take);
+    return strkey_lex_u64(raw);
+}
+
+/* Full-depth comparator.  Fast path: the packed parts.  Tail fallback:
+ * only fires if both records have len > parts*8 and their packed
+ * prefixes are equal — touches pool memory via ray_str_t_cmp only
+ * at the base case, never during the radix partitioning loop. */
+static int strkey_cmp(const ray_strkey_t* a, const ray_strkey_t* b,
+                      int parts,
+                      const ray_str_t* elems, const char* pool) {
+    for (int p = 0; p < parts; p++) {
+        if (a->parts[p] < b->parts[p]) return -1;
+        if (a->parts[p] > b->parts[p]) return  1;
+    }
+    int64_t parts_bytes = (int64_t)parts * 8;
+    /* Both strings fit inside the packed prefix — the only way their
+     * parts can tie is if one is a zero-padded suffix of the other, in
+     * which case the shorter one sorts first.  (Equal length means they
+     * are actually equal and stability via row is handled by the caller.) */
+    if ((int64_t)a->len <= parts_bytes && (int64_t)b->len <= parts_bytes) {
+        return (int)a->len - (int)b->len;
+    }
+    /* Tail comparison on bytes [parts_bytes, len). */
+    const ray_str_t* sa = &elems[a->row];
+    const ray_str_t* sb = &elems[b->row];
+    const char* pa = ray_str_t_ptr(sa, pool);
+    const char* pb = ray_str_t_ptr(sb, pool);
+    int64_t la = (int64_t)sa->len - parts_bytes; if (la < 0) la = 0;
+    int64_t lb = (int64_t)sb->len - parts_bytes; if (lb < 0) lb = 0;
+    int64_t m = la < lb ? la : lb;
+    int r = m ? memcmp(pa + parts_bytes, pb + parts_bytes, (size_t)m) : 0;
+    if (r != 0) return r;
+    return (la > lb) - (la < lb);
+}
+
+static void strkey_insertion_sort(ray_strkey_t* a, int64_t n, int parts,
+                                   const ray_str_t* elems, const char* pool) {
+    for (int64_t i = 1; i < n; i++) {
+        ray_strkey_t cur = a[i];
+        int64_t j = i - 1;
+        while (j >= 0 && strkey_cmp(&a[j], &cur, parts, elems, pool) > 0) {
+            a[j + 1] = a[j];
+            j--;
+        }
+        a[j + 1] = cur;
+    }
+}
+
+/* Extract the bp'th big-endian byte of the packed prefix. */
+static inline uint8_t strkey_byte_at(const ray_strkey_t* k, int bp) {
+    int part = bp >> 3;
+    int shift = 56 - ((bp & 7) << 3);
+    return (uint8_t)(k->parts[part] >> shift);
+}
+
+/* Cheap max-len probe — one sequential pass over the `len` field of each
+ * live row's ray_str_t.  Reads only 4 bytes per row (the len), so at 10M
+ * rows this is ~5ms bandwidth-bound.  Everything else the old probe
+ * computed (monotonicity, distinct-count sample) is folded into the
+ * parallel key-build pass below, where it's nearly free. */
+static int strsort_probe_parts(const int64_t* indices, int64_t n_live,
+                                const ray_str_t* elems) {
+    int64_t max_len = 0;
+    for (int64_t i = 0; i < n_live; i++) {
+        int64_t l = (int64_t)elems[indices[i]].len;
+        if (l > max_len) max_len = l;
+    }
+    int64_t pcalc = (max_len + 7) / 8;
+    if (pcalc < 1) pcalc = 1;
+    if (pcalc > RAY_STRSORT_KEY_PARTS_MAX) pcalc = RAY_STRSORT_KEY_PARTS_MAX;
+    return (int)pcalc;
+}
+
+/* Parallel key materialization (morsel range). */
+typedef struct {
+    ray_strkey_t*    out;
+    const int64_t*   indices;
+    const ray_str_t* elems;
+    const char*      pool;
+    int              parts;
+} strsort_build_ctx_t;
+
+static void strsort_build_fn(void* vctx, uint32_t wid, int64_t s, int64_t e) {
+    (void)wid;
+    strsort_build_ctx_t* c = (strsort_build_ctx_t*)vctx;
+    for (int64_t i = s; i < e; i++) {
+        int64_t row = c->indices[i];
+        const ray_str_t* str = &c->elems[row];
+        c->out[i].row = (uint32_t)row;
+        c->out[i].len = str->len;
+        int64_t len = str->len;
+        const char* src = len ? ray_str_t_ptr(str, c->pool) : NULL;
+        for (int p = 0; p < RAY_STRSORT_KEY_PARTS_MAX; p++) {
+            c->out[i].parts[p] = (p < c->parts)
+                ? strkey_load_part(src, len, p * 8)
+                : 0;
+        }
+    }
+}
+
+static void strsort_build_keys(ray_strkey_t* out, int64_t n_live,
+                                const int64_t* indices,
+                                const ray_str_t* elems, const char* pool,
+                                int parts) {
+    strsort_build_ctx_t c = { out, indices, elems, pool, parts };
+    ray_pool_t* p = ray_pool_get();
+    if (p && n_live >= RAY_PARALLEL_THRESHOLD) {
+        ray_pool_dispatch(p, strsort_build_fn, &c, n_live);
+    } else {
+        strsort_build_fn(&c, 0, 0, n_live);
+    }
+}
+
+/* Emit sorted row indices back to sorted_idx (parallel). */
+typedef struct {
+    int64_t*             out;
+    const ray_strkey_t*  keys;
+} strsort_emit_ctx_t;
+
+static void strsort_emit_fn(void* vctx, uint32_t wid, int64_t s, int64_t e) {
+    (void)wid;
+    strsort_emit_ctx_t* c = (strsort_emit_ctx_t*)vctx;
+    for (int64_t i = s; i < e; i++) c->out[i] = (int64_t)c->keys[i].row;
+}
+
+/* Packed-key lexicographic compare.  Fast path for run-detection and
+ * insertion sort at the radix base case.  No pool access. */
+static inline int strkey_cmp_packed(const ray_strkey_t* a,
+                                    const ray_strkey_t* b, int parts) {
+    for (int p = 0; p < parts; p++) {
+        if (a->parts[p] < b->parts[p]) return -1;
+        if (a->parts[p] > b->parts[p]) return  1;
+    }
+    return (int)a->len - (int)b->len;
+}
+
+/* Sequential run detection over packed keys, with early abort.
+ * For random data the first inversion appears within a few elements
+ * and the scan exits in O(1).  For fully sorted data it does one
+ * linear pass over the packed key array (contiguous memory, ~10ms
+ * sequential at 10M × 40B records — bandwidth bound).
+ * Returns the detected direction: -1 = all descending, +1 = all
+ * ascending, 0 = neither (or tail bytes remain to be sorted).
+ *
+ * IMPORTANT: when two adjacent packed keys tie AND either string is
+ * longer than the packed window, we CANNOT declare a sorted run —
+ * the tail bytes may impose ordering we haven't examined.  The
+ * shortcut is safe only when every pair is either strictly ordered
+ * by the packed key or both sides fit entirely inside the window. */
+static int strsort_detect_runs(const ray_strkey_t* keys, int64_t n,
+                                int parts, int parts_bytes) {
+    if (n < 2) return 0;
+    bool asc = true, desc = true;
+    for (int64_t i = 1; i < n; i++) {
+        int r = strkey_cmp_packed(&keys[i - 1], &keys[i], parts);
+        if (r == 0) {
+            if ((int64_t)keys[i - 1].len > parts_bytes ||
+                (int64_t)keys[i].len     > parts_bytes) {
+                /* Tail bytes unresolved — fall through to the real sort. */
+                return 0;
+            }
+            /* Both fully fit in the packed prefix and their parts tie
+             * → the strings are equal in the sorted order, which is
+             * compatible with both ascending and descending runs. */
+        } else if (r > 0) {
+            asc = false;
+        } else {
+            desc = false;
+        }
+        if (!asc && !desc) return 0;
+    }
+    if (asc) return 1;
+    if (desc) return -1;
+    return 0;
+}
+
+/* Parallel top-level byte-0 partition: per-task histogram, global
+ * prefix-sum, parallel scatter into a second contiguous buffer.
+ * This is the same pattern as the numeric radix_sort_run up above,
+ * adapted for 40-byte packed string keys. */
+typedef struct {
+    const ray_strkey_t* src;
+    ray_strkey_t*       dst;
+    int64_t             n;
+    uint32_t            n_tasks;
+    uint32_t*           hist;     /* [n_tasks × 256] */
+    int64_t*            offsets;  /* [n_tasks × 256] */
+} strsort_top_ctx_t;
+
+static void strsort_top_hist_fn(void* vctx, uint32_t wid,
+                                 int64_t start, int64_t end) {
+    (void)wid; (void)end;
+    strsort_top_ctx_t* c = (strsort_top_ctx_t*)vctx;
+    int64_t task = start;
+    uint32_t* h = c->hist + task * 256;
+    memset(h, 0, 256 * sizeof(uint32_t));
+    int64_t chunk = (c->n + c->n_tasks - 1) / c->n_tasks;
+    int64_t lo = task * chunk;
+    int64_t hi = lo + chunk;
+    if (hi > c->n) hi = c->n;
+    if (lo >= hi) return;
+    const ray_strkey_t* src = c->src;
+    for (int64_t i = lo; i < hi; i++) {
+        h[strkey_byte_at(&src[i], 0)]++;
+    }
+}
+
+static void strsort_top_scatter_fn(void* vctx, uint32_t wid,
+                                    int64_t start, int64_t end) {
+    (void)wid; (void)end;
+    strsort_top_ctx_t* c = (strsort_top_ctx_t*)vctx;
+    int64_t task = start;
+    int64_t chunk = (c->n + c->n_tasks - 1) / c->n_tasks;
+    int64_t lo = task * chunk;
+    int64_t hi = lo + chunk;
+    if (hi > c->n) hi = c->n;
+    if (lo >= hi) return;
+    int64_t* off = c->offsets + task * 256;
+    const ray_strkey_t* src = c->src;
+    ray_strkey_t* dst = c->dst;
+    for (int64_t i = lo; i < hi; i++) {
+        uint8_t b = strkey_byte_at(&src[i], 0);
+        dst[off[b]++] = src[i];
+    }
+}
+
+/* Bucket dispatch context: each task sorts one top-level bucket. */
+typedef struct {
+    ray_strkey_t*    keys;
+    const int64_t*   starts;
+    const int64_t*   counts;
+    int              parts_bytes;
+    int64_t          base_offset;
+    const ray_str_t* elems;
+    const char*      pool;
+    int              parts;
+    int              start_bp;  /* byte position to begin radix within bucket */
+} strsort_bucket_ctx_t;
+
+static void strsort_aflag(ray_strkey_t* keys, int64_t n, int bp,
+                          int parts_bytes, int64_t base_offset,
+                          const ray_str_t* elems, const char* pool,
+                          int parts);
+
+static void strsort_bucket_fn(void* vctx, uint32_t wid, int64_t s, int64_t e) {
+    (void)wid;
+    strsort_bucket_ctx_t* c = (strsort_bucket_ctx_t*)vctx;
+    for (int64_t b = s; b < e; b++) {
+        int64_t cnt = c->counts[b];
+        if (cnt <= 1) continue;
+        strsort_aflag(c->keys + c->starts[b], cnt, c->start_bp,
+                      c->parts_bytes, c->base_offset,
+                      c->elems, c->pool, c->parts);
+    }
+}
+
+/* In-place quicksort by packed key `len` field.  Used as the
+ * finalization step for buckets where every record's string ended
+ * at or before the current base_offset — such records tied on the
+ * packed prefix but still need to be ordered by length (shorter
+ * strings sort before longer ones that extend them, per
+ * ray_str_t_cmp).  Single-key integer quicksort with median-of-3
+ * pivot; stack depth bounded via tail-recursion on the larger half.
+ * Falls back to insertion sort for small ranges. */
+static void strkey_qsort_by_len(ray_strkey_t* a, int64_t lo, int64_t hi) {
+    while (hi - lo > 16) {
+        int64_t mid = lo + (hi - lo) / 2;
+        /* Median-of-3. */
+        if (a[lo].len  > a[hi].len)  { ray_strkey_t t=a[lo];  a[lo]=a[hi];  a[hi]=t;  }
+        if (a[mid].len > a[hi].len)  { ray_strkey_t t=a[mid]; a[mid]=a[hi]; a[hi]=t;  }
+        if (a[lo].len  > a[mid].len) { ray_strkey_t t=a[lo];  a[lo]=a[mid]; a[mid]=t; }
+        uint32_t pivot = a[mid].len;
+        /* Hoare partition. */
+        int64_t i = lo - 1, j = hi + 1;
+        for (;;) {
+            do { i++; } while (a[i].len < pivot);
+            do { j--; } while (a[j].len > pivot);
+            if (i >= j) break;
+            ray_strkey_t t = a[i]; a[i] = a[j]; a[j] = t;
+        }
+        /* Recurse on smaller half, loop on the larger. */
+        if (j - lo < hi - (j + 1)) {
+            strkey_qsort_by_len(a, lo, j);
+            lo = j + 1;
+        } else {
+            strkey_qsort_by_len(a, j + 1, hi);
+            hi = j;
+        }
+    }
+    /* Insertion sort base case. */
+    for (int64_t i = lo + 1; i <= hi; i++) {
+        ray_strkey_t cur = a[i];
+        int64_t j = i - 1;
+        while (j >= lo && a[j].len > cur.len) {
+            a[j + 1] = a[j];
+            j--;
+        }
+        a[j + 1] = cur;
+    }
+}
+
+/* Re-pack the next window of bytes for records whose previous window
+ * tied on the full packed prefix.  `base_offset` is the byte position
+ * in the original string that will become byte 0 of the new packed
+ * prefix.  Returns true if any record still has bytes to contribute
+ * past base_offset — false means every record's string ended at or
+ * before base_offset.
+ *
+ * When this returns false the caller MUST NOT simply move on: strings
+ * that ended before base_offset may still have differing lengths, and
+ * ray_str_t_cmp sorts shorter-before-longer on tie.  We handle that
+ * right here by sorting the bucket in place on `len` before returning,
+ * so the caller can just stop recursing. */
+static bool strsort_repack_window(ray_strkey_t* keys, int64_t n,
+                                   int64_t base_offset,
+                                   const ray_str_t* elems, const char* pool,
+                                   int parts) {
+    bool any_tail = false;
+    /* Track min/max len alongside the repack so we can skip the
+     * finalize-by-len step when every string in the bucket has the
+     * same length — the very common case where the bucket is full
+     * of identical strings (e.g. few_unique radix sub-bucket). */
+    uint32_t min_len = UINT32_MAX;
+    uint32_t max_len = 0;
+    for (int64_t i = 0; i < n; i++) {
+        const ray_str_t* s = &elems[keys[i].row];
+        int64_t len = s->len;
+        if (len > base_offset) any_tail = true;
+        if ((uint32_t)len < min_len) min_len = (uint32_t)len;
+        if ((uint32_t)len > max_len) max_len = (uint32_t)len;
+        const char* src = len > 0 ? ray_str_t_ptr(s, pool) : NULL;
+        for (int p = 0; p < parts; p++) {
+            int64_t off = base_offset + (int64_t)p * 8;
+            keys[i].parts[p] = (src && len > off)
+                ? strkey_load_part(src, len, (int)off)
+                : 0;
+        }
+    }
+    if (!any_tail && n > 1 && min_len != max_len) {
+        /* Every string ended at or before base_offset, they tied on
+         * the zero-padded packed prefix, and at least two of them
+         * differ in length.  A string of length 3 whose bytes match
+         * a prefix of a length-5 string must sort before it (per
+         * ray_str_t_cmp), so finalize the bucket by sorting on len.
+         * When min_len == max_len every record is bitwise equal and
+         * any order is valid — we skip the sort entirely. */
+        strkey_qsort_by_len(keys, 0, n - 1);
+    }
+    return any_tail;
+}
+
+/* American Flag in-place MSD byte radix on keys[0..n) at byte position bp
+ * within the current window.  All records share the same prefix from
+ * byte 0 up to `base_offset + bp` of the original string.  When the
+ * current window is exhausted (`bp >= parts_bytes`) we re-pack the next
+ * window and continue — keeps worst case at O(total_bytes) even when
+ * records share arbitrarily long common prefixes.
+ *
+ * parts_bytes = parts * 8 (cached).  base_offset tracks how many bytes
+ * of the original string have already been consumed by earlier windows. */
+static void strsort_aflag(ray_strkey_t* keys, int64_t n, int bp,
+                          int parts_bytes, int64_t base_offset,
+                          const ray_str_t* elems, const char* pool,
+                          int parts) {
+    /* Tail-recursive inline loop on the largest bucket to bound stack
+     * depth independent of n. */
+    for (;;) {
+        if (n <= 1) return;
+        if (n <= RAY_STRSORT_BASE_CASE) {
+            /* Small bucket — finish with a bounded comparison sort.
+             * strkey_cmp walks the original string bytes past the
+             * current window when necessary, so long tails are fine
+             * at this size. */
+            strkey_insertion_sort(keys, n, parts, elems, pool);
+            return;
+        }
+        if (bp >= parts_bytes) {
+            /* Exhausted the packed prefix for this window with a big
+             * bucket still to resolve.  Re-pack the next window and
+             * restart the radix — keeps total work linear in string
+             * bytes, never quadratic. */
+            int64_t next_offset = base_offset + parts_bytes;
+            if (!strsort_repack_window(keys, n, next_offset,
+                                        elems, pool, parts)) {
+                /* Every record's string ends at or before next_offset;
+                 * they are all equal from here on, order preserved. */
+                return;
+            }
+            base_offset = next_offset;
+            bp = 0;
+            continue;
+        }
+
+        int64_t counts[256] = {0};
+        for (int64_t i = 0; i < n; i++) {
+            counts[strkey_byte_at(&keys[i], bp)]++;
+        }
+        /* Fast path: all records share the same byte at this position.
+         * Skip the partition pass and advance one byte deeper. */
+        int uniq_b = -1;
+        bool uniform = true;
+        for (int b = 0; b < 256; b++) {
+            if (counts[b] == 0) continue;
+            if (uniq_b < 0) uniq_b = b;
+            else { uniform = false; break; }
+        }
+        if (uniform) {
+            bp++;
+            continue;
+        }
+
+        int64_t starts[256];
+        int64_t ends[256];
+        {
+            int64_t sum = 0;
+            for (int b = 0; b < 256; b++) {
+                starts[b] = sum;
+                sum += counts[b];
+                ends[b] = sum;
+            }
+        }
+
+        /* In-place swap loop: classic American Flag.  For each bucket b,
+         * drain records out of its slice whose current byte != b into
+         * their correct destination, cycling until the bucket slice
+         * contains only records that belong in b. */
+        int64_t cursors[256];
+        memcpy(cursors, starts, sizeof(cursors));
+        for (int b = 0; b < 256; b++) {
+            while (cursors[b] < ends[b]) {
+                ray_strkey_t v = keys[cursors[b]];
+                int bb = strkey_byte_at(&v, bp);
+                while (bb != b) {
+                    ray_strkey_t tmp = keys[cursors[bb]];
+                    keys[cursors[bb]] = v;
+                    cursors[bb]++;
+                    v = tmp;
+                    bb = strkey_byte_at(&v, bp);
+                }
+                keys[cursors[b]] = v;
+                cursors[b]++;
+            }
+        }
+
+        /* Find the largest bucket; recurse on the rest and loop on the
+         * largest to keep stack shallow. */
+        int big_b = 0;
+        int64_t big_cnt = counts[0];
+        for (int b = 1; b < 256; b++) {
+            if (counts[b] > big_cnt) { big_cnt = counts[b]; big_b = b; }
+        }
+        for (int b = 0; b < 256; b++) {
+            if (b == big_b) continue;
+            int64_t cnt = counts[b];
+            if (cnt > 1) {
+                strsort_aflag(keys + starts[b], cnt, bp + 1,
+                              parts_bytes, base_offset, elems, pool, parts);
+            }
+        }
+        keys += starts[big_b];
+        n = big_cnt;
+        bp++;
+    }
+}
+
+/* Top-level adaptive string sort.  Nulls partitioned first, then the
+ * non-null range runs through probe → single-run short-circuit →
+ * key materialization → American-Flag MSD → scatter row indices back.
+ * Returns false on OOM (caller should fall back to comparison sort). */
+static bool sort_str_msd_inplace(int64_t* sorted_idx, int64_t nrows,
+                                 ray_t* col, bool desc, bool nulls_first) {
+    if (nrows <= 0) return true;
+
+    /* Initial iota — caller may or may not have already filled it. */
+    for (int64_t i = 0; i < nrows; i++) sorted_idx[i] = i;
+
+    /* Partition nulls to the tail.  Slice vecs inherit the null bitmap
+     * from slice_parent, so check both attr slots — matches the
+     * exec_sort post-sort propagation pattern. */
+    int64_t null_count = 0;
+    bool has_nulls = (col->attrs & RAY_ATTR_HAS_NULLS) ||
+                     ((col->attrs & RAY_ATTR_SLICE) && col->slice_parent &&
+                      (col->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+    if (has_nulls) {
+        int64_t w = 0;
+        int64_t null_pos;
+        for (int64_t i = 0; i < nrows; i++) {
+            if (!ray_vec_is_null(col, i)) sorted_idx[w++] = i;
+        }
+        null_count = nrows - w;
+        null_pos = w;
+        for (int64_t i = 0; i < nrows; i++) {
+            if (ray_vec_is_null(col, i)) sorted_idx[null_pos++] = i;
+        }
+    }
+    int64_t n_live = nrows - null_count;
+
+    if (n_live > 1) {
+        const ray_str_t* elems;
+        const char* pool;
+        str_resolve(col, &elems, &pool);
+        ray_pool_t* pool_p = ray_pool_get();
+        bool go_parallel = (pool_p && n_live >= RAY_PARALLEL_THRESHOLD);
+
+        /* --- Cheap max-len probe (one pass over len fields). ---
+         * Chooses how many 8-byte parts to pack per key.  Everything
+         * else (monotonicity, cardinality sampling) is folded into the
+         * key-build / run-detection passes below. */
+        int parts = strsort_probe_parts(sorted_idx, n_live, elems);
+        int parts_bytes = parts * 8;
+
+        /* --- Parallel key materialization. --- */
+        ray_t* keys_hdr = NULL;
+        ray_strkey_t* keys = (ray_strkey_t*)scratch_alloc(&keys_hdr,
+                                (size_t)n_live * sizeof(ray_strkey_t));
+        if (!keys) return false;
+        strsort_build_keys(keys, n_live, sorted_idx, elems, pool, parts);
+
+        /* --- Vergesort run detection on packed keys. ---
+         * Early-aborts on the first inversion (so random input pays O(1)).
+         * When the entire non-null range is a single monotone run we
+         * skip the sort proper and emit row indices directly. */
+        int run_dir = strsort_detect_runs(keys, n_live, parts, parts_bytes);
+        bool want_asc = !desc;
+        if (run_dir == 1 && want_asc) {
+            /* Already ascending — emit as-is. */
+            strsort_emit_ctx_t ectx = { sorted_idx, keys };
+            if (go_parallel)
+                ray_pool_dispatch(pool_p, strsort_emit_fn, &ectx, n_live);
+            else
+                strsort_emit_fn(&ectx, 0, 0, n_live);
+        } else if (run_dir == -1 && !want_asc) {
+            /* Already descending — emit as-is. */
+            strsort_emit_ctx_t ectx = { sorted_idx, keys };
+            if (go_parallel)
+                ray_pool_dispatch(pool_p, strsort_emit_fn, &ectx, n_live);
+            else
+                strsort_emit_fn(&ectx, 0, 0, n_live);
+        } else if (run_dir != 0) {
+            /* Single run but wrong direction — emit row-indices reversed. */
+            for (int64_t i = 0, j = n_live - 1; i < j; i++, j--) {
+                ray_strkey_t t = keys[i]; keys[i] = keys[j]; keys[j] = t;
+            }
+            strsort_emit_ctx_t ectx = { sorted_idx, keys };
+            if (go_parallel)
+                ray_pool_dispatch(pool_p, strsort_emit_fn, &ectx, n_live);
+            else
+                strsort_emit_fn(&ectx, 0, 0, n_live);
+        } else {
+            /* --- Top-level byte-0 partition. ---
+             * When parallel: per-task histograms, prefix-sum, parallel
+             * scatter into a second contiguous buffer, pointer-swap
+             * so `keys` holds the partitioned records.  When sequential:
+             * single-pass American-Flag in-place swap loop. */
+            ray_t* tmp_hdr = NULL;
+            ray_strkey_t* keys_sorted = keys;  /* where the final data lands */
+
+            if (!go_parallel || parts_bytes == 0) {
+                strsort_aflag(keys, n_live, /*bp=*/0, parts_bytes,
+                              /*base_offset=*/0, elems, pool, parts);
+            } else {
+                ray_strkey_t* tmp = (ray_strkey_t*)scratch_alloc(&tmp_hdr,
+                                        (size_t)n_live * sizeof(ray_strkey_t));
+                if (!tmp) {
+                    /* Fall back to sequential sort on OOM. */
+                    strsort_aflag(keys, n_live, /*bp=*/0, parts_bytes,
+                                  /*base_offset=*/0, elems, pool, parts);
+                } else {
+                    uint32_t n_tasks = ray_pool_total_workers(pool_p);
+                    if (n_tasks < 1) n_tasks = 1;
+
+                    ray_t* hist_hdr = NULL;
+                    ray_t* off_hdr  = NULL;
+                    uint32_t* hist = (uint32_t*)scratch_alloc(&hist_hdr,
+                                        (size_t)n_tasks * 256 * sizeof(uint32_t));
+                    int64_t*  off  = (int64_t*)scratch_alloc(&off_hdr,
+                                        (size_t)n_tasks * 256 * sizeof(int64_t));
+                    if (!hist || !off) {
+                        /* Free only the hist/off scratch we own here; tmp_hdr
+                         * belongs to the outer cleanup block (line below) and
+                         * MUST NOT be freed twice. */
+                        scratch_free(hist_hdr); scratch_free(off_hdr);
+                        strsort_aflag(keys, n_live, /*bp=*/0, parts_bytes,
+                                      /*base_offset=*/0, elems, pool, parts);
+                    } else {
+                        strsort_top_ctx_t tctx = {
+                            .src = keys, .dst = tmp, .n = n_live,
+                            .n_tasks = n_tasks, .hist = hist, .offsets = off,
+                        };
+
+                        /* Phase 1: parallel histogram. */
+                        ray_pool_dispatch_n(pool_p, strsort_top_hist_fn,
+                                            &tctx, n_tasks);
+
+                        /* Phase 2: sequential prefix-sum.  For each bucket
+                         * b, the starting offset is the sum of all counts
+                         * in earlier buckets plus all counts in earlier
+                         * tasks for this bucket. */
+                        int64_t bucket_counts[256];
+                        int64_t bucket_starts[256];
+                        int64_t sum = 0;
+                        for (int b = 0; b < 256; b++) {
+                            bucket_starts[b] = sum;
+                            int64_t bc = 0;
+                            for (uint32_t t = 0; t < n_tasks; t++) {
+                                off[t * 256 + b] = sum + bc;
+                                bc += hist[t * 256 + b];
+                            }
+                            bucket_counts[b] = bc;
+                            sum += bc;
+                        }
+
+                        /* Phase 3: parallel scatter into tmp. */
+                        ray_pool_dispatch_n(pool_p, strsort_top_scatter_fn,
+                                            &tctx, n_tasks);
+
+                        /* tmp now holds the records partitioned by byte 0. */
+                        scratch_free(hist_hdr);
+                        scratch_free(off_hdr);
+
+                        /* Phase 4: parallel per-bucket recursive sort. */
+                        strsort_bucket_ctx_t bctx = {
+                            .keys        = tmp,
+                            .starts      = bucket_starts,
+                            .counts      = bucket_counts,
+                            .parts_bytes = parts_bytes,
+                            .base_offset = 0,
+                            .elems       = elems,
+                            .pool        = pool,
+                            .parts       = parts,
+                            .start_bp    = 1,
+                        };
+                        ray_pool_dispatch_n(pool_p, strsort_bucket_fn,
+                                            &bctx, 256);
+
+                        keys_sorted = tmp;
+                    }
+                }
+            }
+
+            /* Scatter row indices back (ASC order, parallel). */
+            strsort_emit_ctx_t ectx = { sorted_idx, keys_sorted };
+            if (go_parallel)
+                ray_pool_dispatch(pool_p, strsort_emit_fn, &ectx, n_live);
+            else
+                strsort_emit_fn(&ectx, 0, 0, n_live);
+
+            if (tmp_hdr) scratch_free(tmp_hdr);
+
+            /* DESC reverses the sorted non-null range. */
+            if (desc) {
+                for (int64_t i = 0, j = n_live - 1; i < j; i++, j--) {
+                    int64_t t = sorted_idx[i];
+                    sorted_idx[i] = sorted_idx[j];
+                    sorted_idx[j] = t;
+                }
+            }
+        }
+
+        scratch_free(keys_hdr);
+    }
+
+    /* If nulls should be first, rotate them to the front. */
+    if (null_count > 0 && nulls_first) {
+        /* Cheap rotation via three reverses:
+         *   reverse [0, n_live); reverse [n_live, nrows); reverse [0, nrows)
+         * Takes O(nrows) swaps, no extra memory. */
+        int64_t a = 0, b = n_live - 1;
+        while (a < b) { int64_t t = sorted_idx[a]; sorted_idx[a] = sorted_idx[b]; sorted_idx[b] = t; a++; b--; }
+        a = n_live; b = nrows - 1;
+        while (a < b) { int64_t t = sorted_idx[a]; sorted_idx[a] = sorted_idx[b]; sorted_idx[b] = t; a++; b--; }
+        a = 0; b = nrows - 1;
+        while (a < b) { int64_t t = sorted_idx[a]; sorted_idx[a] = sorted_idx[b]; sorted_idx[b] = t; a++; b--; }
+    }
+
+    return true;
+}
+
+/* Build SYM rank mapping: intern_id → sorted rank by string value.
+ * Caller must scratch_free(*hdr_out) when done.
+ * Returns pointer to rank array of size (max_id + 1), or NULL on error. */
+/* Parallel max_id scan context */
+typedef struct {
+    const void* data;
+    int8_t      type;
+    uint8_t     attrs;
+    uint32_t*   pw_max;  /* per-worker max */
+} enum_max_ctx_t;
+
+static void enum_max_fn(void* arg, uint32_t wid,
+                         int64_t start, int64_t end) {
+    enum_max_ctx_t* c = (enum_max_ctx_t*)arg;
+    uint32_t local_max = c->pw_max[wid];
+    for (int64_t i = start; i < end; i++) {
+        uint32_t v = (uint32_t)ray_read_sym(c->data, i, c->type, c->attrs);
+        if (v > local_max) local_max = v;
+    }
+    c->pw_max[wid] = local_max;
+}
+
+uint32_t* build_enum_rank(ray_t* col, int64_t nrows, ray_t** hdr_out) {
+    const void* data = ray_data(col);
+    int8_t type = col->type;
+    uint8_t attrs = col->attrs;
+
+    /* Find max intern ID (parallel for large columns) */
+    uint32_t max_id = 0;
+    ray_pool_t* pool = ray_pool_get();
+    if (pool && nrows > 100000) {
+        uint32_t nw = ray_pool_total_workers(pool);
+        uint32_t pw_max[nw];
+        memset(pw_max, 0, nw * sizeof(uint32_t));
+        enum_max_ctx_t ectx = { .data = data, .type = type, .attrs = attrs, .pw_max = pw_max };
+        ray_pool_dispatch(pool, enum_max_fn, &ectx, nrows);
+        for (uint32_t w = 0; w < nw; w++)
+            if (pw_max[w] > max_id) max_id = pw_max[w];
+    } else {
+        for (int64_t i = 0; i < nrows; i++) {
+            uint32_t v = (uint32_t)ray_read_sym(data, i, type, attrs);
+            if (v > max_id) max_id = v;
+        }
+    }
+
+    if (max_id >= UINT32_MAX - 1) { *hdr_out = NULL; return NULL; }
+    uint32_t n_ids = max_id + 1;
+
+    /* Arena for temporaries (ids, ptrs, lens, tmp) — single reset at end */
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    /* Allocate array of intern IDs to sort */
+    uint32_t* ids = (uint32_t*)ray_scratch_arena_push(&arena,
+                        (size_t)n_ids * sizeof(uint32_t));
+    if (!ids) { ray_scratch_arena_reset(&arena); *hdr_out = NULL; return NULL; }
+    for (uint32_t i = 0; i < n_ids; i++) ids[i] = i;
+
+    /* Pre-cache raw string pointers and lengths for fast comparison */
+    const char** ptrs = (const char**)ray_scratch_arena_push(&arena,
+                             (size_t)n_ids * sizeof(const char*));
+    uint32_t* lens = (uint32_t*)ray_scratch_arena_push(&arena,
+                         (size_t)n_ids * sizeof(uint32_t));
+    if (!ptrs || !lens) {
+        ray_scratch_arena_reset(&arena); *hdr_out = NULL; return NULL;
+    }
+    for (uint32_t i = 0; i < n_ids; i++) {
+        ray_t* s = ray_sym_str((int64_t)i);
+        if (s) {
+            ptrs[i] = ray_str_ptr(s);
+            lens[i] = (uint32_t)ray_str_len(s);
+        } else {
+            ptrs[i] = NULL;
+            lens[i] = 0;
+        }
+    }
+
+    /* Merge sort intern IDs by full string comparison.  For ≤100K SYM
+     * values this completes in <1ms and correctly handles strings that
+     * share long common prefixes (e.g. "id000000001"–"id000099999"). */
+    {
+        uint32_t* tmp = (uint32_t*)ray_scratch_arena_push(&arena,
+                             (size_t)n_ids * sizeof(uint32_t));
+        if (!tmp) { ray_scratch_arena_reset(&arena);
+                    *hdr_out = NULL; return NULL; }
+
+        /* Bottom-up merge sort */
+        for (uint32_t width = 1; width < n_ids; width *= 2) {
+            for (uint32_t i = 0; i < n_ids; i += 2 * width) {
+                uint32_t lo = i;
+                uint32_t mid = lo + width;
+                if (mid > n_ids) mid = n_ids;
+                uint32_t hi = lo + 2 * width;
+                if (hi > n_ids) hi = n_ids;
+                /* Merge ids[lo..mid) and ids[mid..hi) into tmp[lo..hi) */
+                uint32_t a = lo, b = mid, k = lo;
+                while (a < mid && b < hi) {
+                    uint32_t ia = ids[a], ib = ids[b];
+                    uint32_t la = lens[ia], lb = lens[ib];
+                    uint32_t ml = la < lb ? la : lb;
+                    int cmp = 0;
+                    if (ml > 0) cmp = memcmp(ptrs[ia], ptrs[ib], ml);
+                    if (cmp == 0) cmp = (la > lb) - (la < lb);
+                    if (cmp <= 0) tmp[k++] = ids[a++];
+                    else          tmp[k++] = ids[b++];
+                }
+                while (a < mid) tmp[k++] = ids[a++];
+                while (b < hi)  tmp[k++] = ids[b++];
+            }
+            /* Swap ids and tmp */
+            uint32_t* s = ids; ids = tmp; tmp = s;
+        }
+    }
+
+    /* Build rank[intern_id] = sorted position (output — not arena'd) */
+    ray_t* rank_hdr;
+    uint32_t* rank = (uint32_t*)scratch_calloc(&rank_hdr,
+                        (size_t)n_ids * sizeof(uint32_t));
+    if (!rank) { ray_scratch_arena_reset(&arena); *hdr_out = NULL; return NULL; }
+
+    for (uint32_t i = 0; i < n_ids; i++)
+        rank[ids[i]] = i;
+
+    ray_scratch_arena_reset(&arena);  /* free all temporaries at once */
+    *hdr_out = rank_hdr;
+    return rank;
+}
+
+/* Insertion sort for small arrays — used as base case for merge sort */
+void sort_insertion(const sort_cmp_ctx_t* ctx, int64_t* arr, int64_t n) {
+    for (int64_t i = 1; i < n; i++) {
+        int64_t key = arr[i];
+        int64_t j = i - 1;
+        while (j >= 0 && sort_cmp(ctx, arr[j], key) > 0) {
+            arr[j + 1] = arr[j];
+            j--;
+        }
+        arr[j + 1] = key;
+    }
+}
+
+/* Single-threaded merge sort (recursive, with insertion sort base case) */
+void sort_merge_recursive(const sort_cmp_ctx_t* ctx,
+                                  int64_t* arr, int64_t* tmp, int64_t n) {
+    if (n <= 64) {
+        sort_insertion(ctx, arr, n);
+        return;
+    }
+    int64_t mid = n / 2;
+    sort_merge_recursive(ctx, arr, tmp, mid);
+    sort_merge_recursive(ctx, arr + mid, tmp + mid, n - mid);
+
+    /* Merge arr[0..mid) and arr[mid..n) into tmp, then copy back */
+    int64_t i = 0, j = mid, k = 0;
+    while (i < mid && j < n) {
+        if (sort_cmp(ctx, arr[i], arr[j]) <= 0)
+            tmp[k++] = arr[i++];
+        else
+            tmp[k++] = arr[j++];
+    }
+    while (i < mid) tmp[k++] = arr[i++];
+    while (j < n) tmp[k++] = arr[j++];
+    memcpy(arr, tmp, (size_t)n * sizeof(int64_t));
+}
+
+/* sort_phase1_ctx_t defined in exec_internal.h */
+
+void sort_phase1_fn(void* arg, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    sort_phase1_ctx_t* ctx = (sort_phase1_ctx_t*)arg;
+    for (int64_t chunk_idx = start; chunk_idx < end; chunk_idx++) {
+        int64_t chunk_size = (ctx->nrows + ctx->n_chunks - 1) / ctx->n_chunks;
+        int64_t lo = chunk_idx * chunk_size;
+        int64_t hi = lo + chunk_size;
+        if (hi > ctx->nrows) hi = ctx->nrows;
+        if (lo >= hi) continue;
+        sort_merge_recursive(ctx->cmp_ctx, ctx->indices + lo, ctx->tmp + lo, hi - lo);
+    }
+}
+
+/* Merge two adjacent sorted runs: [lo..mid) and [mid..hi) from src into dst */
+static void merge_runs(const sort_cmp_ctx_t* ctx,
+                        const int64_t* src, int64_t* dst,
+                        int64_t lo, int64_t mid, int64_t hi) {
+    int64_t i = lo, j = mid, k = lo;
+    while (i < mid && j < hi) {
+        if (sort_cmp(ctx, src[i], src[j]) <= 0)
+            dst[k++] = src[i++];
+        else
+            dst[k++] = src[j++];
+    }
+    while (i < mid) dst[k++] = src[i++];
+    while (j < hi) dst[k++] = src[j++];
+}
+
+/* sort_merge_ctx_t defined in exec_internal.h */
+
+void sort_merge_fn(void* arg, uint32_t worker_id, int64_t start, int64_t end) {
+    (void)worker_id;
+    sort_merge_ctx_t* ctx = (sort_merge_ctx_t*)arg;
+    for (int64_t pair_idx = start; pair_idx < end; pair_idx++) {
+        int64_t lo = pair_idx * 2 * ctx->run_size;
+        int64_t mid = lo + ctx->run_size;
+        int64_t hi = mid + ctx->run_size;
+        if (mid > ctx->nrows) mid = ctx->nrows;
+        if (hi > ctx->nrows) hi = ctx->nrows;
+        if (lo >= ctx->nrows) continue;
+        if (mid >= hi) {
+            /* Only one run — copy directly */
+            memcpy(ctx->dst + lo, ctx->src + lo, (size_t)(hi - lo) * sizeof(int64_t));
+        } else {
+            merge_runs(ctx->cmp_ctx, ctx->src, ctx->dst, lo, mid, hi);
+        }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Parallel multi-key min/max prescan for composite radix sort.
+ * Each worker scans all n_keys columns over its row range, then the main
+ * thread merges per-worker results.
+ * -------------------------------------------------------------------------- */
+
+/* MK_PRESCAN_MAX_KEYS, mk_prescan_ctx_t defined in exec_internal.h */
+
+void mk_prescan_fn(void* arg, uint32_t wid,
+                           int64_t start, int64_t end) {
+    mk_prescan_ctx_t* c = (mk_prescan_ctx_t*)arg;
+    uint8_t nk = c->n_keys;
+    int64_t* my_mins = c->pw_mins + (int64_t)wid * nk;
+    int64_t* my_maxs = c->pw_maxs + (int64_t)wid * nk;
+
+    /* Initialize on first morsel, merge on subsequent */
+    for (uint8_t k = 0; k < nk; k++) {
+        if (my_mins[k] == INT64_MAX) {
+            /* first morsel for this worker — will be set below */
+        }
+    }
+
+    for (uint8_t k = 0; k < nk; k++) {
+        ray_t* col = c->vecs[k];
+        int64_t kmin = my_mins[k], kmax = my_maxs[k];
+
+        if (c->enum_ranks[k]) {
+            const void* cdata = ray_data(col);
+            int8_t ctype = col->type;
+            uint8_t cattrs = col->attrs;
+            const uint32_t* ranks = c->enum_ranks[k];
+            for (int64_t i = start; i < end; i++) {
+                uint32_t raw = (uint32_t)ray_read_sym(cdata, i, ctype, cattrs);
+                int64_t v = (int64_t)ranks[raw];
+                if (v < kmin) kmin = v;
+                if (v > kmax) kmax = v;
+            }
+        } else if (col->type == RAY_I64 || col->type == RAY_TIMESTAMP) {
+            const int64_t* d = (const int64_t*)ray_data(col);
+            for (int64_t i = start; i < end; i++) {
+                if (d[i] < kmin) kmin = d[i];
+                if (d[i] > kmax) kmax = d[i];
+            }
+        } else if (col->type == RAY_F64) {
+            const double* d = (const double*)ray_data(col);
+            for (int64_t i = start; i < end; i++) {
+                uint64_t bits;
+                memcpy(&bits, &d[i], 8);
+                uint64_t mask = -(bits >> 63) | ((uint64_t)1 << 63);
+                int64_t v = (int64_t)(bits ^ mask);
+                if (v < kmin) kmin = v;
+                if (v > kmax) kmax = v;
+            }
+        } else if (col->type == RAY_I32 || col->type == RAY_DATE || col->type == RAY_TIME) {
+            const int32_t* d = (const int32_t*)ray_data(col);
+            for (int64_t i = start; i < end; i++) {
+                int64_t v = (int64_t)d[i];
+                if (v < kmin) kmin = v;
+                if (v > kmax) kmax = v;
+            }
+        } else if (col->type == RAY_I16) {
+            const int16_t* d = (const int16_t*)ray_data(col);
+            for (int64_t i = start; i < end; i++) {
+                int64_t v = (int64_t)d[i];
+                if (v < kmin) kmin = v;
+                if (v > kmax) kmax = v;
+            }
+        } else if (col->type == RAY_BOOL || col->type == RAY_U8) {
+            const uint8_t* d = (const uint8_t*)ray_data(col);
+            for (int64_t i = start; i < end; i++) {
+                int64_t v = (int64_t)d[i];
+                if (v < kmin) kmin = v;
+                if (v > kmax) kmax = v;
+            }
+        }
+
+        my_mins[k] = kmin;
+        my_maxs[k] = kmax;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Top-N heap selection: for ORDER BY ... LIMIT N where N is small,
+ * a single-pass heap beats the 8-pass radix sort.
+ * -------------------------------------------------------------------------- */
+
+typedef struct { uint64_t key; int64_t idx; } topn_entry_t;
+
+static inline void topn_sift_down(topn_entry_t* h, int64_t n, int64_t i) {
+    for (;;) {
+        int64_t largest = i, l = 2*i+1, r = 2*i+2;
+        if (l < n && h[l].key > h[largest].key) largest = l;
+        if (r < n && h[r].key > h[largest].key) largest = r;
+        if (largest == i) return;
+        topn_entry_t t = h[i]; h[i] = h[largest]; h[largest] = t;
+        i = largest;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Fused encode + top-N: composite-key encode and heap insert in one pass,
+ * avoiding the 80MB intermediate keys array.
+ * -------------------------------------------------------------------------- */
+
+typedef struct {
+    int64_t         limit;
+    topn_entry_t*   heaps;   /* [n_workers][limit] */
+    int64_t*        counts;
+    /* Composite-key encode params (same as radix_encode_ctx_t fields): */
+    uint8_t         n_keys;
+    ray_t**          vecs;
+    int64_t         mins[16];
+    int64_t         ranges[16];
+    uint8_t         bit_shifts[16];
+    uint8_t         descs[16];
+    const uint32_t* enum_ranks[16];
+} fused_topn_ctx_t;
+
+__attribute__((unused))
+static void fused_topn_fn(void* arg, uint32_t wid,
+                           int64_t start, int64_t end) {
+    fused_topn_ctx_t* c = (fused_topn_ctx_t*)arg;
+    int64_t K = c->limit;
+    topn_entry_t* heap = c->heaps + (int64_t)wid * K;
+    int64_t cnt = c->counts[wid];
+    uint8_t nk = c->n_keys;
+
+    for (int64_t i = start; i < end; i++) {
+        /* Inline composite key encode */
+        uint64_t composite = 0;
+        for (uint8_t k = 0; k < nk; k++) {
+            ray_t* col = c->vecs[k];
+            int64_t val;
+            if (c->enum_ranks[k]) {
+                uint32_t raw = (uint32_t)ray_read_sym(ray_data(col), i, col->type, col->attrs);
+                val = (int64_t)c->enum_ranks[k][raw];
+            } else if (col->type == RAY_I64 || col->type == RAY_TIMESTAMP) {
+                val = ((const int64_t*)ray_data(col))[i];
+            } else if (col->type == RAY_F64) {
+                uint64_t bits;
+                memcpy(&bits, &((const double*)ray_data(col))[i], 8);
+                uint64_t mask = -(bits >> 63) | ((uint64_t)1 << 63);
+                val = (int64_t)(bits ^ mask);
+            } else if (col->type == RAY_I32 || col->type == RAY_DATE || col->type == RAY_TIME) {
+                val = (int64_t)((const int32_t*)ray_data(col))[i];
+            } else if (col->type == RAY_I16) {
+                val = (int64_t)((const int16_t*)ray_data(col))[i];
+            } else if (col->type == RAY_BOOL || col->type == RAY_U8) {
+                val = (int64_t)((const uint8_t*)ray_data(col))[i];
+            } else {
+                val = 0;
+            }
+            uint64_t part = (uint64_t)val - (uint64_t)c->mins[k];
+            if (c->descs[k]) part = (uint64_t)c->ranges[k] - part;
+            composite |= part << c->bit_shifts[k];
+        }
+
+        /* Inline heap insert */
+        if (cnt < K) {
+            heap[cnt].key = composite;
+            heap[cnt].idx = i;
+            cnt++;
+            if (cnt == K) {
+                for (int64_t j = K/2 - 1; j >= 0; j--)
+                    topn_sift_down(heap, K, j);
+            }
+        } else if (composite < heap[0].key) {
+            heap[0].key = composite;
+            heap[0].idx = i;
+            topn_sift_down(heap, K, 0);
+        }
+    }
+    c->counts[wid] = cnt;
+}
+
+typedef struct {
+    const uint64_t* keys;
+    int64_t         limit;
+    topn_entry_t*   heaps;   /* [n_workers][limit] */
+    int64_t*        counts;  /* actual count per worker */
+} topn_ctx_t;
+
+__attribute__((unused))
+static void topn_scan_fn(void* arg, uint32_t wid, int64_t start, int64_t end) {
+    topn_ctx_t* c = (topn_ctx_t*)arg;
+    int64_t K = c->limit;
+    topn_entry_t* heap = c->heaps + (int64_t)wid * K;
+    const uint64_t* keys = c->keys;
+    int64_t cnt = c->counts[wid];   /* accumulate across morsels */
+
+    for (int64_t i = start; i < end; i++) {
+        uint64_t k = keys[i];
+        if (cnt < K) {
+            heap[cnt].key = k;
+            heap[cnt].idx = i;
+            cnt++;
+            if (cnt == K) {
+                for (int64_t j = K/2 - 1; j >= 0; j--)
+                    topn_sift_down(heap, K, j);
+            }
+        } else if (k < heap[0].key) {
+            heap[0].key = k;
+            heap[0].idx = i;
+            topn_sift_down(heap, K, 0);
+        }
+    }
+    c->counts[wid] = cnt;
+}
+
+#define TOPN_MAX 8192  /* max limit for heap-based top-N (merge VLA ≤ 128KB) */
+
+__attribute__((unused))
+static int64_t topn_merge_fused(fused_topn_ctx_t* ctx, uint32_t n_workers,
+                                 int64_t* out, int64_t limit) {
+    /* Clamp to TOPN_MAX for VLA stack safety (≤ 128KB). */
+    if (limit > TOPN_MAX) limit = TOPN_MAX;
+    topn_entry_t merge[limit];
+    int64_t cnt = 0;
+    for (uint32_t w = 0; w < n_workers; w++) {
+        topn_entry_t* wh = ctx->heaps + (int64_t)w * limit;
+        int64_t wc = ctx->counts[w];
+        for (int64_t j = 0; j < wc; j++) {
+            if (cnt < limit) {
+                merge[cnt++] = wh[j];
+                if (cnt == limit) {
+                    for (int64_t m = limit/2 - 1; m >= 0; m--)
+                        topn_sift_down(merge, limit, m);
+                }
+            } else if (wh[j].key < merge[0].key) {
+                merge[0] = wh[j];
+                topn_sift_down(merge, limit, 0);
+            }
+        }
+    }
+    if (cnt > 1) {
+        for (int64_t m = cnt/2 - 1; m >= 0; m--)
+            topn_sift_down(merge, cnt, m);
+        for (int64_t i = cnt - 1; i > 0; i--) {
+            topn_entry_t t = merge[0]; merge[0] = merge[i]; merge[i] = t;
+            topn_sift_down(merge, i, 0);
+        }
+    }
+    for (int64_t i = 0; i < cnt; i++)
+        out[i] = merge[i].idx;
+    return cnt;
+}
+
+/* Merge per-worker heaps → sorted indices in out[0..return_val-1]. */
+__attribute__((unused))
+static int64_t topn_merge(topn_ctx_t* ctx, uint32_t n_workers,
+                           int64_t* out, int64_t limit) {
+    /* Clamp to TOPN_MAX for VLA stack safety (≤ 128KB). */
+    if (limit > TOPN_MAX) limit = TOPN_MAX;
+    topn_entry_t merge[limit];
+    int64_t cnt = 0;
+
+    for (uint32_t w = 0; w < n_workers; w++) {
+        topn_entry_t* wh = ctx->heaps + (int64_t)w * limit;
+        int64_t wc = ctx->counts[w];
+        for (int64_t j = 0; j < wc; j++) {
+            if (cnt < limit) {
+                merge[cnt++] = wh[j];
+                if (cnt == limit) {
+                    for (int64_t m = limit/2 - 1; m >= 0; m--)
+                        topn_sift_down(merge, limit, m);
+                }
+            } else if (wh[j].key < merge[0].key) {
+                merge[0] = wh[j];
+                topn_sift_down(merge, limit, 0);
+            }
+        }
+    }
+
+    /* Heapsort for ascending order */
+    if (cnt > 1) {
+        for (int64_t m = cnt/2 - 1; m >= 0; m--)
+            topn_sift_down(merge, cnt, m);
+        for (int64_t i = cnt - 1; i > 0; i--) {
+            topn_entry_t t = merge[0]; merge[0] = merge[i]; merge[i] = t;
+            topn_sift_down(merge, i, 0);
+        }
+    }
+
+    for (int64_t i = 0; i < cnt; i++)
+        out[i] = merge[i].idx;
+    return cnt;
+}
+
+/* Decode sorted radix keys directly into a typed output vector.
+ * Sequential writes — no random access. */
+static void radix_decode_into(void* dst, int8_t type, const uint64_t* sorted_keys,
+                               int64_t n, bool desc) {
+    if (type == RAY_I64 || type == RAY_TIMESTAMP) {
+        int64_t* d = (int64_t*)dst;
+        if (desc)
+            for (int64_t i = 0; i < n; i++)
+                d[i] = (int64_t)(~sorted_keys[i] ^ ((uint64_t)1 << 63));
+        else
+            for (int64_t i = 0; i < n; i++)
+                d[i] = (int64_t)(sorted_keys[i] ^ ((uint64_t)1 << 63));
+    } else if (type == RAY_F64) {
+        double* d = (double*)dst;
+        for (int64_t i = 0; i < n; i++) {
+            uint64_t k = desc ? ~sorted_keys[i] : sorted_keys[i];
+            /* Inverse of encode: positive originals have MSB=1 in key (flip sign bit),
+             * negative originals have MSB=0 in key (flip all bits). */
+            uint64_t mask = (k >> 63) ? ((uint64_t)1 << 63) : ~(uint64_t)0;
+            uint64_t bits = k ^ mask;
+            memcpy(&d[i], &bits, 8);
+        }
+    } else if (type == RAY_I32 || type == RAY_DATE || type == RAY_TIME) {
+        int32_t* d = (int32_t*)dst;
+        if (desc)
+            for (int64_t i = 0; i < n; i++)
+                d[i] = (int32_t)((uint32_t)(~sorted_keys[i]) ^ ((uint32_t)1 << 31));
+        else
+            for (int64_t i = 0; i < n; i++)
+                d[i] = (int32_t)((uint32_t)sorted_keys[i] ^ ((uint32_t)1 << 31));
+    } else if (type == RAY_I16) {
+        int16_t* d = (int16_t*)dst;
+        if (desc)
+            for (int64_t i = 0; i < n; i++)
+                d[i] = (int16_t)((uint16_t)(~sorted_keys[i]) ^ ((uint16_t)1 << 15));
+        else
+            for (int64_t i = 0; i < n; i++)
+                d[i] = (int16_t)((uint16_t)sorted_keys[i] ^ ((uint16_t)1 << 15));
+    } else if (type == RAY_BOOL || type == RAY_U8) {
+        uint8_t* d = (uint8_t*)dst;
+        if (desc)
+            for (int64_t i = 0; i < n; i++) d[i] = (uint8_t)(~sorted_keys[i]);
+        else
+            for (int64_t i = 0; i < n; i++) d[i] = (uint8_t)sorted_keys[i];
+    }
+}
+
+/* Sort columns and return index array (extended: optionally returns sorted keys).
+ * cols:        array of n_cols vectors (sort keys, most significant first)
+ * descs:       array of n_cols flags (0=asc, 1=desc), or NULL for all-asc
+ * nulls_first: array of n_cols flags (0=nulls last, 1=nulls first), or NULL
+ *              for default convention (nulls last for asc, nulls first for desc)
+ * n_cols:      number of sort key columns (max 16)
+ * nrows:       number of rows in each column
+ * sorted_keys_out: if non-NULL, receives sorted radix keys (caller frees keys_hdr_out)
+ * keys_hdr_out:    if non-NULL, receives scratch header for sorted_keys_out
+ * Returns:     ray_t* I64 vector of sorted indices (caller owns), or RAY_ERROR */
+static ray_t* sort_indices_ex(ray_t** cols, uint8_t* descs, uint8_t* nulls_first,
+                               uint8_t n_cols, int64_t nrows,
+                               uint64_t** sorted_keys_out, ray_t** keys_hdr_out) {
+    if (n_cols == 0 || nrows <= 0)
+        return ray_vec_new(RAY_I64, 0);
+    if (n_cols > 16)
+        return ray_error("nyi", NULL);
+
+    /* Allocate index array */
+    ray_t* indices_hdr;
+    int64_t* indices = (int64_t*)scratch_alloc(&indices_hdr,
+                            (size_t)nrows * sizeof(int64_t));
+    if (!indices) return ray_error("oom", NULL);
+    bool iota_done = false;
+
+    /* --- Radix sort fast path ------------------------------------------------
+     * Try radix sort for integer/float/enum keys.  Falls back to merge sort
+     * for unsupported types (SYM with arbitrary strings, mixed types, etc.). */
+    bool radix_done = false;
+    int64_t* sorted_idx = indices;  /* may point to itmp after radix sort */
+    ray_t* radix_itmp_hdr = NULL;   /* kept alive until we copy out */
+    ray_t* enum_rank_hdrs[n_cols];
+    memset(enum_rank_hdrs, 0, n_cols * sizeof(ray_t*));
+
+    if (nrows > 64) {
+        /* RAY_STR single-key fast path — dedicated MSD byte-radix
+         * sort.  Handles variable-width strings, nulls, and DESC
+         * internally; skips the rest of sort_indices_ex on success. */
+        if (n_cols == 1 && cols[0]->type == RAY_STR) {
+            bool desc = descs ? descs[0] : 0;
+            bool nf   = nulls_first ? nulls_first[0] : !desc;
+            if (sort_str_msd_inplace(indices, nrows, cols[0], desc, nf)) {
+                sorted_idx = indices;
+                iota_done = true;
+                radix_done = true;
+                goto str_msd_done;
+            }
+            /* OOM — fall through to comparison merge sort. */
+        }
+
+        /* Check if all sort keys are radix-sortable types.
+         * RAY_STR and RAY_GUID are accepted for multi-key sorts only:
+         * they have no packed uint64 encoding, so the composite-radix
+         * path can't fit them, but the rank-then-compose fallback handles
+         * them via single-key sort_indices_ex recursion (which hits the
+         * RAY_STR MSD byte-radix path for strings, or the merge-sort
+         * path with the new RAY_GUID comparator for guids). */
+        bool can_radix = true;
+        bool has_wide_key = false;  /* RAY_STR or RAY_GUID — forces rank fallback */
+        for (uint8_t k = 0; k < n_cols; k++) {
+            if (!cols[k]) { can_radix = false; break; }
+            int8_t t = cols[k]->type;
+            if (t == RAY_STR || t == RAY_GUID) { has_wide_key = true; continue; }
+            if (t != RAY_I64 && t != RAY_F64 && t != RAY_I32 && t != RAY_I16 &&
+                t != RAY_BOOL && t != RAY_U8 && t != RAY_SYM &&
+                t != RAY_DATE && t != RAY_TIME && t != RAY_TIMESTAMP) {
+                can_radix = false; break;
+            }
+        }
+        /* Single-key wide types: RAY_STR has its own MSD fast path above;
+         * single-key RAY_GUID falls through to merge sort with the new
+         * comparator. In both cases the multi-key composite path is not
+         * applicable, so disable the radix branch. */
+        if (has_wide_key && n_cols == 1) can_radix = false;
+
+        if (can_radix) {
+            ray_pool_t* pool = ray_pool_get();
+
+            /* Build SYM rank mappings (intern_id -> sorted rank by string) */
+            uint32_t* enum_ranks[n_cols];
+            memset(enum_ranks, 0, n_cols * sizeof(uint32_t*));
+            for (uint8_t k = 0; k < n_cols; k++) {
+                if (RAY_IS_SYM(cols[k]->type)) {
+                    enum_ranks[k] = build_enum_rank(cols[k], nrows,
+                                                     &enum_rank_hdrs[k]);
+                    if (!enum_ranks[k]) { can_radix = false; break; }
+                }
+            }
+
+            if (can_radix && n_cols == 1) {
+                /* --- Single-key sort --- */
+                uint8_t key_nbytes_max = radix_key_bytes(cols[0]->type);
+
+                /* Skip pool for small arrays - dispatch overhead dominates */
+                ray_pool_t* sk_pool = (nrows >= SMALL_POOL_THRESHOLD) ? pool : NULL;
+
+                /* Encode keys (needed by all paths) */
+                ray_t *keys_hdr;
+                uint64_t* keys = (uint64_t*)scratch_alloc(&keys_hdr,
+                                    (size_t)nrows * sizeof(uint64_t));
+                if (keys) {
+                    bool desc = descs ? descs[0] : 0;
+                    /* Null = minimum value.
+                     * ASC → nulls first, DESC → nulls last. */
+                    bool nf = nulls_first ? nulls_first[0] : !desc;
+                    radix_encode_ctx_t enc = {
+                        .keys = keys, .indices = indices,
+                        .data = ray_data(cols[0]),
+                        .col = cols[0],
+                        .type = cols[0]->type,
+                        .col_attrs = cols[0]->attrs,
+                        .desc = desc,
+                        .nulls_first = nf,
+                        .enum_rank = enum_ranks[0], .n_keys = 1,
+                    };
+                    if (sk_pool)
+                        ray_pool_dispatch(sk_pool, radix_encode_fn, &enc, nrows);
+                    else
+                        radix_encode_fn(&enc, 0, 0, nrows);
+                    iota_done = true;
+
+                    if (nrows <= RADIX_SORT_THRESHOLD) {
+                        /* Introsort on encoded keys - faster than multi-pass
+                         * radix for small arrays (avoids scatter overhead). */
+                        key_introsort(keys, indices, nrows);
+                        sorted_idx = indices;
+                        radix_done = true;
+                    } else {
+                        /* Data-range-adaptive byte count: scan encoded keys
+                         * to skip bytes that are uniform across all values,
+                         * avoiding wasteful histogram passes. */
+                        uint8_t key_nbytes = compute_key_nbytes(
+                            sk_pool, keys, nrows, key_nbytes_max);
+
+                        /* Try packed radix sort: pack key + index into one
+                         * uint64_t to halve memory traffic per pass.
+                         * Feasible when key_nbytes*8 + index_bits <= 64. */
+                        uint8_t idx_bits = 0;
+                        { int64_t nn = nrows; while (nn > 0) { idx_bits++; nn >>= 1; } }
+                        bool use_packed = (key_nbytes <= 3
+                                           && key_nbytes * 8 + idx_bits <= 64);
+
+                        if (use_packed) {
+                            uint8_t key_bits = key_nbytes * 8;
+                            ray_t *ptmp_hdr;
+                            uint64_t* ptmp = (uint64_t*)scratch_alloc(&ptmp_hdr,
+                                                (size_t)nrows * sizeof(uint64_t));
+                            if (ptmp) {
+                                /* Fuse packing with sortedness + reverse detection */
+                                uint32_t pd_nw = sk_pool ? ray_pool_total_workers(sk_pool) : 1;
+                                int64_t pd_pw[pd_nw], pd_nr[pd_nw];
+                                memset(pd_pw, 0, (size_t)pd_nw * sizeof(int64_t));
+                                memset(pd_nr, 0, (size_t)pd_nw * sizeof(int64_t));
+                                uint64_t key_mask_pd =
+                                    (key_bits < 64) ? ((1ULL << key_bits) - 1) : ~0ULL;
+                                packed_detect_ctx_t pd_ctx = {
+                                    .keys = keys, .key_bits = key_bits,
+                                    .key_mask = key_mask_pd,
+                                    .pw_unsorted = pd_pw, .pw_not_reverse = pd_nr,
+                                };
+
+                                if (sk_pool)
+                                    ray_pool_dispatch(sk_pool, packed_detect_fn, &pd_ctx, nrows);
+                                else
+                                    packed_detect_fn(&pd_ctx, 0, 0, nrows);
+
+                                /* Aggregate sortedness results */
+                                int64_t total_unsorted = 0, total_not_rev = 0;
+                                for (uint32_t t = 0; t < pd_nw; t++) {
+                                    total_unsorted += pd_pw[t];
+                                    total_not_rev += pd_nr[t];
+                                }
+                                /* Check cross-task boundaries */
+                                int64_t grain = RAY_DISPATCH_MORSELS * RAY_MORSEL_ELEMS;
+                                uint64_t key_mask_s =
+                                    (key_bits < 64) ? ((1ULL << key_bits) - 1) : ~0ULL;
+                                for (int64_t b = grain; b < nrows; b += grain) {
+                                    uint64_t ka = keys[b-1] & key_mask_s;
+                                    uint64_t kb2 = keys[b] & key_mask_s;
+                                    if (kb2 < ka) total_unsorted++;
+                                    if (kb2 > ka) total_not_rev++;
+                                }
+
+                                if (total_unsorted == 0) {
+                                    /* Already sorted - identity permutation */
+                                    sorted_idx = indices;
+                                    radix_done = true;
+                                } else if (total_not_rev == 0 && nrows > 1) {
+                                    /* Reverse-sorted - reverse indices in O(n) */
+                                    for (int64_t i = 0; i < nrows; i++)
+                                        indices[i] = nrows - 1 - i;
+                                    sorted_idx = indices;
+                                    radix_done = true;
+                                } else {
+                                    /* Packed radix sort - half the memory traffic */
+                                    uint64_t* sorted = packed_radix_sort_run(
+                                        sk_pool, keys, ptmp, nrows, key_nbytes);
+
+                                    if (sorted) {
+                                        uint64_t idx_mask =
+                                            (idx_bits < 64) ? ((1ULL << idx_bits) - 1) : ~0ULL;
+
+                                        /* Packed path: keys are truncated to key_bits,
+                                         * not full 64-bit encoded keys — can't decode. */
+                                        packed_unpack_ctx_t up = {
+                                            .sorted = sorted, .indices = indices,
+                                            .keys_out = NULL,
+                                            .key_bits = key_bits,
+                                            .idx_mask = idx_mask, .key_mask = 0,
+                                            .extract_keys = false,
+                                        };
+                                        if (sk_pool)
+                                            ray_pool_dispatch(sk_pool, packed_unpack_fn, &up, nrows);
+                                        else
+                                            packed_unpack_fn(&up, 0, 0, nrows);
+
+                                        sorted_idx = indices;
+                                        radix_done = true;
+                                    }
+                                }
+                            }
+                            scratch_free(ptmp_hdr);
+                        } else {
+                            /* Non-packed path: detect sortedness first */
+                            double us_frac2 = detect_sortedness(sk_pool, keys, nrows);
+                            if (us_frac2 == 0.0) {
+                                sorted_idx = indices;
+                                radix_done = true;
+                            }
+                            /* Standard dual-array radix sort */
+                            if (!radix_done) {
+                                ray_t *ktmp_hdr, *itmp_hdr;
+                                uint64_t* ktmp = (uint64_t*)scratch_alloc(&ktmp_hdr,
+                                                    (size_t)nrows * sizeof(uint64_t));
+                                int64_t*  itmp = (int64_t*)scratch_alloc(&itmp_hdr,
+                                                    (size_t)nrows * sizeof(int64_t));
+                                if (ktmp && itmp) {
+                                    bool want_sk = sorted_keys_out
+                                        && !RAY_IS_SYM(cols[0]->type);
+                                    uint64_t* sk_out = NULL;
+                                    sorted_idx = msd_radix_sort_run(sk_pool, keys, indices,
+                                                                     ktmp, itmp, nrows,
+                                                                     key_nbytes,
+                                                                     want_sk ? &sk_out : NULL);
+                                    radix_done = (sorted_idx != NULL);
+                                    if (radix_done && want_sk && sk_out) {
+                                        *sorted_keys_out = sk_out;
+                                        if (sk_out == ktmp) {
+                                            *keys_hdr_out = ktmp_hdr;
+                                            ktmp_hdr = NULL;
+                                        } else {
+                                            /* Even number of radix passes:
+                                             * sorted keys ended up in the
+                                             * original keys buffer. */
+                                            *keys_hdr_out = keys_hdr;
+                                            keys_hdr = NULL;
+                                        }
+                                    }
+                                }
+                                if (ktmp_hdr) scratch_free(ktmp_hdr);
+                                if (sorted_idx != itmp) scratch_free(itmp_hdr);
+                                else radix_itmp_hdr = itmp_hdr;
+                            }
+                        }
+                    }
+                }
+                scratch_free(keys_hdr);
+
+            } else if (can_radix && n_cols > 1) {
+                /* --- Multi-key composite radix sort --- */
+                int64_t mins[n_cols], maxs[n_cols];
+                /* Wider accumulator: up to 16 keys * 63 bits = 1008,
+                 * which would wrap a uint8_t and let an oversized
+                 * budget falsely pass the <=64 fits check. */
+                uint16_t total_bits = 0;
+                bool fits = true;
+
+                ray_pool_t* mk_prescan_pool = (nrows >= SMALL_POOL_THRESHOLD) ? pool : NULL;
+                if (has_wide_key) {
+                    /* RAY_STR / RAY_GUID can't be packed into a composite
+                     * uint64 key. Force the rank-then-compose fallback. */
+                    total_bits = UINT16_MAX;
+                    fits = false;
+                } else if (n_cols <= MK_PRESCAN_MAX_KEYS && mk_prescan_pool) {
+                    uint32_t nw = ray_pool_total_workers(mk_prescan_pool);
+                    size_t pw_count = (size_t)nw * n_cols;
+                    int64_t pw_mins_stack[512], pw_maxs_stack[512];
+                    ray_t *pw_mins_hdr = NULL, *pw_maxs_hdr = NULL;
+                    int64_t* pw_mins = (pw_count <= 512)
+                        ? pw_mins_stack
+                        : (int64_t*)scratch_alloc(&pw_mins_hdr, pw_count * sizeof(int64_t));
+                    int64_t* pw_maxs = (pw_count <= 512)
+                        ? pw_maxs_stack
+                        : (int64_t*)scratch_alloc(&pw_maxs_hdr, pw_count * sizeof(int64_t));
+                    for (size_t i = 0; i < pw_count; i++) {
+                        pw_mins[i] = INT64_MAX;
+                        pw_maxs[i] = INT64_MIN;
+                    }
+                    mk_prescan_ctx_t pctx = {
+                        .vecs = cols, .enum_ranks = enum_ranks,
+                        .n_keys = n_cols, .nrows = nrows, .n_workers = nw,
+                        .pw_mins = pw_mins, .pw_maxs = pw_maxs,
+                    };
+                    ray_pool_dispatch(mk_prescan_pool, mk_prescan_fn, &pctx, nrows);
+
+                    /* Merge per-worker results */
+                    for (uint8_t k = 0; k < n_cols; k++) {
+                        int64_t kmin = INT64_MAX, kmax = INT64_MIN;
+                        for (uint32_t w = 0; w < nw; w++) {
+                            int64_t wmin = pw_mins[w * n_cols + k];
+                            int64_t wmax = pw_maxs[w * n_cols + k];
+                            if (wmin < kmin) kmin = wmin;
+                            if (wmax > kmax) kmax = wmax;
+                        }
+                        mins[k] = kmin;
+                        maxs[k] = kmax;
+                        uint64_t range = (uint64_t)(kmax - kmin);
+                        uint8_t bits = 1;
+                        while (((uint64_t)1 << bits) <= range && bits < 64)
+                            bits++;
+                        total_bits = (uint16_t)(total_bits + bits);
+                    }
+                    if (pw_mins_hdr) scratch_free(pw_mins_hdr);
+                    if (pw_maxs_hdr) scratch_free(pw_maxs_hdr);
+                } else {
+                    /* Sequential fallback (no pool or too many keys) */
+                    for (uint8_t k = 0; k < n_cols; k++) {
+                        ray_t* col = cols[k];
+                        int64_t kmin = INT64_MAX, kmax = INT64_MIN;
+
+                        if (enum_ranks[k]) {
+                            const void* cdata = ray_data(col);
+                            int8_t ctype = col->type;
+                            uint8_t cattrs = col->attrs;
+                            for (int64_t i = 0; i < nrows; i++) {
+                                uint32_t raw = (uint32_t)ray_read_sym(cdata, i, ctype, cattrs);
+                                int64_t v = (int64_t)enum_ranks[k][raw];
+                                if (v < kmin) kmin = v;
+                                if (v > kmax) kmax = v;
+                            }
+                        } else if (col->type == RAY_I64 || col->type == RAY_TIMESTAMP) {
+                            const int64_t* d = (const int64_t*)ray_data(col);
+                            for (int64_t i = 0; i < nrows; i++) {
+                                if (d[i] < kmin) kmin = d[i];
+                                if (d[i] > kmax) kmax = d[i];
+                            }
+                        } else if (col->type == RAY_F64) {
+                            const double* d = (const double*)ray_data(col);
+                            for (int64_t i = 0; i < nrows; i++) {
+                                uint64_t bits;
+                                memcpy(&bits, &d[i], 8);
+                                uint64_t mask = -(bits >> 63) | ((uint64_t)1 << 63);
+                                int64_t v = (int64_t)(bits ^ mask);
+                                if (v < kmin) kmin = v;
+                                if (v > kmax) kmax = v;
+                            }
+                        } else if (col->type == RAY_I32 || col->type == RAY_DATE || col->type == RAY_TIME) {
+                            const int32_t* d = (const int32_t*)ray_data(col);
+                            for (int64_t i = 0; i < nrows; i++) {
+                                if (d[i] < kmin) kmin = (int64_t)d[i];
+                                if (d[i] > kmax) kmax = (int64_t)d[i];
+                            }
+                        } else if (col->type == RAY_I16) {
+                            const int16_t* d = (const int16_t*)ray_data(col);
+                            for (int64_t i = 0; i < nrows; i++) {
+                                if (d[i] < kmin) kmin = (int64_t)d[i];
+                                if (d[i] > kmax) kmax = (int64_t)d[i];
+                            }
+                        } else if (col->type == RAY_BOOL || col->type == RAY_U8) {
+                            const uint8_t* d = (const uint8_t*)ray_data(col);
+                            for (int64_t i = 0; i < nrows; i++) {
+                                if (d[i] < kmin) kmin = (int64_t)d[i];
+                                if (d[i] > kmax) kmax = (int64_t)d[i];
+                            }
+                        }
+
+                        mins[k] = kmin;
+                        maxs[k] = kmax;
+                        uint64_t range = (uint64_t)(kmax - kmin);
+                        uint8_t bits = 1;
+                        while (((uint64_t)1 << bits) <= range && bits < 64)
+                            bits++;
+                        total_bits = (uint16_t)(total_bits + bits);
+                    }
+                }
+
+                if (total_bits > 64) {
+                    fits = false;
+                    /* --- Rank-then-compose fallback ---
+                     * The composite bit budget overflows because at least
+                     * one key has a value range that doesn't fit (typical:
+                     * F64 columns whose sign-flipped IEEE-754 encoding
+                     * spans most of the 64-bit space).  Fall back to a
+                     * rank-encoded composite: for each key, run a single-
+                     * key sort to produce a dense rank in [0..K_k), then
+                     * compose the ranks.  Bits per key shrinks from
+                     * "data range" to "ceil(log2 distinct_count)", which
+                     * always fits for n_cols * ceil(log2 nrows) <= 64. */
+                    ray_t* rank_hdrs[n_cols];
+                    uint32_t* ranks[n_cols];
+                    uint32_t rank_max[n_cols];
+                    bool rank_ok = true;
+                    for (uint8_t k = 0; k < n_cols; k++) {
+                        rank_hdrs[k] = NULL; ranks[k] = NULL; rank_max[k] = 0;
+                    }
+                    for (uint8_t k = 0; k < n_cols && rank_ok; k++) {
+                        uint8_t kdesc = descs ? descs[k] : 0;
+                        uint8_t knf   = nulls_first ? nulls_first[k] : !kdesc;
+                        ray_t* col_arg[1] = { cols[k] };
+                        uint8_t desc_arg[1] = { kdesc };
+                        uint8_t nf_arg[1]   = { knf };
+                        ray_t* sk_idx = sort_indices_ex(col_arg, desc_arg,
+                                                         nf_arg, 1, nrows,
+                                                         NULL, NULL);
+                        if (!sk_idx || RAY_IS_ERR(sk_idx)) { rank_ok = false; break; }
+                        int64_t* sk_idx_data = (int64_t*)ray_data(sk_idx);
+                        uint32_t* r = (uint32_t*)scratch_alloc(&rank_hdrs[k],
+                                          (size_t)nrows * sizeof(uint32_t));
+                        if (!r) { ray_release(sk_idx); rank_ok = false; break; }
+                        ranks[k] = r;
+                        /* Dense-rank tie detection must use the same null
+                         * ordering as the sub-sort so that null/non-null
+                         * pairs aren't treated as ties (and so that two
+                         * nulls do collapse to the same rank). */
+                        sort_cmp_ctx_t cctx = {
+                            .vecs = col_arg, .desc = desc_arg,
+                            .nulls_first = nf_arg, .n_sort = 1,
+                        };
+                        uint32_t cur = 0;
+                        r[sk_idx_data[0]] = 0;
+                        for (int64_t i = 1; i < nrows; i++) {
+                            if (sort_cmp(&cctx, sk_idx_data[i-1], sk_idx_data[i]) != 0)
+                                cur++;
+                            r[sk_idx_data[i]] = cur;
+                        }
+                        rank_max[k] = cur;
+                        ray_release(sk_idx);
+                    }
+                    if (rank_ok) {
+                        uint8_t rank_bits[n_cols];
+                        /* Accumulate in a wider type: up to 16 keys * 63
+                         * bits each = 1008, which would wrap a uint8_t. */
+                        uint16_t rank_total = 0;
+                        for (uint8_t k = 0; k < n_cols; k++) {
+                            uint8_t b = 1;
+                            while (((uint64_t)1 << b) <= rank_max[k] && b < 64) b++;
+                            rank_bits[k] = b;
+                            rank_total = (uint16_t)(rank_total + b);
+                        }
+                        if (rank_total <= 64) {
+                            uint8_t rshift[n_cols];
+                            uint16_t accum = 0;
+                            for (int k = n_cols - 1; k >= 0; k--) {
+                                rshift[k] = (uint8_t)accum;
+                                accum = (uint16_t)(accum + rank_bits[k]);
+                            }
+                            uint8_t rcomp_nbytes = (uint8_t)((rank_total + 7) / 8);
+                            if (rcomp_nbytes < 1) rcomp_nbytes = 1;
+                            ray_pool_t* rk_pool =
+                                (nrows >= SMALL_POOL_THRESHOLD) ? pool : NULL;
+                            ray_t* rkeys_hdr;
+                            uint64_t* rkeys = (uint64_t*)scratch_alloc(&rkeys_hdr,
+                                                  (size_t)nrows * sizeof(uint64_t));
+                            if (rkeys) {
+                                for (int64_t i = 0; i < nrows; i++) {
+                                    uint64_t composite = 0;
+                                    for (uint8_t k = 0; k < n_cols; k++)
+                                        composite |= ((uint64_t)ranks[k][i]) << rshift[k];
+                                    rkeys[i] = composite;
+                                    indices[i] = i;
+                                }
+                                iota_done = true;
+                                if (nrows <= RADIX_SORT_THRESHOLD) {
+                                    key_introsort(rkeys, indices, nrows);
+                                    sorted_idx = indices;
+                                    radix_done = true;
+                                } else {
+                                    ray_t *rktmp_hdr, *ritmp_hdr;
+                                    uint64_t* rktmp = (uint64_t*)scratch_alloc(&rktmp_hdr,
+                                                          (size_t)nrows * sizeof(uint64_t));
+                                    int64_t* ritmp = (int64_t*)scratch_alloc(&ritmp_hdr,
+                                                         (size_t)nrows * sizeof(int64_t));
+                                    if (rktmp && ritmp) {
+                                        sorted_idx = msd_radix_sort_run(
+                                            rk_pool, rkeys, indices,
+                                            rktmp, ritmp, nrows, rcomp_nbytes, NULL);
+                                        radix_done = (sorted_idx != NULL);
+                                    }
+                                    if (rktmp_hdr) scratch_free(rktmp_hdr);
+                                    if (sorted_idx != ritmp) {
+                                        if (ritmp_hdr) scratch_free(ritmp_hdr);
+                                    } else {
+                                        radix_itmp_hdr = ritmp_hdr;
+                                    }
+                                }
+                                scratch_free(rkeys_hdr);
+                            }
+                        }
+                    }
+                    for (uint8_t k = 0; k < n_cols; k++)
+                        if (rank_hdrs[k]) scratch_free(rank_hdrs[k]);
+                }
+
+                if (fits) {
+                    /* Compute bit-shift for each key: primary key in MSBs */
+                    uint8_t bit_shifts[n_cols];
+                    uint8_t accum = 0;
+                    for (int k = n_cols - 1; k >= 0; k--) {
+                        bit_shifts[k] = accum;
+                        uint64_t range = (uint64_t)(maxs[k] - mins[k]);
+                        uint8_t bits = 1;
+                        while (((uint64_t)1 << bits) <= range && bits < 64)
+                            bits++;
+                        accum += bits;
+                    }
+
+                    uint8_t comp_nbytes = (total_bits + 7) / 8;
+                    if (comp_nbytes < 1) comp_nbytes = 1;
+                    ray_pool_t* mk_pool = (nrows >= SMALL_POOL_THRESHOLD) ? pool : NULL;
+
+                    {
+                        /* Encode composite keys */
+                        ray_t *keys_hdr;
+                        uint64_t* keys = (uint64_t*)scratch_alloc(&keys_hdr,
+                                            (size_t)nrows * sizeof(uint64_t));
+                        if (keys) {
+                            radix_encode_ctx_t enc = {
+                                .keys = keys, .indices = indices,
+                                .n_keys = n_cols, .vecs = cols,
+                            };
+                            for (uint8_t k = 0; k < n_cols; k++) {
+                                enc.mins[k] = mins[k];
+                                enc.ranges[k] = maxs[k] - mins[k];
+                                enc.bit_shifts[k] = bit_shifts[k];
+                                enc.descs[k] = descs ? descs[k] : 0;
+                                enc.enum_ranks[k] = enum_ranks[k];
+                            }
+                            if (mk_pool)
+                                ray_pool_dispatch(mk_pool, radix_encode_fn, &enc, nrows);
+                            else
+                                radix_encode_fn(&enc, 0, 0, nrows);
+                            iota_done = true;
+
+                            /* Adaptive: detect sortedness */
+                            double unsorted_frac = detect_sortedness(mk_pool, keys, nrows);
+
+                            if (unsorted_frac == 0.0) {
+                                /* Already sorted */
+                                sorted_idx = indices;
+                                radix_done = true;
+                            } else if (nrows <= RADIX_SORT_THRESHOLD) {
+                                /* Small arrays - introsort */
+                                key_introsort(keys, indices, nrows);
+                                sorted_idx = indices;
+                                radix_done = true;
+                            } else {
+                                /* Radix sort with type-aware pass count */
+                                ray_t *ktmp_hdr, *itmp_hdr;
+                                uint64_t* ktmp = (uint64_t*)scratch_alloc(&ktmp_hdr,
+                                                    (size_t)nrows * sizeof(uint64_t));
+                                int64_t*  itmp = (int64_t*)scratch_alloc(&itmp_hdr,
+                                                    (size_t)nrows * sizeof(int64_t));
+                                if (ktmp && itmp) {
+                                    sorted_idx = msd_radix_sort_run(mk_pool, keys, indices,
+                                                                     ktmp, itmp, nrows,
+                                                                     comp_nbytes, NULL);
+                                    radix_done = (sorted_idx != NULL);
+                                }
+                                scratch_free(ktmp_hdr);
+                                if (sorted_idx != itmp) scratch_free(itmp_hdr);
+                                else radix_itmp_hdr = itmp_hdr;
+                            }
+                        }
+                        scratch_free(keys_hdr);
+                    }
+                }
+            }
+        }
+    }
+
+    /* --- Merge sort fallback ------------------------------------------------ */
+    if (!radix_done) {
+        if (!iota_done)
+            for (int64_t i = 0; i < nrows; i++) indices[i] = i;
+        /* Null = minimum value.
+         * ASC → nulls first (nf=1), DESC → nulls last (nf=0). */
+        uint8_t default_nf[n_cols > 0 ? n_cols : 1];
+        if (!nulls_first) {
+            for (uint8_t k = 0; k < n_cols; k++)
+                default_nf[k] = descs ? !descs[k] : 1;
+            nulls_first = default_nf;
+        }
+        sort_cmp_ctx_t cmp_ctx = {
+            .vecs = cols,
+            .desc = descs,
+            .nulls_first = nulls_first,
+            .n_sort = n_cols,
+        };
+
+        if (nrows <= 64) {
+            sort_insertion(&cmp_ctx, indices, nrows);
+        } else {
+            ray_pool_t* pool = ray_pool_get();
+            uint32_t n_workers = pool ? ray_pool_total_workers(pool) : 1;
+
+            ray_t* tmp_hdr;
+            int64_t* tmp = (int64_t*)scratch_alloc(&tmp_hdr,
+                                (size_t)nrows * sizeof(int64_t));
+            if (!tmp) {
+                for (uint8_t k = 0; k < n_cols; k++)
+                    scratch_free(enum_rank_hdrs[k]);
+                scratch_free(indices_hdr);
+                return ray_error("oom", NULL);
+            }
+
+            uint32_t n_chunks = n_workers;
+            if (pool && n_chunks > 1 && nrows > 1024) {
+                sort_phase1_ctx_t p1ctx = {
+                    .cmp_ctx = &cmp_ctx, .indices = indices, .tmp = tmp,
+                    .nrows = nrows, .n_chunks = n_chunks,
+                };
+                ray_pool_dispatch_n(pool, sort_phase1_fn, &p1ctx, n_chunks);
+            } else {
+                n_chunks = 1;
+                sort_merge_recursive(&cmp_ctx, indices, tmp, nrows);
+            }
+
+            if (n_chunks > 1) {
+                int64_t chunk_size = (nrows + n_chunks - 1) / n_chunks;
+                int64_t run_size = chunk_size;
+                int64_t* src = indices;
+                int64_t* dst = tmp;
+
+                while (run_size < nrows) {
+                    int64_t n_pairs = (nrows + 2 * run_size - 1) / (2 * run_size);
+                    sort_merge_ctx_t mctx = {
+                        .cmp_ctx = &cmp_ctx, .src = src, .dst = dst,
+                        .nrows = nrows, .run_size = run_size,
+                    };
+                    if (pool && n_pairs > 1)
+                        ray_pool_dispatch_n(pool, sort_merge_fn, &mctx,
+                                            (uint32_t)n_pairs);
+                    else
+                        sort_merge_fn(&mctx, 0, 0, n_pairs);
+                    int64_t* t = src; src = dst; dst = t;
+                    run_size *= 2;
+                }
+
+                if (src != indices)
+                    memcpy(indices, src, (size_t)nrows * sizeof(int64_t));
+            }
+
+            scratch_free(tmp_hdr);
+        }
+    }
+
+str_msd_done:;
+    /* If sorted_keys_out was requested but never set, null it out */
+    if (sorted_keys_out && !*sorted_keys_out) {
+        *sorted_keys_out = NULL;
+        if (keys_hdr_out) *keys_hdr_out = NULL;
+    }
+
+    /* Build result I64 vector containing sorted indices */
+    ray_t* result = ray_vec_new(RAY_I64, nrows);
+    if (!result || RAY_IS_ERR(result)) {
+        if (sorted_keys_out && *sorted_keys_out && keys_hdr_out)
+            scratch_free(*keys_hdr_out);
+        for (uint8_t k = 0; k < n_cols; k++)
+            scratch_free(enum_rank_hdrs[k]);
+        scratch_free(radix_itmp_hdr);
+        scratch_free(indices_hdr);
+        return result ? result : ray_error("oom", NULL);
+    }
+    result->len = nrows;
+
+    /* Copy final sorted indices into the result vector.
+     * sorted_idx may point to indices or itmp - either way, copy out. */
+    memcpy(ray_data(result), sorted_idx, (size_t)nrows * sizeof(int64_t));
+
+    /* Free all scratch allocations */
+    for (uint8_t k = 0; k < n_cols; k++)
+        scratch_free(enum_rank_hdrs[k]);
+    scratch_free(radix_itmp_hdr);
+    scratch_free(indices_hdr);
+    return result;
+}
+
+ray_t* ray_sort_indices(ray_t** cols, uint8_t* descs, uint8_t* nulls_first,
+                        uint8_t n_cols, int64_t nrows) {
+    return sort_indices_ex(cols, descs, nulls_first, n_cols, nrows, NULL, NULL);
+}
+
+ray_t* ray_sort(ray_t** cols, uint8_t* descs, uint8_t* nulls_first,
+                uint8_t n_cols, int64_t nrows) {
+    if (n_cols == 1) {
+        uint64_t* sorted_keys = NULL;
+        ray_t* keys_hdr = NULL;
+        ray_t* idx = sort_indices_ex(cols, descs, nulls_first, 1, nrows,
+                                      &sorted_keys, &keys_hdr);
+        if (!idx || RAY_IS_ERR(idx)) return idx;
+
+        if (sorted_keys && !RAY_IS_SYM(cols[0]->type)) {
+            /* Decode path: sequential writes, no random access */
+            ray_t* result = ray_vec_new(cols[0]->type, nrows);
+            if (!result || RAY_IS_ERR(result)) {
+                ray_release(idx);
+                if (keys_hdr) scratch_free(keys_hdr);
+                return result ? result : ray_error("oom", NULL);
+            }
+            result->len = nrows;
+            radix_decode_into(ray_data(result), cols[0]->type, sorted_keys,
+                              nrows, descs ? descs[0] : 0);
+            /* Propagate null bitmap using sorted indices */
+            if (cols[0]->attrs & RAY_ATTR_HAS_NULLS) {
+                int64_t* idx_data = (int64_t*)ray_data(idx);
+                for (int64_t i = 0; i < nrows; i++)
+                    if (ray_vec_is_null(cols[0], idx_data[i]))
+                        ray_vec_set_null(result, i, true);
+            }
+            ray_release(idx);
+            scratch_free(keys_hdr);
+            return result;
+        }
+
+        /* Fallback: gather by index */
+        if (keys_hdr) scratch_free(keys_hdr);
+        ray_t* result = gather_by_idx(cols[0], (int64_t*)ray_data(idx), nrows);
+        ray_release(idx);
+        return result;
+    }
+
+    /* Multi-column: index sort + gather (decode only helps single-key) */
+    ray_t* idx = ray_sort_indices(cols, descs, nulls_first, n_cols, nrows);
+    if (!idx || RAY_IS_ERR(idx)) return idx;
+    ray_t* result = gather_by_idx(cols[0], (int64_t*)ray_data(idx), nrows);
+    ray_release(idx);
+    return result;
+}
+
+ray_t* exec_sort(ray_graph_t* g, ray_op_t* op, ray_t* tbl, int64_t limit) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    int64_t nrows = ray_table_nrows(tbl);
+    int64_t ncols = ray_table_ncols(tbl);
+    if (ncols > 4096) return ray_error("nyi", NULL); /* stack safety */
+    uint8_t n_sort = ext->sort.n_cols;
+    if (n_sort > 16) return ray_error("nyi", NULL); /* radix_encode_ctx_t limit */
+
+    /* Resolve sort key vectors */
+    ray_t* sort_vecs[n_sort > 0 ? n_sort : 1];
+    uint8_t sort_owned[n_sort > 0 ? n_sort : 1];
+    memset(sort_vecs, 0, (n_sort > 0 ? n_sort : 1) * sizeof(ray_t*));
+    memset(sort_owned, 0, n_sort > 0 ? n_sort : 1);
+
+    for (uint8_t k = 0; k < n_sort; k++) {
+        ray_op_t* key_op = ext->sort.columns[k];
+        ray_op_ext_t* key_ext = find_ext(g, key_op->id);
+        if (key_ext && key_ext->base.opcode == OP_SCAN) {
+            sort_vecs[k] = ray_table_get_col(tbl, key_ext->sym);
+        } else {
+            ray_t* saved = g->table;
+            g->table = tbl;
+            sort_vecs[k] = exec_node(g, key_op);
+            g->table = saved;
+            sort_owned[k] = 1;
+        }
+        if (!sort_vecs[k] || RAY_IS_ERR(sort_vecs[k])) {
+            ray_t* err = sort_vecs[k] ? sort_vecs[k] : ray_error("nyi", NULL);
+            for (uint8_t j = 0; j < k; j++) {
+                if (sort_owned[j] && sort_vecs[j] && !RAY_IS_ERR(sort_vecs[j]))
+                    ray_release(sort_vecs[j]);
+            }
+            return err;
+        }
+    }
+
+    /* Sort columns -> get index permutation (with optional sorted radix keys) */
+    uint64_t* sorted_keys = NULL;
+    ray_t* sorted_keys_hdr = NULL;
+    ray_t* idx_vec = sort_indices_ex(sort_vecs, ext->sort.desc,
+                                     ext->sort.nulls_first, n_sort, nrows,
+                                     &sorted_keys, &sorted_keys_hdr);
+    if (!idx_vec || RAY_IS_ERR(idx_vec)) {
+        if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+        for (uint8_t k = 0; k < n_sort; k++) {
+            if (sort_owned[k] && sort_vecs[k] && !RAY_IS_ERR(sort_vecs[k]))
+                ray_release(sort_vecs[k]);
+        }
+        return idx_vec ? idx_vec : ray_error("oom", NULL);
+    }
+    int64_t* sorted_idx = (int64_t*)ray_data(idx_vec);
+
+    /* Check cancellation before expensive gather phase */
+    {
+        ray_pool_t* cp = ray_pool_get();
+        if (pool_cancelled(cp)) {
+            if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+            for (uint8_t k = 0; k < n_sort; k++) {
+                if (sort_owned[k] && sort_vecs[k] && !RAY_IS_ERR(sort_vecs[k]))
+                    ray_release(sort_vecs[k]);
+            }
+            ray_release(idx_vec);
+            return ray_error("cancel", NULL);
+        }
+    }
+
+    /* Materialize sorted result - fused multi-column gather.
+     * When limit > 0, only gather the first `limit` rows (SORT+LIMIT fusion). */
+    int64_t gather_rows = nrows;
+    if (limit > 0 && limit < nrows) gather_rows = limit;
+
+    ray_t* result = ray_table_new(ncols);
+    if (!result || RAY_IS_ERR(result)) {
+        if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+        for (uint8_t k = 0; k < n_sort; k++) {
+            if (sort_owned[k] && sort_vecs[k] && !RAY_IS_ERR(sort_vecs[k]))
+                ray_release(sort_vecs[k]);
+        }
+        ray_release(idx_vec);
+        return result;
+    }
+
+    /* Pre-allocate all output columns, then do a single fused gather pass */
+    ray_pool_t* gather_pool = (gather_rows > RAY_PARALLEL_THRESHOLD) ? ray_pool_get() : NULL;
+    ray_t* new_cols[ncols];
+    int64_t col_names[ncols];
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        col_names[c] = ray_table_col_name(tbl, c);
+        if (!col) { new_cols[c] = NULL; continue; }
+        ray_t* nc;
+        if (col->type == RAY_LIST) {
+            /* LIST: element-wise gather with retain (not memcpy-safe) */
+            nc = ray_list_new(gather_rows);
+        } else {
+            nc = col_vec_new(col, gather_rows);
+        }
+        if (!nc || RAY_IS_ERR(nc)) {
+            for (int64_t j = 0; j < c; j++)
+                if (new_cols[j]) ray_release(new_cols[j]);
+            ray_release(result);
+            if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+            for (uint8_t k = 0; k < n_sort; k++)
+                if (sort_owned[k] && sort_vecs[k] && !RAY_IS_ERR(sort_vecs[k]))
+                    ray_release(sort_vecs[k]);
+            ray_release(idx_vec);
+            return nc ? nc : ray_error("oom", NULL);
+        }
+        if (col->type == RAY_LIST) {
+            ray_t** src_ptrs = (ray_t**)ray_data(col);
+            ray_t** dst_ptrs = (ray_t**)ray_data(nc);
+            for (int64_t r = 0; r < gather_rows; r++) {
+                dst_ptrs[r] = src_ptrs[sorted_idx[r]];
+                if (dst_ptrs[r]) ray_retain(dst_ptrs[r]);
+            }
+        }
+        nc->len = gather_rows;
+        new_cols[c] = nc;
+    }
+
+    /* Decode-gather optimisation: decode the sort key column directly from
+     * sorted radix keys (sequential writes) instead of random-access gather.
+     * Only for single-key, non-SYM sorts where radix keys are available. */
+    int64_t sort_key_sym = -1;
+    if (sorted_keys && n_sort == 1 && !RAY_IS_SYM(sort_vecs[0]->type)) {
+        ray_op_ext_t* key_ext = find_ext(g, ext->sort.columns[0]->id);
+        if (key_ext && key_ext->base.opcode == OP_SCAN)
+            sort_key_sym = key_ext->sym;
+    }
+    int64_t decode_col_idx = -1;
+    if (sort_key_sym >= 0) {
+        for (int64_t c = 0; c < ncols; c++) {
+            if (col_names[c] == sort_key_sym && new_cols[c]) {
+                decode_col_idx = c;
+                break;
+            }
+        }
+    }
+
+    if (decode_col_idx >= 0) {
+        radix_decode_into(ray_data(new_cols[decode_col_idx]),
+                          sort_vecs[0]->type, sorted_keys,
+                          gather_rows, ext->sort.desc ? ext->sort.desc[0] : 0);
+    }
+
+    /* Gather all columns using sorted indices, in batches of MGATHER_MAX_COLS.
+     * LIST columns are skipped here — they were gathered with retain above. */
+    for (int64_t base = 0; base < ncols; ) {
+        char*   g_srcs[MGATHER_MAX_COLS];
+        char*   g_dsts[MGATHER_MAX_COLS];
+        uint8_t g_esz[MGATHER_MAX_COLS];
+        int64_t g_nc = 0;
+        for (; base < ncols && g_nc < MGATHER_MAX_COLS; base++) {
+            if (!new_cols[base] || base == decode_col_idx) continue;
+            ray_t* col = ray_table_get_col_idx(tbl, base);
+            if (col->type == RAY_LIST) continue;
+            g_srcs[g_nc] = (char*)ray_data(col);
+            g_dsts[g_nc] = (char*)ray_data(new_cols[base]);
+            g_esz[g_nc]  = col_esz(col);
+            g_nc++;
+        }
+        if (g_nc == 0) continue;
+        if (n_sort == 1)
+            partitioned_gather(gather_pool, sorted_idx, gather_rows,
+                               nrows, g_srcs, g_dsts, g_esz, g_nc);
+        else {
+            multi_gather_ctx_t mg = { .idx = sorted_idx, .ncols = g_nc };
+            for (int64_t i = 0; i < g_nc; i++) {
+                mg.srcs[i] = g_srcs[i];
+                mg.dsts[i] = g_dsts[i];
+                mg.esz[i]  = g_esz[i];
+            }
+            if (gather_pool)
+                ray_pool_dispatch(gather_pool, multi_gather_fn, &mg,
+                                 gather_rows);
+            else
+                multi_gather_fn(&mg, 0, 0, gather_rows);
+        }
+    }
+
+    /* Propagate str_pool / sym_dict / null bitmaps from source columns */
+    for (int64_t c = 0; c < ncols; c++) {
+        if (!new_cols[c]) continue;
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        if (!col) continue;
+        col_propagate_str_pool(new_cols[c], col);
+        /* sym_dict lives in bytes 8-15 of the header union, which also
+         * hold inline-nullmap bits and slice_offset. Only read it when
+         * the header layout actually exposes the sym_dict/ext_nullmap
+         * interpretation: no slice, and either no nulls or external
+         * nullmap. Otherwise those bytes are bitmap payload / slice
+         * metadata and dereferencing them hands ray_retain garbage. */
+        if (col->type == RAY_SYM &&
+            !(col->attrs & RAY_ATTR_SLICE) &&
+            (!(col->attrs & RAY_ATTR_HAS_NULLS) || (col->attrs & RAY_ATTR_NULLMAP_EXT)) &&
+            col->sym_dict) {
+            ray_retain(col->sym_dict);
+            new_cols[c]->sym_dict = col->sym_dict;
+        }
+        /* Gather null bits in sorted order */
+        bool src_has_nulls = (col->attrs & RAY_ATTR_HAS_NULLS) ||
+                             ((col->attrs & RAY_ATTR_SLICE) && col->slice_parent &&
+                              (col->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+        if (src_has_nulls) {
+            for (int64_t r = 0; r < gather_rows; r++)
+                if (ray_vec_is_null(col, sorted_idx[r]))
+                    ray_vec_set_null(new_cols[c], r, true);
+        }
+    }
+
+    for (int64_t c = 0; c < ncols; c++) {
+        if (!new_cols[c]) continue;
+        result = ray_table_add_col(result, col_names[c], new_cols[c]);
+        ray_release(new_cols[c]);
+    }
+
+    /* Free sorted radix keys scratch buffer */
+    if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+
+    /* Free expression-evaluated sort keys */
+    for (uint8_t k = 0; k < n_sort; k++) {
+        if (sort_owned[k] && sort_vecs[k] && !RAY_IS_ERR(sort_vecs[k]))
+            ray_release(sort_vecs[k]);
+    }
+
+    ray_release(idx_vec);
+    return result;
+}
+
+/* ── Builtins ── */
+
+/* (asc v) — sort vector ascending */
+ray_t* ray_asc_fn(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return x;
+    if (ray_is_atom(x)) { ray_retain(x); return x; }
+    if (!ray_is_vec(x)) return ray_error("type", "asc expects a vector");
+    int64_t n = ray_len(x);
+    if (n <= 1) { ray_retain(x); return x; }
+    uint8_t desc = 0;
+    return ray_sort(&x, &desc, NULL, 1, n);
+}
+
+/* (desc v) — sort vector descending */
+ray_t* ray_desc_fn(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return x;
+    if (ray_is_atom(x)) { ray_retain(x); return x; }
+    if (!ray_is_vec(x)) return ray_error("type", "desc expects a vector");
+    int64_t n = ray_len(x);
+    if (n <= 1) { ray_retain(x); return x; }
+    uint8_t desc = 1;
+    return ray_sort(&x, &desc, NULL, 1, n);
+}
+
+/* (iasc v) — ascending sort indices */
+ray_t* ray_iasc_fn(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return x;
+    if (!ray_is_vec(x)) return ray_error("type", "iasc expects a vector");
+
+    int64_t n = ray_len(x);
+    uint8_t desc = 0;
+    return ray_sort_indices(&x, &desc, NULL, 1, n);
+}
+
+/* (idesc v) — descending sort indices */
+ray_t* ray_idesc_fn(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return x;
+    if (!ray_is_vec(x)) return ray_error("type", "idesc expects a vector");
+
+    int64_t n = ray_len(x);
+    uint8_t desc = 1;
+    return ray_sort_indices(&x, &desc, NULL, 1, n);
+}
+
+/* (rank v) — rank positions (inverse permutation of iasc) */
+ray_t* ray_rank_fn(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return x;
+    if (!ray_is_vec(x)) return ray_error("type", "rank expects a vector");
+
+    int64_t n = ray_len(x);
+    uint8_t desc = 0;
+    ray_t* idx = ray_sort_indices(&x, &desc, NULL, 1, n);
+    if (RAY_IS_ERR(idx)) return idx;
+
+    ray_t* result = ray_vec_new(RAY_I64, n);
+    if (RAY_IS_ERR(result)) { ray_release(idx); return result; }
+    result->len = n;
+
+    int64_t* idx_data = (int64_t*)ray_data(idx);
+    int64_t* rank_data = (int64_t*)ray_data(result);
+    for (int64_t i = 0; i < n; i++)
+        rank_data[idx_data[i]] = i;
+
+    ray_release(idx);
+    return result;
+}
+
+/* Helper: resolve key symbols to table columns for xasc/xdesc */
+ray_t* sort_table_by_keys(ray_t* tbl, ray_t* keys, uint8_t descending) {
+    if (!tbl || tbl->type != RAY_TABLE)
+        return ray_error("type", "xasc/xdesc expects a table as first argument");
+
+    /* keys can be a SYM atom, a SYM vector, or a list of SYM atoms */
+    int64_t n_keys = 0;
+    int64_t key_ids[16];
+
+    if (keys->type == -RAY_SYM) {
+        /* Single symbol atom */
+        key_ids[0] = keys->i64;
+        n_keys = 1;
+    } else if (keys->type == RAY_SYM) {
+        /* SYM vector */
+        int64_t* syms = (int64_t*)ray_data(keys);
+        n_keys = ray_len(keys);
+        if (n_keys > 16) return ray_error("limit", "xasc/xdesc: max 16 key columns");
+        for (int64_t i = 0; i < n_keys; i++) key_ids[i] = syms[i];
+    } else if (is_list(keys)) {
+        /* List of symbol atoms */
+        ray_t** elems = (ray_t**)ray_data(keys);
+        n_keys = ray_len(keys);
+        if (n_keys > 16) return ray_error("limit", "xasc/xdesc: max 16 key columns");
+        for (int64_t i = 0; i < n_keys; i++) {
+            if (elems[i]->type != -RAY_SYM)
+                return ray_error("type", "xasc/xdesc key must be a symbol");
+            key_ids[i] = elems[i]->i64;
+        }
+    } else {
+        return ray_error("type", "xasc/xdesc key must be a symbol or list of symbols");
+    }
+
+    if (n_keys == 0) { ray_retain(tbl); return tbl; }
+
+    int64_t nrows = ray_table_nrows(tbl);
+    if (nrows <= 1) { ray_retain(tbl); return tbl; }
+
+    /* Resolve key columns */
+    ray_t* key_cols[16];
+    for (int64_t i = 0; i < n_keys; i++) {
+        key_cols[i] = ray_table_get_col(tbl, key_ids[i]);
+        if (!key_cols[i])
+            return ray_error("domain", "xasc/xdesc: key column not found in table");
+    }
+
+    /* Build descs array */
+    uint8_t descs[16];
+    for (int64_t i = 0; i < n_keys; i++) descs[i] = descending;
+
+    uint64_t* sorted_keys = NULL;
+    ray_t* sorted_keys_hdr = NULL;
+    ray_t* idx = sort_indices_ex(key_cols, descs, NULL, (uint8_t)n_keys, nrows,
+                                 &sorted_keys, &sorted_keys_hdr);
+    if (RAY_IS_ERR(idx)) {
+        if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+        return idx;
+    }
+
+    int64_t* idx_data = (int64_t*)ray_data(idx);
+    int64_t ncols = ray_table_ncols(tbl);
+
+    /* Pre-allocate all output columns, then do a parallel multi-column
+     * gather — same fast path exec_sort uses.  LIST columns are gathered
+     * element-wise with retain; all other columns go through the
+     * partitioned_gather / multi_gather_fn paths.  Null bits, str_pool,
+     * and sym_dict are propagated after the gather runs.
+     *
+     * Heap-allocate the per-column scratch arrays so the fast path
+     * handles arbitrarily wide tables — avoids a VLA stack blow-up
+     * and matches the pre-regression xasc behavior which supported
+     * any column count via gather_by_idx. */
+    ray_pool_t* gather_pool = (nrows > RAY_PARALLEL_THRESHOLD)
+                              ? ray_pool_get() : NULL;
+
+    ray_t* nc_hdr = NULL;
+    ray_t** new_cols = (ray_t**)scratch_alloc(&nc_hdr,
+                             (size_t)ncols * sizeof(ray_t*));
+    ray_t* cn_hdr = NULL;
+    int64_t* col_names = (int64_t*)scratch_alloc(&cn_hdr,
+                             (size_t)ncols * sizeof(int64_t));
+    if (!new_cols || !col_names) {
+        if (nc_hdr) scratch_free(nc_hdr);
+        if (cn_hdr) scratch_free(cn_hdr);
+        if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+        ray_release(idx);
+        return ray_error("oom", NULL);
+    }
+    for (int64_t c = 0; c < ncols; c++) new_cols[c] = NULL;
+
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        col_names[c] = ray_table_col_name(tbl, c);
+        if (!col) continue;
+        ray_t* nc;
+        if (col->type == RAY_LIST)
+            nc = ray_list_new(nrows);
+        else
+            nc = col_vec_new(col, nrows);
+        if (!nc || RAY_IS_ERR(nc)) {
+            for (int64_t j = 0; j < c; j++)
+                if (new_cols[j]) ray_release(new_cols[j]);
+            scratch_free(nc_hdr);
+            scratch_free(cn_hdr);
+            if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+            ray_release(idx);
+            return nc ? nc : ray_error("oom", NULL);
+        }
+        if (col->type == RAY_LIST) {
+            ray_t** src_ptrs = (ray_t**)ray_data(col);
+            ray_t** dst_ptrs = (ray_t**)ray_data(nc);
+            for (int64_t r = 0; r < nrows; r++) {
+                dst_ptrs[r] = src_ptrs[idx_data[r]];
+                if (dst_ptrs[r]) ray_retain(dst_ptrs[r]);
+            }
+        }
+        nc->len = nrows;
+        new_cols[c] = nc;
+    }
+
+    /* Decode sort key column directly from sorted radix keys when
+     * available — sequential write, much faster than random-access
+     * gather.  Only for single-key sorts where sort_indices_ex
+     * produced sorted_keys (non-packed path). */
+    int64_t decode_col_idx = -1;
+    if (sorted_keys && n_keys == 1 && !RAY_IS_SYM(key_cols[0]->type)) {
+        for (int64_t c = 0; c < ncols; c++) {
+            if (col_names[c] == key_ids[0] && new_cols[c]) {
+                decode_col_idx = c;
+                break;
+            }
+        }
+    }
+    if (decode_col_idx >= 0) {
+        radix_decode_into(ray_data(new_cols[decode_col_idx]),
+                          key_cols[0]->type, sorted_keys,
+                          nrows, descs[0]);
+    }
+
+    /* Gather remaining non-LIST, non-decode columns in batches.
+     * Single-key sorts use the radix-partitioned gather; multi-key
+     * fallback to the multi_gather pool dispatch. */
+    for (int64_t base = 0; base < ncols; ) {
+        char*   g_srcs[MGATHER_MAX_COLS];
+        char*   g_dsts[MGATHER_MAX_COLS];
+        uint8_t g_esz[MGATHER_MAX_COLS];
+        int64_t g_nc = 0;
+        for (; base < ncols && g_nc < MGATHER_MAX_COLS; base++) {
+            if (!new_cols[base] || base == decode_col_idx) continue;
+            ray_t* col = ray_table_get_col_idx(tbl, base);
+            if (col->type == RAY_LIST) continue;
+            g_srcs[g_nc] = (char*)ray_data(col);
+            g_dsts[g_nc] = (char*)ray_data(new_cols[base]);
+            g_esz[g_nc]  = col_esz(col);
+            g_nc++;
+        }
+        if (g_nc == 0) continue;
+        if (n_keys == 1)
+            partitioned_gather(gather_pool, idx_data, nrows,
+                               nrows, g_srcs, g_dsts, g_esz, g_nc);
+        else {
+            multi_gather_ctx_t mg = { .idx = idx_data, .ncols = g_nc };
+            for (int64_t i = 0; i < g_nc; i++) {
+                mg.srcs[i] = g_srcs[i];
+                mg.dsts[i] = g_dsts[i];
+                mg.esz[i]  = g_esz[i];
+            }
+            if (gather_pool)
+                ray_pool_dispatch(gather_pool, multi_gather_fn, &mg, nrows);
+            else
+                multi_gather_fn(&mg, 0, 0, nrows);
+        }
+    }
+
+    /* Propagate str_pool / sym_dict / null bitmaps from source columns.
+     * Null propagation was the reason this function got rewritten in
+     * commit 87981c8; do it explicitly here instead of relying on
+     * gather_by_idx. */
+    for (int64_t c = 0; c < ncols; c++) {
+        if (!new_cols[c]) continue;
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        if (!col) continue;
+        col_propagate_str_pool(new_cols[c], col);
+        /* sym_dict lives in bytes 8-15 of the header union, which also
+         * hold inline-nullmap bits and slice_offset. Only read it when
+         * the header layout actually exposes the sym_dict/ext_nullmap
+         * interpretation: no slice, and either no nulls or external
+         * nullmap. Otherwise those bytes are bitmap payload / slice
+         * metadata and dereferencing them hands ray_retain garbage. */
+        if (col->type == RAY_SYM &&
+            !(col->attrs & RAY_ATTR_SLICE) &&
+            (!(col->attrs & RAY_ATTR_HAS_NULLS) || (col->attrs & RAY_ATTR_NULLMAP_EXT)) &&
+            col->sym_dict) {
+            ray_retain(col->sym_dict);
+            new_cols[c]->sym_dict = col->sym_dict;
+        }
+        bool src_has_nulls = (col->attrs & RAY_ATTR_HAS_NULLS) ||
+                             ((col->attrs & RAY_ATTR_SLICE) && col->slice_parent &&
+                              (col->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+        if (src_has_nulls) {
+            for (int64_t r = 0; r < nrows; r++)
+                if (ray_vec_is_null(col, idx_data[r]))
+                    ray_vec_set_null(new_cols[c], r, true);
+        }
+    }
+
+    /* Assemble result table */
+    ray_t* result = ray_table_new(ncols);
+    if (!result || RAY_IS_ERR(result)) {
+        for (int64_t c = 0; c < ncols; c++)
+            if (new_cols[c]) ray_release(new_cols[c]);
+        scratch_free(nc_hdr);
+        scratch_free(cn_hdr);
+        if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+        ray_release(idx);
+        return result ? result : ray_error("oom", NULL);
+    }
+    for (int64_t c = 0; c < ncols; c++) {
+        if (!new_cols[c]) continue;
+        result = ray_table_add_col(result, col_names[c], new_cols[c]);
+        ray_release(new_cols[c]);
+    }
+
+    scratch_free(nc_hdr);
+    scratch_free(cn_hdr);
+    if (sorted_keys_hdr) scratch_free(sorted_keys_hdr);
+    ray_release(idx);
+    return result;
+}
+
+/* (xasc tbl keys) — sort table ascending by key columns */
+ray_t* ray_xasc_fn(ray_t* tbl, ray_t* keys) {
+    return sort_table_by_keys(tbl, keys, 0);
+}
+
+/* (xdesc tbl keys) — sort table descending by key columns */
+ray_t* ray_xdesc_fn(ray_t* tbl, ray_t* keys) {
+    return sort_table_by_keys(tbl, keys, 1);
+}
+
+/* (xrank n vec) — cross-rank: assign each element to one of n groups
+ * based on its sorted position.  Uses the same O(n log n) sort
+ * infrastructure as `rank` / `xasc` (radix-or-merge inside
+ * ray_sort_indices).  Replaces a per-element ray_vec_get-based
+ * insertion sort that was both correctness-broken (the boxed elem
+ * came back with type=0 so the comparison degenerated to 0.0 ≤ 0.0
+ * → always true → all elements bucketed into group 0) and
+ * algorithmically O(n^2). */
+ray_t* ray_xrank_fn(ray_t* n_obj, ray_t* vec) {
+    if (!is_numeric(n_obj))
+        return ray_error("type", "xrank: first arg must be integer");
+    if (!ray_is_vec(vec))
+        return ray_error("type", "xrank: second arg must be a vector");
+
+    int64_t n_groups = as_i64(n_obj);
+    int64_t len = ray_len(vec);
+    if (n_groups <= 0 || len == 0) return ray_vec_new(RAY_I64, 0);
+
+    uint8_t desc = 0;
+    ray_t* idx = ray_sort_indices(&vec, &desc, NULL, 1, len);
+    if (!idx || RAY_IS_ERR(idx)) return idx ? idx : ray_error("oom", NULL);
+
+    ray_t* result = ray_vec_new(RAY_I64, len);
+    if (!result || RAY_IS_ERR(result)) { ray_release(idx); return result ? result : ray_error("oom", NULL); }
+    result->len = len;
+    const int64_t* idx_data = (const int64_t*)ray_data(idx);
+    int64_t* out = (int64_t*)ray_data(result);
+    for (int64_t i = 0; i < len; i++)
+        out[idx_data[i]] = i * n_groups / len;
+    ray_release(idx);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/string.c b/crates/rayforce-sys/vendor/rayforce/src/ops/string.c
new file mode 100644
index 0000000..e943034
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/string.c
@@ -0,0 +1,604 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+#include "ops/glob.h"
+
+/* ============================================================================
+ * OP_LIKE: glob pattern matching on STR / SYM columns.  See ops/glob.[ch].
+ * Syntax: * (any), ? (one char), [abc] / [a-z] / [!abc] (character class).
+ * ============================================================================ */
+
+ray_t* exec_like(ray_graph_t* g, ray_op_t* op) {
+    ray_t* input = exec_node(g, op->inputs[0]);
+    ray_t* pat_v = exec_node(g, op->inputs[1]);
+    if (!input || RAY_IS_ERR(input)) { if (pat_v && !RAY_IS_ERR(pat_v)) ray_release(pat_v); return input; }
+    if (!pat_v || RAY_IS_ERR(pat_v)) { ray_release(input); return pat_v; }
+
+    /* Get pattern string */
+    const char* pat_str = ray_str_ptr(pat_v);
+    size_t pat_len = ray_str_len(pat_v);
+
+    int64_t len = input->len;
+    ray_t* result = ray_vec_new(RAY_BOOL, len);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(input); ray_release(pat_v);
+        return result;
+    }
+    result->len = len;
+    uint8_t* dst = (uint8_t*)ray_data(result);
+
+    int8_t in_type = input->type;
+    if (in_type == RAY_STR) {
+        const ray_str_t* elems; const char* pool;
+        str_resolve(input, &elems, &pool);
+        for (int64_t i = 0; i < len; i++) {
+            const char* sp = ray_str_t_ptr(&elems[i], pool);
+            size_t sl = elems[i].len;
+            dst[i] = ray_glob_match(sp, sl, pat_str, pat_len) ? 1 : 0;
+        }
+    } else if (RAY_IS_SYM(in_type)) {
+        const void* base = ray_data(input);
+        for (int64_t i = 0; i < len; i++) {
+            int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs);
+            ray_t* s = ray_sym_str(sym_id);
+            if (!s) { dst[i] = 0; continue; }
+            const char* sp = ray_str_ptr(s);
+            size_t sl = ray_str_len(s);
+            dst[i] = ray_glob_match(sp, sl, pat_str, pat_len) ? 1 : 0;
+        }
+    } else {
+        memset(dst, 0, (size_t)len);
+    }
+
+    ray_release(input); ray_release(pat_v);
+    return result;
+}
+
+/* Case-insensitive LIKE — same syntax as `like`, ASCII-fold both sides. */
+
+ray_t* exec_ilike(ray_graph_t* g, ray_op_t* op) {
+    ray_t* input = exec_node(g, op->inputs[0]);
+    ray_t* pat_v = exec_node(g, op->inputs[1]);
+    if (!input || RAY_IS_ERR(input)) { if (pat_v && !RAY_IS_ERR(pat_v)) ray_release(pat_v); return input; }
+    if (!pat_v || RAY_IS_ERR(pat_v)) { ray_release(input); return pat_v; }
+
+    const char* pat_str = ray_str_ptr(pat_v);
+    size_t pat_len = ray_str_len(pat_v);
+
+    int64_t len = input->len;
+    ray_t* result = ray_vec_new(RAY_BOOL, len);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(input); ray_release(pat_v);
+        return result;
+    }
+    result->len = len;
+    uint8_t* dst = (uint8_t*)ray_data(result);
+
+    int8_t in_type = input->type;
+    if (in_type == RAY_STR) {
+        const ray_str_t* elems; const char* pool;
+        str_resolve(input, &elems, &pool);
+        for (int64_t i = 0; i < len; i++) {
+            const char* sp = ray_str_t_ptr(&elems[i], pool);
+            size_t sl = elems[i].len;
+            dst[i] = ray_glob_match_ci(sp, sl, pat_str, pat_len) ? 1 : 0;
+        }
+    } else if (RAY_IS_SYM(in_type)) {
+        const void* base = ray_data(input);
+        for (int64_t i = 0; i < len; i++) {
+            int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs);
+            ray_t* s = ray_sym_str(sym_id);
+            if (!s) { dst[i] = 0; continue; }
+            dst[i] = ray_glob_match_ci(ray_str_ptr(s), ray_str_len(s), pat_str, pat_len) ? 1 : 0;
+        }
+    } else {
+        memset(dst, 0, (size_t)len);
+    }
+
+    ray_release(input); ray_release(pat_v);
+    return result;
+}
+
+/* ============================================================================
+ * String functions: UPPER, LOWER, TRIM, STRLEN, SUBSTR, REPLACE, CONCAT
+ *
+ * These functions call ray_sym_intern() per output row, which is
+ * O(n * sym_table_lookup) per string op.  Acceptable for current workloads;
+ * could be optimized with batch interning if profiling shows a bottleneck.
+ * ============================================================================ */
+
+/* UPPER / LOWER / TRIM — unary SYM/STR → SYM/STR */
+ray_t* exec_string_unary(ray_graph_t* g, ray_op_t* op) {
+    ray_t* input = exec_node(g, op->inputs[0]);
+    if (!input || RAY_IS_ERR(input)) return input;
+
+    int64_t len = input->len;
+    bool is_str = (input->type == RAY_STR);
+
+    ray_t* result;
+    if (is_str) {
+        result = ray_vec_new(RAY_STR, len);
+    } else {
+        result = ray_vec_new(RAY_SYM, len);
+    }
+    if (!result || RAY_IS_ERR(result)) { ray_release(input); return result; }
+    if (!is_str) result->len = len;
+    int64_t* sym_dst = is_str ? NULL : (int64_t*)ray_data(result);
+
+    const ray_str_t* str_elems = NULL;
+    const char* str_pool = NULL;
+    if (is_str) str_resolve(input, &str_elems, &str_pool);
+
+    uint16_t opc = op->opcode;
+    for (int64_t i = 0; i < len; i++) {
+        /* Propagate null */
+        if (ray_vec_is_null((ray_t*)input, i)) {
+            if (is_str) {
+                result = ray_str_vec_append(result, "", 0);
+                if (RAY_IS_ERR(result)) break;
+                ray_vec_set_null(result, result->len - 1, true);
+            } else {
+                sym_dst[i] = 0;
+                ray_vec_set_null(result, i, true);
+            }
+            continue;
+        }
+        const char* sp; size_t sl;
+        if (is_str) {
+            sp = ray_str_t_ptr(&str_elems[i], str_pool);
+            sl = str_elems[i].len;
+        } else {
+            sym_elem(input, i, &sp, &sl);
+        }
+
+        char sbuf[8192];
+        char* buf = sbuf;
+        ray_t* dyn_hdr = NULL;
+        if (sl >= sizeof(sbuf)) {
+            buf = (char*)scratch_alloc(&dyn_hdr, sl + 1);
+            if (!buf) {
+                ray_release(result);
+                ray_release(input);
+                return ray_error("oom", NULL);
+            }
+        }
+        size_t out_len = sl;
+        if (opc == OP_UPPER) {
+            for (size_t j = 0; j < out_len; j++) buf[j] = (char)toupper((unsigned char)sp[j]);
+        } else if (opc == OP_LOWER) {
+            for (size_t j = 0; j < out_len; j++) buf[j] = (char)tolower((unsigned char)sp[j]);
+        } else { /* OP_TRIM */
+            size_t start = 0, end = sl;
+            while (start < sl && isspace((unsigned char)sp[start])) start++;
+            while (end > start && isspace((unsigned char)sp[end - 1])) end--;
+            out_len = end - start;
+            memcpy(buf, sp + start, out_len);
+        }
+
+        if (is_str) {
+            ray_t* prev = result;
+            result = ray_str_vec_append(result, buf, out_len);
+            if (RAY_IS_ERR(result)) { ray_release(prev); scratch_free(dyn_hdr); break; }
+        } else {
+            buf[out_len] = '\0';
+            sym_dst[i] = ray_sym_intern(buf, out_len);
+        }
+        scratch_free(dyn_hdr);
+    }
+    ray_release(input);
+    return result;
+}
+
+/* LENGTH — SYM → I64 */
+ray_t* exec_strlen(ray_graph_t* g, ray_op_t* op) {
+    ray_t* input = exec_node(g, op->inputs[0]);
+    if (!input || RAY_IS_ERR(input)) return input;
+
+    int64_t len = input->len;
+    ray_t* result = ray_vec_new(RAY_I64, len);
+    if (!result || RAY_IS_ERR(result)) { ray_release(input); return result; }
+    result->len = len;
+    int64_t* dst = (int64_t*)ray_data(result);
+
+    if (input->type == RAY_STR) {
+        const ray_str_t* elems; const char* pool;
+        str_resolve(input, &elems, &pool);
+        for (int64_t i = 0; i < len; i++) {
+            if (ray_vec_is_null((ray_t*)input, i)) {
+                dst[i] = 0;
+                ray_vec_set_null(result, i, true);
+                continue;
+            }
+            dst[i] = (int64_t)elems[i].len;
+        }
+    } else {
+        for (int64_t i = 0; i < len; i++) {
+            if (ray_vec_is_null((ray_t*)input, i)) {
+                dst[i] = 0;
+                ray_vec_set_null(result, i, true);
+                continue;
+            }
+            const char* sp; size_t sl;
+            sym_elem(input, i, &sp, &sl);
+            dst[i] = (int64_t)sl;
+        }
+    }
+    ray_release(input);
+    return result;
+}
+
+/* SUBSTR(str, start, len) — 1-based start */
+ray_t* exec_substr(ray_graph_t* g, ray_op_t* op) {
+    ray_t* input = exec_node(g, op->inputs[0]);
+    ray_t* start_v = exec_node(g, op->inputs[1]);
+    if (!input || RAY_IS_ERR(input)) { if (start_v && !RAY_IS_ERR(start_v)) ray_release(start_v); return input; }
+    if (!start_v || RAY_IS_ERR(start_v)) { ray_release(input); return start_v; }
+
+    /* Get len arg from ext node's literal field */
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    uint32_t len_id = (uint32_t)(uintptr_t)ext->literal;
+    ray_t* len_v = exec_node(g, &g->nodes[len_id]);
+    if (!len_v || RAY_IS_ERR(len_v)) { ray_release(input); ray_release(start_v); return len_v; }
+
+    int64_t nrows = input->len;
+    bool is_str = (input->type == RAY_STR);
+
+    ray_t* result;
+    if (is_str) {
+        result = ray_vec_new(RAY_STR, nrows);
+    } else {
+        result = ray_vec_new(RAY_SYM, nrows);
+    }
+    if (!result || RAY_IS_ERR(result)) { ray_release(input); ray_release(start_v); ray_release(len_v); return result; }
+    if (!is_str) result->len = nrows;
+    int64_t* sym_dst = is_str ? NULL : (int64_t*)ray_data(result);
+
+    const ray_str_t* str_elems = NULL;
+    const char* str_pool = NULL;
+    if (is_str) str_resolve(input, &str_elems, &str_pool);
+
+    /* start_v and len_v may be atom scalars or vectors.
+     * Handle RAY_I32 vectors correctly (read as int32_t, not int64_t). */
+    int64_t s_scalar = 0, l_scalar = 0;
+    const int64_t* s_data = NULL;
+    const int64_t* l_data = NULL;
+    const int32_t* s_data_i32 = NULL;
+    const int32_t* l_data_i32 = NULL;
+    if (start_v->type == -RAY_I64) s_scalar = start_v->i64;
+    else if (start_v->type == -RAY_F64) s_scalar = (int64_t)start_v->f64;
+    else if (start_v->len == 1) {
+        if (start_v->type == RAY_F64)
+            s_scalar = (int64_t)((double*)ray_data(start_v))[0];
+        else if (start_v->type == RAY_I32)
+            s_scalar = (int64_t)((int32_t*)ray_data(start_v))[0];
+        else
+            s_scalar = ((int64_t*)ray_data(start_v))[0];
+    }
+    else if (start_v->type == RAY_I32) s_data_i32 = (const int32_t*)ray_data(start_v);
+    else s_data = (const int64_t*)ray_data(start_v);
+    if (len_v->type == -RAY_I64) l_scalar = len_v->i64;
+    else if (len_v->type == -RAY_F64) l_scalar = (int64_t)len_v->f64;
+    else if (len_v->len == 1) {
+        if (len_v->type == RAY_F64)
+            l_scalar = (int64_t)((double*)ray_data(len_v))[0];
+        else if (len_v->type == RAY_I32)
+            l_scalar = (int64_t)((int32_t*)ray_data(len_v))[0];
+        else
+            l_scalar = ((int64_t*)ray_data(len_v))[0];
+    }
+    else if (len_v->type == RAY_I32) l_data_i32 = (const int32_t*)ray_data(len_v);
+    else l_data = (const int64_t*)ray_data(len_v);
+
+    for (int64_t i = 0; i < nrows; i++) {
+        /* Propagate null — from input, start, or length */
+        if (ray_vec_is_null((ray_t*)input, i) ||
+            ((s_data || s_data_i32) && ray_vec_is_null((ray_t*)start_v, i)) ||
+            ((l_data || l_data_i32) && ray_vec_is_null((ray_t*)len_v, i))) {
+            if (is_str) {
+                result = ray_str_vec_append(result, "", 0);
+                if (RAY_IS_ERR(result)) break;
+                ray_vec_set_null(result, result->len - 1, true);
+            } else {
+                sym_dst[i] = 0;
+                ray_vec_set_null(result, i, true);
+            }
+            continue;
+        }
+        const char* sp; size_t sl;
+        if (is_str) {
+            sp = ray_str_t_ptr(&str_elems[i], str_pool);
+            sl = str_elems[i].len;
+        } else {
+            sym_elem(input, i, &sp, &sl);
+        }
+        int64_t st = (s_data ? s_data[i] : s_data_i32 ? (int64_t)s_data_i32[i] : s_scalar) - 1; /* 1-based → 0-based */
+        int64_t ln = l_data ? l_data[i] : l_data_i32 ? (int64_t)l_data_i32[i] : l_scalar;
+        if (st < 0) st = 0;
+        if ((size_t)st >= sl) {
+            if (is_str) {
+                result = ray_str_vec_append(result, "", 0);
+                if (RAY_IS_ERR(result)) break;
+            }
+            else { sym_dst[i] = ray_sym_intern("", 0); }
+            continue;
+        }
+        if (ln < 0 || ln > (int64_t)(sl - (size_t)st)) ln = (int64_t)sl - st;
+        if (is_str) {
+            result = ray_str_vec_append(result, sp + st, (size_t)ln);
+            if (RAY_IS_ERR(result)) break;
+        } else {
+            sym_dst[i] = ray_sym_intern(sp + st, (size_t)ln);
+        }
+    }
+    ray_release(input); ray_release(start_v); ray_release(len_v);
+    return result;
+}
+
+/* REPLACE(str, from, to) */
+ray_t* exec_replace(ray_graph_t* g, ray_op_t* op) {
+    ray_t* input = exec_node(g, op->inputs[0]);
+    ray_t* from_v = exec_node(g, op->inputs[1]);
+    if (!input || RAY_IS_ERR(input)) { if (from_v && !RAY_IS_ERR(from_v)) ray_release(from_v); return input; }
+    if (!from_v || RAY_IS_ERR(from_v)) { ray_release(input); return from_v; }
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    uint32_t to_id = (uint32_t)(uintptr_t)ext->literal;
+    ray_t* to_v = exec_node(g, &g->nodes[to_id]);
+    if (!to_v || RAY_IS_ERR(to_v)) { ray_release(input); ray_release(from_v); return to_v; }
+
+    /* from_v and to_v should be string constants (SYM atoms) */
+    const char* from_str = ray_str_ptr(from_v);
+    size_t from_len = ray_str_len(from_v);
+    const char* to_str = ray_str_ptr(to_v);
+    size_t to_len = ray_str_len(to_v);
+
+    int64_t nrows = input->len;
+    bool is_str = (input->type == RAY_STR);
+
+    ray_t* result;
+    if (is_str) {
+        result = ray_vec_new(RAY_STR, nrows);
+    } else {
+        result = ray_vec_new(RAY_SYM, nrows);
+    }
+    if (!result || RAY_IS_ERR(result)) { ray_release(input); ray_release(from_v); ray_release(to_v); return result; }
+    if (!is_str) result->len = nrows;
+    int64_t* sym_dst = is_str ? NULL : (int64_t*)ray_data(result);
+
+    const ray_str_t* str_elems = NULL;
+    const char* str_pool = NULL;
+    if (is_str) str_resolve(input, &str_elems, &str_pool);
+
+    for (int64_t i = 0; i < nrows; i++) {
+        /* Propagate null */
+        if (ray_vec_is_null((ray_t*)input, i)) {
+            if (is_str) {
+                result = ray_str_vec_append(result, "", 0);
+                if (RAY_IS_ERR(result)) break;
+                ray_vec_set_null(result, result->len - 1, true);
+            } else {
+                sym_dst[i] = 0;
+                ray_vec_set_null(result, i, true);
+            }
+            continue;
+        }
+        const char* sp; size_t sl;
+        if (is_str) {
+            sp = ray_str_t_ptr(&str_elems[i], str_pool);
+            sl = str_elems[i].len;
+        } else {
+            sym_elem(input, i, &sp, &sl);
+        }
+        /* Simple find-and-replace-all */
+        /* Worst case: every char is a match, each replaced by to_len bytes.
+         * Guard against size_t overflow when to_len >> from_len. */
+        size_t n_matches = (from_len > 0) ? sl / from_len : 0;
+        size_t worst;
+        if (from_len > 0 && to_len > from_len && n_matches > SIZE_MAX / to_len) {
+            worst = SIZE_MAX; /* overflow → cap at max; scratch_alloc will OOM */
+        } else if (from_len > 0 && to_len >= from_len) {
+            /* Expanding or same-size: max output when every chunk matches */
+            worst = n_matches * to_len + (sl % from_len) + 1;
+        } else {
+            /* Shrinking or from_len==0: max output when nothing matches → sl */
+            worst = sl + 1;
+        }
+        char sbuf[8192];
+        char* buf = sbuf;
+        ray_t* dyn_hdr = NULL;
+        if (worst > sizeof(sbuf)) {
+            buf = (char*)scratch_alloc(&dyn_hdr, worst);
+            if (!buf) {
+                ray_release(result);
+                ray_release(input); ray_release(from_v); ray_release(to_v);
+                return ray_error("oom", NULL);
+            }
+        }
+        size_t buf_cap = dyn_hdr ? worst : sizeof(sbuf);
+        size_t bi = 0;
+        for (size_t j = 0; j < sl; ) {
+            if (from_len > 0 && j + from_len <= sl && memcmp(sp + j, from_str, from_len) == 0) {
+                if (bi + to_len < buf_cap) { memcpy(buf + bi, to_str, to_len); bi += to_len; }
+                j += from_len;
+            } else {
+                if (bi < buf_cap - 1) buf[bi++] = sp[j];
+                j++;
+            }
+        }
+        if (is_str) {
+            ray_t* prev = result;
+            result = ray_str_vec_append(result, buf, bi);
+            if (RAY_IS_ERR(result)) { ray_release(prev); scratch_free(dyn_hdr); break; }
+        } else {
+            buf[bi] = '\0';
+            sym_dst[i] = ray_sym_intern(buf, bi);
+        }
+        scratch_free(dyn_hdr);
+    }
+    ray_release(input); ray_release(from_v); ray_release(to_v);
+    return result;
+}
+
+/* CONCAT(a, b, ...) */
+ray_t* exec_concat(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+    int64_t raw_nargs = ext->sym;
+    if (raw_nargs < 2 || raw_nargs > 255) return ray_error("domain", NULL);
+    int n_args = (int)raw_nargs;
+
+    /* Evaluate all inputs */
+    ray_t* args_stack[16];
+    ray_t** args = args_stack;
+    ray_t* args_hdr = NULL;
+    if (n_args > 16) {
+        args = (ray_t**)scratch_calloc(&args_hdr, (size_t)n_args * sizeof(ray_t*));
+        if (!args) return ray_error("oom", NULL);
+    }
+
+    args[0] = exec_node(g, op->inputs[0]);
+    args[1] = exec_node(g, op->inputs[1]);
+    uint32_t* trail = (uint32_t*)((char*)(ext + 1));
+    for (int i = 2; i < n_args; i++) {
+        args[i] = exec_node(g, &g->nodes[trail[i - 2]]);
+    }
+    /* Error check */
+    for (int i = 0; i < n_args; i++) {
+        if (!args[i] || RAY_IS_ERR(args[i])) {
+            ray_t* err = args[i];
+            for (int j = 0; j < n_args; j++) {
+                if (j != i && args[j] && !RAY_IS_ERR(args[j])) ray_release(args[j]);
+            }
+            scratch_free(args_hdr);
+            return err;
+        }
+    }
+
+    /* Derive nrows from first vector arg (scalar args have byte-length in len) */
+    int64_t nrows = 1;
+    bool out_str = false;
+    for (int a = 0; a < n_args; a++) {
+        int8_t at = args[a]->type;
+        if (at == RAY_STR) { out_str = true; if (nrows == 1) nrows = args[a]->len; }
+        if (RAY_IS_SYM(at)) { if (nrows == 1) nrows = args[a]->len; }
+        if (!ray_is_atom(args[a]) && nrows == 1) { nrows = args[a]->len; }
+    }
+    ray_t* result = ray_vec_new(out_str ? RAY_STR : RAY_SYM, nrows);
+    if (!result || RAY_IS_ERR(result)) {
+        for (int i = 0; i < n_args; i++) ray_release(args[i]);
+        scratch_free(args_hdr);
+        return result;
+    }
+    if (!out_str) result->len = nrows;
+    int64_t* dst = out_str ? NULL : (int64_t*)ray_data(result);
+
+    for (int64_t r = 0; r < nrows; r++) {
+        /* Check if any arg is null at this row */
+        bool any_null = false;
+        for (int a = 0; a < n_args; a++) {
+            if (ray_is_atom(args[a])) {
+                if (RAY_ATOM_IS_NULL(args[a])) { any_null = true; break; }
+            } else if (ray_vec_is_null((ray_t*)args[a], r < args[a]->len ? r : 0)) {
+                any_null = true;
+                break;
+            }
+        }
+        if (any_null) {
+            if (out_str) {
+                result = ray_str_vec_append(result, "", 0);
+                if (RAY_IS_ERR(result)) break;
+                ray_vec_set_null(result, result->len - 1, true);
+            } else {
+                dst[r] = 0;
+                ray_vec_set_null(result, r, true);
+            }
+            continue;
+        }
+        /* Pre-scan to compute total concat length for this row */
+        size_t total = 0;
+        for (int a = 0; a < n_args; a++) {
+            int8_t t = args[a]->type;
+            if (t == RAY_STR) {
+                const ray_str_t* elems; const char* p;
+                str_resolve(args[a], &elems, &p);
+                int64_t ar = ray_is_atom(args[a]) ? 0 : (r < args[a]->len ? r : 0);
+                total += elems[ar].len;
+            } else if (RAY_IS_SYM(t)) {
+                const char* sp; size_t sl;
+                int64_t ar = ray_is_atom(args[a]) ? 0 : (r < args[a]->len ? r : 0);
+                sym_elem(args[a], ar, &sp, &sl);
+                total += sl;
+            } else if (t == -RAY_STR) {
+                total += ray_str_len(args[a]);
+            }
+        }
+        char sbuf[8192];
+        char* buf = sbuf;
+        ray_t* dyn_hdr = NULL;
+        size_t buf_cap = sizeof(sbuf);
+        if (total >= sizeof(sbuf)) {
+            buf = (char*)scratch_alloc(&dyn_hdr, total + 1);
+            if (!buf) {
+                ray_release(result);
+                for (int i = 0; i < n_args; i++) ray_release(args[i]);
+                scratch_free(args_hdr);
+                return ray_error("oom", NULL);
+            }
+            buf_cap = total + 1;
+        }
+        size_t bi = 0;
+        for (int a = 0; a < n_args; a++) {
+            int8_t t = args[a]->type;
+            if (t == RAY_STR) {
+                const ray_str_t* elems; const char* pool;
+                str_resolve(args[a], &elems, &pool);
+                int64_t ar = ray_is_atom(args[a]) ? 0 : (r < args[a]->len ? r : 0);
+                const char* sp = ray_str_t_ptr(&elems[ar], pool);
+                size_t sl = elems[ar].len;
+                if (bi + sl < buf_cap) { memcpy(buf + bi, sp, sl); bi += sl; }
+            } else if (RAY_IS_SYM(t)) {
+                const char* sp; size_t sl;
+                int64_t ar = ray_is_atom(args[a]) ? 0 : (r < args[a]->len ? r : 0);
+                sym_elem(args[a], ar, &sp, &sl);
+                if (bi + sl < buf_cap) { memcpy(buf + bi, sp, sl); bi += sl; }
+            } else if (t == -RAY_STR) {
+                const char* sp = ray_str_ptr(args[a]);
+                size_t sl = ray_str_len(args[a]);
+                if (sp && bi + sl < buf_cap) { memcpy(buf + bi, sp, sl); bi += sl; }
+            }
+        }
+        if (out_str) {
+            ray_t* prev = result;
+            result = ray_str_vec_append(result, buf, bi);
+            if (RAY_IS_ERR(result)) { ray_release(prev); scratch_free(dyn_hdr); break; }
+        } else {
+            buf[bi] = '\0';
+            dst[r] = ray_sym_intern(buf, bi);
+        }
+        scratch_free(dyn_hdr);
+    }
+    for (int i = 0; i < n_args; i++) ray_release(args[i]);
+    scratch_free(args_hdr);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/strop.c b/crates/rayforce-sys/vendor/rayforce/src/ops/strop.c
new file mode 100644
index 0000000..9744398
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/strop.c
@@ -0,0 +1,281 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/internal.h"
+#include "table/sym.h"
+#include "ops/glob.h"
+
+/* ══════════════════════════════════════════
+ * String builtins
+ * ══════════════════════════════════════════ */
+
+ray_t* ray_split_fn(ray_t* str, ray_t* delim) {
+    /* List split: (split list indices) → list of sub-lists */
+    if (str->type == RAY_LIST &&
+        ray_is_vec(delim) && (delim->type == RAY_I64 || delim->type == RAY_I16 || delim->type == RAY_I32)) {
+        int64_t nidx = delim->len;
+        if (nidx == 0) return NULL; /* null for empty indices */
+        int64_t idx_buf[256];
+        if (nidx > 256) return ray_error("limit", NULL);
+        for (int64_t ii = 0; ii < nidx; ii++) {
+            int alloc = 0;
+            ray_t* ie = collection_elem(delim, ii, &alloc);
+            idx_buf[ii] = as_i64(ie);
+            if (alloc) ray_release(ie);
+        }
+        int64_t total = str->len;
+        ray_t** items = (ray_t**)ray_data(str);
+        ray_t* result = ray_list_new(nidx + 1);
+        if (RAY_IS_ERR(result)) return result;
+        for (int64_t i = 0; i < nidx; i++) {
+            int64_t start = idx_buf[i];
+            int64_t end = (i + 1 < nidx) ? idx_buf[i + 1] : total;
+            int64_t seglen = end - start;
+            if (seglen < 0) seglen = 0;
+            /* Try to make a typed vector if all elements are same type */
+            if (seglen > 0) {
+                int8_t first_type = items[start]->type;
+                int all_same = 1;
+                for (int64_t j = start + 1; j < start + seglen && j < total; j++) {
+                    if (items[j]->type != first_type) { all_same = 0; break; }
+                }
+                if (all_same && first_type < 0 && first_type != -RAY_STR) {
+                    int8_t vtype = -first_type;
+                    ray_t* vec = ray_vec_new(vtype, seglen);
+                    if (!RAY_IS_ERR(vec)) {
+                        vec->len = seglen;
+                        for (int64_t j = 0; j < seglen && start + j < total; j++)
+                            store_typed_elem(vec, j, items[start + j]);
+                        result = ray_list_append(result, vec);
+                        ray_release(vec);
+                        if (RAY_IS_ERR(result)) return result;
+                        continue;
+                    }
+                }
+            }
+            /* Heterogeneous or string segment: make a sub-list */
+            ray_t* seg = ray_list_new(seglen);
+            if (RAY_IS_ERR(seg)) { ray_release(result); return seg; }
+            for (int64_t j = 0; j < seglen && start + j < total; j++) {
+                ray_retain(items[start + j]);
+                seg = ray_list_append(seg, items[start + j]);
+                ray_release(items[start + j]);
+                if (RAY_IS_ERR(seg)) { ray_release(result); return seg; }
+            }
+            result = ray_list_append(result, seg);
+            ray_release(seg);
+            if (RAY_IS_ERR(result)) return result;
+        }
+        return result;
+    }
+    /* Vector/string split: (split vec/str indices) → list of sub-vectors/substrings */
+    if ((ray_is_vec(str) || (ray_is_atom(str) && (-str->type) == RAY_STR)) &&
+        ray_is_vec(delim) && (delim->type == RAY_I64 || delim->type == RAY_I16 || delim->type == RAY_I32)) {
+        int64_t nidx = delim->len;
+        if (nidx == 0) return NULL; /* null for empty indices */
+        /* Extract indices as i64 */
+        int64_t idx_buf[256];
+        if (nidx > 256) return ray_error("limit", NULL);
+        for (int64_t ii = 0; ii < nidx; ii++) {
+            int alloc = 0;
+            ray_t* ie = collection_elem(delim, ii, &alloc);
+            idx_buf[ii] = as_i64(ie);
+            if (alloc) ray_release(ie);
+        }
+        /* String split by indices */
+        if (ray_is_atom(str) && (-str->type) == RAY_STR) {
+            const char* sp2 = ray_str_ptr(str);
+            size_t total = ray_str_len(str);
+            ray_t* result = ray_list_new(nidx + 1);
+            if (RAY_IS_ERR(result)) return result;
+            for (int64_t i = 0; i < nidx; i++) {
+                int64_t start = idx_buf[i];
+                int64_t end = (i + 1 < nidx) ? idx_buf[i + 1] : (int64_t)total;
+                int64_t seglen = end - start;
+                if (seglen < 0) seglen = 0;
+                if (start > (int64_t)total) start = (int64_t)total;
+                if (start + seglen > (int64_t)total) seglen = (int64_t)total - start;
+                ray_t* seg = ray_str(sp2 + start, (size_t)seglen);
+                if (RAY_IS_ERR(seg)) { ray_release(result); return seg; }
+                result = ray_list_append(result, seg);
+                ray_release(seg);
+                if (RAY_IS_ERR(result)) return result;
+            }
+            return result;
+        }
+        /* Vector split by indices */
+        int64_t total = str->len;
+        int esz = ray_elem_size(str->type);
+        ray_t* result = ray_list_new(nidx + 1);
+        if (RAY_IS_ERR(result)) return result;
+        for (int64_t i = 0; i < nidx; i++) {
+            int64_t start = idx_buf[i];
+            int64_t end = (i + 1 < nidx) ? idx_buf[i + 1] : total;
+            int64_t seglen = end - start;
+            if (seglen < 0) seglen = 0;
+            ray_t* seg = ray_vec_new(str->type, seglen);
+            if (RAY_IS_ERR(seg)) { ray_release(result); return seg; }
+            seg->len = seglen;
+            if (seglen > 0) memcpy(ray_data(seg), (char*)ray_data(str) + start * esz, seglen * esz);
+            result = ray_list_append(result, seg);
+            ray_release(seg);
+            if (RAY_IS_ERR(result)) return result;
+        }
+        return result;
+    }
+    /* Normalize str and delim to string pointers */
+    const char *sp, *dp;
+    size_t slen, dlen;
+    ray_t* sym_str_s = NULL;
+    ray_t* sym_str_d = NULL;
+    if (str->type == -RAY_STR) { sp = ray_str_ptr(str); slen = ray_str_len(str); }
+    else if (str->type == -RAY_SYM) { sym_str_s = ray_sym_str(str->i64); if (!sym_str_s) return ray_error("domain", NULL); sp = ray_str_ptr(sym_str_s); slen = ray_str_len(sym_str_s); }
+    /* RAY_CHAR removed — all chars are now -RAY_STR */
+    else return ray_error("type", NULL);
+    if (delim->type == -RAY_STR) { dp = ray_str_ptr(delim); dlen = ray_str_len(delim); }
+    /* RAY_CHAR removed — all chars are now -RAY_STR */
+    else { if (sym_str_s) ray_release(sym_str_s); return ray_error("type", NULL); }
+
+    ray_t* result = ray_list_new(8);
+    if (RAY_IS_ERR(result)) { if (sym_str_s) ray_release(sym_str_s); if (sym_str_d) ray_release(sym_str_d); return result; }
+
+    if (dlen == 0 || slen == 0) {
+        ray_t* part = ray_str(sp, slen);
+        result = ray_list_append(result, part);
+        ray_release(part);
+        if (sym_str_s) ray_release(sym_str_s);
+        if (sym_str_d) ray_release(sym_str_d);
+        return result;
+    }
+
+    size_t start = 0;
+    for (size_t i = 0; i <= slen - dlen; ) {
+        if (memcmp(sp + i, dp, dlen) == 0) {
+            ray_t* part = ray_str(sp + start, i - start);
+            if (RAY_IS_ERR(part)) { ray_release(result); if (sym_str_s) ray_release(sym_str_s); if (sym_str_d) ray_release(sym_str_d); return part; }
+            result = ray_list_append(result, part);
+            ray_release(part);
+            if (RAY_IS_ERR(result)) { if (sym_str_s) ray_release(sym_str_s); if (sym_str_d) ray_release(sym_str_d); return result; }
+            i += dlen;
+            start = i;
+        } else {
+            i++;
+        }
+    }
+    /* Last part */
+    ray_t* part = ray_str(sp + start, slen - start);
+    if (RAY_IS_ERR(part)) { ray_release(result); if (sym_str_s) ray_release(sym_str_s); if (sym_str_d) ray_release(sym_str_d); return part; }
+    result = ray_list_append(result, part);
+    ray_release(part);
+    if (sym_str_s) ray_release(sym_str_s);
+    if (sym_str_d) ray_release(sym_str_d);
+    return result;
+}
+
+/* (like str pattern) — glob-style pattern matching.
+ * Syntax: * (any), ? (one char), [abc] / [a-z] / [!abc] (char class).
+ * Implementation lives in src/ops/glob.[ch]; same matcher is used by
+ * the DAG executor (string.c::exec_like) for select-where contexts. */
+ray_t* ray_like_fn(ray_t* x, ray_t* pattern) {
+    /* Pattern must be a string atom */
+    if (pattern->type != -RAY_STR) return ray_error("type", "like: pattern must be a string");
+    const char* pat = ray_str_ptr(pattern);
+    size_t pat_len = ray_str_len(pattern);
+
+    /* Atom: single match */
+    if (x->type == -RAY_STR || x->type == -RAY_SYM) {
+        const char* s; size_t sl;
+        ray_t* sym_str = NULL;
+        if (x->type == -RAY_SYM) {
+            sym_str = ray_sym_str(x->i64);
+            s  = sym_str ? ray_str_ptr(sym_str) : "";
+            sl = sym_str ? ray_str_len(sym_str) : 0;
+        } else {
+            s  = ray_str_ptr(x);
+            sl = ray_str_len(x);
+        }
+        bool m = ray_glob_match(s, sl, pat, pat_len);
+        if (sym_str) ray_release(sym_str);
+        return make_bool(m ? 1 : 0);
+    }
+
+    /* Vector: map over elements */
+    if (ray_is_vec(x) && (x->type == RAY_SYM || x->type == RAY_STR)) {
+        int64_t n = ray_len(x);
+        ray_t* result = ray_vec_new(RAY_BOOL, n);
+        if (RAY_IS_ERR(result)) return result;
+        result->len = n;
+        uint8_t* out = (uint8_t*)ray_data(result);
+
+        if (x->type == RAY_SYM) {
+            int64_t* sym_ids = (int64_t*)ray_data(x);
+            for (int64_t i = 0; i < n; i++) {
+                ray_t* sym_str = ray_sym_str(sym_ids[i]);
+                const char* s = sym_str ? ray_str_ptr(sym_str) : "";
+                size_t sl = sym_str ? ray_str_len(sym_str) : 0;
+                out[i] = ray_glob_match(s, sl, pat, pat_len) ? 1 : 0;
+                if (sym_str) ray_release(sym_str);
+            }
+        } else {
+            /* RAY_STR vector */
+            for (int64_t i = 0; i < n; i++) {
+                size_t slen;
+                const char* s = ray_str_vec_get(x, i, &slen);
+                out[i] = (s && ray_glob_match(s, slen, pat, pat_len)) ? 1 : 0;
+            }
+        }
+        return result;
+    }
+
+    return ray_error("type", "like: expects string or symbol");
+}
+
+ray_t* ray_sym_name_fn(ray_t* x) {
+    if (x->type == -RAY_I64) {
+        if (x->i64 < 0 || !ray_sym_str(x->i64))
+            return ray_error("domain", "sym-name: invalid sym ID");
+        return ray_sym(x->i64);
+    }
+    if (x->type == RAY_I64) {
+        int64_t n = x->len;
+        const int64_t* data = (const int64_t*)ray_data(x);
+        /* Validate all IDs first */
+        for (int64_t i = 0; i < n; i++) {
+            if (data[i] < 0 || !ray_sym_str(data[i]))
+                return ray_error("domain", "sym-name: invalid sym ID in vector");
+        }
+        ray_t* out = ray_vec_new(RAY_SYM, n);
+        if (RAY_IS_ERR(out)) return out;
+        for (int64_t i = 0; i < n; i++) {
+            out = ray_vec_append(out, &data[i]);
+            if (RAY_IS_ERR(out)) return out;
+        }
+        return out;
+    }
+    /* Already sym (atom or vector), or empty I64/SYM vector — passthrough */
+    if (x->type == -RAY_SYM || x->type == RAY_SYM ||
+        ((x->type == RAY_I64 || x->type == RAY_SYM) && x->len == 0)) {
+        ray_retain(x); return x;
+    }
+    return ray_error("type", "sym-name: expected i64 or i64 vector");
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/system.c b/crates/rayforce-sys/vendor/rayforce/src/ops/system.c
new file mode 100644
index 0000000..43f5d92
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/system.c
@@ -0,0 +1,827 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "lang/internal.h"
+#include "lang/env.h"
+#include "lang/parse.h"
+#include "mem/heap.h"
+#include "store/serde.h"
+#include "store/splay.h"
+#include "store/part.h"
+#include "core/ipc.h"
+#include <time.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#if !defined(RAY_OS_WINDOWS)
+#include <unistd.h>
+#endif
+
+/* ══════════════════════════════════════════
+ * Serialization / storage
+ * ══════════════════════════════════════════ */
+
+/* (ser val) -> serialize to U8 vector with IPC header */
+ray_t* ray_ser_fn(ray_t* val) {
+    return ray_ser(val);
+}
+
+/* (de bytes) -> deserialize from U8 vector */
+ray_t* ray_de_fn(ray_t* val) {
+    return ray_de(val);
+}
+
+/* Build default sym path: dir/sym. Returns NULL if file does not exist. */
+static const char* splay_default_sym(const char* dir, char* buf, size_t bufsz,
+                                     bool must_exist) {
+    int n = snprintf(buf, bufsz, "%s/sym", dir);
+    if (n < 0 || (size_t)n >= bufsz) return NULL;
+    if (must_exist && access(buf, F_OK) != 0) return NULL;
+    return buf;
+}
+
+/* Helper: extract null-terminated path from a STR atom into a stack buffer.
+ * Returns pointer to buf on success, NULL on failure. */
+static const char* str_to_cpath(ray_t* s, char* buf, size_t bufsz) {
+    if (!s || s->type != -RAY_STR) return NULL;
+    const char* p = ray_str_ptr(s);
+    size_t len = ray_str_len(s);
+    if (!p || len == 0 || len >= bufsz) return NULL;
+    memcpy(buf, p, len);
+    buf[len] = '\0';
+    return buf;
+}
+
+/* (.db.splayed.set "dir" table) or (.db.splayed.set "dir" table "sym_path") */
+ray_t* ray_set_splayed_fn(ray_t** args, int64_t n) {
+    if (n < 2 || n > 3) return ray_error("domain", NULL);
+
+    char dir[1024];
+    if (!str_to_cpath(args[0], dir, sizeof(dir))) return ray_error("type", NULL);
+
+    ray_t* tbl = args[1];
+    if (!tbl || tbl->type != RAY_TABLE) return ray_error("type", NULL);
+
+    char sym[1024];
+    const char* sym_path = NULL;
+    if (n == 3 && args[2] && args[2]->type == -RAY_STR)
+        sym_path = str_to_cpath(args[2], sym, sizeof(sym));
+    else
+        sym_path = splay_default_sym(dir, sym, sizeof(sym), false);
+
+    ray_err_t err = ray_splay_save(tbl, dir, sym_path);
+    if (err != RAY_OK) return ray_error(ray_err_code_str(err), NULL);
+
+    ray_retain(tbl);
+    return tbl;
+}
+
+/* (.db.splayed.get "dir") or (.db.splayed.get "dir" "sym_path") */
+ray_t* ray_get_splayed_fn(ray_t** args, int64_t n) {
+    if (n < 1 || n > 2) return ray_error("domain", NULL);
+
+    char dir[1024];
+    if (!str_to_cpath(args[0], dir, sizeof(dir))) return ray_error("type", NULL);
+
+    char sym[1024];
+    const char* sym_path = NULL;
+    if (n == 2 && args[1] && args[1]->type == -RAY_STR)
+        sym_path = str_to_cpath(args[1], sym, sizeof(sym));
+    else
+        sym_path = splay_default_sym(dir, sym, sizeof(sym), true);
+
+    return ray_splay_load(dir, sym_path);
+}
+
+/* (.db.parted.get "db_root" `table_name) -- load partitioned table */
+ray_t* ray_get_parted_fn(ray_t** args, int64_t n) {
+    if (n != 2) return ray_error("domain", NULL);
+
+    char root[1024];
+    if (!str_to_cpath(args[0], root, sizeof(root))) return ray_error("type", NULL);
+
+    /* Table name as symbol atom */
+    if (!args[1] || args[1]->type != -RAY_SYM) return ray_error("type", NULL);
+    ray_t* name_atom = ray_sym_str(args[1]->i64);
+    if (!name_atom) return ray_error("name", NULL);
+
+    char name[256];
+    size_t nlen = ray_str_len(name_atom);
+    if (nlen == 0 || nlen >= sizeof(name)) return ray_error("domain", NULL);
+    memcpy(name, ray_str_ptr(name_atom), nlen);
+    name[nlen] = '\0';
+
+    return ray_read_parted(root, name);
+}
+
+/* ══════════════════════════════════════════
+ * Mount helpers (.db.splayed.mount / .db.parted.mount).
+ *
+ * `mount` walks a root directory, identifies child tables, loads each,
+ * binds it as a global named after the directory entry, and returns
+ * a `name → table` dict so callers can introspect what was loaded
+ * without re-scanning the filesystem.  Mirrors kdb's `\l /tmp/db/`
+ * but split into format-specific entry points so the discovery
+ * heuristics can be tighter (splayed: presence of `.d` schema;
+ * parted: presence of partition directories matching digit/dot).
+ * ══════════════════════════════════════════ */
+
+#include <sys/stat.h>
+#include <dirent.h>
+
+/* True when `dir` is a splayed-table directory: contains a `.d`
+ * schema file at its top.  Side-effect-free aside from a stat. */
+static int dir_is_splayed(const char* dir) {
+    char path[1024];
+    int n = snprintf(path, sizeof(path), "%s/.d", dir);
+    if (n <= 0 || n >= (int)sizeof(path)) return 0;
+    return access(path, F_OK) == 0;
+}
+
+/* True when `name` looks like a partition directory entry:
+ * non-empty, every char is a digit or `.`.  Matches the
+ * collect_part_dirs heuristic in store/part.c. */
+static int name_looks_partition(const char* name) {
+    if (!name || !name[0]) return 0;
+    for (const char* c = name; *c; c++)
+        if (!(*c == '.' || (*c >= '0' && *c <= '9'))) return 0;
+    return 1;
+}
+
+/* True when `dir` is a parted-table root: has at least one
+ * subdirectory whose name matches the partition heuristic. */
+static int dir_is_parted_root(const char* dir) {
+    DIR* d = opendir(dir);
+    if (!d) return 0;
+    int found = 0;
+    struct dirent* ent;
+    while ((ent = readdir(d)) != NULL) {
+        if (ent->d_name[0] == '.') continue;
+        if (strcmp(ent->d_name, "sym") == 0) continue;
+        if (!name_looks_partition(ent->d_name)) continue;
+        char child[2048];
+        int n = snprintf(child, sizeof(child), "%s/%s", dir, ent->d_name);
+        if (n <= 0 || n >= (int)sizeof(child)) continue;
+        struct stat st;
+        if (stat(child, &st) == 0 && S_ISDIR(st.st_mode)) { found = 1; break; }
+    }
+    closedir(d);
+    return found;
+}
+
+/* Bind `name` as a global pointing to `tbl` and append the (name, tbl)
+ * pair onto the building dict.  Both retain — the env keeps an owned
+ * ref, the returned dict gets its own refs. */
+static void mount_record(int64_t* names_buf, ray_t** vals_buf, int* count,
+                         int max, const char* name, size_t nlen, ray_t* tbl) {
+    if (*count >= max) return;
+    int64_t sym_id = ray_sym_intern(name, nlen);
+    ray_env_set(sym_id, tbl);
+    names_buf[*count] = sym_id;
+    ray_retain(tbl);
+    vals_buf[*count] = tbl;
+    (*count)++;
+}
+
+static ray_t* finalize_mount_dict(int64_t* names_buf, ray_t** vals_buf, int count) {
+    if (count == 0) return ray_dict_new(ray_list_new(0), ray_list_new(0));
+    ray_t* keys = ray_vec_new(RAY_SYM, count);
+    if (!keys || RAY_IS_ERR(keys)) return keys ? keys : ray_error("oom", NULL);
+    keys->len = count;
+    int64_t* k = (int64_t*)ray_data(keys);
+    for (int i = 0; i < count; i++) k[i] = names_buf[i];
+    ray_t* vals = ray_list_new(count);
+    if (!vals || RAY_IS_ERR(vals)) { ray_release(keys); return vals ? vals : ray_error("oom", NULL); }
+    for (int i = 0; i < count; i++) {
+        vals = ray_list_append(vals, vals_buf[i]);
+        ray_release(vals_buf[i]);
+    }
+    return ray_dict_new(keys, vals);
+}
+
+/* (.db.splayed.mount "root") — for each immediate subdirectory of
+ * root that contains a `.d` schema file, load it as a splayed table
+ * and bind it as a global named after the subdirectory.  Returns a
+ * dict {name → table} of the bindings made. */
+ray_t* ray_db_splayed_mount_fn(ray_t** args, int64_t n) {
+    if (n != 1) return ray_error("domain", NULL);
+    char root[1024];
+    if (!str_to_cpath(args[0], root, sizeof(root))) return ray_error("type", NULL);
+
+    DIR* d = opendir(root);
+    if (!d) return ray_error("io", "cannot open directory");
+
+    int64_t names_buf[256];
+    ray_t*  vals_buf[256];
+    int     count = 0;
+
+    struct dirent* ent;
+    while ((ent = readdir(d)) != NULL) {
+        if (ent->d_name[0] == '.') continue;
+        char child[2048];
+        int cn = snprintf(child, sizeof(child), "%s/%s", root, ent->d_name);
+        if (cn <= 0 || cn >= (int)sizeof(child)) continue;
+        struct stat st;
+        if (stat(child, &st) != 0 || !S_ISDIR(st.st_mode)) continue;
+        if (!dir_is_splayed(child)) continue;
+        ray_t* tbl = ray_splay_load(child, NULL);
+        if (!tbl || RAY_IS_ERR(tbl)) {
+            if (tbl) ray_release(tbl);
+            continue;
+        }
+        mount_record(names_buf, vals_buf, &count, 256,
+                     ent->d_name, strlen(ent->d_name), tbl);
+        ray_release(tbl);  /* env_set retained; we no longer need our local ref */
+    }
+    closedir(d);
+    return finalize_mount_dict(names_buf, vals_buf, count);
+}
+
+/* (.db.parted.mount "root") — discover the table names under a
+ * partitioned root by inspecting the first partition directory, then
+ * load each name via ray_read_parted (zero-copy parted view) and
+ * bind it as a global.  Returns a dict {name → table}. */
+ray_t* ray_db_parted_mount_fn(ray_t** args, int64_t n) {
+    if (n != 1) return ray_error("domain", NULL);
+    char root[1024];
+    if (!str_to_cpath(args[0], root, sizeof(root))) return ray_error("type", NULL);
+
+    if (!dir_is_parted_root(root))
+        return ray_error("domain", "not a parted-table root (no partition directories found)");
+
+    /* Find the first partition directory to enumerate table names from. */
+    DIR* d = opendir(root);
+    if (!d) return ray_error("io", "cannot open directory");
+    char first_part[2048] = {0};
+    struct dirent* ent;
+    while ((ent = readdir(d)) != NULL) {
+        if (ent->d_name[0] == '.') continue;
+        if (strcmp(ent->d_name, "sym") == 0) continue;
+        if (!name_looks_partition(ent->d_name)) continue;
+        int cn = snprintf(first_part, sizeof(first_part), "%s/%s", root, ent->d_name);
+        if (cn <= 0 || cn >= (int)sizeof(first_part)) { first_part[0] = '\0'; continue; }
+        struct stat st;
+        if (stat(first_part, &st) == 0 && S_ISDIR(st.st_mode)) break;
+        first_part[0] = '\0';
+    }
+    closedir(d);
+    if (!first_part[0])
+        return ray_error("io", "parted root has no readable partition");
+
+    /* Walk the first partition: every subdirectory is a table name. */
+    DIR* dp = opendir(first_part);
+    if (!dp) return ray_error("io", "cannot scan partition");
+
+    int64_t names_buf[256];
+    ray_t*  vals_buf[256];
+    int     count = 0;
+
+    while ((ent = readdir(dp)) != NULL) {
+        if (ent->d_name[0] == '.') continue;
+        char tbl_in_part[3072];
+        int cn = snprintf(tbl_in_part, sizeof(tbl_in_part), "%s/%s", first_part, ent->d_name);
+        if (cn <= 0 || cn >= (int)sizeof(tbl_in_part)) continue;
+        struct stat st;
+        if (stat(tbl_in_part, &st) != 0 || !S_ISDIR(st.st_mode)) continue;
+        ray_t* tbl = ray_read_parted(root, ent->d_name);
+        if (!tbl || RAY_IS_ERR(tbl)) {
+            if (tbl) ray_release(tbl);
+            continue;
+        }
+        mount_record(names_buf, vals_buf, &count, 256,
+                     ent->d_name, strlen(ent->d_name), tbl);
+        ray_release(tbl);
+    }
+    closedir(dp);
+    return finalize_mount_dict(names_buf, vals_buf, count);
+}
+
+/* ══════════════════════════════════════════
+ * Filesystem metadata: .os.size / .os.list
+ *
+ * Issue #36 asked for size + existence + listing primitives.  We
+ * keep just two — `.os.size` and `.os.list` — because every other
+ * predicate (exists, is-file, is-dir) is reachable either via
+ * try-on-error against these or via the existing shell fallback
+ * (`(.sys.cmd "test -e p")` etc.).  Both errors are flagged "io"
+ * so a user wrapping the call in `try` can distinguish missing /
+ * wrong-kind from a domain mistake without introspecting the
+ * message.
+ * ══════════════════════════════════════════ */
+
+/* (.os.size "path") → i64 file size in bytes.  Errors with "io"
+ * when the path doesn't exist or names a directory — `try` it if
+ * the caller wants those treated as "not a file" rather than a
+ * hard error. */
+ray_t* ray_os_size_fn(ray_t* x) {
+    if (!ray_is_atom(x) || x->type != -RAY_STR)
+        return ray_error("type", ".os.size expects a string path");
+    char path[1024];
+    if (!str_to_cpath(x, path, sizeof(path))) return ray_error("type", NULL);
+
+    struct stat st;
+    if (stat(path, &st) != 0)
+        return ray_error("io", "%s: %s", path, strerror(errno));
+    if (S_ISDIR(st.st_mode))
+        return ray_error("io", "%s: is a directory", path);
+    return ray_i64((int64_t)st.st_size);
+}
+
+/* qsort comparator for sorting directory entries by name.  Filesystem
+ * order from readdir is implementation-defined; sorting gives stable
+ * output for tests and predictable iteration in user code. */
+static int dir_entry_cmp(const void* a, const void* b) {
+    const char* sa = *(const char* const*)a;
+    const char* sb = *(const char* const*)b;
+    return strcmp(sa, sb);
+}
+
+/* (.os.list "path") → sym vec of entries, sorted, with `.` and `..`
+ * filtered out.  Errors with "io" if the path isn't a directory or
+ * doesn't exist — caller can use that as a file/dir discriminator
+ * via `try` when they don't want to shell out for the predicate. */
+ray_t* ray_os_list_fn(ray_t* x) {
+    if (!ray_is_atom(x) || x->type != -RAY_STR)
+        return ray_error("type", ".os.list expects a string path");
+    char path[1024];
+    if (!str_to_cpath(x, path, sizeof(path))) return ray_error("type", NULL);
+
+    DIR* d = opendir(path);
+    if (!d) return ray_error("io", "%s: %s", path, strerror(errno));
+
+    /* Collect names into a heap-allocated string array; capacity grows
+     * geometrically so big directories don't quadratic-realloc. */
+    char** names = NULL;
+    int64_t count = 0;
+    int64_t cap = 0;
+    struct dirent* ent;
+    while ((ent = readdir(d)) != NULL) {
+        if (ent->d_name[0] == '.' &&
+            (ent->d_name[1] == '\0' || (ent->d_name[1] == '.' && ent->d_name[2] == '\0')))
+            continue;
+        if (count >= cap) {
+            int64_t new_cap = cap == 0 ? 16 : cap * 2;
+            char** tmp = (char**)realloc(names, (size_t)new_cap * sizeof(char*));
+            if (!tmp) { closedir(d); for (int64_t i = 0; i < count; i++) free(names[i]); free(names); return ray_error("oom", NULL); }
+            names = tmp;
+            cap = new_cap;
+        }
+        size_t nlen = strlen(ent->d_name) + 1;
+        names[count] = (char*)malloc(nlen);
+        if (!names[count]) { closedir(d); for (int64_t i = 0; i < count; i++) free(names[i]); free(names); return ray_error("oom", NULL); }
+        memcpy(names[count], ent->d_name, nlen);
+        count++;
+    }
+    closedir(d);
+
+    qsort(names, (size_t)count, sizeof(char*), dir_entry_cmp);
+
+    ray_t* result = ray_vec_new(RAY_SYM, count);
+    if (!result || RAY_IS_ERR(result)) {
+        for (int64_t i = 0; i < count; i++) free(names[i]);
+        free(names);
+        return result ? result : ray_error("oom", NULL);
+    }
+    result->len = count;
+    int64_t* out = (int64_t*)ray_data(result);
+    for (int64_t i = 0; i < count; i++) {
+        out[i] = ray_sym_intern(names[i], strlen(names[i]));
+        free(names[i]);
+    }
+    free(names);
+    return result;
+}
+
+/* xorshift64* — ~1ns per 64-bit word, vs rand()'s ~10ns for 1 byte.
+ * Per-thread state seeded once with the result of rand() to keep the
+ * (guid n) sequence varying across program runs (rand() is itself
+ * seeded by the runtime).  v4 UUID quality only requires the version
+ * and variant nibbles to be correct; the remaining 122 bits are
+ * pseudo-random and xorshift64* is more than sufficient. */
+static __thread uint64_t guid_rng_state = 0;
+
+static inline uint64_t guid_rng_next(void) {
+    uint64_t x = guid_rng_state;
+    if (RAY_UNLIKELY(x == 0)) {
+        /* Mix rand() into a non-zero seed.  rand() returns ≤ 31 bits, so
+         * combine three calls plus an address-derived constant for
+         * thread-distinct initialisation. */
+        uint64_t a = (uint64_t)rand();
+        uint64_t b = (uint64_t)rand();
+        uint64_t c = (uint64_t)rand();
+        x = (a << 33) ^ (b << 17) ^ c ^ 0x9E3779B97F4A7C15ULL;
+        if (x == 0) x = 0x9E3779B97F4A7C15ULL;
+    }
+    x ^= x >> 12;
+    x ^= x << 25;
+    x ^= x >> 27;
+    guid_rng_state = x;
+    return x * 0x2545F4914F6CDD1DULL;
+}
+
+/* (guid n) -> generate n random GUIDs as GUID vector.
+ * v4 UUID format: 122 random bits + 4 version-bits (0100) + 2 variant-bits (10). */
+ray_t* ray_guid_fn(ray_t* n_arg) {
+    if (!n_arg || !is_numeric(n_arg)) return ray_error("type", NULL);
+    int64_t n = as_i64(n_arg);
+    if (n < 0) return ray_error("domain", NULL);
+    ray_t* result = ray_vec_new(RAY_GUID, n);
+    if (RAY_IS_ERR(result)) return result;
+    result->len = n;
+    uint8_t* data = (uint8_t*)ray_data(result);
+    /* Two 64-bit RNG calls per UUID give 16 random bytes; then we just
+     * stamp the version/variant nibbles. */
+    for (int64_t i = 0; i < n; i++) {
+        uint64_t lo = guid_rng_next();
+        uint64_t hi = guid_rng_next();
+        memcpy(data + i * 16, &lo, 8);
+        memcpy(data + i * 16 + 8, &hi, 8);
+        data[i * 16 + 6] = (data[i * 16 + 6] & 0x0F) | 0x40;  /* version 4 */
+        data[i * 16 + 8] = (data[i * 16 + 8] & 0x3F) | 0x80;  /* variant 10 */
+    }
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * Eval, parse, print, system, env builtins
+ * ══════════════════════════════════════════ */
+
+/* (eval expr) -- evaluate a parsed expression */
+ray_t* ray_eval_builtin_fn(ray_t* x) {
+    return ray_eval(x);
+}
+
+/* (parse str) -- parse a string into an AST */
+ray_t* ray_parse_builtin_fn(ray_t* x) {
+    if (x->type != -RAY_STR) return ray_error("type", "parse expects a string");
+    const char* src = ray_str_ptr(x);
+    if (!src) return ray_error("domain", NULL);
+    ray_t* parsed = ray_parse(src);
+    return parsed ? parsed : ray_error("parse", NULL);
+}
+
+/* (print val) -- print without newline, return the value */
+/* print moved to builtins.c alongside println/show */
+
+/* (meta x) -- return metadata about an object as a dict */
+ray_t* ray_meta_fn(ray_t* x) {
+    if (!x) return ray_error("type", NULL);
+
+    const char* tname = ray_type_name(x->type);
+    int64_t type_sym = ray_sym_intern("type", 4);
+    int64_t type_id  = ray_sym_intern(tname, strlen(tname));
+
+    /* Build keys SYM vec + vals LIST. */
+    int64_t cap = ray_is_atom(x) ? 1 : 2;
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, cap);
+    if (RAY_IS_ERR(keys)) return keys;
+    ray_t* vals = ray_list_new(cap);
+    if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+
+    keys = ray_vec_append(keys, &type_sym);
+    if (RAY_IS_ERR(keys)) { ray_release(vals); return keys; }
+    ray_t* tv = ray_sym(type_id);
+    vals = ray_list_append(vals, tv);
+    ray_release(tv);
+    if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+
+    if (!ray_is_atom(x)) {
+        int64_t len_sym = ray_sym_intern("len", 3);
+        keys = ray_vec_append(keys, &len_sym);
+        if (RAY_IS_ERR(keys)) { ray_release(vals); return keys; }
+        int64_t row_count;
+        if (x->type == RAY_DICT)       row_count = ray_dict_len(x);
+        else if (x->type == RAY_TABLE) row_count = ray_table_ncols(x);
+        else                            row_count = x->len;
+        ray_t* lv = make_i64(row_count);
+        vals = ray_list_append(vals, lv);
+        ray_release(lv);
+        if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+    }
+
+    return ray_dict_new(keys, vals);
+}
+
+/* (.sys.gc) -- no-op garbage collection trigger, return 0.  Variadic
+ * so the call site can be (.sys.gc) without the dummy-arg ceremony. */
+ray_t* ray_gc_fn(ray_t** args, int64_t n) { (void)args; (void)n; return ray_i64(0); }
+
+/* (system cmd) -- run shell command, return exit code */
+ray_t* ray_system_fn(ray_t* x) {
+    if (x->type != -RAY_STR) return ray_error("type", "system expects a string");
+    const char* cmd = ray_str_ptr(x);
+    if (!cmd) return ray_error("domain", NULL);
+    int rc = system(cmd);
+    return make_i64(rc);
+}
+
+/* (getenv name) -- get environment variable */
+ray_t* ray_getenv_fn(ray_t* x) {
+    if (x->type != -RAY_STR) return ray_error("type", "getenv expects a string");
+    const char* name = ray_str_ptr(x);
+    if (!name) return ray_error("domain", NULL);
+    const char* val = getenv(name);
+    return val ? ray_str(val, strlen(val)) : ray_str("", 0);
+}
+
+/* (setenv name val) -- set environment variable */
+#if !defined(RAY_OS_WINDOWS)
+extern int setenv(const char*, const char*, int);
+#endif
+ray_t* ray_setenv_fn(ray_t* name, ray_t* val) {
+    if (name->type != -RAY_STR || val->type != -RAY_STR)
+        return ray_error("type", "setenv expects two strings");
+    const char* n = ray_str_ptr(name);
+    const char* v = ray_str_ptr(val);
+    if (!n || !v) return ray_error("domain", NULL);
+#if defined(RAY_OS_WINDOWS)
+    _putenv_s(n, v);
+#else
+    setenv(n, v, 1);
+#endif
+    return val;
+}
+
+/* ══════════════════════════════════════════
+ * Quote, return, args, rc, diverse, get, remove,
+ * timer, env, internals, memstat, sysinfo
+ * ══════════════════════════════════════════ */
+
+/* (quote expr) -- special form, returns argument unevaluated */
+ray_t* ray_quote_fn(ray_t** args, int64_t n) {
+    if (n < 1) return ray_error("domain", "quote expects 1 argument");
+    ray_retain(args[0]);
+    return args[0];
+}
+
+/* (return x) -- early return from function (identity in Rayfall) */
+ray_t* ray_return_fn(ray_t* x) {
+    ray_retain(x);
+    return x;
+}
+
+/* (args) -- return command-line arguments as a list of strings */
+ray_t* ray_args_fn(ray_t* x) {
+    (void)x;
+    /* Return empty list -- CLI args not wired into eval context */
+    ray_t* list = ray_list_new(0);
+    if (!list) return ray_error("oom", NULL);
+    return list;
+}
+
+/* (rc x) -- return reference count of object */
+ray_t* ray_rc_fn(ray_t* x) {
+    if (!x || RAY_IS_ERR(x)) return make_i64(0);
+    return make_i64((int64_t)x->rc);
+}
+
+/* (diverse x) -- check if all elements in a collection are unique */
+ray_t* ray_diverse_fn(ray_t* x) {
+    if (ray_is_atom(x)) return make_bool(1);
+    if (!is_collection(x)) return ray_error("type", "diverse expects a collection");
+
+    int64_t n = ray_len(x);
+    if (n <= 1) return make_bool(1);
+
+    ray_t* d = ray_distinct_fn(x);
+    if (RAY_IS_ERR(d)) return d;
+    int64_t dn = ray_len(d);
+    ray_release(d);
+    return make_bool(dn == n ? 1 : 0);
+}
+
+/* (get dict key) -- dictionary/table lookup (alias for at) */
+ray_t* ray_get_fn(ray_t* dict, ray_t* key) {
+    return ray_at_fn(dict, key);
+}
+
+/* (remove dict key) -- remove key from dict, return new dict */
+ray_t* ray_remove_fn(ray_t* dict, ray_t* key) {
+    if (!dict || dict->type != RAY_DICT)
+        return ray_error("type", "remove expects a dict");
+    ray_retain(dict);
+    return ray_dict_remove(dict, key);
+}
+
+/* (timer) -- return high-res timestamp in nanoseconds for benchmarking */
+ray_t* ray_timer_fn(ray_t* x) {
+    (void)x;
+    clock_t t = clock();
+    int64_t nanos = (int64_t)((double)t / (double)CLOCKS_PER_SEC * 1e9);
+    return make_i64(nanos);
+}
+
+/* (env) -- return dict of all global environment bindings */
+ray_t* ray_env_fn(ray_t* x) {
+    (void)x;
+    int64_t sym_ids[1024];
+    ray_t* vals_buf[1024];
+    int32_t count = ray_env_list(sym_ids, vals_buf, 1024);
+
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, count);
+    if (RAY_IS_ERR(keys)) return keys;
+    ray_t* vals = ray_list_new(count);
+    if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+
+    for (int32_t i = 0; i < count; i++) {
+        keys = ray_vec_append(keys, &sym_ids[i]);
+        if (RAY_IS_ERR(keys)) { ray_release(vals); return keys; }
+        vals = ray_list_append(vals, vals_buf[i]);
+        if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+    }
+    return ray_dict_new(keys, vals);
+}
+
+/* (.sys.build) -- return dict with internal build information */
+ray_t* ray_internals_fn(ray_t** args, int64_t n) {
+    (void)args; (void)n;
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, 2);
+    if (RAY_IS_ERR(keys)) return keys;
+    ray_t* vals = ray_list_new(2);
+    if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+
+    int64_t ver_sym = ray_sym_intern("version", 7);
+    keys = ray_vec_append(keys, &ver_sym);
+#ifdef RAYFORCE_VERSION
+    ray_t* v1 = ray_str(RAYFORCE_VERSION, strlen(RAYFORCE_VERSION));
+#else
+    ray_t* v1 = ray_str("unknown", 7);
+#endif
+    vals = ray_list_append(vals, v1); ray_release(v1);
+
+    int64_t date_sym = ray_sym_intern("build-date", 10);
+    keys = ray_vec_append(keys, &date_sym);
+#ifdef RAYFORCE_BUILD_DATE
+    ray_t* v2 = ray_str(RAYFORCE_BUILD_DATE, strlen(RAYFORCE_BUILD_DATE));
+#else
+    ray_t* v2 = ray_str("unknown", 7);
+#endif
+    vals = ray_list_append(vals, v2); ray_release(v2);
+
+    return ray_dict_new(keys, vals);
+}
+
+/* (.sys.mem) -- return dict with memory allocator statistics */
+ray_t* ray_memstat_fn(ray_t** args, int64_t n) {
+    (void)args; (void)n;
+    ray_mem_stats_t st;
+    ray_mem_stats(&st);
+
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, 5);
+    if (RAY_IS_ERR(keys)) return keys;
+    ray_t* vals = ray_list_new(5);
+    if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+
+    struct { const char* name; size_t nlen; int64_t v; } rows[] = {
+        { "alloc-count",     11, (int64_t)st.alloc_count     },
+        { "bytes-allocated", 15, (int64_t)st.bytes_allocated },
+        { "peak-bytes",      10, (int64_t)st.peak_bytes      },
+        { "slab-hits",        9, (int64_t)st.slab_hits       },
+        { "sys-current",     11, (int64_t)st.sys_current     },
+    };
+    for (size_t i = 0; i < sizeof(rows)/sizeof(rows[0]); i++) {
+        int64_t s = ray_sym_intern(rows[i].name, rows[i].nlen);
+        keys = ray_vec_append(keys, &s);
+        ray_t* v = make_i64(rows[i].v);
+        vals = ray_list_append(vals, v); ray_release(v);
+    }
+
+    return ray_dict_new(keys, vals);
+}
+
+ray_t* ray_sysinfo_fn(ray_t** args, int64_t n) {
+    (void)args; (void)n;
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, 3);
+    if (RAY_IS_ERR(keys)) return keys;
+    ray_t* vals = ray_list_new(3);
+    if (RAY_IS_ERR(vals)) { ray_release(keys); return vals; }
+
+#if !defined(RAY_OS_WINDOWS)
+    int64_t s1 = ray_sym_intern("cores", 5);
+    keys = ray_vec_append(keys, &s1);
+    ray_t* v1 = make_i64(sysconf(_SC_NPROCESSORS_ONLN));
+    vals = ray_list_append(vals, v1); ray_release(v1);
+
+    int64_t s2 = ray_sym_intern("page-size", 9);
+    keys = ray_vec_append(keys, &s2);
+    ray_t* v2 = make_i64(sysconf(_SC_PAGESIZE));
+    vals = ray_list_append(vals, v2); ray_release(v2);
+
+    long pages = sysconf(_SC_PHYS_PAGES);
+    long psize = sysconf(_SC_PAGESIZE);
+    int64_t s3 = ray_sym_intern("total-mem", 9);
+    keys = ray_vec_append(keys, &s3);
+    ray_t* v3 = make_i64((int64_t)pages * (int64_t)psize);
+    vals = ray_list_append(vals, v3); ray_release(v3);
+#else
+    int64_t s1 = ray_sym_intern("cores", 5);
+    keys = ray_vec_append(keys, &s1);
+    ray_t* v1 = make_i64(1);
+    vals = ray_list_append(vals, v1); ray_release(v1);
+#endif
+
+    return ray_dict_new(keys, vals);
+}
+
+/* ══════════════════════════════════════════
+ * IPC builtins
+ * ══════════════════════════════════════════ */
+
+/* (hopen "host:port[:user:password]") → i64 handle */
+ray_t* ray_hopen_fn(ray_t* x) {
+    if (!ray_is_atom(x) || x->type != -RAY_STR)
+        return ray_error("type", NULL);
+
+    const char* s = ray_str_ptr(x);
+    size_t slen = ray_str_len(x);
+
+    /* Split on colons */
+    const char* parts[4] = {0};
+    size_t part_lens[4] = {0};
+    int n_parts = 0;
+    const char* start = s;
+    for (size_t i = 0; i <= slen && n_parts < 4; i++) {
+        if (i == slen || s[i] == ':') {
+            parts[n_parts] = start;
+            part_lens[n_parts] = (size_t)(&s[i] - start);
+            n_parts++;
+            start = &s[i + 1];
+        }
+    }
+    if (n_parts < 2) return ray_error("domain", NULL);
+
+    char host[256];
+    if (part_lens[0] >= sizeof(host)) return ray_error("domain", NULL);
+    memcpy(host, parts[0], part_lens[0]);
+    host[part_lens[0]] = '\0';
+
+    char port_str[8];
+    if (part_lens[1] >= sizeof(port_str)) return ray_error("domain", NULL);
+    memcpy(port_str, parts[1], part_lens[1]);
+    port_str[part_lens[1]] = '\0';
+    int port = atoi(port_str);
+    if (port <= 0 || port > 65535) return ray_error("domain", NULL);
+
+    char user[128] = "";
+    char password[128] = "";
+    if (n_parts >= 4) {
+        if (part_lens[2] < sizeof(user)) {
+            memcpy(user, parts[2], part_lens[2]);
+            user[part_lens[2]] = '\0';
+        }
+        if (part_lens[3] < sizeof(password)) {
+            memcpy(password, parts[3], part_lens[3]);
+            password[part_lens[3]] = '\0';
+        }
+    }
+
+    const char* pw_ptr = (n_parts >= 4) ? password : NULL;
+    const char* us_ptr = (n_parts >= 4) ? user : NULL;
+
+    int64_t h = ray_ipc_connect(host, (uint16_t)port, us_ptr, pw_ptr);
+    if (h == -2) return ray_error("access", "server requires authentication");
+    if (h == -3) return ray_error("access", "authentication failed");
+    if (h < 0) return ray_error("io", "connection refused: %s:%d", host, port);
+
+    return make_i64(h);
+}
+
+/* (hclose handle) → null */
+ray_t* ray_hclose_fn(ray_t* x) {
+    if (!ray_is_atom(x) || (x->type != -RAY_I64 && x->type != -RAY_I32))
+        return ray_error("type", NULL);
+    int64_t h = (x->type == -RAY_I64) ? x->i64 : x->i32;
+    ray_ipc_close(h);
+    return RAY_NULL_OBJ;
+}
+
+/* (hsend handle msg) → result */
+ray_t* ray_hsend_fn(ray_t* handle, ray_t* msg) {
+    if (!ray_is_atom(handle) || (handle->type != -RAY_I64 && handle->type != -RAY_I32))
+        return ray_error("type", NULL);
+    int64_t h = (handle->type == -RAY_I64) ? handle->i64 : handle->i32;
+    /* Validate message is serializable (reject builtins, etc.) */
+    if (ray_serde_size(msg) <= 0)
+        return ray_error("type", "message not serializable");
+    return ray_ipc_send(h, msg);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/tblop.c b/crates/rayforce-sys/vendor/rayforce/src/ops/tblop.c
new file mode 100644
index 0000000..688abde
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/tblop.c
@@ -0,0 +1,948 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/*  Table builtins — extracted from eval.c  */
+
+#include "lang/internal.h"
+#include "lang/env.h"
+#include "ops/ops.h"
+#include "ops/internal.h"
+#include "ops/hash.h"
+#include "ops/idxop.h"
+#include "table/sym.h"
+#include "mem/heap.h"
+#include <stdio.h>
+#include <inttypes.h>
+
+/* ══════════════════════════════════════════
+ * pivot_fn_to_agg_op
+ * ══════════════════════════════════════════ */
+
+/* Map a RAY_UNARY agg function pointer to a DAG opcode.
+ * Returns 0 if the function is not a known aggregation builtin. */
+uint16_t pivot_fn_to_agg_op(ray_t* fn) {
+    if (fn->type != RAY_UNARY) return 0;
+    ray_unary_fn f = (ray_unary_fn)(uintptr_t)fn->i64;
+    if (f == ray_sum_fn)   return OP_SUM;
+    if (f == ray_avg_fn)   return OP_AVG;
+    if (f == ray_min_fn)      return OP_MIN;
+    if (f == ray_max_fn)      return OP_MAX;
+    if (f == ray_count_fn) return OP_COUNT;
+    if (f == ray_first_fn) return OP_FIRST;
+    if (f == ray_last_fn)  return OP_LAST;
+    return 0;
+}
+
+/* ══════════════════════════════════════════
+ * pivot
+ * ══════════════════════════════════════════ */
+
+/* (pivot table index_col pivot_col value_col agg_fn) — pivot table */
+ray_t* ray_pivot_fn(ray_t** args, int64_t n) {
+    if (n != 5) return ray_error("arity", "pivot expects 5 arguments: table, index, pivot-col, value-col, agg-fn");
+    ray_t* tbl            = args[0];
+    ray_t* index_arg      = args[1];   /* sym atom or list of syms */
+    ray_t* pivot_col_name = args[2];   /* sym atom */
+    ray_t* value_col_name = args[3];   /* sym atom */
+    ray_t* agg_fn         = args[4];   /* function */
+
+    if (tbl->type != RAY_TABLE)
+        return ray_error("type", "pivot: first argument must be a table");
+    if (pivot_col_name->type != -RAY_SYM)
+        return ray_error("type", "pivot: pivot-col must be a symbol");
+    if (value_col_name->type != -RAY_SYM)
+        return ray_error("type", "pivot: value-col must be a symbol");
+    if (agg_fn->type != RAY_UNARY && agg_fn->type != RAY_LAMBDA &&
+        agg_fn->type != RAY_VARY)
+        return ray_error("type", "pivot: agg-fn must be a function");
+
+    /* Determine index columns */
+    int64_t idx_syms[16];
+    int64_t n_idx = 0;
+    if (index_arg->type == -RAY_SYM) {
+        idx_syms[0] = index_arg->i64;
+        n_idx = 1;
+    } else if (index_arg->type == RAY_LIST || ray_is_vec(index_arg)) {
+        int64_t len = ray_len(index_arg);
+        if (len > 16) return ray_error("limit", "pivot: too many index columns");
+        for (int64_t i = 0; i < len; i++) {
+            int alloc = 0;
+            ray_t* elem = collection_elem(index_arg, i, &alloc);
+            if (RAY_IS_ERR(elem)) return elem;
+            if (elem->type != -RAY_SYM) {
+                if (alloc) ray_release(elem);
+                return ray_error("type", "pivot: index columns must be symbols");
+            }
+            idx_syms[i] = elem->i64;
+            if (alloc) ray_release(elem);
+        }
+        n_idx = len;
+    } else {
+        return ray_error("type", "pivot: index must be a symbol or list of symbols");
+    }
+
+    /* Get pivot column, value column */
+    ray_t* pcol = ray_table_get_col(tbl, pivot_col_name->i64);
+    if (!pcol) return ray_error("domain", "pivot: pivot column not found");
+    ray_t* vcol = ray_table_get_col(tbl, value_col_name->i64);
+    if (!vcol) return ray_error("domain", "pivot: value column not found");
+
+    /* Get index columns */
+    ray_t* icols[16];
+    for (int64_t i = 0; i < n_idx; i++) {
+        icols[i] = ray_table_get_col(tbl, idx_syms[i]);
+        if (!icols[i]) return ray_error("domain", "pivot: index column not found");
+    }
+
+    int64_t nrows = ray_table_nrows(tbl);
+    if (nrows == 0) return ray_table_new(0);
+
+    /* DAG fast path: known agg builtins on hashable columns → OP_PIVOT */
+    uint16_t agg_op = pivot_fn_to_agg_op(agg_fn);
+    bool dag_ok = (agg_op != 0 && pcol->type != RAY_STR && vcol->type != RAY_STR);
+    for (int64_t i = 0; i < n_idx && dag_ok; i++)
+        if (icols[i]->type == RAY_STR) dag_ok = false;
+
+    if (dag_ok) {
+        ray_graph_t* g = ray_graph_new(tbl);
+        if (!g) return ray_error("oom", NULL);
+        ray_op_t* idx_ops[16];
+        bool ok = true;
+        for (int64_t i = 0; i < n_idx && ok; i++) {
+            ray_t* s = ray_sym_str(idx_syms[i]);
+            idx_ops[i] = s ? ray_scan(g, ray_str_ptr(s)) : NULL;
+            if (!idx_ops[i]) ok = false;
+        }
+        ray_t* ps = ray_sym_str(pivot_col_name->i64);
+        ray_t* vs = ray_sym_str(value_col_name->i64);
+        ray_op_t* p_op = (ps && ok) ? ray_scan(g, ray_str_ptr(ps)) : NULL;
+        ray_op_t* v_op = (vs && p_op) ? ray_scan(g, ray_str_ptr(vs)) : NULL;
+        if (v_op) {
+            ray_op_t* root = ray_pivot_op(g, idx_ops, (uint8_t)n_idx, p_op, v_op, agg_op);
+            if (root) {
+                ray_t* result = ray_execute(g, root);
+                ray_graph_free(g);
+                return result;
+            }
+        }
+        ray_graph_free(g);
+    }
+
+    /* Generic fallback: use OP_GROUP DAG to group by (index_cols, pivot_col),
+     * then apply agg_fn per group and unstack.  Single O(n) hash pass. */
+
+    /* Build GROUP BY (idx0, ..., idxN-1, pivot_col) with COUNT agg via DAG */
+    ray_graph_t* g = ray_graph_new(tbl);
+    if (!g) return ray_error("oom", NULL);
+
+    uint8_t n_keys = (uint8_t)(n_idx + 1);
+    ray_op_t* key_ops[16];
+    bool ok = true;
+    for (int64_t i = 0; i < n_idx && ok; i++) {
+        ray_t* s = ray_sym_str(idx_syms[i]);
+        key_ops[i] = s ? ray_scan(g, ray_str_ptr(s)) : NULL;
+        if (!key_ops[i]) ok = false;
+    }
+    {
+        ray_t* ps = ray_sym_str(pivot_col_name->i64);
+        key_ops[n_idx] = (ps && ok) ? ray_scan(g, ray_str_ptr(ps)) : NULL;
+        if (!key_ops[n_idx]) ok = false;
+    }
+    /* Value column scan for COUNT (just need a column ref for group) */
+    ray_t* vs = ray_sym_str(value_col_name->i64);
+    ray_op_t* val_scan = (vs && ok) ? ray_scan(g, ray_str_ptr(vs)) : NULL;
+    if (!val_scan) { ray_graph_free(g); return ray_error("domain", "pivot: failed to build DAG"); }
+
+    uint16_t grp_agg_ops[1] = { OP_COUNT };
+    ray_op_t* grp_agg_ins[1] = { val_scan };
+    ray_op_t* grp_root = ray_group(g, key_ops, n_keys, grp_agg_ops, grp_agg_ins, 1);
+    if (!grp_root) { ray_graph_free(g); return ray_error("oom", NULL); }
+
+    ray_t* grouped = ray_execute(g, grp_root);
+    ray_graph_free(g);
+    if (!grouped || RAY_IS_ERR(grouped)) return grouped;
+
+    /* `grouped` is a table: (idx0, ..., idxN-1, pivot_col, _count).
+     * Each row is one (index, pivot) combination.
+     * Now for each group, gather the value column subset and apply agg_fn. */
+    int64_t n_grps = ray_table_nrows(grouped);
+
+    /* Get grouped columns */
+    ray_t* g_icols[16];
+    for (int64_t i = 0; i < n_idx; i++)
+        g_icols[i] = ray_table_get_col(grouped, idx_syms[i]);
+    ray_t* g_pcol = ray_table_get_col(grouped, pivot_col_name->i64);
+
+    /* Collect distinct pivot values and index keys from grouped table */
+    ray_retain(g_pcol);
+    ray_t* dvals = ray_distinct_fn(g_pcol);
+    ray_release(g_pcol);
+    if (RAY_IS_ERR(dvals)) { ray_release(grouped); return dvals; }
+    int64_t n_pv = ray_len(dvals);
+
+    /* Re-scan original table to assign a grouped-row index to each
+     * input row.  Previously this was an O(nrows * n_grps) nested loop
+     * that hung on any large pivot that took the generic fallback.
+     * Replaced with an open-addressed hash table keyed by a cheap row
+     * hash of (idx_cols..., pivot_col), giving O(nrows + n_grps) in the
+     * common case.  Hash collisions re-verify via atom_eq so unhashable
+     * cells (strings, guids) still match correctly.
+     *
+     * Hash helper: produces the same value when called on two rows with
+     * equal cell values for numeric/sym/temporal columns; for strings
+     * and guids we under-hash (returning a type-independent constant)
+     * and rely entirely on atom_eq for equality. */
+    #define FB_ROW_HASH(cols, ncols, pv, rid)                                \
+        ({                                                                    \
+            uint64_t _h = 0;                                                  \
+            for (int64_t _k = 0; _k < (ncols); _k++) {                        \
+                ray_t* _c = (cols)[_k];                                       \
+                uint64_t _kh;                                                 \
+                if (ray_vec_is_null(_c, (rid)))                               \
+                    _kh = 0x9E3779B97F4A7C15ULL ^ (uint64_t)(rid);            \
+                else if (_c->type == RAY_F64)                                 \
+                    _kh = ray_hash_f64(((double*)ray_data(_c))[(rid)]);       \
+                else if (_c->type == RAY_STR || _c->type == RAY_GUID)         \
+                    _kh = 0xDEADBEEFCAFEBABEULL;                              \
+                else                                                           \
+                    _kh = ray_hash_i64(read_col_i64(ray_data(_c), (rid),      \
+                                                    _c->type, _c->attrs));    \
+                _h = (_k == 0) ? _kh : ray_hash_combine(_h, _kh);             \
+            }                                                                  \
+            ray_t* _pc = (pv);                                                 \
+            uint64_t _ph;                                                      \
+            if (ray_vec_is_null(_pc, (rid)))                                  \
+                _ph = 0x165667B19E3779F9ULL ^ (uint64_t)(rid);                \
+            else if (_pc->type == RAY_F64)                                     \
+                _ph = ray_hash_f64(((double*)ray_data(_pc))[(rid)]);          \
+            else if (_pc->type == RAY_STR || _pc->type == RAY_GUID)            \
+                _ph = 0xFEEDFACE12345678ULL;                                   \
+            else                                                                \
+                _ph = ray_hash_i64(read_col_i64(ray_data(_pc), (rid),         \
+                                                 _pc->type, _pc->attrs));      \
+            ray_hash_combine(_h, _ph);                                         \
+        })
+
+    uint32_t gid_cap = 256;
+    while (gid_cap < (uint32_t)n_grps * 2 && gid_cap < (1u << 30)) gid_cap <<= 1;
+    ray_t* gid_ht_hdr = ray_alloc((size_t)gid_cap * sizeof(uint32_t));
+    if (!gid_ht_hdr) { ray_release(dvals); ray_release(grouped); return ray_error("oom", NULL); }
+    uint32_t* gid_ht = (uint32_t*)ray_data(gid_ht_hdr);
+    memset(gid_ht, 0xFF, gid_cap * sizeof(uint32_t));
+    uint32_t gid_mask = gid_cap - 1;
+
+    /* Insert each grouped row into the HT (grouped rows are already
+     * distinct by construction — no equality check needed on insert). */
+    for (int64_t gi = 0; gi < n_grps; gi++) {
+        uint64_t h = FB_ROW_HASH(g_icols, n_idx, g_pcol, gi);
+        uint32_t slot = (uint32_t)(h & gid_mask);
+        while (gid_ht[slot] != UINT32_MAX) slot = (slot + 1) & gid_mask;
+        gid_ht[slot] = (uint32_t)gi;
+    }
+
+    ray_t* gid_vec = ray_vec_new(RAY_I64, nrows);
+    if (!gid_vec || RAY_IS_ERR(gid_vec)) {
+        ray_free(gid_ht_hdr); ray_release(dvals); ray_release(grouped);
+        return ray_error("oom", NULL);
+    }
+    gid_vec->len = nrows;
+    int64_t* gids = (int64_t*)ray_data(gid_vec);
+
+    /* Probe HT for each input row; on collision fall through to atom_eq. */
+    for (int64_t r = 0; r < nrows; r++) {
+        uint64_t h = FB_ROW_HASH(icols, n_idx, pcol, r);
+        uint32_t slot = (uint32_t)(h & gid_mask);
+        int64_t found = -1;
+        while (gid_ht[slot] != UINT32_MAX) {
+            int64_t gi = gid_ht[slot];
+            bool match = true;
+            for (int64_t ci = 0; ci < n_idx && match; ci++) {
+                int a1 = 0, a2 = 0;
+                ray_t* v1 = collection_elem(icols[ci], r, &a1);
+                ray_t* v2 = collection_elem(g_icols[ci], gi, &a2);
+                if (!atom_eq(v1, v2)) match = false;
+                if (a1) ray_release(v1);
+                if (a2) ray_release(v2);
+            }
+            if (match) {
+                int a1 = 0, a2 = 0;
+                ray_t* v1 = collection_elem(pcol, r, &a1);
+                ray_t* v2 = collection_elem(g_pcol, gi, &a2);
+                if (!atom_eq(v1, v2)) match = false;
+                if (a1) ray_release(v1);
+                if (a2) ray_release(v2);
+            }
+            if (match) { found = gi; break; }
+            slot = (slot + 1) & gid_mask;
+        }
+        gids[r] = found;
+    }
+    ray_free(gid_ht_hdr);
+
+    /* For each group, gather the value column subset and apply agg_fn */
+    ray_t* agg_results = ray_alloc(n_grps * sizeof(ray_t*));
+    if (!agg_results) { ray_release(gid_vec); ray_release(dvals); ray_release(grouped); return ray_error("oom", NULL); }
+    agg_results->type = RAY_LIST;
+    agg_results->len = n_grps;
+    ray_t** ar = (ray_t**)ray_data(agg_results);
+
+    /* Counting-sort rows by gid: O(nrows + n_grps) vs the previous
+     * O(nrows * n_grps) double-scan per group. */
+    ray_t* off_hdr = ray_alloc((size_t)(n_grps + 1) * sizeof(int64_t));
+    if (!off_hdr) {
+        ray_free(agg_results); ray_release(gid_vec); ray_release(dvals); ray_release(grouped);
+        return ray_error("oom", NULL);
+    }
+    int64_t* offs = (int64_t*)ray_data(off_hdr);
+    memset(offs, 0, (size_t)(n_grps + 1) * sizeof(int64_t));
+    for (int64_t r = 0; r < nrows; r++) {
+        int64_t g = gids[r];
+        if (g >= 0) offs[g + 1]++;
+    }
+    for (int64_t gi = 0; gi < n_grps; gi++) offs[gi + 1] += offs[gi];
+
+    ray_t* sorted_hdr = ray_alloc((size_t)nrows * sizeof(int64_t));
+    if (!sorted_hdr) {
+        ray_free(off_hdr);
+        ray_free(agg_results); ray_release(gid_vec); ray_release(dvals); ray_release(grouped);
+        return ray_error("oom", NULL);
+    }
+    int64_t* sorted = (int64_t*)ray_data(sorted_hdr);
+    /* Write-cursor array derived from offs. */
+    ray_t* wcur_hdr = ray_alloc((size_t)n_grps * sizeof(int64_t));
+    if (!wcur_hdr) {
+        ray_free(sorted_hdr); ray_free(off_hdr);
+        ray_free(agg_results); ray_release(gid_vec); ray_release(dvals); ray_release(grouped);
+        return ray_error("oom", NULL);
+    }
+    int64_t* wcur = (int64_t*)ray_data(wcur_hdr);
+    memcpy(wcur, offs, (size_t)n_grps * sizeof(int64_t));
+    for (int64_t r = 0; r < nrows; r++) {
+        int64_t g = gids[r];
+        if (g >= 0) sorted[wcur[g]++] = r;
+    }
+    ray_free(wcur_hdr);
+
+    for (int64_t gi = 0; gi < n_grps; gi++) {
+        int64_t cnt = offs[gi + 1] - offs[gi];
+        ray_t* subset = gather_by_idx(vcol, sorted + offs[gi], cnt);
+        if (RAY_IS_ERR(subset)) {
+            for (int64_t j = 0; j < gi; j++) ray_release(ar[j]);
+            ray_free(sorted_hdr); ray_free(off_hdr);
+            ray_free(agg_results); ray_release(gid_vec); ray_release(dvals); ray_release(grouped);
+            return subset;
+        }
+        ray_t* agg_val = call_fn1(agg_fn, subset);
+        ray_release(subset);
+        if (RAY_IS_ERR(agg_val)) {
+            for (int64_t j = 0; j < gi; j++) ray_release(ar[j]);
+            ray_free(sorted_hdr); ray_free(off_hdr);
+            ray_free(agg_results); ray_release(gid_vec); ray_release(dvals); ray_release(grouped);
+            return agg_val;
+        }
+        ar[gi] = agg_val;
+    }
+    ray_free(sorted_hdr);
+    ray_free(off_hdr);
+    ray_release(gid_vec);
+
+    /* Unstack: collect distinct index keys, build wide result.
+     * Map each group to (ix_idx, pv_idx). */
+    ray_t* ix_list = ray_list_new(16);
+    ray_t* gmap = ray_alloc(n_grps * 2 * sizeof(int64_t));
+    int64_t* gm_ix = (int64_t*)ray_data(gmap);
+    int64_t* gm_pv = gm_ix + n_grps;
+
+    for (int64_t gi = 0; gi < n_grps; gi++) {
+        /* Find pivot index */
+        int a1 = 0;
+        ray_t* pv = collection_elem(g_pcol, gi, &a1);
+        gm_pv[gi] = -1;
+        for (int64_t p = 0; p < n_pv; p++) {
+            int a2 = 0;
+            ray_t* dv = collection_elem(dvals, p, &a2);
+            bool eq = atom_eq(pv, dv);
+            if (a2) ray_release(dv);
+            if (eq) { gm_pv[gi] = p; break; }
+        }
+        if (a1) ray_release(pv);
+
+        /* Find or insert index key */
+        gm_ix[gi] = -1;
+        int64_t n_ix = ray_len(ix_list);
+        ray_t** ix_items = (ray_t**)ray_data(ix_list);
+        for (int64_t j = 0; j < n_ix; j++) {
+            ray_t** ex = (ray_t**)ray_data(ix_items[j]);
+            bool match = true;
+            for (int64_t ci = 0; ci < n_idx && match; ci++) {
+                int a2 = 0;
+                ray_t* v = collection_elem(g_icols[ci], gi, &a2);
+                if (!atom_eq(ex[ci], v)) match = false;
+                if (a2) ray_release(v);
+            }
+            if (match) { gm_ix[gi] = j; break; }
+        }
+        if (gm_ix[gi] < 0) {
+            gm_ix[gi] = ray_len(ix_list);
+            ray_t* tup = ray_list_new((int32_t)n_idx);
+            for (int64_t ci = 0; ci < n_idx; ci++) {
+                int a2 = 0;
+                ray_t* v = collection_elem(g_icols[ci], gi, &a2);
+                if (!a2) ray_retain(v);
+                tup = ray_list_append(tup, v);
+                ray_release(v);
+            }
+            ix_list = ray_list_append(ix_list, tup);
+            ray_release(tup);
+        }
+    }
+
+    int64_t n_ix = ray_len(ix_list);
+
+    /* Build result table */
+    ray_t* result = ray_table_new(n_idx + n_pv);
+    if (RAY_IS_ERR(result)) goto fb_cleanup;
+
+    /* Index columns */
+    { ray_t** ix_items = (ray_t**)ray_data(ix_list);
+    for (int64_t ci = 0; ci < n_idx; ci++) {
+        ray_t* col_vals = ray_list_new((int32_t)n_ix);
+        for (int64_t r = 0; r < n_ix; r++) {
+            ray_t* v = ((ray_t**)ray_data(ix_items[r]))[ci];
+            ray_retain(v);
+            col_vals = ray_list_append(col_vals, v);
+            ray_release(v);
+        }
+        ray_t* col_vec = list_to_typed_vec(col_vals, icols[ci]->type);
+        if (RAY_IS_ERR(col_vec)) { ray_release(result); result = col_vec; goto fb_cleanup; }
+        result = ray_table_add_col(result, idx_syms[ci], col_vec);
+        ray_release(col_vec);
+        if (RAY_IS_ERR(result)) goto fb_cleanup;
+    }
+    }
+
+    /* Value columns */
+    for (int64_t p = 0; p < n_pv; p++) {
+        ray_t* col_vals = ray_list_new((int32_t)n_ix);
+        for (int64_t r = 0; r < n_ix; r++) {
+            ray_t* zero = ray_i64(0);
+            col_vals = ray_list_append(col_vals, zero);
+            ray_release(zero);
+        }
+
+        for (int64_t gi = 0; gi < n_grps; gi++) {
+            if (gm_pv[gi] != p) continue;
+            ray_t** cv = (ray_t**)ray_data(col_vals);
+            ray_release(cv[gm_ix[gi]]);
+            ray_retain(ar[gi]);
+            cv[gm_ix[gi]] = ar[gi];
+        }
+
+        int8_t agg_type = RAY_I64;
+        { ray_t** cv = (ray_t**)ray_data(col_vals);
+          for (int64_t r = 0; r < n_ix; r++)
+            if (cv[r]->type == -RAY_F64) { agg_type = RAY_F64; break; }
+        }
+        ray_t* agg_vec = list_to_typed_vec(col_vals, agg_type);
+        if (RAY_IS_ERR(agg_vec)) { ray_release(result); result = agg_vec; goto fb_cleanup; }
+
+        /* Column name */
+        int a1 = 0;
+        ray_t* pval = collection_elem(dvals, p, &a1);
+        int64_t col_sym;
+        if (pval->type == -RAY_SYM) {
+            col_sym = pval->i64;
+        } else if (pval->type == -RAY_I64) {
+            char buf[64]; int len = snprintf(buf, sizeof(buf), "%ld", (long)pval->i64);
+            col_sym = ray_sym_intern(buf, (size_t)len);
+        } else if (pval->type == -RAY_F64) {
+            double fv = pval->f64; if (fv == 0.0 && signbit(fv)) fv = 0.0;
+            char buf[64]; int len = snprintf(buf, sizeof(buf), "%g", fv);
+            col_sym = ray_sym_intern(buf, (size_t)len);
+        } else if (pval->type == -RAY_BOOL) {
+            col_sym = ray_sym_intern(pval->b8 ? "true" : "false", pval->b8 ? 4 : 5);
+        } else {
+            char buf[64]; int len = snprintf(buf, sizeof(buf), "col%ld", (long)pval->i64);
+            col_sym = ray_sym_intern(buf, (size_t)len);
+        }
+        if (a1) ray_release(pval);
+
+        result = ray_table_add_col(result, col_sym, agg_vec);
+        ray_release(agg_vec);
+        if (RAY_IS_ERR(result)) goto fb_cleanup;
+    }
+
+fb_cleanup:
+    ray_free(gmap);
+    ray_release(ix_list);
+    for (int64_t gi = 0; gi < n_grps; gi++) ray_release(ar[gi]);
+    ray_free(agg_results);
+    ray_release(dvals);
+    ray_release(grouped);
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * modify
+ * ══════════════════════════════════════════ */
+
+/* (modify tbl col_name fn) — apply fn to the named column, return new table */
+ray_t* ray_modify_fn(ray_t** args, int64_t n) {
+    if (n < 3) return ray_error("arity", "modify expects 3 arguments: table, column, function");
+    ray_t* tbl = args[0];
+    ray_t* col_name = args[1];
+    ray_t* fn = args[2];
+
+    if (tbl->type != RAY_TABLE)
+        return ray_error("type", "modify: first arg must be a table");
+    if (col_name->type != -RAY_SYM)
+        return ray_error("type", "modify: second arg must be a symbol");
+
+    int64_t target_sym = col_name->i64;
+    ray_t* col = ray_table_get_col(tbl, target_sym);
+    if (!col) return ray_error("domain", "modify: column not found");
+
+    /* Apply fn to the entire column vector (atomic fns will map element-wise) */
+    ray_t* new_col = call_fn1(fn, col);
+    if (RAY_IS_ERR(new_col)) return new_col;
+
+    /* Build new table: copy all columns, replacing the target */
+    int64_t ncols = ray_table_ncols(tbl);
+    ray_t* result = ray_table_new(ncols);
+    if (RAY_IS_ERR(result)) { ray_release(new_col); return result; }
+
+    for (int64_t i = 0; i < ncols; i++) {
+        int64_t cname = ray_table_col_name(tbl, i);
+        ray_t* cvec = (cname == target_sym) ? new_col : ray_table_get_col_idx(tbl, i);
+        result = ray_table_add_col(result, cname, cvec);
+        if (RAY_IS_ERR(result)) { ray_release(new_col); return result; }
+    }
+    ray_release(new_col);
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * alter
+ * ══════════════════════════════════════════ */
+
+/* Cleanup helper for alter set's ray_cow failure paths.  Releases the
+ * caller's retain on the original vec plus the eval'd args, then returns
+ * an owning RAY_ERROR — substituting "oom" when ray_cow itself returned
+ * NULL (RAY_IS_ERR(NULL) is false, but the caller still owes us a
+ * structured error to propagate).
+ *
+ * Exposed (non-static) so test code can pin the contract directly:
+ * NULL cow_result must produce an "oom" error, RAY_ERROR cow_result
+ * passes through, and `original_var` is released exactly once. */
+ray_t* ray_alter_set_cow_fail(ray_t* original_var, ray_t* cow_result,
+                              ray_t* idx, ray_t* val, ray_t* name_sym) {
+    ray_release(original_var);
+    if (idx)      ray_release(idx);
+    if (val)      ray_release(val);
+    if (name_sym) ray_release(name_sym);
+    return cow_result ? cow_result : ray_error("oom", NULL);
+}
+
+ray_t* ray_alter_fn(ray_t** args, int64_t n) {
+    if (n < 3) return ray_error("domain", NULL);
+    /* First arg: evaluate to get the symbol */
+    ray_t* name_sym = ray_eval(args[0]);
+    if (!name_sym || RAY_IS_ERR(name_sym)) return name_sym ? name_sym : ray_error("type", NULL);
+    if (name_sym->type != -RAY_SYM) { ray_release(name_sym); return ray_error("type", NULL); }
+
+    /* Resolve the variable */
+    ray_t* var = ray_env_get(name_sym->i64);
+    if (!var) { ray_release(name_sym); return ray_error("name", NULL); }
+
+    /* Second arg: operation name (unevaluated, must be a name) */
+    ray_t* op = args[1];
+    if (!op || op->type != -RAY_SYM) { ray_release(name_sym); return ray_error("type", NULL); }
+    ray_t* op_name = ray_sym_str(op->i64);
+    if (!op_name) { ray_release(name_sym); return ray_error("domain", NULL); }
+    const char* oname = ray_str_ptr(op_name);
+    size_t olen = ray_str_len(op_name);
+
+    if (olen == 3 && memcmp(oname, "set", 3) == 0) {
+        /* (alter 'v set idx val) — idx can be scalar or vector of indices */
+        ray_release(op_name);
+        if (n < 4) { ray_release(name_sym); return ray_error("domain", NULL); }
+        ray_t* idx = ray_eval(args[2]);
+        if (!idx || RAY_IS_ERR(idx)) { ray_release(name_sym); return idx ? idx : ray_error("type", NULL); }
+        ray_t* val = ray_eval(args[3]);
+        if (!val || RAY_IS_ERR(val)) { ray_release(idx); ray_release(name_sym); return val ? val : ray_error("type", NULL); }
+        if (!ray_is_vec(var) && var->type != RAY_LIST) { ray_release(idx); ray_release(val); ray_release(name_sym); return ray_error("type", NULL); }
+
+        /* For LIST types, build a new list with replaced elements */
+        if (var->type == RAY_LIST) {
+            int64_t vlen = ray_len(var);
+            ray_t** elems = (ray_t**)ray_data(var);
+            ray_t* new_list = ray_alloc(vlen * sizeof(ray_t*));
+            if (!new_list) { ray_release(idx); ray_release(val); ray_release(name_sym); return ray_error("oom", NULL); }
+            new_list->type = RAY_LIST;
+            new_list->len = vlen;
+            ray_t** out = (ray_t**)ray_data(new_list);
+            for (int64_t i = 0; i < vlen; i++) { ray_retain(elems[i]); out[i] = elems[i]; }
+
+            if (ray_is_atom(idx) && is_numeric(idx)) {
+                int64_t i = as_i64(idx);
+                if (i >= 0 && i < vlen) { ray_release(out[i]); ray_retain(val); out[i] = val; }
+            } else if (ray_is_vec(idx)) {
+                int64_t nidx = idx->len;
+                for (int64_t k = 0; k < nidx; k++) {
+                    int alloc = 0;
+                    ray_t* ie = collection_elem(idx, k, &alloc);
+                    int64_t i = as_i64(ie);
+                    if (alloc) ray_release(ie);
+                    if (i >= 0 && i < vlen) { ray_release(out[i]); ray_retain(val); out[i] = val; }
+                }
+            }
+            ray_release(idx); ray_release(val);
+            ray_env_set(name_sym->i64, new_list);
+            ray_release(name_sym);
+            ray_retain(new_list);
+            return new_list;
+        }
+
+        /* `var` came from ray_env_get as a BORROWED ref.  ray_cow's
+         * contract is "I take your owning ref; I give you back a ref"
+         * — so calling it on a borrow over-decrements the env's
+         * binding when the rc>1 copy path fires (releasing v drops
+         * env's count from N to N-1; if some other env binding also
+         * pointed at v, that binding now sees an extra under-retain
+         * and risks UAF when later replaced).
+         *
+         * Retain up-front so the ref we hand to ray_cow is genuinely
+         * ours.  Track the original pointer so the cow-OOM path
+         * (alloc_copy fails before ray_cow's release would have run)
+         * can still release the retain — without that, OOM leaks the
+         * extra ref. */
+        ray_t* original_var = var;
+        ray_retain(var);
+        ray_t* cow_result = ray_cow(var);
+        /* ray_cow returns NULL when ray_alloc_copy returned NULL (heap
+         * exhaustion past RAY_HEAP_MAX_ORDER) and a RAY_ERROR pointer
+         * when alloc_copy hit its own len-overflow guard.  Both leave
+         * the input ref untouched, so the cleanup helper releases
+         * `original_var` and either propagates the error pointer or
+         * synthesizes an "oom" RAY_ERROR for the NULL case (test code
+         * pins both branches). */
+        if (!cow_result || RAY_IS_ERR(cow_result)) {
+            return ray_alter_set_cow_fail(original_var, cow_result, idx, val, name_sym);
+        }
+        var = cow_result;
+
+        /* Validate idx shape + (for the atom case) bounds BEFORE we
+         * touch any state.  The accelerator-index drop below would
+         * otherwise outlive a failed write. */
+        bool idx_is_atom_num = ray_is_atom(idx) && is_numeric(idx);
+        bool idx_is_vec      = ray_is_vec(idx);
+        if (!idx_is_atom_num && !idx_is_vec) {
+            ray_release(var);
+            ray_release(idx); ray_release(val); ray_release(name_sym);
+            return ray_error("type", NULL);
+        }
+        if (idx_is_atom_num) {
+            int64_t i_check = as_i64(idx);
+            if (i_check < 0 || i_check >= var->len) {
+                ray_release(var);
+                ray_release(idx); ray_release(val); ray_release(name_sym);
+                return ray_error("index", NULL);
+            }
+        }
+
+        /* alter's set path writes via store_typed_elem, which bypasses
+         * ray_vec_set's mutation guard.  Now that we know the write
+         * will reach the data array, drop any attached accelerator
+         * index so it can't outlive the mutation. */
+        if (var->attrs & RAY_ATTR_HAS_INDEX) {
+            ray_t* drop_r = ray_index_drop(&var);
+            if (RAY_IS_ERR(drop_r)) {
+                ray_release(var);
+                ray_release(idx); ray_release(val); ray_release(name_sym);
+                return drop_r;
+            }
+        }
+
+        if (idx_is_atom_num) {
+            /* Single index — bounds already validated above. */
+            int64_t i = as_i64(idx);
+            ray_release(idx);
+            store_typed_elem(var, i, val);
+        } else {
+            /* Vector of indices — set each to val.
+             * If val is a vector of same length, set pairwise.
+             * If val is scalar or shorter, broadcast. */
+            int64_t nidx = idx->len;
+            int val_is_vec = ray_is_vec(val) && val->len == nidx;
+            for (int64_t k = 0; k < nidx; k++) {
+                int alloc = 0;
+                ray_t* ie = collection_elem(idx, k, &alloc);
+                int64_t i = as_i64(ie);
+                if (alloc) ray_release(ie);
+                if (i < 0 || i >= var->len) continue;
+                if (val_is_vec) {
+                    int va = 0;
+                    ray_t* ve = collection_elem(val, k, &va);
+                    store_typed_elem(var, i, ve);
+                    if (va) ray_release(ve);
+                } else {
+                    store_typed_elem(var, i, val);
+                }
+            }
+            ray_release(idx);
+        }
+        ray_release(val);
+        ray_env_set(name_sym->i64, var);
+        ray_release(name_sym);
+        /* The retain-first at the top of the set path gave us an owning
+         * ref to var.  ray_env_set already retained for the env binding;
+         * transferring our existing ref to the caller via return is
+         * correct.  No additional ray_retain here. */
+        return var;
+    }
+    if (olen == 6 && memcmp(oname, "concat", 6) == 0) {
+        /* (alter 'v concat val) */
+        ray_release(op_name);
+        if (n < 3) { ray_release(name_sym); return ray_error("domain", NULL); }
+        ray_t* val = ray_eval(args[2]);
+        if (!val || RAY_IS_ERR(val)) { ray_release(name_sym); return val ? val : ray_error("type", NULL); }
+        ray_t* new_vec = ray_concat_fn(var, val);
+        ray_release(val);
+        if (RAY_IS_ERR(new_vec)) { ray_release(name_sym); return new_vec; }
+        ray_env_set(name_sym->i64, new_vec);
+        ray_release(name_sym);
+        ray_retain(new_vec);
+        return new_vec;
+    }
+    if (olen == 6 && memcmp(oname, "remove", 6) == 0) {
+        /* (alter 'v remove idx) — remove element(s) at index/indices */
+        ray_release(op_name);
+        if (n < 3) { ray_release(name_sym); return ray_error("domain", NULL); }
+        ray_t* idx = ray_eval(args[2]);
+        if (!idx || RAY_IS_ERR(idx)) { ray_release(name_sym); return idx ? idx : ray_error("type", NULL); }
+
+        if (!var || var->type != RAY_LIST) {
+            ray_release(idx); ray_release(name_sym);
+            return ray_error("type", NULL);
+        }
+
+        int64_t vlen = ray_len(var);
+        ray_t** elems = (ray_t**)ray_data(var);
+
+        /* Build a set of indices to remove */
+        int64_t remove_idx[256];
+        int64_t nremove = 0;
+        if (ray_is_atom(idx) && is_numeric(idx)) {
+            remove_idx[0] = as_i64(idx);
+            nremove = 1;
+        } else if (ray_is_vec(idx)) {
+            nremove = idx->len;
+            if (nremove > 256) { ray_release(idx); ray_release(name_sym); return ray_error("limit", NULL); }
+            for (int64_t i = 0; i < nremove; i++) {
+                int alloc = 0;
+                ray_t* e = collection_elem(idx, i, &alloc);
+                remove_idx[i] = as_i64(e);
+                if (alloc) ray_release(e);
+            }
+        } else {
+            ray_release(idx); ray_release(name_sym);
+            return ray_error("type", NULL);
+        }
+        ray_release(idx);
+
+        /* Build new list without the removed indices */
+        int64_t new_len = vlen;
+        for (int64_t i = 0; i < nremove; i++)
+            if (remove_idx[i] >= 0 && remove_idx[i] < vlen) new_len--;
+
+        ray_t* new_list = ray_alloc(new_len * sizeof(ray_t*));
+        if (!new_list) { ray_release(name_sym); return ray_error("oom", NULL); }
+        new_list->type = RAY_LIST;
+        new_list->len = new_len;
+        ray_t** out = (ray_t**)ray_data(new_list);
+        int64_t j = 0;
+        for (int64_t i = 0; i < vlen; i++) {
+            int skip = 0;
+            for (int64_t k = 0; k < nremove; k++)
+                if (remove_idx[k] == i) { skip = 1; break; }
+            if (!skip) {
+                ray_retain(elems[i]);
+                out[j++] = elems[i];
+            }
+        }
+        new_list->len = j;
+        ray_env_set(name_sym->i64, new_list);
+        ray_release(name_sym);
+        ray_retain(new_list);
+        return new_list;
+    }
+    ray_release(op_name);
+    ray_release(name_sym);
+    return ray_error("domain", NULL);
+}
+
+/* ══════════════════════════════════════════
+ * del
+ * ══════════════════════════════════════════ */
+
+/* (del name) — delete variable from environment (special form, unevaluated arg) */
+ray_t* ray_del_fn(ray_t** args, int64_t n) {
+    if (n < 1) return ray_error("arity", "del expects 1 argument");
+    ray_t* name = args[0];
+    if (name->type != -RAY_SYM)
+        return ray_error("type", "del expects a symbol");
+    /* Propagate ray_env_set's failure: silently ignoring the return
+     * value would let `(del .sys.gc)` appear to succeed while leaving
+     * the builtin intact — a confusing lie.  Emit a precise message
+     * per error code rather than blaming every failure on the
+     * reserved-namespace guard (OOM on dotted-path upsert, for
+     * example, is not a reserve error). */
+    ray_err_t err = ray_env_set(name->i64, NULL);
+    if (err == RAY_OK) return ray_i64(0);
+    const char* nm = ray_str_ptr(ray_sym_str(name->i64));
+    if (err == RAY_ERR_RESERVED)
+        return ray_error("reserve",
+                         "cannot delete reserved binding '%s'", nm);
+    return ray_error(ray_err_code_str(err),
+                     "del '%s' failed", nm);
+}
+
+/* ══════════════════════════════════════════
+ * row
+ * ══════════════════════════════════════════ */
+
+/* (row table idx) — extract a single row from a table as a dict */
+ray_t* ray_row_fn(ray_t* tbl, ray_t* idx) {
+    if (tbl->type != RAY_TABLE) return ray_error("type", "row expects a table");
+    if (!is_numeric(idx)) return ray_error("type", "row index must be integer");
+    /* Delegate to at — it already handles table integer indexing */
+    return ray_at_fn(tbl, idx);
+}
+
+/* ══════════════════════════════════════════
+ * union-all
+ * ══════════════════════════════════════════ */
+
+/* (union-all t1 t2) — concatenate two tables row-wise (same schema) */
+ray_t* ray_union_all_fn(ray_t* t1, ray_t* t2) {
+    if (t1->type != RAY_TABLE)
+        return ray_error("type", "union-all: first arg must be a table");
+    if (t2->type != RAY_TABLE)
+        return ray_error("type", "union-all: second arg must be a table");
+
+    int64_t ncols = ray_table_ncols(t1);
+    if (ncols != ray_table_ncols(t2))
+        return ray_error("type", "union-all: tables must have same number of columns");
+
+    /* Validate matching column names */
+    for (int64_t c = 0; c < ncols; c++) {
+        if (ray_table_col_name(t1, c) != ray_table_col_name(t2, c))
+            return ray_error("type", "union-all: column names must match");
+    }
+
+    ray_t* result = ray_table_new(ncols);
+    if (!result || RAY_IS_ERR(result)) return result;
+
+    for (int64_t c = 0; c < ncols; c++) {
+        int64_t name_id = ray_table_col_name(t1, c);
+        ray_t* col1 = ray_table_get_col_idx(t1, c);
+        ray_t* col2 = ray_table_get_col_idx(t2, c);
+
+        if (!col1 || !col2) {
+            ray_release(result);
+            return ray_error("type", "union-all: missing column");
+        }
+
+        ray_t* combined = ray_vec_concat(col1, col2);
+        if (!combined || RAY_IS_ERR(combined)) {
+            ray_release(result);
+            return combined ? combined : ray_error("oom", NULL);
+        }
+
+        result = ray_table_add_col(result, name_id, combined);
+        ray_release(combined);
+        if (!result || RAY_IS_ERR(result)) return result;
+    }
+
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * table-distinct
+ * ══════════════════════════════════════════ */
+
+/* (table-distinct t) — remove duplicate rows via DAG group-by */
+ray_t* ray_table_distinct_fn(ray_t* tbl) {
+    if (tbl->type != RAY_TABLE)
+        return ray_error("type", "table-distinct expects a table");
+
+    int64_t ncols = ray_table_ncols(tbl);
+    if (ncols == 0) { ray_retain(tbl); return tbl; }
+
+    ray_graph_t* g = ray_graph_new(tbl);
+    if (!g) return ray_error("oom", NULL);
+
+    ray_op_t* keys[256];
+    if (ncols > 256) { ray_graph_free(g); return ray_error("range", "too many columns"); }
+
+    for (int64_t c = 0; c < ncols; c++) {
+        int64_t name_id = ray_table_col_name(tbl, c);
+        ray_t* name_str = ray_sym_str(name_id);
+        if (!name_str) { ray_graph_free(g); return ray_error("type", "bad column name"); }
+        keys[c] = ray_scan(g, ray_str_ptr(name_str));
+        if (!keys[c]) { ray_graph_free(g); return ray_error("oom", NULL); }
+    }
+
+    ray_op_t* root = ray_distinct(g, keys, (uint8_t)ncols);
+    if (!root) { ray_graph_free(g); return ray_error("oom", NULL); }
+
+    ray_t* result = ray_execute(g, root);
+    ray_graph_free(g);
+    return result;
+}
+
+/* ══════════════════════════════════════════
+ * unify
+ * ══════════════════════════════════════════ */
+
+/* (unify a b) — return list of two vectors promoted to a common type */
+ray_t* ray_unify_fn(ray_t* a, ray_t* b) {
+    /* Build a 2-element list containing both values */
+    ray_t* result = ray_list_new(2);
+    if (RAY_IS_ERR(result)) return result;
+
+    if (a->type == b->type || ray_is_atom(a) || ray_is_atom(b)) {
+        /* Same type or atoms: return as-is */
+        ray_retain(a); ray_retain(b);
+        result = ray_list_append(result, a); ray_release(a);
+        result = ray_list_append(result, b); ray_release(b);
+        return result;
+    }
+
+    /* Different vector types: attempt numeric promotion */
+    /* For now: wrap both without conversion */
+    ray_retain(a); ray_retain(b);
+    result = ray_list_append(result, a); ray_release(a);
+    result = ray_list_append(result, b); ray_release(b);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/temporal.c b/crates/rayforce-sys/vendor/rayforce/src/ops/temporal.c
new file mode 100644
index 0000000..9f6065a
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/temporal.c
@@ -0,0 +1,665 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+#include "lang/internal.h"
+#include "ops/temporal.h"
+#include <time.h>
+
+/* ============================================================================
+ * ray_temporal_extract — standalone extract, usable outside the DAG.
+ *
+ * Mirrors exec_extract's scalar decomposition kernel but takes a ray_t*
+ * input directly.  Vector input → RAY_I64 vector; atom input → RAY_I64
+ * atom.  Returned ref is caller-owned.  Called from the env dotted-path
+ * resolver so `date.dd` / `ts.hh` etc. work at runtime without building
+ * a DAG.
+ * ============================================================================ */
+
+#define RTE_USEC_PER_SEC  1000000LL
+#define RTE_USEC_PER_MIN  (60LL  * RTE_USEC_PER_SEC)
+#define RTE_USEC_PER_HOUR (3600LL * RTE_USEC_PER_SEC)
+#define RTE_USEC_PER_DAY  (86400LL * RTE_USEC_PER_SEC)
+
+/* Decompose a single 'microseconds since 2000-01-01' value into a field. */
+static int64_t rte_extract_one(int64_t us, int field) {
+    if (field == RAY_EXTRACT_EPOCH) return us;
+    if (field == RAY_EXTRACT_HOUR) {
+        int64_t day_us = us % RTE_USEC_PER_DAY;
+        if (day_us < 0) day_us += RTE_USEC_PER_DAY;
+        return day_us / RTE_USEC_PER_HOUR;
+    }
+    if (field == RAY_EXTRACT_MINUTE) {
+        int64_t day_us = us % RTE_USEC_PER_DAY;
+        if (day_us < 0) day_us += RTE_USEC_PER_DAY;
+        return (day_us % RTE_USEC_PER_HOUR) / RTE_USEC_PER_MIN;
+    }
+    if (field == RAY_EXTRACT_SECOND) {
+        int64_t day_us = us % RTE_USEC_PER_DAY;
+        if (day_us < 0) day_us += RTE_USEC_PER_DAY;
+        return (day_us % RTE_USEC_PER_MIN) / RTE_USEC_PER_SEC;
+    }
+
+    /* Calendar fields: Hinnant civil_from_days. */
+    int64_t days_since_2000 = us / RTE_USEC_PER_DAY;
+    if (us < 0 && us % RTE_USEC_PER_DAY != 0) days_since_2000--;
+    int64_t z = days_since_2000 + 10957 + 719468;
+    int64_t era = (z >= 0 ? z : z - 146096) / 146097;
+    uint64_t doe = (uint64_t)(z - era * 146097);
+    uint64_t yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365;
+    int64_t y = (int64_t)yoe + era * 400;
+    uint64_t doy_mar = doe - (365*yoe + yoe/4 - yoe/100);
+    uint64_t mp = (5*doy_mar + 2) / 153;
+    uint64_t d = doy_mar - (153*mp + 2) / 5 + 1;
+    uint64_t mo = mp < 10 ? mp + 3 : mp - 9;
+    y += (mo <= 2);
+
+    if (field == RAY_EXTRACT_YEAR)  return y;
+    if (field == RAY_EXTRACT_MONTH) return (int64_t)mo;
+    if (field == RAY_EXTRACT_DAY)   return (int64_t)d;
+    if (field == RAY_EXTRACT_DOW) {
+        return ((days_since_2000 % 7) + 7 + 5) % 7 + 1;
+    }
+    if (field == RAY_EXTRACT_DOY) {
+        static const int dbm[13] = {
+            0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334
+        };
+        if (mo < 1 || mo > 12) return 0;
+        int leap = (y % 4 == 0 && (y % 100 != 0 || y % 400 == 0));
+        int64_t doy_jan = dbm[mo] + (int64_t)d;
+        if (mo > 2 && leap) doy_jan++;
+        return doy_jan;
+    }
+    return 0;
+}
+
+/* Convert a raw slot value from the respective temporal type into
+ * microseconds-since-2000 — the internal unit used by rte_extract_one's
+ * Hinnant math.  DATE is stored as int32 days, TIME as int32 ms,
+ * TIMESTAMP as int64 *nanoseconds* (matching io/csv.c's parse and the
+ * rest of the runtime).  The previous version of this helper treated
+ * TIMESTAMP as µs, which made (yyyy ts) decode to absurd years (26204
+ * on 2024-03-15) — a 1000× unit mismatch. */
+static inline int64_t rte_to_us(int8_t type, int64_t raw) {
+    if (type == RAY_DATE || type == -RAY_DATE) return raw * RTE_USEC_PER_DAY;
+    if (type == RAY_TIME || type == -RAY_TIME) return raw * 1000LL;
+    /* RAY_TIMESTAMP / -RAY_TIMESTAMP: ns → µs (floor toward -inf). */
+    return raw >= 0 ? raw / 1000LL
+                    : -(((-raw) + 999LL) / 1000LL);
+}
+
+/* Inverse of rte_to_us for TIMESTAMP output paths (truncate). */
+static inline int64_t rte_us_to_ts_raw(int64_t us) { return us * 1000LL; }
+
+ray_t* ray_temporal_extract(ray_t* input, int field) {
+    if (!input || RAY_IS_ERR(input)) return input;
+
+    /* Atom input — extract single value as RAY_I64 atom.  A null input
+     * atom produces a typed null output (0Nl): a garbage year/month/etc.
+     * extracted from the null-sentinel bit pattern would be deeply
+     * confusing when mixed into downstream arithmetic. */
+    if (input->type < 0) {
+        int8_t t = input->type;
+        if (t != -RAY_DATE && t != -RAY_TIME && t != -RAY_TIMESTAMP)
+            return ray_error("type", NULL);
+        if (RAY_ATOM_IS_NULL(input)) return ray_typed_null(-RAY_I64);
+        int64_t raw = input->i64;
+        int64_t us = rte_to_us(t, raw);
+        return ray_i64(rte_extract_one(us, field));
+    }
+
+    /* Vector input. */
+    int8_t t = input->type;
+    if (t != RAY_DATE && t != RAY_TIME && t != RAY_TIMESTAMP)
+        return ray_error("type", NULL);
+
+    int64_t len = input->len;
+    ray_t* result = ray_vec_new(RAY_I64, len);
+    if (!result || RAY_IS_ERR(result)) return result;
+    result->len = len;
+    int64_t* out = (int64_t*)ray_data(result);
+
+    /* Null-aware decomposition: any row flagged null in the source
+     * becomes 0 in the data buffer and carries the null bit on the
+     * output, so downstream ops treat it as 0Nl rather than the bogus
+     * year/month/etc that would fall out of decomposing the null
+     * sentinel's bit pattern. */
+    /* Slice-aware HAS_NULLS check: slices don't carry HAS_NULLS on
+     * themselves, so inspect the parent when input is a slice. */
+    bool src_has_nulls =
+        (input->attrs & RAY_ATTR_HAS_NULLS) ||
+        ((input->attrs & RAY_ATTR_SLICE) && input->slice_parent &&
+         (input->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+    const char* base = (const char*)ray_data(input);
+    for (int64_t i = 0; i < len; i++) {
+        if (src_has_nulls && ray_vec_is_null(input, i)) {
+            out[i] = 0;
+            ray_vec_set_null(result, i, true);
+            continue;
+        }
+        int64_t raw;
+        if (t == RAY_DATE)       raw = (int64_t)((const int32_t*)base)[i];
+        else if (t == RAY_TIME)  raw = (int64_t)((const int32_t*)base)[i];
+        else                     raw = ((const int64_t*)base)[i];
+        out[i] = rte_extract_one(rte_to_us(t, raw), field);
+    }
+    return result;
+}
+
+/* Sym name → RAY_EXTRACT_* field code.  Resolves by reading the interned
+ * name string and matching against the documented segment names.  Used
+ * by the env dotted-path resolver so `date_col.dd` works without a DAG. */
+int ray_temporal_field_from_sym(int64_t sym_id) {
+    ray_t* s = ray_sym_str(sym_id);
+    if (!s) return -1;
+    const char* p = ray_str_ptr(s);
+    size_t n = ray_str_len(s);
+    if (!p) return -1;
+
+    if (n == 4 && memcmp(p, "yyyy",   4) == 0) return RAY_EXTRACT_YEAR;
+    if (n == 2 && memcmp(p, "mm",     2) == 0) return RAY_EXTRACT_MONTH;
+    if (n == 2 && memcmp(p, "dd",     2) == 0) return RAY_EXTRACT_DAY;
+    if (n == 2 && memcmp(p, "hh",     2) == 0) return RAY_EXTRACT_HOUR;
+    if (n == 6 && memcmp(p, "minute", 6) == 0) return RAY_EXTRACT_MINUTE;
+    if (n == 2 && memcmp(p, "ss",     2) == 0) return RAY_EXTRACT_SECOND;
+    if (n == 3 && memcmp(p, "dow",    3) == 0) return RAY_EXTRACT_DOW;
+    if (n == 3 && memcmp(p, "doy",    3) == 0) return RAY_EXTRACT_DOY;
+
+    return -1;
+}
+
+/* Eval-level unary builtins.  Each one is a thin wrapper around
+ * ray_temporal_extract with the field bound, so they participate in the
+ * regular function-call machinery: `(ss ts)`, `(yyyy d)`, etc. behave
+ * like any other unary builtin and `ts.ss`, `d.yyyy` resolve through
+ * env_resolve's standard container-then-callable dispatch. */
+ray_t* ray_extract_ss_fn(ray_t* x)     { return ray_temporal_extract(x, RAY_EXTRACT_SECOND); }
+ray_t* ray_extract_hh_fn(ray_t* x)     { return ray_temporal_extract(x, RAY_EXTRACT_HOUR); }
+ray_t* ray_extract_minute_fn(ray_t* x) { return ray_temporal_extract(x, RAY_EXTRACT_MINUTE); }
+ray_t* ray_extract_yyyy_fn(ray_t* x)   { return ray_temporal_extract(x, RAY_EXTRACT_YEAR); }
+ray_t* ray_extract_mm_fn(ray_t* x)     { return ray_temporal_extract(x, RAY_EXTRACT_MONTH); }
+ray_t* ray_extract_dd_fn(ray_t* x)     { return ray_temporal_extract(x, RAY_EXTRACT_DAY); }
+ray_t* ray_extract_dow_fn(ray_t* x)    { return ray_temporal_extract(x, RAY_EXTRACT_DOW); }
+ray_t* ray_extract_doy_fn(ray_t* x)    { return ray_temporal_extract(x, RAY_EXTRACT_DOY); }
+
+int ray_temporal_trunc_from_sym(int64_t sym_id) {
+    ray_t* s = ray_sym_str(sym_id);
+    if (!s) return -1;
+    const char* p = ray_str_ptr(s);
+    size_t n = ray_str_len(s);
+    if (!p) return -1;
+    if (n == 4 && memcmp(p, "date", 4) == 0) return RAY_EXTRACT_DAY;
+    if (n == 4 && memcmp(p, "time", 4) == 0) return RAY_EXTRACT_SECOND;
+    return -1;
+}
+
+ray_t* ray_temporal_truncate(ray_t* input, int kind) {
+    if (!input || RAY_IS_ERR(input)) return input;
+
+    /* Atom input — produce a RAY_TIMESTAMP atom.  Null input → 0Np. */
+    if (input->type < 0) {
+        int8_t t = input->type;
+        if (t != -RAY_DATE && t != -RAY_TIME && t != -RAY_TIMESTAMP)
+            return ray_error("type", NULL);
+        if (RAY_ATOM_IS_NULL(input)) return ray_typed_null(-RAY_TIMESTAMP);
+        int64_t us = rte_to_us(t, input->i64);
+        int64_t bucket = (kind == RAY_EXTRACT_DAY)
+            ? RTE_USEC_PER_DAY
+            : RTE_USEC_PER_SEC;
+        int64_t r = us % bucket;
+        int64_t out_us = us - r - (r < 0 ? bucket : 0);
+        return ray_timestamp(rte_us_to_ts_raw(out_us));
+    }
+
+    /* Vector input. */
+    int8_t t = input->type;
+    if (t != RAY_DATE && t != RAY_TIME && t != RAY_TIMESTAMP)
+        return ray_error("type", NULL);
+
+    int64_t len = input->len;
+    ray_t* result = ray_vec_new(RAY_TIMESTAMP, len);
+    if (!result || RAY_IS_ERR(result)) return result;
+    result->len = len;
+    int64_t* out = (int64_t*)ray_data(result);
+
+    /* Slice-aware HAS_NULLS check: slices don't carry HAS_NULLS on
+     * themselves, so inspect the parent when input is a slice. */
+    bool src_has_nulls =
+        (input->attrs & RAY_ATTR_HAS_NULLS) ||
+        ((input->attrs & RAY_ATTR_SLICE) && input->slice_parent &&
+         (input->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+    const char* base = (const char*)ray_data(input);
+    int64_t bucket = (kind == RAY_EXTRACT_DAY)
+        ? RTE_USEC_PER_DAY
+        : RTE_USEC_PER_SEC;
+
+    for (int64_t i = 0; i < len; i++) {
+        if (src_has_nulls && ray_vec_is_null(input, i)) {
+            out[i] = 0;
+            ray_vec_set_null(result, i, true);
+            continue;
+        }
+        int64_t raw;
+        if (t == RAY_DATE)       raw = (int64_t)((const int32_t*)base)[i];
+        else if (t == RAY_TIME)  raw = (int64_t)((const int32_t*)base)[i];
+        else                     raw = ((const int64_t*)base)[i];
+        int64_t us = rte_to_us(t, raw);
+        int64_t r = us % bucket;
+        out[i] = rte_us_to_ts_raw(us - r - (r < 0 ? bucket : 0));
+    }
+    return result;
+}
+
+/* ============================================================================
+ * EXTRACT — date/time component extraction from temporal columns
+ *
+ * Input:  RAY_TIMESTAMP (i64 us since 2000-01-01), RAY_DATE (i32 days since
+ *         2000-01-01), or RAY_TIME (i32 ms since midnight).
+ * Output: i64 vector of extracted field values.
+ *
+ * Uses Howard Hinnant's civil_from_days algorithm (public domain) for
+ * Gregorian calendar decomposition.
+ * ============================================================================ */
+
+ray_t* exec_extract(ray_graph_t* g, ray_op_t* op) {
+    ray_t* input = exec_node(g, op->inputs[0]);
+    if (!input || RAY_IS_ERR(input)) return input;
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) { ray_release(input); return ray_error("nyi", NULL); }
+
+    int64_t field = ext->sym;
+    int64_t len = input->len;
+    int8_t in_type = input->type;
+
+    ray_t* result = ray_vec_new(RAY_I64, len);
+    if (!result || RAY_IS_ERR(result)) { ray_release(input); return result; }
+    result->len = len;
+
+    int64_t* out = (int64_t*)ray_data(result);
+
+    #undef  USEC_PER_SEC
+    #define USEC_PER_SEC  1000000LL
+    #define USEC_PER_MIN  (60LL  * USEC_PER_SEC)
+    #define USEC_PER_HOUR (3600LL * USEC_PER_SEC)
+    #define USEC_PER_DAY  (86400LL * USEC_PER_SEC)
+
+    /* Slice-aware HAS_NULLS check: slices don't carry HAS_NULLS on
+     * themselves, so inspect the parent when input is a slice. */
+    bool src_has_nulls =
+        (input->attrs & RAY_ATTR_HAS_NULLS) ||
+        ((input->attrs & RAY_ATTR_SLICE) && input->slice_parent &&
+         (input->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+
+    ray_morsel_t m;
+    ray_morsel_init(&m, input);
+    int64_t off = 0;
+
+    while (ray_morsel_next(&m)) {
+        int64_t n = m.morsel_len;
+
+        for (int64_t i = 0; i < n; i++) {
+            /* Propagate nulls: decomposing a null-sentinel's raw bytes
+             * would emit a bogus year / month / hour, so we zero the
+             * output slot and set its null bit instead. */
+            if (src_has_nulls && ray_vec_is_null(input, off + i)) {
+                out[off + i] = 0;
+                ray_vec_set_null(result, off + i, true);
+                continue;
+            }
+            int64_t us;
+            if (in_type == RAY_DATE) {
+                /* int32 days since 2000-01-01 -> microseconds */
+                int32_t d = ((const int32_t*)m.morsel_ptr)[i];
+                us = (int64_t)d * USEC_PER_DAY;
+            } else if (in_type == RAY_TIME) {
+                /* int32 milliseconds since midnight -> microseconds */
+                int32_t ms = ((const int32_t*)m.morsel_ptr)[i];
+                us = (int64_t)ms * 1000LL;
+            } else {
+                /* RAY_TIMESTAMP: int64 *nanoseconds* since 2000 (matches
+                 * io/csv parse and the rest of the runtime).  Convert to
+                 * µs for the calendar/time decomposition below.  RAY_I64
+                 * inputs flow through the same path; anything higher-
+                 * resolution than µs loses its low three digits, which
+                 * doesn't matter for calendar or clock field extraction. */
+                int64_t ns = ((const int64_t*)m.morsel_ptr)[i];
+                us = ns >= 0 ? ns / 1000LL
+                             : -(((-ns) + 999LL) / 1000LL);
+            }
+
+            if (field == RAY_EXTRACT_EPOCH) {
+                out[off + i] = us;
+            } else if (field == RAY_EXTRACT_HOUR) {
+                int64_t day_us = us % USEC_PER_DAY;
+                if (day_us < 0) day_us += USEC_PER_DAY;
+                out[off + i] = day_us / USEC_PER_HOUR;
+            } else if (field == RAY_EXTRACT_MINUTE) {
+                int64_t day_us = us % USEC_PER_DAY;
+                if (day_us < 0) day_us += USEC_PER_DAY;
+                out[off + i] = (day_us % USEC_PER_HOUR) / USEC_PER_MIN;
+            } else if (field == RAY_EXTRACT_SECOND) {
+                int64_t day_us = us % USEC_PER_DAY;
+                if (day_us < 0) day_us += USEC_PER_DAY;
+                out[off + i] = (day_us % USEC_PER_MIN) / USEC_PER_SEC;
+            } else {
+                /* Calendar fields: YEAR, MONTH, DAY, DOW, DOY */
+                /* Floor-divide microseconds to get day count */
+                int64_t days_since_2000 = us / USEC_PER_DAY;
+                if (us < 0 && us % USEC_PER_DAY != 0) days_since_2000--;
+
+                /* Hinnant civil_from_days: shift to 0000-03-01 era-based epoch */
+                int64_t z = days_since_2000 + 10957 + 719468;
+                int64_t era = (z >= 0 ? z : z - 146096) / 146097;
+                uint64_t doe = (uint64_t)(z - era * 146097);
+                uint64_t yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365;
+                int64_t y = (int64_t)yoe + era * 400;
+                uint64_t doy_mar = doe - (365*yoe + yoe/4 - yoe/100);
+                uint64_t mp = (5*doy_mar + 2) / 153;
+                uint64_t d = doy_mar - (153*mp + 2) / 5 + 1;
+                uint64_t mo = mp < 10 ? mp + 3 : mp - 9;
+                y += (mo <= 2);
+
+                if (field == RAY_EXTRACT_YEAR) {
+                    out[off + i] = y;
+                } else if (field == RAY_EXTRACT_MONTH) {
+                    out[off + i] = (int64_t)mo;
+                } else if (field == RAY_EXTRACT_DAY) {
+                    out[off + i] = (int64_t)d;
+                } else if (field == RAY_EXTRACT_DOW) {
+                    /* ISO day of week: Mon=1 .. Sun=7
+                     * 2000-01-01 was Saturday (ISO 6).
+                     * Formula: ((days%7)+7+5)%7 + 1 */
+                    out[off + i] = ((days_since_2000 % 7) + 7 + 5) % 7 + 1;
+                } else if (field == RAY_EXTRACT_DOY) {
+                    /* Day of year [1..366], January-based */
+                    static const int dbm[13] = {
+                        0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334
+                    };
+                    if (mo < 1 || mo > 12) { out[off + i] = 0; continue; }
+                    int leap = (y % 4 == 0 && (y % 100 != 0 || y % 400 == 0));
+                    int64_t doy_jan = dbm[mo] + (int64_t)d;
+                    if (mo > 2 && leap) doy_jan++;
+                    out[off + i] = doy_jan;
+                } else {
+                    out[off + i] = 0;
+                }
+            }
+        }
+        off += n;
+    }
+
+    #undef USEC_PER_SEC
+    #undef USEC_PER_MIN
+    #undef USEC_PER_HOUR
+    #undef USEC_PER_DAY
+
+    ray_release(input);
+    return result;
+}
+
+/* ============================================================================
+ * DATE_TRUNC — truncate temporal value to specified precision
+ *
+ * Input:  RAY_TIMESTAMP (i64 us since 2000-01-01), RAY_DATE (i32 days since
+ *         2000-01-01), or RAY_TIME (i32 ms since midnight).
+ * Output: RAY_TIMESTAMP (i64 us) — always returns microseconds since 2000-01-01.
+ * Sub-day: modular arithmetic. Month/year: calendar decompose + recompose.
+ * ============================================================================ */
+
+/* Convert (year, month, day) to days since 2000-01-01 using the inverse of
+ * Hinnant's civil_from_days. */
+static int64_t days_from_civil(int64_t y, int64_t m, int64_t d) {
+    y -= (m <= 2);
+    int64_t era = (y >= 0 ? y : y - 399) / 400;
+    uint64_t yoe = (uint64_t)(y - era * 400);
+    uint64_t doy = (153 * (m > 2 ? (uint64_t)m - 3 : (uint64_t)m + 9) + 2) / 5 + (uint64_t)d - 1;
+    uint64_t doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
+    return era * 146097 + (int64_t)doe - 719468 - 10957;
+}
+
+ray_t* exec_date_trunc(ray_graph_t* g, ray_op_t* op) {
+    ray_t* input = exec_node(g, op->inputs[0]);
+    if (!input || RAY_IS_ERR(input)) return input;
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) { ray_release(input); return ray_error("nyi", NULL); }
+
+    int64_t field = ext->sym;
+    int64_t len = input->len;
+    int8_t in_type = input->type;
+
+    ray_t* result = ray_vec_new(RAY_TIMESTAMP, len);
+    if (!result || RAY_IS_ERR(result)) { ray_release(input); return result; }
+    result->len = len;
+
+    int64_t* out = (int64_t*)ray_data(result);
+
+    #define DT_USEC_PER_SEC  1000000LL
+    #define DT_USEC_PER_MIN  (60LL  * DT_USEC_PER_SEC)
+    #define DT_USEC_PER_HOUR (3600LL * DT_USEC_PER_SEC)
+    #define DT_USEC_PER_DAY  (86400LL * DT_USEC_PER_SEC)
+
+    /* Slice-aware HAS_NULLS check: slices don't carry HAS_NULLS on
+     * themselves, so inspect the parent when input is a slice. */
+    bool src_has_nulls =
+        (input->attrs & RAY_ATTR_HAS_NULLS) ||
+        ((input->attrs & RAY_ATTR_SLICE) && input->slice_parent &&
+         (input->slice_parent->attrs & RAY_ATTR_HAS_NULLS));
+
+    ray_morsel_t m;
+    ray_morsel_init(&m, input);
+    int64_t off = 0;
+
+    while (ray_morsel_next(&m)) {
+        int64_t n = m.morsel_len;
+
+        for (int64_t i = 0; i < n; i++) {
+            /* Null sentinels decode to garbage times; propagate the
+             * null bit instead of emitting a bogus truncated value. */
+            if (src_has_nulls && ray_vec_is_null(input, off + i)) {
+                out[off + i] = 0;
+                ray_vec_set_null(result, off + i, true);
+                continue;
+            }
+
+            int64_t us;
+            if (in_type == RAY_DATE) {
+                int32_t d = ((const int32_t*)m.morsel_ptr)[i];
+                us = (int64_t)d * DT_USEC_PER_DAY;
+            } else if (in_type == RAY_TIME) {
+                int32_t ms = ((const int32_t*)m.morsel_ptr)[i];
+                us = (int64_t)ms * 1000LL;
+            } else {
+                /* RAY_TIMESTAMP: nanoseconds since 2000 → microseconds.
+                 * Sub-microsecond precision is intentionally dropped —
+                 * every DATE_TRUNC field truncates at second boundary
+                 * or coarser. */
+                int64_t ns = ((const int64_t*)m.morsel_ptr)[i];
+                us = ns >= 0 ? ns / 1000LL
+                             : -(((-ns) + 999LL) / 1000LL);
+            }
+
+            /* Truncation math below happens in µs; the final value is
+             * scaled back to ns before storing, because the result
+             * vector is RAY_TIMESTAMP and the rest of the runtime
+             * expects ns. */
+            int64_t out_us;
+            switch (field) {
+                case RAY_EXTRACT_SECOND: {
+                    int64_t r = us % DT_USEC_PER_SEC;
+                    out_us = us - r - (r < 0 ? DT_USEC_PER_SEC : 0);
+                    break;
+                }
+                case RAY_EXTRACT_MINUTE: {
+                    int64_t r = us % DT_USEC_PER_MIN;
+                    out_us = us - r - (r < 0 ? DT_USEC_PER_MIN : 0);
+                    break;
+                }
+                case RAY_EXTRACT_HOUR: {
+                    int64_t r = us % DT_USEC_PER_HOUR;
+                    out_us = us - r - (r < 0 ? DT_USEC_PER_HOUR : 0);
+                    break;
+                }
+                case RAY_EXTRACT_DAY: {
+                    int64_t r = us % DT_USEC_PER_DAY;
+                    out_us = us - r - (r < 0 ? DT_USEC_PER_DAY : 0);
+                    break;
+                }
+                case RAY_EXTRACT_MONTH: {
+                    int64_t days2k = us / DT_USEC_PER_DAY;
+                    if (us < 0 && us % DT_USEC_PER_DAY != 0) days2k--;
+                    int64_t z = days2k + 10957 + 719468;
+                    int64_t era = (z >= 0 ? z : z - 146096) / 146097;
+                    uint64_t doe = (uint64_t)(z - era * 146097);
+                    uint64_t yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365;
+                    int64_t y = (int64_t)yoe + era * 400;
+                    uint64_t doy_mar = doe - (365*yoe + yoe/4 - yoe/100);
+                    uint64_t mp = (5*doy_mar + 2) / 153;
+                    uint64_t mo = mp < 10 ? mp + 3 : mp - 9;
+                    y += (mo <= 2);
+                    out_us = days_from_civil(y, (int64_t)mo, 1) * DT_USEC_PER_DAY;
+                    break;
+                }
+                case RAY_EXTRACT_YEAR: {
+                    int64_t days2k = us / DT_USEC_PER_DAY;
+                    if (us < 0 && us % DT_USEC_PER_DAY != 0) days2k--;
+                    int64_t z = days2k + 10957 + 719468;
+                    int64_t era = (z >= 0 ? z : z - 146096) / 146097;
+                    uint64_t doe = (uint64_t)(z - era * 146097);
+                    uint64_t yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365;
+                    int64_t y = (int64_t)yoe + era * 400;
+                    uint64_t doy_mar = doe - (365*yoe + yoe/4 - yoe/100);
+                    uint64_t mp = (5*doy_mar + 2) / 153;
+                    uint64_t mo = mp < 10 ? mp + 3 : mp - 9;
+                    y += (mo <= 2);
+                    out_us = days_from_civil(y, 1, 1) * DT_USEC_PER_DAY;
+                    break;
+                }
+                default:
+                    out_us = us;
+                    break;
+            }
+            out[off + i] = out_us * 1000LL;  /* µs → ns for RAY_TIMESTAMP */
+        }
+        off += n;
+    }
+
+    #undef DT_USEC_PER_SEC
+    #undef DT_USEC_PER_MIN
+    #undef DT_USEC_PER_HOUR
+    #undef DT_USEC_PER_DAY
+
+    ray_release(input);
+    return result;
+}
+
+/* ── Builtins ── */
+
+/* Helper: is the argument the symbol 'global? */
+static bool is_global_arg(ray_t* arg) {
+    if (arg && arg->type == -RAY_SYM) {
+        ray_t* s = ray_sym_str(arg->i64);
+        if (s && ray_str_len(s) == 6 && memcmp(ray_str_ptr(s), "global", 6) == 0)
+            return true;
+    }
+    return false;
+}
+
+/* Compute seconds since 2000.01.01 00:00:00 UTC (the rayforce epoch) */
+static time_t ray_epoch_offset(void) {
+    /* 2000-01-01 00:00:00 UTC = 946684800 seconds after 1970 epoch */
+    return (time_t)946684800;
+}
+
+/* (date 'local) or (date 'global) — returns current date as DATE atom.
+ * Overloaded: if arg is a DATE / TIME / TIMESTAMP value or vector,
+ * returns `arg` truncated to the day boundary (RAY_TIMESTAMP result).
+ * This lets `(date ts)` and `ts.date` both flow through the registered
+ * unary builtin with no special-case detour. */
+ray_t* ray_date_clock_fn(ray_t* arg) {
+    if (arg) {
+        int8_t t = arg->type < 0 ? (int8_t)-arg->type : arg->type;
+        if (t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP)
+            return ray_temporal_truncate(arg, RAY_EXTRACT_DAY);
+    }
+    bool local = !is_global_arg(arg);
+    time_t now = time(NULL);
+    struct tm* t = local ? localtime(&now) : gmtime(&now);
+    if (!t) return ray_error("domain", "date: failed to get current time");
+
+    /* Reconstruct midnight of today */
+    struct tm day = *t;
+    day.tm_hour = 0; day.tm_min = 0; day.tm_sec = 0; day.tm_isdst = -1;
+    time_t day_time = mktime(&day);
+
+    /* For UTC (global), mktime interprets as local — adjust via difference */
+    if (!local) {
+        /* Use a simpler approach: total days from epoch */
+        int32_t days = (int32_t)((now - ray_epoch_offset()) / 86400);
+        return ray_date((int64_t)days);
+    }
+
+    /* Local: days since the rayforce epoch, in local time sense */
+    int32_t days = (int32_t)((day_time - ray_epoch_offset()) / 86400);
+    return ray_date((int64_t)days);
+}
+
+/* (time 'local) or (time 'global) — returns current time as TIME atom.
+ * Overloaded same way as ray_date_clock_fn: temporal argument ⇒
+ * truncate to second boundary (RAY_TIMESTAMP); symbol / default ⇒ clock. */
+ray_t* ray_time_clock_fn(ray_t* arg) {
+    if (arg) {
+        int8_t t = arg->type < 0 ? (int8_t)-arg->type : arg->type;
+        if (t == RAY_DATE || t == RAY_TIME || t == RAY_TIMESTAMP)
+            return ray_temporal_truncate(arg, RAY_EXTRACT_SECOND);
+    }
+    bool local = !is_global_arg(arg);
+    time_t now = time(NULL);
+    struct tm* t = local ? localtime(&now) : gmtime(&now);
+    if (!t) return ray_error("domain", "time: failed to get current time");
+
+    int32_t ms = t->tm_hour * 3600000 + t->tm_min * 60000 + t->tm_sec * 1000;
+    return ray_time((int64_t)ms);
+}
+
+/* (timestamp 'local) or (timestamp 'global) — returns current timestamp (ns since 2000.01.01) */
+ray_t* ray_timestamp_clock_fn(ray_t* arg) {
+    bool local = !is_global_arg(arg);
+    time_t now = time(NULL);
+    struct tm* t = local ? localtime(&now) : gmtime(&now);
+    if (!t) return ray_error("domain", "timestamp: failed to get current time");
+
+    int64_t secs;
+    if (!local) {
+        secs = now - ray_epoch_offset();
+    } else {
+        /* For local, compute offset from rayforce epoch in local terms */
+        struct tm lt = *t;
+        lt.tm_isdst = -1;
+        secs = mktime(&lt) - ray_epoch_offset();
+    }
+
+    int64_t nanos = secs * 1000000000LL;
+    return ray_timestamp(nanos);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/temporal.h b/crates/rayforce-sys/vendor/rayforce/src/ops/temporal.h
new file mode 100644
index 0000000..91016c2
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/temporal.h
@@ -0,0 +1,84 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OPS_TEMPORAL_H
+#define RAY_OPS_TEMPORAL_H
+
+#include <rayforce.h>
+#include "ops/ops.h"
+
+/* Extract a calendar / clock field from a RAY_DATE, RAY_TIME, or
+ * RAY_TIMESTAMP input (vector or atom).  `field` is one of the
+ * RAY_EXTRACT_* codes from ops/ops.h.
+ *
+ * Vector input → RAY_I64 vector of the same length, each slot holding the
+ * extracted value.  Atom input (type < 0) → RAY_I64 atom.  Returns an
+ * error ray_t* if the input isn't a supported temporal type.  The
+ * returned value is caller-owned (rc >= 1); caller must ray_release.
+ * Does NOT consume the input's refcount. */
+ray_t* ray_temporal_extract(ray_t* input, int field);
+
+/* Map a sym_id (used as a dotted-name segment, e.g. `.dd`, `.yyyy`, `.mm`)
+ * to a RAY_EXTRACT_* field code.  Returns -1 if the sym isn't a known
+ * temporal field name.  Recognised segments:
+ *     yyyy      → RAY_EXTRACT_YEAR
+ *     mm        → RAY_EXTRACT_MONTH
+ *     dd        → RAY_EXTRACT_DAY
+ *     hh        → RAY_EXTRACT_HOUR
+ *     minute    → RAY_EXTRACT_MINUTE
+ *     ss        → RAY_EXTRACT_SECOND
+ *     dow       → RAY_EXTRACT_DOW (ISO day-of-week 1..7, Mon=1)
+ *     doy       → RAY_EXTRACT_DOY (day-of-year 1..366)
+ *
+ * `mm` is unambiguously MONTH — MINUTE spelling stays long-form
+ * because a two-letter token can't serve both meanings in a uniform
+ * dotted walk that has no container-type-dependent dispatch. */
+int ray_temporal_field_from_sym(int64_t sym_id);
+
+/* Truncate a temporal value/vector to day boundary (`kind == 0`) or to
+ * time-of-day (`kind == 1`, i.e. microseconds within the current day).
+ * Returns a freshly-allocated RAY_TIMESTAMP-typed ray_t* (caller-owned);
+ * nulls in the input propagate to nulls in the output.  `kind` uses
+ * RAY_EXTRACT_DAY (for `.date`) or RAY_EXTRACT_SECOND (for `.time`) so
+ * the set of codes stays consistent with exec_date_trunc.  Returns
+ * ray_error("type", ...) if input isn't a supported temporal type. */
+ray_t* ray_temporal_truncate(ray_t* input, int kind);
+
+/* Dotted-segment sym → truncate kind (see above).  Returns -1 if the
+ * segment isn't one of the truncate-flavoured names (`date` / `time`). */
+int ray_temporal_trunc_from_sym(int64_t sym_id);
+
+/* Unary builtins: thin wrappers over ray_temporal_extract with the
+ * field pre-bound.  Exposed so eval.c can register them alongside the
+ * rest of the language's unary functions — `(ss ts)` and `ts.ss` then
+ * dispatch through the normal call machinery. */
+ray_t* ray_extract_ss_fn(ray_t* x);
+ray_t* ray_extract_hh_fn(ray_t* x);
+ray_t* ray_extract_minute_fn(ray_t* x);
+ray_t* ray_extract_yyyy_fn(ray_t* x);
+ray_t* ray_extract_mm_fn(ray_t* x);
+ray_t* ray_extract_dd_fn(ray_t* x);
+ray_t* ray_extract_dow_fn(ray_t* x);
+ray_t* ray_extract_doy_fn(ray_t* x);
+
+#endif /* RAY_OPS_TEMPORAL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/traverse.c b/crates/rayforce-sys/vendor/rayforce/src/ops/traverse.c
new file mode 100644
index 0000000..3e12201
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/traverse.c
@@ -0,0 +1,2641 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+
+/* ============================================================================
+ * Graph execution functions
+ * ============================================================================ */
+
+/* exec_expand_factorized: emit factorized output for expand+group fusion.
+ * Returns a table with _src (unique sources) and _count (degree per source).
+ * This avoids materializing the full (src, dst) cross-product. */
+static ray_t* exec_expand_factorized(ray_rel_t* rel, uint8_t direction, ray_t* src_vec) {
+    int64_t n_src = src_vec->len;
+    int64_t* src_data = (int64_t*)ray_data(src_vec);
+
+    /* Compute degrees for each source node */
+    ray_t* out_src = ray_vec_new(RAY_I64, n_src > 0 ? n_src : 1);
+    ray_t* out_cnt = ray_vec_new(RAY_I64, n_src > 0 ? n_src : 1);
+    if (!out_src || RAY_IS_ERR(out_src) || !out_cnt || RAY_IS_ERR(out_cnt)) {
+        if (out_src && !RAY_IS_ERR(out_src)) ray_release(out_src);
+        if (out_cnt && !RAY_IS_ERR(out_cnt)) ray_release(out_cnt);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* sd = (int64_t*)ray_data(out_src);
+    int64_t* cd = (int64_t*)ray_data(out_cnt);
+    int64_t out_len = 0;
+
+    for (int64_t i = 0; i < n_src; i++) {
+        int64_t node = src_data[i];
+        int64_t deg = 0;
+        if (direction == 0 || direction == 2) {
+            if (node >= 0 && node < rel->fwd.n_nodes)
+                deg += ray_csr_degree(&rel->fwd, node);
+        }
+        if (direction == 1 || direction == 2) {
+            if (node >= 0 && node < rel->rev.n_nodes)
+                deg += ray_csr_degree(&rel->rev, node);
+        }
+        if (deg > 0) {
+            sd[out_len] = node;
+            cd[out_len] = deg;
+            out_len++;
+        }
+    }
+    out_src->len = out_len;
+    out_cnt->len = out_len;
+
+    int64_t src_sym = ray_sym_intern("_src", 4);
+    int64_t cnt_sym = ray_sym_intern("_count", 6);
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(out_src); ray_release(out_cnt);
+        return ray_error("oom", NULL);
+    }
+    ray_t* tmp = ray_table_add_col(result, src_sym, out_src);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(out_src); ray_release(out_cnt); ray_release(result); return ray_error("oom", NULL); }
+    result = tmp;
+    tmp = ray_table_add_col(result, cnt_sym, out_cnt);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(out_src); ray_release(out_cnt); ray_release(result); return ray_error("oom", NULL); }
+    result = tmp;
+    ray_release(out_src); ray_release(out_cnt);
+    return result;
+}
+
+/* exec_expand: 1-hop CSR neighbor expansion.
+ * Count-then-fill pattern (same as exec_join). */
+ray_t* exec_expand(ray_graph_t* g, ray_op_t* op, ray_t* src_vec) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    /* Factorized mode: emit pre-aggregated degree counts */
+    if (ext->graph.factorized)
+        return exec_expand_factorized(rel, ext->graph.direction, src_vec);
+
+    uint8_t direction = ext->graph.direction;
+    int64_t n_src = src_vec->len;
+    int64_t* src_data = (int64_t*)ray_data(src_vec);
+
+    /* SIP runtime: check for source-side selection bitmap stored on the
+     * expand ext node (set by optimizer sip_pass or manually for testing).
+     *
+     * If sip_sel is not pre-built but the optimizer left a filter hint in
+     * pad[2..3], build a source-side bitmap by marking all source nodes
+     * that have degree > 0 in the active CSR direction. */
+    uint64_t* src_sel_bits = NULL;
+    int64_t src_sel_len = 0;
+    ray_t* sip_sel = (ray_t*)ext->graph.sip_sel;
+    if (!sip_sel) {
+        uint8_t filter_hint = ext->base.pad[2];
+        if (filter_hint > 0 && n_src > 64) {
+            /* Build SIP bitmap: mark source nodes with degree > 0.
+             * For direction==2 (both), check both fwd and rev CSRs. */
+            int64_t nn = rel->fwd.n_nodes;
+            if (rel->rev.n_nodes > nn) nn = rel->rev.n_nodes;
+            ray_t* built_sel = ray_sel_new(nn);
+            if (built_sel && !RAY_IS_ERR(built_sel)) {
+                uint64_t* bits = ray_sel_bits(built_sel);
+                if (direction == 0 || direction == 2) {
+                    for (int64_t nd = 0; nd < rel->fwd.n_nodes; nd++)
+                        if (ray_csr_degree(&rel->fwd, nd) > 0)
+                            RAY_SEL_BIT_SET(bits, nd);
+                }
+                if (direction == 1 || direction == 2) {
+                    for (int64_t nd = 0; nd < rel->rev.n_nodes; nd++)
+                        if (ray_csr_degree(&rel->rev, nd) > 0)
+                            RAY_SEL_BIT_SET(bits, nd);
+                }
+                ext->graph.sip_sel = built_sel;
+                sip_sel = built_sel;
+            }
+        }
+    }
+    if (sip_sel && !RAY_IS_ERR(sip_sel) && sip_sel->type == RAY_SEL) {
+        src_sel_bits = ray_sel_bits(sip_sel);
+        src_sel_len = sip_sel->len;
+    }
+
+    /* Helper to expand one CSR direction */
+    #define EXPAND_DIR(csr_ptr) do { \
+        ray_csr_t* csr = (csr_ptr); \
+        /* Phase 1: count total output pairs */ \
+        int64_t total = 0; \
+        for (int64_t i = 0; i < n_src; i++) { \
+            int64_t node = src_data[i]; \
+            /* SIP skip: if source node not in selection, skip */ \
+            if (src_sel_bits && node >= 0 && node < src_sel_len \
+                && !RAY_SEL_BIT_TEST(src_sel_bits, node)) continue; \
+            if (node >= 0 && node < csr->n_nodes) \
+                total += ray_csr_degree(csr, node); \
+        } \
+        /* Phase 2: fill */ \
+        ray_t* d_src = ray_vec_new(RAY_I64, total > 0 ? total : 1); \
+        ray_t* d_dst = ray_vec_new(RAY_I64, total > 0 ? total : 1); \
+        if (!d_src || RAY_IS_ERR(d_src) || !d_dst || RAY_IS_ERR(d_dst)) { \
+            if (d_src && !RAY_IS_ERR(d_src)) ray_release(d_src); \
+            if (d_dst && !RAY_IS_ERR(d_dst)) ray_release(d_dst); \
+            return ray_error("oom", NULL); \
+        } \
+        d_src->len = total; d_dst->len = total; \
+        int64_t* sd = (int64_t*)ray_data(d_src); \
+        int64_t* dd = (int64_t*)ray_data(d_dst); \
+        int64_t pos = 0; \
+        for (int64_t i = 0; i < n_src; i++) { \
+            int64_t node = src_data[i]; \
+            if (node < 0 || node >= csr->n_nodes) continue; \
+            /* SIP skip: must match count phase */ \
+            if (src_sel_bits && node < src_sel_len \
+                && !RAY_SEL_BIT_TEST(src_sel_bits, node)) continue; \
+            int64_t cnt; \
+            int64_t* nbrs = ray_csr_neighbors(csr, node, &cnt); \
+            for (int64_t j = 0; j < cnt; j++) { \
+                sd[pos] = node; \
+                dd[pos] = nbrs[j]; \
+                pos++; \
+            } \
+        } \
+        /* Build result table */ \
+        int64_t src_sym = ray_sym_intern("_src", 4); \
+        int64_t dst_sym = ray_sym_intern("_dst", 4); \
+        ray_t* result = ray_table_new(2); \
+        if (!result || RAY_IS_ERR(result)) { \
+            ray_release(d_src); ray_release(d_dst); \
+            return ray_error("oom", NULL); \
+        } \
+        ray_t* _tmp = ray_table_add_col(result, src_sym, d_src); \
+        if (!_tmp || RAY_IS_ERR(_tmp)) { ray_release(d_src); ray_release(d_dst); ray_release(result); return ray_error("oom", NULL); } \
+        result = _tmp; \
+        _tmp = ray_table_add_col(result, dst_sym, d_dst); \
+        if (!_tmp || RAY_IS_ERR(_tmp)) { ray_release(d_src); ray_release(d_dst); ray_release(result); return ray_error("oom", NULL); } \
+        result = _tmp; \
+        ray_release(d_src); ray_release(d_dst); \
+        return result; \
+    } while (0)
+
+    if (direction == 0) {
+        EXPAND_DIR(&rel->fwd);
+    } else if (direction == 1) {
+        EXPAND_DIR(&rel->rev);
+    } else {
+        /* direction == 2: both — expand fwd, then rev, concat */
+        ray_csr_t* fwd = &rel->fwd;
+        ray_csr_t* rev = &rel->rev;
+
+        /* Count forward */
+        int64_t fwd_total = 0;
+        for (int64_t i = 0; i < n_src; i++) {
+            int64_t node = src_data[i];
+            if (src_sel_bits && node >= 0 && node < src_sel_len
+                && !RAY_SEL_BIT_TEST(src_sel_bits, node)) continue;
+            if (node >= 0 && node < fwd->n_nodes)
+                fwd_total += ray_csr_degree(fwd, node);
+        }
+        /* Count reverse */
+        int64_t rev_total = 0;
+        for (int64_t i = 0; i < n_src; i++) {
+            int64_t node = src_data[i];
+            if (src_sel_bits && node >= 0 && node < src_sel_len
+                && !RAY_SEL_BIT_TEST(src_sel_bits, node)) continue;
+            if (node >= 0 && node < rev->n_nodes)
+                rev_total += ray_csr_degree(rev, node);
+        }
+
+        int64_t total = fwd_total + rev_total;
+        ray_t* d_src = ray_vec_new(RAY_I64, total > 0 ? total : 1);
+        ray_t* d_dst = ray_vec_new(RAY_I64, total > 0 ? total : 1);
+        if (!d_src || RAY_IS_ERR(d_src) || !d_dst || RAY_IS_ERR(d_dst)) {
+            if (d_src && !RAY_IS_ERR(d_src)) ray_release(d_src);
+            if (d_dst && !RAY_IS_ERR(d_dst)) ray_release(d_dst);
+            return ray_error("oom", NULL);
+        }
+        d_src->len = total; d_dst->len = total;
+        int64_t* sd = (int64_t*)ray_data(d_src);
+        int64_t* dd = (int64_t*)ray_data(d_dst);
+        int64_t pos = 0;
+
+        /* Fill forward */
+        for (int64_t i = 0; i < n_src; i++) {
+            int64_t node = src_data[i];
+            if (node < 0 || node >= fwd->n_nodes) continue;
+            if (src_sel_bits && node < src_sel_len
+                && !RAY_SEL_BIT_TEST(src_sel_bits, node)) continue;
+            int64_t cnt;
+            int64_t* nbrs = ray_csr_neighbors(fwd, node, &cnt);
+            for (int64_t j = 0; j < cnt; j++) {
+                sd[pos] = node; dd[pos] = nbrs[j]; pos++;
+            }
+        }
+        /* Fill reverse */
+        for (int64_t i = 0; i < n_src; i++) {
+            int64_t node = src_data[i];
+            if (node < 0 || node >= rev->n_nodes) continue;
+            if (src_sel_bits && node < src_sel_len
+                && !RAY_SEL_BIT_TEST(src_sel_bits, node)) continue;
+            int64_t cnt;
+            int64_t* nbrs = ray_csr_neighbors(rev, node, &cnt);
+            for (int64_t j = 0; j < cnt; j++) {
+                sd[pos] = node; dd[pos] = nbrs[j]; pos++;
+            }
+        }
+
+        int64_t src_sym = ray_sym_intern("_src", 4);
+        int64_t dst_sym = ray_sym_intern("_dst", 4);
+        ray_t* result = ray_table_new(2);
+        if (!result || RAY_IS_ERR(result)) {
+            ray_release(d_src); ray_release(d_dst);
+            return ray_error("oom", NULL);
+        }
+        ray_t* tmp = ray_table_add_col(result, src_sym, d_src);
+        if (!tmp || RAY_IS_ERR(tmp)) { ray_release(d_src); ray_release(d_dst); ray_release(result); return ray_error("oom", NULL); }
+        result = tmp;
+        tmp = ray_table_add_col(result, dst_sym, d_dst);
+        if (!tmp || RAY_IS_ERR(tmp)) { ray_release(d_src); ray_release(d_dst); ray_release(result); return ray_error("oom", NULL); }
+        result = tmp;
+        ray_release(d_src); ray_release(d_dst);
+        return result;
+    }
+    #undef EXPAND_DIR
+}
+
+/* exec_var_expand: iterative BFS with depth limit and cycle detection */
+ray_t* exec_var_expand(ray_graph_t* g, ray_op_t* op, ray_t* start_vec) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    uint8_t direction = ext->graph.direction;
+    uint8_t min_depth = ext->graph.min_depth;
+    uint8_t max_depth = ext->graph.max_depth;
+    ray_csr_t* csr_fwd = &rel->fwd;
+    ray_csr_t* csr_rev = &rel->rev;
+    /* For direction==2 (both), use fwd for n_nodes bound but expand both */
+    ray_csr_t* csr = (direction == 1) ? csr_rev : csr_fwd;
+
+    int64_t n_start = start_vec->len;
+    int64_t* start_data = (int64_t*)ray_data(start_vec);
+
+    /* Pre-allocate output buffers (grow as needed) */
+    int64_t out_cap = 1024;
+    ray_t *start_hdr, *end_hdr, *depth_hdr;
+    int64_t* out_start = (int64_t*)scratch_alloc(&start_hdr, (size_t)out_cap * sizeof(int64_t));
+    int64_t* out_end   = (int64_t*)scratch_alloc(&end_hdr,   (size_t)out_cap * sizeof(int64_t));
+    int64_t* out_depth = (int64_t*)scratch_alloc(&depth_hdr, (size_t)out_cap * sizeof(int64_t));
+    if (!out_start || !out_end || !out_depth) {
+        scratch_free(start_hdr); scratch_free(end_hdr); scratch_free(depth_hdr);
+        return ray_error("oom", NULL);
+    }
+    int64_t out_count = 0;
+
+    /* For direction==2, use the larger n_nodes bound */
+    int64_t bfs_n_nodes = csr->n_nodes;
+    if (direction == 2 && csr_rev->n_nodes > bfs_n_nodes)
+        bfs_n_nodes = csr_rev->n_nodes;
+
+    /* BFS per start node */
+    for (int64_t s = 0; s < n_start; s++) {
+        int64_t start_node = start_data[s];
+        if (start_node < 0 || start_node >= bfs_n_nodes) continue;
+
+        /* Visited bitmap via RAY_SEL */
+        ray_t* visited_sel = ray_sel_new(bfs_n_nodes);
+        if (!visited_sel || RAY_IS_ERR(visited_sel)) continue;
+        uint64_t* visited = ray_sel_bits(visited_sel);
+        RAY_SEL_BIT_SET(visited, start_node);
+
+        /* Frontier */
+        ray_t* front_hdr;
+        int64_t front_cap = 256;
+        int64_t* frontier = (int64_t*)scratch_alloc(&front_hdr, (size_t)front_cap * sizeof(int64_t));
+        if (!frontier) { ray_release(visited_sel); continue; }
+        frontier[0] = start_node;
+        int64_t front_len = 1;
+
+        for (uint8_t depth = 1; depth <= max_depth && front_len > 0; depth++) {
+            ray_t* next_hdr;
+            int64_t next_cap = (front_len > INT64_MAX / 4) ? INT64_MAX : front_len * 4;
+            if (next_cap < 64) next_cap = 64;
+            int64_t* next_front = (int64_t*)scratch_alloc(&next_hdr, (size_t)next_cap * sizeof(int64_t));
+            if (!next_front) { scratch_free(front_hdr); ray_release(visited_sel); goto cleanup; }
+            int64_t next_len = 0;
+
+            for (int64_t f = 0; f < front_len; f++) {
+                int64_t node = frontier[f];
+                /* Expand neighbors from active CSR(s).
+                 * For direction==2 (both), expand fwd then rev. */
+                int n_csrs = (direction == 2) ? 2 : 1;
+                ray_csr_t* csrs[2] = { csr, csr_rev };
+                for (int ci = 0; ci < n_csrs; ci++) {
+                    ray_csr_t* cur_csr = csrs[ci];
+                    if (node < 0 || node >= cur_csr->n_nodes) continue;
+                int64_t cnt;
+                int64_t* nbrs = ray_csr_neighbors(cur_csr, node, &cnt);
+                for (int64_t j = 0; j < cnt; j++) {
+                    int64_t nbr = nbrs[j];
+                    if (nbr < 0 || nbr >= bfs_n_nodes) continue;
+                    if (RAY_SEL_BIT_TEST(visited, nbr)) continue;
+                    RAY_SEL_BIT_SET(visited, nbr);
+
+                    /* Grow next_front if needed */
+                    if (next_len >= next_cap) {
+                        if (next_cap > INT64_MAX / 2) break;
+                        int64_t new_cap = next_cap * 2;
+                        int64_t* new_nf = (int64_t*)scratch_realloc(&next_hdr,
+                            (size_t)next_cap * sizeof(int64_t),
+                            (size_t)new_cap * sizeof(int64_t));
+                        if (!new_nf) break;
+                        next_front = new_nf;
+                        next_cap = new_cap;
+                    }
+                    next_front[next_len++] = nbr;
+
+                    /* Emit if within depth range */
+                    if (depth >= min_depth) {
+                        if (out_count >= out_cap) {
+                            if (out_cap > INT64_MAX / 2) break;
+                            int64_t new_oc = out_cap * 2;
+                            /* Grow all three buffers atomically — alloc new
+                             * copies first, commit only if all succeed. */
+                            ray_t *ns_h = NULL, *ne_h = NULL, *nd_h = NULL;
+                            size_t old_sz = (size_t)out_cap * sizeof(int64_t);
+                            size_t new_sz = (size_t)new_oc * sizeof(int64_t);
+                            int64_t* ns = (int64_t*)scratch_alloc(&ns_h, new_sz);
+                            int64_t* ne = (int64_t*)scratch_alloc(&ne_h, new_sz);
+                            int64_t* nd_buf = (int64_t*)scratch_alloc(&nd_h, new_sz);
+                            if (!ns || !ne || !nd_buf) {
+                                scratch_free(ns_h); scratch_free(ne_h); scratch_free(nd_h);
+                                break;
+                            }
+                            memcpy(ns, out_start, old_sz);
+                            memcpy(ne, out_end, old_sz);
+                            memcpy(nd_buf, out_depth, old_sz);
+                            scratch_free(start_hdr); scratch_free(end_hdr); scratch_free(depth_hdr);
+                            start_hdr = ns_h; end_hdr = ne_h; depth_hdr = nd_h;
+                            out_start = ns; out_end = ne; out_depth = nd_buf;
+                            out_cap = new_oc;
+                        }
+                        out_start[out_count] = start_node;
+                        out_end[out_count] = nbr;
+                        out_depth[out_count] = depth;
+                        out_count++;
+                    }
+                }
+                } /* end for ci (CSR directions) */
+            }
+
+            scratch_free(front_hdr);
+            front_hdr = next_hdr;
+            frontier = next_front;
+            front_len = next_len;
+        }
+
+        scratch_free(front_hdr);
+        ray_release(visited_sel);
+    }
+
+cleanup:;
+    /* Build output table */
+    ray_t* v_start = ray_vec_from_raw(RAY_I64, out_start, out_count);
+    ray_t* v_end   = ray_vec_from_raw(RAY_I64, out_end,   out_count);
+    ray_t* v_depth = ray_vec_from_raw(RAY_I64, out_depth, out_count);
+    scratch_free(start_hdr); scratch_free(end_hdr); scratch_free(depth_hdr);
+
+    if (!v_start || RAY_IS_ERR(v_start) || !v_end || RAY_IS_ERR(v_end) ||
+        !v_depth || RAY_IS_ERR(v_depth)) {
+        if (v_start && !RAY_IS_ERR(v_start)) ray_release(v_start);
+        if (v_end && !RAY_IS_ERR(v_end)) ray_release(v_end);
+        if (v_depth && !RAY_IS_ERR(v_depth)) ray_release(v_depth);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t start_sym = ray_sym_intern("_start", 6);
+    int64_t end_sym   = ray_sym_intern("_end", 4);
+    int64_t depth_sym = ray_sym_intern("_depth", 6);
+
+    ray_t* result = ray_table_new(3);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(v_start); ray_release(v_end); ray_release(v_depth);
+        return ray_error("oom", NULL);
+    }
+    ray_t* tmp = ray_table_add_col(result, start_sym, v_start);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(v_start); ray_release(v_end); ray_release(v_depth); ray_release(result); return ray_error("oom", NULL); }
+    result = tmp;
+    tmp = ray_table_add_col(result, end_sym, v_end);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(v_start); ray_release(v_end); ray_release(v_depth); ray_release(result); return ray_error("oom", NULL); }
+    result = tmp;
+    tmp = ray_table_add_col(result, depth_sym, v_depth);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(v_start); ray_release(v_end); ray_release(v_depth); ray_release(result); return ray_error("oom", NULL); }
+    result = tmp;
+    ray_release(v_start); ray_release(v_end); ray_release(v_depth);
+    return result;
+}
+
+/* exec_shortest_path: BFS from src to dst with parent tracking */
+ray_t* exec_shortest_path(ray_graph_t* g, ray_op_t* op,
+                                 ray_t* src_val, ray_t* dst_val) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+    uint8_t direction = ext->graph.direction;
+    ray_csr_t* csr = (direction == 1) ? &rel->rev : &rel->fwd;
+    ray_csr_t* csr_rev = &rel->rev;
+    int n_csrs = (direction == 2) ? 2 : 1;
+    ray_csr_t* csrs[2] = { csr, csr_rev };
+    int64_t bfs_n_nodes = csr->n_nodes;
+    if (direction == 2 && csr_rev->n_nodes > bfs_n_nodes)
+        bfs_n_nodes = csr_rev->n_nodes;
+    uint8_t max_depth = ext->graph.max_depth;
+
+    /* Extract single I64 values */
+    int64_t src_node, dst_node;
+    if (ray_is_atom(src_val)) {
+        src_node = src_val->i64;
+    } else {
+        if (src_val->len == 0) return ray_error("range", NULL);
+        src_node = ((int64_t*)ray_data(src_val))[0];
+    }
+    if (ray_is_atom(dst_val)) {
+        dst_node = dst_val->i64;
+    } else {
+        if (dst_val->len == 0) return ray_error("range", NULL);
+        dst_node = ((int64_t*)ray_data(dst_val))[0];
+    }
+
+    if (src_node < 0 || src_node >= bfs_n_nodes ||
+        dst_node < 0 || dst_node >= bfs_n_nodes)
+        return ray_error("range", NULL);
+
+    /* Special case: src == dst */
+    if (src_node == dst_node) {
+        ray_t* v_node = ray_vec_from_raw(RAY_I64, &src_node, 1);
+        int64_t zero = 0;
+        ray_t* v_depth = ray_vec_from_raw(RAY_I64, &zero, 1);
+        if (!v_node || RAY_IS_ERR(v_node) || !v_depth || RAY_IS_ERR(v_depth)) {
+            if (v_node && !RAY_IS_ERR(v_node)) ray_release(v_node);
+            if (v_depth && !RAY_IS_ERR(v_depth)) ray_release(v_depth);
+            return ray_error("oom", NULL);
+        }
+        ray_t* result = ray_table_new(2);
+        if (!result || RAY_IS_ERR(result)) { ray_release(v_node); ray_release(v_depth); return ray_error("oom", NULL); }
+        ray_t* tmp = ray_table_add_col(result, sym_intern_safe("_node", 5), v_node);
+        if (!tmp || RAY_IS_ERR(tmp)) { ray_release(v_node); ray_release(v_depth); ray_release(result); return ray_error("oom", NULL); }
+        result = tmp;
+        tmp = ray_table_add_col(result, sym_intern_safe("_depth", 6), v_depth);
+        if (!tmp || RAY_IS_ERR(tmp)) { ray_release(v_node); ray_release(v_depth); ray_release(result); return ray_error("oom", NULL); }
+        result = tmp;
+        ray_release(v_node); ray_release(v_depth);
+        return result;
+    }
+
+    /* Allocate parent array (-1 = unvisited) */
+    ray_t* parent_hdr;
+    int64_t* parent = (int64_t*)scratch_alloc(&parent_hdr,
+                                               (size_t)bfs_n_nodes * sizeof(int64_t));
+    if (!parent) return ray_error("oom", NULL);
+    memset(parent, 0xFF, (size_t)bfs_n_nodes * sizeof(int64_t)); /* -1 */
+    parent[src_node] = src_node;
+
+    /* BFS queue */
+    ray_t* queue_hdr;
+    int64_t q_cap = 1024;
+    int64_t* queue = (int64_t*)scratch_alloc(&queue_hdr, (size_t)q_cap * sizeof(int64_t));
+    if (!queue) { scratch_free(parent_hdr); return ray_error("oom", NULL); }
+    queue[0] = src_node;
+    int64_t q_start = 0, q_end = 1;
+    bool found = false;
+
+    for (uint8_t depth = 1; depth <= max_depth && !found; depth++) {
+        int64_t level_end = q_end;
+        for (int64_t qi = q_start; qi < level_end && !found; qi++) {
+            int64_t node = queue[qi];
+            for (int ci = 0; ci < n_csrs && !found; ci++) {
+                ray_csr_t* cur_csr = csrs[ci];
+                if (node < 0 || node >= cur_csr->n_nodes) continue;
+                int64_t cnt;
+                int64_t* nbrs = ray_csr_neighbors(cur_csr, node, &cnt);
+                for (int64_t j = 0; j < cnt; j++) {
+                    int64_t nbr = nbrs[j];
+                    if (nbr < 0 || nbr >= bfs_n_nodes) continue;
+                    if (parent[nbr] != -1) continue;
+                    parent[nbr] = node;
+
+                    if (nbr == dst_node) { found = true; break; }
+
+                    /* Grow queue if needed */
+                    if (q_end >= q_cap) {
+                        if (q_cap > INT64_MAX / 2) { found = false; goto bfs_done; }
+                        int64_t new_cap = q_cap * 2;
+                        int64_t* new_q = (int64_t*)scratch_realloc(&queue_hdr,
+                            (size_t)q_cap * sizeof(int64_t),
+                            (size_t)new_cap * sizeof(int64_t));
+                        if (!new_q) { found = false; goto bfs_done; }
+                        queue = new_q;
+                        q_cap = new_cap;
+                    }
+                    queue[q_end++] = nbr;
+                }
+            } /* end for ci (CSR directions) */
+        }
+        q_start = level_end;
+    }
+
+bfs_done:
+    scratch_free(queue_hdr);
+
+    if (!found) {
+        scratch_free(parent_hdr);
+        return ray_error("range", NULL);
+    }
+
+    /* Reconstruct path */
+    int64_t path_buf[256];
+    int64_t path_len = 0;
+    int64_t cur = dst_node;
+    while (cur != src_node && path_len < 255) {
+        path_buf[path_len++] = cur;
+        cur = parent[cur];
+    }
+    if (cur != src_node) {
+        scratch_free(parent_hdr);
+        return ray_error("range", "path exceeds 254 hops");
+    }
+    path_buf[path_len++] = src_node;
+    scratch_free(parent_hdr);
+
+    /* Reverse path */
+    for (int64_t i = 0; i < path_len / 2; i++) {
+        int64_t tmp = path_buf[i];
+        path_buf[i] = path_buf[path_len - 1 - i];
+        path_buf[path_len - 1 - i] = tmp;
+    }
+
+    /* Build output table */
+    ray_t* v_node = ray_vec_from_raw(RAY_I64, path_buf, path_len);
+    ray_t* v_depth = ray_vec_new(RAY_I64, path_len);
+    if (!v_node || RAY_IS_ERR(v_node) || !v_depth || RAY_IS_ERR(v_depth)) {
+        if (v_node && !RAY_IS_ERR(v_node)) ray_release(v_node);
+        if (v_depth && !RAY_IS_ERR(v_depth)) ray_release(v_depth);
+        return ray_error("oom", NULL);
+    }
+    v_depth->len = path_len;
+    int64_t* dep_data = (int64_t*)ray_data(v_depth);
+    for (int64_t i = 0; i < path_len; i++) dep_data[i] = i;
+
+    int64_t node_sym  = ray_sym_intern("_node", 5);
+    int64_t depth_sym = ray_sym_intern("_depth", 6);
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) { ray_release(v_node); ray_release(v_depth); return ray_error("oom", NULL); }
+    ray_t* tmp = ray_table_add_col(result, node_sym, v_node);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(v_node); ray_release(v_depth); ray_release(result); return ray_error("oom", NULL); }
+    result = tmp;
+    tmp = ray_table_add_col(result, depth_sym, v_depth);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(v_node); ray_release(v_depth); ray_release(result); return ray_error("oom", NULL); }
+    result = tmp;
+    ray_release(v_node); ray_release(v_depth);
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_pagerank: iterative PageRank over CSR adjacency.
+ *
+ * rank[v] = (1 - d)/N + d * SUM(rank[u] / out_degree[u]) for u in in_neighbors(v)
+ *
+ * Uses reverse CSR for in-neighbors, forward CSR for out-degree.
+ * -------------------------------------------------------------------------- */
+ray_t* exec_pagerank(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n       = rel->fwd.n_nodes;
+    uint16_t iters  = ext->graph.max_iter;
+    double damping  = ext->graph.damping;
+
+    if (n <= 0) return ray_error("length", NULL);
+
+    /* Arena for all scratch memory — freed in one shot */
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    double* rank     = (double*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(double));
+    double* rank_new = (double*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(double));
+    if (!rank || !rank_new) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+
+    double init = 1.0 / (double)n;
+    for (int64_t i = 0; i < n; i++) rank[i] = init;
+
+    /* Get raw CSR arrays for direct access */
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* rev_off = (int64_t*)ray_data(rel->rev.offsets);
+    int64_t* rev_tgt = (int64_t*)ray_data(rel->rev.targets);
+
+    double base = (1.0 - damping) / (double)n;
+
+    for (uint16_t iter = 0; iter < iters; iter++) {
+        /* Dangling node correction: redistribute rank of zero-out-degree nodes */
+        double dangling_sum = 0.0;
+        for (int64_t u = 0; u < n; u++) {
+            if (fwd_off[u + 1] == fwd_off[u]) dangling_sum += rank[u];
+        }
+        double adjusted_base = base + damping * dangling_sum / (double)n;
+
+        for (int64_t v = 0; v < n; v++) {
+            double sum = 0.0;
+            /* Iterate over in-neighbors of v using reverse CSR */
+            int64_t rev_start = rev_off[v];
+            int64_t rev_end   = rev_off[v + 1];
+            for (int64_t j = rev_start; j < rev_end; j++) {
+                int64_t u = rev_tgt[j];
+                /* out_degree of u from forward CSR */
+                int64_t out_deg = fwd_off[u + 1] - fwd_off[u];
+                if (out_deg > 0) {
+                    sum += rank[u] / (double)out_deg;
+                }
+            }
+            rank_new[v] = adjusted_base + damping * sum;
+        }
+        /* Swap */
+        double* tmp = rank;
+        rank = rank_new;
+        rank_new = tmp;
+    }
+
+    /* Build output table: _node (I64), _rank (F64) */
+    ray_t* node_vec = ray_vec_new(RAY_I64, n);
+    ray_t* rank_vec = ray_vec_new(RAY_F64, n);
+    if (!node_vec || RAY_IS_ERR(node_vec) || !rank_vec || RAY_IS_ERR(rank_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (rank_vec && !RAY_IS_ERR(rank_vec)) ray_release(rank_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* ndata = (int64_t*)ray_data(node_vec);
+    double*  rdata = (double*)ray_data(rank_vec);
+    for (int64_t i = 0; i < n; i++) {
+        ndata[i] = i;
+        rdata[i] = rank[i];
+    }
+    node_vec->len = n;
+    rank_vec->len = n;
+
+    ray_scratch_arena_reset(&arena);
+
+    /* Package as table with named columns */
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec);
+        ray_release(rank_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_rank", 5), rank_vec);
+    ray_release(rank_vec);
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_connected_comp: connected components via label propagation.
+ * Treats graph as undirected (uses both forward and reverse CSR).
+ * O(diameter * |E|) time.
+ * -------------------------------------------------------------------------- */
+ray_t* exec_connected_comp(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    if (n <= 0) return ray_error("length", NULL);
+
+    /* Arena for all scratch memory — freed in one shot */
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    int64_t* label = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    if (!label) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+
+    /* Initialize: each node is its own component */
+    for (int64_t i = 0; i < n; i++) label[i] = i;
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* rev_off = (int64_t*)ray_data(rel->rev.offsets);
+    int64_t* rev_tgt = (int64_t*)ray_data(rel->rev.targets);
+
+    /* Iterate until convergence */
+    bool changed = true;
+    while (changed) {
+        changed = false;
+        for (int64_t v = 0; v < n; v++) {
+            int64_t min_label = label[v];
+            /* Forward neighbors */
+            for (int64_t j = fwd_off[v]; j < fwd_off[v + 1]; j++) {
+                int64_t u = fwd_tgt[j];
+                if (label[u] < min_label) min_label = label[u];
+            }
+            /* Reverse neighbors */
+            for (int64_t j = rev_off[v]; j < rev_off[v + 1]; j++) {
+                int64_t u = rev_tgt[j];
+                if (label[u] < min_label) min_label = label[u];
+            }
+            if (min_label < label[v]) {
+                label[v] = min_label;
+                changed = true;
+            }
+        }
+    }
+
+    /* Build output table */
+    ray_t* node_vec = ray_vec_new(RAY_I64, n);
+    ray_t* comp_vec = ray_vec_new(RAY_I64, n);
+    if (!node_vec || RAY_IS_ERR(node_vec) || !comp_vec || RAY_IS_ERR(comp_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (comp_vec && !RAY_IS_ERR(comp_vec)) ray_release(comp_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* ndata = (int64_t*)ray_data(node_vec);
+    int64_t* cdata = (int64_t*)ray_data(comp_vec);
+    for (int64_t i = 0; i < n; i++) {
+        ndata[i] = i;
+        cdata[i] = label[i];
+    }
+    node_vec->len = n;
+    comp_vec->len = n;
+
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec);
+        ray_release(comp_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_component", 10), comp_vec);
+    ray_release(comp_vec);
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_dijkstra: weighted shortest path via Dijkstra's algorithm.
+ * Uses a binary min-heap. Reads edge weights from CSR property table.
+ * Returns table with _node (I64), _dist (F64), _depth (I64).
+ * -------------------------------------------------------------------------- */
+
+/* Min-heap entry for Dijkstra */
+typedef struct {
+    double   dist;
+    int64_t  node;
+} dijk_entry_t;
+
+static void dijk_heap_push(dijk_entry_t* heap, int64_t* size,
+                            double dist, int64_t node) {
+    int64_t i = (*size)++;
+    heap[i].dist = dist;
+    heap[i].node = node;
+    /* Sift up */
+    while (i > 0) {
+        int64_t parent = (i - 1) / 2;
+        if (heap[parent].dist <= heap[i].dist) break;
+        dijk_entry_t tmp = heap[parent];
+        heap[parent] = heap[i];
+        heap[i] = tmp;
+        i = parent;
+    }
+}
+
+static dijk_entry_t dijk_heap_pop(dijk_entry_t* heap, int64_t* size) {
+    dijk_entry_t top = heap[0];
+    (*size)--;
+    if (*size > 0) {
+        heap[0] = heap[*size];
+        /* Sift down */
+        int64_t i = 0;
+        while (1) {
+            int64_t left  = 2 * i + 1;
+            int64_t right = 2 * i + 2;
+            int64_t smallest = i;
+            if (left  < *size && heap[left].dist  < heap[smallest].dist) smallest = left;
+            if (right < *size && heap[right].dist < heap[smallest].dist) smallest = right;
+            if (smallest == i) break;
+            dijk_entry_t tmp = heap[i];
+            heap[i] = heap[smallest];
+            heap[smallest] = tmp;
+            i = smallest;
+        }
+    }
+    return top;
+}
+
+/* Reusable Dijkstra with optional node/edge masks (for Yen's k-shortest) */
+static double dijkstra_masked(
+    int64_t* fwd_off, int64_t* fwd_tgt, int64_t* fwd_row,
+    double* weights, int64_t n,
+    int64_t src_id, int64_t dst_id,
+    bool* node_mask,    /* NULL or bool[n]: true = blocked */
+    bool* edge_mask,    /* NULL or bool[m]: true = blocked */
+    double* dist,       /* pre-allocated double[n] */
+    int64_t* parent,    /* pre-allocated int64_t[n] */
+    dijk_entry_t* heap, /* pre-allocated */
+    bool* visited)      /* pre-allocated bool[n] */
+{
+    for (int64_t i = 0; i < n; i++) {
+        dist[i] = 1e308;
+        parent[i] = -1;
+        visited[i] = false;
+    }
+
+    dist[src_id] = 0.0;
+    int64_t heap_size = 0;
+    dijk_heap_push(heap, &heap_size, 0.0, src_id);
+
+    while (heap_size > 0) {
+        dijk_entry_t top = dijk_heap_pop(heap, &heap_size);
+        int64_t u = top.node;
+        if (visited[u]) continue;
+        visited[u] = true;
+
+        if (u == dst_id) break;
+
+        for (int64_t j = fwd_off[u]; j < fwd_off[u + 1]; j++) {
+            if (edge_mask && edge_mask[j]) continue;
+            int64_t v = fwd_tgt[j];
+            if (node_mask && node_mask[v]) continue;
+            int64_t edge_row = fwd_row[j];
+            double w = weights[edge_row];
+            double new_dist = dist[u] + w;
+            if (new_dist < dist[v]) {
+                dist[v] = new_dist;
+                parent[v] = u;
+                dijk_heap_push(heap, &heap_size, new_dist, v);
+            }
+        }
+    }
+
+    return dist[dst_id];
+}
+
+ray_t* exec_dijkstra(ray_graph_t* g, ray_op_t* op,
+                             ray_t* src_val, ray_t* dst_val) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+    if (!rel->fwd.props) return ray_error("schema", NULL); /* need edge properties */
+
+    int64_t n = rel->fwd.n_nodes;
+    int64_t m = rel->fwd.n_edges;
+    int64_t src_id = ray_is_atom(src_val) ? src_val->i64 : ((int64_t*)ray_data(src_val))[0];
+    int64_t dst_id = !dst_val ? -1 : ray_is_atom(dst_val) ? dst_val->i64 : ((int64_t*)ray_data(dst_val))[0];
+
+    if (src_id < 0 || src_id >= n) return ray_error("range", NULL);
+    if (dst_id != -1 && (dst_id < 0 || dst_id >= n)) return ray_error("range", NULL);
+
+    /* Find weight column in edge properties */
+    int64_t weight_sym = ext->graph.weight_col_sym;
+    ray_t* props = rel->fwd.props;
+    ray_t* weight_vec = ray_table_get_col(props, weight_sym);
+    if (!weight_vec || RAY_IS_ERR(weight_vec)) return ray_error("schema", NULL);
+    if (weight_vec->type != RAY_F64) return ray_error("schema", NULL);
+    double* weights = (double*)ray_data(weight_vec);
+
+    /* Dijkstra requires non-negative edge weights */
+    for (int64_t i = 0; i < m; i++) {
+        if (weights[i] < 0.0)
+            return ray_error("domain", "Dijkstra requires non-negative edge weights");
+    }
+
+    /* Allocate working arrays.
+     * Heap capacity = max(n, m) + 1: each edge relaxation can push one entry,
+     * and with lazy deletion (visited check on pop) the heap can grow up to m. */
+    int64_t heap_cap = (m > n ? m : n) + 1;
+
+    /* Arena for all scratch memory — freed in one shot */
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    double*  dist    = (double*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(double));
+    bool*    visited = (bool*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(bool));
+    int64_t* depth   = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    dijk_entry_t* heap = (dijk_entry_t*)ray_scratch_arena_push(&arena,
+                              (size_t)heap_cap * sizeof(dijk_entry_t));
+    if (!dist || !visited || !depth || !heap) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+    memset(visited, 0, (size_t)n * sizeof(bool));
+    memset(depth, 0, (size_t)n * sizeof(int64_t));
+
+    for (int64_t i = 0; i < n; i++) {
+        dist[i] = 1e308;  /* infinity */
+    }
+    dist[src_id] = 0.0;
+
+    int64_t heap_size = 0;
+    dijk_heap_push(heap, &heap_size, 0.0, src_id);
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* fwd_row = (int64_t*)ray_data(rel->fwd.rowmap);
+
+    while (heap_size > 0) {
+        dijk_entry_t top = dijk_heap_pop(heap, &heap_size);
+        int64_t u = top.node;
+        if (visited[u]) continue;
+        visited[u] = true;
+
+        if (u == dst_id) break;  /* early exit if destination reached */
+
+        for (int64_t j = fwd_off[u]; j < fwd_off[u + 1]; j++) {
+            int64_t v = fwd_tgt[j];
+            int64_t edge_row = fwd_row[j];
+            double w = weights[edge_row];
+            double new_dist = dist[u] + w;
+            if (new_dist < dist[v]) {
+                dist[v] = new_dist;
+                depth[v] = depth[u] + 1;
+                dijk_heap_push(heap, &heap_size, new_dist, v);
+            }
+        }
+    }
+
+    /* Collect reachable nodes */
+    int64_t count = 0;
+    for (int64_t i = 0; i < n; i++) {
+        if (dist[i] < 1e308) count++;
+    }
+
+    ray_t* node_vec  = ray_vec_new(RAY_I64, count);
+    ray_t* dist_vec  = ray_vec_new(RAY_F64, count);
+    ray_t* depth_vec = ray_vec_new(RAY_I64, count);
+    if (!node_vec || RAY_IS_ERR(node_vec) ||
+        !dist_vec || RAY_IS_ERR(dist_vec) ||
+        !depth_vec || RAY_IS_ERR(depth_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (dist_vec && !RAY_IS_ERR(dist_vec)) ray_release(dist_vec);
+        if (depth_vec && !RAY_IS_ERR(depth_vec)) ray_release(depth_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* ndata = (int64_t*)ray_data(node_vec);
+    double*  ddata = (double*)ray_data(dist_vec);
+    int64_t* hdata = (int64_t*)ray_data(depth_vec);
+    int64_t idx = 0;
+    for (int64_t i = 0; i < n; i++) {
+        if (dist[i] < 1e308) {
+            ndata[idx] = i;
+            ddata[idx] = dist[i];
+            hdata[idx] = depth[i];
+            idx++;
+        }
+    }
+    node_vec->len = count;
+    dist_vec->len = count;
+    depth_vec->len = count;
+
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(3);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec);
+        ray_release(dist_vec);
+        ray_release(depth_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_dist", 5), dist_vec);
+    ray_release(dist_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_depth", 6), depth_vec);
+    ray_release(depth_vec);
+
+    return result;
+}
+
+/* exec_wco_join: Worst-Case Optimal Join via general Leapfrog Triejoin */
+ray_t* exec_wco_join(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t** rels = (ray_rel_t**)ext->wco.rels;
+    uint8_t n_rels = ext->wco.n_rels;
+    uint8_t n_vars = ext->wco.n_vars;
+
+    if (!rels || n_rels == 0) return ray_error("schema", NULL);
+    if (n_vars > LFTJ_MAX_VARS) return ray_error("nyi", NULL);
+
+    /* Validate sorted CSR (both fwd and rev, since LFTJ may use either) */
+    for (uint8_t r = 0; r < n_rels; r++) {
+        if (!rels[r] || !rels[r]->fwd.sorted || !rels[r]->rev.sorted)
+            return ray_error("domain", NULL);
+    }
+
+    /* Build binding plan */
+    lftj_enum_ctx_t ctx;
+    memset(&ctx, 0, sizeof(ctx));
+    if (!lftj_build_default_plan(&ctx, rels, n_rels, n_vars))
+        return ray_error("nyi", NULL);
+
+    /* Allocate output buffers */
+    int64_t out_cap = 4096;
+    ray_t* col_data_block;
+    int64_t** col_data = (int64_t**)scratch_alloc(&col_data_block,
+                              (size_t)n_vars * sizeof(int64_t*));
+    if (!col_data) {
+        scratch_free(col_data_block);
+        return ray_error("oom", NULL);
+    }
+
+    for (uint8_t v = 0; v < n_vars; v++) {
+        ray_t* h = ray_alloc((size_t)out_cap * sizeof(int64_t));
+        if (!h) {
+            for (uint8_t j = 0; j < v; j++) ray_free(ctx.buf_hdrs[j]);
+            scratch_free(col_data_block);
+            return ray_error("oom", NULL);
+        }
+        ctx.buf_hdrs[v] = h;
+        col_data[v] = (int64_t*)ray_data(h);
+    }
+
+    ctx.col_data = col_data;
+    ctx.out_count = 0;
+    ctx.out_cap = out_cap;
+    ctx.oom = false;
+
+    /* Run general LFTJ enumeration */
+    lftj_enumerate(&ctx, 0);
+
+    if (ctx.oom) {
+        for (uint8_t v = 0; v < n_vars; v++) ray_free(ctx.buf_hdrs[v]);
+        scratch_free(col_data_block);
+        return ray_error("oom", NULL);
+    }
+
+    /* Build output table */
+    ray_t* result = ray_table_new(n_vars);
+    if (!result || RAY_IS_ERR(result)) {
+        for (uint8_t v = 0; v < n_vars; v++) ray_free(ctx.buf_hdrs[v]);
+        scratch_free(col_data_block);
+        return ray_error("oom", NULL);
+    }
+
+    for (uint8_t v = 0; v < n_vars; v++) {
+        ray_t* vec = ray_vec_from_raw(RAY_I64, ctx.col_data[v], ctx.out_count);
+        ray_free(ctx.buf_hdrs[v]);
+        if (!vec || RAY_IS_ERR(vec)) {
+            for (uint8_t j = v + 1; j < n_vars; j++) ray_free(ctx.buf_hdrs[j]);
+            scratch_free(col_data_block);
+            ray_release(result);
+            return ray_error("oom", NULL);
+        }
+        char name_buf[12];
+        int n = snprintf(name_buf, sizeof(name_buf), "_v%d", v);
+        int64_t name_id = ray_sym_intern(name_buf, (size_t)n);
+        ray_t* new_result = ray_table_add_col(result, name_id, vec);
+        ray_release(vec);
+        if (!new_result || RAY_IS_ERR(new_result)) {
+            for (uint8_t j = v + 1; j < n_vars; j++) ray_free(ctx.buf_hdrs[j]);
+            scratch_free(col_data_block);
+            ray_release(result);
+            return ray_error("oom", NULL);
+        }
+        result = new_result;
+    }
+
+    scratch_free(col_data_block);
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_louvain: community detection via Louvain modularity optimization.
+ * Phase 1 only (no graph contraction).
+ * Maximizes modularity Q = (1/2m) * SUM[(A_ij - k_i*k_j/2m) * delta(c_i, c_j)]
+ * Treats graph as undirected. Uses forward+reverse CSR.
+ * -------------------------------------------------------------------------- */
+ray_t* exec_louvain(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    int64_t m = rel->fwd.n_edges;
+    uint16_t max_iter = ext->graph.max_iter;
+
+    if (n <= 0) return ray_error("length", NULL);
+
+    /* Arena for all scratch memory — freed in one shot */
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    int64_t* community = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* degree    = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* comm_tot  = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    if (!community || !degree || !comm_tot) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* rev_off = (int64_t*)ray_data(rel->rev.offsets);
+    int64_t* rev_tgt = (int64_t*)ray_data(rel->rev.targets);
+
+    /* Initialize: each node in its own community */
+    for (int64_t i = 0; i < n; i++) {
+        community[i] = i;
+        degree[i] = (fwd_off[i+1] - fwd_off[i]) + (rev_off[i+1] - rev_off[i]);
+        comm_tot[i] = degree[i];
+    }
+
+    double two_m = (double)(2 * m);
+    if (two_m == 0) two_m = 1;
+
+    /* Scratch space for per-community edge counts (reused across iterations).
+     * k_i_in[c] = number of edges from node v to community c. */
+    int64_t* k_i_in = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    /* Track which communities were touched so we can reset k_i_in efficiently */
+    int64_t* touched = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    if (!k_i_in || !touched) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+    memset(k_i_in, 0, (size_t)n * sizeof(int64_t));
+
+    for (uint16_t iter = 0; iter < max_iter; iter++) {
+        bool moved = false;
+        for (int64_t v = 0; v < n; v++) {
+            int64_t old_comm = community[v];
+            int64_t n_touched = 0;
+
+            /* Aggregate edges per neighbor community (forward + reverse) */
+            for (int64_t j = fwd_off[v]; j < fwd_off[v + 1]; j++) {
+                int64_t c = community[fwd_tgt[j]];
+                if (c == old_comm) continue;
+                if (k_i_in[c] == 0) touched[n_touched++] = c;
+                k_i_in[c]++;
+            }
+            for (int64_t j = rev_off[v]; j < rev_off[v + 1]; j++) {
+                int64_t c = community[rev_tgt[j]];
+                if (c == old_comm) continue;
+                if (k_i_in[c] == 0) touched[n_touched++] = c;
+                k_i_in[c]++;
+            }
+
+            /* Evaluate modularity gain for each candidate community.
+             * delta_Q = k_i_in[c] / two_m - (sigma_tot[c] * k_v) / (two_m * two_m) */
+            int64_t best_comm = old_comm;
+            double best_gain = 0.0;
+            double k_v = (double)degree[v];
+
+            for (int64_t t = 0; t < n_touched; t++) {
+                int64_t c = touched[t];
+                double sigma_tot = (double)comm_tot[c];
+                double gain = (double)k_i_in[c] / two_m
+                            - (sigma_tot * k_v) / (two_m * two_m);
+                if (gain > best_gain) {
+                    best_gain = gain;
+                    best_comm = c;
+                }
+            }
+
+            /* Reset k_i_in for touched communities */
+            for (int64_t t = 0; t < n_touched; t++) {
+                k_i_in[touched[t]] = 0;
+            }
+
+            if (best_comm != old_comm) {
+                comm_tot[old_comm] -= degree[v];
+                comm_tot[best_comm] += degree[v];
+                community[v] = best_comm;
+                moved = true;
+            }
+        }
+        if (!moved) break;
+    }
+
+    /* Normalize community IDs to 0..k-1 */
+    int64_t* remap = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    if (!remap) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+    for (int64_t i = 0; i < n; i++) remap[i] = -1;
+    int64_t next_id = 0;
+    for (int64_t i = 0; i < n; i++) {
+        int64_t c = community[i];
+        if (remap[c] < 0) remap[c] = next_id++;
+        community[i] = remap[c];
+    }
+
+    /* Build output table */
+    ray_t* node_vec = ray_vec_new(RAY_I64, n);
+    ray_t* comm_vec = ray_vec_new(RAY_I64, n);
+    if (!node_vec || RAY_IS_ERR(node_vec) || !comm_vec || RAY_IS_ERR(comm_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (comm_vec && !RAY_IS_ERR(comm_vec)) ray_release(comm_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* ndata = (int64_t*)ray_data(node_vec);
+    int64_t* cdata = (int64_t*)ray_data(comm_vec);
+    for (int64_t i = 0; i < n; i++) {
+        ndata[i] = i;
+        cdata[i] = community[i];
+    }
+    node_vec->len = n;
+    comm_vec->len = n;
+
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec);
+        ray_release(comm_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_community", 10), comm_vec);
+    ray_release(comm_vec);
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_degree_cent: in/out/total degree from CSR offsets. O(n).
+ * -------------------------------------------------------------------------- */
+ray_t* exec_degree_cent(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    if (n <= 0) return ray_error("length", NULL);
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* rev_off = (int64_t*)ray_data(rel->rev.offsets);
+
+    ray_t* node_vec = ray_vec_new(RAY_I64, n);
+    ray_t* in_vec   = ray_vec_new(RAY_I64, n);
+    ray_t* out_vec  = ray_vec_new(RAY_I64, n);
+    ray_t* deg_vec  = ray_vec_new(RAY_I64, n);
+    if (!node_vec || RAY_IS_ERR(node_vec) ||
+        !in_vec   || RAY_IS_ERR(in_vec)   ||
+        !out_vec  || RAY_IS_ERR(out_vec)  ||
+        !deg_vec  || RAY_IS_ERR(deg_vec)) {
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (in_vec   && !RAY_IS_ERR(in_vec))   ray_release(in_vec);
+        if (out_vec  && !RAY_IS_ERR(out_vec))  ray_release(out_vec);
+        if (deg_vec  && !RAY_IS_ERR(deg_vec))  ray_release(deg_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* ndata   = (int64_t*)ray_data(node_vec);
+    int64_t* in_data = (int64_t*)ray_data(in_vec);
+    int64_t* out_data= (int64_t*)ray_data(out_vec);
+    int64_t* deg_data= (int64_t*)ray_data(deg_vec);
+
+    for (int64_t i = 0; i < n; i++) {
+        ndata[i]    = i;
+        out_data[i] = fwd_off[i + 1] - fwd_off[i];
+        in_data[i]  = rev_off[i + 1] - rev_off[i];
+        deg_data[i] = out_data[i] + in_data[i];
+    }
+    node_vec->len = n;
+    in_vec->len   = n;
+    out_vec->len  = n;
+    deg_vec->len  = n;
+
+    ray_t* result = ray_table_new(4);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec); ray_release(in_vec);
+        ray_release(out_vec);  ray_release(deg_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_in_degree", 10), in_vec);
+    ray_release(in_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_out_degree", 11), out_vec);
+    ray_release(out_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_degree", 7), deg_vec);
+    ray_release(deg_vec);
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_topsort: topological sort via Kahn's algorithm. O(n+m).
+ * Returns error if graph contains a cycle.
+ * -------------------------------------------------------------------------- */
+ray_t* exec_topsort(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    if (n <= 0) return ray_error("length", NULL);
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* rev_off = (int64_t*)ray_data(rel->rev.offsets);
+
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    int64_t* in_deg = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* queue  = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* order  = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    if (!in_deg || !queue || !order) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+
+    /* Compute in-degrees from reverse CSR */
+    for (int64_t i = 0; i < n; i++)
+        in_deg[i] = rev_off[i + 1] - rev_off[i];
+
+    /* Enqueue zero-degree nodes */
+    int64_t head = 0, tail = 0;
+    for (int64_t i = 0; i < n; i++) {
+        if (in_deg[i] == 0) queue[tail++] = i;
+    }
+
+    /* BFS — decrement in-degrees, enqueue new zeros */
+    int64_t count = 0;
+    while (head < tail) {
+        int64_t v = queue[head++];
+        order[v] = count++;
+
+        int64_t start = fwd_off[v];
+        int64_t end   = fwd_off[v + 1];
+        for (int64_t j = start; j < end; j++) {
+            int64_t u = fwd_tgt[j];
+            if (--in_deg[u] == 0) queue[tail++] = u;
+        }
+    }
+
+    /* Cycle detection: not all nodes processed */
+    if (count < n) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("domain", NULL);  /* cycle detected */
+    }
+
+    /* Build result */
+    ray_t* node_vec  = ray_vec_new(RAY_I64, n);
+    ray_t* order_vec = ray_vec_new(RAY_I64, n);
+    if (!node_vec || RAY_IS_ERR(node_vec) || !order_vec || RAY_IS_ERR(order_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (order_vec && !RAY_IS_ERR(order_vec)) ray_release(order_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* ndata = (int64_t*)ray_data(node_vec);
+    int64_t* odata = (int64_t*)ray_data(order_vec);
+    for (int64_t i = 0; i < n; i++) {
+        ndata[i] = i;
+        odata[i] = order[i];
+    }
+    node_vec->len  = n;
+    order_vec->len = n;
+
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec); ray_release(order_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_order", 6), order_vec);
+    ray_release(order_vec);
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_cluster_coeff: clustering coefficient via triangle counting. O(n*d^2).
+ * For each node v, count triangles among undirected neighbors using bitset.
+ * -------------------------------------------------------------------------- */
+ray_t* exec_cluster_coeff(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    if (n <= 0) return ray_error("length", NULL);
+
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    /* Scratch: merged neighbor list per node (max possible size = n) */
+    int64_t* nbrs = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    /* Scratch: quick-lookup set for neighbor checking */
+    uint8_t* in_nbr = (uint8_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(uint8_t));
+    if (!nbrs || !in_nbr) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+    memset(in_nbr, 0, (size_t)n * sizeof(uint8_t));
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* rev_off = (int64_t*)ray_data(rel->rev.offsets);
+    int64_t* rev_tgt = (int64_t*)ray_data(rel->rev.targets);
+
+    /* Allocate result vectors */
+    ray_t* node_vec = ray_vec_new(RAY_I64, n);
+    ray_t* lcc_vec  = ray_vec_new(RAY_F64, n);
+    if (!node_vec || RAY_IS_ERR(node_vec) || !lcc_vec || RAY_IS_ERR(lcc_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (lcc_vec  && !RAY_IS_ERR(lcc_vec))  ray_release(lcc_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* ndata = (int64_t*)ray_data(node_vec);
+    double*  ldata = (double*)ray_data(lcc_vec);
+
+    for (int64_t v = 0; v < n; v++) {
+        ndata[v] = v;
+
+        /* Merge forward and reverse neighbors into deduplicated list */
+        int64_t deg = 0;
+        for (int64_t j = fwd_off[v]; j < fwd_off[v + 1]; j++) {
+            int64_t u = fwd_tgt[j];
+            if (u >= 0 && u < n && !in_nbr[u]) {
+                in_nbr[u] = 1;
+                nbrs[deg++] = u;
+            }
+        }
+        for (int64_t j = rev_off[v]; j < rev_off[v + 1]; j++) {
+            int64_t u = rev_tgt[j];
+            if (u >= 0 && u < n && !in_nbr[u]) {
+                in_nbr[u] = 1;
+                nbrs[deg++] = u;
+            }
+        }
+
+        if (deg < 2) {
+            ldata[v] = 0.0;
+        } else {
+            /* Count directed fwd edges between neighbors of v */
+            int64_t triangles = 0;
+            for (int64_t i = 0; i < deg; i++) {
+                int64_t u = nbrs[i];
+                /* Check fwd edges of u against neighbor set */
+                for (int64_t j = fwd_off[u]; j < fwd_off[u + 1]; j++) {
+                    if (in_nbr[fwd_tgt[j]]) triangles++;
+                }
+            }
+            ldata[v] = (double)triangles / ((double)deg * (double)(deg - 1));
+        }
+
+        /* Reset in_nbr for next node */
+        for (int64_t i = 0; i < deg; i++) in_nbr[nbrs[i]] = 0;
+    }
+
+    node_vec->len = n;
+    lcc_vec->len  = n;
+
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec);
+        ray_release(lcc_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_coefficient", 12), lcc_vec);
+    ray_release(lcc_vec);
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_betweenness: Brandes betweenness centrality. O(n*m) exact,
+ * O(sample*m) approximate when sample_size > 0.
+ * -------------------------------------------------------------------------- */
+ray_t* exec_betweenness(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    if (n <= 0) return ray_error("length", NULL);
+    uint16_t sample = ext->graph.max_iter;
+    int64_t n_sources = (sample > 0 && (int64_t)sample < n) ? (int64_t)sample : n;
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* rev_off = (int64_t*)ray_data(rel->rev.offsets);
+    int64_t* rev_tgt = (int64_t*)ray_data(rel->rev.targets);
+
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    double*  cb      = (double*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(double));
+    double*  sigma   = (double*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(double));
+    double*  delta   = (double*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(double));
+    int64_t* dist    = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* queue   = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* stack   = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+
+    /* Predecessor storage: flat CSR-style array with per-node offsets.
+     * Two-pass approach: BFS counts predecessors per node, prefix-sum builds
+     * offsets, then a second pass over the stack fills pred_data in grouped order. */
+    int64_t m_total = rel->fwd.n_edges + rel->rev.n_edges;
+    if (m_total == 0) m_total = 1;
+    int64_t* pred_data   = (int64_t*)ray_scratch_arena_push(&arena, (size_t)m_total * sizeof(int64_t));
+    int64_t* pred_off    = (int64_t*)ray_scratch_arena_push(&arena, (size_t)(n + 1) * sizeof(int64_t));
+    int64_t* pred_cursor = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    /* Per-v dedup marker: tracks which neighbors were already counted via fwd edges
+     * to avoid double-counting sigma/predecessors for bidirectional edges. */
+    int64_t* seen_epoch  = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+
+    if (!cb || !sigma || !delta || !dist || !queue || !stack ||
+        !pred_data || !pred_off || !pred_cursor || !seen_epoch) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+
+    memset(cb, 0, (size_t)n * sizeof(double));
+
+    int64_t stride = (sample > 0 && (int64_t)sample < n) ? (n / n_sources) : 1;
+
+    for (int64_t si = 0; si < n_sources; si++) {
+        int64_t s = (si * stride) % n;
+
+        /* Initialize */
+        for (int64_t i = 0; i < n; i++) {
+            sigma[i] = 0.0;
+            delta[i] = 0.0;
+            dist[i]  = -1;
+        }
+        sigma[s] = 1.0;
+        dist[s]  = 0;
+        memset(pred_off, 0, (size_t)(n + 1) * sizeof(int64_t));
+        memset(seen_epoch, 0, (size_t)n * sizeof(int64_t));
+
+        /* BFS pass 1: discover nodes, compute sigma, count predecessors */
+        int64_t q_head = 0, q_tail = 0;
+        int64_t stack_top = 0;
+        queue[q_tail++] = s;
+
+        /* Use epoch counter to deduplicate: for each v popped from queue,
+         * mark forward neighbors with epoch, then skip reverse neighbors
+         * already marked (bidirectional edges). Epoch increments per v. */
+        int64_t epoch = 0;
+        while (q_head < q_tail) {
+            int64_t v = queue[q_head++];
+            stack[stack_top++] = v;
+            epoch++;
+
+            /* Forward neighbors */
+            for (int64_t j = fwd_off[v]; j < fwd_off[v + 1]; j++) {
+                int64_t w = fwd_tgt[j];
+                if (dist[w] < 0) {
+                    dist[w] = dist[v] + 1;
+                    queue[q_tail++] = w;
+                }
+                if (dist[w] == dist[v] + 1) {
+                    sigma[w] += sigma[v];
+                    pred_off[w + 1]++;
+                    seen_epoch[w] = epoch;  /* mark w as counted for this v */
+                }
+            }
+            /* Reverse neighbors (undirected), skip if already counted via fwd */
+            for (int64_t j = rev_off[v]; j < rev_off[v + 1]; j++) {
+                int64_t w = rev_tgt[j];
+                if (dist[w] < 0) {
+                    dist[w] = dist[v] + 1;
+                    queue[q_tail++] = w;
+                }
+                if (dist[w] == dist[v] + 1 && seen_epoch[w] != epoch) {
+                    sigma[w] += sigma[v];
+                    pred_off[w + 1]++;
+                }
+            }
+        }
+
+        /* Convert pred_off counts to cumulative offsets */
+        for (int64_t i = 1; i <= n; i++)
+            pred_off[i] += pred_off[i - 1];
+
+        /* BFS pass 2: fill pred_data grouped by target node using write cursors.
+         * Same dedup logic as pass 1 to avoid duplicate predecessor entries. */
+        for (int64_t i = 0; i < n; i++) pred_cursor[i] = pred_off[i];
+        epoch = 0;
+        for (int64_t si2 = 0; si2 < stack_top; si2++) {
+            int64_t v = stack[si2];
+            epoch++;
+            for (int64_t j = fwd_off[v]; j < fwd_off[v + 1]; j++) {
+                int64_t w = fwd_tgt[j];
+                if (dist[w] == dist[v] + 1) {
+                    pred_data[pred_cursor[w]++] = v;
+                    seen_epoch[w] = epoch;
+                }
+            }
+            for (int64_t j = rev_off[v]; j < rev_off[v + 1]; j++) {
+                int64_t w = rev_tgt[j];
+                if (dist[w] == dist[v] + 1 && seen_epoch[w] != epoch)
+                    pred_data[pred_cursor[w]++] = v;
+            }
+        }
+
+        /* Back-propagation of dependencies */
+        while (stack_top > 0) {
+            int64_t w = stack[--stack_top];
+            for (int64_t pi = pred_off[w]; pi < pred_off[w + 1]; pi++) {
+                int64_t v = pred_data[pi];
+                delta[v] += (sigma[v] / sigma[w]) * (1.0 + delta[w]);
+            }
+            if (w != s) cb[w] += delta[w];
+        }
+    }
+
+    /* Undirected normalization: BFS from each source counts every unordered
+     * pair {s,t} twice (once as source=s, once as source=t), so halve. */
+    for (int64_t i = 0; i < n; i++) cb[i] /= 2.0;
+
+    /* Normalize if sampled */
+    if (sample > 0 && (int64_t)sample < n) {
+        double scale = (double)n / (double)sample;
+        for (int64_t i = 0; i < n; i++) cb[i] *= scale;
+    }
+
+    /* Build result table */
+    ray_t* node_vec = ray_vec_new(RAY_I64, n);
+    ray_t* cent_vec = ray_vec_new(RAY_F64, n);
+    if (!node_vec || RAY_IS_ERR(node_vec) || !cent_vec || RAY_IS_ERR(cent_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (cent_vec && !RAY_IS_ERR(cent_vec)) ray_release(cent_vec);
+        return ray_error("oom", NULL);
+    }
+    int64_t* ndata = (int64_t*)ray_data(node_vec);
+    double*  cdata = (double*)ray_data(cent_vec);
+    for (int64_t i = 0; i < n; i++) { ndata[i] = i; cdata[i] = cb[i]; }
+    node_vec->len = n;
+    cent_vec->len = n;
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec); ray_release(cent_vec);
+        return ray_error("oom", NULL);
+    }
+    ray_t* tmp = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(result); ray_release(cent_vec); return ray_error("oom", NULL); }
+    result = tmp;
+    tmp = ray_table_add_col(result, sym_intern_safe("_centrality", 11), cent_vec);
+    ray_release(cent_vec);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(result); return ray_error("oom", NULL); }
+    result = tmp;
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_closeness: closeness centrality via BFS distance sums.
+ * closeness[v] = reachable / sum_dist[v]. O(n*m) exact,
+ * O(sample*m) approximate when sample_size > 0.
+ * -------------------------------------------------------------------------- */
+ray_t* exec_closeness(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    if (n <= 0) return ray_error("length", NULL);
+    uint16_t sample = ext->graph.max_iter;
+    int64_t n_sources = (sample > 0 && (int64_t)sample < n) ? (int64_t)sample : n;
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* rev_off = (int64_t*)ray_data(rel->rev.offsets);
+    int64_t* rev_tgt = (int64_t*)ray_data(rel->rev.targets);
+
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    double*  closeness = (double*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(double));
+    int64_t* dist      = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* queue     = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+
+    if (!closeness || !dist || !queue) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+
+    memset(closeness, 0, (size_t)n * sizeof(double));
+
+    int64_t stride = (sample > 0 && (int64_t)sample < n) ? (n / n_sources) : 1;
+
+    for (int64_t si = 0; si < n_sources; si++) {
+        int64_t s = (si * stride) % n;
+
+        /* Initialize distances */
+        for (int64_t i = 0; i < n; i++) dist[i] = -1;
+        dist[s] = 0;
+
+        /* BFS from s */
+        int64_t q_head = 0, q_tail = 0;
+        queue[q_tail++] = s;
+
+        while (q_head < q_tail) {
+            int64_t v = queue[q_head++];
+
+            /* Forward neighbors */
+            for (int64_t j = fwd_off[v]; j < fwd_off[v + 1]; j++) {
+                int64_t w = fwd_tgt[j];
+                if (dist[w] < 0) {
+                    dist[w] = dist[v] + 1;
+                    queue[q_tail++] = w;
+                }
+            }
+            /* Reverse neighbors (undirected) */
+            for (int64_t j = rev_off[v]; j < rev_off[v + 1]; j++) {
+                int64_t w = rev_tgt[j];
+                if (dist[w] < 0) {
+                    dist[w] = dist[v] + 1;
+                    queue[q_tail++] = w;
+                }
+            }
+        }
+
+        /* Sum distances and count reachable nodes */
+        int64_t sum_dist = 0;
+        int64_t reachable = 0;
+        for (int64_t i = 0; i < n; i++) {
+            if (dist[i] > 0) {
+                sum_dist += dist[i];
+                reachable++;
+            }
+        }
+
+        if (reachable > 0 && sum_dist > 0) {
+            closeness[s] = (double)reachable / (double)sum_dist;
+        }
+    }
+
+    /* Build result table: when sampling, only emit computed nodes */
+    int64_t n_out = n_sources;
+    ray_t* node_vec = ray_vec_new(RAY_I64, n_out);
+    ray_t* cent_vec = ray_vec_new(RAY_F64, n_out);
+    if (!node_vec || RAY_IS_ERR(node_vec) || !cent_vec || RAY_IS_ERR(cent_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (cent_vec && !RAY_IS_ERR(cent_vec)) ray_release(cent_vec);
+        return ray_error("oom", NULL);
+    }
+    int64_t* ndata = (int64_t*)ray_data(node_vec);
+    double*  cdata = (double*)ray_data(cent_vec);
+    if (n_sources == n) {
+        for (int64_t i = 0; i < n; i++) { ndata[i] = i; cdata[i] = closeness[i]; }
+    } else {
+        for (int64_t si = 0; si < n_sources; si++) {
+            int64_t s = (si * stride) % n;
+            ndata[si] = s;
+            cdata[si] = closeness[s];
+        }
+    }
+    node_vec->len = n_out;
+    cent_vec->len = n_out;
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec); ray_release(cent_vec);
+        return ray_error("oom", NULL);
+    }
+    ray_t* tmp = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(result); ray_release(cent_vec); return ray_error("oom", NULL); }
+    result = tmp;
+    tmp = ray_table_add_col(result, sym_intern_safe("_centrality", 11), cent_vec);
+    ray_release(cent_vec);
+    if (!tmp || RAY_IS_ERR(tmp)) { ray_release(result); return ray_error("oom", NULL); }
+    result = tmp;
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_mst: Minimum Spanning Tree / Forest via Kruskal's algorithm.
+ * Collects weighted edges from forward CSR, sorts by weight, builds MST
+ * using union-find with path compression and union by rank.
+ * -------------------------------------------------------------------------- */
+typedef struct { double w; int64_t src; int64_t dst; } mst_edge_t;
+
+static int mst_edge_cmp(const void* a, const void* b) {
+    double da = ((const mst_edge_t*)a)->w;
+    double db = ((const mst_edge_t*)b)->w;
+    return (da > db) - (da < db);
+}
+
+static int64_t uf_find(int64_t* parent, int64_t x) {
+    while (parent[x] != x) { parent[x] = parent[parent[x]]; x = parent[x]; }
+    return x;
+}
+
+static bool uf_union(int64_t* parent, int64_t* rank_arr, int64_t a, int64_t b) {
+    a = uf_find(parent, a); b = uf_find(parent, b);
+    if (a == b) return false;
+    if (rank_arr[a] < rank_arr[b]) { int64_t tmp = a; a = b; b = tmp; }
+    parent[b] = a;
+    if (rank_arr[a] == rank_arr[b]) rank_arr[a]++;
+    return true;
+}
+
+ray_t* exec_mst(ray_graph_t* g, ray_op_t* op) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel || !rel->fwd.props) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    int64_t m = rel->fwd.n_edges;
+    if (n <= 0) return ray_error("length", NULL);
+
+    int64_t weight_sym = ext->graph.weight_col_sym;
+    ray_t* weight_vec = ray_table_get_col(rel->fwd.props, weight_sym);
+    if (!weight_vec || weight_vec->type != RAY_F64) return ray_error("schema", NULL);
+    double* weights = (double*)ray_data(weight_vec);
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* fwd_row = (int64_t*)ray_data(rel->fwd.rowmap);
+
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    mst_edge_t* edges_arr = (mst_edge_t*)ray_scratch_arena_push(&arena,
+                                (size_t)m * sizeof(mst_edge_t));
+    int64_t* uf_parent = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* uf_rank   = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    if (!edges_arr || !uf_parent || !uf_rank) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+
+    /* Fill edge array from forward CSR */
+    int64_t ei = 0;
+    for (int64_t u = 0; u < n; u++) {
+        for (int64_t j = fwd_off[u]; j < fwd_off[u + 1]; j++) {
+            edges_arr[ei].src = u;
+            edges_arr[ei].dst = fwd_tgt[j];
+            edges_arr[ei].w   = weights[fwd_row[j]];
+            ei++;
+        }
+    }
+
+    /* Sort edges by weight */
+    qsort(edges_arr, (size_t)ei, sizeof(mst_edge_t), mst_edge_cmp);
+
+    /* Initialize union-find */
+    for (int64_t i = 0; i < n; i++) { uf_parent[i] = i; uf_rank[i] = 0; }
+
+    /* Build MST */
+    int64_t max_mst = n - 1;
+    int64_t mst_count = 0;
+    ray_t* src_vec = ray_vec_new(RAY_I64, max_mst);
+    ray_t* dst_vec = ray_vec_new(RAY_I64, max_mst);
+    ray_t* wt_vec  = ray_vec_new(RAY_F64, max_mst);
+    if (!src_vec || RAY_IS_ERR(src_vec) ||
+        !dst_vec || RAY_IS_ERR(dst_vec) ||
+        !wt_vec  || RAY_IS_ERR(wt_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (src_vec && !RAY_IS_ERR(src_vec)) ray_release(src_vec);
+        if (dst_vec && !RAY_IS_ERR(dst_vec)) ray_release(dst_vec);
+        if (wt_vec  && !RAY_IS_ERR(wt_vec))  ray_release(wt_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* sdata = (int64_t*)ray_data(src_vec);
+    int64_t* ddata = (int64_t*)ray_data(dst_vec);
+    double*  wdata = (double*)ray_data(wt_vec);
+
+    for (int64_t i = 0; i < ei && mst_count < max_mst; i++) {
+        if (uf_union(uf_parent, uf_rank, edges_arr[i].src, edges_arr[i].dst)) {
+            sdata[mst_count] = edges_arr[i].src;
+            ddata[mst_count] = edges_arr[i].dst;
+            wdata[mst_count] = edges_arr[i].w;
+            mst_count++;
+        }
+    }
+
+    src_vec->len = mst_count;
+    dst_vec->len = mst_count;
+    wt_vec->len  = mst_count;
+
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(3);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(src_vec); ray_release(dst_vec); ray_release(wt_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_src", 4), src_vec);
+    ray_release(src_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_dst", 4), dst_vec);
+    ray_release(dst_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_weight", 7), wt_vec);
+    ray_release(wt_vec);
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_random_walk: random walk from source node using xorshift64 PRNG.
+ * -------------------------------------------------------------------------- */
+ray_t* exec_random_walk(ray_graph_t* g, ray_op_t* op, ray_t* src_val) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    uint16_t walk_len = ext->graph.max_iter;
+    if (n <= 0) return ray_error("length", NULL);
+
+    int64_t start_node;
+    if (ray_is_atom(src_val)) {
+        start_node = src_val->i64;
+    } else {
+        start_node = ((int64_t*)ray_data(src_val))[0];
+    }
+    if (start_node < 0 || start_node >= n) return ray_error("range", NULL);
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+
+    int64_t total = (int64_t)walk_len + 1;
+    ray_t* step_vec = ray_vec_new(RAY_I64, total);
+    ray_t* node_vec = ray_vec_new(RAY_I64, total);
+    if (!step_vec || RAY_IS_ERR(step_vec) || !node_vec || RAY_IS_ERR(node_vec)) {
+        if (step_vec && !RAY_IS_ERR(step_vec)) ray_release(step_vec);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* sdata = (int64_t*)ray_data(step_vec);
+    int64_t* ndata = (int64_t*)ray_data(node_vec);
+
+    /* xorshift64 PRNG seeded from source node */
+    uint64_t rng = (uint64_t)start_node * 6364136223846793005ULL + 1442695040888963407ULL;
+    if (rng == 0) rng = 1;
+
+    int64_t current = start_node;
+    int64_t count = 0;
+    for (int64_t i = 0; i < total; i++) {
+        sdata[i] = i;
+        ndata[i] = current;
+        count++;
+        if (i < walk_len) {
+            int64_t deg = fwd_off[current + 1] - fwd_off[current];
+            if (deg == 0) break;  /* dead end */
+            rng ^= rng << 13; rng ^= rng >> 7; rng ^= rng << 17;
+            int64_t pick = (int64_t)(rng % (uint64_t)deg);
+            current = fwd_tgt[fwd_off[current] + pick];
+        }
+    }
+
+    step_vec->len = count;
+    node_vec->len = count;
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(step_vec); ray_release(node_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_step", 5), step_vec);
+    ray_release(step_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * exec_dfs: depth-first search from source node. O(n+m).
+ * -------------------------------------------------------------------------- */
+ray_t* exec_dfs(ray_graph_t* g, ray_op_t* op, ray_t* src_val) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    uint8_t max_depth = ext->graph.max_depth;
+    if (n <= 0) return ray_error("length", NULL);
+
+    /* Get source node ID */
+    int64_t start_node;
+    if (ray_is_atom(src_val)) {
+        start_node = src_val->i64;
+    } else {
+        start_node = ((int64_t*)ray_data(src_val))[0];
+    }
+    if (start_node < 0 || start_node >= n) return ray_error("range", NULL);
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    /* Stack can hold up to m entries (one per edge traversal) */
+    int64_t m = rel->fwd.n_edges;
+    int64_t stack_cap = m > n ? m + 1 : n + 1;
+
+    int64_t* stack_node   = (int64_t*)ray_scratch_arena_push(&arena, (size_t)stack_cap * sizeof(int64_t));
+    int64_t* stack_depth  = (int64_t*)ray_scratch_arena_push(&arena, (size_t)stack_cap * sizeof(int64_t));
+    int64_t* stack_parent = (int64_t*)ray_scratch_arena_push(&arena, (size_t)stack_cap * sizeof(int64_t));
+    uint8_t* visited      = (uint8_t*)ray_scratch_arena_push(&arena, (size_t)n);
+    int64_t* res_node     = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* res_depth    = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    int64_t* res_parent   = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    if (!stack_node || !stack_depth || !stack_parent || !visited ||
+        !res_node || !res_depth || !res_parent) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+
+    memset(visited, 0, (size_t)n);
+
+    /* Push source */
+    int64_t sp = 0;
+    stack_node[sp]   = start_node;
+    stack_depth[sp]  = 0;
+    stack_parent[sp] = -1;
+    sp++;
+
+    int64_t count = 0;
+
+    while (sp > 0) {
+        sp--;
+        int64_t v = stack_node[sp];
+        int64_t d = stack_depth[sp];
+        int64_t p = stack_parent[sp];
+
+        if (visited[v]) continue;
+        visited[v] = 1;
+
+        res_node[count]   = v;
+        res_depth[count]  = d;
+        res_parent[count] = p;
+        count++;
+
+        if (d < max_depth) {
+            /* Push neighbors in reverse order so first neighbor is visited first */
+            int64_t start = fwd_off[v];
+            int64_t end   = fwd_off[v + 1];
+            for (int64_t j = end - 1; j >= start; j--) {
+                int64_t u = fwd_tgt[j];
+                if (!visited[u]) {
+                    stack_node[sp]   = u;
+                    stack_depth[sp]  = d + 1;
+                    stack_parent[sp] = v;
+                    sp++;
+                }
+            }
+        }
+    }
+
+    /* Build result vectors */
+    ray_t* node_vec   = ray_vec_new(RAY_I64, count);
+    ray_t* depth_vec  = ray_vec_new(RAY_I64, count);
+    ray_t* parent_vec = ray_vec_new(RAY_I64, count);
+    if (!node_vec || RAY_IS_ERR(node_vec) ||
+        !depth_vec || RAY_IS_ERR(depth_vec) ||
+        !parent_vec || RAY_IS_ERR(parent_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (depth_vec && !RAY_IS_ERR(depth_vec)) ray_release(depth_vec);
+        if (parent_vec && !RAY_IS_ERR(parent_vec)) ray_release(parent_vec);
+        return ray_error("oom", NULL);
+    }
+
+    memcpy(ray_data(node_vec),   res_node,   (size_t)count * sizeof(int64_t));
+    memcpy(ray_data(depth_vec),  res_depth,  (size_t)count * sizeof(int64_t));
+    memcpy(ray_data(parent_vec), res_parent, (size_t)count * sizeof(int64_t));
+    node_vec->len   = count;
+    depth_vec->len  = count;
+    parent_vec->len = count;
+
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(3);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec); ray_release(depth_vec); ray_release(parent_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_depth", 6), depth_vec);
+    ray_release(depth_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_parent", 7), parent_vec);
+    ray_release(parent_vec);
+
+    return result;
+}
+
+/* exec_astar: A* shortest path with Euclidean coordinate heuristic */
+ray_t* exec_astar(ray_graph_t* g, ray_op_t* op,
+                         ray_t* src_val, ray_t* dst_val) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel) return ray_error("schema", NULL);
+    if (!rel->fwd.props) return ray_error("schema", NULL);
+
+    ray_t* np = (ray_t*)ext->graph.node_props;
+    if (!np) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    int64_t m = rel->fwd.n_edges;
+    int64_t src_id = src_val->i64;
+    int64_t dst_id = dst_val->i64;
+
+    if (src_id < 0 || src_id >= n) return ray_error("range", NULL);
+    if (dst_id < 0 || dst_id >= n) return ray_error("range", NULL);
+
+    /* Resolve weight column from edge properties */
+    int64_t weight_sym = ext->graph.weight_col_sym;
+    ray_t* weight_vec = ray_table_get_col(rel->fwd.props, weight_sym);
+    if (!weight_vec || RAY_IS_ERR(weight_vec)) return ray_error("schema", NULL);
+    double* weights_arr = (double*)ray_data(weight_vec);
+
+    /* Resolve coordinate columns from node properties */
+    ray_t* lat_vec = ray_table_get_col(np, ext->graph.coord_col_syms[0]);
+    ray_t* lon_vec = ray_table_get_col(np, ext->graph.coord_col_syms[1]);
+    if (!lat_vec || !lon_vec) return ray_error("schema", NULL);
+    double* lat = (double*)ray_data(lat_vec);
+    double* lon = (double*)ray_data(lon_vec);
+
+    int64_t heap_cap = (m > n ? m : n) + 1;
+
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    double*  dist_a    = (double*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(double));
+    bool*    visited = (bool*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(bool));
+    int64_t* depth_a   = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    dijk_entry_t* heap = (dijk_entry_t*)ray_scratch_arena_push(&arena,
+                              (size_t)heap_cap * sizeof(dijk_entry_t));
+    if (!dist_a || !visited || !depth_a || !heap) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+    memset(visited, 0, (size_t)n * sizeof(bool));
+    memset(depth_a, 0, (size_t)n * sizeof(int64_t));
+
+    for (int64_t i = 0; i < n; i++) dist_a[i] = 1e308;
+    dist_a[src_id] = 0.0;
+
+    /* A* uses f = g + h; heap stores f-cost for priority ordering */
+    double dx = lat[src_id] - lat[dst_id];
+    double dy = lon[src_id] - lon[dst_id];
+    double h0 = sqrt(dx * dx + dy * dy);
+    int64_t heap_size = 0;
+    dijk_heap_push(heap, &heap_size, h0, src_id);
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* fwd_row = (int64_t*)ray_data(rel->fwd.rowmap);
+
+    while (heap_size > 0) {
+        dijk_entry_t top = dijk_heap_pop(heap, &heap_size);
+        int64_t u = top.node;
+        if (visited[u]) continue;
+        visited[u] = true;
+
+        if (u == dst_id) break;
+
+        for (int64_t j = fwd_off[u]; j < fwd_off[u + 1]; j++) {
+            int64_t v = fwd_tgt[j];
+            int64_t edge_row = fwd_row[j];
+            double w = weights_arr[edge_row];
+            double new_dist = dist_a[u] + w;
+            if (new_dist < dist_a[v]) {
+                dist_a[v] = new_dist;
+                depth_a[v] = depth_a[u] + 1;
+                /* f = g + h (Euclidean heuristic) */
+                double hdx = lat[v] - lat[dst_id];
+                double hdy = lon[v] - lon[dst_id];
+                double hv = sqrt(hdx * hdx + hdy * hdy);
+                dijk_heap_push(heap, &heap_size, new_dist + hv, v);
+            }
+        }
+    }
+
+    /* Collect reachable nodes */
+    int64_t acount = 0;
+    for (int64_t i = 0; i < n; i++) {
+        if (dist_a[i] < 1e308) acount++;
+    }
+
+    ray_t* node_vec  = ray_vec_new(RAY_I64, acount);
+    ray_t* dist_vec  = ray_vec_new(RAY_F64, acount);
+    ray_t* depth_vec = ray_vec_new(RAY_I64, acount);
+    if (!node_vec || RAY_IS_ERR(node_vec) ||
+        !dist_vec || RAY_IS_ERR(dist_vec) ||
+        !depth_vec || RAY_IS_ERR(depth_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (dist_vec && !RAY_IS_ERR(dist_vec)) ray_release(dist_vec);
+        if (depth_vec && !RAY_IS_ERR(depth_vec)) ray_release(depth_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* ndata_a = (int64_t*)ray_data(node_vec);
+    double*  ddata_a = (double*)ray_data(dist_vec);
+    int64_t* hdata_a = (int64_t*)ray_data(depth_vec);
+    int64_t idx = 0;
+    for (int64_t i = 0; i < n; i++) {
+        if (dist_a[i] < 1e308) {
+            ndata_a[idx] = i;
+            ddata_a[idx] = dist_a[i];
+            hdata_a[idx] = depth_a[i];
+            idx++;
+        }
+    }
+    node_vec->len = acount;
+    dist_vec->len = acount;
+    depth_vec->len = acount;
+
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(3);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(node_vec);
+        ray_release(dist_vec);
+        ray_release(depth_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_dist", 5), dist_vec);
+    ray_release(dist_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_depth", 6), depth_vec);
+    ray_release(depth_vec);
+
+    return result;
+}
+
+/* exec_k_shortest: Yen's k-shortest paths via iterative masked Dijkstra */
+ray_t* exec_k_shortest(ray_graph_t* g, ray_op_t* op,
+                               ray_t* src_val, ray_t* dst_val) {
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    ray_rel_t* rel = (ray_rel_t*)ext->graph.rel;
+    if (!rel || !rel->fwd.props) return ray_error("schema", NULL);
+
+    int64_t n = rel->fwd.n_nodes;
+    int64_t m = rel->fwd.n_edges;
+    int64_t src_id = src_val->i64;
+    int64_t dst_id = dst_val->i64;
+    uint16_t K = ext->graph.max_iter;
+
+    if (src_id < 0 || src_id >= n || dst_id < 0 || dst_id >= n)
+        return ray_error("range", NULL);
+
+    int64_t weight_sym = ext->graph.weight_col_sym;
+    ray_t* weight_vec = ray_table_get_col(rel->fwd.props, weight_sym);
+    if (!weight_vec || RAY_IS_ERR(weight_vec)) return ray_error("schema", NULL);
+    double* weights_k = (double*)ray_data(weight_vec);
+
+    int64_t* fwd_off = (int64_t*)ray_data(rel->fwd.offsets);
+    int64_t* fwd_tgt = (int64_t*)ray_data(rel->fwd.targets);
+    int64_t* fwd_row = (int64_t*)ray_data(rel->fwd.rowmap);
+
+    int64_t heap_cap = (m > n ? m : n) + 1;
+
+    ray_scratch_arena_t arena;
+    ray_scratch_arena_init(&arena);
+
+    /* Dijkstra working arrays */
+    double*       dist_arr  = (double*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(double));
+    int64_t*      parent    = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+    bool*         vis       = (bool*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(bool));
+    dijk_entry_t* heap      = (dijk_entry_t*)ray_scratch_arena_push(&arena,
+                                    (size_t)heap_cap * sizeof(dijk_entry_t));
+    bool*         node_mask = (bool*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(bool));
+    bool*         edge_mask = (bool*)ray_scratch_arena_push(&arena, (size_t)m * sizeof(bool));
+
+    /* Path storage: K paths, each up to n nodes */
+    int64_t* paths_data = (int64_t*)ray_scratch_arena_push(&arena, (size_t)K * (size_t)n * sizeof(int64_t));
+    int64_t* path_lens  = (int64_t*)ray_scratch_arena_push(&arena, (size_t)K * sizeof(int64_t));
+    double*  path_costs = (double*)ray_scratch_arena_push(&arena, (size_t)K * sizeof(double));
+
+    /* Candidate storage */
+    int64_t max_cand = (int64_t)K * n;
+    if (max_cand > 4096) max_cand = 4096;
+    int64_t* cand_data  = (int64_t*)ray_scratch_arena_push(&arena, (size_t)max_cand * (size_t)n * sizeof(int64_t));
+    int64_t* cand_lens  = (int64_t*)ray_scratch_arena_push(&arena, (size_t)max_cand * sizeof(int64_t));
+    double*  cand_costs = (double*)ray_scratch_arena_push(&arena, (size_t)max_cand * sizeof(double));
+
+    /* Temp buffer for path reconstruction */
+    int64_t* tmp_path = (int64_t*)ray_scratch_arena_push(&arena, (size_t)n * sizeof(int64_t));
+
+    if (!dist_arr || !parent || !vis || !heap || !node_mask || !edge_mask ||
+        !paths_data || !path_lens || !path_costs ||
+        !cand_data || !cand_lens || !cand_costs || !tmp_path) {
+        ray_scratch_arena_reset(&arena);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t num_found = 0;
+    int64_t num_cand  = 0;
+
+    /* Step 1: Find shortest path P[0] */
+    double d = dijkstra_masked(fwd_off, fwd_tgt, fwd_row, weights_k, n,
+                                src_id, dst_id, NULL, NULL,
+                                dist_arr, parent, heap, vis);
+
+    if (d >= 1e308) {
+        ray_scratch_arena_reset(&arena);
+        ray_t* nv = ray_vec_new(RAY_I64, 0); nv->len = 0;
+        ray_t* dv = ray_vec_new(RAY_F64, 0); dv->len = 0;
+        ray_t* pv = ray_vec_new(RAY_I64, 0); pv->len = 0;
+        ray_t* result = ray_table_new(3);
+        result = ray_table_add_col(result, sym_intern_safe("_path_id", 8), pv); ray_release(pv);
+        result = ray_table_add_col(result, sym_intern_safe("_node", 5), nv); ray_release(nv);
+        result = ray_table_add_col(result, sym_intern_safe("_dist", 5), dv); ray_release(dv);
+        return result;
+    }
+
+    /* Reconstruct P[0] from parent array (reverse then flip) */
+    int64_t plen = 0;
+    for (int64_t v = dst_id; v != -1; v = parent[v]) {
+        tmp_path[plen++] = v;
+        if (plen > n) break;  /* safety: avoid infinite loop on corrupt parent */
+    }
+    for (int64_t i = 0; i < plen / 2; i++) {
+        int64_t tmp = tmp_path[i];
+        tmp_path[i] = tmp_path[plen - 1 - i];
+        tmp_path[plen - 1 - i] = tmp;
+    }
+
+    memcpy(&paths_data[0], tmp_path, (size_t)plen * sizeof(int64_t));
+    path_lens[0] = plen;
+    path_costs[0] = d;
+    num_found = 1;
+
+    /* Step 2: Iteratively find paths P[1]..P[K-1] */
+    for (uint16_t k = 1; k < K; k++) {
+        int64_t* prev_path = &paths_data[(int64_t)(k - 1) * n];
+        int64_t prev_len = path_lens[k - 1];
+
+        for (int64_t i = 0; i < prev_len - 1; i++) {
+            int64_t spur_node = prev_path[i];
+
+            /* Compute root path cost */
+            double root_cost = 0.0;
+            for (int64_t r = 0; r < i; r++) {
+                int64_t from = prev_path[r];
+                int64_t to   = prev_path[r + 1];
+                for (int64_t e = fwd_off[from]; e < fwd_off[from + 1]; e++) {
+                    if (fwd_tgt[e] == to) {
+                        root_cost += weights_k[fwd_row[e]];
+                        break;
+                    }
+                }
+            }
+
+            /* Mask edges used by found paths sharing the root prefix */
+            memset(edge_mask, 0, (size_t)m * sizeof(bool));
+            memset(node_mask, 0, (size_t)n * sizeof(bool));
+
+            for (int64_t j = 0; j < num_found; j++) {
+                int64_t* pj = &paths_data[j * n];
+                int64_t pj_len = path_lens[j];
+                if (pj_len <= i) continue;
+
+                bool same_prefix = true;
+                for (int64_t r = 0; r <= i; r++) {
+                    if (pj[r] != prev_path[r]) { same_prefix = false; break; }
+                }
+                if (!same_prefix) continue;
+
+                int64_t from = pj[i];
+                int64_t to   = pj[i + 1];
+                for (int64_t e = fwd_off[from]; e < fwd_off[from + 1]; e++) {
+                    if (fwd_tgt[e] == to) { edge_mask[e] = true; break; }
+                }
+            }
+
+            /* Mask root path nodes except spur node */
+            for (int64_t r = 0; r < i; r++) {
+                node_mask[prev_path[r]] = true;
+            }
+
+            /* Dijkstra from spur to dst with masks */
+            double spur_dist = dijkstra_masked(fwd_off, fwd_tgt, fwd_row, weights_k, n,
+                                                spur_node, dst_id, node_mask, edge_mask,
+                                                dist_arr, parent, heap, vis);
+            if (spur_dist >= 1e308) continue;
+
+            /* Reconstruct spur path */
+            int64_t spur_len = 0;
+            for (int64_t v = dst_id; v != -1; v = parent[v]) {
+                tmp_path[spur_len++] = v;
+                if (spur_len > n) break;
+            }
+            for (int64_t a = 0; a < spur_len / 2; a++) {
+                int64_t tmp = tmp_path[a];
+                tmp_path[a] = tmp_path[spur_len - 1 - a];
+                tmp_path[spur_len - 1 - a] = tmp;
+            }
+
+            double total_cost = root_cost + spur_dist;
+            int64_t total_len = i + spur_len;
+            if (total_len > n || num_cand >= max_cand) continue;
+
+            /* Check for duplicate candidates */
+            bool dup = false;
+            for (int64_t c = 0; c < num_cand && !dup; c++) {
+                if (cand_lens[c] != total_len) continue;
+                bool same = true;
+                int64_t* cp = &cand_data[c * n];
+                for (int64_t r = 0; r < i && same; r++) {
+                    if (cp[r] != prev_path[r]) same = false;
+                }
+                for (int64_t r = 0; r < spur_len && same; r++) {
+                    if (cp[i + r] != tmp_path[r]) same = false;
+                }
+                if (same) dup = true;
+            }
+            /* Check against already-found paths */
+            for (int64_t f = 0; f < num_found && !dup; f++) {
+                if (path_lens[f] != total_len) continue;
+                bool same = true;
+                int64_t* fp = &paths_data[f * n];
+                for (int64_t r = 0; r < i && same; r++) {
+                    if (fp[r] != prev_path[r]) same = false;
+                }
+                for (int64_t r = 0; r < spur_len && same; r++) {
+                    if (fp[i + r] != tmp_path[r]) same = false;
+                }
+                if (same) dup = true;
+            }
+            if (dup) continue;
+
+            /* Store candidate: root_path[0..i-1] + spur_path */
+            int64_t* cp = &cand_data[num_cand * n];
+            memcpy(cp, prev_path, (size_t)i * sizeof(int64_t));
+            memcpy(cp + i, tmp_path, (size_t)spur_len * sizeof(int64_t));
+            cand_lens[num_cand] = total_len;
+            cand_costs[num_cand] = total_cost;
+            num_cand++;
+        }
+
+        if (num_cand == 0) break;
+
+        /* Pick cheapest candidate */
+        int64_t best = 0;
+        for (int64_t c = 1; c < num_cand; c++) {
+            if (cand_costs[c] < cand_costs[best]) best = c;
+        }
+
+        memcpy(&paths_data[(int64_t)k * n], &cand_data[best * n],
+               (size_t)cand_lens[best] * sizeof(int64_t));
+        path_lens[k] = cand_lens[best];
+        path_costs[k] = cand_costs[best];
+        num_found++;
+
+        /* Remove used candidate (swap with last) */
+        if (best < num_cand - 1) {
+            memcpy(&cand_data[best * n], &cand_data[(num_cand - 1) * n],
+                   (size_t)cand_lens[num_cand - 1] * sizeof(int64_t));
+            cand_lens[best] = cand_lens[num_cand - 1];
+            cand_costs[best] = cand_costs[num_cand - 1];
+        }
+        num_cand--;
+    }
+
+    /* Build output: _path_id, _node, _dist (running dist along each path) */
+    int64_t total_rows = 0;
+    for (int64_t k = 0; k < num_found; k++) total_rows += path_lens[k];
+
+    ray_t* pid_vec  = ray_vec_new(RAY_I64, total_rows);
+    ray_t* node_vec = ray_vec_new(RAY_I64, total_rows);
+    ray_t* dist_vec = ray_vec_new(RAY_F64, total_rows);
+    if (!pid_vec  || RAY_IS_ERR(pid_vec) ||
+        !node_vec || RAY_IS_ERR(node_vec) ||
+        !dist_vec || RAY_IS_ERR(dist_vec)) {
+        ray_scratch_arena_reset(&arena);
+        if (pid_vec  && !RAY_IS_ERR(pid_vec))  ray_release(pid_vec);
+        if (node_vec && !RAY_IS_ERR(node_vec)) ray_release(node_vec);
+        if (dist_vec && !RAY_IS_ERR(dist_vec)) ray_release(dist_vec);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t* pids  = (int64_t*)ray_data(pid_vec);
+    int64_t* nodes_k = (int64_t*)ray_data(node_vec);
+    double*  dists = (double*)ray_data(dist_vec);
+
+    int64_t row = 0;
+    for (int64_t k = 0; k < num_found; k++) {
+        int64_t* path = &paths_data[k * n];
+        int64_t pk_len = path_lens[k];
+        double running = 0.0;
+        for (int64_t j = 0; j < pk_len; j++) {
+            pids[row]  = k;
+            nodes_k[row] = path[j];
+            if (j > 0) {
+                int64_t from = path[j - 1];
+                int64_t to   = path[j];
+                for (int64_t e = fwd_off[from]; e < fwd_off[from + 1]; e++) {
+                    if (fwd_tgt[e] == to) {
+                        running += weights_k[fwd_row[e]];
+                        break;
+                    }
+                }
+            }
+            dists[row] = running;
+            row++;
+        }
+    }
+
+    pid_vec->len  = total_rows;
+    node_vec->len = total_rows;
+    dist_vec->len = total_rows;
+
+    ray_scratch_arena_reset(&arena);
+
+    ray_t* result = ray_table_new(3);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(pid_vec); ray_release(node_vec); ray_release(dist_vec);
+        return ray_error("oom", NULL);
+    }
+    result = ray_table_add_col(result, sym_intern_safe("_path_id", 8), pid_vec);
+    ray_release(pid_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_node", 5), node_vec);
+    ray_release(node_vec);
+    result = ray_table_add_col(result, sym_intern_safe("_dist", 5), dist_vec);
+    ray_release(dist_vec);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/window.c b/crates/rayforce-sys/vendor/rayforce/src/ops/window.c
new file mode 100644
index 0000000..75c8d94
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/ops/window.c
@@ -0,0 +1,1223 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "ops/internal.h"
+
+/* ============================================================================
+ * Window function execution
+ * ============================================================================ */
+
+/* Compare rows ra and rb on the given key columns. Returns true if any differ. */
+static inline bool win_keys_differ(ray_t* const* vecs, uint8_t n_keys,
+                                    int64_t ra, int64_t rb) {
+    for (uint8_t k = 0; k < n_keys; k++) {
+        ray_t* col = vecs[k];
+        if (!col) continue;
+        switch (col->type) {
+        case RAY_I64: case RAY_TIMESTAMP:
+            if (((const int64_t*)ray_data(col))[ra] !=
+                ((const int64_t*)ray_data(col))[rb]) return true;
+            break;
+        case RAY_F64: {
+            double a = ((const double*)ray_data(col))[ra];
+            double b = ((const double*)ray_data(col))[rb];
+            if (a != b) return true;
+            break;
+        }
+        case RAY_I32: case RAY_DATE: case RAY_TIME:
+            if (((const int32_t*)ray_data(col))[ra] !=
+                ((const int32_t*)ray_data(col))[rb]) return true;
+            break;
+        case RAY_SYM:
+            if (ray_read_sym(ray_data(col), ra, col->type, col->attrs) !=
+                ray_read_sym(ray_data(col), rb, col->type, col->attrs)) return true;
+            break;
+        case RAY_I16:
+            if (((const int16_t*)ray_data(col))[ra] !=
+                ((const int16_t*)ray_data(col))[rb]) return true;
+            break;
+        case RAY_BOOL: case RAY_U8:
+            if (((const uint8_t*)ray_data(col))[ra] !=
+                ((const uint8_t*)ray_data(col))[rb]) return true;
+            break;
+        case RAY_STR: {
+            const ray_str_t* elems;
+            const char* pool;
+            str_resolve(col, &elems, &pool);
+            if (!ray_str_t_eq(&elems[ra], pool, &elems[rb], pool)) return true;
+            break;
+        }
+        default: break;
+        }
+    }
+    return false;
+}
+
+static inline double win_read_f64(ray_t* col, int64_t row) {
+    switch (col->type) {
+    case RAY_F64: return ((const double*)ray_data(col))[row];
+    case RAY_I64: case RAY_TIMESTAMP:
+        return (double)((const int64_t*)ray_data(col))[row];
+    case RAY_I32: case RAY_DATE: case RAY_TIME:
+        return (double)((const int32_t*)ray_data(col))[row];
+    case RAY_SYM:
+        return (double)ray_read_sym(ray_data(col), row, col->type, col->attrs);
+    case RAY_I16: return (double)((const int16_t*)ray_data(col))[row];
+    case RAY_BOOL: case RAY_U8: return (double)((const uint8_t*)ray_data(col))[row];
+    default: return 0.0;
+    }
+}
+
+static inline int64_t win_read_i64(ray_t* col, int64_t row) {
+    switch (col->type) {
+    case RAY_I64: case RAY_TIMESTAMP:
+        return ((const int64_t*)ray_data(col))[row];
+    case RAY_I32: case RAY_DATE: case RAY_TIME:
+        return (int64_t)((const int32_t*)ray_data(col))[row];
+    case RAY_SYM:
+        return ray_read_sym(ray_data(col), row, col->type, col->attrs);
+    case RAY_F64: return (int64_t)((const double*)ray_data(col))[row];
+    case RAY_I16: return (int64_t)((const int16_t*)ray_data(col))[row];
+    case RAY_BOOL: case RAY_U8: return (int64_t)((const uint8_t*)ray_data(col))[row];
+    default: return 0;
+    }
+}
+
+/* Aliases for shared parallel null helpers from internal.h */
+#define win_set_null       par_set_null
+#define win_prepare_nullmap par_prepare_nullmap
+#define win_finalize_nulls par_finalize_nulls
+
+/* Resolve a graph op node to a column vector from tbl */
+static ray_t* win_resolve_vec(ray_graph_t* g, ray_op_t* key_op, ray_t* tbl,
+                              uint8_t* owned) {
+    ray_op_ext_t* key_ext = find_ext(g, key_op->id);
+    if (key_ext && key_ext->base.opcode == OP_SCAN) {
+        *owned = 0;
+        return ray_table_get_col(tbl, key_ext->sym);
+    }
+    *owned = 1;
+    ray_t* saved = g->table;
+    g->table = tbl;
+    ray_t* v = exec_node(g, key_op);
+    g->table = saved;
+    return v;
+}
+
+/* Compute window functions for one partition [ps, pe) in sorted_idx */
+static void win_compute_partition(
+    ray_t* const* order_vecs, uint8_t n_order,
+    ray_t* const* func_vecs, const uint8_t* func_kinds, const int64_t* func_params,
+    uint8_t n_funcs,
+    uint8_t frame_start, uint8_t frame_end,
+    const int64_t* sorted_idx, int64_t ps, int64_t pe,
+    ray_t* const* result_vecs, const bool* is_f64)
+{
+    if (ps >= pe) return; /* empty partition — nothing to compute */
+    int64_t part_len = pe - ps;
+
+    for (uint8_t f = 0; f < n_funcs; f++) {
+        uint8_t kind = func_kinds[f];
+        ray_t* fvec = func_vecs[f];
+        ray_t* rvec = result_vecs[f];
+        bool whole = (frame_start == RAY_BOUND_UNBOUNDED_PRECEDING &&
+                      frame_end == RAY_BOUND_UNBOUNDED_FOLLOWING);
+
+        switch (kind) {
+        case RAY_WIN_ROW_NUMBER: {
+            int64_t* out = (int64_t*)ray_data(rvec);
+            for (int64_t i = ps; i < pe; i++)
+                out[sorted_idx[i]] = i - ps + 1;
+            break;
+        }
+        case RAY_WIN_RANK: {
+            int64_t* out = (int64_t*)ray_data(rvec);
+            int64_t rank = 1;
+            out[sorted_idx[ps]] = 1;
+            for (int64_t i = ps + 1; i < pe; i++) {
+                if (n_order > 0 && win_keys_differ(order_vecs, n_order,
+                        sorted_idx[i-1], sorted_idx[i]))
+                    rank = i - ps + 1;
+                out[sorted_idx[i]] = rank;
+            }
+            break;
+        }
+        case RAY_WIN_DENSE_RANK: {
+            int64_t* out = (int64_t*)ray_data(rvec);
+            int64_t rank = 1;
+            out[sorted_idx[ps]] = 1;
+            for (int64_t i = ps + 1; i < pe; i++) {
+                if (n_order > 0 && win_keys_differ(order_vecs, n_order,
+                        sorted_idx[i-1], sorted_idx[i]))
+                    rank++;
+                out[sorted_idx[i]] = rank;
+            }
+            break;
+        }
+        case RAY_WIN_NTILE: {
+            int64_t n = func_params[f];
+            if (n <= 0) n = 1;
+            int64_t* out = (int64_t*)ray_data(rvec);
+            for (int64_t i = ps; i < pe; i++)
+                out[sorted_idx[i]] = ((i - ps) * n) / part_len + 1;
+            break;
+        }
+        case RAY_WIN_COUNT: {
+            int64_t* out = (int64_t*)ray_data(rvec);
+            if (whole) {
+                for (int64_t i = ps; i < pe; i++)
+                    out[sorted_idx[i]] = part_len;
+            } else {
+                for (int64_t i = ps; i < pe; i++)
+                    out[sorted_idx[i]] = i - ps + 1;
+            }
+            break;
+        }
+        case RAY_WIN_SUM: {
+            if (!fvec) break;
+            if (is_f64[f]) {
+                double* out = (double*)ray_data(rvec);
+                if (whole) {
+                    double t = 0.0;
+                    for (int64_t i = ps; i < pe; i++)
+                        if (!ray_vec_is_null(fvec, sorted_idx[i]))
+                            t += win_read_f64(fvec, sorted_idx[i]);
+                    for (int64_t i = ps; i < pe; i++)
+                        out[sorted_idx[i]] = t;
+                } else {
+                    double acc = 0.0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (!ray_vec_is_null(fvec, sorted_idx[i]))
+                            acc += win_read_f64(fvec, sorted_idx[i]);
+                        out[sorted_idx[i]] = acc;
+                    }
+                }
+            } else {
+                int64_t* out = (int64_t*)ray_data(rvec);
+                if (whole) {
+                    int64_t t = 0;
+                    for (int64_t i = ps; i < pe; i++)
+                        if (!ray_vec_is_null(fvec, sorted_idx[i]))
+                            t += win_read_i64(fvec, sorted_idx[i]);
+                    for (int64_t i = ps; i < pe; i++)
+                        out[sorted_idx[i]] = t;
+                } else {
+                    int64_t acc = 0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (!ray_vec_is_null(fvec, sorted_idx[i]))
+                            acc += win_read_i64(fvec, sorted_idx[i]);
+                        out[sorted_idx[i]] = acc;
+                    }
+                }
+            }
+            break;
+        }
+        case RAY_WIN_AVG: {
+            if (!fvec) break;
+            double* out = (double*)ray_data(rvec);
+            if (whole) {
+                double t = 0.0;
+                int64_t cnt = 0;
+                for (int64_t i = ps; i < pe; i++)
+                    if (!ray_vec_is_null(fvec, sorted_idx[i])) {
+                        t += win_read_f64(fvec, sorted_idx[i]); cnt++;
+                    }
+                if (cnt > 0) {
+                    double avg = t / (double)cnt;
+                    for (int64_t i = ps; i < pe; i++)
+                        out[sorted_idx[i]] = avg;
+                } else {
+                    for (int64_t i = ps; i < pe; i++)
+                        win_set_null(rvec, sorted_idx[i]);
+                }
+            } else {
+                double acc = 0.0;
+                int64_t cnt = 0;
+                for (int64_t i = ps; i < pe; i++) {
+                    if (!ray_vec_is_null(fvec, sorted_idx[i])) {
+                        acc += win_read_f64(fvec, sorted_idx[i]); cnt++;
+                    }
+                    if (cnt > 0)
+                        out[sorted_idx[i]] = acc / (double)cnt;
+                    else
+                        win_set_null(rvec, sorted_idx[i]);
+                }
+            }
+            break;
+        }
+        case RAY_WIN_MIN: {
+            if (!fvec) break;
+            if (is_f64[f]) {
+                double* out = (double*)ray_data(rvec);
+                if (whole) {
+                    double mn = DBL_MAX; int found = 0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (ray_vec_is_null(fvec, sorted_idx[i])) continue;
+                        double v = win_read_f64(fvec, sorted_idx[i]);
+                        if (!found || v < mn) { mn = v; found = 1; }
+                    }
+                    if (found) {
+                        for (int64_t i = ps; i < pe; i++)
+                            out[sorted_idx[i]] = mn;
+                    } else {
+                        for (int64_t i = ps; i < pe; i++)
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                } else {
+                    double mn = DBL_MAX; int found = 0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (!ray_vec_is_null(fvec, sorted_idx[i])) {
+                            double v = win_read_f64(fvec, sorted_idx[i]);
+                            if (!found || v < mn) { mn = v; found = 1; }
+                        }
+                        if (found)
+                            out[sorted_idx[i]] = mn;
+                        else
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            } else {
+                int64_t* out = (int64_t*)ray_data(rvec);
+                if (whole) {
+                    int64_t mn = INT64_MAX; int found = 0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (ray_vec_is_null(fvec, sorted_idx[i])) continue;
+                        int64_t v = win_read_i64(fvec, sorted_idx[i]);
+                        if (!found || v < mn) { mn = v; found = 1; }
+                    }
+                    if (found) {
+                        for (int64_t i = ps; i < pe; i++)
+                            out[sorted_idx[i]] = mn;
+                    } else {
+                        for (int64_t i = ps; i < pe; i++)
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                } else {
+                    int64_t mn = INT64_MAX; int found = 0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (!ray_vec_is_null(fvec, sorted_idx[i])) {
+                            int64_t v = win_read_i64(fvec, sorted_idx[i]);
+                            if (!found || v < mn) { mn = v; found = 1; }
+                        }
+                        if (found)
+                            out[sorted_idx[i]] = mn;
+                        else
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            }
+            break;
+        }
+        case RAY_WIN_MAX: {
+            if (!fvec) break;
+            if (is_f64[f]) {
+                double* out = (double*)ray_data(rvec);
+                if (whole) {
+                    double mx = -DBL_MAX; int found = 0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (ray_vec_is_null(fvec, sorted_idx[i])) continue;
+                        double v = win_read_f64(fvec, sorted_idx[i]);
+                        if (!found || v > mx) { mx = v; found = 1; }
+                    }
+                    if (found) {
+                        for (int64_t i = ps; i < pe; i++)
+                            out[sorted_idx[i]] = mx;
+                    } else {
+                        for (int64_t i = ps; i < pe; i++)
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                } else {
+                    double mx = -DBL_MAX; int found = 0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (!ray_vec_is_null(fvec, sorted_idx[i])) {
+                            double v = win_read_f64(fvec, sorted_idx[i]);
+                            if (!found || v > mx) { mx = v; found = 1; }
+                        }
+                        if (found)
+                            out[sorted_idx[i]] = mx;
+                        else
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            } else {
+                int64_t* out = (int64_t*)ray_data(rvec);
+                if (whole) {
+                    int64_t mx = INT64_MIN; int found = 0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (ray_vec_is_null(fvec, sorted_idx[i])) continue;
+                        int64_t v = win_read_i64(fvec, sorted_idx[i]);
+                        if (!found || v > mx) { mx = v; found = 1; }
+                    }
+                    if (found) {
+                        for (int64_t i = ps; i < pe; i++)
+                            out[sorted_idx[i]] = mx;
+                    } else {
+                        for (int64_t i = ps; i < pe; i++)
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                } else {
+                    int64_t mx = INT64_MIN; int found = 0;
+                    for (int64_t i = ps; i < pe; i++) {
+                        if (!ray_vec_is_null(fvec, sorted_idx[i])) {
+                            int64_t v = win_read_i64(fvec, sorted_idx[i]);
+                            if (!found || v > mx) { mx = v; found = 1; }
+                        }
+                        if (found)
+                            out[sorted_idx[i]] = mx;
+                        else
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            }
+            break;
+        }
+        case RAY_WIN_LAG: {
+            if (!fvec) break;
+            int64_t offset = func_params[f];
+            if (offset <= 0) offset = 1;
+            if (is_f64[f]) {
+                double* out = (double*)ray_data(rvec);
+                for (int64_t i = ps; i < pe; i++) {
+                    int64_t src = i - offset;
+                    if (src >= ps) {
+                        out[sorted_idx[i]] = win_read_f64(fvec, sorted_idx[src]);
+                        if (ray_vec_is_null(fvec, sorted_idx[src]))
+                            win_set_null(rvec, sorted_idx[i]);
+                    } else {
+                        out[sorted_idx[i]] = 0.0;
+                        win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            } else {
+                int64_t* out = (int64_t*)ray_data(rvec);
+                for (int64_t i = ps; i < pe; i++) {
+                    int64_t src = i - offset;
+                    if (src >= ps) {
+                        out[sorted_idx[i]] = win_read_i64(fvec, sorted_idx[src]);
+                        if (ray_vec_is_null(fvec, sorted_idx[src]))
+                            win_set_null(rvec, sorted_idx[i]);
+                    } else {
+                        out[sorted_idx[i]] = 0;
+                        win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            }
+            break;
+        }
+        case RAY_WIN_LEAD: {
+            if (!fvec) break;
+            int64_t offset = func_params[f];
+            if (offset <= 0) offset = 1;
+            if (is_f64[f]) {
+                double* out = (double*)ray_data(rvec);
+                for (int64_t i = ps; i < pe; i++) {
+                    int64_t src = i + offset;
+                    if (src < pe) {
+                        out[sorted_idx[i]] = win_read_f64(fvec, sorted_idx[src]);
+                        if (ray_vec_is_null(fvec, sorted_idx[src]))
+                            win_set_null(rvec, sorted_idx[i]);
+                    } else {
+                        out[sorted_idx[i]] = 0.0;
+                        win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            } else {
+                int64_t* out = (int64_t*)ray_data(rvec);
+                for (int64_t i = ps; i < pe; i++) {
+                    int64_t src = i + offset;
+                    if (src < pe) {
+                        out[sorted_idx[i]] = win_read_i64(fvec, sorted_idx[src]);
+                        if (ray_vec_is_null(fvec, sorted_idx[src]))
+                            win_set_null(rvec, sorted_idx[i]);
+                    } else {
+                        out[sorted_idx[i]] = 0;
+                        win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            }
+            break;
+        }
+        case RAY_WIN_FIRST_VALUE: {
+            if (!fvec) break;
+            bool first_null = ray_vec_is_null(fvec, sorted_idx[ps]);
+            if (is_f64[f]) {
+                double* out = (double*)ray_data(rvec);
+                double first = first_null ? 0.0 : win_read_f64(fvec, sorted_idx[ps]);
+                for (int64_t i = ps; i < pe; i++) {
+                    out[sorted_idx[i]] = first;
+                    if (first_null) win_set_null(rvec, sorted_idx[i]);
+                }
+            } else {
+                int64_t* out = (int64_t*)ray_data(rvec);
+                int64_t first = first_null ? 0 : win_read_i64(fvec, sorted_idx[ps]);
+                for (int64_t i = ps; i < pe; i++) {
+                    out[sorted_idx[i]] = first;
+                    if (first_null) win_set_null(rvec, sorted_idx[i]);
+                }
+            }
+            break;
+        }
+        case RAY_WIN_LAST_VALUE: {
+            if (!fvec) break;
+            if (is_f64[f]) {
+                double* out = (double*)ray_data(rvec);
+                if (whole) {
+                    bool lnull = ray_vec_is_null(fvec, sorted_idx[pe - 1]);
+                    double last = lnull ? 0.0 : win_read_f64(fvec, sorted_idx[pe - 1]);
+                    for (int64_t i = ps; i < pe; i++) {
+                        out[sorted_idx[i]] = last;
+                        if (lnull) win_set_null(rvec, sorted_idx[i]);
+                    }
+                } else {
+                    for (int64_t i = ps; i < pe; i++) {
+                        out[sorted_idx[i]] = win_read_f64(fvec, sorted_idx[i]);
+                        if (ray_vec_is_null(fvec, sorted_idx[i]))
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            } else {
+                int64_t* out = (int64_t*)ray_data(rvec);
+                if (whole) {
+                    bool lnull = ray_vec_is_null(fvec, sorted_idx[pe - 1]);
+                    int64_t last = lnull ? 0 : win_read_i64(fvec, sorted_idx[pe - 1]);
+                    for (int64_t i = ps; i < pe; i++) {
+                        out[sorted_idx[i]] = last;
+                        if (lnull) win_set_null(rvec, sorted_idx[i]);
+                    }
+                } else {
+                    for (int64_t i = ps; i < pe; i++) {
+                        out[sorted_idx[i]] = win_read_i64(fvec, sorted_idx[i]);
+                        if (ray_vec_is_null(fvec, sorted_idx[i]))
+                            win_set_null(rvec, sorted_idx[i]);
+                    }
+                }
+            }
+            break;
+        }
+        case RAY_WIN_NTH_VALUE: {
+            if (!fvec) break;
+            int64_t nth = func_params[f];
+            if (nth < 1) nth = 1;
+            bool nth_null = (nth > part_len) ||
+                            ray_vec_is_null(fvec, sorted_idx[ps + nth - 1]);
+            if (is_f64[f]) {
+                double* out = (double*)ray_data(rvec);
+                double val = nth_null ? 0.0 : win_read_f64(fvec, sorted_idx[ps + nth - 1]);
+                for (int64_t i = ps; i < pe; i++) {
+                    out[sorted_idx[i]] = val;
+                    if (nth_null) win_set_null(rvec, sorted_idx[i]);
+                }
+            } else {
+                int64_t* out = (int64_t*)ray_data(rvec);
+                int64_t val = nth_null ? 0 : win_read_i64(fvec, sorted_idx[ps + nth - 1]);
+                for (int64_t i = ps; i < pe; i++) {
+                    out[sorted_idx[i]] = val;
+                    if (nth_null) win_set_null(rvec, sorted_idx[i]);
+                }
+            }
+            break;
+        }
+        } /* switch */
+    } /* for each func */
+}
+
+/* Parallel per-partition window compute context */
+typedef struct {
+    ray_t** order_vecs;
+    uint8_t n_order;
+    ray_t** func_vecs;
+    uint8_t* func_kinds;
+    int64_t* func_params;
+    uint8_t n_funcs;
+    uint8_t frame_start;
+    uint8_t frame_end;
+    int64_t* sorted_idx;
+    int64_t* part_offsets;
+    ray_t** result_vecs;
+    bool* is_f64;
+} win_par_ctx_t;
+
+static void win_par_fn(void* arg, uint32_t worker_id,
+                       int64_t start, int64_t end) {
+    (void)worker_id;
+    win_par_ctx_t* ctx = (win_par_ctx_t*)arg;
+    for (int64_t p = start; p < end; p++) {
+        win_compute_partition(
+            ctx->order_vecs, ctx->n_order,
+            ctx->func_vecs, ctx->func_kinds, ctx->func_params,
+            ctx->n_funcs, ctx->frame_start, ctx->frame_end,
+            ctx->sorted_idx, ctx->part_offsets[p], ctx->part_offsets[p + 1],
+            ctx->result_vecs, ctx->is_f64);
+    }
+}
+
+/* Parallel gather of partition key values into contiguous array.
+ * Eliminates random-access reads during Phase 2 boundary detection. */
+typedef struct {
+    const int64_t* sorted_idx;
+    uint64_t*      pkey_sorted;
+    ray_t**         sort_vecs;
+    uint8_t        n_part;
+} pkey_gather_ctx_t;
+
+static void pkey_gather_fn(void* arg, uint32_t wid,
+                            int64_t start, int64_t end) {
+    (void)wid;
+    pkey_gather_ctx_t* ctx = (pkey_gather_ctx_t*)arg;
+    const int64_t* sidx = ctx->sorted_idx;
+    uint64_t* out = ctx->pkey_sorted;
+
+    if (ctx->n_part == 1) {
+        ray_t* pk = ctx->sort_vecs[0];
+        const void* pkd = ray_data(pk);
+        if (RAY_IS_SYM(pk->type)) {
+            for (int64_t i = start; i < end; i++)
+                out[i] = (uint64_t)ray_read_sym(pkd, sidx[i], pk->type, pk->attrs);
+        } else if (pk->type == RAY_I32 || pk->type == RAY_DATE || pk->type == RAY_TIME) {
+            const int32_t* src = (const int32_t*)pkd;
+            for (int64_t i = start; i < end; i++)
+                out[i] = (uint64_t)((uint32_t)(src[sidx[i]] - INT32_MIN));
+        } else {
+            const uint64_t* src = (const uint64_t*)pkd;
+            for (int64_t i = start; i < end; i++)
+                out[i] = src[sidx[i]];
+        }
+    } else {
+        for (int64_t i = start; i < end; i++) {
+            int64_t r = sidx[i];
+            uint64_t key = 0;
+            for (uint8_t k = 0; k < ctx->n_part; k++) {
+                ray_t* col = ctx->sort_vecs[k];
+                const void* d = ray_data(col);
+                if (RAY_IS_SYM(col->type))
+                    key = (key << 32) | (uint32_t)ray_read_sym(d, r, col->type, col->attrs);
+                else if (col->type == RAY_I32 || col->type == RAY_DATE || col->type == RAY_TIME)
+                    key = (key << 32) | (uint32_t)(((const int32_t*)d)[r] - INT32_MIN);
+                else {
+                    key = (key << 32) | (uint32_t)((const uint64_t*)d)[r];
+                }
+            }
+            out[i] = key;
+        }
+    }
+}
+
+ray_t* exec_window(ray_graph_t* g, ray_op_t* op, ray_t* tbl) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+
+    ray_op_ext_t* ext = find_ext(g, op->id);
+    if (!ext) return ray_error("nyi", NULL);
+
+    int64_t nrows = ray_table_nrows(tbl);
+    int64_t ncols = ray_table_ncols(tbl);
+    uint8_t n_part  = ext->window.n_part_keys;
+    uint8_t n_order = ext->window.n_order_keys;
+    uint8_t n_funcs = ext->window.n_funcs;
+    /* Guard against uint8_t overflow on n_part + n_order */
+    if ((uint16_t)n_part + n_order > 255)
+        return ray_error("nyi", NULL);
+    uint8_t n_sort  = n_part + n_order;
+
+    if (nrows == 0 || n_funcs == 0) {
+        ray_retain(tbl);
+        return tbl;
+    }
+
+    /* --- Phase 0: Resolve key and func_input vectors --- */
+    /* VLAs below are bounded by uint8_t limits (max 255 each),
+     * so max ~10KB on stack; bounded by uint8_t limits. */
+    ray_t* sort_vecs[n_sort > 0 ? n_sort : 1];
+    uint8_t sort_owned[n_sort > 0 ? n_sort : 1];
+    uint8_t sort_descs[n_sort > 0 ? n_sort : 1];
+    memset(sort_owned, 0, sizeof(sort_owned));
+    memset(sort_descs, 0, sizeof(sort_descs));
+
+    for (uint8_t k = 0; k < n_part; k++) {
+        sort_vecs[k] = win_resolve_vec(g, ext->window.part_keys[k], tbl,
+                                        &sort_owned[k]);
+        sort_descs[k] = 0;  /* partition keys always ASC */
+        if (!sort_vecs[k] || RAY_IS_ERR(sort_vecs[k])) {
+            ray_t* err = sort_vecs[k] ? sort_vecs[k] : ray_error("nyi", NULL);
+            for (uint8_t j = 0; j < k; j++)
+                if (sort_owned[j] && sort_vecs[j] && !RAY_IS_ERR(sort_vecs[j]))
+                    ray_release(sort_vecs[j]);
+            return err;
+        }
+    }
+    for (uint8_t k = 0; k < n_order; k++) {
+        sort_vecs[n_part + k] = win_resolve_vec(g, ext->window.order_keys[k],
+                                                 tbl, &sort_owned[n_part + k]);
+        sort_descs[n_part + k] = ext->window.order_descs[k];
+        if (!sort_vecs[n_part + k] || RAY_IS_ERR(sort_vecs[n_part + k])) {
+            ray_t* err = sort_vecs[n_part + k] ? sort_vecs[n_part + k]
+                                               : ray_error("nyi", NULL);
+            for (uint8_t j = 0; j < n_part + k; j++)
+                if (sort_owned[j] && sort_vecs[j] && !RAY_IS_ERR(sort_vecs[j]))
+                    ray_release(sort_vecs[j]);
+            return err;
+        }
+    }
+
+    ray_t* func_vecs[n_funcs];
+    uint8_t func_owned[n_funcs];
+    ray_t* result_vecs[n_funcs];
+    bool is_f64[n_funcs];
+    memset(func_owned, 0, sizeof(func_owned));
+    memset(result_vecs, 0, sizeof(result_vecs));
+    for (uint8_t f = 0; f < n_funcs; f++) {
+        ray_op_t* fi = ext->window.func_inputs[f];
+        if (fi) {
+            func_vecs[f] = win_resolve_vec(g, fi, tbl, &func_owned[f]);
+            if (!func_vecs[f] || RAY_IS_ERR(func_vecs[f])) {
+                ray_t* err = func_vecs[f] ? func_vecs[f] : ray_error("nyi", NULL);
+                for (uint8_t j = 0; j < f; j++)
+                    if (func_owned[j] && func_vecs[j] && !RAY_IS_ERR(func_vecs[j]))
+                        ray_release(func_vecs[j]);
+                for (uint8_t j = 0; j < n_sort; j++)
+                    if (sort_owned[j] && sort_vecs[j] && !RAY_IS_ERR(sort_vecs[j]))
+                        ray_release(sort_vecs[j]);
+                return err;
+            }
+        } else {
+            func_vecs[f] = NULL;
+        }
+    }
+
+    /* --- Phase 1: Sort by (partition_keys ++ order_keys) --- */
+    ray_t* radix_itmp_hdr = NULL;
+    ray_t* win_enum_rank_hdrs[n_sort > 0 ? n_sort : 1];
+    memset(win_enum_rank_hdrs, 0, sizeof(win_enum_rank_hdrs));
+
+    ray_t* indices_hdr = NULL;
+    int64_t* indices = (int64_t*)scratch_alloc(&indices_hdr,
+                                (size_t)nrows * sizeof(int64_t));
+    if (!indices) goto oom;
+    for (int64_t i = 0; i < nrows; i++) indices[i] = i;
+
+    int64_t* sorted_idx = indices;
+
+    if (n_sort > 0 && nrows <= 64) {
+        sort_cmp_ctx_t cmp_ctx = {
+            .vecs = sort_vecs, .desc = sort_descs,
+            .nulls_first = NULL, .n_sort = n_sort,
+        };
+        sort_insertion(&cmp_ctx, indices, nrows);
+    } else if (n_sort > 0) {
+        /* --- Radix sort fast path --- */
+        bool can_radix = true;
+        for (uint8_t k = 0; k < n_sort; k++) {
+            if (!sort_vecs[k]) { can_radix = false; break; }
+            int8_t t = sort_vecs[k]->type;
+            if (t != RAY_I64 && t != RAY_F64 && t != RAY_I32 && t != RAY_I16 &&
+                t != RAY_BOOL && t != RAY_U8 && t != RAY_SYM &&
+                t != RAY_DATE && t != RAY_TIME && t != RAY_TIMESTAMP) {
+                can_radix = false; break;
+            }
+        }
+        bool radix_done = false;
+
+        if (can_radix) {
+            ray_pool_t* pool = ray_pool_get();
+
+            /* Build SYM rank mappings */
+            uint32_t* enum_ranks[n_sort];
+            memset(enum_ranks, 0, n_sort * sizeof(uint32_t*));
+            for (uint8_t k = 0; k < n_sort; k++) {
+                if (RAY_IS_SYM(sort_vecs[k]->type)) {
+                    enum_ranks[k] = build_enum_rank(sort_vecs[k], nrows,
+                                                     &win_enum_rank_hdrs[k]);
+                    if (!enum_ranks[k]) { can_radix = false; break; }
+                }
+            }
+
+            if (can_radix && n_sort == 1) {
+                /* Single-key sort */
+                uint8_t key_nbytes = radix_key_bytes(sort_vecs[0]->type);
+                ray_pool_t* sk_pool = (nrows >= SMALL_POOL_THRESHOLD) ? pool : NULL;
+                ray_t *keys_hdr;
+                uint64_t* keys = (uint64_t*)scratch_alloc(&keys_hdr,
+                                    (size_t)nrows * sizeof(uint64_t));
+                if (keys) {
+                    radix_encode_ctx_t enc = {
+                        .keys = keys, .data = ray_data(sort_vecs[0]),
+                        .col = sort_vecs[0],
+                        .type = sort_vecs[0]->type,
+                        .col_attrs = sort_vecs[0]->attrs,
+                        .desc = sort_descs[0],
+                        .nulls_first = sort_descs[0], /* default: NULLS FIRST for DESC */
+                        .enum_rank = enum_ranks[0], .n_keys = 1,
+                    };
+                    if (sk_pool)
+                        ray_pool_dispatch(sk_pool, radix_encode_fn, &enc, nrows);
+                    else
+                        radix_encode_fn(&enc, 0, 0, nrows);
+
+                    if (nrows <= RADIX_SORT_THRESHOLD) {
+                        key_introsort(keys, indices, nrows);
+                        sorted_idx = indices;
+                        radix_done = true;
+                    } else {
+                        ray_t *ktmp_hdr, *itmp_hdr;
+                        uint64_t* ktmp = (uint64_t*)scratch_alloc(&ktmp_hdr,
+                                            (size_t)nrows * sizeof(uint64_t));
+                        int64_t*  itmp = (int64_t*)scratch_alloc(&itmp_hdr,
+                                            (size_t)nrows * sizeof(int64_t));
+                        if (ktmp && itmp) {
+                            sorted_idx = radix_sort_run(sk_pool, keys, indices,
+                                                         ktmp, itmp, nrows,
+                                                         key_nbytes, NULL);
+                            radix_done = (sorted_idx != NULL);
+                        }
+                        scratch_free(ktmp_hdr);
+                        if (sorted_idx != itmp) scratch_free(itmp_hdr);
+                        else radix_itmp_hdr = itmp_hdr;
+                    }
+                }
+                scratch_free(keys_hdr);
+            } else if (can_radix && n_sort > 1) {
+                /* Multi-key composite radix sort */
+                ray_pool_t* pool2 = pool;
+                int64_t mins[n_sort], maxs[n_sort];
+                uint8_t total_bits = 0;
+                bool fits = true;
+
+                ray_pool_t* mk_prescan_pool2 = (nrows >= SMALL_POOL_THRESHOLD) ? pool2 : NULL;
+                if (n_sort <= MK_PRESCAN_MAX_KEYS && mk_prescan_pool2) {
+                    uint32_t nw = ray_pool_total_workers(mk_prescan_pool2);
+                    size_t pw_count = (size_t)nw * n_sort;
+                    int64_t pw_mins_stack[512], pw_maxs_stack[512];
+                    ray_t *pw_mins_hdr = NULL, *pw_maxs_hdr = NULL;
+                    int64_t* pw_mins = (pw_count <= 512)
+                        ? pw_mins_stack
+                        : (int64_t*)scratch_alloc(&pw_mins_hdr, pw_count * sizeof(int64_t));
+                    int64_t* pw_maxs = (pw_count <= 512)
+                        ? pw_maxs_stack
+                        : (int64_t*)scratch_alloc(&pw_maxs_hdr, pw_count * sizeof(int64_t));
+                    for (size_t i = 0; i < pw_count; i++) {
+                        pw_mins[i] = INT64_MAX;
+                        pw_maxs[i] = INT64_MIN;
+                    }
+                    mk_prescan_ctx_t pctx = {
+                        .vecs = sort_vecs, .enum_ranks = enum_ranks,
+                        .n_keys = n_sort, .nrows = nrows, .n_workers = nw,
+                        .pw_mins = pw_mins, .pw_maxs = pw_maxs,
+                    };
+                    ray_pool_dispatch(mk_prescan_pool2, mk_prescan_fn, &pctx, nrows);
+
+                    for (uint8_t k = 0; k < n_sort; k++) {
+                        int64_t kmin = INT64_MAX, kmax = INT64_MIN;
+                        for (uint32_t w = 0; w < nw; w++) {
+                            int64_t wmin = pw_mins[w * n_sort + k];
+                            int64_t wmax = pw_maxs[w * n_sort + k];
+                            if (wmin < kmin) kmin = wmin;
+                            if (wmax > kmax) kmax = wmax;
+                        }
+                        mins[k] = kmin;
+                        maxs[k] = kmax;
+                        uint64_t range = (uint64_t)(kmax - kmin);
+                        uint8_t bits = 1;
+                        while (((uint64_t)1 << bits) <= range && bits < 64)
+                            bits++;
+                        total_bits += bits;
+                    }
+                    if (pw_mins_hdr) scratch_free(pw_mins_hdr);
+                    if (pw_maxs_hdr) scratch_free(pw_maxs_hdr);
+                } else {
+                    for (uint8_t k = 0; k < n_sort; k++) {
+                        ray_t* col = sort_vecs[k];
+                        int64_t kmin = INT64_MAX, kmax = INT64_MIN;
+                        if (enum_ranks[k]) {
+                            const void* cdata = ray_data(col);
+                            int8_t ctype = col->type;
+                            uint8_t cattrs = col->attrs;
+                            for (int64_t i = 0; i < nrows; i++) {
+                                uint32_t raw = (uint32_t)ray_read_sym(cdata, i, ctype, cattrs);
+                                int64_t v = (int64_t)enum_ranks[k][raw];
+                                if (v < kmin) kmin = v;
+                                if (v > kmax) kmax = v;
+                            }
+                        } else if (col->type == RAY_I64 || col->type == RAY_TIMESTAMP) {
+                            const int64_t* d = (const int64_t*)ray_data(col);
+                            for (int64_t i = 0; i < nrows; i++) {
+                                if (d[i] < kmin) kmin = d[i];
+                                if (d[i] > kmax) kmax = d[i];
+                            }
+                        } else if (col->type == RAY_I32 || col->type == RAY_DATE || col->type == RAY_TIME) {
+                            const int32_t* d = (const int32_t*)ray_data(col);
+                            for (int64_t i = 0; i < nrows; i++) {
+                                if (d[i] < kmin) kmin = (int64_t)d[i];
+                                if (d[i] > kmax) kmax = (int64_t)d[i];
+                            }
+                        } else if (col->type == RAY_I16) {
+                            const int16_t* d = (const int16_t*)ray_data(col);
+                            for (int64_t i = 0; i < nrows; i++) {
+                                if (d[i] < kmin) kmin = (int64_t)d[i];
+                                if (d[i] > kmax) kmax = (int64_t)d[i];
+                            }
+                        } else if (col->type == RAY_BOOL || col->type == RAY_U8) {
+                            const uint8_t* d = (const uint8_t*)ray_data(col);
+                            for (int64_t i = 0; i < nrows; i++) {
+                                if (d[i] < kmin) kmin = (int64_t)d[i];
+                                if (d[i] > kmax) kmax = (int64_t)d[i];
+                            }
+                        }
+                        mins[k] = kmin;
+                        maxs[k] = kmax;
+                        uint64_t range = (uint64_t)(kmax - kmin);
+                        uint8_t bits = 1;
+                        while (((uint64_t)1 << bits) <= range && bits < 64)
+                            bits++;
+                        total_bits += bits;
+                    }
+                }
+
+                if (total_bits > 64) fits = false;
+
+                if (fits) {
+                    uint8_t bit_shifts[n_sort];
+                    uint8_t accum = 0;
+                    for (int k = n_sort - 1; k >= 0; k--) {
+                        bit_shifts[k] = accum;
+                        uint64_t range = (uint64_t)(maxs[k] - mins[k]);
+                        uint8_t bits = 1;
+                        while (((uint64_t)1 << bits) <= range && bits < 64)
+                            bits++;
+                        accum += bits;
+                    }
+
+                    uint8_t comp_nbytes = (total_bits + 7) / 8;
+                    if (comp_nbytes < 1) comp_nbytes = 1;
+                    ray_pool_t* mk_pool = (nrows >= SMALL_POOL_THRESHOLD) ? pool2 : NULL;
+
+                    ray_t *keys_hdr;
+                    uint64_t* keys = (uint64_t*)scratch_alloc(&keys_hdr,
+                                        (size_t)nrows * sizeof(uint64_t));
+                    if (keys) {
+                        radix_encode_ctx_t enc = {
+                            .keys = keys, .n_keys = n_sort, .vecs = sort_vecs,
+                        };
+                        for (uint8_t k = 0; k < n_sort; k++) {
+                            enc.mins[k] = mins[k];
+                            enc.ranges[k] = maxs[k] - mins[k];
+                            enc.bit_shifts[k] = bit_shifts[k];
+                            enc.descs[k] = sort_descs[k];
+                            enc.enum_ranks[k] = enum_ranks[k];
+                        }
+                        if (mk_pool)
+                            ray_pool_dispatch(mk_pool, radix_encode_fn, &enc, nrows);
+                        else
+                            radix_encode_fn(&enc, 0, 0, nrows);
+
+                        if (nrows <= RADIX_SORT_THRESHOLD) {
+                            key_introsort(keys, indices, nrows);
+                            sorted_idx = indices;
+                            radix_done = true;
+                        } else {
+                            ray_t *ktmp_hdr, *itmp_hdr;
+                            uint64_t* ktmp = (uint64_t*)scratch_alloc(&ktmp_hdr,
+                                                (size_t)nrows * sizeof(uint64_t));
+                            int64_t*  itmp = (int64_t*)scratch_alloc(&itmp_hdr,
+                                                (size_t)nrows * sizeof(int64_t));
+                            if (ktmp && itmp) {
+                                sorted_idx = radix_sort_run(mk_pool, keys, indices,
+                                                             ktmp, itmp, nrows,
+                                                             comp_nbytes, NULL);
+                                radix_done = (sorted_idx != NULL);
+                            }
+                            scratch_free(ktmp_hdr);
+                            if (sorted_idx != itmp) scratch_free(itmp_hdr);
+                            else radix_itmp_hdr = itmp_hdr;
+                        }
+                    }
+                    scratch_free(keys_hdr);
+                }
+            }
+        }
+
+        /* --- Merge sort fallback --- */
+        if (!radix_done) {
+            sort_cmp_ctx_t cmp_ctx = {
+                .vecs = sort_vecs, .desc = sort_descs,
+                .nulls_first = NULL, .n_sort = n_sort,
+            };
+            ray_t* tmp_hdr;
+            int64_t* tmp = (int64_t*)scratch_alloc(&tmp_hdr,
+                                (size_t)nrows * sizeof(int64_t));
+            if (!tmp) { scratch_free(indices_hdr); indices_hdr = NULL; goto oom; }
+
+            ray_pool_t* pool = ray_pool_get();
+            uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
+            if (pool && nw > 1 && nrows > 1024) {
+                sort_phase1_ctx_t p1ctx = {
+                    .cmp_ctx = &cmp_ctx, .indices = indices, .tmp = tmp,
+                    .nrows = nrows, .n_chunks = nw,
+                };
+                ray_pool_dispatch_n(pool, sort_phase1_fn, &p1ctx, nw);
+
+                int64_t chunk_size = (nrows + nw - 1) / nw;
+                int64_t run_size = chunk_size;
+                int64_t* src = indices;
+                int64_t* dst = tmp;
+                while (run_size < nrows) {
+                    int64_t n_pairs = (nrows + 2 * run_size - 1) / (2 * run_size);
+                    sort_merge_ctx_t mctx = {
+                        .cmp_ctx = &cmp_ctx, .src = src, .dst = dst,
+                        .nrows = nrows, .run_size = run_size,
+                    };
+                    if (n_pairs > 1)
+                        ray_pool_dispatch_n(pool, sort_merge_fn, &mctx,
+                                            (uint32_t)n_pairs);
+                    else
+                        sort_merge_fn(&mctx, 0, 0, n_pairs);
+                    int64_t* t = src; src = dst; dst = t;
+                    run_size *= 2;
+                }
+                if (src != indices)
+                    memcpy(indices, src, (size_t)nrows * sizeof(int64_t));
+            } else {
+                sort_merge_recursive(&cmp_ctx, indices, tmp, nrows);
+            }
+            scratch_free(tmp_hdr);
+            sorted_idx = indices;
+        }
+    }
+
+    /* --- Phase 2: Find partition boundaries --- */
+    /* Overallocate part_offsets to worst case (single-pass, no counting pass) */
+    ray_t* poff_hdr = NULL;
+    int64_t* part_offsets = (int64_t*)scratch_alloc(&poff_hdr,
+                                (size_t)(nrows + 1) * sizeof(int64_t));
+    if (!part_offsets) { scratch_free(indices_hdr); goto oom; }
+
+    part_offsets[0] = 0;
+    int64_t n_parts = 0;
+
+    if (n_part > 0) {
+        /* Check if we can pack partition keys into uint64 for fast gather.
+         * Multi-key packing shifts each key by 32 bits, so any key requiring
+         * >32 bits in a multi-key scenario would be truncated.  Force fallback
+         * when any 64-bit key appears alongside other keys. */
+        uint8_t pk_bits = 0;
+        bool can_pack = true;
+        bool has_64bit_key = false;
+        for (uint8_t k = 0; k < n_part; k++) {
+            int8_t t = sort_vecs[k]->type;
+            if (RAY_IS_SYM(t) || t == RAY_I32 || t == RAY_DATE || t == RAY_TIME) pk_bits += 32;
+            else if (t == RAY_I64 || t == RAY_SYM || t == RAY_TIMESTAMP ||
+                     t == RAY_F64) { pk_bits += 64; has_64bit_key = true; }
+            else { can_pack = false; break; }
+            if (pk_bits > 64) { can_pack = false; break; }
+        }
+        /* If multi-key with any 64-bit type, the <<32 packing truncates.
+         * Force sequential fallback for correctness. */
+        if (can_pack && n_part > 1 && has_64bit_key) can_pack = false;
+
+        ray_t* pkey_hdr = NULL;
+        uint64_t* pkey_sorted = can_pack ?
+            (uint64_t*)scratch_alloc(&pkey_hdr, (size_t)nrows * sizeof(uint64_t))
+            : NULL;
+
+        if (pkey_sorted) {
+            /* Parallel gather partition keys into contiguous array */
+            pkey_gather_ctx_t gctx = {
+                .sorted_idx = sorted_idx, .pkey_sorted = pkey_sorted,
+                .sort_vecs = sort_vecs, .n_part = n_part,
+            };
+            ray_pool_t* gpool = ray_pool_get();
+            if (gpool)
+                ray_pool_dispatch(gpool, pkey_gather_fn, &gctx, nrows);
+            else
+                pkey_gather_fn(&gctx, 0, 0, nrows);
+
+            /* Sequential scan on contiguous data (no random access) */
+            for (int64_t i = 1; i < nrows; i++)
+                if (pkey_sorted[i] != pkey_sorted[i - 1])
+                    part_offsets[++n_parts] = i;
+
+            scratch_free(pkey_hdr);
+        } else {
+            /* Fallback: single-pass random-access comparison */
+            for (int64_t i = 1; i < nrows; i++)
+                if (win_keys_differ(sort_vecs, n_part,
+                                    sorted_idx[i - 1], sorted_idx[i]))
+                    part_offsets[++n_parts] = i;
+        }
+        part_offsets[++n_parts] = nrows;
+    } else {
+        /* No partition keys: entire table is one partition.
+         * Minor memory waste (part_offsets sized for nrows+1) but no
+         * correctness issue — only indices 0 and 1 are used. */
+        part_offsets[1] = nrows;
+        n_parts = 1;
+    }
+
+    /* Check cancellation before expensive per-partition compute */
+    {
+        ray_pool_t* cpool = ray_pool_get();
+        if (pool_cancelled(cpool)) {
+            scratch_free(poff_hdr);
+            scratch_free(indices_hdr);
+            if (radix_itmp_hdr) scratch_free(radix_itmp_hdr);
+            for (uint8_t k = 0; k < n_sort; k++)
+                if (win_enum_rank_hdrs[k]) scratch_free(win_enum_rank_hdrs[k]);
+            for (uint8_t k = 0; k < n_sort; k++)
+                if (sort_owned[k] && sort_vecs[k] && !RAY_IS_ERR(sort_vecs[k]))
+                    ray_release(sort_vecs[k]);
+            for (uint8_t f = 0; f < n_funcs; f++)
+                if (func_owned[f] && func_vecs[f] && !RAY_IS_ERR(func_vecs[f]))
+                    ray_release(func_vecs[f]);
+            return ray_error("cancel", NULL);
+        }
+    }
+
+    /* --- Phase 3: Allocate result vectors and compute per-partition --- */
+    for (uint8_t f = 0; f < n_funcs; f++) {
+        uint8_t kind = ext->window.func_kinds[f];
+        ray_t* fvec = func_vecs[f];
+
+        bool out_f64 = false;
+        if (kind == RAY_WIN_AVG) {
+            out_f64 = true;
+        } else if (kind == RAY_WIN_SUM || kind == RAY_WIN_MIN ||
+                   kind == RAY_WIN_MAX || kind == RAY_WIN_LAG ||
+                   kind == RAY_WIN_LEAD || kind == RAY_WIN_FIRST_VALUE ||
+                   kind == RAY_WIN_LAST_VALUE || kind == RAY_WIN_NTH_VALUE) {
+            out_f64 = fvec && fvec->type == RAY_F64;
+        }
+
+        is_f64[f] = out_f64;
+        result_vecs[f] = ray_vec_new(out_f64 ? RAY_F64 : RAY_I64, nrows);
+        if (!result_vecs[f] || RAY_IS_ERR(result_vecs[f])) {
+            for (uint8_t j = 0; j < f; j++) ray_release(result_vecs[j]);
+            scratch_free(poff_hdr);
+            scratch_free(indices_hdr);
+            goto oom;
+        }
+        result_vecs[f]->len = nrows;
+        memset(ray_data(result_vecs[f]), 0, (size_t)nrows * 8);
+    }
+
+    /* Order key vectors start at sort_vecs[n_part] */
+    ray_t** order_vecs = n_order > 0 ? &sort_vecs[n_part] : NULL;
+
+    {
+        /* Pre-allocate nullmaps so win_set_null works in both paths.
+         * On OOM, force sequential path where win_set_null falls back
+         * to single-threaded ray_vec_set_null. */
+        bool nullmaps_ok = true;
+        for (uint8_t f = 0; f < n_funcs; f++) {
+            if (win_prepare_nullmap(result_vecs[f]) != RAY_OK)
+                nullmaps_ok = false;
+        }
+
+        ray_pool_t* p3pool = ray_pool_get();
+        if (p3pool && n_parts > 1 && nullmaps_ok) {
+            win_par_ctx_t pctx = {
+                .order_vecs = order_vecs, .n_order = n_order,
+                .func_vecs = func_vecs, .func_kinds = ext->window.func_kinds,
+                .func_params = ext->window.func_params, .n_funcs = n_funcs,
+                .frame_start = ext->window.frame_start,
+                .frame_end = ext->window.frame_end,
+                .sorted_idx = sorted_idx, .part_offsets = part_offsets,
+                .result_vecs = result_vecs, .is_f64 = is_f64,
+            };
+            ray_pool_dispatch_n(p3pool, win_par_fn, &pctx, (uint32_t)n_parts);
+        } else {
+            for (int64_t p = 0; p < n_parts; p++) {
+                win_compute_partition(
+                    order_vecs, n_order,
+                    func_vecs, ext->window.func_kinds, ext->window.func_params,
+                    n_funcs, ext->window.frame_start, ext->window.frame_end,
+                    sorted_idx, part_offsets[p], part_offsets[p + 1],
+                    result_vecs, is_f64);
+            }
+        }
+
+        /* Set RAY_ATTR_HAS_NULLS on vectors that actually received nulls */
+        for (uint8_t f = 0; f < n_funcs; f++)
+            win_finalize_nulls(result_vecs[f]);
+    }
+
+    /* --- Phase 4: Build result table --- */
+    ray_t* result = ray_table_new(ncols + n_funcs);
+    if (!result || RAY_IS_ERR(result)) {
+        for (uint8_t f = 0; f < n_funcs; f++) ray_release(result_vecs[f]);
+        scratch_free(poff_hdr);
+        scratch_free(indices_hdr);
+        goto oom;
+    }
+
+    /* Pass-through original columns */
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        if (!col) continue;
+        int64_t name_id = ray_table_col_name(tbl, c);
+        ray_retain(col);
+        result = ray_table_add_col(result, name_id, col);
+        ray_release(col);
+    }
+
+    /* Add window result columns with auto-generated names */
+    for (uint8_t f = 0; f < n_funcs; f++) {
+        char buf[16] = "_w";
+        int pos = 2;
+        if (f >= 100) buf[pos++] = '0' + (f / 100);
+        if (f >= 10)  buf[pos++] = '0' + ((f / 10) % 10);
+        buf[pos++] = '0' + (f % 10);
+        buf[pos] = '\0';
+        int64_t name_id = ray_sym_intern(buf, (size_t)pos);
+        result = ray_table_add_col(result, name_id, result_vecs[f]);
+        ray_release(result_vecs[f]);
+    }
+
+    scratch_free(poff_hdr);
+    if (radix_itmp_hdr) scratch_free(radix_itmp_hdr);
+    scratch_free(indices_hdr);
+    for (uint8_t k = 0; k < n_sort; k++)
+        if (win_enum_rank_hdrs[k]) scratch_free(win_enum_rank_hdrs[k]);
+
+    /* Free owned key/func vectors */
+    for (uint8_t k = 0; k < n_sort; k++)
+        if (sort_owned[k] && sort_vecs[k] && !RAY_IS_ERR(sort_vecs[k]))
+            ray_release(sort_vecs[k]);
+    for (uint8_t f = 0; f < n_funcs; f++)
+        if (func_owned[f] && func_vecs[f] && !RAY_IS_ERR(func_vecs[f]))
+            ray_release(func_vecs[f]);
+
+    return result;
+
+oom:
+    if (radix_itmp_hdr) scratch_free(radix_itmp_hdr);
+    for (uint8_t k = 0; k < n_sort; k++)
+        if (win_enum_rank_hdrs[k]) scratch_free(win_enum_rank_hdrs[k]);
+    for (uint8_t k = 0; k < n_sort; k++)
+        if (sort_owned[k] && sort_vecs[k] && !RAY_IS_ERR(sort_vecs[k]))
+            ray_release(sort_vecs[k]);
+    for (uint8_t f = 0; f < n_funcs; f++) {
+        if (func_owned[f] && func_vecs[f] && !RAY_IS_ERR(func_vecs[f]))
+            ray_release(func_vecs[f]);
+        if (result_vecs[f] && !RAY_IS_ERR(result_vecs[f]))
+            ray_release(result_vecs[f]);
+    }
+    return ray_error("oom", NULL);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/col.c b/crates/rayforce-sys/vendor/rayforce/src/store/col.c
new file mode 100644
index 0000000..e590ee7
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/col.c
@@ -0,0 +1,954 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "col.h"
+#include "core/platform.h"
+#include "mem/heap.h"
+#include "store/serde.h"
+#include "store/fileio.h"
+#include "table/sym.h"
+#include "ops/idxop.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdatomic.h>
+
+/* --------------------------------------------------------------------------
+ * validate_sym_bounds -- check all indices in a RAY_SYM column are < sym_count
+ *
+ * Width-dispatched scan for maximum index. Returns RAY_ERR_CORRUPT if any
+ * index >= sym_count. Skipped when sym_count == 0 (allows raw column loads
+ * in tests without a sym file).
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t validate_sym_bounds(const void* data, int64_t len,
+                                     uint8_t attrs, uint32_t sym_count) {
+    if (sym_count == 0 || len == 0) return RAY_OK;
+
+    uint64_t max_id = 0;
+    switch (attrs & RAY_SYM_W_MASK) {
+    case RAY_SYM_W8: {
+        const uint8_t* p = (const uint8_t*)data;
+        for (int64_t i = 0; i < len; i++)
+            if (p[i] > max_id) max_id = p[i];
+        break;
+    }
+    case RAY_SYM_W16: {
+        const uint16_t* p = (const uint16_t*)data;
+        for (int64_t i = 0; i < len; i++)
+            if (p[i] > max_id) max_id = p[i];
+        break;
+    }
+    case RAY_SYM_W32: {
+        const uint32_t* p = (const uint32_t*)data;
+        for (int64_t i = 0; i < len; i++)
+            if (p[i] > max_id) max_id = p[i];
+        break;
+    }
+    case RAY_SYM_W64: {
+        const int64_t* p = (const int64_t*)data;
+        for (int64_t i = 0; i < len; i++) {
+            if (p[i] < 0) return RAY_ERR_CORRUPT;
+            if ((uint64_t)p[i] > max_id) max_id = (uint64_t)p[i];
+        }
+        break;
+    }
+    default:
+        return RAY_ERR_CORRUPT;
+    }
+
+    if (max_id >= sym_count) return RAY_ERR_CORRUPT;
+    return RAY_OK;
+}
+
+/* Magic numbers for extended column formats */
+#define STR_LIST_MAGIC  0x4C525453U  /* "STRL" */
+#define STR_VEC_MAGIC   0x56525453U  /* "STRV" */
+#define LIST_MAGIC      0x4754534CU  /* "LSTG" */
+#define TABLE_MAGIC     0x4C425454U  /* "TTBL" */
+
+/* --------------------------------------------------------------------------
+ * Column file format:
+ *   Bytes 0-15:  nullmap (inline) or zeroed (ext_nullmap / no nulls)
+ *   Bytes 16-31: mmod=0, order=0, type, attrs, rc=0, len
+ *   Bytes 32+:   raw element data
+ *   (if RAY_ATTR_NULLMAP_EXT): appended (len+7)/8 bitmap bytes
+ *
+ * On-disk format IS the in-memory format (zero deserialization on load).
+ * -------------------------------------------------------------------------- */
+
+/* Explicit allowlist of types that are safe to serialize as raw bytes.
+ * Only fixed-size scalar types -- pointer-bearing types (STR, LIST, TABLE)
+ * and non-scalar types are excluded. */
+static bool is_serializable_type(int8_t t) {
+    switch (t) {
+    case RAY_BOOL: case RAY_U8:   case RAY_I16:
+    case RAY_I32:  case RAY_I64:  case RAY_F64:
+    case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP: case RAY_GUID:
+    case RAY_SYM:
+        return true;
+    default:
+        return false;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * String list detection: RAY_LIST whose elements are all -RAY_STR
+ * -------------------------------------------------------------------------- */
+
+static bool is_str_list(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return false;
+    if (v->type != RAY_LIST) return false;
+    ray_t** slots = (ray_t**)ray_data(v);
+    for (int64_t i = 0; i < v->len; i++) {
+        ray_t* elem = slots[i];
+        if (!elem || RAY_IS_ERR(elem)) return false;
+        if (elem->type != -RAY_STR) return false;
+    }
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * col_save_str_list -- serialize a list of string atoms
+ *
+ * Format: [4B magic "STRL"][8B count][for each: 4B len + data bytes]
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t col_save_str_list(ray_t* list, FILE* f) {
+    uint32_t magic = STR_LIST_MAGIC;
+    if (fwrite(&magic, 4, 1, f) != 1) return RAY_ERR_IO;
+
+    int64_t count = list->len;
+    if (fwrite(&count, 8, 1, f) != 1) return RAY_ERR_IO;
+
+    ray_t** slots = (ray_t**)ray_data(list);
+    for (int64_t i = 0; i < count; i++) {
+        ray_t* s = slots[i];
+        const char* sp = ray_str_ptr(s);
+        size_t slen = ray_str_len(s);
+        uint32_t len32 = (uint32_t)slen;
+        if (fwrite(&len32, 4, 1, f) != 1) return RAY_ERR_IO;
+        if (slen > 0 && fwrite(sp, 1, slen, f) != slen) return RAY_ERR_IO;
+    }
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * col_load_str_list -- deserialize a string list from mapped data
+ *
+ * ptr points past the 4B magic. remaining = bytes available.
+ * -------------------------------------------------------------------------- */
+
+static ray_t* col_load_str_list(const uint8_t* ptr, size_t remaining) {
+    if (remaining < 8) return ray_error("corrupt", NULL);
+    int64_t count;
+    memcpy(&count, ptr, 8);
+    ptr += 8; remaining -= 8;
+
+    if (count < 0 || (uint64_t)count > remaining / 4)
+        return ray_error("corrupt", NULL);
+
+    ray_t* list = ray_list_new(count);
+    if (!list || RAY_IS_ERR(list)) return list;
+
+    for (int64_t i = 0; i < count; i++) {
+        if (remaining < 4) { ray_release(list); return ray_error("corrupt", NULL); }
+        uint32_t slen;
+        memcpy(&slen, ptr, 4);
+        ptr += 4; remaining -= 4;
+
+        if (slen > remaining) { ray_release(list); return ray_error("corrupt", NULL); }
+        ray_t* s = ray_str((const char*)ptr, (size_t)slen);
+        if (!s || RAY_IS_ERR(s)) { ray_release(list); return s; }
+        ptr += slen; remaining -= slen;
+
+        list = ray_list_append(list, s);
+        ray_release(s);  /* list_append retains */
+        if (!list || RAY_IS_ERR(list)) return list;
+    }
+    return list;
+}
+
+/* --------------------------------------------------------------------------
+ * col_save_str_vec -- serialize a RAY_STR vector with Rayforce serde
+ *
+ * RAY_STR columns carry a string pool through the header union, so they cannot
+ * use the raw 32-byte column layout. Reuse the object wire format here; it
+ * already preserves pooled strings and external null bitmaps.
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t col_save_str_vec(ray_t* vec, FILE* f) {
+    uint32_t magic = STR_VEC_MAGIC;
+    if (fwrite(&magic, 4, 1, f) != 1) return RAY_ERR_IO;
+
+    int64_t len = ray_serde_size(vec);
+    if (len <= 0) return RAY_ERR_IO;
+    ray_t* bytes = ray_vec_new(RAY_U8, len);
+    if (!bytes || RAY_IS_ERR(bytes)) return RAY_ERR_OOM;
+
+    int64_t wrote = ray_ser_raw((uint8_t*)ray_data(bytes), vec);
+    if (wrote != len) {
+        ray_release(bytes);
+        return RAY_ERR_IO;
+    }
+
+    size_t out = fwrite(ray_data(bytes), 1, (size_t)len, f);
+    ray_release(bytes);
+    return out == (size_t)len ? RAY_OK : RAY_ERR_IO;
+}
+
+static ray_t* col_load_str_vec(const uint8_t* ptr, size_t remaining) {
+    if (remaining > (size_t)INT64_MAX) return ray_error("range", NULL);
+    int64_t len = (int64_t)remaining;
+    ray_t* result = ray_de_raw((uint8_t*)ptr, &len);
+    if (!result || RAY_IS_ERR(result)) return result;
+    if (result->type != RAY_STR) {
+        ray_release(result);
+        return ray_error("type", NULL);
+    }
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * Recursive element serialization for generic lists and tables
+ *
+ * Recursive element format:
+ *   [1B type]
+ *   atoms (type < 0):
+ *     -RAY_STR: [4B len][data bytes]
+ *     other:       [8B raw value]
+ *   vectors with is_serializable_type: [8B len][raw data]
+ *   RAY_LIST: [8B count][recursive elements...]
+ *   RAY_TABLE: [8B ncols][8B nrows][for each col: 8B name_sym + recursive col]
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t col_write_recursive(ray_t* obj, FILE* f);
+
+static ray_err_t col_write_recursive(ray_t* obj, FILE* f) {
+    if (!obj || RAY_IS_ERR(obj)) return RAY_ERR_TYPE;
+
+    int8_t type = obj->type;
+    if (fwrite(&type, 1, 1, f) != 1) return RAY_ERR_IO;
+
+    if (type < 0) {
+        /* Atom */
+        if (type == -RAY_STR) {
+            const char* sp = ray_str_ptr(obj);
+            size_t slen = ray_str_len(obj);
+            uint32_t len32 = (uint32_t)slen;
+            if (fwrite(&len32, 4, 1, f) != 1) return RAY_ERR_IO;
+            if (slen > 0 && fwrite(sp, 1, slen, f) != slen) return RAY_ERR_IO;
+        } else {
+            /* Fixed-size atom: write 8 bytes of the value union */
+            if (fwrite(&obj->i64, 8, 1, f) != 1) return RAY_ERR_IO;
+        }
+        return RAY_OK;
+    }
+
+    if (is_serializable_type(type)) {
+        /* Fixed-size vector: write len + raw data.
+         * RAY_SYM: also write attrs byte (adaptive width W8/W16/W32/W64). */
+        int64_t len = obj->len;
+        if (fwrite(&len, 8, 1, f) != 1) return RAY_ERR_IO;
+        if (type == RAY_SYM) {
+            uint8_t attrs = obj->attrs;
+            if (fwrite(&attrs, 1, 1, f) != 1) return RAY_ERR_IO;
+        }
+        uint8_t esz = ray_sym_elem_size(type, obj->attrs);
+        size_t data_size = (size_t)len * esz;
+        if (data_size > 0 && fwrite(ray_data(obj), 1, data_size, f) != data_size)
+            return RAY_ERR_IO;
+        return RAY_OK;
+    }
+
+    if (type == RAY_LIST) {
+        int64_t count = obj->len;
+        if (fwrite(&count, 8, 1, f) != 1) return RAY_ERR_IO;
+        ray_t** slots = (ray_t**)ray_data(obj);
+        for (int64_t i = 0; i < count; i++) {
+            ray_err_t err = col_write_recursive(slots[i], f);
+            if (err != RAY_OK) return err;
+        }
+        return RAY_OK;
+    }
+
+    if (type == RAY_TABLE) {
+        int64_t ncols = ray_table_ncols(obj);
+        int64_t nrows = ray_table_nrows(obj);
+        if (fwrite(&ncols, 8, 1, f) != 1) return RAY_ERR_IO;
+        if (fwrite(&nrows, 8, 1, f) != 1) return RAY_ERR_IO;
+        for (int64_t c = 0; c < ncols; c++) {
+            int64_t name_sym = ray_table_col_name(obj, c);
+            if (fwrite(&name_sym, 8, 1, f) != 1) return RAY_ERR_IO;
+            ray_t* col = ray_table_get_col_idx(obj, c);
+            ray_err_t err = col_write_recursive(col, f);
+            if (err != RAY_OK) return err;
+        }
+        return RAY_OK;
+    }
+
+    return RAY_ERR_NYI;
+}
+
+/* Read recursive element from mapped buffer */
+static ray_t* col_read_recursive(const uint8_t** pp, size_t* remaining);
+
+static ray_t* col_read_recursive(const uint8_t** pp, size_t* remaining) {
+    if (*remaining < 1) return ray_error("corrupt", NULL);
+    int8_t type;
+    memcpy(&type, *pp, 1);
+    *pp += 1; *remaining -= 1;
+
+    if (type < 0) {
+        /* Atom */
+        if (type == -RAY_STR) {
+            if (*remaining < 4) return ray_error("corrupt", NULL);
+            uint32_t slen;
+            memcpy(&slen, *pp, 4);
+            *pp += 4; *remaining -= 4;
+            if (slen > *remaining) return ray_error("corrupt", NULL);
+            ray_t* s = ray_str((const char*)*pp, (size_t)slen);
+            *pp += slen; *remaining -= slen;
+            return s;
+        } else {
+            /* Fixed atom: 8 bytes */
+            if (*remaining < 8) return ray_error("corrupt", NULL);
+            int64_t val;
+            memcpy(&val, *pp, 8);
+            *pp += 8; *remaining -= 8;
+
+            ray_t* atom = ray_alloc(0);
+            if (!atom || RAY_IS_ERR(atom)) return atom;
+            atom->type = type;
+            atom->i64 = val;
+            return atom;
+        }
+    }
+
+    if (is_serializable_type(type)) {
+        /* Fixed-size vector */
+        if (*remaining < 8) return ray_error("corrupt", NULL);
+        int64_t len;
+        memcpy(&len, *pp, 8);
+        *pp += 8; *remaining -= 8;
+        if (len < 0) return ray_error("corrupt", NULL);
+
+        /* RAY_SYM: read attrs byte for adaptive width */
+        uint8_t attrs = 0;
+        if (type == RAY_SYM) {
+            if (*remaining < 1) return ray_error("corrupt", NULL);
+            memcpy(&attrs, *pp, 1);
+            *pp += 1; *remaining -= 1;
+        }
+
+        uint8_t esz = ray_sym_elem_size(type, attrs);
+        if (esz > 0 && (uint64_t)len > SIZE_MAX / esz)
+            return ray_error("corrupt", NULL);
+        size_t data_size = (size_t)len * esz;
+        if (data_size > *remaining) return ray_error("corrupt", NULL);
+
+        ray_t* vec = (type == RAY_SYM)
+            ? ray_sym_vec_new(attrs & RAY_SYM_W_MASK, len)
+            : ray_vec_new(type, len);
+        if (!vec || RAY_IS_ERR(vec)) return vec;
+        vec->len = len;
+        if (data_size > 0)
+            memcpy(ray_data(vec), *pp, data_size);
+        *pp += data_size; *remaining -= data_size;
+
+        if (type == RAY_SYM) {
+            uint32_t sc = ray_sym_count();
+            ray_err_t ve = validate_sym_bounds(ray_data(vec), len, attrs, sc);
+            if (ve != RAY_OK) { ray_release(vec); return ray_error(ray_err_code_str(ve), NULL); }
+        }
+        return vec;
+    }
+
+    if (type == RAY_LIST) {
+        if (*remaining < 8) return ray_error("corrupt", NULL);
+        int64_t count;
+        memcpy(&count, *pp, 8);
+        *pp += 8; *remaining -= 8;
+        if (count < 0) return ray_error("corrupt", NULL);
+
+        ray_t* list = ray_list_new(count);
+        if (!list || RAY_IS_ERR(list)) return list;
+        for (int64_t i = 0; i < count; i++) {
+            ray_t* elem = col_read_recursive(pp, remaining);
+            if (!elem || RAY_IS_ERR(elem)) { ray_release(list); return elem; }
+            list = ray_list_append(list, elem);
+            ray_release(elem);
+            if (!list || RAY_IS_ERR(list)) return list;
+        }
+        return list;
+    }
+
+    if (type == RAY_TABLE) {
+        if (*remaining < 16) return ray_error("corrupt", NULL);
+        int64_t ncols, nrows;
+        memcpy(&ncols, *pp, 8);
+        *pp += 8; *remaining -= 8;
+        memcpy(&nrows, *pp, 8);
+        *pp += 8; *remaining -= 8;
+        (void)nrows;  /* nrows is reconstructed from columns */
+
+        if (ncols < 0) return ray_error("corrupt", NULL);
+        ray_t* tbl = ray_table_new(ncols);
+        if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+
+        for (int64_t c = 0; c < ncols; c++) {
+            if (*remaining < 8) { ray_release(tbl); return ray_error("corrupt", NULL); }
+            int64_t name_sym;
+            memcpy(&name_sym, *pp, 8);
+            *pp += 8; *remaining -= 8;
+
+            ray_t* col = col_read_recursive(pp, remaining);
+            if (!col || RAY_IS_ERR(col)) { ray_release(tbl); return col; }
+            tbl = ray_table_add_col(tbl, name_sym, col);
+            ray_release(col);  /* table_add_col retains */
+            if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+        }
+        return tbl;
+    }
+
+    return ray_error("nyi", NULL);
+}
+
+/* --------------------------------------------------------------------------
+ * col_save_list -- serialize a generic RAY_LIST
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t col_save_list(ray_t* list, FILE* f) {
+    uint32_t magic = LIST_MAGIC;
+    if (fwrite(&magic, 4, 1, f) != 1) return RAY_ERR_IO;
+    return col_write_recursive(list, f);
+}
+
+/* --------------------------------------------------------------------------
+ * col_save_table -- serialize a RAY_TABLE
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t col_save_table(ray_t* tbl, FILE* f) {
+    uint32_t magic = TABLE_MAGIC;
+    if (fwrite(&magic, 4, 1, f) != 1) return RAY_ERR_IO;
+    return col_write_recursive(tbl, f);
+}
+
+/* --------------------------------------------------------------------------
+ * try_load_link_sidecar -- attach HAS_LINK to vec from `<path>.link`
+ *
+ * Best-effort: missing sidecar, unreadable file, or empty contents leave
+ * vec as a plain int column.  Only RAY_I32 / RAY_I64 columns are eligible.
+ * The sidecar holds the target table sym name in plain text; we intern it
+ * into the local sym table and write the resulting sym ID + HAS_LINK bit.
+ * Used by both ray_col_load (buddy-copy path) and ray_col_mmap (zero-copy
+ * path) so linked columns survive both load styles.
+ * -------------------------------------------------------------------------- */
+static void try_load_link_sidecar(ray_t* vec, const char* path) {
+    if (!vec || (vec->type != RAY_I32 && vec->type != RAY_I64)) return;
+    char link_path[1024];
+    size_t plen = strlen(path);
+    if (plen + 6 >= sizeof(link_path)) return;
+    memcpy(link_path, path, plen);
+    memcpy(link_path + plen, ".link", 6);
+    FILE* lf = fopen(link_path, "rb");
+    if (!lf) return;
+    char buf[256];
+    size_t n = fread(buf, 1, sizeof(buf) - 1, lf);
+    fclose(lf);
+    while (n > 0 && (buf[n-1] == '\n' || buf[n-1] == '\r'
+                  || buf[n-1] == ' '  || buf[n-1] == '\t'
+                  || buf[n-1] == '\0')) n--;
+    if (n == 0) return;
+    int64_t target_sym = ray_sym_intern(buf, n);
+    if (target_sym < 0) return;
+    vec->link_target = target_sym;
+    vec->attrs |= RAY_ATTR_HAS_LINK;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_col_save -- write a vector to a column file
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_col_save(ray_t* vec, const char* path) {
+    if (!vec || RAY_IS_ERR(vec)) return RAY_ERR_TYPE;
+    if (!path) return RAY_ERR_IO;
+
+    /* Build temp path for crash-safe write: write tmp, fsync, atomic rename */
+    char tmp_path[1024];
+    if (snprintf(tmp_path, sizeof(tmp_path), "%s.tmp", path) >= (int)sizeof(tmp_path))
+        return RAY_ERR_IO;
+
+    /* String list: RAY_LIST of -RAY_STR atoms */
+    if (is_str_list(vec)) {
+        FILE* f = fopen(tmp_path, "wb");
+        if (!f) return RAY_ERR_IO;
+        ray_err_t err = col_save_str_list(vec, f);
+        fclose(f);
+        if (err != RAY_OK) { remove(tmp_path); return err; }
+        goto fsync_and_rename;
+    }
+
+    /* String vector */
+    if (vec->type == RAY_STR) {
+        FILE* f = fopen(tmp_path, "wb");
+        if (!f) return RAY_ERR_IO;
+        ray_err_t err = col_save_str_vec(vec, f);
+        fclose(f);
+        if (err != RAY_OK) { remove(tmp_path); return err; }
+        goto fsync_and_rename;
+    }
+
+    /* Generic list */
+    if (vec->type == RAY_LIST) {
+        FILE* f = fopen(tmp_path, "wb");
+        if (!f) return RAY_ERR_IO;
+        ray_err_t err = col_save_list(vec, f);
+        fclose(f);
+        if (err != RAY_OK) { remove(tmp_path); return err; }
+        goto fsync_and_rename;
+    }
+
+    /* Table */
+    if (vec->type == RAY_TABLE) {
+        FILE* f = fopen(tmp_path, "wb");
+        if (!f) return RAY_ERR_IO;
+        ray_err_t err = col_save_table(vec, f);
+        fclose(f);
+        if (err != RAY_OK) { remove(tmp_path); return err; }
+        goto fsync_and_rename;
+    }
+
+    /* Explicit allowlist of serializable types */
+    if (!is_serializable_type(vec->type))
+        return RAY_ERR_NYI;
+
+    {
+        FILE* f = fopen(tmp_path, "wb");
+        if (!f) return RAY_ERR_IO;
+
+        /* Write a clean header (mmod=0, rc=0) */
+        ray_t header;
+        memcpy(&header, vec, 32);
+        header.mmod = 0;
+        header.order = 0;
+        /* For RAY_SYM: store sym count in rc field (always 0 on disk otherwise).
+         * This serves as O(1) fast-reject metadata on load. */
+        header.rc = (vec->type == RAY_SYM) ? ray_sym_count() : 0;
+
+        /* HAS_INDEX rebase: an attached accelerator index displaces the
+         * 16-byte nullmap union with an index pointer.  Persist the
+         * pre-attach state instead — strip HAS_INDEX, restore the saved
+         * NULLMAP_EXT bit, and copy the saved bitmap bytes back into the
+         * on-disk header.  ext_for_append captures the saved ext-nullmap
+         * pointer so the bitmap append at end-of-write reads from the
+         * right place. */
+        ray_t* ext_for_append = (vec->attrs & RAY_ATTR_NULLMAP_EXT)
+                                ? vec->ext_nullmap : NULL;
+        if (vec->attrs & RAY_ATTR_HAS_INDEX) {
+            ray_index_t* ix = ray_index_payload(vec->index);
+            header.attrs &= ~RAY_ATTR_HAS_INDEX;
+            if (ix->saved_attrs & RAY_ATTR_NULLMAP_EXT) {
+                header.attrs |= RAY_ATTR_NULLMAP_EXT;
+                memcpy(&ext_for_append, &ix->saved_nullmap[0],
+                       sizeof(ext_for_append));
+            } else {
+                header.attrs &= ~RAY_ATTR_NULLMAP_EXT;
+                ext_for_append = NULL;
+            }
+            memcpy(header.nullmap, ix->saved_nullmap, 16);
+        }
+
+        /* HAS_LINK rebase: target sym ID lives at header.nullmap[8..15],
+         * but sym IDs are process-local — the on-disk file would be
+         * useless across runs.  Strip the bit and zero the slot; the
+         * sidecar `.link` file (written below after rename) carries the
+         * target table name in text form for portable restoration. */
+        if (vec->attrs & RAY_ATTR_HAS_LINK) {
+            header.attrs &= (uint8_t)~RAY_ATTR_HAS_LINK;
+            memset(header.nullmap + 8, 0, 8);
+        }
+
+        /* Clear slice field; preserve ext_nullmap flag for bitmap append */
+        header.attrs &= ~RAY_ATTR_SLICE;
+        if (!(header.attrs & RAY_ATTR_HAS_NULLS)) {
+            memset(header.nullmap, 0, 16);
+            header.attrs &= ~RAY_ATTR_NULLMAP_EXT;
+        } else if (header.attrs & RAY_ATTR_NULLMAP_EXT) {
+            /* Ext bitmap appended after data; zero pointer bytes in header */
+            memset(header.nullmap, 0, 16);
+        }
+
+        size_t written = fwrite(&header, 1, 32, f);
+        if (written != 32) { fclose(f); remove(tmp_path); return RAY_ERR_IO; }
+
+        /* Write data */
+        if (vec->len < 0) { fclose(f); remove(tmp_path); return RAY_ERR_CORRUPT; }
+        uint8_t esz = ray_sym_elem_size(vec->type, vec->attrs);
+        if (esz == 0 && vec->len > 0) { fclose(f); remove(tmp_path); return RAY_ERR_TYPE; }
+        /* Overflow check: ensure len*esz fits in size_t with 32-byte header room */
+        if ((uint64_t)vec->len > (SIZE_MAX - 32) / (esz ? esz : 1)) {
+            fclose(f);
+            remove(tmp_path);
+            return RAY_ERR_IO;
+        }
+        size_t data_size = (size_t)vec->len * esz;
+
+        void* data;
+        if (vec->attrs & RAY_ATTR_SLICE) {
+            /* Validate slice bounds before computing data pointer */
+            ray_t* parent = vec->slice_parent;
+            if (!parent || vec->slice_offset < 0 ||
+                vec->slice_offset + vec->len > parent->len) {
+                fclose(f);
+                remove(tmp_path);
+                return RAY_ERR_IO;
+            }
+            data = (char*)ray_data(parent) + vec->slice_offset * esz;
+        } else {
+            data = ray_data(vec);
+        }
+
+        if (data_size > 0) {
+            written = fwrite(data, 1, data_size, f);
+            if (written != data_size) { fclose(f); remove(tmp_path); return RAY_ERR_IO; }
+        }
+
+        /* Append external nullmap bitmap after data.  Use header.attrs
+         * (rebased above for HAS_INDEX) and ext_for_append (the
+         * effective ext_nullmap pointer, possibly extracted from the
+         * index's saved snapshot). */
+        if ((vec->attrs & RAY_ATTR_HAS_NULLS) &&
+            (header.attrs & RAY_ATTR_NULLMAP_EXT) && ext_for_append) {
+            size_t bitmap_len = ((size_t)vec->len + 7) / 8;
+            written = fwrite(ray_data(ext_for_append), 1, bitmap_len, f);
+            if (written != bitmap_len) { fclose(f); remove(tmp_path); return RAY_ERR_IO; }
+        }
+
+        fclose(f);
+    }
+
+fsync_and_rename:;
+    /* Fsync temp file for durability */
+    ray_fd_t tmp_fd = ray_file_open(tmp_path, RAY_OPEN_READ | RAY_OPEN_WRITE);
+    if (tmp_fd == RAY_FD_INVALID) { remove(tmp_path); return RAY_ERR_IO; }
+    ray_err_t err = ray_file_sync(tmp_fd);
+    ray_file_close(tmp_fd);
+    if (err != RAY_OK) { remove(tmp_path); return err; }
+
+    /* Atomic rename: tmp -> final path */
+    err = ray_file_rename(tmp_path, path);
+    if (err != RAY_OK) { remove(tmp_path); return err; }
+
+    /* Linked-column sidecar: write `<path>.link` containing the target
+     * table's sym name (text form) so it survives the per-process
+     * sym-ID re-assignment.  Remove any stale `.link` from a previous
+     * save when the current vec is unlinked. */
+    {
+        char link_path[1024];
+        size_t plen = strlen(path);
+        if (plen + 6 < sizeof(link_path)) {
+            memcpy(link_path, path, plen);
+            memcpy(link_path + plen, ".link", 6);
+            if (vec->attrs & RAY_ATTR_HAS_LINK) {
+                ray_t* sym_str = ray_sym_str(vec->link_target);
+                const char* sp = sym_str ? ray_str_ptr(sym_str) : NULL;
+                size_t slen = sym_str ? ray_str_len(sym_str) : 0;
+                if (sp && slen > 0) {
+                    char tmp_link[1024];
+                    memcpy(tmp_link, link_path, plen + 6);
+                    if (plen + 10 < sizeof(tmp_link)) {
+                        memcpy(tmp_link + plen + 5, ".tmp", 5);
+                        FILE* lf = fopen(tmp_link, "wb");
+                        if (lf) {
+                            size_t wrote = fwrite(sp, 1, slen, lf);
+                            fclose(lf);
+                            if (wrote == slen) {
+                                ray_file_rename(tmp_link, link_path);
+                            } else {
+                                remove(tmp_link);
+                            }
+                        }
+                    }
+                }
+            } else {
+                /* No link on this column — clean stale sidecar if any. */
+                remove(link_path);
+            }
+        }
+    }
+
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * col_validate_mapped -- shared validation for ray_col_load / ray_col_mmap
+ *
+ * Maps the file, validates header/type/bounds, and returns parsed metadata.
+ * On success, the mapping remains open (caller must unmap on error paths).
+ * Returns NULL on success, or an error ray_t* on failure (mapping already
+ * cleaned up in that case).
+ * -------------------------------------------------------------------------- */
+
+typedef struct {
+    void*   mapped;
+    size_t  mapped_size;
+    ray_t*  header;       /* pointer into mapped region */
+    uint8_t esz;
+    size_t  data_size;
+    bool    has_ext_nullmap;
+    size_t  bitmap_len;
+} col_mapped_t;
+
+static ray_t* col_validate_mapped(const char* path, col_mapped_t* out) {
+    size_t mapped_size = 0;
+    void* ptr = ray_vm_map_file(path, &mapped_size);
+    if (!ptr) return ray_error("io", NULL);
+
+    if (mapped_size < 32) {
+        ray_vm_unmap_file(ptr, mapped_size);
+        return ray_error("corrupt", NULL);
+    }
+
+    ray_t* hdr = (ray_t*)ptr;
+
+    /* Validate type from untrusted file data -- allowlist only */
+    if (!is_serializable_type(hdr->type)) {
+        ray_vm_unmap_file(ptr, mapped_size);
+        return ray_error("nyi", NULL);
+    }
+    if (hdr->len < 0) {
+        ray_vm_unmap_file(ptr, mapped_size);
+        return ray_error("corrupt", NULL);
+    }
+
+    uint8_t esz = ray_sym_elem_size(hdr->type, hdr->attrs);
+    if (esz == 0 && hdr->len > 0) {
+        ray_vm_unmap_file(ptr, mapped_size);
+        return ray_error("type", NULL);
+    }
+    /* Overflow check: ensure len*esz fits in size_t with 32-byte header room */
+    if ((uint64_t)hdr->len > (SIZE_MAX - 32) / (esz ? esz : 1)) {
+        ray_vm_unmap_file(ptr, mapped_size);
+        return ray_error("io", NULL);
+    }
+    size_t data_size = (size_t)hdr->len * esz;
+    if (32 + data_size > mapped_size) {
+        ray_vm_unmap_file(ptr, mapped_size);
+        return ray_error("corrupt", NULL);
+    }
+
+    /* Check for appended ext_nullmap bitmap */
+    bool has_ext_nullmap = (hdr->attrs & RAY_ATTR_HAS_NULLS) &&
+                           (hdr->attrs & RAY_ATTR_NULLMAP_EXT);
+    size_t bitmap_len = has_ext_nullmap ? ((size_t)hdr->len + 7) / 8 : 0;
+    if (has_ext_nullmap && 32 + data_size + bitmap_len > mapped_size) {
+        ray_vm_unmap_file(ptr, mapped_size);
+        return ray_error("corrupt", NULL);
+    }
+
+    /* RAY_SYM: fast-reject via sym count in header rc field.
+     * Use memcpy (not atomic_load) since file data is not atomic storage. */
+    if (hdr->type == RAY_SYM) {
+        uint32_t saved_sc;
+        memcpy(&saved_sc, (const char*)ptr + offsetof(ray_t, rc), sizeof(saved_sc));
+        uint32_t cur_sc = ray_sym_count();
+        if (saved_sc > 0 && cur_sc > 0 && cur_sc < saved_sc) {
+            ray_vm_unmap_file(ptr, mapped_size);
+            return ray_error("corrupt", NULL);
+        }
+    }
+
+    out->mapped          = ptr;
+    out->mapped_size     = mapped_size;
+    out->header          = hdr;
+    out->esz             = esz;
+    out->data_size       = data_size;
+    out->has_ext_nullmap = has_ext_nullmap;
+    out->bitmap_len      = bitmap_len;
+    return NULL;  /* success */
+}
+
+/* --------------------------------------------------------------------------
+ * col_restore_ext_nullmap -- allocate buddy-backed copy of ext nullmap
+ *
+ * Shared by ray_col_load and ray_col_mmap. On success, sets vec->ext_nullmap.
+ * Returns NULL on success, or an error string on failure.
+ * -------------------------------------------------------------------------- */
+
+static ray_t* col_restore_ext_nullmap(ray_t* vec, const col_mapped_t* cm) {
+    ray_t* ext = ray_vec_new(RAY_U8, (int64_t)cm->bitmap_len);
+    if (!ext || RAY_IS_ERR(ext)) return ray_error("oom", NULL);
+    ext->len = (int64_t)cm->bitmap_len;
+    memcpy(ray_data(ext), (char*)cm->mapped + 32 + cm->data_size, cm->bitmap_len);
+    vec->ext_nullmap = ext;
+    return NULL;  /* success */
+}
+
+/* --------------------------------------------------------------------------
+ * ray_col_load -- load a column file via mmap (zero deserialization)
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_col_load(const char* path) {
+    if (!path) return ray_error("io", NULL);
+
+    /* Read file into temp mmap for validation, then copy to buddy block.
+     * This avoids the mmap lifecycle problem (mmod=1 blocks are never freed). */
+    size_t mapped_size = 0;
+    void* ptr = ray_vm_map_file(path, &mapped_size);
+    if (!ptr) return ray_error("io", NULL);
+
+    /* Check for extended format magic numbers (first 4 bytes) */
+    if (mapped_size >= 4) {
+        uint32_t magic;
+        memcpy(&magic, ptr, 4);
+
+        if (magic == STR_LIST_MAGIC) {
+            ray_t* result = col_load_str_list((const uint8_t*)ptr + 4, mapped_size - 4);
+            ray_vm_unmap_file(ptr, mapped_size);
+            return result;
+        }
+        if (magic == STR_VEC_MAGIC) {
+            ray_t* result = col_load_str_vec((const uint8_t*)ptr + 4, mapped_size - 4);
+            ray_vm_unmap_file(ptr, mapped_size);
+            return result;
+        }
+        if (magic == LIST_MAGIC || magic == TABLE_MAGIC) {
+            const uint8_t* p = (const uint8_t*)ptr + 4;
+            size_t rem = mapped_size - 4;
+            ray_t* result = col_read_recursive(&p, &rem);
+            ray_vm_unmap_file(ptr, mapped_size);
+            return result;
+        }
+    }
+    /* Unmap the initial mapping; col_validate_mapped will re-map for validation */
+    ray_vm_unmap_file(ptr, mapped_size);
+
+    col_mapped_t cm = {0};
+    ray_t* err = col_validate_mapped(path, &cm);
+    if (err) return err;
+
+    /* Allocate buddy block and copy file data */
+    ray_t* vec = ray_alloc(cm.data_size);
+    if (!vec || RAY_IS_ERR(vec)) {
+        ray_vm_unmap_file(cm.mapped, cm.mapped_size);
+        return vec ? vec : ray_error("oom", NULL);
+    }
+    uint8_t saved_order = vec->order;  /* preserve buddy order */
+    memcpy(vec, cm.mapped, 32 + cm.data_size);
+
+    /* Restore external nullmap if present */
+    if (cm.has_ext_nullmap) {
+        ray_t* ext_err = col_restore_ext_nullmap(vec, &cm);
+        if (ext_err) {
+            ray_vm_unmap_file(cm.mapped, cm.mapped_size);
+            ray_free(vec);
+            return ext_err;
+        }
+    }
+
+    ray_vm_unmap_file(cm.mapped, cm.mapped_size);
+
+    /* Fix up header for buddy-allocated block */
+    vec->mmod = 0;
+    vec->order = saved_order;
+    vec->attrs &= ~RAY_ATTR_SLICE;
+    if (!cm.has_ext_nullmap)
+        vec->attrs &= ~RAY_ATTR_NULLMAP_EXT;
+    ray_atomic_store(&vec->rc, 1);
+
+    /* RAY_SYM: validate sym count footer + bounds check */
+    if (vec->type == RAY_SYM) {
+        ray_err_t sym_err = validate_sym_bounds(ray_data(vec), vec->len,
+                                                vec->attrs, ray_sym_count());
+        if (sym_err != RAY_OK) {
+            ray_release(vec);
+            return ray_error(ray_err_code_str(sym_err), NULL);
+        }
+    }
+
+    try_load_link_sidecar(vec, path);
+
+    return vec;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_col_mmap -- zero-copy column load via mmap (mmod=1)
+ *
+ * Returns a ray_t* backed directly by the file's mmap region.
+ * MAP_PRIVATE gives COW semantics -- only the header page gets a private
+ * copy when we write mmod/rc. All data pages stay shared with page cache.
+ * ray_release -> ray_free -> munmap.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_col_mmap(const char* path) {
+    if (!path) return ray_error("io", NULL);
+
+    col_mapped_t cm = {0};
+    ray_t* err = col_validate_mapped(path, &cm);
+    if (err) return err;
+
+    /* Validate that file size matches expected layout exactly.
+     * ray_free() reconstructs the munmap size using the same formula. */
+    size_t expected = 32 + cm.data_size + cm.bitmap_len;
+    if (expected != cm.mapped_size) {
+        ray_vm_unmap_file(cm.mapped, cm.mapped_size);
+        return ray_error("io", NULL);
+    }
+
+    ray_t* vec = cm.header;
+
+    /* RAY_SYM: bounds check on data */
+    if (vec->type == RAY_SYM) {
+        ray_err_t sym_err = validate_sym_bounds(
+            (const char*)cm.mapped + 32, vec->len, vec->attrs, ray_sym_count());
+        if (sym_err != RAY_OK) {
+            ray_vm_unmap_file(cm.mapped, cm.mapped_size);
+            return ray_error(ray_err_code_str(sym_err), NULL);
+        }
+    }
+
+    /* Restore external nullmap: allocate buddy-backed copy
+     * (ext_nullmap must be a proper ray_t for ref counting) */
+    if (cm.has_ext_nullmap) {
+        ray_t* ext_err = col_restore_ext_nullmap(vec, &cm);
+        if (ext_err) {
+            ray_vm_unmap_file(cm.mapped, cm.mapped_size);
+            return ext_err;
+        }
+    }
+
+    /* Patch header -- MAP_PRIVATE COW: only the header page gets copied */
+    vec->mmod = 1;
+    vec->order = 0;
+    vec->attrs &= ~RAY_ATTR_SLICE;
+    if (!cm.has_ext_nullmap)
+        vec->attrs &= ~RAY_ATTR_NULLMAP_EXT;
+    ray_atomic_store(&vec->rc, 1);
+
+    /* Reattach link sidecar if present.  Without this, linked columns
+     * round-tripped through splay-mmap (splay.c:184) lose HAS_LINK
+     * even though ray_col_load restores it. */
+    try_load_link_sidecar(vec, path);
+
+    return vec;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/col.h b/crates/rayforce-sys/vendor/rayforce/src/store/col.h
new file mode 100644
index 0000000..55f492b
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/col.h
@@ -0,0 +1,34 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_COL_H
+#define RAY_COL_H
+
+#include <rayforce.h>
+
+/* Column file I/O */
+ray_err_t ray_col_save(ray_t* vec, const char* path);
+ray_t*    ray_col_load(const char* path);
+ray_t*    ray_col_mmap(const char* path);
+
+#endif /* RAY_COL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/csr.c b/crates/rayforce-sys/vendor/rayforce/src/store/csr.c
new file mode 100644
index 0000000..a978cdd
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/csr.c
@@ -0,0 +1,529 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "csr.h"
+#include "store/col.h"
+#include "mem/sys.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+/* Forward declaration */
+static void csr_free(ray_csr_t* csr);
+
+/* --------------------------------------------------------------------------
+ * CSR construction helpers
+ * -------------------------------------------------------------------------- */
+
+/* Pair for sorting edges */
+typedef struct {
+    int64_t src;
+    int64_t dst;
+    int64_t row;  /* original row index for rowmap */
+} edge_pair_t;
+
+/* Comparison by src then dst */
+static int cmp_edge_by_src(const void* a, const void* b) {
+    const edge_pair_t* ea = (const edge_pair_t*)a;
+    const edge_pair_t* eb = (const edge_pair_t*)b;
+    if (ea->src < eb->src) return -1;
+    if (ea->src > eb->src) return 1;
+    if (ea->dst < eb->dst) return -1;
+    if (ea->dst > eb->dst) return 1;
+    return 0;
+}
+
+/* Comparison by dst then src (for reverse CSR) */
+static int cmp_edge_by_dst(const void* a, const void* b) {
+    const edge_pair_t* ea = (const edge_pair_t*)a;
+    const edge_pair_t* eb = (const edge_pair_t*)b;
+    if (ea->dst < eb->dst) return -1;
+    if (ea->dst > eb->dst) return 1;
+    if (ea->src < eb->src) return -1;
+    if (ea->src > eb->src) return 1;
+    return 0;
+}
+
+/* Sort targets within each adjacency list (for LFTJ) */
+static void csr_sort_adjacency_lists(ray_csr_t* csr) {
+    int64_t* offsets = (int64_t*)ray_data(csr->offsets);
+    int64_t* targets = (int64_t*)ray_data(csr->targets);
+    int64_t* rowmap = csr->rowmap ? (int64_t*)ray_data(csr->rowmap) : NULL;
+
+    for (int64_t node = 0; node < csr->n_nodes; node++) {
+        int64_t start = offsets[node];
+        int64_t end = offsets[node + 1];
+        int64_t deg = end - start;
+        if (deg <= 1) continue;
+
+        /* Simple insertion sort — adjacency lists are typically small */
+        for (int64_t i = start + 1; i < end; i++) {
+            int64_t key = targets[i];
+            int64_t row_key = rowmap ? rowmap[i] : 0;
+            int64_t j = i - 1;
+            while (j >= start && targets[j] > key) {
+                targets[j + 1] = targets[j];
+                if (rowmap) rowmap[j + 1] = rowmap[j];
+                j--;
+            }
+            targets[j + 1] = key;
+            if (rowmap) rowmap[j + 1] = row_key;
+        }
+    }
+}
+
+/* Build CSR from sorted edge pairs.
+ * pairs must be sorted by the 'key' field (src for fwd, dst for rev). */
+static ray_err_t csr_build_from_pairs(edge_pair_t* pairs, int64_t n_edges,
+                                      int64_t n_nodes, bool is_reverse,
+                                      bool sort_targets, ray_csr_t* out) {
+    out->n_nodes = n_nodes;
+    out->props = NULL;
+
+    /* Count valid edges (those within [0, n_nodes) range) */
+    int64_t valid_edges = 0;
+    for (int64_t i = 0; i < n_edges; i++) {
+        int64_t key = is_reverse ? pairs[i].dst : pairs[i].src;
+        if (key >= 0 && key < n_nodes) valid_edges++;
+    }
+    out->n_edges = valid_edges;
+
+    /* Allocate offsets (n_nodes + 1) */
+    out->offsets = ray_vec_new(RAY_I64, n_nodes + 1);
+    if (!out->offsets || RAY_IS_ERR(out->offsets)) return RAY_ERR_OOM;
+    out->offsets->len = n_nodes + 1;
+    int64_t* off = (int64_t*)ray_data(out->offsets);
+    memset(off, 0, (size_t)(n_nodes + 1) * sizeof(int64_t));
+
+    /* Allocate targets */
+    out->targets = ray_vec_new(RAY_I64, valid_edges > 0 ? valid_edges : 1);
+    if (!out->targets || RAY_IS_ERR(out->targets)) {
+        ray_release(out->offsets); out->offsets = NULL;
+        return RAY_ERR_OOM;
+    }
+    out->targets->len = valid_edges;
+    int64_t* tgt = (int64_t*)ray_data(out->targets);
+
+    /* Allocate rowmap */
+    out->rowmap = ray_vec_new(RAY_I64, valid_edges > 0 ? valid_edges : 1);
+    if (!out->rowmap || RAY_IS_ERR(out->rowmap)) {
+        ray_release(out->offsets); out->offsets = NULL;
+        ray_release(out->targets); out->targets = NULL;
+        return RAY_ERR_OOM;
+    }
+    out->rowmap->len = valid_edges;
+    int64_t* rmap = (int64_t*)ray_data(out->rowmap);
+
+    /* Count degrees */
+    for (int64_t i = 0; i < n_edges; i++) {
+        int64_t key = is_reverse ? pairs[i].dst : pairs[i].src;
+        if (key >= 0 && key < n_nodes) off[key + 1]++;
+    }
+
+    /* Prefix sum */
+    for (int64_t i = 1; i <= n_nodes; i++)
+        off[i] += off[i - 1];
+
+    /* Fill targets + rowmap using a position array */
+    ray_t* pos_hdr = ray_alloc((size_t)(n_nodes > 0 ? n_nodes : 1) * sizeof(int64_t));
+    if (!pos_hdr) {
+        ray_release(out->offsets); out->offsets = NULL;
+        ray_release(out->targets); out->targets = NULL;
+        ray_release(out->rowmap); out->rowmap = NULL;
+        return RAY_ERR_OOM;
+    }
+    int64_t* pos = (int64_t*)ray_data(pos_hdr);
+    if (n_nodes > 0)
+        memcpy(pos, off, (size_t)n_nodes * sizeof(int64_t));
+
+    for (int64_t i = 0; i < n_edges; i++) {
+        int64_t key = is_reverse ? pairs[i].dst : pairs[i].src;
+        int64_t val = is_reverse ? pairs[i].src : pairs[i].dst;
+        if (key >= 0 && key < n_nodes) {
+            int64_t p = pos[key]++;
+            tgt[p] = val;
+            rmap[p] = pairs[i].row;
+        }
+    }
+    ray_free(pos_hdr);
+
+    /* Sort within adjacency lists if requested */
+    if (sort_targets) {
+        csr_sort_adjacency_lists(out);
+        out->sorted = true;
+    } else {
+        out->sorted = false;
+    }
+
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_rel_from_edges — build from explicit edge table
+ * -------------------------------------------------------------------------- */
+
+ray_rel_t* ray_rel_from_edges(ray_t* edge_table,
+                             const char* src_col, const char* dst_col,
+                             int64_t n_src_nodes, int64_t n_dst_nodes,
+                             bool sort_targets) {
+    if (!edge_table || RAY_IS_ERR(edge_table) || edge_table->type != RAY_TABLE)
+        return NULL;
+
+    int64_t src_sym = ray_sym_intern(src_col, strlen(src_col));
+    int64_t dst_sym = ray_sym_intern(dst_col, strlen(dst_col));
+    if (src_sym < 0 || dst_sym < 0) return NULL;  /* sym intern OOM */
+
+    ray_t* src_vec = ray_table_get_col(edge_table, src_sym);
+    ray_t* dst_vec = ray_table_get_col(edge_table, dst_sym);
+    if (!src_vec || !dst_vec) return NULL;
+    if (src_vec->type != RAY_I64 || dst_vec->type != RAY_I64) return NULL;
+
+    int64_t n_edges = src_vec->len;
+    if (n_edges != dst_vec->len) return NULL;
+    if (n_src_nodes < 0 || n_dst_nodes < 0) return NULL;
+
+    /* Build edge pairs */
+    ray_t* pairs_hdr = ray_alloc((size_t)n_edges * sizeof(edge_pair_t));
+    if (!pairs_hdr) return NULL;
+    edge_pair_t* pairs = (edge_pair_t*)ray_data(pairs_hdr);
+
+    int64_t* src_data = (int64_t*)ray_data(src_vec);
+    int64_t* dst_data = (int64_t*)ray_data(dst_vec);
+    for (int64_t i = 0; i < n_edges; i++) {
+        pairs[i].src = src_data[i];
+        pairs[i].dst = dst_data[i];
+        pairs[i].row = i;
+    }
+
+    /* Allocate rel */
+    ray_rel_t* rel = (ray_rel_t*)ray_sys_alloc(sizeof(ray_rel_t));
+    if (!rel) { ray_free(pairs_hdr); return NULL; }
+    memset(rel, 0, sizeof(ray_rel_t));
+    rel->name_sym = -1;
+
+    /* Build forward CSR (sorted by src) */
+    /* qsort is from libc, not an external dep */
+    qsort(pairs, (size_t)n_edges, sizeof(edge_pair_t), cmp_edge_by_src);
+    ray_err_t err = csr_build_from_pairs(pairs, n_edges, n_src_nodes, false,
+                                         sort_targets, &rel->fwd);
+    if (err != RAY_OK) {
+        ray_free(pairs_hdr);
+        ray_sys_free(rel);
+        return NULL;
+    }
+
+    /* Build reverse CSR (sorted by dst) */
+    qsort(pairs, (size_t)n_edges, sizeof(edge_pair_t), cmp_edge_by_dst);
+    err = csr_build_from_pairs(pairs, n_edges, n_dst_nodes, true,
+                                sort_targets, &rel->rev);
+    if (err != RAY_OK) {
+        ray_free(pairs_hdr);
+        csr_free(&rel->fwd);
+        ray_sys_free(rel);
+        return NULL;
+    }
+
+    ray_free(pairs_hdr);
+    return rel;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_rel_build — build from FK column in source table
+ * -------------------------------------------------------------------------- */
+
+ray_rel_t* ray_rel_build(ray_t* from_table, const char* fk_col,
+                         int64_t n_target_nodes, bool sort_targets) {
+    if (!from_table || RAY_IS_ERR(from_table) || from_table->type != RAY_TABLE)
+        return NULL;
+
+    int64_t fk_sym = ray_sym_intern(fk_col, strlen(fk_col));
+    ray_t* fk_vec = ray_table_get_col(from_table, fk_sym);
+    if (!fk_vec || fk_vec->type != RAY_I64) return NULL;
+    if (n_target_nodes < 0) return NULL;
+
+    int64_t n_edges = fk_vec->len;
+    int64_t n_src_nodes = ray_table_nrows(from_table);
+
+    /* Build edge pairs: src = row index, dst = fk value */
+    ray_t* pairs_hdr = ray_alloc((size_t)n_edges * sizeof(edge_pair_t));
+    if (!pairs_hdr) return NULL;
+    edge_pair_t* pairs = (edge_pair_t*)ray_data(pairs_hdr);
+
+    int64_t* fk_data = (int64_t*)ray_data(fk_vec);
+    for (int64_t i = 0; i < n_edges; i++) {
+        pairs[i].src = i;
+        pairs[i].dst = fk_data[i];
+        pairs[i].row = i;
+    }
+
+    ray_rel_t* rel = (ray_rel_t*)ray_sys_alloc(sizeof(ray_rel_t));
+    if (!rel) { ray_free(pairs_hdr); return NULL; }
+    memset(rel, 0, sizeof(ray_rel_t));
+    rel->name_sym = -1;
+
+    /* Build forward CSR */
+    qsort(pairs, (size_t)n_edges, sizeof(edge_pair_t), cmp_edge_by_src);
+    ray_err_t err = csr_build_from_pairs(pairs, n_edges, n_src_nodes, false,
+                                         sort_targets, &rel->fwd);
+    if (err != RAY_OK) {
+        ray_free(pairs_hdr);
+        ray_sys_free(rel);
+        return NULL;
+    }
+
+    /* Build reverse CSR */
+    qsort(pairs, (size_t)n_edges, sizeof(edge_pair_t), cmp_edge_by_dst);
+    err = csr_build_from_pairs(pairs, n_edges, n_target_nodes, true,
+                                sort_targets, &rel->rev);
+    if (err != RAY_OK) {
+        ray_free(pairs_hdr);
+        csr_free(&rel->fwd);
+        ray_sys_free(rel);
+        return NULL;
+    }
+
+    ray_free(pairs_hdr);
+    return rel;
+}
+
+/* --------------------------------------------------------------------------
+ * CSR free
+ * -------------------------------------------------------------------------- */
+
+static void csr_free(ray_csr_t* csr) {
+    if (csr->offsets) ray_release(csr->offsets);
+    if (csr->targets) ray_release(csr->targets);
+    if (csr->rowmap) ray_release(csr->rowmap);
+    if (csr->props) ray_release(csr->props);
+    csr->offsets = NULL;
+    csr->targets = NULL;
+    csr->rowmap = NULL;
+    csr->props = NULL;
+}
+
+void ray_rel_set_props(ray_rel_t* rel, ray_t* props) {
+    if (!rel || !props) return;
+    /* Retain twice: fwd.props and rev.props both alias the same pointer,
+     * and csr_free() releases each independently. */
+    ray_retain(props);
+    ray_retain(props);
+    if (rel->fwd.props) ray_release(rel->fwd.props);
+    if (rel->rev.props) ray_release(rel->rev.props);
+    rel->fwd.props = props;
+    rel->rev.props = props;
+}
+
+void ray_rel_free(ray_rel_t* rel) {
+    if (!rel) return;
+    csr_free(&rel->fwd);
+    csr_free(&rel->rev);
+    ray_sys_free(rel);
+}
+
+/* --- Public CSR neighbor access ------------------------------------------- */
+
+const int64_t* ray_rel_neighbors(ray_rel_t* rel, int64_t node,
+                                 uint8_t direction, int64_t* out_count) {
+    if (!rel) { if (out_count) *out_count = 0; return NULL; }
+    ray_csr_t* csr = (direction == 1) ? &rel->rev : &rel->fwd;
+    return ray_csr_neighbors(csr, node, out_count);
+}
+
+int64_t ray_rel_n_nodes(ray_rel_t* rel, uint8_t direction) {
+    if (!rel) return 0;
+    ray_csr_t* csr = (direction == 1) ? &rel->rev : &rel->fwd;
+    return csr->n_nodes;
+}
+
+/* --------------------------------------------------------------------------
+ * CSR persistence — save/load/mmap using existing column file format
+ * -------------------------------------------------------------------------- */
+
+
+static ray_err_t csr_save(ray_csr_t* csr, const char* dir, const char* prefix) {
+    char path[1024];
+    int len;
+
+    len = snprintf(path, sizeof(path), "%s/%s_offsets", dir, prefix);
+    if (len < 0 || (size_t)len >= sizeof(path)) return RAY_ERR_IO;
+    ray_err_t err = ray_col_save(csr->offsets, path);
+    if (err != RAY_OK) return err;
+
+    len = snprintf(path, sizeof(path), "%s/%s_targets", dir, prefix);
+    if (len < 0 || (size_t)len >= sizeof(path)) return RAY_ERR_IO;
+    err = ray_col_save(csr->targets, path);
+    if (err != RAY_OK) return err;
+
+    if (csr->rowmap) {
+        len = snprintf(path, sizeof(path), "%s/%s_rowmap", dir, prefix);
+        if (len < 0 || (size_t)len >= sizeof(path)) return RAY_ERR_IO;
+        err = ray_col_save(csr->rowmap, path);
+        if (err != RAY_OK) return err;
+    }
+
+    return RAY_OK;
+}
+
+static ray_err_t csr_load_impl(ray_csr_t* csr, const char* dir, const char* prefix,
+                                bool use_mmap) {
+    char path[1024];
+    int len;
+
+    len = snprintf(path, sizeof(path), "%s/%s_offsets", dir, prefix);
+    if (len < 0 || (size_t)len >= sizeof(path)) return RAY_ERR_IO;
+    csr->offsets = use_mmap ? ray_col_mmap(path) : ray_col_load(path);
+    if (!csr->offsets || RAY_IS_ERR(csr->offsets)) {
+        csr->offsets = NULL;
+        return RAY_ERR_IO;
+    }
+
+    len = snprintf(path, sizeof(path), "%s/%s_targets", dir, prefix);
+    if (len < 0 || (size_t)len >= sizeof(path)) return RAY_ERR_IO;
+    csr->targets = use_mmap ? ray_col_mmap(path) : ray_col_load(path);
+    if (!csr->targets || RAY_IS_ERR(csr->targets)) {
+        ray_release(csr->offsets); csr->offsets = NULL;
+        csr->targets = NULL;
+        return RAY_ERR_IO;
+    }
+
+    len = snprintf(path, sizeof(path), "%s/%s_rowmap", dir, prefix);
+    if (len < 0 || (size_t)len >= sizeof(path)) return RAY_ERR_IO;
+    csr->rowmap = use_mmap ? ray_col_mmap(path) : ray_col_load(path);
+    if (!csr->rowmap || RAY_IS_ERR(csr->rowmap)) {
+        /* rowmap is optional — ignore error */
+        csr->rowmap = NULL;
+    }
+
+    if (csr->offsets->len < 1) {
+        ray_release(csr->offsets); csr->offsets = NULL;
+        ray_release(csr->targets); csr->targets = NULL;
+        if (csr->rowmap) { ray_release(csr->rowmap); csr->rowmap = NULL; }
+        return RAY_ERR_IO;
+    }
+    csr->n_nodes = csr->offsets->len - 1;
+    csr->n_edges = csr->targets->len;
+
+    /* Consistency: offsets[n_nodes] must equal targets->len */
+    int64_t* off_data = (int64_t*)ray_data(csr->offsets);
+    if (off_data[csr->n_nodes] != csr->n_edges) {
+        ray_release(csr->offsets); csr->offsets = NULL;
+        ray_release(csr->targets); csr->targets = NULL;
+        if (csr->rowmap) { ray_release(csr->rowmap); csr->rowmap = NULL; }
+        return RAY_ERR_IO;
+    }
+
+    /* Validate offset monotonicity: offsets[i] <= offsets[i+1] */
+    for (int64_t i = 0; i < csr->n_nodes; i++) {
+        if (off_data[i] < 0 || off_data[i] > off_data[i + 1]) {
+            ray_release(csr->offsets); csr->offsets = NULL;
+            ray_release(csr->targets); csr->targets = NULL;
+            if (csr->rowmap) { ray_release(csr->rowmap); csr->rowmap = NULL; }
+            return RAY_ERR_IO;  /* corrupt: non-monotonic offsets */
+        }
+    }
+
+    csr->sorted = false;  /* caller sets if known */
+    csr->props = NULL;
+
+    return RAY_OK;
+}
+
+ray_err_t ray_rel_save(ray_rel_t* rel, const char* dir) {
+    if (!rel || !dir) return RAY_ERR_IO;
+
+    /* Create directory */
+    if (mkdir(dir, 0755) != 0 && errno != EEXIST) return RAY_ERR_IO;
+
+    ray_err_t err = csr_save(&rel->fwd, dir, "fwd");
+    if (err != RAY_OK) return err;
+
+    err = csr_save(&rel->rev, dir, "rev");
+    if (err != RAY_OK) return err;
+
+    /* Save metadata (from_table, to_table, name_sym, sorted flags) */
+    char path[1024];
+    int len = snprintf(path, sizeof(path), "%s/meta", dir);
+    if (len < 0 || (size_t)len >= sizeof(path)) return RAY_ERR_IO;
+
+    /* Pack metadata into an I64 vector: [from_table, to_table, name_sym, fwd_sorted, rev_sorted] */
+    int64_t meta_data[5];
+    meta_data[0] = (int64_t)rel->from_table;
+    meta_data[1] = (int64_t)rel->to_table;
+    meta_data[2] = rel->name_sym;
+    meta_data[3] = rel->fwd.sorted ? 1 : 0;
+    meta_data[4] = rel->rev.sorted ? 1 : 0;
+    ray_t* meta_vec = ray_vec_from_raw(RAY_I64, meta_data, 5);
+    if (!meta_vec || RAY_IS_ERR(meta_vec)) return RAY_ERR_OOM;
+    err = ray_col_save(meta_vec, path);
+    ray_release(meta_vec);
+
+    return err;
+}
+
+static ray_rel_t* rel_load_impl(const char* dir, bool use_mmap) {
+    if (!dir) return NULL;
+
+    ray_rel_t* rel = (ray_rel_t*)ray_sys_alloc(sizeof(ray_rel_t));
+    if (!rel) return NULL;
+    memset(rel, 0, sizeof(ray_rel_t));
+
+    ray_err_t err = csr_load_impl(&rel->fwd, dir, "fwd", use_mmap);
+    if (err != RAY_OK) { ray_sys_free(rel); return NULL; }
+
+    err = csr_load_impl(&rel->rev, dir, "rev", use_mmap);
+    if (err != RAY_OK) {
+        csr_free(&rel->fwd);
+        ray_sys_free(rel);
+        return NULL;
+    }
+
+    /* Load metadata */
+    char path[1024];
+    int len = snprintf(path, sizeof(path), "%s/meta", dir);
+    if (len >= 0 && (size_t)len < sizeof(path)) {
+        ray_t* meta = use_mmap ? ray_col_mmap(path) : ray_col_load(path);
+        if (meta && !RAY_IS_ERR(meta) && meta->len >= 5) {
+            int64_t* md = (int64_t*)ray_data(meta);
+            rel->from_table = (uint16_t)md[0];
+            rel->to_table = (uint16_t)md[1];
+            rel->name_sym = md[2];
+            rel->fwd.sorted = md[3] != 0;
+            rel->rev.sorted = md[4] != 0;
+            ray_release(meta);
+        } else if (meta && !RAY_IS_ERR(meta)) {
+            ray_release(meta);
+        }
+    }
+
+    return rel;
+}
+
+ray_rel_t* ray_rel_load(const char* dir) {
+    return rel_load_impl(dir, false);
+}
+
+ray_rel_t* ray_rel_mmap(const char* dir) {
+    return rel_load_impl(dir, true);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/csr.h b/crates/rayforce-sys/vendor/rayforce/src/store/csr.h
new file mode 100644
index 0000000..ece3a53
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/csr.h
@@ -0,0 +1,79 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_CSR_H
+#define RAY_CSR_H
+
+#include <rayforce.h>
+
+/* Compressed Sparse Row edge index.
+ *
+ * offsets[i]..offsets[i+1] gives the range in targets[] for node i's neighbors.
+ * Stored as ray_t I64 vectors — same allocator, mmap, COW as everything else.
+ *
+ * If sorted == true, targets within each adjacency list are sorted ascending.
+ * Required for OP_WCO_JOIN (Leapfrog Triejoin).
+ */
+typedef struct ray_csr {
+    ray_t*    offsets;      /* I64 vec, length = n_nodes + 1                 */
+    ray_t*    targets;      /* I64 vec, length = n_edges                     */
+    ray_t*    rowmap;       /* I64 vec, length = n_edges (CSR pos -> prop row)*/
+    ray_t*    props;        /* optional edge property table (ray_t RAY_TABLE)  */
+    int64_t  n_nodes;
+    int64_t  n_edges;
+    bool     sorted;       /* targets sorted per adjacency list             */
+} ray_csr_t;
+
+/* Relationship: double-indexed CSR (forward + reverse).
+ *
+ * from_table/to_table are opaque IDs assigned by the caller (planner).
+ * librayforce does not manage a table registry -- it just stores the IDs
+ * so the caller can identify which tables this rel connects.
+ */
+typedef struct ray_rel {
+    uint16_t    from_table;
+    uint16_t    to_table;
+    int64_t     name_sym;     /* relationship name as symbol ID */
+    ray_csr_t    fwd;          /* src -> dst */
+    ray_csr_t    rev;          /* dst -> src */
+} ray_rel_t;
+
+/* O(1) neighbor range lookup — caller must ensure node is in [0, n_nodes). */
+static inline int64_t ray_csr_degree(ray_csr_t* csr, int64_t node) {
+    if (!csr || !csr->offsets || node < 0 || node >= csr->n_nodes) return 0;
+    int64_t* o = (int64_t*)ray_data(csr->offsets);
+    return o[node + 1] - o[node];
+}
+
+static inline int64_t* ray_csr_neighbors(ray_csr_t* csr, int64_t node, int64_t* out_count) {
+    if (!csr || !csr->offsets || !csr->targets || node < 0 || node >= csr->n_nodes) {
+        if (out_count) *out_count = 0;
+        return NULL;
+    }
+    int64_t* o = (int64_t*)ray_data(csr->offsets);
+    int64_t* t = (int64_t*)ray_data(csr->targets);
+    *out_count = o[node + 1] - o[node];
+    return &t[o[node]];
+}
+
+#endif /* RAY_CSR_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/fileio.c b/crates/rayforce-sys/vendor/rayforce/src/store/fileio.c
new file mode 100644
index 0000000..8586c13
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/fileio.c
@@ -0,0 +1,270 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "fileio.h"
+
+#include <limits.h>
+
+/* PATH_MAX is mandated on POSIX (typically 4096 on Linux); Windows
+ * caps at MAX_PATH = 260 unless long-path support is enabled.  Use the
+ * larger of the two when known so callers passing deep splayed paths
+ * (e.g. /db/yyyy.mm.dd/table/) don't silently truncate. */
+#ifdef RAY_OS_WINDOWS
+#  define RAY_PATH_MAX 4096
+#elif defined(PATH_MAX)
+#  define RAY_PATH_MAX PATH_MAX
+#else
+#  define RAY_PATH_MAX 4096
+#endif
+
+#ifdef RAY_OS_WINDOWS
+
+#include <errno.h>
+
+/* ===== Windows implementation ===== */
+
+/* Translate GetLastError() into errno so callers can use errno portably. */
+static void win_set_errno(void) {
+    DWORD e = GetLastError();
+    switch (e) {
+    case ERROR_FILE_NOT_FOUND:
+    case ERROR_PATH_NOT_FOUND:    errno = ENOENT;  break;
+    case ERROR_ACCESS_DENIED:     errno = EACCES;  break;
+    case ERROR_WRITE_PROTECT:     errno = EROFS;   break;
+    case ERROR_TOO_MANY_OPEN_FILES: errno = EMFILE; break;
+    case ERROR_FILE_EXISTS:
+    case ERROR_ALREADY_EXISTS:    errno = EEXIST;  break;
+    default:                      errno = EIO;     break;
+    }
+}
+
+ray_fd_t ray_file_open(const char* path, int flags) {
+    if (!path) return RAY_FD_INVALID;
+
+    DWORD access = 0;
+    DWORD creation = OPEN_EXISTING;
+
+    if (flags & RAY_OPEN_READ)  access |= GENERIC_READ;
+    if (flags & RAY_OPEN_WRITE) access |= GENERIC_WRITE;
+    if (flags & RAY_OPEN_CREATE) creation = OPEN_ALWAYS;
+
+    HANDLE h = CreateFileA(path, access, FILE_SHARE_READ | FILE_SHARE_WRITE,
+                           NULL, creation, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (h == INVALID_HANDLE_VALUE) win_set_errno();
+    return h;
+}
+
+void ray_file_close(ray_fd_t fd) {
+    if (fd != RAY_FD_INVALID) CloseHandle(fd);
+}
+
+ray_err_t ray_file_lock_ex(ray_fd_t fd) {
+    if (fd == RAY_FD_INVALID) return RAY_ERR_IO;
+    OVERLAPPED ov = {0};
+    if (!LockFileEx(fd, LOCKFILE_EXCLUSIVE_LOCK, 0, MAXDWORD, MAXDWORD, &ov))
+        return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_file_lock_sh(ray_fd_t fd) {
+    if (fd == RAY_FD_INVALID) return RAY_ERR_IO;
+    OVERLAPPED ov = {0};
+    if (!LockFileEx(fd, 0, 0, MAXDWORD, MAXDWORD, &ov))
+        return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_file_unlock(ray_fd_t fd) {
+    if (fd == RAY_FD_INVALID) return RAY_OK;
+    OVERLAPPED ov = {0};
+    if (!UnlockFileEx(fd, 0, MAXDWORD, MAXDWORD, &ov))
+        return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_file_sync(ray_fd_t fd) {
+    if (fd == RAY_FD_INVALID) return RAY_ERR_IO;
+    if (!FlushFileBuffers(fd)) return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_file_sync_dir(const char* path) {
+    /* Windows: rename durability is handled by MOVEFILE_WRITE_THROUGH in
+     * ray_file_rename; no separate directory fsync needed. */
+    (void)path;
+    return RAY_OK;
+}
+
+ray_err_t ray_file_rename(const char* old_path, const char* new_path) {
+    if (!old_path || !new_path) return RAY_ERR_IO;
+    /* MOVEFILE_WRITE_THROUGH flushes the rename to disk before returning,
+     * providing crash-safe durability equivalent to POSIX fsync-after-rename. */
+    if (!MoveFileExA(old_path, new_path,
+                     MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH))
+        return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_mkdir(const char* path) {
+    if (!path) return RAY_ERR_IO;
+    if (!CreateDirectoryA(path, NULL)) {
+        if (GetLastError() != ERROR_ALREADY_EXISTS) return RAY_ERR_IO;
+    }
+    return RAY_OK;
+}
+
+ray_err_t ray_mkdir_p(const char* path) {
+    if (!path || !*path) return RAY_ERR_IO;
+    char buf[RAY_PATH_MAX];
+    size_t len = strlen(path);
+    if (len >= sizeof(buf)) return RAY_ERR_IO;
+    memcpy(buf, path, len + 1);
+    /* Normalize trailing separator: trim it so the loop creates `buf` itself. */
+    while (len > 1 && (buf[len - 1] == '/' || buf[len - 1] == '\\')) buf[--len] = '\0';
+    for (size_t i = 1; i < len; i++) {
+        if (buf[i] == '/' || buf[i] == '\\') {
+            char saved = buf[i];
+            buf[i] = '\0';
+            if (!CreateDirectoryA(buf, NULL) && GetLastError() != ERROR_ALREADY_EXISTS) {
+                buf[i] = saved;
+                return RAY_ERR_IO;
+            }
+            buf[i] = saved;
+        }
+    }
+    if (!CreateDirectoryA(buf, NULL) && GetLastError() != ERROR_ALREADY_EXISTS) return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+#else
+
+/* ===== POSIX implementation ===== */
+
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+ray_fd_t ray_file_open(const char* path, int flags) {
+    if (!path) return RAY_FD_INVALID;
+
+    int oflags = 0;
+    if ((flags & RAY_OPEN_READ) && (flags & RAY_OPEN_WRITE))
+        oflags = O_RDWR;
+    else if (flags & RAY_OPEN_WRITE)
+        oflags = O_WRONLY;
+    else
+        oflags = O_RDONLY;
+
+    if (flags & RAY_OPEN_CREATE) oflags |= O_CREAT;
+
+    return open(path, oflags, 0644);
+}
+
+void ray_file_close(ray_fd_t fd) {
+    if (fd != RAY_FD_INVALID) close(fd);
+}
+
+ray_err_t ray_file_lock_ex(ray_fd_t fd) {
+    if (fd == RAY_FD_INVALID) return RAY_ERR_IO;
+    if (flock(fd, LOCK_EX) != 0) return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_file_lock_sh(ray_fd_t fd) {
+    if (fd == RAY_FD_INVALID) return RAY_ERR_IO;
+    if (flock(fd, LOCK_SH) != 0) return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_file_unlock(ray_fd_t fd) {
+    if (fd == RAY_FD_INVALID) return RAY_OK;
+    if (flock(fd, LOCK_UN) != 0) return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_file_sync(ray_fd_t fd) {
+    if (fd == RAY_FD_INVALID) return RAY_ERR_IO;
+    if (fsync(fd) != 0) return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_file_sync_dir(const char* path) {
+    if (!path) return RAY_ERR_IO;
+    /* Extract parent directory from path */
+    char dir[1024];
+    size_t len = strlen(path);
+    if (len >= sizeof(dir)) return RAY_ERR_IO;
+    memcpy(dir, path, len + 1);
+    /* Find last '/' */
+    char* slash = strrchr(dir, '/');
+    if (slash) {
+        if (slash == dir)
+            dir[1] = '\0';  /* root directory */
+        else
+            *slash = '\0';
+    } else {
+        dir[0] = '.'; dir[1] = '\0';  /* current directory */
+    }
+    int fd = open(dir, O_RDONLY);
+    if (fd < 0) return RAY_ERR_IO;
+    int rc = fsync(fd);
+    close(fd);
+    return (rc == 0) ? RAY_OK : RAY_ERR_IO;
+}
+
+ray_err_t ray_file_rename(const char* old_path, const char* new_path) {
+    if (!old_path || !new_path) return RAY_ERR_IO;
+    if (rename(old_path, new_path) != 0) return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_mkdir(const char* path) {
+    if (!path) return RAY_ERR_IO;
+    if (mkdir(path, 0755) != 0 && errno != EEXIST) return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+ray_err_t ray_mkdir_p(const char* path) {
+    if (!path || !*path) return RAY_ERR_IO;
+    char buf[RAY_PATH_MAX];
+    size_t len = strlen(path);
+    if (len >= sizeof(buf)) return RAY_ERR_IO;
+    memcpy(buf, path, len + 1);
+    /* Strip trailing slash so the final mkdir creates `buf` itself. */
+    while (len > 1 && buf[len - 1] == '/') buf[--len] = '\0';
+    for (size_t i = 1; i < len; i++) {
+        if (buf[i] == '/') {
+            buf[i] = '\0';
+            if (mkdir(buf, 0755) != 0 && errno != EEXIST) return RAY_ERR_IO;
+            buf[i] = '/';
+        }
+    }
+    if (mkdir(buf, 0755) != 0 && errno != EEXIST) return RAY_ERR_IO;
+    return RAY_OK;
+}
+
+#endif
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/fileio.h b/crates/rayforce-sys/vendor/rayforce/src/store/fileio.h
new file mode 100644
index 0000000..658e560
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/fileio.h
@@ -0,0 +1,54 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_FILEIO_H
+#define RAY_FILEIO_H
+
+#include <rayforce.h>
+
+/* Cross-platform file I/O (locking, sync, atomic rename) */
+#ifdef RAY_OS_WINDOWS
+  #include <windows.h>
+  typedef HANDLE ray_fd_t;
+  #define RAY_FD_INVALID INVALID_HANDLE_VALUE
+#else
+  typedef int ray_fd_t;
+  #define RAY_FD_INVALID (-1)
+#endif
+
+#define RAY_OPEN_READ   0x01
+#define RAY_OPEN_WRITE  0x02
+#define RAY_OPEN_CREATE 0x04
+
+ray_fd_t  ray_file_open(const char* path, int flags);
+void     ray_file_close(ray_fd_t fd);
+ray_err_t ray_file_lock_ex(ray_fd_t fd);
+ray_err_t ray_file_lock_sh(ray_fd_t fd);
+ray_err_t ray_file_unlock(ray_fd_t fd);
+ray_err_t ray_file_sync(ray_fd_t fd);
+ray_err_t ray_file_sync_dir(const char* path);
+ray_err_t ray_file_rename(const char* old_path, const char* new_path);
+ray_err_t ray_mkdir(const char* path);
+ray_err_t ray_mkdir_p(const char* path);  /* like `mkdir -p` */
+
+#endif /* RAY_FILEIO_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/hnsw.c b/crates/rayforce-sys/vendor/rayforce/src/store/hnsw.c
new file mode 100644
index 0000000..dc939a4
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/hnsw.c
@@ -0,0 +1,972 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "hnsw.h"
+#include "mem/sys.h"
+#include <math.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+/* --------------------------------------------------------------------------
+ * Distance dispatch — each metric maps to a scalar where lower = closer,
+ * as required by the HNSW beam search.
+ *
+ *   COSINE  → 1 - cos(a, b)          (range [0, 2])
+ *   L2      → sqrt(Σ (a_i - b_i)^2)  (Euclidean)
+ *   IP      → -dot(a, b)             (negated so lower=closer)
+ *
+ * Note on L2: we keep the sqrt (true Euclidean), even
+ * though omitting it preserves ordering.  The sqrt cost is dominated by
+ * the inner loop on modern cores, and returning true distances avoids
+ * surprising callers who compare against thresholds.
+ * -------------------------------------------------------------------------- */
+
+static double hnsw_cosine_dist(const float* a, const float* b, int32_t dim) {
+    double dot = 0.0, na = 0.0, nb = 0.0;
+    for (int32_t i = 0; i < dim; i++) {
+        dot += (double)a[i] * b[i];
+        na  += (double)a[i] * a[i];
+        nb  += (double)b[i] * b[i];
+    }
+    double denom = sqrt(na) * sqrt(nb);
+    return (denom > 0.0) ? 1.0 - dot / denom : 1.0;
+}
+
+static double hnsw_l2_dist(const float* a, const float* b, int32_t dim) {
+    double s = 0.0;
+    for (int32_t i = 0; i < dim; i++) {
+        double d = (double)a[i] - (double)b[i];
+        s += d * d;
+    }
+    return sqrt(s);
+}
+
+static double hnsw_ip_dist(const float* a, const float* b, int32_t dim) {
+    double dot = 0.0;
+    for (int32_t i = 0; i < dim; i++) {
+        dot += (double)a[i] * (double)b[i];
+    }
+    return -dot;
+}
+
+static double hnsw_dist(const ray_hnsw_t* idx, const float* a, const float* b) {
+    switch ((ray_hnsw_metric_t)idx->metric) {
+        case RAY_HNSW_L2: return hnsw_l2_dist(a, b, idx->dim);
+        case RAY_HNSW_IP: return hnsw_ip_dist(a, b, idx->dim);
+        case RAY_HNSW_COSINE: /* fallthrough */
+        default: return hnsw_cosine_dist(a, b, idx->dim);
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Random level assignment (HNSW paper, Section 3.1)
+ * -------------------------------------------------------------------------- */
+
+static _Thread_local uint32_t hnsw_rng_state = 42;
+
+static uint32_t hnsw_rand(void) {
+    /* xorshift32 — fast, deterministic, no global state collision */
+    uint32_t x = hnsw_rng_state;
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 5;
+    hnsw_rng_state = x;
+    return x;
+}
+
+static int32_t hnsw_random_level(int32_t M) {
+    double ml = 1.0 / log((double)M);
+    double r = (double)hnsw_rand() / (double)UINT32_MAX;
+    if (r < 1e-10) r = 1e-10;
+    int32_t level = (int32_t)floor(-log(r) * ml);
+    if (level >= HNSW_MAX_LAYERS) level = HNSW_MAX_LAYERS - 1;
+    return level;
+}
+
+/* --------------------------------------------------------------------------
+ * Candidate heap (min-heap by distance for beam search)
+ * -------------------------------------------------------------------------- */
+
+typedef struct {
+    int64_t id;     /* global node id */
+    double  dist;   /* cosine distance to query */
+} hnsw_cand_t;
+
+static void heap_sift_up(hnsw_cand_t* h, int64_t i) {
+    while (i > 0) {
+        int64_t p = (i - 1) / 2;
+        if (h[p].dist <= h[i].dist) break;
+        hnsw_cand_t tmp = h[p]; h[p] = h[i]; h[i] = tmp;
+        i = p;
+    }
+}
+
+static void heap_sift_down(hnsw_cand_t* h, int64_t n, int64_t i) {
+    for (;;) {
+        int64_t best = i;
+        int64_t l = 2 * i + 1, r = 2 * i + 2;
+        if (l < n && h[l].dist < h[best].dist) best = l;
+        if (r < n && h[r].dist < h[best].dist) best = r;
+        if (best == i) break;
+        hnsw_cand_t tmp = h[best]; h[best] = h[i]; h[i] = tmp;
+        i = best;
+    }
+}
+
+/* Max-heap: sift keeping largest at top */
+static void maxheap_sift_up(hnsw_cand_t* h, int64_t i) {
+    while (i > 0) {
+        int64_t p = (i - 1) / 2;
+        if (h[p].dist >= h[i].dist) break;
+        hnsw_cand_t tmp = h[p]; h[p] = h[i]; h[i] = tmp;
+        i = p;
+    }
+}
+
+static void maxheap_sift_down(hnsw_cand_t* h, int64_t n, int64_t i) {
+    for (;;) {
+        int64_t best = i;
+        int64_t l = 2 * i + 1, r = 2 * i + 2;
+        if (l < n && h[l].dist > h[best].dist) best = l;
+        if (r < n && h[r].dist > h[best].dist) best = r;
+        if (best == i) break;
+        hnsw_cand_t tmp = h[best]; h[best] = h[i]; h[i] = tmp;
+        i = best;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Visited set (bitset)
+ * -------------------------------------------------------------------------- */
+
+typedef struct {
+    uint8_t* bits;
+    int64_t  n_nodes;
+} hnsw_visited_t;
+
+static hnsw_visited_t visited_new(int64_t n_nodes) {
+    hnsw_visited_t v;
+    v.n_nodes = n_nodes;
+    size_t sz = ((size_t)n_nodes + 7) / 8;
+    v.bits = (uint8_t*)ray_sys_alloc(sz);
+    if (v.bits) memset(v.bits, 0, sz);
+    return v;
+}
+
+static void visited_free(hnsw_visited_t* v) {
+    if (v->bits) ray_sys_free(v->bits);
+    v->bits = NULL;
+}
+
+static bool visited_test(const hnsw_visited_t* v, int64_t id) {
+    if (id < 0 || id >= v->n_nodes) return true;
+    return (v->bits[id / 8] >> (id % 8)) & 1;
+}
+
+static void visited_set(hnsw_visited_t* v, int64_t id) {
+    if (id >= 0 && id < v->n_nodes)
+        v->bits[id / 8] |= (uint8_t)(1 << (id % 8));
+}
+
+/* --------------------------------------------------------------------------
+ * Layer helper: find index of global node id within a layer
+ * -------------------------------------------------------------------------- */
+
+static int64_t layer_local_idx(const ray_hnsw_layer_t* layer, int64_t global_id) {
+    /* For layer 0, all nodes are present: local == global */
+    /* For higher layers, linear scan (small) or we could build a reverse map */
+    for (int64_t i = 0; i < layer->n_nodes; i++) {
+        if (layer->node_ids[i] == global_id) return i;
+    }
+    return -1;
+}
+
+/* Get neighbor list for a node in a layer (by global id) */
+static int64_t* layer_neighbors(const ray_hnsw_layer_t* layer, int64_t global_id,
+                                  int64_t* out_M_max) {
+    int64_t local = layer_local_idx(layer, global_id);
+    if (local < 0) { *out_M_max = 0; return NULL; }
+    *out_M_max = layer->M_max;
+    return &layer->neighbors[local * layer->M_max];
+}
+
+/* Count actual (non -1) neighbors */
+static int64_t count_neighbors(const int64_t* nb, int64_t M_max) {
+    int64_t c = 0;
+    for (int64_t i = 0; i < M_max; i++) {
+        if (nb[i] < 0) break;
+        c++;
+    }
+    return c;
+}
+
+/* Add a neighbor to a node's list (append if room) */
+static bool add_neighbor(int64_t* nb, int64_t M_max, int64_t new_id) {
+    for (int64_t i = 0; i < M_max; i++) {
+        if (nb[i] < 0) { nb[i] = new_id; return true; }
+        if (nb[i] == new_id) return true; /* already present */
+    }
+    return false; /* full */
+}
+
+/* --------------------------------------------------------------------------
+ * Search layer: beam search on a single layer.
+ *
+ * When `accept` is non-NULL, behaves as a filtered iterative scan:
+ *   - Candidate-queue expansion still walks through rejected neighbours so
+ *     accepted descendants remain reachable (preserves graph connectivity).
+ *   - Only nodes passing `accept(node_id, ctx)` enter the result heap.
+ *   - Candidate capacity is widened to n_nodes so pathologically selective
+ *     filters don't silently drop unexplored regions.
+ *
+ * When `accept` is NULL, behaviour is identical to the original ef-bounded
+ * beam search.
+ * -------------------------------------------------------------------------- */
+
+/* Return value convention: non-negative = number of results written.
+ * -1 = allocation failure (OOM) — callers must surface a distinct error
+ * rather than treat it as "no matches". */
+static int64_t hnsw_search_layer(
+    const ray_hnsw_t* idx,
+    const float* query,
+    const int64_t* entry_points, int64_t n_entries,
+    int32_t layer_idx,
+    int32_t ef,
+    hnsw_cand_t* results /* pre-allocated, ef entries */,
+    ray_hnsw_accept_fn accept, void* accept_ctx)
+{
+    const ray_hnsw_layer_t* layer = &idx->layers[layer_idx];
+
+    hnsw_visited_t vis = visited_new(idx->n_nodes);
+    if (!vis.bits) return -1;
+
+    /* Candidate capacity.  Unfiltered: tight bound, standard HNSW.
+     * Filtered: worst case is a full-graph scan, so budget n_nodes.
+     * Memory: n_nodes * sizeof(hnsw_cand_t) = n_nodes * 16 bytes. */
+    int64_t cand_cap = ef * 2 + n_entries + 1;
+    if (accept && idx->n_nodes > cand_cap) cand_cap = idx->n_nodes;
+    hnsw_cand_t* candidates = (hnsw_cand_t*)ray_sys_alloc((size_t)cand_cap * sizeof(hnsw_cand_t));
+    if (!candidates) { visited_free(&vis); return -1; }
+    int64_t cand_sz = 0;
+    int64_t res_sz = 0;
+
+    /* Initialize with entry points. */
+    for (int64_t i = 0; i < n_entries; i++) {
+        int64_t ep = entry_points[i];
+        if (visited_test(&vis, ep)) continue;
+        visited_set(&vis, ep);
+
+        double d = hnsw_dist(idx, query, idx->vectors + ep * idx->dim);
+
+        /* Always add to candidate queue. */
+        candidates[cand_sz] = (hnsw_cand_t){ ep, d };
+        heap_sift_up(candidates, cand_sz);
+        cand_sz++;
+
+        /* Add to results only if no filter, or filter accepts. */
+        if (!accept || accept(ep, accept_ctx)) {
+            results[res_sz] = (hnsw_cand_t){ ep, d };
+            maxheap_sift_up(results, res_sz);
+            res_sz++;
+        }
+    }
+
+    /* Beam loop. */
+    while (cand_sz > 0) {
+        hnsw_cand_t closest = candidates[0];
+        candidates[0] = candidates[cand_sz - 1];
+        cand_sz--;
+        if (cand_sz > 0) heap_sift_down(candidates, cand_sz, 0);
+
+        /* Termination: closest unexpanded is worse than farthest accepted
+         * AND we already have ef accepted.  When filtering, `res_sz`
+         * counts only accepted nodes, so this naturally delays stopping
+         * until we've collected ef accepted results. */
+        if (res_sz >= ef && closest.dist > results[0].dist) break;
+
+        int64_t M_max;
+        int64_t* nb = layer_neighbors(layer, closest.id, &M_max);
+        if (!nb) continue;
+
+        for (int64_t i = 0; i < M_max; i++) {
+            int64_t nid = nb[i];
+            if (nid < 0) break;
+            if (visited_test(&vis, nid)) continue;
+            visited_set(&vis, nid);
+
+            double d = hnsw_dist(idx, query, idx->vectors + nid * idx->dim);
+
+            /* Candidate-queue gate.  Unfiltered: only push if the neighbour
+             * could improve the top-ef.  Filtered: always push so rejected
+             * nodes remain pathways to accepted descendants. */
+            bool should_explore = accept != NULL ||
+                                  res_sz < ef ||
+                                  d < results[0].dist;
+            if (should_explore && cand_sz < cand_cap) {
+                candidates[cand_sz] = (hnsw_cand_t){ nid, d };
+                heap_sift_up(candidates, cand_sz);
+                cand_sz++;
+            }
+
+            /* Result gate: only accepted nodes enter the top-K. */
+            if (accept && !accept(nid, accept_ctx)) continue;
+
+            if (res_sz < ef) {
+                results[res_sz] = (hnsw_cand_t){ nid, d };
+                maxheap_sift_up(results, res_sz);
+                res_sz++;
+            } else if (d < results[0].dist) {
+                results[0] = (hnsw_cand_t){ nid, d };
+                maxheap_sift_down(results, res_sz, 0);
+            }
+        }
+    }
+
+    ray_sys_free(candidates);
+    visited_free(&vis);
+
+    /* Sort results by distance ascending (insertion sort, ef is small) */
+    for (int64_t i = 1; i < res_sz; i++) {
+        hnsw_cand_t key = results[i];
+        int64_t j = i - 1;
+        while (j >= 0 && results[j].dist > key.dist) {
+            results[j + 1] = results[j];
+            j--;
+        }
+        results[j + 1] = key;
+    }
+
+    return res_sz;
+}
+
+/* --------------------------------------------------------------------------
+ * Greedy closest: find single nearest neighbor in a layer (used during descent)
+ * -------------------------------------------------------------------------- */
+
+static int64_t hnsw_greedy_closest(const ray_hnsw_t* idx, const float* query,
+                                     int64_t ep, int32_t layer_idx) {
+    const ray_hnsw_layer_t* layer = &idx->layers[layer_idx];
+    double best_dist = hnsw_dist(idx, query, idx->vectors + ep * idx->dim);
+    bool changed = true;
+
+    while (changed) {
+        changed = false;
+        int64_t M_max;
+        int64_t* nb = layer_neighbors(layer, ep, &M_max);
+        if (!nb) break;
+
+        for (int64_t i = 0; i < M_max; i++) {
+            int64_t nid = nb[i];
+            if (nid < 0) break;
+            double d = hnsw_dist(idx, query, idx->vectors + nid * idx->dim);
+            if (d < best_dist) {
+                best_dist = d;
+                ep = nid;
+                changed = true;
+            }
+        }
+    }
+    return ep;
+}
+
+/* --------------------------------------------------------------------------
+ * Neighbor pruning: keep M closest neighbors (simple selection)
+ * -------------------------------------------------------------------------- */
+
+static void prune_neighbors(const ray_hnsw_t* idx, int64_t node_id,
+                              int64_t* nb, int64_t M_max, int64_t M_keep) {
+    /* Count current neighbors */
+    int64_t count = count_neighbors(nb, M_max);
+    if (count <= M_keep) return;
+
+    /* Compute distances from node to each neighbor */
+    const float* vec = idx->vectors + node_id * idx->dim;
+    hnsw_cand_t* ranked = (hnsw_cand_t*)ray_sys_alloc((size_t)count * sizeof(hnsw_cand_t));
+    if (!ranked) return;
+
+    for (int64_t i = 0; i < count; i++) {
+        ranked[i].id = nb[i];
+        ranked[i].dist = hnsw_dist(idx, vec, idx->vectors + nb[i] * idx->dim);
+    }
+
+    /* Sort by distance ascending */
+    for (int64_t i = 1; i < count; i++) {
+        hnsw_cand_t key = ranked[i];
+        int64_t j = i - 1;
+        while (j >= 0 && ranked[j].dist > key.dist) {
+            ranked[j + 1] = ranked[j];
+            j--;
+        }
+        ranked[j + 1] = key;
+    }
+
+    /* Keep M_keep closest */
+    for (int64_t i = 0; i < M_max; i++) {
+        nb[i] = (i < M_keep) ? ranked[i].id : -1;
+    }
+
+    ray_sys_free(ranked);
+}
+
+/* --------------------------------------------------------------------------
+ * HNSW Build (Algorithm 1 from HNSW paper)
+ * -------------------------------------------------------------------------- */
+
+ray_hnsw_t* ray_hnsw_build(const float* vectors, int64_t n_nodes, int32_t dim,
+                           ray_hnsw_metric_t metric,
+                           int32_t M, int32_t ef_construction) {
+    if (!vectors || n_nodes <= 0 || dim <= 0) return NULL;
+    if (M <= 0) M = HNSW_DEFAULT_M;
+    if (ef_construction <= 0) ef_construction = HNSW_DEFAULT_EF_C;
+    if (metric < RAY_HNSW_COSINE || metric > RAY_HNSW_IP) metric = RAY_HNSW_COSINE;
+
+    ray_hnsw_t* idx = (ray_hnsw_t*)ray_sys_alloc(sizeof(ray_hnsw_t));
+    if (!idx) return NULL;
+    memset(idx, 0, sizeof(ray_hnsw_t));
+
+    idx->n_nodes = n_nodes;
+    idx->dim = dim;
+    idx->M = M;
+    idx->M_max0 = 2 * M;
+    idx->ef_construction = ef_construction;
+    idx->metric = (int32_t)metric;
+    idx->entry_point = 0;
+    /* Copy vectors so the index owns its data — prevents use-after-free
+     * if the caller frees the original buffer. */
+    size_t vec_bytes = (size_t)n_nodes * (size_t)dim * sizeof(float);
+    float* vec_copy = (float*)ray_sys_alloc(vec_bytes);
+    if (!vec_copy) { ray_sys_free(idx); return NULL; }
+    memcpy(vec_copy, vectors, vec_bytes);
+    idx->vectors = vec_copy;
+    idx->owns_data = true;
+
+    /* Allocate node levels */
+    idx->node_level = (int8_t*)ray_sys_alloc((size_t)n_nodes * sizeof(int8_t));
+    if (!idx->node_level) { ray_hnsw_free(idx); return NULL; }
+
+    /* Assign random levels to all nodes */
+    int32_t max_level = 0;
+    for (int64_t i = 0; i < n_nodes; i++) {
+        int32_t level = hnsw_random_level(M);
+        idx->node_level[i] = (int8_t)level;
+        if (level > max_level) max_level = level;
+    }
+    idx->n_layers = max_level + 1;
+
+    /* Allocate layers */
+    for (int32_t l = 0; l < idx->n_layers; l++) {
+        ray_hnsw_layer_t* layer = &idx->layers[l];
+
+        /* Count nodes at this layer */
+        int64_t count = 0;
+        for (int64_t i = 0; i < n_nodes; i++) {
+            if (idx->node_level[i] >= l) count++;
+        }
+        layer->n_nodes = count;
+        layer->M_max = (l == 0) ? idx->M_max0 : M;
+
+        /* Allocate neighbor array and node_ids mapping */
+        size_t nb_size = (size_t)count * (size_t)layer->M_max * sizeof(int64_t);
+        layer->neighbors = (int64_t*)ray_sys_alloc(nb_size);
+        layer->node_ids  = (int64_t*)ray_sys_alloc((size_t)count * sizeof(int64_t));
+        if (!layer->neighbors || !layer->node_ids) {
+            ray_hnsw_free(idx);
+            return NULL;
+        }
+
+        /* Initialize neighbors to -1 (empty) */
+        memset(layer->neighbors, 0xFF, nb_size);
+
+        /* Fill node_ids mapping */
+        int64_t j = 0;
+        for (int64_t i = 0; i < n_nodes; i++) {
+            if (idx->node_level[i] >= l) {
+                layer->node_ids[j++] = i;
+            }
+        }
+    }
+
+    /* Temp buffer for search results during construction */
+    int64_t max_ef = ef_construction > idx->M_max0 ? ef_construction : idx->M_max0;
+    hnsw_cand_t* search_buf = (hnsw_cand_t*)ray_sys_alloc((size_t)(max_ef + 1) * sizeof(hnsw_cand_t));
+    if (!search_buf) { ray_hnsw_free(idx); return NULL; }
+
+    /* Insert nodes one by one */
+    for (int64_t i = 1; i < n_nodes; i++) {
+        const float* vec = vectors + i * dim;
+        int32_t node_level = idx->node_level[i];
+
+        /* Phase 1: Greedy descent from top layer to node_level+1 */
+        int64_t ep = idx->entry_point;
+        for (int32_t l = idx->n_layers - 1; l > node_level; l--) {
+            ep = hnsw_greedy_closest(idx, vec, ep, l);
+        }
+
+        /* Phase 2: Insert into layers [node_level ... 0] */
+        for (int32_t l = node_level; l >= 0; l--) {
+            ray_hnsw_layer_t* layer = &idx->layers[l];
+            int64_t M_max_l = layer->M_max;
+            int64_t M_keep = (l == 0) ? idx->M_max0 : M;
+
+            /* Search for ef_construction nearest neighbors at this layer */
+            int64_t n_found = hnsw_search_layer(idx, vec, &ep, 1, l,
+                                                  ef_construction, search_buf,
+                                                  NULL, NULL);
+            if (n_found < 0) {
+                /* Allocation failed inside the beam — abort the build
+                 * rather than producing a half-connected index. */
+                ray_sys_free(search_buf);
+                ray_hnsw_free(idx);
+                return NULL;
+            }
+
+            /* Connect node i to the M nearest found */
+            int64_t local_i = layer_local_idx(layer, i);
+            if (local_i < 0) continue;
+
+            int64_t* my_nb = &layer->neighbors[local_i * M_max_l];
+            int64_t n_connect = (n_found < M_keep) ? n_found : M_keep;
+            for (int64_t j = 0; j < n_connect; j++) {
+                my_nb[j] = search_buf[j].id;
+            }
+
+            /* Add bidirectional edges: each neighbor also gets i */
+            for (int64_t j = 0; j < n_connect; j++) {
+                int64_t nb_id = search_buf[j].id;
+                int64_t nb_local = layer_local_idx(layer, nb_id);
+                if (nb_local < 0) continue;
+
+                int64_t* their_nb = &layer->neighbors[nb_local * M_max_l];
+                if (!add_neighbor(their_nb, M_max_l, i)) {
+                    /* Neighbor list full — prune to make room, then add i */
+                    prune_neighbors(idx, nb_id, their_nb, M_max_l, M_keep);
+                    add_neighbor(their_nb, M_max_l, i);
+                }
+            }
+
+            /* Update ep for next lower layer */
+            if (n_found > 0) ep = search_buf[0].id;
+        }
+
+        /* Update entry point if this node has higher level */
+        if (node_level > idx->node_level[idx->entry_point]) {
+            idx->entry_point = i;
+        }
+    }
+
+    ray_sys_free(search_buf);
+    return idx;
+}
+
+/* --------------------------------------------------------------------------
+ * Free
+ * -------------------------------------------------------------------------- */
+
+void ray_hnsw_free(ray_hnsw_t* idx) {
+    if (!idx) return;
+    for (int32_t l = 0; l < idx->n_layers; l++) {
+        if (idx->layers[l].neighbors) ray_sys_free(idx->layers[l].neighbors);
+        if (idx->layers[l].node_ids) ray_sys_free(idx->layers[l].node_ids);
+    }
+    if (idx->node_level) ray_sys_free(idx->node_level);
+    if (idx->owns_data && idx->vectors) ray_sys_free((void*)idx->vectors);
+    ray_sys_free(idx);
+}
+
+ray_hnsw_t* ray_hnsw_clone(const ray_hnsw_t* src) {
+    if (!src) return NULL;
+
+    ray_hnsw_t* dst = (ray_hnsw_t*)ray_sys_alloc(sizeof(ray_hnsw_t));
+    if (!dst) return NULL;
+    memset(dst, 0, sizeof(ray_hnsw_t));
+
+    /* Scalars — straight copy. */
+    dst->n_nodes         = src->n_nodes;
+    dst->dim             = src->dim;
+    dst->n_layers        = src->n_layers;
+    dst->M               = src->M;
+    dst->M_max0          = src->M_max0;
+    dst->ef_construction = src->ef_construction;
+    dst->metric          = src->metric;
+    dst->entry_point     = src->entry_point;
+    dst->owns_data       = true;
+
+    /* node_level */
+    if (src->n_nodes > 0 && src->node_level) {
+        size_t sz = (size_t)src->n_nodes * sizeof(int8_t);
+        dst->node_level = (int8_t*)ray_sys_alloc(sz);
+        if (!dst->node_level) { ray_hnsw_free(dst); return NULL; }
+        memcpy(dst->node_level, src->node_level, sz);
+    }
+
+    /* Vectors */
+    if (src->n_nodes > 0 && src->dim > 0 && src->vectors) {
+        size_t vec_bytes = (size_t)src->n_nodes * (size_t)src->dim * sizeof(float);
+        float* vcopy = (float*)ray_sys_alloc(vec_bytes);
+        if (!vcopy) { ray_hnsw_free(dst); return NULL; }
+        memcpy(vcopy, src->vectors, vec_bytes);
+        dst->vectors = vcopy;
+    }
+
+    /* Per-layer neighbor + node_id arrays */
+    for (int32_t l = 0; l < src->n_layers; l++) {
+        const ray_hnsw_layer_t* sl = &src->layers[l];
+        ray_hnsw_layer_t*       dl = &dst->layers[l];
+        dl->n_nodes = sl->n_nodes;
+        dl->M_max   = sl->M_max;
+
+        if (sl->n_nodes > 0 && sl->M_max > 0 && sl->neighbors) {
+            size_t nb = (size_t)sl->n_nodes * (size_t)sl->M_max * sizeof(int64_t);
+            dl->neighbors = (int64_t*)ray_sys_alloc(nb);
+            if (!dl->neighbors) { ray_hnsw_free(dst); return NULL; }
+            memcpy(dl->neighbors, sl->neighbors, nb);
+        }
+        if (sl->n_nodes > 0 && sl->node_ids) {
+            size_t sz = (size_t)sl->n_nodes * sizeof(int64_t);
+            dl->node_ids = (int64_t*)ray_sys_alloc(sz);
+            if (!dl->node_ids) { ray_hnsw_free(dst); return NULL; }
+            memcpy(dl->node_ids, sl->node_ids, sz);
+        }
+    }
+
+    return dst;
+}
+
+/* --------------------------------------------------------------------------
+ * Search: find K approximate nearest neighbors
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_hnsw_search(const ray_hnsw_t* idx,
+                         const float* query, int32_t dim,
+                         int64_t k, int32_t ef_search,
+                         int64_t* out_ids, double* out_dists) {
+    if (!idx || !query || dim != idx->dim || k <= 0) return 0;
+    if (ef_search < k) ef_search = (int32_t)k;
+    if (idx->n_nodes == 0) return 0;
+
+    /* Phase 1: Greedy descent from top layer to layer 1 */
+    int64_t ep = idx->entry_point;
+    for (int32_t l = idx->n_layers - 1; l >= 1; l--) {
+        ep = hnsw_greedy_closest(idx, query, ep, l);
+    }
+
+    /* Phase 2: Beam search on layer 0 with ef_search width */
+    hnsw_cand_t* results = (hnsw_cand_t*)ray_sys_alloc(
+        (size_t)ef_search * sizeof(hnsw_cand_t));
+    if (!results) return -1;  /* OOM — caller must propagate error. */
+
+    int64_t n_found = hnsw_search_layer(idx, query, &ep, 1, 0, ef_search, results,
+                                         NULL, NULL);
+    if (n_found < 0) {
+        ray_sys_free(results);
+        return -1;  /* OOM — caller must propagate error. */
+    }
+
+    /* Extract top-K from results (already sorted by distance ascending) */
+    int64_t result_count = (n_found < k) ? n_found : k;
+    for (int64_t i = 0; i < result_count; i++) {
+        out_ids[i]   = results[i].id;
+        out_dists[i] = results[i].dist;
+    }
+
+    ray_sys_free(results);
+    return result_count;
+}
+
+/* --------------------------------------------------------------------------
+ * Filtered iterative-scan search: only returns nodes passing `accept(node_id, ctx)`.
+ *
+ * The beam search explores the graph normally (including through rejected
+ * nodes, preserving connectivity to accepted descendants); only accepted
+ * nodes enter the result heap.  Falls back to a full-graph walk for
+ * pathologically selective filters — bounded by n_nodes memory.
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_hnsw_search_filter(const ray_hnsw_t* idx,
+                               const float* query, int32_t dim,
+                               int64_t k, int32_t ef_search,
+                               ray_hnsw_accept_fn accept, void* ctx,
+                               int64_t* out_ids, double* out_dists) {
+    if (!idx || !query || dim != idx->dim || k <= 0) return 0;
+    if (!accept) {
+        /* No predicate — fall through to the plain search so callers get
+         * zero overhead. */
+        return ray_hnsw_search(idx, query, dim, k, ef_search, out_ids, out_dists);
+    }
+    if (ef_search < k) ef_search = (int32_t)k;
+    if (idx->n_nodes == 0) return 0;
+
+    /* Descent through upper layers is filter-unaware — we only use those
+     * layers to pick the layer-0 entry point.  The filter applies on
+     * layer 0 where the result set is collected. */
+    int64_t ep = idx->entry_point;
+    for (int32_t l = idx->n_layers - 1; l >= 1; l--) {
+        ep = hnsw_greedy_closest(idx, query, ep, l);
+    }
+
+    hnsw_cand_t* results = (hnsw_cand_t*)ray_sys_alloc(
+        (size_t)ef_search * sizeof(hnsw_cand_t));
+    if (!results) return -1;  /* OOM */
+
+    int64_t n_found = hnsw_search_layer(idx, query, &ep, 1, 0, ef_search, results,
+                                         accept, ctx);
+    if (n_found < 0) {
+        ray_sys_free(results);
+        return -1;
+    }
+
+    int64_t result_count = (n_found < k) ? n_found : k;
+    for (int64_t i = 0; i < result_count; i++) {
+        out_ids[i]   = results[i].id;
+        out_dists[i] = results[i].dist;
+    }
+
+    ray_sys_free(results);
+    return result_count;
+}
+
+/* --------------------------------------------------------------------------
+ * Accessors
+ * -------------------------------------------------------------------------- */
+
+int32_t ray_hnsw_dim(const ray_hnsw_t* idx) {
+    return idx ? idx->dim : 0;
+}
+
+/* --------------------------------------------------------------------------
+ * Persistence: save/load/mmap
+ *
+ * File layout in directory:
+ *   hnsw_header.bin  — fixed-size header
+ *   hnsw_levels.bin  — node_level[n_nodes]
+ *   hnsw_layer_N.bin — per-layer: neighbors + node_ids
+ * -------------------------------------------------------------------------- */
+
+typedef struct {
+    int64_t n_nodes;
+    int32_t dim;
+    int32_t n_layers;
+    int32_t M;
+    int32_t M_max0;
+    int32_t ef_construction;
+    int32_t metric;        /* ray_hnsw_metric_t (was _pad; old files saved 0 = COSINE) */
+    int64_t entry_point;
+} hnsw_file_header_t;
+
+ray_err_t ray_hnsw_save(const ray_hnsw_t* idx, const char* dir) {
+    if (!idx || !dir) return RAY_ERR_IO;
+
+    if (mkdir(dir, 0755) != 0 && errno != EEXIST) return RAY_ERR_IO;
+
+    char path[1024];
+    FILE* f;
+
+    /* Write header */
+    snprintf(path, sizeof(path), "%s/hnsw_header.bin", dir);
+    f = fopen(path, "wb");
+    if (!f) return RAY_ERR_IO;
+    hnsw_file_header_t hdr = {
+        .n_nodes = idx->n_nodes,
+        .dim = idx->dim,
+        .n_layers = idx->n_layers,
+        .M = idx->M,
+        .M_max0 = idx->M_max0,
+        .ef_construction = idx->ef_construction,
+        .metric = idx->metric,
+        .entry_point = idx->entry_point
+    };
+    if (fwrite(&hdr, sizeof(hdr), 1, f) != 1) { fclose(f); return RAY_ERR_IO; }
+    fclose(f);
+
+    /* Write node levels */
+    snprintf(path, sizeof(path), "%s/hnsw_levels.bin", dir);
+    f = fopen(path, "wb");
+    if (!f) return RAY_ERR_IO;
+    if (fwrite(idx->node_level, sizeof(int8_t), (size_t)idx->n_nodes, f) !=
+        (size_t)idx->n_nodes) {
+        fclose(f); return RAY_ERR_IO;
+    }
+    fclose(f);
+
+    /* Write each layer */
+    for (int32_t l = 0; l < idx->n_layers; l++) {
+        const ray_hnsw_layer_t* layer = &idx->layers[l];
+        snprintf(path, sizeof(path), "%s/hnsw_layer_%d.bin", dir, l);
+        f = fopen(path, "wb");
+        if (!f) return RAY_ERR_IO;
+
+        /* Write layer metadata: n_nodes, M_max */
+        if (fwrite(&layer->n_nodes, sizeof(int64_t), 1, f) != 1) { fclose(f); return RAY_ERR_IO; }
+        if (fwrite(&layer->M_max, sizeof(int64_t), 1, f) != 1) { fclose(f); return RAY_ERR_IO; }
+
+        /* Write neighbors */
+        size_t nb_count = (size_t)layer->n_nodes * (size_t)layer->M_max;
+        if (nb_count > 0) {
+            if (fwrite(layer->neighbors, sizeof(int64_t), nb_count, f) != nb_count) {
+                fclose(f); return RAY_ERR_IO;
+            }
+        }
+
+        /* Write node_ids */
+        if (layer->n_nodes > 0) {
+            if (fwrite(layer->node_ids, sizeof(int64_t), (size_t)layer->n_nodes, f) !=
+                (size_t)layer->n_nodes) {
+                fclose(f); return RAY_ERR_IO;
+            }
+        }
+
+        fclose(f);
+    }
+
+    /* Write vectors */
+    snprintf(path, sizeof(path), "%s/hnsw_vectors.bin", dir);
+    f = fopen(path, "wb");
+    if (!f) return RAY_ERR_IO;
+    size_t vec_count = (size_t)idx->n_nodes * (size_t)idx->dim;
+    if (vec_count > 0) {
+        if (fwrite(idx->vectors, sizeof(float), vec_count, f) != vec_count) {
+            fclose(f); return RAY_ERR_IO;
+        }
+    }
+    fclose(f);
+
+    return RAY_OK;
+}
+
+static ray_hnsw_t* hnsw_load_impl(const char* dir, bool use_mmap) {
+    if (!dir) return NULL;
+    (void)use_mmap; /* mmap optimization deferred — both paths read into memory */
+
+    char path[1024];
+    FILE* f;
+
+    /* Read header */
+    snprintf(path, sizeof(path), "%s/hnsw_header.bin", dir);
+    f = fopen(path, "rb");
+    if (!f) return NULL;
+    hnsw_file_header_t hdr;
+    if (fread(&hdr, sizeof(hdr), 1, f) != 1) { fclose(f); return NULL; }
+    fclose(f);
+
+    if (hdr.n_nodes <= 0 || hdr.dim <= 0 || hdr.n_layers <= 0 ||
+        hdr.n_layers > HNSW_MAX_LAYERS ||
+        hdr.M <= 0 || hdr.M_max0 <= 0 ||
+        hdr.entry_point < 0 || hdr.entry_point >= hdr.n_nodes) return NULL;
+
+    ray_hnsw_t* idx = (ray_hnsw_t*)ray_sys_alloc(sizeof(ray_hnsw_t));
+    if (!idx) return NULL;
+    memset(idx, 0, sizeof(ray_hnsw_t));
+
+    idx->n_nodes = hdr.n_nodes;
+    idx->dim = hdr.dim;
+    idx->n_layers = hdr.n_layers;
+    idx->M = hdr.M;
+    idx->M_max0 = hdr.M_max0;
+    idx->ef_construction = hdr.ef_construction;
+    idx->metric = (hdr.metric >= RAY_HNSW_COSINE && hdr.metric <= RAY_HNSW_IP)
+                  ? hdr.metric : RAY_HNSW_COSINE;
+    idx->entry_point = hdr.entry_point;
+    idx->vectors = NULL;
+    idx->owns_data = true;
+
+    /* Read node levels */
+    snprintf(path, sizeof(path), "%s/hnsw_levels.bin", dir);
+    f = fopen(path, "rb");
+    if (!f) { ray_hnsw_free(idx); return NULL; }
+    idx->node_level = (int8_t*)ray_sys_alloc((size_t)hdr.n_nodes * sizeof(int8_t));
+    if (!idx->node_level) { fclose(f); ray_hnsw_free(idx); return NULL; }
+    if (fread(idx->node_level, sizeof(int8_t), (size_t)hdr.n_nodes, f) !=
+        (size_t)hdr.n_nodes) {
+        fclose(f); ray_hnsw_free(idx); return NULL;
+    }
+    fclose(f);
+
+    /* Read each layer */
+    for (int32_t l = 0; l < hdr.n_layers; l++) {
+        ray_hnsw_layer_t* layer = &idx->layers[l];
+        snprintf(path, sizeof(path), "%s/hnsw_layer_%d.bin", dir, l);
+        f = fopen(path, "rb");
+        if (!f) { ray_hnsw_free(idx); return NULL; }
+
+        /* Read layer metadata */
+        if (fread(&layer->n_nodes, sizeof(int64_t), 1, f) != 1) { fclose(f); ray_hnsw_free(idx); return NULL; }
+        if (fread(&layer->M_max, sizeof(int64_t), 1, f) != 1) { fclose(f); ray_hnsw_free(idx); return NULL; }
+
+        /* Validate layer metadata against header */
+        if (layer->n_nodes <= 0 || layer->n_nodes > hdr.n_nodes) { fclose(f); ray_hnsw_free(idx); return NULL; }
+        if (layer->M_max <= 0 || layer->M_max > 4096) { fclose(f); ray_hnsw_free(idx); return NULL; }
+        if ((uint64_t)layer->n_nodes > SIZE_MAX / sizeof(int64_t) / (uint64_t)layer->M_max) {
+            fclose(f); ray_hnsw_free(idx); return NULL;
+        }
+
+        /* Allocate and read neighbors */
+        size_t nb_count = (size_t)layer->n_nodes * (size_t)layer->M_max;
+        if (nb_count > 0) {
+            layer->neighbors = (int64_t*)ray_sys_alloc(nb_count * sizeof(int64_t));
+            if (!layer->neighbors) { fclose(f); ray_hnsw_free(idx); return NULL; }
+            if (fread(layer->neighbors, sizeof(int64_t), nb_count, f) != nb_count) {
+                fclose(f); ray_hnsw_free(idx); return NULL;
+            }
+        }
+
+        /* Allocate and read node_ids */
+        if (layer->n_nodes > 0) {
+            layer->node_ids = (int64_t*)ray_sys_alloc((size_t)layer->n_nodes * sizeof(int64_t));
+            if (!layer->node_ids) { fclose(f); ray_hnsw_free(idx); return NULL; }
+            if (fread(layer->node_ids, sizeof(int64_t), (size_t)layer->n_nodes, f) !=
+                (size_t)layer->n_nodes) {
+                fclose(f); ray_hnsw_free(idx); return NULL;
+            }
+        }
+
+        fclose(f);
+    }
+
+    /* Read vectors */
+    snprintf(path, sizeof(path), "%s/hnsw_vectors.bin", dir);
+    f = fopen(path, "rb");
+    if (!f) { ray_hnsw_free(idx); return NULL; }
+    size_t vec_count = (size_t)hdr.n_nodes * (size_t)hdr.dim;
+    if (vec_count > 0) {
+        float* vecs = (float*)ray_sys_alloc(vec_count * sizeof(float));
+        if (!vecs) { fclose(f); ray_hnsw_free(idx); return NULL; }
+        if (fread(vecs, sizeof(float), vec_count, f) != vec_count) {
+            fclose(f); ray_sys_free(vecs); ray_hnsw_free(idx); return NULL;
+        }
+        idx->vectors = vecs;
+    }
+    fclose(f);
+
+    return idx;
+}
+
+ray_hnsw_t* ray_hnsw_load(const char* dir) {
+    return hnsw_load_impl(dir, false);
+}
+
+ray_hnsw_t* ray_hnsw_mmap(const char* dir) {
+    return hnsw_load_impl(dir, true);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/hnsw.h b/crates/rayforce-sys/vendor/rayforce/src/store/hnsw.h
new file mode 100644
index 0000000..055a3c7
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/hnsw.h
@@ -0,0 +1,133 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_HNSW_H
+#define RAY_HNSW_H
+
+#include <rayforce.h>
+
+/* ---------- HNSW Index ----------
+ *
+ * Multi-layer proximity graph for approximate nearest neighbor search.
+ *
+ * Memory layout per node:
+ *   - Layer 0: up to M_max0 neighbors (default 2*M)
+ *   - Layers 1+: up to M neighbors each
+ *
+ * Neighbor lists stored as flat arrays:
+ *   neighbors[node * M_max + i] = neighbor_id  (or -1 if unused)
+ *
+ * Each layer stores its own neighbor array for all nodes at that layer.
+ */
+
+#define HNSW_MAX_LAYERS    16
+#define HNSW_DEFAULT_M     16
+#define HNSW_DEFAULT_EF_C  200
+#define HNSW_DEFAULT_EF_S  50
+
+/* Distance metric driving beam search.  HNSW requires lower-is-closer;
+ * we choose the encoding so each metric sorts ascending:
+ *   COSINE → 1 - cos(a, b)     range [0, 2]
+ *   L2     → sqrt(sum(sq diff)) range [0, ∞)
+ *   IP     → -dot(a, b)         range (-∞, ∞)   (negated so lower=closer) */
+typedef enum {
+    RAY_HNSW_COSINE = 0,
+    RAY_HNSW_L2     = 1,
+    RAY_HNSW_IP     = 2
+} ray_hnsw_metric_t;
+
+typedef struct ray_hnsw_layer {
+    int64_t*  neighbors;     /* flat array: n_nodes_in_layer * M_max entries */
+    int64_t   n_nodes;       /* number of nodes in this layer */
+    int64_t   M_max;         /* max neighbors per node in this layer */
+    int64_t*  node_ids;      /* mapping: layer_idx -> global node id */
+} ray_hnsw_layer_t;
+
+typedef struct ray_hnsw {
+    int64_t          n_nodes;         /* total number of vectors */
+    int32_t          dim;             /* embedding dimension */
+    int32_t          n_layers;        /* number of layers (including layer 0) */
+    int32_t          M;               /* max neighbors per node (layers 1+) */
+    int32_t          M_max0;          /* max neighbors per node (layer 0) */
+    int32_t          ef_construction;  /* beam width during construction */
+    int32_t          metric;          /* ray_hnsw_metric_t */
+    int64_t          entry_point;     /* entry point node (highest layer) */
+    int8_t*          node_level;      /* max layer for each node (n_nodes entries) */
+    ray_hnsw_layer_t  layers[HNSW_MAX_LAYERS];
+    const float*     vectors;         /* pointer to embedding data (not owned) */
+    bool             owns_data;       /* true if loaded from disk (owns neighbor arrays etc.) */
+} ray_hnsw_t;
+
+/* --- Build / Free / Clone --- */
+ray_hnsw_t* ray_hnsw_build(const float* vectors, int64_t n_nodes, int32_t dim,
+                           ray_hnsw_metric_t metric,
+                           int32_t M, int32_t ef_construction);
+void ray_hnsw_free(ray_hnsw_t* idx);
+/* Deep-copy an index: duplicates vectors, node levels, and every layer's
+ * neighbor + node_id arrays.  Returns a new fully-owned index with the
+ * same semantics as the source.  Returns NULL on OOM. */
+ray_hnsw_t* ray_hnsw_clone(const ray_hnsw_t* src);
+
+/* --- Search --- */
+/* Returns top-K nearest neighbors as (node_id, distance) pairs.
+ * out_ids and out_dists must be pre-allocated with k entries.
+ *
+ * Return value:
+ *   >= 0 : number of results written (may be < k).
+ *   -1   : allocation failure (OOM) — callers must surface a distinct
+ *          error rather than treat the 0-return as "no matches".
+ */
+int64_t ray_hnsw_search(const ray_hnsw_t* idx,
+                         const float* query, int32_t dim,
+                         int64_t k, int32_t ef_search,
+                         int64_t* out_ids, double* out_dists);
+
+/* Predicate callback used by the filtered iterative-scan variant below.
+ * Return true to accept `node_id` into the result set, false to reject.
+ * Rejected nodes still participate in candidate-graph exploration so
+ * connectivity through them is preserved — this is the standard
+ * "iterative scan" shape. */
+typedef bool (*ray_hnsw_accept_fn)(int64_t node_id, void* ctx);
+
+/* Like ray_hnsw_search, but only nodes passing `accept(node_id, ctx)`
+ * enter the top-K result set.  Candidate-queue expansion still traverses
+ * rejected nodes so their accepted descendants remain reachable.
+ * Falls back to exhaustive graph exploration for pathologically selective
+ * filters (bounded by idx->n_nodes).
+ *
+ * Return value matches ray_hnsw_search: >= 0 = result count, -1 = OOM. */
+int64_t ray_hnsw_search_filter(const ray_hnsw_t* idx,
+                               const float* query, int32_t dim,
+                               int64_t k, int32_t ef_search,
+                               ray_hnsw_accept_fn accept, void* ctx,
+                               int64_t* out_ids, double* out_dists);
+
+/* --- Accessors --- */
+int32_t ray_hnsw_dim(const ray_hnsw_t* idx);
+
+/* --- Persistence --- */
+ray_err_t ray_hnsw_save(const ray_hnsw_t* idx, const char* dir);
+ray_hnsw_t* ray_hnsw_load(const char* dir);
+ray_hnsw_t* ray_hnsw_mmap(const char* dir);
+
+#endif /* RAY_HNSW_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/journal.c b/crates/rayforce-sys/vendor/rayforce/src/store/journal.c
new file mode 100644
index 0000000..d16da06
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/journal.c
@@ -0,0 +1,656 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OS_WINDOWS
+#  define _GNU_SOURCE   /* fileno(), gmtime_r() */
+#endif
+
+#include "journal.h"
+#include "fileio.h"
+#include "serde.h"
+#include "core/ipc.h"
+#include "lang/eval.h"
+#include "lang/env.h"
+#include "mem/sys.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#ifndef RAY_OS_WINDOWS
+#  include <unistd.h>
+#  include <sys/stat.h>
+#endif
+
+#define RAY_JOURNAL_PATH_MAX 1024
+
+/* Module-private state.  Single-threaded by construction (the IPC
+ * dispatch loop is single-threaded for eval_payload, and replay runs
+ * from main before any worker thread spins up). */
+static struct {
+    ray_journal_mode_t mode;
+    FILE*              fp;
+    char               base[RAY_JOURNAL_PATH_MAX];
+    char               log_path[RAY_JOURNAL_PATH_MAX];
+    bool               in_replay;
+} g_journal = {
+    .mode      = RAY_JOURNAL_OFF,
+    .fp        = NULL,
+    .base      = {0},
+    .log_path  = {0},
+    .in_replay = false,
+};
+
+/* ── helpers ──────────────────────────────────────────────────────── */
+
+static bool path_join_ext(char* dst, size_t dstsz, const char* base, const char* ext) {
+    int n = snprintf(dst, dstsz, "%s%s", base, ext);
+    return n > 0 && (size_t)n < dstsz;
+}
+
+static bool file_exists(const char* path) {
+#ifdef RAY_OS_WINDOWS
+    DWORD attrs = GetFileAttributesA(path);
+    return attrs != INVALID_FILE_ATTRIBUTES && !(attrs & FILE_ATTRIBUTE_DIRECTORY);
+#else
+    struct stat st;
+    return stat(path, &st) == 0 && S_ISREG(st.st_mode);
+#endif
+}
+
+/* Read fixed-size buffer in a loop — fread can short-read on signals.
+ * Returns SIZE_MAX on a real I/O error (vs. clean EOF) so the caller
+ * can distinguish "log ended cleanly" from "torn read mid-frame".  The
+ * difference matters: clean EOF after N entries means N replayed; an
+ * error mid-frame must abort with RAY_JREPLAY_IO so we don't open the
+ * log for append on top of a partially-replayed state. */
+static size_t read_full(FILE* f, void* buf, size_t want) {
+    uint8_t* p = (uint8_t*)buf;
+    size_t   got = 0;
+    while (got < want) {
+        size_t n = fread(p + got, 1, want - got, f);
+        if (n == 0) {
+            if (ferror(f)) return SIZE_MAX;
+            break;
+        }
+        got += n;
+    }
+    return got;
+}
+
+/* Decompress an IPC payload in place if the COMPRESSED flag is set;
+ * otherwise no-op.  Returns owned buffer + length on success (caller
+ * frees with ray_sys_free), NULL on failure.  When no decompression
+ * is needed, *out_owned is set to false and *out_buf aliases payload. */
+static bool decompress_if_needed(const ray_ipc_header_t* hdr,
+                                 const uint8_t* payload, int64_t payload_len,
+                                 uint8_t** out_buf, int64_t* out_len,
+                                 bool* out_owned)
+{
+    if (!(hdr->flags & RAY_IPC_FLAG_COMPRESSED)) {
+        *out_buf   = (uint8_t*)payload;
+        *out_len   = payload_len;
+        *out_owned = false;
+        return true;
+    }
+    if (payload_len < 4) return false;
+    uint32_t uncomp_size;
+    memcpy(&uncomp_size, payload, 4);
+    if (uncomp_size == 0 || uncomp_size > 256u * 1024u * 1024u) return false;
+
+    uint8_t* tmp = (uint8_t*)ray_sys_alloc(uncomp_size);
+    if (!tmp) return false;
+    size_t dlen = ray_ipc_decompress(payload + 4, (size_t)payload_len - 4,
+                                     tmp, uncomp_size);
+    if (dlen != uncomp_size) { ray_sys_free(tmp); return false; }
+    *out_buf   = tmp;
+    *out_len   = (int64_t)uncomp_size;
+    *out_owned = true;
+    return true;
+}
+
+/* Evaluate a deserialized message exactly as eval_payload would: string
+ * payloads run through ray_eval_str, everything else through ray_eval. */
+static ray_t* eval_one(ray_t* msg) {
+    if (!msg || RAY_IS_ERR(msg)) return msg;
+    if (msg->type == -RAY_STR) {
+        const char* s = ray_str_ptr(msg);
+        size_t      n = ray_str_len(msg);
+        if (!s || n == 0) return RAY_NULL_OBJ;
+        char* tmp = (char*)ray_sys_alloc(n + 1);
+        if (!tmp) return ray_error("oom", NULL);
+        memcpy(tmp, s, n);
+        tmp[n] = '\0';
+        ray_t* r = ray_eval_str(tmp);
+        ray_sys_free(tmp);
+        return r;
+    }
+    return ray_eval(msg);
+}
+
+/* ── public API ───────────────────────────────────────────────────── */
+
+bool ray_journal_is_open(void) { return g_journal.fp != NULL; }
+
+ray_err_t ray_journal_write_bytes(const ray_ipc_header_t* hdr,
+                                  const uint8_t*          payload,
+                                  int64_t                 payload_len)
+{
+    if (!g_journal.fp || g_journal.in_replay) return RAY_OK;
+    if (!hdr || !payload || payload_len < 0) return RAY_ERR_DOMAIN;
+
+    if (fwrite(hdr, 1, sizeof(*hdr), g_journal.fp) != sizeof(*hdr))
+        return RAY_ERR_IO;
+    if (payload_len > 0 &&
+        fwrite(payload, 1, (size_t)payload_len, g_journal.fp) != (size_t)payload_len)
+        return RAY_ERR_IO;
+
+    if (g_journal.mode == RAY_JOURNAL_SYNC) {
+        if (fflush(g_journal.fp) != 0) return RAY_ERR_IO;
+#ifndef RAY_OS_WINDOWS
+        if (fsync(fileno(g_journal.fp)) != 0) return RAY_ERR_IO;
+#else
+        FlushFileBuffers((HANDLE)_get_osfhandle(_fileno(g_journal.fp)));
+#endif
+    }
+    return RAY_OK;
+}
+
+ray_err_t ray_journal_replay(const char*           path,
+                             int64_t*              out_chunks,
+                             int64_t*              out_eval_errors,
+                             ray_jreplay_status_t* out_status)
+{
+    if (out_chunks)      *out_chunks      = 0;
+    if (out_eval_errors) *out_eval_errors = 0;
+    if (out_status)      *out_status      = RAY_JREPLAY_OK;
+
+    FILE* f = fopen(path, "rb");
+    if (!f) {
+        if (out_status) *out_status = RAY_JREPLAY_IO;
+        return RAY_ERR_IO;
+    }
+
+    bool prev_in_replay = g_journal.in_replay;
+    g_journal.in_replay = true;
+
+    int64_t chunks = 0;
+    int64_t errs   = 0;
+    ray_jreplay_status_t status = RAY_JREPLAY_OK;
+
+    for (;;) {
+        ray_ipc_header_t hdr;
+        size_t r = read_full(f, &hdr, sizeof(hdr));
+        if (r == 0) break;                          /* clean EOF */
+        if (r == SIZE_MAX)                          { status = RAY_JREPLAY_IO;      break; }
+        if (r != sizeof(hdr))                       { status = RAY_JREPLAY_BADTAIL; break; }
+        if (hdr.prefix  != RAY_SERDE_PREFIX)        { status = RAY_JREPLAY_BADTAIL; break; }
+        if (hdr.version != RAY_SERDE_WIRE_VERSION)  { status = RAY_JREPLAY_BADTAIL; break; }
+        if (hdr.size <= 0 || hdr.size > 256LL*1024*1024)
+                                                    { status = RAY_JREPLAY_BADTAIL; break; }
+
+        uint8_t* buf = (uint8_t*)ray_sys_alloc((size_t)hdr.size);
+        if (!buf)                                   { status = RAY_JREPLAY_OOM;     break; }
+        size_t pr = read_full(f, buf, (size_t)hdr.size);
+        if (pr == SIZE_MAX) { ray_sys_free(buf); status = RAY_JREPLAY_IO;      break; }
+        if (pr != (size_t)hdr.size) {
+            ray_sys_free(buf);
+            status = RAY_JREPLAY_BADTAIL;
+            break;
+        }
+
+        uint8_t* payload  = NULL;
+        int64_t  pay_len  = 0;
+        bool     owned    = false;
+        if (!decompress_if_needed(&hdr, buf, hdr.size,
+                                  &payload, &pay_len, &owned)) {
+            ray_sys_free(buf);
+            /* Framing was intact (header parsed OK, payload size matched);
+             * "decode failed" is a content/code bug, NOT a tail truncation,
+             * so do not point the operator at `truncate to recover`. */
+            status = RAY_JREPLAY_DECOMP;
+            break;
+        }
+
+        int64_t consumed = pay_len;
+        ray_t*  msg      = ray_de_raw(payload, &consumed);
+        if (owned) ray_sys_free(payload);
+        ray_sys_free(buf);
+
+        if (!msg || RAY_IS_ERR(msg)) {
+            if (msg) ray_error_free(msg);   /* ray_release is a no-op on errors */
+            status = RAY_JREPLAY_DESER;
+            break;
+        }
+
+        /* Re-impose the sender's restricted state for THIS frame.  Without
+         * this a `-U` client's writes silently elevate to full privilege
+         * across crash-restart, since replay runs in the main thread with
+         * no IPC connection context. */
+        bool prev_restricted = ray_eval_get_restricted();
+        ray_eval_set_restricted(hdr.flags & RAY_IPC_FLAG_RESTRICTED);
+        ray_t* result = eval_one(msg);
+        ray_eval_set_restricted(prev_restricted);
+        ray_release(msg);
+
+        if (result && RAY_IS_ERR(result)) {
+            const char* code = ray_err_code(result);
+            fprintf(stderr, "log: WARN  chunk %lld raised: %s (during replay)\n",
+                    (long long)chunks, code ? code : "?");
+            errs++;
+            ray_error_free(result);
+        } else if (result && result != RAY_NULL_OBJ) {
+            ray_release(result);
+        }
+
+        chunks++;
+    }
+
+    fclose(f);
+    g_journal.in_replay = prev_in_replay;
+
+    if (out_chunks)      *out_chunks      = chunks;
+    if (out_eval_errors) *out_eval_errors = errs;
+    if (out_status)      *out_status      = status;
+
+    if (status == RAY_JREPLAY_OK)  return RAY_OK;
+    if (status == RAY_JREPLAY_IO ||
+        status == RAY_JREPLAY_OOM) return RAY_ERR_IO;
+    return RAY_ERR_DOMAIN;
+}
+
+ray_err_t ray_journal_validate(const char* path,
+                               int64_t*    out_chunks,
+                               int64_t*    out_valid_bytes)
+{
+    if (out_chunks)      *out_chunks      = 0;
+    if (out_valid_bytes) *out_valid_bytes = 0;
+
+    FILE* f = fopen(path, "rb");
+    if (!f) return RAY_ERR_IO;
+
+    int64_t chunks = 0;
+    int64_t valid_off = 0;
+    /* Reuse one growing buffer for payload reads — most logs hold
+     * many small entries plus the occasional large one, so growing on
+     * demand is simpler than trying to size up front. */
+    uint8_t* buf = NULL;
+    int64_t  cap = 0;
+
+    for (;;) {
+        ray_ipc_header_t hdr;
+        size_t r = read_full(f, &hdr, sizeof(hdr));
+        if (r == 0) break;                                /* clean EOF */
+        if (r != sizeof(hdr))                       break;
+        if (hdr.prefix  != RAY_SERDE_PREFIX)        break;
+        if (hdr.version != RAY_SERDE_WIRE_VERSION)  break;
+        if (hdr.size <= 0 || hdr.size > 256LL*1024*1024) break;
+
+        if (hdr.size > cap) {
+            uint8_t* tmp = (uint8_t*)ray_sys_alloc((size_t)hdr.size);
+            if (!tmp) break;                              /* OOM mid-validate */
+            if (buf) ray_sys_free(buf);
+            buf = tmp;
+            cap = hdr.size;
+        }
+        /* Actually consume the payload bytes — fseek would silently
+         * succeed past EOF and we'd over-count valid frames on a
+         * truncated log. */
+        if (read_full(f, buf, (size_t)hdr.size) != (size_t)hdr.size) break;
+
+        valid_off += (int64_t)sizeof(hdr) + hdr.size;
+        chunks++;
+    }
+    if (buf) ray_sys_free(buf);
+    fclose(f);
+
+    if (out_chunks)      *out_chunks      = chunks;
+    if (out_valid_bytes) *out_valid_bytes = valid_off;
+    return RAY_OK;
+}
+
+/* Open <base>.log in append mode after replay. */
+static ray_err_t open_log_for_append(void) {
+    g_journal.fp = fopen(g_journal.log_path, "ab");
+    if (!g_journal.fp) return RAY_ERR_IO;
+    /* Disable stdio buffering: every fwrite must reach the OS buffer
+     * immediately so a SIGTERM (or any non-clean shutdown) still leaves
+     * the entry on disk.  Without this, the default block-buffered FILE*
+     * keeps recent writes in user-space until 4 KB accumulates — a
+     * silent data-loss window that defeats the whole point of -l mode.
+     * In RAY_JOURNAL_SYNC mode we additionally fsync per write; here
+     * we just need the bytes to leave the process. */
+    setvbuf(g_journal.fp, NULL, _IONBF, 0);
+    return RAY_OK;
+}
+
+ray_err_t ray_journal_open(const char* base, ray_journal_mode_t mode) {
+    if (!base || !*base) return RAY_ERR_DOMAIN;
+    if (g_journal.fp) return RAY_ERR_DOMAIN;   /* already open */
+
+    size_t blen = strlen(base);
+    if (blen + 5 >= sizeof(g_journal.base)) return RAY_ERR_DOMAIN;
+
+    memcpy(g_journal.base, base, blen + 1);
+    g_journal.mode = mode;
+    if (!path_join_ext(g_journal.log_path, sizeof(g_journal.log_path), base, ".log"))
+        return RAY_ERR_DOMAIN;
+
+    char qdb_path[RAY_JOURNAL_PATH_MAX];
+    if (!path_join_ext(qdb_path, sizeof(qdb_path), base, ".qdb"))
+        return RAY_ERR_DOMAIN;
+
+    /* 1. Snapshot — load <base>.qdb if present. */
+    if (file_exists(qdb_path)) {
+        bool prev_in_replay = g_journal.in_replay;
+        g_journal.in_replay = true;
+        ray_t* snap = ray_obj_load(qdb_path);
+        g_journal.in_replay = prev_in_replay;
+
+        if (!snap || RAY_IS_ERR(snap)) {
+            const char* code = (snap && RAY_IS_ERR(snap)) ? ray_err_code(snap) : "io";
+            fprintf(stderr, "log: ERROR  failed to load snapshot %s (%s)\n",
+                    qdb_path, code ? code : "io");
+            if (snap) ray_error_free(snap);
+            return RAY_ERR_IO;
+        }
+        if (snap->type != RAY_DICT) {
+            fprintf(stderr, "log: ERROR  snapshot %s is not a dict\n", qdb_path);
+            ray_release(snap);
+            return RAY_ERR_DOMAIN;
+        }
+        ray_t* keys = ray_dict_keys(snap);
+        ray_t* vals = ray_dict_vals(snap);
+        int64_t n = keys ? keys->len : 0;
+        int64_t bound = 0;
+        int64_t skipped = 0;
+        int64_t bind_errs = 0;
+        for (int64_t i = 0; i < n; i++) {
+            if (!keys || keys->type != RAY_SYM) {
+                fprintf(stderr, "log: WARN  snapshot key vector has type %d, expected RAY_SYM — dropping %lld bindings\n",
+                        keys ? (int)keys->type : -1, (long long)(n - i));
+                skipped += n - i;
+                break;
+            }
+            int64_t sym_id = ((int64_t*)ray_data(keys))[i];
+            ray_t* v = ray_list_get(vals, i);
+            if (!v) {
+                fprintf(stderr, "log: WARN  snapshot value missing for sym %lld — skipping\n",
+                        (long long)sym_id);
+                skipped++;
+                continue;
+            }
+            /* MUST go through ray_env_set, NOT ray_env_bind: the former
+             * flips the slot's user flag, the latter installs as builtin
+             * and the value silently drops out of the next snapshot's
+             * ray_env_list_user filter — silent corruption across two
+             * restarts.  ray_env_set handles its own retain via
+             * env_bind_global_impl, so do NOT explicitly retain here
+             * (would leak one ref per binding). */
+            ray_err_t e = ray_env_set(sym_id, v);
+            if (e != RAY_OK) {
+                fprintf(stderr, "log: WARN  snapshot bind for sym %lld failed (%d)\n",
+                        (long long)sym_id, (int)e);
+                bind_errs++;
+                continue;
+            }
+            bound++;
+        }
+        ray_release(snap);
+        fprintf(stderr, "log: loaded snapshot %s (%lld bound, %lld skipped, %lld errors)\n",
+                qdb_path, (long long)bound, (long long)skipped, (long long)bind_errs);
+        if (bind_errs > 0) {
+            /* Partial state is a footgun.  The caller should treat this as
+             * fatal and either restore from backup or skip the snapshot. */
+            fprintf(stderr, "log: ERROR  snapshot load left env in a partially-applied state\n");
+            return RAY_ERR_DOMAIN;
+        }
+    }
+
+    /* 2. Log — replay <base>.log if present. */
+    if (file_exists(g_journal.log_path)) {
+        int64_t chunks = 0, errs = 0;
+        ray_jreplay_status_t status = RAY_JREPLAY_OK;
+        ray_journal_replay(g_journal.log_path, &chunks, &errs, &status);
+        switch (status) {
+        case RAY_JREPLAY_OK: {
+            fprintf(stderr, "log: replayed %lld entries (%lld eval errors) from %s\n",
+                    (long long)chunks, (long long)errs, g_journal.log_path);
+            break;
+        }
+        case RAY_JREPLAY_BADTAIL: {
+            int64_t valid_chunks = 0, valid_bytes = 0;
+            ray_journal_validate(g_journal.log_path, &valid_chunks, &valid_bytes);
+            fprintf(stderr,
+                    "log: ERROR badtail in %s after %lld entries (valid bytes = %lld)\n"
+                    "log: hint: truncate the file at offset %lld to recover the\n"
+                    "log:       valid prefix, then restart\n",
+                    g_journal.log_path, (long long)chunks,
+                    (long long)valid_bytes, (long long)valid_bytes);
+            return RAY_ERR_DOMAIN;
+        }
+        case RAY_JREPLAY_DESER:
+        case RAY_JREPLAY_DECOMP: {
+            fprintf(stderr,
+                    "log: ERROR replay failed at chunk %lld in %s: %s — framing\n"
+                    "log:       was intact so this is content/code mismatch, NOT\n"
+                    "log:       tail truncation.  Do NOT truncate the log; either\n"
+                    "log:       fix the version skew or restore from .qdb backup.\n",
+                    (long long)chunks, g_journal.log_path,
+                    status == RAY_JREPLAY_DECOMP ? "decompression failed" : "deserialization failed");
+            return RAY_ERR_DOMAIN;
+        }
+        case RAY_JREPLAY_OOM:
+        case RAY_JREPLAY_IO: {
+            fprintf(stderr, "log: ERROR replay aborted at chunk %lld in %s (%s)\n",
+                    (long long)chunks, g_journal.log_path,
+                    status == RAY_JREPLAY_OOM ? "out of memory" : "I/O failure");
+            return RAY_ERR_IO;
+        }
+        }
+    }
+
+    /* 3. Open log for append. */
+    return open_log_for_append();
+}
+
+ray_err_t ray_journal_close(void) {
+    if (!g_journal.fp) return RAY_OK;
+    /* Check both fflush and fclose return — buffered ENOSPC slips
+     * through silently otherwise and the "best-effort durability at
+     * clean shutdown" promise becomes a lie.  Even on failure we null
+     * the fp so the journal isn't left in a half-open zombie state. */
+    int flush_rc = fflush(g_journal.fp);
+    int close_rc = fclose(g_journal.fp);
+    g_journal.fp = NULL;
+    if (flush_rc != 0 || close_rc != 0) {
+        fprintf(stderr, "log: ERROR  journal close (flush rc=%d, close rc=%d)\n",
+                flush_rc, close_rc);
+        return RAY_ERR_IO;
+    }
+    return RAY_OK;
+}
+
+ray_err_t ray_journal_sync(void) {
+    if (!g_journal.fp) return RAY_OK;
+    if (g_journal.mode == RAY_JOURNAL_SYNC) return RAY_OK;
+    if (fflush(g_journal.fp) != 0) return RAY_ERR_IO;
+#ifndef RAY_OS_WINDOWS
+    if (fsync(fileno(g_journal.fp)) != 0) return RAY_ERR_IO;
+#else
+    FlushFileBuffers((HANDLE)_get_osfhandle(_fileno(g_journal.fp)));
+#endif
+    return RAY_OK;
+}
+
+/* UTC ISO-8601 with safe filename chars (no ':'). */
+static void utc_stamp(char* buf, size_t bufsz) {
+    time_t t = time(NULL);
+    struct tm tm_;
+#ifdef RAY_OS_WINDOWS
+    gmtime_s(&tm_, &t);
+#else
+    gmtime_r(&t, &tm_);
+#endif
+    strftime(buf, bufsz, "%Y.%m.%dT%H.%M.%SZ", &tm_);
+}
+
+ray_err_t ray_journal_roll(void) {
+    if (!g_journal.fp) return RAY_ERR_DOMAIN;
+
+    /* Build the archive name BEFORE closing the fp — if path build
+     * fails we can return without leaving the journal in a closed
+     * state that ray_journal_write_bytes silently no-ops on. */
+    char stamp[64];
+    utc_stamp(stamp, sizeof(stamp));
+    char archive[RAY_JOURNAL_PATH_MAX];
+    int  n = snprintf(archive, sizeof(archive), "%s.%s.log", g_journal.base, stamp);
+    if (n <= 0 || (size_t)n >= sizeof(archive)) return RAY_ERR_DOMAIN;
+
+    int flush_rc = fflush(g_journal.fp);
+    int close_rc = fclose(g_journal.fp);
+    g_journal.fp = NULL;
+    if (flush_rc != 0 || close_rc != 0) {
+        /* Don't rename a partial/possibly-corrupt log.  Try to reopen
+         * for append so subsequent writes still land somewhere. */
+        fprintf(stderr, "log: ERROR  roll: pre-rename flush/close failed (flush=%d close=%d)\n",
+                flush_rc, close_rc);
+        (void)open_log_for_append();   /* best-effort restore; fp may stay NULL */
+        return RAY_ERR_IO;
+    }
+
+    if (ray_file_rename(g_journal.log_path, archive) != RAY_OK) {
+        /* Rename failed but the log file is still on disk under its
+         * original name.  Reopen for append so we don't silently
+         * disable journaling for the rest of the process. */
+        fprintf(stderr, "log: ERROR  roll: rename %s -> %s failed\n",
+                g_journal.log_path, archive);
+        (void)open_log_for_append();
+        return RAY_ERR_IO;
+    }
+
+    /* Durability: the rename itself is atomic but the directory entry
+     * may not survive a power loss without a parent fsync.  Best-
+     * effort — log if it fails but don't abort, the rename did
+     * succeed. */
+    (void)ray_file_sync_dir(archive);
+
+    return open_log_for_append();
+}
+
+ray_err_t ray_journal_snapshot(void) {
+    if (!g_journal.fp) return RAY_ERR_DOMAIN;
+
+    /* Enumerate ONLY user-defined globals (slots last written via
+     * ray_env_set).  Builtin function objects must NOT enter the
+     * snapshot — they hold absolute pointers from the prior process
+     * and would dangle on reload.  ray_env_list_user is the one-bit-
+     * per-slot filter maintained by env.c. */
+    int32_t cap = ray_env_global_count();
+    if (cap <= 0) cap = 1;
+    int64_t* sym_ids = (int64_t*)ray_sys_alloc((size_t)cap * sizeof(int64_t));
+    ray_t**  vals_buf = (ray_t**) ray_sys_alloc((size_t)cap * sizeof(ray_t*));
+    if (!sym_ids || !vals_buf) {
+        if (sym_ids)  ray_sys_free(sym_ids);
+        if (vals_buf) ray_sys_free(vals_buf);
+        return RAY_ERR_OOM;
+    }
+    int32_t kept = ray_env_list_user(sym_ids, vals_buf, cap);
+
+    ray_t* keys = ray_sym_vec_new(RAY_SYM_W64, kept);
+    if (!keys || RAY_IS_ERR(keys)) {
+        if (keys && RAY_IS_ERR(keys)) ray_error_free(keys);
+        ray_sys_free(sym_ids); ray_sys_free(vals_buf);
+        return RAY_ERR_OOM;
+    }
+    ray_t* vals = ray_list_new(kept);
+    if (!vals || RAY_IS_ERR(vals)) {
+        if (vals && RAY_IS_ERR(vals)) ray_error_free(vals);
+        ray_release(keys);
+        ray_sys_free(sym_ids); ray_sys_free(vals_buf);
+        return RAY_ERR_OOM;
+    }
+    for (int32_t i = 0; i < kept; i++) {
+        /* ray_vec_append returns an error sentinel on failure but the
+         * input `keys` was either mutated in place (rc==1, no cow) or
+         * cow'd and released internally — either way the caller still
+         * owns the original pointer which is now stale.  Take the
+         * pre-call pointer so we can release whichever survived. */
+        ray_t* prev_keys = keys;
+        keys = ray_vec_append(keys, &sym_ids[i]);
+        if (RAY_IS_ERR(keys)) {
+            ray_error_free(keys);
+            ray_release(prev_keys);
+            ray_release(vals);
+            ray_sys_free(sym_ids); ray_sys_free(vals_buf);
+            return RAY_ERR_OOM;
+        }
+        ray_t* prev_vals = vals;
+        vals = ray_list_append(vals, vals_buf[i]);
+        if (RAY_IS_ERR(vals)) {
+            ray_error_free(vals);
+            ray_release(prev_vals);
+            ray_release(keys);
+            ray_sys_free(sym_ids); ray_sys_free(vals_buf);
+            return RAY_ERR_OOM;
+        }
+    }
+    ray_sys_free(sym_ids);
+    ray_sys_free(vals_buf);
+
+    ray_t* snap = ray_dict_new(keys, vals);
+    if (!snap || RAY_IS_ERR(snap)) {
+        if (snap && RAY_IS_ERR(snap)) ray_error_free(snap);
+        return RAY_ERR_OOM;
+    }
+
+    char qdb_path[RAY_JOURNAL_PATH_MAX];
+    char qdb_tmp[RAY_JOURNAL_PATH_MAX];
+    if (!path_join_ext(qdb_path, sizeof(qdb_path), g_journal.base, ".qdb") ||
+        !path_join_ext(qdb_tmp,  sizeof(qdb_tmp),  g_journal.base, ".qdb.tmp")) {
+        ray_release(snap);
+        return RAY_ERR_DOMAIN;
+    }
+
+    /* ray_obj_save writes prefix-headered bytes (same wire framing). */
+    ray_err_t e = ray_obj_save(snap, qdb_tmp);
+    ray_release(snap);
+    if (e != RAY_OK) {
+        /* Don't leave a half-written .qdb.tmp behind to confuse the
+         * next snapshot or the operator. */
+        remove(qdb_tmp);
+        return e;
+    }
+
+    if (ray_file_rename(qdb_tmp, qdb_path) != RAY_OK) {
+        remove(qdb_tmp);
+        return RAY_ERR_IO;
+    }
+    /* Parent-dir fsync: rename(2) is atomic but the directory entry
+     * isn't durable across a power loss without it.  Best-effort. */
+    (void)ray_file_sync_dir(qdb_path);
+
+    return ray_journal_roll();
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/journal.h b/crates/rayforce-sys/vendor/rayforce/src/store/journal.h
new file mode 100644
index 0000000..1f336c5
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/journal.h
@@ -0,0 +1,123 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+/* Transaction-log journaling — q/kdb's `-l` / `-L` ported to Rayforce.
+ *
+ * Wire format: every entry is a complete IPC message (16-byte
+ * ray_ipc_header_t followed by serialized payload), so log frames
+ * share parser code with the live IPC path.  Concatenation by `cat`
+ * is valid by construction.
+ *
+ * Open with ray_journal_open(base, mode):
+ *   1. If <base>.qdb exists, load it and bind every key into the global env.
+ *   2. If <base>.log exists, replay it (badtail is fatal, eval errors warn).
+ *   3. Open <base>.log for append.
+ *
+ * After open, the IPC dispatch hook (eval_payload in core/ipc.c) calls
+ * ray_journal_write_bytes() for every inbound sync message before
+ * evaluating it.  Async messages and responses are not journaled,
+ * matching q's policy of journaling only the .z.ps stream.
+ *
+ * Replay is single-threaded by construction (it runs from main, before
+ * the poll loop starts) so the module is intentionally not thread-safe;
+ * the IPC dispatch loop is also single-threaded for eval_payload, so the
+ * shared file handle does not need a mutex either.
+ */
+#ifndef RAY_JOURNAL_H
+#define RAY_JOURNAL_H
+
+#include <rayforce.h>
+#include "store/serde.h"
+
+typedef enum {
+    RAY_JOURNAL_OFF   = 0,
+    RAY_JOURNAL_ASYNC = 1,   /* -l: write, no per-message fsync          */
+    RAY_JOURNAL_SYNC  = 2,   /* -L: write + fsync per message            */
+} ray_journal_mode_t;
+
+typedef enum {
+    RAY_JREPLAY_OK      = 0,
+    RAY_JREPLAY_BADTAIL = 1, /* truncated frame / bad magic / version mismatch — framing broken */
+    RAY_JREPLAY_IO      = 2, /* file open / read I/O failure                                    */
+    RAY_JREPLAY_OOM     = 3, /* allocation failed mid-replay — transient, retryable             */
+    RAY_JREPLAY_DESER   = 4, /* header valid but ray_de_raw rejected the payload                */
+    RAY_JREPLAY_DECOMP  = 5, /* compressed payload, but decompression failed                    */
+} ray_jreplay_status_t;
+
+/* Open the journal: load <base>.qdb, replay <base>.log, open log for append.
+ * Returns RAY_OK on success.  Prints a one-line summary to stderr
+ * ("log: replayed N entries (M eval errors)").  Returns RAY_ERR_DOMAIN
+ * if the log replay hits a badtail; the caller should print a recovery
+ * hint and exit non-zero. */
+ray_err_t ray_journal_open(const char* base, ray_journal_mode_t mode);
+
+/* True iff a journal is currently open for append. */
+bool ray_journal_is_open(void);
+
+/* Append one entry to the active journal.  No-op (returns RAY_OK) if
+ * no journal is open or if a replay is currently in progress (we do
+ * NOT recursively log replayed messages even if .log.write is called
+ * from a replayed entry).  In RAY_JOURNAL_SYNC mode, fflush + fsync
+ * before returning. */
+ray_err_t ray_journal_write_bytes(const ray_ipc_header_t* hdr,
+                                  const uint8_t*          payload,
+                                  int64_t                 payload_len);
+
+/* Replay a log file, evaluating each entry in order.  Sets *out_chunks
+ * to entries successfully replayed and *out_eval_errors to entries that
+ * deserialized cleanly but raised an error during ray_eval (those are
+ * skipped with a stderr warning, not fatal — framing was intact).
+ * *out_status is RAY_JREPLAY_OK on a clean tail or RAY_JREPLAY_BADTAIL
+ * if a truncated/corrupt frame was found. */
+ray_err_t ray_journal_replay(const char*           path,
+                             int64_t*              out_chunks,
+                             int64_t*              out_eval_errors,
+                             ray_jreplay_status_t* out_status);
+
+/* Validate (parse but don't eval) — q's `-11!(-2; file)` analogue.
+ * *out_chunks counts valid entries; *out_valid_bytes is the byte
+ * offset of the first bad header (== file size on a clean log). */
+ray_err_t ray_journal_validate(const char* path,
+                               int64_t*    out_chunks,
+                               int64_t*    out_valid_bytes);
+
+/* Close the active log, rename it to <base>.<UTC-ISO8601>.log, open a
+ * fresh empty <base>.log for append.  Errors if no journal is open. */
+ray_err_t ray_journal_roll(void);
+
+/* Serialize every user (non-reserved) global env binding into a dict and
+ * write it as a single entry to <base>.qdb.tmp, then atomic-rename to
+ * <base>.qdb, then call ray_journal_roll.  After this, the .log file
+ * is fresh and a future restart loads .qdb instead of replaying the
+ * old (now archived) log. */
+ray_err_t ray_journal_snapshot(void);
+
+/* Force fflush + fsync on the active journal.  No-op (RAY_OK) when no
+ * journal is open or when in RAY_JOURNAL_SYNC mode (where every write
+ * already syncs). */
+ray_err_t ray_journal_sync(void);
+
+/* Close the active journal.  No-op if none is open. */
+ray_err_t ray_journal_close(void);
+
+#endif /* RAY_JOURNAL_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/meta.c b/crates/rayforce-sys/vendor/rayforce/src/store/meta.c
new file mode 100644
index 0000000..d4889f3
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/meta.c
@@ -0,0 +1,43 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "meta.h"
+#include "store/col.h"
+#include <string.h>
+#include <stdio.h>
+
+/* --------------------------------------------------------------------------
+ * .d file: serialized I64 vector of column name symbol IDs
+ *
+ * ray_meta_save_d: write schema vector to .d file
+ * ray_meta_load_d: read .d file back as I64 vector
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_meta_save_d(ray_t* schema, const char* path) {
+    if (!schema || RAY_IS_ERR(schema)) return RAY_ERR_TYPE;
+    return ray_col_save(schema, path);
+}
+
+ray_t* ray_meta_load_d(const char* path) {
+    return ray_col_load(path);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/meta.h b/crates/rayforce-sys/vendor/rayforce/src/store/meta.h
new file mode 100644
index 0000000..f76065c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/meta.h
@@ -0,0 +1,33 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_META_H
+#define RAY_META_H
+
+#include <rayforce.h>
+
+/* Metadata */
+ray_err_t ray_meta_save_d(ray_t* schema, const char* path);
+ray_t*    ray_meta_load_d(const char* path);
+
+#endif /* RAY_META_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/part.c b/crates/rayforce-sys/vendor/rayforce/src/store/part.c
new file mode 100644
index 0000000..0646ddb
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/part.c
@@ -0,0 +1,503 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#if defined(__APPLE__)
+#define _DARWIN_C_SOURCE
+#elif !defined(RAY_OS_WINDOWS)
+#define _GNU_SOURCE
+#endif
+#include "part.h"
+#include "core/platform.h"
+#include "mem/sys.h"
+#include "ops/ops.h"
+#include "store/splay.h"
+#include "table/sym.h"
+#include <string.h>
+#include <stdio.h>
+#include <dirent.h>
+#include <sys/stat.h>
+
+/* Validate YYYY.MM.DD format: exactly 10 chars, dots at pos 4/7,
+ * month 01-12, day 01-31. */
+static bool is_date_dir(const char* name) {
+    if (strlen(name) != 10) return false;
+    if (name[4] != '.' || name[7] != '.') return false;
+    for (int i = 0; i < 10; i++) {
+        if (i == 4 || i == 7) continue;
+        if (name[i] < '0' || name[i] > '9') return false;
+    }
+    int month = (name[5] - '0') * 10 + (name[6] - '0');
+    int day   = (name[8] - '0') * 10 + (name[9] - '0');
+    return month >= 1 && month <= 12 && day >= 1 && day <= 31;
+}
+
+/* Check if string is a pure integer (digits only, possibly with leading minus). */
+static bool is_integer_str(const char* s) {
+    if (!*s) return false;
+    if (*s == '-') s++;
+    if (!*s) return false;
+    for (; *s; s++)
+        if (*s < '0' || *s > '9') return false;
+    return true;
+}
+
+/* Infer MAPCOMMON sub-type from partition directory names. */
+static uint8_t infer_mc_type(char** part_dirs, int64_t part_count) {
+    bool all_date = true, all_int = true;
+    for (int64_t i = 0; i < part_count; i++) {
+        if (all_date && !is_date_dir(part_dirs[i])) all_date = false;
+        if (all_int && !is_integer_str(part_dirs[i])) all_int = false;
+        if (!all_date && !all_int) break;
+    }
+    if (all_date) return RAY_MC_DATE;
+    if (all_int) return RAY_MC_I64;
+    return RAY_MC_SYM;
+}
+
+/* Parse "YYYY.MM.DD" → days since 2000-01-01 (Rayforce epoch).
+ * Uses inverse of Hinnant's civil_from_days algorithm (same as exec.c). */
+static int32_t parse_date_dir(const char* name) {
+    int64_t y = (name[0]-'0')*1000 + (name[1]-'0')*100 +
+                (name[2]-'0')*10   + (name[3]-'0');
+    int64_t m = (name[5]-'0')*10 + (name[6]-'0');
+    int64_t d = (name[8]-'0')*10 + (name[9]-'0');
+    y -= (m <= 2);
+    int64_t era = (y >= 0 ? y : y - 399) / 400;
+    uint64_t yoe = (uint64_t)(y - era * 400);
+    uint64_t doy = (153 * (m > 2 ? (uint64_t)m-3 : (uint64_t)m+9) + 2)/5 + (uint64_t)d - 1;
+    uint64_t doe = yoe*365 + yoe/4 - yoe/100 + doy;
+    return (int32_t)(era * 146097 + (int64_t)doe - 719468 - 10957);
+}
+
+/* Parse integer string → int64_t. Caller guarantees is_integer_str(). */
+static int64_t parse_int_dir(const char* s) {
+    int neg = 0;
+    if (*s == '-') { neg = 1; s++; }
+    int64_t v = 0;
+    for (; *s; s++) v = v * 10 + (*s - '0');
+    return neg ? -v : v;
+}
+
+/* --------------------------------------------------------------------------
+ * Partitioned table: date-partitioned directory of splayed tables
+ *
+ * Format:
+ *   db_root/sym              — global symbol intern table
+ *   db_root/YYYY.MM.DD/      — partition directories
+ *   db_root/YYYY.MM.DD/table — splayed table per partition
+ *
+ * No symlink check: local-trust file format; path traversal checks
+ * cover main attack vector.
+ * -------------------------------------------------------------------------- */
+
+/* --------------------------------------------------------------------------
+ * collect_part_dirs — scan db_root for partition directories
+ *
+ * Collects directory names that match digit/dot pattern, bubble-sorts them.
+ * If skip_sym is true, entries named "sym" are skipped.
+ * Caller must free each entry with ray_sys_free and the array itself.
+ * -------------------------------------------------------------------------- */
+
+static ray_err_t collect_part_dirs(const char* db_root, char*** out_dirs,
+                                   int64_t* out_count, bool skip_sym) {
+    DIR* d = opendir(db_root);
+    if (!d) return RAY_ERR_IO;
+
+    char** part_dirs = NULL;
+    int64_t part_count = 0;
+    int64_t part_cap = 0;
+
+    struct dirent* ent;
+    while ((ent = readdir(d)) != NULL) {
+        if (ent->d_name[0] == '.') continue;
+        if (skip_sym && strcmp(ent->d_name, "sym") == 0) continue;
+
+        /* Partition directory name format validation is intentionally loose:
+         * accepts any sequence of digits and dots (e.g. "2024.01.15").
+         * Invalid entries fail during splay load and are caught there. */
+        bool valid = (ent->d_name[0] != '\0');
+        for (const char* c = ent->d_name; *c; c++) {
+            if (*c == '.' || (*c >= '0' && *c <= '9')) continue;
+            valid = false; break;
+        }
+        if (!valid) continue;
+
+        if (part_count >= part_cap) {
+            part_cap = part_cap == 0 ? 16 : part_cap * 2;
+            char** tmp = (char**)ray_sys_realloc(part_dirs, (size_t)part_cap * sizeof(char*));
+            if (!tmp) break;
+            part_dirs = tmp;
+        }
+        char* dup = ray_sys_strdup(ent->d_name);
+        if (!dup) break;
+        part_dirs[part_count++] = dup;
+    }
+    closedir(d);
+
+    if (part_count == 0) {
+        ray_sys_free(part_dirs);
+        return RAY_ERR_IO;
+    }
+
+    /* Sort partition names for deterministic order.
+     * O(n^2) but partition count is typically small (< 1000 daily partitions). */
+    for (int64_t i = 0; i < part_count - 1; i++) {
+        for (int64_t j = i + 1; j < part_count; j++) {
+            if (strcmp(part_dirs[i], part_dirs[j]) > 0) {
+                char* tmp = part_dirs[i];
+                part_dirs[i] = part_dirs[j];
+                part_dirs[j] = tmp;
+            }
+        }
+    }
+
+    *out_dirs = part_dirs;
+    *out_count = part_count;
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_part_load — load a partitioned table
+ *
+ * Discovers partition directories, loads each splayed table, and
+ * concatenates columns across partitions.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_part_load(const char* db_root, const char* table_name) {
+    if (!db_root || !table_name) return ray_error("io", NULL);
+
+    /* Validate table_name: no path separators or traversal */
+    if (strchr(table_name, '/') || strchr(table_name, '\\') ||
+        strstr(table_name, "..") || table_name[0] == '.')
+        return ray_error("io", NULL);
+
+    /* Scan db_root for partition directories */
+    char** part_dirs = NULL;
+    int64_t part_count = 0;
+    ray_err_t collect_err = collect_part_dirs(db_root, &part_dirs, &part_count, false);
+    if (collect_err != RAY_OK) return ray_error("io", NULL);
+
+    /* Build sym_path for this db_root */
+    char sym_path[1024];
+    int sn = snprintf(sym_path, sizeof(sym_path), "%s/sym", db_root);
+    if (sn < 0 || (size_t)sn >= sizeof(sym_path)) {
+        for (int64_t i = 0; i < part_count; i++) ray_sys_free(part_dirs[i]);
+        ray_sys_free(part_dirs);
+        return ray_error("io", NULL);
+    }
+
+    /* Load first partition to get schema. */
+    char path[1024];
+    int n = snprintf(path, sizeof(path), "%s/%s/%s", db_root, part_dirs[0], table_name);
+    if (n < 0 || (size_t)n >= sizeof(path)) {
+        for (int64_t i = 0; i < part_count; i++) ray_sys_free(part_dirs[i]);
+        ray_sys_free(part_dirs);
+        return ray_error("io", NULL);
+    }
+    ray_t* first = ray_splay_load(path, sym_path);
+    if (!first || RAY_IS_ERR(first)) {
+        for (int64_t i = 0; i < part_count; i++) ray_sys_free(part_dirs[i]);
+        ray_sys_free(part_dirs);
+        return first;
+    }
+
+    if (part_count == 1) {
+        for (int64_t i = 0; i < part_count; i++) ray_sys_free(part_dirs[i]);
+        ray_sys_free(part_dirs);
+        return first;
+    }
+
+    /* Load remaining partitions and concatenate */
+    int64_t ncols = ray_table_ncols(first);
+    /* Accumulate rows from all partitions */
+    ray_t** all_dfs = (ray_t**)ray_sys_alloc((size_t)part_count * sizeof(ray_t*));
+    if (!all_dfs) {
+        ray_release(first);
+        for (int64_t i = 0; i < part_count; i++) ray_sys_free(part_dirs[i]);
+        ray_sys_free(part_dirs);
+        return ray_error("oom", NULL);
+    }
+    all_dfs[0] = first;
+
+    int64_t fail_count = 0;
+    for (int64_t p = 1; p < part_count; p++) {
+        n = snprintf(path, sizeof(path), "%s/%s/%s", db_root, part_dirs[p], table_name);
+        if (n < 0 || (size_t)n >= sizeof(path)) { all_dfs[p] = NULL; fail_count++; continue; }
+        all_dfs[p] = ray_splay_load(path, NULL);
+        if (!all_dfs[p] || RAY_IS_ERR(all_dfs[p])) {
+            all_dfs[p] = NULL;
+            fail_count++;
+        }
+    }
+    if (fail_count > 0) {
+        /* One or more partition splay loads failed -- abort entire load */
+        for (int64_t p = 0; p < part_count; p++) {
+            if (all_dfs[p] && !RAY_IS_ERR(all_dfs[p]))
+                ray_release(all_dfs[p]);
+            ray_sys_free(part_dirs[p]);
+        }
+        ray_sys_free(all_dfs);
+        ray_sys_free(part_dirs);
+        return ray_error("io", NULL);
+    }
+
+    /* Build combined table by concatenating columns */
+    ray_t* result = ray_table_new(ncols);
+    for (int64_t c = 0; c < ncols; c++) {
+        int64_t name_id = ray_table_col_name(first, c);
+        ray_t* combined = ray_table_get_col_idx(first, c);
+        if (!combined) continue;
+        ray_retain(combined);
+
+        for (int64_t p = 1; p < part_count; p++) {
+            if (!all_dfs[p] || RAY_IS_ERR(all_dfs[p])) continue;
+            ray_t* part_col = ray_table_get_col_idx(all_dfs[p], c);
+            if (part_col) {
+                ray_t* new_combined = ray_vec_concat(combined, part_col);
+                ray_release(combined);
+                if (!new_combined || RAY_IS_ERR(new_combined)) {
+                    combined = NULL;
+                    break;
+                }
+                combined = new_combined;
+            }
+        }
+
+        if (!combined) {
+            ray_release(result);
+            result = NULL;
+            break;
+        }
+        result = ray_table_add_col(result, name_id, combined);
+        ray_release(combined);
+        if (!result || RAY_IS_ERR(result)) break;
+    }
+
+    /* Cleanup */
+    for (int64_t p = 0; p < part_count; p++) {
+        if (all_dfs[p] && !RAY_IS_ERR(all_dfs[p]))
+            ray_release(all_dfs[p]);
+        ray_sys_free(part_dirs[p]);
+    }
+    ray_sys_free(all_dfs);
+    ray_sys_free(part_dirs);
+
+    return result ? result : ray_error("oom", NULL);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_read_parted — zero-copy open of a partitioned table
+ *
+ * Builds parted columns (RAY_PARTED_BASE + base_type) where each segment
+ * is an mmap'd vector from ray_read_splayed. Also builds a MAPCOMMON column
+ * with partition key names and row counts.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_read_parted(const char* db_root, const char* table_name) {
+    if (!db_root || !table_name) return ray_error("io", NULL);
+
+    /* Validate table_name: no path separators or traversal */
+    if (strchr(table_name, '/') || strchr(table_name, '\\') ||
+        strstr(table_name, "..") || table_name[0] == '.')
+        return ray_error("io", NULL);
+
+    /* Build sym_path. */
+    char sym_path[1024];
+    int sn = snprintf(sym_path, sizeof(sym_path), "%s/sym", db_root);
+    if (sn < 0 || (size_t)sn >= sizeof(sym_path))
+        return ray_error("io", NULL);
+
+    /* Load global symfile if present.  Tables without RAY_SYM columns
+     * never produce a global symfile (.db.splayed.set only writes per-table
+     * sym files inside the leaf splayed dir), so a missing root-level
+     * symfile is normal — not an error. */
+    struct stat sym_st;
+    if (stat(sym_path, &sym_st) == 0) {
+        ray_err_t sym_err = ray_sym_load(sym_path);
+        if (sym_err != RAY_OK) return ray_error(ray_err_code_str(sym_err), NULL);
+    }
+
+    /* Scan db_root for partition directories (skip "sym" entry) */
+    char** part_dirs = NULL;
+    int64_t part_count = 0;
+    ray_err_t collect_err = collect_part_dirs(db_root, &part_dirs, &part_count, true);
+    if (collect_err != RAY_OK) return ray_error("io", NULL);
+
+    /* Open each partition via ray_read_splayed */
+    ray_t** part_tables = (ray_t**)ray_sys_alloc((size_t)part_count * sizeof(ray_t*));
+    if (!part_tables) goto fail_dirs;
+    memset(part_tables, 0, (size_t)part_count * sizeof(ray_t*));
+
+    char path[1024];
+    for (int64_t p = 0; p < part_count; p++) {
+        int pn = snprintf(path, sizeof(path), "%s/%s/%s", db_root, part_dirs[p], table_name);
+        if (pn < 0 || (size_t)pn >= sizeof(path)) {
+            part_tables[p] = NULL;
+            goto fail_tables;
+        }
+        part_tables[p] = ray_read_splayed(path, NULL);
+        if (!part_tables[p] || RAY_IS_ERR(part_tables[p])) {
+            part_tables[p] = NULL;
+            goto fail_tables;
+        }
+    }
+
+    /* Get schema from first partition */
+    int64_t ncols = ray_table_ncols(part_tables[0]);
+    if (ncols <= 0) goto fail_tables;
+
+    /* Infer MAPCOMMON sub-type from partition directory names */
+    uint8_t mc_type = infer_mc_type(part_dirs, part_count);
+
+    /* Build result table: 1 MAPCOMMON + ncols data columns */
+    ray_t* result = ray_table_new(ncols + 2);
+    if (!result || RAY_IS_ERR(result)) goto fail_tables;
+
+    /* ---- MAPCOMMON column (first) ---- */
+    {
+        /* key_values type matches inferred partition key type */
+        int8_t kv_type = (mc_type == RAY_MC_DATE) ? RAY_DATE
+                       : (mc_type == RAY_MC_I64)  ? RAY_I64
+                       :                           RAY_SYM;
+        ray_t* key_values = ray_vec_new(kv_type, part_count);
+        ray_t* row_counts = ray_vec_new(RAY_I64, part_count);
+        if (!key_values || RAY_IS_ERR(key_values) ||
+            !row_counts || RAY_IS_ERR(row_counts)) {
+            if (key_values && !RAY_IS_ERR(key_values)) ray_release(key_values);
+            if (row_counts && !RAY_IS_ERR(row_counts)) ray_release(row_counts);
+            ray_release(result);
+            goto fail_tables;
+        }
+
+        int64_t* rc_data = (int64_t*)ray_data(row_counts);
+        if (mc_type == RAY_MC_DATE) {
+            int32_t* kv_data = (int32_t*)ray_data(key_values);
+            for (int64_t p = 0; p < part_count; p++) {
+                kv_data[p] = parse_date_dir(part_dirs[p]);
+                rc_data[p] = ray_table_nrows(part_tables[p]);
+            }
+        } else if (mc_type == RAY_MC_I64) {
+            int64_t* kv_data = (int64_t*)ray_data(key_values);
+            for (int64_t p = 0; p < part_count; p++) {
+                kv_data[p] = parse_int_dir(part_dirs[p]);
+                rc_data[p] = ray_table_nrows(part_tables[p]);
+            }
+        } else {
+            int64_t* kv_data = (int64_t*)ray_data(key_values);
+            for (int64_t p = 0; p < part_count; p++) {
+                kv_data[p] = ray_sym_intern(part_dirs[p], strlen(part_dirs[p]));
+                rc_data[p] = ray_table_nrows(part_tables[p]);
+            }
+        }
+        key_values->len = part_count;
+        row_counts->len = part_count;
+
+        ray_t* mapcommon = ray_alloc(2 * sizeof(ray_t*));
+        if (!mapcommon || RAY_IS_ERR(mapcommon)) {
+            ray_release(key_values);
+            ray_release(row_counts);
+            ray_release(result);
+            goto fail_tables;
+        }
+        mapcommon->type = RAY_MAPCOMMON;
+        mapcommon->len = 2;
+        mapcommon->attrs = mc_type;
+        memset(mapcommon->nullmap, 0, 16);
+
+        ray_t** mc_ptrs = (ray_t**)ray_data(mapcommon);
+        mc_ptrs[0] = key_values;  ray_retain(key_values);
+        mc_ptrs[1] = row_counts;  ray_retain(row_counts);
+
+        const char* mc_name = (mc_type == RAY_MC_DATE) ? "date" : "part";
+        int64_t part_name_id = ray_sym_intern(mc_name, strlen(mc_name));
+        result = ray_table_add_col(result, part_name_id, mapcommon);
+        if (!result || RAY_IS_ERR(result)) {
+            ray_release(mapcommon);
+            ray_release(key_values);
+            ray_release(row_counts);
+            goto fail_tables;
+        }
+
+        ray_release(mapcommon);
+        ray_release(key_values);
+        ray_release(row_counts);
+    }
+
+    /* ---- Data columns (after MAPCOMMON) ---- */
+    for (int64_t c = 0; c < ncols; c++) {
+        int64_t name_id = ray_table_col_name(part_tables[0], c);
+        ray_t* first_seg = ray_table_get_col_idx(part_tables[0], c);
+        if (!first_seg) continue;
+
+        ray_t* parted = ray_alloc((size_t)part_count * sizeof(ray_t*));
+        if (!parted || RAY_IS_ERR(parted)) {
+            ray_release(result);
+            goto fail_tables;
+        }
+        parted->type = RAY_PARTED_BASE + first_seg->type;
+        parted->len = part_count;
+        parted->attrs = 0;
+        memset(parted->nullmap, 0, 16);
+
+        ray_t** segs = (ray_t**)ray_data(parted);
+        for (int64_t p = 0; p < part_count; p++) {
+            ray_t* seg = ray_table_get_col_idx(part_tables[p], c);
+            if (!seg) {
+                segs[p] = NULL;
+                continue;
+            }
+            ray_retain(seg);
+            segs[p] = seg;
+            ray_vm_advise_willneed(ray_data(seg),
+                                  (size_t)seg->len * ray_sym_elem_size(seg->type, seg->attrs));
+        }
+
+        result = ray_table_add_col(result, name_id, parted);
+        ray_release(parted);
+        if (!result || RAY_IS_ERR(result)) goto fail_tables;
+    }
+
+    /* Release partition sub-tables (segment vectors survive via retain) */
+    for (int64_t p = 0; p < part_count; p++) {
+        if (part_tables[p]) ray_release(part_tables[p]);
+        ray_sys_free(part_dirs[p]);
+    }
+    ray_sys_free(part_tables);
+    ray_sys_free(part_dirs);
+
+    return result;
+
+fail_tables:
+    for (int64_t p = 0; p < part_count; p++) {
+        if (part_tables[p] && !RAY_IS_ERR(part_tables[p]))
+            ray_release(part_tables[p]);
+    }
+    ray_sys_free(part_tables);
+
+fail_dirs:
+    for (int64_t p = 0; p < part_count; p++)
+        ray_sys_free(part_dirs[p]);
+    ray_sys_free(part_dirs);
+
+    return ray_error("io", NULL);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/part.h b/crates/rayforce-sys/vendor/rayforce/src/store/part.h
new file mode 100644
index 0000000..aef3b46
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/part.h
@@ -0,0 +1,33 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_PART_H
+#define RAY_PART_H
+
+#include <rayforce.h>
+
+/* Partitioned table */
+ray_t*    ray_part_load(const char* db_root, const char* table_name);
+ray_t*    ray_read_parted(const char* db_root, const char* table_name);
+
+#endif /* RAY_PART_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/serde.c b/crates/rayforce-sys/vendor/rayforce/src/store/serde.c
new file mode 100644
index 0000000..0c27da1
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/serde.c
@@ -0,0 +1,984 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_OS_WINDOWS
+#  define _GNU_SOURCE   /* fileno() for fsync-after-fwrite below */
+#endif
+
+#include "serde.h"
+#include "store/col.h"
+#include "store/fileio.h"
+#include "core/types.h"
+#include "mem/heap.h"
+#include "vec/str.h"
+#include "vec/vec.h"
+
+#ifndef RAY_OS_WINDOWS
+#  include <unistd.h>
+#endif
+#include "table/sym.h"
+#include "lang/env.h"
+#include <string.h>
+#include <stdio.h>
+
+/* --------------------------------------------------------------------------
+ * Wire format:
+ *
+ *   byte 0:   type tag (int8_t — negative = atom, positive = vector/compound)
+ *
+ *   Atoms (type < 0):
+ *     BOOL/U8:        1 byte value
+ *     I16:            2 bytes
+ *     I32/DATE/TIME:  4 bytes
+ *     F32:            4 bytes
+ *     I64/TIMESTAMP:  8 bytes
+ *     F64:            8 bytes
+ *     SYM:            null-terminated string (interned on deserialize)
+ *     GUID:           16 bytes
+ *     STR:            i64 length + raw bytes (no null terminator)
+ *
+ *   Vectors (type > 0):
+ *     attrs byte + i64 length + element data
+ *     SYM vector: each element as null-terminated string
+ *     STR vector: each element as i64 length + raw bytes
+ *     LIST: each element recursively serialized
+ *
+ *   TABLE/DICT: attrs byte + keys(recursive) + values(recursive)
+ *   LAMBDA:     attrs byte + params(recursive) + body(recursive)
+ *   UNARY/BINARY/VARY: function name as null-terminated string
+ *   ERROR:      8-byte sdata (packed error code)
+ *   NULL (type=0 with len=0): just the type byte
+ * -------------------------------------------------------------------------- */
+
+/* Helper: strlen with bounds */
+static size_t safe_strlen(const uint8_t* buf, int64_t max) {
+    for (int64_t i = 0; i < max; i++)
+        if (buf[i] == 0) return (size_t)i;
+    return (size_t)max;
+}
+
+/* Null bitmap size for a vector (0 if no nulls) */
+static int64_t null_bitmap_size(ray_t* v) {
+    if (!(v->attrs & RAY_ATTR_HAS_NULLS)) return 0;
+    return (v->len + 7) / 8;
+}
+
+/* Write null bitmap bytes into buf. Returns bytes written.
+ * Uses ray_vec_nullmap_bytes so HAS_INDEX, slice, ext, and inline storage
+ * forms all serialize the correct bits.  bit_offset is non-zero only for
+ * slices, which (per pre-existing serde behaviour) are saved as if they
+ * had no nulls — null_bitmap_size returns 0 since the slice's own attrs
+ * lack HAS_NULLS — so we never reach this with off>0. */
+static int64_t ser_null_bitmap(uint8_t* buf, ray_t* v) {
+    int64_t bsz = null_bitmap_size(v);
+    if (bsz <= 0) return 0;
+
+    int64_t bit_off = 0, len_bits = 0;
+    const uint8_t* bits = ray_vec_nullmap_bytes(v, &bit_off, &len_bits);
+    if (!bits || bit_off != 0) {
+        memset(buf, 0, (size_t)bsz);
+        return bsz;
+    }
+    int64_t avail_bytes = (len_bits + 7) / 8;
+    int64_t copy = bsz < avail_bytes ? bsz : avail_bytes;
+    memcpy(buf, bits, (size_t)copy);
+    if (copy < bsz) memset(buf + copy, 0, (size_t)(bsz - copy));
+    return bsz;
+}
+
+/* Restore null bitmap from buf into vector. Returns bytes consumed. */
+static int64_t de_null_bitmap(const uint8_t* buf, int64_t avail, ray_t* v) {
+    int64_t bsz = (v->len + 7) / 8;
+    if (avail < bsz) return -1;
+
+    v->attrs |= RAY_ATTR_HAS_NULLS;
+
+    if (v->type == RAY_STR || v->len > 128) {
+        /* Must use external nullmap (STR always, others when > 128 elements) */
+        ray_t* ext = ray_vec_new(RAY_U8, bsz);
+        if (!ext || RAY_IS_ERR(ext)) return -1;
+        ext->len = bsz;
+        memcpy(ray_data(ext), buf, (size_t)bsz);
+        v->attrs |= RAY_ATTR_NULLMAP_EXT;
+        v->ext_nullmap = ext;
+    } else {
+        /* Inline nullmap */
+        memcpy(v->nullmap, buf, (size_t)bsz);
+    }
+    return bsz;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_serde_size — calculate serialized size (excluding IPC header)
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_serde_size(ray_t* obj) {
+    if (!obj) return 1; /* RAY_SERDE_NULL marker */
+    if (RAY_IS_ERR(obj)) return 1 + 8; /* type + sdata */
+    if (RAY_IS_NULL(obj)) return 1; /* just the null type byte */
+
+    int8_t type = obj->type;
+
+    /* Atoms (negative type).  Format: type(1) + flags(1) + value-bytes.
+     * `flags` carries the typed-null bit so a deserialize round-trip
+     * restores 0Nl/0Nf/0Nd/0Nt etc. instead of decoding the zero-value
+     * payload as a plain atom (see ray_typed_null / RAY_ATOM_IS_NULL). */
+    if (type < 0) {
+        int8_t base = -type;
+        switch (base) {
+        case RAY_BOOL:
+        case RAY_U8:        return 1 + 1 + 1;
+        case RAY_I16:       return 1 + 1 + 2;
+        case RAY_I32:
+        case RAY_DATE:
+        case RAY_TIME:
+        case RAY_F32:       return 1 + 1 + 4;
+        case RAY_I64:
+        case RAY_TIMESTAMP:
+        case RAY_F64:       return 1 + 1 + 8;
+        case RAY_GUID:      return 1 + 1 + 16;
+        case RAY_SYM: {
+            ray_t* s = ray_sym_str(obj->i64);
+            return 1 + 1 + (s ? (int64_t)ray_str_len(s) : 0) + 1; /* +1 for null terminator */
+        }
+        case RAY_STR: {
+            return 1 + 1 + 8 + (int64_t)ray_str_len(obj);
+        }
+        default: return 0;
+        }
+    }
+
+    /* NULL object: type=LIST with len=0, but we check for actual NULL semantics */
+
+    /* Vectors — format: type(1) + attrs(1) + len(8) + data + nullmap */
+    int64_t nbm = null_bitmap_size(obj);
+
+    /* Overflow guard: worst case is GUID at 16 bytes/elem */
+    if (obj->len > (INT64_MAX - 32) / 16) return -1;
+
+    switch (type) {
+    case RAY_BOOL:
+    case RAY_U8:        return 1 + 1 + 8 + obj->len + nbm;
+    case RAY_I16:       return 1 + 1 + 8 + obj->len * 2 + nbm;
+    case RAY_I32:
+    case RAY_DATE:
+    case RAY_TIME:
+    case RAY_F32:       return 1 + 1 + 8 + obj->len * 4 + nbm;
+    case RAY_I64:
+    case RAY_TIMESTAMP:
+    case RAY_F64:       return 1 + 1 + 8 + obj->len * 8 + nbm;
+    case RAY_GUID:      return 1 + 1 + 8 + obj->len * 16 + nbm;
+    case RAY_SYM: {
+        int64_t size = 1 + 1 + 8;
+        int64_t* ids = (int64_t*)ray_data(obj);
+        for (int64_t i = 0; i < obj->len; i++) {
+            ray_t* s = ray_sym_str(ids[i]);
+            size += (s ? (int64_t)ray_str_len(s) : 0) + 1;
+        }
+        return size + nbm;
+    }
+    case RAY_STR: {
+        int64_t size = 1 + 1 + 8;
+        ray_str_t* elems = (ray_str_t*)ray_data(obj);
+        for (int64_t i = 0; i < obj->len; i++)
+            size += 8 + elems[i].len; /* i64 length + raw bytes */
+        return size + nbm;
+    }
+    case RAY_LIST: {
+        int64_t size = 1 + 1 + 8;
+        ray_t** elems = (ray_t**)ray_data(obj);
+        for (int64_t i = 0; i < obj->len; i++)
+            size += ray_serde_size(elems[i]);
+        return size;
+    }
+    case RAY_TABLE: {
+        /* type + attrs + schema(recursive) + cols(recursive RAY_LIST) */
+        ray_t** slots = (ray_t**)ray_data(obj);
+        return 1 + 1 + ray_serde_size(slots[0]) + ray_serde_size(slots[1]);
+    }
+    case RAY_DICT: {
+        /* type + attrs + keys(recursive) + vals(recursive) */
+        ray_t** slots = (ray_t**)ray_data(obj);
+        return 1 + 1 + ray_serde_size(slots[0]) + ray_serde_size(slots[1]);
+    }
+    case RAY_LAMBDA: {
+        ray_t** slots = (ray_t**)ray_data(obj);
+        return 1 + 1 + ray_serde_size(slots[0]) + ray_serde_size(slots[1]);
+    }
+    case RAY_UNARY:
+    case RAY_BINARY:
+    case RAY_VARY: {
+        /* Serialize by name (null-terminated string in nullmap) */
+        const char* name = ray_fn_name(obj);
+        size_t nlen = strlen(name); if (nlen > 15) nlen = 15;
+        return 1 + (int64_t)nlen + 1; /* type + name + null terminator */
+    }
+    case RAY_ERROR:
+        return 1 + 8; /* sdata */
+    default:
+        return 0;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ray_ser_raw — serialize into buffer, returns bytes written
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_ser_raw(uint8_t* buf, ray_t* obj) {
+    if (!obj) {
+        buf[0] = RAY_SERDE_NULL;
+        return 1;
+    }
+    if (RAY_IS_ERR(obj)) {
+        buf[0] = (uint8_t)RAY_ERROR;
+        memcpy(buf + 1, obj->sdata, 7);
+        buf[8] = 0;
+        return 1 + 8;
+    }
+
+    int8_t type = obj->type;
+    buf[0] = (uint8_t)type;
+    buf++;
+
+    /* Atoms — format: type(1) + flags(1) + value-bytes.  `flags` bit 0
+     * carries the typed-null marker (nullmap[0] & 1 on the source atom)
+     * so (de (ser 0Nl)) roundtrips instead of decoding as plain 0. */
+    if (type < 0) {
+        uint8_t aflags = (uint8_t)(obj->nullmap[0] & 1);
+        buf[0] = aflags;
+        buf++;
+        int8_t base = -type;
+        switch (base) {
+        case RAY_BOOL:
+        case RAY_U8:
+            buf[0] = obj->u8;
+            return 1 + 1 + 1;
+        case RAY_I16:
+            memcpy(buf, &obj->i16, 2);
+            return 1 + 1 + 2;
+        case RAY_I32:
+        case RAY_DATE:
+        case RAY_TIME:
+            memcpy(buf, &obj->i32, 4);
+            return 1 + 1 + 4;
+        case RAY_F32:
+            memcpy(buf, &obj->i32, 4); /* same 4-byte slot */
+            return 1 + 1 + 4;
+        case RAY_I64:
+        case RAY_TIMESTAMP:
+            memcpy(buf, &obj->i64, 8);
+            return 1 + 1 + 8;
+        case RAY_F64:
+            memcpy(buf, &obj->f64, 8);
+            return 1 + 1 + 8;
+        case RAY_GUID: {
+            /* GUID atom stored via obj pointer to 16-byte data */
+            ray_t* gv = obj->obj;
+            if (gv) memcpy(buf, ray_data(gv), 16);
+            else    memset(buf, 0, 16);
+            return 1 + 1 + 16;
+        }
+        case RAY_SYM: {
+            ray_t* s = ray_sym_str(obj->i64);
+            if (s) {
+                size_t slen = ray_str_len(s);
+                memcpy(buf, ray_str_ptr(s), slen);
+                buf[slen] = '\0';
+                return 1 + 1 + (int64_t)slen + 1;
+            }
+            buf[0] = '\0';
+            return 1 + 1 + 1;
+        }
+        case RAY_STR: {
+            size_t slen = ray_str_len(obj);
+            const char* p = ray_str_ptr(obj);
+            if (!p) { p = ""; slen = 0; }
+            int64_t n = (int64_t)slen;
+            memcpy(buf, &n, 8);
+            memcpy(buf + 8, p, slen);
+            return 1 + 1 + 8 + (int64_t)slen;
+        }
+        default: return 0;
+        }
+    }
+
+    /* Vectors and compound types */
+    int64_t c;
+
+    /* Attrs byte: preserve HAS_NULLS, clear SLICE/NULLMAP_EXT/ARENA (internal flags) */
+    uint8_t wire_attrs = obj->attrs & (RAY_ATTR_HAS_NULLS);
+
+    switch (type) {
+    case RAY_BOOL:
+    case RAY_U8: {
+        buf[0] = wire_attrs; buf++;
+        memcpy(buf, &obj->len, 8); buf += 8;
+        memcpy(buf, ray_data(obj), obj->len);
+        c = 1 + 1 + 8 + obj->len;
+        c += ser_null_bitmap(buf + obj->len, obj);
+        return c;
+    }
+    case RAY_I16: {
+        buf[0] = wire_attrs; buf++;
+        memcpy(buf, &obj->len, 8); buf += 8;
+        int64_t dsz = obj->len * 2;
+        memcpy(buf, ray_data(obj), dsz);
+        c = 1 + 1 + 8 + dsz;
+        c += ser_null_bitmap(buf + dsz, obj);
+        return c;
+    }
+    case RAY_I32:
+    case RAY_DATE:
+    case RAY_TIME:
+    case RAY_F32: {
+        buf[0] = wire_attrs; buf++;
+        memcpy(buf, &obj->len, 8); buf += 8;
+        int64_t dsz = obj->len * 4;
+        memcpy(buf, ray_data(obj), dsz);
+        c = 1 + 1 + 8 + dsz;
+        c += ser_null_bitmap(buf + dsz, obj);
+        return c;
+    }
+    case RAY_I64:
+    case RAY_TIMESTAMP:
+    case RAY_F64: {
+        buf[0] = wire_attrs; buf++;
+        memcpy(buf, &obj->len, 8); buf += 8;
+        int64_t dsz = obj->len * 8;
+        memcpy(buf, ray_data(obj), dsz);
+        c = 1 + 1 + 8 + dsz;
+        c += ser_null_bitmap(buf + dsz, obj);
+        return c;
+    }
+    case RAY_GUID: {
+        buf[0] = wire_attrs; buf++;
+        memcpy(buf, &obj->len, 8); buf += 8;
+        int64_t dsz = obj->len * 16;
+        memcpy(buf, ray_data(obj), dsz);
+        c = 1 + 1 + 8 + dsz;
+        c += ser_null_bitmap(buf + dsz, obj);
+        return c;
+    }
+    case RAY_SYM: {
+        buf[0] = wire_attrs; buf++;
+        memcpy(buf, &obj->len, 8); buf += 8;
+        int64_t* ids = (int64_t*)ray_data(obj);
+        c = 0;
+        for (int64_t i = 0; i < obj->len; i++) {
+            ray_t* s = ray_sym_str(ids[i]);
+            if (s) {
+                size_t slen = ray_str_len(s);
+                memcpy(buf + c, ray_str_ptr(s), slen);
+                c += (int64_t)slen;
+            }
+            buf[c] = '\0';
+            c++;
+        }
+        c += ser_null_bitmap(buf + c, obj);
+        return 1 + 1 + 8 + c;
+    }
+
+    case RAY_STR: {
+        buf[0] = wire_attrs; buf++;
+        memcpy(buf, &obj->len, 8); buf += 8;
+        ray_str_t* elems = (ray_str_t*)ray_data(obj);
+        const char* pool = obj->str_pool ? (const char*)ray_data(obj->str_pool) : NULL;
+        c = 0;
+        for (int64_t i = 0; i < obj->len; i++) {
+            int64_t slen = (int64_t)elems[i].len;
+            memcpy(buf + c, &slen, 8);
+            c += 8;
+            const char* p = ray_str_t_ptr(&elems[i], pool);
+            memcpy(buf + c, p, (size_t)slen);
+            c += slen;
+        }
+        c += ser_null_bitmap(buf + c, obj);
+        return 1 + 1 + 8 + c;
+    }
+
+    case RAY_LIST: {
+        buf[0] = obj->attrs;
+        buf++;
+        memcpy(buf, &obj->len, 8);
+        buf += 8;
+        ray_t** elems = (ray_t**)ray_data(obj);
+        c = 0;
+        for (int64_t i = 0; i < obj->len; i++)
+            c += ray_ser_raw(buf + c, elems[i]);
+        return 1 + 1 + 8 + c;
+    }
+
+    case RAY_TABLE: {
+        /* Layout: type + attrs + schema(recursive) + cols(recursive RAY_LIST) */
+        buf[0] = obj->attrs;
+        buf++;
+        ray_t** slots = (ray_t**)ray_data(obj);
+        c = ray_ser_raw(buf, slots[0]);          /* schema (RAY_I64 vector) */
+        c += ray_ser_raw(buf + c, slots[1]);     /* cols (RAY_LIST) */
+        return 1 + 1 + c;
+    }
+
+    case RAY_DICT: {
+        buf[0] = obj->attrs;
+        buf++;
+        ray_t** slots = (ray_t**)ray_data(obj);
+        c = ray_ser_raw(buf, slots[0]);
+        c += ray_ser_raw(buf + c, slots[1]);
+        return 1 + 1 + c;
+    }
+
+    case RAY_LAMBDA: {
+        buf[0] = obj->attrs;
+        buf++;
+        ray_t** slots = (ray_t**)ray_data(obj);
+        c = ray_ser_raw(buf, slots[0]);     /* params */
+        c += ray_ser_raw(buf + c, slots[1]); /* body */
+        return 1 + 1 + c;
+    }
+
+    case RAY_UNARY:
+    case RAY_BINARY:
+    case RAY_VARY: {
+        /* Serialize builtin by name (null-terminated) */
+        const char* name = ray_fn_name(obj);
+        size_t nlen = strlen(name); if (nlen > 15) nlen = 15;
+        memcpy(buf, name, nlen);
+        buf[nlen] = 0;
+        return 1 + (int64_t)nlen + 1;
+    }
+
+    case RAY_ERROR:
+        memcpy(buf, obj->sdata, 7);
+        buf[7] = 0;
+        return 1 + 8;
+
+    default:
+        return 0;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ray_de_raw — deserialize from buffer
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_de_raw(uint8_t* buf, int64_t* len) {
+    if (*len < 1) return NULL;
+
+    int8_t type = (int8_t)buf[0];
+    buf++;
+    (*len)--;
+
+    /* Null */
+    if ((uint8_t)type == RAY_SERDE_NULL) return NULL;
+
+    /* Atoms — read 1-byte flags (typed-null bit) before the value.  If
+     * the null bit is set we always return ray_typed_null(type) regardless
+     * of the value bytes, which are still read/skipped to keep the buffer
+     * position in sync with the serialized length. */
+    if (type < 0) {
+        if (*len < 1) return ray_error("domain", NULL);
+        uint8_t aflags = buf[0];
+        buf++; (*len)--;
+        bool is_null = (aflags & 1) != 0;
+        int8_t base = -type;
+        switch (base) {
+        case RAY_BOOL:
+            if (*len < 1) return ray_error("domain", NULL);
+            (*len)--;
+            return is_null ? ray_typed_null(type) : ray_bool(buf[0]);
+        case RAY_U8:
+            if (*len < 1) return ray_error("domain", NULL);
+            (*len)--;
+            return is_null ? ray_typed_null(type) : ray_u8(buf[0]);
+        case RAY_I16:
+            if (*len < 2) return ray_error("domain", NULL);
+            { int16_t v; memcpy(&v, buf, 2); *len -= 2;
+              return is_null ? ray_typed_null(type) : ray_i16(v); }
+        case RAY_I32:
+            if (*len < 4) return ray_error("domain", NULL);
+            { int32_t v; memcpy(&v, buf, 4); *len -= 4;
+              return is_null ? ray_typed_null(type) : ray_i32(v); }
+        case RAY_DATE:
+            if (*len < 4) return ray_error("domain", NULL);
+            { int32_t v; memcpy(&v, buf, 4); *len -= 4;
+              return is_null ? ray_typed_null(type) : ray_date((int64_t)v); }
+        case RAY_TIME:
+            if (*len < 4) return ray_error("domain", NULL);
+            { int32_t v; memcpy(&v, buf, 4); *len -= 4;
+              return is_null ? ray_typed_null(type) : ray_time((int64_t)v); }
+        case RAY_F32:
+            if (*len < 4) return ray_error("domain", NULL);
+            { float v; memcpy(&v, buf, 4); *len -= 4;
+              return is_null ? ray_typed_null(-RAY_F64)
+                             : ray_f64((double)v); /* promote to f64 atom */ }
+        case RAY_I64:
+            if (*len < 8) return ray_error("domain", NULL);
+            { int64_t v; memcpy(&v, buf, 8); *len -= 8;
+              return is_null ? ray_typed_null(type) : ray_i64(v); }
+        case RAY_TIMESTAMP:
+            if (*len < 8) return ray_error("domain", NULL);
+            { int64_t v; memcpy(&v, buf, 8); *len -= 8;
+              return is_null ? ray_typed_null(type) : ray_timestamp(v); }
+        case RAY_F64:
+            if (*len < 8) return ray_error("domain", NULL);
+            { double v; memcpy(&v, buf, 8); *len -= 8;
+              return is_null ? ray_typed_null(type) : ray_f64(v); }
+        case RAY_GUID:
+            if (*len < 16) return ray_error("domain", NULL);
+            *len -= 16;
+            return is_null ? ray_typed_null(type) : ray_guid(buf);
+        case RAY_SYM: {
+            size_t slen = safe_strlen(buf, *len);
+            if ((int64_t)slen >= *len) return ray_error("domain", NULL);
+            *len -= (int64_t)slen + 1;
+            if (is_null) return ray_typed_null(type);
+            int64_t id = ray_sym_intern((const char*)buf, slen);
+            return ray_sym(id);
+        }
+        case RAY_STR: {
+            if (*len < 8) return ray_error("domain", NULL);
+            int64_t slen; memcpy(&slen, buf, 8);
+            buf += 8; *len -= 8;
+            if (*len < slen || slen < 0) return ray_error("domain", NULL);
+            *len -= slen;
+            if (is_null) return ray_typed_null(type);
+            return ray_str((const char*)buf, (size_t)slen);
+        }
+        default:
+            return ray_error("type", NULL);
+        }
+    }
+
+    /* Vectors and compounds */
+    int64_t l;
+
+    switch (type) {
+    case RAY_BOOL:
+    case RAY_U8:
+    case RAY_I16:
+    case RAY_I32:
+    case RAY_DATE:
+    case RAY_TIME:
+    case RAY_F32:
+    case RAY_I64:
+    case RAY_TIMESTAMP:
+    case RAY_F64:
+    case RAY_GUID: {
+        if (*len < 9) return ray_error("domain", NULL);
+        uint8_t attrs = buf[0];
+        buf++;
+        memcpy(&l, buf, 8);
+        buf += 8;
+        *len -= 9;
+
+        if (l < 0 || l > 1000000000) return ray_error("domain", NULL);
+
+        uint8_t esz = ray_type_sizes[type];
+        int64_t data_bytes = l * esz;
+        if (*len < data_bytes) return ray_error("domain", NULL);
+
+        ray_t* vec = ray_vec_from_raw(type, buf, l);
+        if (!vec || RAY_IS_ERR(vec)) return vec;
+        buf += data_bytes;
+        *len -= data_bytes;
+
+        /* Restore null bitmap if present */
+        if (attrs & RAY_ATTR_HAS_NULLS) {
+            int64_t consumed = de_null_bitmap(buf, *len, vec);
+            if (consumed < 0) { ray_release(vec); return ray_error("domain", NULL); }
+            buf += consumed;
+            *len -= consumed;
+        }
+        return vec;
+    }
+
+    case RAY_SYM: {
+        if (*len < 9) return ray_error("domain", NULL);
+        uint8_t attrs = buf[0];
+        buf++;
+        memcpy(&l, buf, 8);
+        buf += 8;
+        *len -= 9;
+
+        if (l < 0 || l > 1000000000) return ray_error("domain", NULL);
+
+        ray_t* vec = ray_vec_new(RAY_SYM, l);
+        if (!vec || RAY_IS_ERR(vec)) return vec;
+        vec->len = l;
+        int64_t* ids = (int64_t*)ray_data(vec);
+        for (int64_t i = 0; i < l; i++) {
+            size_t slen = safe_strlen(buf, *len);
+            if ((int64_t)slen >= *len) {
+                vec->len = i;
+                ray_release(vec);
+                return ray_error("domain", NULL);
+            }
+            ids[i] = ray_sym_intern((const char*)buf, slen);
+            buf += slen + 1;
+            *len -= (int64_t)slen + 1;
+        }
+
+        if (attrs & RAY_ATTR_HAS_NULLS) {
+            int64_t consumed = de_null_bitmap(buf, *len, vec);
+            if (consumed < 0) { ray_release(vec); return ray_error("domain", NULL); }
+            buf += consumed;
+            *len -= consumed;
+        }
+        return vec;
+    }
+
+    case RAY_STR: {
+        if (*len < 9) return ray_error("domain", NULL);
+        uint8_t attrs = buf[0];
+        buf++;
+        memcpy(&l, buf, 8);
+        buf += 8;
+        *len -= 9;
+
+        if (l < 0 || l > 1000000000) return ray_error("domain", NULL);
+
+        /* Build STR vector by appending each string via ray_str_vec_append */
+        ray_t* vec = ray_vec_new(RAY_STR, l);
+        if (!vec || RAY_IS_ERR(vec)) return vec;
+        vec->len = 0;
+        for (int64_t i = 0; i < l; i++) {
+            if (*len < 8) { ray_release(vec); return ray_error("domain", NULL); }
+            int64_t slen; memcpy(&slen, buf, 8);
+            buf += 8; *len -= 8;
+            if (*len < slen || slen < 0) { ray_release(vec); return ray_error("domain", NULL); }
+            ray_t* nv = ray_str_vec_append(vec, (const char*)buf, (size_t)slen);
+            if (!nv || RAY_IS_ERR(nv)) { ray_release(vec); return nv ? nv : ray_error("oom", NULL); }
+            vec = nv;
+            buf += slen;
+            *len -= slen;
+        }
+
+        if (attrs & RAY_ATTR_HAS_NULLS) {
+            int64_t consumed = de_null_bitmap(buf, *len, vec);
+            if (consumed < 0) { ray_release(vec); return ray_error("domain", NULL); }
+            buf += consumed;
+            *len -= consumed;
+        }
+        return vec;
+    }
+
+    case RAY_LIST: {
+        if (*len < 9) return ray_error("domain", NULL);
+        uint8_t list_attrs = buf[0];
+        buf++;
+        memcpy(&l, buf, 8);
+        buf += 8;
+        *len -= 9;
+
+        if (l < 0 || l > 1000000000) return ray_error("domain", NULL);
+
+        ray_t* list = ray_alloc(l * sizeof(ray_t*));
+        if (!list || RAY_IS_ERR(list)) return list;
+        list->type = RAY_LIST;
+        list->attrs = list_attrs;
+        list->len = l;
+        ray_t** elems = (ray_t**)ray_data(list);
+
+        int64_t saved = *len;
+        for (int64_t i = 0; i < l; i++) {
+            elems[i] = ray_de_raw(buf + (saved - *len), len);
+            if (!elems[i] || RAY_IS_ERR(elems[i])) {
+                /* Clean up already-deserialized elements */
+                for (int64_t j = 0; j < i; j++) ray_release(elems[j]);
+                list->len = 0;
+                ray_release(list);
+                return elems[i] ? elems[i] : ray_error("domain", NULL);
+            }
+        }
+        return list;
+    }
+
+    case RAY_TABLE: {
+        if (*len < 1) return ray_error("domain", NULL);
+        /* uint8_t tbl_attrs = buf[0]; — tables rebuild attrs via ray_table_add_col */
+        buf++;
+        *len -= 1;
+
+        int64_t saved = *len;
+        /* Deserialize schema (I64 vector of sym IDs) */
+        ray_t* schema = ray_de_raw(buf, len);
+        if (!schema || RAY_IS_ERR(schema)) return schema;
+
+        /* Deserialize columns (as LIST) */
+        ray_t* cols = ray_de_raw(buf + (saved - *len), len);
+        if (!cols || RAY_IS_ERR(cols)) {
+            ray_release(schema);
+            return cols;
+        }
+
+        /* Reconstruct table */
+        if (cols->type != RAY_LIST || schema->type != RAY_I64) {
+            ray_release(schema);
+            ray_release(cols);
+            return ray_error("domain", NULL);
+        }
+
+        int64_t ncols = cols->len;
+        ray_t* tbl = ray_table_new(ncols);
+        if (!tbl || RAY_IS_ERR(tbl)) {
+            ray_release(schema);
+            ray_release(cols);
+            return tbl;
+        }
+
+        int64_t* name_ids = (int64_t*)ray_data(schema);
+        ray_t** col_ptrs = (ray_t**)ray_data(cols);
+        for (int64_t i = 0; i < ncols && i < schema->len; i++) {
+            ray_t* new_tbl = ray_table_add_col(tbl, name_ids[i], col_ptrs[i]);
+            if (!new_tbl || RAY_IS_ERR(new_tbl)) {
+                ray_release(tbl);
+                ray_release(schema);
+                ray_release(cols);
+                return new_tbl;
+            }
+            tbl = new_tbl;
+        }
+
+        ray_release(schema);
+        ray_release(cols);
+        return tbl;
+    }
+
+    case RAY_DICT: {
+        if (*len < 1) return ray_error("domain", NULL);
+        uint8_t dict_attrs = buf[0];
+        buf++;
+        *len -= 1;
+
+        int64_t saved = *len;
+        ray_t* keys = ray_de_raw(buf, len);
+        if (!keys || RAY_IS_ERR(keys)) return keys;
+
+        ray_t* vals = ray_de_raw(buf + (saved - *len), len);
+        if (!vals || RAY_IS_ERR(vals)) {
+            ray_release(keys);
+            return vals;
+        }
+
+        /* Build dict: alloc with 2 slots */
+        ray_t* dict = ray_alloc(2 * sizeof(ray_t*));
+        if (!dict || RAY_IS_ERR(dict)) {
+            ray_release(keys);
+            ray_release(vals);
+            return dict;
+        }
+        dict->type = RAY_DICT;
+        dict->attrs = dict_attrs;
+        dict->len = 2;
+        ((ray_t**)ray_data(dict))[0] = keys;
+        ((ray_t**)ray_data(dict))[1] = vals;
+        return dict;
+    }
+
+    case RAY_LAMBDA: {
+        if (*len < 1) return ray_error("domain", NULL);
+        uint8_t lam_attrs = buf[0];
+        buf++;
+        *len -= 1;
+
+        int64_t saved = *len;
+        ray_t* params = ray_de_raw(buf, len);
+        if (!params || RAY_IS_ERR(params)) return params;
+
+        ray_t* body = ray_de_raw(buf + (saved - *len), len);
+        if (!body || RAY_IS_ERR(body)) {
+            ray_release(params);
+            return body;
+        }
+
+        /* Build lambda: allocate with 7 slots (same as eval.c) */
+        ray_t* lambda = ray_alloc(7 * sizeof(ray_t*));
+        if (!lambda || RAY_IS_ERR(lambda)) {
+            ray_release(params);
+            ray_release(body);
+            return lambda;
+        }
+        lambda->type = RAY_LAMBDA;
+        lambda->attrs = lam_attrs;
+        lambda->len = 0;
+        memset(ray_data(lambda), 0, 7 * sizeof(ray_t*));
+        ((ray_t**)ray_data(lambda))[0] = params;
+        ((ray_t**)ray_data(lambda))[1] = body;
+        return lambda;
+    }
+
+    case RAY_UNARY:
+    case RAY_BINARY:
+    case RAY_VARY: {
+        /* Deserialize builtin by name: read null-terminated string,
+         * look up in the global environment. */
+        size_t nlen = safe_strlen(buf, *len);
+        if ((int64_t)nlen >= *len) return ray_error("domain", NULL);
+        int64_t sym = ray_sym_intern((const char*)buf, nlen);
+        *len -= (int64_t)nlen + 1;
+        ray_t* fn = ray_env_get(sym);
+        if (!fn) return ray_error("name", NULL);
+        ray_retain(fn);
+        return fn;
+    }
+
+    case RAY_ERROR: {
+        if (*len < 8) return ray_error("domain", NULL);
+        ray_t* err = ray_error((const char*)buf, NULL);
+        *len -= 8;
+        return err;
+    }
+
+    default:
+        return ray_error("type", NULL);
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ray_ser — top-level: serialize with IPC header
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_ser(ray_t* obj) {
+    int64_t payload = ray_serde_size(obj);
+    if (payload <= 0) return ray_error("domain", payload < 0 ? "serialization overflow" : NULL);
+
+    int64_t total = (int64_t)sizeof(ray_ipc_header_t) + payload;
+    ray_t* buf = ray_vec_new(RAY_U8, total);
+    if (!buf || RAY_IS_ERR(buf)) return buf;
+    buf->len = total;
+
+    ray_ipc_header_t* hdr = (ray_ipc_header_t*)ray_data(buf);
+    hdr->prefix  = RAY_SERDE_PREFIX;
+    hdr->version = RAY_SERDE_WIRE_VERSION;
+    hdr->flags   = 0;
+    hdr->endian  = 0;
+    hdr->msgtype = 0;
+    hdr->size    = payload;
+
+    int64_t written = ray_ser_raw((uint8_t*)ray_data(buf) + sizeof(ray_ipc_header_t), obj);
+    if (written == 0) {
+        ray_release(buf);
+        return ray_error("domain", NULL);
+    }
+
+    return buf;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_de — top-level: deserialize from U8 vector
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_de(ray_t* bytes) {
+    if (!bytes || RAY_IS_ERR(bytes)) return ray_error("type", NULL);
+    if (bytes->type != RAY_U8 && bytes->type != -RAY_U8)
+        return ray_error("type", NULL);
+
+    int64_t total = bytes->len;
+    uint8_t* buf = (uint8_t*)ray_data(bytes);
+
+    if (total < (int64_t)sizeof(ray_ipc_header_t))
+        return ray_error("domain", NULL);
+
+    ray_ipc_header_t* hdr = (ray_ipc_header_t*)buf;
+    if (hdr->prefix != RAY_SERDE_PREFIX)
+        return ray_error("domain", NULL);
+    if (hdr->version != RAY_SERDE_WIRE_VERSION)
+        return ray_error("version", "serde wire version mismatch");
+    if (hdr->size < 0 || hdr->size > 1000000000)
+        return ray_error("domain", NULL);
+    if (hdr->size + (int64_t)sizeof(ray_ipc_header_t) != total)
+        return ray_error("domain", NULL);
+
+    int64_t len = hdr->size;
+    return ray_de_raw(buf + sizeof(ray_ipc_header_t), &len);
+}
+
+/* --------------------------------------------------------------------------
+ * File I/O: save/load any object in binary format
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_obj_save(ray_t* obj, const char* path) {
+    ray_t* bytes = ray_ser(obj);
+    if (!bytes || RAY_IS_ERR(bytes)) {
+        if (bytes && RAY_IS_ERR(bytes)) ray_error_free(bytes);
+        return RAY_ERR_DOMAIN;
+    }
+
+    FILE* f = fopen(path, "wb");
+    if (!f) { ray_release(bytes); return RAY_ERR_IO; }
+
+    size_t total = (size_t)bytes->len;
+    size_t n = fwrite(ray_data(bytes), 1, total, f);
+    if (n != total) {
+        fclose(f); ray_release(bytes);
+        return RAY_ERR_IO;
+    }
+
+    /* Durability: fflush + fsync BEFORE fclose so a buffered write
+     * hitting ENOSPC inside fclose doesn't slip through silently.
+     * Callers (esp. ray_journal_snapshot) write to a .tmp then rename
+     * — without this fsync the .tmp may be empty/partial on disk
+     * when the rename atomically swaps it in. */
+    if (fflush(f) != 0) {
+        fclose(f); ray_release(bytes);
+        return RAY_ERR_IO;
+    }
+#ifndef RAY_OS_WINDOWS
+    if (fsync(fileno(f)) != 0) {
+        fclose(f); ray_release(bytes);
+        return RAY_ERR_IO;
+    }
+#endif
+    /* fclose itself can fail (final flush of any platform-level
+     * buffer).  Check it. */
+    int close_rc = fclose(f);
+    ray_release(bytes);
+    return close_rc == 0 ? RAY_OK : RAY_ERR_IO;
+}
+
+ray_t* ray_obj_load(const char* path) {
+    FILE* f = fopen(path, "rb");
+    if (!f) return ray_error("io", NULL);
+
+    /* Check fseek/ftell return values — silent failures here let a
+     * truncated read through as "valid empty file" or worse. */
+    if (fseek(f, 0, SEEK_END) != 0) { fclose(f); return ray_error("io", "fseek end"); }
+    long sz = ftell(f);
+    if (sz < 0) { fclose(f); return ray_error("io", "ftell"); }
+    if (fseek(f, 0, SEEK_SET) != 0) { fclose(f); return ray_error("io", "fseek set"); }
+
+    if (sz == 0) { fclose(f); return ray_error("io", "empty file"); }
+
+    ray_t* buf = ray_vec_new(RAY_U8, sz);
+    if (!buf || RAY_IS_ERR(buf)) { fclose(f); return buf; }
+    buf->len = sz;
+
+    size_t n = fread(ray_data(buf), 1, (size_t)sz, f);
+    fclose(f);
+
+    if ((long)n != sz) { ray_release(buf); return ray_error("io", "short read"); }
+
+    ray_t* result = ray_de(buf);
+    ray_release(buf);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/serde.h b/crates/rayforce-sys/vendor/rayforce/src/store/serde.h
new file mode 100644
index 0000000..d0d6a11
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/serde.h
@@ -0,0 +1,81 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+ *
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_SERDE_H
+#define RAY_SERDE_H
+
+#include <rayforce.h>
+
+/* Wire format prefix */
+#define RAY_SERDE_PREFIX 0xcefadefa
+
+/* Wire format version.  Bumped whenever the on-the-wire layout of any
+ * serialized value changes (e.g. a new field is added to the atom
+ * record) so a peer running older code detects the mismatch and
+ * rejects the payload instead of silently mis-parsing.  Decoupled from
+ * RAY_VERSION_MAJOR on purpose: API version and wire version evolve
+ * independently.
+ *
+ *   Version 2 — atoms: type(1) + value-bytes.
+ *   Version 3 — atoms: type(1) + flags(1) + value-bytes.  `flags` bit 0
+ *               carries the typed-null marker so (de (ser 0Nl)) round-
+ *               trips (previously decoded as ray_i64(0) and dropped the
+ *               null bit). */
+#define RAY_SERDE_WIRE_VERSION 3
+
+/* Wire-only null marker (not a valid ray_t type) */
+#define RAY_SERDE_NULL 126
+
+typedef struct ray_ipc_header_t {
+    uint32_t prefix;     /* RAY_SERDE_PREFIX */
+    uint8_t  version;    /* RAY_VERSION_MAJOR */
+    uint8_t  flags;      /* 0 */
+    uint8_t  endian;     /* 0 = little */
+    uint8_t  msgtype;    /* 0 = async, 1 = sync, 2 = response */
+    int64_t  size;       /* payload size in bytes */
+} ray_ipc_header_t;
+
+_Static_assert(sizeof(ray_ipc_header_t) == 16, "ipc header must be 16 bytes");
+
+/* Calculate serialized size of an object (excluding IPC header) */
+int64_t ray_serde_size(ray_t* obj);
+
+/* Serialize object into buffer. Returns bytes written, 0 on error.
+ * Buffer must have at least ray_serde_size(obj) bytes. */
+int64_t ray_ser_raw(uint8_t* buf, ray_t* obj);
+
+/* Deserialize object from buffer. Returns reconstructed ray_t*.
+ * *len is updated to reflect bytes consumed. */
+ray_t*  ray_de_raw(uint8_t* buf, int64_t* len);
+
+/* Top-level: serialize to U8 vector with IPC header */
+ray_t*  ray_ser(ray_t* obj);
+
+/* Top-level: deserialize from U8 vector (validates IPC header) */
+ray_t*  ray_de(ray_t* bytes);
+
+/* File I/O: save/load any object in binary format */
+ray_err_t ray_obj_save(ray_t* obj, const char* path);
+ray_t*    ray_obj_load(const char* path);
+
+#endif /* RAY_SERDE_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/splay.c b/crates/rayforce-sys/vendor/rayforce/src/store/splay.c
new file mode 100644
index 0000000..32ce082
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/splay.c
@@ -0,0 +1,229 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "splay.h"
+#include "store/col.h"
+#include "store/fileio.h"
+#include <string.h>
+#include <stdio.h>
+
+/* --------------------------------------------------------------------------
+ * Splayed table: directory of column files + .d schema file
+ *
+ * Format:
+ *   dir/.d        — I64 vector of column name symbol IDs
+ *   dir/<colname> — column file per column
+ *
+ * No symlink check: local-trust file format; path traversal checks
+ * (rejecting '/', '\\', '..', leading '.') cover main attack vector.
+ * -------------------------------------------------------------------------- */
+
+/* Post-load validation: reject if sym table is empty but table has RAY_SYM
+ * columns, or if schema expected columns but none could be loaded. */
+static ray_err_t validate_sym_columns(ray_t* tbl, int64_t schema_ncols) {
+    if (ray_sym_count() != 0) return RAY_OK;
+
+    int64_t nc = ray_table_ncols(tbl);
+    if (schema_ncols > 0 && nc == 0) return RAY_ERR_CORRUPT;
+
+    for (int64_t c = 0; c < nc; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        if (col && col->type == RAY_SYM) return RAY_ERR_CORRUPT;
+    }
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_splay_save — save a table to a splayed table directory
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path) {
+    if (!tbl || RAY_IS_ERR(tbl)) return RAY_ERR_TYPE;
+    if (!dir) return RAY_ERR_IO;
+
+    /* Create directory and any missing parents (mkdir -p semantics).
+     * Required for partitioned layouts like "/db/2024.01.01/t/" where the
+     * caller hasn't pre-created the date partition. */
+    ray_err_t mkdir_err = ray_mkdir_p(dir);
+    if (mkdir_err != RAY_OK) return mkdir_err;
+
+    /* Save symbol table if sym_path provided */
+    if (sym_path) {
+        ray_err_t sym_err = ray_sym_save(sym_path);
+        if (sym_err != RAY_OK) return sym_err;
+    }
+
+    int64_t ncols = ray_table_ncols(tbl);
+
+    /* Save .d schema file */
+    ray_t* schema = ray_table_schema(tbl);
+    if (schema) {
+        char path[1024];
+        int path_len = snprintf(path, sizeof(path), "%s/.d", dir);
+        if (path_len < 0 || (size_t)path_len >= sizeof(path)) return RAY_ERR_RANGE;
+        ray_err_t err = ray_col_save(schema, path);
+        if (err != RAY_OK) return err;
+    }
+
+    /* Save each column */
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* col = ray_table_get_col_idx(tbl, c);
+        int64_t name_id = ray_table_col_name(tbl, c);
+        if (!col) continue;
+
+        /* Get column name string */
+        ray_t* name_atom = ray_sym_str(name_id);
+        if (!name_atom) continue;
+
+        const char* name = ray_str_ptr(name_atom);
+        size_t name_len = ray_str_len(name_atom);
+
+        /* Reject names with path separators, traversal, or starting with '.' */
+        if (name_len == 0 || name[0] == '.' ||
+            memchr(name, '/', name_len) || memchr(name, '\\', name_len) ||
+            memchr(name, '\0', name_len))
+            continue;
+
+        char path[1024];
+        int path_len = snprintf(path, sizeof(path), "%s/%.*s", dir, (int)name_len, name);
+        if (path_len < 0 || (size_t)path_len >= sizeof(path)) return RAY_ERR_RANGE;
+
+        ray_err_t err = ray_col_save(col, path);
+        /* On partial failure, columns 0..c-1 remain on disk.
+         * Caller should clean up or use atomic rename for safe writes. */
+        if (err != RAY_OK) return err;
+    }
+
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * splay_load_impl — shared implementation for ray_splay_load / ray_read_splayed
+ *
+ * When use_mmap is false, columns are loaded via ray_col_load (buddy copy).
+ * When use_mmap is true, columns are loaded via ray_col_mmap (zero-copy).
+ * The .d schema is always loaded via ray_col_load (small, buddy copy).
+ * -------------------------------------------------------------------------- */
+
+static ray_t* splay_load_impl(const char* dir, const char* sym_path, bool use_mmap) {
+    if (!dir) return ray_error("io", NULL);
+
+    /* Load symbol table if sym_path provided */
+    if (sym_path) {
+        ray_err_t sym_err = ray_sym_load(sym_path);
+        if (sym_err != RAY_OK) return ray_error(ray_err_code_str(sym_err), NULL);
+    }
+
+    /* Load .d schema */
+    char path[1024];
+    int path_len = snprintf(path, sizeof(path), "%s/.d", dir);
+    if (path_len < 0 || (size_t)path_len >= sizeof(path))
+        return ray_error("range", NULL);
+    ray_t* schema = ray_col_load(path);
+    if (!schema || RAY_IS_ERR(schema)) return schema;
+
+    int64_t ncols = schema->len;
+    int64_t* name_ids = (int64_t*)ray_data(schema);
+
+    ray_t* tbl = ray_table_new(ncols);
+    if (!tbl || RAY_IS_ERR(tbl)) {
+        ray_release(schema);
+        return tbl;
+    }
+
+    /* Load each column */
+    for (int64_t c = 0; c < ncols; c++) {
+        int64_t name_id = name_ids[c];
+        ray_t* name_atom = ray_sym_str(name_id);
+        if (!name_atom) {
+            /* Schema references a sym ID that doesn't exist — sym table
+             * is stale or wrong for this data. */
+            ray_release(schema);
+            ray_release(tbl);
+            return ray_error("corrupt", NULL);
+        }
+
+        const char* name = ray_str_ptr(name_atom);
+        size_t name_len = ray_str_len(name_atom);
+
+        /* Reject names with path separators, traversal, or starting with '.'
+         * — these indicate a stale/wrong sym file, not a column to skip. */
+        if (name_len == 0 || name[0] == '.' ||
+            memchr(name, '/', name_len) || memchr(name, '\\', name_len) ||
+            memchr(name, '\0', name_len)) {
+            ray_release(schema);
+            ray_release(tbl);
+            return ray_error("corrupt", NULL);
+        }
+
+        path_len = snprintf(path, sizeof(path), "%s/%.*s", dir, (int)name_len, name);
+        if (path_len < 0 || (size_t)path_len >= sizeof(path)) {
+            ray_release(schema);
+            ray_release(tbl);
+            return ray_error("range", NULL);
+        }
+
+        ray_t* col = use_mmap ? ray_col_mmap(path) : ray_col_load(path);
+        if (use_mmap && col && RAY_IS_ERR(col) &&
+            strcmp(ray_err_code(col), "nyi") == 0) {
+            /* ray_release on an error object is a no-op (rayforce.h:180);
+             * must use ray_error_free to actually reclaim the error
+             * before retrying with the non-mmap loader. */
+            ray_error_free(col);
+            col = ray_col_load(path);
+        }
+        if (!col || RAY_IS_ERR(col)) {
+            ray_release(schema);
+            ray_release(tbl);
+            return col ? col : ray_error("io", NULL);
+        }
+
+        ray_t* new_df = ray_table_add_col(tbl, name_id, col);
+        if (!new_df || RAY_IS_ERR(new_df)) {
+            ray_release(col);
+            ray_release(schema);
+            ray_release(tbl);
+            return new_df ? new_df : ray_error("oom", NULL);
+        }
+        ray_release(col); /* table_add_col retains; drop our ref */
+        tbl = new_df;
+    }
+
+    ray_release(schema);
+
+    ray_err_t sym_check = validate_sym_columns(tbl, ncols);
+    if (sym_check != RAY_OK) {
+        ray_release(tbl);
+        return ray_error(ray_err_code_str(sym_check), NULL);
+    }
+
+    return tbl;
+}
+
+ray_t* ray_splay_load(const char* dir, const char* sym_path) {
+    return splay_load_impl(dir, sym_path, false);
+}
+
+ray_t* ray_read_splayed(const char* dir, const char* sym_path) {
+    return splay_load_impl(dir, sym_path, true);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/splay.h b/crates/rayforce-sys/vendor/rayforce/src/store/splay.h
new file mode 100644
index 0000000..8648bf1
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/store/splay.h
@@ -0,0 +1,34 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_SPLAY_H
+#define RAY_SPLAY_H
+
+#include <rayforce.h>
+
+/* Splayed table I/O */
+ray_err_t ray_splay_save(ray_t* tbl, const char* dir, const char* sym_path);
+ray_t*    ray_splay_load(const char* dir, const char* sym_path);
+ray_t*    ray_read_splayed(const char* dir, const char* sym_path);
+
+#endif /* RAY_SPLAY_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/dict.c b/crates/rayforce-sys/vendor/rayforce/src/table/dict.c
new file mode 100644
index 0000000..9d58412
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/table/dict.c
@@ -0,0 +1,609 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "dict.h"
+#include "table.h"
+#include "table/sym.h"
+#include "lang/internal.h"   /* atom_eq for RAY_LIST key compares */
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * Layout
+ *
+ *   Block header (32B) | slot[0] = keys (ray_t*) | slot[1] = vals (ray_t*)
+ *
+ *   d->type   = RAY_DICT
+ *   d->len    = 2 (slot count, kept consistent with table block convention)
+ *   keys: any vector type; pair count = keys->len
+ *   vals: typed vector when homogeneous, RAY_LIST otherwise
+ * -------------------------------------------------------------------------- */
+
+#define DICT_DATA_SIZE  (2 * sizeof(ray_t*))
+
+static ray_t* dict_alloc_block(ray_t* keys, ray_t* vals) {
+    ray_t* d = ray_alloc(DICT_DATA_SIZE);
+    if (!d || RAY_IS_ERR(d)) return d;
+    d->type  = RAY_DICT;
+    d->attrs = 0;
+    d->len   = 2;
+    memset(d->nullmap, 0, 16);
+    ray_t** slots = ray_dict_slots(d);
+    slots[0] = keys;
+    slots[1] = vals;
+    return d;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_dict_new — wrap two refs into a fresh RAY_DICT block.
+ *
+ * Ownership: consumes one ref each of `keys` and `vals` (transferred into
+ * the dict).  On error, both refs are released.  Returns rc=1 dict.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_dict_new(ray_t* keys, ray_t* vals) {
+    if (!keys || RAY_IS_ERR(keys)) {
+        if (vals && !RAY_IS_ERR(vals)) ray_release(vals);
+        return keys ? keys : ray_error("type", NULL);
+    }
+    if (!vals || RAY_IS_ERR(vals)) {
+        ray_release(keys);
+        return vals ? vals : ray_error("type", NULL);
+    }
+    ray_t* d = dict_alloc_block(keys, vals);
+    if (!d || RAY_IS_ERR(d)) {
+        ray_release(keys);
+        ray_release(vals);
+        return d ? d : ray_error("oom", NULL);
+    }
+    return d;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_dict_keys / ray_dict_vals — borrowed pointers; do not release.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_dict_keys(ray_t* d) {
+    if (!d || RAY_IS_ERR(d) || d->type != RAY_DICT) return NULL;
+    return ray_dict_slots(d)[0];
+}
+
+ray_t* ray_dict_vals(ray_t* d) {
+    if (!d || RAY_IS_ERR(d) || d->type != RAY_DICT) return NULL;
+    return ray_dict_slots(d)[1];
+}
+
+int64_t ray_dict_len(ray_t* d) {
+    ray_t* keys = ray_dict_keys(d);
+    return keys ? keys->len : 0;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_dict_find_sym — fast sym-only probe (no atom boxing).
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_dict_find_sym(ray_t* d, int64_t sym_id) {
+    if (!d || RAY_IS_ERR(d) || d->type != RAY_DICT) return -1;
+    ray_t* keys = ray_dict_slots(d)[0];
+    if (!keys || RAY_IS_ERR(keys) || keys->type != RAY_SYM) return -1;
+    void* base = ray_data(keys);
+    int64_t n = keys->len;
+    uint8_t aw = keys->attrs & RAY_SYM_W_MASK;
+    switch (aw) {
+        case RAY_SYM_W8: {
+            const uint8_t* a = (const uint8_t*)base;
+            for (int64_t i = 0; i < n; i++) if ((int64_t)a[i] == sym_id) return i;
+            return -1;
+        }
+        case RAY_SYM_W16: {
+            const uint16_t* a = (const uint16_t*)base;
+            for (int64_t i = 0; i < n; i++) if ((int64_t)a[i] == sym_id) return i;
+            return -1;
+        }
+        case RAY_SYM_W32: {
+            const uint32_t* a = (const uint32_t*)base;
+            for (int64_t i = 0; i < n; i++) if ((int64_t)a[i] == sym_id) return i;
+            return -1;
+        }
+        default: {
+            const int64_t* a = (const int64_t*)base;
+            for (int64_t i = 0; i < n; i++) if (a[i] == sym_id) return i;
+            return -1;
+        }
+    }
+}
+
+ray_t* ray_dict_probe_sym_borrowed(ray_t* d, int64_t sym_id) {
+    int64_t idx = ray_dict_find_sym(d, sym_id);
+    if (idx < 0) return NULL;
+    ray_t* vals = ray_dict_slots(d)[1];
+    if (!vals || RAY_IS_ERR(vals) || vals->type != RAY_LIST) return NULL;
+    return ((ray_t**)ray_data(vals))[idx];
+}
+
+ray_t* ray_container_probe_sym(ray_t* v, int64_t sym_id) {
+    if (!v || RAY_IS_ERR(v)) return NULL;
+    if (v->type == RAY_DICT) return ray_dict_probe_sym_borrowed(v, sym_id);
+    if (v->type == RAY_TABLE) return ray_table_get_col(v, sym_id);
+    return NULL;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_dict_find_idx — locate key index in keys vector; -1 if missing.
+ *
+ * Dispatches on keys->type; the key atom must have matching atom type
+ * (e.g. -RAY_SYM key for a RAY_SYM keys vector).  Returns -1 on type
+ * mismatch rather than erroring; the caller surfaces the error.
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_dict_find_idx(ray_t* d, ray_t* key_atom) {
+    if (!d || RAY_IS_ERR(d) || d->type != RAY_DICT) return -1;
+    if (!key_atom || RAY_IS_ERR(key_atom)) return -1;
+
+    ray_t* keys = ray_dict_slots(d)[0];
+    if (!keys || RAY_IS_ERR(keys)) return -1;
+    int8_t kt = keys->type;
+    int64_t n = keys->len;
+    if (n <= 0) return -1;
+
+    /* RAY_LIST keys: heterogeneous, compare via atom_eq. */
+    if (kt == RAY_LIST) {
+        ray_t** ks = (ray_t**)ray_data(keys);
+        for (int64_t i = 0; i < n; i++)
+            if (atom_eq(ks[i], key_atom)) return i;
+        return -1;
+    }
+
+    /* Typed-vector keys: atom type must match. */
+    if (key_atom->type != -kt) return -1;
+
+    /* Null-aware probe: a null key atom matches only null slots; a non-null
+     * key atom must match a non-null slot of equal value.  Without this, a
+     * group dict containing both `0Nl` and `0` keys (now produced as
+     * distinct buckets by ray_group_fn) would still resolve `(at d 0Nl)`
+     * to the first non-null zero — re-introducing the conflation we just
+     * fixed in grouping. */
+    bool key_is_null = RAY_ATOM_IS_NULL(key_atom);
+    bool keys_have_nulls = (keys->attrs & RAY_ATTR_HAS_NULLS) != 0
+                            || (keys->attrs & RAY_ATTR_SLICE);
+    if (key_is_null) {
+        if (!keys_have_nulls) return -1;
+        for (int64_t i = 0; i < n; i++)
+            if (ray_vec_is_null(keys, i)) return i;
+        return -1;
+    }
+
+    void* base = ray_data(keys);
+#define DICT_FIND_LOOP(EQ_EXPR) do {                              \
+        if (keys_have_nulls) {                                    \
+            for (int64_t i = 0; i < n; i++) {                     \
+                if (ray_vec_is_null(keys, i)) continue;           \
+                if (EQ_EXPR) return i;                            \
+            }                                                     \
+        } else {                                                  \
+            for (int64_t i = 0; i < n; i++)                       \
+                if (EQ_EXPR) return i;                            \
+        }                                                         \
+        return -1;                                                \
+    } while (0)
+
+    switch (kt) {
+        case RAY_SYM: {
+            int64_t key_id = key_atom->i64;
+            uint8_t aw = keys->attrs;
+            switch (aw & RAY_SYM_W_MASK) {
+                case RAY_SYM_W8: {
+                    const uint8_t* a = (const uint8_t*)base;
+                    DICT_FIND_LOOP((int64_t)a[i] == key_id);
+                }
+                case RAY_SYM_W16: {
+                    const uint16_t* a = (const uint16_t*)base;
+                    DICT_FIND_LOOP((int64_t)a[i] == key_id);
+                }
+                case RAY_SYM_W32: {
+                    const uint32_t* a = (const uint32_t*)base;
+                    DICT_FIND_LOOP((int64_t)a[i] == key_id);
+                }
+                default: {
+                    const int64_t* a = (const int64_t*)base;
+                    DICT_FIND_LOOP(a[i] == key_id);
+                }
+            }
+        }
+        case RAY_I64:
+        case RAY_TIMESTAMP: {
+            const int64_t* a = (const int64_t*)base;
+            int64_t v = key_atom->i64;
+            DICT_FIND_LOOP(a[i] == v);
+        }
+        case RAY_I32:
+        case RAY_DATE:
+        case RAY_TIME: {
+            const int32_t* a = (const int32_t*)base;
+            int32_t v = key_atom->i32;
+            DICT_FIND_LOOP(a[i] == v);
+        }
+        case RAY_I16: {
+            const int16_t* a = (const int16_t*)base;
+            int16_t v = key_atom->i16;
+            DICT_FIND_LOOP(a[i] == v);
+        }
+        case RAY_BOOL:
+        case RAY_U8: {
+            const uint8_t* a = (const uint8_t*)base;
+            uint8_t v = key_atom->u8;
+            DICT_FIND_LOOP(a[i] == v);
+        }
+        case RAY_F32: {
+            const float* a = (const float*)base;
+            float v = (float)key_atom->f64;
+            DICT_FIND_LOOP(a[i] == v);
+        }
+        case RAY_F64: {
+            const double* a = (const double*)base;
+            double v = key_atom->f64;
+            DICT_FIND_LOOP(a[i] == v);
+        }
+        case RAY_STR: {
+            const char* kp = ray_str_ptr(key_atom);
+            size_t klen = ray_str_len(key_atom);
+            for (int64_t i = 0; i < n; i++) {
+                if (keys_have_nulls && ray_vec_is_null(keys, i)) continue;
+                size_t vlen = 0;
+                const char* vp = ray_str_vec_get(keys, i, &vlen);
+                if (!vp) continue;
+                if (vlen == klen && (klen == 0 || memcmp(vp, kp, klen) == 0))
+                    return i;
+            }
+            return -1;
+        }
+        case RAY_GUID: {
+            const uint8_t* a = (const uint8_t*)base;
+            const uint8_t* kp = key_atom->obj ? (const uint8_t*)ray_data(key_atom->obj) : NULL;
+            if (!kp) return -1;
+            for (int64_t i = 0; i < n; i++) {
+                if (keys_have_nulls && ray_vec_is_null(keys, i)) continue;
+                if (memcmp(a + i * 16, kp, 16) == 0) return i;
+            }
+            return -1;
+        }
+        default:
+            return -1;
+    }
+#undef DICT_FIND_LOOP
+}
+
+/* --------------------------------------------------------------------------
+ * Internal: read element at index out of a vals container as a borrowed
+ * ray_t*.  For RAY_LIST that's a stored pointer.  For typed vectors we
+ * synthesize a fresh atom (rc=1) — caller owns the returned ref.
+ *
+ * `*owned_out` is set true if the caller must release the result, false if
+ * it's borrowed (must NOT be released by the caller).
+ * -------------------------------------------------------------------------- */
+
+static ray_t* dict_vals_at(ray_t* vals, int64_t idx, bool* owned_out) {
+    *owned_out = false;
+    if (!vals || RAY_IS_ERR(vals)) return NULL;
+    if (idx < 0 || idx >= vals->len) return NULL;
+
+    if (vals->type == RAY_LIST) {
+        ray_t** slots = (ray_t**)ray_data(vals);
+        return slots[idx];
+    }
+
+    /* Typed vector — box element into a fresh atom so the caller has a
+     * uniform ray_t* contract.  Mark as owned so the caller releases. */
+    ray_t* atom = NULL;
+    void* base = ray_data(vals);
+    switch (vals->type) {
+        case RAY_BOOL:      atom = ray_bool(((uint8_t*)base)[idx]);                  break;
+        case RAY_U8:        atom = ray_u8(((uint8_t*)base)[idx]);                    break;
+        case RAY_I16:       atom = ray_i16(((int16_t*)base)[idx]);                   break;
+        case RAY_I32:       atom = ray_i32(((int32_t*)base)[idx]);                   break;
+        case RAY_I64:       atom = ray_i64(((int64_t*)base)[idx]);                   break;
+        case RAY_F32:       atom = ray_f32(((float*)base)[idx]);                     break;
+        case RAY_F64:       atom = ray_f64(((double*)base)[idx]);                    break;
+        case RAY_DATE:      atom = ray_date(((int32_t*)base)[idx]);                  break;
+        case RAY_TIME:      atom = ray_time(((int32_t*)base)[idx]);                  break;
+        case RAY_TIMESTAMP: atom = ray_timestamp(((int64_t*)base)[idx]);              break;
+        case RAY_SYM: {
+            int64_t id = ray_read_sym(base, idx, vals->type, vals->attrs);
+            atom = ray_sym(id);
+            break;
+        }
+        case RAY_STR: {
+            size_t slen = 0;
+            const char* sp = ray_str_vec_get(vals, idx, &slen);
+            atom = sp ? ray_str(sp, slen) : ray_str("", 0);
+            break;
+        }
+        case RAY_GUID:
+            atom = ray_guid(((uint8_t*)base) + idx * 16);
+            break;
+        default:
+            return NULL;
+    }
+    if (atom && !RAY_IS_ERR(atom)) *owned_out = true;
+    return atom;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_dict_get — return value for `key_atom`, or NULL if missing.
+ *
+ * The returned pointer is owned by the caller (rc=1) — callers must
+ * `ray_release` it after use.  This makes the contract uniform whether
+ * vals is a typed vector (boxed atom) or a RAY_LIST (retained slot).
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_dict_get(ray_t* d, ray_t* key_atom) {
+    int64_t i = ray_dict_find_idx(d, key_atom);
+    if (i < 0) return NULL;
+    ray_t* vals = ray_dict_slots(d)[1];
+    bool owned = false;
+    ray_t* out = dict_vals_at(vals, i, &owned);
+    if (!out || RAY_IS_ERR(out)) return out;
+    if (!owned) ray_retain(out);
+    return out;
+}
+
+/* --------------------------------------------------------------------------
+ * promote_vals_to_list — return a RAY_LIST equivalent to `vals`.
+ *
+ * If `vals` is already a RAY_LIST we return it unchanged (borrowed).  If
+ * `vals` is a typed vector we materialize each element into a fresh atom
+ * inside a new RAY_LIST — the caller owns the new list and must release
+ * it (and the original `vals` separately if it owns that).
+ *
+ * Used by upsert/remove to keep mutation paths simple regardless of the
+ * incoming vals shape.
+ * -------------------------------------------------------------------------- */
+
+static ray_t* promote_vals_to_list(ray_t* vals) {
+    if (!vals || RAY_IS_ERR(vals)) return vals;
+    if (vals->type == RAY_LIST) {
+        ray_retain(vals);
+        return vals;
+    }
+    int64_t n = vals->len;
+    ray_t* lst = ray_list_new(n);
+    if (!lst || RAY_IS_ERR(lst)) return lst ? lst : ray_error("oom", NULL);
+    for (int64_t i = 0; i < n; i++) {
+        bool owned = false;
+        ray_t* a = dict_vals_at(vals, i, &owned);
+        if (!a || RAY_IS_ERR(a)) {
+            ray_release(lst);
+            return a ? a : ray_error("oom", NULL);
+        }
+        ray_t* lst2 = ray_list_append(lst, a);
+        if (owned) ray_release(a);
+        if (!lst2 || RAY_IS_ERR(lst2)) {
+            if (lst2 == NULL) ray_release(lst);
+            return lst2 ? lst2 : ray_error("oom", NULL);
+        }
+        lst = lst2;
+    }
+    return lst;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_dict_upsert — set d[key_atom] = val.
+ *
+ * Ownership: consumes `d`; on success the ref is transferred into the
+ * returned dict (rc=1).  On error `d` is released.  Does NOT consume
+ * `key_atom` or `val` — both are retained internally as needed.
+ *
+ * Existing-key fast path: COW the dict, replace val slot in place when
+ * vals is a RAY_LIST; if vals is a typed vector matching val's atom type
+ * we COW vals and rewrite the element; otherwise promote vals to RAY_LIST
+ * first.  Missing-key path: append key & val (always promoting vals to
+ * RAY_LIST so the homogeneous typed-vec invariant is not silently broken).
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_dict_upsert(ray_t* d, ray_t* key_atom, ray_t* val) {
+    if (!d || RAY_IS_ERR(d)) return d ? d : ray_error("type", NULL);
+    if (!val || RAY_IS_ERR(val)) {
+        ray_release(d);
+        return val ? val : ray_error("type", NULL);
+    }
+
+    /* Empty-target special case: build a fresh dict.  Keys vector type
+     * mirrors the key atom's atom type. */
+    if (d->type != RAY_DICT) {
+        ray_release(d);
+        if (!key_atom || RAY_IS_ERR(key_atom)) return ray_error("type", NULL);
+        int8_t kt = (int8_t)-key_atom->type;
+        ray_t* keys = (kt == RAY_SYM)
+            ? ray_sym_vec_new(RAY_SYM_W64, 1)
+            : ray_vec_new(kt, 1);
+        if (!keys || RAY_IS_ERR(keys)) return keys ? keys : ray_error("oom", NULL);
+        ray_t* vals = ray_list_new(1);
+        if (!vals || RAY_IS_ERR(vals)) { ray_release(keys); return vals ? vals : ray_error("oom", NULL); }
+        ray_t* d2 = ray_dict_new(keys, vals);
+        if (!d2 || RAY_IS_ERR(d2)) return d2;
+        return ray_dict_upsert(d2, key_atom, val);
+    }
+
+    int64_t idx = ray_dict_find_idx(d, key_atom);
+
+    /* COW the dict; the slots remain shared until we COW them too. */
+    d = ray_cow(d);
+    if (!d || RAY_IS_ERR(d)) return d;
+
+    ray_t** slots = ray_dict_slots(d);
+    ray_t* keys = slots[0];
+    ray_t* vals = slots[1];
+
+    /* The append/set helpers consume the input ref and return an owned ref
+     * (possibly the same pointer, or a fresh one after grow — in which case
+     * the OLD block is already freed inside the helper).  So we always
+     * overwrite slots[*] with the helper return and never release the old
+     * pointer ourselves — that would double-free on grow. */
+    if (idx >= 0) {
+        /* Replace existing slot. */
+        if (vals->type == RAY_LIST) {
+            ray_t* new_vals = ray_list_set(vals, idx, val);
+            if (!new_vals || RAY_IS_ERR(new_vals)) { ray_release(d); return new_vals ? new_vals : ray_error("oom", NULL); }
+            slots[1] = new_vals;
+        } else {
+            /* Typed vector path: promote to LIST first, then update. */
+            ray_t* lst = promote_vals_to_list(vals);
+            if (!lst || RAY_IS_ERR(lst)) { ray_release(d); return lst ? lst : ray_error("oom", NULL); }
+            ray_release(vals);
+            slots[1] = lst;
+            ray_t* new_lst = ray_list_set(lst, idx, val);
+            if (!new_lst || RAY_IS_ERR(new_lst)) { ray_release(d); return new_lst ? new_lst : ray_error("oom", NULL); }
+            slots[1] = new_lst;
+        }
+        return d;
+    }
+
+    /* Missing key — append to both vectors.  Promote vals to LIST first. */
+    if (vals->type != RAY_LIST) {
+        ray_t* lst = promote_vals_to_list(vals);
+        if (!lst || RAY_IS_ERR(lst)) { ray_release(d); return lst ? lst : ray_error("oom", NULL); }
+        ray_release(vals);
+        slots[1] = lst;
+        vals = lst;
+    }
+
+    /* Append key — helper consumes `keys`, returns owned (possibly new) ref. */
+    ray_t* new_keys = NULL;
+    if (keys->type == RAY_SYM) {
+        int64_t kid = key_atom->i64;
+        new_keys = ray_vec_append(keys, &kid);
+    } else if (keys->type == RAY_STR && key_atom->type == -RAY_STR) {
+        new_keys = ray_str_vec_append(keys, ray_str_ptr(key_atom), ray_str_len(key_atom));
+    } else if (keys->type == RAY_GUID && key_atom->type == -RAY_GUID) {
+        const void* src = key_atom->obj ? ray_data(key_atom->obj) : NULL;
+        if (!src) { ray_release(d); return ray_error("type", NULL); }
+        new_keys = ray_vec_append(keys, src);
+    } else if (keys->type == RAY_F32 && key_atom->type == -RAY_F32) {
+        /* F32 atoms keep their value in the f64 union slot; the keys vec
+         * stores narrower 4-byte floats, so narrow before append (the
+         * generic &u8 fallback below would copy the wrong half of the
+         * double bit pattern). */
+        float f = (float)key_atom->f64;
+        new_keys = ray_vec_append(keys, &f);
+    } else if (keys->type == -key_atom->type) {
+        new_keys = ray_vec_append(keys, &key_atom->u8);
+    } else {
+        ray_release(d);
+        return ray_error("type", NULL);
+    }
+    if (!new_keys || RAY_IS_ERR(new_keys)) { ray_release(d); return new_keys ? new_keys : ray_error("oom", NULL); }
+    slots[0] = new_keys;
+
+    /* Append val — list_append consumes vals, returns owned (possibly new). */
+    ray_t* new_vals = ray_list_append(vals, val);
+    if (!new_vals || RAY_IS_ERR(new_vals)) { ray_release(d); return new_vals ? new_vals : ray_error("oom", NULL); }
+    slots[1] = new_vals;
+
+    return d;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_dict_remove — drop the (key, val) pair if present.
+ *
+ * Ownership: consumes `d`; transferred into the returned dict.  If the
+ * key isn't present, returns the input unchanged (one-ref transferred).
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_dict_remove(ray_t* d, ray_t* key_atom) {
+    if (!d || RAY_IS_ERR(d)) return d ? d : ray_error("type", NULL);
+    if (d->type != RAY_DICT) { ray_release(d); return ray_error("type", NULL); }
+
+    int64_t idx = ray_dict_find_idx(d, key_atom);
+    if (idx < 0) return d;
+
+    d = ray_cow(d);
+    if (!d || RAY_IS_ERR(d)) return d;
+
+    ray_t** slots = ray_dict_slots(d);
+    ray_t* keys = slots[0];
+    ray_t* vals = slots[1];
+
+    /* Promote typed-vector vals to LIST so we can remove uniformly. */
+    if (vals->type != RAY_LIST) {
+        ray_t* lst = promote_vals_to_list(vals);
+        if (!lst || RAY_IS_ERR(lst)) { ray_release(d); return lst ? lst : ray_error("oom", NULL); }
+        ray_release(vals);
+        slots[1] = lst;
+        vals = lst;
+    }
+
+    /* Drop key element by slicing (build a smaller vec without idx). */
+    int64_t n = keys->len;
+    ray_t* new_keys = NULL;
+    if (keys->type == RAY_SYM) {
+        new_keys = ray_sym_vec_new(keys->attrs & RAY_SYM_W_MASK, n - 1);
+    } else if (keys->type == RAY_STR) {
+        new_keys = ray_vec_new(RAY_STR, n - 1);
+    } else {
+        new_keys = ray_vec_new(keys->type, n - 1);
+    }
+    if (!new_keys || RAY_IS_ERR(new_keys)) { ray_release(d); return new_keys ? new_keys : ray_error("oom", NULL); }
+
+    /* Copy keys[0..idx-1] then keys[idx+1..n-1] into new_keys. */
+    if (keys->type == RAY_STR) {
+        for (int64_t i = 0; i < n; i++) {
+            if (i == idx) continue;
+            size_t slen = 0;
+            const char* sp = ray_str_vec_get(keys, i, &slen);
+            ray_t* nk = ray_str_vec_append(new_keys, sp ? sp : "", sp ? slen : 0);
+            if (!nk || RAY_IS_ERR(nk)) { ray_release(new_keys); ray_release(d); return nk ? nk : ray_error("oom", NULL); }
+            new_keys = nk;
+        }
+    } else if (keys->type == RAY_SYM) {
+        uint8_t aw = keys->attrs & RAY_SYM_W_MASK;
+        void* sb = ray_data(keys);
+        for (int64_t i = 0; i < n; i++) {
+            if (i == idx) continue;
+            int64_t sid = ray_read_sym(sb, i, RAY_SYM, aw);
+            ray_t* nk = ray_vec_append(new_keys, &sid);
+            if (!nk || RAY_IS_ERR(nk)) { ray_release(new_keys); ray_release(d); return nk ? nk : ray_error("oom", NULL); }
+            new_keys = nk;
+        }
+    } else {
+        uint8_t esz = ray_sym_elem_size(keys->type, keys->attrs);
+        const uint8_t* base = (const uint8_t*)ray_data(keys);
+        for (int64_t i = 0; i < n; i++) {
+            if (i == idx) continue;
+            ray_t* nk = ray_vec_append(new_keys, base + (size_t)i * esz);
+            if (!nk || RAY_IS_ERR(nk)) { ray_release(new_keys); ray_release(d); return nk ? nk : ray_error("oom", NULL); }
+            new_keys = nk;
+        }
+    }
+    slots[0] = new_keys;
+    ray_release(keys);
+
+    /* Drop vals[idx] from the LIST. */
+    vals = ray_cow(vals);
+    if (!vals || RAY_IS_ERR(vals)) { ray_release(d); return vals; }
+    slots[1] = vals;
+    ray_t** vslots = (ray_t**)ray_data(vals);
+    if (vslots[idx]) ray_release(vslots[idx]);
+    for (int64_t i = idx; i + 1 < vals->len; i++) vslots[i] = vslots[i + 1];
+    vals->len -= 1;
+
+    return d;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/dict.h b/crates/rayforce-sys/vendor/rayforce/src/table/dict.h
new file mode 100644
index 0000000..a6e1d0d
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/table/dict.h
@@ -0,0 +1,68 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_DICT_H
+#define RAY_DICT_H
+
+/*
+ * dict.h -- Dict operations.
+ *
+ * A dict has type = RAY_DICT (99), len = 2.  Data region holds two
+ * ray_t* slots:
+ *   slot[0] = keys vector (any vector type)
+ *   slot[1] = vals — typed vector when homogeneous, RAY_LIST otherwise
+ *
+ * Layout mirrors RAY_TABLE's
+ * (keys, vals) shape.
+ *
+ * Lookup dispatches on keys->type so polymorphic keys (sym, i64, str, …)
+ * all use the same probe path.  Pair count == keys->len.
+ */
+
+#include <rayforce.h>
+#include "mem/heap.h"
+
+/* Internal slot accessors — keys/vals slots in the 2-pointer block. */
+static inline ray_t** ray_dict_slots(ray_t* d) {
+    return (ray_t**)ray_data(d);
+}
+
+/* Lookup index of `key_atom` in `keys` vector, or -1 if not found.
+ * `key_atom` may be of any atom type matching the keys vector. */
+int64_t ray_dict_find_idx(ray_t* d, ray_t* key_atom);
+
+/* Find sym key index without atom boxing.  Returns -1 if d is not a
+ * RAY_DICT, keys is not RAY_SYM, or sym_id is missing. */
+int64_t ray_dict_find_sym(ray_t* d, int64_t sym_id);
+
+/* Borrowed-ref probe for sym key.  Returns the slot pointer when vals is
+ * RAY_LIST; returns NULL otherwise (typed-vec dicts require boxing — use
+ * ray_dict_get for those).  Used by env-path resolution where dict vals
+ * are always callables/atoms/sub-dicts, never typed columns. */
+ray_t* ray_dict_probe_sym_borrowed(ray_t* d, int64_t sym_id);
+
+/* Borrowed sym-key probe for either RAY_DICT or RAY_TABLE — returns the
+ * value slot for dicts and the column vector for tables.  NULL on miss. */
+ray_t* ray_container_probe_sym(ray_t* v, int64_t sym_id);
+
+#endif /* RAY_DICT_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/sym.c b/crates/rayforce-sys/vendor/rayforce/src/table/sym.c
new file mode 100644
index 0000000..02d1e1a
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/table/sym.c
@@ -0,0 +1,1251 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "sym.h"
+#include "core/platform.h"
+#include "store/col.h"
+#include "store/fileio.h"
+#include "mem/heap.h"
+#include "mem/sys.h"
+#include "mem/arena.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdatomic.h>
+#include <errno.h>
+#include "ops/hash.h"
+
+/* --------------------------------------------------------------------------
+ * Symbol table structure (static global, sequential mode only).
+ * NOT thread-safe: all interning must happen before ray_parallel_begin().
+ * -------------------------------------------------------------------------- */
+
+#define SYM_INIT_CAP     256
+#define SYM_LOAD_FACTOR  0.7
+
+/* Cached segment list for a dotted sym: nsegs sym_ids that together make up
+ * the dotted path.  segs is arena-allocated (same lifetime as sym table). */
+typedef struct {
+    uint8_t  nsegs;
+    int64_t* segs;   /* length nsegs; NULL for non-dotted entries */
+} sym_segs_t;
+
+typedef struct {
+    /* Hash table: each bucket stores (hash32 << 32) | (id + 1), 0 = empty */
+    uint64_t*  buckets;
+    uint32_t   bucket_cap;   /* always power of 2 */
+
+    /* String array: strings[id] = ray_t* string atom */
+    ray_t**     strings;
+    uint32_t   str_count;
+    uint32_t   str_cap;
+
+    /* Per-sym dotted-path metadata, parallel to strings[].
+     * `dotted` is a bitmap (1 bit per sym_id); bit set = name is dotted
+     *   and segment sym_ids are cached in `segments`.
+     * `scanned` is a bitmap; bit set = sym_cache_segments has settled this
+     *   sym (either cached successfully, or decided it is a plain name).
+     *   Unset = needs to be (re-)scanned on the next intern call, which is
+     *   how we recover from a transient cache OOM on first intern: the
+     *   bit stays clear, so future interns of the same name retry.
+     * `segments` holds cached segment sym_ids; segs == NULL when dotted
+     *   bit is clear. */
+    uint64_t*   dotted;       /* (str_cap + 63) / 64 words */
+    uint64_t*   scanned;      /* (str_cap + 63) / 64 words */
+    sym_segs_t* segments;     /* length str_cap */
+
+    /* Persistence: entries [0..persisted_count-1] are known on disk */
+    uint32_t   persisted_count;
+
+    /* Arena for string atoms — avoids per-string buddy allocator calls */
+    ray_arena_t*  arena;
+} sym_table_t;
+
+static sym_table_t g_sym;
+static _Atomic(bool) g_sym_inited = false;
+
+/* Spinlock protecting g_sym mutations in ray_sym_intern */
+static _Atomic(int) g_sym_lock = 0;
+static inline void sym_lock(void) {
+    while (atomic_exchange_explicit(&g_sym_lock, 1, memory_order_acquire)) {
+#if defined(__x86_64__) || defined(__i386__)
+        __builtin_ia32_pause();
+#endif
+    }
+}
+static inline void sym_unlock(void) {
+    atomic_store_explicit(&g_sym_lock, 0, memory_order_release);
+}
+
+/* Arena-backed ray_str equivalent. Same logic as ray_str() in atom.c
+ * but allocates from the sym arena instead of the buddy allocator. */
+static ray_t* sym_str_arena(ray_arena_t* arena, const char* s, size_t len) {
+    if (len < 7) {
+        /* SSO path: inline in header */
+        ray_t* v = ray_arena_alloc(arena, 0);
+        if (!v) return NULL;
+        v->type = -RAY_STR;
+        v->slen = (uint8_t)len;
+        if (len > 0) memcpy(v->sdata, s, len);
+        v->sdata[len] = '\0';
+        return v;
+    }
+    /* Long string: fused single allocation for U8 vector + STR header.
+     * Layout: [CHAR ray_t header (32B) | string data (len+1) | padding | STR ray_t header (32B)]
+     * This halves arena_alloc calls for long strings. */
+    size_t data_size = len + 1;
+    size_t chars_block = ((32 + data_size) + 31) & ~(size_t)31;  /* align up to 32 */
+    ray_t* chars = ray_arena_alloc(arena, chars_block + 32 - 32);  /* chars_block - 32 (header) + 32 (str header) */
+    if (!chars) return NULL;
+    chars->type = RAY_U8;
+    chars->len = (int64_t)len;
+    memcpy(ray_data(chars), s, len);
+    ((char*)ray_data(chars))[len] = '\0';
+
+    /* STR header sits right after the CHAR block */
+    ray_t* v = (ray_t*)((char*)chars + chars_block);
+    memset(v, 0, 32);
+    v->attrs = RAY_ATTR_ARENA;
+    ray_atomic_store(&v->rc, 1);
+    v->type = -RAY_STR;
+    v->obj = chars;
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_init
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_sym_init(void) {
+    bool expected = false;
+    if (!atomic_compare_exchange_strong_explicit(&g_sym_inited, &expected, true,
+            memory_order_acq_rel, memory_order_acquire))
+        return RAY_OK; /* already initialized by another thread */
+
+    g_sym.bucket_cap = SYM_INIT_CAP;
+    /* ray_sys_alloc uses mmap(MAP_ANONYMOUS) which zero-initializes. */
+    g_sym.buckets = (uint64_t*)ray_sys_alloc(g_sym.bucket_cap * sizeof(uint64_t));
+    if (!g_sym.buckets) {
+        atomic_store_explicit(&g_sym_inited, false, memory_order_release);
+        return RAY_ERR_OOM;
+    }
+
+    g_sym.str_cap = SYM_INIT_CAP;
+    g_sym.str_count = 0;
+    g_sym.strings = (ray_t**)ray_sys_alloc(g_sym.str_cap * sizeof(ray_t*));
+    if (!g_sym.strings) {
+        ray_sys_free(g_sym.buckets);
+        g_sym.buckets = NULL;
+        atomic_store_explicit(&g_sym_inited, false, memory_order_release);
+        return RAY_ERR_OOM;
+    }
+
+    g_sym.arena = ray_arena_new(1024 * 1024);  /* 1MB chunks */
+    if (!g_sym.arena) {
+        ray_sys_free(g_sym.strings);
+        ray_sys_free(g_sym.buckets);
+        g_sym.strings = NULL;
+        g_sym.buckets = NULL;
+        atomic_store_explicit(&g_sym_inited, false, memory_order_release);
+        return RAY_ERR_OOM;
+    }
+
+    /* Dotted-path sidecars sized to str_cap.  ray_sys_alloc is MAP_ANONYMOUS
+     * so memory is zero-initialised — bitmaps start all-zero, segments[i]
+     * structs start {nsegs:0, segs:NULL}.  Failures free prior allocations
+     * and roll the sym table back to uninitialised. */
+    uint32_t bm_words = (g_sym.str_cap + 63) / 64;
+    g_sym.dotted = (uint64_t*)ray_sys_alloc((size_t)bm_words * sizeof(uint64_t));
+    g_sym.scanned = (uint64_t*)ray_sys_alloc((size_t)bm_words * sizeof(uint64_t));
+    g_sym.segments = (sym_segs_t*)ray_sys_alloc((size_t)g_sym.str_cap * sizeof(sym_segs_t));
+    if (!g_sym.dotted || !g_sym.scanned || !g_sym.segments) {
+        if (g_sym.dotted) ray_sys_free(g_sym.dotted);
+        if (g_sym.scanned) ray_sys_free(g_sym.scanned);
+        if (g_sym.segments) ray_sys_free(g_sym.segments);
+        g_sym.dotted = NULL;
+        g_sym.scanned = NULL;
+        g_sym.segments = NULL;
+        ray_arena_destroy(g_sym.arena);
+        g_sym.arena = NULL;
+        ray_sys_free(g_sym.strings);
+        ray_sys_free(g_sym.buckets);
+        g_sym.strings = NULL;
+        g_sym.buckets = NULL;
+        atomic_store_explicit(&g_sym_inited, false, memory_order_release);
+        return RAY_ERR_OOM;
+    }
+
+    /* g_sym_inited already set to true by CAS above */
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_destroy
+ * -------------------------------------------------------------------------- */
+
+void ray_sym_destroy(void) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return;
+
+    /* Arena-backed strings: ray_release is a no-op (RAY_ATTR_ARENA).
+     * Destroy the arena to free all string atoms at once.
+     * segments[i].segs pointers are arena-allocated too, freed with it. */
+    if (g_sym.arena) {
+        ray_arena_destroy(g_sym.arena);
+        g_sym.arena = NULL;
+    }
+
+    if (g_sym.segments) ray_sys_free(g_sym.segments);
+    if (g_sym.scanned)  ray_sys_free(g_sym.scanned);
+    if (g_sym.dotted)   ray_sys_free(g_sym.dotted);
+    ray_sys_free(g_sym.strings);
+    ray_sys_free(g_sym.buckets);
+
+    memset(&g_sym, 0, sizeof(g_sym));
+    atomic_store_explicit(&g_sym_inited, false, memory_order_release);
+}
+
+/* --------------------------------------------------------------------------
+ * Hash table helpers
+ * -------------------------------------------------------------------------- */
+
+static void ht_insert(uint64_t* buckets, uint32_t cap, uint32_t hash, uint32_t id) {
+    uint32_t mask = cap - 1;
+    uint32_t slot = hash & mask;
+    uint64_t entry = ((uint64_t)hash << 32) | ((uint64_t)(id + 1));
+
+    for (;;) {
+        if (buckets[slot] == 0) {
+            buckets[slot] = entry;
+            return;
+        }
+        slot = (slot + 1) & mask;
+    }
+}
+
+/* Grow hash table to new_cap (must be power of 2 and > current cap). */
+static bool ht_grow_to(uint32_t new_cap) {
+    uint64_t* new_buckets = (uint64_t*)ray_sys_alloc((size_t)new_cap * sizeof(uint64_t));
+    if (!new_buckets) return false;
+
+    /* Re-insert all existing entries */
+    for (uint32_t i = 0; i < g_sym.bucket_cap; i++) {
+        uint64_t e = g_sym.buckets[i];
+        if (e == 0) continue;
+        uint32_t h = (uint32_t)(e >> 32);
+        uint32_t id = (uint32_t)(e & 0xFFFFFFFF) - 1;
+        ht_insert(new_buckets, new_cap, h, id);
+    }
+
+    ray_sys_free(g_sym.buckets);
+    g_sym.buckets = new_buckets;
+    g_sym.bucket_cap = new_cap;
+    return true;
+}
+
+static bool ht_grow(void) {
+    /* Overflow guard: bucket_cap is always power of 2.
+     * At 2^31, doubling overflows uint32_t. */
+    if (g_sym.bucket_cap >= (UINT32_MAX / 2 + 1)) return false;
+    return ht_grow_to(g_sym.bucket_cap * 2);
+}
+
+/* --------------------------------------------------------------------------
+ * sym_grow_str_cap — grow strings[], dotted[] bitmap, and segments[] array
+ * to hold at least new_cap entries.  Must be called with sym_lock held
+ * (or from within single-threaded prehashed intern).  Zero-fills the new
+ * portion of segments[] explicitly (realloc of a mapped region may return
+ * pages that weren't touched but we don't want to rely on virgin mmap).
+ * -------------------------------------------------------------------------- */
+static bool sym_grow_str_cap(uint32_t new_cap) {
+    uint32_t old_cap = g_sym.str_cap;
+    if (new_cap <= old_cap) return true;
+
+    ray_t** new_strings = (ray_t**)ray_sys_realloc(g_sym.strings,
+                                                   (size_t)new_cap * sizeof(ray_t*));
+    if (!new_strings) return false;
+    g_sym.strings = new_strings;
+
+    uint32_t old_bm_words = (old_cap + 63) / 64;
+    uint32_t new_bm_words = (new_cap + 63) / 64;
+    if (new_bm_words > old_bm_words) {
+        uint64_t* new_dotted = (uint64_t*)ray_sys_realloc(g_sym.dotted,
+                                                          (size_t)new_bm_words * sizeof(uint64_t));
+        if (!new_dotted) return false;
+        memset(new_dotted + old_bm_words, 0,
+               (size_t)(new_bm_words - old_bm_words) * sizeof(uint64_t));
+        g_sym.dotted = new_dotted;
+
+        uint64_t* new_scanned = (uint64_t*)ray_sys_realloc(g_sym.scanned,
+                                                           (size_t)new_bm_words * sizeof(uint64_t));
+        if (!new_scanned) return false;
+        memset(new_scanned + old_bm_words, 0,
+               (size_t)(new_bm_words - old_bm_words) * sizeof(uint64_t));
+        g_sym.scanned = new_scanned;
+    }
+
+    sym_segs_t* new_segments = (sym_segs_t*)ray_sys_realloc(g_sym.segments,
+                                                            (size_t)new_cap * sizeof(sym_segs_t));
+    if (!new_segments) return false;
+    memset(new_segments + old_cap, 0,
+           (size_t)(new_cap - old_cap) * sizeof(sym_segs_t));
+    g_sym.segments = new_segments;
+
+    g_sym.str_cap = new_cap;
+    return true;
+}
+
+/* Forward declarations — sym_cache_segments (below) needs these helpers
+ * that are defined further down in the file.  ray_sym_bytes_upper is
+ * declared in sym.h as a public inline so both the intern path and the
+ * test suite can refer to the same formula. */
+static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len);
+static int64_t sym_probe(uint32_t hash, const char* str, size_t len);
+static int64_t sym_commit_new(uint32_t hash, const char* str, size_t len);
+static bool    sym_reserve_capacity(uint32_t new_sym_count, size_t arena_bytes);
+
+/* --------------------------------------------------------------------------
+ * sym_cache_segments — idempotent cache-and-apply for an EXISTING sym.
+ * Used by ray_sym_rebuild_segments (after bulk persistence loads) and by
+ * the probe-found branch of sym_intern_nolock (a prior intern via
+ * ray_sym_intern_no_split may have committed the sym without ever
+ * running the cache prep).
+ *
+ * Atomic: same inspect + reserve + commit pattern as sym_intern_nolock,
+ * so a failure here leaves no orphan segment syms and no half-applied
+ * cache state.  Returns false only on real OOM — scanned stays clear
+ * in that case so future retries pick up where we left off.
+ * -------------------------------------------------------------------------- */
+static bool sym_cache_segments(uint32_t new_id, const char* str, size_t len) {
+    uint64_t bit = (uint64_t)1 << (new_id & 63);
+    uint32_t word = new_id >> 6;
+    if (g_sym.scanned[word] & bit) return true;
+
+    const char* first_dot = (const char*)memchr(str, '.', len);
+    if (!first_dot) {
+        /* Plain — mark settled. */
+        g_sym.scanned[word] |= bit;
+        return true;
+    }
+
+    /* Validate structure.  Trailing dot → not dotted.  Leading `.` is
+     * allowed ONLY when followed by another dot (e.g. `.sys.gc`) —
+     * in that case segment 0 includes the leading dot (`.sys`), so
+     * reserved-namespace names resolve against their root dict via
+     * the regular segment walk. */
+    if (str[len - 1] == '.') {
+        g_sym.scanned[word] |= bit;
+        return true;
+    }
+    bool leading_dot = (str[0] == '.');
+    if (leading_dot) {
+        /* `.sys` alone (no second dot) is a plain name. */
+        const char* second = (const char*)memchr(str + 1, '.', len - 1);
+        if (!second) { g_sym.scanned[word] |= bit; return true; }
+    }
+    size_t sep_dots = 0;
+    for (size_t i = (leading_dot ? 1 : 0); i < len; i++)
+        if (str[i] == '.') sep_dots++;
+    if (sep_dots + 1 > 255) {
+        g_sym.scanned[word] |= bit;
+        return true;
+    }
+    uint8_t nsegs = (uint8_t)(sep_dots + 1);
+
+    struct { const char* p; size_t len; uint32_t hash; int64_t id; } descs[256];
+    uint32_t new_seg_count = 0;
+    size_t   new_seg_bytes = 0;
+    {
+        const char* p = str;
+        size_t remaining = len;
+        uint8_t i = 0;
+        while (remaining && i < nsegs) {
+            /* Segment 0 starts at str[0] but skips the leading `.` when
+             * searching for the segment-terminating dot — so seg 0 of
+             * `.sys.gc` is `.sys`, not `` (empty). */
+            size_t skip = (i == 0 && leading_dot) ? 1 : 0;
+            const char* dot = remaining > skip
+                ? (const char*)memchr(p + skip, '.', remaining - skip)
+                : NULL;
+            size_t seg_len = dot ? (size_t)(dot - p) : remaining;
+            if (seg_len == 0) { g_sym.scanned[word] |= bit; return true; }
+            uint32_t h = (uint32_t)ray_hash_bytes(p, seg_len);
+            descs[i].p    = p;
+            descs[i].len  = seg_len;
+            descs[i].hash = h;
+            descs[i].id   = sym_probe(h, p, seg_len);
+            if (descs[i].id < 0) {
+                new_seg_count++;
+                new_seg_bytes += ray_sym_bytes_upper(seg_len);
+            }
+            i++;
+            if (!dot) break;
+            remaining -= (seg_len + 1);
+            p = dot + 1;
+        }
+    }
+
+    /* Reserve capacity for new segments + segs array. */
+    size_t segs_payload = (size_t)nsegs * sizeof(int64_t);
+    size_t arena_bytes  = new_seg_bytes +
+                          (((size_t)32 + segs_payload + 31) & ~(size_t)31);
+    if (!sym_reserve_capacity(new_seg_count, arena_bytes)) return false;
+
+    /* Commit.  Allocations covered by reservation above. */
+    for (uint8_t i = 0; i < nsegs; i++) {
+        if (descs[i].id < 0) {
+            int64_t sid = sym_commit_new(descs[i].hash, descs[i].p, descs[i].len);
+            if (sid < 0) return false;   /* reservation should have prevented */
+            descs[i].id = sid;
+            g_sym.scanned[sid >> 6] |= ((uint64_t)1 << (sid & 63));
+        }
+    }
+
+    int64_t* segs = (int64_t*)ray_arena_alloc(g_sym.arena, segs_payload);
+    if (!segs) return false;             /* reservation should have prevented */
+    for (uint8_t i = 0; i < nsegs; i++) segs[i] = descs[i].id;
+
+    g_sym.segments[new_id].nsegs = nsegs;
+    g_sym.segments[new_id].segs  = segs;
+    g_sym.dotted[word]  |= bit;
+    g_sym.scanned[word] |= bit;
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * sym_probe — hash-table lookup only.  Returns sym_id for an existing
+ * entry or -1 if not present.  No side effects.
+ * -------------------------------------------------------------------------- */
+static int64_t sym_probe(uint32_t hash, const char* str, size_t len) {
+    uint32_t mask = g_sym.bucket_cap - 1;
+    uint32_t slot = hash & mask;
+    for (;;) {
+        uint64_t e = g_sym.buckets[slot];
+        if (e == 0) return -1;
+        uint32_t e_hash = (uint32_t)(e >> 32);
+        if (e_hash == hash) {
+            uint32_t e_id = (uint32_t)(e & 0xFFFFFFFF) - 1;
+            ray_t* existing = g_sym.strings[e_id];
+            if (ray_str_len(existing) == len &&
+                memcmp(ray_str_ptr(existing), str, len) == 0) {
+                return (int64_t)e_id;
+            }
+        }
+        slot = (slot + 1) & mask;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * sym_commit_new — insert a NEW sym (caller must have confirmed it does
+ * not already exist).  Grows the hash/strings tables as needed, allocates
+ * the string atom from the arena, inserts into the hash table.  Returns
+ * new sym_id or -1 on OOM.  No cache side effect.
+ * -------------------------------------------------------------------------- */
+static int64_t sym_commit_new(uint32_t hash, const char* str, size_t len) {
+    /* Grow hash table if load factor exceeds threshold, or if critically
+     * full.  Attempt grow before refusing insert.
+     * Cast to uint64_t to prevent overflow when bucket_cap >= 2^26. */
+    if ((uint64_t)g_sym.str_count * 100 >= (uint64_t)g_sym.bucket_cap * 70) {
+        if (!ht_grow()) {
+            /* If critically full even after failed grow, refuse insert
+             * to prevent infinite probe loops. */
+            if ((uint64_t)g_sym.str_count * 100 >= (uint64_t)g_sym.bucket_cap * 95) {
+                return -1;
+            }
+        }
+    }
+
+    uint32_t new_id = g_sym.str_count;
+
+    if (new_id >= g_sym.str_cap) {
+        if (g_sym.str_cap >= UINT32_MAX / 2) return -1;
+        if (!sym_grow_str_cap(g_sym.str_cap * 2)) return -1;
+    }
+
+    /* Create string atom from arena — avoids buddy allocator overhead.
+     * Arena blocks have rc=1 and RAY_ATTR_ARENA set. */
+    ray_t* s = sym_str_arena(g_sym.arena, str, len);
+    if (!s) return -1;
+    g_sym.strings[new_id] = s;
+    g_sym.str_count++;
+
+    /* Insert into hash table.
+     * Note: ht_insert probes from hash & mask to find an empty slot,
+     * so it works correctly even if ht_grow changed the bucket array. */
+    ht_insert(g_sym.buckets, g_sym.bucket_cap, hash, new_id);
+
+    return (int64_t)new_id;
+}
+
+/* --------------------------------------------------------------------------
+ * sym_intern_nolock_noseg — intern WITHOUT the segment-caching side
+ * effect.  Persistence paths (ray_sym_load, ray_sym_save's merge phase)
+ * use this variant because segment sub-interning during load would
+ * append new ids mid-sequence and break the disk-position==sym_id
+ * invariant.  After the bulk op, call ray_sym_rebuild_segments to
+ * populate the dotted bitmap + segments cache.  Assumes caller holds
+ * sym_lock (or is in the single-threaded prehashed caller contract).
+ * -------------------------------------------------------------------------- */
+static int64_t sym_intern_nolock_noseg(uint32_t hash, const char* str, size_t len) {
+    int64_t existing = sym_probe(hash, str, len);
+    if (existing >= 0) return existing;
+    return sym_commit_new(hash, str, len);
+}
+
+/* Reserve hash-table, strings-array, and arena capacity for `new_sym_count`
+ * new syms plus `arena_bytes` of additional arena usage (for the segs array
+ * if we're interning a dotted name).  Returns true on success; on failure
+ * returns false with no commits made. */
+static bool sym_reserve_capacity(uint32_t new_sym_count, size_t arena_bytes) {
+    /* Hash table — grow if adding new_sym_count entries would exceed 70%. */
+    uint64_t new_count = (uint64_t)g_sym.str_count + new_sym_count;
+    uint32_t target = g_sym.bucket_cap;
+    while (new_count * 100 >= (uint64_t)target * 70) {
+        if (target >= (UINT32_MAX / 2 + 1)) return false;
+        target *= 2;
+    }
+    if (target > g_sym.bucket_cap) {
+        if (!ht_grow_to(target)) return false;
+    }
+
+    /* Strings and sidecars. */
+    if (new_count > g_sym.str_cap) {
+        uint32_t str_target = g_sym.str_cap;
+        while (str_target < new_count) {
+            if (str_target >= UINT32_MAX / 2) return false;
+            str_target *= 2;
+        }
+        if (!sym_grow_str_cap(str_target)) return false;
+    }
+
+    /* Arena — reserve one chunk large enough for every forthcoming alloc. */
+    if (arena_bytes && !ray_arena_reserve(g_sym.arena, arena_bytes)) return false;
+
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * sym_intern_nolock — fully atomic intern.
+ *
+ * Three phases:
+ *   A. Inspect: probe the main name, validate its dotted shape, probe
+ *      every segment.  No side effects.
+ *   B. Reserve: pre-grow hash/strings/arena to accommodate everything
+ *      we might need to commit.  Can fail → return -1 with no state
+ *      change (no orphan segment syms, no cache fragments).
+ *   C. Commit: all allocations in this phase are guaranteed by the
+ *      reservations above, so they cannot fail.  Creates any new
+ *      segment syms, creates the main sym, fills the segs cache, sets
+ *      scanned + dotted bits.
+ *
+ * This closes two prior traps:
+ *  - A committed main sym whose dotted bit disagrees with its name's
+ *    structure (env silently routing dotted-path writes/reads through
+ *    the flat path).
+ *  - Orphan segment syms persisting when the main-sym commit fails.
+ *
+ * For an existing sym found in phase A, we still opportunistically try
+ * the cache — that path is the lazy fallback for ray_sym_intern_no_split,
+ * which commits the main sym without a cache on purpose.  A cache-OOM
+ * there is tolerated (scanned bit stays clear → future interns retry).
+ * -------------------------------------------------------------------------- */
+static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len) {
+    /* Phase A.1: probe main. */
+    int64_t existing = sym_probe(hash, str, len);
+    if (existing >= 0) {
+        (void)sym_cache_segments((uint32_t)existing, str, len);
+        return existing;
+    }
+
+    /* Phase A.2: structural validation + per-segment probe. */
+    struct { const char* p; size_t len; uint32_t hash; int64_t id; } descs[256];
+    uint8_t  nsegs = 0;
+    uint32_t new_seg_count = 0;
+    size_t   new_seg_bytes = 0;
+    bool     is_dotted = false;
+
+    const char* first_dot = (const char*)memchr(str, '.', len);
+    if (first_dot) {
+        /* Dotted-name rules (parallel to sym_cache_segments):
+         *   - Trailing dot            → plain (not dotted).
+         *   - Leading dot alone       → plain (`.sys` with no inner dot).
+         *   - Leading dot + inner dot → segment 0 is `.<head>` including
+         *                                the leading dot.  This is how
+         *                                reserved-namespace names like
+         *                                `.sys.gc` resolve against the
+         *                                `.sys` root dict. */
+        bool valid = str[len - 1] != '.';
+        bool leading_dot = (str[0] == '.');
+        if (valid && leading_dot) {
+            const char* second = (const char*)memchr(str + 1, '.', len - 1);
+            if (!second) valid = false;
+        }
+        size_t sep_dots = 0;
+        if (valid) {
+            for (size_t i = (leading_dot ? 1 : 0); i < len; i++)
+                if (str[i] == '.') sep_dots++;
+            if (sep_dots + 1 > 255) valid = false;
+        }
+        if (valid) {
+            nsegs = (uint8_t)(sep_dots + 1);
+            const char* p = str;
+            size_t remaining = len;
+            uint8_t i = 0;
+            while (remaining && i < nsegs) {
+                size_t skip = (i == 0 && leading_dot) ? 1 : 0;
+                const char* dot = remaining > skip
+                    ? (const char*)memchr(p + skip, '.', remaining - skip)
+                    : NULL;
+                size_t seg_len = dot ? (size_t)(dot - p) : remaining;
+                if (seg_len == 0) { valid = false; break; }
+                uint32_t seg_hash = (uint32_t)ray_hash_bytes(p, seg_len);
+                descs[i].p    = p;
+                descs[i].len  = seg_len;
+                descs[i].hash = seg_hash;
+                descs[i].id   = sym_probe(seg_hash, p, seg_len);
+                if (descs[i].id < 0) {
+                    new_seg_count++;
+                    new_seg_bytes += ray_sym_bytes_upper(seg_len);
+                }
+                i++;
+                if (!dot) break;
+                remaining -= (seg_len + 1);
+                p = dot + 1;
+            }
+            if (valid) is_dotted = true;
+        }
+    }
+
+    /* Phase B: reserve capacity for main + new segments + segs array. */
+    size_t arena_bytes = ray_sym_bytes_upper(len);
+    if (is_dotted) {
+        arena_bytes += new_seg_bytes;
+        /* segs array is arena-allocated via ray_arena_alloc(_, nsegs*8). */
+        size_t segs_payload = (size_t)nsegs * sizeof(int64_t);
+        arena_bytes += ((size_t)32 + segs_payload + 31) & ~(size_t)31;
+    }
+    if (!sym_reserve_capacity(1 + new_seg_count, arena_bytes)) return -1;
+
+    /* Phase C: commit.  Every allocation below is covered by the
+     * reservation above, so nothing here can fail. */
+    if (is_dotted) {
+        for (uint8_t i = 0; i < nsegs; i++) {
+            if (descs[i].id < 0) {
+                int64_t sid = sym_commit_new(descs[i].hash, descs[i].p, descs[i].len);
+                /* Reservation guarantees success; defensive check kept. */
+                if (sid < 0) return -1;
+                descs[i].id = sid;
+                /* Segment is itself a plain name (no dot inside). */
+                g_sym.scanned[sid >> 6] |= ((uint64_t)1 << (sid & 63));
+            }
+        }
+    }
+
+    int64_t main_id = sym_commit_new(hash, str, len);
+    if (main_id < 0) return -1;
+
+    if (is_dotted) {
+        int64_t* segs = (int64_t*)ray_arena_alloc(g_sym.arena,
+                                                  (size_t)nsegs * sizeof(int64_t));
+        if (!segs) return main_id;   /* reservation should have prevented this */
+        for (uint8_t i = 0; i < nsegs; i++) segs[i] = descs[i].id;
+        g_sym.segments[main_id].nsegs = nsegs;
+        g_sym.segments[main_id].segs  = segs;
+        g_sym.dotted[main_id >> 6] |= ((uint64_t)1 << (main_id & 63));
+    }
+    g_sym.scanned[main_id >> 6] |= ((uint64_t)1 << (main_id & 63));
+
+    return main_id;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_intern — locked public API
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_sym_intern(const char* str, size_t len) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1;
+    uint32_t hash = (uint32_t)ray_hash_bytes(str, len);
+    sym_lock();
+    int64_t id = sym_intern_nolock(hash, str, len);
+    sym_unlock();
+    return id;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_intern_prehashed -- intern with pre-computed hash, no lock.
+ *
+ * CALLER CONTRACT: must only be called when no other thread is interning
+ * (e.g., after ray_pool_dispatch returns during CSV merge).
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_sym_intern_prehashed(uint32_t hash, const char* str, size_t len) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1;
+    return sym_intern_nolock(hash, str, len);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_intern_no_split — persistence-only bulk intern
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_sym_intern_no_split(const char* str, size_t len) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1;
+    uint32_t hash = (uint32_t)ray_hash_bytes(str, len);
+    sym_lock();
+    int64_t id = sym_intern_nolock_noseg(hash, str, len);
+    sym_unlock();
+    return id;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_rebuild_segments — populate dotted cache for any not-yet-cached
+ * entries.  Must follow a batch of ray_sym_intern_no_split calls.
+ *
+ * Propagates the first allocation/sub-intern failure as RAY_ERR_OOM so
+ * persistence callers (ray_sym_load / ray_sym_save merge) can abort
+ * cleanly rather than silently leaving dotted names un-cached — that
+ * would degrade them to flat-sym semantics and break env lookup for any
+ * name the user wrote with a '.' in it.
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_sym_rebuild_segments(void) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return RAY_ERR_IO;
+    sym_lock();
+    /* Snapshot upper bound — sym_cache_segments may append segment entries
+     * beyond the original range, but those new entries themselves are
+     * non-dotted segment names and so produce no further work.  Use the
+     * scanned bitmap to skip: anything already settled (plain or dotted)
+     * avoids even the memchr inside sym_cache_segments. */
+    uint32_t count = g_sym.str_count;
+    for (uint32_t i = 0; i < count; i++) {
+        if (g_sym.scanned[i >> 6] & ((uint64_t)1 << (i & 63))) continue;
+        ray_t* s = g_sym.strings[i];
+        if (!s) continue;
+        if (!sym_cache_segments(i, ray_str_ptr(s), ray_str_len(s))) {
+            sym_unlock();
+            return RAY_ERR_OOM;
+        }
+    }
+    sym_unlock();
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * Dotted-name accessors
+ * -------------------------------------------------------------------------- */
+
+bool ray_sym_is_dotted(int64_t sym_id) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return false;
+    if (sym_id < 0 || (uint32_t)sym_id >= g_sym.str_count) return false;
+    uint64_t word = g_sym.dotted[(uint32_t)sym_id >> 6];
+    return (word >> ((uint32_t)sym_id & 63)) & 1;
+}
+
+int ray_sym_segs(int64_t sym_id, const int64_t** out_segs) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return 0;
+    if (sym_id < 0 || (uint32_t)sym_id >= g_sym.str_count) return 0;
+    sym_segs_t s = g_sym.segments[sym_id];
+    if (s.nsegs == 0 || !s.segs) return 0;
+    if (out_segs) *out_segs = s.segs;
+    return (int)s.nsegs;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_find
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_sym_find(const char* str, size_t len) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1;
+
+    /* Lock required: concurrent ray_sym_intern may trigger ht_grow which
+     * frees and replaces g_sym.buckets -- reading without lock is UAF. */
+    sym_lock();
+
+    uint32_t hash = (uint32_t)ray_hash_bytes(str, len);
+    uint32_t mask = g_sym.bucket_cap - 1;
+    uint32_t slot = hash & mask;
+
+    for (;;) {
+        uint64_t e = g_sym.buckets[slot];
+        if (e == 0) { sym_unlock(); return -1; }  /* empty -- not found */
+
+        uint32_t e_hash = (uint32_t)(e >> 32);
+        if (e_hash == hash) {
+            uint32_t e_id = (uint32_t)(e & 0xFFFFFFFF) - 1;
+            ray_t* existing = g_sym.strings[e_id];
+            if (ray_str_len(existing) == len &&
+                memcmp(ray_str_ptr(existing), str, len) == 0) {
+                sym_unlock();
+                return (int64_t)e_id;
+            }
+        }
+        slot = (slot + 1) & mask;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_str
+ * -------------------------------------------------------------------------- */
+
+/* Returned pointer is valid only while no concurrent ray_sym_intern occurs.
+ * Safe during read-only execution phase (after all interning is complete).
+ * Caller must not store the pointer across sym table mutations (ht_grow
+ * or strings realloc). */
+ray_t* ray_sym_str(int64_t id) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return NULL;
+
+    /* Lock required: concurrent ray_sym_intern may realloc g_sym.strings. */
+    sym_lock();
+    if (id < 0 || (uint32_t)id >= g_sym.str_count) { sym_unlock(); return NULL; }
+    ray_t* s = g_sym.strings[id];
+    sym_unlock();
+    return s;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_count
+ * -------------------------------------------------------------------------- */
+
+uint32_t ray_sym_count(void) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return 0;
+
+    /* Lock required: concurrent ray_sym_intern may modify str_count. */
+    sym_lock();
+    uint32_t count = g_sym.str_count;
+    sym_unlock();
+    return count;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_ensure_cap -- pre-grow hash table and strings array
+ *
+ * Ensures the symbol table can hold at least `needed` total symbols without
+ * rehashing.  Call before bulk interning (e.g., CSV merge) to prevent
+ * mid-insert OOM that silently drops symbols.
+ * -------------------------------------------------------------------------- */
+
+bool ray_sym_ensure_cap(uint32_t needed) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return false;
+
+    sym_lock();
+
+    /* Grow strings array (and sidecars) if needed */
+    while (g_sym.str_cap < needed) {
+        if (g_sym.str_cap >= UINT32_MAX / 2) { sym_unlock(); return false; }
+        uint32_t new_str_cap = g_sym.str_cap * 2;
+        if (new_str_cap < needed) { /* jump directly to needed */
+            new_str_cap = needed;
+            /* Round up to power of 2 */
+            new_str_cap--;
+            new_str_cap |= new_str_cap >> 1;
+            new_str_cap |= new_str_cap >> 2;
+            new_str_cap |= new_str_cap >> 4;
+            new_str_cap |= new_str_cap >> 8;
+            new_str_cap |= new_str_cap >> 16;
+            new_str_cap++;
+            if (new_str_cap == 0) { sym_unlock(); return false; }
+        }
+        if (!sym_grow_str_cap(new_str_cap)) { sym_unlock(); return false; }
+    }
+
+    /* Grow hash table so load factor stays below threshold after filling */
+    double raw_buckets = (double)needed / SYM_LOAD_FACTOR + 1.0;
+    if (raw_buckets > (double)UINT32_MAX) { sym_unlock(); return false; }
+    uint32_t needed_buckets = (uint32_t)raw_buckets;
+    /* Round up to power of 2 */
+    needed_buckets--;
+    needed_buckets |= needed_buckets >> 1;
+    needed_buckets |= needed_buckets >> 2;
+    needed_buckets |= needed_buckets >> 4;
+    needed_buckets |= needed_buckets >> 8;
+    needed_buckets |= needed_buckets >> 16;
+    needed_buckets++;
+
+    if (needed_buckets > g_sym.bucket_cap) {
+        if (!ht_grow_to(needed_buckets)) { sym_unlock(); return false; }
+    }
+
+    sym_unlock();
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_save -- serialize symbol table as RAY_LIST of -RAY_STR
+ *
+ * Uses ray_col_save (STRL format), file locking for concurrent writers,
+ * and fsync + atomic rename for crash safety.  Append-only: skips save
+ * when persisted_count == str_count.
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_sym_save(const char* path) {
+    if (!path) return RAY_ERR_IO;
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return RAY_ERR_IO;
+
+    /* Quick check: nothing new to persist? */
+    sym_lock();
+    if (g_sym.persisted_count == g_sym.str_count) {
+        sym_unlock();
+        return RAY_OK;
+    }
+    sym_unlock();
+
+    /* Build lock and temp paths */
+    char lock_path[1024];
+    char tmp_path[1024];
+    if (snprintf(lock_path, sizeof(lock_path), "%s.lk", path) >= (int)sizeof(lock_path))
+        return RAY_ERR_IO;
+    if (snprintf(tmp_path, sizeof(tmp_path), "%s.tmp", path) >= (int)sizeof(tmp_path))
+        return RAY_ERR_IO;
+
+    /* Acquire cross-process exclusive lock */
+    ray_fd_t lock_fd = ray_file_open(lock_path, RAY_OPEN_READ | RAY_OPEN_WRITE | RAY_OPEN_CREATE);
+    if (lock_fd == RAY_FD_INVALID) return RAY_ERR_IO;
+    ray_err_t err = ray_file_lock_ex(lock_fd);
+    if (err != RAY_OK) { ray_file_close(lock_fd); return err; }
+
+    /* If file exists, load and merge (pick up entries from other writers).
+     * Distinguish "file not found" (proceed with full save) from real I/O
+     * errors (abort to avoid overwriting a file we couldn't read). */
+    {
+        ray_t* existing = ray_col_load(path);
+        if (existing && !RAY_IS_ERR(existing)) {
+            if (existing->type != RAY_LIST) {
+                ray_release(existing);
+                ray_file_unlock(lock_fd);
+                ray_file_close(lock_fd);
+                return RAY_ERR_CORRUPT;
+            }
+            /* Intern any new entries from disk (idempotent).
+             * Verify each entry's in-memory ID matches its disk position:
+             * if a local symbol already occupies a slot that disk expects,
+             * the tables have diverged and merging would silently reorder
+             * symbol IDs, corrupting previously written RAY_SYM columns. */
+            ray_t** slots = (ray_t**)ray_data(existing);
+            for (int64_t i = 0; i < existing->len; i++) {
+                ray_t* s = slots[i];
+                if (!s || RAY_IS_ERR(s) || s->type != -RAY_STR) {
+                    ray_release(existing);
+                    ray_file_unlock(lock_fd);
+                    ray_file_close(lock_fd);
+                    return RAY_ERR_CORRUPT;
+                }
+                /* Use the no-split variant: sub-interning segments mid-loop
+                 * would shift subsequent disk positions and spuriously trip
+                 * the id==i check below. */
+                int64_t id = ray_sym_intern_no_split(ray_str_ptr(s), ray_str_len(s));
+                if (id < 0) {
+                    ray_release(existing);
+                    ray_file_unlock(lock_fd);
+                    ray_file_close(lock_fd);
+                    return RAY_ERR_OOM;
+                }
+                if (id != i) {
+                    /* Divergent symbol tables: disk position i maps to
+                     * in-memory id != i.  A local symbol occupies the
+                     * slot, so merging would reorder IDs and corrupt
+                     * any RAY_SYM columns written by the other writer. */
+                    ray_release(existing);
+                    ray_file_unlock(lock_fd);
+                    ray_file_close(lock_fd);
+                    return RAY_ERR_CORRUPT;
+                }
+            }
+            ray_release(existing);
+            /* Populate dotted cache for names just merged in.  An OOM
+             * here would leave some loaded dotted names without a segment
+             * cache, silently degrading their env-lookup semantics — we
+             * must not proceed to write the file as if the merge fully
+             * succeeded. */
+            ray_err_t rebuild_err = ray_sym_rebuild_segments();
+            if (rebuild_err != RAY_OK) {
+                ray_file_unlock(lock_fd);
+                ray_file_close(lock_fd);
+                return rebuild_err;
+            }
+        } else {
+            /* ray_col_load failed — check if the file actually exists.
+             * If it does, the failure is a real I/O/corruption error;
+             * do not overwrite the file with a potentially incomplete
+             * in-memory snapshot. */
+            ray_fd_t probe_fd = ray_file_open(path, RAY_OPEN_READ);
+            if (probe_fd != RAY_FD_INVALID) {
+                /* File exists and is readable but ray_col_load failed —
+                 * corruption or format error; do not overwrite. */
+                ray_file_close(probe_fd);
+                ray_file_unlock(lock_fd);
+                ray_file_close(lock_fd);
+                return RAY_IS_ERR(existing) ? ray_err_from_obj(existing) : RAY_ERR_IO;
+            }
+            if (errno != ENOENT) {
+                /* File may exist but we can't open it (EACCES, EMFILE,
+                 * EIO, etc.) — do not overwrite, report I/O error. */
+                ray_file_unlock(lock_fd);
+                ray_file_close(lock_fd);
+                return RAY_ERR_IO;
+            }
+            /* File does not exist (ENOENT) — proceed with full save */
+        }
+    }
+
+    /* Snapshot string pointers under sym_lock, then build list without it.
+     * Strings are append-only and never freed, so pointers remain valid. */
+    sym_lock();
+    uint32_t count = g_sym.str_count;
+    size_t snap_sz = count * sizeof(ray_t*);
+    ray_t* snap_block = ray_alloc(snap_sz);
+    if (!snap_block || RAY_IS_ERR(snap_block)) {
+        sym_unlock();
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return RAY_ERR_OOM;
+    }
+    ray_t** snap = (ray_t**)ray_data(snap_block);
+    memcpy(snap, g_sym.strings, snap_sz);
+    sym_unlock();
+
+    /* Build RAY_LIST of -RAY_STR from snapshot */
+    ray_t* list = ray_list_new((int64_t)count);
+    if (!list || RAY_IS_ERR(list)) {
+        ray_free(snap_block);
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return RAY_ERR_OOM;
+    }
+
+    for (uint32_t i = 0; i < count; i++) {
+        list = ray_list_append(list, snap[i]);
+        if (!list || RAY_IS_ERR(list)) {
+            ray_free(snap_block);
+            ray_file_unlock(lock_fd);
+            ray_file_close(lock_fd);
+            return RAY_ERR_OOM;
+        }
+    }
+    ray_free(snap_block);
+
+    /* Save to temp file via ray_col_save (writes STRL format) */
+    err = ray_col_save(list, tmp_path);
+    ray_release(list);
+    if (err != RAY_OK) {
+        remove(tmp_path);
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return err;
+    }
+
+    /* Fsync temp file for durability */
+    ray_fd_t tmp_fd = ray_file_open(tmp_path, RAY_OPEN_READ | RAY_OPEN_WRITE);
+    if (tmp_fd == RAY_FD_INVALID) {
+        remove(tmp_path);
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return RAY_ERR_IO;
+    }
+    err = ray_file_sync(tmp_fd);
+    ray_file_close(tmp_fd);
+    if (err != RAY_OK) {
+        remove(tmp_path);
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return err;
+    }
+
+    /* Atomic rename: tmp -> final path */
+    err = ray_file_rename(tmp_path, path);
+    if (err != RAY_OK) {
+        remove(tmp_path);
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return err;
+    }
+
+    /* Fsync parent directory so the new directory entry is durable.
+     * Without this, a crash after rename can lose the new file. */
+    err = ray_file_sync_dir(path);
+    if (err != RAY_OK) {
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return err;
+    }
+
+    /* Update persisted count */
+    sym_lock();
+    g_sym.persisted_count = count;
+    sym_unlock();
+
+    ray_file_unlock(lock_fd);
+    ray_file_close(lock_fd);
+    return RAY_OK;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_load -- load symbol table from RAY_LIST file (STRL format)
+ *
+ * Uses ray_col_load to read the list, then interns entries beyond what's
+ * already in memory.  File locking prevents reading a partial write.
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_sym_load(const char* path) {
+    if (!path) return RAY_ERR_IO;
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return RAY_ERR_IO;
+
+    /* Acquire cross-process shared lock.
+     * Try read-only open first so that read-only users (snapshots, read-only
+     * mounts) can load without write permission on the directory.  Fall back
+     * to read-write+create if the lock file doesn't exist yet.  If both fail,
+     * only proceed without locking on read-only filesystem (EROFS) — other
+     * errors (EMFILE, ENFILE, EACCES on writable fs, etc.) are real failures
+     * that would silently drop the shared-lock guarantee. */
+    char lock_path[1024];
+    if (snprintf(lock_path, sizeof(lock_path), "%s.lk", path) >= (int)sizeof(lock_path))
+        return RAY_ERR_IO;
+    ray_fd_t lock_fd = ray_file_open(lock_path, RAY_OPEN_READ);
+    if (lock_fd == RAY_FD_INVALID) {
+        int saved_errno = errno;
+        lock_fd = ray_file_open(lock_path, RAY_OPEN_READ | RAY_OPEN_WRITE | RAY_OPEN_CREATE);
+        if (lock_fd == RAY_FD_INVALID) {
+            /* Only proceed unlocked on read-only filesystem (EROFS) where
+             * concurrent writes are impossible.  All other failures are
+             * real errors that should not be silently ignored. */
+            if (saved_errno != EROFS && errno != EROFS)
+                return RAY_ERR_IO;
+        }
+    }
+    if (lock_fd != RAY_FD_INVALID) {
+        ray_err_t err = ray_file_lock_sh(lock_fd);
+        if (err != RAY_OK) { ray_file_close(lock_fd); return err; }
+    }
+
+    /* Load the sym file as a RAY_LIST of -RAY_STR */
+    ray_t* list = ray_col_load(path);
+    if (!list || RAY_IS_ERR(list)) {
+        ray_err_t code = RAY_IS_ERR(list) ? ray_err_from_obj(list) : RAY_ERR_IO;
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return code;
+    }
+
+    if (list->type != RAY_LIST || list->len > UINT32_MAX) {
+        ray_release(list);
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return RAY_ERR_CORRUPT;
+    }
+
+    /* Validate existing entries match, then intern remaining.
+     * Use persisted_count (not str_count) as the already-loaded prefix:
+     * runtime code may ray_sym_intern transient names that were never
+     * persisted, and those must not participate in prefix validation
+     * or affect the intern start offset. */
+    sym_lock();
+    uint32_t already = g_sym.persisted_count;
+    sym_unlock();
+    ray_t** slots = (ray_t**)ray_data(list);
+
+    /* Reject stale/truncated sym file: if disk has fewer entries than what
+     * we previously loaded from disk, the file is outdated or truncated. */
+    if (already > 0 && list->len < (int64_t)already) {
+        ray_release(list);
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return RAY_ERR_CORRUPT;
+    }
+
+    /* Validate entries [0..already-1] match the persisted prefix */
+    for (int64_t i = 0; i < (int64_t)already && i < list->len; i++) {
+        ray_t* s = slots[i];
+        if (!s || RAY_IS_ERR(s) || s->type != -RAY_STR) {
+            ray_release(list);
+            ray_file_unlock(lock_fd);
+            ray_file_close(lock_fd);
+            return RAY_ERR_CORRUPT;
+        }
+        ray_t* mem_s = ray_sym_str(i);
+        if (!mem_s || ray_str_len(mem_s) != ray_str_len(s) ||
+            memcmp(ray_str_ptr(mem_s), ray_str_ptr(s), ray_str_len(s)) != 0) {
+            ray_release(list);
+            ray_file_unlock(lock_fd);
+            ray_file_close(lock_fd);
+            return RAY_ERR_CORRUPT;
+        }
+    }
+
+    /* Intern entries beyond what's already in memory.
+     * Verify each entry's in-memory ID matches its disk position:
+     * if transient runtime-interned symbols already occupy these
+     * slots, the disk entries would get wrong IDs, causing RAY_SYM
+     * columns to resolve the wrong strings. */
+    for (int64_t i = (int64_t)already; i < list->len; i++) {
+        ray_t* s = slots[i];
+        if (!s || RAY_IS_ERR(s) || s->type != -RAY_STR) {
+            ray_release(list);
+            ray_file_unlock(lock_fd);
+            ray_file_close(lock_fd);
+            return RAY_ERR_CORRUPT;
+        }
+        /* Bulk load MUST use the no-split variant so that loading a disk
+         * entry like "user.name" doesn't recursively intern "user" + "name"
+         * mid-loop and shift subsequent disk positions — that would break
+         * the id==i contract below.  Segment cache is populated in one
+         * pass after the loop finishes. */
+        int64_t id = ray_sym_intern_no_split(ray_str_ptr(s), ray_str_len(s));
+        if (id < 0) {
+            ray_release(list);
+            ray_file_unlock(lock_fd);
+            ray_file_close(lock_fd);
+            return RAY_ERR_OOM;
+        }
+        if (id != i) {
+            /* ID mismatch: disk position i was assigned in-memory
+             * id != i, meaning a transient symbol occupies the slot.
+             * The sym table has diverged from disk; continuing would
+             * cause RAY_SYM columns to resolve wrong strings. */
+            ray_release(list);
+            ray_file_unlock(lock_fd);
+            ray_file_close(lock_fd);
+            return RAY_ERR_CORRUPT;
+        }
+    }
+
+    /* Populate dotted cache for every loaded (and previously-loaded) sym.
+     * Idempotent — already-cached entries are skipped.  Runs once per load.
+     * An OOM here must surface: leaving dotted names un-cached would make
+     * env lookup silently resolve them as flat syms, quietly losing
+     * namespace semantics on anything the user stored with a '.' in it. */
+    ray_err_t rebuild_err = ray_sym_rebuild_segments();
+    if (rebuild_err != RAY_OK) {
+        ray_release(list);
+        ray_file_unlock(lock_fd);
+        ray_file_close(lock_fd);
+        return rebuild_err;
+    }
+
+    /* Update persisted count to reflect what is actually on disk.
+     * Use list->len (not str_count) because transient runtime-interned
+     * symbols may exist beyond the persisted prefix. */
+    sym_lock();
+    g_sym.persisted_count = (uint32_t)list->len;
+    sym_unlock();
+
+    ray_release(list);
+    ray_file_unlock(lock_fd);
+    ray_file_close(lock_fd);
+    return RAY_OK;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/sym.h b/crates/rayforce-sys/vendor/rayforce/src/table/sym.h
new file mode 100644
index 0000000..e55734c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/table/sym.h
@@ -0,0 +1,139 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_SYM_H
+#define RAY_SYM_H
+
+/*
+ * sym.h -- Global symbol intern table.
+ *
+ * Sequential mode: simple hash map + array. wyhash (truncated to 32-bit),
+ * open addressing with linear probing. Stores (hash32 << 32) | (id + 1)
+ * so that 0 means empty bucket.
+ */
+
+#include <rayforce.h>
+#include "core/types.h"
+#include <stdbool.h>
+
+/* Symbol width encoding (lower 2 bits of attrs when type == RAY_SYM).
+ * RAY_SYM_W{8,16,32,64} are now declared in <rayforce.h> for embedders. */
+#define RAY_SYM_W_MASK   0x03
+#ifndef RAY_SYM_W8
+#define RAY_SYM_W8       0x00
+#define RAY_SYM_W16      0x01
+#define RAY_SYM_W32      0x02
+#define RAY_SYM_W64      0x03
+#endif
+
+/* Helper macros */
+#define RAY_IS_SYM(t)         ((t) == RAY_SYM)
+#define RAY_SYM_ELEM(attrs)   (1u << ((attrs) & RAY_SYM_W_MASK))  /* 1,2,4,8 */
+
+/* Determine optimal SYM width for a given dictionary size */
+static inline uint8_t ray_sym_dict_width(int64_t dict_size) {
+    if (dict_size <= 255)        return RAY_SYM_W8;
+    if (dict_size <= 65535)      return RAY_SYM_W16;
+    if (dict_size <= 4294967295) return RAY_SYM_W32;
+    return RAY_SYM_W64;
+}
+
+/* SYM-aware element size: returns adaptive width for RAY_SYM columns */
+static inline uint8_t ray_sym_elem_size(int8_t type, uint8_t attrs) {
+    if (type == RAY_SYM) return (uint8_t)RAY_SYM_ELEM(attrs);
+    return ray_elem_size(type);
+}
+
+/* Read a dictionary index from a RAY_SYM column (adaptive width) */
+static inline int64_t ray_read_sym(const void* data, int64_t row, int8_t type, uint8_t attrs) {
+    (void)type; /* only RAY_SYM now */
+    switch (attrs & RAY_SYM_W_MASK) {
+        case RAY_SYM_W8:  return ((const uint8_t*)data)[row];
+        case RAY_SYM_W16: return ((const uint16_t*)data)[row];
+        case RAY_SYM_W32: return ((const uint32_t*)data)[row];
+        case RAY_SYM_W64: return ((const int64_t*)data)[row];
+    }
+    return 0;
+}
+
+/* Write a dictionary index into a RAY_SYM column (adaptive width) */
+static inline void ray_write_sym(void* data, int64_t row, uint64_t val, int8_t type, uint8_t attrs) {
+    (void)type; /* only RAY_SYM now */
+    switch (attrs & RAY_SYM_W_MASK) {
+        case RAY_SYM_W8:  ((uint8_t*)data)[row]  = (uint8_t)val;  break;
+        case RAY_SYM_W16: ((uint16_t*)data)[row] = (uint16_t)val; break;
+        case RAY_SYM_W32: ((uint32_t*)data)[row] = (uint32_t)val; break;
+        case RAY_SYM_W64: ((int64_t*)data)[row]  = (int64_t)val;  break;
+    }
+}
+
+/* Intern with pre-computed wyhash, no lock.
+ * Caller must guarantee single-threaded access. */
+int64_t ray_sym_intern_prehashed(uint32_t hash, const char* str, size_t len);
+
+/* ---- Dotted name resolution (namespace paths) ---------------------------
+ * A symbol whose name contains one or more '.' is a *dotted* sym.  At intern
+ * time we memchr once, split the name on '.', intern each segment, and cache
+ * the resulting segment sym_ids.  `ray_sym_is_dotted` is cheap (one bitmap
+ * load) and gates the slow path in env lookup/set. */
+bool ray_sym_is_dotted(int64_t sym_id);
+
+/* Returns segment count (>=2 if dotted, 0 otherwise).  *out_segs is set to
+ * an interned sym_id array of length `nsegs` (valid for the lifetime of the
+ * sym table). */
+int ray_sym_segs(int64_t sym_id, const int64_t** out_segs);
+
+/* Bulk-intern variant that does NOT sub-intern segments.  Used only by
+ * persistence paths (ray_sym_load, ray_sym_save merge phase) where the
+ * disk-position==sym_id invariant would be broken by segment sub-interning
+ * appending entries mid-sequence.  Callers MUST follow a batch of these
+ * with ray_sym_rebuild_segments to populate the dotted cache. */
+int64_t ray_sym_intern_no_split(const char* str, size_t len);
+
+/* Walk the intern table and cache segment sym_ids for any dotted name
+ * that hasn't been cached yet.  Idempotent — safe to call multiple times.
+ * Needed after bulk loads that used ray_sym_intern_no_split.  Returns
+ * RAY_ERR_OOM on the first allocation/sub-intern failure so persistence
+ * paths can abort instead of leaving dotted names silently un-cached. */
+ray_err_t ray_sym_rebuild_segments(void);
+
+/* Upper bound on the arena bytes that sym_str_arena consumes for a name
+ * of the given length.  Used by the three-phase atomic intern to pre-
+ * reserve arena capacity, so the commit phase cannot fail partway.
+ *
+ * Short path (<7 bytes): ray_arena_alloc(_, 0) charges ARENA_ALIGN_UP(32)
+ * = 32 bytes.  Long path: sym_str_arena computes chars_block =
+ * ALIGN(32 + len + 1) and calls ray_arena_alloc(_, chars_block), which
+ * charges ARENA_ALIGN_UP(32 + chars_block) = 32 + chars_block because
+ * chars_block is 32-aligned.  The +32 term is the crucial one — omitting
+ * it under-reserves by exactly 32 bytes per long sym.
+ *
+ * Exposed as inline so tests can verify the bound against actual arena
+ * consumption for every length in a range. */
+static inline size_t ray_sym_bytes_upper(size_t len) {
+    if (len < 7) return 32;
+    size_t chars_block = ((size_t)32 + len + 1 + 31) & ~(size_t)31;
+    return 32 + chars_block;
+}
+
+#endif /* RAY_SYM_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/table.c b/crates/rayforce-sys/vendor/rayforce/src/table/table.c
new file mode 100644
index 0000000..8d393b6
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/table/table.c
@@ -0,0 +1,238 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "table.h"
+#include "mem/heap.h"
+#include "ops/ops.h"
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * Data layout — same shape as RAY_DICT.
+ *
+ * Block header (32 B):  type = RAY_TABLE, len = 2
+ *   slot[0] = ray_t* schema    — RAY_I64 vector of column name sym IDs
+ *   slot[1] = ray_t* cols      — RAY_LIST of column vectors
+ *
+ * `tbl->len` is the slot count (always 2).  Use ray_table_ncols() to get
+ * the column count, ray_table_nrows() for the row count.
+ *
+ * The schema vector stays RAY_I64 (rather than RAY_SYM) because the rest
+ * of the codebase reads it as `int64_t* ids = ray_data(schema)` in
+ * dozens of hot loops; RAY_SYM's adaptive widths (W8/W16/W32/W64) would
+ * silently truncate those reads.  RAY_DICT is free to use any keys type;
+ * TABLE deliberately pins schema to I64 for that interop.
+ * -------------------------------------------------------------------------- */
+
+#define TBL_DATA_SIZE  (2 * sizeof(ray_t*))
+
+static inline ray_t** tbl_slots(ray_t* tbl) {
+    return (ray_t**)ray_data(tbl);
+}
+
+static inline ray_t* tbl_schema(ray_t* tbl) {
+    return tbl_slots(tbl)[0];
+}
+
+static inline ray_t* tbl_cols(ray_t* tbl) {
+    return tbl_slots(tbl)[1];
+}
+
+/* --------------------------------------------------------------------------
+ * ray_table_new — allocates an empty table with capacity for `ncols`.
+ *
+ * The schema vector and cols list are pre-sized to avoid early grows.
+ * Callers append columns via ray_table_add_col.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_table_new(int64_t ncols) {
+    if (ncols < 0) return ray_error("range", NULL);
+
+    ray_t* tbl = ray_alloc(TBL_DATA_SIZE);
+    if (!tbl) return ray_error("oom", "table_new(ncols=%lld)", (long long)ncols);
+    if (RAY_IS_ERR(tbl)) return tbl;
+    tbl->type  = RAY_TABLE;
+    tbl->attrs = 0;
+    tbl->len   = 2;
+    memset(tbl->nullmap, 0, 16);
+    memset(ray_data(tbl), 0, TBL_DATA_SIZE);
+
+    ray_t* schema = ray_vec_new(RAY_I64, ncols);
+    if (!schema || RAY_IS_ERR(schema)) {
+        ray_free(tbl);
+        return schema ? schema : ray_error("oom", NULL);
+    }
+    ray_t* cols = ray_list_new(ncols);
+    if (!cols || RAY_IS_ERR(cols)) {
+        ray_release(schema);
+        ray_free(tbl);
+        return cols ? cols : ray_error("oom", NULL);
+    }
+    tbl_slots(tbl)[0] = schema;
+    tbl_slots(tbl)[1] = cols;
+    return tbl;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_table_add_col — append `col_vec` under name `name_id`.
+ *
+ * Consumes one ref of the input table; on success returns an owned ref to
+ * the (possibly COW'd) result.  Retains `col_vec` internally so the caller
+ * keeps its own ref.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_table_add_col(ray_t* tbl, int64_t name_id, ray_t* col_vec) {
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+    if (!col_vec || RAY_IS_ERR(col_vec)) return ray_error("type", NULL);
+
+    tbl = ray_cow(tbl);
+    if (!tbl || RAY_IS_ERR(tbl)) return tbl;
+
+    ray_t** slots = tbl_slots(tbl);
+    ray_t* schema = slots[0];
+    ray_t* cols   = slots[1];
+
+    /* schema and cols may themselves be shared after cow — append helpers
+     * COW them as needed and return the (possibly new) owned ref. */
+    ray_t* new_schema = ray_vec_append(schema, &name_id);
+    if (!new_schema || RAY_IS_ERR(new_schema)) { ray_release(tbl); return new_schema ? new_schema : ray_error("oom", NULL); }
+    slots[0] = new_schema;
+
+    ray_retain(col_vec);
+    ray_t* new_cols = ray_list_append(cols, col_vec);
+    ray_release(col_vec);
+    if (!new_cols || RAY_IS_ERR(new_cols)) { ray_release(tbl); return new_cols ? new_cols : ray_error("oom", NULL); }
+    slots[1] = new_cols;
+
+    return tbl;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_table_get_col — lookup column by sym id; borrowed pointer or NULL.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_table_get_col(ray_t* tbl, int64_t name_id) {
+    if (!tbl || RAY_IS_ERR(tbl)) return NULL;
+    ray_t* schema = tbl_schema(tbl);
+    ray_t* cols   = tbl_cols(tbl);
+    if (!schema || !cols) return NULL;
+    int64_t* ids = (int64_t*)ray_data(schema);
+    int64_t ncols = schema->len;
+    ray_t** col_ptrs = (ray_t**)ray_data(cols);
+    for (int64_t i = 0; i < ncols; i++)
+        if (ids[i] == name_id) return col_ptrs[i];
+    return NULL;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_table_get_col_idx — borrowed pointer at slot `idx`, or NULL.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_table_get_col_idx(ray_t* tbl, int64_t idx) {
+    if (!tbl || RAY_IS_ERR(tbl)) return NULL;
+    ray_t* cols = tbl_cols(tbl);
+    if (!cols) return NULL;
+    if (idx < 0 || idx >= cols->len) return NULL;
+    return ((ray_t**)ray_data(cols))[idx];
+}
+
+/* --------------------------------------------------------------------------
+ * ray_table_col_name — sym id at slot `idx`, -1 on out-of-range.
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_table_col_name(ray_t* tbl, int64_t idx) {
+    if (!tbl || RAY_IS_ERR(tbl)) return -1;
+    ray_t* schema = tbl_schema(tbl);
+    if (!schema) return -1;
+    if (idx < 0 || idx >= schema->len) return -1;
+    return ((int64_t*)ray_data(schema))[idx];
+}
+
+/* --------------------------------------------------------------------------
+ * ray_table_set_col_name — overwrite name at `idx`.  Caller must ensure
+ * exclusive ownership (rc==1) before calling.
+ * -------------------------------------------------------------------------- */
+
+void ray_table_set_col_name(ray_t* tbl, int64_t idx, int64_t name_id) {
+    if (!tbl || RAY_IS_ERR(tbl)) return;
+    ray_t** slots = tbl_slots(tbl);
+    ray_t* schema = slots[0];
+    if (!schema || RAY_IS_ERR(schema)) return;
+    if (idx < 0 || idx >= schema->len) return;
+    schema = ray_cow(schema);
+    if (!schema || RAY_IS_ERR(schema)) return;
+    slots[0] = schema;
+    ((int64_t*)ray_data(schema))[idx] = name_id;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_table_ncols / ray_table_nrows / ray_table_schema
+ * -------------------------------------------------------------------------- */
+
+int64_t ray_table_ncols(ray_t* tbl) {
+    if (!tbl || RAY_IS_ERR(tbl)) return 0;
+    ray_t* schema = tbl_schema(tbl);
+    return schema ? schema->len : 0;
+}
+
+int64_t ray_table_nrows(ray_t* tbl) {
+    if (!tbl || RAY_IS_ERR(tbl)) return 0;
+    ray_t* cols = tbl_cols(tbl);
+    if (!cols || cols->len <= 0) return 0;
+    ray_t* first_col = ((ray_t**)ray_data(cols))[0];
+    if (!first_col || RAY_IS_ERR(first_col)) return 0;
+
+    if (RAY_IS_PARTED(first_col->type) || first_col->type == RAY_MAPCOMMON)
+        return ray_parted_nrows(first_col);
+
+    return first_col->len;
+}
+
+int64_t ray_parted_nrows(ray_t* v) {
+    if (!v || RAY_IS_ERR(v)) return 0;
+    if (!RAY_IS_PARTED(v->type) && v->type != RAY_MAPCOMMON) return v->len;
+
+    if (v->type == RAY_MAPCOMMON) {
+        ray_t** ptrs = (ray_t**)ray_data(v);
+        ray_t* counts = ptrs[1];
+        if (!counts || RAY_IS_ERR(counts)) return 0;
+        int64_t total = 0;
+        int64_t* cdata = (int64_t*)ray_data(counts);
+        for (int64_t i = 0; i < counts->len; i++)
+            total += cdata[i];
+        return total;
+    }
+
+    int64_t n_segs = v->len;
+    ray_t** segs = (ray_t**)ray_data(v);
+    int64_t total = 0;
+    for (int64_t i = 0; i < n_segs; i++) {
+        if (segs[i] && !RAY_IS_ERR(segs[i]))
+            total += segs[i]->len;
+    }
+    return total;
+}
+
+ray_t* ray_table_schema(ray_t* tbl) {
+    if (!tbl || RAY_IS_ERR(tbl)) return NULL;
+    return tbl_schema(tbl);
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/table.h b/crates/rayforce-sys/vendor/rayforce/src/table/table.h
new file mode 100644
index 0000000..16eff4e
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/table/table.h
@@ -0,0 +1,40 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_TABLE_H
+#define RAY_TABLE_H
+
+/*
+ * table.h -- Table operations.
+ *
+ * A table has type = RAY_TABLE (13), len = current column count.
+ * Data region: first sizeof(ray_t*) bytes = pointer to schema (I64 vector
+ * of column name symbol IDs), then ncols * sizeof(ray_t*) = column vector
+ * pointers.
+ */
+
+#include <rayforce.h>
+
+int64_t ray_parted_nrows(ray_t* parted_col);
+
+#endif /* RAY_TABLE_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/atom.c b/crates/rayforce-sys/vendor/rayforce/src/vec/atom.c
new file mode 100644
index 0000000..2d4b487
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/atom.c
@@ -0,0 +1,208 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "atom.h"
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * Simple atom constructors
+ *
+ * Pattern: allocate 0-byte data block (just the 32B header), set type to
+ * negative tag, store value in header union field.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_bool(bool val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_BOOL;
+    v->b8 = val ? 1 : 0;
+    return v;
+}
+
+ray_t* ray_u8(uint8_t val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_U8;
+    v->u8 = val;
+    return v;
+}
+
+ray_t* ray_i16(int16_t val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_I16;
+    v->i16 = val;
+    return v;
+}
+
+ray_t* ray_i32(int32_t val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_I32;
+    v->i32 = val;
+    return v;
+}
+
+ray_t* ray_i64(int64_t val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_I64;
+    v->i64 = val;
+    return v;
+}
+
+ray_t* ray_f64(double val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_F64;
+    v->f64 = val;
+    return v;
+}
+
+/* F32 atoms reuse the f64 union slot — fmt_obj's RAY_F32 branch already
+ * narrows back to float via `(float)obj->f64`.  Constructor mirrors
+ * ray_f64; only the type tag differs.  Provided so RAY_F32 vectors can
+ * box elements through the same atom-construction path used by I32/F64. */
+ray_t* ray_f32(float val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_F32;
+    v->f64  = (double)val;
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * String atom: SSO for <= 7 bytes, long string via U8 vector for > 7
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_str(const char* s, size_t len) {
+    if (len < 7) {
+        /* SSO path: store inline in header (< 7 leaves room for NUL).
+         * Exactly 7 bytes would fill all of sdata[7] with no NUL terminator,
+         * so 7-byte strings fall through to the long-string path. */
+        ray_t* v = ray_alloc(0);
+        if (RAY_IS_ERR(v)) return v;
+        v->type = -RAY_STR;
+        v->slen = (uint8_t)len;
+        if (len > 0) memcpy(v->sdata, s, len);
+        v->sdata[len] = '\0';
+        return v;
+    }
+    /* Long string: allocate a U8 vector to hold the data, store pointer.
+     * Allocate len+1 and null-terminate for C string compatibility — callers
+     * (including ctypes c_char_p) may read until '\0'. */
+    size_t data_size = len + 1;
+    ray_t* chars = ray_alloc(data_size);
+    if (!chars || RAY_IS_ERR(chars)) return chars;
+    chars->type = RAY_U8;
+    chars->len = (int64_t)len;
+    memcpy(ray_data(chars), s, len);
+    ((char*)ray_data(chars))[len] = '\0';
+
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) {
+        ray_free(chars);
+        return v;
+    }
+    v->type = -RAY_STR;
+    v->obj = chars;
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * Symbol atom: intern ID stored as i64
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_sym(int64_t id) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_SYM;
+    v->i64 = id;
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * Date/Time/Timestamp atoms
+ *
+ * All atom constructors accept int64_t and store in the i64 union field
+ * (atoms are scalar wrappers — always 8 bytes in the union). The vector
+ * element sizes differ: DATE=4, TIME=4, TIMESTAMP=8. When broadcasting
+ * an atom to a vector (materialize_broadcast_input), the value must be
+ * narrowed to the correct element width.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_date(int64_t val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_DATE;
+    v->i64 = val;
+    return v;
+}
+
+ray_t* ray_time(int64_t val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_TIME;
+    v->i64 = val;
+    return v;
+}
+
+ray_t* ray_timestamp(int64_t val) {
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = -RAY_TIMESTAMP;
+    v->i64 = val;
+    return v;
+}
+
+ray_t* ray_typed_null(int8_t type) {
+    if (type >= 0) return ray_error("type", NULL);
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) return v;
+    v->type = type;
+    v->i64 = 0;
+    v->nullmap[0] |= 1;
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * GUID atom: 16 bytes stored in a U8 vector, pointer in obj field
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_guid(const uint8_t* bytes) {
+    /* Allocate U8 vector of length 16 */
+    ray_t* vec = ray_alloc(16);
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    vec->type = RAY_U8;
+    vec->len = 16;
+    memcpy(ray_data(vec), bytes, 16);
+
+    ray_t* v = ray_alloc(0);
+    if (RAY_IS_ERR(v)) {
+        ray_free(vec);
+        return v;
+    }
+    v->type = -RAY_GUID;
+    v->obj = vec;
+    return v;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/atom.h b/crates/rayforce-sys/vendor/rayforce/src/vec/atom.h
new file mode 100644
index 0000000..0c495bc
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/atom.h
@@ -0,0 +1,36 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_ATOM_H
+#define RAY_ATOM_H
+
+/*
+ * atom.h -- Atom constructors.
+ *
+ * Each atom is a 32-byte block (header only, no data region) with a
+ * negative type tag and the value stored in the header union.
+ */
+
+#include <rayforce.h>
+
+#endif /* RAY_ATOM_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/embedding.h b/crates/rayforce-sys/vendor/rayforce/src/vec/embedding.h
new file mode 100644
index 0000000..487c7d3
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/embedding.h
@@ -0,0 +1,38 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_EMBEDDING_H
+#define RAY_EMBEDDING_H
+
+#include <rayforce.h>
+
+/* ===== Embedding Column Helpers ===== */
+
+/* An embedding column is a RAY_F32 vector of length N*D where D is the
+ * embedding dimension.  D is stored in a separate I32 atom that the
+ * caller keeps alongside the column.  Access helpers: */
+
+/* Create an embedding column for N rows of D-dimensional vectors. */
+ray_t* ray_embedding_new(int64_t nrows, int32_t dim);
+
+#endif /* RAY_EMBEDDING_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/list.c b/crates/rayforce-sys/vendor/rayforce/src/vec/list.c
new file mode 100644
index 0000000..c4b33c3
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/list.c
@@ -0,0 +1,299 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "list.h"
+#include "mem/heap.h"
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * Capacity helpers (same pattern as vec.c)
+ * -------------------------------------------------------------------------- */
+
+static int64_t list_capacity(ray_t* list) {
+    size_t block_size = (size_t)1 << list->order;
+    size_t data_space = block_size - 32;  /* 32B ray_t header */
+    return (int64_t)(data_space / sizeof(ray_t*));
+}
+
+/* --------------------------------------------------------------------------
+ * ray_list_new
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_list_new(int64_t capacity) {
+    if (capacity < 0) return ray_error("range", NULL);
+    if ((uint64_t)capacity > SIZE_MAX / sizeof(ray_t*))
+        return ray_error("oom", NULL);
+    size_t data_size = (size_t)capacity * sizeof(ray_t*);
+
+    ray_t* list = ray_alloc(data_size);
+    if (!list) return ray_error("oom", "list_new(cap=%lld): %zu bytes",
+                                (long long)capacity, data_size);
+    if (RAY_IS_ERR(list)) return list;
+
+    list->type = RAY_LIST;
+    list->len = 0;
+    list->attrs = 0;
+    memset(list->nullmap, 0, 16);
+
+    return list;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_list_append
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_list_append(ray_t* list, ray_t* item) {
+    if (!list || RAY_IS_ERR(list)) return list;
+
+    /* COW if shared */
+    ray_t* original = list;
+    list = ray_cow(list);
+    if (!list || RAY_IS_ERR(list)) return list;
+
+    int64_t cap = list_capacity(list);
+
+    /* Grow if needed */
+    if (list->len >= cap) {
+        size_t new_data_size = (size_t)(list->len + 1) * sizeof(ray_t*);
+        if (new_data_size < 32) new_data_size = 32;
+        else {
+            size_t s = 32;
+            while (s < new_data_size) {
+                if (s > SIZE_MAX / 2) {
+                    if (list != original) ray_release(list);
+                    return ray_error("oom", NULL);
+                }
+                s *= 2;
+            }
+            new_data_size = s;
+        }
+        ray_t* new_list = ray_scratch_realloc(list, new_data_size);
+        if (!new_list || RAY_IS_ERR(new_list)) {
+            if (list != original) ray_release(list);
+            return new_list ? new_list : ray_error("oom", NULL);
+        }
+        list = new_list;
+    }
+
+    /* Store item pointer and retain it */
+    ray_t** slots = (ray_t**)ray_data(list);
+    slots[list->len] = item;
+    if (item) ray_retain(item);
+    list->len++;
+
+    return list;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_list_get
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_list_get(ray_t* list, int64_t idx) {
+    if (!list || RAY_IS_ERR(list)) return NULL;
+    if (idx < 0 || idx >= list->len) return NULL;
+
+    ray_t** slots = (ray_t**)ray_data(list);
+    return slots[idx];
+}
+
+/* --------------------------------------------------------------------------
+ * ray_list_set
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_list_set(ray_t* list, int64_t idx, ray_t* item) {
+    if (!list || RAY_IS_ERR(list)) return list;
+    if (idx < 0 || idx >= list->len)
+        return ray_error("range", NULL);
+
+    /* COW if shared */
+    list = ray_cow(list);
+    if (!list || RAY_IS_ERR(list)) return list;
+
+    ray_t** slots = (ray_t**)ray_data(list);
+
+    /* Release old item */
+    ray_t* old = slots[idx];
+    if (old) ray_release(old);
+
+    /* Store new item and retain it */
+    slots[idx] = item;
+    if (item) ray_retain(item);
+
+    return list;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_list_insert_at — insert one item at pre-insertion position idx.
+ *
+ * idx ∈ [0, list->len]; idx == len is equivalent to ray_list_append.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_list_insert_at(ray_t* list, int64_t idx, ray_t* item) {
+    if (!list || RAY_IS_ERR(list)) return list;
+    if (list->type != RAY_LIST) return ray_error("type", NULL);
+    if (idx < 0 || idx > list->len) return ray_error("range", NULL);
+
+    ray_t* original = list;
+    list = ray_cow(list);
+    if (!list || RAY_IS_ERR(list)) return list;
+
+    int64_t cap = list_capacity(list);
+
+    if (list->len >= cap) {
+        size_t new_data_size = (size_t)(list->len + 1) * sizeof(ray_t*);
+        if (new_data_size < 32) new_data_size = 32;
+        else {
+            size_t s = 32;
+            while (s < new_data_size) {
+                if (s > SIZE_MAX / 2) {
+                    if (list != original) ray_release(list);
+                    return ray_error("oom", NULL);
+                }
+                s *= 2;
+            }
+            new_data_size = s;
+        }
+        ray_t* new_list = ray_scratch_realloc(list, new_data_size);
+        if (!new_list || RAY_IS_ERR(new_list)) {
+            if (list != original) ray_release(list);
+            return new_list ? new_list : ray_error("oom", NULL);
+        }
+        list = new_list;
+    }
+
+    ray_t** slots = (ray_t**)ray_data(list);
+
+    if (idx < list->len) {
+        memmove(&slots[idx + 1], &slots[idx],
+                (size_t)(list->len - idx) * sizeof(ray_t*));
+    }
+
+    slots[idx] = item;
+    if (item) ray_retain(item);
+    list->len++;
+
+    return list;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_list_insert_many — insert N items at N pre-insertion positions.
+ *
+ * idxs : RAY_I64 vec of length N, each idx in [0, list->len].
+ * vals : RAY_LIST. Length 1 broadcasts the single ptr; length N is parallel.
+ *
+ * Stable on duplicate indices. Returns a fresh block; broadcast retains the
+ * same pointer once per insertion site.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_list_insert_many(ray_t* list, ray_t* idxs, ray_t* vals) {
+    if (!list || RAY_IS_ERR(list)) return list;
+    if (!idxs || RAY_IS_ERR(idxs)) return idxs;
+    if (!vals || RAY_IS_ERR(vals)) return vals;
+    if (list->type != RAY_LIST) return ray_error("type", NULL);
+    if (idxs->type != RAY_I64) return ray_error("type", NULL);
+    if (vals->type != RAY_LIST) return ray_error("type", NULL);
+
+    int64_t N = idxs->len;
+    int64_t old_len = list->len;
+
+    if (N == 0) { ray_retain(list); return list; }
+
+    const int64_t* idx_arr = (const int64_t*)ray_data(idxs);
+    for (int64_t k = 0; k < N; k++) {
+        if (idx_arr[k] < 0 || idx_arr[k] > old_len)
+            return ray_error("range", NULL);
+    }
+
+    int broadcast;
+    if (vals->len == 1) broadcast = 1;
+    else if (vals->len == N) broadcast = 0;
+    else return ray_error("range", NULL);
+
+    /* Sort buffer of (idx, src_pos) pairs */
+    ray_t* pair_vec = ray_vec_new(RAY_I64, 2 * N);
+    if (!pair_vec || RAY_IS_ERR(pair_vec)) return ray_error("oom", NULL);
+    pair_vec->len = 2 * N;
+    int64_t* pairs = (int64_t*)ray_data(pair_vec);
+    for (int64_t k = 0; k < N; k++) {
+        pairs[2 * k]     = idx_arr[k];
+        pairs[2 * k + 1] = k;
+    }
+
+    /* Stable insertion sort by idx */
+    for (int64_t i = 1; i < N; i++) {
+        int64_t ki = pairs[2 * i];
+        int64_t ks = pairs[2 * i + 1];
+        int64_t j = i - 1;
+        while (j >= 0 && pairs[2 * j] > ki) {
+            pairs[2 * (j + 1)]     = pairs[2 * j];
+            pairs[2 * (j + 1) + 1] = pairs[2 * j + 1];
+            j--;
+        }
+        pairs[2 * (j + 1)]     = ki;
+        pairs[2 * (j + 1) + 1] = ks;
+    }
+
+    int64_t new_len = old_len + N;
+    if (new_len < old_len) { ray_release(pair_vec); return ray_error("oom", NULL); }
+    if ((uint64_t)new_len > SIZE_MAX / sizeof(ray_t*)) {
+        ray_release(pair_vec);
+        return ray_error("oom", NULL);
+    }
+    size_t data_size = (size_t)new_len * sizeof(ray_t*);
+
+    ray_t* result = ray_alloc(data_size);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(pair_vec);
+        return result ? result : ray_error("oom", NULL);
+    }
+    result->type = RAY_LIST;
+    result->len = new_len;
+    result->attrs = 0;
+    memset(result->nullmap, 0, 16);
+
+    ray_t** src_slots = (ray_t**)ray_data(list);
+    ray_t** val_slots = (ray_t**)ray_data(vals);
+    ray_t** dst_slots = (ray_t**)ray_data(result);
+
+    int64_t w = 0;
+    int64_t p = 0;
+    for (int64_t r = 0; r <= old_len; r++) {
+        while (p < N && pairs[2 * p] == r) {
+            int64_t src_pos = pairs[2 * p + 1];
+            ray_t* item = broadcast ? val_slots[0] : val_slots[src_pos];
+            dst_slots[w] = item;
+            if (item) ray_retain(item);
+            w++;
+            p++;
+        }
+        if (r < old_len) {
+            ray_t* item = src_slots[r];
+            dst_slots[w] = item;
+            if (item) ray_retain(item);
+            w++;
+        }
+    }
+
+    ray_release(pair_vec);
+    return result;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/list.h b/crates/rayforce-sys/vendor/rayforce/src/vec/list.h
new file mode 100644
index 0000000..20ad19c
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/list.h
@@ -0,0 +1,36 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_LIST_H
+#define RAY_LIST_H
+
+/*
+ * list.h -- LIST type operations.
+ *
+ * A LIST has type = RAY_LIST (0) and stores an array of ray_t* pointers
+ * in the data region. Items are reference-counted via ray_retain/ray_release.
+ */
+
+#include <rayforce.h>
+
+#endif /* RAY_LIST_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/sel.c b/crates/rayforce-sys/vendor/rayforce/src/vec/sel.c
new file mode 100644
index 0000000..e651296
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/sel.c
@@ -0,0 +1,190 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include <rayforce.h>
+#include "ops/ops.h"
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * Layout size computation
+ *
+ * Data payload after 32-byte ray_t header:
+ *   ray_sel_meta_t          16 bytes
+ *   seg_flags[n_segs]      align8(n_segs) bytes
+ *   seg_popcnt[n_segs]     align8(n_segs * 2) bytes
+ *   bits[n_words]          n_words * 8 bytes
+ * -------------------------------------------------------------------------- */
+
+static size_t sel_data_size(int64_t nrows) {
+    uint32_t n_segs = (uint32_t)((nrows + RAY_MORSEL_ELEMS - 1) / RAY_MORSEL_ELEMS);
+    uint32_t n_words = (uint32_t)((nrows + 63) / 64);
+
+    size_t sz = sizeof(ray_sel_meta_t);
+    sz += (n_segs + 7u) & ~(size_t)7;           /* seg_flags, 8-aligned */
+    sz += ((size_t)n_segs * 2 + 7u) & ~(size_t)7; /* seg_popcnt, 8-aligned */
+    sz += (size_t)n_words * 8;                   /* bits */
+    return sz;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sel_new — allocate a selection with all bits zero (no rows pass)
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_sel_new(int64_t nrows) {
+    if (nrows < 0) return ray_error("range", NULL);
+
+    size_t dsz = sel_data_size(nrows);
+    ray_t* s = ray_alloc(dsz);
+    if (!s || RAY_IS_ERR(s)) return s;
+
+    s->type = RAY_SEL;
+    s->len  = nrows;
+    memset(ray_data(s), 0, dsz);
+
+    ray_sel_meta_t* m = ray_sel_meta(s);
+    m->total_pass = 0;
+    m->n_segs = (uint32_t)((nrows + RAY_MORSEL_ELEMS - 1) / RAY_MORSEL_ELEMS);
+    /* seg_flags[] already zero = RAY_SEL_NONE, seg_popcnt[] = 0, bits[] = 0 */
+
+    return s;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sel_recompute — rebuild seg_flags + seg_popcnt from bits[]
+ *
+ * Called after direct writes into bits[] (e.g., fused predicate evaluation).
+ * -------------------------------------------------------------------------- */
+
+void ray_sel_recompute(ray_t* sel) {
+    if (!sel || sel->type != RAY_SEL) return;
+
+    ray_sel_meta_t* m = ray_sel_meta(sel);
+    uint8_t*  flags  = ray_sel_flags(sel);
+    uint16_t* pcnt   = ray_sel_popcnt(sel);
+    uint64_t* bits   = ray_sel_bits(sel);
+
+    int64_t total = 0;
+    int64_t nrows = sel->len;
+    uint32_t n_segs = m->n_segs;
+
+    for (uint32_t seg = 0; seg < n_segs; seg++) {
+        int64_t seg_start = (int64_t)seg * RAY_MORSEL_ELEMS;
+        int64_t seg_rows  = nrows - seg_start;
+        if (seg_rows > RAY_MORSEL_ELEMS) seg_rows = RAY_MORSEL_ELEMS;
+
+        /* Count bits in this segment's words */
+        uint32_t word_start = (uint32_t)(seg_start / 64);
+        uint32_t word_end   = (uint32_t)((seg_start + seg_rows + 63) / 64);
+        int64_t seg_pop = 0;
+        for (uint32_t w = word_start; w < word_end; w++)
+            seg_pop += __builtin_popcountll(bits[w]);
+
+        /* Handle partial last word: mask out trailing bits beyond nrows */
+        if (seg == n_segs - 1 && (nrows & 63)) {
+            uint32_t last_w = word_end - 1;
+            uint32_t valid_bits = (uint32_t)(nrows & 63);
+            uint64_t trail_mask = (1ULL << valid_bits) - 1;
+            /* Subtract overcounted trailing bits */
+            seg_pop -= __builtin_popcountll(bits[last_w] & ~trail_mask);
+        }
+
+        pcnt[seg] = (uint16_t)seg_pop;
+        total += seg_pop;
+
+        if (seg_pop == 0)
+            flags[seg] = RAY_SEL_NONE;
+        else if (seg_pop == seg_rows)
+            flags[seg] = RAY_SEL_ALL;
+        else
+            flags[seg] = RAY_SEL_MIX;
+    }
+
+    m->total_pass = total;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sel_from_pred — convert a RAY_BOOL byte-per-row vector to RAY_SEL
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_sel_from_pred(ray_t* pred) {
+    if (!pred || RAY_IS_ERR(pred)) return pred;
+    if (pred->type != RAY_BOOL) return ray_error("type", NULL);
+
+    int64_t nrows = pred->len;
+    ray_t* sel = ray_sel_new(nrows);
+    if (!sel || RAY_IS_ERR(sel)) return sel;
+
+    /* Pack byte-per-row into bitpacked uint64_t words */
+    uint64_t* bits = ray_sel_bits(sel);
+    const uint8_t* src = (const uint8_t*)ray_data(pred);
+
+    int64_t full_words = nrows / 64;
+    for (int64_t w = 0; w < full_words; w++) {
+        uint64_t word = 0;
+        const uint8_t* p = src + w * 64;
+        for (int b = 0; b < 64; b++)
+            word |= (uint64_t)(p[b] != 0) << b;
+        bits[w] = word;
+    }
+
+    /* Remainder bits */
+    int64_t rem = nrows & 63;
+    if (rem) {
+        uint64_t word = 0;
+        const uint8_t* p = src + full_words * 64;
+        for (int64_t b = 0; b < rem; b++)
+            word |= (uint64_t)(p[b] != 0) << b;
+        bits[full_words] = word;
+    }
+
+    ray_sel_recompute(sel);
+    return sel;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sel_and — AND two selections of equal length, returns new RAY_SEL
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_sel_and(ray_t* a, ray_t* b) {
+    if (!a || RAY_IS_ERR(a)) return a;
+    if (!b || RAY_IS_ERR(b)) return b;
+    if (a->type != RAY_SEL || b->type != RAY_SEL)
+        return ray_error("type", NULL);
+    if (a->len != b->len)
+        return ray_error("range", NULL);
+
+    int64_t nrows = a->len;
+    ray_t* out = ray_sel_new(nrows);
+    if (!out || RAY_IS_ERR(out)) return out;
+
+    uint64_t* dst = ray_sel_bits(out);
+    const uint64_t* sa = ray_sel_bits(a);
+    const uint64_t* sb = ray_sel_bits(b);
+    uint32_t n_words = (uint32_t)((nrows + 63) / 64);
+
+    for (uint32_t w = 0; w < n_words; w++)
+        dst[w] = sa[w] & sb[w];
+
+    ray_sel_recompute(out);
+    return out;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/str.c b/crates/rayforce-sys/vendor/rayforce/src/vec/str.c
new file mode 100644
index 0000000..ca76b92
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/str.c
@@ -0,0 +1,90 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "str.h"
+#include <string.h>
+
+/* --------------------------------------------------------------------------
+ * SSO vs long-string detection
+ *
+ * The slen/sdata and obj fields share the same 8-byte union in ray_t.
+ * SSO: slen is 0..7, sdata contains the string bytes.
+ * Long: obj is a non-NULL pointer to a U8 vector.
+ *
+ * Distinction:
+ *   - slen 1..7 → always SSO (a 32B-aligned pointer's low byte is a
+ *     multiple of 32, never 1..7)
+ *   - slen 0 with obj == NULL → empty SSO (all 8 union bytes are zero)
+ *   - slen 0 with obj != NULL → long string (pointer's low byte is 0)
+ *   - slen > 7 → long string (pointer's low byte is 32, 64, ... or higher)
+ * -------------------------------------------------------------------------- */
+
+static bool is_sso(ray_t* s) {
+    if (s->slen >= 1 && s->slen <= 7) return true;
+    if (s->slen == 0 && s->obj == NULL) return true;
+    return false;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_str_ptr
+ * -------------------------------------------------------------------------- */
+
+const char* ray_str_ptr(ray_t* s) {
+    if (!s || RAY_IS_ERR(s)) return NULL;
+    if (is_sso(s)) return (const char*)s->sdata;
+    return (const char*)ray_data(s->obj);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_str_len
+ * -------------------------------------------------------------------------- */
+
+size_t ray_str_len(ray_t* s) {
+    if (!s || RAY_IS_ERR(s)) return 0;
+    if (is_sso(s)) return (size_t)s->slen;
+    return (size_t)s->obj->len;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_str_cmp -- Compare two string atoms.
+ *
+ * Compare by memcmp of the min length, then by length difference.
+ * -------------------------------------------------------------------------- */
+
+int ray_str_cmp(ray_t* a, ray_t* b) {
+    if (!a || RAY_IS_ERR(a) || !b || RAY_IS_ERR(b)) return 0;
+
+    const char* ap = ray_str_ptr(a);
+    const char* bp = ray_str_ptr(b);
+    size_t alen = ray_str_len(a);
+    size_t blen = ray_str_len(b);
+
+    size_t minlen = alen < blen ? alen : blen;
+    int cmp = 0;
+    if (minlen > 0) cmp = memcmp(ap, bp, minlen);
+    if (cmp != 0) return cmp;
+
+    if (alen < blen) return -1;
+    if (alen > blen) return 1;
+    return 0;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/str.h b/crates/rayforce-sys/vendor/rayforce/src/vec/str.h
new file mode 100644
index 0000000..192f8e2
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/str.h
@@ -0,0 +1,103 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_STR_H
+#define RAY_STR_H
+
+/*
+ * str.h -- String helper functions.
+ *
+ * String atoms use SSO for <= 7 bytes (stored in sdata[7] with slen).
+ * Long strings store data in a U8 vector pointed to by obj.
+ */
+
+#include <rayforce.h>
+#include <string.h>
+
+/* ===== Inline String Element (16 bytes) ===== */
+
+typedef union {
+    struct { uint32_t len; char     data[12]; };      /* inline: len <= 12 */
+    struct { uint32_t len_; char    prefix[4];        /* pooled: len > 12  */
+             uint32_t pool_off; uint32_t _pad; };
+} ray_str_t;
+
+#define RAY_STR_INLINE_MAX 12
+
+static inline bool ray_str_is_inline(const ray_str_t* s) {
+    return s->len <= RAY_STR_INLINE_MAX;
+}
+
+/* Resolve string data pointer for a ray_str_t element.
+ * pool_base: base of string pool (NULL if all strings are inline) */
+static inline const char* ray_str_t_ptr(const ray_str_t* s, const char* pool_base) {
+    if (s->len == 0) return "";
+    if (ray_str_is_inline(s)) return s->data;
+    assert(pool_base != NULL && "ray_str_t_ptr: pooled string requires non-NULL pool_base");
+    return pool_base + s->pool_off;
+}
+
+/* Equality: fast reject on len, then prefix, then full compare.
+ * pool_a/pool_b: pool bases for elements a and b respectively (NULL if inline) */
+static inline bool ray_str_t_eq(const ray_str_t* a, const char* pool_a,
+                               const ray_str_t* b, const char* pool_b) {
+    if (a->len != b->len) return false;
+    if (a->len == 0) return true;
+    if (ray_str_is_inline(a)) {
+        return memcmp(a->data, b->data, a->len) == 0;
+    }
+    /* Both pooled: check prefix first */
+    if (memcmp(a->prefix, b->prefix, 4) != 0) return false;
+    return memcmp(pool_a + a->pool_off, pool_b + b->pool_off, a->len) == 0;
+}
+
+/* Ordering: lexicographic, shorter string is less on prefix tie.
+ * pool_a/pool_b: pool bases for elements a and b respectively (NULL if inline) */
+static inline int ray_str_t_cmp(const ray_str_t* a, const char* pool_a,
+                               const ray_str_t* b, const char* pool_b) {
+    const char* pa = ray_str_t_ptr(a, pool_a);
+    const char* pb = ray_str_t_ptr(b, pool_b);
+    uint32_t min_len = a->len < b->len ? a->len : b->len;
+    int r = memcmp(pa, pb, min_len);
+    if (r != 0) return r;
+    return (a->len > b->len) - (a->len < b->len);
+}
+
+/* Hash a ray_str_t element.  Uses FNV-1a which is self-contained and fast for
+ * the typical short-to-medium strings stored in ray_str_t.
+ * pool_base: pool base pointer for pooled strings (NULL when inline-only). */
+static inline uint64_t ray_str_t_hash(const ray_str_t* s, const char* pool_base) {
+    if (s->len == 0) return 0x9E3779B97F4A7C15ULL; /* golden ratio constant for empty */
+    if (!ray_str_is_inline(s)) {
+        assert(pool_base != NULL && "ray_str_t_hash: pooled string requires non-NULL pool_base");
+    }
+    const char* p = ray_str_is_inline(s) ? s->data : pool_base + s->pool_off;
+    uint64_t h = 0xcbf29ce484222325ULL;
+    for (uint32_t i = 0; i < s->len; i++) {
+        h ^= (uint64_t)(unsigned char)p[i];
+        h *= 0x100000001b3ULL;
+    }
+    return h;
+}
+
+#endif /* RAY_STR_H */
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/vec.c b/crates/rayforce-sys/vendor/rayforce/src/vec/vec.c
new file mode 100644
index 0000000..110a6e3
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/vec.c
@@ -0,0 +1,1361 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#include "vec.h"
+#include "core/platform.h"
+#include "mem/heap.h"
+#include "table/sym.h"
+#include "vec/embedding.h"
+#include "vec/str.h"
+#include "ops/idxop.h"
+#include <string.h>
+#include <stdlib.h>
+
+/* qsort comparator for (idx, original_k) pairs in ray_vec_insert_many.
+ * Sorts primarily by idx ascending; ties break by original k to preserve
+ * stable-sort semantics (matches the previous insertion-sort behaviour). */
+static int pair_cmp_idx_then_k(const void* a, const void* b) {
+    const int64_t* pa = (const int64_t*)a;
+    const int64_t* pb = (const int64_t*)b;
+    if (pa[0] != pb[0]) return (pa[0] > pb[0]) - (pa[0] < pb[0]);
+    return (pa[1] > pb[1]) - (pa[1] < pb[1]);
+}
+
+/* Public bitmap accessor — handles slice / ext / inline / HAS_INDEX
+ * uniformly.  See vec.h for the contract. */
+const uint8_t* ray_vec_nullmap_bytes(const ray_t* v,
+                                     int64_t* bit_offset_out,
+                                     int64_t* len_bits_out) {
+    if (bit_offset_out) *bit_offset_out = 0;
+    if (len_bits_out)   *len_bits_out   = 0;
+    if (!v) return NULL;
+
+    /* Slice: HAS_NULLS / HAS_INDEX live on the parent — redirect first,
+     * THEN test for nulls.  Reading v->attrs & HAS_NULLS here would
+     * incorrectly drop a sliced view of a nullable column. */
+    const ray_t* target = v;
+    int64_t off = 0;
+    if (v->attrs & RAY_ATTR_SLICE) {
+        target = v->slice_parent;
+        off = v->slice_offset;
+        if (!target) return NULL;
+    }
+    if (!(target->attrs & RAY_ATTR_HAS_NULLS)) return NULL;
+
+    if (bit_offset_out) *bit_offset_out = off;
+
+    if (target->attrs & RAY_ATTR_HAS_INDEX) {
+        const ray_index_t* ix = ray_index_payload(target->index);
+        if (ix->saved_attrs & RAY_ATTR_NULLMAP_EXT) {
+            ray_t* ext;
+            memcpy(&ext, &ix->saved_nullmap[0], sizeof(ext));
+            if (len_bits_out) *len_bits_out = ext->len * 8;
+            return (const uint8_t*)ray_data(ext);
+        }
+        if (len_bits_out) *len_bits_out = 128;
+        return ix->saved_nullmap;
+    }
+    if (target->attrs & RAY_ATTR_NULLMAP_EXT) {
+        if (len_bits_out) *len_bits_out = target->ext_nullmap->len * 8;
+        return (const uint8_t*)ray_data(target->ext_nullmap);
+    }
+    /* Inline path: RAY_STR's bytes 0-15 hold str_pool/str_ext_null, not
+     * bits — so RAY_STR with HAS_NULLS must always have NULLMAP_EXT. */
+    if (target->type == RAY_STR) return NULL;
+    if (len_bits_out) *len_bits_out = 128;
+    return target->nullmap;
+}
+
+/* Internal compatibility wrapper for the older two-out-param form used
+ * inside vec.c.  Returns the inline pointer (16-byte buffer) when nulls
+ * live inline, or NULL when they live in *ext_out. */
+static inline const uint8_t* vec_inline_nullmap(const ray_t* v, ray_t** ext_nullmap_ref) {
+    *ext_nullmap_ref = NULL;
+    if (v->attrs & RAY_ATTR_HAS_INDEX) {
+        const ray_index_t* ix = ray_index_payload(v->index);
+        if (ix->saved_attrs & RAY_ATTR_NULLMAP_EXT) {
+            ray_t* ext;
+            memcpy(&ext, &ix->saved_nullmap[0], sizeof(ext));
+            *ext_nullmap_ref = ext;
+            return NULL;
+        }
+        return ix->saved_nullmap;
+    }
+    if (v->attrs & RAY_ATTR_NULLMAP_EXT) {
+        *ext_nullmap_ref = v->ext_nullmap;
+        return NULL;
+    }
+    return v->nullmap;
+}
+
+/* True if v has any nulls.  HAS_NULLS is preserved on the parent across
+ * index attach/detach (see attach_finalize), so this is the same one-bit
+ * test in both indexed and non-indexed cases. */
+static inline bool vec_any_nulls(const ray_t* v) {
+    return (v->attrs & RAY_ATTR_HAS_NULLS) != 0;
+}
+
+/* In-place drop of attached index — caller must hold a unique ref (rc==1)
+ * on `v` itself.  Used by mutation paths to invalidate the (now stale)
+ * index before writing.  HAS_NULLS was preserved through the attachment
+ * so it needs no restoration; only NULLMAP_EXT (cleared at attach time)
+ * is reinstated from saved_attrs.
+ *
+ * Shared-index case: `v` may share its index ray_t with another vec
+ * (e.g. after ray_cow followed by ray_retain_owned_refs, both copies
+ * point at the same RAY_INDEX with rc==2).  We must NOT clobber the
+ * saved-nullmap bytes inside a shared index — the other holder still
+ * reads them.  Detect rc>1 and copy the saved pointers via
+ * ray_index_retain_saved instead of moving them out. */
+static inline void vec_drop_index_inplace(ray_t* v) {
+    if (!(v->attrs & RAY_ATTR_HAS_INDEX)) return;
+    ray_t* idx = v->index;
+    ray_index_t* ix = ray_index_payload(idx);
+    uint8_t saved = ix->saved_attrs;
+    bool shared = ray_atomic_load(&idx->rc) > 1;
+
+    if (shared) {
+        /* Take our own retained references to the saved-pointer slots
+         * (ext_nullmap / str_pool / sym_dict etc.) so the bytes we copy
+         * into v->nullmap are validly owned by v.  Leave the index's
+         * snapshot intact for the other holder. */
+        ray_index_retain_saved(ix);
+    }
+    memcpy(v->nullmap, ix->saved_nullmap, 16);
+    if (!shared) {
+        /* Sole owner: about to release idx, so neutralize its snapshot
+         * to prevent ray_index_release_saved from double-releasing the
+         * pointers we just transferred to v. */
+        memset(ix->saved_nullmap, 0, 16);
+        ix->saved_attrs = 0;
+    }
+    v->attrs &= (uint8_t)~RAY_ATTR_HAS_INDEX;
+    if (saved & RAY_ATTR_NULLMAP_EXT) v->attrs |= RAY_ATTR_NULLMAP_EXT;
+    ray_release(idx);
+}
+
+/* --------------------------------------------------------------------------
+ * Capacity helpers
+ *
+ * A vector's capacity is determined by its buddy order:
+ *   capacity = (2^order - 32) / elem_size
+ * When len reaches capacity, realloc to next power-of-2 data size.
+ * -------------------------------------------------------------------------- */
+
+static int64_t vec_capacity(ray_t* vec) {
+    size_t block_size = (size_t)1 << vec->order;
+    size_t data_space = block_size - 32;  /* 32B ray_t header */
+    uint8_t esz = ray_sym_elem_size(vec->type, vec->attrs);
+    if (esz == 0) return 0;
+    return (int64_t)(data_space / esz);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_new
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_vec_new(int8_t type, int64_t capacity) {
+    if (type <= 0 || type >= RAY_TYPE_COUNT)
+        return ray_error("type", NULL);
+    if (type == RAY_SYM)
+        return ray_sym_vec_new(RAY_SYM_W64, capacity);  /* default: global sym IDs */
+    if (capacity < 0) return ray_error("range", NULL);
+
+    uint8_t esz = ray_elem_size(type);
+    size_t data_size = (size_t)capacity * esz;
+    if (esz > 1 && data_size / esz != (size_t)capacity)
+        return ray_error("oom", NULL);
+
+    ray_t* v = ray_alloc(data_size);
+    if (!v) return ray_error("oom", "vec_new(type=%d, cap=%lld): %zu bytes",
+                             (int)type, (long long)capacity, data_size);
+    if (RAY_IS_ERR(v)) return v;
+
+    v->type = type;
+    v->len = 0;
+    v->attrs = 0;
+    memset(v->nullmap, 0, 16);
+    if (type == RAY_STR) v->str_pool = NULL;
+
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_sym_vec_new — create a RAY_SYM vector with adaptive index width
+ *
+ * sym_width: RAY_SYM_W8, RAY_SYM_W16, RAY_SYM_W32, or RAY_SYM_W64
+ * capacity:  number of elements (rows)
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_sym_vec_new(uint8_t sym_width, int64_t capacity) {
+    if ((sym_width & ~RAY_SYM_W_MASK) != 0)
+        return ray_error("type", NULL);
+    if (capacity < 0) return ray_error("range", NULL);
+
+    uint8_t esz = (uint8_t)RAY_SYM_ELEM(sym_width);
+    size_t data_size = (size_t)capacity * esz;
+    if (esz > 1 && data_size / esz != (size_t)capacity)
+        return ray_error("oom", NULL);
+
+    ray_t* v = ray_alloc(data_size);
+    if (!v) return ray_error("oom", "sym_vec_new(width=%u, cap=%lld): %zu bytes",
+                             (unsigned)sym_width, (long long)capacity, data_size);
+    if (RAY_IS_ERR(v)) return v;
+
+    v->type = RAY_SYM;
+    v->len = 0;
+    v->attrs = sym_width;  /* lower 2 bits encode width */
+    memset(v->nullmap, 0, 16);
+
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_append
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_vec_append(ray_t* vec, const void* elem) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (vec->type <= 0 || vec->type >= RAY_TYPE_COUNT)
+        return ray_error("type", NULL);
+    if (vec->type == RAY_STR) return ray_error("type", NULL);
+
+    /* COW: if shared, copy first */
+    ray_t* original = vec;
+    vec = ray_cow(vec);
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+
+    /* Append changes len + writes data; any attached index is now stale. */
+    vec_drop_index_inplace(vec);
+
+    uint8_t esz = ray_sym_elem_size(vec->type, vec->attrs);
+    int64_t cap = vec_capacity(vec);
+
+    /* Grow if needed */
+    if (vec->len >= cap) {
+        size_t new_data_size = (size_t)(vec->len + 1) * esz;
+        /* Round up to next power of 2 block */
+        if (new_data_size < 32) new_data_size = 32;
+        else {
+            size_t s = 32;
+            while (s < new_data_size) {
+                if (s > SIZE_MAX / 2) goto fail;
+                s *= 2;
+            }
+            new_data_size = s;
+        }
+        ray_t* new_vec = ray_scratch_realloc(vec, new_data_size);
+        if (!new_vec || RAY_IS_ERR(new_vec)) {
+            if (vec != original) ray_release(vec);
+            return new_vec ? new_vec : ray_error("oom", NULL);
+        }
+        vec = new_vec;
+    }
+
+    /* Append element */
+    char* dst = (char*)ray_data(vec) + vec->len * esz;
+    memcpy(dst, elem, esz);
+    vec->len++;
+
+    return vec;
+
+fail:
+    if (vec != original) ray_release(vec);
+    return ray_error("oom", NULL);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_set
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_vec_set(ray_t* vec, int64_t idx, const void* elem) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (vec->type == RAY_STR) return ray_error("type", NULL);
+    if (idx < 0 || idx >= vec->len)
+        return ray_error("range", NULL);
+
+    /* COW: if shared, copy first */
+    vec = ray_cow(vec);
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+
+    /* Writing a slot value invalidates any attached accelerator index. */
+    vec_drop_index_inplace(vec);
+
+    uint8_t esz = ray_sym_elem_size(vec->type, vec->attrs);
+    char* dst = (char*)ray_data(vec) + idx * esz;
+    memcpy(dst, elem, esz);
+
+    return vec;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_get
+ * -------------------------------------------------------------------------- */
+
+void* ray_vec_get(ray_t* vec, int64_t idx) {
+    if (!vec || RAY_IS_ERR(vec)) return NULL;
+    if (vec->type == RAY_STR) return NULL;
+
+    /* Slice path: redirect to parent */
+    if (vec->attrs & RAY_ATTR_SLICE) {
+        ray_t* parent = vec->slice_parent;
+        int64_t offset = vec->slice_offset;
+        if (idx < 0 || idx >= vec->len) return NULL;
+        uint8_t esz = ray_sym_elem_size(parent->type, parent->attrs);
+        return (char*)ray_data(parent) + (offset + idx) * esz;
+    }
+
+    if (idx < 0 || idx >= vec->len) return NULL;
+    uint8_t esz = ray_sym_elem_size(vec->type, vec->attrs);
+    return (char*)ray_data(vec) + idx * esz;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_slice  (zero-copy view)
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_vec_slice(ray_t* vec, int64_t offset, int64_t len) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (offset < 0 || len < 0 || offset > vec->len || len > vec->len - offset)
+        return ray_error("range", NULL);
+
+    /* If input is already a slice, resolve to ultimate parent */
+    ray_t* parent = vec;
+    int64_t parent_offset = offset;
+    if (vec->attrs & RAY_ATTR_SLICE) {
+        parent = vec->slice_parent;
+        parent_offset = vec->slice_offset + offset;
+    }
+
+    /* Allocate a header-only block for the slice view */
+    ray_t* s = ray_alloc(0);
+    if (!s || RAY_IS_ERR(s)) return s;
+
+    s->type = parent->type;
+    s->attrs = RAY_ATTR_SLICE | (parent->attrs & RAY_SYM_W_MASK);
+    s->len = len;
+    s->slice_parent = parent;
+    s->slice_offset = parent_offset;
+
+    /* Retain the parent so it stays alive */
+    ray_retain(parent);
+
+    return s;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_concat
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_vec_concat(ray_t* a, ray_t* b) {
+    if (!a || RAY_IS_ERR(a)) return a;
+    if (!b || RAY_IS_ERR(b)) return b;
+    if (a->type != b->type)
+        return ray_error("type", NULL);
+
+    if (a->type == RAY_STR) {
+        int64_t total_len = a->len + b->len;
+        if (total_len < a->len) return ray_error("oom", NULL);
+
+        ray_t* result = ray_vec_new(RAY_STR, total_len);
+        if (!result || RAY_IS_ERR(result)) return result;
+        result->len = total_len;
+
+        ray_str_t* dst = (ray_str_t*)ray_data(result);
+
+        /* Resolve a's data (may be a slice) */
+        const ray_str_t* a_elems = (a->attrs & RAY_ATTR_SLICE)
+            ? &((const ray_str_t*)ray_data(a->slice_parent))[a->slice_offset]
+            : (const ray_str_t*)ray_data(a);
+        ray_t* a_pool_owner = (a->attrs & RAY_ATTR_SLICE) ? a->slice_parent : a;
+
+        /* Resolve b's data (may be a slice) */
+        const ray_str_t* b_elems = (b->attrs & RAY_ATTR_SLICE)
+            ? &((const ray_str_t*)ray_data(b->slice_parent))[b->slice_offset]
+            : (const ray_str_t*)ray_data(b);
+        ray_t* b_pool_owner = (b->attrs & RAY_ATTR_SLICE) ? b->slice_parent : b;
+
+        /* Copy a's elements as-is */
+        memcpy(dst, a_elems, (size_t)a->len * sizeof(ray_str_t));
+
+        /* Merge pools: a's pool + b's pool */
+        int64_t a_pool_size = (a_pool_owner->str_pool) ? a_pool_owner->str_pool->len : 0;
+        int64_t b_pool_size = (b_pool_owner->str_pool) ? b_pool_owner->str_pool->len : 0;
+        int64_t total_pool = a_pool_size + b_pool_size;
+
+        /* Guard: total pool must fit in uint32_t for pool_off rebasing */
+        if (total_pool > (int64_t)UINT32_MAX) {
+            ray_release(result);
+            return ray_error("range", NULL);
+        }
+
+        if (total_pool > 0) {
+            result->str_pool = ray_alloc((size_t)total_pool);
+            if (!result->str_pool || RAY_IS_ERR(result->str_pool)) {
+                result->str_pool = NULL;
+                ray_release(result);
+                return ray_error("oom", NULL);
+            }
+            result->str_pool->type = RAY_U8;
+            result->str_pool->len = total_pool;
+            char* pool_dst = (char*)ray_data(result->str_pool);
+            if (a_pool_size > 0)
+                memcpy(pool_dst, ray_data(a_pool_owner->str_pool), (size_t)a_pool_size);
+            if (b_pool_size > 0)
+                memcpy(pool_dst + a_pool_size, ray_data(b_pool_owner->str_pool), (size_t)b_pool_size);
+        }
+
+        /* Copy b's elements, rebasing pool offsets */
+        for (int64_t i = 0; i < b->len; i++) {
+            dst[a->len + i] = b_elems[i];
+            if (!ray_str_is_inline(&b_elems[i]) && b_elems[i].len > 0) {
+                dst[a->len + i].pool_off += (uint32_t)a_pool_size;
+            }
+        }
+
+        /* Propagate null bitmaps from a and b.
+         * Slices don't carry RAY_ATTR_HAS_NULLS — check RAY_ATTR_SLICE too. */
+        if ((a->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_SLICE)) ||
+            (b->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_SLICE))) {
+            for (int64_t i = 0; i < a->len; i++) {
+                if (ray_vec_is_null((ray_t*)a, i)) {
+                    ray_err_t err = ray_vec_set_null_checked(result, i, true);
+                    if (err != RAY_OK) { ray_release(result); return ray_error(ray_err_code_str(err), NULL); }
+                }
+            }
+            for (int64_t i = 0; i < b->len; i++) {
+                if (ray_vec_is_null((ray_t*)b, i)) {
+                    ray_err_t err = ray_vec_set_null_checked(result, a->len + i, true);
+                    if (err != RAY_OK) { ray_release(result); return ray_error(ray_err_code_str(err), NULL); }
+                }
+            }
+        }
+
+        return result;
+    }
+
+    uint8_t a_esz = ray_sym_elem_size(a->type, a->attrs);
+    uint8_t b_esz = ray_sym_elem_size(b->type, b->attrs);
+    /* Use the wider of the two widths for SYM columns — carry only width bits,
+     * not flags like RAY_ATTR_SLICE or RAY_ATTR_HAS_NULLS from inputs. */
+    uint8_t out_attrs = (a_esz >= b_esz) ? (a->attrs & RAY_SYM_W_MASK) : (b->attrs & RAY_SYM_W_MASK);
+    uint8_t esz = (a_esz >= b_esz) ? a_esz : b_esz;
+
+    int64_t total_len = a->len + b->len;
+    if (total_len < a->len) return ray_error("oom", NULL); /* overflow */
+    size_t data_size = (size_t)total_len * esz;
+    if (esz > 1 && data_size / esz != (size_t)total_len)
+        return ray_error("oom", NULL); /* multiplication overflow */
+
+    ray_t* result = ray_alloc(data_size);
+    if (!result || RAY_IS_ERR(result)) return result;
+
+    result->type = a->type;
+    result->len = total_len;
+    result->attrs = out_attrs;
+    memset(result->nullmap, 0, 16);
+
+    /* For SYM with mismatched widths, widen element-by-element */
+    if (a->type == RAY_SYM && a_esz != b_esz) {
+        void* dst = ray_data(result);
+        for (int64_t i = 0; i < a->len; i++) {
+            int64_t val = ray_read_sym(ray_data(a), i, a->type, a->attrs);
+            ray_write_sym(dst, i, (uint64_t)val, result->type, result->attrs);
+        }
+        for (int64_t i = 0; i < b->len; i++) {
+            int64_t val = ray_read_sym(ray_data(b), i, b->type, b->attrs);
+            ray_write_sym(dst, a->len + i, (uint64_t)val, result->type, result->attrs);
+        }
+    } else {
+        /* Same width: fast memcpy path */
+        void* a_data = (a->attrs & RAY_ATTR_SLICE) ?
+            ((char*)ray_data(a->slice_parent) + a->slice_offset * esz) :
+            ray_data(a);
+        memcpy(ray_data(result), a_data, (size_t)a->len * esz);
+
+        void* b_data = (b->attrs & RAY_ATTR_SLICE) ?
+            ((char*)ray_data(b->slice_parent) + b->slice_offset * esz) :
+            ray_data(b);
+        memcpy((char*)ray_data(result) + (size_t)a->len * esz, b_data,
+               (size_t)b->len * esz);
+    }
+
+    /* Propagate null bitmaps from a and b.
+     * Slices don't carry RAY_ATTR_HAS_NULLS — check RAY_ATTR_SLICE too. */
+    if ((a->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_SLICE)) ||
+        (b->attrs & (RAY_ATTR_HAS_NULLS | RAY_ATTR_SLICE))) {
+        for (int64_t i = 0; i < a->len; i++) {
+            if (ray_vec_is_null((ray_t*)a, i)) {
+                ray_err_t err = ray_vec_set_null_checked(result, i, true);
+                if (err != RAY_OK) { ray_release(result); return ray_error(ray_err_code_str(err), NULL); }
+            }
+        }
+        for (int64_t i = 0; i < b->len; i++) {
+            if (ray_vec_is_null((ray_t*)b, i)) {
+                ray_err_t err = ray_vec_set_null_checked(result, a->len + i, true);
+                if (err != RAY_OK) { ray_release(result); return ray_error(ray_err_code_str(err), NULL); }
+            }
+        }
+    }
+
+    /* LIST/TABLE columns hold child pointers — retain them */
+    if (a->type == RAY_LIST || a->type == RAY_TABLE) {
+        ray_t** ptrs = (ray_t**)ray_data(result);
+        for (int64_t i = 0; i < total_len; i++) {
+            if (ptrs[i]) ray_retain(ptrs[i]);
+        }
+    }
+
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_insert_at — insert a single element at position idx.
+ *
+ * idx is a pre-insertion position in [0, vec->len]. idx == vec->len is
+ * equivalent to append. Does not support RAY_STR (use ray_str_vec_insert_at).
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_vec_insert_at(ray_t* vec, int64_t idx, const void* elem) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (vec->type <= 0 || vec->type >= RAY_TYPE_COUNT)
+        return ray_error("type", NULL);
+    if (vec->type == RAY_STR) return ray_error("type", NULL);
+    if (idx < 0 || idx > vec->len) return ray_error("range", NULL);
+
+    /* COW: if shared, copy first */
+    ray_t* original = vec;
+    vec = ray_cow(vec);
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+
+    /* In-place insert mutates len + data + nullmap; any attached
+     * accelerator index is now stale. */
+    vec_drop_index_inplace(vec);
+
+    uint8_t esz = ray_sym_elem_size(vec->type, vec->attrs);
+    int64_t cap = vec_capacity(vec);
+
+    /* Grow if needed */
+    if (vec->len >= cap) {
+        size_t new_data_size = (size_t)(vec->len + 1) * esz;
+        if (new_data_size < 32) new_data_size = 32;
+        else {
+            size_t s = 32;
+            while (s < new_data_size) {
+                if (s > SIZE_MAX / 2) goto fail_oom;
+                s *= 2;
+            }
+            new_data_size = s;
+        }
+        ray_t* new_vec = ray_scratch_realloc(vec, new_data_size);
+        if (!new_vec || RAY_IS_ERR(new_vec)) {
+            if (vec != original) ray_release(vec);
+            return new_vec ? new_vec : ray_error("oom", NULL);
+        }
+        vec = new_vec;
+    }
+
+    int64_t old_len = vec->len;
+    char* base = (char*)ray_data(vec);
+
+    /* Shift elements [idx..old_len) → [idx+1..old_len+1) */
+    if (idx < old_len) {
+        memmove(base + (size_t)(idx + 1) * esz,
+                base + (size_t)idx * esz,
+                (size_t)(old_len - idx) * esz);
+    }
+
+    /* Write the new element */
+    memcpy(base + (size_t)idx * esz, elem, esz);
+
+    vec->len = old_len + 1;
+
+    /* Shift null bitmap bits [idx..old_len) up by one; clear bit at idx.
+     * Walk from tail backward so we don't overwrite unread bits. */
+    if (vec->attrs & RAY_ATTR_HAS_NULLS) {
+        for (int64_t i = old_len - 1; i >= idx; i--) {
+            bool was_null = ray_vec_is_null(vec, i);
+            if (was_null) {
+                ray_err_t err = ray_vec_set_null_checked(vec, i + 1, true);
+                if (err != RAY_OK) goto fail_oom;
+            } else {
+                ray_err_t err = ray_vec_set_null_checked(vec, i + 1, false);
+                if (err != RAY_OK) goto fail_oom;
+            }
+        }
+        /* New element is not null */
+        ray_err_t err = ray_vec_set_null_checked(vec, idx, false);
+        if (err != RAY_OK) goto fail_oom;
+    }
+
+    return vec;
+
+fail_oom:
+    if (vec != original) ray_release(vec);
+    return ray_error("oom", NULL);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_insert_vec_at — splice src into vec at position idx.
+ *
+ * Shares SYM-width widening, RAY_STR pool merge, and null-bit propagation
+ * with ray_vec_concat via the slice→concat→concat pattern. Always returns
+ * a fresh block; caller should release the input if no longer needed.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_vec_insert_vec_at(ray_t* vec, int64_t idx, ray_t* src) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (!src || RAY_IS_ERR(src)) return src;
+    if (vec->type != src->type) return ray_error("type", NULL);
+    if (idx < 0 || idx > vec->len) return ray_error("range", NULL);
+
+    /* Fast path: idx == len is plain concat */
+    if (idx == vec->len) return ray_vec_concat(vec, src);
+    /* Fast path: idx == 0 is reversed concat */
+    if (idx == 0) return ray_vec_concat(src, vec);
+
+    ray_t* head = ray_vec_slice(vec, 0, idx);
+    if (!head || RAY_IS_ERR(head)) return head;
+
+    ray_t* tail = ray_vec_slice(vec, idx, vec->len - idx);
+    if (!tail || RAY_IS_ERR(tail)) { ray_release(head); return tail; }
+
+    ray_t* mid = ray_vec_concat(head, src);
+    ray_release(head);
+    if (!mid || RAY_IS_ERR(mid)) { ray_release(tail); return mid; }
+
+    ray_t* result = ray_vec_concat(mid, tail);
+    ray_release(mid);
+    ray_release(tail);
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_insert_many — insert N values at N pre-insertion positions.
+ *
+ * idxs: I64 vec of length N, each idx in [0, vec->len].
+ * vals: either a matching atom (broadcast) or same-type vec of length N
+ *       (parallel) or length 1 (broadcast).
+ *
+ * For ties in idxs, the original input order is preserved (stable sort).
+ * Returns a fresh block; caller releases vec if no longer needed.
+ * RAY_STR targets are rejected — use ray_vec_insert_vec_at in a loop instead.
+ * For RAY_SYM, the source width must match the destination width.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_vec_insert_many(ray_t* vec, ray_t* idxs, ray_t* vals) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (!idxs || RAY_IS_ERR(idxs)) return idxs;
+    if (!vals || RAY_IS_ERR(vals)) return vals;
+    if (vec->type <= 0 || vec->type >= RAY_TYPE_COUNT) return ray_error("type", NULL);
+    if (vec->type == RAY_STR) return ray_error("type", NULL);
+    if (idxs->type != RAY_I64) return ray_error("type", NULL);
+
+    int64_t N = idxs->len;
+    int64_t old_len = vec->len;
+    uint8_t esz = ray_sym_elem_size(vec->type, vec->attrs);
+
+    /* Fast path: N == 0 returns a fresh retain */
+    if (N == 0) { ray_retain(vec); return vec; }
+
+    /* Validate indices */
+    const int64_t* idx_arr = (const int64_t*)ray_data(idxs);
+    for (int64_t k = 0; k < N; k++) {
+        if (idx_arr[k] < 0 || idx_arr[k] > old_len)
+            return ray_error("range", NULL);
+    }
+
+    /* Classify vals: atom (broadcast) vs vec (parallel or singleton broadcast) */
+    int broadcast;
+    if (vals->type < 0) {
+        if (vals->type != -vec->type) return ray_error("type", NULL);
+        broadcast = 1;
+    } else if (vals->type == vec->type) {
+        /* SYM width must match — dispatcher should widen upstream */
+        if (vec->type == RAY_SYM &&
+            (vals->attrs & RAY_SYM_W_MASK) != (vec->attrs & RAY_SYM_W_MASK))
+            return ray_error("type", NULL);
+        if (vals->len == 1) broadcast = 1;
+        else if (vals->len == N) broadcast = 0;
+        else return ray_error("range", NULL);
+    } else {
+        return ray_error("type", NULL);
+    }
+
+    /* Build sort buffer as I64 vec of 2*N slots: [idx0, src0, idx1, src1, ...] */
+    ray_t* pair_vec = ray_vec_new(RAY_I64, 2 * N);
+    if (!pair_vec || RAY_IS_ERR(pair_vec)) return ray_error("oom", NULL);
+    pair_vec->len = 2 * N;
+    int64_t* pairs = (int64_t*)ray_data(pair_vec);
+    for (int64_t k = 0; k < N; k++) {
+        pairs[2 * k]     = idx_arr[k];
+        pairs[2 * k + 1] = k;
+    }
+
+    /* Stable sort the (idx, original_k) pairs by idx.  qsort isn't
+     * inherently stable, but a compound comparator on (idx, k) — where
+     * k is the original position — gives the same total order as a
+     * stable sort by idx alone.  Replaces an O(N^2) insertion sort
+     * that hangs for bulk-set updates with thousands+ of indices. */
+    qsort(pairs, (size_t)N, 2 * sizeof(int64_t), pair_cmp_idx_then_k);
+
+    /* Allocate result */
+    int64_t new_len = old_len + N;
+    if (new_len < old_len) { ray_release(pair_vec); return ray_error("oom", NULL); }
+    size_t data_size = (size_t)new_len * esz;
+    if (esz > 1 && data_size / esz != (size_t)new_len) {
+        ray_release(pair_vec);
+        return ray_error("oom", NULL);
+    }
+
+    ray_t* result = ray_alloc(data_size);
+    if (!result || RAY_IS_ERR(result)) { ray_release(pair_vec); return result ? result : ray_error("oom", NULL); }
+    result->type = vec->type;
+    result->len = new_len;
+    result->attrs = vec->attrs & RAY_SYM_W_MASK;
+    memset(result->nullmap, 0, 16);
+
+    /* Source pointers */
+    const char* src_base = (vec->attrs & RAY_ATTR_SLICE)
+        ? ((const char*)ray_data(vec->slice_parent) + (size_t)vec->slice_offset * esz)
+        : (const char*)ray_data(vec);
+
+    /* Value source: atom bytes or vec row bytes.
+     * GUID atoms keep their 16-byte payload in vals->obj, not inline; typed
+     * nulls carry obj==NULL and fall through to a zero buffer (null bit is
+     * then set below via RAY_ATOM_IS_NULL). */
+    static const uint8_t zero_guid[16] = {0};
+    const char* val_atom_bytes = NULL;
+    if (vals->type < 0) {
+        if (vec->type == RAY_GUID) {
+            val_atom_bytes = vals->obj
+                ? (const char*)ray_data(vals->obj)
+                : (const char*)zero_guid;
+        } else {
+            val_atom_bytes = (const char*)&vals->u8;
+        }
+    }
+    const char* val_vec_base = NULL;
+    if (val_atom_bytes == NULL) {
+        val_vec_base = (vals->attrs & RAY_ATTR_SLICE)
+            ? ((const char*)ray_data(vals->slice_parent) + (size_t)vals->slice_offset * esz)
+            : (const char*)ray_data(vals);
+    }
+
+    char* dst_base = (char*)ray_data(result);
+
+    /* Walk: merge sorted inserts with original */
+    int64_t w = 0;   /* write cursor */
+    int64_t p = 0;   /* pair cursor */
+    for (int64_t r = 0; r <= old_len; r++) {
+        while (p < N && pairs[2 * p] == r) {
+            int64_t src_pos = pairs[2 * p + 1];
+            if (val_atom_bytes) {
+                /* Broadcast atom */
+                memcpy(dst_base + (size_t)w * esz, val_atom_bytes, esz);
+                /* Atom-level null propagation */
+                if (RAY_ATOM_IS_NULL(vals)) {
+                    ray_err_t e = ray_vec_set_null_checked(result, w, true);
+                    if (e != RAY_OK) { ray_release(result); ray_release(pair_vec); return ray_error("oom", NULL); }
+                }
+            } else if (broadcast) {
+                /* Single-element vec broadcast — always row 0 */
+                memcpy(dst_base + (size_t)w * esz, val_vec_base, esz);
+                if (ray_vec_is_null(vals, 0)) {
+                    ray_err_t e = ray_vec_set_null_checked(result, w, true);
+                    if (e != RAY_OK) { ray_release(result); ray_release(pair_vec); return ray_error("oom", NULL); }
+                }
+            } else {
+                /* Parallel: use src_pos into vals */
+                memcpy(dst_base + (size_t)w * esz,
+                       val_vec_base + (size_t)src_pos * esz, esz);
+                if (ray_vec_is_null(vals, src_pos)) {
+                    ray_err_t e = ray_vec_set_null_checked(result, w, true);
+                    if (e != RAY_OK) { ray_release(result); ray_release(pair_vec); return ray_error("oom", NULL); }
+                }
+            }
+            w++;
+            p++;
+        }
+        if (r < old_len) {
+            memcpy(dst_base + (size_t)w * esz, src_base + (size_t)r * esz, esz);
+            if (ray_vec_is_null(vec, r)) {
+                ray_err_t e = ray_vec_set_null_checked(result, w, true);
+                if (e != RAY_OK) { ray_release(result); ray_release(pair_vec); return ray_error("oom", NULL); }
+            }
+            w++;
+        }
+    }
+
+    ray_release(pair_vec);
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_from_raw
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_vec_from_raw(int8_t type, const void* data, int64_t count) {
+    if (type <= 0 || type >= RAY_TYPE_COUNT)
+        return ray_error("type", NULL);
+    if (type == RAY_STR) return ray_error("type", NULL);
+    if (count < 0) return ray_error("range", NULL);
+
+    /* RAY_SYM defaults to W64 (global sym IDs) */
+    uint8_t sym_w = (type == RAY_SYM) ? RAY_SYM_W64 : 0;
+    uint8_t esz = ray_sym_elem_size(type, sym_w);
+    size_t data_size = (size_t)count * esz;
+
+    ray_t* v = ray_alloc(data_size);
+    if (!v || RAY_IS_ERR(v)) return v;
+
+    v->type = type;
+    v->len = count;
+    v->attrs = sym_w;
+    memset(v->nullmap, 0, 16);
+
+    memcpy(ray_data(v), data, data_size);
+
+    /* LIST/TABLE elements are child pointers — retain them */
+    if (type == RAY_LIST || type == RAY_TABLE) {
+        ray_t** ptrs = (ray_t**)ray_data(v);
+        for (int64_t i = 0; i < count; i++) {
+            if (ptrs[i]) ray_retain(ptrs[i]);
+        }
+    }
+
+    return v;
+}
+
+/* --------------------------------------------------------------------------
+ * Null bitmap operations
+ *
+ * Inline: for vectors with <=128 elements, bits stored in nullmap[16] (128 bits).
+ * External: for >128 elements, allocate a U8 vector bitmap via ext_nullmap.
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_vec_set_null_checked(ray_t* vec, int64_t idx, bool is_null) {
+    if (!vec || RAY_IS_ERR(vec)) return RAY_ERR_TYPE;
+    if (vec->attrs & RAY_ATTR_SLICE) return RAY_ERR_TYPE; /* cannot set null on slice — COW first */
+    if (idx < 0 || idx >= vec->len) return RAY_ERR_RANGE;
+
+    /* Mutation invalidates any attached accelerator index — drop it inline.
+     * Caller must already hold a unique ref (set-null on a shared vec is a
+     * bug regardless of indexing). */
+    vec_drop_index_inplace(vec);
+
+    /* Mark HAS_NULLS if setting a null (defer for RAY_STR until ext alloc succeeds) */
+    if (is_null && vec->type != RAY_STR) vec->attrs |= RAY_ATTR_HAS_NULLS;
+
+    if (!(vec->attrs & RAY_ATTR_NULLMAP_EXT)) {
+        /* RAY_STR uses bytes 8-15 for str_pool, HAS_LINK uses bytes 8-15 for
+         * link_target — both must skip the inline-128 path to avoid
+         * aliasing corruption.  Otherwise <=128 elements go inline. */
+        bool can_inline = (vec->type != RAY_STR) && idx < 128 &&
+                          !(vec->attrs & RAY_ATTR_HAS_LINK);
+        if (can_inline) {
+            /* Inline nullmap path (<=128 elements, non-STR, non-linked) */
+            int byte_idx = (int)(idx / 8);
+            int bit_idx = (int)(idx % 8);
+            if (is_null)
+                vec->nullmap[byte_idx] |= (uint8_t)(1u << bit_idx);
+            else
+                vec->nullmap[byte_idx] &= (uint8_t)~(1u << bit_idx);
+            return RAY_OK;
+        }
+        /* Need to promote to external nullmap */
+        int64_t bitmap_len = (vec->len + 7) / 8;
+        ray_t* ext = ray_vec_new(RAY_U8, bitmap_len);
+        if (!ext || RAY_IS_ERR(ext)) return RAY_ERR_OOM;
+        ext->len = bitmap_len;
+        if (vec->type == RAY_STR || (vec->attrs & RAY_ATTR_HAS_LINK)) {
+            /* Bytes 0-15 contain pointers/sym, not bits — start ext zeroed.
+             * (Linked vecs reach here only when adding their first null,
+             *  since promote_inline_to_ext in linkop.c covers the
+             *  pre-existing-nulls case at attach time.) */
+            memset(ray_data(ext), 0, (size_t)bitmap_len);
+        } else {
+            /* Copy existing inline bits */
+            memcpy(ray_data(ext), vec->nullmap, 16);
+            /* Zero remaining bytes */
+            if (bitmap_len > 16)
+                memset((char*)ray_data(ext) + 16, 0, (size_t)(bitmap_len - 16));
+        }
+        vec->attrs |= RAY_ATTR_NULLMAP_EXT;
+        if (is_null) vec->attrs |= RAY_ATTR_HAS_NULLS;
+        vec->ext_nullmap = ext;
+    }
+
+    /* External nullmap path */
+    ray_t* ext = vec->ext_nullmap;
+    /* Grow external bitmap if needed */
+    int64_t needed_bytes = (idx / 8) + 1;
+    if (needed_bytes > ext->len) {
+        int64_t new_len = (vec->len + 7) / 8;
+        if (new_len < needed_bytes) new_len = needed_bytes;
+        size_t new_data_size = (size_t)new_len;
+        int64_t old_len = ext->len;
+        ray_t* new_ext = ray_scratch_realloc(ext, new_data_size);
+        if (!new_ext || RAY_IS_ERR(new_ext)) return RAY_ERR_OOM;
+        /* Zero new bytes */
+        if (new_len > old_len)
+            memset((char*)ray_data(new_ext) + old_len, 0,
+                   (size_t)(new_len - old_len));
+        new_ext->len = new_len;
+        vec->ext_nullmap = new_ext;
+        ext = new_ext;
+    }
+
+    uint8_t* bits = (uint8_t*)ray_data(ext);
+    int byte_idx = (int)(idx / 8);
+    int bit_idx = (int)(idx % 8);
+    if (is_null)
+        bits[byte_idx] |= (uint8_t)(1u << bit_idx);
+    else
+        bits[byte_idx] &= (uint8_t)~(1u << bit_idx);
+    return RAY_OK;
+}
+
+void ray_vec_set_null(ray_t* vec, int64_t idx, bool is_null) {
+    (void)ray_vec_set_null_checked(vec, idx, is_null);
+}
+
+/* --------------------------------------------------------------------------
+ * str_pool_cow — ensure pool is privately owned after ray_cow()
+ *
+ * After ray_cow(), the copy shares the same str_pool as the original.
+ * ray_retain_owned_refs bumps pool rc, so direct mutation would corrupt
+ * the original's pool data (or ray_scratch_realloc would ray_free a
+ * shared block).  Deep-copy the pool when rc > 1.
+ * -------------------------------------------------------------------------- */
+
+static ray_t* str_pool_cow(ray_t* vec) {
+    if (!vec->str_pool || RAY_IS_ERR(vec->str_pool)) return vec;
+    uint32_t pool_rc = ray_atomic_load(&vec->str_pool->rc);
+    if (pool_rc <= 1) return vec;
+
+    size_t pool_data_size = ((size_t)1 << vec->str_pool->order) - 32;
+    ray_t* new_pool = ray_alloc(pool_data_size);
+    if (!new_pool || RAY_IS_ERR(new_pool)) return NULL;
+
+    size_t copy_bytes = (size_t)vec->str_pool->len;
+    if (copy_bytes > pool_data_size) copy_bytes = pool_data_size;
+
+    uint8_t saved_order = new_pool->order;
+    uint8_t saved_mmod  = new_pool->mmod;
+    memcpy(new_pool, vec->str_pool, 32 + copy_bytes);
+    new_pool->order = saved_order;
+    new_pool->mmod  = saved_mmod;
+    ray_atomic_store(&new_pool->rc, 1);
+
+    ray_release(vec->str_pool);
+    vec->str_pool = new_pool;
+    return vec;
+}
+
+/* --------------------------------------------------------------------------
+ * String pool dead-byte tracking
+ *
+ * Dead bytes are stored as a uint32_t in the pool block's nullmap[0..3],
+ * which is otherwise unused (the pool is a raw CHAR vector).
+ * -------------------------------------------------------------------------- */
+
+static inline uint32_t str_pool_dead(ray_t* vec) {
+    if (!vec->str_pool) return 0;
+    uint32_t d;
+    memcpy(&d, vec->str_pool->nullmap, 4);
+    return d;
+}
+
+static inline void str_pool_add_dead(ray_t* vec, uint32_t bytes) {
+    uint32_t d = str_pool_dead(vec);
+    d = (d > UINT32_MAX - bytes) ? UINT32_MAX : d + bytes;
+    memcpy(vec->str_pool->nullmap, &d, 4);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_str_vec_append — append a string to a RAY_STR vector
+ *
+ * Strings <= 12 bytes are inlined in the ray_str_t element.
+ * Strings > 12 bytes store a 4-byte prefix + offset into a growable pool.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_str_vec_append(ray_t* vec, const char* s, size_t len) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (vec->type != RAY_STR) return ray_error("type", NULL);
+    if (len > UINT32_MAX) return ray_error("range", NULL);
+
+    ray_t* original = vec;
+    vec = ray_cow(vec);
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (!str_pool_cow(vec)) goto fail_oom;
+
+    int64_t pool_off = 0;
+    if (len > RAY_STR_INLINE_MAX) {
+        if (!vec->str_pool) {
+            size_t init_pool = len < 256 ? 256 : len * 2;
+            vec->str_pool = ray_alloc(init_pool);
+            if (!vec->str_pool || RAY_IS_ERR(vec->str_pool)) {
+                vec->str_pool = NULL;
+                goto fail_oom;
+            }
+            vec->str_pool->type = RAY_U8;
+            vec->str_pool->len = 0;
+        }
+
+        int64_t pool_used = vec->str_pool->len;
+        size_t pool_cap = ((size_t)1 << vec->str_pool->order) - 32;
+        if ((size_t)pool_used + len > pool_cap) {
+            size_t need = (size_t)pool_used + len;
+            size_t new_cap = pool_cap;
+            if (new_cap == 0) new_cap = 256;
+            while (new_cap < need) {
+                if (new_cap > SIZE_MAX / 2) goto fail_oom;
+                new_cap *= 2;
+            }
+            ray_t* np = ray_scratch_realloc(vec->str_pool, new_cap);
+            if (!np || RAY_IS_ERR(np)) goto fail_oom;
+            vec->str_pool = np;
+        }
+
+        if ((uint64_t)pool_used > UINT32_MAX) goto fail_range;
+        pool_off = pool_used;
+    }
+
+    /* Grow element array if needed — pool is already ready */
+    int64_t cap = vec_capacity(vec);
+    if (vec->len >= cap) {
+        size_t new_data_size = (size_t)(vec->len + 1) * sizeof(ray_str_t);
+        if (new_data_size < 32) new_data_size = 32;
+        else {
+            size_t s2 = 32;
+            while (s2 < new_data_size) {
+                if (s2 > SIZE_MAX / 2) goto fail_oom;
+                s2 *= 2;
+            }
+            new_data_size = s2;
+        }
+        ray_t* nv = ray_scratch_realloc(vec, new_data_size);
+        if (!nv || RAY_IS_ERR(nv)) goto fail_oom;
+        vec = nv;
+    }
+
+    ray_str_t* elem = &((ray_str_t*)ray_data(vec))[vec->len];
+    memset(elem, 0, sizeof(ray_str_t));
+    elem->len = (uint32_t)len;
+
+    if (len <= RAY_STR_INLINE_MAX) {
+        if (len > 0) memcpy(elem->data, s, len);
+    } else {
+        /* Copy string into pool (already allocated above) */
+        char* pool_base = (char*)ray_data(vec->str_pool);
+        memcpy(pool_base + pool_off, s, len);
+
+        memcpy(elem->prefix, s, 4);
+        elem->pool_off = (uint32_t)pool_off;
+        vec->str_pool->len = pool_off + (int64_t)len;
+    }
+
+    vec->len++;
+    return vec;
+
+fail_oom:
+    if (vec != original) ray_release(vec);
+    return ray_error("oom", NULL);
+fail_range:
+    if (vec != original) ray_release(vec);
+    return ray_error("range", NULL);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_str_vec_get — read a string from a RAY_STR vector by index
+ *
+ * Returns a pointer to the string data (inline or pool) and sets *out_len.
+ * Returns NULL for invalid input or out-of-bounds index.
+ * -------------------------------------------------------------------------- */
+
+const char* ray_str_vec_get(ray_t* vec, int64_t idx, size_t* out_len) {
+    if (out_len) *out_len = 0;
+    if (!vec || RAY_IS_ERR(vec) || vec->type != RAY_STR) return NULL;
+    if (idx < 0 || idx >= vec->len) return NULL;
+
+    /* Slice: redirect to parent */
+    ray_t* data_owner = vec;
+    int64_t data_idx = idx;
+    if (vec->attrs & RAY_ATTR_SLICE) {
+        data_owner = vec->slice_parent;
+        data_idx = vec->slice_offset + idx;
+    }
+
+    const ray_str_t* elem = &((const ray_str_t*)ray_data(data_owner))[data_idx];
+    if (out_len) *out_len = elem->len;
+
+    if (elem->len == 0) return "";
+    if (ray_str_is_inline(elem)) return elem->data;
+
+    /* Pooled: resolve via pool */
+    if (!data_owner->str_pool) return NULL;
+    return (const char*)ray_data(data_owner->str_pool) + elem->pool_off;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_str_vec_set — update string at index in a RAY_STR vector
+ *
+ * Overwrites element at idx. Old pooled bytes become dead space (reclaimed
+ * by ray_str_vec_compact). New pooled strings are appended to the pool.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_str_vec_set(ray_t* vec, int64_t idx, const char* s, size_t len) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (vec->type != RAY_STR) return ray_error("type", NULL);
+    if (idx < 0 || idx >= vec->len) return ray_error("range", NULL);
+    if (len > UINT32_MAX) return ray_error("range", NULL);
+
+    ray_t* original = vec;
+    vec = ray_cow(vec);
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (!str_pool_cow(vec)) goto fail_oom;
+
+    ray_str_t* elem = &((ray_str_t*)ray_data(vec))[idx];
+
+    if (len <= RAY_STR_INLINE_MAX) {
+        /* Track dead bytes if old string was pooled */
+        if (!ray_str_is_inline(elem) && elem->len > 0 && vec->str_pool) {
+            str_pool_add_dead(vec, elem->len);
+        }
+        memset(elem, 0, sizeof(ray_str_t));
+        elem->len = (uint32_t)len;
+        if (len > 0) memcpy(elem->data, s, len);
+    } else {
+        if (!vec->str_pool) {
+            size_t init_pool = len < 256 ? 256 : len * 2;
+            vec->str_pool = ray_alloc(init_pool);
+            if (!vec->str_pool || RAY_IS_ERR(vec->str_pool)) {
+                vec->str_pool = NULL;
+                goto fail_oom;
+            }
+            vec->str_pool->type = RAY_U8;
+            vec->str_pool->len = 0;
+        }
+
+        /* Grow pool if needed */
+        int64_t pool_used = vec->str_pool->len;
+        size_t pool_cap = ((size_t)1 << vec->str_pool->order) - 32;
+        if ((size_t)pool_used + len > pool_cap) {
+            size_t need = (size_t)pool_used + len;
+            size_t new_cap = pool_cap;
+            if (new_cap == 0) new_cap = 256;
+            while (new_cap < need) {
+                if (new_cap > SIZE_MAX / 2) goto fail_oom;
+                new_cap *= 2;
+            }
+            ray_t* np = ray_scratch_realloc(vec->str_pool, new_cap);
+            if (!np || RAY_IS_ERR(np)) goto fail_oom;
+            vec->str_pool = np;
+        }
+
+        if ((uint64_t)pool_used > UINT32_MAX) goto fail_range;
+
+        /* Pool alloc succeeded — now safe to modify the element */
+        if (!ray_str_is_inline(elem) && elem->len > 0 && vec->str_pool) {
+            str_pool_add_dead(vec, elem->len);
+        }
+
+        char* pool_base = (char*)ray_data(vec->str_pool);
+        memcpy(pool_base + pool_used, s, len);
+        memset(elem, 0, sizeof(ray_str_t));
+        elem->len = (uint32_t)len;
+        memcpy(elem->prefix, s, 4);
+        elem->pool_off = (uint32_t)pool_used;
+        vec->str_pool->len = pool_used + (int64_t)len;
+    }
+
+    return vec;
+
+fail_oom:
+    if (vec != original) ray_release(vec);
+    return ray_error("oom", NULL);
+fail_range:
+    if (vec != original) ray_release(vec);
+    return ray_error("range", NULL);
+}
+
+/* --------------------------------------------------------------------------
+ * ray_str_vec_insert_at — insert a single string at position idx.
+ *
+ * Wraps (s, len) into a 1-element RAY_STR vector and delegates to
+ * ray_vec_insert_vec_at, which handles pool merging via ray_vec_concat.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_str_vec_insert_at(ray_t* vec, int64_t idx, const char* s, size_t len) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (vec->type != RAY_STR) return ray_error("type", NULL);
+    if (idx < 0 || idx > vec->len) return ray_error("range", NULL);
+
+    ray_t* tmp = ray_vec_new(RAY_STR, 1);
+    if (!tmp || RAY_IS_ERR(tmp)) return tmp ? tmp : ray_error("oom", NULL);
+
+    ray_t* tmp2 = ray_str_vec_append(tmp, s, len);
+    if (!tmp2 || RAY_IS_ERR(tmp2)) { ray_release(tmp); return tmp2 ? tmp2 : ray_error("oom", NULL); }
+
+    ray_t* result = ray_vec_insert_vec_at(vec, idx, tmp2);
+    ray_release(tmp2);
+    return result;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_str_vec_compact — reclaim dead pool space
+ *
+ * Allocates a fresh pool containing only live pooled strings, updates
+ * element offsets, and releases the old pool.
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_str_vec_compact(ray_t* vec) {
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (vec->type != RAY_STR) return ray_error("type", NULL);
+    if (!vec->str_pool || str_pool_dead(vec) == 0) return vec;
+
+    ray_t* original = vec;
+    vec = ray_cow(vec);
+    if (!vec || RAY_IS_ERR(vec)) return vec;
+    if (!str_pool_cow(vec)) {
+        if (vec != original) ray_release(vec);
+        return ray_error("oom", NULL);
+    }
+
+    /* Compute true live size by scanning elements — avoids overflow when
+     * the dead-byte counter (uint32_t) has saturated at UINT32_MAX. */
+    ray_str_t* elems = (ray_str_t*)ray_data(vec);
+    size_t live_size = 0;
+    for (int64_t i = 0; i < vec->len; i++) {
+        if (ray_vec_is_null(vec, i) || ray_str_is_inline(&elems[i]) || elems[i].len == 0) continue;
+        live_size += elems[i].len;
+    }
+
+    if (live_size == 0) {
+        ray_release(vec->str_pool);
+        vec->str_pool = NULL;
+        return vec;
+    }
+
+    ray_t* new_pool = ray_alloc(live_size);
+    if (!new_pool || RAY_IS_ERR(new_pool)) return vec;
+    new_pool->type = RAY_U8;
+    new_pool->len = 0;
+    memset(new_pool->nullmap, 0, 16);
+
+    char* old_base = (char*)ray_data(vec->str_pool);
+    char* new_base = (char*)ray_data(new_pool);
+    uint32_t write_off = 0;
+
+    for (int64_t i = 0; i < vec->len; i++) {
+        if (ray_vec_is_null(vec, i) || ray_str_is_inline(&elems[i]) || elems[i].len == 0) continue;
+
+        uint32_t slen = elems[i].len;
+        memcpy(new_base + write_off, old_base + elems[i].pool_off, slen);
+        elems[i].pool_off = write_off;
+        write_off += slen;
+    }
+
+    new_pool->len = (int64_t)write_off;
+    ray_release(vec->str_pool);
+    vec->str_pool = new_pool;
+
+    return vec;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_embedding_new — create a flat F32 vector for N*D embedding storage
+ * -------------------------------------------------------------------------- */
+
+ray_t* ray_embedding_new(int64_t nrows, int32_t dim) {
+    int64_t total = nrows * (int64_t)dim;
+    ray_t* v = ray_vec_new(RAY_F32, total);
+    if (!v || RAY_IS_ERR(v)) return v;
+    v->len = total;
+    return v;
+}
+
+bool ray_vec_is_null(ray_t* vec, int64_t idx) {
+    if (!vec || RAY_IS_ERR(vec)) return false;
+    if (idx < 0 || idx >= vec->len) return false;
+
+    /* Slice: delegate to parent with adjusted index */
+    if (vec->attrs & RAY_ATTR_SLICE) {
+        ray_t* parent = vec->slice_parent;
+        int64_t pidx = vec->slice_offset + idx;
+        return ray_vec_is_null(parent, pidx);
+    }
+
+    if (!vec_any_nulls(vec)) return false;
+
+    ray_t* ext = NULL;
+    const uint8_t* inline_bits = vec_inline_nullmap(vec, &ext);
+    if (ext) {
+        int64_t byte_idx = idx / 8;
+        if (byte_idx >= ext->len) return false;
+        const uint8_t* bits = (const uint8_t*)ray_data(ext);
+        return (bits[byte_idx] >> (idx % 8)) & 1;
+    }
+
+    /* Inline nullmap path.  RAY_STR's inline 16 bytes hold str_pool/str_ext_null
+     * (or, when an index is attached, were the same and are now in the index
+     * snapshot).  Either way, RAY_STR uses ext nullmap exclusively for its
+     * null bits, which is handled above; if the inline path is taken for
+     * RAY_STR, no nulls are present. */
+    if (vec->type == RAY_STR) return false;
+    if (idx >= 128) return false;
+    int byte_idx = (int)(idx / 8);
+    int bit_idx = (int)(idx % 8);
+    return (inline_bits[byte_idx] >> bit_idx) & 1;
+}
+
+/* --------------------------------------------------------------------------
+ * ray_vec_copy_nulls — bulk-copy null bitmap from src to dst
+ *
+ * dst must have the same len as src (or at least as many elements).
+ * Handles inline, external, and slice source bitmaps.
+ * -------------------------------------------------------------------------- */
+
+ray_err_t ray_vec_copy_nulls(ray_t* dst, const ray_t* src) {
+    if (!dst || !src) return RAY_ERR_TYPE;
+
+    /* Use ray_vec_is_null which handles slices, inline, and external bitmaps
+     * transparently. For non-null sources this returns immediately. */
+    bool has_any = false;
+    if (src->attrs & RAY_ATTR_SLICE) {
+        const ray_t* parent = src->slice_parent;
+        if (parent && (parent->attrs & RAY_ATTR_HAS_NULLS)) has_any = true;
+    } else {
+        if (src->attrs & RAY_ATTR_HAS_NULLS) has_any = true;
+    }
+    if (!has_any) return RAY_OK;
+
+    for (int64_t i = 0; i < dst->len && i < src->len; i++) {
+        if (ray_vec_is_null((ray_t*)src, i)) {
+            ray_err_t err = ray_vec_set_null_checked(dst, i, true);
+            if (err != RAY_OK) return err;
+        }
+    }
+    return RAY_OK;
+}
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/vec.h b/crates/rayforce-sys/vendor/rayforce/src/vec/vec.h
new file mode 100644
index 0000000..15d670e
--- /dev/null
+++ b/crates/rayforce-sys/vendor/rayforce/src/vec/vec.h
@@ -0,0 +1,58 @@
+/*
+ *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
+ *   All rights reserved.
+
+ *   Permission is hereby granted, free of charge, to any person obtaining a copy
+ *   of this software and associated documentation files (the "Software"), to deal
+ *   in the Software without restriction, including without limitation the rights
+ *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *   copies of the Software, and to permit persons to whom the Software is
+ *   furnished to do so, subject to the following conditions:
+
+ *   The above copyright notice and this permission notice shall be included in all
+ *   copies or substantial portions of the Software.
+
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *   SOFTWARE.
+ */
+
+#ifndef RAY_VEC_H
+#define RAY_VEC_H
+
+/*
+ * vec.h -- Vector operations.
+ *
+ * Vectors are ray_t blocks with positive type tags. Data follows the 32-byte
+ * header. Supports append, get, set, slice (zero-copy), concat, and nullable
+ * bitmap (inline for <=128 elements, external for >128).
+ */
+
+#include <rayforce.h>
+
+/* Copy null bitmap from src to dst (handles slices, inline, external).
+ * dst and src must have the same length. Internal helper. */
+ray_err_t ray_vec_copy_nulls(ray_t* dst, const ray_t* src);
+
+/* Return a pointer to the effective null bitmap bytes for `v`, accounting
+ * for slice / external / inline / HAS_INDEX storage forms.  Returns NULL
+ * when `v` has no nulls (caller should gate on `v->attrs & RAY_ATTR_HAS_NULLS`
+ * before calling for the cheap fast-path).
+ *
+ * On return:
+ *   *bit_offset_out (if non-NULL): bit-offset within the returned buffer
+ *      that corresponds to v's row 0.  Non-zero only for slices.
+ *   *len_bits_out  (if non-NULL): total bits addressable in the buffer.
+ *      For inline, this is 128.  For external, it's the ext->len * 8.
+ *
+ * The returned pointer is valid as long as `v` (and its ext_nullmap /
+ * attached index ray_t, if any) are not released or mutated. */
+const uint8_t* ray_vec_nullmap_bytes(const ray_t* v,
+                                     int64_t* bit_offset_out,
+                                     int64_t* len_bits_out);
+
+#endif /* RAY_VEC_H */
diff --git a/crates/raysense-cli/Cargo.toml b/crates/raysense-cli/Cargo.toml
index d92e2de..1ed1bc8 100644
--- a/crates/raysense-cli/Cargo.toml
+++ b/crates/raysense-cli/Cargo.toml
@@ -33,10 +33,14 @@ path = "src/main.rs"
 
 [dependencies]
 anyhow.workspace = true
+axum = "0.7"
 clap.workspace = true
 rayforce-sys = { path = "../rayforce-sys", version = "0.1.0" }
 raysense-core = { path = "../raysense-core", version = "0.1.0" }
 raysense-memory = { path = "../raysense-memory", version = "0.1.0" }
 serde.workspace = true
 serde_json.workspace = true
+sha2 = "0.10"
+tokio = { version = "1", features = ["rt-multi-thread", "macros", "sync", "time", "signal"] }
+tokio-stream = { version = "0.1", features = ["sync"] }
 toml = "1.1.2"
diff --git a/crates/raysense-cli/src/lib.rs b/crates/raysense-cli/src/lib.rs
index 6522996..0c54602 100644
--- a/crates/raysense-cli/src/lib.rs
+++ b/crates/raysense-cli/src/lib.rs
@@ -113,17 +113,18 @@ enum Command {
         #[arg(long)]
         config: Option<PathBuf>,
     },
+    /// Start a live UI server. The page subscribes to server-sent events and
+    /// reloads when the scan content hash changes — never on a fixed timer.
+    /// Single source of UI; no static HTML export.
     Visualize {
         #[arg(default_value = ".")]
         path: PathBuf,
-        #[arg(long)]
-        output: Option<PathBuf>,
-        #[arg(long)]
-        watch: bool,
         #[arg(long, default_value_t = 2)]
         interval: u64,
         #[arg(long)]
         config: Option<PathBuf>,
+        #[arg(long, default_value_t = 7000)]
+        port: u16,
     },
     Plugin {
         #[command(subcommand)]
@@ -374,11 +375,10 @@ pub fn run() -> Result<()> {
         } => watch_project(&path, config.as_deref(), interval)?,
         Command::Visualize {
             path,
-            output,
-            watch,
             interval,
             config,
-        } => visualize_project(&path, output, config.as_deref(), watch, interval)?,
+            port,
+        } => serve_visualization(&path, config.as_deref(), interval, port)?,
         Command::Plugin { command } => match command {
             PluginCommand::List { path, config } => list_plugins(&path, config.as_deref())?,
             PluginCommand::Add {
@@ -731,36 +731,167 @@ fn watch_project(root: &Path, config_path: Option<&Path>, interval: u64) -> Resu
     }
 }
 
-fn visualize_project(
+/// Run a tokio HTTP server that hosts the live visualization. The server
+/// re-scans on a fixed interval, only emits an SSE `data-changed` event when
+/// the new snapshot's content hash differs from the previous one, and serves
+/// the HTML page without any meta-refresh. Browsers connected to `/events`
+/// reload the page on each change; other state (filter selections, scroll,
+/// expanded panels) survives whenever data didn't actually change.
+fn serve_visualization(
     root: &Path,
-    output: Option<PathBuf>,
     config_path: Option<&Path>,
-    watch: bool,
     interval: u64,
+    port: u16,
 ) -> Result<()> {
-    let output = output.unwrap_or_else(|| root.join(".raysense/visualization.html"));
-    if let Some(parent) = output.parent() {
-        fs::create_dir_all(parent)
-            .with_context(|| format!("failed to create {}", parent.display()))?;
-    }
-    loop {
-        let config = config_for_root(root, config_path)?;
-        let report = scan_path_with_config(root, &config)?;
-        let health = compute_health_with_config(&report, &config);
-        fs::write(&output, visualization_html(&report, &health))
-            .with_context(|| format!("failed to write {}", output.display()))?;
+    let root = root.to_path_buf();
+    let config_path = config_path.map(Path::to_path_buf);
+    let interval = interval.max(1);
+
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .context("failed to start tokio runtime")?;
+
+    runtime.block_on(async move {
+        use axum::{
+            response::sse::{Event, KeepAlive, Sse},
+            response::{Html, IntoResponse},
+            routing::get,
+            Json, Router,
+        };
+        use std::sync::Arc;
+        use tokio::sync::{broadcast, RwLock};
+        use tokio_stream::wrappers::BroadcastStream;
+        use tokio_stream::StreamExt;
+
+        let initial = scan_now(&root, config_path.as_deref())?;
+        let state = Arc::new(LiveState {
+            inner: RwLock::new(initial),
+            tx: broadcast::channel::<()>(16).0,
+        });
+
+        let scanner_state = state.clone();
+        let scanner_root = root.clone();
+        let scanner_config = config_path.clone();
+        tokio::spawn(async move {
+            let mut ticker = tokio::time::interval(std::time::Duration::from_secs(interval));
+            ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+            ticker.tick().await; // first tick fires immediately; we already scanned.
+            loop {
+                ticker.tick().await;
+                let scan = match tokio::task::spawn_blocking({
+                    let root = scanner_root.clone();
+                    let cfg = scanner_config.clone();
+                    move || scan_now(&root, cfg.as_deref())
+                })
+                .await
+                {
+                    Ok(Ok(snap)) => snap,
+                    Ok(Err(err)) => {
+                        eprintln!("rescan failed: {err}");
+                        continue;
+                    }
+                    Err(err) => {
+                        eprintln!("rescan task panicked: {err}");
+                        continue;
+                    }
+                };
+                let mut current = scanner_state.inner.write().await;
+                if current.hash != scan.hash {
+                    *current = scan;
+                    let _ = scanner_state.tx.send(());
+                }
+            }
+        });
+
+        let html_state = state.clone();
+        let data_state = state.clone();
+        let events_state = state.clone();
+
+        let app = Router::new()
+            .route(
+                "/",
+                get(move || async move {
+                    let snap = html_state.inner.read().await;
+                    Html(snap.html.clone()).into_response()
+                }),
+            )
+            .route(
+                "/data",
+                get(move || async move {
+                    let snap = data_state.inner.read().await;
+                    Json(snap.payload.clone()).into_response()
+                }),
+            )
+            .route(
+                "/events",
+                get(move || async move {
+                    let rx = events_state.tx.subscribe();
+                    let stream = BroadcastStream::new(rx).map(|item| match item {
+                        Ok(()) => Ok(Event::default().event("data-changed")),
+                        Err(_) => Ok::<_, std::convert::Infallible>(
+                            Event::default().event("data-changed"),
+                        ),
+                    });
+                    Sse::new(stream).keep_alive(KeepAlive::default())
+                }),
+            );
+
+        let addr = std::net::SocketAddr::from(([127, 0, 0, 1], port));
+        let listener = tokio::net::TcpListener::bind(addr)
+            .await
+            .with_context(|| format!("failed to bind {addr}"))?;
         println!(
-            "visualization {} snapshot={} quality_signal={}",
-            output.display(),
-            report.snapshot.snapshot_id,
-            health.quality_signal
+            "visualization http://{addr} interval={interval}s — Ctrl+C to stop",
+            addr = addr,
+            interval = interval,
         );
-        if !watch {
-            break;
-        }
-        thread::sleep(Duration::from_secs(interval.max(1)));
-    }
-    Ok(())
+
+        axum::serve(listener, app)
+            .with_graceful_shutdown(async {
+                let _ = tokio::signal::ctrl_c().await;
+            })
+            .await
+            .context("server error")?;
+
+        Ok::<(), anyhow::Error>(())
+    })
+}
+
+struct LiveState {
+    inner: tokio::sync::RwLock<LiveSnapshot>,
+    tx: tokio::sync::broadcast::Sender<()>,
+}
+
+struct LiveSnapshot {
+    hash: String,
+    html: String,
+    payload: serde_json::Value,
+}
+
+fn scan_now(root: &Path, config_path: Option<&Path>) -> Result<LiveSnapshot> {
+    use sha2::{Digest, Sha256};
+    let config = config_for_root(root, config_path)?;
+    let report = scan_path_with_config(root, &config)?;
+    let health = compute_health_with_config(&report, &config);
+    let html = visualization_html(&report, &health);
+    let payload = serde_json::json!({
+        "snapshot_id": report.snapshot.snapshot_id,
+        "score": health.score,
+        "quality_signal": health.quality_signal,
+        "files": report.files.len(),
+        "functions": report.functions.len(),
+        "rules": health.rules.len(),
+    });
+    let mut hasher = Sha256::new();
+    hasher.update(report.snapshot.snapshot_id.as_bytes());
+    hasher.update(serde_json::to_vec(&payload).unwrap_or_default());
+    let hash = format!("{:x}", hasher.finalize());
+    Ok(LiveSnapshot {
+        hash,
+        html,
+        payload,
+    })
 }
 
 fn visualization_html(
@@ -1089,7 +1220,7 @@ fn visualization_html(
     .unwrap_or_else(|_| "{}".to_string());
     format!(
         r#"<!doctype html>
-<html><head><meta charset="utf-8"><meta http-equiv="refresh" content="10"><title>Raysense</title>
+<html><head><meta charset="utf-8"><title>Raysense</title>
 <style>
 body{{font-family:system-ui,sans-serif;margin:24px;background:#111;color:#eee;line-height:1.4}}
 .top{{display:flex;gap:24px;align-items:flex-end;flex-wrap:wrap}}
@@ -1432,6 +1563,15 @@ table{{border-collapse:collapse;width:100%;margin-top:16px}}td,th{{border-bottom
   }}
 }})();
 </script>
+<script>
+(function() {{
+  if (typeof EventSource !== 'function') return;
+  try {{
+    var es = new EventSource('/events');
+    es.addEventListener('data-changed', function() {{ location.reload(); }});
+  }} catch (_) {{}}
+}})();
+</script>
 </body></html>"#,
         health.quality_signal,
         health.score,

From cbdbef7988e30835713aeb2abefa5f305b8c1459 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Fri, 1 May 2026 12:37:00 +0200
Subject: [PATCH 2/5] refactor: collapse 5-crate workspace into a single crate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Layout becomes:
- Cargo.toml      one [package], all deps merged, single [[bin]]
- build.rs        moved from crates/rayforce-sys/ — vendor path adjusted
- src/lib.rs      module declarations + public re-exports
- src/main.rs     calls raysense::cli::run()
- src/sys.rs      was crates/rayforce-sys/src/lib.rs
- src/scanner.rs, health.rs, facts.rs, graph.rs, profile.rs, simulate.rs,
  baseline.rs    were under crates/raysense-core/src/
- src/memory.rs   was crates/raysense-memory/src/lib.rs
- src/cli.rs      was crates/raysense-cli/src/lib.rs
- src/mcp.rs      was crates/raysense-cli/src/mcp.rs
- vendor/rayforce was crates/rayforce-sys/vendor/rayforce

Cross-module references rewritten (raysense_core::, raysense_memory::,
raysense_cli::, rayforce_sys:: -> crate:: paths). Functions previously
private under the cli sub-crate that mcp.rs reaches via super:: are now
pub(crate). The duplicate [[bin]] declaration that warned on every
build is gone. CI and publish workflows trimmed to a single crate path
and no longer depend on an external rayforce checkout. Tests preserved:
123 inline tests, all green.
---
 .github/workflows/ci.yml                      | 11 ---
 .github/workflows/publish.yml                 | 29 +-----
 Cargo.toml                                    | 36 ++++---
 crates/rayforce-sys/build.rs => build.rs      |  0
 crates/rayforce-sys/Cargo.toml                | 32 -------
 crates/raysense-cli/Cargo.toml                | 46 ---------
 crates/raysense-cli/src/main.rs               | 26 -----
 crates/raysense-core/Cargo.toml               | 45 ---------
 crates/raysense-core/README.md                | 32 -------
 crates/raysense-memory/Cargo.toml             | 36 -------
 crates/raysense/Cargo.toml                    | 41 --------
 crates/raysense/README.md                     | 35 -------
 crates/raysense/src/lib.rs                    | 51 ----------
 {crates/raysense-core/src => src}/baseline.rs |  0
 crates/raysense-cli/src/lib.rs => src/cli.rs  | 78 ++++++++-------
 {crates/raysense-core/src => src}/facts.rs    |  0
 {crates/raysense-core/src => src}/graph.rs    |  0
 {crates/raysense-core/src => src}/health.rs   |  0
 {crates/raysense-core/src => src}/lib.rs      |  6 ++
 {crates/raysense/src => src}/main.rs          |  2 +-
 {crates/raysense-cli/src => src}/mcp.rs       | 68 ++++++-------
 .../src/lib.rs => src/memory.rs               | 96 +++++++++----------
 {crates/raysense-core/src => src}/profile.rs  |  0
 {crates/raysense-core/src => src}/scanner.rs  |  0
 {crates/raysense-core/src => src}/simulate.rs |  0
 crates/rayforce-sys/src/lib.rs => src/sys.rs  |  0
 .../vendor => vendor}/rayforce/LICENSE        |  0
 .../rayforce/include/rayforce.h               |  0
 .../rayforce/src/core/block.c                 |  0
 .../rayforce/src/core/block.h                 |  0
 .../rayforce/src/core/epoll.c                 |  0
 .../rayforce/src/core/iocp.c                  |  0
 .../vendor => vendor}/rayforce/src/core/ipc.c |  0
 .../vendor => vendor}/rayforce/src/core/ipc.h |  0
 .../rayforce/src/core/kqueue.c                |  0
 .../rayforce/src/core/morsel.c                |  0
 .../rayforce/src/core/morsel.h                |  0
 .../rayforce/src/core/numparse.c              |  0
 .../rayforce/src/core/numparse.h              |  0
 .../rayforce/src/core/platform.c              |  0
 .../rayforce/src/core/platform.h              |  0
 .../rayforce/src/core/poll.c                  |  0
 .../rayforce/src/core/poll.h                  |  0
 .../rayforce/src/core/pool.c                  |  0
 .../rayforce/src/core/pool.h                  |  0
 .../rayforce/src/core/profile.h               |  0
 .../rayforce/src/core/progress.c              |  0
 .../rayforce/src/core/runtime.c               |  0
 .../rayforce/src/core/runtime.h               |  0
 .../rayforce/src/core/sock.c                  |  0
 .../rayforce/src/core/sock.h                  |  0
 .../rayforce/src/core/types.c                 |  0
 .../rayforce/src/core/types.h                 |  0
 .../vendor => vendor}/rayforce/src/io/csv.c   |  0
 .../vendor => vendor}/rayforce/src/io/csv.h   |  0
 .../vendor => vendor}/rayforce/src/lang/cal.h |  0
 .../rayforce/src/lang/compile.c               |  0
 .../vendor => vendor}/rayforce/src/lang/env.c |  0
 .../vendor => vendor}/rayforce/src/lang/env.h |  0
 .../rayforce/src/lang/eval.c                  |  0
 .../rayforce/src/lang/eval.h                  |  0
 .../rayforce/src/lang/format.c                |  0
 .../rayforce/src/lang/format.h                |  0
 .../rayforce/src/lang/internal.h              |  0
 .../vendor => vendor}/rayforce/src/lang/nfo.c |  0
 .../vendor => vendor}/rayforce/src/lang/nfo.h |  0
 .../rayforce/src/lang/parse.c                 |  0
 .../rayforce/src/lang/parse.h                 |  0
 .../rayforce/src/lang/syscmd.c                |  0
 .../rayforce/src/lang/syscmd.h                |  0
 .../rayforce/src/mem/arena.c                  |  0
 .../rayforce/src/mem/arena.h                  |  0
 .../vendor => vendor}/rayforce/src/mem/cow.c  |  0
 .../vendor => vendor}/rayforce/src/mem/cow.h  |  0
 .../vendor => vendor}/rayforce/src/mem/heap.c |  0
 .../vendor => vendor}/rayforce/src/mem/heap.h |  0
 .../vendor => vendor}/rayforce/src/mem/sys.c  |  0
 .../vendor => vendor}/rayforce/src/mem/sys.h  |  0
 .../vendor => vendor}/rayforce/src/ops/agg.c  |  0
 .../rayforce/src/ops/arith.c                  |  0
 .../rayforce/src/ops/builtins.c               |  0
 .../vendor => vendor}/rayforce/src/ops/cmp.c  |  0
 .../rayforce/src/ops/collection.c             |  0
 .../rayforce/src/ops/datalog.c                |  0
 .../rayforce/src/ops/datalog.h                |  0
 .../vendor => vendor}/rayforce/src/ops/dump.c |  0
 .../rayforce/src/ops/embedding.c              |  0
 .../vendor => vendor}/rayforce/src/ops/exec.c |  0
 .../vendor => vendor}/rayforce/src/ops/exec.h |  0
 .../vendor => vendor}/rayforce/src/ops/expr.c |  0
 .../rayforce/src/ops/filter.c                 |  0
 .../vendor => vendor}/rayforce/src/ops/fuse.c |  0
 .../vendor => vendor}/rayforce/src/ops/fuse.h |  0
 .../vendor => vendor}/rayforce/src/ops/fvec.c |  0
 .../vendor => vendor}/rayforce/src/ops/fvec.h |  0
 .../vendor => vendor}/rayforce/src/ops/glob.c |  0
 .../vendor => vendor}/rayforce/src/ops/glob.h |  0
 .../rayforce/src/ops/graph.c                  |  0
 .../rayforce/src/ops/graph.h                  |  0
 .../rayforce/src/ops/group.c                  |  0
 .../vendor => vendor}/rayforce/src/ops/hash.h |  0
 .../rayforce/src/ops/idxop.c                  |  0
 .../rayforce/src/ops/idxop.h                  |  0
 .../rayforce/src/ops/internal.h               |  0
 .../vendor => vendor}/rayforce/src/ops/join.c |  0
 .../rayforce/src/ops/journal.c                |  0
 .../rayforce/src/ops/journal.h                |  0
 .../vendor => vendor}/rayforce/src/ops/lftj.c |  0
 .../vendor => vendor}/rayforce/src/ops/lftj.h |  0
 .../rayforce/src/ops/linkop.c                 |  0
 .../rayforce/src/ops/linkop.h                 |  0
 .../vendor => vendor}/rayforce/src/ops/ops.h  |  0
 .../vendor => vendor}/rayforce/src/ops/opt.c  |  0
 .../vendor => vendor}/rayforce/src/ops/opt.h  |  0
 .../vendor => vendor}/rayforce/src/ops/pipe.c |  0
 .../vendor => vendor}/rayforce/src/ops/pipe.h |  0
 .../rayforce/src/ops/pivot.c                  |  0
 .../vendor => vendor}/rayforce/src/ops/plan.c |  0
 .../vendor => vendor}/rayforce/src/ops/plan.h |  0
 .../rayforce/src/ops/query.c                  |  0
 .../rayforce/src/ops/rerank.c                 |  0
 .../rayforce/src/ops/rowsel.c                 |  0
 .../rayforce/src/ops/rowsel.h                 |  0
 .../vendor => vendor}/rayforce/src/ops/sort.c |  0
 .../rayforce/src/ops/string.c                 |  0
 .../rayforce/src/ops/strop.c                  |  0
 .../rayforce/src/ops/system.c                 |  0
 .../rayforce/src/ops/tblop.c                  |  0
 .../rayforce/src/ops/temporal.c               |  0
 .../rayforce/src/ops/temporal.h               |  0
 .../rayforce/src/ops/traverse.c               |  0
 .../rayforce/src/ops/window.c                 |  0
 .../rayforce/src/store/col.c                  |  0
 .../rayforce/src/store/col.h                  |  0
 .../rayforce/src/store/csr.c                  |  0
 .../rayforce/src/store/csr.h                  |  0
 .../rayforce/src/store/fileio.c               |  0
 .../rayforce/src/store/fileio.h               |  0
 .../rayforce/src/store/hnsw.c                 |  0
 .../rayforce/src/store/hnsw.h                 |  0
 .../rayforce/src/store/journal.c              |  0
 .../rayforce/src/store/journal.h              |  0
 .../rayforce/src/store/meta.c                 |  0
 .../rayforce/src/store/meta.h                 |  0
 .../rayforce/src/store/part.c                 |  0
 .../rayforce/src/store/part.h                 |  0
 .../rayforce/src/store/serde.c                |  0
 .../rayforce/src/store/serde.h                |  0
 .../rayforce/src/store/splay.c                |  0
 .../rayforce/src/store/splay.h                |  0
 .../rayforce/src/table/dict.c                 |  0
 .../rayforce/src/table/dict.h                 |  0
 .../rayforce/src/table/sym.c                  |  0
 .../rayforce/src/table/sym.h                  |  0
 .../rayforce/src/table/table.c                |  0
 .../rayforce/src/table/table.h                |  0
 .../vendor => vendor}/rayforce/src/vec/atom.c |  0
 .../vendor => vendor}/rayforce/src/vec/atom.h |  0
 .../rayforce/src/vec/embedding.h              |  0
 .../vendor => vendor}/rayforce/src/vec/list.c |  0
 .../vendor => vendor}/rayforce/src/vec/list.h |  0
 .../vendor => vendor}/rayforce/src/vec/sel.c  |  0
 .../vendor => vendor}/rayforce/src/vec/str.c  |  0
 .../vendor => vendor}/rayforce/src/vec/str.h  |  0
 .../vendor => vendor}/rayforce/src/vec/vec.c  |  0
 .../vendor => vendor}/rayforce/src/vec/vec.h  |  0
 166 files changed, 156 insertions(+), 514 deletions(-)
 rename crates/rayforce-sys/build.rs => build.rs (100%)
 delete mode 100644 crates/rayforce-sys/Cargo.toml
 delete mode 100644 crates/raysense-cli/Cargo.toml
 delete mode 100644 crates/raysense-cli/src/main.rs
 delete mode 100644 crates/raysense-core/Cargo.toml
 delete mode 100644 crates/raysense-core/README.md
 delete mode 100644 crates/raysense-memory/Cargo.toml
 delete mode 100644 crates/raysense/Cargo.toml
 delete mode 100644 crates/raysense/README.md
 delete mode 100644 crates/raysense/src/lib.rs
 rename {crates/raysense-core/src => src}/baseline.rs (100%)
 rename crates/raysense-cli/src/lib.rs => src/cli.rs (97%)
 rename {crates/raysense-core/src => src}/facts.rs (100%)
 rename {crates/raysense-core/src => src}/graph.rs (100%)
 rename {crates/raysense-core/src => src}/health.rs (100%)
 rename {crates/raysense-core/src => src}/lib.rs (96%)
 rename {crates/raysense/src => src}/main.rs (98%)
 rename {crates/raysense-cli/src => src}/mcp.rs (97%)
 rename crates/raysense-memory/src/lib.rs => src/memory.rs (95%)
 rename {crates/raysense-core/src => src}/profile.rs (100%)
 rename {crates/raysense-core/src => src}/scanner.rs (100%)
 rename {crates/raysense-core/src => src}/simulate.rs (100%)
 rename crates/rayforce-sys/src/lib.rs => src/sys.rs (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/LICENSE (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/include/rayforce.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/block.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/block.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/epoll.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/iocp.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/ipc.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/ipc.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/kqueue.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/morsel.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/morsel.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/numparse.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/numparse.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/platform.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/platform.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/poll.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/poll.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/pool.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/pool.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/profile.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/progress.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/runtime.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/runtime.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/sock.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/sock.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/types.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/core/types.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/io/csv.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/io/csv.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/cal.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/compile.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/env.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/env.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/eval.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/eval.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/format.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/format.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/internal.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/nfo.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/nfo.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/parse.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/parse.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/syscmd.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/lang/syscmd.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/mem/arena.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/mem/arena.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/mem/cow.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/mem/cow.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/mem/heap.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/mem/heap.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/mem/sys.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/mem/sys.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/agg.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/arith.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/builtins.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/cmp.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/collection.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/datalog.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/datalog.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/dump.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/embedding.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/exec.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/exec.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/expr.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/filter.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/fuse.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/fuse.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/fvec.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/fvec.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/glob.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/glob.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/graph.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/graph.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/group.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/hash.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/idxop.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/idxop.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/internal.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/join.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/journal.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/journal.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/lftj.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/lftj.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/linkop.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/linkop.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/ops.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/opt.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/opt.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/pipe.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/pipe.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/pivot.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/plan.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/plan.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/query.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/rerank.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/rowsel.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/rowsel.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/sort.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/string.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/strop.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/system.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/tblop.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/temporal.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/temporal.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/traverse.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/ops/window.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/col.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/col.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/csr.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/csr.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/fileio.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/fileio.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/hnsw.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/hnsw.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/journal.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/journal.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/meta.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/meta.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/part.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/part.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/serde.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/serde.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/splay.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/store/splay.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/table/dict.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/table/dict.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/table/sym.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/table/sym.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/table/table.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/table/table.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/atom.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/atom.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/embedding.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/list.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/list.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/sel.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/str.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/str.h (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/vec.c (100%)
 rename {crates/rayforce-sys/vendor => vendor}/rayforce/src/vec/vec.h (100%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 80f41cc..38744fd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,19 +40,8 @@ jobs:
       - name: Checkout raysense
         uses: actions/checkout@v6
 
-      - name: Checkout rayforce
-        uses: actions/checkout@v6
-        with:
-          repository: RayforceDB/rayforce
-          path: deps/rayforce
-
-      - name: Build rayforce library
-        run: make -C deps/rayforce lib
-
       - name: Check formatting
         run: cargo fmt --check
 
       - name: Run tests
         run: cargo test
-        env:
-          RAYFORCE_DIR: ${{ github.workspace }}/deps/rayforce
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c1aca36..80bbb3a 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -45,19 +45,8 @@ jobs:
       - name: Checkout raysense
         uses: actions/checkout@v6
 
-      - name: Checkout rayforce
-        uses: actions/checkout@v6
-        with:
-          repository: RayforceDB/rayforce
-          path: deps/rayforce
-
-      - name: Build rayforce library
-        run: make -C deps/rayforce lib
-
-      - name: Test workspace
+      - name: Test
         run: cargo test
-        env:
-          RAYFORCE_DIR: ${{ github.workspace }}/deps/rayforce
 
       - name: Package and publish crates
         shell: bash
@@ -119,30 +108,22 @@ jobs:
             wait_for_crate "$package" "$version"
           }
 
-          publish_crate rayforce-sys
-          publish_crate raysense-core
-          publish_crate raysense-memory
-          publish_crate raysense-cli
           publish_crate raysense
         env:
           CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
-          RAYFORCE_DIR: ${{ github.workspace }}/deps/rayforce
 
       - name: Post-release smoke
         if: ${{ github.event_name != 'workflow_dispatch' || inputs.dry_run != true }}
         shell: bash
         run: |
           set -euo pipefail
-          version="$(sed -n 's/^version = "\(.*\)"/\1/p' crates/raysense/Cargo.toml | head -n 1)"
+          version="$(sed -n 's/^version = "\(.*\)"/\1/p' Cargo.toml | head -n 1)"
           smoke_dir="$(mktemp -d)"
 
-          RAYFORCE_DIR="${{ github.workspace }}/deps/rayforce" \
-            cargo install raysense --version "$version" --root "$smoke_dir/install"
-          "$smoke_dir/install/bin/raysense" rayforce-version
+          cargo install raysense --version "$version" --root "$smoke_dir/install"
+          "$smoke_dir/install/bin/raysense" --version
 
           cargo new "$smoke_dir/library-smoke"
           cd "$smoke_dir/library-smoke"
           cargo add "raysense@$version"
-          RAYFORCE_DIR="${{ github.workspace }}/deps/rayforce" cargo check
-        env:
-          RAYFORCE_DIR: ${{ github.workspace }}/deps/rayforce
+          cargo check
diff --git a/Cargo.toml b/Cargo.toml
index afb972e..3f5d72a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,23 +19,23 @@
 #   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 #   SOFTWARE.
 
-[workspace]
-members = [
-    "crates/rayforce-sys",
-    "crates/raysense",
-    "crates/raysense-cli",
-    "crates/raysense-core",
-    "crates/raysense-memory",
-]
-resolver = "2"
-
-[workspace.package]
+[package]
+name = "raysense"
+version = "0.2.0"
 edition = "2021"
 license = "MIT"
 repository = "https://github.com/RayforceDB/raysense"
+description = "Architectural X-ray for your codebase. Live, local, agent-ready."
+readme = "README.md"
+links = "rayforce"
+
+[[bin]]
+name = "raysense"
+path = "src/main.rs"
 
-[workspace.dependencies]
+[dependencies]
 anyhow = "1"
+axum = "0.7"
 clap = { version = "4", features = ["derive"] }
 ignore = "0.4"
 libloading = "0.8"
@@ -44,4 +44,16 @@ serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 sha2 = "0.10"
 thiserror = "2"
+tokio = { version = "1", features = ["rt-multi-thread", "macros", "sync", "time", "signal"] }
+tokio-stream = { version = "0.1", features = ["sync"] }
+toml = "1.1.2"
+tree-sitter = "0.26.8"
+tree-sitter-c = "0.24.2"
+tree-sitter-cpp = "0.23.4"
 tree-sitter-language = "0.1"
+tree-sitter-python = "0.25.0"
+tree-sitter-rust = "0.24.2"
+tree-sitter-typescript = "0.23.2"
+
+[build-dependencies]
+cc = "1"
diff --git a/crates/rayforce-sys/build.rs b/build.rs
similarity index 100%
rename from crates/rayforce-sys/build.rs
rename to build.rs
diff --git a/crates/rayforce-sys/Cargo.toml b/crates/rayforce-sys/Cargo.toml
deleted file mode 100644
index 2e82ffe..0000000
--- a/crates/rayforce-sys/Cargo.toml
+++ /dev/null
@@ -1,32 +0,0 @@
-#   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
-#   All rights reserved.
-#
-#   Permission is hereby granted, free of charge, to any person obtaining a copy
-#   of this software and associated documentation files (the "Software"), to deal
-#   in the Software without restriction, including without limitation the rights
-#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#   copies of the Software, and to permit persons to whom the Software is
-#   furnished to do so, subject to the following conditions:
-#
-#   The above copyright notice and this permission notice shall be included in all
-#   copies or substantial portions of the Software.
-#
-#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-#   SOFTWARE.
-
-[package]
-name = "rayforce-sys"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-repository.workspace = true
-description = "Rust FFI bindings for Rayforce used by Raysense"
-links = "rayforce"
-
-[build-dependencies]
-cc = "1"
diff --git a/crates/raysense-cli/Cargo.toml b/crates/raysense-cli/Cargo.toml
deleted file mode 100644
index 1ed1bc8..0000000
--- a/crates/raysense-cli/Cargo.toml
+++ /dev/null
@@ -1,46 +0,0 @@
-#   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
-#   All rights reserved.
-#
-#   Permission is hereby granted, free of charge, to any person obtaining a copy
-#   of this software and associated documentation files (the "Software"), to deal
-#   in the Software without restriction, including without limitation the rights
-#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#   copies of the Software, and to permit persons to whom the Software is
-#   furnished to do so, subject to the following conditions:
-#
-#   The above copyright notice and this permission notice shall be included in all
-#   copies or substantial portions of the Software.
-#
-#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-#   SOFTWARE.
-
-[package]
-name = "raysense-cli"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-repository.workspace = true
-description = "Command line interface for Raysense"
-
-[[bin]]
-name = "raysense"
-path = "src/main.rs"
-
-[dependencies]
-anyhow.workspace = true
-axum = "0.7"
-clap.workspace = true
-rayforce-sys = { path = "../rayforce-sys", version = "0.1.0" }
-raysense-core = { path = "../raysense-core", version = "0.1.0" }
-raysense-memory = { path = "../raysense-memory", version = "0.1.0" }
-serde.workspace = true
-serde_json.workspace = true
-sha2 = "0.10"
-tokio = { version = "1", features = ["rt-multi-thread", "macros", "sync", "time", "signal"] }
-tokio-stream = { version = "0.1", features = ["sync"] }
-toml = "1.1.2"
diff --git a/crates/raysense-cli/src/main.rs b/crates/raysense-cli/src/main.rs
deleted file mode 100644
index a45e083..0000000
--- a/crates/raysense-cli/src/main.rs
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
- *   All rights reserved.
- *
- *   Permission is hereby granted, free of charge, to any person obtaining a copy
- *   of this software and associated documentation files (the "Software"), to deal
- *   in the Software without restriction, including without limitation the rights
- *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *   copies of the Software, and to permit persons to whom the Software is
- *   furnished to do so, subject to the following conditions:
- *
- *   The above copyright notice and this permission notice shall be included in all
- *   copies or substantial portions of the Software.
- *
- *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *   SOFTWARE.
- */
-
-fn main() -> anyhow::Result<()> {
-    raysense_cli::run()
-}
diff --git a/crates/raysense-core/Cargo.toml b/crates/raysense-core/Cargo.toml
deleted file mode 100644
index 7fe02f2..0000000
--- a/crates/raysense-core/Cargo.toml
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
-#   All rights reserved.
-#
-#   Permission is hereby granted, free of charge, to any person obtaining a copy
-#   of this software and associated documentation files (the "Software"), to deal
-#   in the Software without restriction, including without limitation the rights
-#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#   copies of the Software, and to permit persons to whom the Software is
-#   furnished to do so, subject to the following conditions:
-#
-#   The above copyright notice and this permission notice shall be included in all
-#   copies or substantial portions of the Software.
-#
-#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-#   SOFTWARE.
-
-[package]
-name = "raysense-core"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-repository.workspace = true
-description = "Core scanner and architectural fact model for Raysense"
-readme = "README.md"
-
-[dependencies]
-ignore.workspace = true
-libloading.workspace = true
-serde.workspace = true
-serde_json.workspace = true
-sha2.workspace = true
-thiserror.workspace = true
-tree-sitter-language.workspace = true
-toml = "1.1.2"
-tree-sitter = "0.26.8"
-tree-sitter-c = "0.24.2"
-tree-sitter-cpp = "0.23.4"
-tree-sitter-python = "0.25.0"
-tree-sitter-rust = "0.24.2"
-tree-sitter-typescript = "0.23.2"
diff --git a/crates/raysense-core/README.md b/crates/raysense-core/README.md
deleted file mode 100644
index 178a018..0000000
--- a/crates/raysense-core/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-<!--
-  Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
-  All rights reserved.
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in all
-  copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-  SOFTWARE.
--->
-
-# Raysense Core
-
-Core scanner and architectural fact model for Raysense.
-
-```rust
-let report = raysense_core::scan_path(".")?;
-println!("imports: {}", report.imports.len());
-# Ok::<(), raysense_core::ScanError>(())
-```
diff --git a/crates/raysense-memory/Cargo.toml b/crates/raysense-memory/Cargo.toml
deleted file mode 100644
index 061a06e..0000000
--- a/crates/raysense-memory/Cargo.toml
+++ /dev/null
@@ -1,36 +0,0 @@
-#   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
-#   All rights reserved.
-#
-#   Permission is hereby granted, free of charge, to any person obtaining a copy
-#   of this software and associated documentation files (the "Software"), to deal
-#   in the Software without restriction, including without limitation the rights
-#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#   copies of the Software, and to permit persons to whom the Software is
-#   furnished to do so, subject to the following conditions:
-#
-#   The above copyright notice and this permission notice shall be included in all
-#   copies or substantial portions of the Software.
-#
-#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-#   SOFTWARE.
-
-[package]
-name = "raysense-memory"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-repository.workspace = true
-description = "Rayforce-backed memory tables for Raysense"
-
-[dependencies]
-rayforce-sys = { path = "../rayforce-sys", version = "0.1.0" }
-raysense-core = { path = "../raysense-core", version = "0.1.0" }
-regex.workspace = true
-serde.workspace = true
-serde_json.workspace = true
-thiserror.workspace = true
diff --git a/crates/raysense/Cargo.toml b/crates/raysense/Cargo.toml
deleted file mode 100644
index 5f5a6c2..0000000
--- a/crates/raysense/Cargo.toml
+++ /dev/null
@@ -1,41 +0,0 @@
-#   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
-#   All rights reserved.
-#
-#   Permission is hereby granted, free of charge, to any person obtaining a copy
-#   of this software and associated documentation files (the "Software"), to deal
-#   in the Software without restriction, including without limitation the rights
-#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#   copies of the Software, and to permit persons to whom the Software is
-#   furnished to do so, subject to the following conditions:
-#
-#   The above copyright notice and this permission notice shall be included in all
-#   copies or substantial portions of the Software.
-#
-#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-#   SOFTWARE.
-
-[package]
-name = "raysense"
-version = "0.1.1"
-edition.workspace = true
-license.workspace = true
-repository.workspace = true
-description = "Local architectural telemetry for AI coding agents"
-readme = "README.md"
-
-[lib]
-path = "src/lib.rs"
-
-[[bin]]
-name = "raysense"
-path = "src/main.rs"
-
-[dependencies]
-anyhow.workspace = true
-raysense-core = { path = "../raysense-core", version = "0.1.0" }
-raysense-cli = { path = "../raysense-cli", version = "0.1.0" }
diff --git a/crates/raysense/README.md b/crates/raysense/README.md
deleted file mode 100644
index 0121df0..0000000
--- a/crates/raysense/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!--
-  Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
-  All rights reserved.
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in all
-  copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-  SOFTWARE.
--->
-
-# Raysense
-
-Raysense is local architectural telemetry for AI coding agents.
-
-The crate exposes the owned scanner and architectural fact model, and installs
-the `raysense` command line tool.
-
-```rust
-let report = raysense::scan_path(".")?;
-println!("files: {}", report.files.len());
-# Ok::<(), raysense::ScanError>(())
-```
diff --git a/crates/raysense/src/lib.rs b/crates/raysense/src/lib.rs
deleted file mode 100644
index 665019d..0000000
--- a/crates/raysense/src/lib.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *   Copyright (c) 2025-2026 Anton Kundenko <singaraiona@gmail.com>
- *   All rights reserved.
- *
- *   Permission is hereby granted, free of charge, to any person obtaining a copy
- *   of this software and associated documentation files (the "Software"), to deal
- *   in the Software without restriction, including without limitation the rights
- *   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *   copies of the Software, and to permit persons to whom the Software is
- *   furnished to do so, subject to the following conditions:
- *
- *   The above copyright notice and this permission notice shall be included in all
- *   copies or substantial portions of the Software.
- *
- *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *   SOFTWARE.
- */
-
-pub const NAME: &str = "raysense";
-pub const VERSION: &str = env!("CARGO_PKG_VERSION");
-
-pub use raysense_core::{
-    compute_health, scan_path, EntryPointFact, EntryPointKind, FileFact, FileHotspot, FunctionFact,
-    GraphMetrics, HealthSummary, ImportFact, ImportResolution, Language, Remediation,
-    ResolutionBreakdown, RuleFinding, RuleSeverity, ScanError, ScanReport, ScoreConfig,
-    SnapshotFact, TestGapCandidate, TrendMetrics,
-};
-
-pub fn package_name() -> &'static str {
-    NAME
-}
-
-pub fn package_version() -> &'static str {
-    VERSION
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn exposes_package_identity() {
-        assert_eq!(package_name(), "raysense");
-        assert_eq!(package_version(), env!("CARGO_PKG_VERSION"));
-    }
-}
diff --git a/crates/raysense-core/src/baseline.rs b/src/baseline.rs
similarity index 100%
rename from crates/raysense-core/src/baseline.rs
rename to src/baseline.rs
diff --git a/crates/raysense-cli/src/lib.rs b/src/cli.rs
similarity index 97%
rename from crates/raysense-cli/src/lib.rs
rename to src/cli.rs
index 0c54602..b84ac5a 100644
--- a/crates/raysense-cli/src/lib.rs
+++ b/src/cli.rs
@@ -21,15 +21,13 @@
  *   SOFTWARE.
  */
 
-#![recursion_limit = "256"]
-
 use anyhow::{anyhow, Context, Result};
 use clap::{Parser, Subcommand};
-use raysense_core::{
+use crate::{
     build_baseline, compute_health_with_config, diff_baselines, scan_path_with_config,
     BaselineDiff, ImportResolution, ProjectBaseline, RaysenseConfig,
 };
-use raysense_memory::{
+use crate::memory::{
     BaselineFilterMode, BaselineFilterOp, BaselineSortDirection, BaselineTableFilter,
     BaselineTableQuery, BaselineTableSort,
 };
@@ -42,7 +40,7 @@ use std::process;
 use std::thread;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
-mod mcp;
+use crate::mcp;
 
 #[derive(Debug, Parser)]
 #[command(name = "raysense")]
@@ -319,7 +317,7 @@ pub fn run() -> Result<()> {
             if json {
                 println!("{}", serde_json::to_string_pretty(&report)?);
             } else if memory {
-                let memory = raysense_memory::RayMemory::from_report_with_config(&report, &config)?;
+                let memory = crate::memory::RayMemory::from_report_with_config(&report, &config)?;
                 print_memory_summary(&memory.summary());
             } else {
                 print_summary(&report, &config);
@@ -341,12 +339,12 @@ pub fn run() -> Result<()> {
             print_edges(&report, all)?;
         }
         Command::RayforceVersion => {
-            println!("{}", rayforce_sys::version_string());
+            println!("{}", crate::sys::version_string());
         }
         Command::Memory { path, config } => {
             let config = config_for_root(&path, config.as_deref())?;
             let report = scan_path_with_config(path, &config)?;
-            let memory = raysense_memory::RayMemory::from_report_with_config(&report, &config)?;
+            let memory = crate::memory::RayMemory::from_report_with_config(&report, &config)?;
             print_memory_summary(&memory.summary());
         }
         Command::Check {
@@ -492,7 +490,7 @@ pub fn run() -> Result<()> {
                 let baseline = baseline.unwrap_or_else(default_baseline_dir);
                 let tables_dir = baseline.join("tables");
                 let tables =
-                    raysense_memory::list_baseline_tables(&tables_dir).with_context(|| {
+                    crate::memory::list_baseline_tables(&tables_dir).with_context(|| {
                         format!("failed to list baseline tables {}", tables_dir.display())
                     })?;
                 if json {
@@ -523,7 +521,7 @@ pub fn run() -> Result<()> {
                     filter_mode: parse_filter_mode(&filter_mode)?,
                     sort: parse_sort(&sort, desc)?,
                 };
-                let rows = raysense_memory::query_baseline_table(&tables_dir, &table, query)
+                let rows = crate::memory::query_baseline_table(&tables_dir, &table, query)
                     .with_context(|| {
                         format!("failed to read baseline table {}", tables_dir.display())
                     })?;
@@ -588,13 +586,13 @@ fn check_project(
     let has_errors = health
         .rules
         .iter()
-        .any(|rule| matches!(rule.severity, raysense_core::RuleSeverity::Error));
+        .any(|rule| matches!(rule.severity, crate::RuleSeverity::Error));
     Ok(if has_errors { 1 } else { 0 })
 }
 
-fn sarif_report(
-    report: &raysense_core::ScanReport,
-    health: &raysense_core::HealthSummary,
+pub(crate) fn sarif_report(
+    report: &crate::ScanReport,
+    health: &crate::HealthSummary,
 ) -> Value {
     let mut seen_rules = BTreeSet::new();
     let rules = health
@@ -666,11 +664,11 @@ fn sarif_report(
     })
 }
 
-fn sarif_level(severity: raysense_core::RuleSeverity) -> &'static str {
+fn sarif_level(severity: crate::RuleSeverity) -> &'static str {
     match severity {
-        raysense_core::RuleSeverity::Error => "error",
-        raysense_core::RuleSeverity::Warning => "warning",
-        raysense_core::RuleSeverity::Info => "note",
+        crate::RuleSeverity::Error => "error",
+        crate::RuleSeverity::Warning => "warning",
+        crate::RuleSeverity::Info => "note",
     }
 }
 
@@ -894,9 +892,9 @@ fn scan_now(root: &Path, config_path: Option<&Path>) -> Result<LiveSnapshot> {
     })
 }
 
-fn visualization_html(
-    report: &raysense_core::ScanReport,
-    health: &raysense_core::HealthSummary,
+pub(crate) fn visualization_html(
+    report: &crate::ScanReport,
+    health: &crate::HealthSummary,
 ) -> String {
     let max_lines = report
         .files
@@ -1671,11 +1669,11 @@ fn add_plugin(
     config
         .scan
         .plugins
-        .push(raysense_core::LanguagePluginConfig {
+        .push(crate::LanguagePluginConfig {
             name: name.to_string(),
             extensions,
             file_names,
-            ..raysense_core::LanguagePluginConfig::default()
+            ..crate::LanguagePluginConfig::default()
         });
     let toml = toml::to_string_pretty(&config).context("failed to encode config")?;
     fs::write(&path, toml).with_context(|| format!("failed to write {}", path.display()))?;
@@ -1693,7 +1691,7 @@ fn add_standard_plugins(root: &Path, config_path: Option<&Path>) -> Result<()> {
     } else {
         RaysenseConfig::default()
     };
-    let standard = raysense_core::standard_language_plugins();
+    let standard = crate::standard_language_plugins();
     for plugin in &standard {
         config
             .scan
@@ -1740,7 +1738,7 @@ pub(crate) fn validate_plugin_dir(dir: &Path) -> Result<Value> {
         .with_context(|| format!("failed to read {}", manifest_path.display()))?;
     let mut errors = Vec::new();
     let mut warnings = Vec::new();
-    let plugin: raysense_core::LanguagePluginConfig =
+    let plugin: crate::LanguagePluginConfig =
         match toml::from_str(&content).context("failed to parse plugin manifest") {
             Ok(plugin) => plugin,
             Err(error) => {
@@ -1828,7 +1826,7 @@ fn has_supported_query_capture(query: &str) -> bool {
     .any(|capture| query.contains(capture))
 }
 
-fn plugin_has_query_language(plugin: &raysense_core::LanguagePluginConfig) -> bool {
+fn plugin_has_query_language(plugin: &crate::LanguagePluginConfig) -> bool {
     plugin.grammar_path.is_some()
         || matches!(
             plugin.grammar.as_deref().unwrap_or(plugin.name.as_str()),
@@ -1863,7 +1861,7 @@ pub(crate) fn sync_standard_plugins(
     names: &[String],
     force: bool,
 ) -> Result<PluginSyncSummary> {
-    let plugins = raysense_core::standard_language_plugins();
+    let plugins = crate::standard_language_plugins();
     let filter: std::collections::HashSet<&str> = names.iter().map(String::as_str).collect();
     let mut summary = PluginSyncSummary::default();
     for plugin in plugins {
@@ -1959,7 +1957,7 @@ fn init_policy(root: &Path, config_path: Option<&Path>, preset: &str) -> Result<
     Ok(())
 }
 
-fn apply_policy_preset(config: &mut RaysenseConfig, preset: &str) -> Result<()> {
+pub(crate) fn apply_policy_preset(config: &mut RaysenseConfig, preset: &str) -> Result<()> {
     match preset {
         "rust-crate" => {
             config.scan.ignored_paths = vec!["target".to_string()];
@@ -1986,17 +1984,17 @@ fn apply_policy_preset(config: &mut RaysenseConfig, preset: &str) -> Result<()>
                 vec!["src".to_string(), "internal".to_string(), "pkg".to_string()];
             config.rules.max_function_complexity = 18;
             config.boundaries.layers = vec![
-                raysense_core::LayerConfig {
+                crate::LayerConfig {
                     name: "api".to_string(),
                     path: "src/api/*".to_string(),
                     order: 2,
                 },
-                raysense_core::LayerConfig {
+                crate::LayerConfig {
                     name: "domain".to_string(),
                     path: "src/domain/*".to_string(),
                     order: 1,
                 },
-                raysense_core::LayerConfig {
+                crate::LayerConfig {
                     name: "infra".to_string(),
                     path: "src/infra/*".to_string(),
                     order: 0,
@@ -2151,7 +2149,7 @@ fn save_baseline(root: &Path, output: &Path, config_path: Option<&Path>) -> Resu
     let report = scan_path_with_config(root, &config)?;
     let health = compute_health_with_config(&report, &config);
     let baseline = build_baseline(&report, &health);
-    let memory = raysense_memory::RayMemory::from_report_with_config(&report, &config)?;
+    let memory = crate::memory::RayMemory::from_report_with_config(&report, &config)?;
     let tables_dir = output.join("tables");
 
     fs::create_dir_all(output)
@@ -2298,7 +2296,7 @@ fn parse_sort_spec(sort: &str, desc: bool) -> Result<(String, BaselineSortDirect
     Ok((column.to_string(), direction))
 }
 
-fn print_memory_summary(summary: &raysense_memory::MemorySummary) {
+fn print_memory_summary(summary: &crate::memory::MemorySummary) {
     println!(
         "files rows={} cols={}",
         summary.files.rows, summary.files.columns
@@ -2345,14 +2343,14 @@ fn print_memory_summary(summary: &raysense_memory::MemorySummary) {
     );
 }
 
-fn print_baseline_tables(tables: &[raysense_memory::BaselineTableInfo]) {
+fn print_baseline_tables(tables: &[crate::memory::BaselineTableInfo]) {
     println!("name\trows\tcolumns");
     for table in tables {
         println!("{}\t{}\t{}", table.name, table.rows, table.columns);
     }
 }
 
-fn print_baseline_rows(rows: &raysense_memory::BaselineTableRows) {
+fn print_baseline_rows(rows: &crate::memory::BaselineTableRows) {
     println!(
         "table {} rows={} matched={} offset={} limit={}",
         rows.name, rows.total_rows, rows.matched_rows, rows.offset, rows.limit
@@ -2428,7 +2426,7 @@ fn print_baseline_diff(diff: &BaselineDiff) {
     }
 }
 
-fn print_summary(report: &raysense_core::ScanReport, config: &RaysenseConfig) {
+fn print_summary(report: &crate::ScanReport, config: &RaysenseConfig) {
     let health = compute_health_with_config(report, config);
     println!("snapshot {}", report.snapshot.snapshot_id);
     println!("root {}", report.snapshot.root.display());
@@ -2458,7 +2456,7 @@ fn print_summary(report: &raysense_core::ScanReport, config: &RaysenseConfig) {
     println!("max_fan_out {}", report.graph.max_fan_out);
 }
 
-fn print_health(report: &raysense_core::ScanReport, health: &raysense_core::HealthSummary) {
+fn print_health(report: &crate::ScanReport, health: &crate::HealthSummary) {
     println!("score {}", health.score);
     println!("quality_signal {}", health.quality_signal);
     println!("coverage_score {}", health.coverage_score);
@@ -2696,7 +2694,7 @@ fn print_health(report: &raysense_core::ScanReport, health: &raysense_core::Heal
     }
 }
 
-fn print_edges(report: &raysense_core::ScanReport, all: bool) -> io::Result<()> {
+fn print_edges(report: &crate::ScanReport, all: bool) -> io::Result<()> {
     let stdout = io::stdout();
     let mut stdout = stdout.lock();
 
@@ -2767,8 +2765,8 @@ mod tests {
 
     #[test]
     fn visualization_html_includes_color_mode_and_detail_panel() {
-        let report = raysense_core::scan_path(env!("CARGO_MANIFEST_DIR")).unwrap();
-        let health = raysense_core::compute_health(&report);
+        let report = crate::scan_path(env!("CARGO_MANIFEST_DIR")).unwrap();
+        let health = crate::compute_health(&report);
         let html = visualization_html(&report, &health);
         assert!(html.contains("id=\"color-mode\""));
         assert!(html.contains("data-churn"));
diff --git a/crates/raysense-core/src/facts.rs b/src/facts.rs
similarity index 100%
rename from crates/raysense-core/src/facts.rs
rename to src/facts.rs
diff --git a/crates/raysense-core/src/graph.rs b/src/graph.rs
similarity index 100%
rename from crates/raysense-core/src/graph.rs
rename to src/graph.rs
diff --git a/crates/raysense-core/src/health.rs b/src/health.rs
similarity index 100%
rename from crates/raysense-core/src/health.rs
rename to src/health.rs
diff --git a/crates/raysense-core/src/lib.rs b/src/lib.rs
similarity index 96%
rename from crates/raysense-core/src/lib.rs
rename to src/lib.rs
index 6e68b7d..a44bd15 100644
--- a/crates/raysense-core/src/lib.rs
+++ b/src/lib.rs
@@ -21,13 +21,19 @@
  *   SOFTWARE.
  */
 
+#![recursion_limit = "256"]
+
 pub mod baseline;
+pub mod cli;
 pub mod facts;
 pub mod graph;
 pub mod health;
+pub mod mcp;
+pub mod memory;
 pub mod profile;
 pub mod scanner;
 pub mod simulate;
+pub mod sys;
 
 pub use baseline::{
     build_baseline, diff_baselines, BaselineDiff, BaselineModuleEdge, ModuleEdgeDelta,
diff --git a/crates/raysense/src/main.rs b/src/main.rs
similarity index 98%
rename from crates/raysense/src/main.rs
rename to src/main.rs
index a45e083..62ab40c 100644
--- a/crates/raysense/src/main.rs
+++ b/src/main.rs
@@ -22,5 +22,5 @@
  */
 
 fn main() -> anyhow::Result<()> {
-    raysense_cli::run()
+    raysense::cli::run()
 }
diff --git a/crates/raysense-cli/src/mcp.rs b/src/mcp.rs
similarity index 97%
rename from crates/raysense-cli/src/mcp.rs
rename to src/mcp.rs
index 44d5eef..a506023 100644
--- a/crates/raysense-cli/src/mcp.rs
+++ b/src/mcp.rs
@@ -22,11 +22,11 @@
  */
 
 use anyhow::{anyhow, Context, Result};
-use raysense_core::{
+use crate::{
     build_baseline, compute_health_with_config, diff_baselines, is_foundation_file,
     scan_path_with_config, ImportResolution, ProjectBaseline, RaysenseConfig,
 };
-use raysense_memory::{
+use crate::memory::{
     BaselineFilterMode, BaselineFilterOp, BaselineSortDirection, BaselineTableFilter,
     BaselineTableQuery, BaselineTableSort,
 };
@@ -50,7 +50,7 @@ struct HealthCache {
     root: PathBuf,
     signature: String,
     report_root: PathBuf,
-    health: raysense_core::HealthSummary,
+    health: crate::HealthSummary,
 }
 
 /// Tools that mutate scan inputs (config, baselines, plugins, sessions) or
@@ -830,7 +830,7 @@ fn check_rules_tool(args: &Value) -> Result<Value> {
     let pass = !health
         .rules
         .iter()
-        .any(|rule| matches!(rule.severity, raysense_core::RuleSeverity::Error));
+        .any(|rule| matches!(rule.severity, crate::RuleSeverity::Error));
     Ok(json!({
         "root": root,
         "pass": pass,
@@ -905,7 +905,7 @@ fn visualize_tool(args: &Value) -> Result<Value> {
     }
     let report = scan_path_with_config(&root, &config)?;
     let health = compute_health_with_config(&report, &config);
-    let html = super::visualization_html(&report, &health);
+    let html = crate::cli::visualization_html(&report, &health);
     fs::write(&output, &html).with_context(|| format!("failed to write {}", output.display()))?;
 
     Ok(json!({
@@ -928,7 +928,7 @@ fn sarif_tool(args: &Value) -> Result<Value> {
     let include_sarif = bool_arg(args, "include_sarif", false)?;
     let report = scan_path_with_config(&root, &config)?;
     let health = compute_health_with_config(&report, &config);
-    let sarif = super::sarif_report(&report, &health);
+    let sarif = crate::cli::sarif_report(&report, &health);
 
     if let Some(path) = output.as_ref() {
         if let Some(parent) = path.parent() {
@@ -966,7 +966,7 @@ fn standard_plugins_tool(args: &Value) -> Result<Value> {
         .map(|_| limit_arg(args, usize::MAX))
         .transpose()?
         .unwrap_or(usize::MAX);
-    let plugins = raysense_core::standard_language_plugins();
+    let plugins = crate::standard_language_plugins();
     Ok(json!({
         "plugins": limited(&plugins, limit),
         "total": plugins.len()
@@ -990,11 +990,11 @@ fn plugin_add_tool(args: &Value) -> Result<Value> {
     config
         .scan
         .plugins
-        .push(raysense_core::LanguagePluginConfig {
+        .push(crate::LanguagePluginConfig {
             name: name.to_string(),
             extensions,
             file_names,
-            ..raysense_core::LanguagePluginConfig::default()
+            ..crate::LanguagePluginConfig::default()
         });
     write_config_path(&path, &config)?;
 
@@ -1009,7 +1009,7 @@ fn plugin_sync_tool(args: &Value) -> Result<Value> {
     let root = root_arg(args)?;
     let names = string_array_arg(args, "names")?;
     let force = args.get("force").and_then(Value::as_bool).unwrap_or(false);
-    let summary = super::sync_standard_plugins(&root, &names, force)?;
+    let summary = crate::cli::sync_standard_plugins(&root, &names, force)?;
     Ok(json!({
         "root": root,
         "wrote": summary.written.len(),
@@ -1023,7 +1023,7 @@ fn plugin_add_standard_tool(args: &Value) -> Result<Value> {
     let root = root_arg(args)?;
     let path = config_path_arg(args)?.unwrap_or_else(|| root.join(".raysense.toml"));
     let mut config = load_or_default_config(&path)?;
-    let standard = raysense_core::standard_language_plugins();
+    let standard = crate::standard_language_plugins();
     for plugin in &standard {
         config
             .scan
@@ -1075,7 +1075,7 @@ fn plugin_validate_tool(args: &Value) -> Result<Value> {
         .or_else(|| args.get("plugin_dir"))
         .and_then(Value::as_str)
         .ok_or_else(|| anyhow!("missing plugin dir"))?;
-    super::validate_plugin_dir(Path::new(dir))
+    crate::cli::validate_plugin_dir(Path::new(dir))
 }
 
 fn plugin_scaffold_tool(args: &Value) -> Result<Value> {
@@ -1088,8 +1088,8 @@ fn plugin_scaffold_tool(args: &Value) -> Result<Value> {
         .get("extension")
         .and_then(Value::as_str)
         .ok_or_else(|| anyhow!("missing plugin extension"))?;
-    let dir = super::scaffold_plugin(&root, name, extension)?;
-    let validation = super::validate_plugin_dir(&dir)?;
+    let dir = crate::cli::scaffold_plugin(&root, name, extension)?;
+    let validation = crate::cli::validate_plugin_dir(&dir)?;
     Ok(json!({
         "root": root,
         "dir": dir,
@@ -1182,8 +1182,8 @@ fn what_if_edge_tool(
     let before = build_baseline(&before_report, &before_health);
 
     let after_report = match action {
-        "remove_edge" => raysense_core::simulate::remove_edge(&before_report, from, to),
-        "add_edge" => raysense_core::simulate::add_edge(&before_report, from, to),
+        "remove_edge" => crate::simulate::remove_edge(&before_report, from, to),
+        "add_edge" => crate::simulate::add_edge(&before_report, from, to),
         _ => unreachable!("validated what-if action"),
     }
     .map_err(|err| anyhow!(err.to_string()))?;
@@ -1223,7 +1223,7 @@ fn what_if_remove_file_tool(args: &Value, root: &Path, config: &RaysenseConfig)
     let before_report = scan_path_with_config(root, config)?;
     let before_health = compute_health_with_config(&before_report, config);
     let before = build_baseline(&before_report, &before_health);
-    let after_report = raysense_core::simulate_remove_file(&before_report, file)
+    let after_report = crate::simulate_remove_file(&before_report, file)
         .map_err(|err| anyhow!(err.to_string()))?;
     let after_health = compute_health_with_config(&after_report, config);
     let after = build_baseline(&after_report, &after_health);
@@ -1250,7 +1250,7 @@ fn what_if_move_file_tool(args: &Value, root: &Path, config: &RaysenseConfig) ->
     let before_report = scan_path_with_config(root, config)?;
     let before_health = compute_health_with_config(&before_report, config);
     let before = build_baseline(&before_report, &before_health);
-    let after_report = raysense_core::simulate_move_file(&before_report, config, from, to)
+    let after_report = crate::simulate_move_file(&before_report, config, from, to)
         .map_err(|err| anyhow!(err.to_string()))?;
     let after_health = compute_health_with_config(&after_report, config);
     let after = build_baseline(&after_report, &after_health);
@@ -1278,7 +1278,7 @@ fn what_if_break_cycle_tool(args: &Value, root: &Path, config: &RaysenseConfig)
     let before_report = scan_path_with_config(root, config)?;
     let before_health = compute_health_with_config(&before_report, config);
     let before = build_baseline(&before_report, &before_health);
-    let after_report = raysense_core::simulate_break_cycle(&before_report, from, to)
+    let after_report = crate::simulate_break_cycle(&before_report, from, to)
         .map_err(|err| anyhow!(err.to_string()))?;
     let after_health = compute_health_with_config(&after_report, config);
     let after = build_baseline(&after_report, &after_health);
@@ -1309,7 +1309,7 @@ fn break_cycle_recommendations_tool(args: &Value) -> Result<Value> {
         .unwrap_or(500);
     let report = scan_path_with_config(&root, &config)?;
     let recommendations =
-        raysense_core::break_cycle_recommendations(&report, limit, max_candidates);
+        crate::break_cycle_recommendations(&report, limit, max_candidates);
     Ok(json!({
         "root": report.snapshot.root,
         "cycle_count_before": report.graph.cycle_count,
@@ -1319,7 +1319,7 @@ fn break_cycle_recommendations_tool(args: &Value) -> Result<Value> {
 }
 
 fn what_if_sequence_tool(actions: &[Value], root: &Path, config: &RaysenseConfig) -> Result<Value> {
-    let parsed_actions: Vec<raysense_core::simulate::Action> = actions
+    let parsed_actions: Vec<crate::simulate::Action> = actions
         .iter()
         .enumerate()
         .map(|(idx, step)| {
@@ -1332,7 +1332,7 @@ fn what_if_sequence_tool(actions: &[Value], root: &Path, config: &RaysenseConfig
     let before = build_baseline(&before_report, &before_health);
 
     let after_report =
-        raysense_core::simulate::simulate_sequence(&before_report, config, &parsed_actions)
+        crate::simulate::simulate_sequence(&before_report, config, &parsed_actions)
             .map_err(|err| anyhow!(err.to_string()))?;
 
     let after_health = compute_health_with_config(&after_report, config);
@@ -1346,7 +1346,7 @@ fn what_if_sequence_tool(actions: &[Value], root: &Path, config: &RaysenseConfig
     }))
 }
 
-fn what_if_health_summary(health: &raysense_core::HealthSummary) -> Value {
+fn what_if_health_summary(health: &crate::HealthSummary) -> Value {
     json!({
         "score": health.score,
         "quality_signal": health.quality_signal,
@@ -1382,7 +1382,7 @@ fn policy_init_tool(args: &Value) -> Result<Value> {
         .ok_or_else(|| anyhow!("missing preset"))?;
     let path = config_path_arg(args)?.unwrap_or_else(|| root.join(".raysense.toml"));
     let mut config = load_or_default_config(&path)?;
-    super::apply_policy_preset(&mut config, preset)?;
+    crate::cli::apply_policy_preset(&mut config, preset)?;
     write_config_path(&path, &config)?;
 
     Ok(json!({
@@ -1397,7 +1397,7 @@ fn memory_summary_tool(args: &Value) -> Result<Value> {
     let root = root_arg(args)?;
     let config = effective_config(args, &root)?;
     let report = scan_path_with_config(&root, &config)?;
-    let memory = raysense_memory::RayMemory::from_report_with_config(&report, &config)?;
+    let memory = crate::memory::RayMemory::from_report_with_config(&report, &config)?;
 
     Ok(json!({
         "root": report.snapshot.root,
@@ -1412,7 +1412,7 @@ fn baseline_save_tool(args: &Value) -> Result<Value> {
     let report = scan_path_with_config(&root, &config)?;
     let health = compute_health_with_config(&report, &config);
     let baseline = build_baseline(&report, &health);
-    let memory = raysense_memory::RayMemory::from_report_with_config(&report, &config)?;
+    let memory = crate::memory::RayMemory::from_report_with_config(&report, &config)?;
     let tables_dir = output.join("tables");
 
     fs::create_dir_all(&output)
@@ -1460,7 +1460,7 @@ fn baseline_tables_tool(args: &Value) -> Result<Value> {
     let root = root_arg(args)?;
     let baseline_dir = baseline_dir_arg(args, &root)?;
     let tables_dir = baseline_dir.join("tables");
-    let tables = raysense_memory::list_baseline_tables(&tables_dir)
+    let tables = crate::memory::list_baseline_tables(&tables_dir)
         .with_context(|| format!("failed to list baseline tables {}", tables_dir.display()))?;
 
     Ok(json!({
@@ -1497,7 +1497,7 @@ fn baseline_table_read_tool(args: &Value) -> Result<Value> {
         filter_mode: filter_mode_arg(args)?,
         sort: sort_arg(args)?,
     };
-    let table_rows = raysense_memory::query_baseline_table(&tables_dir, table, query)
+    let table_rows = crate::memory::query_baseline_table(&tables_dir, table, query)
         .with_context(|| format!("failed to read baseline table {}", tables_dir.display()))?;
 
     Ok(json!({
@@ -1507,7 +1507,7 @@ fn baseline_table_read_tool(args: &Value) -> Result<Value> {
     }))
 }
 
-fn health_from_args(args: &Value) -> Result<(PathBuf, raysense_core::HealthSummary)> {
+fn health_from_args(args: &Value) -> Result<(PathBuf, crate::HealthSummary)> {
     let root = root_arg(args)?;
     let config = effective_config(args, &root)?;
     let report = scan_path_with_config(&root, &config)?;
@@ -1521,7 +1521,7 @@ fn health_from_args(args: &Value) -> Result<(PathBuf, raysense_core::HealthSumma
 fn health_from_args_cached(
     args: &Value,
     state: &mut McpState,
-) -> Result<(PathBuf, raysense_core::HealthSummary)> {
+) -> Result<(PathBuf, crate::HealthSummary)> {
     let root = root_arg(args)?;
     let config = effective_config(args, &root)?;
     let signature = config_signature(&root, &config);
@@ -1570,7 +1570,7 @@ fn baseline_dir_arg(args: &Value, root: &Path) -> Result<PathBuf> {
         .map(|path| path.unwrap_or_else(|| root.join(".raysense/baseline")))
 }
 
-fn find_file_id(report: &raysense_core::ScanReport, requested: &str) -> Option<usize> {
+fn find_file_id(report: &crate::ScanReport, requested: &str) -> Option<usize> {
     let requested = requested.replace('\\', "/");
     report
         .files
@@ -1585,7 +1585,7 @@ fn find_file_id(report: &raysense_core::ScanReport, requested: &str) -> Option<u
         .map(|file| file.file_id)
 }
 
-fn reachable_files(report: &raysense_core::ScanReport, start: usize, limit: usize) -> Vec<Value> {
+fn reachable_files(report: &crate::ScanReport, start: usize, limit: usize) -> Vec<Value> {
     let adjacency = local_adjacency(report);
     let mut seen = HashSet::new();
     let mut queue = VecDeque::new();
@@ -1618,7 +1618,7 @@ fn reachable_files(report: &raysense_core::ScanReport, start: usize, limit: usiz
     out
 }
 
-fn reachable_count(report: &raysense_core::ScanReport, start: usize) -> usize {
+fn reachable_count(report: &crate::ScanReport, start: usize) -> usize {
     let adjacency = local_adjacency(report);
     let mut seen = HashSet::new();
     let mut queue = VecDeque::new();
@@ -1639,7 +1639,7 @@ fn reachable_count(report: &raysense_core::ScanReport, start: usize) -> usize {
     seen.len().saturating_sub(1)
 }
 
-fn local_adjacency(report: &raysense_core::ScanReport) -> HashMap<usize, Vec<usize>> {
+fn local_adjacency(report: &crate::ScanReport) -> HashMap<usize, Vec<usize>> {
     let mut adjacency: HashMap<usize, Vec<usize>> = HashMap::new();
     for import in &report.imports {
         let Some(to_file) = import.resolved_file else {
diff --git a/crates/raysense-memory/src/lib.rs b/src/memory.rs
similarity index 95%
rename from crates/raysense-memory/src/lib.rs
rename to src/memory.rs
index 6a804a9..07f6bcf 100644
--- a/crates/raysense-memory/src/lib.rs
+++ b/src/memory.rs
@@ -21,7 +21,7 @@
  *   SOFTWARE.
  */
 
-use raysense_core::{compute_health_with_config, HealthSummary, RaysenseConfig, ScanReport};
+use crate::{compute_health_with_config, HealthSummary, RaysenseConfig, ScanReport};
 use serde::Serialize;
 use std::ffi::CString;
 use std::fs;
@@ -300,14 +300,14 @@ impl RayMemory {
     fn save_table(
         &self,
         name: &'static str,
-        table: *mut rayforce_sys::ray_t,
+        table: *mut crate::sys::ray_t,
         base: &Path,
         sym_path: &Path,
     ) -> Result<(), MemoryError> {
         let path = CString::new(base.join(name).to_string_lossy().into_owned())?;
         let sym_path = CString::new(sym_path.to_string_lossy().into_owned())?;
-        let err = unsafe { rayforce_sys::ray_splay_save(table, path.as_ptr(), sym_path.as_ptr()) };
-        if err == rayforce_sys::RAY_OK {
+        let err = unsafe { crate::sys::ray_splay_save(table, path.as_ptr(), sym_path.as_ptr()) };
+        if err == crate::sys::RAY_OK {
             Ok(())
         } else {
             Err(MemoryError::SplaySave {
@@ -345,8 +345,8 @@ pub fn list_baseline_tables(dir: impl AsRef<Path>) -> Result<Vec<BaselineTableIn
         let table = read_table_object(dir, &name)?;
         tables.push(BaselineTableInfo {
             name,
-            columns: unsafe { rayforce_sys::ray_table_ncols(table.as_ptr()) },
-            rows: unsafe { rayforce_sys::ray_table_nrows(table.as_ptr()) },
+            columns: unsafe { crate::sys::ray_table_ncols(table.as_ptr()) },
+            rows: unsafe { crate::sys::ray_table_nrows(table.as_ptr()) },
         });
     }
 
@@ -400,7 +400,7 @@ fn read_table_object(dir: &Path, name: &str) -> Result<RayObject, MemoryError> {
         None
     };
     let ptr = unsafe {
-        rayforce_sys::ray_read_splayed(
+        crate::sys::ray_read_splayed(
             path.as_ptr(),
             sym_path
                 .as_ref()
@@ -413,9 +413,9 @@ fn read_table_object(dir: &Path, name: &str) -> Result<RayObject, MemoryError> {
             table: name.to_string(),
         });
     }
-    if unsafe { (*ptr).type_ } == rayforce_sys::RAY_ERROR {
+    if unsafe { (*ptr).type_ } == crate::sys::RAY_ERROR {
         let code = unsafe {
-            let code = rayforce_sys::ray_err_code(ptr);
+            let code = crate::sys::ray_err_code(ptr);
             if code.is_null() {
                 "unknown".to_string()
             } else {
@@ -434,18 +434,18 @@ fn read_table_object(dir: &Path, name: &str) -> Result<RayObject, MemoryError> {
 
 fn table_rows(
     name: &str,
-    table: *mut rayforce_sys::ray_t,
+    table: *mut crate::sys::ray_t,
     query: BaselineTableQuery,
 ) -> Result<BaselineTableRows, MemoryError> {
-    let total_rows = unsafe { rayforce_sys::ray_table_nrows(table) };
-    let ncols = unsafe { rayforce_sys::ray_table_ncols(table) };
+    let total_rows = unsafe { crate::sys::ray_table_nrows(table) };
+    let ncols = unsafe { crate::sys::ray_table_ncols(table) };
     let mut columns = Vec::new();
     let mut col_ptrs = Vec::new();
 
     for idx in 0..ncols {
-        let name_id = unsafe { rayforce_sys::ray_table_col_name(table, idx) };
+        let name_id = unsafe { crate::sys::ray_table_col_name(table, idx) };
         columns.push(symbol_text(name_id));
-        col_ptrs.push(unsafe { rayforce_sys::ray_table_get_col_idx(table, idx) });
+        col_ptrs.push(unsafe { crate::sys::ray_table_get_col_idx(table, idx) });
     }
 
     let projected = project_columns(&columns, query.columns.as_deref())?;
@@ -572,7 +572,7 @@ fn column_index(columns: &[String], name: &str) -> Result<usize, MemoryError> {
 }
 
 fn row_matches(
-    col_ptrs: &[*mut rayforce_sys::ray_t],
+    col_ptrs: &[*mut crate::sys::ray_t],
     row_idx: usize,
     filters: &[CompiledFilter],
     filter_mode: BaselineFilterMode,
@@ -676,14 +676,14 @@ fn value_sort_key(value: &serde_json::Value) -> String {
 }
 
 fn symbol_text(name_id: i64) -> String {
-    let atom = unsafe { rayforce_sys::ray_sym_str(name_id) };
+    let atom = unsafe { crate::sys::ray_sym_str(name_id) };
     if atom.is_null() {
         return format!("#{name_id}");
     }
     string_atom(atom).unwrap_or_else(|| format!("#{name_id}"))
 }
 
-fn cell_value(col: *mut rayforce_sys::ray_t, row_idx: i64) -> serde_json::Value {
+fn cell_value(col: *mut crate::sys::ray_t, row_idx: i64) -> serde_json::Value {
     if col.is_null() {
         return serde_json::Value::Null;
     }
@@ -693,37 +693,37 @@ fn cell_value(col: *mut rayforce_sys::ray_t, row_idx: i64) -> serde_json::Value
     }
 
     match unsafe { (*col).type_ } {
-        rayforce_sys::RAY_I32 => {
+        crate::sys::RAY_I32 => {
             let data = ray_data(col).cast::<i32>();
             serde_json::Value::from(unsafe { *data.add(row_idx as usize) })
         }
-        rayforce_sys::RAY_I64 => {
+        crate::sys::RAY_I64 => {
             let data = ray_data(col).cast::<i64>();
             serde_json::Value::from(unsafe { *data.add(row_idx as usize) })
         }
-        rayforce_sys::RAY_F64 => {
+        crate::sys::RAY_F64 => {
             let data = ray_data(col).cast::<f64>();
             serde_json::Number::from_f64(unsafe { *data.add(row_idx as usize) })
                 .map(serde_json::Value::Number)
                 .unwrap_or(serde_json::Value::Null)
         }
-        rayforce_sys::RAY_STR => string_vec_value(col, row_idx)
+        crate::sys::RAY_STR => string_vec_value(col, row_idx)
             .map(serde_json::Value::String)
             .unwrap_or(serde_json::Value::Null),
         other => serde_json::Value::String(format!("<unsupported type {other}>")),
     }
 }
 
-fn ray_data(obj: *mut rayforce_sys::ray_t) -> *const u8 {
+fn ray_data(obj: *mut crate::sys::ray_t) -> *const u8 {
     unsafe {
         obj.cast::<u8>()
-            .add(std::mem::size_of::<rayforce_sys::ray_t>())
+            .add(std::mem::size_of::<crate::sys::ray_t>())
     }
 }
 
-fn string_vec_value(col: *mut rayforce_sys::ray_t, row_idx: i64) -> Option<String> {
+fn string_vec_value(col: *mut crate::sys::ray_t, row_idx: i64) -> Option<String> {
     let mut len = 0usize;
-    let ptr = unsafe { rayforce_sys::ray_str_vec_get(col, row_idx, &mut len) };
+    let ptr = unsafe { crate::sys::ray_str_vec_get(col, row_idx, &mut len) };
     if ptr.is_null() {
         return None;
     }
@@ -733,9 +733,9 @@ fn string_vec_value(col: *mut rayforce_sys::ray_t, row_idx: i64) -> Option<Strin
     )
 }
 
-fn string_atom(atom: *mut rayforce_sys::ray_t) -> Option<String> {
-    let len = unsafe { rayforce_sys::ray_str_len(atom) };
-    let ptr = unsafe { rayforce_sys::ray_str_ptr(atom) };
+fn string_atom(atom: *mut crate::sys::ray_t) -> Option<String> {
+    let len = unsafe { crate::sys::ray_str_len(atom) };
+    let ptr = unsafe { crate::sys::ray_str_ptr(atom) };
     if ptr.is_null() {
         return None;
     }
@@ -746,21 +746,21 @@ fn string_atom(atom: *mut rayforce_sys::ray_t) -> Option<String> {
 }
 
 struct RayObject {
-    ptr: NonNull<rayforce_sys::ray_t>,
+    ptr: NonNull<crate::sys::ray_t>,
 }
 
 impl RayObject {
-    fn new(ptr: *mut rayforce_sys::ray_t, context: &'static str) -> Result<Self, MemoryError> {
+    fn new(ptr: *mut crate::sys::ray_t, context: &'static str) -> Result<Self, MemoryError> {
         NonNull::new(ptr)
             .map(|ptr| Self { ptr })
             .ok_or(MemoryError::Null(context))
     }
 
-    fn as_ptr(&self) -> *mut rayforce_sys::ray_t {
+    fn as_ptr(&self) -> *mut crate::sys::ray_t {
         self.ptr.as_ptr()
     }
 
-    fn into_raw(self) -> *mut rayforce_sys::ray_t {
+    fn into_raw(self) -> *mut crate::sys::ray_t {
         let ptr = self.ptr.as_ptr();
         std::mem::forget(self);
         ptr
@@ -770,14 +770,14 @@ impl RayObject {
 impl Drop for RayObject {
     fn drop(&mut self) {
         unsafe {
-            rayforce_sys::ray_release(self.ptr.as_ptr());
+            crate::sys::ray_release(self.ptr.as_ptr());
         }
     }
 }
 
 fn init_symbols() -> Result<(), MemoryError> {
-    let err = unsafe { rayforce_sys::ray_sym_init() };
-    if err == rayforce_sys::RAY_OK {
+    let err = unsafe { crate::sys::ray_sym_init() };
+    if err == crate::sys::RAY_OK {
         Ok(())
     } else {
         Err(MemoryError::SymbolInit(err))
@@ -1643,13 +1643,13 @@ fn i64_vec(
     values: impl IntoIterator<Item = i64>,
 ) -> Result<RayObject, MemoryError> {
     let mut vec = RayObject::new(
-        unsafe { rayforce_sys::ray_vec_new(rayforce_sys::RAY_I64, capacity as i64) },
+        unsafe { crate::sys::ray_vec_new(crate::sys::RAY_I64, capacity as i64) },
         "i64 vector",
     )?;
 
     for value in values {
         let next = unsafe {
-            rayforce_sys::ray_vec_append(
+            crate::sys::ray_vec_append(
                 vec.into_raw(),
                 (&value as *const i64).cast::<std::ffi::c_void>(),
             )
@@ -1665,14 +1665,14 @@ fn str_vec(
     values: impl IntoIterator<Item = String>,
 ) -> Result<RayObject, MemoryError> {
     let mut vec = RayObject::new(
-        unsafe { rayforce_sys::ray_vec_new(rayforce_sys::RAY_STR, capacity as i64) },
+        unsafe { crate::sys::ray_vec_new(crate::sys::RAY_STR, capacity as i64) },
         "string vector",
     )?;
 
     for value in values {
         let value = CString::new(value)?;
         let next = unsafe {
-            rayforce_sys::ray_str_vec_append(vec.into_raw(), value.as_ptr(), value.as_bytes().len())
+            crate::sys::ray_str_vec_append(vec.into_raw(), value.as_ptr(), value.as_bytes().len())
         };
         vec = RayObject::new(next, "string vector append")?;
     }
@@ -1685,32 +1685,32 @@ fn table<const N: usize>(
     columns: [(&'static str, RayObject); N],
 ) -> Result<RayObject, MemoryError> {
     let mut table = RayObject::new(
-        unsafe { rayforce_sys::ray_table_new(capacity) },
+        unsafe { crate::sys::ray_table_new(capacity) },
         "rayforce table",
     )?;
 
     for (name, col) in columns {
         let name = CString::new(name)?;
-        let name_id = unsafe { rayforce_sys::ray_sym_intern(name.as_ptr(), name.as_bytes().len()) };
+        let name_id = unsafe { crate::sys::ray_sym_intern(name.as_ptr(), name.as_bytes().len()) };
         let next =
-            unsafe { rayforce_sys::ray_table_add_col(table.into_raw(), name_id, col.as_ptr()) };
+            unsafe { crate::sys::ray_table_add_col(table.into_raw(), name_id, col.as_ptr()) };
         table = RayObject::new(next, "rayforce table column")?;
     }
 
     Ok(table)
 }
 
-fn table_summary(table: *mut rayforce_sys::ray_t) -> TableSummary {
+fn table_summary(table: *mut crate::sys::ray_t) -> TableSummary {
     TableSummary {
-        columns: unsafe { rayforce_sys::ray_table_ncols(table) },
-        rows: unsafe { rayforce_sys::ray_table_nrows(table) },
+        columns: unsafe { crate::sys::ray_table_ncols(table) },
+        rows: unsafe { crate::sys::ray_table_nrows(table) },
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use raysense_core::{scan_path, FileFact, Language, SnapshotFact};
+    use crate::{scan_path, FileFact, Language, SnapshotFact};
     use serde_json::json;
     use std::path::PathBuf;
     use std::time::{SystemTime, UNIX_EPOCH};
@@ -1740,7 +1740,7 @@ mod tests {
         assert_eq!(summary.rules.columns, 4);
         assert_eq!(summary.module_edges.columns, 3);
         assert_eq!(summary.changed_files.columns, 2);
-        let health = raysense_core::compute_health(&report);
+        let health = crate::compute_health(&report);
         assert_eq!(summary.temporal_hotspots.columns, 4);
         assert_eq!(
             summary.temporal_hotspots.rows as usize,
@@ -2038,7 +2038,7 @@ mod tests {
             calls: Vec::new(),
             call_edges: Vec::new(),
             types: Vec::new(),
-            graph: raysense_core::GraphMetrics::default(),
+            graph: crate::GraphMetrics::default(),
         }
     }
 
diff --git a/crates/raysense-core/src/profile.rs b/src/profile.rs
similarity index 100%
rename from crates/raysense-core/src/profile.rs
rename to src/profile.rs
diff --git a/crates/raysense-core/src/scanner.rs b/src/scanner.rs
similarity index 100%
rename from crates/raysense-core/src/scanner.rs
rename to src/scanner.rs
diff --git a/crates/raysense-core/src/simulate.rs b/src/simulate.rs
similarity index 100%
rename from crates/raysense-core/src/simulate.rs
rename to src/simulate.rs
diff --git a/crates/rayforce-sys/src/lib.rs b/src/sys.rs
similarity index 100%
rename from crates/rayforce-sys/src/lib.rs
rename to src/sys.rs
diff --git a/crates/rayforce-sys/vendor/rayforce/LICENSE b/vendor/rayforce/LICENSE
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/LICENSE
rename to vendor/rayforce/LICENSE
diff --git a/crates/rayforce-sys/vendor/rayforce/include/rayforce.h b/vendor/rayforce/include/rayforce.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/include/rayforce.h
rename to vendor/rayforce/include/rayforce.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/block.c b/vendor/rayforce/src/core/block.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/block.c
rename to vendor/rayforce/src/core/block.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/block.h b/vendor/rayforce/src/core/block.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/block.h
rename to vendor/rayforce/src/core/block.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/epoll.c b/vendor/rayforce/src/core/epoll.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/epoll.c
rename to vendor/rayforce/src/core/epoll.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/iocp.c b/vendor/rayforce/src/core/iocp.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/iocp.c
rename to vendor/rayforce/src/core/iocp.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/ipc.c b/vendor/rayforce/src/core/ipc.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/ipc.c
rename to vendor/rayforce/src/core/ipc.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/ipc.h b/vendor/rayforce/src/core/ipc.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/ipc.h
rename to vendor/rayforce/src/core/ipc.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/kqueue.c b/vendor/rayforce/src/core/kqueue.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/kqueue.c
rename to vendor/rayforce/src/core/kqueue.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/morsel.c b/vendor/rayforce/src/core/morsel.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/morsel.c
rename to vendor/rayforce/src/core/morsel.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/morsel.h b/vendor/rayforce/src/core/morsel.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/morsel.h
rename to vendor/rayforce/src/core/morsel.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/numparse.c b/vendor/rayforce/src/core/numparse.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/numparse.c
rename to vendor/rayforce/src/core/numparse.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/numparse.h b/vendor/rayforce/src/core/numparse.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/numparse.h
rename to vendor/rayforce/src/core/numparse.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/platform.c b/vendor/rayforce/src/core/platform.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/platform.c
rename to vendor/rayforce/src/core/platform.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/platform.h b/vendor/rayforce/src/core/platform.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/platform.h
rename to vendor/rayforce/src/core/platform.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/poll.c b/vendor/rayforce/src/core/poll.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/poll.c
rename to vendor/rayforce/src/core/poll.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/poll.h b/vendor/rayforce/src/core/poll.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/poll.h
rename to vendor/rayforce/src/core/poll.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/pool.c b/vendor/rayforce/src/core/pool.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/pool.c
rename to vendor/rayforce/src/core/pool.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/pool.h b/vendor/rayforce/src/core/pool.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/pool.h
rename to vendor/rayforce/src/core/pool.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/profile.h b/vendor/rayforce/src/core/profile.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/profile.h
rename to vendor/rayforce/src/core/profile.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/progress.c b/vendor/rayforce/src/core/progress.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/progress.c
rename to vendor/rayforce/src/core/progress.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/runtime.c b/vendor/rayforce/src/core/runtime.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/runtime.c
rename to vendor/rayforce/src/core/runtime.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/runtime.h b/vendor/rayforce/src/core/runtime.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/runtime.h
rename to vendor/rayforce/src/core/runtime.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/sock.c b/vendor/rayforce/src/core/sock.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/sock.c
rename to vendor/rayforce/src/core/sock.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/sock.h b/vendor/rayforce/src/core/sock.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/sock.h
rename to vendor/rayforce/src/core/sock.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/types.c b/vendor/rayforce/src/core/types.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/types.c
rename to vendor/rayforce/src/core/types.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/core/types.h b/vendor/rayforce/src/core/types.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/core/types.h
rename to vendor/rayforce/src/core/types.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/io/csv.c b/vendor/rayforce/src/io/csv.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/io/csv.c
rename to vendor/rayforce/src/io/csv.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/io/csv.h b/vendor/rayforce/src/io/csv.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/io/csv.h
rename to vendor/rayforce/src/io/csv.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/cal.h b/vendor/rayforce/src/lang/cal.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/cal.h
rename to vendor/rayforce/src/lang/cal.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/compile.c b/vendor/rayforce/src/lang/compile.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/compile.c
rename to vendor/rayforce/src/lang/compile.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/env.c b/vendor/rayforce/src/lang/env.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/env.c
rename to vendor/rayforce/src/lang/env.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/env.h b/vendor/rayforce/src/lang/env.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/env.h
rename to vendor/rayforce/src/lang/env.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/eval.c b/vendor/rayforce/src/lang/eval.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/eval.c
rename to vendor/rayforce/src/lang/eval.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/eval.h b/vendor/rayforce/src/lang/eval.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/eval.h
rename to vendor/rayforce/src/lang/eval.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/format.c b/vendor/rayforce/src/lang/format.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/format.c
rename to vendor/rayforce/src/lang/format.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/format.h b/vendor/rayforce/src/lang/format.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/format.h
rename to vendor/rayforce/src/lang/format.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/internal.h b/vendor/rayforce/src/lang/internal.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/internal.h
rename to vendor/rayforce/src/lang/internal.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/nfo.c b/vendor/rayforce/src/lang/nfo.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/nfo.c
rename to vendor/rayforce/src/lang/nfo.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/nfo.h b/vendor/rayforce/src/lang/nfo.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/nfo.h
rename to vendor/rayforce/src/lang/nfo.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/parse.c b/vendor/rayforce/src/lang/parse.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/parse.c
rename to vendor/rayforce/src/lang/parse.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/parse.h b/vendor/rayforce/src/lang/parse.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/parse.h
rename to vendor/rayforce/src/lang/parse.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.c b/vendor/rayforce/src/lang/syscmd.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.c
rename to vendor/rayforce/src/lang/syscmd.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.h b/vendor/rayforce/src/lang/syscmd.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/lang/syscmd.h
rename to vendor/rayforce/src/lang/syscmd.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/arena.c b/vendor/rayforce/src/mem/arena.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/mem/arena.c
rename to vendor/rayforce/src/mem/arena.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/arena.h b/vendor/rayforce/src/mem/arena.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/mem/arena.h
rename to vendor/rayforce/src/mem/arena.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/cow.c b/vendor/rayforce/src/mem/cow.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/mem/cow.c
rename to vendor/rayforce/src/mem/cow.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/cow.h b/vendor/rayforce/src/mem/cow.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/mem/cow.h
rename to vendor/rayforce/src/mem/cow.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/heap.c b/vendor/rayforce/src/mem/heap.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/mem/heap.c
rename to vendor/rayforce/src/mem/heap.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/heap.h b/vendor/rayforce/src/mem/heap.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/mem/heap.h
rename to vendor/rayforce/src/mem/heap.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/sys.c b/vendor/rayforce/src/mem/sys.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/mem/sys.c
rename to vendor/rayforce/src/mem/sys.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/mem/sys.h b/vendor/rayforce/src/mem/sys.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/mem/sys.h
rename to vendor/rayforce/src/mem/sys.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/agg.c b/vendor/rayforce/src/ops/agg.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/agg.c
rename to vendor/rayforce/src/ops/agg.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/arith.c b/vendor/rayforce/src/ops/arith.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/arith.c
rename to vendor/rayforce/src/ops/arith.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/builtins.c b/vendor/rayforce/src/ops/builtins.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/builtins.c
rename to vendor/rayforce/src/ops/builtins.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/cmp.c b/vendor/rayforce/src/ops/cmp.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/cmp.c
rename to vendor/rayforce/src/ops/cmp.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/collection.c b/vendor/rayforce/src/ops/collection.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/collection.c
rename to vendor/rayforce/src/ops/collection.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/datalog.c b/vendor/rayforce/src/ops/datalog.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/datalog.c
rename to vendor/rayforce/src/ops/datalog.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/datalog.h b/vendor/rayforce/src/ops/datalog.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/datalog.h
rename to vendor/rayforce/src/ops/datalog.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/dump.c b/vendor/rayforce/src/ops/dump.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/dump.c
rename to vendor/rayforce/src/ops/dump.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/embedding.c b/vendor/rayforce/src/ops/embedding.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/embedding.c
rename to vendor/rayforce/src/ops/embedding.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/exec.c b/vendor/rayforce/src/ops/exec.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/exec.c
rename to vendor/rayforce/src/ops/exec.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/exec.h b/vendor/rayforce/src/ops/exec.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/exec.h
rename to vendor/rayforce/src/ops/exec.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/expr.c b/vendor/rayforce/src/ops/expr.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/expr.c
rename to vendor/rayforce/src/ops/expr.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/filter.c b/vendor/rayforce/src/ops/filter.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/filter.c
rename to vendor/rayforce/src/ops/filter.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/fuse.c b/vendor/rayforce/src/ops/fuse.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/fuse.c
rename to vendor/rayforce/src/ops/fuse.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/fuse.h b/vendor/rayforce/src/ops/fuse.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/fuse.h
rename to vendor/rayforce/src/ops/fuse.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/fvec.c b/vendor/rayforce/src/ops/fvec.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/fvec.c
rename to vendor/rayforce/src/ops/fvec.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/fvec.h b/vendor/rayforce/src/ops/fvec.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/fvec.h
rename to vendor/rayforce/src/ops/fvec.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/glob.c b/vendor/rayforce/src/ops/glob.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/glob.c
rename to vendor/rayforce/src/ops/glob.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/glob.h b/vendor/rayforce/src/ops/glob.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/glob.h
rename to vendor/rayforce/src/ops/glob.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/graph.c b/vendor/rayforce/src/ops/graph.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/graph.c
rename to vendor/rayforce/src/ops/graph.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/graph.h b/vendor/rayforce/src/ops/graph.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/graph.h
rename to vendor/rayforce/src/ops/graph.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/group.c b/vendor/rayforce/src/ops/group.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/group.c
rename to vendor/rayforce/src/ops/group.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/hash.h b/vendor/rayforce/src/ops/hash.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/hash.h
rename to vendor/rayforce/src/ops/hash.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/idxop.c b/vendor/rayforce/src/ops/idxop.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/idxop.c
rename to vendor/rayforce/src/ops/idxop.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/idxop.h b/vendor/rayforce/src/ops/idxop.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/idxop.h
rename to vendor/rayforce/src/ops/idxop.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/internal.h b/vendor/rayforce/src/ops/internal.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/internal.h
rename to vendor/rayforce/src/ops/internal.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/join.c b/vendor/rayforce/src/ops/join.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/join.c
rename to vendor/rayforce/src/ops/join.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/journal.c b/vendor/rayforce/src/ops/journal.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/journal.c
rename to vendor/rayforce/src/ops/journal.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/journal.h b/vendor/rayforce/src/ops/journal.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/journal.h
rename to vendor/rayforce/src/ops/journal.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/lftj.c b/vendor/rayforce/src/ops/lftj.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/lftj.c
rename to vendor/rayforce/src/ops/lftj.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/lftj.h b/vendor/rayforce/src/ops/lftj.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/lftj.h
rename to vendor/rayforce/src/ops/lftj.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/linkop.c b/vendor/rayforce/src/ops/linkop.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/linkop.c
rename to vendor/rayforce/src/ops/linkop.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/linkop.h b/vendor/rayforce/src/ops/linkop.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/linkop.h
rename to vendor/rayforce/src/ops/linkop.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/ops.h b/vendor/rayforce/src/ops/ops.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/ops.h
rename to vendor/rayforce/src/ops/ops.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/opt.c b/vendor/rayforce/src/ops/opt.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/opt.c
rename to vendor/rayforce/src/ops/opt.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/opt.h b/vendor/rayforce/src/ops/opt.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/opt.h
rename to vendor/rayforce/src/ops/opt.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/pipe.c b/vendor/rayforce/src/ops/pipe.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/pipe.c
rename to vendor/rayforce/src/ops/pipe.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/pipe.h b/vendor/rayforce/src/ops/pipe.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/pipe.h
rename to vendor/rayforce/src/ops/pipe.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/pivot.c b/vendor/rayforce/src/ops/pivot.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/pivot.c
rename to vendor/rayforce/src/ops/pivot.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/plan.c b/vendor/rayforce/src/ops/plan.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/plan.c
rename to vendor/rayforce/src/ops/plan.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/plan.h b/vendor/rayforce/src/ops/plan.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/plan.h
rename to vendor/rayforce/src/ops/plan.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/query.c b/vendor/rayforce/src/ops/query.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/query.c
rename to vendor/rayforce/src/ops/query.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/rerank.c b/vendor/rayforce/src/ops/rerank.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/rerank.c
rename to vendor/rayforce/src/ops/rerank.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.c b/vendor/rayforce/src/ops/rowsel.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.c
rename to vendor/rayforce/src/ops/rowsel.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.h b/vendor/rayforce/src/ops/rowsel.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/rowsel.h
rename to vendor/rayforce/src/ops/rowsel.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/sort.c b/vendor/rayforce/src/ops/sort.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/sort.c
rename to vendor/rayforce/src/ops/sort.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/string.c b/vendor/rayforce/src/ops/string.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/string.c
rename to vendor/rayforce/src/ops/string.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/strop.c b/vendor/rayforce/src/ops/strop.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/strop.c
rename to vendor/rayforce/src/ops/strop.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/system.c b/vendor/rayforce/src/ops/system.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/system.c
rename to vendor/rayforce/src/ops/system.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/tblop.c b/vendor/rayforce/src/ops/tblop.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/tblop.c
rename to vendor/rayforce/src/ops/tblop.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/temporal.c b/vendor/rayforce/src/ops/temporal.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/temporal.c
rename to vendor/rayforce/src/ops/temporal.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/temporal.h b/vendor/rayforce/src/ops/temporal.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/temporal.h
rename to vendor/rayforce/src/ops/temporal.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/traverse.c b/vendor/rayforce/src/ops/traverse.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/traverse.c
rename to vendor/rayforce/src/ops/traverse.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/ops/window.c b/vendor/rayforce/src/ops/window.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/ops/window.c
rename to vendor/rayforce/src/ops/window.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/col.c b/vendor/rayforce/src/store/col.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/col.c
rename to vendor/rayforce/src/store/col.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/col.h b/vendor/rayforce/src/store/col.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/col.h
rename to vendor/rayforce/src/store/col.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/csr.c b/vendor/rayforce/src/store/csr.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/csr.c
rename to vendor/rayforce/src/store/csr.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/csr.h b/vendor/rayforce/src/store/csr.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/csr.h
rename to vendor/rayforce/src/store/csr.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/fileio.c b/vendor/rayforce/src/store/fileio.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/fileio.c
rename to vendor/rayforce/src/store/fileio.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/fileio.h b/vendor/rayforce/src/store/fileio.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/fileio.h
rename to vendor/rayforce/src/store/fileio.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/hnsw.c b/vendor/rayforce/src/store/hnsw.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/hnsw.c
rename to vendor/rayforce/src/store/hnsw.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/hnsw.h b/vendor/rayforce/src/store/hnsw.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/hnsw.h
rename to vendor/rayforce/src/store/hnsw.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/journal.c b/vendor/rayforce/src/store/journal.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/journal.c
rename to vendor/rayforce/src/store/journal.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/journal.h b/vendor/rayforce/src/store/journal.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/journal.h
rename to vendor/rayforce/src/store/journal.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/meta.c b/vendor/rayforce/src/store/meta.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/meta.c
rename to vendor/rayforce/src/store/meta.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/meta.h b/vendor/rayforce/src/store/meta.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/meta.h
rename to vendor/rayforce/src/store/meta.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/part.c b/vendor/rayforce/src/store/part.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/part.c
rename to vendor/rayforce/src/store/part.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/part.h b/vendor/rayforce/src/store/part.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/part.h
rename to vendor/rayforce/src/store/part.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/serde.c b/vendor/rayforce/src/store/serde.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/serde.c
rename to vendor/rayforce/src/store/serde.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/serde.h b/vendor/rayforce/src/store/serde.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/serde.h
rename to vendor/rayforce/src/store/serde.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/splay.c b/vendor/rayforce/src/store/splay.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/splay.c
rename to vendor/rayforce/src/store/splay.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/store/splay.h b/vendor/rayforce/src/store/splay.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/store/splay.h
rename to vendor/rayforce/src/store/splay.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/dict.c b/vendor/rayforce/src/table/dict.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/table/dict.c
rename to vendor/rayforce/src/table/dict.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/dict.h b/vendor/rayforce/src/table/dict.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/table/dict.h
rename to vendor/rayforce/src/table/dict.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/sym.c b/vendor/rayforce/src/table/sym.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/table/sym.c
rename to vendor/rayforce/src/table/sym.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/sym.h b/vendor/rayforce/src/table/sym.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/table/sym.h
rename to vendor/rayforce/src/table/sym.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/table.c b/vendor/rayforce/src/table/table.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/table/table.c
rename to vendor/rayforce/src/table/table.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/table/table.h b/vendor/rayforce/src/table/table.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/table/table.h
rename to vendor/rayforce/src/table/table.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/atom.c b/vendor/rayforce/src/vec/atom.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/atom.c
rename to vendor/rayforce/src/vec/atom.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/atom.h b/vendor/rayforce/src/vec/atom.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/atom.h
rename to vendor/rayforce/src/vec/atom.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/embedding.h b/vendor/rayforce/src/vec/embedding.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/embedding.h
rename to vendor/rayforce/src/vec/embedding.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/list.c b/vendor/rayforce/src/vec/list.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/list.c
rename to vendor/rayforce/src/vec/list.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/list.h b/vendor/rayforce/src/vec/list.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/list.h
rename to vendor/rayforce/src/vec/list.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/sel.c b/vendor/rayforce/src/vec/sel.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/sel.c
rename to vendor/rayforce/src/vec/sel.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/str.c b/vendor/rayforce/src/vec/str.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/str.c
rename to vendor/rayforce/src/vec/str.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/str.h b/vendor/rayforce/src/vec/str.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/str.h
rename to vendor/rayforce/src/vec/str.h
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/vec.c b/vendor/rayforce/src/vec/vec.c
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/vec.c
rename to vendor/rayforce/src/vec/vec.c
diff --git a/crates/rayforce-sys/vendor/rayforce/src/vec/vec.h b/vendor/rayforce/src/vec/vec.h
similarity index 100%
rename from crates/rayforce-sys/vendor/rayforce/src/vec/vec.h
rename to vendor/rayforce/src/vec/vec.h

From 0235ae06c384b75d6a987cb232e0f5502e14cda7 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Fri, 1 May 2026 12:42:25 +0200
Subject: [PATCH 3/5] =?UTF-8?q?refactor:=20flag-based=20CLI=20=E2=80=94=20?=
 =?UTF-8?q?`raysense=20[PATH]`=20default,=20top-level=20mode=20flags?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the Command enum's 16 subcommands with a top-level Args struct
that takes the scan path as an optional positional and exposes the
common modes as flags:

  raysense .              -> health report (default, was `health`)
  raysense . --json       -> JSON of the health summary
  raysense . --check      -> rule gate, was `check`
  raysense . --watch      -> watch mode, was `watch`
  raysense . --ui [PORT]  -> live UI server, was `visualize`
  raysense --mcp          -> stdio MCP server, was `mcp`

Advanced operations (baseline / plugin / policy / trend / whatif) stay
as subcommands so their multi-arg shapes don't pollute the simple path.
Drops `observe` / `health` (now default), `edges` / `memory` (rolled
into the default report or accessible via baseline tables), `gate`
(use `--check` plus `baseline diff`), `remediate` (out of scope here),
and `rayforce-version` (use `--rayforce-version` flag or `--version`).

README rewritten with the new shape: the user-facing surface is now six
flags plus five power-user subcommands.
---
 README.md  |  36 ++---
 src/cli.rs | 452 ++++++++++++++---------------------------------------
 2 files changed, 123 insertions(+), 365 deletions(-)

diff --git a/README.md b/README.md
index 4017b72..f7c6420 100644
--- a/README.md
+++ b/README.md
@@ -49,36 +49,20 @@ Or build from source — see [Building](#building) below.
 
 ## Use
 
-Three things, one binary.
-
-**Live dashboard.** Open it once, leave it open. Updates the moment your
-code does, never on a fixed timer.
-
-```bash
-raysense visualize .
-```
-
-**Health report.** A single number out of 100, plus A–F grades on six
-dimensions, plus the rules currently failing.
-
-```bash
-raysense health .
-```
-
-**CI gate.** Exit non-zero if any rule fails or scores drop against a
-saved baseline.
+One command, a few flags. The default is a health report.
 
 ```bash
-raysense check .
+raysense .                  # health report
+raysense . --json           # machine-readable JSON
+raysense . --check          # CI gate, exits non-zero on rule failures
+raysense . --watch          # rescan + reprint on a 2s loop
+raysense . --ui             # live dashboard at http://localhost:7000
+raysense --mcp              # stdio MCP server for agents
 ```
 
-**Agent connector.** Hook Raysense into Claude, Cursor, or any MCP-capable
-client. 40+ tools — scan, edges, hotspots, what-if simulation, baseline
-diff, evolution metrics — all queryable.
-
-```bash
-raysense mcp
-```
+Power-user operations live as subcommands: `baseline save|diff`,
+`plugin sync`, `policy init`, `trend record|show`, `whatif`. See
+`raysense --help` for the full surface.
 
 ## What it measures
 
diff --git a/src/cli.rs b/src/cli.rs
index b84ac5a..a89a61a 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -21,20 +21,19 @@
  *   SOFTWARE.
  */
 
-use anyhow::{anyhow, Context, Result};
-use clap::{Parser, Subcommand};
-use crate::{
-    build_baseline, compute_health_with_config, diff_baselines, scan_path_with_config,
-    BaselineDiff, ImportResolution, ProjectBaseline, RaysenseConfig,
-};
 use crate::memory::{
     BaselineFilterMode, BaselineFilterOp, BaselineSortDirection, BaselineTableFilter,
     BaselineTableQuery, BaselineTableSort,
 };
+use crate::{
+    build_baseline, compute_health_with_config, diff_baselines, scan_path_with_config,
+    BaselineDiff, ProjectBaseline, RaysenseConfig,
+};
+use anyhow::{anyhow, Context, Result};
+use clap::{Parser, Subcommand};
 use serde_json::{json, Value};
 use std::collections::{BTreeMap, BTreeSet};
 use std::fs;
-use std::io::{self, Write};
 use std::path::{Path, PathBuf};
 use std::process;
 use std::thread;
@@ -42,109 +41,85 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use crate::mcp;
 
+/// One-tool CLI: `raysense [path]` runs a health report by default.
+/// Top-level flags pick a different mode (json, ui, watch, check, mcp).
+/// Advanced operations (baseline / plugin / policy / trend / whatif) live as
+/// subcommands so their multi-arg shapes don't pollute the simple path.
 #[derive(Debug, Parser)]
 #[command(name = "raysense")]
-#[command(about = "Local architectural telemetry for AI coding agents")]
+#[command(version)]
+#[command(about = "Architectural X-ray for your codebase. Live, local, agent-ready.")]
 struct Args {
+    /// Path to scan. Default: current directory.
+    #[arg(default_value = ".")]
+    path: PathBuf,
+
+    /// Emit machine-readable JSON instead of human-readable text.
+    #[arg(long)]
+    json: bool,
+
+    /// Run the rule gate. Exits non-zero if any rule fails.
+    #[arg(long)]
+    check: bool,
+
+    /// With `--check`: also write a SARIF code-scanning report here.
+    #[arg(long, value_name = "PATH")]
+    sarif: Option<PathBuf>,
+
+    /// Watch mode: rescan on a fixed interval and reprint.
+    #[arg(long)]
+    watch: bool,
+
+    /// Start the live UI HTTP server. Optional port (default 7000).
+    #[arg(long, value_name = "PORT", num_args = 0..=1, default_missing_value = "7000")]
+    ui: Option<u16>,
+
+    /// Re-scan interval in seconds (used by `--watch` and `--ui`).
+    #[arg(long, default_value_t = 2)]
+    interval: u64,
+
+    /// Run as a stdio MCP server. Path is ignored.
+    #[arg(long)]
+    mcp: bool,
+
+    /// Print the linked C library version and exit.
+    #[arg(long)]
+    rayforce_version: bool,
+
+    /// Optional explicit `.raysense.toml` path.
+    #[arg(long, value_name = "FILE")]
+    config: Option<PathBuf>,
+
     #[command(subcommand)]
-    command: Command,
+    advanced: Option<Command>,
 }
 
+/// Advanced subcommands. Most users never need these — the top-level flags
+/// cover the common 90 %.
 #[derive(Debug, Subcommand)]
 enum Command {
-    Observe {
-        path: PathBuf,
-        #[arg(long)]
-        json: bool,
-        #[arg(long)]
-        memory: bool,
-        #[arg(long)]
-        config: Option<PathBuf>,
-    },
-    Health {
-        path: PathBuf,
-        #[arg(long)]
-        json: bool,
-        #[arg(long)]
-        config: Option<PathBuf>,
-    },
-    Edges {
-        path: PathBuf,
-        #[arg(long)]
-        all: bool,
-        #[arg(long)]
-        config: Option<PathBuf>,
-    },
-    RayforceVersion,
-    Memory {
-        path: PathBuf,
-        #[arg(long)]
-        config: Option<PathBuf>,
-    },
-    Check {
-        #[arg(default_value = ".")]
-        path: PathBuf,
-        #[arg(long)]
-        config: Option<PathBuf>,
-        #[arg(long)]
-        json: bool,
-        #[arg(long)]
-        sarif: Option<PathBuf>,
-    },
-    Gate {
-        #[arg(default_value = ".")]
-        path: PathBuf,
-        #[arg(long)]
-        save: bool,
-        #[arg(long)]
-        baseline: Option<PathBuf>,
-        #[arg(long)]
-        config: Option<PathBuf>,
-        #[arg(long)]
-        json: bool,
-    },
-    Watch {
-        #[arg(default_value = ".")]
-        path: PathBuf,
-        #[arg(long, default_value_t = 2)]
-        interval: u64,
-        #[arg(long)]
-        config: Option<PathBuf>,
-    },
-    /// Start a live UI server. The page subscribes to server-sent events and
-    /// reloads when the scan content hash changes — never on a fixed timer.
-    /// Single source of UI; no static HTML export.
-    Visualize {
-        #[arg(default_value = ".")]
-        path: PathBuf,
-        #[arg(long, default_value_t = 2)]
-        interval: u64,
-        #[arg(long)]
-        config: Option<PathBuf>,
-        #[arg(long, default_value_t = 7000)]
-        port: u16,
+    /// Save / diff / query a baseline of the current scan.
+    Baseline {
+        #[command(subcommand)]
+        command: BaselineCommand,
     },
+    /// Manage language plugins (list / add / sync / validate / scaffold).
     Plugin {
         #[command(subcommand)]
         command: PluginCommand,
     },
+    /// Apply or list rule policy presets.
     Policy {
         #[command(subcommand)]
         command: PolicyCommand,
     },
+    /// Record / show health-score trend snapshots.
     Trend {
         #[command(subcommand)]
         command: TrendCommand,
     },
-    Remediate {
-        #[arg(default_value = ".")]
-        path: PathBuf,
-        #[arg(long)]
-        config: Option<PathBuf>,
-        #[arg(long)]
-        json: bool,
-    },
-    WhatIf {
+    /// What-if simulation: rescan with extra ignored / generated paths.
+    Whatif {
         #[arg(default_value = ".")]
         path: PathBuf,
         #[arg(long)]
@@ -156,11 +131,6 @@ enum Command {
         #[arg(long)]
         json: bool,
     },
-    Baseline {
-        #[command(subcommand)]
-        command: BaselineCommand,
-    },
-    Mcp,
 }
 
 #[derive(Debug, Subcommand)]
@@ -305,78 +275,47 @@ enum BaselineCommand {
 pub fn run() -> Result<()> {
     let args = Args::parse();
 
-    match args.command {
-        Command::Observe {
-            path,
-            json,
-            memory,
-            config,
-        } => {
-            let config = config_for_root(&path, config.as_deref())?;
-            let report = scan_path_with_config(path, &config)?;
-            if json {
-                println!("{}", serde_json::to_string_pretty(&report)?);
-            } else if memory {
-                let memory = crate::memory::RayMemory::from_report_with_config(&report, &config)?;
-                print_memory_summary(&memory.summary());
-            } else {
-                print_summary(&report, &config);
-            }
-        }
-        Command::Health { path, json, config } => {
-            let config = config_for_root(&path, config.as_deref())?;
-            let report = scan_path_with_config(path, &config)?;
-            let health = compute_health_with_config(&report, &config);
-            if json {
-                println!("{}", serde_json::to_string_pretty(&health)?);
-            } else {
-                print_health(&report, &health);
-            }
-        }
-        Command::Edges { path, all, config } => {
-            let config = config_for_root(&path, config.as_deref())?;
-            let report = scan_path_with_config(path, &config)?;
-            print_edges(&report, all)?;
-        }
-        Command::RayforceVersion => {
-            println!("{}", crate::sys::version_string());
-        }
-        Command::Memory { path, config } => {
-            let config = config_for_root(&path, config.as_deref())?;
-            let report = scan_path_with_config(path, &config)?;
-            let memory = crate::memory::RayMemory::from_report_with_config(&report, &config)?;
-            print_memory_summary(&memory.summary());
-        }
-        Command::Check {
-            path,
-            config,
-            json,
-            sarif,
-        } => {
-            let exit = check_project(&path, config.as_deref(), json, sarif.as_deref())?;
-            process::exit(exit);
-        }
-        Command::Gate {
-            path,
-            save,
-            baseline,
-            config,
-            json,
-        } => {
-            let exit = gate_project(&path, baseline, config.as_deref(), save, json)?;
-            process::exit(exit);
-        }
-        Command::Watch {
-            path,
-            interval,
-            config,
-        } => watch_project(&path, config.as_deref(), interval)?,
-        Command::Visualize {
-            path,
-            interval,
-            config,
-            port,
-        } => serve_visualization(&path, config.as_deref(), interval, port)?,
+    if let Some(command) = args.advanced {
+        return run_advanced(command);
+    }
+
+    if args.rayforce_version {
+        println!("{}", crate::sys::version_string());
+        return Ok(());
+    }
+    if args.mcp {
+        return mcp::run();
+    }
+    if let Some(port) = args.ui {
+        return serve_visualization(&args.path, args.config.as_deref(), args.interval, port);
+    }
+    if args.watch {
+        return watch_project(&args.path, args.config.as_deref(), args.interval);
+    }
+    if args.check {
+        let exit = check_project(
+            &args.path,
+            args.config.as_deref(),
+            args.json,
+            args.sarif.as_deref(),
+        )?;
+        process::exit(exit);
+    }
+
+    // Default mode: health report.
+    let config = config_for_root(&args.path, args.config.as_deref())?;
+    let report = scan_path_with_config(&args.path, &config)?;
+    let health = compute_health_with_config(&report, &config);
+    if args.json {
+        println!("{}", serde_json::to_string_pretty(&health)?);
+    } else {
+        print_health(&report, &health);
+    }
+    Ok(())
+}
+
+fn run_advanced(command: Command) -> Result<()> {
+    match command {
         Command::Plugin { command } => match command {
             PluginCommand::List { path, config } => list_plugins(&path, config.as_deref())?,
             PluginCommand::Add {
@@ -446,10 +385,7 @@ pub fn run() -> Result<()> {
                 show_trend(&path, config.as_deref(), json)?
             }
         },
-        Command::Remediate { path, config, json } => {
-            print_remediations(&path, config.as_deref(), json)?
-        }
-        Command::WhatIf {
+        Command::Whatif {
             path,
             config,
             ignore_paths,
@@ -532,9 +468,6 @@ pub fn run() -> Result<()> {
                 }
             }
         },
-        Command::Mcp => {
-            mcp::run()?;
-        }
     }
 
     Ok(())
@@ -590,10 +523,7 @@ fn check_project(
     Ok(if has_errors { 1 } else { 0 })
 }
 
-pub(crate) fn sarif_report(
-    report: &crate::ScanReport,
-    health: &crate::HealthSummary,
-) -> Value {
+pub(crate) fn sarif_report(report: &crate::ScanReport, health: &crate::HealthSummary) -> Value {
     let mut seen_rules = BTreeSet::new();
     let rules = health
         .rules
@@ -682,32 +612,6 @@ fn sarif_uri(root: &Path, path: &str) -> String {
     }
 }
 
-fn gate_project(
-    root: &Path,
-    baseline: Option<PathBuf>,
-    config_path: Option<&Path>,
-    save: bool,
-    json: bool,
-) -> Result<i32> {
-    let baseline = baseline.unwrap_or_else(|| root.join(".raysense/baseline"));
-    if save {
-        save_baseline(root, &baseline, config_path)?;
-        println!("baseline {}", baseline.display());
-        return Ok(0);
-    }
-    let diff = diff_baseline(root, &baseline, config_path)?;
-    if json {
-        println!("{}", serde_json::to_string_pretty(&diff)?);
-    } else {
-        print_baseline_diff(&diff);
-    }
-    Ok(if diff.score_delta < 0 || !diff.added_rules.is_empty() {
-        1
-    } else {
-        0
-    })
-}
-
 fn watch_project(root: &Path, config_path: Option<&Path>, interval: u64) -> Result<()> {
     let mut last_snapshot = String::new();
     loop {
@@ -1666,15 +1570,12 @@ fn add_plugin(
         RaysenseConfig::default()
     };
     config.scan.plugins.retain(|plugin| plugin.name != name);
-    config
-        .scan
-        .plugins
-        .push(crate::LanguagePluginConfig {
-            name: name.to_string(),
-            extensions,
-            file_names,
-            ..crate::LanguagePluginConfig::default()
-        });
+    config.scan.plugins.push(crate::LanguagePluginConfig {
+        name: name.to_string(),
+        extensions,
+        file_names,
+        ..crate::LanguagePluginConfig::default()
+    });
     let toml = toml::to_string_pretty(&config).context("failed to encode config")?;
     fs::write(&path, toml).with_context(|| format!("failed to write {}", path.display()))?;
     println!("plugin {} {}", name, path.display());
@@ -2062,21 +1963,6 @@ fn show_trend(root: &Path, config_path: Option<&Path>, json: bool) -> Result<()>
     Ok(())
 }
 
-fn print_remediations(root: &Path, config_path: Option<&Path>, json: bool) -> Result<()> {
-    let config = config_for_root(root, config_path)?;
-    let report = scan_path_with_config(root, &config)?;
-    let health = compute_health_with_config(&report, &config);
-    if json {
-        println!("{}", serde_json::to_string_pretty(&health.remediations)?);
-    } else {
-        for item in health.remediations {
-            println!("{} {} - {}", item.code, item.path, item.action);
-            println!("  {}", item.command);
-        }
-    }
-    Ok(())
-}
-
 fn print_what_if(
     root: &Path,
     config_path: Option<&Path>,
@@ -2296,53 +2182,6 @@ fn parse_sort_spec(sort: &str, desc: bool) -> Result<(String, BaselineSortDirect
     Ok((column.to_string(), direction))
 }
 
-fn print_memory_summary(summary: &crate::memory::MemorySummary) {
-    println!(
-        "files rows={} cols={}",
-        summary.files.rows, summary.files.columns
-    );
-    println!(
-        "functions rows={} cols={}",
-        summary.functions.rows, summary.functions.columns
-    );
-    println!(
-        "entry_points rows={} cols={}",
-        summary.entry_points.rows, summary.entry_points.columns
-    );
-    println!(
-        "imports rows={} cols={}",
-        summary.imports.rows, summary.imports.columns
-    );
-    println!(
-        "calls rows={} cols={}",
-        summary.calls.rows, summary.calls.columns
-    );
-    println!(
-        "call_edges rows={} cols={}",
-        summary.call_edges.rows, summary.call_edges.columns
-    );
-    println!(
-        "health rows={} cols={}",
-        summary.health.rows, summary.health.columns
-    );
-    println!(
-        "hotspots rows={} cols={}",
-        summary.hotspots.rows, summary.hotspots.columns
-    );
-    println!(
-        "rules rows={} cols={}",
-        summary.rules.rows, summary.rules.columns
-    );
-    println!(
-        "module_edges rows={} cols={}",
-        summary.module_edges.rows, summary.module_edges.columns
-    );
-    println!(
-        "changed_files rows={} cols={}",
-        summary.changed_files.rows, summary.changed_files.columns
-    );
-}
-
 fn print_baseline_tables(tables: &[crate::memory::BaselineTableInfo]) {
     println!("name\trows\tcolumns");
     for table in tables {
@@ -2426,36 +2265,6 @@ fn print_baseline_diff(diff: &BaselineDiff) {
     }
 }
 
-fn print_summary(report: &crate::ScanReport, config: &RaysenseConfig) {
-    let health = compute_health_with_config(report, config);
-    println!("snapshot {}", report.snapshot.snapshot_id);
-    println!("root {}", report.snapshot.root.display());
-    println!("score {}", health.score);
-    println!("quality_signal {}", health.quality_signal);
-    println!("coverage_score {}", health.coverage_score);
-    println!("structural_score {}", health.structural_score);
-    println!("files {}", report.snapshot.file_count);
-    println!("functions {}", report.snapshot.function_count);
-    println!("calls {}", report.snapshot.call_count);
-    println!("call_edges {}", report.call_edges.len());
-    println!(
-        "entry_points total={} binaries={} examples={} tests={}",
-        report.entry_points.len(),
-        health.metrics.entry_points.binaries,
-        health.metrics.entry_points.examples,
-        health.metrics.entry_points.tests
-    );
-    println!("imports {}", report.snapshot.import_count);
-    println!("local_imports {}", health.resolution.local);
-    println!("external_imports {}", health.resolution.external);
-    println!("system_imports {}", health.resolution.system);
-    println!("unresolved_imports {}", health.resolution.unresolved);
-    println!("resolved_edges {}", report.graph.resolved_edge_count);
-    println!("cycles {}", report.graph.cycle_count);
-    println!("max_fan_in {}", report.graph.max_fan_in);
-    println!("max_fan_out {}", report.graph.max_fan_out);
-}
-
 fn print_health(report: &crate::ScanReport, health: &crate::HealthSummary) {
     println!("score {}", health.score);
     println!("quality_signal {}", health.quality_signal);
@@ -2694,41 +2503,6 @@ fn print_health(report: &crate::ScanReport, health: &crate::HealthSummary) {
     }
 }
 
-fn print_edges(report: &crate::ScanReport, all: bool) -> io::Result<()> {
-    let stdout = io::stdout();
-    let mut stdout = stdout.lock();
-
-    for import in &report.imports {
-        if !all && import.resolution != ImportResolution::Local {
-            continue;
-        }
-
-        let from = report
-            .files
-            .get(import.from_file)
-            .map(|file| file.path.to_string_lossy().into_owned())
-            .unwrap_or_else(|| format!("#{}", import.from_file));
-        let to = import
-            .resolved_file
-            .and_then(|file_id| report.files.get(file_id))
-            .map(|file| file.path.to_string_lossy().into_owned())
-            .unwrap_or_else(|| import.target.clone());
-
-        if let Err(err) = writeln!(
-            stdout,
-            "{:?} {} -> {} ({})",
-            import.resolution, from, to, import.kind
-        ) {
-            if err.kind() == io::ErrorKind::BrokenPipe {
-                return Ok(());
-            }
-            return Err(err);
-        }
-    }
-
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;

From 33a8c16c74ee688eb2d6f813a4d6af370e3b0840 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Fri, 1 May 2026 12:43:35 +0200
Subject: [PATCH 4/5] style: cargo fmt after collapse

---
 src/mcp.rs | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/mcp.rs b/src/mcp.rs
index a506023..9fc0f9f 100644
--- a/src/mcp.rs
+++ b/src/mcp.rs
@@ -21,15 +21,15 @@
  *   SOFTWARE.
  */
 
-use anyhow::{anyhow, Context, Result};
-use crate::{
-    build_baseline, compute_health_with_config, diff_baselines, is_foundation_file,
-    scan_path_with_config, ImportResolution, ProjectBaseline, RaysenseConfig,
-};
 use crate::memory::{
     BaselineFilterMode, BaselineFilterOp, BaselineSortDirection, BaselineTableFilter,
     BaselineTableQuery, BaselineTableSort,
 };
+use crate::{
+    build_baseline, compute_health_with_config, diff_baselines, is_foundation_file,
+    scan_path_with_config, ImportResolution, ProjectBaseline, RaysenseConfig,
+};
+use anyhow::{anyhow, Context, Result};
 use serde_json::{json, Value};
 use std::collections::{HashMap, HashSet, VecDeque};
 use std::fs;
@@ -987,15 +987,12 @@ fn plugin_add_tool(args: &Value) -> Result<Value> {
     let path = config_path_arg(args)?.unwrap_or_else(|| root.join(".raysense.toml"));
     let mut config = load_or_default_config(&path)?;
     config.scan.plugins.retain(|plugin| plugin.name != name);
-    config
-        .scan
-        .plugins
-        .push(crate::LanguagePluginConfig {
-            name: name.to_string(),
-            extensions,
-            file_names,
-            ..crate::LanguagePluginConfig::default()
-        });
+    config.scan.plugins.push(crate::LanguagePluginConfig {
+        name: name.to_string(),
+        extensions,
+        file_names,
+        ..crate::LanguagePluginConfig::default()
+    });
     write_config_path(&path, &config)?;
 
     Ok(json!({
@@ -1308,8 +1305,7 @@ fn break_cycle_recommendations_tool(args: &Value) -> Result<Value> {
         .map(|n| n as usize)
         .unwrap_or(500);
     let report = scan_path_with_config(&root, &config)?;
-    let recommendations =
-        crate::break_cycle_recommendations(&report, limit, max_candidates);
+    let recommendations = crate::break_cycle_recommendations(&report, limit, max_candidates);
     Ok(json!({
         "root": report.snapshot.root,
         "cycle_count_before": report.graph.cycle_count,
@@ -1331,9 +1327,8 @@ fn what_if_sequence_tool(actions: &[Value], root: &Path, config: &RaysenseConfig
     let before_health = compute_health_with_config(&before_report, config);
     let before = build_baseline(&before_report, &before_health);
 
-    let after_report =
-        crate::simulate::simulate_sequence(&before_report, config, &parsed_actions)
-            .map_err(|err| anyhow!(err.to_string()))?;
+    let after_report = crate::simulate::simulate_sequence(&before_report, config, &parsed_actions)
+        .map_err(|err| anyhow!(err.to_string()))?;
 
     let after_health = compute_health_with_config(&after_report, config);
     let after = build_baseline(&after_report, &after_health);

From 7fbf610306078364f826828c976a7a3365359deb Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Fri, 1 May 2026 12:50:24 +0200
Subject: [PATCH 5/5] style: cargo fmt build.rs link_external

---
 build.rs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/build.rs b/build.rs
index 3ecc531..fb3183d 100644
--- a/build.rs
+++ b/build.rs
@@ -111,10 +111,7 @@ fn link_external(rayforce_dir: PathBuf) {
         );
     }
     println!("cargo:include={}", include_dir.display());
-    println!(
-        "cargo:rustc-link-search=native={}",
-        rayforce_dir.display()
-    );
+    println!("cargo:rustc-link-search=native={}", rayforce_dir.display());
     println!("cargo:rustc-link-lib=static=rayforce");
     println!("cargo:rerun-if-changed={}", lib_path.display());
     println!(