diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2adecef42..588455f06 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -65,6 +65,7 @@ jobs: ../configure --tooldir=$TOOLDIR ci/toolchain_install.sh --all ci/sst_install.sh + ci/gem5_install.sh - name: Setup Third Party if: steps.cache-thirdparty.outputs.cache-hit != 'true' @@ -78,6 +79,11 @@ jobs: echo "SST_CORE_HOME=$PWD/tools/sst-install/sst-core" >> $GITHUB_ENV echo "SST_ELEMENTS_HOME=$PWD/tools/sst-install/sst-elements" >> $GITHUB_ENV + - name: Export gem5 paths + run: | + echo "GEM5_HOME=$PWD/tools/gem5" >> $GITHUB_ENV + echo "$PWD/tools/gem5/build/X86" >> $GITHUB_PATH + build: needs: setup strategy: @@ -137,15 +143,23 @@ jobs: matrix: os: [ubuntu-24.04] # dxa + tensor_wg disabled: features not yet complete (see regression{32,64}_failures.md) - name: [regression, amo, mpi, dtm, opencl, cache, config1, config2, debug, scope, stress, synthesis, vm, rvc, cupbop, hip, tensor, tensor_sp, tensor_mx] + name: [regression, amo, mpi, dtm, opencl, cache, config1, config2, debug, scope, stress, synthesis, vm, rvc, cupbop, hip, tensor, tensor_sp, tensor_mx, gem5] xlen: [32, 64] # chipStar's hipcc emits Physical64 SPIR-V; POCL refuses it on # rv32 Vortex (CL_INVALID_OPERATION). hip is rv64-only until # either chipStar grows --offload=spirv32 or the native # HIPVortex toolchain lands (see hip_support_proposal.md). + # + # gem5 only runs against the rv32 build; the device library + # is XLEN-locked by the gem5 install (build/X86/gem5.opt + # links against the libvortex-gem5.so the runner builds, and + # we only build it once). XLEN=64 entry would just duplicate + # the run against an identical setup. exclude: - name: hip xlen: 32 + - name: gem5 + xlen: 64 runs-on: ${{ matrix.os }} timeout-minutes: 120 @@ -190,6 +204,11 @@ jobs: echo "SST_CORE_HOME=$PWD/tools/sst-install/sst-core" >> $GITHUB_ENV echo "SST_ELEMENTS_HOME=$PWD/tools/sst-install/sst-elements" >> $GITHUB_ENV + - name: Export gem5 paths + run: | + echo "GEM5_HOME=$PWD/tools/gem5" >> $GITHUB_ENV + echo "$PWD/tools/gem5/build/X86" >> $GITHUB_PATH + - name: Run tests run: | cd build${{ matrix.xlen }} diff --git a/VERSION b/VERSION index af5ac4633..590f872b1 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,3 @@ VORTEX_VERSION=3.0 TOOLCHAIN_REV=v3.0 +GEM5_REV=v25.0.0.1 diff --git a/ci/gem5_install.sh.in b/ci/gem5_install.sh.in new file mode 100644 index 000000000..0eb610e40 --- /dev/null +++ b/ci/gem5_install.sh.in @@ -0,0 +1,119 @@ +#!/bin/bash + +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# gem5 install for the Vortex integration — see +# docs/proposals/gem5_v2_cp_migration_proposal.md for the design, +# docs/gem5_integration.md for the operator manual. +# +# Fetches a pinned gem5 release, installs build deps + the AArch64 +# cross-toolchain (used by the ARM regression matrix), and builds +# gem5.opt for the selected ISA targets (X86 + ARM by default). The +# Vortex SimObject is installed separately via +# `sim/simx/gem5/install.sh` — that step has to re-run whenever the +# SimObject sources change, but it does NOT need a fresh gem5 clone. +# +# Idempotent: re-running with the same GEM5_REV is a no-op once +# $GEM5_HOME/build//gem5.opt exists. + +# exit when any command fails +set -e + +GEM5_REV=${GEM5_REV:=@GEM5_REV@} +TOOLDIR=${TOOLDIR:=@TOOLDIR@} +GEM5_HOME=$TOOLDIR/gem5 +GEM5_REPO=https://github.com/gem5/gem5.git + +# Build deps. gem5 documents these at https://www.gem5.org/documentation/general_docs/building +# AArch64 cross-toolchain (gcc/g++-aarch64-linux-gnu) is needed for +# the ARM regression matrix: cross-compiles libvortex-gem5-aarch64.so +# and vecadd-aarch64 / sgemm-aarch64. Installing it here keeps the +# one-time setup self-contained. +DEBIAN_FRONTEND=noninteractive sudo apt install -y \ + scons \ + python3 python3-dev python3-pip python3-venv \ + libprotobuf-dev protobuf-compiler libprotoc-dev \ + libgoogle-perftools-dev \ + m4 \ + libboost-all-dev \ + libhdf5-serial-dev \ + libpng-dev \ + pkg-config \ + gcc-aarch64-linux-gnu g++-aarch64-linux-gnu \ + build-essential git wget + +mkdir -p "$TOOLDIR" + +# Fetch (or update) gem5 working tree at the pinned revision. +if [ -d "$GEM5_HOME/.git" ]; then + echo "gem5 working tree exists at $GEM5_HOME" + pushd "$GEM5_HOME" > /dev/null + current_rev=$(git describe --tags --always 2>/dev/null || echo "unknown") + if [ "$current_rev" != "$GEM5_REV" ]; then + echo "checked-out rev $current_rev != pinned $GEM5_REV; refetching" + git fetch --depth=1 origin "tag" "$GEM5_REV" + git checkout "$GEM5_REV" + fi + popd > /dev/null +else + echo "cloning gem5 $GEM5_REV into $GEM5_HOME" + git clone --depth=1 --branch "$GEM5_REV" "$GEM5_REPO" "$GEM5_HOME" +fi + +# Build the ARM variant. -j$(nproc) on the self-hosted runner; cap at 4 +# on hosted runners to avoid OOM (gem5 link uses ~4 GB peak). +JOBS=$(nproc) +if [ -n "$GITHUB_ACTIONS" ] && [ -z "$VORTEX_SELF_HOSTED" ]; then + JOBS=4 +fi + +# Build both X86 (default host ISA — easier, no cross-compile needed) +# and ARM (used by the cross-arch regression matrix). Either can be +# selected at test-config time via GEM5_BIN=$GEM5_HOME/build/{X86,ARM}/gem5.opt. +# Default targets can be overridden via GEM5_TARGETS="X86" or "ARM" or +# "X86 ARM" (space-separated). Both is the default. +GEM5_TARGETS=${GEM5_TARGETS:-"X86 ARM"} + +cd "$GEM5_HOME" +for target in $GEM5_TARGETS; do + if [ ! -x "$GEM5_HOME/build/$target/gem5.opt" ]; then + echo "building gem5.opt ($target) with -j$JOBS" + scons "build/$target/gem5.opt" -j"$JOBS" + else + echo "gem5.opt ($target) already built at $GEM5_HOME/build/$target/gem5.opt" + fi +done + +# Persist GEM5_HOME for subsequent shells (idempotent). +if ! grep -q "^export GEM5_HOME=" ~/.bashrc 2>/dev/null; then + echo "export GEM5_HOME=$GEM5_HOME" >> ~/.bashrc +fi +export GEM5_HOME + +# GitHub Actions: propagate to subsequent steps. +if [ -n "$GITHUB_ENV" ]; then + echo "GEM5_HOME=$GEM5_HOME" >> "$GITHUB_ENV" +fi +if [ -n "$GITHUB_PATH" ]; then + for target in $GEM5_TARGETS; do + echo "$GEM5_HOME/build/$target" >> "$GITHUB_PATH" + done +fi + +echo "" +echo "gem5 $GEM5_REV installed at $GEM5_HOME" +for target in $GEM5_TARGETS; do + echo " binary: $GEM5_HOME/build/$target/gem5.opt" +done +echo " GEM5_HOME exported (re-source ~/.bashrc to pick up in new shells)" diff --git a/ci/gem5_run_app.py b/ci/gem5_run_app.py new file mode 100644 index 000000000..9eb008e1b --- /dev/null +++ b/ci/gem5_run_app.py @@ -0,0 +1,260 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# End-to-end gem5 integration test for vortex.VortexGPGPU. +# +# Generic application runner — any Vortex regression test that +# follows the standard shape (host binary + kernel.vxbin in the same +# directory, links against libvortex.so) can run here. +# +# Wires (gem5_v2_cp_migration_proposal §3): +# - SE-mode CPU(s) running an unmodified Vortex regression test +# (same binary the SimX backend uses). +# - VortexGPGPU device on the system membus, claiming two ranges: +# CP regfile at PIO_BASE (32-bit MMIO) and BAR-mapped VRAM at +# PIN_BASE (host memcpy lands in in-process simx::RAM). +# - Identity-mapped via Process.map() — the same mechanism gem5's +# AMD GPU integration uses at apu_se.py:1055. +# +# The simulated process loads libvortex.so (the upstream dispatcher), +# which dlopens libvortex-gem5-x86_64.so based on VORTEX_DRIVER. The +# dispatcher's CP submission path then: +# 1. mem_alloc + mem_upload → ring buffer / head / cmpl slots in VRAM +# 2. cp_mmio_write(Q_*, ...) → program CP regfile, enable Q0 + CP +# 3. vx_enqueue_launch / vx_enqueue_write / etc. → CMD_* descriptors +# written into the ring (mem_upload), Q_TAIL_HI doorbell (cp_mmio_write), +# Q_SEQNUM polled to wait (cp_mmio_read). +# The host runtime is a thin platform shim — no per-command logic. +# +# Configurable via env vars: +# VORTEX_GEM5_DEV_LIB — path to sim/simx/libvortex-gem5.so +# (device-side; dlopened by the gem5 SimObject) +# VORTEX_GEM5_HOST_RT_DIR — directory containing libvortex.so (the stub) +# AND libvortex-gem5-x86_64.so (the host +# runtime backend). Both are added to the +# simulated process's LD_LIBRARY_PATH. +# VORTEX_TEST_DIR — directory containing the test binary + +# kernel.vxbin +# VORTEX_TEST_BIN — name of the test binary inside that dir +# (default: vecadd) +# VORTEX_TEST_ARGS — args passed to the binary (default: -n16) +# VORTEX_DRIVER — backend selector for the stub library +# (default: gem5-x86_64; use gem5-aarch64 +# when running the ARM matrix) + +import os +import shlex + +import m5 +from m5.objects import ( + AddrRange, + DDR3_1600_8x8, + MemCtrl, + Process, + RedirectPath, + Root, + SEWorkload, + SrcClockDomain, + System, + SystemXBar, + AtomicSimpleCPU, + VoltageDomain, + VortexGPGPU, +) + +DEV_LIB = os.environ.get("VORTEX_GEM5_DEV_LIB") +HOST_RT_DIR = os.environ.get("VORTEX_GEM5_HOST_RT_DIR") +TEST_DIR = os.environ.get("VORTEX_TEST_DIR") +TEST_BIN = os.environ.get("VORTEX_TEST_BIN", "vecadd") +TEST_ARGS = os.environ.get("VORTEX_TEST_ARGS", "-n16") +DRIVER = os.environ.get("VORTEX_DRIVER", "gem5-x86_64") + +# Number of CPU thread contexts. The upstream dispatcher spawns a +# per-Queue worker thread (commit 157e7a1) and the legacy_runtime +# helpers may spawn additional internal threads. Each thread needs a +# free HW context — we provision 4 (one main + headroom). Each is a +# separate AtomicSimpleCPU instance per the gem5 SE-mode pthread +# pattern (deprecated/example/se.py:188-189): clone() in +# syscall_emul finds the next idle context across all CPUs. +NUM_CPUS = 4 + +for name, val in [ + ("VORTEX_GEM5_DEV_LIB", DEV_LIB), + ("VORTEX_GEM5_HOST_RT_DIR", HOST_RT_DIR), + ("VORTEX_TEST_DIR", TEST_DIR), +]: + if not val: + raise RuntimeError(f"{name} env var is required") + +APP_BIN = f"{TEST_DIR}/{TEST_BIN}" + +# Fixed mappings used by the gem5 host runtime (see +# sw/runtime/gem5/driver.h). The Python config and the C runtime +# share these constants by convention; if you change one, change +# both. +PIO_BASE = 0x20000000 +PIO_SIZE = 0x0200 # CP regfile (0x40 globals + 4 × 0x40 queues + pad) +PIN_BASE = 0x100000000 # BAR-mapped VRAM, above 4 GiB to clear the + # simulated process's natural low-VA layout +PIN_SIZE = 0x100000000 # 4 GB — full XLEN=32 device address space + +# --------------------------------------------------------------------------- +# System construction +# --------------------------------------------------------------------------- +system = System() +system.clk_domain = SrcClockDomain(clock="3GHz", + voltage_domain=VoltageDomain()) +system.mem_mode = "atomic" +system.mem_ranges = [AddrRange("1GiB")] # advisory; actual routing + # is by per-SimObject ranges + # (DRAM owns [0, 1GB); + # VortexGPGPU owns the PIO + # and PIN ranges, both above) + +# Cross-arch interp + runtime library redirection. +# Two separate gem5 mechanisms are at play: +# (1) `setInterpDir(prefix)` prepends `prefix` to PT_INTERP when +# gem5 loads the dynamic linker (e.g. /lib/ld-linux-aarch64.so.1 +# → /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1). The +# linker is opened directly by gem5's loader, NOT via SE-mode +# syscall, so RedirectPath doesn't help here. +# (2) `system.redirect_paths` redirects open()/stat()/etc syscalls +# the GUEST process makes — used when the dynamic linker +# later looks up libc.so.6, libstdc++.so.6, libvortex.so, etc. +# Both are no-ops for native x86. +if DRIVER == "gem5-aarch64": + from m5.core import setInterpDir + setInterpDir("/usr/aarch64-linux-gnu") + system.redirect_paths = [ + RedirectPath(app_path="/lib/aarch64-linux-gnu", + host_paths=["/usr/aarch64-linux-gnu/lib"]), + RedirectPath(app_path="/usr/lib/aarch64-linux-gnu", + host_paths=["/usr/aarch64-linux-gnu/lib"]), + ] + +# Membus connects CPU ↔ memory ↔ VortexGPGPU. +system.membus = SystemXBar() +system.system_port = system.membus.cpu_side_ports + +# CPUs. Atomic — the cycle counts inside the Vortex device are +# driven by the device's own clock; timing CPU adds wall time without +# changing the kernel result. We provision NUM_CPUS instances so the +# dispatcher's per-Queue worker thread (commit 157e7a1) and any +# transient helper threads have free HW contexts to clone() into. +system.cpu = [AtomicSimpleCPU(cpu_id=i) for i in range(NUM_CPUS)] +system.multi_thread = True +for cpu in system.cpu: + cpu.createInterruptController() + cpu.icache_port = system.membus.cpu_side_ports + cpu.dcache_port = system.membus.cpu_side_ports + # X86's InterruptController has explicit pio/int_requestor/ + # int_responder ports that must be wired to the membus (per + # learning_gem5/part1/two_level.py:111-114). ARM's interrupt model + # doesn't expose these — skip on ARM. Tested via the DRIVER env + # var (the same one that selects the simulated host ISA). + if DRIVER == "gem5-x86_64": + cpu.interrupts[0].pio = system.membus.mem_side_ports + cpu.interrupts[0].int_requestor = system.membus.cpu_side_ports + cpu.interrupts[0].int_responder = system.membus.mem_side_ports + +# Memory controller. DRAM serves the simulated process's normal +# low-VA address space ([0, 1 GiB) is plenty for ELF code + heap + +# stack of any in-tree regression test). The VortexGPGPU device owns +# disjoint ranges higher up: +# - [PIO_BASE, PIO_BASE+PIO_SIZE) — CP regfile (32-bit MMIO) +# - [PIN_BASE, PIN_BASE+PIN_SIZE) — BAR-mapped VRAM; host CPU +# writes land in the same bytes the CP and Vortex see via in-process +# simx::RAM (gem5_v2_cp_migration §2.2 single data plane). +# Placing PIN_BASE above 4 GiB keeps it well clear of both the DRAM +# range and the simulated process's natural VA layout. +system.mem_ctrl = MemCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = AddrRange(0, PIO_BASE) +system.mem_ctrl.port = system.membus.mem_side_ports + +# The Vortex device. The `library` parameter points at the +# device-side libvortex-gem5.so (no arch suffix; gem5 itself is +# always x86-host). The host-side runtime is loaded separately by +# the simulated process via VORTEX_DRIVER below. +system.vortex = VortexGPGPU( + library = DEV_LIB, + kernel = "", # NO preload — the host binary uploads the kernel + # via the dispatcher's CP submission path, the way + # a real accelerator runtime works. +) +system.vortex.pio_addr = PIO_BASE +system.vortex.pio_size = PIO_SIZE +system.vortex.pin_addr = PIN_BASE +system.vortex.pin_size = PIN_SIZE +system.vortex.pio = system.membus.mem_side_ports +system.vortex.dma = system.membus.cpu_side_ports + +# --------------------------------------------------------------------------- +# Workload (the host test binary) +# --------------------------------------------------------------------------- +argv = [APP_BIN] + shlex.split(TEST_ARGS) +process = Process( + pid=100, + cwd=TEST_DIR, + cmd=argv, + executable=argv[0], + env=[ + # Tells the stub to dlopen our backend + # (libvortex.so does dlopen("libvortex-${VORTEX_DRIVER}.so")). + f"VORTEX_DRIVER={DRIVER}", + # Library search path inside the simulated process. Must + # contain libvortex.so AND libvortex-gem5-$ARCH.so (both + # are in HOST_RT_DIR by construction). + f"LD_LIBRARY_PATH={HOST_RT_DIR}", + ], +) + +system.workload = SEWorkload.init_compatible(APP_BIN) +# gem5 SE-mode requires each CPU to have an assigned workload; the +# secondary CPUs are halted at boot and wake when clone() finds them +# (deprecated/example/se.py:294). Assign the same Process to all +# four CPUs — only CPU[0] starts running; the rest sit idle until +# pthread spawn. +for cpu in system.cpu: + cpu.workload = process + cpu.createThreads() + +# --------------------------------------------------------------------------- +# Run +# --------------------------------------------------------------------------- +root = Root(full_system=False, system=system) +m5.instantiate() + +# Identity-map both device-owned ranges into the simulated process's +# address space. Must happen AFTER m5.instantiate(). Mirrors +# apu_se.py:1055 (gem5's AMD GPU pattern). The CPU's userspace then +# touches PIO_BASE / PIN_BASE as ordinary memory; the membus routes +# both ranges to the VortexGPGPU SimObject (PIN range = BAR-mapped +# VRAM, PIO range = CP regfile). +# +# cacheable=False on PIN ensures host stores to VRAM are immediately +# visible to the CP — otherwise a cache line could hold the new ring +# entry while Q_TAIL_HI is observed by the device. +system.cpu[0].workload[0].map(PIO_BASE, PIO_BASE, PIO_SIZE, cacheable=False) +system.cpu[0].workload[0].map(PIN_BASE, PIN_BASE, PIN_SIZE, cacheable=False) + +print(f"E2E: app={APP_BIN} {TEST_ARGS}") +print(f"E2E: VortexGPGPU.library={DEV_LIB}") +print(f"E2E: VORTEX_DRIVER={DRIVER}") +print(f"E2E: LD_LIBRARY_PATH={HOST_RT_DIR}") +print(f"E2E: PIO @0x{PIO_BASE:x}+0x{PIO_SIZE:x}, PIN @0x{PIN_BASE:x}+0x{PIN_SIZE:x}") +print("E2E: starting simulation...") + +exit_event = m5.simulate() +print(f"E2E: exit_event.cause = {exit_event.getCause()!r}") +print(f"E2E: tick = {m5.curTick()}") diff --git a/ci/gem5_run_hostless_app.py b/ci/gem5_run_hostless_app.py new file mode 100644 index 000000000..65c92602c --- /dev/null +++ b/ci/gem5_run_hostless_app.py @@ -0,0 +1,108 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Hostless gem5 integration test for vortex.VortexGPGPU. +# +# The SimObject loads a .vxbin kernel directly via its `kernel=` +# parameter and runs it via its internal vortexTickEvent_ chain — no +# host CPU, no Command Processor, no PIO/DMA. Smoke-tests the +# gem5↔libvortex-gem5.so wiring: dlopen succeeds, SimObject +# constructs, Processor::cycle() drives from the gem5 event loop, sim +# exits cleanly. +# +# Hosted counterpart: [gem5_run_app.py](gem5_run_app.py) wires up the +# host CPU + CP regfile + BAR-mapped VRAM on top. +# +# Configurable via env vars (parallel to gem5_run_app.py): +# VORTEX_GEM5_DEV_LIB — path to libvortex-gem5.so (no default) +# VORTEX_TEST_DIR — directory containing the kernel .vxbin +# VORTEX_TEST_KERNEL — kernel filename inside that dir +# (default: kernel.vxbin, matching the +# regression-test convention) +# +# Run from the Vortex build dir as: +# VORTEX_GEM5_DEV_LIB=$PWD/sim/simx/libvortex-gem5.so \ +# VORTEX_TEST_DIR=$PWD/tests/kernel/hello \ +# VORTEX_TEST_KERNEL=hello.vxbin \ +# $GEM5_HOME/build/X86/gem5.opt ci/gem5_run_hostless_app.py + +import os +import m5 +from m5.objects import ( + AddrRange, + DDR3_1600_8x8, + MemCtrl, + Root, + SrcClockDomain, + System, + SystemXBar, + VoltageDomain, + VortexGPGPU, +) + +DEV_LIB = os.environ.get("VORTEX_GEM5_DEV_LIB") +TEST_DIR = os.environ.get("VORTEX_TEST_DIR") +TEST_KERNEL = os.environ.get("VORTEX_TEST_KERNEL", "kernel.vxbin") + +for name, val in [("VORTEX_GEM5_DEV_LIB", DEV_LIB), + ("VORTEX_TEST_DIR", TEST_DIR)]: + if not val: + raise RuntimeError(f"{name} env var is required") + +KERNEL = f"{TEST_DIR}/{TEST_KERNEL}" + +# Minimal system: just enough to hang the VortexGPGPU off a membus +# so gem5 considers it a properly-wired SimObject. No CPU in this +# test — the kernel runs entirely inside the SimObject's internal +# vortexTickEvent_ chain. +system = System() +system.clk_domain = SrcClockDomain(clock="1GHz", + voltage_domain=VoltageDomain()) +system.mem_mode = "atomic" +system.mem_ranges = [AddrRange("512MiB")] + +# Membus + a small backing memory so PIO ranges have somewhere to bind. +system.membus = SystemXBar() + +# Memory controller (unused at runtime in hostless mode but required +# for the system to instantiate cleanly). +system.mem_ctrl = MemCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] +system.mem_ctrl.port = system.membus.mem_side_ports + +# The Vortex device. It inherits clock from the system clock domain +# (set above to 1GHz) via ClockedObject; no explicit `clock=` param. +system.vortex = VortexGPGPU( + library = DEV_LIB, + kernel = KERNEL, + # Explicitly disable the BAR-mapped VRAM range — the hostless + # path loads the kernel via the device library's load_kernel() + # entry, never via host memcpy through PIN. Leaving it enabled + # here would conflict with this test's DRAM range. + pin_size = 0, +) +system.vortex.pio = system.membus.mem_side_ports +system.vortex.dma = system.membus.cpu_side_ports + +# Root wires the system into the simulator. +root = Root(full_system=False, system=system) +m5.instantiate() + +print(f"Hostless: VortexGPGPU.library={DEV_LIB}") +print(f"Hostless: kernel={KERNEL}") +print("Hostless: running until VortexGPGPU exits the sim loop...") + +exit_event = m5.simulate() +print(f"Hostless: exit_event.cause = {exit_event.getCause()!r}") +print(f"Hostless: tick = {m5.curTick()}") diff --git a/ci/regression.sh.in b/ci/regression.sh.in index a24aba709..5d1ddf82a 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -95,14 +95,145 @@ sst() cp sim/simx/libvortex.so $SST_ELEMENTS_HOME/lib/sst-elements-library/ # alternatively - $ sst --add-lib-path `pwd` myConfig.py - sst ci/sst_test_vortex_hello.py - sst ci/sst_test_vortex_fibonacci.py - sst ci/sst_test_vortex_vecadd.py - sst ci/sst_test_vortex_conform.py + BUILD_DIR=$(pwd) + + # Hostless SST runner (ci/sst_run_hostless_app.py) parameterized + # by VORTEX_TEST_DIR + VORTEX_TEST_KERNEL — same shape as + # ci/gem5_run_hostless_app.py. SST is hostless-only today (no + # CPU component wired to Vortex); the ci/sst_run_app.py name + # slot is reserved for a future host-CPU SST integration. + for spec in "hello:hello.vxbin" "fibonacci:fibonacci.vxbin" \ + "vecadd:vecadd.vxbin" "conform:conform.vxbin"; do + kern="${spec%%:*}" + vxbin="${spec#*:}" + echo "=== sst: $kern ===" + VORTEX_TEST_DIR=$BUILD_DIR/tests/kernel/$kern \ + VORTEX_TEST_KERNEL=$vxbin \ + sst ci/sst_run_hostless_app.py + done echo "sst tests done!" } +# gem5 integration tests — see docs/proposals/gem5_v2_cp_migration_proposal.md +# for the v2 CP-first design and docs/gem5_integration.md for the operator +# manual. Two layers: +# +# 1. Phase 3 standalone: kernel preloaded via the SimObject's +# `kernel=` Python param; runs entirely inside the gem5 event +# loop, no host CPU needed. Fast smoke test for the device +# library wiring. +# +# 2. Phase 5 e2e: an x86 (or aarch64) SE-mode workload drives the +# device via the CP regfile (cp_mmio_*) + BAR-mapped VRAM +# (mem_upload through identity-mapped PIN region). The +# dispatcher's CP submission path handles all command building; +# the host runtime is a thin platform shim. +# +# ARM matrix is opt-in via VORTEX_GEM5_ARM=1 (needs gcc-aarch64-linux-gnu +# installed; not part of the default hosted-runner image). +gem5() +{ + echo "begin gem5 tests..." + + if [ -z "$GEM5_HOME" ]; then + GEM5_HOME=$HOME/tools/gem5 + fi + if [ ! -x "$GEM5_HOME/build/X86/gem5.opt" ]; then + echo "error: $GEM5_HOME/build/X86/gem5.opt not found — run ci/gem5_install.sh first" + exit 1 + fi + + # Build prerequisites. The host runtime is gated on HOST_ARCH; + # default x86 needs no cross-toolchain. + make -C sim/simx USE_GEM5=1 + make -C sw/runtime/stub + make -C sw/runtime/gem5 HOST_ARCH=x86_64 + make -C sw/kernel + make -C tests/kernel/hello + make -C tests/regression/vecadd + make -C tests/regression/sgemm + + BUILD_DIR=$(pwd) + LIB_GEM5_DEV=$BUILD_DIR/sim/simx/libvortex-gem5.so + HOST_RT_DIR=$BUILD_DIR/sw/runtime + + # Hostless smoke — no host CPU, kernel preloaded via SimObject param. + # env-vars MUST precede the binary (gem5.opt would otherwise treat + # them as positional args). + echo "=== gem5 hostless: hello ===" + VORTEX_GEM5_DEV_LIB=$LIB_GEM5_DEV \ + VORTEX_TEST_DIR=$BUILD_DIR/tests/kernel/hello \ + VORTEX_TEST_KERNEL=hello.vxbin \ + timeout 120 $GEM5_HOME/build/X86/gem5.opt \ + ci/gem5_run_hostless_app.py + + # E2E — CP-driven path through the host runtime. Generic runner + # (ci/gem5_run_app.py) parameterized by VORTEX_TEST_BIN + + # VORTEX_TEST_ARGS. Sizes fit the 120s per-test budget + # (feedback_test_timeout_120s): + # - vecadd -n16 small vector add + # - sgemm -n4 4x4 matrix multiply + # Larger sizes overrun because the simulated host CPU's CP poll + # loop burns gem5 wall time proportional to kernel runtime. + # Run on local dev box for larger sizes by overriding VORTEX_TEST_ARGS. + for spec in "vecadd:-n16" "sgemm:-n4"; do + app="${spec%%:*}" + args="${spec#*:}" + echo "=== gem5 e2e: $app $args ===" + VORTEX_GEM5_DEV_LIB=$LIB_GEM5_DEV \ + VORTEX_GEM5_HOST_RT_DIR=$HOST_RT_DIR \ + VORTEX_TEST_DIR=$BUILD_DIR/tests/regression/$app \ + VORTEX_TEST_BIN=$app \ + VORTEX_TEST_ARGS=$args \ + timeout 120 $GEM5_HOME/build/X86/gem5.opt \ + ci/gem5_run_app.py + done + + # ARM matrix (opt-in). The device library (libvortex-gem5.so) is + # always x86 — gem5.opt is an x86 binary regardless of which + # simulated ISA it models. Only the simulated host's ISA changes. + if [ -n "$VORTEX_GEM5_ARM" ]; then + if [ ! -x "$GEM5_HOME/build/ARM/gem5.opt" ]; then + echo "error: $GEM5_HOME/build/ARM/gem5.opt not found" + exit 1 + fi + + # Cross-compile the host runtime, stub, and test binaries for + # aarch64. All outputs land in $arch/ subdirs alongside the + # native x86 builds so they coexist cleanly. + make -C sw/runtime/stub HOST_ARCH=aarch64 + make -C sw/runtime/gem5 HOST_ARCH=aarch64 + make -C tests/regression/vecadd HOST_ARCH=aarch64 + make -C tests/regression/sgemm HOST_ARCH=aarch64 + + ARM_HOST_RT_DIR=$BUILD_DIR/sw/runtime/aarch64 + + echo "=== gem5 ARM hostless: hello ===" + VORTEX_GEM5_DEV_LIB=$LIB_GEM5_DEV \ + VORTEX_TEST_DIR=$BUILD_DIR/tests/kernel/hello \ + VORTEX_TEST_KERNEL=hello.vxbin \ + timeout 120 $GEM5_HOME/build/ARM/gem5.opt \ + ci/gem5_run_hostless_app.py + + for spec in "vecadd:-n16" "sgemm:-n4"; do + app="${spec%%:*}" + args="${spec#*:}" + echo "=== gem5 ARM e2e: $app $args ===" + VORTEX_GEM5_DEV_LIB=$LIB_GEM5_DEV \ + VORTEX_GEM5_HOST_RT_DIR=$ARM_HOST_RT_DIR \ + VORTEX_TEST_DIR=$BUILD_DIR/tests/regression/$app \ + VORTEX_TEST_BIN=$app-aarch64 \ + VORTEX_TEST_ARGS=$args \ + VORTEX_DRIVER=gem5-aarch64 \ + timeout 120 $GEM5_HOME/build/ARM/gem5.opt \ + ci/gem5_run_app.py + done + fi + + echo "gem5 tests done!" +} + mpi() { echo "begin mpi tests..." @@ -1047,7 +1178,7 @@ hip() show_usage() { echo "Vortex Regression Test" - echo "Usage: $0 [--clean] [--unittest] [--riscv] [--kernel] [--regression] [--amo] [--dxa] [--opencl] [--cache] [--vm] [--rvc] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--graphics] [--tensor] [--tensor_sp] [--tensor_mx] [--tensor_wg] [--cupbop] [--hip] [--all] [--h|--help]" + echo "Usage: $0 [--clean] [--unittest] [--riscv] [--kernel] [--regression] [--amo] [--dxa] [--opencl] [--cache] [--vm] [--rvc] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--graphics] [--tensor] [--tensor_sp] [--tensor_mx] [--tensor_wg] [--cupbop] [--hip] [--sst] [--gem5] [--dtm] [--mpi] [--all] [--h|--help]" } declare -a tests=() @@ -1139,6 +1270,9 @@ while [ "$1" != "" ]; do --sst ) tests+=("sst") ;; + --gem5 ) + tests+=("gem5") + ;; --dtm ) tests+=("dtm") ;; diff --git a/ci/sst_run_hostless_app.py b/ci/sst_run_hostless_app.py new file mode 100644 index 000000000..3f8618808 --- /dev/null +++ b/ci/sst_run_hostless_app.py @@ -0,0 +1,53 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Hostless SST runner: instantiate a single vortex.VortexGPGPU +# component and run the given kernel. SST runs Vortex co-resident in +# one process, primes the KMU DCRs directly via proc_->dcr_write +# inside sim/simx/sst/vortex_simulator.cpp, and ticks the simulation +# to completion. No host CPU, no CP, no PIO/DMA. +# +# Hostless is the only mode the SST integration currently supports: +# there is no SST CPU component (e.g. Ariel/Vanadis) wired to a +# Vortex regression test binary today. A future ci/sst_run_app.py +# could add that path; the name slot is reserved. +# +# For memHierarchy timing modeling, the VortexGPGPU component exposes +# an optional `memIface` SubComponent slot — see +# docs/proposals/sst_simx_v3_proposal.md for the wiring recipe. +# +# Configurable via env vars (parallel to ci/gem5_run_hostless_app.py): +# VORTEX_TEST_DIR — directory containing the kernel .vxbin +# VORTEX_TEST_KERNEL — kernel filename inside that dir +# (default: kernel.vxbin, matching the +# regression-test convention) +# +# Run via: +# VORTEX_TEST_DIR=tests/kernel/hello VORTEX_TEST_KERNEL=hello.vxbin \ +# sst ci/sst_run_hostless_app.py + +import os +import sst + +TEST_DIR = os.environ.get("VORTEX_TEST_DIR") +TEST_KERNEL = os.environ.get("VORTEX_TEST_KERNEL", "kernel.vxbin") +if not TEST_DIR: + raise RuntimeError("VORTEX_TEST_DIR env var is required") + +PROGRAM = f"{TEST_DIR}/{TEST_KERNEL}" + +gpu = sst.Component("gpu0", "vortex.VortexGPGPU") +gpu.addParams({ + "clock": "1GHz", + "program": PROGRAM, +}) diff --git a/ci/sst_test_vortex_conform.py b/ci/sst_test_vortex_conform.py deleted file mode 100644 index 25681dc6d..000000000 --- a/ci/sst_test_vortex_conform.py +++ /dev/null @@ -1,7 +0,0 @@ -import sst - -gpu = sst.Component("gpu0", "vortex.VortexGPGPU") -gpu.addParams({ - "clock": "1GHz", - "program": "tests/kernel/conform/conform.vxbin" -}) diff --git a/ci/sst_test_vortex_fibonacci.py b/ci/sst_test_vortex_fibonacci.py deleted file mode 100644 index b174543db..000000000 --- a/ci/sst_test_vortex_fibonacci.py +++ /dev/null @@ -1,7 +0,0 @@ -import sst - -gpu = sst.Component("gpu0", "vortex.VortexGPGPU") -gpu.addParams({ - "clock": "1GHz", - "program": "tests/kernel/fibonacci/fibonacci.vxbin" -}) diff --git a/ci/sst_test_vortex_hello.py b/ci/sst_test_vortex_hello.py deleted file mode 100644 index ca4fc0199..000000000 --- a/ci/sst_test_vortex_hello.py +++ /dev/null @@ -1,7 +0,0 @@ -import sst - -gpu = sst.Component("gpu0", "vortex.VortexGPGPU") -gpu.addParams({ - "clock": "1GHz", - "program": "tests/kernel/hello/hello.vxbin" -}) diff --git a/ci/sst_test_vortex_memHierarchy.py b/ci/sst_test_vortex_memHierarchy.py deleted file mode 100644 index 2193985fb..000000000 --- a/ci/sst_test_vortex_memHierarchy.py +++ /dev/null @@ -1,63 +0,0 @@ -# SST Phase 3 integration test for vortex.VortexGPGPU. -# -# Wires the VortexGPGPU component's optional `memIface` SubComponent slot -# through an L1 cache to a memHierarchy.MemController. Every memory request -# accepted by Vortex's local DRAM model is mirrored to the SST memHierarchy -# as a StandardMem::Read or Write event, so memHierarchy can model timing / -# capacity / contention alongside Vortex's own simulation. -# -# This is the Phase 3 demonstrator from docs/proposals/sst_simx_v3_proposal.md. -# The local data path stays in Vortex (RAM is authoritative); SST sees -# every transaction but doesn't have to serve data back. That gives us -# meaningful integration without forcing v3's TLM data path through SST. - -import sst - -# --- Vortex GPGPU component (single-warp hello kernel) ----------------------- -gpu = sst.Component("gpu0", "vortex.VortexGPGPU") -gpu.addParams({ - "clock": "1GHz", - "program": "tests/kernel/hello/hello.vxbin", -}) - -# Vortex's StandardMem-side adapter -gpu_mem_iface = gpu.setSubComponent("memIface", "memHierarchy.standardInterface") - -# --- L1 cache between Vortex and memory -------------------------------------- -# A cache is required because memHierarchy.MemController routes via MemLink -# and only registers its address range when there's an upstream cache that -# advertises destinations. -l1 = sst.Component("l1cache", "memHierarchy.Cache") -l1.addParams({ - "access_latency_cycles": "2", - "cache_frequency": "1GHz", - "replacement_policy": "lru", - "coherence_protocol": "MESI", - "associativity": "4", - "cache_line_size": "64", - "L1": "1", - "cache_size": "8KiB", -}) - -# --- Memory controller + simple backend (host RAM-backed) -------------------- -memctrl = sst.Component("memctrl0", "memHierarchy.MemController") -memctrl.addParams({ - "clock": "1GHz", - "addr_range_end": 0x100000000 - 1, # 4 GB -}) -memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem") -memory.addParams({ - "access_time": "10ns", - "mem_size": "4GiB", -}) - -# --- Wiring ------------------------------------------------------------------ -# Vortex GPGPU → L1 cache -link_gpu_l1 = sst.Link("link_gpu_l1") -link_gpu_l1.connect((gpu_mem_iface, "lowlink", "1ns"), - (l1, "highlink", "1ns")) - -# L1 cache → MemController -link_l1_mem = sst.Link("link_l1_mem") -link_l1_mem.connect((l1, "lowlink", "1ns"), - (memctrl, "highlink", "1ns")) diff --git a/ci/sst_test_vortex_vecadd.py b/ci/sst_test_vortex_vecadd.py deleted file mode 100644 index 8a156cf81..000000000 --- a/ci/sst_test_vortex_vecadd.py +++ /dev/null @@ -1,7 +0,0 @@ -import sst - -gpu = sst.Component("gpu0", "vortex.VortexGPGPU") -gpu.addParams({ - "clock": "1GHz", - "program": "tests/kernel/vecadd/vecadd.vxbin" -}) diff --git a/configure b/configure index 14c0880d1..ea1abb5eb 100755 --- a/configure +++ b/configure @@ -69,7 +69,7 @@ copy_files() { continue fi mkdir -p "$dest_dir" - sed "s|@VORTEX_HOME@|$SOURCE_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@BUILDDIR@|$CURRENT_DIR|g; s|@TOOLCHAIN_REV@|$TOOLCHAIN_REV|g; s|@VORTEX_VERSION@|$VORTEX_VERSION|g" "$file" > "$dest_file" + sed "s|@VORTEX_HOME@|$SOURCE_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@BUILDDIR@|$CURRENT_DIR|g; s|@TOOLCHAIN_REV@|$TOOLCHAIN_REV|g; s|@VORTEX_VERSION@|$VORTEX_VERSION|g; s|@GEM5_REV@|$GEM5_REV|g" "$file" > "$dest_file" # apply permissions to bash scripts read -r firstline < "$dest_file" if [[ "$firstline" =~ ^#!.*bash ]]; then diff --git a/docs/gem5_integration.md b/docs/gem5_integration.md new file mode 100644 index 000000000..5b2e0f1af --- /dev/null +++ b/docs/gem5_integration.md @@ -0,0 +1,441 @@ +# gem5 Integration + +Vortex runs inside the [gem5](https://www.gem5.org/) simulator as a +`DmaDevice` SimObject, exposing the Vortex GPGPU to a simulated host +CPU (x86 or ARM) through a Command Processor regfile + BAR-mapped +VRAM. Use this when you want to model heterogeneous host-CPU + +accelerator workloads with realistic cross-ISA cache and DMA timing, +or to validate the v2 Command Processor architecture against a real +host/device split. + +For the redesigned architecture, see +[docs/proposals/gem5_v2_cp_migration_proposal.md](proposals/gem5_v2_cp_migration_proposal.md). +The earlier [gem5_simx_v3_proposal.md](proposals/gem5_simx_v3_proposal.md) +covers the original OPAE-protocol design; its §3 (host/device protocol) +and §4 (SimObject design) are superseded by the v2 migration. This +document is the operator manual for the current (v2 CP-first) design. + +## At a glance + +Three parts live in this repo: + +| Part | Source | Built artifact | Loaded by | +|---|---|---|---| +| Device library | `sim/simx/gem5/vortex_gpgpu.{cpp,h}` + `dev_mem.{cpp,h}` | `build/sim/simx/libvortex-gem5.so` | gem5 SimObject via `dlopen` | +| gem5 SimObject | `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}` + `VortexGPGPU.py` + `SConscript` | Linked into `gem5.opt` after install | gem5 itself | +| Host runtime | `sw/runtime/gem5/{vortex.cpp,driver.{cpp,h},Makefile}` | `build/sw/runtime/libvortex-gem5-{x86_64,aarch64}.so` | The simulated process inside gem5 | + +Plus `ci/gem5_install.sh` which fetches gem5 v25.0.0.1, drops the +SimObject sources into `$GEM5_HOME/src/dev/vortex/`, and builds +`build/{X86,ARM}/gem5.opt`. + +## Architecture in one paragraph + +The simulated host process loads the upstream dispatcher +(`libvortex.so`) which dlopens the gem5 backend +(`libvortex-gem5-x86_64.so`). The backend's only platform primitives +are `mem_upload/download/copy` (regular memcpy through a host-visible +BAR mapped to device VRAM) and `cp_mmio_{read,write}` (32-bit PIO to +the device's CP regfile). All kernel launches, DCR programming, and +fences flow through the dispatcher's Command Processor submission +path: it writes `CMD_*` descriptors into a ring buffer in device VRAM +(via mem_upload), commits via `cp_mmio_write(Q_TAIL_HI, ...)`, and +polls completion via `cp_mmio_read(Q_SEQNUM, ...)`. The CP itself is +the upstream `vortex::CommandProcessor` C++ class embedded in the +device library; the SimObject ticks it on its own gem5 event chain +and ticks the Vortex Processor on a parallel chain. Both event chains +self-schedule only while they have work — the device is genuinely +idle between commands. + +## One-time setup + +Vortex install / build as usual ([docs/install_vortex.md](install_vortex.md)), +then add gem5: + +```bash +cd build/ # standard Vortex out-of-tree build directory +./ci/gem5_install.sh +``` + +This runs `sudo apt install` for gem5's build dependencies (scons, +libprotobuf, m4, libboost, **gcc-aarch64-linux-gnu**, …), clones gem5 +v25.0.0.1 into `$TOOLDIR/gem5`, copies the Vortex SimObject sources +into `$GEM5_HOME/src/dev/vortex/`, and builds `gem5.opt` for both X86 +and ARM (~15 min on a 64-core machine, ~30-45 min on a typical CI +runner). The script is idempotent — re-running with the same +`GEM5_REV` is a no-op. + +To install only one ISA: + +```bash +GEM5_TARGETS="X86" ./ci/gem5_install.sh # default +GEM5_TARGETS="ARM" ./ci/gem5_install.sh +GEM5_TARGETS="X86 ARM" ./ci/gem5_install.sh # both (default) +``` + +The pinned gem5 revision lives in `VERSION` (`GEM5_REV=v25.0.0.1`); +bumping it requires re-running `ci/gem5_install.sh` and verifying +both `gem5.opt` builds still load `VortexGPGPU` cleanly. + +## Building Vortex with gem5 support + +The device library is gated behind `USE_GEM5=1`. The default +`make -C sim/simx` is **unchanged** — no gem5 dep, no `libvortex-gem5.so` +produced. + +```bash +make -C sim/simx # default; no gem5 artifacts +make -C sim/simx USE_GEM5=1 # produces libvortex-gem5.so + gem5_smoke +``` + +`USE_SST=1` and `USE_GEM5=1` are mutually exclusive (the Makefile +errors out if both are set). + +### Host runtime + tests (cross-compile) + +The simulated process inside gem5 loads the **host runtime** +`libvortex-gem5-$HOST_ARCH.so`, which exposes the pure-v2 `callbacks_t` +to the dispatcher. The `HOST_ARCH` knob is consistent across three +Makefiles — runtime backend, stub, and regression tests: + +```bash +# Native x86 (default) +make -C sw/runtime/stub # → build/sw/runtime/libvortex.so +make -C sw/runtime/gem5 # → build/sw/runtime/libvortex-gem5-x86_64.so +make -C tests/regression/vecadd # → build/tests/regression/vecadd/vecadd + +# Cross-compiled aarch64 — outputs land in $arch/ subdirs so x86 +# and ARM artifacts coexist: +make -C sw/runtime/stub HOST_ARCH=aarch64 # → build/sw/runtime/aarch64/libvortex.so +make -C sw/runtime/gem5 HOST_ARCH=aarch64 # → build/sw/runtime/aarch64/libvortex-gem5-aarch64.so +make -C tests/regression/vecadd HOST_ARCH=aarch64 # → build/tests/regression/vecadd/vecadd-aarch64 + +# armhf works the same way (note: armhf is 32-bit so the BAR +# mapping above 4 GiB is out of reach — only standalone tests work): +make -C sw/runtime/stub HOST_ARCH=armhf +make -C sw/runtime/gem5 HOST_ARCH=armhf +``` + +The ARM targets require `gcc-aarch64-linux-gnu` / +`gcc-arm-linux-gnueabihf` respectively — `ci/gem5_install.sh` +installs these. + +## Running tests + +### From the regression harness + +```bash +cd build/ +./ci/regression.sh --gem5 +``` + +Runs both the standalone Phase-3 smoke test (kernel preloaded on the +SimObject, no host CPU) and the Phase-5 end-to-end test (real SE-mode +host program drives the device through CP submissions). + +To also run the ARM matrix entry (needs `gcc-aarch64-linux-gnu`): + +```bash +VORTEX_GEM5_ARM=1 ./ci/regression.sh --gem5 +``` + +Runs 6 tests: +- X86 standalone hello (no host CPU; SimObject preloads kernel) +- X86 e2e vecadd `-n16` (host CPU drives device via CP regfile) +- X86 e2e sgemm `-n4` +- ARM standalone hello +- ARM e2e vecadd `-n16` +- ARM e2e sgemm `-n4` + +Cross-arch e2e relies on two gem5 mechanisms working together: + +1. **`setInterpDir(prefix)`** prepends a sysroot to the dynamic + linker path embedded in the cross-compiled ELF + (`/lib/ld-linux-aarch64.so.1` → `/usr/aarch64-linux-gnu/lib/...`). + The Python config calls this when `VORTEX_DRIVER=gem5-aarch64`. +2. **`system.redirect_paths`** redirects the *guest process's* + open()/stat() syscalls for `/lib/aarch64-linux-gnu/*` → + `/usr/aarch64-linux-gnu/lib/*` so the dynamic linker can resolve + libc, libstdc++, etc. + +Both paths point at the Ubuntu `gcc-aarch64-linux-gnu` package's +install location — no extra setup needed. + +### By hand + +**Hostless** (no host CPU; kernel preloaded via SimObject parameter): + +```bash +VORTEX_GEM5_DEV_LIB=$(pwd)/sim/simx/libvortex-gem5.so \ +VORTEX_TEST_DIR=$(pwd)/tests/kernel/hello \ +VORTEX_TEST_KERNEL=hello.vxbin \ + $GEM5_HOME/build/X86/gem5.opt ci/gem5_run_hostless_app.py +``` + +`VORTEX_TEST_KERNEL` defaults to `kernel.vxbin`, so any standard +regression test's kernel can be driven hostless without the host +binary — e.g. `VORTEX_TEST_DIR=$(pwd)/tests/regression/vecadd +ci/gem5_run_hostless_app.py`. + +**End-to-end** — any standard Vortex regression test (host binary + +kernel.vxbin) runs through the generic +[`ci/gem5_run_app.py`](../ci/gem5_run_app.py) runner. + +```bash +# vecadd +VORTEX_GEM5_DEV_LIB=$(pwd)/sim/simx/libvortex-gem5.so \ +VORTEX_GEM5_HOST_RT_DIR=$(pwd)/sw/runtime \ +VORTEX_TEST_DIR=$(pwd)/tests/regression/vecadd \ +VORTEX_TEST_BIN=vecadd \ +VORTEX_TEST_ARGS="-n16" \ + $GEM5_HOME/build/X86/gem5.opt ci/gem5_run_app.py + +# sgemm +VORTEX_GEM5_DEV_LIB=$(pwd)/sim/simx/libvortex-gem5.so \ +VORTEX_GEM5_HOST_RT_DIR=$(pwd)/sw/runtime \ +VORTEX_TEST_DIR=$(pwd)/tests/regression/sgemm \ +VORTEX_TEST_BIN=sgemm \ +VORTEX_TEST_ARGS="-n4" \ + $GEM5_HOME/build/X86/gem5.opt ci/gem5_run_app.py +``` + +Expected output ends with: +``` +PASSED! +``` + +### Sizing tests for the 120 s budget + +Each `timeout 120` per test bound comes from +[feedback_test_timeout_120s](../../../../.claude/projects/-home-blaisetine-dev/memory/feedback_test_timeout_120s.md). +gem5 SE-mode runs the host CPU's CP poll loop in simulated time too, +so **kernel runtime + dispatcher poll budget translate directly into +gem5 wall time**. The regression script's default sizes fit; larger +sizes are fine when run by hand outside the budget cap. + +## Address space layout + +``` +Host process VA (simulated, gem5 SE-mode) | Simulated PA | Backed by +------------------------------------------+--------------+---------------------- +[0x0000_0000_0000, 0x0000_1000_0000) | same | gem5 DDR3 (process + | | heap/stack/code) +[0x0000_2000_0000, 0x0000_2000_0200) | same | VortexGPGPU CP regfile + | | (32-bit PIO) +[0x0001_0000_0000, 0x0002_0000_0000) | same | VortexGPGPU VRAM + | | (BAR-mapped to + | | in-process simx::RAM) +``` + +PIN_BASE_ADDR = `0x100000000` is identity-mapped via `Process.map()` +so host stores at PIN_BASE+dev_addr land in the same in-process +simx::RAM bytes the CP and Vortex read. PIO_BASE_ADDR = `0x20000000` +is identity-mapped (cacheable=False) so the dispatcher's PIO MMIO +reaches the SimObject's regfile decoder. + +These constants are duplicated in two places — `sw/runtime/gem5/driver.h` +and `ci/gem5_run_app.py`. If you change one, change the other. + +## Writing your own gem5 Python script + +The minimal recipe for hosting Vortex inside a custom gem5 system: + +```python +from m5.objects import ( + AddrRange, AtomicSimpleCPU, DDR3_1600_8x8, MemCtrl, Process, + Root, SEWorkload, SrcClockDomain, System, SystemXBar, + VoltageDomain, VortexGPGPU, +) + +# Mappings expected by sw/runtime/gem5/driver.h. +PIO_BASE, PIO_SIZE = 0x20000000, 0x0200 # CP regfile (32-bit) +PIN_BASE, PIN_SIZE = 0x100000000, 0x100000000 # BAR-mapped VRAM +NUM_CPUS = 4 # >=2 required for the dispatcher's per-Queue worker thread + +system = System() +system.clk_domain = SrcClockDomain(clock="3GHz", + voltage_domain=VoltageDomain()) +system.mem_mode = "atomic" +system.mem_ranges = [AddrRange("1GiB")] +system.membus = SystemXBar() +system.system_port = system.membus.cpu_side_ports + +# Multiple CPU contexts — the upstream dispatcher spawns a per-Queue +# worker thread; clone() in SE-mode needs a free HW context to land on. +system.cpu = [AtomicSimpleCPU(cpu_id=i) for i in range(NUM_CPUS)] +system.multi_thread = True +for cpu in system.cpu: + cpu.createInterruptController() + cpu.icache_port = system.membus.cpu_side_ports + cpu.dcache_port = system.membus.cpu_side_ports + # X86 needs explicit interrupt port wiring; ARM does not. + cpu.interrupts[0].pio = system.membus.mem_side_ports + cpu.interrupts[0].int_requestor = system.membus.cpu_side_ports + cpu.interrupts[0].int_responder = system.membus.mem_side_ports + +# DRAM serves the process's address space below PIO_BASE. +system.mem_ctrl = MemCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = AddrRange(0, PIO_BASE) +system.mem_ctrl.port = system.membus.mem_side_ports + +# The Vortex device — claims both the CP regfile PIO range and the +# BAR-mapped VRAM range (gem5_v2_cp_migration §3). +system.vortex = VortexGPGPU( + library = "/path/to/build/sim/simx/libvortex-gem5.so", + kernel = "", # NO preload — the host binary uploads via CP +) +system.vortex.pio_addr = PIO_BASE +system.vortex.pio_size = PIO_SIZE +system.vortex.pin_addr = PIN_BASE +system.vortex.pin_size = PIN_SIZE +system.vortex.pio = system.membus.mem_side_ports +system.vortex.dma = system.membus.cpu_side_ports + +# Workload — the host binary loads libvortex.so + libvortex-gem5-x86_64.so. +process = Process( + pid=100, + cwd="/path/to/your/test", + cmd=["/path/to/your/test/binary"], + executable="/path/to/your/test/binary", + env=[ + "VORTEX_DRIVER=gem5-x86_64", + "LD_LIBRARY_PATH=/path/to/build/sw/runtime", + ], +) + +system.workload = SEWorkload.init_compatible(process.executable) +for cpu in system.cpu: + cpu.workload = process # required: workload size must equal numThreads + cpu.createThreads() + +import m5 +root = Root(full_system=False, system=system) +m5.instantiate() + +# CRITICAL: Process.map() must come AFTER m5.instantiate(). +# Identity-mapping PIO + PIN gives the runtime direct CPU access to +# the device's CP regfile and to BAR-mapped VRAM. +system.cpu[0].workload[0].map(PIO_BASE, PIO_BASE, PIO_SIZE, cacheable=False) +system.cpu[0].workload[0].map(PIN_BASE, PIN_BASE, PIN_SIZE, cacheable=False) + +m5.simulate() +``` + +Reference implementations: +- [ci/gem5_run_hostless_app.py](../ci/gem5_run_hostless_app.py) — hostless variant (preload via `kernel=` param; no host CPU) +- [ci/gem5_run_app.py](../ci/gem5_run_app.py) — e2e variant (any regression test via `VORTEX_TEST_BIN`) + +## Load-bearing invariants — do not violate + +### 1. Process.map() goes AFTER m5.instantiate() + +`Process.map(vaddr, paddr, size)` is a C++ method on the underlying +`gem5::Process` object; that object only exists after +`m5.instantiate()` builds the SimObject tree. Calling `.map()` +before instantiate raises `RuntimeError: Attempt to instantiate +orphan node `. Confirmed by gem5's own AMD GPU +integration at `$GEM5_HOME/configs/example/apu_se.py:1055`. + +### 2. PIO and PIN regions must be identity-mapped — and PIN must be cacheable=False + +`sw/runtime/gem5/driver.h` hard-codes: +- `PIO_BASE_ADDR = 0x20000000` (CP regfile; 0x200 bytes) +- `PIN_BASE_ADDR = 0x100000000` (BAR-mapped VRAM; 4 GB) + +The Python config must `process.map()` both at the same physical +addresses, with `cacheable=False` on PIN. With caching enabled the +host CPU's L1 could hold the new ring entry while `Q_TAIL_HI` is +observed by the CP — the CP fetches a stale CL and the dispatcher +hangs polling `Q_SEQNUM`. + +Changing either constant requires updating both the Python config +**and** `sw/runtime/gem5/driver.h` (they are not auto-synced). + +### 3. CPU thread context count must be >= 2 + +The upstream dispatcher (commit `157e7a1`) spawns a per-Queue worker +thread at `vx_queue_create`. SE-mode `clone()` returns EAGAIN if +there is no free HW context, which surfaces as +`std::system_error: Resource temporarily unavailable` at the +dispatcher constructor. + +Use multiple CPU instances (one per thread) and +`system.multi_thread = True`. Assigning the same Process to every +CPU is required because gem5 fatals if +`workload.size() != numThreads`. + +### 4. PIO accesses to the CP regfile are 32-bit + +The CP regfile is 32-bit-wide; `cp_mmio_write/read` in the host +runtime are explicitly 32-bit (`mmio_write32` / `mmio_read32` in +`driver.cpp`). Don't issue 64-bit accesses — gem5 will deliver a +single packet of the wrong width and the SimObject will route the +extra bytes into the next regfile slot. + +### 5. The Vortex `Processor` and `CommandProcessor` are independent gem5 event chains + +`cpTickEvent_` advances the CP one functional cycle; `vortexTickEvent_` +advances the Vortex `Processor::cycle()`. Both self-schedule only +while their respective busy flag is true. When the CP fires +`CMD_LAUNCH`, the `vortex_start` hook schedules `vortexTickEvent_` +via the registered start handler (set at `VortexGPGPU` construction). +Don't try to combine them into a single tick — that breaks +"concurrent host + CP + GPU progress" which is the whole point of +the simulation model. + +### 6. USE_SST=1 and USE_GEM5=1 are mutually exclusive + +The Makefile rejects both at once. Different external simulators, +different LDFLAGS, different `libvortex.so` shapes. Pick one per +build. + +## Architectural choices you may want to revisit + +These are documented in the +[v2 CP migration proposal](proposals/gem5_v2_cp_migration_proposal.md) +but worth surfacing: + +- **In-process VRAM with DevMemAccessor seam** (proposal §2.5). v1 + uses `InProcessDevMem` (wraps simx::RAM directly). The accessor + interface is designed to be swappable to a gem5 `SimpleMemory` + + DMA-port path in v2 without touching CP hook code or Vortex memory + code. +- **Single ClockDomain for CP + Vortex in v1** (proposal §2.4, D2). + Real silicon has separate clocks; v2 would add a second + `ClockDomain` and rate-match the tick events. +- **Raw PIO range, not a PCIe BAR / config space** + (proposal §2.1). Swap base class from `DmaDevice` to `PciDevice` + for a more realistic FS-mode integration. +- **Polling completion, not MSI-X interrupts** (proposal §8). The + host runtime spins on `Q_SEQNUM` PIO reads. v2 work would let the + CP raise an interrupt and let the dispatcher sleep until it fires. +- **Multi-queue PIO map reserves 4 slots; v1 host runtime exercises + Q0 only** (proposal §2.6, D4). Q1–Q3 hardware is ready for future + vortex2.h multi-queue work. + +## CI + +`./ci/regression.sh --gem5` (built into `--all` is intentionally +**out**: gem5 install is heavy and gated like SST). The +`.github/workflows/ci.yml` matrix includes a `gem5` entry that runs +on hosted runners; ARM matrix gated on `VORTEX_GEM5_ARM=1`. + +Apptainer integration (the `apptainer-ci.yml` pipeline) does **not** +include gem5 — adding it to `miscs/apptainer/vortex.def` is out of +scope. Use the hosted CI for gem5. + +## Troubleshooting + +| Symptom | Cause | Fix | +|---|---|---| +| `dlopen('libvortex-gem5.so') failed: cannot open shared object file` | gem5 SimObject can't find the device library | Set `VortexGPGPU(library="/abs/path/to/libvortex-gem5.so", ...)` to absolute path | +| `Cannot open library: libvortex-gem5-x86_64.so: cannot open shared object file` | Stub can't find the host runtime backend | Set `LD_LIBRARY_PATH=/path/to/sw/runtime` in the `env=[...]` list passed to `Process()` | +| `terminate called after throwing an instance of 'std::system_error': Resource temporarily unavailable` | Dispatcher's per-Queue worker `std::thread` can't `clone()` into a free HW context | Use multiple CPU instances + `system.multi_thread = True`; assign the same Process to every CPU (invariant §3) | +| `system.membus has two ports responding within range [...]` | DRAM `mem_ctrl.dram.range` overlaps with VortexGPGPU's PIO or PIN range | Shrink `dram.range = AddrRange(0, PIO_BASE)` so the device-owned ranges have exclusive routing | +| `Tried to write unmapped address 0xXXX` | Host runtime is using stale PIN_BASE_ADDR (mismatch with Python config), or `Process.map()` was skipped | Confirm both `sw/runtime/gem5/driver.h` and the Python config use the same `PIN_BASE_ADDR`; ensure `Process.map(PIN_BASE, PIN_BASE, PIN_SIZE)` runs after `m5.instantiate()` | +| `Attempt to instantiate orphan node ` | `Process.map()` called before `m5.instantiate()` | Move all `.map()` calls AFTER `m5.instantiate()` — see invariant §1 above | +| `fatal: VortexGPGPU: dlsym(vortex_gem5_cp_mmio_write) failed` | Device library is missing the C ABI symbol — usually means the `library=` parameter points at the wrong .so | `library=` is the **device** library `build/sim/simx/libvortex-gem5.so` (no arch suffix), NOT the host runtime `libvortex-gem5-x86_64.so` | +| `fatal: system.membus has two ports responding within range [0x10000000:0x20000000]` (standalone hello) | `pin_size` defaulted to non-zero in an old gem5.opt; standalone test doesn't need the BAR | Re-install + rebuild gem5.opt OR explicitly set `pin_size = 0` on the VortexGPGPU instance | +| Test hangs polling `Q_SEQNUM` after first launch | Cacheable PIN region — host's L1 holds the ring entry; CP sees stale bytes | Set `cacheable=False` on the PIN `Process.map()` call (invariant §2) | +| `ccache g++ ... undefined reference to fmt::v8::detail::error_handler::on_error` | ccache served a stale object compiled against a different `fmt` version | `CCACHE_DISABLE=1 make -C sim/simx clean && CCACHE_DISABLE=1 make ...` | diff --git a/docs/index.md b/docs/index.md index a7b9000d4..0c3504d72 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,3 +8,4 @@ - [Contributing](contributing.md): Process for contributing your own features including repo semantics and testing - [Debugging](debugging.md): Debugging configurations for each Vortex driver - [Building the Toolchain from Source](building_toolchain.md): Maintainer-facing build recipes for Verilator, RISC-V GNU, LLVM (with X86 + lld + SPIR-V), compiler-rt, musl, and POCL +- [gem5 Integration](gem5_integration.md): Running Vortex inside the gem5 full-system simulator (x86/ARM host CPU + Vortex device over OPAE MMIO/DMA) diff --git a/docs/proposals/gem5_simx_v3_proposal.md b/docs/proposals/gem5_simx_v3_proposal.md new file mode 100644 index 000000000..3bce6ff95 --- /dev/null +++ b/docs/proposals/gem5_simx_v3_proposal.md @@ -0,0 +1,1040 @@ +# gem5 Integration for SimX v3 — Proposal + +**Date:** 2026-05-16 +**Status:** ✅ Original Phases 0–7 complete (OPAE-protocol design). **§3 (host/device protocol) and §4 (SimObject design) SUPERSEDED** by [gem5_v2_cp_migration_proposal.md](gem5_v2_cp_migration_proposal.md) after upstream's pure-v2 `callbacks_t` + CommandProcessor landed (commits `086d26b`, `8bc2564`, `16aa1ca`). The current operator manual is [docs/gem5_integration.md](../gem5_integration.md). §0–§2 (motivation, source-tree layout) and §5+ (testing, install, cross-arch) remain accurate. +**Author:** Blaise Tine +**Related:** +[simx_v3_proposal.md](simx_v3_proposal.md) (Phase 5: TLM data path), +[sst_simx_v3_proposal.md](sst_simx_v3_proposal.md) (the sister integration whose patterns this proposal follows), +[master_merge_v3_proposal.md](master_merge_v3_proposal.md) §10.2 (the precedent for cross-simulator integrations on this line), +[`~/dev/vortex_gem5`](https://github.com/sij814/vortex_gem5) on branch `gem5`, commit `91dcf17` ("working Vortex with gem5", 2025-05-22 — Injae Shin, UCLA capstone), +[Injae Shin, "gem5-Vortex: Heterogeneous Cross-ISA Integration of Vortex GPGPU in gem5"](#) (capstone report, 2025). + +--- + +## 1. Constraints (load-bearing) + +Any design that breaks one of these is wrong. + +1. **One source of truth for memory state.** Per + [simx_v3_proposal.md §3.3](simx_v3_proposal.md), data lives in the + channel hierarchy: `MemReq`/`MemRsp` packets carry actual bytes + between `MemCoalescer` → `Cache` → `Memory`, and the `RAM` image + attached to `Memory` is authoritative. There is no shadow backing + store and no parallel `MemBackend`. The gem5 integration plugs in at + exactly one boundary (the device's DMA port maps to `RAM` + read/write); it does **not** introduce a second data path. +2. **Single clock owner per simulation.** Under gem5, gem5 drives the + clock: `VortexGPGPU::tick()` (a gem5 `EventFunctionWrapper` that + reschedules itself every cycle at the device clock) calls + `Processor::cycle()`. SimX does not advance on its own and there is + no worker thread doing async `Processor::run()` in the background. + (This is a deliberate departure from the legacy `vortex_gem5` design + — see §2.2 — which is the source of most of that branch's bugs.) +3. **gem5 plugs in at one boundary, not many.** Vortex → gem5 traffic + crosses two well-defined interfaces: + - **PIO** for MMIO command/status registers (the OPAE AFU image + layout, unchanged from `sw/runtime/opae`). + - **DMA** for staging-buffer host↔device transfers, and for any + future host-visible memory window. + The cache hierarchy, scheduler, ALU/FPU, KMU, and the new + `Processor::cycle()` entry point do not know gem5 exists. +4. **No regression for non-gem5 builds.** `make -C sim/simx` (no + `USE_GEM5=1`) continues to produce a self-contained `simx` binary + identical to today's. gem5 is opt-in compile-time, not a runtime + probe, and ships as a separate shared library (`libvortex-gem5.so`) + that the gem5 SimObject loads. Per §1.4 of + [sst_simx_v3_proposal.md](sst_simx_v3_proposal.md). +5. **The Vortex tree owns the integration code.** All gem5-facing C++ + (the `DmaDevice` SimObject) and Python (SimObject config + test + scripts) live under `sim/simx/gem5/` and `ci/gem5_test_vortex_*.py` + in this repo. `ci/gem5_install.sh` fetches a pinned upstream gem5 + release and copies/symlinks our SimObject into its source tree + before building. Versioning the integration alongside Vortex is what + makes it possible to review API-breaking changes in a single PR; + the legacy split across two repos is what froze `vortex_gem5` at a + two-year-old SimX. +6. **Author attribution.** The legacy `vortex_gem5` design (DMA-bouncing + through a pinned staging buffer, OPAE-shaped MMIO command set, ARM + SE-mode runtime) is Injae Shin's capstone work. The + re-implementation is a rewrite, not a port (§2), but each new file's + commit body cites the capstone report and the legacy commit + (`vortex_gem5@91dcf17`). + +--- + +## 2. Why the legacy `vortex_gem5` cannot be ported as-is + +### 2.1 The architectural mismatch + +`vortex_gem5` was built on pre-v3 SimX (`Arch`, `Processor*`, +single-step `run()`, `set_running(true)`, `VX_DCR_BASE_*` startup DCRs +broadcast to all cores). v3 explicitly retired all of those: + +| Concern | Legacy SimX (vortex_gem5) | SimX v3 (this branch) | +|---|---|---| +| Sizing | `Arch arch(NUM_THREADS, NUM_WARPS, NUM_CORES)` object | Macros (`NUM_THREADS`, etc.) — no `Arch` class | +| Top-level | `Processor(arch)` ctor with arg | `Processor()` no-arg ctor | +| Run model | `processor->run()` is one cycle | `processor.run()` blocks to completion | +| Single-cycle step | `processor->run()` per cycle from `proc_tick()` | does not exist — must be added (`Processor::cycle()`) | +| Kernel dispatch | `set_running(true)` + `VX_DCR_BASE_STARTUP_*` | `KMU::start()` + `VX_DCR_KMU_*` (startup + grid/block dims) | +| Cache flush | implicit in `run()` finish | explicit: `dcr_read(VX_DCR_BASE_CACHE_FLUSH, cid, &dummy)` per core before host read-back | +| Memory hierarchy | `MemSim` + `CacheSim` are timing-only, data sits in `MemBackend` (`Emulator`-side) | `Memory` + `Cache` carry data through `MemReq`/`MemRsp`; backing image is in `RAM` attached to `Memory` | +| Runtime layout | top-level `runtime/{stubarm,opaesimx}/` | reorganized under `sw/runtime/` per [master_merge §3](master_merge_v3_proposal.md) | + +So the **shape of the gem5 plug-in changes**: not "tick the legacy +single-cycle Processor" but "add a `cycle()` entry point to the v3 +Processor and call it from the gem5 SimObject," with KMU-style dispatch +and an explicit cache-flush before host read-back. + +### 2.2 Specific bugs in the legacy code + +A walk-through of `vortex_gem5/sim/{simx,opaesimx}/` and +`vortex_gem5/runtime/{stubarm,opaesimx}/` found the following defects. +Each is called out so the redesign does not re-introduce it. + +| # | File | Defect | Why it matters | +|---|---|---|---| +| B1 | `sim/simx/simx_device.cpp:122` (`proc_tick`) | Calls `processor_->run()` directly. On legacy SimX this was a single step; on v3 it would block until program completion. | The "tick per gem5 cycle" pattern simply won't work. We must add a real single-cycle `Processor::cycle()` (already required for SST). | +| B2 | `sim/simx/simx_device.cpp:111` (`start`) | `processor_->set_running(true)` — that API does not exist in v3. The KMU now drives execution and requires `VX_DCR_KMU_GRID_DIM_*` / `VX_DCR_KMU_BLOCK_DIM_*` to be written before the first cycle. | Even after re-pluming, kernels won't launch without the KMU DCR setup (see `sim/simx/main.cpp:101–116`). | +| B3 | `sim/opaesimx/opae_simx.cpp:185, 199` (`read_mmio64`/`write_mmio64`) | Implementation is `*(uint64_t*)(GEM5_BASE_ADDR + offset)` — a raw host-pointer dereference into a fixed virtual address. | Only works when the host runtime and the gem5 device share an address space (i.e., when the host runtime is *not* actually inside gem5). It is a stand-in for the real path, not the real path. Cross-ISA simulation defeats the assumption: an ARM userspace process inside gem5 cannot dereference `0x20000000` and reach the device. The legacy code papers over this with a co-resident driver hack; v3 needs a real PIO/DMA path. | +| B4 | `sim/opaesimx/opae_simx.cpp:204–399` | Several hundred lines of commented-out CCI/AVS bus + Verilator (`device_->…`) plumbing left in place, referencing fields and types that do not exist in this file. | Dead code that obscures what the module actually does. Drop it; the new gem5 wrapper has no CCI bus to model. | +| B5 | `sim/opaesimx/opae_simx.cpp:71` (`dram_sim_` field) | DRAM model is constructed but never ticked or consulted after the gem5 hack landed. | Dead state. | +| B6 | `sim/opaesimx/opae_simx.cpp:103` (`pinned_alloc_`) | Uses `PIN_BASE_ADDR = 0x10000000` with `PINNED_MEM_SIZE = 0xFFFFFF` (16 MB), hardcoded. No bounds check beyond `MemoryAllocator::allocate` failure. | Tiny by design — large kernel inputs would silently fail. The v3 design should size from `GLOBAL_MEM_SIZE`/`ALLOC_BASE_ADDR` and surface OOM errors. | +| B7 | `runtime/opaesimx/vortex.cpp:324, 367` | `auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);` — uses float `log2` for an integer constant, then discards the result. | Cosmetic / dead, but a smell. Use `log2ceil(CACHE_BLOCK_SIZE)` from `sw/common/util.h`. | +| B8 | `runtime/opaesimx/vortex.cpp:418–474` (`ready_wait`) | `nanosleep` call is **commented out**; the busy loop only decrements `timeout_ms` and never sleeps. On a long-running kernel inside gem5 SE-mode this saturates the simulated ARM core. | Either use the gem5 device's interrupt path (preferred — implementable as an MMIO doorbell) or restore the `nanosleep` so the ARM CPU is idle while the GPU runs. | +| B9 | `runtime/opaesimx/vortex.cpp:349–390` (`download`) | No cache-flush step before reading back results from device memory. | On v3, dirty lines must be drained via `dcr_read(VX_DCR_BASE_CACHE_FLUSH, cid, &dummy)` per core (see `sim/simx/main.cpp:194–197`, `sw/runtime/simx/vortex.cpp:191–197`) or the host sees stale data. | +| B10 | `runtime/opaesimx/vortex.cpp:478–489` (`dcr_write`) | OPAE protocol has `CMD_DCR_WRITE` but no `CMD_DCR_READ`. | The cache-flush fix above requires a `dcr_read` path. Current `sw/runtime/opae` already adds `CMD_DCR_READ` + `MMIO_DCR_RSP` — adopt the same shape on the gem5 device. | +| B11 | `runtime/stubarm/vortex.cpp:54` | `static callbacks_t g_callbacks;` global with `vx_dev_init(&g_callbacks)` resolved at link time. | Works for a single-device test but breaks `vx_dev_open` from being called concurrently from two host processes. Less critical for the gem5 use case (single device per simulation) but worth flagging. | +| B12 | `sim/simx/simx_device.cpp` (`Impl`) | Uses `std::future future_` for shutdown synchronization but `proc_tick()` calls `processor_->run()` directly on the caller thread. The mutex / future plumbing implies an async model that isn't actually used. | Confused concurrency contract. The v3 design must pick one: synchronous tick from the gem5 event loop (this proposal) **or** async run with a doorbell — not both. | +| B13 | `runtime/stubarm/Makefile:7` + `runtime/opaesimx/Makefile:9` | Cross-compiler hardcoded to `arm-linux-gnueabihf-g++` (32-bit ARM hard-float). | gem5 also models AArch64 ARMv8 and x86_64, and most contemporary ARM ports are 64-bit. The v3 build selects compiler from a `HOST_ARCH` make variable (`x86_64`, `aarch64`, `armhf`); see Phase 4. | +| B14 | `runtime/opaesimx/vortex.cpp:489` (`dcr_write`) and `stubarm/vortex.cpp:139` | Both runtimes write to DCR via the OPAE protocol but no MMIO ordering / fence is established between DCR writes and the `CMD_RUN` MMIO. | Inside gem5 the host CPU model may reorder MMIO. Need an explicit barrier before `CMD_RUN` (per `HOST_ARCH`: `mfence` for x86, `dmb sy` for ARM). Phase 4 provides a `vortex_gem5_mmio_fence()` inline helper. | +| B15 | `sim/opaesimx/opae_simx.cpp:138–157` (`prepare_buffer`) | Returns `*buf_addr = (void*)buffer.ioaddr;` — casts an integer device IO address back to a `void*`. | The runtime then dereferences this pointer to do `memcpy(staging_ptr_, host_ptr, size)` (line 322 of `runtime/opaesimx/vortex.cpp`). Same root cause as B3 — only works when host runtime and device share an address space. Under real gem5 the runtime must `mmap` the pinned region via a syscall the gem5 device intercepts, or the gem5 device must expose the pinned region as a PIO/DMA window. | + +Together B1, B2, B3, B6, B9, B14 and B15 mean the legacy integration as +literally written does not run a kernel correctly under v3 even after +the path renames are applied; it requires architectural rework, not +porting. + +### 2.3 What still ports as design intent + +The legacy paper's design intent — and these are what we keep: + +- **OPAE-shaped MMIO command set.** `CMD_RUN`, `CMD_MEM_READ`, + `CMD_MEM_WRITE`, `CMD_DCR_WRITE`, `MMIO_CMD_TYPE`, `MMIO_CMD_ARG0..2`, + `MMIO_STATUS`. Add `CMD_DCR_READ` + `MMIO_DCR_RSP` per the v3 OPAE + runtime (B10). The kernel runtime under `sw/runtime/gem5/` reuses + this layout so the same `vortex.h` shim layer that drives `opae` + also drives `gem5`. +- **Pinned staging buffer pattern** for host↔device transfers. A + fixed device-visible region of host address space; runtime + `memcpy`'s into it, device DMAs out of it. Sizing is dynamic + (allocate-on-demand) rather than the legacy fixed-16-MB chunk (B6). +- **Single-PIO-range device** registered to gem5 with the OPAE MMIO + offsets. The runtime issues 64-bit MMIO writes; the SimObject + decodes them in `write()` / `read()`. +- **The host SE-mode runtime** (`sw/runtime/gem5/`, native x86 or cross-compiled ARM) + shipped into gem5's SE-mode app, **NOT** a full-system Linux on the + guest. The paper makes this point explicitly and it is the + differentiator vs. NoMali (FS-only) and AMD GPU (FS-only). See + `capstone §IIC`. + +### 2.4 What needs a v3 redesign + +- **`sim/simx/simx_device.{cpp,h}`** — replace with + `sim/simx/gem5/vortex_gpgpu.{cpp,h}` (the SimObject wrapper) + plus reuse of the new `Processor::cycle()` API. The legacy file's + `Impl` class is the wrong shape (B1, B2, B12). +- **`sim/opaesimx/opae_simx.{cpp,h}`** — delete entirely. The legacy + module is a host-side OPAE stub whose `read_mmio64`/`write_mmio64` + do raw pointer arithmetic (B3, B15). The v3 design routes MMIO + through gem5's PIO port; there is no host-side stub. +- **`runtime/opaesimx/`** — delete. The OPAE-stub path was a + pre-gem5 debugging convenience; under v3 we test the gem5 device + end-to-end via a gem5 Python script (§4, Phase 5), not via a + co-resident driver. +- **`runtime/stubarm/`** — replace with `sw/runtime/gem5/`, + re-implemented against the same `callbacks.h` ABI as + `sw/runtime/simx`/`opae`/`rtlsim`, with cache-flush plumbed in + (B9), MMIO fences before `CMD_RUN` (B14), and a configurable ARM + cross-compiler target (B13). + +--- + +## 3. Target architecture + +``` + ┌───────────────────────────────────────────────┐ + │ gem5 simulation │ + │ ───────────────── │ + │ ./ci/gem5_test_vortex_hello.py │ + │ (gem5.opt is build/X86/gem5.opt or │ + │ build/ARM/gem5.opt; both supported) │ + │ │ + │ ┌─────────────┐ ┌─────────────────┐ │ + │ │ Host CPU │ ──PIO─▶ │ VortexGPGPU │ │ + │ │ (X86 or ARM,│ ◀─PIO── │ (DmaDevice ↓ │ │ + │ │ SE mode) │ │ PioDevice) │ │ + │ │ user │ │ ┌───────────┐ │ │ + │ │ binary: │ │ │ MMIO regs │ │ │ + │ │ hello + │ │ └───────────┘ │ │ + │ │ libvortex- │ │ ┌───────────┐ │ │ + │ │ gem5.so │ ──DMA─▶ │ │ Pinned │ │ │ + │ │ (native │ ◀─DMA── │ │ staging │ │ │ + │ │ for X86, │ │ │ buffer │ │ │ + │ │ cross- │ │ │ window │ │ │ + │ │ compiled │ │ └───────────┘ │ │ + │ │ for ARM) │ │ │ │ │ + │ └─────────────┘ │ ▼ │ │ + │ │ │ ┌───────────┐ │ │ + │ │ MemPort │ │ vortex:: │ │ │ + │ ▼ │ │ Processor │ │ │ + │ ┌─────────────┐ │ │ (SimX v3) │ │ │ + │ └─────────────┘ │ │ │ │ │ + │ │ │ Cluster[]│ │ │ + │ │ │ Cache │ │ │ + │ │ │ Memory ─┼──┼──┼─▶ RAM (Vortex VRAM, + │ │ └───────────┘ │ │ held inside the + │ │ ▲ │ │ device — separate + │ │ │ cycle() │ │ address space from + │ │ ┌┴──────────┐ │ │ gem5 DRAM) + │ │ │ tick │ │ │ + │ │ │ (gem5 │ │ │ + │ │ │ event) │ │ │ + │ │ └───────────┘ │ │ + │ └─────────────────┘ │ + └───────────────────────────────────────────────┘ +``` + +### 3.1 The plug-in boundary + +The Vortex side exposes **one** plug-in unit: `libvortex-gem5.so`. It +is built from the same `sim/simx/*.{cpp,h}` sources as the default +`simx` binary, plus a single new wrapper file +(`sim/simx/gem5/vortex_gpgpu.{cpp,h}`) that holds: + +- A `vortex::Gem5Wrapper` C++ class that owns a `vortex::Processor`, + a `vortex::RAM` (the device VRAM), and a thin `cycle()` entry + point — exactly mirroring `vortex::VortexSimulator` in + `sim/simx/sst/`. +- A C-ABI shim (`vortex_gem5_create()`, `vortex_gem5_tick()`, + `vortex_gem5_mmio_write64()`, `vortex_gem5_mmio_read64()`, + `vortex_gem5_dma_read()`, `vortex_gem5_dma_write()`, …) so the + gem5-side SimObject is decoupled from C++ ABI changes in + `vortex::Processor`. **The C ABI is the contract;** changing it + requires a coordinated update of the gem5-side SimObject. + +The gem5 side is **one** SimObject + **one** Python file, both shipped +in this repo at `sim/simx/gem5/`: + +- `vortex_gpgpu_dev.{cc,hh}` — subclasses `gem5::DmaDevice` (which + itself subclasses `PioDevice`). Holds an opaque + `vortex_gem5_handle_t`; on `tick()`, calls `vortex_gem5_tick()`. PIO + reads/writes decode the OPAE MMIO offsets and forward to + `vortex_gem5_mmio_*`. DMA reads/writes triggered by + `CMD_MEM_{READ,WRITE}` use gem5's `DmaPort` and copy bytes into the + device VRAM via `vortex_gem5_dma_*`. +- `VortexGPGPU.py` — `gem5.SimObject` definition with `pio_addr`, + `pio_size`, `pio_latency`, `dma_latency`, `clock`, `library` + (path to `libvortex-gem5.so`), and `kernel` (path to `*.vxbin` — + loaded into VRAM at boot, in lieu of the runtime upload path, for + smoke tests). + +`ci/gem5_install.sh.in` fetches a pinned gem5 release +(see §3.4 for version), copies the two files into +`/src/dev/vortex/`, drops a one-line `SConscript`, and runs +`scons build/ARM/gem5.opt`. + +**Nothing upstream of `vortex_gem5_create()` knows gem5 exists.** This +satisfies §1.3. + +### 3.2 The cycle interface + +`Processor::cycle()` does **not exist** in v3 today. It is a direct +prerequisite of both the SST integration (per +[sst_simx_v3_proposal.md §3.2](sst_simx_v3_proposal.md)) and this +proposal. The signature and shape are identical to what SST needs: + +```cpp +// processor.h — public additions +bool cycle(); // advance one cycle; returns false when nothing is running +Memory* memsim(); // for optional gem5/SST memory-mirroring hooks +``` + +```cpp +// processor.cpp — implementation +bool ProcessorImpl::cycle() { + if (!is_cycle_initialized_) { + SimPlatform::instance().reset(); + this->reset(); + kmu_->start(); // dispatch CTAs into the cluster + is_cycle_initialized_ = true; + } + SimPlatform::instance().tick(); + return this->any_running(); +} + +Memory* ProcessorImpl::memsim() { return memsim_.get(); } +``` + +The two pieces (`SimPlatform::reset()` → `start_kmu()` → +`SimPlatform::tick()` and `any_running()`) are already factored on +`Processor` from Round 6 DTM work. `cycle()` just packages them into a +single-cycle step. + +**Reuse from DTM work:** `start_kmu()` and `any_running()` are already +public on `Processor`. We add `cycle()` and `memsim()` and that is the +entire SimX-side API surface required by both SST and gem5. + +### 3.3 The MMIO command protocol + +Identical to `sw/runtime/opae` v3 (the OPAE driver), reusing +`hw/syn/altera/opae/vortex_afu.h`: + +| Offset | Name | Direction | Purpose | +|---|---|---|---| +| `MMIO_CMD_TYPE` | `CMD_*` | W64 | Dispatch one of: `MEM_READ`, `MEM_WRITE`, `RUN`, `DCR_WRITE`, `DCR_READ` | +| `MMIO_CMD_ARG0..2` | command-specific | W64 | DCR addr / device addr / size / value | +| `MMIO_STATUS` | bit0=busy | R64 | Polled by runtime's `ready_wait` | +| `MMIO_DCR_RSP` | response | R64 | Result of `CMD_DCR_READ` (used for cache-flush) | +| `MMIO_DEV_CAPS` / `MMIO_ISA_CAPS` | caps bitfield | R64 | Encoded device capabilities | + +The runtime issues commands by writing args first, then `CMD_TYPE` +(B14 fix: emit a `DMB SY` before the type write). The device latches +on `CMD_TYPE`, performs the action synchronously (PIO write returns +when the operation is enqueued, or completes synchronously for +fast ones like `DCR_WRITE`), and clears the status busy bit when done. + +`CMD_MEM_{READ,WRITE}` use the staging-buffer protocol from the +capstone paper Fig. 5 (§3.4 below). + +### 3.4 The staging-buffer protocol + +The gem5 device exposes a PIO-addressable register `MMIO_PINNED_BASE` +that returns the base address of a pinned region inside gem5's host +address space. The runtime, on `vx_mem_alloc`, lazily picks a slice of +that region as a staging buffer. + +For a `vx_copy_to_dev(host_ptr, dev_addr, size)`: +1. Runtime `memcpy(staging_buf, host_ptr, size)`. +2. Runtime writes `staging_buf_addr`, `dev_addr`, `size` to + `MMIO_CMD_ARG{0,1,2}`. +3. Runtime writes `CMD_MEM_WRITE` to `MMIO_CMD_TYPE`. +4. Device's PIO handler enqueues a `gem5::DmaPort::dmaAction()` read + from `staging_buf_addr` into a local scratch. +5. On DMA completion, the device copies the scratch bytes into Vortex's + `RAM` at `dev_addr` (via `RAM::write`). +6. Device clears the status busy bit. +7. Runtime polls `MMIO_STATUS` until busy=0. + +`vx_copy_from_dev` is the reverse, with **cache flush first** (B9): +the runtime issues `CMD_DCR_READ(VX_DCR_BASE_CACHE_FLUSH, cid)` for +every core before the `CMD_MEM_READ`. The device's DCR-read handler +plumbs through to `Processor::dcr_read`, which already invokes +`flush_caches()` for the cache-flush DCR +([processor.cpp:251–258](../../sim/simx/processor.cpp#L251)). + +This is the same protocol the v3 OPAE runtime already uses, so the +runtime under `sw/runtime/gem5/` differs from `sw/runtime/opae/` only +in: +- The `driver.{cpp,h}` backend (gem5 mmaps a `/dev/vortex_gem5` + character device path **OR**, in SE-mode, gem5 sets up the device's + PIO/DMA windows directly in the simulated process's address space — + see §3.6). +- The lack of an `fpgaPrepareBuffer` API (the device exposes the + pinned region itself; no per-call buffer allocation by an OPAE + layer). + +### 3.5 Build-time gating + +`USE_GEM5=1` make variable controls compilation of: +- `sim/simx/gem5/vortex_gpgpu.{cpp,h}` (the C ABI wrapper). +- Link target `libvortex-gem5.so` produced alongside `libsimx.so` + (mirrors the SST `libvortex.so` pattern in `sim/simx/Makefile`). + +`USE_GEM5=1` does **not** affect the default build: +`make -C sim/simx` (no flag) still produces a stand-alone `simx` +binary with no gem5 dep. Per §1.4. + +The host-side runtime supports both x86 (native) and ARM (cross- +compiled) targets via a `HOST_ARCH` switch: +``` +make -C sw/runtime/gem5 # x86 default +make -C sw/runtime/gem5 HOST_ARCH=x86_64 # explicit x86 +make -C sw/runtime/gem5 HOST_ARCH=aarch64 # AArch64 cross +make -C sw/runtime/gem5 HOST_ARCH=armhf # ARMv7 cross +``` +producing `libvortex-gem5-{x86_64,aarch64,armhf}.so`. Test scripts +select the matching `(gem5.opt, libvortex-gem5-*.so)` pair via the +`HOST_ARCH` make variable. Native x86 needs no toolchain install; ARM +requires `gcc/g++-aarch64-linux-gnu` (or `-arm-linux-gnueabihf` for +ARMv7), which `ci/gem5_install.sh` installs as part of Phase 0. + +### 3.6 gem5 SE-mode wiring + ISA selection + +**Host ISA: both x86 and ARM, equally first-class** (decision recorded +2026-05-16 after Phase 0 prototyping). Phase 0's `ci/gem5_install.sh` +builds `build/X86/gem5.opt` *and* `build/ARM/gem5.opt`; phases 4–6 +test both. Rationale: + +- **x86** is the path of least resistance for users — no + cross-toolchain, native `g++` builds `sw/runtime/gem5/`, faster + gem5 CPU model, and PCIe is canonical on x86 (relevant to the + Phase 5+ upgrade path below). +- **ARM** is the research-narrative path matching the capstone paper + (Injae Shin 2025) and actually-deployed ARM+accelerator HPC + platforms (Grace Hopper, Fugaku, Graviton, Apple Silicon). Kept + as a first-class matrix variant; not a stretch goal. + +Three MMIO/DMA paths exist; this proposal picks one for the initial +work and notes the others as future upgrades: + +| Path | Description | Status in this proposal | +|---|---|---| +| **1. SE-mode + custom PIO+DMA wiring** | The device is a `DmaDevice` subclass attached to `system.membus` at a configurable `pio_addr` (default `0x20000000`, matching the legacy paper). Host binary touches the address via `mmap`/inline asm. Works in both x86 SE-mode and ARM SE-mode. | **Phase 2–6: this is the design.** Matches legacy paper, lightweight, fast iteration. | +| **2. FS-mode + PCIe device** | Subclass `PciDevice` (which already inherits `DmaDevice`); BARs expose MMIO, DMA for staging. Full Linux boot inside gem5 with a tiny PCI kernel module to bind the device. | **Phase 5+ upgrade.** Realistic accelerator-modeling story expected by x86 users. The C ABI committed in Phase 2 is shape-compatible — `PciDevice` and the custom `DmaDevice` both use the same `vortex_gem5_dma_*` callbacks; only the gem5-side wrapper class differs. | +| **3. `/dev/vortex_gem5` pseudo-file** | The gem5 device implements `SyscallReturn open(...)` + `mmap` for a synthetic device path. Runtime `open("/dev/vortex_gem5", O_RDWR)` + `mmap`. | Out of scope. Closest to how real OPAE drivers work but requires a custom syscall handler in gem5; cost outweighs the benefit when Path 1 already works. | + +**Doorbell queues** are a Phase 7+ realism upgrade orthogonal to the +transport choice above. AMD GPU (gem5 `src/dev/amdgpu/`, derived +from `PciEndpoint`) and NVIDIA-style modern accelerators use a ring +buffer in host DRAM plus a single MMIO "doorbell" write per dispatch: +the host appends commands to the ring, then writes the new tail +offset to the doorbell register; the device asynchronously walks the +ring and processes commands. The Phase 2-6 design instead uses +**status polling** — the host writes args + `CMD_TYPE`, then polls +`MMIO_STATUS` until done — which matches the legacy OPAE FPGA driver. +Polling is fine for the capstone-paper scope (small kernels, one at +a time) but burns simulated cycles on the spin. If later research +wants batched-dispatch realism comparable to AMD GPU, the upgrade +swaps the OPAE MMIO command set for a ring + doorbell protocol; the +C ABI in Phase 2 stays compatible (a new `vortex_gem5_doorbell_ring(handle, tail)` +entry point alongside the existing `vortex_gem5_mmio_*`). + +### 3.7 gem5 version pinning + +`ci/gem5_install.sh.in` pins gem5 to v25.0.0 (the most recent stable +release as of 2026-05). The pinned tag goes in `VERSION` alongside +`TOOLCHAIN_REV` and `SST_VER` — bumps require a CI re-run on the +self-hosted runner first (small risk of API drift on gem5's +`DmaDevice`/`PioDevice` between major releases). **Picking and +validating this pin is the first deliverable of Phase 0** — every +other phase is a no-op if Phase 0 reveals that v25.0.0 no longer +supports SE-mode PIO mapping or the SimObject install path we depend +on. + +### 3.8 Why this is not just a copy of the SST pattern + +SST and gem5 are similar in shape (external simulator drives the +Vortex clock through a C++ wrapper around `Processor::cycle()`) but +differ in three load-bearing ways: + +1. **The host process is simulated under gem5.** Under SST the host + "process" is the SST Python script itself, running natively on the + developer's machine. Under gem5 the host is a userspace process + (x86 or ARM, per §3.6) running inside the gem5 model. So the gem5 + integration also needs a host-side runtime under `sw/runtime/gem5/` + (native compile for x86, cross-compile for ARM); SST does not. + (This is the bulk of the work that makes gem5 the bigger project — + see §9 effort estimate.) +2. **Memory is in two address spaces.** Under SST, the SimX `Processor` + and any optional SST memHierarchy share the same simulator. Under + gem5, the host CPU's DRAM is a gem5 `AddrRange`, the Vortex VRAM is + a `RAM` inside the device, and the only way bytes cross between + them is via DMA through the device. The staging-buffer protocol + (§3.4) implements this; SST has no equivalent. +3. **PIO bus integration.** SST's `StandardMem` interface is the + only one we plug into; gem5 has separate `PioPort` and `DmaPort` + with different timing models. The wrapper must manage both. + +--- + +## 4. Phasing + +Each phase is independently shippable and validated. The work follows +the same shape as the SST integration in +[sst_simx_v3_proposal.md §4](sst_simx_v3_proposal.md): **environment +first**, API + library second, gem5-side wiring third, ARM runtime +fourth, CI last. + +### Phase 0 — gem5 environment + API survey *(derisking; nothing else can start until this is done)* + +The legacy `vortex_gem5` was built against a forked gem5 that no +longer exists publicly. Before we design the C ABI in Phase 2 or +write a single line of `DmaDevice` glue in Phase 3, we need a +known-good gem5 build on the bench so the API surface we are about +to commit to is **real**, not assumed-from-headers-we-haven't-read. +This is the "solve gem5 setup first" phase. + +Concretely: + +- **Pick and pin the gem5 version.** Default target: v25.0.0.1 + (patch release on top of v25.0.0, most recent stable as of 2026-05). + Pin the tag in `VERSION` alongside `TOOLCHAIN_REV` and `SST_VER`: + ``` + GEM5_REV=v25.0.0.1 + ``` +- **Write `ci/gem5_install.sh.in`** (no Vortex integration yet — just + the install). Mirrors the structure of `ci/sst_install.sh.in`: + - `apt install scons python3-dev python3-pip libprotobuf-dev + protobuf-compiler libprotoc-dev libgoogle-perftools-dev m4 + libboost-all-dev gcc-aarch64-linux-gnu g++-aarch64-linux-gnu` + (gem5's documented build deps + ARM cross-toolchain for the ARM + matrix variant). + - Fetch gem5 working tree at `$GEM5_REV` into `$TOOLDIR/gem5`. + - `scons build/X86/gem5.opt -j$(nproc)` and + `scons build/ARM/gem5.opt -j$(nproc)` — **both ISAs by default** + per the dual-ISA decision in §3.6. Targets selectable via + `GEM5_TARGETS="X86"` / `"ARM"` / `"X86 ARM"`. + - Export `GEM5_HOME=$TOOLDIR/gem5` to `~/.bashrc`. +- **Validate the X86 native compiler produces SE-mode binaries.** + Trivial — `gcc -static -o /tmp/hello-x86 sim/simx/gem5/hello.c` + then run under `gem5.opt configs/example/gem5_library/arm-hello.py` + -shape config (substituting `ISA.X86`). Confirm exit code 0 and + the expected stdout. +- **Validate the ARM cross-toolchain produces SE-mode binaries.** + Cross-compile `hello.c` with `aarch64-linux-gnu-gcc -static -o + /tmp/hello-arm`, run under + `build/ARM/gem5.opt configs/example/gem5_library/arm-hello.py` + (or the deprecated SE script). Confirms the cross-toolchain + produces something gem5 ARM-mode can load. +- **Read the gem5 source for the API surface we are about to use** + and record findings in a short scratch file + `sim/simx/gem5/gem5_api_notes.md` (not committed to docs/, just a + Phase 0 deliverable): + - `src/dev/io_device.hh` — `PioDevice::read`/`write` signatures + in v25.0.0. Compare to what the legacy paper assumed. + - `src/dev/dma_device.hh` — `DmaDevice::dmaAction`, `DmaPort` + timing model. Confirm 64-bit address support, async completion + callback shape. + - `src/python/m5/objects/Device.py` — SimObject Python bindings. + Confirm that out-of-tree `src/dev//SConscript` is + picked up by `scons build/ARM/gem5.opt` (this is the install + mechanism we rely on in Phase 3). + - `configs/example/se.py` — how SE-mode wires a CPU to a + `Workload`. Confirm that we can attach a `PioDevice` and have + the SE-mode loader map its PIO range into the workload's address + space (the legacy paper's `0x20000000` magic). If this is no + longer supported, the design changes — better to know now than + in Phase 3. +- **Smoke-build a trivial out-of-tree SimObject** to prove the + install mechanism end-to-end. Three files + (`Dummy.{cc,hh,py}` + `SConscript`) under `sim/simx/gem5/dummy/`, + installed by `sim/simx/gem5/install.sh` (Phase 0 only ships the + installer; the real SimObject lands in Phase 3). After + `ci/gem5_install.sh` re-runs, `gem5.opt --list-sim-objects` shows + `Dummy`. Delete `dummy/` once verified — it was scaffolding. + +**Validation:** +- `ci/gem5_install.sh` finishes successfully on the self-hosted + runner. Wall time recorded in `gem5_api_notes.md` (drives CI + caching strategy in Phase 6). +- `$GEM5_HOME/build/ARM/gem5.opt configs/example/se.py + --cmd ./hello-arm` exits 0. +- `gem5.opt --list-sim-objects` lists the dummy SimObject installed + via `sim/simx/gem5/install.sh`. +- `gem5_api_notes.md` documents the `DmaDevice` / `PioDevice` / + `EventFunctionWrapper` signatures we will commit to in Phase 2's + C ABI design. + +**Why this is its own phase:** if any of those validations fails +(e.g. gem5 v25 has dropped SE-mode PIO mapping, or the SimObject +install mechanism has changed), the rest of the proposal needs +redesign before code lands. Phase 0 is a ~1-day gate, not a tracked +deliverable; everything downstream depends on its outputs. + +### Phase 1 — `Processor::cycle()` + `Memory*` accessor + +Prerequisite shared with SST. Can run in parallel with Phase 0 +(no gem5 dependency) and lands first into the SimX-side codebase. + +- Add `Processor::cycle()` and `Memory* Processor::memsim()` as in + §3.2. This is a ~50-line patch to `processor.{cpp,h}` and + `processor_impl.h` plus an `is_cycle_initialized_` bool. +- Add `Memory::set_pre_send_hook()` (already in v3 per + `sim/simx/mem/memory.h:42` — verify still there; if so, this part + of Phase 1 is a no-op). +- Update SST's `vortex_simulator.cpp` to use the new public + `Processor::cycle()` API (currently calls `proc_->cycle()` which + does not compile against `processor.h` HEAD — see + `sim/simx/sst/vortex_simulator.cpp:64`). **This is a pre-existing + bug that Phase 1 fixes for both integrations.** + +**Validation:** `make -C sim/simx` (default), `make -C sim/simx +USE_SST=1`, and `make -C sim/simx USE_GEM5=1` all build. SST tests +that previously failed to link now link and run (`sst +ci/sst_test_vortex_hello.py` passes). + +### Phase 2 — `libvortex-gem5.so` + C ABI + +**Prerequisite: Phase 0 complete.** The C ABI is designed *against* +the `DmaDevice`/`PioDevice` shapes recorded in +`gem5_api_notes.md`, not from headers we haven't read. + +- Create `sim/simx/gem5/vortex_gpgpu.{cpp,h}` mirroring + `sim/simx/sst/vortex_simulator.{cpp,h}` shape: + - Owns a `Processor`, a `RAM` (device VRAM at `MEM_PAGE_SIZE`). + - Exposes a C ABI (`vortex_gem5_*`) sufficient for the gem5 device + to MMIO/DMA/tick it. ABI signatures match what gem5's + `DmaDevice::dmaAction` and `PioDevice::read`/`write` need to + call into (per Phase 0 survey). +- Add `USE_GEM5=1` build target to `sim/simx/Makefile` producing + `libvortex-gem5.so` (no SST symbols; no `sst-core` link). Pattern: + duplicate the `ifeq ($(USE_SST),1)` block. +- Add a tiny in-process smoke driver + `sim/simx/gem5/gem5_smoke_main.cpp` (built with the lib) that: + 1. Loads a `.vxbin` via the C ABI. + 2. Ticks until `cycle()` returns false. + 3. Reads the MPM exit code via DCR_READ. + + This is the "library compiles and a kernel runs through it without + gem5 installed" smoke test (§6.2). + +**Validation:** +- `make -C sim/simx USE_GEM5=1` builds. +- `LD_LIBRARY_PATH=. ./gem5_smoke hello.vxbin` returns 0. +- `make -C sim/simx` (no flag) still builds and `./simx hello.vxbin` + returns 0 (no regression on default). + +### Phase 3 — gem5 SimObject + Python config + +**Prerequisite: Phases 0 + 2 complete.** The install mechanism is +already proven by Phase 0's dummy SimObject; this phase replaces +the dummy with the real device. + +- `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}` — the gem5 `DmaDevice` + subclass. PIO `read`/`write` decode MMIO offsets and call + `vortex_gem5_mmio_*`. DMA actions triggered by `CMD_MEM_*`. A + registered `EventFunctionWrapper` re-schedules itself every + `clock_period_ticks()` and calls `vortex_gem5_tick()`. +- `sim/simx/gem5/VortexGPGPU.py` — Python SimObject definition. +- `sim/simx/gem5/SConscript` — for gem5's scons build. +- `sim/simx/gem5/install.sh` — copies the four files above into + `/src/dev/vortex/`. (Phase 0 already wrote this for the + dummy SimObject; just extend it.) +- Update `ci/gem5_install.sh.in` to re-run `install.sh` and rebuild + `build/ARM/gem5.opt` after the Vortex SimObject lands. + +**Validation:** `ci/gem5_install.sh` succeeds with the real +SimObject installed. `gem5.opt --list-sim-objects` shows +`VortexGPGPU`. `gem5.opt configs/example/se.py --help` accepts +`VortexGPGPU` parameters. + +### Phase 4 — Host runtime (`sw/runtime/gem5/`, x86 + ARM) + +- New backend mirroring `sw/runtime/opae/` shape: + - `vortex.cpp` — implements the `vx_*` callbacks against the OPAE + MMIO protocol (§3.3), but the `driver.{cpp,h}` underneath does + raw `mmap`/MMIO writes to the PIO address rather than calling + `libopae`. + - `Makefile` — selects compiler from `HOST_ARCH`: + - `x86_64` (default): native `g++` + - `aarch64`: `aarch64-linux-gnu-g++` + - `armhf`: `arm-linux-gnueabihf-g++` +- Cache-flush integration (B9): the v3 `download` path issues + `CMD_DCR_READ(VX_DCR_BASE_CACHE_FLUSH, cid)` per core before + `CMD_MEM_READ`. +- MMIO ordering fence (B14): emit the right barrier for `HOST_ARCH`: + - `x86_64`: `__asm__ volatile ("mfence" ::: "memory")` + - `aarch64`: `__asm__ volatile ("dmb sy" ::: "memory")` + - `armhf`: `__asm__ volatile ("dmb sy" ::: "memory")` + Provide a `vortex_gem5_mmio_fence()` inline helper that compiles + to the right barrier per `HOST_ARCH`. +- Multi-target build (B13 obsolete; replaced by clean multi-target + support): `HOST_ARCH` make variable. + +**Validation:** +- `make -C sw/runtime/gem5` (default `HOST_ARCH=x86_64`) builds. + `file build/sw/runtime/libvortex-gem5-x86_64.so` confirms x86-64 + ELF. +- `make -C sw/runtime/gem5 HOST_ARCH=aarch64` builds (requires + cross-toolchain, installed by Phase 0's `ci/gem5_install.sh`). + `file build/sw/runtime/libvortex-gem5-aarch64.so` confirms + AArch64 ELF. + +### Phase 5 — End-to-end gem5 test + +- `ci/gem5_test_vortex_hello.py` — gem5 Python config that wires: + - A `System` with one `TimingSimpleCPU` core in SE mode (host ISA + selected at runtime via `--host-arch=x86|arm`). + - A `VortexGPGPU` device on `system.membus` at + `pio_addr=0x20000000`, mapped into the process's address space. + - The native-or-cross-compiled test binary + (`tests/kernel/hello/hello` re-linked against the matching + `libvortex-gem5-{x86_64,aarch64}.so`) as the SE-mode workload. +- `ci/gem5_test_vortex_vecadd.py` — same with a vecadd kernel that + actually exercises DMA in both directions and the cache-flush path. +- Add a top-level wrapper test in `tests/regression/gem5/` (mirrors + `tests/regression/dxa/`) that builds the kernels and invokes the + Python scripts for both `HOST_ARCH=x86_64` and `HOST_ARCH=aarch64`. + +**Validation:** +- `build/X86/gem5.opt ci/gem5_test_vortex_hello.py --host-arch=x86` + exits with code 0 and the expected `Hello World` on stdout. +- `build/ARM/gem5.opt ci/gem5_test_vortex_hello.py --host-arch=arm` + exits with code 0 and the expected `Hello World` on stdout. +- Both `ci/gem5_test_vortex_vecadd.py` variants exit 0 with the + vecadd result buffer matching the CPU-computed reference (checked + by the test binary itself). + +### Phase 6 — CI integration + +- Add `gem5()` function to `ci/regression.sh.in` (mirroring `sst()` + on line ~80): + ```bash + gem5() + { + echo "begin gem5 tests..." + + make -C sim/simx USE_GEM5=1 + make -C tests/kernel + + # X86 default: native compile, no cross-toolchain needed. + make -C sw/runtime/gem5 HOST_ARCH=x86_64 + cp sim/simx/libvortex-gem5.so $GEM5_HOME/build/X86/ + + timeout 120 $GEM5_HOME/build/X86/gem5.opt \ + ci/gem5_test_vortex_hello.py --host-arch=x86 + timeout 120 $GEM5_HOME/build/X86/gem5.opt \ + ci/gem5_test_vortex_vecadd.py --host-arch=x86 + + # ARM matrix entry — requires gcc-aarch64-linux-gnu (installed + # by ci/gem5_install.sh in Phase 0). + if [ -n "$VORTEX_GEM5_ARM" ]; then + make -C sw/runtime/gem5 HOST_ARCH=aarch64 + cp sim/simx/libvortex-gem5.so $GEM5_HOME/build/ARM/ + + timeout 120 $GEM5_HOME/build/ARM/gem5.opt \ + ci/gem5_test_vortex_hello.py --host-arch=arm + timeout 120 $GEM5_HOME/build/ARM/gem5.opt \ + ci/gem5_test_vortex_vecadd.py --host-arch=arm + fi + + echo "gem5 tests done!" + } + ``` + Per `feedback_test_timeout_120s.md`, every test invocation is + `timeout 120`-capped. ARM is opt-in via `VORTEX_GEM5_ARM=1` so + hosted CI without the ARM toolchain still passes; the self-hosted + runner sets the env var. +- Add `gem5-x86` and `gem5-arm` matrix entries to + `.github/workflows/ci.yml` (both run on the self-hosted runner + only, per + [`project_ci_machine.md`](../../../../.claude/projects/-home-blaisetine-dev/memory/project_ci_machine.md); + the hosted runners do not have enough resources for a full + gem5 build). +- Add `ci/gem5_install.sh` to the Apptainer recipe + ([`miscs/apptainer/vortex.def`](../../miscs/apptainer/vortex.def)) + so the .sif has gem5 pre-installed. **Out of scope for Phase 6; + see §8.** + +**Validation:** `./ci/regression.sh --gem5` runs both +`gem5_test_vortex_*.py` cleanly on the self-hosted runner. + +### Phase 7 — Documentation + +- `docs/gem5_integration.md`: + - How to install gem5 v25.0.0 (point at `ci/gem5_install.sh`). + - How to build with `USE_GEM5=1`. + - How to cross-compile the ARM runtime + kernels. + - How to write a gem5 Python script that drives `VortexGPGPU`. + - The single-source-of-truth invariant (§1.1) and the cache-flush + contract (§3.4) for future hackers who might be tempted to skip + the flush "because it's fast". + +--- + +## 5. Authorship / history mechanics + +- `sim/simx/gem5/vortex_gpgpu.{cpp,h}` and the gem5-side + `vortex_gpgpu_dev.{cc,hh}` + `VortexGPGPU.py`: **new files**, no + upstream equivalent. Commit body cites: + > Replaces legacy `vortex_gem5/sim/simx/simx_device.{cpp,h}` + > (Injae Shin, UCLA 2025-05-22 commit 91dcf17) and the gem5-side + > SimObject described in his capstone report. + > Re-implemented for SimX v3 Processor::cycle() API. Original + > design intent (OPAE MMIO + pinned staging buffer + ARM SE-mode + > runtime) preserved. + +- `sw/runtime/gem5/`: **new files** mirroring `sw/runtime/opae/`'s + shape. Same authorship attribution as above; the file-level + similarity is to `sw/runtime/opae`, not to `runtime/opaesimx` from + the legacy tree (which has the bugs catalogued in §2.2). + +- `ci/gem5_install.sh.in` and `ci/gem5_test_vortex_*.py`: new files; + follow the structure of `ci/sst_install.sh.in` and + `ci/sst_test_vortex_*.py`. `ci/gem5_install.sh.in` lands in + Phase 0 (initially installing the dummy SimObject); the test + scripts land in Phase 5. + +- `Processor::cycle()` / `Processor::memsim()`: new public API on + `Processor`, lands in Phase 1. Single commit on the simx_v3 line; + mentioned as a prerequisite of both SST and gem5 integrations in + the commit body. + +- `sim/simx/gem5/gem5_api_notes.md`: Phase 0 deliverable, scratch + notes only — **not** committed to `docs/`. Captures the gem5 + v25.0.0 API surface our C ABI design depends on; deleted once + Phase 2 commits the C ABI itself. + +This is consistent with the rule established in +[`feedback_keep_ours_in_merge.md`](../../../../.claude/projects/-home-blaisetine-dev/memory/feedback_keep_ours_in_merge.md): +the legacy code is not a "theirs" we apply; it is a prior design that +informs our redesign. Credit the designer in the body; do not pretend +the bits are a port. + +--- + +## 6. Validation + +Each phase ends with the validation listed in §4. Across phases the +acceptance criteria are: + +1. **No-gem5 build identical.** `make -C sim/simx` (default flags) + produces a binary identical in behavior to today's on the + regression suite (io_addr, arith, vecadd, mpi_vecadd, tensor*, + dxa, dtm). The Phase 0 `Processor::cycle()` addition must not + change `Processor::run()` semantics — verify by trace-diffing + `vecadd` before and after Phase 0. + +2. **In-process smoke (no gem5 needed).** `gem5_smoke hello.vxbin`, + the Phase 2 driver, runs the same kernels the `simx` binary runs + and produces matching output. This is the unit-test layer that + shakes out C-ABI breakage without requiring gem5 to be installed + beyond what Phase 0 already set up. + +3. **End-to-end gem5 PASS.** Both `gem5_test_vortex_hello.py` and + `gem5_test_vortex_vecadd.py` exit 0 under the pinned gem5 v25.0.0.1, + on *both* `build/X86/gem5.opt` and `build/ARM/gem5.opt`, timed out + at 120 s (each). The pin and the install path are both already + validated by Phase 0; this validation just exercises the real + `VortexGPGPU` SimObject end-to-end. + +4. **No `core->mem_read` / `core->mem_write` regressions.** Phase 5 + of v3 forbids those + ([simx_v3_proposal.md §3.3](simx_v3_proposal.md)). The grep gate + from + [master_merge_v3_proposal.md §8 R1](master_merge_v3_proposal.md) + applies here: every commit must pass + `git diff
.. -- sim/simx/ | grep -E 'core->mem_(read|write)' | wc -l == 0`.
+
+5. **Single source of truth check.** The gem5 device's pinned region
+   is `RAM`-backed (i.e., a slice of host memory exposed to gem5's
+   DRAM AddrRange via `mmap`); Vortex's VRAM is the `RAM` attached to
+   `Memory` inside `vortex::Processor`. **There is no shadow image.**
+   `vortex_gem5_dma_{read,write}` copies bytes between the two via
+   `RAM::read`/`RAM::write` — no additional buffer level. Mistakes
+   here re-introduce the §1.1 violation.
+
+---
+
+## 7. Risks
+
+| # | Risk | Mitigation |
+|---|---|---|
+| R1 | gem5 v25.0.0 `DmaDevice` API drifts in v26+. | Pin in `ci/gem5_install.sh.in` (Phase 0). Document the pin in `docs/gem5_integration.md`. CI catches regressions on bump. |
+| R2 | ARM cross-compiler not available in the Apptainer recipe. | Phase 6 says gem5 CI is on the self-hosted runner only, which already has the ARM toolchain per [`project_ci_machine.md`](../../../../.claude/projects/-home-blaisetine-dev/memory/project_ci_machine.md). Apptainer absorption is out of scope (§8). |
+| R3 | `MMIO_PINNED_BASE` PIO range collides with another gem5 device's PIO range. | Pick a default (`0x20000000`, matching the legacy paper) but make it a Python-configurable parameter (`pio_addr`). Phase 0 confirms the default is reachable from SE-mode in v25.0.0; document collisions in the integration guide. |
+| R4 | The gem5 ARM CPU model reorders MMIO writes, breaking the args-then-CMD_TYPE protocol (B14). | `DMB SY` (AArch64) or `dmb sy` (ARMv7) before `CMD_TYPE` write in the runtime. Add a regression test that issues a back-to-back `CMD_MEM_WRITE` + `CMD_RUN` and verifies the kernel observed the correct args. |
+| R5 | Future contributor re-introduces the host-pointer-MMIO hack (B3) "for convenience". | This proposal explicitly deletes that abstraction (§2.4). The follow-up `docs/gem5_integration.md` (Phase 7) should call this out. |
+| R6 | `Processor::cycle()` for a never-launched kernel hangs (no `kmu_->start()` because `is_cycle_initialized_` was never reset). | Reset is implicit on first `cycle()`. If a second kernel is launched in the same device lifetime (rare; supported by gem5 only for back-to-back tests), the gem5 device's `CMD_RUN` handler must call a new `Processor::reset_cycle()` that clears `is_cycle_initialized_`. Add this in Phase 2. |
+| R7 | The cross-compiled ARM `libvortex-gem5.so` and the gem5-loaded `libvortex-gem5.so` (x86) have the same SONAME and get confused at install time. | Suffix the ARM build (`libvortex-gem5-aarch64.so`) and the gem5 build (`libvortex-gem5.so`). Document in Phase 2+4. |
+| R8 | gem5's `DmaPort` request size is unbounded; a 1 GB `CMD_MEM_WRITE` would burn simulated time. | Cap per-transaction size at 1 MB in the device's `CMD_MEM_*` handler; chunk larger requests into multiple DMA actions. Mirrors how the OPAE `fpgaPrepareBuffer` page-aligns transfers. |
+| R9 | Cache flush via `CMD_DCR_READ` returns synchronously per core; for `NUM_CORES * NUM_CLUSTERS = 16` that is 16 PIO round-trips per download. | Acceptable for Phase 5; can be batched into a single `CMD_FLUSH_ALL` MMIO later if measured to hurt. |
+| R10 | The gem5 SimObject install (`sim/simx/gem5/install.sh`) modifies the gem5 source tree in place; rebuilds can leave stale artifacts. | `install.sh` is idempotent (copies, doesn't patch); `ci/gem5_install.sh` does a clean `scons -c` before re-build on toolchain version mismatch. Phase 0 proves the install path end-to-end with a dummy SimObject before any real code depends on it. |
+| R11 | Phase 0 reveals gem5 v25.0.0 has dropped SE-mode PIO mapping (the legacy `0x20000000` magic). | Switch design to the `/dev/vortex_gem5` pseudo-file path (§3.6 option 2) before Phase 2 commits the C ABI. Cost: ~1 week added to Phase 0 redesign window. Acceptable because Phase 0 is explicitly a gate — no downstream phase has shipped code yet. |
+| R12 | Phase 0 install takes hours on first run; blocks parallel work. | Cache the `$TOOLDIR/gem5-src/build` directory in CI the same way SST and toolchain caches work. Self-hosted runner's local toolchain dir survives across runs. |
+
+---
+
+## 8. Out of scope
+
+- **Apptainer integration.** Adding gem5 + the ARM cross-toolchain
+  to `miscs/apptainer/vortex.def` is a separate concern. Until that
+  is done, `apptainer-ci.yml`'s matrix should not include `gem5`. The
+  self-hosted runner runs the gem5 matrix entry on hosted ci.yml; the
+  Apptainer pipeline skips it. See
+  [`apptainer-ci.yml` policy notes](../../.github/workflows/apptainer-ci.yml).
+
+- **Full-system Linux on gem5.** The capstone paper restricts itself
+  to SE-mode (per the paper's §IIC: "gem5-Vortex's implementation
+  allows users to use gem5's system call emulation (SE) mode"). This
+  proposal does the same. FS-mode requires booting a Linux kernel
+  inside gem5 with a Vortex device driver — possible, but a separate
+  redesign that intersects with kernel-mode driver work the project
+  has not started.
+
+- **Multi-device simulation.** One `VortexGPGPU` per gem5 system.
+  Multi-device support requires per-instance PIO ranges and a runtime
+  side that supports `vx_dev_open` returning >1 handle — the legacy
+  `g_callbacks` global (B11) blocks this on the runtime side, and
+  the device side needs per-instance state isolation. Defer.
+
+- **AMD GPU / NoMali comparison.** The capstone paper compares
+  gem5-Vortex to NoMali (stub GPU) and AMD GPU (full-system). Those
+  comparisons live in the paper; reproducing them as benchmarks is
+  out of scope. Comparing performance to SimX standalone or to the
+  SST integration is also out of scope — separate analysis work.
+
+- **DMA performance modeling.** The capstone paper §V measures DMA
+  delay variation per kernel size. Replicating that as a CI
+  performance gate is out of scope; could be a follow-up perf
+  proposal once the integration is stable.
+
+- **SST + gem5 simultaneous.** Both integrations replace different
+  parts of the harness; running them together is not a use case
+  anyone has asked for. Build flags are mutually exclusive:
+  `USE_SST=1` and `USE_GEM5=1` together is rejected by `sim/simx/Makefile`.
+
+- **gem5 fork branch.** We do not maintain a long-lived fork of gem5.
+  `ci/gem5_install.sh` fetches a clean release tarball and applies
+  our SimObject; if the user wants a persistent gem5 working tree,
+  that is their setup. Avoids the "fork rot" that froze
+  `vortex_gem5`.
+
+- **Runtime gem5/non-gem5 switching.** Keep `USE_GEM5=1` as a
+  build-time switch. A runtime switch would require both `Processor`
+  and a gem5 wrapper in every binary plus a factory; not worth the
+  maintenance cost for a single-device research integration.
+
+---
+
+## 9. Estimated effort
+
+Based on the SST integration in
+[sst_simx_v3_proposal.md §9](sst_simx_v3_proposal.md) (~15–28 h):
+
+- **Phase 0** (gem5 env + API survey + dummy SimObject install):
+  **6–10 h estimated; ✅ COMPLETE 2026-05-16** in ~3 h of
+  attended + ~25 min unattended scons build. The wall time to
+  install gem5 was 13 min (ARM) + 11 min (X86) parallel on the
+  self-hosted 64-core runner. All six validations
+  (see `sim/simx/gem5/gem5_api_notes.md`) pass on both ISAs.
+  Key discoveries committed: (1) SE-mode PIO attachment is
+  possible but requires bypassing the `SimpleBoard` high-level
+  API; (2) out-of-tree SimObject install needs **no** top-level
+  SConstruct patch — pure `cp -r`; (3) PCIe (Path 2 in §3.6) is
+  a clean Phase 5+ upgrade because `PciDevice` inherits
+  `DmaDevice` and shares the same C ABI surface.
+- **Phase 1** (`Processor::cycle()` + `memsim()`): **1–2 h estimated;
+  ✅ COMPLETE 2026-05-16** in ~1 h. ~50-line patch to
+  `processor.{cpp,h}` + `processor_impl.h`. Default `make -C
+  sim/simx` and `USE_SST=1` both build clean; `simx hello.vxbin`
+  prints `#0: Hello World!`. **Bonus:** the SST integration was
+  previously broken at the `proc_->cycle()` call site
+  (`sim/simx/sst/vortex_simulator.cpp:64`) and would not link; with
+  Phase 1 in place, `sst ci/sst_test_vortex_hello.py` runs
+  end-to-end and exits cleanly at 4.643 µs simulated time.
+- **Phase 2** (`libvortex-gem5.so` + C ABI + in-process smoke):
+  **4–6 h estimated; ✅ COMPLETE 2026-05-16** in ~1.5 h. Files added:
+  `sim/simx/gem5/vortex_gpgpu.{h,cpp}` (the C ABI library) and
+  `sim/simx/gem5/gem5_smoke_main.cpp` (the in-process smoke driver).
+  `sim/simx/Makefile` extended with a `USE_GEM5=1` gate that
+  produces `libvortex-gem5.so` (1.5 MB) + `gem5_smoke` (16 KB
+  driver linking against the lib). `gem5_smoke hello.vxbin` →
+  `#0: Hello World!`, 4642 cycles, exit_code=0 (correctly read back
+  via `vortex_gem5_vram_read` after the cache-flush DCR path —
+  validating B9 from §2.2 is fixed). Default `make -C sim/simx`
+  unchanged (only `simx` produced; gem5 sources fully gated).
+  `USE_SST=1 USE_GEM5=1` correctly rejected by the Makefile per
+  §8 (mutual exclusion). Side fix: `sw/common/bitmanip.h` was
+  missing `` and `` includes — header
+  hygiene fix benefits any caller (per
+  [feedback_always_correct_fix_not_patch](../../../../.claude/projects/-home-blaisetine-dev/memory/feedback_always_correct_fix_not_patch.md)).
+- **Phase 3** (gem5 SimObject + Python + install.sh): **6–10 h
+  estimated; ✅ COMPLETE 2026-05-16** in ~1.5 h. Files added:
+  `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}` (gem5 `DmaDevice` subclass
+  with `dlopen` + `EventFunctionWrapper` tick scheduling),
+  `sim/simx/gem5/VortexGPGPU.py` (Python binding with `library=` +
+  `kernel=` parameters), `sim/simx/gem5/SConscript`. Updated
+  `install.sh` to install the real device and remove the Phase 0
+  dummy scaffolding from `$GEM5_HOME` cleanly. New test:
+  `ci/gem5_test_vortex_hello.py` (standalone-device variant, no
+  host CPU needed). Validation: both `build/X86/gem5.opt` and
+  `build/ARM/gem5.opt` import `VortexGPGPU` and run hello.vxbin to
+  completion at tick 4,643,000 (1 GHz clock → 4643 cycles, matching
+  Phase 1 SST + Phase 2 in-process within 1 cycle). **Three
+  harnesses now validated through the same `Processor::cycle()` API:
+  SST, in-process C ABI, and gem5 SimObject.**
+- **Phase 4** (host runtime, x86 + ARM): **6–10 h estimated; ✅ x86
+  PATH COMPLETE 2026-05-16** in ~1 h; aarch64 cross-build gated on
+  the user's `sudo apt install gcc-aarch64-linux-gnu`. Files added:
+  `sw/runtime/gem5/driver.{cpp,h}` (direct MMIO + mmio_fence helper
+  with per-arch barrier; bump-allocator for the pinned region),
+  `sw/runtime/gem5/vortex.cpp` (OPAE-shaped `vx_device` with the
+  full callback table — compile-time caps from VX_config.h since
+  the host runtime and the device library are built from the same
+  source tree), `sw/runtime/gem5/Makefile` (HOST_ARCH ∈
+  {x86_64,aarch64,armhf} → matching cross-compiler; produces
+  `libvortex-gem5-$ARCH.so`). All three B-bugs addressed: B9 (cache
+  flush before download via per-core `dcr_read(VX_DCR_BASE_CACHE_FLUSH,
+  cid)`), B13 (per-arch compiler via `HOST_ARCH`), B14 (mmio_fence()
+  centralised in `issue_cmd()` so every CMD_TYPE write is fenced
+  by construction). Validation: `make -C sw/runtime/gem5 HOST_ARCH=x86_64`
+  → `libvortex-gem5-x86_64.so` (43 KB, ELF 64-bit x86-64, SONAME
+  correct, exports `vx_dev_init` matching the OPAE/SimX backend
+  pattern).
+- **Phase 5** (end-to-end gem5 tests): **4–6 h estimated; ✅ x86
+  PATH COMPLETE 2026-05-17** in ~3 h. The bulk of the work turned
+  out to be the OPAE state machine on the device side (cmd_args
+  latching, busy bit, dcr_rsp register) plus the dmaAction
+  dispatch in the SimObject — the test scripts themselves were
+  small. Files added:
+  `ci/gem5_test_vortex_vecadd.py` (full e2e: x86 CPU + identity-mapped
+  PIO+PIN regions + Process.map() + Vortex device). The Phase 3
+  standalone `ci/gem5_test_vortex_hello.py` continues to pass as a
+  fast smoke test. Phase 5 also extended Phase 2's
+  `sim/simx/gem5/vortex_gpgpu.{cpp,h}` with the full OPAE protocol
+  state machine and Phase 3's `sim/simx/gem5/vortex_gpgpu_dev.cc`
+  with `pop_pending_cmd` → `dmaRead`/`dmaWrite` dispatch.
+  Validation: `vecadd -n16` PASSED!, kernel ran 454 cycles at
+  IPC 0.247 on 4×4 threads/warps. Side fix: glibc's `nanosleep()`
+  routes through `clock_nanosleep` (#230) which gem5 SE-mode
+  doesn't implement — switched the host runtime's poll-loop back-off
+  to `sched_yield()` (in gem5's syscall table). ARM e2e gated on
+  user `sudo apt install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu`
+  (same gate as Phase 4's aarch64 build).
+- **Phase 6** (CI): **2–3 h estimated; ✅ COMPLETE 2026-05-17** in
+  ~30 min. Added `gem5()` function to `ci/regression.sh.in`
+  (mirrors `sst()` shape; builds prerequisites + runs both Phase 3
+  standalone and Phase 5 e2e tests via `timeout 120` per
+  [feedback_test_timeout_120s](../../../../.claude/projects/-home-blaisetine-dev/memory/feedback_test_timeout_120s.md);
+  ARM matrix opt-in via `VORTEX_GEM5_ARM=1`). Added `--gem5` case
+  dispatch + `--gem5` to the show_usage line. Updated
+  `.github/workflows/ci.yml`: appended `ci/gem5_install.sh` to the
+  `Setup Toolchain` step (gated on `cache-toolchain.outputs.cache-hit`
+  like SST), added `Export gem5 paths` step (GEM5_HOME + PATH for
+  `build/X86`), added `gem5` to the `tests.matrix.name` list with
+  `exclude: name=gem5 xlen=64` (the device library is XLEN-locked
+  by the gem5 install; one entry is sufficient). Validation:
+  `./ci/regression.sh --gem5` PASSED end-to-end in **5 seconds**
+  (Phase 3 hello standalone + Phase 5 vecadd e2e, both clean).
+- **Phase 7** (docs): **1–2 h estimated; ✅ COMPLETE 2026-05-17** in
+  ~45 min. Added `docs/gem5_integration.md` covering: install
+  (`ci/gem5_install.sh`), Vortex+gem5 build (`USE_GEM5=1`), host
+  runtime cross-compile (`HOST_ARCH`), running tests
+  (`./ci/regression.sh --gem5` and standalone hand commands),
+  a complete minimal Python recipe for hosting Vortex in a custom
+  gem5 system, **six load-bearing invariants** (Process.map order,
+  identity-mapped PIO+PIN, cache flush before download, MMIO
+  fence, single source of truth for memory, USE_SST/GEM5 mutex),
+  architectural choices worth revisiting (doorbells vs. polling,
+  PCIe upgrade path, C ABI rationale), CI integration, and a
+  troubleshooting table covering the 6 most common error modes
+  (wrong library path, missing LD_LIBRARY_PATH, clock_nanosleep
+  syscall, orphan Process, wrong `library=` param, busy-bit hang,
+  ccache stale objects). Added to `docs/index.md`.
+
+Total: **~30–49 hours** of focused work (was ~26–41 h before Phase 0
+was added as a separate phase; the actual work has not grown — the
+gem5 install was implicit in the old Phase 2 estimate and is now
+explicit in Phase 0). Substantial enough to warrant its own branch
+(`gem5_simx_v3` or similar).
+
+**Sequencing with SST:** Phase 1 (`Processor::cycle()`) is shared;
+do it once and both integrations benefit. If SST lands first, gem5
+reuses `Processor::cycle()` unchanged. If gem5 lands first, the SST
+integration's broken `proc_->cycle()` reference
+(`sim/simx/sst/vortex_simulator.cpp:64`) gets fixed as a side effect
+of Phase 1 — net win for both. Phase 0 is gem5-only; SST integration
+does not benefit from it.
diff --git a/docs/proposals/gem5_v2_cp_migration_proposal.md b/docs/proposals/gem5_v2_cp_migration_proposal.md
new file mode 100644
index 000000000..035d0805b
--- /dev/null
+++ b/docs/proposals/gem5_v2_cp_migration_proposal.md
@@ -0,0 +1,758 @@
+# gem5 v2 Backend Redesign — CP-First, Event-Driven Architecture
+
+**Date:** 2026-05-18
+**Status:** Draft for review (supersedes the v1 draft of this file)
+**Author:** Blaise Tine
+
+**Related:**
+- [gem5_simx_v3_proposal.md](gem5_simx_v3_proposal.md) — the prior gem5 integration design (OPAE-style MMIO command FSM). This proposal supersedes its §3 (host/device protocol) and §4 (gem5 SimObject design).
+- Upstream proposals on `origin/tinebp-patch-2`:
+  - [command_processor_proposal.md](command_processor_proposal.md) — CP RTL architecture, vortex2.h API, OpenCL 1.2 mapping.
+  - [cp_pure_v2_callbacks_proposal.md](cp_pure_v2_callbacks_proposal.md) — pure-v2 `callbacks_t` + `vortex::CommandProcessor` C++ class for simx/rtlsim.
+- Upstream commits this proposal targets:
+  - `086d26b` runtime: strip legacy launch_*/dcr_* from callbacks_t (Phase E — pure v2)
+  - `8bc2564` runtime: add cp_mmio_write/read callbacks; wire all 4 backends
+  - `16aa1ca` sim/common: software CommandProcessor C++ class + unit test
+  - `04971a2` tests/regression: rewrite vecadd + sgemm from scratch on vortex2.h
+  - `00aa42f` docs: pure-v2 callbacks_t + software CP for simx/rtlsim
+
+**Decisions ratified before this draft (recorded for traceability):**
+- D1 — **Data plane unified through CP**. All ordered host↔device transfers go through `CMD_MEM_*` in a CP queue. `callbacks_t::mem_upload/download/copy` are reserved for the dispatcher's cold-start writes (ring buffer seeding, kernel ELF preload). No second data path.
+- D2 — **Single clock domain for CP + Vortex in v1**. CP and Vortex tick at the same rate; separate `ClockDomain`s are a follow-on.
+- D3 — **In-process VRAM with DMA-port seam designed in**. CP and Vortex memory accesses go through a single accessor interface backed by the in-process `RAM` in v1; v2 swaps in a gem5 `SimpleMemory` via the SimObject's DMA port behind the same interface.
+- D4 — **MAX_QUEUES = 4** in the gem5 PIO map (matches upstream `VX_CP_NUM_QUEUES` default). v1 host runtime exercises Q0 only; Q1–Q3 hardware is ready for future v2.h multi-queue work.
+
+---
+
+## 0. Purpose
+
+The original gem5 backend ([gem5_simx_v3_proposal.md](gem5_simx_v3_proposal.md))
+shipped an OPAE-style MMIO command FSM on the device and a synchronous
+`vx_start`/`vx_ready_wait` runtime on the host. That was a deliberate
+bring-up choice — it reused the OPAE protocol so we could validate the
+gem5 SE-mode integration (PIO, DMA, cross-arch, ELF interp redirection)
+in isolation from the broader v2 runtime work.
+
+That bring-up is done. With upstream's pure-v2 `callbacks_t` landed,
+keeping the OPAE FSM means:
+- Two control planes coexist on the device (legacy CMD_* state machine
+  AND the CP regfile), doubling the device-side surface.
+- The host runtime carries dead code: `start()`, `ready_wait()`,
+  `dcr_write/read`, and their MMIO poll loops, none of which the
+  dispatcher will call again.
+- The SimObject's polled-tick model misuses gem5's event scheduler:
+  it ticks every clock period even when there's no work, and the host
+  has to spin-wait on `Q_SEQNUM` between unsynchronized tick events.
+
+This proposal is a **redesign**, not a port. It deletes the OPAE
+control plane entirely, makes the CP a first-class event-driven gem5
+device block, runs the Vortex `Processor` as a parallel gem5 event
+chain, and rebuilds the host runtime as a thin shim over the CP
+regfile. The end-state is structurally identical to how a real PCIe
+GPU is modeled in gem5: a SimObject hosting an FSM that fetches
+commands, dispatches DMAs, and kicks an asynchronous compute engine.
+
+---
+
+## 1. What changed upstream (verbatim summary)
+
+The new pure-v2 `callbacks_t` ([sw/runtime/common/callbacks.h](../../sw/runtime/common/callbacks.h)
+on `origin/tinebp-patch-2`) contains:
+
+```
+dev_open, dev_close
+query_caps, memory_info
+mem_alloc, mem_reserve, mem_free, mem_access
+mem_upload, mem_download, mem_copy
+cp_mmio_write, cp_mmio_read       // NEW — sole control plane
+```
+
+It no longer contains `start`, `ready_wait`, `dcr_write`, `dcr_read`.
+
+The dispatcher in [sw/runtime/common/vx_device.cpp](../../sw/runtime/common/vx_device.cpp)
+is now the single source of truth for CP command building. Every
+kernel launch, every DCR program, every fence, every event becomes a
+`CMD_*` descriptor written into a ring buffer in device memory, with
+`cp_mmio_write(Q_TAIL_HI)` as the publish doorbell.
+
+The CP itself ([sim/common/CommandProcessor.h](../../sim/common/CommandProcessor.h))
+is a clock-ticked FSM with 5 hooks (note: not 6 — `vortex_dcr_read` is
+handled by the CP's `dram_write` path back to the requesting `CMD_DCR_READ`'s
+writeback address, not a dedicated hook):
+
+```cpp
+struct Hooks {
+    std::function dram_read;
+    std::function dram_write;
+    std::function vortex_dcr_write;
+    std::function vortex_start;
+    std::function vortex_busy;
+};
+```
+
+The CP regfile lives at offset `0x1000` on opae/xrt (the AFU shim adds
+the base); the simulator-internal contract per `cp_pure_v2 §6.3` is
+that `cp_mmio_write(off, val)` takes a **CP-internal** offset and each
+backend wrapper adds its own base.
+
+---
+
+## 2. Design pillars
+
+Six pillars define the redesign. Each is a deliberate departure from
+the v1 OPAE-style design.
+
+### 2.1 Single control plane: CP regfile MMIO only
+
+The PIO range is sized for the CP regfile, period. No more legacy
+OPAE CMD_TYPE / CMD_ARG / STATUS registers, no reserved 4 KiB hole,
+no CMD_* state machine on the SimObject.
+
+PIO layout:
+
+```
+PIO_BASE + 0x0000 .. 0x003F   CP global header (CTRL, STATUS, CAPS, IRQ)
+PIO_BASE + 0x0040 .. 0x004F   CP profiling block (CYCLE_LO/HI, FREQ_HZ)
+PIO_BASE + 0x0100 .. 0x01FF   CP per-queue regfile (4 × 0x40)
+                              Q0: 0x0100..0x013F
+                              Q1: 0x0140..0x017F
+                              Q2: 0x0180..0x01BF
+                              Q3: 0x01C0..0x01FF
+PIO_BASE + 0x0200 .. end      reserved (future profiling per-queue, IRQ, …)
+```
+
+Total PIO size: **`0x0200`** (was `0x1000`).
+
+The host wrapper `cp_mmio_write(off, val)` is:
+
+```cpp
+// sw/runtime/gem5/vortex.cpp
+int cp_mmio_write(uint32_t off, uint32_t value) {
+    driver_.mmio_write32(PIO_BASE_ADDR + off, value);
+    return 0;
+}
+```
+
+No `+0x1000` adjustment — gem5 doesn't need to match the AFU's `bit[12]`
+control/data split because there is no AFU. CP regfile starts at
+`PIO_BASE + 0x0`.
+
+### 2.2 Single data plane: CP commands via `CMD_MEM_*`
+
+`vx_enqueue_write/read/copy` (vortex2.h) emit `CMD_MEM_*` descriptors
+into a queue's ring buffer. The CP executes them via its DMA hooks
+against device VRAM. The same path serves user buffer transfers as
+serves CP descriptor fetches — one accessor interface.
+
+`callbacks_t::mem_upload/download/copy` are reserved for the
+dispatcher's **cold-start** writes only: seeding ring buffers at queue
+create, preloading kernel ELFs into device VRAM before they are
+referenced by a `vx_launch_info_t`. The dispatcher does not use them
+on the user-facing data plane.
+
+In our gem5 setup this is essentially free: PIN_BASE_ADDR is
+identity-mapped into the host process VA via `Process::map`, so
+`mem_upload(dev_va, host_src, size)` is `memcpy(host_va_of_PIN_BASE +
+dev_va, host_src, size)` — a regular store sequence that gem5
+translates through the page table to the same physical bytes the
+SimObject's `ram_` sees. No PIO trigger, no command descriptor, no
+state machine.
+
+### 2.3 Event-driven CP, not polled tick
+
+The CP is a self-scheduling gem5 event:
+
+```cpp
+// sim/simx/gem5/vortex_gpgpu_dev.hh — sketch
+class VortexGPGPU : public DmaDevice {
+    EventFunctionWrapper cpTickEvent_;
+    EventFunctionWrapper vortexTickEvent_;
+
+    void cpTick();      // calls cp_.tick(); reschedules if cp_ has work
+    void vortexTick();  // calls processor_.cycle(); reschedules if !is_done()
+};
+```
+
+`cpTick()` calls `cp_.tick()` once and reschedules itself at
+`clockEdge(Cycles(1))` **only if** the CP reports it still has work
+(queue non-empty, command in flight, completion writeback pending).
+Otherwise it returns and the CP is dormant.
+
+CP wake-up paths:
+- Host writes `Q_TAIL_HI` (queue doorbell) → `cp_mmio_write` schedules
+  `cpTickEvent_` at the next clock edge if not already scheduled.
+- Host writes `CP_CTRL.enable` → same.
+- Vortex `tickEvent` observes `is_done()` and signals CP → CP wakes
+  to retire `CMD_LAUNCH`.
+- CP DMA completion → CP self-reschedules until the DMA retires.
+
+**No bounded-tick-burst around `cp_mmio_*`.** No `VORTEX_USE_CP=0`
+transparent-mode escape hatch. The CP is always real, always
+event-driven, and the gem5 event queue arbitrates between CP, Vortex,
+host CPU, and any other SimObjects in the system the way gem5
+expects.
+
+### 2.4 Vortex `Processor` as a parallel event chain
+
+The Vortex GPU runs in its own gem5 event chain, scheduled by the
+CP's `vortex_start` hook and torn down when `processor_.is_done()`:
+
+```cpp
+auto vortex_start = [this]() {
+    if (!vortexTickEvent_.scheduled())
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+};
+
+void VortexGPGPU::vortexTick() {
+    processor_->cycle();
+    if (processor_->any_running())
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+    // CP polls processor_->any_running() via the vortex_busy hook
+    // from its own tick; no notification needed.
+}
+```
+
+Both `cpTickEvent_` and `vortexTickEvent_` use the same `ClockDomain`
+(D2). The gem5 event queue interleaves them with whatever simulated
+host CPU work is happening at the same simulated time — that's where
+the concurrency-realism win comes from. It is also what makes the
+simulation faster overall: idle blocks (CP between commands, Vortex
+between launches) do not consume tick events.
+
+### 2.5 Single VRAM accessor — in-process for v1, DMA-port seam for v2
+
+All device memory access — CP ring fetches, completion writebacks,
+DMA payload reads/writes, Vortex's `MemSim` accesses — goes through
+one accessor interface:
+
+```cpp
+// sim/simx/gem5/dev_mem.h
+class DevMemAccessor {
+public:
+    virtual void read (uint64_t addr, void* dst, size_t bytes) = 0;
+    virtual void write(uint64_t addr, const void* src, size_t bytes) = 0;
+};
+
+class InProcessDevMem : public DevMemAccessor { /* wraps simx::RAM */ };
+class DmaPortDevMem   : public DevMemAccessor { /* wraps DmaDevice port */ };
+```
+
+v1: `Gem5Device` constructs an `InProcessDevMem` wrapping the existing
+`simx::RAM`. CP `dram_read/write` hooks call through it. Vortex's
+`MemSim::read/write` calls through it. PIN_BASE_ADDR's identity
+mapping makes the host process see the same bytes.
+
+v2 seam: replace `InProcessDevMem` with `DmaPortDevMem` (and back VRAM
+with a gem5 `SimpleMemory` connected to the SimObject's DMA port).
+**Zero changes to CP hooks, zero changes to Vortex memory code, zero
+changes to host runtime.** That's the entire point of the abstraction
+— the v2 path is a localized swap, not a rewrite.
+
+This pillar is the reason "in-process for v1" is the right answer
+(per D3): the accessor seam captures the design intent of v2 without
+paying its cost upfront.
+
+### 2.6 Multi-queue PIO map from day one
+
+PIO map reserves 4 queue regfile slots (D4). v1 host runtime enables
+Q0 only and the CP runs Q0 only. The 3 unused queue slots cost ~96
+bytes of PIO range and let the hardware grow into v2.h multi-queue
+without re-versioning the PIO layout (and without bumping the host
+process's mmap size).
+
+Picking 4 now means the gem5 device's regfile shape **matches
+upstream `VX_CP_NUM_QUEUES = 4`**. The OPAE/XRT AFUs will instantiate
+the same 4-queue CP; gem5 should not be the odd one out.
+
+---
+
+## 3. Address space layout
+
+The full memory map after the redesign:
+
+```
+Host process VA (simulated, gem5 SE-mode)
+  0x0000_0000_0000 .. 0x0000_0FFF_FFFF   normal heap / stack / mmap
+  0x0000_1000_0000 .. 0x0000_1FFF_FFFF   PIN_BASE_ADDR (device VRAM,
+                                          identity-mapped via Process::map)
+  0x0000_2000_0000 .. 0x0000_2000_01FF   PIO_BASE_ADDR (CP regfile)
+  0x0000_2000_0200 .. 0x0000_2FFF_FFFF   reserved (future PIO blocks)
+
+gem5 SimObject PA
+  PIN_BASE_ADDR .. PIN_BASE_ADDR + ram_size   device VRAM backing store
+  PIO_BASE_ADDR .. PIO_BASE_ADDR + 0x0200      CP regfile (PIO range)
+```
+
+`PIN_BASE_ADDR` is the same VA on both sides because `Process::map`
+identity-maps it into the simulated host process. The CP and Vortex
+see the same physical bytes; the host process writes to them as
+ordinary memory.
+
+`PIO_BASE_ADDR` is **only** the CP regfile after this redesign. The
+4 KiB OPAE legacy reserved block is gone.
+
+---
+
+## 4. Data flow walkthroughs
+
+### 4.1 Cold start (queue create)
+
+```
+host runtime                                 gem5 SimObject + CP
+─────────────────────────────────────────    ────────────────────────────
+vx_device_open                                — (handle alloc; no IO)
+  └─ callbacks->dev_open()
+       └─ open libvortex-gem5-x86_64.so
+       └─ vortex_gem5_dev_open(...)          construct Gem5Device:
+                                               - new simx::RAM
+                                               - new InProcessDevMem
+                                               - new simx::Processor (wired to InProcessDevMem)
+                                               - new vortex::CommandProcessor
+                                                     (hooks: dram_read/write
+                                                      → InProcessDevMem,
+                                                      vortex_dcr_write → proc_.dcr_write,
+                                                      vortex_start → schedule(vortexTickEvent_),
+                                                      vortex_busy  → proc_.any_running())
+                                               cpTickEvent_ deschduled (no work yet)
+                                               vortexTickEvent_ deschduled
+
+dispatcher: vx_queue_create
+  └─ mem_alloc(ring_size, &ring_va)          allocate from device VRAM bump allocator
+  └─ mem_alloc(8, &head_va)
+  └─ mem_alloc(8, &cmpl_va)
+  └─ mem_upload(ring_va, zeros, ring_size)   memcpy through PIN_BASE mapping
+  └─ cp_mmio_write(Q0_RING_BASE_LO, ring_va lo)
+  └─ cp_mmio_write(Q0_RING_BASE_HI, ring_va hi)
+  └─ cp_mmio_write(Q0_HEAD_ADDR_LO/HI, head_va)
+  └─ cp_mmio_write(Q0_CMPL_ADDR_LO/HI, cmpl_va)
+  └─ cp_mmio_write(Q0_RING_SIZE_LOG2, log2)
+  └─ cp_mmio_write(Q0_CONTROL, enable=1)     → SimObject PIO write handler:
+                                                   cp_.mmio_write(off, val);
+                                                   if cp_.has_work() and
+                                                      !cpTickEvent_.scheduled():
+                                                     schedule(cpTickEvent_,
+                                                              clockEdge(Cycles(1)))
+                                                   (CP has work because Q0 is now enabled
+                                                    and may have a non-empty ring)
+  └─ cp_mmio_write(CP_CTRL, enable=1)        — already enabled (idempotent)
+```
+
+### 4.2 Kernel launch (`vx_enqueue_launch`)
+
+```
+dispatcher                                   CP (in SimObject)
+─────────────────────────────────────────    ────────────────────────────
+mem_upload(ring_va + tail, CMD_DCR_WRITE     (rings now non-empty;
+            for KMU PC, grid, block, args)   CP will fetch when scheduled)
+mem_upload(ring_va + tail, CMD_LAUNCH)
+cp_mmio_write(Q0_TAIL_LO, tail_lo)
+cp_mmio_write(Q0_TAIL_HI, tail_hi)           → cpTickEvent_ schedule check fires;
+                                              schedule for next clock edge
+
+(host returns immediately — async by design;
+ dispatcher does not block here. Polling
+ happens later via cp_mmio_read(Q0_SEQNUM)
+ from vx_event_wait_all.)
+```
+
+At next clock edge:
+
+```
+cpTick():
+  cp_.tick()
+    [CPE0 FSM: fetch ring head cache line
+     via dram_read(ring_va, &cl, 64) → InProcessDevMem.read → ram_.read]
+    [decode CMD_DCR_WRITE; route through vortex_dcr_write hook
+     → proc_.dcr_write(addr, value); retire; bump seqnum]
+    [dram_write(cmpl_va, &seqnum, 8); dram_write(head_va, &head, 8)]
+  reschedule cpTickEvent_ (still has CMD_LAUNCH pending)
+
+cpTick() (next):
+  cp_.tick()
+    [CPE0 FSM: fetch next CL, decode CMD_LAUNCH]
+    [vortex_start() → schedule(vortexTickEvent_, clockEdge(Cycles(1)))]
+    [CPE0 enters WAIT_FOR_BUSY state — polls vortex_busy() each tick]
+  reschedule cpTickEvent_ (CMD_LAUNCH in flight)
+
+… concurrent vortexTick() advances processor_.cycle() …
+… until processor_.is_done(); on next CP tick:
+
+cpTick():
+  cp_.tick()
+    [CPE0 sees !vortex_busy(); retire CMD_LAUNCH; bump seqnum]
+    [dram_write(cmpl_va, &seqnum, 8)]
+  CP has no more work; do NOT reschedule cpTickEvent_.
+  vortexTickEvent_ stopped scheduling itself when is_done() became true.
+  System is dormant.
+
+Host poll:
+  cp_mmio_read(Q0_SEQNUM_LO)                 → SimObject PIO read handler:
+                                                   return cp_.mmio_read(off);
+                                              (returns the retired seqnum;
+                                               no tick burst needed because the
+                                               cmpl_va writeback already happened
+                                               in earlier cpTick())
+```
+
+The host never spins. The CP never idle-ticks. Vortex never runs past
+`is_done()`. This is the win.
+
+### 4.3 `vx_enqueue_write` (data plane through CP)
+
+```
+dispatcher                                   CP
+─────────────────────────────────────────    ────────────────────────────
+(host_src is in regular heap, not PIN_BASE   — note: dispatcher copies
+ — so the dispatcher copies it into a       payloads into PIN_BASE first
+ pinned device buffer it owns OR the         on backends that require it.
+ caller used vx_buffer_map to write          For gem5 + Process::map this is
+ directly into a host-mapped device          a memcpy through the mapped
+ buffer)                                     pages.)
+
+mem_upload(ring_va + tail, CMD_MEM_WRITE
+            { src=pinned_host_va,            (pinned_host_va is in PIN_BASE
+              dst=dev_va,                     so it's also a device PA)
+              size=N })
+cp_mmio_write(Q0_TAIL_HI, ...)               → CP schedules
+
+cpTick():
+  cp_.tick()
+    [decode CMD_MEM_WRITE]
+    [CP DMA FSM: dram_read(src, &buf, chunk)
+                  dram_write(dst, &buf, chunk)
+     looping over the transfer in 64 B steps]
+    [retire; bump seqnum; cmpl writeback]
+```
+
+Both endpoints (`src`, `dst`) are in the same flat physical space
+(PIN_BASE region). The CP's DMA FSM doesn't distinguish host vs.
+device addresses — they're the same accessor.
+
+---
+
+## 5. Component design
+
+### 5.1 `sim/simx/gem5/vortex_gpgpu.{cpp,h}` — device library
+
+**Responsibilities:**
+- Construct `RAM`, `Processor`, `CommandProcessor`, `InProcessDevMem`.
+- Provide C ABI: `vortex_gem5_dev_open/close`, `vortex_gem5_cp_mmio_{read,write}`,
+  `vortex_gem5_dram_access` (for SimObject DMA path → backing store),
+  `vortex_gem5_cp_tick`, `vortex_gem5_vortex_tick`,
+  `vortex_gem5_cp_has_work`, `vortex_gem5_vortex_busy`.
+- Provide kernel preload for the Phase 3 standalone test (unchanged).
+
+**Removed (all OPAE state machine carry-over):**
+- `pending_cmd_`, `cmd_args_`, `dcr_rsp_`, `busy_` fields
+- `mmio_write64`/`mmio_read64` and the CMD_TYPE dispatch
+- `pop_pending_cmd`, `get_cmd_arg`, `set_busy`, `load_args`
+- `process_cmd` and the `CMD_RUN`/`CMD_DCR_*`/`CMD_MEM_*` handlers
+  (last one re-emerges inside the CP, not here)
+
+**Added:**
+- `cp_` member (`vortex::CommandProcessor`) with hooks bound in ctor.
+- `dev_mem_` member (`std::unique_ptr`) — `InProcessDevMem`
+  for v1.
+- C-ABI surface for the SimObject (below).
+
+### 5.2 `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}` — gem5 SimObject
+
+**Class:** `VortexGPGPU : public DmaDevice` (unchanged from current).
+
+**Members:**
+- `pioAddr_, pioSize_ = 0x0200` (was `0x1000`).
+- `EventFunctionWrapper cpTickEvent_;`
+- `EventFunctionWrapper vortexTickEvent_;`
+- `deviceHandle_` — opaque from device library.
+
+**PIO `read(PacketPtr)`:**
+```cpp
+const Addr off = pkt->getAddr() - pioAddr_;
+uint32_t value = 0;
+abi_.cp_mmio_read(deviceHandle_, uint32_t(off), &value);
+pkt->setLE(value);
+pkt->makeAtomicResponse();
+return pioLatency_;
+```
+
+**PIO `write(PacketPtr)`:**
+```cpp
+const Addr off = pkt->getAddr() - pioAddr_;
+const uint32_t value = pkt->getLE();
+abi_.cp_mmio_write(deviceHandle_, uint32_t(off), value);
+maybeWakeCp();
+pkt->makeAtomicResponse();
+return pioLatency_;
+```
+
+**`maybeWakeCp()`:**
+```cpp
+if (abi_.cp_has_work(deviceHandle_) && !cpTickEvent_.scheduled())
+    schedule(cpTickEvent_, clockEdge(Cycles(1)));
+```
+
+**`cpTick()`:**
+```cpp
+abi_.cp_tick(deviceHandle_);
+if (abi_.cp_has_work(deviceHandle_))
+    schedule(cpTickEvent_, clockEdge(Cycles(1)));
+```
+
+**`vortexTick()`:**
+```cpp
+abi_.vortex_tick(deviceHandle_);
+if (abi_.vortex_busy(deviceHandle_))
+    schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+```
+
+**`vortex_start` hook callback (from device library into the SimObject):**
+schedules `vortexTickEvent_` at next clock edge if not scheduled.
+Implemented as a small C ABI: `vortex_gem5_set_start_handler(handle,
+fn, ctx)` registered in `VortexGPGPU::init()`; the device library
+calls it from the CP's `vortex_start` lambda.
+
+### 5.3 `sw/runtime/gem5/vortex.cpp` — host runtime
+
+**Responsibilities (shrunken):**
+- `init` / `get_caps` / `mem_info` (unchanged)
+- `mem_alloc` / `mem_reserve` / `mem_free` / `mem_access` (unchanged
+  bump allocator + `PIN_BASE_ADDR` math)
+- `mem_upload` / `mem_download` / `mem_copy` → `memcpy` through the
+  PIN_BASE identity mapping (renamed from `upload`/`download`/`copy`
+  in the v1 backend)
+- `cp_mmio_write` → `driver_.mmio_write32(PIO_BASE_ADDR + off, val)`
+- `cp_mmio_read` → `driver_.mmio_read32(PIO_BASE_ADDR + off, &val)`
+
+**Removed:**
+- `start()`, `ready_wait()`, `dcr_write()`, `dcr_read()` methods
+- `MMIO_CMD_TYPE` / `MMIO_STATUS` constants and their poll loop
+- `` and the `sched_yield()` back-off (no host poll loop —
+  the dispatcher's `vx_event_wait_all` does its own polling against
+  `Q_SEQNUM`)
+
+**Kept:**
+- The pinned-region setup, `PIN_BASE_ADDR`, `PIO_BASE_ADDR`,
+  `mmio_fence()`, the bump allocator state.
+
+### 5.4 `sw/runtime/gem5/driver.{cpp,h}` — pinned region + MMIO helpers
+
+**Added:**
+- `mmio_write32(uint64_t pa, uint32_t value)` — 4-byte store with
+  fence. Implemented as `*reinterpret_cast(pa) =
+  value; mmio_fence();`.
+- `mmio_read32(uint64_t pa, uint32_t* value)` — symmetric.
+
+**Removed:**
+- `mmio_write64` / `mmio_read64` — no caller after the redesign.
+- The 64-bit MMIO path was a v1 choice for OPAE-style 8-byte argument
+  registers. The CP regfile is 32-bit.
+
+### 5.5 `sim/simx/gem5/VortexGPGPU.py` — SimObject Python binding
+
+**Params:**
+- `library = Param.String(...)` (unchanged)
+- `kernel = Param.String("")` (Phase 3 standalone preload — unchanged)
+- `pio_addr = Param.Addr(0x20000000)` (unchanged)
+- `pio_size = Param.Addr(0x0200)` — **changed from 0x1000** to match
+  the redesigned PIO map
+- `pio_latency = Param.Latency("100ns")` (unchanged)
+- `dma_latency = Param.Latency("100ns")` (unchanged)
+- (new) `max_queues = Param.Unsigned(4)` — for forward compatibility;
+  v1 enforces == 4
+
+### 5.6 `sim/simx/Makefile` — build wiring
+
+- Add `$(SIM_COMMON_DIR)/CommandProcessor.cpp` to the `USE_GEM5=1`
+  source list (the device library links it; the SimObject indirects
+  via the C ABI).
+
+### 5.7 `sw/runtime/gem5/Makefile` — build wiring
+
+- No source-list changes (the CommandProcessor lives in the device
+  library, not the host runtime).
+- `` include and any sched-related CFLAGS go away with the
+  `sched_yield` poll loop.
+
+---
+
+## 6. Migration phasing
+
+The whole redesign lands as **one commit** per the "substantial,
+testable feature" rule. The internal phasing below is for validation
+checkpoints during implementation, not for separate commits.
+
+### Phase M1 — Merge upstream
+
+- `git merge --no-commit --no-ff origin/tinebp-patch-2`
+- Conflicts (all expected):
+  - `sw/runtime/stub/Makefile` — keep HOST_ARCH; take new v2 dispatcher SRCS
+  - Possibly `sw/runtime/common/callbacks.{h,inc}` — defer to upstream version
+- Build will not compile until M2 + M3 complete. That is acceptable
+  inside one commit; the commit is only created when M3 builds and
+  passes regression.
+
+### Phase M2 — Device-side redesign
+
+- Add `sim/simx/gem5/dev_mem.{h,cpp}` (`DevMemAccessor` + `InProcessDevMem`).
+- Rewrite `sim/simx/gem5/vortex_gpgpu.{cpp,h}`:
+  - Delete OPAE state machine (per §5.1).
+  - Embed `cp_` with hooks bound to `InProcessDevMem` + `proc_`.
+  - Export the new C ABI.
+- Rewrite `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}`:
+  - PIO range shrinks to 0x0200.
+  - `read`/`write` route 32-bit packets to `cp_mmio_{read,write}`.
+  - `cpTickEvent_` + `vortexTickEvent_` self-scheduling per §2.3, §2.4.
+  - `vortex_start` callback registration.
+- Update `VortexGPGPU.py` (`pio_size = 0x0200`, `max_queues = 4`).
+- `sim/simx/Makefile`: add `CommandProcessor.cpp`.
+
+**Validation:** `make -C build/sim/simx USE_GEM5=1` builds.
+`./hw/unittest/cp_sim/` unit test passes (smoke-tests the
+CommandProcessor wiring; runnable without gem5 itself).
+
+### Phase M3 — Host runtime redesign
+
+- Rewrite `sw/runtime/gem5/vortex.cpp`:
+  - Drop `start`/`ready_wait`/`dcr_*`.
+  - Rename `upload`/`download`/`copy` → `mem_upload`/`mem_download`/`mem_copy`.
+  - Add `cp_mmio_{read,write}` (3-line MMIO wrappers).
+  - Drop `` and the poll loop.
+- Add `mmio_{read,write}32` to `driver.{cpp,h}`; drop the 64-bit helpers.
+- Build for x86_64 (default) and aarch64 (cross-compile via existing
+  HOST_ARCH switch).
+
+**Validation:**
+- Hostless test (`ci/gem5_run_hostless_app.py`): PASSES.
+  (No host runtime involvement.)
+- `./ci/regression.sh --gem5`: PASSES — hello + vecadd + sgemm e2e on x86.
+- `VORTEX_GEM5_ARM=1 ./ci/regression.sh --gem5`: PASSES — same 3 tests
+  on aarch64. Total 6/6 PASS matches pre-redesign baseline.
+
+### Phase M4 — Documentation
+
+Update [docs/gem5_integration.md](../gem5_integration.md):
+- Replace the OPAE protocol description with the CP regfile + ring
+  buffer architecture.
+- Update the 6 load-bearing invariants list:
+  - Drop OPAE CMD_* invariants.
+  - Add: "CP regfile is at `PIO_BASE + 0x0`, 0x200 bytes, 32-bit
+    register stride."
+  - Add: "PIN_BASE is identity-mapped via Process::map; host
+    runtime's `mem_upload` is a direct memcpy."
+  - Add: "CP and Vortex tick events self-schedule only while work is
+    pending; idle is observable as cpTickEvent_ unscheduled."
+
+Update [docs/proposals/gem5_simx_v3_proposal.md](gem5_simx_v3_proposal.md):
+- Add a "Status: Superseded by gem5_v2_cp_migration_proposal" header
+  on §3 (host/device protocol) and §4 (SimObject design).
+- Keep §0–§2 (motivation, source-tree layout) and §5+ (testing,
+  install, cross-arch) — those parts remain accurate.
+
+---
+
+## 7. Validation criteria
+
+The redesign is complete when all of the following hold:
+
+1. **`./ci/regression.sh --gem5`** PASSES on x86 (hello + vecadd +
+   sgemm e2e). Total wall time ≤ 30 s (was 16 s pre-redesign; the
+   event-driven design should be at least as fast because idle blocks
+   no longer tick).
+2. **`VORTEX_GEM5_ARM=1 ./ci/regression.sh --gem5`** PASSES on
+   aarch64 (same 3 tests).
+3. **No regression on non-gem5 builds.** `make -C build/sim/simx`
+   (default), `USE_SST=1` still build and pass.
+4. **No OPAE leftovers grep-detectable.** `grep -r CMD_TYPE\|CMD_RUN\|
+   pending_cmd_\|get_cmd_arg sim/simx/gem5/ sw/runtime/gem5/` returns
+   zero hits.
+5. **Event-driven invariants hold.** Run a sim with a 100 ms idle gap
+   between two enqueues; verify (via debug log) that `cpTickEvent_`
+   is unscheduled during the gap and that the host CPU advances
+   unhindered.
+6. **PIO map size matches design.** `pio_size = 0x0200` exposed in
+   `VortexGPGPU.py`; host runtime never writes outside that range.
+
+---
+
+## 8. Risks
+
+| # | Risk | Mitigation |
+|---|---|---|
+| R1 | CP `vortex_start` hook needs to schedule a gem5 event from inside a hook called during PIO write handling. gem5 SimObjects can `schedule()` from anywhere, but only from the gem5 thread. Verify the C ABI doesn't route the hook from a different thread. | Hooks are bound at construction; called from `cp_.tick()` which is called from `cpTick()` which is itself a gem5 event handler — same thread. No issue. |
+| R2 | `vortexTick()` advancing `processor_.cycle()` per Vortex clock period is slow if the cycle()-per-tick ratio is high (a Vortex clock period is shorter than a CPU-host instruction time). | Match Vortex's `ClockDomain` to a realistic Vortex frequency (1 GHz). gem5 only schedules events at actual clock edges; the per-tick cost is one C++ function call. Acceptable. |
+| R3 | New vecadd/sgemm tests (rewritten on vortex2.h upstream) may use features (events, queue priority) we don't validate end-to-end on gem5. | M3 validation surfaces this. If a test uses an unsupported vortex2.h primitive, file a follow-up; M3 acceptance is contingent on the existing 3-test matrix passing. |
+| R4 | `DevMemAccessor` interface change forces a Vortex `MemSim` rewrite. | `MemSim` already takes a memory backend. v1 wires it to `InProcessDevMem` which delegates to `simx::RAM` — same backing buffer as today. Zero code change in Vortex itself. |
+| R5 | The Phase 3 standalone test loads a kernel via `kernel=` SimObject param, then primes KMU DCRs directly. After the redesign, KMU DCR programming must route through the CP, which means the standalone test needs a tiny one-shot ring submission instead of direct `proc_.dcr_write` calls. | Add a `vortex_gem5_run_standalone_kernel(handle, kernel_path)` C ABI in the device library that builds a synthetic CMD_DCR_WRITE+CMD_LAUNCH ring and runs the CP to completion. ~30 LoC. Keeps the standalone test path real instead of a back-door. |
+| R6 | Vortex's `cycle()` does not handle being called only when scheduled; e.g. internal counters reset assuming consecutive ticks. | Audit during M2. Vortex's existing implementation already supports being suspended (simx uses it both ways). |
+
+---
+
+## 9. Out of scope
+
+- **XLEN=64 device library.** Current setup is XLEN=32 only.
+  Orthogonal.
+- **Separate `ClockDomain` for Vortex vs. CP.** D2 ratifies single
+  domain for v1.
+- **Gem5 `SimpleMemory` backing VRAM via DMA port.** D3 ratifies
+  in-process for v1 with the accessor seam in place. v2 is a
+  follow-on commit that swaps the accessor.
+- **PCIe BAR mapping** instead of raw PIO range. Original gem5_simx_v3
+  §3.6 commits to this; orthogonal to the CP redesign.
+- **Multi-queue host runtime.** Q1–Q3 hardware is there but the host
+  runtime exercises Q0 only. Multi-queue runtime work follows
+  upstream vortex2.h.
+- **Profiling timestamp writeback path.** The upstream CP supports
+  `F_PROFILE` flag + `VX_cp_profiling`; gem5 backend will get it for
+  free once the CP implementation lands. No gem5-specific work.
+
+---
+
+## 10. Estimated effort
+
+Calibrated against the v1 OPAE backend (~3 days from scratch) and the
+recent rejected inline-adaptation attempt (Option A reached ~50%
+completion in ~30 min before being stopped):
+
+- **Phase M1 (merge):** 10 min. Three known conflicts, all mechanical.
+- **Phase M2 (device redesign):** 8–10 h. Bulk of the work:
+  - `dev_mem.{h,cpp}` — 1 h
+  - `vortex_gpgpu.{cpp,h}` rewrite — 4 h (mostly subtraction)
+  - `vortex_gpgpu_dev.{cc,hh}` rewrite — 3 h (event-driven scheduling)
+  - `VortexGPGPU.py` + Makefile + standalone test ABI — 1 h
+- **Phase M3 (host runtime):** 2–3 h. Mostly subtraction; new code is
+  small.
+- **Phase M4 (docs):** 1 h.
+
+**Total: 11–14 h focused work, single commit on `feature_gem5`.**
+
+Calibration vs. the v1 draft of this proposal (which claimed 7–11 h):
+the redesign is longer because (a) event-driven scheduling needs more
+care than a polled tick, (b) the OPAE deletion is comprehensive
+(M4 was "optional" in v1), and (c) we now have to wire the standalone
+test path through a real CP ring submission instead of direct DCR
+writes.
+
+---
+
+## 11. Why not a smaller change?
+
+For the record — the alternatives that were considered and rejected:
+
+- **Adapt-only (the rejected Option A from v1 of this doc).** Embed
+  `vortex::CommandProcessor` in the host runtime; translate each CP
+  hook back into the existing OPAE MMIO protocol. **Rejected:**
+  CP runs on the wrong side of the host/device boundary, every ring
+  fetch costs an MMIO+DMA round trip across the simulated bus,
+  device-side OPAE state machine stays as permanent dead code, no
+  alignment with how opae/xrt do it on real silicon.
+- **Device-side CP, keep OPAE for `mem_upload` data plane.** The v1
+  draft of this proposal. **Rejected:** Two control planes coexist,
+  two protocols to keep in sync, no clean line between "what goes
+  through CP" and "what doesn't."
+- **`VORTEX_USE_CP=0` transparent mode** as a permanent bring-up
+  escape hatch. **Rejected:** defeats the purpose of a cycle-accurate
+  simulator; the gem5 backend's job is to model the hardware, not to
+  emulate around it.
+
+The redesign in this proposal is the minimum that does not leave dead
+code, dead protocols, or bring-up hacks in the final state.
diff --git a/docs/proposals/sst_simx_v3_proposal.md b/docs/proposals/sst_simx_v3_proposal.md
index 3dbe0a00e..65db9ebbb 100644
--- a/docs/proposals/sst_simx_v3_proposal.md
+++ b/docs/proposals/sst_simx_v3_proposal.md
@@ -1,7 +1,7 @@
 # SST Integration for SimX v3 — Proposal
 
 **Date:** 2026-05-03
-**Status:** Draft
+**Status:** Implemented — note that `ci/sst_test_vortex_*.py` have been consolidated into a single generic runner [ci/sst_run_hostless_app.py](../../ci/sst_run_hostless_app.py) (parameterized by `VORTEX_TEST_DIR` + `VORTEX_TEST_KERNEL`, parallel to [ci/gem5_run_hostless_app.py](../../ci/gem5_run_hostless_app.py)). The naming reserves the `ci/sst_run_app.py` slot for a future host-CPU-driven SST integration (none today — see §3). The memHierarchy wiring described in §6 is no longer kept as a standalone test runner; the recipe stays here as documentation. References to specific `sst_test_vortex_.py` filenames below are historical.
 **Author:** Blaise Tine
 **Related:**
 [simx_v3_proposal.md](simx_v3_proposal.md) (Phase 5: TLM data path),
diff --git a/sim/simx/Makefile b/sim/simx/Makefile
index 059484eff..6a743bbca 100644
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -2,8 +2,17 @@ include ../common.mk
 
 DESTDIR ?= $(CURDIR)
 USE_SST ?= 0
+USE_GEM5 ?= 0
 #SST_PKG ?= SST-14.1 # default SST package name
 
+# USE_SST and USE_GEM5 are mutually exclusive — different external
+# simulator wrappers with different LDFLAGS; building both into one
+# binary makes no sense and the proposal docs/proposals/gem5_simx_v3_proposal.md
+# §8 calls this out explicitly.
+ifeq ($(USE_SST)$(USE_GEM5),11)
+$(error USE_SST=1 and USE_GEM5=1 are mutually exclusive)
+endif
+
 OBJ_DIR = $(DESTDIR)/obj
 CONFIG_FILE = $(DESTDIR)/simx_config.stamp
 SRC_DIR = $(VORTEX_HOME)/sim/simx
@@ -24,7 +33,7 @@ LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulato
 XCONFIGS := $(shell python3 $(ROOT_DIR)/ci/gen_config.py --config=$(VORTEX_HOME)/VX_config.toml --cflags='$(CONFIGS) -DXLEN_$(XLEN)')
 
 # Source files definition
-SRCS = $(SW_COMMON_DIR)/util.cpp $(SIM_COMMON_DIR)/mem.cpp $(SW_COMMON_DIR)/softfloat_ext.cpp $(SW_COMMON_DIR)/rvfloats.cpp $(SIM_COMMON_DIR)/dram_sim.cpp
+SRCS = $(SW_COMMON_DIR)/util.cpp $(SIM_COMMON_DIR)/mem.cpp $(SW_COMMON_DIR)/softfloat_ext.cpp $(SW_COMMON_DIR)/rvfloats.cpp $(SIM_COMMON_DIR)/dram_sim.cpp $(SIM_COMMON_DIR)/CommandProcessor.cpp
 SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/scheduler.cpp $(SRC_DIR)/cta_dispatcher.cpp $(SRC_DIR)/barrier_unit.cpp
 SRCS += $(SRC_DIR)/kmu/kmu.cpp
 SRCS += $(SRC_DIR)/decode.cpp $(SRC_DIR)/decompressor.cpp $(SRC_DIR)/scoreboard.cpp $(SRC_DIR)/sequencer.cpp $(SRC_DIR)/opc_unit.cpp $(SRC_DIR)/dispatcher.cpp
@@ -96,6 +105,12 @@ ifeq ($(USE_SST),1)
 	SRCS     += $(SRC_DIR)/sst/vortex_simulator.cpp $(SRC_DIR)/sst/vortex_gpgpu.cpp
 endif
 
+# gem5 integration: build libvortex-gem5.so (the C ABI library loaded
+# by the gem5 VortexGPGPU SimObject). The gem5 wrapper source is kept
+# out of the default SRCS list and pulled into VORTEX_GEM5_SRCS so the
+# default simx binary does not carry it.
+VORTEX_GEM5_SRCS := $(SRC_DIR)/gem5/vortex_gpgpu.cpp $(SRC_DIR)/gem5/dev_mem.cpp
+
 # Debugging
 ifdef DEBUG
 	CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
@@ -128,17 +143,25 @@ VORTEX_SST_OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(VORTEX_SST_SRCS)
 DEPS += $(VORTEX_SST_OBJS:.o=.d)
 endif
 
+ifeq ($(USE_GEM5), 1)
+VORTEX_GEM5_OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(VORTEX_GEM5_SRCS))
+DEPS             += $(VORTEX_GEM5_OBJS:.o=.d)
+endif
+
 
 # optional: pipe through ccache if you have it
 CXX := $(if $(shell which ccache),ccache $(CXX),$(CXX))
 
 PROJECT := simx
 VORTEX_LIB := libvortex.so
+VORTEX_GEM5_LIB := libvortex-gem5.so
 
-.PHONY: all force clean clean-lib clean-exe clean-obj libvortex clean-libvortex
+.PHONY: all force clean clean-lib clean-exe clean-obj libvortex clean-libvortex libvortex-gem5 clean-libvortex-gem5
 
 ifeq ($(USE_SST), 1)
 all: $(DESTDIR)/$(PROJECT) $(DESTDIR)/$(VORTEX_LIB)
+else ifeq ($(USE_GEM5), 1)
+all: $(DESTDIR)/$(PROJECT) $(DESTDIR)/$(VORTEX_GEM5_LIB)
 else
 all: $(DESTDIR)/$(PROJECT)
 endif
@@ -186,6 +209,13 @@ $(DESTDIR)/$(VORTEX_LIB): $(OBJS) $(VORTEX_SST_OBJS)
 	-shared -o $@ \
 	$(LDFLAGS) $(SST_LFLAGS)
 
+# Vortex gem5 device shared library — the gem5 SimObject dlopens this
+# and calls the C ABI declared in sim/simx/gem5/vortex_gpgpu.h.
+libvortex-gem5: $(DESTDIR)/$(VORTEX_GEM5_LIB)
+
+$(DESTDIR)/$(VORTEX_GEM5_LIB): $(OBJS) $(VORTEX_GEM5_OBJS)
+	$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -Wl,-soname,$(VORTEX_GEM5_LIB) -o $@
+
 # updates the timestamp when flags changed.
 $(CONFIG_FILE): force
 	@mkdir -p $(@D)
@@ -205,10 +235,13 @@ clean-lib:
 clean-libvortex:
 	rm -f $(DESTDIR)/libvortex.so
 
+clean-libvortex-gem5:
+	rm -f $(DESTDIR)/$(VORTEX_GEM5_LIB)
+
 clean-exe:
 	rm -f $(DESTDIR)/$(PROJECT)
 
 clean-obj:
 	rm -rf $(OBJ_DIR)
 
-clean: clean-lib clean-exe clean-obj
+clean: clean-lib clean-libvortex clean-libvortex-gem5 clean-exe clean-obj
diff --git a/sim/simx/gem5/SConscript b/sim/simx/gem5/SConscript
new file mode 100644
index 000000000..535ada56f
--- /dev/null
+++ b/sim/simx/gem5/SConscript
@@ -0,0 +1,18 @@
+# -*- mode:python -*-
+#
+# Vortex SimObjects for gem5. Installed into $GEM5_HOME/src/dev/vortex/
+# by sim/simx/gem5/install.sh. Picked up automatically by gem5's
+# top-level SConstruct via the SConscript-recursion rule at
+# SConstruct:1000.
+#
+# This file's source of truth lives in the Vortex tree
+# (sim/simx/gem5/SConscript); the installer just copies it.
+
+Import('*')
+
+SimObject('VortexGPGPU.py', sim_objects=['VortexGPGPU'])
+Source('vortex_gpgpu_dev.cc')
+
+# DebugFlag for VortexGPGPU traces. Enable with:
+#   gem5.opt --debug-flags=VortexGPGPU ...
+DebugFlag('VortexGPGPU')
diff --git a/sim/simx/gem5/VortexGPGPU.py b/sim/simx/gem5/VortexGPGPU.py
new file mode 100644
index 000000000..e89168c8b
--- /dev/null
+++ b/sim/simx/gem5/VortexGPGPU.py
@@ -0,0 +1,68 @@
+# Copyright © 2019-2023
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Python SimObject binding for the gem5-side VortexGPGPU device.
+# Mirrors the inheritance graph of the C++ side: DmaDevice → PioDevice
+# → ClockedObject.
+
+from m5.objects.Device import DmaDevice
+from m5.params import *
+
+
+class VortexGPGPU(DmaDevice):
+    type = "VortexGPGPU"
+    cxx_header = "dev/vortex/vortex_gpgpu_dev.hh"
+    cxx_class = "gem5::VortexGPGPU"
+
+    # Path to libvortex-gem5.so produced by `make -C sim/simx
+    # USE_GEM5=1` in the Vortex build dir. Required; the C++ ctor
+    # fatals if empty.
+    library = Param.String("Absolute path to libvortex-gem5.so")
+
+    # Optional kernel image preloaded at startup() via
+    # vortex_gem5_load_kernel. When set, the device runs the kernel to
+    # completion via its own vortexTickEvent_ scheduler and exits the
+    # sim loop on done — no host CPU or MMIO traffic required. This is
+    # the Phase 3 standalone smoke test. Hosted mode (kernel="" or
+    # unset) starts idle; the host runtime drives the CP via MMIO and
+    # the CP schedules its own ticks.
+    kernel = Param.String("", "Optional .vxbin/.bin/.hex to preload at boot")
+
+    # PIO range. After the gem5_v2_cp_migration redesign the PIO range
+    # is exactly the CP regfile: 0x40 of globals + 4 × 0x40 per-queue
+    # slots = 0x140 used today, 0x200 reserved for headroom. Per
+    # gem5_v2_cp_migration_proposal §3 the legacy OPAE register window
+    # is gone.
+    pio_addr    = Param.Addr(0x20000000, "PIO base address (CP regfile)")
+    pio_size    = Param.Addr(0x0200, "PIO region size (CP regfile, bytes)")
+    pio_latency = Param.Latency("1ns", "PIO access latency")
+
+    # BAR-mapped VRAM. The device exposes its in-process simx::RAM
+    # over the same physical-address range the host's PIN_BASE_ADDR
+    # identity-maps to via Process::map(). Host CPU writes land in
+    # the same bytes the CP's dram_read hook and Vortex's MemSim see
+    # — single source of truth for device memory (gem5_v2_cp_migration
+    # §2.2 single data plane).
+    #
+    # Disabled by default (pin_size=0) since the standalone smoke test
+    # uses load_kernel(), not host memcpy through PIN. Hosted (e2e)
+    # tests opt in by setting both pin_addr and pin_size to match the
+    # host runtime's PIN_BASE_ADDR / PIN_REGION_SIZE.
+    pin_addr    = Param.Addr(0x100000000, "VRAM base address (BAR-mapped)")
+    pin_size    = Param.Addr(0, "VRAM region size (bytes); 0 disables")
+
+    # Compile-time CP capacity that this PIO map can address. v1 host
+    # runtime exercises Q0 only; Q1–Q3 hardware is provisioned for
+    # future v2.h multi-queue work (gem5_v2_cp_migration_proposal §2.6,
+    # D4). Matches upstream VX_CP_NUM_QUEUES default.
+    max_queues  = Param.Unsigned(4, "Number of CP queues the PIO map covers")
diff --git a/sim/simx/gem5/dev_mem.cpp b/sim/simx/gem5/dev_mem.cpp
new file mode 100644
index 000000000..16943bd06
--- /dev/null
+++ b/sim/simx/gem5/dev_mem.cpp
@@ -0,0 +1,32 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dev_mem.h"
+
+#include 
+
+namespace vortex_gem5 {
+
+void InProcessDevMem::read(uint64_t addr, void* dst, std::size_t bytes) {
+    ram_.enable_acl(false);
+    ram_.read(static_cast(dst), addr, bytes);
+    ram_.enable_acl(true);
+}
+
+void InProcessDevMem::write(uint64_t addr, const void* src, std::size_t bytes) {
+    ram_.enable_acl(false);
+    ram_.write(static_cast(src), addr, bytes);
+    ram_.enable_acl(true);
+}
+
+} // namespace vortex_gem5
diff --git a/sim/simx/gem5/dev_mem.h b/sim/simx/gem5/dev_mem.h
new file mode 100644
index 000000000..cabe7587c
--- /dev/null
+++ b/sim/simx/gem5/dev_mem.h
@@ -0,0 +1,61 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Device-memory accessor seam for the gem5 backend.
+//
+// Per gem5_v2_cp_migration_proposal §2.5: every device-memory access —
+// CP ring fetches, completion writebacks, CMD_MEM_* DMA payload, Vortex
+// MemSim loads/stores — funnels through this interface. In v1 the only
+// implementation is InProcessDevMem (wraps simx::RAM). In v2 a
+// DmaPortDevMem will replace it; CP hooks and Vortex memory code are
+// untouched.
+//
+// Layered on top of simx::RAM rather than replacing it because Vortex's
+// existing MemSim already knows how to talk to RAM; we only need the
+// accessor seam for the CP side.
+
+#pragma once
+
+#include 
+#include 
+
+namespace vortex {
+class RAM;
+} // namespace vortex
+
+namespace vortex_gem5 {
+
+class DevMemAccessor {
+public:
+    virtual ~DevMemAccessor() = default;
+
+    virtual void read (uint64_t addr, void* dst,       std::size_t bytes) = 0;
+    virtual void write(uint64_t addr, const void* src, std::size_t bytes) = 0;
+};
+
+// v1 backing: the simx::RAM the Processor already uses. ACL bypass is
+// the same pattern the simx/rtlsim CP hooks apply (sw/runtime/simx/
+// vortex.cpp:271-280) — the CP/DMA is a peer of the host runtime, not
+// a userspace caller subject to per-region page protections.
+class InProcessDevMem final : public DevMemAccessor {
+public:
+    explicit InProcessDevMem(vortex::RAM& ram) : ram_(ram) {}
+
+    void read (uint64_t addr, void* dst,       std::size_t bytes) override;
+    void write(uint64_t addr, const void* src, std::size_t bytes) override;
+
+private:
+    vortex::RAM& ram_;
+};
+
+} // namespace vortex_gem5
diff --git a/sim/simx/gem5/install.sh b/sim/simx/gem5/install.sh
new file mode 100755
index 000000000..0d171f247
--- /dev/null
+++ b/sim/simx/gem5/install.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Install the Vortex gem5 SimObject sources into a pinned gem5 tree.
+#
+# Copies vortex_gpgpu_dev.{cc,hh}, VortexGPGPU.py, and SConscript into
+# $GEM5_HOME/src/dev/vortex/ so gem5's scons can build them. The
+# source-of-truth lives in the Vortex tree (this directory); any
+# change has to re-run this script before `scons build//gem5.opt`
+# picks it up.
+#
+# Idempotent: re-running just refreshes the files.
+#
+# Usage:
+#   GEM5_HOME=$HOME/tools/gem5 sim/simx/gem5/install.sh
+# or
+#   sim/simx/gem5/install.sh           # uses $GEM5_HOME from env
+
+set -e
+
+GEM5_HOME=${GEM5_HOME:-$HOME/tools/gem5}
+SOURCE_DIR=$(dirname "$(readlink -f "$0")")
+
+if [ ! -d "$GEM5_HOME/src/dev" ]; then
+    echo "ERROR: GEM5_HOME=$GEM5_HOME does not look like a gem5 tree" >&2
+    echo "       (expected $GEM5_HOME/src/dev/)" >&2
+    exit 1
+fi
+
+DEST_DIR="$GEM5_HOME/src/dev/vortex"
+mkdir -p "$DEST_DIR"
+
+install -m 0644 "$SOURCE_DIR/vortex_gpgpu_dev.hh" "$DEST_DIR/"
+install -m 0644 "$SOURCE_DIR/vortex_gpgpu_dev.cc" "$DEST_DIR/"
+install -m 0644 "$SOURCE_DIR/VortexGPGPU.py"      "$DEST_DIR/"
+install -m 0644 "$SOURCE_DIR/SConscript"          "$DEST_DIR/"
+
+echo "Vortex SimObjects installed at $DEST_DIR"
+echo "Files:"
+ls -1 "$DEST_DIR" | sed 's/^/  /'
+echo ""
+echo "Re-build gem5 with one or both of:"
+echo "  scons -C $GEM5_HOME build/X86/gem5.opt -j\$(nproc)"
+echo "  scons -C $GEM5_HOME build/ARM/gem5.opt -j\$(nproc)"
diff --git a/sim/simx/gem5/vortex_gpgpu.cpp b/sim/simx/gem5/vortex_gpgpu.cpp
new file mode 100644
index 000000000..fc2562cfd
--- /dev/null
+++ b/sim/simx/gem5/vortex_gpgpu.cpp
@@ -0,0 +1,299 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "vortex_gpgpu.h"
+
+#include "constants.h"
+#include "dev_mem.h"
+#include "processor.h"
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+using namespace vortex;
+
+// Mirrors sw/runtime/common/common.h's GLOBAL_MEM_SIZE so the bounds
+// check in vram_{read,write} matches what the host runtime enforces.
+// Inlined rather than including common.h because that header drags in
+// the full runtime ABI which a device library has no business touching.
+#if (XLEN == 64)
+static constexpr uint64_t GEM5_GLOBAL_MEM_SIZE = 0x200000000ull;  // 8 GB
+#else
+static constexpr uint64_t GEM5_GLOBAL_MEM_SIZE = 0x100000000ull;  // 4 GB
+#endif
+
+namespace {
+
+// Gem5Device — owns the Vortex Processor + RAM + CommandProcessor
+// triplet. The CP's hooks call back into proc_/dev_mem_, and the
+// SimObject drives cp_tick / vortex_tick on independent gem5 events.
+class Gem5Device {
+public:
+    Gem5Device()
+        : ram_(0, MEM_PAGE_SIZE),
+          proc_(std::make_unique()),
+          dev_mem_(std::make_unique(ram_)),
+          cp_(make_cp_hooks()) {
+        proc_->attach_ram(&ram_);
+    }
+
+    ~Gem5Device() = default;
+
+    // ---------------- Standalone (Phase 3) kernel preload ---------------
+    // Primes the KMU DCRs for a 1×1×1 CTA at STARTUP_ADDR and loads the
+    // ELF/bin/hex into VRAM. After this, calling vortex_tick repeatedly
+    // dispatches the kernel to completion (ProcessorImpl::cycle's lazy
+    // init resets SimPlatform and calls kmu_->start() on first tick).
+    // The hosted (CP-driven) path never calls this — kernel ELFs land
+    // in VRAM via mem_upload, and KMU programming goes through CMD_DCR_*.
+    bool load_kernel(const std::string& path) {
+        const uint64_t startup_addr(STARTUP_ADDR);
+        proc_->dcr_write(VX_DCR_KMU_STARTUP_ADDR0, startup_addr & 0xffffffff);
+    #if (XLEN == 64)
+        proc_->dcr_write(VX_DCR_KMU_STARTUP_ADDR1, startup_addr >> 32);
+    #endif
+        proc_->dcr_write(VX_DCR_KMU_STARTUP_ARG0, 0);
+        proc_->dcr_write(VX_DCR_KMU_STARTUP_ARG1, 0);
+        proc_->dcr_write(VX_DCR_KMU_GRID_DIM_X,   1);
+        proc_->dcr_write(VX_DCR_KMU_GRID_DIM_Y,   1);
+        proc_->dcr_write(VX_DCR_KMU_GRID_DIM_Z,   1);
+        proc_->dcr_write(VX_DCR_KMU_BLOCK_DIM_X,  1);
+        proc_->dcr_write(VX_DCR_KMU_BLOCK_DIM_Y,  1);
+        proc_->dcr_write(VX_DCR_KMU_BLOCK_DIM_Z,  1);
+        proc_->dcr_write(VX_DCR_KMU_LMEM_SIZE,    0);
+        proc_->dcr_write(VX_DCR_KMU_BLOCK_SIZE,   1);
+        proc_->dcr_write(VX_DCR_KMU_WARP_STEP_X,  NUM_THREADS);
+        proc_->dcr_write(VX_DCR_KMU_WARP_STEP_Y,  0);
+        proc_->dcr_write(VX_DCR_KMU_WARP_STEP_Z,  0);
+
+        std::string ext(fileExtension(path.c_str()));
+        if (ext == "vxbin") {
+            ram_.loadVxImage(path.c_str());
+        } else if (ext == "bin") {
+            ram_.loadBinImage(path.c_str(), startup_addr);
+        } else if (ext == "hex") {
+            ram_.loadHexImage(path.c_str());
+        } else {
+            std::cerr << "vortex_gem5: unsupported kernel extension '" << ext
+                      << "' (need .vxbin, .bin, or .hex)" << std::endl;
+            return false;
+        }
+        // Mark the device as "running" so the SimObject's standalone
+        // path advances vortexTickEvent_ until ProcessorImpl::cycle()
+        // reports done. Hosted launches set this via vortex_start.
+        vortex_running_ = true;
+        return true;
+    }
+
+    // ---------------- VRAM direct access --------------------------------
+    void vram_write(uint64_t addr, const uint8_t* src, uint32_t size) {
+        if (addr + size > GEM5_GLOBAL_MEM_SIZE) {
+        #ifndef NDEBUG
+            std::cerr << "vortex_gem5: vram_write overflow addr=0x"
+                      << std::hex << addr << " size=" << std::dec << size
+                      << std::endl;
+        #endif
+            return;
+        }
+        dev_mem_->write(addr, src, size);
+    }
+    void vram_read(uint64_t addr, uint8_t* dst, uint32_t size) {
+        if (addr + size > GEM5_GLOBAL_MEM_SIZE) {
+        #ifndef NDEBUG
+            std::cerr << "vortex_gem5: vram_read overflow addr=0x"
+                      << std::hex << addr << " size=" << std::dec << size
+                      << std::endl;
+        #endif
+            return;
+        }
+        dev_mem_->read(addr, dst, size);
+    }
+
+    // ---------------- CP regfile MMIO -----------------------------------
+    // The SimObject's PIO handlers translate `cp_mmio_write(off,v)` to
+    // a single call here. The CommandProcessor's regfile is 32-bit and
+    // its address map is documented in sim/common/CommandProcessor.h.
+    void cp_mmio_write(uint32_t off, uint32_t value) { cp_.mmio_write(off, value); }
+    uint32_t cp_mmio_read (uint32_t off) const       { return cp_.mmio_read(off); }
+
+    // ---------------- CP tick / introspection ---------------------------
+    // tick() advances the CP one functional cycle and returns true iff
+    // the CP still has work to do. The SimObject reschedules
+    // cpTickEvent_ while true and sleeps otherwise — proposal §2.3.
+    bool cp_tick() {
+        cp_.tick();
+        return cp_.busy();
+    }
+    bool cp_has_work() const { return cp_.enabled() && cp_.busy(); }
+
+    // ---------------- Vortex tick / introspection -----------------------
+    // vortex_tick advances ProcessorImpl::cycle() one step. cycle() does
+    // lazy init (resets SimPlatform + calls kmu_->start()) on first call.
+    // For back-to-back launches the CP's vortex_start hook calls
+    // processor_.start_kmu() explicitly to re-arm the KMU for the next
+    // kernel (kmu_->start is idempotent — first launch redundantly
+    // re-starts inside the lazy init, no harm).
+    bool vortex_tick() {
+        bool still_running = proc_->cycle();
+        if (!still_running) {
+            vortex_running_ = false;
+        }
+        return vortex_running_;
+    }
+    bool vortex_busy() const { return vortex_running_; }
+
+    // ---------------- vortex_start handler registration -----------------
+    // The SimObject registers a callback the CP fires when retiring a
+    // CMD_LAUNCH. The callback schedules vortexTickEvent_ at the next
+    // clock edge, decoupling CP and Vortex tick chains (proposal §2.4).
+    void set_start_handler(vortex_gem5_start_handler_t fn, void* ctx) {
+        start_fn_  = fn;
+        start_ctx_ = ctx;
+    }
+
+private:
+    vortex::CommandProcessor::Hooks make_cp_hooks() {
+        vortex::CommandProcessor::Hooks h;
+        h.dram_read = [this](uint64_t addr, void* dst, std::size_t bytes) {
+            dev_mem_->read(addr, dst, bytes);
+        };
+        h.dram_write = [this](uint64_t addr, const void* src, std::size_t bytes) {
+            dev_mem_->write(addr, src, bytes);
+        };
+        h.vortex_dcr_write = [this](uint32_t addr, uint32_t value) {
+            proc_->dcr_write(addr, value);
+        };
+        h.vortex_dcr_read = [this](uint32_t addr, uint32_t tag) -> uint32_t {
+            uint32_t v = 0;
+            proc_->dcr_read(addr, tag, &v);
+            return v;
+        };
+        h.vortex_start = [this]() {
+            // Mark Vortex as in-flight so vortex_busy returns true on
+            // the very next CP poll (before the first cycle() runs).
+            // Then re-arm the KMU for the (possibly back-to-back)
+            // kernel and ask the SimObject to begin ticking Vortex.
+            vortex_running_ = true;
+            proc_->start_kmu();
+            if (start_fn_) start_fn_(start_ctx_);
+        };
+        h.vortex_busy = [this]() -> bool { return vortex_running_; };
+        return h;
+    }
+
+    RAM ram_;
+    std::unique_ptr proc_;
+    std::unique_ptr dev_mem_;
+    vortex::CommandProcessor cp_;
+    bool vortex_running_ = false;
+    vortex_gem5_start_handler_t start_fn_  = nullptr;
+    void* start_ctx_ = nullptr;
+};
+
+} // namespace
+
+// ----- C ABI -----------------------------------------------------------------
+
+extern "C" {
+
+const char* vortex_gem5_build_info(void) {
+    static char info[256];
+    std::snprintf(info, sizeof(info),
+                  "vortex-gem5 (XLEN=%d, threads=%d, warps=%d, cores=%d, clusters=%d)",
+                  XLEN, NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS);
+    return info;
+}
+
+vortex_gem5_handle_t vortex_gem5_create(void) {
+    try {
+        return reinterpret_cast(new Gem5Device());
+    } catch (const std::exception& e) {
+        std::cerr << "vortex_gem5_create: " << e.what() << std::endl;
+        return nullptr;
+    } catch (...) {
+        std::cerr << "vortex_gem5_create: unknown exception" << std::endl;
+        return nullptr;
+    }
+}
+
+void vortex_gem5_destroy(vortex_gem5_handle_t h) {
+    if (h == nullptr) return;
+    delete reinterpret_cast(h);
+}
+
+void vortex_gem5_set_start_handler(vortex_gem5_handle_t h,
+                                   vortex_gem5_start_handler_t fn,
+                                   void* ctx) {
+    if (h == nullptr) return;
+    reinterpret_cast(h)->set_start_handler(fn, ctx);
+}
+
+int vortex_gem5_load_kernel(vortex_gem5_handle_t h, const char* path) {
+    if (h == nullptr || path == nullptr) return -1;
+    return reinterpret_cast(h)->load_kernel(path) ? 0 : -1;
+}
+
+void vortex_gem5_cp_mmio_write(vortex_gem5_handle_t h,
+                               uint32_t off, uint32_t value) {
+    if (h == nullptr) return;
+    reinterpret_cast(h)->cp_mmio_write(off, value);
+}
+
+uint32_t vortex_gem5_cp_mmio_read(vortex_gem5_handle_t h, uint32_t off) {
+    if (h == nullptr) return 0;
+    return reinterpret_cast(h)->cp_mmio_read(off);
+}
+
+bool vortex_gem5_cp_tick(vortex_gem5_handle_t h) {
+    if (h == nullptr) return false;
+    return reinterpret_cast(h)->cp_tick();
+}
+
+bool vortex_gem5_cp_has_work(vortex_gem5_handle_t h) {
+    if (h == nullptr) return false;
+    return reinterpret_cast(h)->cp_has_work();
+}
+
+bool vortex_gem5_vortex_tick(vortex_gem5_handle_t h) {
+    if (h == nullptr) return false;
+    return reinterpret_cast(h)->vortex_tick();
+}
+
+bool vortex_gem5_vortex_busy(vortex_gem5_handle_t h) {
+    if (h == nullptr) return false;
+    return reinterpret_cast(h)->vortex_busy();
+}
+
+void vortex_gem5_vram_write(vortex_gem5_handle_t h,
+                            uint64_t dev_addr, const uint8_t* src,
+                            uint32_t size) {
+    if (h == nullptr || src == nullptr) return;
+    reinterpret_cast(h)->vram_write(dev_addr, src, size);
+}
+
+void vortex_gem5_vram_read(vortex_gem5_handle_t h,
+                           uint64_t dev_addr, uint8_t* dst,
+                           uint32_t size) {
+    if (h == nullptr || dst == nullptr) return;
+    reinterpret_cast(h)->vram_read(dev_addr, dst, size);
+}
+
+} // extern "C"
diff --git a/sim/simx/gem5/vortex_gpgpu.h b/sim/simx/gem5/vortex_gpgpu.h
new file mode 100644
index 000000000..09cca06be
--- /dev/null
+++ b/sim/simx/gem5/vortex_gpgpu.h
@@ -0,0 +1,129 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// libvortex-gem5 — C ABI for the gem5 VortexGPGPU SimObject.
+//
+// Per gem5_v2_cp_migration_proposal §5.1 the device library hosts a
+// vortex::Processor + vortex::CommandProcessor pair, exposes a 32-bit
+// CP MMIO regfile (PIO_BASE_ADDR + 0x0 .. + 0x1FF), and provides two
+// independently-tickable engines so the SimObject can drive CP and
+// Vortex as separate gem5 event chains:
+//
+//     cpTickEvent_      -> vortex_gem5_cp_tick()
+//     vortexTickEvent_  -> vortex_gem5_vortex_tick()
+//
+// Both engines self-report whether they still have work via
+// vortex_gem5_cp_has_work() / vortex_gem5_vortex_busy(); the SimObject
+// uses those to decide whether to reschedule. The CP's vortex_start
+// hook calls back into the SimObject via the start-handler registered
+// at construction so a CMD_LAUNCH retirement schedules vortexTickEvent_
+// from inside cpTickEvent_'s execution.
+//
+// The ABI is C — not C++ — so the gem5 side does not depend on SimX's
+// internal types and can be rebuilt against a new gem5 release without
+// touching anything Vortex-side.
+//
+// Concurrency: all calls are serialized on the gem5 event-loop thread.
+// No internal locking. No re-entrancy.
+
+#pragma once
+
+#include 
+#include 
+#include 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque handle. Owns a vortex::Processor, RAM, MemoryAllocator, and
+// vortex::CommandProcessor.
+typedef struct vortex_gem5_device_s* vortex_gem5_handle_t;
+
+// Returns a printable description of the build config (cores, warps,
+// threads, XLEN). Returned pointer is static; do not free.
+const char* vortex_gem5_build_info(void);
+
+// Construct a Vortex device instance. Returns NULL on failure.
+// VRAM is allocated lazily; no kernel is loaded until
+// vortex_gem5_load_kernel is called.
+vortex_gem5_handle_t vortex_gem5_create(void);
+
+// Destroy the device. Safe to call with NULL.
+void vortex_gem5_destroy(vortex_gem5_handle_t h);
+
+// Register a callback the device library invokes from inside its CP
+// vortex_start hook. The SimObject uses this to schedule its Vortex
+// tick event when the CP launches a kernel. Pass NULL to clear.
+// `ctx` is forwarded back unchanged.
+typedef void (*vortex_gem5_start_handler_t)(void* ctx);
+void vortex_gem5_set_start_handler(vortex_gem5_handle_t h,
+                                   vortex_gem5_start_handler_t fn,
+                                   void* ctx);
+
+// Load a kernel image into VRAM. Accepts .vxbin / .bin / .hex (same
+// shape as sim/simx/main.cpp). Primes the KMU DCRs for a 1×1×1 CTA
+// at STARTUP_ADDR for the Phase 3 standalone test path (in hosted
+// mode the dispatcher uploads kernels via mem_upload + programs KMU
+// DCRs via CMD_DCR_WRITE through the CP).
+//
+// Returns 0 on success, -1 on file-not-found or unsupported format.
+int vortex_gem5_load_kernel(vortex_gem5_handle_t h, const char* path);
+
+// CP regfile MMIO. `off` is the CP-internal byte offset (0..0x13F for
+// queue 0; see sim/common/CommandProcessor.h §address map). All
+// accesses are 32-bit. The SimObject translates a PIO packet at
+// `PIO_BASE_ADDR + off` into one of these calls; the host runtime's
+// cp_mmio_{write,read} translates `cp_mmio_write(off, v)` to one of
+// these via a 32-bit PIO write at `PIO_BASE_ADDR + off` (no AFU bit-12
+// split — the gem5 device's PIO range IS the CP regfile).
+void     vortex_gem5_cp_mmio_write(vortex_gem5_handle_t h,
+                                   uint32_t off, uint32_t value);
+uint32_t vortex_gem5_cp_mmio_read (vortex_gem5_handle_t h, uint32_t off);
+
+// Advance the embedded CommandProcessor by one functional cycle.
+// Returns true if the CP has more work (ring non-empty, command in
+// flight) and should be ticked again.
+bool vortex_gem5_cp_tick(vortex_gem5_handle_t h);
+
+// True iff the CP would benefit from being ticked: enabled and busy.
+// The SimObject uses this from PIO write handlers (after a CP regfile
+// update may have armed work) to decide whether to schedule
+// cpTickEvent_.
+bool vortex_gem5_cp_has_work(vortex_gem5_handle_t h);
+
+// Advance the Vortex Processor by one cycle. Returns true while the
+// processor is still running (clusters active or channels carrying
+// packets); the SimObject's vortexTickEvent_ reschedules itself while
+// this returns true and stops otherwise.
+bool vortex_gem5_vortex_tick(vortex_gem5_handle_t h);
+
+// True iff Vortex is currently executing a kernel (any cluster
+// running, any in-flight memory transactions). Used by the CP's
+// vortex_busy hook to know when to retire a CMD_LAUNCH.
+bool vortex_gem5_vortex_busy(vortex_gem5_handle_t h);
+
+// Direct device-VRAM access used by the SimObject's DMA-path scratch
+// buffers in v1 (a peer of the host runtime, ACL-bypassed). v2 will
+// route both Vortex memory and CP DMA through gem5's memory hierarchy
+// via the same DevMemAccessor interface.
+void vortex_gem5_vram_write(vortex_gem5_handle_t h,
+                            uint64_t dev_addr, const uint8_t* src,
+                            uint32_t size);
+void vortex_gem5_vram_read (vortex_gem5_handle_t h,
+                            uint64_t dev_addr, uint8_t* dst,
+                            uint32_t size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/sim/simx/gem5/vortex_gpgpu_dev.cc b/sim/simx/gem5/vortex_gpgpu_dev.cc
new file mode 100644
index 000000000..8bc54899c
--- /dev/null
+++ b/sim/simx/gem5/vortex_gpgpu_dev.cc
@@ -0,0 +1,277 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dev/vortex/vortex_gpgpu_dev.hh"
+
+#include "base/logging.hh"
+#include "base/trace.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+#include 
+
+namespace gem5
+{
+
+namespace {
+
+template 
+T dlsym_or_fatal(void* handle, const char* symbol, const char* libpath)
+{
+    void* p = dlsym(handle, symbol);
+    if (p == nullptr) {
+        fatal("VortexGPGPU: dlsym(%s) failed in %s: %s",
+              symbol, libpath, dlerror());
+    }
+    return reinterpret_cast(p);
+}
+
+} // namespace
+
+VortexGPGPU::VortexGPGPU(const Params &p)
+  : DmaDevice(p),
+    libHandle_(nullptr),
+    deviceHandle_(nullptr),
+    abi_{},
+    libraryPath_(p.library),
+    kernelPath_(p.kernel),
+    pioAddr_(p.pio_addr),
+    pioSize_(p.pio_size),
+    pinAddr_(p.pin_addr),
+    pinSize_(p.pin_size),
+    pioLatency_(p.pio_latency),
+    cpTickEvent_([this]{ this->cpTick(); }, name() + ".cpTickEvent"),
+    vortexTickEvent_([this]{ this->vortexTick(); }, name() + ".vortexTickEvent"),
+    standalone_(false)
+{
+    if (libraryPath_.empty()) {
+        fatal("VortexGPGPU: 'library' parameter is required "
+              "(path to libvortex-gem5.so)");
+    }
+
+    libHandle_ = dlopen(libraryPath_.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    if (libHandle_ == nullptr) {
+        fatal("VortexGPGPU: dlopen('%s') failed: %s",
+              libraryPath_, dlerror());
+    }
+
+    // Resolve the v2 ABI surface. Any missing symbol is a hard build
+    // mismatch — fatal at construction rather than mid-simulation.
+    abi_.build_info        = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_build_info",        libraryPath_.c_str());
+    abi_.create            = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_create",            libraryPath_.c_str());
+    abi_.destroy           = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_destroy",           libraryPath_.c_str());
+    abi_.set_start_handler = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_set_start_handler", libraryPath_.c_str());
+    abi_.load_kernel       = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_load_kernel",       libraryPath_.c_str());
+    abi_.cp_mmio_write     = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_cp_mmio_write",     libraryPath_.c_str());
+    abi_.cp_mmio_read      = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_cp_mmio_read",      libraryPath_.c_str());
+    abi_.cp_tick           = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_cp_tick",           libraryPath_.c_str());
+    abi_.cp_has_work       = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_cp_has_work",       libraryPath_.c_str());
+    abi_.vortex_tick       = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_vortex_tick",       libraryPath_.c_str());
+    abi_.vortex_busy       = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_vortex_busy",       libraryPath_.c_str());
+    abi_.vram_write        = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_vram_write",        libraryPath_.c_str());
+    abi_.vram_read         = dlsym_or_fatal
+                              (libHandle_, "vortex_gem5_vram_read",         libraryPath_.c_str());
+
+    inform("VortexGPGPU: %s", abi_.build_info());
+    inform("VortexGPGPU: library=%s", libraryPath_);
+    inform("VortexGPGPU: pio[CP regfile]=[0x%llx,+0x%llx)",
+           static_cast(pioAddr_),
+           static_cast(pioSize_));
+    if (pinSize_ != 0) {
+        inform("VortexGPGPU: pin[BAR-mapped VRAM]=[0x%llx,+0x%llx)",
+               static_cast(pinAddr_),
+               static_cast(pinSize_));
+    }
+
+    deviceHandle_ = abi_.create();
+    if (deviceHandle_ == nullptr) {
+        fatal("VortexGPGPU: vortex_gem5_create returned NULL");
+    }
+
+    // Register the vortex_start trampoline so the CP can schedule
+    // Vortex ticks from inside cp_tick (proposal §2.4).
+    abi_.set_start_handler(deviceHandle_, &VortexGPGPU::onVortexStartTrampoline, this);
+}
+
+VortexGPGPU::~VortexGPGPU()
+{
+    if (deviceHandle_ != nullptr && abi_.destroy != nullptr) {
+        abi_.destroy(deviceHandle_);
+    }
+    if (libHandle_ != nullptr) {
+        dlclose(libHandle_);
+    }
+}
+
+void
+VortexGPGPU::init()
+{
+    DmaDevice::init();
+}
+
+void
+VortexGPGPU::startup()
+{
+    DmaDevice::startup();
+
+    if (!kernelPath_.empty()) {
+        // Standalone mode (Phase 3): preload a kernel and self-drive
+        // to completion. No host CPU, no CP. The standalone path
+        // exists as a smoke test for the device library.
+        inform("VortexGPGPU: standalone mode (preload + auto-tick)");
+        inform("VortexGPGPU: preloading kernel=%s", kernelPath_);
+        if (abi_.load_kernel(deviceHandle_, kernelPath_.c_str()) != 0) {
+            fatal("VortexGPGPU: vortex_gem5_load_kernel('%s') failed",
+                  kernelPath_);
+        }
+        standalone_ = true;
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+    } else {
+        // Hosted mode (proposal §4): the host runtime issues CP MMIO
+        // writes to configure queues + commits commands; the CP
+        // schedules its own ticks via maybeWakeCp() and the vortex
+        // tick via the start handler. Idle at boot.
+        inform("VortexGPGPU: hosted mode (waiting for CP enable)");
+        standalone_ = false;
+    }
+}
+
+void
+VortexGPGPU::cpTick()
+{
+    const bool still_busy = abi_.cp_tick(deviceHandle_);
+    if (still_busy) {
+        schedule(cpTickEvent_, clockEdge(Cycles(1)));
+    }
+    // Idle drop-out: no reschedule. PIO writes that arm new work will
+    // call maybeWakeCp() and reschedule us.
+}
+
+void
+VortexGPGPU::vortexTick()
+{
+    const bool still_running = abi_.vortex_tick(deviceHandle_);
+    if (still_running) {
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+        return;
+    }
+    if (standalone_) {
+        inform("VortexGPGPU: standalone kernel complete — exiting sim loop");
+        exitSimLoop("VortexGPGPU: kernel complete");
+        return;
+    }
+    // Hosted mode: Vortex finished. The CP's launch FSM observes
+    // vortex_busy() == false on its next tick and retires the
+    // CMD_LAUNCH. If the CP is already idle (no scheduled tick) we
+    // need to wake it so the retirement actually happens.
+    maybeWakeCp();
+}
+
+void
+VortexGPGPU::maybeWakeCp()
+{
+    if (abi_.cp_has_work(deviceHandle_) && !cpTickEvent_.scheduled()) {
+        schedule(cpTickEvent_, clockEdge(Cycles(1)));
+    }
+}
+
+void
+VortexGPGPU::onVortexStartTrampoline(void* ctx)
+{
+    static_cast(ctx)->onVortexStart();
+}
+
+void
+VortexGPGPU::onVortexStart()
+{
+    if (!vortexTickEvent_.scheduled()) {
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+    }
+}
+
+Tick
+VortexGPGPU::read(PacketPtr pkt)
+{
+    const Addr a = pkt->getAddr();
+    if (a >= pioAddr_ && a < pioAddr_ + pioSize_) {
+        // CP regfile access — 32-bit only.
+        const uint32_t off = uint32_t(a - pioAddr_);
+        const uint32_t value = abi_.cp_mmio_read(deviceHandle_, off);
+        pkt->setUintX(static_cast(value), ByteOrder::little);
+        pkt->makeAtomicResponse();
+        return pioLatency_;
+    }
+    // BAR-mapped VRAM access (CPU is reading device memory directly).
+    // Variable-width packet (host load / cache-line fill).
+    const uint64_t dev_addr = a - pinAddr_;
+    abi_.vram_read(deviceHandle_,
+                   dev_addr,
+                   pkt->getPtr(),
+                   uint32_t(pkt->getSize()));
+    pkt->makeAtomicResponse();
+    return pioLatency_;
+}
+
+Tick
+VortexGPGPU::write(PacketPtr pkt)
+{
+    const Addr a = pkt->getAddr();
+    if (a >= pioAddr_ && a < pioAddr_ + pioSize_) {
+        // CP regfile write — 32-bit only.
+        const uint32_t off = uint32_t(a - pioAddr_);
+        const uint64_t raw = pkt->getUintX(ByteOrder::little);
+        abi_.cp_mmio_write(deviceHandle_, off, uint32_t(raw));
+        maybeWakeCp();
+        pkt->makeAtomicResponse();
+        return pioLatency_;
+    }
+    // BAR-mapped VRAM write — variable-width packet (host store /
+    // cache writeback). Forwards into in-process simx::RAM via
+    // dev_mem_, so subsequent CP dram_read / Vortex MemSim reads at
+    // the same dev_addr see the bytes the CPU just wrote.
+    const uint64_t dev_addr = a - pinAddr_;
+    abi_.vram_write(deviceHandle_,
+                    dev_addr,
+                    pkt->getConstPtr(),
+                    uint32_t(pkt->getSize()));
+    // Writes to device VRAM may seed CP ring entries; if the CP is
+    // dormant, leave it dormant (the CP only wakes on a doorbell PIO
+    // write, not on a ring-fill).
+    pkt->makeAtomicResponse();
+    return pioLatency_;
+}
+
+AddrRangeList
+VortexGPGPU::getAddrRanges() const
+{
+    AddrRangeList ranges;
+    ranges.push_back(RangeSize(pioAddr_, pioSize_));
+    if (pinSize_ != 0) {
+        ranges.push_back(RangeSize(pinAddr_, pinSize_));
+    }
+    return ranges;
+}
+
+} // namespace gem5
diff --git a/sim/simx/gem5/vortex_gpgpu_dev.hh b/sim/simx/gem5/vortex_gpgpu_dev.hh
new file mode 100644
index 000000000..54aab326f
--- /dev/null
+++ b/sim/simx/gem5/vortex_gpgpu_dev.hh
@@ -0,0 +1,133 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// VortexGPGPU — gem5 SimObject wrapper for libvortex-gem5.so.
+//
+// Lives at $GEM5_HOME/src/dev/vortex/vortex_gpgpu_dev.{cc,hh} after
+// sim/simx/gem5/install.sh runs. The host-side source of truth is the
+// Vortex tree (sim/simx/gem5/) so API drift between gem5 and the Vortex
+// C ABI surfaces as a build error in Vortex CI, not as a gem5
+// integration mystery.
+//
+// Design (gem5_v2_cp_migration_proposal §2.3, §2.4):
+//   - dlopen the Vortex library at construction; resolve all
+//     vortex_gem5_* symbols up-front so the hot paths (cpTick,
+//     vortexTick, PIO read/write) are direct indirect calls.
+//   - PIO range is exactly the CP regfile (PIO_BASE_ADDR + 0..+0x1FF,
+//     proposal §3); no legacy OPAE register window.
+//   - cpTickEvent_ self-schedules only while the CP has work; goes
+//     dormant otherwise (proposal §2.3). PIO writes that may have
+//     armed work re-arm the schedule.
+//   - vortexTickEvent_ self-schedules only while Vortex is running;
+//     scheduled by the CP's vortex_start hook via the registered
+//     start handler (proposal §2.4). Standalone mode skips the CP
+//     and schedules vortexTickEvent_ directly at startup.
+//   - DmaDevice base class kept for forward compatibility with the
+//     v2 DMA-port seam (proposal §2.5) and for the standalone smoke
+//     test path that still uses gem5's pio interface.
+
+#ifndef __DEV_VORTEX_VORTEX_GPGPU_DEV_HH__
+#define __DEV_VORTEX_VORTEX_GPGPU_DEV_HH__
+
+#include "dev/dma_device.hh"
+#include "dev/io_device.hh"
+#include "params/VortexGPGPU.hh"
+#include "sim/eventq.hh"
+
+#include 
+#include 
+
+namespace gem5
+{
+
+class VortexGPGPU : public DmaDevice
+{
+public:
+    using Params = VortexGPGPUParams;
+
+    VortexGPGPU(const Params &p);
+    ~VortexGPGPU() override;
+
+    // PioDevice interface
+    Tick read(PacketPtr pkt) override;
+    Tick write(PacketPtr pkt) override;
+    AddrRangeList getAddrRanges() const override;
+
+    // SimObject lifecycle
+    void init() override;
+    void startup() override;
+
+private:
+    // CP tick — advances the embedded CommandProcessor one functional
+    // cycle. Self-reschedules iff cp_tick reported still-busy.
+    void cpTick();
+
+    // Vortex tick — advances the Vortex Processor one cycle.
+    // Self-reschedules iff vortex_tick reported still-running.
+    // Standalone mode exits the sim loop when vortex_tick returns false.
+    void vortexTick();
+
+    // Called from a PIO write to schedule cpTickEvent_ if (a) the CP
+    // reports new work and (b) the event isn't already pending.
+    void maybeWakeCp();
+
+    // Static trampoline registered with the device library so the CP's
+    // vortex_start hook can schedule vortexTickEvent_ via the gem5
+    // event scheduler. Passing `this` via the void* ctx avoids any
+    // dependency on gem5 types in the library.
+    static void onVortexStartTrampoline(void* ctx);
+    void onVortexStart();
+
+    // Library binding ------------------------------------------------
+    void* libHandle_;
+    void* deviceHandle_;
+
+    struct AbiV2 {
+        const char* (*build_info)(void);
+        void*       (*create)(void);
+        void        (*destroy)(void* h);
+        void        (*set_start_handler)(void* h, void (*fn)(void*), void* ctx);
+        int         (*load_kernel)(void* h, const char* path);
+        void        (*cp_mmio_write)(void* h, uint32_t off, uint32_t value);
+        uint32_t    (*cp_mmio_read)(void* h, uint32_t off);
+        bool        (*cp_tick)(void* h);
+        bool        (*cp_has_work)(void* h);
+        bool        (*vortex_tick)(void* h);
+        bool        (*vortex_busy)(void* h);
+        void        (*vram_write)(void* h, uint64_t addr,
+                                  const uint8_t* src, uint32_t size);
+        void        (*vram_read)(void* h, uint64_t addr,
+                                 uint8_t* dst, uint32_t size);
+    } abi_;
+
+    // Configuration --------------------------------------------------
+    const std::string libraryPath_;
+    const std::string kernelPath_;
+    const Addr        pioAddr_;
+    const Addr        pioSize_;
+    const Addr        pinAddr_;   // device VRAM, host-visible as BAR
+    const Addr        pinSize_;
+    const Tick        pioLatency_;
+
+    // Event scheduling
+    EventFunctionWrapper cpTickEvent_;
+    EventFunctionWrapper vortexTickEvent_;
+
+    // Standalone (Phase 3) vs. hosted mode. Set by startup() based on
+    // whether the `kernel=` Python param was provided.
+    bool standalone_;
+};
+
+} // namespace gem5
+
+#endif // __DEV_VORTEX_VORTEX_GPGPU_DEV_HH__
diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp
index b173e4195..40dc9226a 100644
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@@ -231,6 +231,22 @@ void ProcessorImpl::reset() {
   perf_mem_writes_ = 0;
   perf_mem_latency_ = 0;
   perf_mem_pending_reads_ = 0;
+  is_cycle_initialized_ = false;
+}
+
+bool ProcessorImpl::cycle() {
+  // Lazy first-call init mirrors run()'s top-of-loop sequence so the
+  // external driver doesn't need to choreograph reset + kmu start
+  // separately. reset() clears is_cycle_initialized_ so a back-to-back
+  // kernel launch re-dispatches.
+  if (!is_cycle_initialized_) {
+    this->reset();
+    kmu_->start();
+    is_cycle_initialized_ = true;
+  }
+  SimPlatform::instance().tick();
+  perf_mem_latency_ += perf_mem_pending_reads_;
+  return this->any_running();
 }
 
 int ProcessorImpl::dcr_write(uint32_t addr, uint32_t value) {
@@ -333,6 +349,14 @@ int Processor::run() {
   return -1;
 }
 
+bool Processor::cycle() {
+  return impl_->cycle();
+}
+
+Memory* Processor::memsim() {
+  return impl_->memsim();
+}
+
 int Processor::dcr_write(uint32_t addr, uint32_t value) {
   return impl_->dcr_write(addr, value);
 }
diff --git a/sim/simx/processor.h b/sim/simx/processor.h
index 129cfdc46..04b57f037 100644
--- a/sim/simx/processor.h
+++ b/sim/simx/processor.h
@@ -20,6 +20,7 @@
 namespace vortex {
 
 class RAM;
+class Memory;
 class ProcessorImpl;
 
 class Processor {
@@ -33,12 +34,29 @@ class Processor {
 
   int run();
 
+  // Advance the simulator by one cycle. On the first call after a
+  // reset() (or on the very first call), the KMU is started so warps
+  // dispatch into the cluster. Returns true while work remains
+  // (clusters running or channels carrying packets); false once the
+  // program has finished and the channels have drained.
+  //
+  // Used by external simulators that drive Vortex's clock from their
+  // own event loop (SST in sim/simx/sst/, gem5 in sim/simx/gem5/).
+  bool cycle();
+
   void start_kmu();
 
   bool any_running() const;
 
   class Core* get_first_core() const;
 
+  // Returns the processor's memory module. Used by external simulators
+  // (SST, gem5) to install a pre-send hook on Memory::tick that mirrors
+  // accepted requests to their own memory hierarchy for timing
+  // observability. The local data path stays in Vortex's RAM — this is
+  // a peek, not a substitute.
+  Memory* memsim();
+
   int dcr_write(uint32_t addr, uint32_t value);
 
   int dcr_read(uint32_t addr, uint32_t tag, uint32_t* value);
diff --git a/sim/simx/processor_impl.h b/sim/simx/processor_impl.h
index 0f66471b6..4d2b6fef4 100644
--- a/sim/simx/processor_impl.h
+++ b/sim/simx/processor_impl.h
@@ -40,6 +40,11 @@ class ProcessorImpl {
 
   int run();
 
+  // Single-cycle step; see Processor::cycle() doc. Lazily initializes
+  // (resets + starts KMU) on the first call after construction or
+  // after reset() has been invoked.
+  bool cycle();
+
   int dcr_write(uint32_t addr, uint32_t value);
 
   int dcr_read(uint32_t addr, uint32_t tag, uint32_t* value);
@@ -48,6 +53,8 @@ class ProcessorImpl {
 
   Kmu& kmu()       { return *kmu_; }
 
+  Memory* memsim() { return memsim_.get(); }
+
   bool any_running() const;
 
   class Core* get_first_core() const;
@@ -67,6 +74,10 @@ class ProcessorImpl {
   uint64_t perf_mem_writes_;
   uint64_t perf_mem_latency_;
   uint64_t perf_mem_pending_reads_;
+  // Tracks whether cycle() has done its first-call init (reset +
+  // kmu_->start()). reset() clears it so a back-to-back kernel launch
+  // via cycle() re-dispatches the KMU.
+  bool is_cycle_initialized_;
 };
 
 }
diff --git a/sw/common/bitmanip.h b/sw/common/bitmanip.h
index c4fe9e8da..5c7268385 100644
--- a/sw/common/bitmanip.h
+++ b/sw/common/bitmanip.h
@@ -14,6 +14,8 @@
 #pragma once
 
 #include 
+#include 
+#include 
 #include 
 
 namespace vortex {
diff --git a/sw/runtime/gem5/Makefile b/sw/runtime/gem5/Makefile
new file mode 100644
index 000000000..259bda5d9
--- /dev/null
+++ b/sw/runtime/gem5/Makefile
@@ -0,0 +1,66 @@
+include ../common.mk
+
+# HOST_ARCH selects the cross-compiler for the simulated host ISA
+# inside gem5 (see docs/proposals/gem5_simx_v3_proposal.md §3.5).
+# Default x86_64 has no toolchain install requirement; aarch64/armhf
+# need ci/gem5_install.sh to have run sudo-apt for the cross-compilers.
+HOST_ARCH ?= x86_64
+
+DESTDIR ?= $(CURDIR)/..
+
+SRC_DIR := $(VORTEX_HOME)/sw/runtime/gem5
+
+CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors -Werror
+CXXFLAGS += -I$(INC_DIR) -I$(ROOT_DIR)/sw -I$(ROOT_DIR)/hw -I$(DESTDIR) -I$(SW_COMMON_DIR) -I$(RT_COMMON_DIR)
+CXXFLAGS += -DXLEN_$(XLEN)
+CXXFLAGS += -fPIC
+CXXFLAGS += $(CONFIGS)
+
+# Per-arch compiler selection. The cross-compilers are sysroot-aware
+# (Ubuntu's gcc-aarch64-linux-gnu ships the matching libstdc++); no
+# extra --sysroot flags needed.
+#
+# Cross-compiled outputs land in $(DESTDIR)/$(HOST_ARCH)/ alongside
+# the stub's libvortex.so (also cross-compiled). The simulated ARM
+# process's LD_LIBRARY_PATH points at that one dir to find both.
+ifeq ($(HOST_ARCH),x86_64)
+    CXX := g++
+    ARCH_SUFFIX := x86_64
+    OUT_DIR := $(DESTDIR)
+else ifeq ($(HOST_ARCH),aarch64)
+    CXX := aarch64-linux-gnu-g++
+    ARCH_SUFFIX := aarch64
+    OUT_DIR := $(DESTDIR)/aarch64
+else ifeq ($(HOST_ARCH),armhf)
+    CXX := arm-linux-gnueabihf-g++
+    ARCH_SUFFIX := armhf
+    OUT_DIR := $(DESTDIR)/armhf
+else
+    $(error HOST_ARCH must be one of: x86_64, aarch64, armhf (got $(HOST_ARCH)))
+endif
+
+LDFLAGS += -shared -pthread
+
+SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp $(RT_COMMON_DIR)/utils.cpp
+
+# Debug / release
+ifdef DEBUG
+    CXXFLAGS += -g -O0
+else
+    CXXFLAGS += -O2 -DNDEBUG
+endif
+
+PROJECT := libvortex-gem5-$(ARCH_SUFFIX).so
+
+.PHONY: all force clean
+
+all: $(OUT_DIR)/$(PROJECT)
+
+$(OUT_DIR)/$(PROJECT): $(SRCS)
+	@mkdir -p $(OUT_DIR)
+	$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -Wl,-soname,$(PROJECT) -o $@
+
+clean:
+	rm -f $(DESTDIR)/libvortex-gem5-*.so
+	rm -f $(DESTDIR)/aarch64/libvortex-gem5-*.so
+	rm -f $(DESTDIR)/armhf/libvortex-gem5-*.so
diff --git a/sw/runtime/gem5/driver.cpp b/sw/runtime/gem5/driver.cpp
new file mode 100644
index 000000000..e00f72f77
--- /dev/null
+++ b/sw/runtime/gem5/driver.cpp
@@ -0,0 +1,62 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "driver.h"
+
+namespace vortex {
+
+namespace {
+bool g_inited = false;
+}
+
+int drv_init() {
+    // The two fixed regions (PIO and PIN) are mapped by the gem5
+    // SE-mode setup before this binary runs. No mmap() here because
+    // SE-mode has no /dev/vortex; the Python config arranges the
+    // address space directly. If this runtime is ever ported to a
+    // real OS with a kernel driver, drv_init() becomes
+    // open("/dev/vortex_gem5") + mmap() for both regions.
+    g_inited = true;
+    return 0;
+}
+
+void drv_close() {
+    g_inited = false;
+}
+
+uint32_t mmio_read32(uint32_t offset) {
+    auto* p = reinterpret_cast(PIO_BASE_ADDR + offset);
+    return *p;
+}
+
+void mmio_write32(uint32_t offset, uint32_t value) {
+    auto* p = reinterpret_cast(PIO_BASE_ADDR + offset);
+    *p = value;
+}
+
+// Publish prior stores before the next MMIO write. The host CPU model
+// in gem5 (especially out-of-order variants like O3CPU) can reorder
+// MMIO writes and surrounding stores; the dispatcher must guarantee
+// that ring-buffer payloads land in device memory before Q_TAIL_HI is
+// observed by the CP. The barrier is per-HOST_ARCH.
+void mmio_fence() {
+#if defined(__x86_64__) || defined(__i386__)
+    __asm__ __volatile__ ("mfence" ::: "memory");
+#elif defined(__aarch64__) || defined(__arm__)
+    __asm__ __volatile__ ("dmb sy" ::: "memory");
+#else
+    __asm__ __volatile__ ("" ::: "memory");
+#endif
+}
+
+} // namespace vortex
diff --git a/sw/runtime/gem5/driver.h b/sw/runtime/gem5/driver.h
new file mode 100644
index 000000000..45bef33a6
--- /dev/null
+++ b/sw/runtime/gem5/driver.h
@@ -0,0 +1,70 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Direct-MMIO + pinned-region driver for the gem5 VortexGPGPU device.
+//
+// Inside a gem5 SE-mode process the device is reached by:
+//
+//   1. MMIO accesses to the CP regfile via a fixed virtual address that
+//      the gem5 Python config maps to the SimObject's PIO range
+//      (PIO_BASE_ADDR below; default 0x20000000 — gem5_v2_cp_migration
+//      §3). The CP regfile is 32-bit; only 32-bit accesses are used.
+//
+//   2. Direct memory access to device VRAM via a fixed pinned region
+//      that the gem5 Python config identity-maps virtual→physical
+//      (PIN_BASE_ADDR; default 0x10000000). The runtime treats it as
+//      ordinary memory: regular stores from the host process land in
+//      the same physical bytes the SimObject sees as device VRAM.
+//      Eliminates the need for a separate "DMA staging buffer" path —
+//      gem5_v2_cp_migration §2.2.
+
+#pragma once
+
+#include 
+#include 
+
+namespace vortex {
+
+// Fixed virtual addresses the runtime expects to find mapped by the
+// gem5 Python config. PIN_BASE..PIN_BASE+PIN_REGION_SIZE is the
+// host-visible window onto device VRAM — `memcpy(PIN_BASE+dev_addr,
+// host_src, sz)` lands in the same in-process simx::RAM bytes the CP
+// and Vortex see. Sized to cover the full XLEN device address space
+// so any address mem_alloc / mem_reserve can hand out is reachable
+// via the host BAR; placed above 4 GiB so it doesn't collide with the
+// simulated process's natural low-VA layout (heap/stack/code).
+constexpr uintptr_t PIN_BASE_ADDR    = 0x100000000ull;
+constexpr size_t    PIN_REGION_SIZE  = 0x100000000ull;  // 4 GB (= XLEN=32 device VRAM)
+constexpr uintptr_t PIO_BASE_ADDR    = 0x20000000ull;
+constexpr size_t    PIO_REGION_SIZE  = 0x00000200ull;   // 0x200 — CP regfile
+
+// Init / shutdown. Both are idempotent in practice but should be
+// paired 1:1.
+int  drv_init();
+void drv_close();
+
+// CP regfile MMIO. `offset` is the CP-internal byte offset
+// (sim/common/CommandProcessor.h §address map). All accesses are 32-bit
+// — the CP regfile is 32-bit wide, and gem5's PIO model honors the
+// packet width verbatim.
+//
+// mmio_fence() emits the right barrier for HOST_ARCH (mfence on x86,
+// dmb sy on AArch64/ARMv7). The host runtime issues a fence between
+// any non-MMIO publication (e.g. seeding a ring buffer through
+// PIN_BASE_ADDR) and the doorbell write (Q_TAIL_HI) so the device
+// sees the new ring entries before the tail advance.
+uint32_t mmio_read32 (uint32_t offset);
+void     mmio_write32(uint32_t offset, uint32_t value);
+void     mmio_fence();
+
+} // namespace vortex
diff --git a/sw/runtime/gem5/vortex.cpp b/sw/runtime/gem5/vortex.cpp
new file mode 100644
index 000000000..7bd5c53b6
--- /dev/null
+++ b/sw/runtime/gem5/vortex.cpp
@@ -0,0 +1,192 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// gem5 host runtime backend (pure-v2 callbacks_t).
+//
+// Implements vx_device with the platform primitives expected by
+// sw/runtime/common/callbacks.inc: init / get_caps / mem_info /
+// mem_{alloc,reserve,free,access} / upload / download / copy /
+// cp_mmio_{write,read}. All kernel launches and DCR ops flow through
+// the upstream dispatcher (sw/runtime/common/vx_device.cpp) which
+// builds CMD_* descriptors into the CP ring buffer and bumps Q_TAIL
+// via cp_mmio_write.
+//
+// gem5-specific shape (vs. xrt/opae):
+//   - mem_upload/download/copy are direct memcpy through PIN_BASE_ADDR
+//     which the gem5 SE-mode process has identity-mapped to device VRAM
+//     via Process::map. No DMA descriptor; no PIO trigger.
+//   - cp_mmio_{write,read} are 32-bit PIO accesses at PIO_BASE_ADDR + off
+//     (no CP_BASE 0x1000 offset because the gem5 device's PIO range IS
+//     the CP regfile; there is no AFU bit-12 split).
+//
+// See docs/proposals/gem5_v2_cp_migration_proposal.md for the full
+// design rationale.
+
+#include 
+#include           // log2floor / log2ceil / is_aligned / aligned_size
+#include "driver.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+using namespace vortex;
+
+class vx_device {
+public:
+    vx_device()
+        : global_mem_(ALLOC_BASE_ADDR,
+                      GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR,
+                      RAM_PAGE_SIZE,
+                      CACHE_BLOCK_SIZE) {}
+
+    ~vx_device() {
+        drv_close();
+    }
+
+    int init() {
+        if (drv_init() != 0) {
+            std::fprintf(stderr, "[VXDRV] drv_init failed\n");
+            return -1;
+        }
+        return 0;
+    }
+
+    // Compile-time capability table — host runtime and SimX-side device
+    // library share the build tree so VX_config.h macros agree on both
+    // sides by construction.
+    int get_caps(uint32_t caps_id, uint64_t* value) {
+        switch (caps_id) {
+        case VX_CAPS_VERSION:         *value = IMPLEMENTATION_ID; break;
+        case VX_CAPS_NUM_THREADS:     *value = NUM_THREADS; break;
+        case VX_CAPS_NUM_WARPS:       *value = NUM_WARPS; break;
+        case VX_CAPS_NUM_CORES:       *value = NUM_CORES * NUM_CLUSTERS; break;
+        case VX_CAPS_NUM_CLUSTERS:    *value = NUM_CLUSTERS; break;
+        case VX_CAPS_SOCKET_SIZE:     *value = SOCKET_SIZE; break;
+        case VX_CAPS_ISSUE_WIDTH:     *value = ISSUE_WIDTH; break;
+        case VX_CAPS_CACHE_LINE_SIZE: *value = CACHE_BLOCK_SIZE; break;
+        case VX_CAPS_GLOBAL_MEM_SIZE: *value = GLOBAL_MEM_SIZE; break;
+        case VX_CAPS_LOCAL_MEM_SIZE:  *value = (1 << LMEM_LOG_SIZE); break;
+        case VX_CAPS_ISA_FLAGS:
+            *value = ((uint64_t(MISA_EXT)) << 32)
+                   | ((log2floor(XLEN) - 4) << 30)
+                   |   MISA_STD;
+            break;
+        case VX_CAPS_NUM_MEM_BANKS:   *value = PLATFORM_MEMORY_NUM_BANKS; break;
+        case VX_CAPS_MEM_BANK_SIZE:   *value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS); break;
+        case VX_CAPS_CLOCK_RATE:      *value = 0; break;
+        case VX_CAPS_PEAK_MEM_BW:     *value = PLATFORM_MEMORY_PEAK_BW; break;
+        default:
+            std::fprintf(stderr, "[VXDRV] invalid caps id: %u\n", caps_id);
+            return -1;
+        }
+        return 0;
+    }
+
+    int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
+        uint64_t addr;
+        CHECK_ERR(global_mem_.allocate(size, &addr), { return err; });
+        CHECK_ERR(this->mem_access(addr, size, flags), {
+            global_mem_.release(addr);
+            return err;
+        });
+        *dev_addr = addr;
+        return 0;
+    }
+
+    int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
+        CHECK_ERR(global_mem_.reserve(dev_addr, size), { return err; });
+        CHECK_ERR(this->mem_access(dev_addr, size, flags), {
+            global_mem_.release(dev_addr);
+            return err;
+        });
+        return 0;
+    }
+
+    int mem_free(uint64_t dev_addr) {
+        return global_mem_.release(dev_addr);
+    }
+
+    int mem_access(uint64_t /*dev_addr*/, uint64_t /*size*/, int /*flags*/) {
+        // Access control is enforced by the device's RAM ACL inside
+        // libvortex-gem5.so. The host runtime has nothing to do here.
+        return 0;
+    }
+
+    int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
+        if (mem_free) *mem_free = global_mem_.free();
+        if (mem_used) *mem_used = global_mem_.allocated();
+        return 0;
+    }
+
+    // ---- Data plane (cold-start only) ----
+    // PIN_BASE_ADDR is identity-mapped into the host process's VA via
+    // Process::map (driver.h §"identity v→p"), and into the SimObject's
+    // PA view of device VRAM. A memcpy through PIN_BASE_ADDR is the
+    // same physical bytes the CP's DMA engine and Vortex's MemSim see —
+    // zero PIO bounce, zero DMA descriptor, zero command. The dispatcher
+    // uses these to seed CP ring buffers and to preload kernel ELFs;
+    // ordered host↔device transfers from user code go through CMD_MEM_*
+    // in the CP queue.
+
+    int upload(uint64_t dev_addr, const void* host_ptr, uint64_t size) {
+        if (size == 0) return 0;
+        if (dev_addr + size > GLOBAL_MEM_SIZE) return -1;
+        std::memcpy(reinterpret_cast(PIN_BASE_ADDR + dev_addr),
+                    host_ptr, size);
+        mmio_fence();
+        return 0;
+    }
+
+    int download(void* host_ptr, uint64_t dev_addr, uint64_t size) {
+        if (size == 0) return 0;
+        if (dev_addr + size > GLOBAL_MEM_SIZE) return -1;
+        mmio_fence();
+        std::memcpy(host_ptr,
+                    reinterpret_cast(PIN_BASE_ADDR + dev_addr),
+                    size);
+        return 0;
+    }
+
+    int copy(uint64_t dest_addr, uint64_t src_addr, uint64_t size) {
+        if (size == 0) return 0;
+        if (dest_addr + size > GLOBAL_MEM_SIZE
+         || src_addr  + size > GLOBAL_MEM_SIZE) return -1;
+        std::memmove(reinterpret_cast(PIN_BASE_ADDR + dest_addr),
+                     reinterpret_cast(PIN_BASE_ADDR + src_addr),
+                     size);
+        mmio_fence();
+        return 0;
+    }
+
+    // ---- Control plane (sole) ----
+    // `off` is the CP-internal regfile offset (sim/common/CommandProcessor.h
+    // §address map). The gem5 device exposes the CP regfile starting at
+    // PIO_BASE_ADDR + 0 — no AFU bit-12 split — so the wrapper is a
+    // straight PIO access.
+    int cp_mmio_write(uint32_t off, uint32_t value) {
+        mmio_write32(off, value);
+        return 0;
+    }
+    int cp_mmio_read(uint32_t off, uint32_t* value) {
+        *value = mmio_read32(off);
+        return 0;
+    }
+
+private:
+    MemoryAllocator global_mem_;
+};
+
+#include 
diff --git a/sw/runtime/stub/Makefile b/sw/runtime/stub/Makefile
index 14f88f02b..ed566fac6 100644
--- a/sw/runtime/stub/Makefile
+++ b/sw/runtime/stub/Makefile
@@ -1,5 +1,13 @@
 include ../common.mk
 
+# HOST_ARCH switch — when building for a non-native simulated host
+# (e.g. running x86 gem5 with an aarch64 simulated CPU), select the
+# matching cross-compiler. Aligns with sw/runtime/gem5/Makefile's
+# HOST_ARCH knob; cross-arch builds land in $(DESTDIR)/$(HOST_ARCH)/
+# so the same dlopen target name (libvortex.so) can coexist with the
+# native build in $(DESTDIR)/.
+HOST_ARCH ?= x86_64
+
 DESTDIR ?= $(CURDIR)/..
 
 SRC_DIR := $(VORTEX_HOME)/sw/runtime/stub
@@ -13,8 +21,21 @@ LDFLAGS += -shared -pthread -ldl -Wl,-soname,libvortex.so
 # itself lives in (so the dlopen at vx_device_open time finds them).
 LDFLAGS += -Wl,-rpath,'$$ORIGIN'
 
+ifeq ($(HOST_ARCH),x86_64)
+    CXX := g++
+    OUT_DIR := $(DESTDIR)
+else ifeq ($(HOST_ARCH),aarch64)
+    CXX := aarch64-linux-gnu-g++
+    OUT_DIR := $(DESTDIR)/aarch64
+else ifeq ($(HOST_ARCH),armhf)
+    CXX := arm-linux-gnueabihf-g++
+    OUT_DIR := $(DESTDIR)/armhf
+else
+    $(error HOST_ARCH must be one of: x86_64, aarch64, armhf (got $(HOST_ARCH)))
+endif
+
 # Dispatcher library = vortex2.h runtime (C++ classes) +
-#                      vortex_legacy.cpp wrappers (vortex.h -> vortex2.h) +
+#                      legacy_runtime.cpp wrappers (vortex.h -> vortex2.h) +
 #                      legacy utility helpers +
 #                      thin stub/vortex.cpp glue (currently just for the
 #                      build target — the real entry points live in
@@ -41,12 +62,13 @@ endif
 
 PROJECT := libvortex.so
 
-all: $(DESTDIR)/$(PROJECT)
+all: $(OUT_DIR)/$(PROJECT)
 
-$(DESTDIR)/$(PROJECT): $(SRCS)
+$(OUT_DIR)/$(PROJECT): $(SRCS)
+	@mkdir -p $(OUT_DIR)
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
 
 clean:
-	rm -f $(DESTDIR)/$(PROJECT)
+	rm -f $(DESTDIR)/$(PROJECT) $(DESTDIR)/aarch64/$(PROJECT) $(DESTDIR)/armhf/$(PROJECT)
 
 .PHONY: all clean
diff --git a/tests/regression/common.mk b/tests/regression/common.mk
index 536fcd6f8..6484ed1e0 100644
--- a/tests/regression/common.mk
+++ b/tests/regression/common.mk
@@ -83,7 +83,39 @@ CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors -Werror
 CXXFLAGS += -I$(VORTEX_HOME)/sw/runtime/include -I$(ROOT_DIR)/sw -I$(ROOT_DIR)/hw -I$(SW_COMMON_DIR)
 CXXFLAGS += $(CONFIGS)
 
-LDFLAGS += -L$(VORTEX_RT_LIB) -lvortex
+# HOST_ARCH selects the simulated-host compiler for the test binary
+# (the .vxbin always builds with the RISC-V toolchain regardless).
+# When non-native, the binary is suffixed (e.g. vecadd-aarch64) and
+# we link against the cross-compiled stub in $(VORTEX_RT_LIB)/$(HOST_ARCH)/.
+# Aligns with sw/runtime/{stub,gem5}/Makefile's HOST_ARCH knob; the
+# gem5 ARM e2e test path uses this to produce aarch64 binaries that
+# the simulated ARM CPU inside gem5 can execute.
+#
+# Cross-compiled ELFs embed `/lib/ld-linux-$arch.so.1` as the dynamic
+# linker (PT_INTERP). gem5 doesn't have that path on the host, but
+# it has a setInterpDir() API that prepends a sysroot to the
+# interpreter lookup — the gem5 Python config calls that when
+# DRIVER=gem5-aarch64. Keep the default INTERP here so that mechanism
+# can do the redirection cleanly. (Earlier versions used
+# `-Wl,--dynamic-linker=` to rewrite PT_INTERP, but that interacts
+# badly with setInterpDir's prepend logic.)
+HOST_ARCH ?= x86_64
+ifeq ($(HOST_ARCH),x86_64)
+    PROJECT_SUFFIX :=
+    RT_LIB_DIR := $(VORTEX_RT_LIB)
+else ifeq ($(HOST_ARCH),aarch64)
+    CXX := aarch64-linux-gnu-g++
+    PROJECT_SUFFIX := -aarch64
+    RT_LIB_DIR := $(VORTEX_RT_LIB)/aarch64
+else ifeq ($(HOST_ARCH),armhf)
+    CXX := arm-linux-gnueabihf-g++
+    PROJECT_SUFFIX := -armhf
+    RT_LIB_DIR := $(VORTEX_RT_LIB)/armhf
+else
+    $(error HOST_ARCH must be one of: x86_64, aarch64, armhf (got $(HOST_ARCH)))
+endif
+
+LDFLAGS += -L$(RT_LIB_DIR) -lvortex
 
 # Debugging
 ifdef DEBUG
@@ -106,7 +138,11 @@ endif
 
 CONFIG_STAMP = config.stamp
 
-all: $(PROJECT) kernel.vxbin kernel.dump
+# HOST_ARCH-suffixed binary name (vecadd, vecadd-aarch64, …) so
+# x86 and cross-compiled variants coexist in the same dir.
+APP := $(PROJECT)$(PROJECT_SUFFIX)
+
+all: $(APP) kernel.vxbin kernel.dump
 
 # Force rebuild when CONFIGS (defines) change between runs.
 $(CONFIG_STAMP): FORCE
@@ -146,9 +182,16 @@ kernel.elf: vx_start.o $(VX_SRCS) $(VORTEX_KN_PATH)/lib$(KERNEL_LIB).a $(CONFIG_
 	$(VX_CXX) $(VX_CFLAGS) vx_start.o $(VX_APP_OBJS) $(VX_LDFLAGS) -o $@
 endif
 
-$(PROJECT): $(SRCS) $(VORTEX_RT_LIB)/libvortex.so $(CONFIG_STAMP)
+$(APP): $(SRCS) $(RT_LIB_DIR)/libvortex.so $(CONFIG_STAMP)
 	$(CXX) $(CXXFLAGS) $(filter-out $(CONFIG_STAMP),$^) $(LDFLAGS) -o $@
 
+# Cross-compiled stub for non-native HOST_ARCH. Native (x86_64)
+# is built by $(VORTEX_RT_LIB)/libvortex.so rule below.
+ifneq ($(HOST_ARCH),x86_64)
+$(RT_LIB_DIR)/libvortex.so:
+	$(RUNTIME_ARGS) $(MAKE) -C $(VORTEX_RT_SRC)/stub HOST_ARCH=$(HOST_ARCH) DESTDIR=$(VORTEX_RT_LIB)
+endif
+
 run-simx: $(PROJECT) kernel.vxbin
 	$(RUNTIME_ARGS) $(MAKE) -C $(VORTEX_RT_SRC)/simx DESTDIR=$(VORTEX_RT_LIB)
 	LD_LIBRARY_PATH=$(VORTEX_RT_LIB):$(LD_LIBRARY_PATH) VORTEX_DRIVER=simx ./$(PROJECT) $(OPTS)