diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2adecef42..588455f06 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -65,6 +65,7 @@ jobs:
           ../configure --tooldir=$TOOLDIR
           ci/toolchain_install.sh --all
           ci/sst_install.sh
+          ci/gem5_install.sh
 
       - name: Setup Third Party
         if: steps.cache-thirdparty.outputs.cache-hit != 'true'
@@ -78,6 +79,11 @@ jobs:
           echo "SST_CORE_HOME=$PWD/tools/sst-install/sst-core" >> $GITHUB_ENV
           echo "SST_ELEMENTS_HOME=$PWD/tools/sst-install/sst-elements" >> $GITHUB_ENV
 
+      - name: Export gem5 paths
+        run: |
+          echo "GEM5_HOME=$PWD/tools/gem5" >> $GITHUB_ENV
+          echo "$PWD/tools/gem5/build/X86" >> $GITHUB_PATH
+
   build:
     needs: setup
     strategy:
@@ -137,15 +143,23 @@ jobs:
       matrix:
         os: [ubuntu-24.04]
         # dxa + tensor_wg disabled: features not yet complete (see regression{32,64}_failures.md)
-        name: [regression, amo, mpi, dtm, opencl, cache, config1, config2, debug, scope, stress, synthesis, vm, rvc, cupbop, hip, tensor, tensor_sp, tensor_mx]
+        name: [regression, amo, mpi, dtm, opencl, cache, config1, config2, debug, scope, stress, synthesis, vm, rvc, cupbop, hip, tensor, tensor_sp, tensor_mx, gem5]
         xlen: [32, 64]
         # chipStar's hipcc emits Physical64 SPIR-V; POCL refuses it on
         # rv32 Vortex (CL_INVALID_OPERATION). hip is rv64-only until
         # either chipStar grows --offload=spirv32 or the native
         # HIPVortex toolchain lands (see hip_support_proposal.md).
+        #
+        # gem5 only runs against the rv32 build; the device library
+        # is XLEN-locked by the gem5 install (build/X86/gem5.opt
+        # links against the libvortex-gem5.so the runner builds, and
+        # we only build it once). XLEN=64 entry would just duplicate
+        # the run against an identical setup.
         exclude:
           - name: hip
             xlen: 32
+          - name: gem5
+            xlen: 64
     runs-on: ${{ matrix.os }}
     timeout-minutes: 120
 
@@ -190,6 +204,11 @@ jobs:
           echo "SST_CORE_HOME=$PWD/tools/sst-install/sst-core" >> $GITHUB_ENV
           echo "SST_ELEMENTS_HOME=$PWD/tools/sst-install/sst-elements" >> $GITHUB_ENV
 
+      - name: Export gem5 paths
+        run: |
+          echo "GEM5_HOME=$PWD/tools/gem5" >> $GITHUB_ENV
+          echo "$PWD/tools/gem5/build/X86" >> $GITHUB_PATH
+
       - name: Run tests
         run: |
           cd build${{ matrix.xlen }}
diff --git a/VERSION b/VERSION
index af5ac4633..590f872b1 100644
--- a/VERSION
+++ b/VERSION
@@ -1,2 +1,3 @@
 VORTEX_VERSION=3.0
 TOOLCHAIN_REV=v3.0
+GEM5_REV=v25.0.0.1
diff --git a/ci/gem5_install.sh.in b/ci/gem5_install.sh.in
new file mode 100644
index 000000000..0eb610e40
--- /dev/null
+++ b/ci/gem5_install.sh.in
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+# Copyright © 2019-2023
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# gem5 install for the Vortex integration — see
+# docs/proposals/gem5_v2_cp_migration_proposal.md for the design,
+# docs/gem5_integration.md for the operator manual.
+#
+# Fetches a pinned gem5 release, installs build deps + the AArch64
+# cross-toolchain (used by the ARM regression matrix), and builds
+# gem5.opt for the selected ISA targets (X86 + ARM by default). The
+# Vortex SimObject is installed separately via
+# `sim/simx/gem5/install.sh` — that step has to re-run whenever the
+# SimObject sources change, but it does NOT need a fresh gem5 clone.
+#
+# Idempotent: re-running with the same GEM5_REV is a no-op once
+# $GEM5_HOME/build/<target>/gem5.opt exists.
+
+# exit when any command fails
+set -e
+
+GEM5_REV=${GEM5_REV:=@GEM5_REV@}
+TOOLDIR=${TOOLDIR:=@TOOLDIR@}
+GEM5_HOME=$TOOLDIR/gem5
+GEM5_REPO=https://github.com/gem5/gem5.git
+
+# Build deps. gem5 documents these at https://www.gem5.org/documentation/general_docs/building
+# AArch64 cross-toolchain (gcc/g++-aarch64-linux-gnu) is needed for
+# the ARM regression matrix: cross-compiles libvortex-gem5-aarch64.so
+# and vecadd-aarch64 / sgemm-aarch64. Installing it here keeps the
+# one-time setup self-contained.
+DEBIAN_FRONTEND=noninteractive sudo apt install -y \
+    scons \
+    python3 python3-dev python3-pip python3-venv \
+    libprotobuf-dev protobuf-compiler libprotoc-dev \
+    libgoogle-perftools-dev \
+    m4 \
+    libboost-all-dev \
+    libhdf5-serial-dev \
+    libpng-dev \
+    pkg-config \
+    gcc-aarch64-linux-gnu g++-aarch64-linux-gnu \
+    build-essential git wget
+
+mkdir -p "$TOOLDIR"
+
+# Fetch (or update) gem5 working tree at the pinned revision.
+if [ -d "$GEM5_HOME/.git" ]; then
+    echo "gem5 working tree exists at $GEM5_HOME"
+    pushd "$GEM5_HOME" > /dev/null
+    current_rev=$(git describe --tags --always 2>/dev/null || echo "unknown")
+    if [ "$current_rev" != "$GEM5_REV" ]; then
+        echo "checked-out rev $current_rev != pinned $GEM5_REV; refetching"
+        git fetch --depth=1 origin "tag" "$GEM5_REV"
+        git checkout "$GEM5_REV"
+    fi
+    popd > /dev/null
+else
+    echo "cloning gem5 $GEM5_REV into $GEM5_HOME"
+    git clone --depth=1 --branch "$GEM5_REV" "$GEM5_REPO" "$GEM5_HOME"
+fi
+
+# Build the ARM variant. -j$(nproc) on the self-hosted runner; cap at 4
+# on hosted runners to avoid OOM (gem5 link uses ~4 GB peak).
+JOBS=$(nproc)
+if [ -n "$GITHUB_ACTIONS" ] && [ -z "$VORTEX_SELF_HOSTED" ]; then
+    JOBS=4
+fi
+
+# Build both X86 (default host ISA — easier, no cross-compile needed)
+# and ARM (used by the cross-arch regression matrix). Either can be
+# selected at test-config time via GEM5_BIN=$GEM5_HOME/build/{X86,ARM}/gem5.opt.
+# Default targets can be overridden via GEM5_TARGETS="X86" or "ARM" or
+# "X86 ARM" (space-separated). Both is the default.
+GEM5_TARGETS=${GEM5_TARGETS:-"X86 ARM"}
+
+cd "$GEM5_HOME"
+for target in $GEM5_TARGETS; do
+    if [ ! -x "$GEM5_HOME/build/$target/gem5.opt" ]; then
+        echo "building gem5.opt ($target) with -j$JOBS"
+        scons "build/$target/gem5.opt" -j"$JOBS"
+    else
+        echo "gem5.opt ($target) already built at $GEM5_HOME/build/$target/gem5.opt"
+    fi
+done
+
+# Persist GEM5_HOME for subsequent shells (idempotent).
+if ! grep -q "^export GEM5_HOME=" ~/.bashrc 2>/dev/null; then
+    echo "export GEM5_HOME=$GEM5_HOME" >> ~/.bashrc
+fi
+export GEM5_HOME
+
+# GitHub Actions: propagate to subsequent steps.
+if [ -n "$GITHUB_ENV" ]; then
+    echo "GEM5_HOME=$GEM5_HOME" >> "$GITHUB_ENV"
+fi
+if [ -n "$GITHUB_PATH" ]; then
+    for target in $GEM5_TARGETS; do
+        echo "$GEM5_HOME/build/$target" >> "$GITHUB_PATH"
+    done
+fi
+
+echo ""
+echo "gem5 $GEM5_REV installed at $GEM5_HOME"
+for target in $GEM5_TARGETS; do
+    echo "  binary: $GEM5_HOME/build/$target/gem5.opt"
+done
+echo "  GEM5_HOME exported (re-source ~/.bashrc to pick up in new shells)"
diff --git a/ci/gem5_run_app.py b/ci/gem5_run_app.py
new file mode 100644
index 000000000..9eb008e1b
--- /dev/null
+++ b/ci/gem5_run_app.py
@@ -0,0 +1,260 @@
+# Copyright © 2019-2023
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# End-to-end gem5 integration test for vortex.VortexGPGPU.
+#
+# Generic application runner — any Vortex regression test that
+# follows the standard shape (host binary + kernel.vxbin in the same
+# directory, links against libvortex.so) can run here.
+#
+# Wires (gem5_v2_cp_migration_proposal §3):
+#   - SE-mode CPU(s) running an unmodified Vortex regression test
+#     (same binary the SimX backend uses).
+#   - VortexGPGPU device on the system membus, claiming two ranges:
+#     CP regfile at PIO_BASE (32-bit MMIO) and BAR-mapped VRAM at
+#     PIN_BASE (host memcpy lands in in-process simx::RAM).
+#   - Identity-mapped via Process.map() — the same mechanism gem5's
+#     AMD GPU integration uses at apu_se.py:1055.
+#
+# The simulated process loads libvortex.so (the upstream dispatcher),
+# which dlopens libvortex-gem5-x86_64.so based on VORTEX_DRIVER. The
+# dispatcher's CP submission path then:
+#   1. mem_alloc + mem_upload → ring buffer / head / cmpl slots in VRAM
+#   2. cp_mmio_write(Q_*, ...) → program CP regfile, enable Q0 + CP
+#   3. vx_enqueue_launch / vx_enqueue_write / etc. → CMD_* descriptors
+#      written into the ring (mem_upload), Q_TAIL_HI doorbell (cp_mmio_write),
+#      Q_SEQNUM polled to wait (cp_mmio_read).
+# The host runtime is a thin platform shim — no per-command logic.
+#
+# Configurable via env vars:
+#   VORTEX_GEM5_DEV_LIB     — path to sim/simx/libvortex-gem5.so
+#                             (device-side; dlopened by the gem5 SimObject)
+#   VORTEX_GEM5_HOST_RT_DIR — directory containing libvortex.so (the stub)
+#                             AND libvortex-gem5-x86_64.so (the host
+#                             runtime backend). Both are added to the
+#                             simulated process's LD_LIBRARY_PATH.
+#   VORTEX_TEST_DIR         — directory containing the test binary +
+#                             kernel.vxbin
+#   VORTEX_TEST_BIN         — name of the test binary inside that dir
+#                             (default: vecadd)
+#   VORTEX_TEST_ARGS        — args passed to the binary (default: -n16)
+#   VORTEX_DRIVER           — backend selector for the stub library
+#                             (default: gem5-x86_64; use gem5-aarch64
+#                             when running the ARM matrix)
+
+import os
+import shlex
+
+import m5
+from m5.objects import (
+    AddrRange,
+    DDR3_1600_8x8,
+    MemCtrl,
+    Process,
+    RedirectPath,
+    Root,
+    SEWorkload,
+    SrcClockDomain,
+    System,
+    SystemXBar,
+    AtomicSimpleCPU,
+    VoltageDomain,
+    VortexGPGPU,
+)
+
+DEV_LIB     = os.environ.get("VORTEX_GEM5_DEV_LIB")
+HOST_RT_DIR = os.environ.get("VORTEX_GEM5_HOST_RT_DIR")
+TEST_DIR    = os.environ.get("VORTEX_TEST_DIR")
+TEST_BIN    = os.environ.get("VORTEX_TEST_BIN", "vecadd")
+TEST_ARGS   = os.environ.get("VORTEX_TEST_ARGS", "-n16")
+DRIVER      = os.environ.get("VORTEX_DRIVER",   "gem5-x86_64")
+
+# Number of CPU thread contexts. The upstream dispatcher spawns a
+# per-Queue worker thread (commit 157e7a1) and the legacy_runtime
+# helpers may spawn additional internal threads. Each thread needs a
+# free HW context — we provision 4 (one main + headroom). Each is a
+# separate AtomicSimpleCPU instance per the gem5 SE-mode pthread
+# pattern (deprecated/example/se.py:188-189): clone() in
+# syscall_emul finds the next idle context across all CPUs.
+NUM_CPUS = 4
+
+for name, val in [
+    ("VORTEX_GEM5_DEV_LIB",     DEV_LIB),
+    ("VORTEX_GEM5_HOST_RT_DIR", HOST_RT_DIR),
+    ("VORTEX_TEST_DIR",         TEST_DIR),
+]:
+    if not val:
+        raise RuntimeError(f"{name} env var is required")
+
+APP_BIN = f"{TEST_DIR}/{TEST_BIN}"
+
+# Fixed mappings used by the gem5 host runtime (see
+# sw/runtime/gem5/driver.h). The Python config and the C runtime
+# share these constants by convention; if you change one, change
+# both.
+PIO_BASE   = 0x20000000
+PIO_SIZE   = 0x0200        # CP regfile (0x40 globals + 4 × 0x40 queues + pad)
+PIN_BASE   = 0x100000000   # BAR-mapped VRAM, above 4 GiB to clear the
+                           # simulated process's natural low-VA layout
+PIN_SIZE   = 0x100000000   # 4 GB — full XLEN=32 device address space
+
+# ---------------------------------------------------------------------------
+# System construction
+# ---------------------------------------------------------------------------
+system = System()
+system.clk_domain = SrcClockDomain(clock="3GHz",
+                                   voltage_domain=VoltageDomain())
+system.mem_mode = "atomic"
+system.mem_ranges = [AddrRange("1GiB")]   # advisory; actual routing
+                                          # is by per-SimObject ranges
+                                          # (DRAM owns [0, 1GB);
+                                          # VortexGPGPU owns the PIO
+                                          # and PIN ranges, both above)
+
+# Cross-arch interp + runtime library redirection.
+# Two separate gem5 mechanisms are at play:
+#   (1) `setInterpDir(prefix)` prepends `prefix` to PT_INTERP when
+#       gem5 loads the dynamic linker (e.g. /lib/ld-linux-aarch64.so.1
+#       → /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1). The
+#       linker is opened directly by gem5's loader, NOT via SE-mode
+#       syscall, so RedirectPath doesn't help here.
+#   (2) `system.redirect_paths` redirects open()/stat()/etc syscalls
+#       the GUEST process makes — used when the dynamic linker
+#       later looks up libc.so.6, libstdc++.so.6, libvortex.so, etc.
+# Both are no-ops for native x86.
+if DRIVER == "gem5-aarch64":
+    from m5.core import setInterpDir
+    setInterpDir("/usr/aarch64-linux-gnu")
+    system.redirect_paths = [
+        RedirectPath(app_path="/lib/aarch64-linux-gnu",
+                     host_paths=["/usr/aarch64-linux-gnu/lib"]),
+        RedirectPath(app_path="/usr/lib/aarch64-linux-gnu",
+                     host_paths=["/usr/aarch64-linux-gnu/lib"]),
+    ]
+
+# Membus connects CPU ↔ memory ↔ VortexGPGPU.
+system.membus = SystemXBar()
+system.system_port = system.membus.cpu_side_ports
+
+# CPUs. Atomic — the cycle counts inside the Vortex device are
+# driven by the device's own clock; timing CPU adds wall time without
+# changing the kernel result. We provision NUM_CPUS instances so the
+# dispatcher's per-Queue worker thread (commit 157e7a1) and any
+# transient helper threads have free HW contexts to clone() into.
+system.cpu = [AtomicSimpleCPU(cpu_id=i) for i in range(NUM_CPUS)]
+system.multi_thread = True
+for cpu in system.cpu:
+    cpu.createInterruptController()
+    cpu.icache_port = system.membus.cpu_side_ports
+    cpu.dcache_port = system.membus.cpu_side_ports
+    # X86's InterruptController has explicit pio/int_requestor/
+    # int_responder ports that must be wired to the membus (per
+    # learning_gem5/part1/two_level.py:111-114). ARM's interrupt model
+    # doesn't expose these — skip on ARM. Tested via the DRIVER env
+    # var (the same one that selects the simulated host ISA).
+    if DRIVER == "gem5-x86_64":
+        cpu.interrupts[0].pio           = system.membus.mem_side_ports
+        cpu.interrupts[0].int_requestor = system.membus.cpu_side_ports
+        cpu.interrupts[0].int_responder = system.membus.mem_side_ports
+
+# Memory controller. DRAM serves the simulated process's normal
+# low-VA address space ([0, 1 GiB) is plenty for ELF code + heap +
+# stack of any in-tree regression test). The VortexGPGPU device owns
+# disjoint ranges higher up:
+#   - [PIO_BASE,  PIO_BASE+PIO_SIZE)  — CP regfile (32-bit MMIO)
+#   - [PIN_BASE,  PIN_BASE+PIN_SIZE)  — BAR-mapped VRAM; host CPU
+#     writes land in the same bytes the CP and Vortex see via in-process
+#     simx::RAM (gem5_v2_cp_migration §2.2 single data plane).
+# Placing PIN_BASE above 4 GiB keeps it well clear of both the DRAM
+# range and the simulated process's natural VA layout.
+system.mem_ctrl = MemCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = AddrRange(0, PIO_BASE)
+system.mem_ctrl.port = system.membus.mem_side_ports
+
+# The Vortex device. The `library` parameter points at the
+# device-side libvortex-gem5.so (no arch suffix; gem5 itself is
+# always x86-host). The host-side runtime is loaded separately by
+# the simulated process via VORTEX_DRIVER below.
+system.vortex = VortexGPGPU(
+    library = DEV_LIB,
+    kernel  = "",   # NO preload — the host binary uploads the kernel
+                    # via the dispatcher's CP submission path, the way
+                    # a real accelerator runtime works.
+)
+system.vortex.pio_addr = PIO_BASE
+system.vortex.pio_size = PIO_SIZE
+system.vortex.pin_addr = PIN_BASE
+system.vortex.pin_size = PIN_SIZE
+system.vortex.pio = system.membus.mem_side_ports
+system.vortex.dma = system.membus.cpu_side_ports
+
+# ---------------------------------------------------------------------------
+# Workload (the host test binary)
+# ---------------------------------------------------------------------------
+argv = [APP_BIN] + shlex.split(TEST_ARGS)
+process = Process(
+    pid=100,
+    cwd=TEST_DIR,
+    cmd=argv,
+    executable=argv[0],
+    env=[
+        # Tells the stub to dlopen our backend
+        # (libvortex.so does dlopen("libvortex-${VORTEX_DRIVER}.so")).
+        f"VORTEX_DRIVER={DRIVER}",
+        # Library search path inside the simulated process. Must
+        # contain libvortex.so AND libvortex-gem5-$ARCH.so (both
+        # are in HOST_RT_DIR by construction).
+        f"LD_LIBRARY_PATH={HOST_RT_DIR}",
+    ],
+)
+
+system.workload = SEWorkload.init_compatible(APP_BIN)
+# gem5 SE-mode requires each CPU to have an assigned workload; the
+# secondary CPUs are halted at boot and wake when clone() finds them
+# (deprecated/example/se.py:294). Assign the same Process to all
+# four CPUs — only CPU[0] starts running; the rest sit idle until
+# pthread spawn.
+for cpu in system.cpu:
+    cpu.workload = process
+    cpu.createThreads()
+
+# ---------------------------------------------------------------------------
+# Run
+# ---------------------------------------------------------------------------
+root = Root(full_system=False, system=system)
+m5.instantiate()
+
+# Identity-map both device-owned ranges into the simulated process's
+# address space. Must happen AFTER m5.instantiate(). Mirrors
+# apu_se.py:1055 (gem5's AMD GPU pattern). The CPU's userspace then
+# touches PIO_BASE / PIN_BASE as ordinary memory; the membus routes
+# both ranges to the VortexGPGPU SimObject (PIN range = BAR-mapped
+# VRAM, PIO range = CP regfile).
+#
+# cacheable=False on PIN ensures host stores to VRAM are immediately
+# visible to the CP — otherwise a cache line could hold the new ring
+# entry while Q_TAIL_HI is observed by the device.
+system.cpu[0].workload[0].map(PIO_BASE, PIO_BASE, PIO_SIZE, cacheable=False)
+system.cpu[0].workload[0].map(PIN_BASE, PIN_BASE, PIN_SIZE, cacheable=False)
+
+print(f"E2E: app={APP_BIN} {TEST_ARGS}")
+print(f"E2E: VortexGPGPU.library={DEV_LIB}")
+print(f"E2E: VORTEX_DRIVER={DRIVER}")
+print(f"E2E: LD_LIBRARY_PATH={HOST_RT_DIR}")
+print(f"E2E: PIO @0x{PIO_BASE:x}+0x{PIO_SIZE:x}, PIN @0x{PIN_BASE:x}+0x{PIN_SIZE:x}")
+print("E2E: starting simulation...")
+
+exit_event = m5.simulate()
+print(f"E2E: exit_event.cause = {exit_event.getCause()!r}")
+print(f"E2E: tick = {m5.curTick()}")
diff --git a/ci/gem5_run_hostless_app.py b/ci/gem5_run_hostless_app.py
new file mode 100644
index 000000000..65c92602c
--- /dev/null
+++ b/ci/gem5_run_hostless_app.py
@@ -0,0 +1,108 @@
+# Copyright © 2019-2023
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Hostless gem5 integration test for vortex.VortexGPGPU.
+#
+# The SimObject loads a .vxbin kernel directly via its `kernel=`
+# parameter and runs it via its internal vortexTickEvent_ chain — no
+# host CPU, no Command Processor, no PIO/DMA. Smoke-tests the
+# gem5↔libvortex-gem5.so wiring: dlopen succeeds, SimObject
+# constructs, Processor::cycle() drives from the gem5 event loop, sim
+# exits cleanly.
+#
+# Hosted counterpart: [gem5_run_app.py](gem5_run_app.py) wires up the
+# host CPU + CP regfile + BAR-mapped VRAM on top.
+#
+# Configurable via env vars (parallel to gem5_run_app.py):
+#   VORTEX_GEM5_DEV_LIB — path to libvortex-gem5.so (no default)
+#   VORTEX_TEST_DIR     — directory containing the kernel .vxbin
+#   VORTEX_TEST_KERNEL  — kernel filename inside that dir
+#                         (default: kernel.vxbin, matching the
+#                          regression-test convention)
+#
+# Run from the Vortex build dir as:
+#   VORTEX_GEM5_DEV_LIB=$PWD/sim/simx/libvortex-gem5.so \
+#   VORTEX_TEST_DIR=$PWD/tests/kernel/hello \
+#   VORTEX_TEST_KERNEL=hello.vxbin \
+#       $GEM5_HOME/build/X86/gem5.opt ci/gem5_run_hostless_app.py
+
+import os
+import m5
+from m5.objects import (
+    AddrRange,
+    DDR3_1600_8x8,
+    MemCtrl,
+    Root,
+    SrcClockDomain,
+    System,
+    SystemXBar,
+    VoltageDomain,
+    VortexGPGPU,
+)
+
+DEV_LIB     = os.environ.get("VORTEX_GEM5_DEV_LIB")
+TEST_DIR    = os.environ.get("VORTEX_TEST_DIR")
+TEST_KERNEL = os.environ.get("VORTEX_TEST_KERNEL", "kernel.vxbin")
+
+for name, val in [("VORTEX_GEM5_DEV_LIB", DEV_LIB),
+                  ("VORTEX_TEST_DIR",     TEST_DIR)]:
+    if not val:
+        raise RuntimeError(f"{name} env var is required")
+
+KERNEL = f"{TEST_DIR}/{TEST_KERNEL}"
+
+# Minimal system: just enough to hang the VortexGPGPU off a membus
+# so gem5 considers it a properly-wired SimObject. No CPU in this
+# test — the kernel runs entirely inside the SimObject's internal
+# vortexTickEvent_ chain.
+system = System()
+system.clk_domain = SrcClockDomain(clock="1GHz",
+                                   voltage_domain=VoltageDomain())
+system.mem_mode = "atomic"
+system.mem_ranges = [AddrRange("512MiB")]
+
+# Membus + a small backing memory so PIO ranges have somewhere to bind.
+system.membus = SystemXBar()
+
+# Memory controller (unused at runtime in hostless mode but required
+# for the system to instantiate cleanly).
+system.mem_ctrl = MemCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
+system.mem_ctrl.port = system.membus.mem_side_ports
+
+# The Vortex device. It inherits clock from the system clock domain
+# (set above to 1GHz) via ClockedObject; no explicit `clock=` param.
+system.vortex = VortexGPGPU(
+    library = DEV_LIB,
+    kernel  = KERNEL,
+    # Explicitly disable the BAR-mapped VRAM range — the hostless
+    # path loads the kernel via the device library's load_kernel()
+    # entry, never via host memcpy through PIN. Leaving it enabled
+    # here would conflict with this test's DRAM range.
+    pin_size = 0,
+)
+system.vortex.pio = system.membus.mem_side_ports
+system.vortex.dma = system.membus.cpu_side_ports
+
+# Root wires the system into the simulator.
+root = Root(full_system=False, system=system)
+m5.instantiate()
+
+print(f"Hostless: VortexGPGPU.library={DEV_LIB}")
+print(f"Hostless: kernel={KERNEL}")
+print("Hostless: running until VortexGPGPU exits the sim loop...")
+
+exit_event = m5.simulate()
+print(f"Hostless: exit_event.cause = {exit_event.getCause()!r}")
+print(f"Hostless: tick = {m5.curTick()}")
diff --git a/ci/regression.sh.in b/ci/regression.sh.in
index a24aba709..5d1ddf82a 100755
--- a/ci/regression.sh.in
+++ b/ci/regression.sh.in
@@ -95,14 +95,145 @@ sst()
 
     cp sim/simx/libvortex.so $SST_ELEMENTS_HOME/lib/sst-elements-library/   # alternatively - $ sst --add-lib-path `pwd` myConfig.py
 
-    sst ci/sst_test_vortex_hello.py
-    sst ci/sst_test_vortex_fibonacci.py
-    sst ci/sst_test_vortex_vecadd.py
-    sst ci/sst_test_vortex_conform.py
+    BUILD_DIR=$(pwd)
+
+    # Hostless SST runner (ci/sst_run_hostless_app.py) parameterized
+    # by VORTEX_TEST_DIR + VORTEX_TEST_KERNEL — same shape as
+    # ci/gem5_run_hostless_app.py. SST is hostless-only today (no
+    # CPU component wired to Vortex); the ci/sst_run_app.py name
+    # slot is reserved for a future host-CPU SST integration.
+    for spec in "hello:hello.vxbin" "fibonacci:fibonacci.vxbin" \
+                "vecadd:vecadd.vxbin" "conform:conform.vxbin"; do
+        kern="${spec%%:*}"
+        vxbin="${spec#*:}"
+        echo "=== sst: $kern ==="
+        VORTEX_TEST_DIR=$BUILD_DIR/tests/kernel/$kern \
+        VORTEX_TEST_KERNEL=$vxbin \
+            sst ci/sst_run_hostless_app.py
+    done
 
     echo "sst tests done!"
 }
 
+# gem5 integration tests — see docs/proposals/gem5_v2_cp_migration_proposal.md
+# for the v2 CP-first design and docs/gem5_integration.md for the operator
+# manual. Two layers:
+#
+#   1. Phase 3 standalone: kernel preloaded via the SimObject's
+#      `kernel=` Python param; runs entirely inside the gem5 event
+#      loop, no host CPU needed. Fast smoke test for the device
+#      library wiring.
+#
+#   2. Phase 5 e2e: an x86 (or aarch64) SE-mode workload drives the
+#      device via the CP regfile (cp_mmio_*) + BAR-mapped VRAM
+#      (mem_upload through identity-mapped PIN region). The
+#      dispatcher's CP submission path handles all command building;
+#      the host runtime is a thin platform shim.
+#
+# ARM matrix is opt-in via VORTEX_GEM5_ARM=1 (needs gcc-aarch64-linux-gnu
+# installed; not part of the default hosted-runner image).
+gem5()
+{
+    echo "begin gem5 tests..."
+
+    if [ -z "$GEM5_HOME" ]; then
+        GEM5_HOME=$HOME/tools/gem5
+    fi
+    if [ ! -x "$GEM5_HOME/build/X86/gem5.opt" ]; then
+        echo "error: $GEM5_HOME/build/X86/gem5.opt not found — run ci/gem5_install.sh first"
+        exit 1
+    fi
+
+    # Build prerequisites. The host runtime is gated on HOST_ARCH;
+    # default x86 needs no cross-toolchain.
+    make -C sim/simx USE_GEM5=1
+    make -C sw/runtime/stub
+    make -C sw/runtime/gem5 HOST_ARCH=x86_64
+    make -C sw/kernel
+    make -C tests/kernel/hello
+    make -C tests/regression/vecadd
+    make -C tests/regression/sgemm
+
+    BUILD_DIR=$(pwd)
+    LIB_GEM5_DEV=$BUILD_DIR/sim/simx/libvortex-gem5.so
+    HOST_RT_DIR=$BUILD_DIR/sw/runtime
+
+    # Hostless smoke — no host CPU, kernel preloaded via SimObject param.
+    # env-vars MUST precede the binary (gem5.opt would otherwise treat
+    # them as positional args).
+    echo "=== gem5 hostless: hello ==="
+    VORTEX_GEM5_DEV_LIB=$LIB_GEM5_DEV \
+    VORTEX_TEST_DIR=$BUILD_DIR/tests/kernel/hello \
+    VORTEX_TEST_KERNEL=hello.vxbin \
+        timeout 120 $GEM5_HOME/build/X86/gem5.opt \
+        ci/gem5_run_hostless_app.py
+
+    # E2E — CP-driven path through the host runtime. Generic runner
+    # (ci/gem5_run_app.py) parameterized by VORTEX_TEST_BIN +
+    # VORTEX_TEST_ARGS. Sizes fit the 120s per-test budget
+    # (feedback_test_timeout_120s):
+    #   - vecadd -n16   small vector add
+    #   - sgemm  -n4    4x4 matrix multiply
+    # Larger sizes overrun because the simulated host CPU's CP poll
+    # loop burns gem5 wall time proportional to kernel runtime.
+    # Run on local dev box for larger sizes by overriding VORTEX_TEST_ARGS.
+    for spec in "vecadd:-n16" "sgemm:-n4"; do
+        app="${spec%%:*}"
+        args="${spec#*:}"
+        echo "=== gem5 e2e: $app $args ==="
+        VORTEX_GEM5_DEV_LIB=$LIB_GEM5_DEV \
+        VORTEX_GEM5_HOST_RT_DIR=$HOST_RT_DIR \
+        VORTEX_TEST_DIR=$BUILD_DIR/tests/regression/$app \
+        VORTEX_TEST_BIN=$app \
+        VORTEX_TEST_ARGS=$args \
+            timeout 120 $GEM5_HOME/build/X86/gem5.opt \
+            ci/gem5_run_app.py
+    done
+
+    # ARM matrix (opt-in). The device library (libvortex-gem5.so) is
+    # always x86 — gem5.opt is an x86 binary regardless of which
+    # simulated ISA it models. Only the simulated host's ISA changes.
+    if [ -n "$VORTEX_GEM5_ARM" ]; then
+        if [ ! -x "$GEM5_HOME/build/ARM/gem5.opt" ]; then
+            echo "error: $GEM5_HOME/build/ARM/gem5.opt not found"
+            exit 1
+        fi
+
+        # Cross-compile the host runtime, stub, and test binaries for
+        # aarch64. All outputs land in $arch/ subdirs alongside the
+        # native x86 builds so they coexist cleanly.
+        make -C sw/runtime/stub HOST_ARCH=aarch64
+        make -C sw/runtime/gem5 HOST_ARCH=aarch64
+        make -C tests/regression/vecadd HOST_ARCH=aarch64
+        make -C tests/regression/sgemm  HOST_ARCH=aarch64
+
+        ARM_HOST_RT_DIR=$BUILD_DIR/sw/runtime/aarch64
+
+        echo "=== gem5 ARM hostless: hello ==="
+        VORTEX_GEM5_DEV_LIB=$LIB_GEM5_DEV \
+        VORTEX_TEST_DIR=$BUILD_DIR/tests/kernel/hello \
+        VORTEX_TEST_KERNEL=hello.vxbin \
+            timeout 120 $GEM5_HOME/build/ARM/gem5.opt \
+            ci/gem5_run_hostless_app.py
+
+        for spec in "vecadd:-n16" "sgemm:-n4"; do
+            app="${spec%%:*}"
+            args="${spec#*:}"
+            echo "=== gem5 ARM e2e: $app $args ==="
+            VORTEX_GEM5_DEV_LIB=$LIB_GEM5_DEV \
+            VORTEX_GEM5_HOST_RT_DIR=$ARM_HOST_RT_DIR \
+            VORTEX_TEST_DIR=$BUILD_DIR/tests/regression/$app \
+            VORTEX_TEST_BIN=$app-aarch64 \
+            VORTEX_TEST_ARGS=$args \
+            VORTEX_DRIVER=gem5-aarch64 \
+                timeout 120 $GEM5_HOME/build/ARM/gem5.opt \
+                ci/gem5_run_app.py
+        done
+    fi
+
+    echo "gem5 tests done!"
+}
+
 mpi()
 {
     echo "begin mpi tests..."
@@ -1047,7 +1178,7 @@ hip()
 show_usage()
 {
     echo "Vortex Regression Test"
-    echo "Usage: $0 [--clean] [--unittest] [--riscv] [--kernel] [--regression] [--amo] [--dxa] [--opencl] [--cache] [--vm] [--rvc] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--graphics] [--tensor] [--tensor_sp] [--tensor_mx] [--tensor_wg] [--cupbop] [--hip] [--all] [--h|--help]"
+    echo "Usage: $0 [--clean] [--unittest] [--riscv] [--kernel] [--regression] [--amo] [--dxa] [--opencl] [--cache] [--vm] [--rvc] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--graphics] [--tensor] [--tensor_sp] [--tensor_mx] [--tensor_wg] [--cupbop] [--hip] [--sst] [--gem5] [--dtm] [--mpi] [--all] [--h|--help]"
 }
 
 declare -a tests=()
@@ -1139,6 +1270,9 @@ while [ "$1" != "" ]; do
         --sst )
                 tests+=("sst")
                 ;;
+        --gem5 )
+                tests+=("gem5")
+                ;;
         --dtm )
                 tests+=("dtm")
                 ;;
diff --git a/ci/sst_run_hostless_app.py b/ci/sst_run_hostless_app.py
new file mode 100644
index 000000000..3f8618808
--- /dev/null
+++ b/ci/sst_run_hostless_app.py
@@ -0,0 +1,53 @@
+# Copyright © 2019-2023
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Hostless SST runner: instantiate a single vortex.VortexGPGPU
+# component and run the given kernel. SST runs Vortex co-resident in
+# one process, primes the KMU DCRs directly via proc_->dcr_write
+# inside sim/simx/sst/vortex_simulator.cpp, and ticks the simulation
+# to completion. No host CPU, no CP, no PIO/DMA.
+#
+# Hostless is the only mode the SST integration currently supports:
+# there is no SST CPU component (e.g. Ariel/Vanadis) wired to a
+# Vortex regression test binary today. A future ci/sst_run_app.py
+# could add that path; the name slot is reserved.
+#
+# For memHierarchy timing modeling, the VortexGPGPU component exposes
+# an optional `memIface` SubComponent slot — see
+# docs/proposals/sst_simx_v3_proposal.md for the wiring recipe.
+#
+# Configurable via env vars (parallel to ci/gem5_run_hostless_app.py):
+#   VORTEX_TEST_DIR    — directory containing the kernel .vxbin
+#   VORTEX_TEST_KERNEL — kernel filename inside that dir
+#                        (default: kernel.vxbin, matching the
+#                         regression-test convention)
+#
+# Run via:
+#   VORTEX_TEST_DIR=tests/kernel/hello VORTEX_TEST_KERNEL=hello.vxbin \
+#       sst ci/sst_run_hostless_app.py
+
+import os
+import sst
+
+TEST_DIR    = os.environ.get("VORTEX_TEST_DIR")
+TEST_KERNEL = os.environ.get("VORTEX_TEST_KERNEL", "kernel.vxbin")
+if not TEST_DIR:
+    raise RuntimeError("VORTEX_TEST_DIR env var is required")
+
+PROGRAM = f"{TEST_DIR}/{TEST_KERNEL}"
+
+gpu = sst.Component("gpu0", "vortex.VortexGPGPU")
+gpu.addParams({
+    "clock":   "1GHz",
+    "program": PROGRAM,
+})
diff --git a/ci/sst_test_vortex_conform.py b/ci/sst_test_vortex_conform.py
deleted file mode 100644
index 25681dc6d..000000000
--- a/ci/sst_test_vortex_conform.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import sst
-
-gpu = sst.Component("gpu0", "vortex.VortexGPGPU")
-gpu.addParams({
-    "clock": "1GHz",
-    "program": "tests/kernel/conform/conform.vxbin"
-})
diff --git a/ci/sst_test_vortex_fibonacci.py b/ci/sst_test_vortex_fibonacci.py
deleted file mode 100644
index b174543db..000000000
--- a/ci/sst_test_vortex_fibonacci.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import sst
-
-gpu = sst.Component("gpu0", "vortex.VortexGPGPU")
-gpu.addParams({
-    "clock": "1GHz",
-    "program": "tests/kernel/fibonacci/fibonacci.vxbin"
-})
diff --git a/ci/sst_test_vortex_hello.py b/ci/sst_test_vortex_hello.py
deleted file mode 100644
index ca4fc0199..000000000
--- a/ci/sst_test_vortex_hello.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import sst
-
-gpu = sst.Component("gpu0", "vortex.VortexGPGPU")
-gpu.addParams({
-    "clock": "1GHz",
-    "program": "tests/kernel/hello/hello.vxbin"
-})
diff --git a/ci/sst_test_vortex_memHierarchy.py b/ci/sst_test_vortex_memHierarchy.py
deleted file mode 100644
index 2193985fb..000000000
--- a/ci/sst_test_vortex_memHierarchy.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SST Phase 3 integration test for vortex.VortexGPGPU.
-#
-# Wires the VortexGPGPU component's optional `memIface` SubComponent slot
-# through an L1 cache to a memHierarchy.MemController. Every memory request
-# accepted by Vortex's local DRAM model is mirrored to the SST memHierarchy
-# as a StandardMem::Read or Write event, so memHierarchy can model timing /
-# capacity / contention alongside Vortex's own simulation.
-#
-# This is the Phase 3 demonstrator from docs/proposals/sst_simx_v3_proposal.md.
-# The local data path stays in Vortex (RAM is authoritative); SST sees
-# every transaction but doesn't have to serve data back. That gives us
-# meaningful integration without forcing v3's TLM data path through SST.
-
-import sst
-
-# --- Vortex GPGPU component (single-warp hello kernel) -----------------------
-gpu = sst.Component("gpu0", "vortex.VortexGPGPU")
-gpu.addParams({
-    "clock":   "1GHz",
-    "program": "tests/kernel/hello/hello.vxbin",
-})
-
-# Vortex's StandardMem-side adapter
-gpu_mem_iface = gpu.setSubComponent("memIface", "memHierarchy.standardInterface")
-
-# --- L1 cache between Vortex and memory --------------------------------------
-# A cache is required because memHierarchy.MemController routes via MemLink
-# and only registers its address range when there's an upstream cache that
-# advertises destinations.
-l1 = sst.Component("l1cache", "memHierarchy.Cache")
-l1.addParams({
-    "access_latency_cycles": "2",
-    "cache_frequency":       "1GHz",
-    "replacement_policy":    "lru",
-    "coherence_protocol":    "MESI",
-    "associativity":         "4",
-    "cache_line_size":       "64",
-    "L1":                    "1",
-    "cache_size":            "8KiB",
-})
-
-# --- Memory controller + simple backend (host RAM-backed) --------------------
-memctrl = sst.Component("memctrl0", "memHierarchy.MemController")
-memctrl.addParams({
-    "clock":          "1GHz",
-    "addr_range_end": 0x100000000 - 1,  # 4 GB
-})
-memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem")
-memory.addParams({
-    "access_time": "10ns",
-    "mem_size":    "4GiB",
-})
-
-# --- Wiring ------------------------------------------------------------------
-# Vortex GPGPU → L1 cache
-link_gpu_l1 = sst.Link("link_gpu_l1")
-link_gpu_l1.connect((gpu_mem_iface, "lowlink", "1ns"),
-                    (l1,            "highlink", "1ns"))
-
-# L1 cache → MemController
-link_l1_mem = sst.Link("link_l1_mem")
-link_l1_mem.connect((l1,      "lowlink",  "1ns"),
-                    (memctrl, "highlink", "1ns"))
diff --git a/ci/sst_test_vortex_vecadd.py b/ci/sst_test_vortex_vecadd.py
deleted file mode 100644
index 8a156cf81..000000000
--- a/ci/sst_test_vortex_vecadd.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import sst
-
-gpu = sst.Component("gpu0", "vortex.VortexGPGPU")
-gpu.addParams({
-    "clock": "1GHz",
-    "program": "tests/kernel/vecadd/vecadd.vxbin"
-})
diff --git a/configure b/configure
index 14c0880d1..ea1abb5eb 100755
--- a/configure
+++ b/configure
@@ -69,7 +69,7 @@ copy_files() {
                         continue
                     fi
                     mkdir -p "$dest_dir"
-                    sed "s|@VORTEX_HOME@|$SOURCE_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@BUILDDIR@|$CURRENT_DIR|g; s|@TOOLCHAIN_REV@|$TOOLCHAIN_REV|g; s|@VORTEX_VERSION@|$VORTEX_VERSION|g" "$file" > "$dest_file"
+                    sed "s|@VORTEX_HOME@|$SOURCE_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@BUILDDIR@|$CURRENT_DIR|g; s|@TOOLCHAIN_REV@|$TOOLCHAIN_REV|g; s|@VORTEX_VERSION@|$VORTEX_VERSION|g; s|@GEM5_REV@|$GEM5_REV|g" "$file" > "$dest_file"
                     # apply permissions to bash scripts
                     read -r firstline < "$dest_file"
                     if [[ "$firstline" =~ ^#!.*bash ]]; then
diff --git a/docs/gem5_integration.md b/docs/gem5_integration.md
new file mode 100644
index 000000000..5b2e0f1af
--- /dev/null
+++ b/docs/gem5_integration.md
@@ -0,0 +1,441 @@
+# gem5 Integration
+
+Vortex runs inside the [gem5](https://www.gem5.org/) simulator as a
+`DmaDevice` SimObject, exposing the Vortex GPGPU to a simulated host
+CPU (x86 or ARM) through a Command Processor regfile + BAR-mapped
+VRAM. Use this when you want to model heterogeneous host-CPU +
+accelerator workloads with realistic cross-ISA cache and DMA timing,
+or to validate the v2 Command Processor architecture against a real
+host/device split.
+
+For the redesigned architecture, see
+[docs/proposals/gem5_v2_cp_migration_proposal.md](proposals/gem5_v2_cp_migration_proposal.md).
+The earlier [gem5_simx_v3_proposal.md](proposals/gem5_simx_v3_proposal.md)
+covers the original OPAE-protocol design; its §3 (host/device protocol)
+and §4 (SimObject design) are superseded by the v2 migration. This
+document is the operator manual for the current (v2 CP-first) design.
+
+## At a glance
+
+Three parts live in this repo:
+
+| Part | Source | Built artifact | Loaded by |
+|---|---|---|---|
+| Device library | `sim/simx/gem5/vortex_gpgpu.{cpp,h}` + `dev_mem.{cpp,h}` | `build/sim/simx/libvortex-gem5.so` | gem5 SimObject via `dlopen` |
+| gem5 SimObject | `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}` + `VortexGPGPU.py` + `SConscript` | Linked into `gem5.opt` after install | gem5 itself |
+| Host runtime | `sw/runtime/gem5/{vortex.cpp,driver.{cpp,h},Makefile}` | `build/sw/runtime/libvortex-gem5-{x86_64,aarch64}.so` | The simulated process inside gem5 |
+
+Plus `ci/gem5_install.sh` which fetches gem5 v25.0.0.1, drops the
+SimObject sources into `$GEM5_HOME/src/dev/vortex/`, and builds
+`build/{X86,ARM}/gem5.opt`.
+
+## Architecture in one paragraph
+
+The simulated host process loads the upstream dispatcher
+(`libvortex.so`) which dlopens the gem5 backend
+(`libvortex-gem5-x86_64.so`). The backend's only platform primitives
+are `mem_upload/download/copy` (regular memcpy through a host-visible
+BAR mapped to device VRAM) and `cp_mmio_{read,write}` (32-bit PIO to
+the device's CP regfile). All kernel launches, DCR programming, and
+fences flow through the dispatcher's Command Processor submission
+path: it writes `CMD_*` descriptors into a ring buffer in device VRAM
+(via mem_upload), commits via `cp_mmio_write(Q_TAIL_HI, ...)`, and
+polls completion via `cp_mmio_read(Q_SEQNUM, ...)`. The CP itself is
+the upstream `vortex::CommandProcessor` C++ class embedded in the
+device library; the SimObject ticks it on its own gem5 event chain
+and ticks the Vortex Processor on a parallel chain. Both event chains
+self-schedule only while they have work — the device is genuinely
+idle between commands.
+
+## One-time setup
+
+Vortex install / build as usual ([docs/install_vortex.md](install_vortex.md)),
+then add gem5:
+
+```bash
+cd build/   # standard Vortex out-of-tree build directory
+./ci/gem5_install.sh
+```
+
+This runs `sudo apt install` for gem5's build dependencies (scons,
+libprotobuf, m4, libboost, **gcc-aarch64-linux-gnu**, …), clones gem5
+v25.0.0.1 into `$TOOLDIR/gem5`, copies the Vortex SimObject sources
+into `$GEM5_HOME/src/dev/vortex/`, and builds `gem5.opt` for both X86
+and ARM (~15 min on a 64-core machine, ~30-45 min on a typical CI
+runner). The script is idempotent — re-running with the same
+`GEM5_REV` is a no-op.
+
+To install only one ISA:
+
+```bash
+GEM5_TARGETS="X86" ./ci/gem5_install.sh   # default
+GEM5_TARGETS="ARM" ./ci/gem5_install.sh
+GEM5_TARGETS="X86 ARM" ./ci/gem5_install.sh   # both (default)
+```
+
+The pinned gem5 revision lives in `VERSION` (`GEM5_REV=v25.0.0.1`);
+bumping it requires re-running `ci/gem5_install.sh` and verifying
+both `gem5.opt` builds still load `VortexGPGPU` cleanly.
+
+## Building Vortex with gem5 support
+
+The device library is gated behind `USE_GEM5=1`. The default
+`make -C sim/simx` is **unchanged** — no gem5 dep, no `libvortex-gem5.so`
+produced.
+
+```bash
+make -C sim/simx                     # default; no gem5 artifacts
+make -C sim/simx USE_GEM5=1          # produces libvortex-gem5.so + gem5_smoke
+```
+
+`USE_SST=1` and `USE_GEM5=1` are mutually exclusive (the Makefile
+errors out if both are set).
+
+### Host runtime + tests (cross-compile)
+
+The simulated process inside gem5 loads the **host runtime**
+`libvortex-gem5-$HOST_ARCH.so`, which exposes the pure-v2 `callbacks_t`
+to the dispatcher. The `HOST_ARCH` knob is consistent across three
+Makefiles — runtime backend, stub, and regression tests:
+
+```bash
+# Native x86 (default)
+make -C sw/runtime/stub                          # → build/sw/runtime/libvortex.so
+make -C sw/runtime/gem5                          # → build/sw/runtime/libvortex-gem5-x86_64.so
+make -C tests/regression/vecadd                  # → build/tests/regression/vecadd/vecadd
+
+# Cross-compiled aarch64 — outputs land in $arch/ subdirs so x86
+# and ARM artifacts coexist:
+make -C sw/runtime/stub HOST_ARCH=aarch64        # → build/sw/runtime/aarch64/libvortex.so
+make -C sw/runtime/gem5 HOST_ARCH=aarch64        # → build/sw/runtime/aarch64/libvortex-gem5-aarch64.so
+make -C tests/regression/vecadd HOST_ARCH=aarch64 # → build/tests/regression/vecadd/vecadd-aarch64
+
+# armhf works the same way (note: armhf is 32-bit so the BAR
+# mapping above 4 GiB is out of reach — only standalone tests work):
+make -C sw/runtime/stub HOST_ARCH=armhf
+make -C sw/runtime/gem5 HOST_ARCH=armhf
+```
+
+The ARM targets require `gcc-aarch64-linux-gnu` /
+`gcc-arm-linux-gnueabihf` respectively — `ci/gem5_install.sh`
+installs these.
+
+## Running tests
+
+### From the regression harness
+
+```bash
+cd build/
+./ci/regression.sh --gem5
+```
+
+Runs both the standalone Phase-3 smoke test (kernel preloaded on the
+SimObject, no host CPU) and the Phase-5 end-to-end test (real SE-mode
+host program drives the device through CP submissions).
+
+To also run the ARM matrix entry (needs `gcc-aarch64-linux-gnu`):
+
+```bash
+VORTEX_GEM5_ARM=1 ./ci/regression.sh --gem5
+```
+
+Runs 6 tests:
+- X86 standalone hello (no host CPU; SimObject preloads kernel)
+- X86 e2e vecadd `-n16` (host CPU drives device via CP regfile)
+- X86 e2e sgemm `-n4`
+- ARM standalone hello
+- ARM e2e vecadd `-n16`
+- ARM e2e sgemm `-n4`
+
+Cross-arch e2e relies on two gem5 mechanisms working together:
+
+1. **`setInterpDir(prefix)`** prepends a sysroot to the dynamic
+   linker path embedded in the cross-compiled ELF
+   (`/lib/ld-linux-aarch64.so.1` → `/usr/aarch64-linux-gnu/lib/...`).
+   The Python config calls this when `VORTEX_DRIVER=gem5-aarch64`.
+2. **`system.redirect_paths`** redirects the *guest process's*
+   open()/stat() syscalls for `/lib/aarch64-linux-gnu/*` →
+   `/usr/aarch64-linux-gnu/lib/*` so the dynamic linker can resolve
+   libc, libstdc++, etc.
+
+Both paths point at the Ubuntu `gcc-aarch64-linux-gnu` package's
+install location — no extra setup needed.
+
+### By hand
+
+**Hostless** (no host CPU; kernel preloaded via SimObject parameter):
+
+```bash
+VORTEX_GEM5_DEV_LIB=$(pwd)/sim/simx/libvortex-gem5.so \
+VORTEX_TEST_DIR=$(pwd)/tests/kernel/hello \
+VORTEX_TEST_KERNEL=hello.vxbin \
+    $GEM5_HOME/build/X86/gem5.opt ci/gem5_run_hostless_app.py
+```
+
+`VORTEX_TEST_KERNEL` defaults to `kernel.vxbin`, so any standard
+regression test's kernel can be driven hostless without the host
+binary — e.g. `VORTEX_TEST_DIR=$(pwd)/tests/regression/vecadd
+ci/gem5_run_hostless_app.py`.
+
+**End-to-end** — any standard Vortex regression test (host binary +
+kernel.vxbin) runs through the generic
+[`ci/gem5_run_app.py`](../ci/gem5_run_app.py) runner.
+
+```bash
+# vecadd
+VORTEX_GEM5_DEV_LIB=$(pwd)/sim/simx/libvortex-gem5.so \
+VORTEX_GEM5_HOST_RT_DIR=$(pwd)/sw/runtime \
+VORTEX_TEST_DIR=$(pwd)/tests/regression/vecadd \
+VORTEX_TEST_BIN=vecadd \
+VORTEX_TEST_ARGS="-n16" \
+    $GEM5_HOME/build/X86/gem5.opt ci/gem5_run_app.py
+
+# sgemm
+VORTEX_GEM5_DEV_LIB=$(pwd)/sim/simx/libvortex-gem5.so \
+VORTEX_GEM5_HOST_RT_DIR=$(pwd)/sw/runtime \
+VORTEX_TEST_DIR=$(pwd)/tests/regression/sgemm \
+VORTEX_TEST_BIN=sgemm \
+VORTEX_TEST_ARGS="-n4" \
+    $GEM5_HOME/build/X86/gem5.opt ci/gem5_run_app.py
+```
+
+Expected output ends with:
+```
+PASSED!
+```
+
+### Sizing tests for the 120 s budget
+
+Each `timeout 120` per test bound comes from
+[feedback_test_timeout_120s](../../../../.claude/projects/-home-blaisetine-dev/memory/feedback_test_timeout_120s.md).
+gem5 SE-mode runs the host CPU's CP poll loop in simulated time too,
+so **kernel runtime + dispatcher poll budget translate directly into
+gem5 wall time**. The regression script's default sizes fit; larger
+sizes are fine when run by hand outside the budget cap.
+
+## Address space layout
+
+```
+Host process VA (simulated, gem5 SE-mode) | Simulated PA | Backed by
+------------------------------------------+--------------+----------------------
+[0x0000_0000_0000, 0x0000_1000_0000)      | same         | gem5 DDR3 (process
+                                          |              |   heap/stack/code)
+[0x0000_2000_0000, 0x0000_2000_0200)      | same         | VortexGPGPU CP regfile
+                                          |              |   (32-bit PIO)
+[0x0001_0000_0000, 0x0002_0000_0000)      | same         | VortexGPGPU VRAM
+                                          |              |   (BAR-mapped to
+                                          |              |    in-process simx::RAM)
+```
+
+PIN_BASE_ADDR = `0x100000000` is identity-mapped via `Process.map()`
+so host stores at PIN_BASE+dev_addr land in the same in-process
+simx::RAM bytes the CP and Vortex read. PIO_BASE_ADDR = `0x20000000`
+is identity-mapped (cacheable=False) so the dispatcher's PIO MMIO
+reaches the SimObject's regfile decoder.
+
+These constants are duplicated in two places — `sw/runtime/gem5/driver.h`
+and `ci/gem5_run_app.py`. If you change one, change the other.
+
+## Writing your own gem5 Python script
+
+The minimal recipe for hosting Vortex inside a custom gem5 system:
+
+```python
+from m5.objects import (
+    AddrRange, AtomicSimpleCPU, DDR3_1600_8x8, MemCtrl, Process,
+    Root, SEWorkload, SrcClockDomain, System, SystemXBar,
+    VoltageDomain, VortexGPGPU,
+)
+
+# Mappings expected by sw/runtime/gem5/driver.h.
+PIO_BASE, PIO_SIZE = 0x20000000, 0x0200          # CP regfile (32-bit)
+PIN_BASE, PIN_SIZE = 0x100000000, 0x100000000    # BAR-mapped VRAM
+NUM_CPUS = 4   # >=2 required for the dispatcher's per-Queue worker thread
+
+system = System()
+system.clk_domain = SrcClockDomain(clock="3GHz",
+                                   voltage_domain=VoltageDomain())
+system.mem_mode = "atomic"
+system.mem_ranges = [AddrRange("1GiB")]
+system.membus = SystemXBar()
+system.system_port = system.membus.cpu_side_ports
+
+# Multiple CPU contexts — the upstream dispatcher spawns a per-Queue
+# worker thread; clone() in SE-mode needs a free HW context to land on.
+system.cpu = [AtomicSimpleCPU(cpu_id=i) for i in range(NUM_CPUS)]
+system.multi_thread = True
+for cpu in system.cpu:
+    cpu.createInterruptController()
+    cpu.icache_port = system.membus.cpu_side_ports
+    cpu.dcache_port = system.membus.cpu_side_ports
+    # X86 needs explicit interrupt port wiring; ARM does not.
+    cpu.interrupts[0].pio           = system.membus.mem_side_ports
+    cpu.interrupts[0].int_requestor = system.membus.cpu_side_ports
+    cpu.interrupts[0].int_responder = system.membus.mem_side_ports
+
+# DRAM serves the process's address space below PIO_BASE.
+system.mem_ctrl = MemCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = AddrRange(0, PIO_BASE)
+system.mem_ctrl.port = system.membus.mem_side_ports
+
+# The Vortex device — claims both the CP regfile PIO range and the
+# BAR-mapped VRAM range (gem5_v2_cp_migration §3).
+system.vortex = VortexGPGPU(
+    library = "/path/to/build/sim/simx/libvortex-gem5.so",
+    kernel  = "",   # NO preload — the host binary uploads via CP
+)
+system.vortex.pio_addr = PIO_BASE
+system.vortex.pio_size = PIO_SIZE
+system.vortex.pin_addr = PIN_BASE
+system.vortex.pin_size = PIN_SIZE
+system.vortex.pio = system.membus.mem_side_ports
+system.vortex.dma = system.membus.cpu_side_ports
+
+# Workload — the host binary loads libvortex.so + libvortex-gem5-x86_64.so.
+process = Process(
+    pid=100,
+    cwd="/path/to/your/test",
+    cmd=["/path/to/your/test/binary"],
+    executable="/path/to/your/test/binary",
+    env=[
+        "VORTEX_DRIVER=gem5-x86_64",
+        "LD_LIBRARY_PATH=/path/to/build/sw/runtime",
+    ],
+)
+
+system.workload = SEWorkload.init_compatible(process.executable)
+for cpu in system.cpu:
+    cpu.workload = process       # required: workload size must equal numThreads
+    cpu.createThreads()
+
+import m5
+root = Root(full_system=False, system=system)
+m5.instantiate()
+
+# CRITICAL: Process.map() must come AFTER m5.instantiate().
+# Identity-mapping PIO + PIN gives the runtime direct CPU access to
+# the device's CP regfile and to BAR-mapped VRAM.
+system.cpu[0].workload[0].map(PIO_BASE, PIO_BASE, PIO_SIZE, cacheable=False)
+system.cpu[0].workload[0].map(PIN_BASE, PIN_BASE, PIN_SIZE, cacheable=False)
+
+m5.simulate()
+```
+
+Reference implementations:
+- [ci/gem5_run_hostless_app.py](../ci/gem5_run_hostless_app.py) — hostless variant (preload via `kernel=` param; no host CPU)
+- [ci/gem5_run_app.py](../ci/gem5_run_app.py) — e2e variant (any regression test via `VORTEX_TEST_BIN`)
+
+## Load-bearing invariants — do not violate
+
+### 1. Process.map() goes AFTER m5.instantiate()
+
+`Process.map(vaddr, paddr, size)` is a C++ method on the underlying
+`gem5::Process` object; that object only exists after
+`m5.instantiate()` builds the SimObject tree. Calling `.map()`
+before instantiate raises `RuntimeError: Attempt to instantiate
+orphan node <orphan Process>`. Confirmed by gem5's own AMD GPU
+integration at `$GEM5_HOME/configs/example/apu_se.py:1055`.
+
+### 2. PIO and PIN regions must be identity-mapped — and PIN must be cacheable=False
+
+`sw/runtime/gem5/driver.h` hard-codes:
+- `PIO_BASE_ADDR = 0x20000000` (CP regfile; 0x200 bytes)
+- `PIN_BASE_ADDR = 0x100000000` (BAR-mapped VRAM; 4 GB)
+
+The Python config must `process.map()` both at the same physical
+addresses, with `cacheable=False` on PIN. With caching enabled the
+host CPU's L1 could hold the new ring entry while `Q_TAIL_HI` is
+observed by the CP — the CP fetches a stale CL and the dispatcher
+hangs polling `Q_SEQNUM`.
+
+Changing either constant requires updating both the Python config
+**and** `sw/runtime/gem5/driver.h` (they are not auto-synced).
+
+### 3. CPU thread context count must be >= 2
+
+The upstream dispatcher (commit `157e7a1`) spawns a per-Queue worker
+thread at `vx_queue_create`. SE-mode `clone()` returns EAGAIN if
+there is no free HW context, which surfaces as
+`std::system_error: Resource temporarily unavailable` at the
+dispatcher constructor.
+
+Use multiple CPU instances (one per thread) and
+`system.multi_thread = True`. Assigning the same Process to every
+CPU is required because gem5 fatals if
+`workload.size() != numThreads`.
+
+### 4. PIO accesses to the CP regfile are 32-bit
+
+The CP regfile is 32-bit-wide; `cp_mmio_write/read` in the host
+runtime are explicitly 32-bit (`mmio_write32` / `mmio_read32` in
+`driver.cpp`). Don't issue 64-bit accesses — gem5 will deliver a
+single packet of the wrong width and the SimObject will route the
+extra bytes into the next regfile slot.
+
+### 5. The Vortex `Processor` and `CommandProcessor` are independent gem5 event chains
+
+`cpTickEvent_` advances the CP one functional cycle; `vortexTickEvent_`
+advances the Vortex `Processor::cycle()`. Both self-schedule only
+while their respective busy flag is true. When the CP fires
+`CMD_LAUNCH`, the `vortex_start` hook schedules `vortexTickEvent_`
+via the registered start handler (set at `VortexGPGPU` construction).
+Don't try to combine them into a single tick — that breaks
+"concurrent host + CP + GPU progress" which is the whole point of
+the simulation model.
+
+### 6. USE_SST=1 and USE_GEM5=1 are mutually exclusive
+
+The Makefile rejects both at once. Different external simulators,
+different LDFLAGS, different `libvortex.so` shapes. Pick one per
+build.
+
+## Architectural choices you may want to revisit
+
+These are documented in the
+[v2 CP migration proposal](proposals/gem5_v2_cp_migration_proposal.md)
+but worth surfacing:
+
+- **In-process VRAM with DevMemAccessor seam** (proposal §2.5). v1
+  uses `InProcessDevMem` (wraps simx::RAM directly). The accessor
+  interface is designed to be swappable to a gem5 `SimpleMemory` +
+  DMA-port path in v2 without touching CP hook code or Vortex memory
+  code.
+- **Single ClockDomain for CP + Vortex in v1** (proposal §2.4, D2).
+  Real silicon has separate clocks; v2 would add a second
+  `ClockDomain` and rate-match the tick events.
+- **Raw PIO range, not a PCIe BAR / config space**
+  (proposal §2.1). Swap base class from `DmaDevice` to `PciDevice`
+  for a more realistic FS-mode integration.
+- **Polling completion, not MSI-X interrupts** (proposal §8). The
+  host runtime spins on `Q_SEQNUM` PIO reads. v2 work would let the
+  CP raise an interrupt and let the dispatcher sleep until it fires.
+- **Multi-queue PIO map reserves 4 slots; v1 host runtime exercises
+  Q0 only** (proposal §2.6, D4). Q1–Q3 hardware is ready for future
+  vortex2.h multi-queue work.
+
+## CI
+
+`./ci/regression.sh --gem5` (built into `--all` is intentionally
+**out**: gem5 install is heavy and gated like SST). The
+`.github/workflows/ci.yml` matrix includes a `gem5` entry that runs
+on hosted runners; ARM matrix gated on `VORTEX_GEM5_ARM=1`.
+
+Apptainer integration (the `apptainer-ci.yml` pipeline) does **not**
+include gem5 — adding it to `miscs/apptainer/vortex.def` is out of
+scope. Use the hosted CI for gem5.
+
+## Troubleshooting
+
+| Symptom | Cause | Fix |
+|---|---|---|
+| `dlopen('libvortex-gem5.so') failed: cannot open shared object file` | gem5 SimObject can't find the device library | Set `VortexGPGPU(library="/abs/path/to/libvortex-gem5.so", ...)` to absolute path |
+| `Cannot open library: libvortex-gem5-x86_64.so: cannot open shared object file` | Stub can't find the host runtime backend | Set `LD_LIBRARY_PATH=/path/to/sw/runtime` in the `env=[...]` list passed to `Process()` |
+| `terminate called after throwing an instance of 'std::system_error': Resource temporarily unavailable` | Dispatcher's per-Queue worker `std::thread` can't `clone()` into a free HW context | Use multiple CPU instances + `system.multi_thread = True`; assign the same Process to every CPU (invariant §3) |
+| `system.membus has two ports responding within range [...]` | DRAM `mem_ctrl.dram.range` overlaps with VortexGPGPU's PIO or PIN range | Shrink `dram.range = AddrRange(0, PIO_BASE)` so the device-owned ranges have exclusive routing |
+| `Tried to write unmapped address 0xXXX` | Host runtime is using stale PIN_BASE_ADDR (mismatch with Python config), or `Process.map()` was skipped | Confirm both `sw/runtime/gem5/driver.h` and the Python config use the same `PIN_BASE_ADDR`; ensure `Process.map(PIN_BASE, PIN_BASE, PIN_SIZE)` runs after `m5.instantiate()` |
+| `Attempt to instantiate orphan node <orphan Process>` | `Process.map()` called before `m5.instantiate()` | Move all `.map()` calls AFTER `m5.instantiate()` — see invariant §1 above |
+| `fatal: VortexGPGPU: dlsym(vortex_gem5_cp_mmio_write) failed` | Device library is missing the C ABI symbol — usually means the `library=` parameter points at the wrong .so | `library=` is the **device** library `build/sim/simx/libvortex-gem5.so` (no arch suffix), NOT the host runtime `libvortex-gem5-x86_64.so` |
+| `fatal: system.membus has two ports responding within range [0x10000000:0x20000000]` (standalone hello) | `pin_size` defaulted to non-zero in an old gem5.opt; standalone test doesn't need the BAR | Re-install + rebuild gem5.opt OR explicitly set `pin_size = 0` on the VortexGPGPU instance |
+| Test hangs polling `Q_SEQNUM` after first launch | Cacheable PIN region — host's L1 holds the ring entry; CP sees stale bytes | Set `cacheable=False` on the PIN `Process.map()` call (invariant §2) |
+| `ccache g++ ... undefined reference to fmt::v8::detail::error_handler::on_error` | ccache served a stale object compiled against a different `fmt` version | `CCACHE_DISABLE=1 make -C sim/simx clean && CCACHE_DISABLE=1 make ...` |
diff --git a/docs/index.md b/docs/index.md
index a7b9000d4..0c3504d72 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -8,3 +8,4 @@
 - [Contributing](contributing.md): Process for contributing your own features including repo semantics and testing
 - [Debugging](debugging.md): Debugging configurations for each Vortex driver
 - [Building the Toolchain from Source](building_toolchain.md): Maintainer-facing build recipes for Verilator, RISC-V GNU, LLVM (with X86 + lld + SPIR-V), compiler-rt, musl, and POCL
+- [gem5 Integration](gem5_integration.md): Running Vortex inside the gem5 full-system simulator (x86/ARM host CPU + Vortex device over OPAE MMIO/DMA)
diff --git a/docs/proposals/gem5_simx_v3_proposal.md b/docs/proposals/gem5_simx_v3_proposal.md
new file mode 100644
index 000000000..3bce6ff95
--- /dev/null
+++ b/docs/proposals/gem5_simx_v3_proposal.md
@@ -0,0 +1,1040 @@
+# gem5 Integration for SimX v3 — Proposal
+
+**Date:** 2026-05-16
+**Status:** ✅ Original Phases 0–7 complete (OPAE-protocol design). **§3 (host/device protocol) and §4 (SimObject design) SUPERSEDED** by [gem5_v2_cp_migration_proposal.md](gem5_v2_cp_migration_proposal.md) after upstream's pure-v2 `callbacks_t` + CommandProcessor landed (commits `086d26b`, `8bc2564`, `16aa1ca`). The current operator manual is [docs/gem5_integration.md](../gem5_integration.md). §0–§2 (motivation, source-tree layout) and §5+ (testing, install, cross-arch) remain accurate.
+**Author:** Blaise Tine
+**Related:**
+[simx_v3_proposal.md](simx_v3_proposal.md) (Phase 5: TLM data path),
+[sst_simx_v3_proposal.md](sst_simx_v3_proposal.md) (the sister integration whose patterns this proposal follows),
+[master_merge_v3_proposal.md](master_merge_v3_proposal.md) §10.2 (the precedent for cross-simulator integrations on this line),
+[`~/dev/vortex_gem5`](https://github.com/sij814/vortex_gem5) on branch `gem5`, commit `91dcf17` ("working Vortex with gem5", 2025-05-22 — Injae Shin, UCLA capstone),
+[Injae Shin, "gem5-Vortex: Heterogeneous Cross-ISA Integration of Vortex GPGPU in gem5"](#) (capstone report, 2025).
+
+---
+
+## 1. Constraints (load-bearing)
+
+Any design that breaks one of these is wrong.
+
+1. **One source of truth for memory state.** Per
+   [simx_v3_proposal.md §3.3](simx_v3_proposal.md), data lives in the
+   channel hierarchy: `MemReq`/`MemRsp` packets carry actual bytes
+   between `MemCoalescer` → `Cache` → `Memory`, and the `RAM` image
+   attached to `Memory` is authoritative. There is no shadow backing
+   store and no parallel `MemBackend`. The gem5 integration plugs in at
+   exactly one boundary (the device's DMA port maps to `RAM`
+   read/write); it does **not** introduce a second data path.
+2. **Single clock owner per simulation.** Under gem5, gem5 drives the
+   clock: `VortexGPGPU::tick()` (a gem5 `EventFunctionWrapper` that
+   reschedules itself every cycle at the device clock) calls
+   `Processor::cycle()`. SimX does not advance on its own and there is
+   no worker thread doing async `Processor::run()` in the background.
+   (This is a deliberate departure from the legacy `vortex_gem5` design
+   — see §2.2 — which is the source of most of that branch's bugs.)
+3. **gem5 plugs in at one boundary, not many.** Vortex → gem5 traffic
+   crosses two well-defined interfaces:
+   - **PIO** for MMIO command/status registers (the OPAE AFU image
+     layout, unchanged from `sw/runtime/opae`).
+   - **DMA** for staging-buffer host↔device transfers, and for any
+     future host-visible memory window.
+   The cache hierarchy, scheduler, ALU/FPU, KMU, and the new
+   `Processor::cycle()` entry point do not know gem5 exists.
+4. **No regression for non-gem5 builds.** `make -C sim/simx` (no
+   `USE_GEM5=1`) continues to produce a self-contained `simx` binary
+   identical to today's. gem5 is opt-in compile-time, not a runtime
+   probe, and ships as a separate shared library (`libvortex-gem5.so`)
+   that the gem5 SimObject loads. Per §1.4 of
+   [sst_simx_v3_proposal.md](sst_simx_v3_proposal.md).
+5. **The Vortex tree owns the integration code.** All gem5-facing C++
+   (the `DmaDevice` SimObject) and Python (SimObject config + test
+   scripts) live under `sim/simx/gem5/` and `ci/gem5_test_vortex_*.py`
+   in this repo. `ci/gem5_install.sh` fetches a pinned upstream gem5
+   release and copies/symlinks our SimObject into its source tree
+   before building. Versioning the integration alongside Vortex is what
+   makes it possible to review API-breaking changes in a single PR;
+   the legacy split across two repos is what froze `vortex_gem5` at a
+   two-year-old SimX.
+6. **Author attribution.** The legacy `vortex_gem5` design (DMA-bouncing
+   through a pinned staging buffer, OPAE-shaped MMIO command set, ARM
+   SE-mode runtime) is Injae Shin's capstone work. The
+   re-implementation is a rewrite, not a port (§2), but each new file's
+   commit body cites the capstone report and the legacy commit
+   (`vortex_gem5@91dcf17`).
+
+---
+
+## 2. Why the legacy `vortex_gem5` cannot be ported as-is
+
+### 2.1 The architectural mismatch
+
+`vortex_gem5` was built on pre-v3 SimX (`Arch`, `Processor*`,
+single-step `run()`, `set_running(true)`, `VX_DCR_BASE_*` startup DCRs
+broadcast to all cores). v3 explicitly retired all of those:
+
+| Concern | Legacy SimX (vortex_gem5) | SimX v3 (this branch) |
+|---|---|---|
+| Sizing | `Arch arch(NUM_THREADS, NUM_WARPS, NUM_CORES)` object | Macros (`NUM_THREADS`, etc.) — no `Arch` class |
+| Top-level | `Processor(arch)` ctor with arg | `Processor()` no-arg ctor |
+| Run model | `processor->run()` is one cycle | `processor.run()` blocks to completion |
+| Single-cycle step | `processor->run()` per cycle from `proc_tick()` | does not exist — must be added (`Processor::cycle()`) |
+| Kernel dispatch | `set_running(true)` + `VX_DCR_BASE_STARTUP_*` | `KMU::start()` + `VX_DCR_KMU_*` (startup + grid/block dims) |
+| Cache flush | implicit in `run()` finish | explicit: `dcr_read(VX_DCR_BASE_CACHE_FLUSH, cid, &dummy)` per core before host read-back |
+| Memory hierarchy | `MemSim` + `CacheSim` are timing-only, data sits in `MemBackend` (`Emulator`-side) | `Memory` + `Cache` carry data through `MemReq`/`MemRsp`; backing image is in `RAM` attached to `Memory` |
+| Runtime layout | top-level `runtime/{stubarm,opaesimx}/` | reorganized under `sw/runtime/` per [master_merge §3](master_merge_v3_proposal.md) |
+
+So the **shape of the gem5 plug-in changes**: not "tick the legacy
+single-cycle Processor" but "add a `cycle()` entry point to the v3
+Processor and call it from the gem5 SimObject," with KMU-style dispatch
+and an explicit cache-flush before host read-back.
+
+### 2.2 Specific bugs in the legacy code
+
+A walk-through of `vortex_gem5/sim/{simx,opaesimx}/` and
+`vortex_gem5/runtime/{stubarm,opaesimx}/` found the following defects.
+Each is called out so the redesign does not re-introduce it.
+
+| # | File | Defect | Why it matters |
+|---|---|---|---|
+| B1 | `sim/simx/simx_device.cpp:122` (`proc_tick`) | Calls `processor_->run()` directly. On legacy SimX this was a single step; on v3 it would block until program completion. | The "tick per gem5 cycle" pattern simply won't work. We must add a real single-cycle `Processor::cycle()` (already required for SST). |
+| B2 | `sim/simx/simx_device.cpp:111` (`start`) | `processor_->set_running(true)` — that API does not exist in v3. The KMU now drives execution and requires `VX_DCR_KMU_GRID_DIM_*` / `VX_DCR_KMU_BLOCK_DIM_*` to be written before the first cycle. | Even after re-pluming, kernels won't launch without the KMU DCR setup (see `sim/simx/main.cpp:101–116`). |
+| B3 | `sim/opaesimx/opae_simx.cpp:185, 199` (`read_mmio64`/`write_mmio64`) | Implementation is `*(uint64_t*)(GEM5_BASE_ADDR + offset)` — a raw host-pointer dereference into a fixed virtual address. | Only works when the host runtime and the gem5 device share an address space (i.e., when the host runtime is *not* actually inside gem5). It is a stand-in for the real path, not the real path. Cross-ISA simulation defeats the assumption: an ARM userspace process inside gem5 cannot dereference `0x20000000` and reach the device. The legacy code papers over this with a co-resident driver hack; v3 needs a real PIO/DMA path. |
+| B4 | `sim/opaesimx/opae_simx.cpp:204–399` | Several hundred lines of commented-out CCI/AVS bus + Verilator (`device_->…`) plumbing left in place, referencing fields and types that do not exist in this file. | Dead code that obscures what the module actually does. Drop it; the new gem5 wrapper has no CCI bus to model. |
+| B5 | `sim/opaesimx/opae_simx.cpp:71` (`dram_sim_` field) | DRAM model is constructed but never ticked or consulted after the gem5 hack landed. | Dead state. |
+| B6 | `sim/opaesimx/opae_simx.cpp:103` (`pinned_alloc_`) | Uses `PIN_BASE_ADDR = 0x10000000` with `PINNED_MEM_SIZE = 0xFFFFFF` (16 MB), hardcoded. No bounds check beyond `MemoryAllocator::allocate` failure. | Tiny by design — large kernel inputs would silently fail. The v3 design should size from `GLOBAL_MEM_SIZE`/`ALLOC_BASE_ADDR` and surface OOM errors. |
+| B7 | `runtime/opaesimx/vortex.cpp:324, 367` | `auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);` — uses float `log2` for an integer constant, then discards the result. | Cosmetic / dead, but a smell. Use `log2ceil(CACHE_BLOCK_SIZE)` from `sw/common/util.h`. |
+| B8 | `runtime/opaesimx/vortex.cpp:418–474` (`ready_wait`) | `nanosleep` call is **commented out**; the busy loop only decrements `timeout_ms` and never sleeps. On a long-running kernel inside gem5 SE-mode this saturates the simulated ARM core. | Either use the gem5 device's interrupt path (preferred — implementable as an MMIO doorbell) or restore the `nanosleep` so the ARM CPU is idle while the GPU runs. |
+| B9 | `runtime/opaesimx/vortex.cpp:349–390` (`download`) | No cache-flush step before reading back results from device memory. | On v3, dirty lines must be drained via `dcr_read(VX_DCR_BASE_CACHE_FLUSH, cid, &dummy)` per core (see `sim/simx/main.cpp:194–197`, `sw/runtime/simx/vortex.cpp:191–197`) or the host sees stale data. |
+| B10 | `runtime/opaesimx/vortex.cpp:478–489` (`dcr_write`) | OPAE protocol has `CMD_DCR_WRITE` but no `CMD_DCR_READ`. | The cache-flush fix above requires a `dcr_read` path. Current `sw/runtime/opae` already adds `CMD_DCR_READ` + `MMIO_DCR_RSP` — adopt the same shape on the gem5 device. |
+| B11 | `runtime/stubarm/vortex.cpp:54` | `static callbacks_t g_callbacks;` global with `vx_dev_init(&g_callbacks)` resolved at link time. | Works for a single-device test but breaks `vx_dev_open` from being called concurrently from two host processes. Less critical for the gem5 use case (single device per simulation) but worth flagging. |
+| B12 | `sim/simx/simx_device.cpp` (`Impl`) | Uses `std::future<void> future_` for shutdown synchronization but `proc_tick()` calls `processor_->run()` directly on the caller thread. The mutex / future plumbing implies an async model that isn't actually used. | Confused concurrency contract. The v3 design must pick one: synchronous tick from the gem5 event loop (this proposal) **or** async run with a doorbell — not both. |
+| B13 | `runtime/stubarm/Makefile:7` + `runtime/opaesimx/Makefile:9` | Cross-compiler hardcoded to `arm-linux-gnueabihf-g++` (32-bit ARM hard-float). | gem5 also models AArch64 ARMv8 and x86_64, and most contemporary ARM ports are 64-bit. The v3 build selects compiler from a `HOST_ARCH` make variable (`x86_64`, `aarch64`, `armhf`); see Phase 4. |
+| B14 | `runtime/opaesimx/vortex.cpp:489` (`dcr_write`) and `stubarm/vortex.cpp:139` | Both runtimes write to DCR via the OPAE protocol but no MMIO ordering / fence is established between DCR writes and the `CMD_RUN` MMIO. | Inside gem5 the host CPU model may reorder MMIO. Need an explicit barrier before `CMD_RUN` (per `HOST_ARCH`: `mfence` for x86, `dmb sy` for ARM). Phase 4 provides a `vortex_gem5_mmio_fence()` inline helper. |
+| B15 | `sim/opaesimx/opae_simx.cpp:138–157` (`prepare_buffer`) | Returns `*buf_addr = (void*)buffer.ioaddr;` — casts an integer device IO address back to a `void*`. | The runtime then dereferences this pointer to do `memcpy(staging_ptr_, host_ptr, size)` (line 322 of `runtime/opaesimx/vortex.cpp`). Same root cause as B3 — only works when host runtime and device share an address space. Under real gem5 the runtime must `mmap` the pinned region via a syscall the gem5 device intercepts, or the gem5 device must expose the pinned region as a PIO/DMA window. |
+
+Together B1, B2, B3, B6, B9, B14 and B15 mean the legacy integration as
+literally written does not run a kernel correctly under v3 even after
+the path renames are applied; it requires architectural rework, not
+porting.
+
+### 2.3 What still ports as design intent
+
+The legacy paper's design intent — and these are what we keep:
+
+- **OPAE-shaped MMIO command set.** `CMD_RUN`, `CMD_MEM_READ`,
+  `CMD_MEM_WRITE`, `CMD_DCR_WRITE`, `MMIO_CMD_TYPE`, `MMIO_CMD_ARG0..2`,
+  `MMIO_STATUS`. Add `CMD_DCR_READ` + `MMIO_DCR_RSP` per the v3 OPAE
+  runtime (B10). The kernel runtime under `sw/runtime/gem5/` reuses
+  this layout so the same `vortex.h` shim layer that drives `opae`
+  also drives `gem5`.
+- **Pinned staging buffer pattern** for host↔device transfers. A
+  fixed device-visible region of host address space; runtime
+  `memcpy`'s into it, device DMAs out of it. Sizing is dynamic
+  (allocate-on-demand) rather than the legacy fixed-16-MB chunk (B6).
+- **Single-PIO-range device** registered to gem5 with the OPAE MMIO
+  offsets. The runtime issues 64-bit MMIO writes; the SimObject
+  decodes them in `write()` / `read()`.
+- **The host SE-mode runtime** (`sw/runtime/gem5/`, native x86 or cross-compiled ARM)
+  shipped into gem5's SE-mode app, **NOT** a full-system Linux on the
+  guest. The paper makes this point explicitly and it is the
+  differentiator vs. NoMali (FS-only) and AMD GPU (FS-only). See
+  `capstone §IIC`.
+
+### 2.4 What needs a v3 redesign
+
+- **`sim/simx/simx_device.{cpp,h}`** — replace with
+  `sim/simx/gem5/vortex_gpgpu.{cpp,h}` (the SimObject wrapper)
+  plus reuse of the new `Processor::cycle()` API. The legacy file's
+  `Impl` class is the wrong shape (B1, B2, B12).
+- **`sim/opaesimx/opae_simx.{cpp,h}`** — delete entirely. The legacy
+  module is a host-side OPAE stub whose `read_mmio64`/`write_mmio64`
+  do raw pointer arithmetic (B3, B15). The v3 design routes MMIO
+  through gem5's PIO port; there is no host-side stub.
+- **`runtime/opaesimx/`** — delete. The OPAE-stub path was a
+  pre-gem5 debugging convenience; under v3 we test the gem5 device
+  end-to-end via a gem5 Python script (§4, Phase 5), not via a
+  co-resident driver.
+- **`runtime/stubarm/`** — replace with `sw/runtime/gem5/`,
+  re-implemented against the same `callbacks.h` ABI as
+  `sw/runtime/simx`/`opae`/`rtlsim`, with cache-flush plumbed in
+  (B9), MMIO fences before `CMD_RUN` (B14), and a configurable ARM
+  cross-compiler target (B13).
+
+---
+
+## 3. Target architecture
+
+```
+                ┌───────────────────────────────────────────────┐
+                │  gem5 simulation                              │
+                │  ─────────────────                            │
+                │  ./ci/gem5_test_vortex_hello.py               │
+                │  (gem5.opt is build/X86/gem5.opt or           │
+                │   build/ARM/gem5.opt; both supported)         │
+                │                                               │
+                │  ┌─────────────┐         ┌─────────────────┐  │
+                │  │ Host CPU    │ ──PIO─▶ │ VortexGPGPU     │  │
+                │  │ (X86 or ARM,│ ◀─PIO── │ (DmaDevice ↓    │  │
+                │  │  SE mode)   │         │  PioDevice)     │  │
+                │  │ user        │         │  ┌───────────┐  │  │
+                │  │ binary:     │         │  │ MMIO regs │  │  │
+                │  │  hello +    │         │  └───────────┘  │  │
+                │  │  libvortex- │         │  ┌───────────┐  │  │
+                │  │  gem5.so    │ ──DMA─▶ │  │ Pinned    │  │  │
+                │  │  (native    │ ◀─DMA── │  │ staging   │  │  │
+                │  │   for X86,  │         │  │ buffer    │  │  │
+                │  │   cross-    │         │  │ window    │  │  │
+                │  │   compiled  │         │  └───────────┘  │  │
+                │  │   for ARM)  │         │       │         │  │
+                │  └─────────────┘         │       ▼         │  │
+                │         │                │  ┌───────────┐  │  │
+                │         │ MemPort        │  │ vortex::  │  │  │
+                │         ▼                │  │ Processor │  │  │
+                │  ┌─────────────┐         │  │ (SimX v3) │  │  │
+                │  └─────────────┘         │  │           │  │  │
+                │                          │  │  Cluster[]│  │  │
+                │                          │  │   Cache   │  │  │
+                │                          │  │   Memory ─┼──┼──┼─▶ RAM (Vortex VRAM,
+                │                          │  └───────────┘  │  │      held inside the
+                │                          │   ▲             │  │      device — separate
+                │                          │   │ cycle()     │  │      address space from
+                │                          │  ┌┴──────────┐  │  │      gem5 DRAM)
+                │                          │  │ tick      │  │  │
+                │                          │  │ (gem5     │  │  │
+                │                          │  │  event)   │  │  │
+                │                          │  └───────────┘  │  │
+                │                          └─────────────────┘  │
+                └───────────────────────────────────────────────┘
+```
+
+### 3.1 The plug-in boundary
+
+The Vortex side exposes **one** plug-in unit: `libvortex-gem5.so`. It
+is built from the same `sim/simx/*.{cpp,h}` sources as the default
+`simx` binary, plus a single new wrapper file
+(`sim/simx/gem5/vortex_gpgpu.{cpp,h}`) that holds:
+
+- A `vortex::Gem5Wrapper` C++ class that owns a `vortex::Processor`,
+  a `vortex::RAM` (the device VRAM), and a thin `cycle()` entry
+  point — exactly mirroring `vortex::VortexSimulator` in
+  `sim/simx/sst/`.
+- A C-ABI shim (`vortex_gem5_create()`, `vortex_gem5_tick()`,
+  `vortex_gem5_mmio_write64()`, `vortex_gem5_mmio_read64()`,
+  `vortex_gem5_dma_read()`, `vortex_gem5_dma_write()`, …) so the
+  gem5-side SimObject is decoupled from C++ ABI changes in
+  `vortex::Processor`. **The C ABI is the contract;** changing it
+  requires a coordinated update of the gem5-side SimObject.
+
+The gem5 side is **one** SimObject + **one** Python file, both shipped
+in this repo at `sim/simx/gem5/`:
+
+- `vortex_gpgpu_dev.{cc,hh}` — subclasses `gem5::DmaDevice` (which
+  itself subclasses `PioDevice`). Holds an opaque
+  `vortex_gem5_handle_t`; on `tick()`, calls `vortex_gem5_tick()`. PIO
+  reads/writes decode the OPAE MMIO offsets and forward to
+  `vortex_gem5_mmio_*`. DMA reads/writes triggered by
+  `CMD_MEM_{READ,WRITE}` use gem5's `DmaPort` and copy bytes into the
+  device VRAM via `vortex_gem5_dma_*`.
+- `VortexGPGPU.py` — `gem5.SimObject` definition with `pio_addr`,
+  `pio_size`, `pio_latency`, `dma_latency`, `clock`, `library`
+  (path to `libvortex-gem5.so`), and `kernel` (path to `*.vxbin` —
+  loaded into VRAM at boot, in lieu of the runtime upload path, for
+  smoke tests).
+
+`ci/gem5_install.sh.in` fetches a pinned gem5 release
+(see §3.4 for version), copies the two files into
+`<gem5>/src/dev/vortex/`, drops a one-line `SConscript`, and runs
+`scons build/ARM/gem5.opt`.
+
+**Nothing upstream of `vortex_gem5_create()` knows gem5 exists.** This
+satisfies §1.3.
+
+### 3.2 The cycle interface
+
+`Processor::cycle()` does **not exist** in v3 today. It is a direct
+prerequisite of both the SST integration (per
+[sst_simx_v3_proposal.md §3.2](sst_simx_v3_proposal.md)) and this
+proposal. The signature and shape are identical to what SST needs:
+
+```cpp
+// processor.h — public additions
+bool cycle();        // advance one cycle; returns false when nothing is running
+Memory* memsim();    // for optional gem5/SST memory-mirroring hooks
+```
+
+```cpp
+// processor.cpp — implementation
+bool ProcessorImpl::cycle() {
+  if (!is_cycle_initialized_) {
+    SimPlatform::instance().reset();
+    this->reset();
+    kmu_->start();                  // dispatch CTAs into the cluster
+    is_cycle_initialized_ = true;
+  }
+  SimPlatform::instance().tick();
+  return this->any_running();
+}
+
+Memory* ProcessorImpl::memsim() { return memsim_.get(); }
+```
+
+The two pieces (`SimPlatform::reset()` → `start_kmu()` →
+`SimPlatform::tick()` and `any_running()`) are already factored on
+`Processor` from Round 6 DTM work. `cycle()` just packages them into a
+single-cycle step.
+
+**Reuse from DTM work:** `start_kmu()` and `any_running()` are already
+public on `Processor`. We add `cycle()` and `memsim()` and that is the
+entire SimX-side API surface required by both SST and gem5.
+
+### 3.3 The MMIO command protocol
+
+Identical to `sw/runtime/opae` v3 (the OPAE driver), reusing
+`hw/syn/altera/opae/vortex_afu.h`:
+
+| Offset | Name | Direction | Purpose |
+|---|---|---|---|
+| `MMIO_CMD_TYPE` | `CMD_*` | W64 | Dispatch one of: `MEM_READ`, `MEM_WRITE`, `RUN`, `DCR_WRITE`, `DCR_READ` |
+| `MMIO_CMD_ARG0..2` | command-specific | W64 | DCR addr / device addr / size / value |
+| `MMIO_STATUS` | bit0=busy | R64 | Polled by runtime's `ready_wait` |
+| `MMIO_DCR_RSP` | response | R64 | Result of `CMD_DCR_READ` (used for cache-flush) |
+| `MMIO_DEV_CAPS` / `MMIO_ISA_CAPS` | caps bitfield | R64 | Encoded device capabilities |
+
+The runtime issues commands by writing args first, then `CMD_TYPE`
+(B14 fix: emit a `DMB SY` before the type write). The device latches
+on `CMD_TYPE`, performs the action synchronously (PIO write returns
+when the operation is enqueued, or completes synchronously for
+fast ones like `DCR_WRITE`), and clears the status busy bit when done.
+
+`CMD_MEM_{READ,WRITE}` use the staging-buffer protocol from the
+capstone paper Fig. 5 (§3.4 below).
+
+### 3.4 The staging-buffer protocol
+
+The gem5 device exposes a PIO-addressable register `MMIO_PINNED_BASE`
+that returns the base address of a pinned region inside gem5's host
+address space. The runtime, on `vx_mem_alloc`, lazily picks a slice of
+that region as a staging buffer.
+
+For a `vx_copy_to_dev(host_ptr, dev_addr, size)`:
+1. Runtime `memcpy(staging_buf, host_ptr, size)`.
+2. Runtime writes `staging_buf_addr`, `dev_addr`, `size` to
+   `MMIO_CMD_ARG{0,1,2}`.
+3. Runtime writes `CMD_MEM_WRITE` to `MMIO_CMD_TYPE`.
+4. Device's PIO handler enqueues a `gem5::DmaPort::dmaAction()` read
+   from `staging_buf_addr` into a local scratch.
+5. On DMA completion, the device copies the scratch bytes into Vortex's
+   `RAM` at `dev_addr` (via `RAM::write`).
+6. Device clears the status busy bit.
+7. Runtime polls `MMIO_STATUS` until busy=0.
+
+`vx_copy_from_dev` is the reverse, with **cache flush first** (B9):
+the runtime issues `CMD_DCR_READ(VX_DCR_BASE_CACHE_FLUSH, cid)` for
+every core before the `CMD_MEM_READ`. The device's DCR-read handler
+plumbs through to `Processor::dcr_read`, which already invokes
+`flush_caches()` for the cache-flush DCR
+([processor.cpp:251–258](../../sim/simx/processor.cpp#L251)).
+
+This is the same protocol the v3 OPAE runtime already uses, so the
+runtime under `sw/runtime/gem5/` differs from `sw/runtime/opae/` only
+in:
+- The `driver.{cpp,h}` backend (gem5 mmaps a `/dev/vortex_gem5`
+  character device path **OR**, in SE-mode, gem5 sets up the device's
+  PIO/DMA windows directly in the simulated process's address space —
+  see §3.6).
+- The lack of an `fpgaPrepareBuffer` API (the device exposes the
+  pinned region itself; no per-call buffer allocation by an OPAE
+  layer).
+
+### 3.5 Build-time gating
+
+`USE_GEM5=1` make variable controls compilation of:
+- `sim/simx/gem5/vortex_gpgpu.{cpp,h}` (the C ABI wrapper).
+- Link target `libvortex-gem5.so` produced alongside `libsimx.so`
+  (mirrors the SST `libvortex.so` pattern in `sim/simx/Makefile`).
+
+`USE_GEM5=1` does **not** affect the default build:
+`make -C sim/simx` (no flag) still produces a stand-alone `simx`
+binary with no gem5 dep. Per §1.4.
+
+The host-side runtime supports both x86 (native) and ARM (cross-
+compiled) targets via a `HOST_ARCH` switch:
+```
+make -C sw/runtime/gem5                                     # x86 default
+make -C sw/runtime/gem5 HOST_ARCH=x86_64                    # explicit x86
+make -C sw/runtime/gem5 HOST_ARCH=aarch64                   # AArch64 cross
+make -C sw/runtime/gem5 HOST_ARCH=armhf                     # ARMv7 cross
+```
+producing `libvortex-gem5-{x86_64,aarch64,armhf}.so`. Test scripts
+select the matching `(gem5.opt, libvortex-gem5-*.so)` pair via the
+`HOST_ARCH` make variable. Native x86 needs no toolchain install; ARM
+requires `gcc/g++-aarch64-linux-gnu` (or `-arm-linux-gnueabihf` for
+ARMv7), which `ci/gem5_install.sh` installs as part of Phase 0.
+
+### 3.6 gem5 SE-mode wiring + ISA selection
+
+**Host ISA: both x86 and ARM, equally first-class** (decision recorded
+2026-05-16 after Phase 0 prototyping). Phase 0's `ci/gem5_install.sh`
+builds `build/X86/gem5.opt` *and* `build/ARM/gem5.opt`; phases 4–6
+test both. Rationale:
+
+- **x86** is the path of least resistance for users — no
+  cross-toolchain, native `g++` builds `sw/runtime/gem5/`, faster
+  gem5 CPU model, and PCIe is canonical on x86 (relevant to the
+  Phase 5+ upgrade path below).
+- **ARM** is the research-narrative path matching the capstone paper
+  (Injae Shin 2025) and actually-deployed ARM+accelerator HPC
+  platforms (Grace Hopper, Fugaku, Graviton, Apple Silicon). Kept
+  as a first-class matrix variant; not a stretch goal.
+
+Three MMIO/DMA paths exist; this proposal picks one for the initial
+work and notes the others as future upgrades:
+
+| Path | Description | Status in this proposal |
+|---|---|---|
+| **1. SE-mode + custom PIO+DMA wiring** | The device is a `DmaDevice` subclass attached to `system.membus` at a configurable `pio_addr` (default `0x20000000`, matching the legacy paper). Host binary touches the address via `mmap`/inline asm. Works in both x86 SE-mode and ARM SE-mode. | **Phase 2–6: this is the design.** Matches legacy paper, lightweight, fast iteration. |
+| **2. FS-mode + PCIe device** | Subclass `PciDevice` (which already inherits `DmaDevice`); BARs expose MMIO, DMA for staging. Full Linux boot inside gem5 with a tiny PCI kernel module to bind the device. | **Phase 5+ upgrade.** Realistic accelerator-modeling story expected by x86 users. The C ABI committed in Phase 2 is shape-compatible — `PciDevice` and the custom `DmaDevice` both use the same `vortex_gem5_dma_*` callbacks; only the gem5-side wrapper class differs. |
+| **3. `/dev/vortex_gem5` pseudo-file** | The gem5 device implements `SyscallReturn open(...)` + `mmap` for a synthetic device path. Runtime `open("/dev/vortex_gem5", O_RDWR)` + `mmap`. | Out of scope. Closest to how real OPAE drivers work but requires a custom syscall handler in gem5; cost outweighs the benefit when Path 1 already works. |
+
+**Doorbell queues** are a Phase 7+ realism upgrade orthogonal to the
+transport choice above. AMD GPU (gem5 `src/dev/amdgpu/`, derived
+from `PciEndpoint`) and NVIDIA-style modern accelerators use a ring
+buffer in host DRAM plus a single MMIO "doorbell" write per dispatch:
+the host appends commands to the ring, then writes the new tail
+offset to the doorbell register; the device asynchronously walks the
+ring and processes commands. The Phase 2-6 design instead uses
+**status polling** — the host writes args + `CMD_TYPE`, then polls
+`MMIO_STATUS` until done — which matches the legacy OPAE FPGA driver.
+Polling is fine for the capstone-paper scope (small kernels, one at
+a time) but burns simulated cycles on the spin. If later research
+wants batched-dispatch realism comparable to AMD GPU, the upgrade
+swaps the OPAE MMIO command set for a ring + doorbell protocol; the
+C ABI in Phase 2 stays compatible (a new `vortex_gem5_doorbell_ring(handle, tail)`
+entry point alongside the existing `vortex_gem5_mmio_*`).
+
+### 3.7 gem5 version pinning
+
+`ci/gem5_install.sh.in` pins gem5 to v25.0.0 (the most recent stable
+release as of 2026-05). The pinned tag goes in `VERSION` alongside
+`TOOLCHAIN_REV` and `SST_VER` — bumps require a CI re-run on the
+self-hosted runner first (small risk of API drift on gem5's
+`DmaDevice`/`PioDevice` between major releases). **Picking and
+validating this pin is the first deliverable of Phase 0** — every
+other phase is a no-op if Phase 0 reveals that v25.0.0 no longer
+supports SE-mode PIO mapping or the SimObject install path we depend
+on.
+
+### 3.8 Why this is not just a copy of the SST pattern
+
+SST and gem5 are similar in shape (external simulator drives the
+Vortex clock through a C++ wrapper around `Processor::cycle()`) but
+differ in three load-bearing ways:
+
+1. **The host process is simulated under gem5.** Under SST the host
+   "process" is the SST Python script itself, running natively on the
+   developer's machine. Under gem5 the host is a userspace process
+   (x86 or ARM, per §3.6) running inside the gem5 model. So the gem5
+   integration also needs a host-side runtime under `sw/runtime/gem5/`
+   (native compile for x86, cross-compile for ARM); SST does not.
+   (This is the bulk of the work that makes gem5 the bigger project —
+   see §9 effort estimate.)
+2. **Memory is in two address spaces.** Under SST, the SimX `Processor`
+   and any optional SST memHierarchy share the same simulator. Under
+   gem5, the host CPU's DRAM is a gem5 `AddrRange`, the Vortex VRAM is
+   a `RAM` inside the device, and the only way bytes cross between
+   them is via DMA through the device. The staging-buffer protocol
+   (§3.4) implements this; SST has no equivalent.
+3. **PIO bus integration.** SST's `StandardMem` interface is the
+   only one we plug into; gem5 has separate `PioPort` and `DmaPort`
+   with different timing models. The wrapper must manage both.
+
+---
+
+## 4. Phasing
+
+Each phase is independently shippable and validated. The work follows
+the same shape as the SST integration in
+[sst_simx_v3_proposal.md §4](sst_simx_v3_proposal.md): **environment
+first**, API + library second, gem5-side wiring third, ARM runtime
+fourth, CI last.
+
+### Phase 0 — gem5 environment + API survey *(derisking; nothing else can start until this is done)*
+
+The legacy `vortex_gem5` was built against a forked gem5 that no
+longer exists publicly. Before we design the C ABI in Phase 2 or
+write a single line of `DmaDevice` glue in Phase 3, we need a
+known-good gem5 build on the bench so the API surface we are about
+to commit to is **real**, not assumed-from-headers-we-haven't-read.
+This is the "solve gem5 setup first" phase.
+
+Concretely:
+
+- **Pick and pin the gem5 version.** Default target: v25.0.0.1
+  (patch release on top of v25.0.0, most recent stable as of 2026-05).
+  Pin the tag in `VERSION` alongside `TOOLCHAIN_REV` and `SST_VER`:
+  ```
+  GEM5_REV=v25.0.0.1
+  ```
+- **Write `ci/gem5_install.sh.in`** (no Vortex integration yet — just
+  the install). Mirrors the structure of `ci/sst_install.sh.in`:
+  - `apt install scons python3-dev python3-pip libprotobuf-dev
+    protobuf-compiler libprotoc-dev libgoogle-perftools-dev m4
+    libboost-all-dev gcc-aarch64-linux-gnu g++-aarch64-linux-gnu`
+    (gem5's documented build deps + ARM cross-toolchain for the ARM
+    matrix variant).
+  - Fetch gem5 working tree at `$GEM5_REV` into `$TOOLDIR/gem5`.
+  - `scons build/X86/gem5.opt -j$(nproc)` and
+    `scons build/ARM/gem5.opt -j$(nproc)` — **both ISAs by default**
+    per the dual-ISA decision in §3.6. Targets selectable via
+    `GEM5_TARGETS="X86"` / `"ARM"` / `"X86 ARM"`.
+  - Export `GEM5_HOME=$TOOLDIR/gem5` to `~/.bashrc`.
+- **Validate the X86 native compiler produces SE-mode binaries.**
+  Trivial — `gcc -static -o /tmp/hello-x86 sim/simx/gem5/hello.c`
+  then run under `gem5.opt configs/example/gem5_library/arm-hello.py`
+  -shape config (substituting `ISA.X86`). Confirm exit code 0 and
+  the expected stdout.
+- **Validate the ARM cross-toolchain produces SE-mode binaries.**
+  Cross-compile `hello.c` with `aarch64-linux-gnu-gcc -static -o
+  /tmp/hello-arm`, run under
+  `build/ARM/gem5.opt configs/example/gem5_library/arm-hello.py`
+  (or the deprecated SE script). Confirms the cross-toolchain
+  produces something gem5 ARM-mode can load.
+- **Read the gem5 source for the API surface we are about to use**
+  and record findings in a short scratch file
+  `sim/simx/gem5/gem5_api_notes.md` (not committed to docs/, just a
+  Phase 0 deliverable):
+  - `src/dev/io_device.hh` — `PioDevice::read`/`write` signatures
+    in v25.0.0. Compare to what the legacy paper assumed.
+  - `src/dev/dma_device.hh` — `DmaDevice::dmaAction`, `DmaPort`
+    timing model. Confirm 64-bit address support, async completion
+    callback shape.
+  - `src/python/m5/objects/Device.py` — SimObject Python bindings.
+    Confirm that out-of-tree `src/dev/<our-dir>/SConscript` is
+    picked up by `scons build/ARM/gem5.opt` (this is the install
+    mechanism we rely on in Phase 3).
+  - `configs/example/se.py` — how SE-mode wires a CPU to a
+    `Workload`. Confirm that we can attach a `PioDevice` and have
+    the SE-mode loader map its PIO range into the workload's address
+    space (the legacy paper's `0x20000000` magic). If this is no
+    longer supported, the design changes — better to know now than
+    in Phase 3.
+- **Smoke-build a trivial out-of-tree SimObject** to prove the
+  install mechanism end-to-end. Three files
+  (`Dummy.{cc,hh,py}` + `SConscript`) under `sim/simx/gem5/dummy/`,
+  installed by `sim/simx/gem5/install.sh` (Phase 0 only ships the
+  installer; the real SimObject lands in Phase 3). After
+  `ci/gem5_install.sh` re-runs, `gem5.opt --list-sim-objects` shows
+  `Dummy`. Delete `dummy/` once verified — it was scaffolding.
+
+**Validation:**
+- `ci/gem5_install.sh` finishes successfully on the self-hosted
+  runner. Wall time recorded in `gem5_api_notes.md` (drives CI
+  caching strategy in Phase 6).
+- `$GEM5_HOME/build/ARM/gem5.opt configs/example/se.py
+  --cmd ./hello-arm` exits 0.
+- `gem5.opt --list-sim-objects` lists the dummy SimObject installed
+  via `sim/simx/gem5/install.sh`.
+- `gem5_api_notes.md` documents the `DmaDevice` / `PioDevice` /
+  `EventFunctionWrapper` signatures we will commit to in Phase 2's
+  C ABI design.
+
+**Why this is its own phase:** if any of those validations fails
+(e.g. gem5 v25 has dropped SE-mode PIO mapping, or the SimObject
+install mechanism has changed), the rest of the proposal needs
+redesign before code lands. Phase 0 is a ~1-day gate, not a tracked
+deliverable; everything downstream depends on its outputs.
+
+### Phase 1 — `Processor::cycle()` + `Memory*` accessor
+
+Prerequisite shared with SST. Can run in parallel with Phase 0
+(no gem5 dependency) and lands first into the SimX-side codebase.
+
+- Add `Processor::cycle()` and `Memory* Processor::memsim()` as in
+  §3.2. This is a ~50-line patch to `processor.{cpp,h}` and
+  `processor_impl.h` plus an `is_cycle_initialized_` bool.
+- Add `Memory::set_pre_send_hook()` (already in v3 per
+  `sim/simx/mem/memory.h:42` — verify still there; if so, this part
+  of Phase 1 is a no-op).
+- Update SST's `vortex_simulator.cpp` to use the new public
+  `Processor::cycle()` API (currently calls `proc_->cycle()` which
+  does not compile against `processor.h` HEAD — see
+  `sim/simx/sst/vortex_simulator.cpp:64`). **This is a pre-existing
+  bug that Phase 1 fixes for both integrations.**
+
+**Validation:** `make -C sim/simx` (default), `make -C sim/simx
+USE_SST=1`, and `make -C sim/simx USE_GEM5=1` all build. SST tests
+that previously failed to link now link and run (`sst
+ci/sst_test_vortex_hello.py` passes).
+
+### Phase 2 — `libvortex-gem5.so` + C ABI
+
+**Prerequisite: Phase 0 complete.** The C ABI is designed *against*
+the `DmaDevice`/`PioDevice` shapes recorded in
+`gem5_api_notes.md`, not from headers we haven't read.
+
+- Create `sim/simx/gem5/vortex_gpgpu.{cpp,h}` mirroring
+  `sim/simx/sst/vortex_simulator.{cpp,h}` shape:
+  - Owns a `Processor`, a `RAM` (device VRAM at `MEM_PAGE_SIZE`).
+  - Exposes a C ABI (`vortex_gem5_*`) sufficient for the gem5 device
+    to MMIO/DMA/tick it. ABI signatures match what gem5's
+    `DmaDevice::dmaAction` and `PioDevice::read`/`write` need to
+    call into (per Phase 0 survey).
+- Add `USE_GEM5=1` build target to `sim/simx/Makefile` producing
+  `libvortex-gem5.so` (no SST symbols; no `sst-core` link). Pattern:
+  duplicate the `ifeq ($(USE_SST),1)` block.
+- Add a tiny in-process smoke driver
+  `sim/simx/gem5/gem5_smoke_main.cpp` (built with the lib) that:
+  1. Loads a `.vxbin` via the C ABI.
+  2. Ticks until `cycle()` returns false.
+  3. Reads the MPM exit code via DCR_READ.
+
+  This is the "library compiles and a kernel runs through it without
+  gem5 installed" smoke test (§6.2).
+
+**Validation:**
+- `make -C sim/simx USE_GEM5=1` builds.
+- `LD_LIBRARY_PATH=. ./gem5_smoke hello.vxbin` returns 0.
+- `make -C sim/simx` (no flag) still builds and `./simx hello.vxbin`
+  returns 0 (no regression on default).
+
+### Phase 3 — gem5 SimObject + Python config
+
+**Prerequisite: Phases 0 + 2 complete.** The install mechanism is
+already proven by Phase 0's dummy SimObject; this phase replaces
+the dummy with the real device.
+
+- `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}` — the gem5 `DmaDevice`
+  subclass. PIO `read`/`write` decode MMIO offsets and call
+  `vortex_gem5_mmio_*`. DMA actions triggered by `CMD_MEM_*`. A
+  registered `EventFunctionWrapper` re-schedules itself every
+  `clock_period_ticks()` and calls `vortex_gem5_tick()`.
+- `sim/simx/gem5/VortexGPGPU.py` — Python SimObject definition.
+- `sim/simx/gem5/SConscript` — for gem5's scons build.
+- `sim/simx/gem5/install.sh` — copies the four files above into
+  `<gem5>/src/dev/vortex/`. (Phase 0 already wrote this for the
+  dummy SimObject; just extend it.)
+- Update `ci/gem5_install.sh.in` to re-run `install.sh` and rebuild
+  `build/ARM/gem5.opt` after the Vortex SimObject lands.
+
+**Validation:** `ci/gem5_install.sh` succeeds with the real
+SimObject installed. `gem5.opt --list-sim-objects` shows
+`VortexGPGPU`. `gem5.opt configs/example/se.py --help` accepts
+`VortexGPGPU` parameters.
+
+### Phase 4 — Host runtime (`sw/runtime/gem5/`, x86 + ARM)
+
+- New backend mirroring `sw/runtime/opae/` shape:
+  - `vortex.cpp` — implements the `vx_*` callbacks against the OPAE
+    MMIO protocol (§3.3), but the `driver.{cpp,h}` underneath does
+    raw `mmap`/MMIO writes to the PIO address rather than calling
+    `libopae`.
+  - `Makefile` — selects compiler from `HOST_ARCH`:
+    - `x86_64` (default): native `g++`
+    - `aarch64`: `aarch64-linux-gnu-g++`
+    - `armhf`: `arm-linux-gnueabihf-g++`
+- Cache-flush integration (B9): the v3 `download` path issues
+  `CMD_DCR_READ(VX_DCR_BASE_CACHE_FLUSH, cid)` per core before
+  `CMD_MEM_READ`.
+- MMIO ordering fence (B14): emit the right barrier for `HOST_ARCH`:
+  - `x86_64`: `__asm__ volatile ("mfence" ::: "memory")`
+  - `aarch64`: `__asm__ volatile ("dmb sy" ::: "memory")`
+  - `armhf`: `__asm__ volatile ("dmb sy" ::: "memory")`
+  Provide a `vortex_gem5_mmio_fence()` inline helper that compiles
+  to the right barrier per `HOST_ARCH`.
+- Multi-target build (B13 obsolete; replaced by clean multi-target
+  support): `HOST_ARCH` make variable.
+
+**Validation:**
+- `make -C sw/runtime/gem5` (default `HOST_ARCH=x86_64`) builds.
+  `file build/sw/runtime/libvortex-gem5-x86_64.so` confirms x86-64
+  ELF.
+- `make -C sw/runtime/gem5 HOST_ARCH=aarch64` builds (requires
+  cross-toolchain, installed by Phase 0's `ci/gem5_install.sh`).
+  `file build/sw/runtime/libvortex-gem5-aarch64.so` confirms
+  AArch64 ELF.
+
+### Phase 5 — End-to-end gem5 test
+
+- `ci/gem5_test_vortex_hello.py` — gem5 Python config that wires:
+  - A `System` with one `TimingSimpleCPU` core in SE mode (host ISA
+    selected at runtime via `--host-arch=x86|arm`).
+  - A `VortexGPGPU` device on `system.membus` at
+    `pio_addr=0x20000000`, mapped into the process's address space.
+  - The native-or-cross-compiled test binary
+    (`tests/kernel/hello/hello` re-linked against the matching
+    `libvortex-gem5-{x86_64,aarch64}.so`) as the SE-mode workload.
+- `ci/gem5_test_vortex_vecadd.py` — same with a vecadd kernel that
+  actually exercises DMA in both directions and the cache-flush path.
+- Add a top-level wrapper test in `tests/regression/gem5/` (mirrors
+  `tests/regression/dxa/`) that builds the kernels and invokes the
+  Python scripts for both `HOST_ARCH=x86_64` and `HOST_ARCH=aarch64`.
+
+**Validation:**
+- `build/X86/gem5.opt ci/gem5_test_vortex_hello.py --host-arch=x86`
+  exits with code 0 and the expected `Hello World` on stdout.
+- `build/ARM/gem5.opt ci/gem5_test_vortex_hello.py --host-arch=arm`
+  exits with code 0 and the expected `Hello World` on stdout.
+- Both `ci/gem5_test_vortex_vecadd.py` variants exit 0 with the
+  vecadd result buffer matching the CPU-computed reference (checked
+  by the test binary itself).
+
+### Phase 6 — CI integration
+
+- Add `gem5()` function to `ci/regression.sh.in` (mirroring `sst()`
+  on line ~80):
+  ```bash
+  gem5()
+  {
+      echo "begin gem5 tests..."
+
+      make -C sim/simx USE_GEM5=1
+      make -C tests/kernel
+
+      # X86 default: native compile, no cross-toolchain needed.
+      make -C sw/runtime/gem5 HOST_ARCH=x86_64
+      cp sim/simx/libvortex-gem5.so $GEM5_HOME/build/X86/
+
+      timeout 120 $GEM5_HOME/build/X86/gem5.opt \
+          ci/gem5_test_vortex_hello.py  --host-arch=x86
+      timeout 120 $GEM5_HOME/build/X86/gem5.opt \
+          ci/gem5_test_vortex_vecadd.py --host-arch=x86
+
+      # ARM matrix entry — requires gcc-aarch64-linux-gnu (installed
+      # by ci/gem5_install.sh in Phase 0).
+      if [ -n "$VORTEX_GEM5_ARM" ]; then
+          make -C sw/runtime/gem5 HOST_ARCH=aarch64
+          cp sim/simx/libvortex-gem5.so $GEM5_HOME/build/ARM/
+
+          timeout 120 $GEM5_HOME/build/ARM/gem5.opt \
+              ci/gem5_test_vortex_hello.py  --host-arch=arm
+          timeout 120 $GEM5_HOME/build/ARM/gem5.opt \
+              ci/gem5_test_vortex_vecadd.py --host-arch=arm
+      fi
+
+      echo "gem5 tests done!"
+  }
+  ```
+  Per `feedback_test_timeout_120s.md`, every test invocation is
+  `timeout 120`-capped. ARM is opt-in via `VORTEX_GEM5_ARM=1` so
+  hosted CI without the ARM toolchain still passes; the self-hosted
+  runner sets the env var.
+- Add `gem5-x86` and `gem5-arm` matrix entries to
+  `.github/workflows/ci.yml` (both run on the self-hosted runner
+  only, per
+  [`project_ci_machine.md`](../../../../.claude/projects/-home-blaisetine-dev/memory/project_ci_machine.md);
+  the hosted runners do not have enough resources for a full
+  gem5 build).
+- Add `ci/gem5_install.sh` to the Apptainer recipe
+  ([`miscs/apptainer/vortex.def`](../../miscs/apptainer/vortex.def))
+  so the .sif has gem5 pre-installed. **Out of scope for Phase 6;
+  see §8.**
+
+**Validation:** `./ci/regression.sh --gem5` runs both
+`gem5_test_vortex_*.py` cleanly on the self-hosted runner.
+
+### Phase 7 — Documentation
+
+- `docs/gem5_integration.md`:
+  - How to install gem5 v25.0.0 (point at `ci/gem5_install.sh`).
+  - How to build with `USE_GEM5=1`.
+  - How to cross-compile the ARM runtime + kernels.
+  - How to write a gem5 Python script that drives `VortexGPGPU`.
+  - The single-source-of-truth invariant (§1.1) and the cache-flush
+    contract (§3.4) for future hackers who might be tempted to skip
+    the flush "because it's fast".
+
+---
+
+## 5. Authorship / history mechanics
+
+- `sim/simx/gem5/vortex_gpgpu.{cpp,h}` and the gem5-side
+  `vortex_gpgpu_dev.{cc,hh}` + `VortexGPGPU.py`: **new files**, no
+  upstream equivalent. Commit body cites:
+  > Replaces legacy `vortex_gem5/sim/simx/simx_device.{cpp,h}`
+  > (Injae Shin, UCLA 2025-05-22 commit 91dcf17) and the gem5-side
+  > SimObject described in his capstone report.
+  > Re-implemented for SimX v3 Processor::cycle() API. Original
+  > design intent (OPAE MMIO + pinned staging buffer + ARM SE-mode
+  > runtime) preserved.
+
+- `sw/runtime/gem5/`: **new files** mirroring `sw/runtime/opae/`'s
+  shape. Same authorship attribution as above; the file-level
+  similarity is to `sw/runtime/opae`, not to `runtime/opaesimx` from
+  the legacy tree (which has the bugs catalogued in §2.2).
+
+- `ci/gem5_install.sh.in` and `ci/gem5_test_vortex_*.py`: new files;
+  follow the structure of `ci/sst_install.sh.in` and
+  `ci/sst_test_vortex_*.py`. `ci/gem5_install.sh.in` lands in
+  Phase 0 (initially installing the dummy SimObject); the test
+  scripts land in Phase 5.
+
+- `Processor::cycle()` / `Processor::memsim()`: new public API on
+  `Processor`, lands in Phase 1. Single commit on the simx_v3 line;
+  mentioned as a prerequisite of both SST and gem5 integrations in
+  the commit body.
+
+- `sim/simx/gem5/gem5_api_notes.md`: Phase 0 deliverable, scratch
+  notes only — **not** committed to `docs/`. Captures the gem5
+  v25.0.0 API surface our C ABI design depends on; deleted once
+  Phase 2 commits the C ABI itself.
+
+This is consistent with the rule established in
+[`feedback_keep_ours_in_merge.md`](../../../../.claude/projects/-home-blaisetine-dev/memory/feedback_keep_ours_in_merge.md):
+the legacy code is not a "theirs" we apply; it is a prior design that
+informs our redesign. Credit the designer in the body; do not pretend
+the bits are a port.
+
+---
+
+## 6. Validation
+
+Each phase ends with the validation listed in §4. Across phases the
+acceptance criteria are:
+
+1. **No-gem5 build identical.** `make -C sim/simx` (default flags)
+   produces a binary identical in behavior to today's on the
+   regression suite (io_addr, arith, vecadd, mpi_vecadd, tensor*,
+   dxa, dtm). The Phase 0 `Processor::cycle()` addition must not
+   change `Processor::run()` semantics — verify by trace-diffing
+   `vecadd` before and after Phase 0.
+
+2. **In-process smoke (no gem5 needed).** `gem5_smoke hello.vxbin`,
+   the Phase 2 driver, runs the same kernels the `simx` binary runs
+   and produces matching output. This is the unit-test layer that
+   shakes out C-ABI breakage without requiring gem5 to be installed
+   beyond what Phase 0 already set up.
+
+3. **End-to-end gem5 PASS.** Both `gem5_test_vortex_hello.py` and
+   `gem5_test_vortex_vecadd.py` exit 0 under the pinned gem5 v25.0.0.1,
+   on *both* `build/X86/gem5.opt` and `build/ARM/gem5.opt`, timed out
+   at 120 s (each). The pin and the install path are both already
+   validated by Phase 0; this validation just exercises the real
+   `VortexGPGPU` SimObject end-to-end.
+
+4. **No `core->mem_read` / `core->mem_write` regressions.** Phase 5
+   of v3 forbids those
+   ([simx_v3_proposal.md §3.3](simx_v3_proposal.md)). The grep gate
+   from
+   [master_merge_v3_proposal.md §8 R1](master_merge_v3_proposal.md)
+   applies here: every commit must pass
+   `git diff <pre>..<post> -- sim/simx/ | grep -E 'core->mem_(read|write)' | wc -l == 0`.
+
+5. **Single source of truth check.** The gem5 device's pinned region
+   is `RAM`-backed (i.e., a slice of host memory exposed to gem5's
+   DRAM AddrRange via `mmap`); Vortex's VRAM is the `RAM` attached to
+   `Memory` inside `vortex::Processor`. **There is no shadow image.**
+   `vortex_gem5_dma_{read,write}` copies bytes between the two via
+   `RAM::read`/`RAM::write` — no additional buffer level. Mistakes
+   here re-introduce the §1.1 violation.
+
+---
+
+## 7. Risks
+
+| # | Risk | Mitigation |
+|---|---|---|
+| R1 | gem5 v25.0.0 `DmaDevice` API drifts in v26+. | Pin in `ci/gem5_install.sh.in` (Phase 0). Document the pin in `docs/gem5_integration.md`. CI catches regressions on bump. |
+| R2 | ARM cross-compiler not available in the Apptainer recipe. | Phase 6 says gem5 CI is on the self-hosted runner only, which already has the ARM toolchain per [`project_ci_machine.md`](../../../../.claude/projects/-home-blaisetine-dev/memory/project_ci_machine.md). Apptainer absorption is out of scope (§8). |
+| R3 | `MMIO_PINNED_BASE` PIO range collides with another gem5 device's PIO range. | Pick a default (`0x20000000`, matching the legacy paper) but make it a Python-configurable parameter (`pio_addr`). Phase 0 confirms the default is reachable from SE-mode in v25.0.0; document collisions in the integration guide. |
+| R4 | The gem5 ARM CPU model reorders MMIO writes, breaking the args-then-CMD_TYPE protocol (B14). | `DMB SY` (AArch64) or `dmb sy` (ARMv7) before `CMD_TYPE` write in the runtime. Add a regression test that issues a back-to-back `CMD_MEM_WRITE` + `CMD_RUN` and verifies the kernel observed the correct args. |
+| R5 | Future contributor re-introduces the host-pointer-MMIO hack (B3) "for convenience". | This proposal explicitly deletes that abstraction (§2.4). The follow-up `docs/gem5_integration.md` (Phase 7) should call this out. |
+| R6 | `Processor::cycle()` for a never-launched kernel hangs (no `kmu_->start()` because `is_cycle_initialized_` was never reset). | Reset is implicit on first `cycle()`. If a second kernel is launched in the same device lifetime (rare; supported by gem5 only for back-to-back tests), the gem5 device's `CMD_RUN` handler must call a new `Processor::reset_cycle()` that clears `is_cycle_initialized_`. Add this in Phase 2. |
+| R7 | The cross-compiled ARM `libvortex-gem5.so` and the gem5-loaded `libvortex-gem5.so` (x86) have the same SONAME and get confused at install time. | Suffix the ARM build (`libvortex-gem5-aarch64.so`) and the gem5 build (`libvortex-gem5.so`). Document in Phase 2+4. |
+| R8 | gem5's `DmaPort` request size is unbounded; a 1 GB `CMD_MEM_WRITE` would burn simulated time. | Cap per-transaction size at 1 MB in the device's `CMD_MEM_*` handler; chunk larger requests into multiple DMA actions. Mirrors how the OPAE `fpgaPrepareBuffer` page-aligns transfers. |
+| R9 | Cache flush via `CMD_DCR_READ` returns synchronously per core; for `NUM_CORES * NUM_CLUSTERS = 16` that is 16 PIO round-trips per download. | Acceptable for Phase 5; can be batched into a single `CMD_FLUSH_ALL` MMIO later if measured to hurt. |
+| R10 | The gem5 SimObject install (`sim/simx/gem5/install.sh`) modifies the gem5 source tree in place; rebuilds can leave stale artifacts. | `install.sh` is idempotent (copies, doesn't patch); `ci/gem5_install.sh` does a clean `scons -c` before re-build on toolchain version mismatch. Phase 0 proves the install path end-to-end with a dummy SimObject before any real code depends on it. |
+| R11 | Phase 0 reveals gem5 v25.0.0 has dropped SE-mode PIO mapping (the legacy `0x20000000` magic). | Switch design to the `/dev/vortex_gem5` pseudo-file path (§3.6 option 2) before Phase 2 commits the C ABI. Cost: ~1 week added to Phase 0 redesign window. Acceptable because Phase 0 is explicitly a gate — no downstream phase has shipped code yet. |
+| R12 | Phase 0 install takes hours on first run; blocks parallel work. | Cache the `$TOOLDIR/gem5-src/build` directory in CI the same way SST and toolchain caches work. Self-hosted runner's local toolchain dir survives across runs. |
+
+---
+
+## 8. Out of scope
+
+- **Apptainer integration.** Adding gem5 + the ARM cross-toolchain
+  to `miscs/apptainer/vortex.def` is a separate concern. Until that
+  is done, `apptainer-ci.yml`'s matrix should not include `gem5`. The
+  self-hosted runner runs the gem5 matrix entry on hosted ci.yml; the
+  Apptainer pipeline skips it. See
+  [`apptainer-ci.yml` policy notes](../../.github/workflows/apptainer-ci.yml).
+
+- **Full-system Linux on gem5.** The capstone paper restricts itself
+  to SE-mode (per the paper's §IIC: "gem5-Vortex's implementation
+  allows users to use gem5's system call emulation (SE) mode"). This
+  proposal does the same. FS-mode requires booting a Linux kernel
+  inside gem5 with a Vortex device driver — possible, but a separate
+  redesign that intersects with kernel-mode driver work the project
+  has not started.
+
+- **Multi-device simulation.** One `VortexGPGPU` per gem5 system.
+  Multi-device support requires per-instance PIO ranges and a runtime
+  side that supports `vx_dev_open` returning >1 handle — the legacy
+  `g_callbacks` global (B11) blocks this on the runtime side, and
+  the device side needs per-instance state isolation. Defer.
+
+- **AMD GPU / NoMali comparison.** The capstone paper compares
+  gem5-Vortex to NoMali (stub GPU) and AMD GPU (full-system). Those
+  comparisons live in the paper; reproducing them as benchmarks is
+  out of scope. Comparing performance to SimX standalone or to the
+  SST integration is also out of scope — separate analysis work.
+
+- **DMA performance modeling.** The capstone paper §V measures DMA
+  delay variation per kernel size. Replicating that as a CI
+  performance gate is out of scope; could be a follow-up perf
+  proposal once the integration is stable.
+
+- **SST + gem5 simultaneous.** Both integrations replace different
+  parts of the harness; running them together is not a use case
+  anyone has asked for. Build flags are mutually exclusive:
+  `USE_SST=1` and `USE_GEM5=1` together is rejected by `sim/simx/Makefile`.
+
+- **gem5 fork branch.** We do not maintain a long-lived fork of gem5.
+  `ci/gem5_install.sh` fetches a clean release tarball and applies
+  our SimObject; if the user wants a persistent gem5 working tree,
+  that is their setup. Avoids the "fork rot" that froze
+  `vortex_gem5`.
+
+- **Runtime gem5/non-gem5 switching.** Keep `USE_GEM5=1` as a
+  build-time switch. A runtime switch would require both `Processor`
+  and a gem5 wrapper in every binary plus a factory; not worth the
+  maintenance cost for a single-device research integration.
+
+---
+
+## 9. Estimated effort
+
+Based on the SST integration in
+[sst_simx_v3_proposal.md §9](sst_simx_v3_proposal.md) (~15–28 h):
+
+- **Phase 0** (gem5 env + API survey + dummy SimObject install):
+  **6–10 h estimated; ✅ COMPLETE 2026-05-16** in ~3 h of
+  attended + ~25 min unattended scons build. The wall time to
+  install gem5 was 13 min (ARM) + 11 min (X86) parallel on the
+  self-hosted 64-core runner. All six validations
+  (see `sim/simx/gem5/gem5_api_notes.md`) pass on both ISAs.
+  Key discoveries committed: (1) SE-mode PIO attachment is
+  possible but requires bypassing the `SimpleBoard` high-level
+  API; (2) out-of-tree SimObject install needs **no** top-level
+  SConstruct patch — pure `cp -r`; (3) PCIe (Path 2 in §3.6) is
+  a clean Phase 5+ upgrade because `PciDevice` inherits
+  `DmaDevice` and shares the same C ABI surface.
+- **Phase 1** (`Processor::cycle()` + `memsim()`): **1–2 h estimated;
+  ✅ COMPLETE 2026-05-16** in ~1 h. ~50-line patch to
+  `processor.{cpp,h}` + `processor_impl.h`. Default `make -C
+  sim/simx` and `USE_SST=1` both build clean; `simx hello.vxbin`
+  prints `#0: Hello World!`. **Bonus:** the SST integration was
+  previously broken at the `proc_->cycle()` call site
+  (`sim/simx/sst/vortex_simulator.cpp:64`) and would not link; with
+  Phase 1 in place, `sst ci/sst_test_vortex_hello.py` runs
+  end-to-end and exits cleanly at 4.643 µs simulated time.
+- **Phase 2** (`libvortex-gem5.so` + C ABI + in-process smoke):
+  **4–6 h estimated; ✅ COMPLETE 2026-05-16** in ~1.5 h. Files added:
+  `sim/simx/gem5/vortex_gpgpu.{h,cpp}` (the C ABI library) and
+  `sim/simx/gem5/gem5_smoke_main.cpp` (the in-process smoke driver).
+  `sim/simx/Makefile` extended with a `USE_GEM5=1` gate that
+  produces `libvortex-gem5.so` (1.5 MB) + `gem5_smoke` (16 KB
+  driver linking against the lib). `gem5_smoke hello.vxbin` →
+  `#0: Hello World!`, 4642 cycles, exit_code=0 (correctly read back
+  via `vortex_gem5_vram_read` after the cache-flush DCR path —
+  validating B9 from §2.2 is fixed). Default `make -C sim/simx`
+  unchanged (only `simx` produced; gem5 sources fully gated).
+  `USE_SST=1 USE_GEM5=1` correctly rejected by the Makefile per
+  §8 (mutual exclusion). Side fix: `sw/common/bitmanip.h` was
+  missing `<type_traits>` and `<algorithm>` includes — header
+  hygiene fix benefits any caller (per
+  [feedback_always_correct_fix_not_patch](../../../../.claude/projects/-home-blaisetine-dev/memory/feedback_always_correct_fix_not_patch.md)).
+- **Phase 3** (gem5 SimObject + Python + install.sh): **6–10 h
+  estimated; ✅ COMPLETE 2026-05-16** in ~1.5 h. Files added:
+  `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}` (gem5 `DmaDevice` subclass
+  with `dlopen` + `EventFunctionWrapper` tick scheduling),
+  `sim/simx/gem5/VortexGPGPU.py` (Python binding with `library=` +
+  `kernel=` parameters), `sim/simx/gem5/SConscript`. Updated
+  `install.sh` to install the real device and remove the Phase 0
+  dummy scaffolding from `$GEM5_HOME` cleanly. New test:
+  `ci/gem5_test_vortex_hello.py` (standalone-device variant, no
+  host CPU needed). Validation: both `build/X86/gem5.opt` and
+  `build/ARM/gem5.opt` import `VortexGPGPU` and run hello.vxbin to
+  completion at tick 4,643,000 (1 GHz clock → 4643 cycles, matching
+  Phase 1 SST + Phase 2 in-process within 1 cycle). **Three
+  harnesses now validated through the same `Processor::cycle()` API:
+  SST, in-process C ABI, and gem5 SimObject.**
+- **Phase 4** (host runtime, x86 + ARM): **6–10 h estimated; ✅ x86
+  PATH COMPLETE 2026-05-16** in ~1 h; aarch64 cross-build gated on
+  the user's `sudo apt install gcc-aarch64-linux-gnu`. Files added:
+  `sw/runtime/gem5/driver.{cpp,h}` (direct MMIO + mmio_fence helper
+  with per-arch barrier; bump-allocator for the pinned region),
+  `sw/runtime/gem5/vortex.cpp` (OPAE-shaped `vx_device` with the
+  full callback table — compile-time caps from VX_config.h since
+  the host runtime and the device library are built from the same
+  source tree), `sw/runtime/gem5/Makefile` (HOST_ARCH ∈
+  {x86_64,aarch64,armhf} → matching cross-compiler; produces
+  `libvortex-gem5-$ARCH.so`). All three B-bugs addressed: B9 (cache
+  flush before download via per-core `dcr_read(VX_DCR_BASE_CACHE_FLUSH,
+  cid)`), B13 (per-arch compiler via `HOST_ARCH`), B14 (mmio_fence()
+  centralised in `issue_cmd()` so every CMD_TYPE write is fenced
+  by construction). Validation: `make -C sw/runtime/gem5 HOST_ARCH=x86_64`
+  → `libvortex-gem5-x86_64.so` (43 KB, ELF 64-bit x86-64, SONAME
+  correct, exports `vx_dev_init` matching the OPAE/SimX backend
+  pattern).
+- **Phase 5** (end-to-end gem5 tests): **4–6 h estimated; ✅ x86
+  PATH COMPLETE 2026-05-17** in ~3 h. The bulk of the work turned
+  out to be the OPAE state machine on the device side (cmd_args
+  latching, busy bit, dcr_rsp register) plus the dmaAction
+  dispatch in the SimObject — the test scripts themselves were
+  small. Files added:
+  `ci/gem5_test_vortex_vecadd.py` (full e2e: x86 CPU + identity-mapped
+  PIO+PIN regions + Process.map() + Vortex device). The Phase 3
+  standalone `ci/gem5_test_vortex_hello.py` continues to pass as a
+  fast smoke test. Phase 5 also extended Phase 2's
+  `sim/simx/gem5/vortex_gpgpu.{cpp,h}` with the full OPAE protocol
+  state machine and Phase 3's `sim/simx/gem5/vortex_gpgpu_dev.cc`
+  with `pop_pending_cmd` → `dmaRead`/`dmaWrite` dispatch.
+  Validation: `vecadd -n16` PASSED!, kernel ran 454 cycles at
+  IPC 0.247 on 4×4 threads/warps. Side fix: glibc's `nanosleep()`
+  routes through `clock_nanosleep` (#230) which gem5 SE-mode
+  doesn't implement — switched the host runtime's poll-loop back-off
+  to `sched_yield()` (in gem5's syscall table). ARM e2e gated on
+  user `sudo apt install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu`
+  (same gate as Phase 4's aarch64 build).
+- **Phase 6** (CI): **2–3 h estimated; ✅ COMPLETE 2026-05-17** in
+  ~30 min. Added `gem5()` function to `ci/regression.sh.in`
+  (mirrors `sst()` shape; builds prerequisites + runs both Phase 3
+  standalone and Phase 5 e2e tests via `timeout 120` per
+  [feedback_test_timeout_120s](../../../../.claude/projects/-home-blaisetine-dev/memory/feedback_test_timeout_120s.md);
+  ARM matrix opt-in via `VORTEX_GEM5_ARM=1`). Added `--gem5` case
+  dispatch + `--gem5` to the show_usage line. Updated
+  `.github/workflows/ci.yml`: appended `ci/gem5_install.sh` to the
+  `Setup Toolchain` step (gated on `cache-toolchain.outputs.cache-hit`
+  like SST), added `Export gem5 paths` step (GEM5_HOME + PATH for
+  `build/X86`), added `gem5` to the `tests.matrix.name` list with
+  `exclude: name=gem5 xlen=64` (the device library is XLEN-locked
+  by the gem5 install; one entry is sufficient). Validation:
+  `./ci/regression.sh --gem5` PASSED end-to-end in **5 seconds**
+  (Phase 3 hello standalone + Phase 5 vecadd e2e, both clean).
+- **Phase 7** (docs): **1–2 h estimated; ✅ COMPLETE 2026-05-17** in
+  ~45 min. Added `docs/gem5_integration.md` covering: install
+  (`ci/gem5_install.sh`), Vortex+gem5 build (`USE_GEM5=1`), host
+  runtime cross-compile (`HOST_ARCH`), running tests
+  (`./ci/regression.sh --gem5` and standalone hand commands),
+  a complete minimal Python recipe for hosting Vortex in a custom
+  gem5 system, **six load-bearing invariants** (Process.map order,
+  identity-mapped PIO+PIN, cache flush before download, MMIO
+  fence, single source of truth for memory, USE_SST/GEM5 mutex),
+  architectural choices worth revisiting (doorbells vs. polling,
+  PCIe upgrade path, C ABI rationale), CI integration, and a
+  troubleshooting table covering the 6 most common error modes
+  (wrong library path, missing LD_LIBRARY_PATH, clock_nanosleep
+  syscall, orphan Process, wrong `library=` param, busy-bit hang,
+  ccache stale objects). Added to `docs/index.md`.
+
+Total: **~30–49 hours** of focused work (was ~26–41 h before Phase 0
+was added as a separate phase; the actual work has not grown — the
+gem5 install was implicit in the old Phase 2 estimate and is now
+explicit in Phase 0). Substantial enough to warrant its own branch
+(`gem5_simx_v3` or similar).
+
+**Sequencing with SST:** Phase 1 (`Processor::cycle()`) is shared;
+do it once and both integrations benefit. If SST lands first, gem5
+reuses `Processor::cycle()` unchanged. If gem5 lands first, the SST
+integration's broken `proc_->cycle()` reference
+(`sim/simx/sst/vortex_simulator.cpp:64`) gets fixed as a side effect
+of Phase 1 — net win for both. Phase 0 is gem5-only; SST integration
+does not benefit from it.
diff --git a/docs/proposals/gem5_v2_cp_migration_proposal.md b/docs/proposals/gem5_v2_cp_migration_proposal.md
new file mode 100644
index 000000000..035d0805b
--- /dev/null
+++ b/docs/proposals/gem5_v2_cp_migration_proposal.md
@@ -0,0 +1,758 @@
+# gem5 v2 Backend Redesign — CP-First, Event-Driven Architecture
+
+**Date:** 2026-05-18
+**Status:** Draft for review (supersedes the v1 draft of this file)
+**Author:** Blaise Tine
+
+**Related:**
+- [gem5_simx_v3_proposal.md](gem5_simx_v3_proposal.md) — the prior gem5 integration design (OPAE-style MMIO command FSM). This proposal supersedes its §3 (host/device protocol) and §4 (gem5 SimObject design).
+- Upstream proposals on `origin/tinebp-patch-2`:
+  - [command_processor_proposal.md](command_processor_proposal.md) — CP RTL architecture, vortex2.h API, OpenCL 1.2 mapping.
+  - [cp_pure_v2_callbacks_proposal.md](cp_pure_v2_callbacks_proposal.md) — pure-v2 `callbacks_t` + `vortex::CommandProcessor` C++ class for simx/rtlsim.
+- Upstream commits this proposal targets:
+  - `086d26b` runtime: strip legacy launch_*/dcr_* from callbacks_t (Phase E — pure v2)
+  - `8bc2564` runtime: add cp_mmio_write/read callbacks; wire all 4 backends
+  - `16aa1ca` sim/common: software CommandProcessor C++ class + unit test
+  - `04971a2` tests/regression: rewrite vecadd + sgemm from scratch on vortex2.h
+  - `00aa42f` docs: pure-v2 callbacks_t + software CP for simx/rtlsim
+
+**Decisions ratified before this draft (recorded for traceability):**
+- D1 — **Data plane unified through CP**. All ordered host↔device transfers go through `CMD_MEM_*` in a CP queue. `callbacks_t::mem_upload/download/copy` are reserved for the dispatcher's cold-start writes (ring buffer seeding, kernel ELF preload). No second data path.
+- D2 — **Single clock domain for CP + Vortex in v1**. CP and Vortex tick at the same rate; separate `ClockDomain`s are a follow-on.
+- D3 — **In-process VRAM with DMA-port seam designed in**. CP and Vortex memory accesses go through a single accessor interface backed by the in-process `RAM` in v1; v2 swaps in a gem5 `SimpleMemory` via the SimObject's DMA port behind the same interface.
+- D4 — **MAX_QUEUES = 4** in the gem5 PIO map (matches upstream `VX_CP_NUM_QUEUES` default). v1 host runtime exercises Q0 only; Q1–Q3 hardware is ready for future v2.h multi-queue work.
+
+---
+
+## 0. Purpose
+
+The original gem5 backend ([gem5_simx_v3_proposal.md](gem5_simx_v3_proposal.md))
+shipped an OPAE-style MMIO command FSM on the device and a synchronous
+`vx_start`/`vx_ready_wait` runtime on the host. That was a deliberate
+bring-up choice — it reused the OPAE protocol so we could validate the
+gem5 SE-mode integration (PIO, DMA, cross-arch, ELF interp redirection)
+in isolation from the broader v2 runtime work.
+
+That bring-up is done. With upstream's pure-v2 `callbacks_t` landed,
+keeping the OPAE FSM means:
+- Two control planes coexist on the device (legacy CMD_* state machine
+  AND the CP regfile), doubling the device-side surface.
+- The host runtime carries dead code: `start()`, `ready_wait()`,
+  `dcr_write/read`, and their MMIO poll loops, none of which the
+  dispatcher will call again.
+- The SimObject's polled-tick model misuses gem5's event scheduler:
+  it ticks every clock period even when there's no work, and the host
+  has to spin-wait on `Q_SEQNUM` between unsynchronized tick events.
+
+This proposal is a **redesign**, not a port. It deletes the OPAE
+control plane entirely, makes the CP a first-class event-driven gem5
+device block, runs the Vortex `Processor` as a parallel gem5 event
+chain, and rebuilds the host runtime as a thin shim over the CP
+regfile. The end-state is structurally identical to how a real PCIe
+GPU is modeled in gem5: a SimObject hosting an FSM that fetches
+commands, dispatches DMAs, and kicks an asynchronous compute engine.
+
+---
+
+## 1. What changed upstream (verbatim summary)
+
+The new pure-v2 `callbacks_t` ([sw/runtime/common/callbacks.h](../../sw/runtime/common/callbacks.h)
+on `origin/tinebp-patch-2`) contains:
+
+```
+dev_open, dev_close
+query_caps, memory_info
+mem_alloc, mem_reserve, mem_free, mem_access
+mem_upload, mem_download, mem_copy
+cp_mmio_write, cp_mmio_read       // NEW — sole control plane
+```
+
+It no longer contains `start`, `ready_wait`, `dcr_write`, `dcr_read`.
+
+The dispatcher in [sw/runtime/common/vx_device.cpp](../../sw/runtime/common/vx_device.cpp)
+is now the single source of truth for CP command building. Every
+kernel launch, every DCR program, every fence, every event becomes a
+`CMD_*` descriptor written into a ring buffer in device memory, with
+`cp_mmio_write(Q_TAIL_HI)` as the publish doorbell.
+
+The CP itself ([sim/common/CommandProcessor.h](../../sim/common/CommandProcessor.h))
+is a clock-ticked FSM with 5 hooks (note: not 6 — `vortex_dcr_read` is
+handled by the CP's `dram_write` path back to the requesting `CMD_DCR_READ`'s
+writeback address, not a dedicated hook):
+
+```cpp
+struct Hooks {
+    std::function<void(uint64_t addr, void* dst, size_t bytes)> dram_read;
+    std::function<void(uint64_t addr, const void* src, size_t bytes)> dram_write;
+    std::function<void(uint32_t addr, uint32_t value)> vortex_dcr_write;
+    std::function<void()> vortex_start;
+    std::function<bool()> vortex_busy;
+};
+```
+
+The CP regfile lives at offset `0x1000` on opae/xrt (the AFU shim adds
+the base); the simulator-internal contract per `cp_pure_v2 §6.3` is
+that `cp_mmio_write(off, val)` takes a **CP-internal** offset and each
+backend wrapper adds its own base.
+
+---
+
+## 2. Design pillars
+
+Six pillars define the redesign. Each is a deliberate departure from
+the v1 OPAE-style design.
+
+### 2.1 Single control plane: CP regfile MMIO only
+
+The PIO range is sized for the CP regfile, period. No more legacy
+OPAE CMD_TYPE / CMD_ARG / STATUS registers, no reserved 4 KiB hole,
+no CMD_* state machine on the SimObject.
+
+PIO layout:
+
+```
+PIO_BASE + 0x0000 .. 0x003F   CP global header (CTRL, STATUS, CAPS, IRQ)
+PIO_BASE + 0x0040 .. 0x004F   CP profiling block (CYCLE_LO/HI, FREQ_HZ)
+PIO_BASE + 0x0100 .. 0x01FF   CP per-queue regfile (4 × 0x40)
+                              Q0: 0x0100..0x013F
+                              Q1: 0x0140..0x017F
+                              Q2: 0x0180..0x01BF
+                              Q3: 0x01C0..0x01FF
+PIO_BASE + 0x0200 .. end      reserved (future profiling per-queue, IRQ, …)
+```
+
+Total PIO size: **`0x0200`** (was `0x1000`).
+
+The host wrapper `cp_mmio_write(off, val)` is:
+
+```cpp
+// sw/runtime/gem5/vortex.cpp
+int cp_mmio_write(uint32_t off, uint32_t value) {
+    driver_.mmio_write32(PIO_BASE_ADDR + off, value);
+    return 0;
+}
+```
+
+No `+0x1000` adjustment — gem5 doesn't need to match the AFU's `bit[12]`
+control/data split because there is no AFU. CP regfile starts at
+`PIO_BASE + 0x0`.
+
+### 2.2 Single data plane: CP commands via `CMD_MEM_*`
+
+`vx_enqueue_write/read/copy` (vortex2.h) emit `CMD_MEM_*` descriptors
+into a queue's ring buffer. The CP executes them via its DMA hooks
+against device VRAM. The same path serves user buffer transfers as
+serves CP descriptor fetches — one accessor interface.
+
+`callbacks_t::mem_upload/download/copy` are reserved for the
+dispatcher's **cold-start** writes only: seeding ring buffers at queue
+create, preloading kernel ELFs into device VRAM before they are
+referenced by a `vx_launch_info_t`. The dispatcher does not use them
+on the user-facing data plane.
+
+In our gem5 setup this is essentially free: PIN_BASE_ADDR is
+identity-mapped into the host process VA via `Process::map`, so
+`mem_upload(dev_va, host_src, size)` is `memcpy(host_va_of_PIN_BASE +
+dev_va, host_src, size)` — a regular store sequence that gem5
+translates through the page table to the same physical bytes the
+SimObject's `ram_` sees. No PIO trigger, no command descriptor, no
+state machine.
+
+### 2.3 Event-driven CP, not polled tick
+
+The CP is a self-scheduling gem5 event:
+
+```cpp
+// sim/simx/gem5/vortex_gpgpu_dev.hh — sketch
+class VortexGPGPU : public DmaDevice {
+    EventFunctionWrapper cpTickEvent_;
+    EventFunctionWrapper vortexTickEvent_;
+
+    void cpTick();      // calls cp_.tick(); reschedules if cp_ has work
+    void vortexTick();  // calls processor_.cycle(); reschedules if !is_done()
+};
+```
+
+`cpTick()` calls `cp_.tick()` once and reschedules itself at
+`clockEdge(Cycles(1))` **only if** the CP reports it still has work
+(queue non-empty, command in flight, completion writeback pending).
+Otherwise it returns and the CP is dormant.
+
+CP wake-up paths:
+- Host writes `Q_TAIL_HI` (queue doorbell) → `cp_mmio_write` schedules
+  `cpTickEvent_` at the next clock edge if not already scheduled.
+- Host writes `CP_CTRL.enable` → same.
+- Vortex `tickEvent` observes `is_done()` and signals CP → CP wakes
+  to retire `CMD_LAUNCH`.
+- CP DMA completion → CP self-reschedules until the DMA retires.
+
+**No bounded-tick-burst around `cp_mmio_*`.** No `VORTEX_USE_CP=0`
+transparent-mode escape hatch. The CP is always real, always
+event-driven, and the gem5 event queue arbitrates between CP, Vortex,
+host CPU, and any other SimObjects in the system the way gem5
+expects.
+
+### 2.4 Vortex `Processor` as a parallel event chain
+
+The Vortex GPU runs in its own gem5 event chain, scheduled by the
+CP's `vortex_start` hook and torn down when `processor_.is_done()`:
+
+```cpp
+auto vortex_start = [this]() {
+    if (!vortexTickEvent_.scheduled())
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+};
+
+void VortexGPGPU::vortexTick() {
+    processor_->cycle();
+    if (processor_->any_running())
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+    // CP polls processor_->any_running() via the vortex_busy hook
+    // from its own tick; no notification needed.
+}
+```
+
+Both `cpTickEvent_` and `vortexTickEvent_` use the same `ClockDomain`
+(D2). The gem5 event queue interleaves them with whatever simulated
+host CPU work is happening at the same simulated time — that's where
+the concurrency-realism win comes from. It is also what makes the
+simulation faster overall: idle blocks (CP between commands, Vortex
+between launches) do not consume tick events.
+
+### 2.5 Single VRAM accessor — in-process for v1, DMA-port seam for v2
+
+All device memory access — CP ring fetches, completion writebacks,
+DMA payload reads/writes, Vortex's `MemSim` accesses — goes through
+one accessor interface:
+
+```cpp
+// sim/simx/gem5/dev_mem.h
+class DevMemAccessor {
+public:
+    virtual void read (uint64_t addr, void* dst, size_t bytes) = 0;
+    virtual void write(uint64_t addr, const void* src, size_t bytes) = 0;
+};
+
+class InProcessDevMem : public DevMemAccessor { /* wraps simx::RAM */ };
+class DmaPortDevMem   : public DevMemAccessor { /* wraps DmaDevice port */ };
+```
+
+v1: `Gem5Device` constructs an `InProcessDevMem` wrapping the existing
+`simx::RAM`. CP `dram_read/write` hooks call through it. Vortex's
+`MemSim::read/write` calls through it. PIN_BASE_ADDR's identity
+mapping makes the host process see the same bytes.
+
+v2 seam: replace `InProcessDevMem` with `DmaPortDevMem` (and back VRAM
+with a gem5 `SimpleMemory` connected to the SimObject's DMA port).
+**Zero changes to CP hooks, zero changes to Vortex memory code, zero
+changes to host runtime.** That's the entire point of the abstraction
+— the v2 path is a localized swap, not a rewrite.
+
+This pillar is the reason "in-process for v1" is the right answer
+(per D3): the accessor seam captures the design intent of v2 without
+paying its cost upfront.
+
+### 2.6 Multi-queue PIO map from day one
+
+PIO map reserves 4 queue regfile slots (D4). v1 host runtime enables
+Q0 only and the CP runs Q0 only. The 3 unused queue slots cost ~96
+bytes of PIO range and let the hardware grow into v2.h multi-queue
+without re-versioning the PIO layout (and without bumping the host
+process's mmap size).
+
+Picking 4 now means the gem5 device's regfile shape **matches
+upstream `VX_CP_NUM_QUEUES = 4`**. The OPAE/XRT AFUs will instantiate
+the same 4-queue CP; gem5 should not be the odd one out.
+
+---
+
+## 3. Address space layout
+
+The full memory map after the redesign:
+
+```
+Host process VA (simulated, gem5 SE-mode)
+  0x0000_0000_0000 .. 0x0000_0FFF_FFFF   normal heap / stack / mmap
+  0x0000_1000_0000 .. 0x0000_1FFF_FFFF   PIN_BASE_ADDR (device VRAM,
+                                          identity-mapped via Process::map)
+  0x0000_2000_0000 .. 0x0000_2000_01FF   PIO_BASE_ADDR (CP regfile)
+  0x0000_2000_0200 .. 0x0000_2FFF_FFFF   reserved (future PIO blocks)
+
+gem5 SimObject PA
+  PIN_BASE_ADDR .. PIN_BASE_ADDR + ram_size   device VRAM backing store
+  PIO_BASE_ADDR .. PIO_BASE_ADDR + 0x0200      CP regfile (PIO range)
+```
+
+`PIN_BASE_ADDR` is the same VA on both sides because `Process::map`
+identity-maps it into the simulated host process. The CP and Vortex
+see the same physical bytes; the host process writes to them as
+ordinary memory.
+
+`PIO_BASE_ADDR` is **only** the CP regfile after this redesign. The
+4 KiB OPAE legacy reserved block is gone.
+
+---
+
+## 4. Data flow walkthroughs
+
+### 4.1 Cold start (queue create)
+
+```
+host runtime                                 gem5 SimObject + CP
+─────────────────────────────────────────    ────────────────────────────
+vx_device_open                                — (handle alloc; no IO)
+  └─ callbacks->dev_open()
+       └─ open libvortex-gem5-x86_64.so
+       └─ vortex_gem5_dev_open(...)          construct Gem5Device:
+                                               - new simx::RAM
+                                               - new InProcessDevMem
+                                               - new simx::Processor (wired to InProcessDevMem)
+                                               - new vortex::CommandProcessor
+                                                     (hooks: dram_read/write
+                                                      → InProcessDevMem,
+                                                      vortex_dcr_write → proc_.dcr_write,
+                                                      vortex_start → schedule(vortexTickEvent_),
+                                                      vortex_busy  → proc_.any_running())
+                                               cpTickEvent_ deschduled (no work yet)
+                                               vortexTickEvent_ deschduled
+
+dispatcher: vx_queue_create
+  └─ mem_alloc(ring_size, &ring_va)          allocate from device VRAM bump allocator
+  └─ mem_alloc(8, &head_va)
+  └─ mem_alloc(8, &cmpl_va)
+  └─ mem_upload(ring_va, zeros, ring_size)   memcpy through PIN_BASE mapping
+  └─ cp_mmio_write(Q0_RING_BASE_LO, ring_va lo)
+  └─ cp_mmio_write(Q0_RING_BASE_HI, ring_va hi)
+  └─ cp_mmio_write(Q0_HEAD_ADDR_LO/HI, head_va)
+  └─ cp_mmio_write(Q0_CMPL_ADDR_LO/HI, cmpl_va)
+  └─ cp_mmio_write(Q0_RING_SIZE_LOG2, log2)
+  └─ cp_mmio_write(Q0_CONTROL, enable=1)     → SimObject PIO write handler:
+                                                   cp_.mmio_write(off, val);
+                                                   if cp_.has_work() and
+                                                      !cpTickEvent_.scheduled():
+                                                     schedule(cpTickEvent_,
+                                                              clockEdge(Cycles(1)))
+                                                   (CP has work because Q0 is now enabled
+                                                    and may have a non-empty ring)
+  └─ cp_mmio_write(CP_CTRL, enable=1)        — already enabled (idempotent)
+```
+
+### 4.2 Kernel launch (`vx_enqueue_launch`)
+
+```
+dispatcher                                   CP (in SimObject)
+─────────────────────────────────────────    ────────────────────────────
+mem_upload(ring_va + tail, CMD_DCR_WRITE     (rings now non-empty;
+            for KMU PC, grid, block, args)   CP will fetch when scheduled)
+mem_upload(ring_va + tail, CMD_LAUNCH)
+cp_mmio_write(Q0_TAIL_LO, tail_lo)
+cp_mmio_write(Q0_TAIL_HI, tail_hi)           → cpTickEvent_ schedule check fires;
+                                              schedule for next clock edge
+
+(host returns immediately — async by design;
+ dispatcher does not block here. Polling
+ happens later via cp_mmio_read(Q0_SEQNUM)
+ from vx_event_wait_all.)
+```
+
+At next clock edge:
+
+```
+cpTick():
+  cp_.tick()
+    [CPE0 FSM: fetch ring head cache line
+     via dram_read(ring_va, &cl, 64) → InProcessDevMem.read → ram_.read]
+    [decode CMD_DCR_WRITE; route through vortex_dcr_write hook
+     → proc_.dcr_write(addr, value); retire; bump seqnum]
+    [dram_write(cmpl_va, &seqnum, 8); dram_write(head_va, &head, 8)]
+  reschedule cpTickEvent_ (still has CMD_LAUNCH pending)
+
+cpTick() (next):
+  cp_.tick()
+    [CPE0 FSM: fetch next CL, decode CMD_LAUNCH]
+    [vortex_start() → schedule(vortexTickEvent_, clockEdge(Cycles(1)))]
+    [CPE0 enters WAIT_FOR_BUSY state — polls vortex_busy() each tick]
+  reschedule cpTickEvent_ (CMD_LAUNCH in flight)
+
+… concurrent vortexTick() advances processor_.cycle() …
+… until processor_.is_done(); on next CP tick:
+
+cpTick():
+  cp_.tick()
+    [CPE0 sees !vortex_busy(); retire CMD_LAUNCH; bump seqnum]
+    [dram_write(cmpl_va, &seqnum, 8)]
+  CP has no more work; do NOT reschedule cpTickEvent_.
+  vortexTickEvent_ stopped scheduling itself when is_done() became true.
+  System is dormant.
+
+Host poll:
+  cp_mmio_read(Q0_SEQNUM_LO)                 → SimObject PIO read handler:
+                                                   return cp_.mmio_read(off);
+                                              (returns the retired seqnum;
+                                               no tick burst needed because the
+                                               cmpl_va writeback already happened
+                                               in earlier cpTick())
+```
+
+The host never spins. The CP never idle-ticks. Vortex never runs past
+`is_done()`. This is the win.
+
+### 4.3 `vx_enqueue_write` (data plane through CP)
+
+```
+dispatcher                                   CP
+─────────────────────────────────────────    ────────────────────────────
+(host_src is in regular heap, not PIN_BASE   — note: dispatcher copies
+ — so the dispatcher copies it into a       payloads into PIN_BASE first
+ pinned device buffer it owns OR the         on backends that require it.
+ caller used vx_buffer_map to write          For gem5 + Process::map this is
+ directly into a host-mapped device          a memcpy through the mapped
+ buffer)                                     pages.)
+
+mem_upload(ring_va + tail, CMD_MEM_WRITE
+            { src=pinned_host_va,            (pinned_host_va is in PIN_BASE
+              dst=dev_va,                     so it's also a device PA)
+              size=N })
+cp_mmio_write(Q0_TAIL_HI, ...)               → CP schedules
+
+cpTick():
+  cp_.tick()
+    [decode CMD_MEM_WRITE]
+    [CP DMA FSM: dram_read(src, &buf, chunk)
+                  dram_write(dst, &buf, chunk)
+     looping over the transfer in 64 B steps]
+    [retire; bump seqnum; cmpl writeback]
+```
+
+Both endpoints (`src`, `dst`) are in the same flat physical space
+(PIN_BASE region). The CP's DMA FSM doesn't distinguish host vs.
+device addresses — they're the same accessor.
+
+---
+
+## 5. Component design
+
+### 5.1 `sim/simx/gem5/vortex_gpgpu.{cpp,h}` — device library
+
+**Responsibilities:**
+- Construct `RAM`, `Processor`, `CommandProcessor`, `InProcessDevMem`.
+- Provide C ABI: `vortex_gem5_dev_open/close`, `vortex_gem5_cp_mmio_{read,write}`,
+  `vortex_gem5_dram_access` (for SimObject DMA path → backing store),
+  `vortex_gem5_cp_tick`, `vortex_gem5_vortex_tick`,
+  `vortex_gem5_cp_has_work`, `vortex_gem5_vortex_busy`.
+- Provide kernel preload for the Phase 3 standalone test (unchanged).
+
+**Removed (all OPAE state machine carry-over):**
+- `pending_cmd_`, `cmd_args_`, `dcr_rsp_`, `busy_` fields
+- `mmio_write64`/`mmio_read64` and the CMD_TYPE dispatch
+- `pop_pending_cmd`, `get_cmd_arg`, `set_busy`, `load_args`
+- `process_cmd` and the `CMD_RUN`/`CMD_DCR_*`/`CMD_MEM_*` handlers
+  (last one re-emerges inside the CP, not here)
+
+**Added:**
+- `cp_` member (`vortex::CommandProcessor`) with hooks bound in ctor.
+- `dev_mem_` member (`std::unique_ptr<DevMemAccessor>`) — `InProcessDevMem`
+  for v1.
+- C-ABI surface for the SimObject (below).
+
+### 5.2 `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}` — gem5 SimObject
+
+**Class:** `VortexGPGPU : public DmaDevice` (unchanged from current).
+
+**Members:**
+- `pioAddr_, pioSize_ = 0x0200` (was `0x1000`).
+- `EventFunctionWrapper cpTickEvent_;`
+- `EventFunctionWrapper vortexTickEvent_;`
+- `deviceHandle_` — opaque from device library.
+
+**PIO `read(PacketPtr)`:**
+```cpp
+const Addr off = pkt->getAddr() - pioAddr_;
+uint32_t value = 0;
+abi_.cp_mmio_read(deviceHandle_, uint32_t(off), &value);
+pkt->setLE<uint32_t>(value);
+pkt->makeAtomicResponse();
+return pioLatency_;
+```
+
+**PIO `write(PacketPtr)`:**
+```cpp
+const Addr off = pkt->getAddr() - pioAddr_;
+const uint32_t value = pkt->getLE<uint32_t>();
+abi_.cp_mmio_write(deviceHandle_, uint32_t(off), value);
+maybeWakeCp();
+pkt->makeAtomicResponse();
+return pioLatency_;
+```
+
+**`maybeWakeCp()`:**
+```cpp
+if (abi_.cp_has_work(deviceHandle_) && !cpTickEvent_.scheduled())
+    schedule(cpTickEvent_, clockEdge(Cycles(1)));
+```
+
+**`cpTick()`:**
+```cpp
+abi_.cp_tick(deviceHandle_);
+if (abi_.cp_has_work(deviceHandle_))
+    schedule(cpTickEvent_, clockEdge(Cycles(1)));
+```
+
+**`vortexTick()`:**
+```cpp
+abi_.vortex_tick(deviceHandle_);
+if (abi_.vortex_busy(deviceHandle_))
+    schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+```
+
+**`vortex_start` hook callback (from device library into the SimObject):**
+schedules `vortexTickEvent_` at next clock edge if not scheduled.
+Implemented as a small C ABI: `vortex_gem5_set_start_handler(handle,
+fn, ctx)` registered in `VortexGPGPU::init()`; the device library
+calls it from the CP's `vortex_start` lambda.
+
+### 5.3 `sw/runtime/gem5/vortex.cpp` — host runtime
+
+**Responsibilities (shrunken):**
+- `init` / `get_caps` / `mem_info` (unchanged)
+- `mem_alloc` / `mem_reserve` / `mem_free` / `mem_access` (unchanged
+  bump allocator + `PIN_BASE_ADDR` math)
+- `mem_upload` / `mem_download` / `mem_copy` → `memcpy` through the
+  PIN_BASE identity mapping (renamed from `upload`/`download`/`copy`
+  in the v1 backend)
+- `cp_mmio_write` → `driver_.mmio_write32(PIO_BASE_ADDR + off, val)`
+- `cp_mmio_read` → `driver_.mmio_read32(PIO_BASE_ADDR + off, &val)`
+
+**Removed:**
+- `start()`, `ready_wait()`, `dcr_write()`, `dcr_read()` methods
+- `MMIO_CMD_TYPE` / `MMIO_STATUS` constants and their poll loop
+- `<sched.h>` and the `sched_yield()` back-off (no host poll loop —
+  the dispatcher's `vx_event_wait_all` does its own polling against
+  `Q_SEQNUM`)
+
+**Kept:**
+- The pinned-region setup, `PIN_BASE_ADDR`, `PIO_BASE_ADDR`,
+  `mmio_fence()`, the bump allocator state.
+
+### 5.4 `sw/runtime/gem5/driver.{cpp,h}` — pinned region + MMIO helpers
+
+**Added:**
+- `mmio_write32(uint64_t pa, uint32_t value)` — 4-byte store with
+  fence. Implemented as `*reinterpret_cast<volatile uint32_t*>(pa) =
+  value; mmio_fence();`.
+- `mmio_read32(uint64_t pa, uint32_t* value)` — symmetric.
+
+**Removed:**
+- `mmio_write64` / `mmio_read64` — no caller after the redesign.
+- The 64-bit MMIO path was a v1 choice for OPAE-style 8-byte argument
+  registers. The CP regfile is 32-bit.
+
+### 5.5 `sim/simx/gem5/VortexGPGPU.py` — SimObject Python binding
+
+**Params:**
+- `library = Param.String(...)` (unchanged)
+- `kernel = Param.String("")` (Phase 3 standalone preload — unchanged)
+- `pio_addr = Param.Addr(0x20000000)` (unchanged)
+- `pio_size = Param.Addr(0x0200)` — **changed from 0x1000** to match
+  the redesigned PIO map
+- `pio_latency = Param.Latency("100ns")` (unchanged)
+- `dma_latency = Param.Latency("100ns")` (unchanged)
+- (new) `max_queues = Param.Unsigned(4)` — for forward compatibility;
+  v1 enforces == 4
+
+### 5.6 `sim/simx/Makefile` — build wiring
+
+- Add `$(SIM_COMMON_DIR)/CommandProcessor.cpp` to the `USE_GEM5=1`
+  source list (the device library links it; the SimObject indirects
+  via the C ABI).
+
+### 5.7 `sw/runtime/gem5/Makefile` — build wiring
+
+- No source-list changes (the CommandProcessor lives in the device
+  library, not the host runtime).
+- `<sched.h>` include and any sched-related CFLAGS go away with the
+  `sched_yield` poll loop.
+
+---
+
+## 6. Migration phasing
+
+The whole redesign lands as **one commit** per the "substantial,
+testable feature" rule. The internal phasing below is for validation
+checkpoints during implementation, not for separate commits.
+
+### Phase M1 — Merge upstream
+
+- `git merge --no-commit --no-ff origin/tinebp-patch-2`
+- Conflicts (all expected):
+  - `sw/runtime/stub/Makefile` — keep HOST_ARCH; take new v2 dispatcher SRCS
+  - Possibly `sw/runtime/common/callbacks.{h,inc}` — defer to upstream version
+- Build will not compile until M2 + M3 complete. That is acceptable
+  inside one commit; the commit is only created when M3 builds and
+  passes regression.
+
+### Phase M2 — Device-side redesign
+
+- Add `sim/simx/gem5/dev_mem.{h,cpp}` (`DevMemAccessor` + `InProcessDevMem`).
+- Rewrite `sim/simx/gem5/vortex_gpgpu.{cpp,h}`:
+  - Delete OPAE state machine (per §5.1).
+  - Embed `cp_` with hooks bound to `InProcessDevMem` + `proc_`.
+  - Export the new C ABI.
+- Rewrite `sim/simx/gem5/vortex_gpgpu_dev.{cc,hh}`:
+  - PIO range shrinks to 0x0200.
+  - `read`/`write` route 32-bit packets to `cp_mmio_{read,write}`.
+  - `cpTickEvent_` + `vortexTickEvent_` self-scheduling per §2.3, §2.4.
+  - `vortex_start` callback registration.
+- Update `VortexGPGPU.py` (`pio_size = 0x0200`, `max_queues = 4`).
+- `sim/simx/Makefile`: add `CommandProcessor.cpp`.
+
+**Validation:** `make -C build/sim/simx USE_GEM5=1` builds.
+`./hw/unittest/cp_sim/` unit test passes (smoke-tests the
+CommandProcessor wiring; runnable without gem5 itself).
+
+### Phase M3 — Host runtime redesign
+
+- Rewrite `sw/runtime/gem5/vortex.cpp`:
+  - Drop `start`/`ready_wait`/`dcr_*`.
+  - Rename `upload`/`download`/`copy` → `mem_upload`/`mem_download`/`mem_copy`.
+  - Add `cp_mmio_{read,write}` (3-line MMIO wrappers).
+  - Drop `<sched.h>` and the poll loop.
+- Add `mmio_{read,write}32` to `driver.{cpp,h}`; drop the 64-bit helpers.
+- Build for x86_64 (default) and aarch64 (cross-compile via existing
+  HOST_ARCH switch).
+
+**Validation:**
+- Hostless test (`ci/gem5_run_hostless_app.py`): PASSES.
+  (No host runtime involvement.)
+- `./ci/regression.sh --gem5`: PASSES — hello + vecadd + sgemm e2e on x86.
+- `VORTEX_GEM5_ARM=1 ./ci/regression.sh --gem5`: PASSES — same 3 tests
+  on aarch64. Total 6/6 PASS matches pre-redesign baseline.
+
+### Phase M4 — Documentation
+
+Update [docs/gem5_integration.md](../gem5_integration.md):
+- Replace the OPAE protocol description with the CP regfile + ring
+  buffer architecture.
+- Update the 6 load-bearing invariants list:
+  - Drop OPAE CMD_* invariants.
+  - Add: "CP regfile is at `PIO_BASE + 0x0`, 0x200 bytes, 32-bit
+    register stride."
+  - Add: "PIN_BASE is identity-mapped via Process::map; host
+    runtime's `mem_upload` is a direct memcpy."
+  - Add: "CP and Vortex tick events self-schedule only while work is
+    pending; idle is observable as cpTickEvent_ unscheduled."
+
+Update [docs/proposals/gem5_simx_v3_proposal.md](gem5_simx_v3_proposal.md):
+- Add a "Status: Superseded by gem5_v2_cp_migration_proposal" header
+  on §3 (host/device protocol) and §4 (SimObject design).
+- Keep §0–§2 (motivation, source-tree layout) and §5+ (testing,
+  install, cross-arch) — those parts remain accurate.
+
+---
+
+## 7. Validation criteria
+
+The redesign is complete when all of the following hold:
+
+1. **`./ci/regression.sh --gem5`** PASSES on x86 (hello + vecadd +
+   sgemm e2e). Total wall time ≤ 30 s (was 16 s pre-redesign; the
+   event-driven design should be at least as fast because idle blocks
+   no longer tick).
+2. **`VORTEX_GEM5_ARM=1 ./ci/regression.sh --gem5`** PASSES on
+   aarch64 (same 3 tests).
+3. **No regression on non-gem5 builds.** `make -C build/sim/simx`
+   (default), `USE_SST=1` still build and pass.
+4. **No OPAE leftovers grep-detectable.** `grep -r CMD_TYPE\|CMD_RUN\|
+   pending_cmd_\|get_cmd_arg sim/simx/gem5/ sw/runtime/gem5/` returns
+   zero hits.
+5. **Event-driven invariants hold.** Run a sim with a 100 ms idle gap
+   between two enqueues; verify (via debug log) that `cpTickEvent_`
+   is unscheduled during the gap and that the host CPU advances
+   unhindered.
+6. **PIO map size matches design.** `pio_size = 0x0200` exposed in
+   `VortexGPGPU.py`; host runtime never writes outside that range.
+
+---
+
+## 8. Risks
+
+| # | Risk | Mitigation |
+|---|---|---|
+| R1 | CP `vortex_start` hook needs to schedule a gem5 event from inside a hook called during PIO write handling. gem5 SimObjects can `schedule()` from anywhere, but only from the gem5 thread. Verify the C ABI doesn't route the hook from a different thread. | Hooks are bound at construction; called from `cp_.tick()` which is called from `cpTick()` which is itself a gem5 event handler — same thread. No issue. |
+| R2 | `vortexTick()` advancing `processor_.cycle()` per Vortex clock period is slow if the cycle()-per-tick ratio is high (a Vortex clock period is shorter than a CPU-host instruction time). | Match Vortex's `ClockDomain` to a realistic Vortex frequency (1 GHz). gem5 only schedules events at actual clock edges; the per-tick cost is one C++ function call. Acceptable. |
+| R3 | New vecadd/sgemm tests (rewritten on vortex2.h upstream) may use features (events, queue priority) we don't validate end-to-end on gem5. | M3 validation surfaces this. If a test uses an unsupported vortex2.h primitive, file a follow-up; M3 acceptance is contingent on the existing 3-test matrix passing. |
+| R4 | `DevMemAccessor` interface change forces a Vortex `MemSim` rewrite. | `MemSim` already takes a memory backend. v1 wires it to `InProcessDevMem` which delegates to `simx::RAM` — same backing buffer as today. Zero code change in Vortex itself. |
+| R5 | The Phase 3 standalone test loads a kernel via `kernel=` SimObject param, then primes KMU DCRs directly. After the redesign, KMU DCR programming must route through the CP, which means the standalone test needs a tiny one-shot ring submission instead of direct `proc_.dcr_write` calls. | Add a `vortex_gem5_run_standalone_kernel(handle, kernel_path)` C ABI in the device library that builds a synthetic CMD_DCR_WRITE+CMD_LAUNCH ring and runs the CP to completion. ~30 LoC. Keeps the standalone test path real instead of a back-door. |
+| R6 | Vortex's `cycle()` does not handle being called only when scheduled; e.g. internal counters reset assuming consecutive ticks. | Audit during M2. Vortex's existing implementation already supports being suspended (simx uses it both ways). |
+
+---
+
+## 9. Out of scope
+
+- **XLEN=64 device library.** Current setup is XLEN=32 only.
+  Orthogonal.
+- **Separate `ClockDomain` for Vortex vs. CP.** D2 ratifies single
+  domain for v1.
+- **Gem5 `SimpleMemory` backing VRAM via DMA port.** D3 ratifies
+  in-process for v1 with the accessor seam in place. v2 is a
+  follow-on commit that swaps the accessor.
+- **PCIe BAR mapping** instead of raw PIO range. Original gem5_simx_v3
+  §3.6 commits to this; orthogonal to the CP redesign.
+- **Multi-queue host runtime.** Q1–Q3 hardware is there but the host
+  runtime exercises Q0 only. Multi-queue runtime work follows
+  upstream vortex2.h.
+- **Profiling timestamp writeback path.** The upstream CP supports
+  `F_PROFILE` flag + `VX_cp_profiling`; gem5 backend will get it for
+  free once the CP implementation lands. No gem5-specific work.
+
+---
+
+## 10. Estimated effort
+
+Calibrated against the v1 OPAE backend (~3 days from scratch) and the
+recent rejected inline-adaptation attempt (Option A reached ~50%
+completion in ~30 min before being stopped):
+
+- **Phase M1 (merge):** 10 min. Three known conflicts, all mechanical.
+- **Phase M2 (device redesign):** 8–10 h. Bulk of the work:
+  - `dev_mem.{h,cpp}` — 1 h
+  - `vortex_gpgpu.{cpp,h}` rewrite — 4 h (mostly subtraction)
+  - `vortex_gpgpu_dev.{cc,hh}` rewrite — 3 h (event-driven scheduling)
+  - `VortexGPGPU.py` + Makefile + standalone test ABI — 1 h
+- **Phase M3 (host runtime):** 2–3 h. Mostly subtraction; new code is
+  small.
+- **Phase M4 (docs):** 1 h.
+
+**Total: 11–14 h focused work, single commit on `feature_gem5`.**
+
+Calibration vs. the v1 draft of this proposal (which claimed 7–11 h):
+the redesign is longer because (a) event-driven scheduling needs more
+care than a polled tick, (b) the OPAE deletion is comprehensive
+(M4 was "optional" in v1), and (c) we now have to wire the standalone
+test path through a real CP ring submission instead of direct DCR
+writes.
+
+---
+
+## 11. Why not a smaller change?
+
+For the record — the alternatives that were considered and rejected:
+
+- **Adapt-only (the rejected Option A from v1 of this doc).** Embed
+  `vortex::CommandProcessor` in the host runtime; translate each CP
+  hook back into the existing OPAE MMIO protocol. **Rejected:**
+  CP runs on the wrong side of the host/device boundary, every ring
+  fetch costs an MMIO+DMA round trip across the simulated bus,
+  device-side OPAE state machine stays as permanent dead code, no
+  alignment with how opae/xrt do it on real silicon.
+- **Device-side CP, keep OPAE for `mem_upload` data plane.** The v1
+  draft of this proposal. **Rejected:** Two control planes coexist,
+  two protocols to keep in sync, no clean line between "what goes
+  through CP" and "what doesn't."
+- **`VORTEX_USE_CP=0` transparent mode** as a permanent bring-up
+  escape hatch. **Rejected:** defeats the purpose of a cycle-accurate
+  simulator; the gem5 backend's job is to model the hardware, not to
+  emulate around it.
+
+The redesign in this proposal is the minimum that does not leave dead
+code, dead protocols, or bring-up hacks in the final state.
diff --git a/docs/proposals/sst_simx_v3_proposal.md b/docs/proposals/sst_simx_v3_proposal.md
index 3dbe0a00e..65db9ebbb 100644
--- a/docs/proposals/sst_simx_v3_proposal.md
+++ b/docs/proposals/sst_simx_v3_proposal.md
@@ -1,7 +1,7 @@
 # SST Integration for SimX v3 — Proposal
 
 **Date:** 2026-05-03
-**Status:** Draft
+**Status:** Implemented — note that `ci/sst_test_vortex_*.py` have been consolidated into a single generic runner [ci/sst_run_hostless_app.py](../../ci/sst_run_hostless_app.py) (parameterized by `VORTEX_TEST_DIR` + `VORTEX_TEST_KERNEL`, parallel to [ci/gem5_run_hostless_app.py](../../ci/gem5_run_hostless_app.py)). The naming reserves the `ci/sst_run_app.py` slot for a future host-CPU-driven SST integration (none today — see §3). The memHierarchy wiring described in §6 is no longer kept as a standalone test runner; the recipe stays here as documentation. References to specific `sst_test_vortex_<test>.py` filenames below are historical.
 **Author:** Blaise Tine
 **Related:**
 [simx_v3_proposal.md](simx_v3_proposal.md) (Phase 5: TLM data path),
diff --git a/sim/simx/Makefile b/sim/simx/Makefile
index 059484eff..6a743bbca 100644
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -2,8 +2,17 @@ include ../common.mk
 
 DESTDIR ?= $(CURDIR)
 USE_SST ?= 0
+USE_GEM5 ?= 0
 #SST_PKG ?= SST-14.1 # default SST package name
 
+# USE_SST and USE_GEM5 are mutually exclusive — different external
+# simulator wrappers with different LDFLAGS; building both into one
+# binary makes no sense and the proposal docs/proposals/gem5_simx_v3_proposal.md
+# §8 calls this out explicitly.
+ifeq ($(USE_SST)$(USE_GEM5),11)
+$(error USE_SST=1 and USE_GEM5=1 are mutually exclusive)
+endif
+
 OBJ_DIR = $(DESTDIR)/obj
 CONFIG_FILE = $(DESTDIR)/simx_config.stamp
 SRC_DIR = $(VORTEX_HOME)/sim/simx
@@ -24,7 +33,7 @@ LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulato
 XCONFIGS := $(shell python3 $(ROOT_DIR)/ci/gen_config.py --config=$(VORTEX_HOME)/VX_config.toml --cflags='$(CONFIGS) -DXLEN_$(XLEN)')
 
 # Source files definition
-SRCS = $(SW_COMMON_DIR)/util.cpp $(SIM_COMMON_DIR)/mem.cpp $(SW_COMMON_DIR)/softfloat_ext.cpp $(SW_COMMON_DIR)/rvfloats.cpp $(SIM_COMMON_DIR)/dram_sim.cpp
+SRCS = $(SW_COMMON_DIR)/util.cpp $(SIM_COMMON_DIR)/mem.cpp $(SW_COMMON_DIR)/softfloat_ext.cpp $(SW_COMMON_DIR)/rvfloats.cpp $(SIM_COMMON_DIR)/dram_sim.cpp $(SIM_COMMON_DIR)/CommandProcessor.cpp
 SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/scheduler.cpp $(SRC_DIR)/cta_dispatcher.cpp $(SRC_DIR)/barrier_unit.cpp
 SRCS += $(SRC_DIR)/kmu/kmu.cpp
 SRCS += $(SRC_DIR)/decode.cpp $(SRC_DIR)/decompressor.cpp $(SRC_DIR)/scoreboard.cpp $(SRC_DIR)/sequencer.cpp $(SRC_DIR)/opc_unit.cpp $(SRC_DIR)/dispatcher.cpp
@@ -96,6 +105,12 @@ ifeq ($(USE_SST),1)
 	SRCS     += $(SRC_DIR)/sst/vortex_simulator.cpp $(SRC_DIR)/sst/vortex_gpgpu.cpp
 endif
 
+# gem5 integration: build libvortex-gem5.so (the C ABI library loaded
+# by the gem5 VortexGPGPU SimObject). The gem5 wrapper source is kept
+# out of the default SRCS list and pulled into VORTEX_GEM5_SRCS so the
+# default simx binary does not carry it.
+VORTEX_GEM5_SRCS := $(SRC_DIR)/gem5/vortex_gpgpu.cpp $(SRC_DIR)/gem5/dev_mem.cpp
+
 # Debugging
 ifdef DEBUG
 	CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
@@ -128,17 +143,25 @@ VORTEX_SST_OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(VORTEX_SST_SRCS)
 DEPS += $(VORTEX_SST_OBJS:.o=.d)
 endif
 
+ifeq ($(USE_GEM5), 1)
+VORTEX_GEM5_OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(VORTEX_GEM5_SRCS))
+DEPS             += $(VORTEX_GEM5_OBJS:.o=.d)
+endif
+
 
 # optional: pipe through ccache if you have it
 CXX := $(if $(shell which ccache),ccache $(CXX),$(CXX))
 
 PROJECT := simx
 VORTEX_LIB := libvortex.so
+VORTEX_GEM5_LIB := libvortex-gem5.so
 
-.PHONY: all force clean clean-lib clean-exe clean-obj libvortex clean-libvortex
+.PHONY: all force clean clean-lib clean-exe clean-obj libvortex clean-libvortex libvortex-gem5 clean-libvortex-gem5
 
 ifeq ($(USE_SST), 1)
 all: $(DESTDIR)/$(PROJECT) $(DESTDIR)/$(VORTEX_LIB)
+else ifeq ($(USE_GEM5), 1)
+all: $(DESTDIR)/$(PROJECT) $(DESTDIR)/$(VORTEX_GEM5_LIB)
 else
 all: $(DESTDIR)/$(PROJECT)
 endif
@@ -186,6 +209,13 @@ $(DESTDIR)/$(VORTEX_LIB): $(OBJS) $(VORTEX_SST_OBJS)
 	-shared -o $@ \
 	$(LDFLAGS) $(SST_LFLAGS)
 
+# Vortex gem5 device shared library — the gem5 SimObject dlopens this
+# and calls the C ABI declared in sim/simx/gem5/vortex_gpgpu.h.
+libvortex-gem5: $(DESTDIR)/$(VORTEX_GEM5_LIB)
+
+$(DESTDIR)/$(VORTEX_GEM5_LIB): $(OBJS) $(VORTEX_GEM5_OBJS)
+	$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -Wl,-soname,$(VORTEX_GEM5_LIB) -o $@
+
 # updates the timestamp when flags changed.
 $(CONFIG_FILE): force
 	@mkdir -p $(@D)
@@ -205,10 +235,13 @@ clean-lib:
 clean-libvortex:
 	rm -f $(DESTDIR)/libvortex.so
 
+clean-libvortex-gem5:
+	rm -f $(DESTDIR)/$(VORTEX_GEM5_LIB)
+
 clean-exe:
 	rm -f $(DESTDIR)/$(PROJECT)
 
 clean-obj:
 	rm -rf $(OBJ_DIR)
 
-clean: clean-lib clean-exe clean-obj
+clean: clean-lib clean-libvortex clean-libvortex-gem5 clean-exe clean-obj
diff --git a/sim/simx/gem5/SConscript b/sim/simx/gem5/SConscript
new file mode 100644
index 000000000..535ada56f
--- /dev/null
+++ b/sim/simx/gem5/SConscript
@@ -0,0 +1,18 @@
+# -*- mode:python -*-
+#
+# Vortex SimObjects for gem5. Installed into $GEM5_HOME/src/dev/vortex/
+# by sim/simx/gem5/install.sh. Picked up automatically by gem5's
+# top-level SConstruct via the SConscript-recursion rule at
+# SConstruct:1000.
+#
+# This file's source of truth lives in the Vortex tree
+# (sim/simx/gem5/SConscript); the installer just copies it.
+
+Import('*')
+
+SimObject('VortexGPGPU.py', sim_objects=['VortexGPGPU'])
+Source('vortex_gpgpu_dev.cc')
+
+# DebugFlag for VortexGPGPU traces. Enable with:
+#   gem5.opt --debug-flags=VortexGPGPU ...
+DebugFlag('VortexGPGPU')
diff --git a/sim/simx/gem5/VortexGPGPU.py b/sim/simx/gem5/VortexGPGPU.py
new file mode 100644
index 000000000..e89168c8b
--- /dev/null
+++ b/sim/simx/gem5/VortexGPGPU.py
@@ -0,0 +1,68 @@
+# Copyright © 2019-2023
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Python SimObject binding for the gem5-side VortexGPGPU device.
+# Mirrors the inheritance graph of the C++ side: DmaDevice → PioDevice
+# → ClockedObject.
+
+from m5.objects.Device import DmaDevice
+from m5.params import *
+
+
+class VortexGPGPU(DmaDevice):
+    type = "VortexGPGPU"
+    cxx_header = "dev/vortex/vortex_gpgpu_dev.hh"
+    cxx_class = "gem5::VortexGPGPU"
+
+    # Path to libvortex-gem5.so produced by `make -C sim/simx
+    # USE_GEM5=1` in the Vortex build dir. Required; the C++ ctor
+    # fatals if empty.
+    library = Param.String("Absolute path to libvortex-gem5.so")
+
+    # Optional kernel image preloaded at startup() via
+    # vortex_gem5_load_kernel. When set, the device runs the kernel to
+    # completion via its own vortexTickEvent_ scheduler and exits the
+    # sim loop on done — no host CPU or MMIO traffic required. This is
+    # the Phase 3 standalone smoke test. Hosted mode (kernel="" or
+    # unset) starts idle; the host runtime drives the CP via MMIO and
+    # the CP schedules its own ticks.
+    kernel = Param.String("", "Optional .vxbin/.bin/.hex to preload at boot")
+
+    # PIO range. After the gem5_v2_cp_migration redesign the PIO range
+    # is exactly the CP regfile: 0x40 of globals + 4 × 0x40 per-queue
+    # slots = 0x140 used today, 0x200 reserved for headroom. Per
+    # gem5_v2_cp_migration_proposal §3 the legacy OPAE register window
+    # is gone.
+    pio_addr    = Param.Addr(0x20000000, "PIO base address (CP regfile)")
+    pio_size    = Param.Addr(0x0200, "PIO region size (CP regfile, bytes)")
+    pio_latency = Param.Latency("1ns", "PIO access latency")
+
+    # BAR-mapped VRAM. The device exposes its in-process simx::RAM
+    # over the same physical-address range the host's PIN_BASE_ADDR
+    # identity-maps to via Process::map(). Host CPU writes land in
+    # the same bytes the CP's dram_read hook and Vortex's MemSim see
+    # — single source of truth for device memory (gem5_v2_cp_migration
+    # §2.2 single data plane).
+    #
+    # Disabled by default (pin_size=0) since the standalone smoke test
+    # uses load_kernel(), not host memcpy through PIN. Hosted (e2e)
+    # tests opt in by setting both pin_addr and pin_size to match the
+    # host runtime's PIN_BASE_ADDR / PIN_REGION_SIZE.
+    pin_addr    = Param.Addr(0x100000000, "VRAM base address (BAR-mapped)")
+    pin_size    = Param.Addr(0, "VRAM region size (bytes); 0 disables")
+
+    # Compile-time CP capacity that this PIO map can address. v1 host
+    # runtime exercises Q0 only; Q1–Q3 hardware is provisioned for
+    # future v2.h multi-queue work (gem5_v2_cp_migration_proposal §2.6,
+    # D4). Matches upstream VX_CP_NUM_QUEUES default.
+    max_queues  = Param.Unsigned(4, "Number of CP queues the PIO map covers")
diff --git a/sim/simx/gem5/dev_mem.cpp b/sim/simx/gem5/dev_mem.cpp
new file mode 100644
index 000000000..16943bd06
--- /dev/null
+++ b/sim/simx/gem5/dev_mem.cpp
@@ -0,0 +1,32 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dev_mem.h"
+
+#include <mem.h>
+
+namespace vortex_gem5 {
+
+void InProcessDevMem::read(uint64_t addr, void* dst, std::size_t bytes) {
+    ram_.enable_acl(false);
+    ram_.read(static_cast<uint8_t*>(dst), addr, bytes);
+    ram_.enable_acl(true);
+}
+
+void InProcessDevMem::write(uint64_t addr, const void* src, std::size_t bytes) {
+    ram_.enable_acl(false);
+    ram_.write(static_cast<const uint8_t*>(src), addr, bytes);
+    ram_.enable_acl(true);
+}
+
+} // namespace vortex_gem5
diff --git a/sim/simx/gem5/dev_mem.h b/sim/simx/gem5/dev_mem.h
new file mode 100644
index 000000000..cabe7587c
--- /dev/null
+++ b/sim/simx/gem5/dev_mem.h
@@ -0,0 +1,61 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Device-memory accessor seam for the gem5 backend.
+//
+// Per gem5_v2_cp_migration_proposal §2.5: every device-memory access —
+// CP ring fetches, completion writebacks, CMD_MEM_* DMA payload, Vortex
+// MemSim loads/stores — funnels through this interface. In v1 the only
+// implementation is InProcessDevMem (wraps simx::RAM). In v2 a
+// DmaPortDevMem will replace it; CP hooks and Vortex memory code are
+// untouched.
+//
+// Layered on top of simx::RAM rather than replacing it because Vortex's
+// existing MemSim already knows how to talk to RAM; we only need the
+// accessor seam for the CP side.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace vortex {
+class RAM;
+} // namespace vortex
+
+namespace vortex_gem5 {
+
+class DevMemAccessor {
+public:
+    virtual ~DevMemAccessor() = default;
+
+    virtual void read (uint64_t addr, void* dst,       std::size_t bytes) = 0;
+    virtual void write(uint64_t addr, const void* src, std::size_t bytes) = 0;
+};
+
+// v1 backing: the simx::RAM the Processor already uses. ACL bypass is
+// the same pattern the simx/rtlsim CP hooks apply (sw/runtime/simx/
+// vortex.cpp:271-280) — the CP/DMA is a peer of the host runtime, not
+// a userspace caller subject to per-region page protections.
+class InProcessDevMem final : public DevMemAccessor {
+public:
+    explicit InProcessDevMem(vortex::RAM& ram) : ram_(ram) {}
+
+    void read (uint64_t addr, void* dst,       std::size_t bytes) override;
+    void write(uint64_t addr, const void* src, std::size_t bytes) override;
+
+private:
+    vortex::RAM& ram_;
+};
+
+} // namespace vortex_gem5
diff --git a/sim/simx/gem5/install.sh b/sim/simx/gem5/install.sh
new file mode 100755
index 000000000..0d171f247
--- /dev/null
+++ b/sim/simx/gem5/install.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Install the Vortex gem5 SimObject sources into a pinned gem5 tree.
+#
+# Copies vortex_gpgpu_dev.{cc,hh}, VortexGPGPU.py, and SConscript into
+# $GEM5_HOME/src/dev/vortex/ so gem5's scons can build them. The
+# source-of-truth lives in the Vortex tree (this directory); any
+# change has to re-run this script before `scons build/<ISA>/gem5.opt`
+# picks it up.
+#
+# Idempotent: re-running just refreshes the files.
+#
+# Usage:
+#   GEM5_HOME=$HOME/tools/gem5 sim/simx/gem5/install.sh
+# or
+#   sim/simx/gem5/install.sh           # uses $GEM5_HOME from env
+
+set -e
+
+GEM5_HOME=${GEM5_HOME:-$HOME/tools/gem5}
+SOURCE_DIR=$(dirname "$(readlink -f "$0")")
+
+if [ ! -d "$GEM5_HOME/src/dev" ]; then
+    echo "ERROR: GEM5_HOME=$GEM5_HOME does not look like a gem5 tree" >&2
+    echo "       (expected $GEM5_HOME/src/dev/)" >&2
+    exit 1
+fi
+
+DEST_DIR="$GEM5_HOME/src/dev/vortex"
+mkdir -p "$DEST_DIR"
+
+install -m 0644 "$SOURCE_DIR/vortex_gpgpu_dev.hh" "$DEST_DIR/"
+install -m 0644 "$SOURCE_DIR/vortex_gpgpu_dev.cc" "$DEST_DIR/"
+install -m 0644 "$SOURCE_DIR/VortexGPGPU.py"      "$DEST_DIR/"
+install -m 0644 "$SOURCE_DIR/SConscript"          "$DEST_DIR/"
+
+echo "Vortex SimObjects installed at $DEST_DIR"
+echo "Files:"
+ls -1 "$DEST_DIR" | sed 's/^/  /'
+echo ""
+echo "Re-build gem5 with one or both of:"
+echo "  scons -C $GEM5_HOME build/X86/gem5.opt -j\$(nproc)"
+echo "  scons -C $GEM5_HOME build/ARM/gem5.opt -j\$(nproc)"
diff --git a/sim/simx/gem5/vortex_gpgpu.cpp b/sim/simx/gem5/vortex_gpgpu.cpp
new file mode 100644
index 000000000..fc2562cfd
--- /dev/null
+++ b/sim/simx/gem5/vortex_gpgpu.cpp
@@ -0,0 +1,299 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "vortex_gpgpu.h"
+
+#include "constants.h"
+#include "dev_mem.h"
+#include "processor.h"
+#include <CommandProcessor.h>
+#include <mem.h>
+#include <util.h>
+#include <VX_config.h>
+#include <VX_types.h>
+
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <string>
+
+using namespace vortex;
+
+// Mirrors sw/runtime/common/common.h's GLOBAL_MEM_SIZE so the bounds
+// check in vram_{read,write} matches what the host runtime enforces.
+// Inlined rather than including common.h because that header drags in
+// the full runtime ABI which a device library has no business touching.
+#if (XLEN == 64)
+static constexpr uint64_t GEM5_GLOBAL_MEM_SIZE = 0x200000000ull;  // 8 GB
+#else
+static constexpr uint64_t GEM5_GLOBAL_MEM_SIZE = 0x100000000ull;  // 4 GB
+#endif
+
+namespace {
+
+// Gem5Device — owns the Vortex Processor + RAM + CommandProcessor
+// triplet. The CP's hooks call back into proc_/dev_mem_, and the
+// SimObject drives cp_tick / vortex_tick on independent gem5 events.
+class Gem5Device {
+public:
+    Gem5Device()
+        : ram_(0, MEM_PAGE_SIZE),
+          proc_(std::make_unique<Processor>()),
+          dev_mem_(std::make_unique<vortex_gem5::InProcessDevMem>(ram_)),
+          cp_(make_cp_hooks()) {
+        proc_->attach_ram(&ram_);
+    }
+
+    ~Gem5Device() = default;
+
+    // ---------------- Standalone (Phase 3) kernel preload ---------------
+    // Primes the KMU DCRs for a 1×1×1 CTA at STARTUP_ADDR and loads the
+    // ELF/bin/hex into VRAM. After this, calling vortex_tick repeatedly
+    // dispatches the kernel to completion (ProcessorImpl::cycle's lazy
+    // init resets SimPlatform and calls kmu_->start() on first tick).
+    // The hosted (CP-driven) path never calls this — kernel ELFs land
+    // in VRAM via mem_upload, and KMU programming goes through CMD_DCR_*.
+    bool load_kernel(const std::string& path) {
+        const uint64_t startup_addr(STARTUP_ADDR);
+        proc_->dcr_write(VX_DCR_KMU_STARTUP_ADDR0, startup_addr & 0xffffffff);
+    #if (XLEN == 64)
+        proc_->dcr_write(VX_DCR_KMU_STARTUP_ADDR1, startup_addr >> 32);
+    #endif
+        proc_->dcr_write(VX_DCR_KMU_STARTUP_ARG0, 0);
+        proc_->dcr_write(VX_DCR_KMU_STARTUP_ARG1, 0);
+        proc_->dcr_write(VX_DCR_KMU_GRID_DIM_X,   1);
+        proc_->dcr_write(VX_DCR_KMU_GRID_DIM_Y,   1);
+        proc_->dcr_write(VX_DCR_KMU_GRID_DIM_Z,   1);
+        proc_->dcr_write(VX_DCR_KMU_BLOCK_DIM_X,  1);
+        proc_->dcr_write(VX_DCR_KMU_BLOCK_DIM_Y,  1);
+        proc_->dcr_write(VX_DCR_KMU_BLOCK_DIM_Z,  1);
+        proc_->dcr_write(VX_DCR_KMU_LMEM_SIZE,    0);
+        proc_->dcr_write(VX_DCR_KMU_BLOCK_SIZE,   1);
+        proc_->dcr_write(VX_DCR_KMU_WARP_STEP_X,  NUM_THREADS);
+        proc_->dcr_write(VX_DCR_KMU_WARP_STEP_Y,  0);
+        proc_->dcr_write(VX_DCR_KMU_WARP_STEP_Z,  0);
+
+        std::string ext(fileExtension(path.c_str()));
+        if (ext == "vxbin") {
+            ram_.loadVxImage(path.c_str());
+        } else if (ext == "bin") {
+            ram_.loadBinImage(path.c_str(), startup_addr);
+        } else if (ext == "hex") {
+            ram_.loadHexImage(path.c_str());
+        } else {
+            std::cerr << "vortex_gem5: unsupported kernel extension '" << ext
+                      << "' (need .vxbin, .bin, or .hex)" << std::endl;
+            return false;
+        }
+        // Mark the device as "running" so the SimObject's standalone
+        // path advances vortexTickEvent_ until ProcessorImpl::cycle()
+        // reports done. Hosted launches set this via vortex_start.
+        vortex_running_ = true;
+        return true;
+    }
+
+    // ---------------- VRAM direct access --------------------------------
+    void vram_write(uint64_t addr, const uint8_t* src, uint32_t size) {
+        if (addr + size > GEM5_GLOBAL_MEM_SIZE) {
+        #ifndef NDEBUG
+            std::cerr << "vortex_gem5: vram_write overflow addr=0x"
+                      << std::hex << addr << " size=" << std::dec << size
+                      << std::endl;
+        #endif
+            return;
+        }
+        dev_mem_->write(addr, src, size);
+    }
+    void vram_read(uint64_t addr, uint8_t* dst, uint32_t size) {
+        if (addr + size > GEM5_GLOBAL_MEM_SIZE) {
+        #ifndef NDEBUG
+            std::cerr << "vortex_gem5: vram_read overflow addr=0x"
+                      << std::hex << addr << " size=" << std::dec << size
+                      << std::endl;
+        #endif
+            return;
+        }
+        dev_mem_->read(addr, dst, size);
+    }
+
+    // ---------------- CP regfile MMIO -----------------------------------
+    // The SimObject's PIO handlers translate `cp_mmio_write(off,v)` to
+    // a single call here. The CommandProcessor's regfile is 32-bit and
+    // its address map is documented in sim/common/CommandProcessor.h.
+    void cp_mmio_write(uint32_t off, uint32_t value) { cp_.mmio_write(off, value); }
+    uint32_t cp_mmio_read (uint32_t off) const       { return cp_.mmio_read(off); }
+
+    // ---------------- CP tick / introspection ---------------------------
+    // tick() advances the CP one functional cycle and returns true iff
+    // the CP still has work to do. The SimObject reschedules
+    // cpTickEvent_ while true and sleeps otherwise — proposal §2.3.
+    bool cp_tick() {
+        cp_.tick();
+        return cp_.busy();
+    }
+    bool cp_has_work() const { return cp_.enabled() && cp_.busy(); }
+
+    // ---------------- Vortex tick / introspection -----------------------
+    // vortex_tick advances ProcessorImpl::cycle() one step. cycle() does
+    // lazy init (resets SimPlatform + calls kmu_->start()) on first call.
+    // For back-to-back launches the CP's vortex_start hook calls
+    // processor_.start_kmu() explicitly to re-arm the KMU for the next
+    // kernel (kmu_->start is idempotent — first launch redundantly
+    // re-starts inside the lazy init, no harm).
+    bool vortex_tick() {
+        bool still_running = proc_->cycle();
+        if (!still_running) {
+            vortex_running_ = false;
+        }
+        return vortex_running_;
+    }
+    bool vortex_busy() const { return vortex_running_; }
+
+    // ---------------- vortex_start handler registration -----------------
+    // The SimObject registers a callback the CP fires when retiring a
+    // CMD_LAUNCH. The callback schedules vortexTickEvent_ at the next
+    // clock edge, decoupling CP and Vortex tick chains (proposal §2.4).
+    void set_start_handler(vortex_gem5_start_handler_t fn, void* ctx) {
+        start_fn_  = fn;
+        start_ctx_ = ctx;
+    }
+
+private:
+    vortex::CommandProcessor::Hooks make_cp_hooks() {
+        vortex::CommandProcessor::Hooks h;
+        h.dram_read = [this](uint64_t addr, void* dst, std::size_t bytes) {
+            dev_mem_->read(addr, dst, bytes);
+        };
+        h.dram_write = [this](uint64_t addr, const void* src, std::size_t bytes) {
+            dev_mem_->write(addr, src, bytes);
+        };
+        h.vortex_dcr_write = [this](uint32_t addr, uint32_t value) {
+            proc_->dcr_write(addr, value);
+        };
+        h.vortex_dcr_read = [this](uint32_t addr, uint32_t tag) -> uint32_t {
+            uint32_t v = 0;
+            proc_->dcr_read(addr, tag, &v);
+            return v;
+        };
+        h.vortex_start = [this]() {
+            // Mark Vortex as in-flight so vortex_busy returns true on
+            // the very next CP poll (before the first cycle() runs).
+            // Then re-arm the KMU for the (possibly back-to-back)
+            // kernel and ask the SimObject to begin ticking Vortex.
+            vortex_running_ = true;
+            proc_->start_kmu();
+            if (start_fn_) start_fn_(start_ctx_);
+        };
+        h.vortex_busy = [this]() -> bool { return vortex_running_; };
+        return h;
+    }
+
+    RAM ram_;
+    std::unique_ptr<Processor> proc_;
+    std::unique_ptr<vortex_gem5::DevMemAccessor> dev_mem_;
+    vortex::CommandProcessor cp_;
+    bool vortex_running_ = false;
+    vortex_gem5_start_handler_t start_fn_  = nullptr;
+    void* start_ctx_ = nullptr;
+};
+
+} // namespace
+
+// ----- C ABI -----------------------------------------------------------------
+
+extern "C" {
+
+const char* vortex_gem5_build_info(void) {
+    static char info[256];
+    std::snprintf(info, sizeof(info),
+                  "vortex-gem5 (XLEN=%d, threads=%d, warps=%d, cores=%d, clusters=%d)",
+                  XLEN, NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS);
+    return info;
+}
+
+vortex_gem5_handle_t vortex_gem5_create(void) {
+    try {
+        return reinterpret_cast<vortex_gem5_handle_t>(new Gem5Device());
+    } catch (const std::exception& e) {
+        std::cerr << "vortex_gem5_create: " << e.what() << std::endl;
+        return nullptr;
+    } catch (...) {
+        std::cerr << "vortex_gem5_create: unknown exception" << std::endl;
+        return nullptr;
+    }
+}
+
+void vortex_gem5_destroy(vortex_gem5_handle_t h) {
+    if (h == nullptr) return;
+    delete reinterpret_cast<Gem5Device*>(h);
+}
+
+void vortex_gem5_set_start_handler(vortex_gem5_handle_t h,
+                                   vortex_gem5_start_handler_t fn,
+                                   void* ctx) {
+    if (h == nullptr) return;
+    reinterpret_cast<Gem5Device*>(h)->set_start_handler(fn, ctx);
+}
+
+int vortex_gem5_load_kernel(vortex_gem5_handle_t h, const char* path) {
+    if (h == nullptr || path == nullptr) return -1;
+    return reinterpret_cast<Gem5Device*>(h)->load_kernel(path) ? 0 : -1;
+}
+
+void vortex_gem5_cp_mmio_write(vortex_gem5_handle_t h,
+                               uint32_t off, uint32_t value) {
+    if (h == nullptr) return;
+    reinterpret_cast<Gem5Device*>(h)->cp_mmio_write(off, value);
+}
+
+uint32_t vortex_gem5_cp_mmio_read(vortex_gem5_handle_t h, uint32_t off) {
+    if (h == nullptr) return 0;
+    return reinterpret_cast<Gem5Device*>(h)->cp_mmio_read(off);
+}
+
+bool vortex_gem5_cp_tick(vortex_gem5_handle_t h) {
+    if (h == nullptr) return false;
+    return reinterpret_cast<Gem5Device*>(h)->cp_tick();
+}
+
+bool vortex_gem5_cp_has_work(vortex_gem5_handle_t h) {
+    if (h == nullptr) return false;
+    return reinterpret_cast<Gem5Device*>(h)->cp_has_work();
+}
+
+bool vortex_gem5_vortex_tick(vortex_gem5_handle_t h) {
+    if (h == nullptr) return false;
+    return reinterpret_cast<Gem5Device*>(h)->vortex_tick();
+}
+
+bool vortex_gem5_vortex_busy(vortex_gem5_handle_t h) {
+    if (h == nullptr) return false;
+    return reinterpret_cast<Gem5Device*>(h)->vortex_busy();
+}
+
+void vortex_gem5_vram_write(vortex_gem5_handle_t h,
+                            uint64_t dev_addr, const uint8_t* src,
+                            uint32_t size) {
+    if (h == nullptr || src == nullptr) return;
+    reinterpret_cast<Gem5Device*>(h)->vram_write(dev_addr, src, size);
+}
+
+void vortex_gem5_vram_read(vortex_gem5_handle_t h,
+                           uint64_t dev_addr, uint8_t* dst,
+                           uint32_t size) {
+    if (h == nullptr || dst == nullptr) return;
+    reinterpret_cast<Gem5Device*>(h)->vram_read(dev_addr, dst, size);
+}
+
+} // extern "C"
diff --git a/sim/simx/gem5/vortex_gpgpu.h b/sim/simx/gem5/vortex_gpgpu.h
new file mode 100644
index 000000000..09cca06be
--- /dev/null
+++ b/sim/simx/gem5/vortex_gpgpu.h
@@ -0,0 +1,129 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// libvortex-gem5 — C ABI for the gem5 VortexGPGPU SimObject.
+//
+// Per gem5_v2_cp_migration_proposal §5.1 the device library hosts a
+// vortex::Processor + vortex::CommandProcessor pair, exposes a 32-bit
+// CP MMIO regfile (PIO_BASE_ADDR + 0x0 .. + 0x1FF), and provides two
+// independently-tickable engines so the SimObject can drive CP and
+// Vortex as separate gem5 event chains:
+//
+//     cpTickEvent_      -> vortex_gem5_cp_tick()
+//     vortexTickEvent_  -> vortex_gem5_vortex_tick()
+//
+// Both engines self-report whether they still have work via
+// vortex_gem5_cp_has_work() / vortex_gem5_vortex_busy(); the SimObject
+// uses those to decide whether to reschedule. The CP's vortex_start
+// hook calls back into the SimObject via the start-handler registered
+// at construction so a CMD_LAUNCH retirement schedules vortexTickEvent_
+// from inside cpTickEvent_'s execution.
+//
+// The ABI is C — not C++ — so the gem5 side does not depend on SimX's
+// internal types and can be rebuilt against a new gem5 release without
+// touching anything Vortex-side.
+//
+// Concurrency: all calls are serialized on the gem5 event-loop thread.
+// No internal locking. No re-entrancy.
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque handle. Owns a vortex::Processor, RAM, MemoryAllocator, and
+// vortex::CommandProcessor.
+typedef struct vortex_gem5_device_s* vortex_gem5_handle_t;
+
+// Returns a printable description of the build config (cores, warps,
+// threads, XLEN). Returned pointer is static; do not free.
+const char* vortex_gem5_build_info(void);
+
+// Construct a Vortex device instance. Returns NULL on failure.
+// VRAM is allocated lazily; no kernel is loaded until
+// vortex_gem5_load_kernel is called.
+vortex_gem5_handle_t vortex_gem5_create(void);
+
+// Destroy the device. Safe to call with NULL.
+void vortex_gem5_destroy(vortex_gem5_handle_t h);
+
+// Register a callback the device library invokes from inside its CP
+// vortex_start hook. The SimObject uses this to schedule its Vortex
+// tick event when the CP launches a kernel. Pass NULL to clear.
+// `ctx` is forwarded back unchanged.
+typedef void (*vortex_gem5_start_handler_t)(void* ctx);
+void vortex_gem5_set_start_handler(vortex_gem5_handle_t h,
+                                   vortex_gem5_start_handler_t fn,
+                                   void* ctx);
+
+// Load a kernel image into VRAM. Accepts .vxbin / .bin / .hex (same
+// shape as sim/simx/main.cpp). Primes the KMU DCRs for a 1×1×1 CTA
+// at STARTUP_ADDR for the Phase 3 standalone test path (in hosted
+// mode the dispatcher uploads kernels via mem_upload + programs KMU
+// DCRs via CMD_DCR_WRITE through the CP).
+//
+// Returns 0 on success, -1 on file-not-found or unsupported format.
+int vortex_gem5_load_kernel(vortex_gem5_handle_t h, const char* path);
+
+// CP regfile MMIO. `off` is the CP-internal byte offset (0..0x13F for
+// queue 0; see sim/common/CommandProcessor.h §address map). All
+// accesses are 32-bit. The SimObject translates a PIO packet at
+// `PIO_BASE_ADDR + off` into one of these calls; the host runtime's
+// cp_mmio_{write,read} translates `cp_mmio_write(off, v)` to one of
+// these via a 32-bit PIO write at `PIO_BASE_ADDR + off` (no AFU bit-12
+// split — the gem5 device's PIO range IS the CP regfile).
+void     vortex_gem5_cp_mmio_write(vortex_gem5_handle_t h,
+                                   uint32_t off, uint32_t value);
+uint32_t vortex_gem5_cp_mmio_read (vortex_gem5_handle_t h, uint32_t off);
+
+// Advance the embedded CommandProcessor by one functional cycle.
+// Returns true if the CP has more work (ring non-empty, command in
+// flight) and should be ticked again.
+bool vortex_gem5_cp_tick(vortex_gem5_handle_t h);
+
+// True iff the CP would benefit from being ticked: enabled and busy.
+// The SimObject uses this from PIO write handlers (after a CP regfile
+// update may have armed work) to decide whether to schedule
+// cpTickEvent_.
+bool vortex_gem5_cp_has_work(vortex_gem5_handle_t h);
+
+// Advance the Vortex Processor by one cycle. Returns true while the
+// processor is still running (clusters active or channels carrying
+// packets); the SimObject's vortexTickEvent_ reschedules itself while
+// this returns true and stops otherwise.
+bool vortex_gem5_vortex_tick(vortex_gem5_handle_t h);
+
+// True iff Vortex is currently executing a kernel (any cluster
+// running, any in-flight memory transactions). Used by the CP's
+// vortex_busy hook to know when to retire a CMD_LAUNCH.
+bool vortex_gem5_vortex_busy(vortex_gem5_handle_t h);
+
+// Direct device-VRAM access used by the SimObject's DMA-path scratch
+// buffers in v1 (a peer of the host runtime, ACL-bypassed). v2 will
+// route both Vortex memory and CP DMA through gem5's memory hierarchy
+// via the same DevMemAccessor interface.
+void vortex_gem5_vram_write(vortex_gem5_handle_t h,
+                            uint64_t dev_addr, const uint8_t* src,
+                            uint32_t size);
+void vortex_gem5_vram_read (vortex_gem5_handle_t h,
+                            uint64_t dev_addr, uint8_t* dst,
+                            uint32_t size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/sim/simx/gem5/vortex_gpgpu_dev.cc b/sim/simx/gem5/vortex_gpgpu_dev.cc
new file mode 100644
index 000000000..8bc54899c
--- /dev/null
+++ b/sim/simx/gem5/vortex_gpgpu_dev.cc
@@ -0,0 +1,277 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dev/vortex/vortex_gpgpu_dev.hh"
+
+#include "base/logging.hh"
+#include "base/trace.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+#include <dlfcn.h>
+
+namespace gem5
+{
+
+namespace {
+
+template <typename T>
+T dlsym_or_fatal(void* handle, const char* symbol, const char* libpath)
+{
+    void* p = dlsym(handle, symbol);
+    if (p == nullptr) {
+        fatal("VortexGPGPU: dlsym(%s) failed in %s: %s",
+              symbol, libpath, dlerror());
+    }
+    return reinterpret_cast<T>(p);
+}
+
+} // namespace
+
+VortexGPGPU::VortexGPGPU(const Params &p)
+  : DmaDevice(p),
+    libHandle_(nullptr),
+    deviceHandle_(nullptr),
+    abi_{},
+    libraryPath_(p.library),
+    kernelPath_(p.kernel),
+    pioAddr_(p.pio_addr),
+    pioSize_(p.pio_size),
+    pinAddr_(p.pin_addr),
+    pinSize_(p.pin_size),
+    pioLatency_(p.pio_latency),
+    cpTickEvent_([this]{ this->cpTick(); }, name() + ".cpTickEvent"),
+    vortexTickEvent_([this]{ this->vortexTick(); }, name() + ".vortexTickEvent"),
+    standalone_(false)
+{
+    if (libraryPath_.empty()) {
+        fatal("VortexGPGPU: 'library' parameter is required "
+              "(path to libvortex-gem5.so)");
+    }
+
+    libHandle_ = dlopen(libraryPath_.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    if (libHandle_ == nullptr) {
+        fatal("VortexGPGPU: dlopen('%s') failed: %s",
+              libraryPath_, dlerror());
+    }
+
+    // Resolve the v2 ABI surface. Any missing symbol is a hard build
+    // mismatch — fatal at construction rather than mid-simulation.
+    abi_.build_info        = dlsym_or_fatal<const char*(*)(void)>
+                              (libHandle_, "vortex_gem5_build_info",        libraryPath_.c_str());
+    abi_.create            = dlsym_or_fatal<void*(*)(void)>
+                              (libHandle_, "vortex_gem5_create",            libraryPath_.c_str());
+    abi_.destroy           = dlsym_or_fatal<void(*)(void*)>
+                              (libHandle_, "vortex_gem5_destroy",           libraryPath_.c_str());
+    abi_.set_start_handler = dlsym_or_fatal<void(*)(void*, void(*)(void*), void*)>
+                              (libHandle_, "vortex_gem5_set_start_handler", libraryPath_.c_str());
+    abi_.load_kernel       = dlsym_or_fatal<int(*)(void*, const char*)>
+                              (libHandle_, "vortex_gem5_load_kernel",       libraryPath_.c_str());
+    abi_.cp_mmio_write     = dlsym_or_fatal<void(*)(void*, uint32_t, uint32_t)>
+                              (libHandle_, "vortex_gem5_cp_mmio_write",     libraryPath_.c_str());
+    abi_.cp_mmio_read      = dlsym_or_fatal<uint32_t(*)(void*, uint32_t)>
+                              (libHandle_, "vortex_gem5_cp_mmio_read",      libraryPath_.c_str());
+    abi_.cp_tick           = dlsym_or_fatal<bool(*)(void*)>
+                              (libHandle_, "vortex_gem5_cp_tick",           libraryPath_.c_str());
+    abi_.cp_has_work       = dlsym_or_fatal<bool(*)(void*)>
+                              (libHandle_, "vortex_gem5_cp_has_work",       libraryPath_.c_str());
+    abi_.vortex_tick       = dlsym_or_fatal<bool(*)(void*)>
+                              (libHandle_, "vortex_gem5_vortex_tick",       libraryPath_.c_str());
+    abi_.vortex_busy       = dlsym_or_fatal<bool(*)(void*)>
+                              (libHandle_, "vortex_gem5_vortex_busy",       libraryPath_.c_str());
+    abi_.vram_write        = dlsym_or_fatal<void(*)(void*, uint64_t, const uint8_t*, uint32_t)>
+                              (libHandle_, "vortex_gem5_vram_write",        libraryPath_.c_str());
+    abi_.vram_read         = dlsym_or_fatal<void(*)(void*, uint64_t, uint8_t*, uint32_t)>
+                              (libHandle_, "vortex_gem5_vram_read",         libraryPath_.c_str());
+
+    inform("VortexGPGPU: %s", abi_.build_info());
+    inform("VortexGPGPU: library=%s", libraryPath_);
+    inform("VortexGPGPU: pio[CP regfile]=[0x%llx,+0x%llx)",
+           static_cast<unsigned long long>(pioAddr_),
+           static_cast<unsigned long long>(pioSize_));
+    if (pinSize_ != 0) {
+        inform("VortexGPGPU: pin[BAR-mapped VRAM]=[0x%llx,+0x%llx)",
+               static_cast<unsigned long long>(pinAddr_),
+               static_cast<unsigned long long>(pinSize_));
+    }
+
+    deviceHandle_ = abi_.create();
+    if (deviceHandle_ == nullptr) {
+        fatal("VortexGPGPU: vortex_gem5_create returned NULL");
+    }
+
+    // Register the vortex_start trampoline so the CP can schedule
+    // Vortex ticks from inside cp_tick (proposal §2.4).
+    abi_.set_start_handler(deviceHandle_, &VortexGPGPU::onVortexStartTrampoline, this);
+}
+
+VortexGPGPU::~VortexGPGPU()
+{
+    if (deviceHandle_ != nullptr && abi_.destroy != nullptr) {
+        abi_.destroy(deviceHandle_);
+    }
+    if (libHandle_ != nullptr) {
+        dlclose(libHandle_);
+    }
+}
+
+void
+VortexGPGPU::init()
+{
+    DmaDevice::init();
+}
+
+void
+VortexGPGPU::startup()
+{
+    DmaDevice::startup();
+
+    if (!kernelPath_.empty()) {
+        // Standalone mode (Phase 3): preload a kernel and self-drive
+        // to completion. No host CPU, no CP. The standalone path
+        // exists as a smoke test for the device library.
+        inform("VortexGPGPU: standalone mode (preload + auto-tick)");
+        inform("VortexGPGPU: preloading kernel=%s", kernelPath_);
+        if (abi_.load_kernel(deviceHandle_, kernelPath_.c_str()) != 0) {
+            fatal("VortexGPGPU: vortex_gem5_load_kernel('%s') failed",
+                  kernelPath_);
+        }
+        standalone_ = true;
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+    } else {
+        // Hosted mode (proposal §4): the host runtime issues CP MMIO
+        // writes to configure queues + commits commands; the CP
+        // schedules its own ticks via maybeWakeCp() and the vortex
+        // tick via the start handler. Idle at boot.
+        inform("VortexGPGPU: hosted mode (waiting for CP enable)");
+        standalone_ = false;
+    }
+}
+
+void
+VortexGPGPU::cpTick()
+{
+    const bool still_busy = abi_.cp_tick(deviceHandle_);
+    if (still_busy) {
+        schedule(cpTickEvent_, clockEdge(Cycles(1)));
+    }
+    // Idle drop-out: no reschedule. PIO writes that arm new work will
+    // call maybeWakeCp() and reschedule us.
+}
+
+void
+VortexGPGPU::vortexTick()
+{
+    const bool still_running = abi_.vortex_tick(deviceHandle_);
+    if (still_running) {
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+        return;
+    }
+    if (standalone_) {
+        inform("VortexGPGPU: standalone kernel complete — exiting sim loop");
+        exitSimLoop("VortexGPGPU: kernel complete");
+        return;
+    }
+    // Hosted mode: Vortex finished. The CP's launch FSM observes
+    // vortex_busy() == false on its next tick and retires the
+    // CMD_LAUNCH. If the CP is already idle (no scheduled tick) we
+    // need to wake it so the retirement actually happens.
+    maybeWakeCp();
+}
+
+void
+VortexGPGPU::maybeWakeCp()
+{
+    if (abi_.cp_has_work(deviceHandle_) && !cpTickEvent_.scheduled()) {
+        schedule(cpTickEvent_, clockEdge(Cycles(1)));
+    }
+}
+
+void
+VortexGPGPU::onVortexStartTrampoline(void* ctx)
+{
+    static_cast<VortexGPGPU*>(ctx)->onVortexStart();
+}
+
+void
+VortexGPGPU::onVortexStart()
+{
+    if (!vortexTickEvent_.scheduled()) {
+        schedule(vortexTickEvent_, clockEdge(Cycles(1)));
+    }
+}
+
+Tick
+VortexGPGPU::read(PacketPtr pkt)
+{
+    const Addr a = pkt->getAddr();
+    if (a >= pioAddr_ && a < pioAddr_ + pioSize_) {
+        // CP regfile access — 32-bit only.
+        const uint32_t off = uint32_t(a - pioAddr_);
+        const uint32_t value = abi_.cp_mmio_read(deviceHandle_, off);
+        pkt->setUintX(static_cast<uint64_t>(value), ByteOrder::little);
+        pkt->makeAtomicResponse();
+        return pioLatency_;
+    }
+    // BAR-mapped VRAM access (CPU is reading device memory directly).
+    // Variable-width packet (host load / cache-line fill).
+    const uint64_t dev_addr = a - pinAddr_;
+    abi_.vram_read(deviceHandle_,
+                   dev_addr,
+                   pkt->getPtr<uint8_t>(),
+                   uint32_t(pkt->getSize()));
+    pkt->makeAtomicResponse();
+    return pioLatency_;
+}
+
+Tick
+VortexGPGPU::write(PacketPtr pkt)
+{
+    const Addr a = pkt->getAddr();
+    if (a >= pioAddr_ && a < pioAddr_ + pioSize_) {
+        // CP regfile write — 32-bit only.
+        const uint32_t off = uint32_t(a - pioAddr_);
+        const uint64_t raw = pkt->getUintX(ByteOrder::little);
+        abi_.cp_mmio_write(deviceHandle_, off, uint32_t(raw));
+        maybeWakeCp();
+        pkt->makeAtomicResponse();
+        return pioLatency_;
+    }
+    // BAR-mapped VRAM write — variable-width packet (host store /
+    // cache writeback). Forwards into in-process simx::RAM via
+    // dev_mem_, so subsequent CP dram_read / Vortex MemSim reads at
+    // the same dev_addr see the bytes the CPU just wrote.
+    const uint64_t dev_addr = a - pinAddr_;
+    abi_.vram_write(deviceHandle_,
+                    dev_addr,
+                    pkt->getConstPtr<uint8_t>(),
+                    uint32_t(pkt->getSize()));
+    // Writes to device VRAM may seed CP ring entries; if the CP is
+    // dormant, leave it dormant (the CP only wakes on a doorbell PIO
+    // write, not on a ring-fill).
+    pkt->makeAtomicResponse();
+    return pioLatency_;
+}
+
+AddrRangeList
+VortexGPGPU::getAddrRanges() const
+{
+    AddrRangeList ranges;
+    ranges.push_back(RangeSize(pioAddr_, pioSize_));
+    if (pinSize_ != 0) {
+        ranges.push_back(RangeSize(pinAddr_, pinSize_));
+    }
+    return ranges;
+}
+
+} // namespace gem5
diff --git a/sim/simx/gem5/vortex_gpgpu_dev.hh b/sim/simx/gem5/vortex_gpgpu_dev.hh
new file mode 100644
index 000000000..54aab326f
--- /dev/null
+++ b/sim/simx/gem5/vortex_gpgpu_dev.hh
@@ -0,0 +1,133 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// VortexGPGPU — gem5 SimObject wrapper for libvortex-gem5.so.
+//
+// Lives at $GEM5_HOME/src/dev/vortex/vortex_gpgpu_dev.{cc,hh} after
+// sim/simx/gem5/install.sh runs. The host-side source of truth is the
+// Vortex tree (sim/simx/gem5/) so API drift between gem5 and the Vortex
+// C ABI surfaces as a build error in Vortex CI, not as a gem5
+// integration mystery.
+//
+// Design (gem5_v2_cp_migration_proposal §2.3, §2.4):
+//   - dlopen the Vortex library at construction; resolve all
+//     vortex_gem5_* symbols up-front so the hot paths (cpTick,
+//     vortexTick, PIO read/write) are direct indirect calls.
+//   - PIO range is exactly the CP regfile (PIO_BASE_ADDR + 0..+0x1FF,
+//     proposal §3); no legacy OPAE register window.
+//   - cpTickEvent_ self-schedules only while the CP has work; goes
+//     dormant otherwise (proposal §2.3). PIO writes that may have
+//     armed work re-arm the schedule.
+//   - vortexTickEvent_ self-schedules only while Vortex is running;
+//     scheduled by the CP's vortex_start hook via the registered
+//     start handler (proposal §2.4). Standalone mode skips the CP
+//     and schedules vortexTickEvent_ directly at startup.
+//   - DmaDevice base class kept for forward compatibility with the
+//     v2 DMA-port seam (proposal §2.5) and for the standalone smoke
+//     test path that still uses gem5's pio interface.
+
+#ifndef __DEV_VORTEX_VORTEX_GPGPU_DEV_HH__
+#define __DEV_VORTEX_VORTEX_GPGPU_DEV_HH__
+
+#include "dev/dma_device.hh"
+#include "dev/io_device.hh"
+#include "params/VortexGPGPU.hh"
+#include "sim/eventq.hh"
+
+#include <cstdint>
+#include <string>
+
+namespace gem5
+{
+
+class VortexGPGPU : public DmaDevice
+{
+public:
+    using Params = VortexGPGPUParams;
+
+    VortexGPGPU(const Params &p);
+    ~VortexGPGPU() override;
+
+    // PioDevice interface
+    Tick read(PacketPtr pkt) override;
+    Tick write(PacketPtr pkt) override;
+    AddrRangeList getAddrRanges() const override;
+
+    // SimObject lifecycle
+    void init() override;
+    void startup() override;
+
+private:
+    // CP tick — advances the embedded CommandProcessor one functional
+    // cycle. Self-reschedules iff cp_tick reported still-busy.
+    void cpTick();
+
+    // Vortex tick — advances the Vortex Processor one cycle.
+    // Self-reschedules iff vortex_tick reported still-running.
+    // Standalone mode exits the sim loop when vortex_tick returns false.
+    void vortexTick();
+
+    // Called from a PIO write to schedule cpTickEvent_ if (a) the CP
+    // reports new work and (b) the event isn't already pending.
+    void maybeWakeCp();
+
+    // Static trampoline registered with the device library so the CP's
+    // vortex_start hook can schedule vortexTickEvent_ via the gem5
+    // event scheduler. Passing `this` via the void* ctx avoids any
+    // dependency on gem5 types in the library.
+    static void onVortexStartTrampoline(void* ctx);
+    void onVortexStart();
+
+    // Library binding ------------------------------------------------
+    void* libHandle_;
+    void* deviceHandle_;
+
+    struct AbiV2 {
+        const char* (*build_info)(void);
+        void*       (*create)(void);
+        void        (*destroy)(void* h);
+        void        (*set_start_handler)(void* h, void (*fn)(void*), void* ctx);
+        int         (*load_kernel)(void* h, const char* path);
+        void        (*cp_mmio_write)(void* h, uint32_t off, uint32_t value);
+        uint32_t    (*cp_mmio_read)(void* h, uint32_t off);
+        bool        (*cp_tick)(void* h);
+        bool        (*cp_has_work)(void* h);
+        bool        (*vortex_tick)(void* h);
+        bool        (*vortex_busy)(void* h);
+        void        (*vram_write)(void* h, uint64_t addr,
+                                  const uint8_t* src, uint32_t size);
+        void        (*vram_read)(void* h, uint64_t addr,
+                                 uint8_t* dst, uint32_t size);
+    } abi_;
+
+    // Configuration --------------------------------------------------
+    const std::string libraryPath_;
+    const std::string kernelPath_;
+    const Addr        pioAddr_;
+    const Addr        pioSize_;
+    const Addr        pinAddr_;   // device VRAM, host-visible as BAR
+    const Addr        pinSize_;
+    const Tick        pioLatency_;
+
+    // Event scheduling
+    EventFunctionWrapper cpTickEvent_;
+    EventFunctionWrapper vortexTickEvent_;
+
+    // Standalone (Phase 3) vs. hosted mode. Set by startup() based on
+    // whether the `kernel=` Python param was provided.
+    bool standalone_;
+};
+
+} // namespace gem5
+
+#endif // __DEV_VORTEX_VORTEX_GPGPU_DEV_HH__
diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp
index b173e4195..40dc9226a 100644
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@@ -231,6 +231,22 @@ void ProcessorImpl::reset() {
   perf_mem_writes_ = 0;
   perf_mem_latency_ = 0;
   perf_mem_pending_reads_ = 0;
+  is_cycle_initialized_ = false;
+}
+
+bool ProcessorImpl::cycle() {
+  // Lazy first-call init mirrors run()'s top-of-loop sequence so the
+  // external driver doesn't need to choreograph reset + kmu start
+  // separately. reset() clears is_cycle_initialized_ so a back-to-back
+  // kernel launch re-dispatches.
+  if (!is_cycle_initialized_) {
+    this->reset();
+    kmu_->start();
+    is_cycle_initialized_ = true;
+  }
+  SimPlatform::instance().tick();
+  perf_mem_latency_ += perf_mem_pending_reads_;
+  return this->any_running();
 }
 
 int ProcessorImpl::dcr_write(uint32_t addr, uint32_t value) {
@@ -333,6 +349,14 @@ int Processor::run() {
   return -1;
 }
 
+bool Processor::cycle() {
+  return impl_->cycle();
+}
+
+Memory* Processor::memsim() {
+  return impl_->memsim();
+}
+
 int Processor::dcr_write(uint32_t addr, uint32_t value) {
   return impl_->dcr_write(addr, value);
 }
diff --git a/sim/simx/processor.h b/sim/simx/processor.h
index 129cfdc46..04b57f037 100644
--- a/sim/simx/processor.h
+++ b/sim/simx/processor.h
@@ -20,6 +20,7 @@
 namespace vortex {
 
 class RAM;
+class Memory;
 class ProcessorImpl;
 
 class Processor {
@@ -33,12 +34,29 @@ class Processor {
 
   int run();
 
+  // Advance the simulator by one cycle. On the first call after a
+  // reset() (or on the very first call), the KMU is started so warps
+  // dispatch into the cluster. Returns true while work remains
+  // (clusters running or channels carrying packets); false once the
+  // program has finished and the channels have drained.
+  //
+  // Used by external simulators that drive Vortex's clock from their
+  // own event loop (SST in sim/simx/sst/, gem5 in sim/simx/gem5/).
+  bool cycle();
+
   void start_kmu();
 
   bool any_running() const;
 
   class Core* get_first_core() const;
 
+  // Returns the processor's memory module. Used by external simulators
+  // (SST, gem5) to install a pre-send hook on Memory::tick that mirrors
+  // accepted requests to their own memory hierarchy for timing
+  // observability. The local data path stays in Vortex's RAM — this is
+  // a peek, not a substitute.
+  Memory* memsim();
+
   int dcr_write(uint32_t addr, uint32_t value);
 
   int dcr_read(uint32_t addr, uint32_t tag, uint32_t* value);
diff --git a/sim/simx/processor_impl.h b/sim/simx/processor_impl.h
index 0f66471b6..4d2b6fef4 100644
--- a/sim/simx/processor_impl.h
+++ b/sim/simx/processor_impl.h
@@ -40,6 +40,11 @@ class ProcessorImpl {
 
   int run();
 
+  // Single-cycle step; see Processor::cycle() doc. Lazily initializes
+  // (resets + starts KMU) on the first call after construction or
+  // after reset() has been invoked.
+  bool cycle();
+
   int dcr_write(uint32_t addr, uint32_t value);
 
   int dcr_read(uint32_t addr, uint32_t tag, uint32_t* value);
@@ -48,6 +53,8 @@ class ProcessorImpl {
 
   Kmu& kmu()       { return *kmu_; }
 
+  Memory* memsim() { return memsim_.get(); }
+
   bool any_running() const;
 
   class Core* get_first_core() const;
@@ -67,6 +74,10 @@ class ProcessorImpl {
   uint64_t perf_mem_writes_;
   uint64_t perf_mem_latency_;
   uint64_t perf_mem_pending_reads_;
+  // Tracks whether cycle() has done its first-call init (reset +
+  // kmu_->start()). reset() clears it so a back-to-back kernel launch
+  // via cycle() re-dispatches the KMU.
+  bool is_cycle_initialized_;
 };
 
 }
diff --git a/sw/common/bitmanip.h b/sw/common/bitmanip.h
index c4fe9e8da..5c7268385 100644
--- a/sw/common/bitmanip.h
+++ b/sw/common/bitmanip.h
@@ -14,6 +14,8 @@
 #pragma once
 
 #include <cstdint>
+#include <type_traits>
+#include <algorithm>
 #include <assert.h>
 
 namespace vortex {
diff --git a/sw/runtime/gem5/Makefile b/sw/runtime/gem5/Makefile
new file mode 100644
index 000000000..259bda5d9
--- /dev/null
+++ b/sw/runtime/gem5/Makefile
@@ -0,0 +1,66 @@
+include ../common.mk
+
+# HOST_ARCH selects the cross-compiler for the simulated host ISA
+# inside gem5 (see docs/proposals/gem5_simx_v3_proposal.md §3.5).
+# Default x86_64 has no toolchain install requirement; aarch64/armhf
+# need ci/gem5_install.sh to have run sudo-apt for the cross-compilers.
+HOST_ARCH ?= x86_64
+
+DESTDIR ?= $(CURDIR)/..
+
+SRC_DIR := $(VORTEX_HOME)/sw/runtime/gem5
+
+CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors -Werror
+CXXFLAGS += -I$(INC_DIR) -I$(ROOT_DIR)/sw -I$(ROOT_DIR)/hw -I$(DESTDIR) -I$(SW_COMMON_DIR) -I$(RT_COMMON_DIR)
+CXXFLAGS += -DXLEN_$(XLEN)
+CXXFLAGS += -fPIC
+CXXFLAGS += $(CONFIGS)
+
+# Per-arch compiler selection. The cross-compilers are sysroot-aware
+# (Ubuntu's gcc-aarch64-linux-gnu ships the matching libstdc++); no
+# extra --sysroot flags needed.
+#
+# Cross-compiled outputs land in $(DESTDIR)/$(HOST_ARCH)/ alongside
+# the stub's libvortex.so (also cross-compiled). The simulated ARM
+# process's LD_LIBRARY_PATH points at that one dir to find both.
+ifeq ($(HOST_ARCH),x86_64)
+    CXX := g++
+    ARCH_SUFFIX := x86_64
+    OUT_DIR := $(DESTDIR)
+else ifeq ($(HOST_ARCH),aarch64)
+    CXX := aarch64-linux-gnu-g++
+    ARCH_SUFFIX := aarch64
+    OUT_DIR := $(DESTDIR)/aarch64
+else ifeq ($(HOST_ARCH),armhf)
+    CXX := arm-linux-gnueabihf-g++
+    ARCH_SUFFIX := armhf
+    OUT_DIR := $(DESTDIR)/armhf
+else
+    $(error HOST_ARCH must be one of: x86_64, aarch64, armhf (got $(HOST_ARCH)))
+endif
+
+LDFLAGS += -shared -pthread
+
+SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp $(RT_COMMON_DIR)/utils.cpp
+
+# Debug / release
+ifdef DEBUG
+    CXXFLAGS += -g -O0
+else
+    CXXFLAGS += -O2 -DNDEBUG
+endif
+
+PROJECT := libvortex-gem5-$(ARCH_SUFFIX).so
+
+.PHONY: all force clean
+
+all: $(OUT_DIR)/$(PROJECT)
+
+$(OUT_DIR)/$(PROJECT): $(SRCS)
+	@mkdir -p $(OUT_DIR)
+	$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -Wl,-soname,$(PROJECT) -o $@
+
+clean:
+	rm -f $(DESTDIR)/libvortex-gem5-*.so
+	rm -f $(DESTDIR)/aarch64/libvortex-gem5-*.so
+	rm -f $(DESTDIR)/armhf/libvortex-gem5-*.so
diff --git a/sw/runtime/gem5/driver.cpp b/sw/runtime/gem5/driver.cpp
new file mode 100644
index 000000000..e00f72f77
--- /dev/null
+++ b/sw/runtime/gem5/driver.cpp
@@ -0,0 +1,62 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "driver.h"
+
+namespace vortex {
+
+namespace {
+bool g_inited = false;
+}
+
+int drv_init() {
+    // The two fixed regions (PIO and PIN) are mapped by the gem5
+    // SE-mode setup before this binary runs. No mmap() here because
+    // SE-mode has no /dev/vortex; the Python config arranges the
+    // address space directly. If this runtime is ever ported to a
+    // real OS with a kernel driver, drv_init() becomes
+    // open("/dev/vortex_gem5") + mmap() for both regions.
+    g_inited = true;
+    return 0;
+}
+
+void drv_close() {
+    g_inited = false;
+}
+
+uint32_t mmio_read32(uint32_t offset) {
+    auto* p = reinterpret_cast<volatile uint32_t*>(PIO_BASE_ADDR + offset);
+    return *p;
+}
+
+void mmio_write32(uint32_t offset, uint32_t value) {
+    auto* p = reinterpret_cast<volatile uint32_t*>(PIO_BASE_ADDR + offset);
+    *p = value;
+}
+
+// Publish prior stores before the next MMIO write. The host CPU model
+// in gem5 (especially out-of-order variants like O3CPU) can reorder
+// MMIO writes and surrounding stores; the dispatcher must guarantee
+// that ring-buffer payloads land in device memory before Q_TAIL_HI is
+// observed by the CP. The barrier is per-HOST_ARCH.
+void mmio_fence() {
+#if defined(__x86_64__) || defined(__i386__)
+    __asm__ __volatile__ ("mfence" ::: "memory");
+#elif defined(__aarch64__) || defined(__arm__)
+    __asm__ __volatile__ ("dmb sy" ::: "memory");
+#else
+    __asm__ __volatile__ ("" ::: "memory");
+#endif
+}
+
+} // namespace vortex
diff --git a/sw/runtime/gem5/driver.h b/sw/runtime/gem5/driver.h
new file mode 100644
index 000000000..45bef33a6
--- /dev/null
+++ b/sw/runtime/gem5/driver.h
@@ -0,0 +1,70 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Direct-MMIO + pinned-region driver for the gem5 VortexGPGPU device.
+//
+// Inside a gem5 SE-mode process the device is reached by:
+//
+//   1. MMIO accesses to the CP regfile via a fixed virtual address that
+//      the gem5 Python config maps to the SimObject's PIO range
+//      (PIO_BASE_ADDR below; default 0x20000000 — gem5_v2_cp_migration
+//      §3). The CP regfile is 32-bit; only 32-bit accesses are used.
+//
+//   2. Direct memory access to device VRAM via a fixed pinned region
+//      that the gem5 Python config identity-maps virtual→physical
+//      (PIN_BASE_ADDR; default 0x10000000). The runtime treats it as
+//      ordinary memory: regular stores from the host process land in
+//      the same physical bytes the SimObject sees as device VRAM.
+//      Eliminates the need for a separate "DMA staging buffer" path —
+//      gem5_v2_cp_migration §2.2.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace vortex {
+
+// Fixed virtual addresses the runtime expects to find mapped by the
+// gem5 Python config. PIN_BASE..PIN_BASE+PIN_REGION_SIZE is the
+// host-visible window onto device VRAM — `memcpy(PIN_BASE+dev_addr,
+// host_src, sz)` lands in the same in-process simx::RAM bytes the CP
+// and Vortex see. Sized to cover the full XLEN device address space
+// so any address mem_alloc / mem_reserve can hand out is reachable
+// via the host BAR; placed above 4 GiB so it doesn't collide with the
+// simulated process's natural low-VA layout (heap/stack/code).
+constexpr uintptr_t PIN_BASE_ADDR    = 0x100000000ull;
+constexpr size_t    PIN_REGION_SIZE  = 0x100000000ull;  // 4 GB (= XLEN=32 device VRAM)
+constexpr uintptr_t PIO_BASE_ADDR    = 0x20000000ull;
+constexpr size_t    PIO_REGION_SIZE  = 0x00000200ull;   // 0x200 — CP regfile
+
+// Init / shutdown. Both are idempotent in practice but should be
+// paired 1:1.
+int  drv_init();
+void drv_close();
+
+// CP regfile MMIO. `offset` is the CP-internal byte offset
+// (sim/common/CommandProcessor.h §address map). All accesses are 32-bit
+// — the CP regfile is 32-bit wide, and gem5's PIO model honors the
+// packet width verbatim.
+//
+// mmio_fence() emits the right barrier for HOST_ARCH (mfence on x86,
+// dmb sy on AArch64/ARMv7). The host runtime issues a fence between
+// any non-MMIO publication (e.g. seeding a ring buffer through
+// PIN_BASE_ADDR) and the doorbell write (Q_TAIL_HI) so the device
+// sees the new ring entries before the tail advance.
+uint32_t mmio_read32 (uint32_t offset);
+void     mmio_write32(uint32_t offset, uint32_t value);
+void     mmio_fence();
+
+} // namespace vortex
diff --git a/sw/runtime/gem5/vortex.cpp b/sw/runtime/gem5/vortex.cpp
new file mode 100644
index 000000000..7bd5c53b6
--- /dev/null
+++ b/sw/runtime/gem5/vortex.cpp
@@ -0,0 +1,192 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// gem5 host runtime backend (pure-v2 callbacks_t).
+//
+// Implements vx_device with the platform primitives expected by
+// sw/runtime/common/callbacks.inc: init / get_caps / mem_info /
+// mem_{alloc,reserve,free,access} / upload / download / copy /
+// cp_mmio_{write,read}. All kernel launches and DCR ops flow through
+// the upstream dispatcher (sw/runtime/common/vx_device.cpp) which
+// builds CMD_* descriptors into the CP ring buffer and bumps Q_TAIL
+// via cp_mmio_write.
+//
+// gem5-specific shape (vs. xrt/opae):
+//   - mem_upload/download/copy are direct memcpy through PIN_BASE_ADDR
+//     which the gem5 SE-mode process has identity-mapped to device VRAM
+//     via Process::map. No DMA descriptor; no PIO trigger.
+//   - cp_mmio_{write,read} are 32-bit PIO accesses at PIO_BASE_ADDR + off
+//     (no CP_BASE 0x1000 offset because the gem5 device's PIO range IS
+//     the CP regfile; there is no AFU bit-12 split).
+//
+// See docs/proposals/gem5_v2_cp_migration_proposal.md for the full
+// design rationale.
+
+#include <common.h>
+#include <util.h>          // log2floor / log2ceil / is_aligned / aligned_size
+#include "driver.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace vortex;
+
+class vx_device {
+public:
+    vx_device()
+        : global_mem_(ALLOC_BASE_ADDR,
+                      GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR,
+                      RAM_PAGE_SIZE,
+                      CACHE_BLOCK_SIZE) {}
+
+    ~vx_device() {
+        drv_close();
+    }
+
+    int init() {
+        if (drv_init() != 0) {
+            std::fprintf(stderr, "[VXDRV] drv_init failed\n");
+            return -1;
+        }
+        return 0;
+    }
+
+    // Compile-time capability table — host runtime and SimX-side device
+    // library share the build tree so VX_config.h macros agree on both
+    // sides by construction.
+    int get_caps(uint32_t caps_id, uint64_t* value) {
+        switch (caps_id) {
+        case VX_CAPS_VERSION:         *value = IMPLEMENTATION_ID; break;
+        case VX_CAPS_NUM_THREADS:     *value = NUM_THREADS; break;
+        case VX_CAPS_NUM_WARPS:       *value = NUM_WARPS; break;
+        case VX_CAPS_NUM_CORES:       *value = NUM_CORES * NUM_CLUSTERS; break;
+        case VX_CAPS_NUM_CLUSTERS:    *value = NUM_CLUSTERS; break;
+        case VX_CAPS_SOCKET_SIZE:     *value = SOCKET_SIZE; break;
+        case VX_CAPS_ISSUE_WIDTH:     *value = ISSUE_WIDTH; break;
+        case VX_CAPS_CACHE_LINE_SIZE: *value = CACHE_BLOCK_SIZE; break;
+        case VX_CAPS_GLOBAL_MEM_SIZE: *value = GLOBAL_MEM_SIZE; break;
+        case VX_CAPS_LOCAL_MEM_SIZE:  *value = (1 << LMEM_LOG_SIZE); break;
+        case VX_CAPS_ISA_FLAGS:
+            *value = ((uint64_t(MISA_EXT)) << 32)
+                   | ((log2floor(XLEN) - 4) << 30)
+                   |   MISA_STD;
+            break;
+        case VX_CAPS_NUM_MEM_BANKS:   *value = PLATFORM_MEMORY_NUM_BANKS; break;
+        case VX_CAPS_MEM_BANK_SIZE:   *value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS); break;
+        case VX_CAPS_CLOCK_RATE:      *value = 0; break;
+        case VX_CAPS_PEAK_MEM_BW:     *value = PLATFORM_MEMORY_PEAK_BW; break;
+        default:
+            std::fprintf(stderr, "[VXDRV] invalid caps id: %u\n", caps_id);
+            return -1;
+        }
+        return 0;
+    }
+
+    int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
+        uint64_t addr;
+        CHECK_ERR(global_mem_.allocate(size, &addr), { return err; });
+        CHECK_ERR(this->mem_access(addr, size, flags), {
+            global_mem_.release(addr);
+            return err;
+        });
+        *dev_addr = addr;
+        return 0;
+    }
+
+    int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
+        CHECK_ERR(global_mem_.reserve(dev_addr, size), { return err; });
+        CHECK_ERR(this->mem_access(dev_addr, size, flags), {
+            global_mem_.release(dev_addr);
+            return err;
+        });
+        return 0;
+    }
+
+    int mem_free(uint64_t dev_addr) {
+        return global_mem_.release(dev_addr);
+    }
+
+    int mem_access(uint64_t /*dev_addr*/, uint64_t /*size*/, int /*flags*/) {
+        // Access control is enforced by the device's RAM ACL inside
+        // libvortex-gem5.so. The host runtime has nothing to do here.
+        return 0;
+    }
+
+    int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
+        if (mem_free) *mem_free = global_mem_.free();
+        if (mem_used) *mem_used = global_mem_.allocated();
+        return 0;
+    }
+
+    // ---- Data plane (cold-start only) ----
+    // PIN_BASE_ADDR is identity-mapped into the host process's VA via
+    // Process::map (driver.h §"identity v→p"), and into the SimObject's
+    // PA view of device VRAM. A memcpy through PIN_BASE_ADDR is the
+    // same physical bytes the CP's DMA engine and Vortex's MemSim see —
+    // zero PIO bounce, zero DMA descriptor, zero command. The dispatcher
+    // uses these to seed CP ring buffers and to preload kernel ELFs;
+    // ordered host↔device transfers from user code go through CMD_MEM_*
+    // in the CP queue.
+
+    int upload(uint64_t dev_addr, const void* host_ptr, uint64_t size) {
+        if (size == 0) return 0;
+        if (dev_addr + size > GLOBAL_MEM_SIZE) return -1;
+        std::memcpy(reinterpret_cast<void*>(PIN_BASE_ADDR + dev_addr),
+                    host_ptr, size);
+        mmio_fence();
+        return 0;
+    }
+
+    int download(void* host_ptr, uint64_t dev_addr, uint64_t size) {
+        if (size == 0) return 0;
+        if (dev_addr + size > GLOBAL_MEM_SIZE) return -1;
+        mmio_fence();
+        std::memcpy(host_ptr,
+                    reinterpret_cast<const void*>(PIN_BASE_ADDR + dev_addr),
+                    size);
+        return 0;
+    }
+
+    int copy(uint64_t dest_addr, uint64_t src_addr, uint64_t size) {
+        if (size == 0) return 0;
+        if (dest_addr + size > GLOBAL_MEM_SIZE
+         || src_addr  + size > GLOBAL_MEM_SIZE) return -1;
+        std::memmove(reinterpret_cast<void*>(PIN_BASE_ADDR + dest_addr),
+                     reinterpret_cast<const void*>(PIN_BASE_ADDR + src_addr),
+                     size);
+        mmio_fence();
+        return 0;
+    }
+
+    // ---- Control plane (sole) ----
+    // `off` is the CP-internal regfile offset (sim/common/CommandProcessor.h
+    // §address map). The gem5 device exposes the CP regfile starting at
+    // PIO_BASE_ADDR + 0 — no AFU bit-12 split — so the wrapper is a
+    // straight PIO access.
+    int cp_mmio_write(uint32_t off, uint32_t value) {
+        mmio_write32(off, value);
+        return 0;
+    }
+    int cp_mmio_read(uint32_t off, uint32_t* value) {
+        *value = mmio_read32(off);
+        return 0;
+    }
+
+private:
+    MemoryAllocator global_mem_;
+};
+
+#include <callbacks.inc>
diff --git a/sw/runtime/stub/Makefile b/sw/runtime/stub/Makefile
index 14f88f02b..ed566fac6 100644
--- a/sw/runtime/stub/Makefile
+++ b/sw/runtime/stub/Makefile
@@ -1,5 +1,13 @@
 include ../common.mk
 
+# HOST_ARCH switch — when building for a non-native simulated host
+# (e.g. running x86 gem5 with an aarch64 simulated CPU), select the
+# matching cross-compiler. Aligns with sw/runtime/gem5/Makefile's
+# HOST_ARCH knob; cross-arch builds land in $(DESTDIR)/$(HOST_ARCH)/
+# so the same dlopen target name (libvortex.so) can coexist with the
+# native build in $(DESTDIR)/.
+HOST_ARCH ?= x86_64
+
 DESTDIR ?= $(CURDIR)/..
 
 SRC_DIR := $(VORTEX_HOME)/sw/runtime/stub
@@ -13,8 +21,21 @@ LDFLAGS += -shared -pthread -ldl -Wl,-soname,libvortex.so
 # itself lives in (so the dlopen at vx_device_open time finds them).
 LDFLAGS += -Wl,-rpath,'$$ORIGIN'
 
+ifeq ($(HOST_ARCH),x86_64)
+    CXX := g++
+    OUT_DIR := $(DESTDIR)
+else ifeq ($(HOST_ARCH),aarch64)
+    CXX := aarch64-linux-gnu-g++
+    OUT_DIR := $(DESTDIR)/aarch64
+else ifeq ($(HOST_ARCH),armhf)
+    CXX := arm-linux-gnueabihf-g++
+    OUT_DIR := $(DESTDIR)/armhf
+else
+    $(error HOST_ARCH must be one of: x86_64, aarch64, armhf (got $(HOST_ARCH)))
+endif
+
 # Dispatcher library = vortex2.h runtime (C++ classes) +
-#                      vortex_legacy.cpp wrappers (vortex.h -> vortex2.h) +
+#                      legacy_runtime.cpp wrappers (vortex.h -> vortex2.h) +
 #                      legacy utility helpers +
 #                      thin stub/vortex.cpp glue (currently just for the
 #                      build target — the real entry points live in
@@ -41,12 +62,13 @@ endif
 
 PROJECT := libvortex.so
 
-all: $(DESTDIR)/$(PROJECT)
+all: $(OUT_DIR)/$(PROJECT)
 
-$(DESTDIR)/$(PROJECT): $(SRCS)
+$(OUT_DIR)/$(PROJECT): $(SRCS)
+	@mkdir -p $(OUT_DIR)
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
 
 clean:
-	rm -f $(DESTDIR)/$(PROJECT)
+	rm -f $(DESTDIR)/$(PROJECT) $(DESTDIR)/aarch64/$(PROJECT) $(DESTDIR)/armhf/$(PROJECT)
 
 .PHONY: all clean
diff --git a/tests/regression/common.mk b/tests/regression/common.mk
index 536fcd6f8..6484ed1e0 100644
--- a/tests/regression/common.mk
+++ b/tests/regression/common.mk
@@ -83,7 +83,39 @@ CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors -Werror
 CXXFLAGS += -I$(VORTEX_HOME)/sw/runtime/include -I$(ROOT_DIR)/sw -I$(ROOT_DIR)/hw -I$(SW_COMMON_DIR)
 CXXFLAGS += $(CONFIGS)
 
-LDFLAGS += -L$(VORTEX_RT_LIB) -lvortex
+# HOST_ARCH selects the simulated-host compiler for the test binary
+# (the .vxbin always builds with the RISC-V toolchain regardless).
+# When non-native, the binary is suffixed (e.g. vecadd-aarch64) and
+# we link against the cross-compiled stub in $(VORTEX_RT_LIB)/$(HOST_ARCH)/.
+# Aligns with sw/runtime/{stub,gem5}/Makefile's HOST_ARCH knob; the
+# gem5 ARM e2e test path uses this to produce aarch64 binaries that
+# the simulated ARM CPU inside gem5 can execute.
+#
+# Cross-compiled ELFs embed `/lib/ld-linux-$arch.so.1` as the dynamic
+# linker (PT_INTERP). gem5 doesn't have that path on the host, but
+# it has a setInterpDir() API that prepends a sysroot to the
+# interpreter lookup — the gem5 Python config calls that when
+# DRIVER=gem5-aarch64. Keep the default INTERP here so that mechanism
+# can do the redirection cleanly. (Earlier versions used
+# `-Wl,--dynamic-linker=` to rewrite PT_INTERP, but that interacts
+# badly with setInterpDir's prepend logic.)
+HOST_ARCH ?= x86_64
+ifeq ($(HOST_ARCH),x86_64)
+    PROJECT_SUFFIX :=
+    RT_LIB_DIR := $(VORTEX_RT_LIB)
+else ifeq ($(HOST_ARCH),aarch64)
+    CXX := aarch64-linux-gnu-g++
+    PROJECT_SUFFIX := -aarch64
+    RT_LIB_DIR := $(VORTEX_RT_LIB)/aarch64
+else ifeq ($(HOST_ARCH),armhf)
+    CXX := arm-linux-gnueabihf-g++
+    PROJECT_SUFFIX := -armhf
+    RT_LIB_DIR := $(VORTEX_RT_LIB)/armhf
+else
+    $(error HOST_ARCH must be one of: x86_64, aarch64, armhf (got $(HOST_ARCH)))
+endif
+
+LDFLAGS += -L$(RT_LIB_DIR) -lvortex
 
 # Debugging
 ifdef DEBUG
@@ -106,7 +138,11 @@ endif
 
 CONFIG_STAMP = config.stamp
 
-all: $(PROJECT) kernel.vxbin kernel.dump
+# HOST_ARCH-suffixed binary name (vecadd, vecadd-aarch64, …) so
+# x86 and cross-compiled variants coexist in the same dir.
+APP := $(PROJECT)$(PROJECT_SUFFIX)
+
+all: $(APP) kernel.vxbin kernel.dump
 
 # Force rebuild when CONFIGS (defines) change between runs.
 $(CONFIG_STAMP): FORCE
@@ -146,9 +182,16 @@ kernel.elf: vx_start.o $(VX_SRCS) $(VORTEX_KN_PATH)/lib$(KERNEL_LIB).a $(CONFIG_
 	$(VX_CXX) $(VX_CFLAGS) vx_start.o $(VX_APP_OBJS) $(VX_LDFLAGS) -o $@
 endif
 
-$(PROJECT): $(SRCS) $(VORTEX_RT_LIB)/libvortex.so $(CONFIG_STAMP)
+$(APP): $(SRCS) $(RT_LIB_DIR)/libvortex.so $(CONFIG_STAMP)
 	$(CXX) $(CXXFLAGS) $(filter-out $(CONFIG_STAMP),$^) $(LDFLAGS) -o $@
 
+# Cross-compiled stub for non-native HOST_ARCH. Native (x86_64)
+# is built by $(VORTEX_RT_LIB)/libvortex.so rule below.
+ifneq ($(HOST_ARCH),x86_64)
+$(RT_LIB_DIR)/libvortex.so:
+	$(RUNTIME_ARGS) $(MAKE) -C $(VORTEX_RT_SRC)/stub HOST_ARCH=$(HOST_ARCH) DESTDIR=$(VORTEX_RT_LIB)
+endif
+
 run-simx: $(PROJECT) kernel.vxbin
 	$(RUNTIME_ARGS) $(MAKE) -C $(VORTEX_RT_SRC)/simx DESTDIR=$(VORTEX_RT_LIB)
 	LD_LIBRARY_PATH=$(VORTEX_RT_LIB):$(LD_LIBRARY_PATH) VORTEX_DRIVER=simx ./$(PROJECT) $(OPTS)