From 2da15192b3a0d16f180b206025786c03fc4d6d64 Mon Sep 17 00:00:00 2001
From: maobaolong <baoloongmao@tencent.com>
Date: Sun, 7 Jun 2026 16:42:58 +0800
Subject: [PATCH 01/57] feat: add POSIX SHM infra for CPU KV-cache IPC (#3563)

* feat: add POSIX SHM infra for CPU KV-cache IPC

- lmcache/v1/multiprocess/posix_shm.py: thin POSIX-SHM facade
  (shm_create_readwrite / shm_map_readwrite / shm_munmap / shm_unlink /
  shm_open_pool_as_mmap) routing through CPython's _posixshmem to
  avoid macOS EACCES and shutdown BufferError issues
- lmcache/v1/platform/cpu/shm.py: CpuShmTensorWrapper + migrate_to_shm_and_wrap
  for zero-copy CPU KV-cache IPC mirroring CUDA-IPC semantics
- lmcache/v1/platform/cpu/__init__.py: self-register cpu factory with
  platform registry
- tests/v1/multiprocess/test_posix_shm.py: unit tests for posix_shm
- tests/v1/platform/test_cpu_shm.py: unit tests for CpuShmTensorWrapper

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* address comment

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* address comment

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* assert zero storage_offset before SHM migration

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* add warning logs to swallowed exceptions in posix_shm

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

---------

Signed-off-by: baoloongmao <baoloongmao@tencent.com>
---
 .../vllm/vllm_multi_process_adapter.py        |   6 +-
 lmcache/v1/multiprocess/posix_shm.py          | 273 ++++++++++++++++++
 lmcache/v1/platform/cpu/__init__.py           |  34 ++-
 lmcache/v1/platform/cpu/shm.py                | 267 +++++++++++++++++
 tests/v1/multiprocess/test_posix_shm.py       |  78 +++++
 tests/v1/platform/__init__.py                 |   1 +
 tests/v1/platform/test_cpu_shm.py             | 234 +++++++++++++++
 7 files changed, 887 insertions(+), 6 deletions(-)
 create mode 100644 lmcache/v1/multiprocess/posix_shm.py
 create mode 100644 lmcache/v1/platform/cpu/shm.py
 create mode 100644 tests/v1/multiprocess/test_posix_shm.py
 create mode 100644 tests/v1/platform/__init__.py
 create mode 100644 tests/v1/platform/test_cpu_shm.py

diff --git a/lmcache/integration/vllm/vllm_multi_process_adapter.py b/lmcache/integration/vllm/vllm_multi_process_adapter.py
index 86578b22db..80dfde3717 100644
--- a/lmcache/integration/vllm/vllm_multi_process_adapter.py
+++ b/lmcache/integration/vllm/vllm_multi_process_adapter.py
@@ -149,7 +149,7 @@ def wrap_kv_caches(kv_caches: dict[str, torch.Tensor]) -> KVCache:
     wrappers: KVCache = []
     try:
         for tensor in kv_caches.values():
-            wrappers.append(_wrap_one_kv_cache(tensor))
+            wrappers.append(wrap_one_kv_cache(tensor))
     except BaseException:
         _release_partial_kv_wrappers(wrappers)
         raise
@@ -165,7 +165,7 @@ def _release_partial_kv_wrappers(wrappers: list[Any]) -> None:
     are silently skipped.
     """
     # First Party
-    from lmcache.v1.platform.cpu.shm import shm_unlink
+    from lmcache.v1.multiprocess.posix_shm import shm_unlink
 
     for w in wrappers:
         name = getattr(w, "shm_name", None)
@@ -177,7 +177,7 @@ def _release_partial_kv_wrappers(wrappers: list[Any]) -> None:
             logger.debug("shm_unlink failed during rollback", exc_info=True)
 
 
-def _wrap_one_kv_cache(tensor: torch.Tensor) -> Any:
+def wrap_one_kv_cache(tensor: torch.Tensor) -> Any:
     """Dispatch by ``tensor.device.type`` via the platform registry.
 
     Concrete factories self-register at import time (CUDA in
diff --git a/lmcache/v1/multiprocess/posix_shm.py b/lmcache/v1/multiprocess/posix_shm.py
new file mode 100644
index 0000000000..9445021666
--- /dev/null
+++ b/lmcache/v1/multiprocess/posix_shm.py
@@ -0,0 +1,273 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Shared-memory primitives shared by SHM-based transports.
+
+Thin POSIX-SHM facade exposing the legacy
+``shm_create_readwrite`` / ``shm_map_readwrite`` / ``shm_munmap`` /
+``shm_unlink`` / ``shm_open_pool_as_mmap`` quartet, so the CPU
+KV-cache wrapper, the MP non-GPU SHM transport, and the existing
+tests keep working unchanged.
+
+We deliberately route through CPython's bundled ``_posixshmem`` C
+extension (used internally by :mod:`multiprocessing.shared_memory`)
+rather than the higher-level :class:`SharedMemory` wrapper. The
+wrapper keeps an internal ``memoryview`` over its own ``mmap``;
+when callers also export a buffer (via
+``ctypes.c_uint8.from_buffer(shm.buf)`` / ``torch.frombuffer(...)``),
+:meth:`SharedMemory.close` invoked from ``__del__`` at interpreter
+shutdown raises ``BufferError: cannot close exported pointers
+exist``. Owning the ``mmap`` ourselves and pairing every alloc with
+an explicit ``shm_munmap`` keeps shutdown silent on macOS and Linux
+alike.
+
+The previous hand-rolled libc/librt implementation tripped over
+macOS' shm_open MAC label propagation when certain native
+extensions (torch + a few others) were already loaded in the
+parent process, producing spurious ``errno=13 / EACCES`` failures
+on the child side. Routing through ``_posixshmem.shm_open`` -- the
+same underlying entry point CPython's stdlib uses -- fixes that
+and is identical on Linux.
+"""
+
+# Future
+from __future__ import annotations
+
+# Standard
+import atexit
+import ctypes
+import logging
+import mmap as _mmap
+import os
+import threading
+
+# Third Party
+import _posixshmem  # type: ignore[import-not-found]
+
+logger = logging.getLogger(__name__)
+
+
+def _strip_leading_slash(name: str) -> str:
+    """Normalise a name to the bare form (no leading ``/``).
+
+    Callers historically embed the POSIX leading slash; we keep the
+    on-wire name slash-prefixed but feed the bare form to
+    ``_posixshmem.shm_open``-derived helpers that prepend it again.
+    """
+    return name[1:] if name.startswith("/") else name
+
+
+def _slashed(name: str) -> str:
+    """Inverse of :func:`_strip_leading_slash` for shm_open calls."""
+    return name if name.startswith("/") else "/" + name
+
+
+# Per-process registry mapping the public ``int`` address back to the
+# ``mmap`` object that owns the mapping, so a later ``shm_munmap`` can
+# call ``mmap.close()`` exactly once and avoid leaking pages. Owners
+# (creators) also remember the name so ``shm_unlink`` can find it
+# without a re-open round-trip.
+_REGISTRY_LOCK = threading.Lock()
+_ADDR_TO_MMAP: dict[int, _mmap.mmap] = {}
+_OWNED_NAMES: set[str] = set()
+
+
+def _open_and_mmap(name: str, nbytes: int, *, create: bool) -> tuple[_mmap.mmap, int]:
+    """Open (or create) a POSIX SHM segment and ``mmap`` it.
+
+    Returns a ``(mmap_obj, base_addr)`` pair. The fd is always closed
+    before returning so we don't leak descriptors; the kernel keeps
+    the mapping alive as long as ``mmap_obj`` stays alive.
+    """
+    flags = os.O_RDWR | (os.O_CREAT | os.O_EXCL if create else 0)
+    fd = _posixshmem.shm_open(_slashed(name), flags, mode=0o600)
+    mm: _mmap.mmap | None = None
+    try:
+        if create:
+            os.ftruncate(fd, nbytes)
+        mm = _mmap.mmap(fd, nbytes, access=_mmap.ACCESS_WRITE)
+        addr = _addr_of_mmap(mm)
+    except BaseException:
+        if mm is not None:
+            mm.close()
+        if create:
+            try:
+                _posixshmem.shm_unlink(_slashed(name))
+            except OSError:
+                logger.warning(
+                    "shm_unlink failed during cleanup of %s",
+                    name,
+                    exc_info=True,
+                )
+        raise
+    finally:
+        os.close(fd)
+    return mm, addr
+
+
+def _addr_of_mmap(mm: _mmap.mmap) -> int:
+    """Return the base address of an ``mmap`` without leaking a buffer view.
+
+    A single-byte ctypes view is created just long enough to read the
+    base address, then dropped before this function returns; once it is
+    out of scope the mmap has no exported pointers, so a later
+    ``mm.close()`` can complete cleanly.  A 1-byte view is sufficient
+    -- ``ctypes.addressof`` returns the start of the buffer regardless
+    of its declared length.
+    """
+    view = (ctypes.c_uint8 * 1).from_buffer(mm)
+    addr = ctypes.addressof(view)
+    del view
+    return addr
+
+
+def shm_create_readwrite(name: str, nbytes: int) -> int:
+    """Create a new shared-memory segment and return its mmap address.
+
+    Mirrors the previous ``shm_open(O_CREAT|O_EXCL) + ftruncate +
+    mmap`` sequence: collisions raise ``OSError`` (``FileExistsError``
+    is a subclass), and a failure mid-way fully tears down what was
+    allocated.
+
+    Args:
+        name: The name of the shared-memory segment.
+        nbytes: The size of the segment in bytes.
+
+    Returns:
+        The virtual address of the mapped segment.
+
+    Raises:
+        OSError: If the segment already exists or creation fails.
+    """
+    sm_name = _strip_leading_slash(name)
+    mm, addr = _open_and_mmap(sm_name, nbytes, create=True)
+    with _REGISTRY_LOCK:
+        _ADDR_TO_MMAP[addr] = mm
+        _OWNED_NAMES.add(sm_name)
+    return addr
+
+
+def shm_map_readwrite(name: str, nbytes: int) -> int:
+    """Open an existing shared-memory segment and return its address.
+
+    ``nbytes`` must match the segment's actual size; ``mmap`` will
+    raise on a mismatch.
+
+    Args:
+        name: The name of the shared-memory segment.
+        nbytes: The size of the segment in bytes.
+
+    Returns:
+        The virtual address of the mapped segment.
+
+    Raises:
+        OSError: If the segment cannot be opened or mapped.
+    """
+    sm_name = _strip_leading_slash(name)
+    mm, addr = _open_and_mmap(sm_name, nbytes, create=False)
+    with _REGISTRY_LOCK:
+        _ADDR_TO_MMAP[addr] = mm
+    return addr
+
+
+def shm_munmap(addr: int, nbytes: int = 0) -> None:
+    """Best-effort release of a previously mapped segment by address.
+
+    The underlying mmap is closed exactly once; subsequent calls with
+    the same address are no-ops.
+
+    Args:
+        addr: The virtual address of the mapped segment.
+        nbytes: Unused; kept for API compatibility so callers that
+            already pass the size do not need to be updated.
+    """
+    if not addr:
+        return
+    with _REGISTRY_LOCK:
+        mm = _ADDR_TO_MMAP.pop(addr, None)
+    if mm is None:
+        return
+    try:
+        mm.close()
+    except (BufferError, ValueError) as exc:
+        # ``BufferError`` means callers still hold an exported view
+        # (e.g. a torch tensor backed by this mmap); they will release
+        # the mapping themselves on GC. ``ValueError`` means already
+        # closed -- treat both as best-effort no-ops.
+        logger.warning(
+            "shm_munmap: mmap.close() skipped for addr=%#x: %s",
+            addr,
+            exc,
+        )
+
+
+def shm_unlink(name: str) -> None:
+    """Best-effort segment removal.
+
+    Idempotent: a missing segment is treated as a successful
+    no-op so callers can blindly call this on shutdown.
+
+    Args:
+        name: The name of the shared-memory segment to unlink.
+    """
+    sm_name = _strip_leading_slash(name)
+    with _REGISTRY_LOCK:
+        _OWNED_NAMES.discard(sm_name)
+    try:
+        _posixshmem.shm_unlink(_slashed(sm_name))
+    except FileNotFoundError:
+        logger.debug("shm_unlink: segment %s already removed", sm_name)
+    except OSError:
+        # Mirrors the historical "best effort" contract -- e.g.
+        # double-unlink on shutdown should never raise.
+        logger.warning(
+            "shm_unlink: failed to unlink %s",
+            sm_name,
+            exc_info=True,
+        )
+
+
+def _atexit_cleanup() -> None:
+    """Unlink and munmap any SHM segments still owned by this process."""
+    with _REGISTRY_LOCK:
+        names = list(_OWNED_NAMES)
+        mmaps = list(_ADDR_TO_MMAP.values())
+        _ADDR_TO_MMAP.clear()
+        _OWNED_NAMES.clear()
+    for mm in mmaps:
+        try:
+            mm.close()
+        except (BufferError, OSError) as exc:
+            logger.warning("atexit: mmap.close() failed: %s", exc)
+    for n in names:
+        try:
+            _posixshmem.shm_unlink(_slashed(n))
+        except OSError as exc:
+            logger.warning("atexit: shm_unlink(%s) failed: %s", n, exc)
+
+
+atexit.register(_atexit_cleanup)
+
+
+def shm_open_pool_as_mmap(name: str, nbytes: int) -> _mmap.mmap:
+    """Open an existing segment as an independent ``mmap.mmap`` object.
+
+    Convenience helper for non-GPU SHM transports that consume the
+    segment via ``torch.frombuffer(mmap_obj, ...)`` rather than a raw
+    address. The returned mmap is independent of any registry entry,
+    so the caller takes ownership and is responsible for closing it.
+
+    Args:
+        name: The name of the shared-memory segment.
+        nbytes: The size of the segment in bytes.
+
+    Returns:
+        An independent ``mmap.mmap`` object backed by the segment.
+
+    Raises:
+        OSError: If the segment cannot be opened or mapped.
+    """
+    sm_name = _strip_leading_slash(name)
+    fd = _posixshmem.shm_open(_slashed(sm_name), os.O_RDWR, mode=0o600)
+    try:
+        return _mmap.mmap(fd, nbytes, access=_mmap.ACCESS_WRITE)
+    finally:
+        os.close(fd)
diff --git a/lmcache/v1/platform/cpu/__init__.py b/lmcache/v1/platform/cpu/__init__.py
index ea5769408b..bb2edd378a 100644
--- a/lmcache/v1/platform/cpu/__init__.py
+++ b/lmcache/v1/platform/cpu/__init__.py
@@ -1,7 +1,35 @@
 # SPDX-License-Identifier: Apache-2.0
 """CPU-specific platform primitives.
 
-This package will register a CPU KV-cache wrapper factory with
-:mod:`lmcache.v1.platform._registry` once the POSIX-SHM backend
-is available.
+Importing this package self-registers the POSIX-SHM KV-cache wrapper
+factory with :mod:`lmcache.v1.platform._registry`, so the dispatch
+in :mod:`lmcache.integration.vllm.vllm_multi_process_adapter` can
+pick the right wrapper based on ``tensor.device.type`` without any
+if/elif chain.
 """
+
+# Standard
+from typing import Any
+
+# Third Party
+import torch
+
+# First Party
+from lmcache.v1.platform._registry import register_kv_wrapper
+
+
+def _kv_wrapper_factory(tensor: torch.Tensor) -> Any:
+    """Indirect-dispatch wrapper.
+
+    Defers loading :mod:`lmcache.v1.platform.cpu.shm` (which pulls in
+    ``multiprocess.custom_types``) until first use, so importing this
+    package during ``lmcache/__init__.py``'s bootstrap does not race
+    other imports that touch ``torch_dev``.
+    """
+    # First Party
+    from lmcache.v1.platform.cpu.shm import migrate_to_shm_and_wrap
+
+    return migrate_to_shm_and_wrap(tensor)
+
+
+register_kv_wrapper("cpu", _kv_wrapper_factory)
diff --git a/lmcache/v1/platform/cpu/shm.py b/lmcache/v1/platform/cpu/shm.py
new file mode 100644
index 0000000000..7bbb702d5b
--- /dev/null
+++ b/lmcache/v1/platform/cpu/shm.py
@@ -0,0 +1,267 @@
+# SPDX-License-Identifier: Apache-2.0
+"""CPU-only KV-cache IPC wrapper backed by POSIX shared memory.
+
+Mirrors the GPU-mode CUDA-IPC zero-copy semantics for hosts without an
+accelerator: client and LMCache mp server map the **same** physical
+pages so transfers are pointer-shuffles rather than memcpys.
+
+Self-registers a ``"cpu"`` factory with
+:mod:`lmcache.v1.platform._registry` at import time, so the
+multiprocess adapter can dispatch by ``tensor.device.type`` without
+any if/elif chain.
+"""
+
+# Future
+from __future__ import annotations
+
+# Standard
+import ctypes
+import itertools
+import os
+import threading
+import weakref
+
+# Third Party
+import torch
+
+# First Party
+from lmcache.logging import init_logger
+from lmcache.v1.multiprocess.custom_types import CudaIPCWrapper
+from lmcache.v1.multiprocess.posix_shm import (
+    shm_create_readwrite,
+    shm_map_readwrite,
+    shm_munmap,
+    shm_unlink,
+)
+
+logger = init_logger(__name__)
+
+# Re-export POSIX-SHM primitives so existing callers keep working.
+# The canonical home is :mod:`lmcache.v1.multiprocess.posix_shm`; new
+# code (e.g. the MP non-GPU SHM transport) should import from there.
+__all__ = [
+    "CpuShmTensorWrapper",
+    "inject_stale_cache_entry_for_test",
+    "migrate_to_shm_and_wrap",
+    "shm_create_readwrite",
+    "shm_map_readwrite",
+    "shm_munmap",
+    "shm_unlink",
+]
+
+# ---------------------------------------------------------------------------
+# Wrapper class                                                             #
+# ---------------------------------------------------------------------------
+
+
+class CpuShmTensorWrapper(CudaIPCWrapper):
+    """IPC wrapper for CPU tensors backed by POSIX shared memory.
+
+    Used by the ``lmcache bench kvcache --mode cpu`` path and the
+    vLLM CPU integration so that the client and the LMCache mp server
+    map the **same** physical pages for the KV cache, mirroring the
+    GPU-mode CUDA-IPC zero-copy semantics.
+
+    Subclassing :class:`CudaIPCWrapper` is load-bearing for the same
+    reason :class:`RawCudaIPCWrapper` does it: msgspec does not
+    support unions of custom ext-encoded types, so all wire-level
+    KV-cache wrappers must share the single ext code (1) registered
+    for ``CudaIPCWrapper``. Pickle preserves the subclass identity
+    so ``to_tensor`` dispatches correctly on both sides.
+    """
+
+    # POSIX shared-memory name (``/lmcache_...``) -- leading ``/`` is
+    # required by ``shm_open(3)`` on both Linux and macOS.
+    SHM_NAME_PREFIX = "/lmcache_kv_"
+
+    def __init__(self, tensor: torch.Tensor, shm_name: str) -> None:
+        if tensor.device.type != "cpu":
+            raise ValueError(
+                "CpuShmTensorWrapper requires a CPU tensor, got %s" % tensor.device
+            )
+        if not tensor.is_contiguous():
+            raise ValueError("CpuShmTensorWrapper requires a contiguous tensor")
+
+        self.shm_name = shm_name
+        # ``numel * element_size`` is the correct logical byte size; the
+        # underlying storage may be larger when the tensor is a view.
+        self.nbytes = tensor.numel() * tensor.element_size()
+
+        # CudaIPCWrapper interface fields. ``handle`` / ``device_uuid``
+        # are unused on the CPU path but kept to satisfy the parent
+        # contract used by equality checks.
+        self.handle = None
+        self.dtype = tensor.dtype
+        self.shape = tuple(tensor.shape)
+        self.stride = tuple(tensor.stride())
+        self.storage_offset = int(tensor.storage_offset())
+        self.device_uuid = "cpu"
+
+    def to_tensor(self) -> torch.Tensor:
+        """Reconstruct the tensor by mapping the same SHM segment.
+
+        The returned tensor owns the mmap: a ``weakref.finalize`` hook
+        runs ``munmap`` once the tensor (and any views derived from it)
+        is garbage-collected, so the per-process virtual address space
+        does not leak across repeated ``to_tensor`` calls.
+
+        We rebuild the view through ``as_strided`` so the original
+        memory layout (stride / storage_offset / memory_format) is
+        replayed faithfully on the receiving side; reshape would
+        silently re-coalesce strides and lose, e.g., channels_last.
+        """
+        # Empty tensors carry no SHM segment (mmap with length 0 is
+        # undefined / EINVAL on POSIX); rebuild the empty view in-process.
+        if self.nbytes == 0:
+            return torch.empty(self.shape, dtype=self.dtype)
+        addr = shm_map_readwrite(self.shm_name, self.nbytes)
+        # ``torch.frombuffer`` requires a writable buffer; build one
+        # via ctypes so the resulting torch tensor shares storage
+        # with the SHM mapping (zero copy across processes).
+        buf_type = ctypes.c_uint8 * self.nbytes
+        buf = buf_type.from_address(addr)
+        flat = torch.frombuffer(buf, dtype=torch.uint8)
+        typed = flat.view(self.dtype)
+        out = torch.as_strided(typed, self.shape, self.stride, self.storage_offset)
+        # Keep ``flat`` alive for the lifetime of ``out`` so its mmap
+        # is not released while still in use, then munmap on cleanup.
+        out._lmcache_shm_buf = flat  # type: ignore[attr-defined]
+        weakref.finalize(out, shm_munmap, addr, self.nbytes)
+        return out
+
+
+# ---------------------------------------------------------------------------
+# Migrate-and-wrap factory (used by the multiprocess adapter)              #
+# ---------------------------------------------------------------------------
+
+# Per-process registry of SHM segments we have created, so the same
+# tensor object is only migrated to SHM once even if the factory is
+# called multiple times.
+#
+# Keyed by ``id(tensor)`` for cheap O(1) lookup, but each entry also
+# holds a ``weakref.ref`` to the original tensor and we *verify the
+# referent is still that exact object* before reusing the cached SHM
+# name. CPython recycles object IDs, so a fresh tensor allocated at
+# the same address as a previously migrated (now garbage-collected)
+# one would otherwise inherit a stale name -- and because
+# :func:`shm_create_readwrite` uses ``O_EXCL``, the next migration
+# would crash with ``EEXIST`` ("File exists"). The weakref-validated
+# lookup below makes that race impossible: a stale entry can only
+# point at a dead referent, which we treat as a miss.
+_CPU_SHM_NAMES: dict[int, tuple["weakref.ReferenceType[torch.Tensor]", str]] = {}
+_CPU_SHM_LOCK = threading.Lock()
+_CPU_SHM_COUNTER = itertools.count()
+
+
+def _cleanup_shm_segment(tid: int, shm_name: str, addr: int, nbytes: int) -> None:
+    """Release the mmap, unlink, and forget the cached SHM name."""
+    with _CPU_SHM_LOCK:
+        # Only drop the entry if it still points at *this* segment;
+        # a future tensor reusing ``tid`` may already have replaced it.
+        cached = _CPU_SHM_NAMES.get(tid)
+        if cached is not None and cached[1] == shm_name:
+            _CPU_SHM_NAMES.pop(tid, None)
+    shm_munmap(addr, nbytes)
+    shm_unlink(shm_name)
+
+
+def migrate_to_shm_and_wrap(tensor: torch.Tensor) -> CpuShmTensorWrapper:
+    """Re-point ``tensor``'s storage at a POSIX SHM segment, then wrap.
+
+    Used as the registered ``"cpu"`` KV-wrapper factory: the LMCache mp
+    server can mmap the same physical pages on the receiving side.
+    Idempotent per tensor identity (validated via a stored weakref so
+    Python's id-recycling cannot produce a stale-name hit). The SHM
+    segment is released (``munmap`` + ``shm_unlink``) automatically
+    when the migrated tensor is garbage-collected.
+    """
+    # First Party
+    from lmcache.v1.gpu_connector.utils import attempt_permute_to_contiguous_view
+
+    # Validate and normalise the tensor *before* touching the registry
+    # or mutating storage, so a bad input never leaves things half-done.
+    tensor = attempt_permute_to_contiguous_view(tensor)
+    if tensor.device.type != "cpu":
+        raise ValueError(
+            "migrate_to_shm_and_wrap requires a CPU tensor, got %s" % tensor.device
+        )
+    if not tensor.is_contiguous():
+        raise ValueError("migrate_to_shm_and_wrap requires a contiguous tensor")
+
+    tid = id(tensor)
+
+    # Fast path: check the registry under the lock, return early if the
+    # tensor has already been migrated.
+    with _CPU_SHM_LOCK:
+        cached = _CPU_SHM_NAMES.get(tid)
+        if cached is not None:
+            ref, cached_name = cached
+            if ref() is tensor:
+                return CpuShmTensorWrapper(tensor, cached_name)
+            # Stale entry from a GC'd tensor whose id has been
+            # reused; drop it and fall through to allocate fresh.
+        _CPU_SHM_NAMES.pop(tid, None)
+
+    nbytes = tensor.numel() * tensor.element_size()
+    assert tensor.storage_offset() == 0, (
+        "migrate_to_shm_and_wrap: SHM segment is sized to "
+        "numel*elem_size; a nonzero storage_offset would cause "
+        "OOB access. Got offset=%d" % tensor.storage_offset()
+    )
+    if nbytes == 0:
+        # No SHM segment for empty tensors: ``mmap`` with length 0
+        # is undefined / EINVAL on POSIX. ``to_tensor`` rebuilds an
+        # empty view directly when ``shm_name`` is empty.
+        return CpuShmTensorWrapper(tensor, "")
+
+    shm_name = "%s%d_%d" % (
+        CpuShmTensorWrapper.SHM_NAME_PREFIX,
+        os.getpid(),
+        next(_CPU_SHM_COUNTER),
+    )
+    # Perform the heavy work (syscall + tensor mutation) outside the lock
+    # to keep the critical section small.
+    addr = shm_create_readwrite(shm_name, nbytes)
+    try:
+        buf_type = ctypes.c_uint8 * nbytes
+        buf = buf_type.from_address(addr)
+        shm_storage = torch.frombuffer(buf, dtype=torch.uint8).untyped_storage()
+        tensor.set_(
+            shm_storage,
+            tensor.storage_offset(),
+            tensor.shape,
+            tensor.stride(),
+        )
+    except Exception:
+        # Make sure the SHM resources don't leak if migration fails
+        # part-way (e.g. ``set_`` rejects an unusual stride).
+        shm_munmap(addr, nbytes)
+        shm_unlink(shm_name)
+        raise
+
+    with _CPU_SHM_LOCK:
+        _CPU_SHM_NAMES[tid] = (weakref.ref(tensor), shm_name)
+    weakref.finalize(tensor, _cleanup_shm_segment, tid, shm_name, addr, nbytes)
+    logger.info(
+        "Migrated CPU KV cache tensor (nbytes=%d) to SHM %s",
+        nbytes,
+        shm_name,
+    )
+    return CpuShmTensorWrapper(tensor, shm_name)
+
+
+def inject_stale_cache_entry_for_test(
+    tensor: torch.Tensor,
+    dead_ref: "weakref.ReferenceType[torch.Tensor]",
+    stale_shm_name: str,
+) -> None:
+    """Test-only hook: pre-seed the registry with a stale entry.
+
+    Lets unit tests reproduce the CPython id-reuse race -- where a
+    fresh tensor lands on the same id as a previously migrated and
+    garbage-collected one -- without the per-test global-state
+    surgery that would otherwise have to reach into the module's
+    private dict / lock.
+    """
+    with _CPU_SHM_LOCK:
+        _CPU_SHM_NAMES[id(tensor)] = (dead_ref, stale_shm_name)
diff --git a/tests/v1/multiprocess/test_posix_shm.py b/tests/v1/multiprocess/test_posix_shm.py
new file mode 100644
index 0000000000..a04497df50
--- /dev/null
+++ b/tests/v1/multiprocess/test_posix_shm.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for ``lmcache.v1.multiprocess.posix_shm``.
+
+Validates the POSIX-SHM primitives and the ``mmap``-based pool helper
+shared by SHM-based transports.
+"""
+
+# Standard
+import os
+
+# Third Party
+import pytest
+
+# First Party
+from lmcache.v1.multiprocess.posix_shm import (
+    shm_create_readwrite,
+    shm_map_readwrite,
+    shm_munmap,
+    shm_open_pool_as_mmap,
+    shm_unlink,
+)
+
+
+def _unique_name(tag: str) -> str:
+    # macOS shm_open caps names at 31 bytes incl. leading '/'.
+    return "/lmc_pshm_%s_%d" % (tag, os.getpid())
+
+
+def test_create_map_munmap_unlink_roundtrip():
+    name = _unique_name("rt")
+    addr = shm_create_readwrite(name, 4096)
+    try:
+        assert addr not in (0, None)
+        # Map again from a fresh address: same segment, different vaddr.
+        addr2 = shm_map_readwrite(name, 4096)
+        try:
+            assert addr2 not in (0, None)
+        finally:
+            shm_munmap(addr2, 4096)
+    finally:
+        shm_munmap(addr, 4096)
+        shm_unlink(name)
+
+
+def test_create_excl_collision():
+    name = _unique_name("excl")
+    addr = shm_create_readwrite(name, 4096)
+    try:
+        with pytest.raises(OSError):
+            shm_create_readwrite(name, 4096)
+    finally:
+        shm_munmap(addr, 4096)
+        shm_unlink(name)
+
+
+def test_open_pool_as_mmap_zero_copy_view():
+    name = _unique_name("pool")
+    nbytes = 4096
+    addr = shm_create_readwrite(name, nbytes)
+    try:
+        mm = shm_open_pool_as_mmap(name, nbytes)
+        try:
+            mm[0:4] = b"\x01\x02\x03\x04"
+            mm2 = shm_open_pool_as_mmap(name, nbytes)
+            try:
+                assert bytes(mm2[0:4]) == b"\x01\x02\x03\x04"
+            finally:
+                mm2.close()
+        finally:
+            mm.close()
+    finally:
+        shm_munmap(addr, nbytes)
+        shm_unlink(name)
+
+
+def test_munmap_no_op_on_zero_addr():
+    # Should not crash; best-effort no-op.
+    shm_munmap(0, 4096)
diff --git a/tests/v1/platform/__init__.py b/tests/v1/platform/__init__.py
new file mode 100644
index 0000000000..9881313609
--- /dev/null
+++ b/tests/v1/platform/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/v1/platform/test_cpu_shm.py b/tests/v1/platform/test_cpu_shm.py
new file mode 100644
index 0000000000..65a52cedae
--- /dev/null
+++ b/tests/v1/platform/test_cpu_shm.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for ``lmcache.v1.platform.cpu.shm``.
+
+Validates that the POSIX-SHM-backed wrapper can round-trip a CPU
+tensor in-process: the constructed wrapper's ``to_tensor()`` view
+sees writes made through the original tensor.
+"""
+
+# Standard
+import os
+
+# Third Party
+import pytest
+import torch
+
+# First Party
+from lmcache.v1.multiprocess.posix_shm import shm_unlink
+from lmcache.v1.platform.cpu.shm import (
+    CpuShmTensorWrapper,
+    migrate_to_shm_and_wrap,
+    shm_create_readwrite,
+)
+
+
+def test_shm_create_unlink_roundtrip():
+    """``shm_create_readwrite`` succeeds and ``shm_unlink`` cleans up."""
+    name = "/lmcache_test_%d" % os.getpid()
+    addr = shm_create_readwrite(name, 4096)
+    try:
+        assert addr not in (0, None)
+    finally:
+        shm_unlink(name)
+
+
+def test_migrate_to_shm_and_wrap_zero_copy_view():
+    """After migrate, writes via the original tensor are visible via wrapper."""
+    src = torch.zeros((2, 4, 4), dtype=torch.float32)
+    wrapper = migrate_to_shm_and_wrap(src)
+    try:
+        assert isinstance(wrapper, CpuShmTensorWrapper)
+        assert wrapper.shape == (2, 4, 4)
+        assert wrapper.dtype == torch.float32
+        # Mutate via the migrated source tensor; its storage is now the
+        # SHM segment, so the wrapper's reconstructed view must see it.
+        src.add_(7.0)
+        view = wrapper.to_tensor()
+        assert torch.equal(view, src)
+    finally:
+        shm_unlink(wrapper.shm_name)
+
+
+def test_migrate_handles_empty_tensor():
+    """Empty tensors must not call ``mmap`` (length 0 is EINVAL).
+
+    Regression for the case where ``nbytes == 0``: the wrapper carries
+    an empty ``shm_name`` and ``to_tensor`` rebuilds the empty view in
+    process without touching POSIX shared memory.
+    """
+    src = torch.empty((0, 4), dtype=torch.float32)
+    wrapper = migrate_to_shm_and_wrap(src)
+    assert isinstance(wrapper, CpuShmTensorWrapper)
+    assert wrapper.shm_name == ""
+    assert wrapper.nbytes == 0
+    view = wrapper.to_tensor()
+    assert view.shape == (0, 4)
+    assert view.dtype == torch.float32
+
+
+def test_migrate_is_idempotent_on_same_tensor():
+    """Re-wrapping the same tensor reuses the existing SHM segment."""
+    src = torch.zeros((3, 5), dtype=torch.float32)
+    w1 = migrate_to_shm_and_wrap(src)
+    try:
+        w2 = migrate_to_shm_and_wrap(src)
+        assert w1.shm_name == w2.shm_name
+    finally:
+        shm_unlink(w1.shm_name)
+
+
+def test_rejects_non_cpu_tensor():
+    """Construction rejects tensors that are not on CPU."""
+    if not torch.backends.mps.is_available():
+        pytest.skip("MPS not available; cannot synthesize a non-cpu tensor")
+    src = torch.zeros((2, 2), device="mps")
+    with pytest.raises(ValueError, match="CPU tensor"):
+        CpuShmTensorWrapper(src, "/lmcache_test_should_not_exist")
+
+
+def test_migrate_finalizer_unlinks_on_gc():
+    """Once the migrated tensor is GC-ed, its SHM segment is unlinked."""
+    # Standard
+    import gc
+
+    # First Party
+    from lmcache.v1.platform.cpu.shm import shm_map_readwrite
+
+    src = torch.zeros((2, 2), dtype=torch.float32)
+    w = migrate_to_shm_and_wrap(src)
+    name = w.shm_name
+    nbytes = w.nbytes
+    # Drop both references; the weakref.finalize hook should unlink.
+    del src, w
+    gc.collect()
+    with pytest.raises(OSError):
+        shm_map_readwrite(name, nbytes)
+
+
+def test_shm_create_cleans_up_on_existing_name():
+    """If ``shm_open(O_EXCL)`` fails the helper must not leave the fd open.
+
+    We exercise the failure path by creating a segment, then asking
+    ``shm_create_readwrite`` to recreate the same name -- it must
+    raise without leaking the file descriptor it briefly held.
+    """
+    name = "/lmcache_test_excl_%d" % os.getpid()
+    addr = shm_create_readwrite(name, 4096)
+    try:
+        with pytest.raises(OSError):
+            shm_create_readwrite(name, 4096)
+    finally:
+        shm_unlink(name)
+    # And after unlink, the name is reusable again.
+    addr2 = shm_create_readwrite(name, 4096)
+    assert addr2 not in (0, None)
+    shm_unlink(name)
+    _ = addr  # silence unused-variable hint
+
+
+def test_to_tensor_view_carries_munmap_finalizer():
+    """``to_tensor`` returns a tensor that releases its mmap on GC."""
+    # Standard
+    import gc
+    import weakref
+
+    src = torch.zeros((2, 2), dtype=torch.float32)
+    w = migrate_to_shm_and_wrap(src)
+    try:
+        view = w.to_tensor()
+        # The view must keep ``flat`` alive so its mmap stays valid.
+        assert hasattr(view, "_lmcache_shm_buf")
+        ref = weakref.ref(view)
+        del view
+        gc.collect()
+        assert ref() is None
+    finally:
+        del src
+        gc.collect()
+        shm_unlink(w.shm_name)
+
+
+def test_to_tensor_replays_stride_and_storage_offset():
+    """``to_tensor`` rebuilds the view via stride+offset (not reshape)."""
+    src = torch.arange(24, dtype=torch.float32).reshape(2, 3, 4).contiguous()
+    w = migrate_to_shm_and_wrap(src)
+    try:
+        view = w.to_tensor()
+        assert tuple(view.stride()) == w.stride
+        assert int(view.storage_offset()) == w.storage_offset
+        assert torch.equal(view, src)
+    finally:
+        del src, view
+        shm_unlink(w.shm_name)
+
+
+def test_wrap_kv_caches_unlinks_partial_batch_on_failure(monkeypatch):
+    """If wrapping the N-th tensor raises, earlier SHM names are unlinked.
+
+    Drives :func:`wrap_kv_caches` with two CPU tensors and forces the
+    second factory call to raise; the first iteration's SHM segment
+    must be ``shm_unlink``-ed so the named segment does not outlive
+    the failed batch.
+    """
+    # First Party
+    from lmcache.integration.vllm import vllm_multi_process_adapter as adapter
+    from lmcache.v1.platform.cpu.shm import shm_map_readwrite
+
+    real_wrap = adapter.wrap_one_kv_cache
+    state = {"n": 0, "first_name": None}
+
+    def flaky_wrap(tensor):
+        state["n"] += 1
+        if state["n"] == 2:
+            raise RuntimeError("simulated migration failure")
+        w = real_wrap(tensor)
+        state["first_name"] = w.shm_name
+        return w
+
+    monkeypatch.setattr(adapter, "wrap_one_kv_cache", flaky_wrap)
+
+    t1 = torch.zeros((2, 2), dtype=torch.float32)
+    t2 = torch.zeros((2, 2), dtype=torch.float32)
+    with pytest.raises(RuntimeError, match="simulated migration failure"):
+        adapter.wrap_kv_caches({"a": t1, "b": t2})
+
+    # The first iteration's SHM segment must no longer be openable.
+    nbytes = t1.numel() * t1.element_size()
+    with pytest.raises(OSError):
+        shm_map_readwrite(state["first_name"], nbytes)
+
+
+def test_migrate_ignores_stale_entry_from_id_reuse():
+    """A cached entry whose weakref is dead must not be reused.
+
+    Simulates CPython recycling an object id by injecting a stale
+    ``(dead_ref, old_name)`` tuple keyed by the live tensor's id,
+    then calling :func:`migrate_to_shm_and_wrap`. The factory must
+    treat the dead entry as a miss and allocate a fresh SHM segment
+    -- if it blindly reused the cached name, ``shm_create_readwrite``
+    would crash with ``EEXIST`` (and even worse, the fresh tensor
+    would be silently bound to the wrong SHM name).
+    """
+    # Standard
+    import gc
+    import weakref as _wr
+
+    # First Party
+    from lmcache.v1.platform.cpu.shm import inject_stale_cache_entry_for_test
+
+    # Build a tensor we will let die so we have a guaranteed-dead ref.
+    ghost = torch.zeros((1,), dtype=torch.float32)
+    dead_ref = _wr.ref(ghost)
+    del ghost
+    gc.collect()
+    assert dead_ref() is None
+
+    live = torch.zeros((2, 2), dtype=torch.float32)
+    stale_name = "/lmcache_test_stale_%d" % os.getpid()
+    inject_stale_cache_entry_for_test(live, dead_ref, stale_name)
+
+    w = migrate_to_shm_and_wrap(live)
+    try:
+        assert w.shm_name != stale_name
+    finally:
+        shm_unlink(w.shm_name)

From 954abb4aea8397589544e5f72f21116b98d712f0 Mon Sep 17 00:00:00 2001
From: Tony Lin <tony.lin@intel.com>
Date: Mon, 8 Jun 2026 15:36:34 +0800
Subject: [PATCH 02/57] [Refactor]: Normalize flat/nested block_ids in
 flat_block_ids and connector __str__ (#3577)

Normalize flat/nested block_ids in flat_block_ids and connector __str__

Older vLLM connectors emit a flat list[int] for the single non-hybrid
group, while newer ones use nested list[list[int]]. Make flat_block_ids
and the three LMCacheMPConnectorMetadata.__str__ paths tolerate both,
matching the normalization already done in expand_block_ids_to_views().

Signed-off-by: Tony Lin <tony.lin@intel.com>
---
 lmcache/integration/vllm/lmcache_mp_connector.py   |  2 +-
 .../integration/vllm/lmcache_mp_connector_0180.py  |  2 +-
 .../integration/vllm/lmcache_mp_connector_0201.py  |  2 +-
 .../integration/vllm/vllm_multi_process_adapter.py | 14 +++++++++++++-
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/lmcache/integration/vllm/lmcache_mp_connector.py b/lmcache/integration/vllm/lmcache_mp_connector.py
index 27fa2de489..012e96fd2c 100644
--- a/lmcache/integration/vllm/lmcache_mp_connector.py
+++ b/lmcache/integration/vllm/lmcache_mp_connector.py
@@ -510,7 +510,7 @@ def __str__(self):
             request_strs.append(
                 f"RequestMetadata(request_id={req_meta.request_id}, "
                 f"direction={req_meta.direction}, "
-                f"num_blocks={len(req_meta.op.block_ids[0])}, "
+                f"num_blocks={len(req_meta.op.flat_block_ids)}, "
                 f"block_ids={req_meta.op.block_ids})"
             )
         return "[" + "\n".join(request_strs) + "]"
diff --git a/lmcache/integration/vllm/lmcache_mp_connector_0180.py b/lmcache/integration/vllm/lmcache_mp_connector_0180.py
index 61fe117a87..7fa46db945 100644
--- a/lmcache/integration/vllm/lmcache_mp_connector_0180.py
+++ b/lmcache/integration/vllm/lmcache_mp_connector_0180.py
@@ -433,7 +433,7 @@ def __str__(self):
             request_strs.append(
                 f"RequestMetadata(request_id={req_meta.request_id}, "
                 f"direction={req_meta.direction}, "
-                f"num_blocks={len(req_meta.op)}, "
+                f"num_blocks={len(req_meta.op.flat_block_ids)}, "
                 f"block_ids={req_meta.op.block_ids})"
             )
         return "[" + "\n".join(request_strs) + "]"
diff --git a/lmcache/integration/vllm/lmcache_mp_connector_0201.py b/lmcache/integration/vllm/lmcache_mp_connector_0201.py
index 1adc873587..6db28f5412 100644
--- a/lmcache/integration/vllm/lmcache_mp_connector_0201.py
+++ b/lmcache/integration/vllm/lmcache_mp_connector_0201.py
@@ -454,7 +454,7 @@ def __str__(self):
             request_strs.append(
                 f"RequestMetadata(request_id={req_meta.request_id}, "
                 f"direction={req_meta.direction}, "
-                f"num_blocks={len(req_meta.op)}, "
+                f"num_blocks={len(req_meta.op.flat_block_ids)}, "
                 f"block_ids={req_meta.op.block_ids})"
             )
         return "[" + "\n".join(request_strs) + "]"
diff --git a/lmcache/integration/vllm/vllm_multi_process_adapter.py b/lmcache/integration/vllm/vllm_multi_process_adapter.py
index 80dfde3717..c2fcb083a9 100644
--- a/lmcache/integration/vllm/vllm_multi_process_adapter.py
+++ b/lmcache/integration/vllm/vllm_multi_process_adapter.py
@@ -475,7 +475,19 @@ class LoadStoreOp:
 
     @property
     def flat_block_ids(self) -> list[int]:
-        """Return all block IDs flattened for group-blind error paths."""
+        """Return all block IDs flattened for group-blind error paths.
+
+        Handles both the normal ``list[list[int]]`` format and the
+        IPC-flattened ``list[int]`` format that vLLM v0.19.0 produces when
+        ``SchedulerOutput`` serializes single-element nested lists across
+        process boundaries (e.g. ``[[20, 21]]`` → ``[20, 21]``).
+        Returns an empty list when ``block_ids`` is empty.
+        """
+        if not self.block_ids:
+            return []
+        # Defend against IPC serialization flattening [[20, 21, …]] → [20, 21, …]
+        if isinstance(self.block_ids[0], int):
+            return list(self.block_ids)
         return [
             block_id
             for group_block_ids in self.block_ids

From 3a45d0f2fadaa590d4dc450d60d6d93be44691c2 Mon Sep 17 00:00:00 2001
From: feixiangpeng <155504520+feixiangpeng@users.noreply.github.com>
Date: Mon, 8 Jun 2026 13:28:45 -0500
Subject: [PATCH 03/57] Added HFbucket MP (#3263)

Signed-off-by: feixiangpeng <155504520+feixiangpeng@users.noreply.github.com>
---
 .../kv_cache/storage_backends/hfbucket.rst    |  84 ++
 docs/source/mp/l2_storage.rst                 |  59 ++
 .../l2_adapters/hfbucket_l2_adapter.py        | 898 ++++++++++++++++++
 .../distributed/test_hfbucket_l2_adapter.py   | 570 +++++++++++
 4 files changed, 1611 insertions(+)
 create mode 100644 lmcache/v1/distributed/l2_adapters/hfbucket_l2_adapter.py
 create mode 100644 tests/v1/distributed/test_hfbucket_l2_adapter.py

diff --git a/docs/source/kv_cache/storage_backends/hfbucket.rst b/docs/source/kv_cache/storage_backends/hfbucket.rst
index cd05342b51..2f5dc46da4 100644
--- a/docs/source/kv_cache/storage_backends/hfbucket.rst
+++ b/docs/source/kv_cache/storage_backends/hfbucket.rst
@@ -91,6 +91,90 @@ either ``hfbucket`` or an instance-qualified name such as ``hfbucket.prod``.
   existence and size metadata.
 
 
+MP Mode Configuration
+---------------------
+
+In multi-process (MP) mode, Hugging Face Buckets are configured as an L2
+adapter through a JSON spec passed to the LMCache server. This is separate from
+the non-MP ``remote_storage_plugins`` configuration above. Each
+``--l2-adapter`` argument takes a JSON object whose ``"type": "hfbucket"``
+field selects the HFBucket adapter.
+
+.. code-block:: json
+
+   {
+     "type": "hfbucket",
+     "bucket_handle": "hf://buckets/my-org/lmcache-kv/prod",
+     "token_env": "HF_TOKEN",
+     "create_bucket_if_missing": false,
+     "download_tmp_dir": "/tmp/lmcache-hfbucket-mp",
+     "metadata_cache_ttl_secs": 30,
+     "num_workers": 4,
+     "max_capacity_gb": 500,
+     "eviction": {
+       "eviction_policy": "LRU",
+       "trigger_watermark": 0.85,
+       "eviction_ratio": 0.2
+     }
+   }
+
+HFBucket L2 Adapter Fields
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* **type** (required): must be ``"hfbucket"``.
+* **bucket_handle** (required): Hugging Face Bucket handle in
+  ``hf://buckets/<namespace>/<bucket>[/<prefix>]`` format.
+* **token_env**: environment variable used to resolve the Hugging Face access
+  token (default ``"HF_TOKEN"``).
+* **token**: optional direct token fallback. ``token_env`` takes precedence
+  when the environment variable is set. Prefer ``token_env`` for production
+  deployments so secrets do not live in adapter JSON.
+* **create_bucket_if_missing**: lazily create the bucket on the first store
+  operation (default ``false``). This only helps when the bucket is missing and
+  the token has permission to create it; it does not fix invalid credentials,
+  invalid handles, or network failures.
+* **download_tmp_dir**: root directory for temporary load downloads (default
+  ``/tmp/lmcache-hfbucket-mp``). The MP adapter downloads bucket files into
+  per-task temporary files and then copies their bytes into the destination
+  ``MemoryObj`` buffers supplied by the MP controller.
+* **metadata_cache_ttl_secs**: TTL for cached exact path-size metadata (default
+  ``30``). Set this lower when another process may modify the same bucket
+  prefix outside LMCache and fresher metadata is more important than reducing
+  Hugging Face metadata calls.
+* **num_workers**: number of worker threads used for blocking Hugging Face Hub
+  bucket API calls (default ``4``). The HFBucket Python APIs are synchronous,
+  so MP mode runs upload, lookup, load, and delete work on a bounded thread
+  pool behind the adapter's eventfd-based completion interface.
+* **max_capacity_gb**: capacity used by ``get_usage()`` for watermark-based L2
+  eviction. Set to ``0`` (default) to disable aggregate capacity tracking;
+  ``get_usage()`` then reports the adapter as not providing an eviction signal.
+* **eviction**: optional sub-dict enabling the L2 eviction controller for this
+  adapter. When present, keys that are currently being loaded are protected by
+  the lookup-and-lock path and skipped by ``delete()`` until they are unlocked.
+
+Differences vs Non-MP HFBucket
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Hugging Face bucket operations are synchronous but the adapter makes submission
+  non-blocking by running the blocking calls on worker threads.
+* MP loads do not allocate and return new memory. The MP controller provides
+  destination ``MemoryObj`` buffers, and the adapter copies downloaded bytes
+  into those buffers.
+* Keys are identified by ``ObjectKey`` (``model_name`` + ``kv_rank`` +
+  ``chunk_hash`` + optional ``cache_salt``) rather than ``CacheEngineKey``.
+  The serialized MP object name is
+  ``<model>@<kv_rank_hex>@<chunk_hash_hex>[@<cache_salt>]`` and is then
+  encoded for the bucket path. This naming is not compatible with the non-MP
+  HFBucket connector's ``CacheEngineKey`` object names, so a bucket prefix
+  populated by non-MP LMCache cannot be read directly by MP LMCache and vice
+  versa.
+* Full object writes are batch based. Hugging Face batch writes are not
+  transactional, so a failed store task may still leave some objects in the
+  bucket. The MP adapter reconciles backend metadata after such failures so
+  any objects that actually landed are counted for usage and later deletion 
+  (submitted store task is still reported as failed).
+
+
 Notes
 -----
 
diff --git a/docs/source/mp/l2_storage.rst b/docs/source/mp/l2_storage.rst
index d5f0f08d2e..0f5138c251 100644
--- a/docs/source/mp/l2_storage.rst
+++ b/docs/source/mp/l2_storage.rst
@@ -543,6 +543,59 @@ S3-compatible endpoint (MinIO, Ceph RGW, etc.).
     # Local MinIO over plain HTTP
     --l2-adapter '{"type": "s3", "s3_endpoint": "minio.local:9000", "s3_region": "us-east-1", "disable_tls": true, "aws_access_key_id": "minio", "aws_secret_access_key": "minio123"}'
 
+``hfbucket`` -- Hugging Face Buckets
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An L2 adapter that stores KV cache objects in a `Hugging Face Bucket
+<https://huggingface.co/docs/hub/storage-backends>`_ using the
+``huggingface_hub`` bucket APIs.  Blocking Hub calls run on a bounded thread
+pool driven by an asyncio loop on a daemon thread, so the L2 controller thread
+is never blocked on network I/O.
+
+Object names are derived from the MP ``ObjectKey`` as
+``<model>@<kv_rank_hex>@<chunk_hash_hex>[@<cache_salt>]`` and then encoded with
+the standard HFBucket object-name encoding plus the optional bucket prefix.
+Because Hugging Face batch writes are not transactional, a store task that
+partially fails reconciles backend metadata so that any objects that actually
+landed are still counted for usage accounting and later deletion.
+
+This is a persistent remote backend best suited to warm and cold KV cache
+tiers; prefer a lower-latency local adapter for the hottest cache tier.
+
+**Required fields:**
+
+- ``bucket_handle``: Bucket location in the form
+  ``hf://buckets/<namespace>/<bucket>[/<prefix>]``.
+
+**Optional fields:**
+
+- ``token_env`` (string, default ``"HF_TOKEN"``): Environment variable used to
+  resolve the Hugging Face access token.
+- ``token`` (string): Direct token fallback used when ``token_env`` is unset.
+- ``create_bucket_if_missing`` (bool, default ``false``): Create the bucket
+  lazily on the first store instead of requiring it to exist.
+- ``download_tmp_dir`` (string): Root directory for temporary load downloads.
+- ``metadata_cache_ttl_secs`` (float, default ``30.0``): TTL for the
+  path-size metadata cache that backs lookups and usage accounting.
+- ``num_workers`` (int, default ``4``): Number of worker threads for blocking
+  Hugging Face Hub API calls.
+- ``max_capacity_gb`` (float, default ``0.0``): Aggregate capacity used by
+  ``get_usage()``.  A value of ``0`` disables aggregate eviction.
+- ``eviction`` (dict): Optional eviction policy, see ``L2AdapterConfigBase``.
+
+**Configuration examples:**
+
+.. code-block:: bash
+
+    # Minimal: use an existing bucket with a token from $HF_TOKEN
+    --l2-adapter '{"type": "hfbucket", "bucket_handle": "hf://buckets/my-org/lmcache-kv/prod"}'
+
+    # Create the bucket on first store and bound the worker pool
+    --l2-adapter '{"type": "hfbucket", "bucket_handle": "hf://buckets/my-org/lmcache-kv/prod", "create_bucket_if_missing": true, "num_workers": 8}'
+
+    # Enable aggregate eviction with a capacity cap
+    --l2-adapter '{"type": "hfbucket", "bucket_handle": "hf://buckets/my-org/lmcache-kv/prod", "max_capacity_gb": 50, "eviction": {"eviction_policy": "LRU", "trigger_watermark": 0.9, "eviction_ratio": 0.1}}'
+
 ``mock`` -- Mock adapter for testing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -777,6 +830,12 @@ drops by ``eviction_ratio``.
        when ``max_capacity_gb`` is ``0`` (disabled); set a non-zero
        ``max_capacity_gb`` to enable the watermark-triggered eviction
        controller.
+   * - ``hfbucket``
+     - ``delete`` removes objects from the bucket and frees aggregate
+       byte accounting. ``get_usage`` reports ``usage_fraction == -1.0``
+       when ``max_capacity_gb`` is ``0`` (disabled); set a non-zero
+       ``max_capacity_gb`` to enable the watermark-triggered eviction
+       controller. Locked keys (in-flight loads) are skipped.
    * - ``dax``
      - Full support. ``delete`` removes unlocked keys from the in-memory
        index immediately and recycles fixed slots once active read borrows
diff --git a/lmcache/v1/distributed/l2_adapters/hfbucket_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/hfbucket_l2_adapter.py
new file mode 100644
index 0000000000..b1cf97f576
--- /dev/null
+++ b/lmcache/v1/distributed/l2_adapters/hfbucket_l2_adapter.py
@@ -0,0 +1,898 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Hugging Face Buckets L2 adapter for LMCache MP mode.
+"""
+
+# Future
+from __future__ import annotations
+
+# Standard
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+import asyncio
+import os
+import shutil
+import tempfile
+import threading
+import time
+
+if TYPE_CHECKING:
+    from lmcache.v1.distributed.internal_api import L1MemoryDesc
+
+# First Party
+from lmcache.logging import init_logger
+from lmcache.native_storage_ops import Bitmap
+from lmcache.v1.distributed.api import ObjectKey
+from lmcache.v1.distributed.internal_api import L2StoreResult
+from lmcache.v1.distributed.l2_adapters.base import (
+    L2AdapterInterface,
+    L2TaskId,
+)
+from lmcache.v1.distributed.l2_adapters.config import (
+    L2AdapterConfigBase,
+    register_l2_adapter_type,
+)
+from lmcache.v1.distributed.l2_adapters.factory import (
+    register_l2_adapter_factory,
+)
+from lmcache.v1.memory_management import MemoryObj
+from lmcache.v1.platform import create_event_notifier
+from lmcache.v1.storage_backend.connector.hfbucket_connector import (
+    HFBucketClient,
+    HFBucketClientInterface,
+    HFBucketLocation,
+    encode_hfbucket_object_name,
+    parse_hfbucket_handle,
+)
+
+logger = init_logger(__name__)
+
+# Use a separate temp root from non-MP HFBucket to avoid collisions.
+_DEFAULT_DOWNLOAD_TMP_DIR = Path(tempfile.gettempdir()) / "lmcache-hfbucket-mp"
+_METADATA_CACHE_PRUNE_INTERVAL = 128
+
+
+@dataclass(frozen=True)
+class _CachedObjectMetadata:
+    """Cached object size entry with expiration metadata."""
+
+    size_bytes: int
+    expires_at: float
+
+
+class _PartialStoreFailure(RuntimeError):
+    """Raised when a failed HFBucket batch store still wrote some objects."""
+
+    def __init__(
+        self,
+        message: str,
+        stored_keys: list[ObjectKey],
+        stored_sizes: list[int],
+    ) -> None:
+        super().__init__(message)
+        self.stored_keys = stored_keys
+        self.stored_sizes = stored_sizes
+
+
+def _object_key_to_string(key: ObjectKey) -> str:
+    """Serialize an MP ``ObjectKey`` to the shared L2 object-name format.
+
+    Unsalted keys use ``<model_name>@<kv_rank_hex>@<chunk_hash_hex>``. Salted
+    keys append ``@<cache_salt>`` so tenants/users with identical token chunks
+    do not collide in the backing bucket.
+    """
+    base = f"{key.model_name}@{key.kv_rank:08x}@{key.chunk_hash.hex()}"
+    if key.cache_salt:
+        return f"{base}@{key.cache_salt}"
+    return base
+
+
+def _object_key_to_bucket_path(key: ObjectKey, location: HFBucketLocation) -> str:
+    """Return the HFBucket object path for an MP object key."""
+    encoded = encode_hfbucket_object_name(_object_key_to_string(key))
+    if location.object_prefix:
+        return f"{location.object_prefix}/{encoded}"
+    return encoded
+
+
+def _resolve_hf_token(token_env: str, token: str | None) -> str | None:
+    """Resolve Hugging Face token from env-first adapter config."""
+    env_token = os.environ.get(token_env, "") if token_env else ""
+    if env_token:
+        return env_token
+    return token
+
+
+def _get_path_info_path(path_info: object) -> str:
+    """Read a Hugging Face path-info object's path field defensively."""
+    path = getattr(path_info, "path", "")
+    return path if isinstance(path, str) else ""
+
+
+def _get_path_info_type(path_info: object) -> str:
+    """Read a Hugging Face path-info object's type field defensively."""
+    obj_type = getattr(path_info, "type", "")
+    return obj_type if isinstance(obj_type, str) else ""
+
+
+def _get_path_info_size(path_info: object) -> int:
+    """Read a Hugging Face path-info object's size field defensively."""
+    size = getattr(path_info, "size", 0)
+    return size if isinstance(size, int) else 0
+
+
+def _is_not_found_error(exc: Exception) -> bool:
+    """Return whether an exception represents a missing bucket/object."""
+    response = getattr(exc, "response", None)
+    status_code = getattr(response, "status_code", None)
+    if isinstance(status_code, int):
+        return status_code == 404
+
+    direct_status_code = getattr(exc, "status_code", None)
+    if isinstance(direct_status_code, int):
+        return direct_status_code == 404
+
+    return "404" in str(exc)
+
+
+class HFBucketL2AdapterConfig(L2AdapterConfigBase):
+    """Configuration for the HFBucket MP L2 adapter.
+
+    Fields:
+    - ``bucket_handle``: ``hf://buckets/<namespace>/<bucket>[/<prefix>]``.
+    - ``token_env``: environment variable used to resolve the HF token.
+    - ``token``: optional direct token fallback.
+    - ``create_bucket_if_missing``: create the bucket lazily on first store.
+    - ``download_tmp_dir``: root directory for temporary load downloads.
+    - ``metadata_cache_ttl_secs``: TTL for path-size metadata cache.
+    - ``num_workers``: worker threads for blocking Hugging Face API calls.
+    - ``max_capacity_gb``: capacity used by inherited L2 usage accounting.
+    """
+
+    def __init__(
+        self,
+        bucket_handle: str,
+        token_env: str = "HF_TOKEN",
+        token: Optional[str] = None,
+        create_bucket_if_missing: bool = False,
+        download_tmp_dir: str = str(_DEFAULT_DOWNLOAD_TMP_DIR),
+        metadata_cache_ttl_secs: float = 30.0,
+        num_workers: int = 4,
+        max_capacity_gb: float = 0.0,
+    ) -> None:
+        self.bucket_handle = bucket_handle
+        self.bucket_location = parse_hfbucket_handle(bucket_handle)
+        self.token_env = token_env
+        self.token = token
+        self.create_bucket_if_missing = create_bucket_if_missing
+        self.download_tmp_dir = Path(download_tmp_dir)
+        self.metadata_cache_ttl_secs = metadata_cache_ttl_secs
+        self.num_workers = num_workers
+        self.max_capacity_gb = max_capacity_gb
+
+    @classmethod
+    def from_dict(cls, d: dict) -> "HFBucketL2AdapterConfig":
+        """Parse a config object from ``--l2-adapter`` JSON."""
+        bucket_handle = d.get("bucket_handle")
+        if not isinstance(bucket_handle, str) or not bucket_handle:
+            raise ValueError("bucket_handle must be a non-empty string")
+
+        token_env = d.get("token_env", "HF_TOKEN")
+        if not isinstance(token_env, str):
+            raise ValueError("token_env must be a string")
+
+        token = d.get("token")
+        if token is not None and not isinstance(token, str):
+            raise ValueError("token must be a string")
+
+        download_tmp_dir = d.get("download_tmp_dir", str(_DEFAULT_DOWNLOAD_TMP_DIR))
+        if not isinstance(download_tmp_dir, str) or not download_tmp_dir:
+            raise ValueError("download_tmp_dir must be a non-empty string")
+
+        metadata_cache_ttl_secs = d.get("metadata_cache_ttl_secs", 30.0)
+        if (
+            not isinstance(metadata_cache_ttl_secs, (int, float))
+            or isinstance(metadata_cache_ttl_secs, bool)
+            or metadata_cache_ttl_secs < 0
+        ):
+            raise ValueError("metadata_cache_ttl_secs must be a non-negative number")
+
+        num_workers = d.get("num_workers", 4)
+        if not isinstance(num_workers, int) or isinstance(num_workers, bool):
+            raise ValueError("num_workers must be a positive integer")
+        if num_workers <= 0:
+            raise ValueError("num_workers must be a positive integer")
+
+        max_capacity_gb = d.get("max_capacity_gb", 0.0)
+        if (
+            not isinstance(max_capacity_gb, (int, float))
+            or isinstance(max_capacity_gb, bool)
+            or max_capacity_gb < 0
+        ):
+            raise ValueError("max_capacity_gb must be a non-negative number")
+
+        create_bucket_if_missing = d.get("create_bucket_if_missing", False)
+        if not isinstance(create_bucket_if_missing, bool):
+            raise ValueError("create_bucket_if_missing must be a boolean")
+
+        cfg = cls(
+            bucket_handle=bucket_handle,
+            token_env=token_env,
+            token=token,
+            create_bucket_if_missing=create_bucket_if_missing,
+            download_tmp_dir=download_tmp_dir,
+            metadata_cache_ttl_secs=float(metadata_cache_ttl_secs),
+            num_workers=num_workers,
+            max_capacity_gb=float(max_capacity_gb),
+        )
+        cfg.eviction_config = cls._parse_eviction_config(d)
+        return cfg
+
+    @classmethod
+    def help(cls) -> str:
+        """Return CLI help text for this adapter type."""
+        return (
+            "HFBucket L2 adapter config fields:\n"
+            "- bucket_handle (str, required): "
+            "hf://buckets/<namespace>/<bucket>[/<prefix>]\n"
+            "- token_env (str): env var for HF token (default HF_TOKEN)\n"
+            "- token (str): direct token fallback\n"
+            "- create_bucket_if_missing (bool): create bucket on first store\n"
+            "- download_tmp_dir (str): temporary download root\n"
+            "- metadata_cache_ttl_secs (float): metadata cache TTL\n"
+            "- num_workers (int): blocking HF API worker threads\n"
+            "- max_capacity_gb (float): capacity for get_usage (0 = disabled)\n"
+            "- eviction (dict): optional, see L2AdapterConfigBase"
+        )
+
+
+class HFBucketL2Adapter(L2AdapterInterface):
+    """Hugging Face Buckets backed MP L2 adapter."""
+
+    def __init__(
+        self,
+        config: HFBucketL2AdapterConfig,
+        bucket_client: HFBucketClientInterface | None = None,
+    ) -> None:
+        super().__init__(max_capacity_bytes=int(config.max_capacity_gb * (1024**3)))
+        self._config = config
+        self._bucket_location = config.bucket_location
+        self._bucket_id = config.bucket_location.bucket_id
+        self._object_prefix = config.bucket_location.object_prefix
+        self._create_bucket_if_missing = config.create_bucket_if_missing
+        self._metadata_cache_ttl_secs = config.metadata_cache_ttl_secs
+
+        if bucket_client is None:
+            token = _resolve_hf_token(config.token_env, config.token)
+            self._bucket_client: HFBucketClientInterface = HFBucketClient(token=token)
+        else:
+            self._bucket_client = bucket_client
+
+        self._store_efd = create_event_notifier()
+        self._lookup_efd = create_event_notifier()
+        self._load_efd = create_event_notifier()
+
+        self._next_task_id: L2TaskId = 0
+        self._completed_store_tasks: dict[L2TaskId, L2StoreResult] = {}
+        self._completed_lookup_tasks: dict[L2TaskId, Bitmap] = {}
+        self._completed_load_tasks: dict[L2TaskId, Bitmap] = {}
+
+        self._locked_keys: dict[ObjectKey, int] = defaultdict(int)
+        self._key_sizes: dict[ObjectKey, int] = {}
+        self._metadata_cache: dict[str, _CachedObjectMetadata] = {}
+        self._metadata_cache_updates = 0
+
+        self._bucket_create_checked = False
+        self._bucket_create_lock = threading.Lock()
+
+        self._lock = threading.Lock()
+        self._closed = False
+
+        self._download_tmp_root = config.download_tmp_dir.expanduser()
+        self._download_tmp_root.mkdir(parents=True, exist_ok=True)
+        self._download_session_dir = Path(
+            tempfile.mkdtemp(
+                prefix="hfbucket-mp-",
+                dir=self._download_tmp_root,
+            )
+        )
+
+        self._executor = ThreadPoolExecutor(
+            max_workers=config.num_workers,
+            thread_name_prefix="hfbucket-l2",
+        )
+        self._loop = asyncio.new_event_loop()
+        self._loop_thread = threading.Thread(
+            target=self._run_event_loop,
+            daemon=True,
+            name="hfbucket-l2-adapter-loop",
+        )
+        self._loop_thread.start()
+
+        logger.info(
+            "Initialized HFBucketL2Adapter (bucket_id=%s prefix=%r "
+            "workers=%d max_capacity_gb=%.2f)",
+            self._bucket_id,
+            self._object_prefix,
+            config.num_workers,
+            config.max_capacity_gb,
+        )
+
+    def get_store_event_fd(self) -> int:
+        return self._store_efd.fileno()
+
+    def get_lookup_and_lock_event_fd(self) -> int:
+        return self._lookup_efd.fileno()
+
+    def get_load_event_fd(self) -> int:
+        return self._load_efd.fileno()
+
+    def submit_store_task(
+        self,
+        keys: list[ObjectKey],
+        objects: list[MemoryObj],
+    ) -> L2TaskId:
+        with self._lock:
+            task_id = self._get_next_task_id_locked()
+            if self._closed:
+                self._completed_store_tasks[task_id] = L2StoreResult(False, 0)
+                closed = True
+            else:
+                closed = False
+
+        if closed:
+            self._store_efd.notify()
+            return task_id
+
+        asyncio.run_coroutine_threadsafe(
+            self._execute_store(list(keys), list(objects), task_id),
+            self._loop,
+        )
+        return task_id
+
+    def pop_completed_store_tasks(self) -> dict[L2TaskId, L2StoreResult]:
+        with self._lock:
+            completed = self._completed_store_tasks
+            self._completed_store_tasks = {}
+        return completed
+
+    def submit_lookup_and_lock_task(self, keys: list[ObjectKey]) -> L2TaskId:
+        with self._lock:
+            task_id = self._get_next_task_id_locked()
+            if self._closed:
+                self._completed_lookup_tasks[task_id] = Bitmap(len(keys))
+                closed = True
+            else:
+                closed = False
+
+        if closed:
+            self._lookup_efd.notify()
+            return task_id
+
+        asyncio.run_coroutine_threadsafe(
+            self._execute_lookup(list(keys), task_id),
+            self._loop,
+        )
+        return task_id
+
+    def query_lookup_and_lock_result(self, task_id: L2TaskId) -> Optional[Bitmap]:
+        with self._lock:
+            return self._completed_lookup_tasks.pop(task_id, None)
+
+    def submit_unlock(self, keys: list[ObjectKey]) -> None:
+        with self._lock:
+            for key in keys:
+                if key not in self._locked_keys:
+                    continue
+                if self._locked_keys[key] <= 1:
+                    del self._locked_keys[key]
+                else:
+                    self._locked_keys[key] -= 1
+
+    def submit_load_task(
+        self,
+        keys: list[ObjectKey],
+        objects: list[MemoryObj],
+    ) -> L2TaskId:
+        with self._lock:
+            task_id = self._get_next_task_id_locked()
+            if self._closed:
+                self._completed_load_tasks[task_id] = Bitmap(len(keys))
+                closed = True
+            else:
+                closed = False
+
+        if closed:
+            self._load_efd.notify()
+            return task_id
+
+        asyncio.run_coroutine_threadsafe(
+            self._execute_load(list(keys), list(objects), task_id),
+            self._loop,
+        )
+        return task_id
+
+    def query_load_result(self, task_id: L2TaskId) -> Optional[Bitmap]:
+        with self._lock:
+            return self._completed_load_tasks.pop(task_id, None)
+
+    def delete(self, keys: list[ObjectKey]) -> None:
+        if not keys:
+            return
+
+        with self._lock:
+            if self._closed:
+                return
+            deletable = [key for key in keys if self._locked_keys.get(key, 0) == 0]
+
+        if not deletable:
+            return
+
+        future = asyncio.run_coroutine_threadsafe(
+            self._execute_delete(deletable),
+            self._loop,
+        )
+        try:
+            deleted_keys, deleted_sizes = future.result(timeout=30.0)
+        except Exception as exc:
+            logger.warning("HFBucketL2Adapter delete failed: %s", exc)
+            return
+
+        if deleted_keys:
+            self._notify_keys_deleted(deleted_keys, deleted_sizes)
+
+    def report_status(self) -> dict:
+        usage = self.get_usage()
+        with self._lock:
+            object_count = len(self._key_sizes)
+            locked_key_count = len(self._locked_keys)
+            closed = self._closed
+        return {
+            "is_healthy": self._loop_thread.is_alive() and not closed,
+            "type": "HFBucketL2Adapter",
+            "bucket_id": self._bucket_id,
+            "object_prefix": self._object_prefix,
+            "stored_object_count": object_count,
+            "locked_key_count": locked_key_count,
+            "current_size_bytes": usage.total_bytes_used,
+            "max_capacity_bytes": usage.total_capacity_bytes,
+        }
+
+    def close(self) -> None:
+        if self._closed:
+            return
+        self._closed = True
+
+        async def _stop_tasks() -> None:
+            tasks = [
+                task
+                for task in asyncio.all_tasks(self._loop)
+                if task is not asyncio.current_task()
+            ]
+            for task in tasks:
+                task.cancel()
+            if tasks:
+                await asyncio.gather(*tasks, return_exceptions=True)
+
+        if self._loop.is_running():
+            try:
+                asyncio.run_coroutine_threadsafe(_stop_tasks(), self._loop).result(
+                    timeout=5
+                )
+            except Exception:
+                pass
+            self._loop.call_soon_threadsafe(self._loop.stop)
+
+        self._loop_thread.join(timeout=5)
+        try:
+            self._loop.close()
+        except Exception:
+            pass
+
+        self._executor.shutdown(wait=True, cancel_futures=True)
+
+        self._store_efd.close()
+        self._lookup_efd.close()
+        self._load_efd.close()
+
+        with self._lock:
+            self._metadata_cache.clear()
+            self._key_sizes.clear()
+            self._locked_keys.clear()
+
+        shutil.rmtree(self._download_session_dir, ignore_errors=True)
+        logger.info("HFBucketL2Adapter closed")
+
+    def _run_event_loop(self) -> None:
+        asyncio.set_event_loop(self._loop)
+        self._loop.run_forever()
+
+    def _get_next_task_id_locked(self) -> L2TaskId:
+        task_id = self._next_task_id
+        self._next_task_id += 1
+        return task_id
+
+    async def _execute_store(
+        self,
+        keys: list[ObjectKey],
+        objects: list[MemoryObj],
+        task_id: L2TaskId,
+    ) -> None:
+        try:
+            stored_keys, stored_sizes = await self._loop.run_in_executor(
+                self._executor,
+                self._store_batch_sync,
+                keys,
+                objects,
+            )
+            success = True
+        except _PartialStoreFailure as exc:
+            logger.exception("HFBucketL2Adapter store task partially failed")
+            stored_keys = exc.stored_keys
+            stored_sizes = exc.stored_sizes
+            success = False
+        except Exception:
+            logger.exception("HFBucketL2Adapter store task failed")
+            stored_keys = []
+            stored_sizes = []
+            success = False
+
+        bytes_transferred = sum(stored_sizes)
+        with self._lock:
+            self._completed_store_tasks[task_id] = L2StoreResult(
+                success,
+                bytes_transferred,
+            )
+
+        if stored_keys:
+            self._notify_keys_stored(stored_keys, stored_sizes)
+        self._store_efd.notify()
+
+    async def _execute_lookup(
+        self,
+        keys: list[ObjectKey],
+        task_id: L2TaskId,
+    ) -> None:
+        bitmap = Bitmap(len(keys))
+        try:
+            sizes = await self._loop.run_in_executor(
+                self._executor,
+                self._resolve_object_sizes_sync,
+                keys,
+            )
+        except Exception:
+            logger.exception("HFBucketL2Adapter lookup task failed")
+            sizes = [0] * len(keys)
+
+        accessed: list[ObjectKey] = []
+        with self._lock:
+            for i, (key, size) in enumerate(zip(keys, sizes, strict=True)):
+                if size <= 0:
+                    continue
+                bitmap.set(i)
+                self._locked_keys[key] += 1
+                accessed.append(key)
+            self._completed_lookup_tasks[task_id] = bitmap
+
+        self._lookup_efd.notify()
+        if accessed:
+            self._notify_keys_accessed(accessed)
+
+    async def _execute_load(
+        self,
+        keys: list[ObjectKey],
+        objects: list[MemoryObj],
+        task_id: L2TaskId,
+    ) -> None:
+        try:
+            bitmap = await self._loop.run_in_executor(
+                self._executor,
+                self._load_batch_sync,
+                keys,
+                objects,
+            )
+        except Exception:
+            logger.exception("HFBucketL2Adapter load task failed")
+            bitmap = Bitmap(len(keys))
+
+        with self._lock:
+            self._completed_load_tasks[task_id] = bitmap
+        self._load_efd.notify()
+
+    async def _execute_delete(
+        self,
+        keys: list[ObjectKey],
+    ) -> tuple[list[ObjectKey], list[int]]:
+        return await self._loop.run_in_executor(
+            self._executor,
+            self._delete_batch_sync,
+            keys,
+        )
+
+    def _store_batch_sync(
+        self,
+        keys: list[ObjectKey],
+        objects: list[MemoryObj],
+    ) -> tuple[list[ObjectKey], list[int]]:
+        self._ensure_bucket_for_writes()
+
+        additions: list[tuple[bytes, str]] = []
+        indexed: list[tuple[ObjectKey, str, int]] = []
+        for key, obj in zip(keys, objects, strict=True):
+            object_path = _object_key_to_bucket_path(key, self._bucket_location)
+            data = memoryview(obj.byte_array).cast("B").tobytes()
+            additions.append((data, object_path))
+            indexed.append((key, object_path, len(data)))
+
+        if not additions:
+            return [], []
+
+        try:
+            self._bucket_client.upload_files(self._bucket_id, additions)
+        except Exception as exc:
+            # Hugging Face batch writes are not transactional: a request can
+            # write part of the batch and then fail. Fetch fresh backend
+            # metadata, update accounting for objects that really landed, and
+            # still report the submitted store task as failed.
+            reconciled_keys, reconciled_sizes = self._reconcile_failed_store(indexed)
+            raise _PartialStoreFailure(
+                "HFBucket batch upload failed after partial reconciliation",
+                reconciled_keys,
+                reconciled_sizes,
+            ) from exc
+
+        stored_keys: list[ObjectKey] = []
+        stored_sizes: list[int] = []
+        with self._lock:
+            for key, object_path, size in indexed:
+                was_new = key not in self._key_sizes
+                self._key_sizes[key] = size
+                self._set_cached_object_size_locked(object_path, size)
+                if was_new:
+                    stored_keys.append(key)
+                    stored_sizes.append(size)
+
+        return stored_keys, stored_sizes
+
+    def _resolve_object_sizes_sync(self, keys: list[ObjectKey]) -> list[int]:
+        object_paths = [
+            _object_key_to_bucket_path(key, self._bucket_location) for key in keys
+        ]
+
+        cached: dict[str, int] = {}
+        unresolved_paths: list[str] = []
+        with self._lock:
+            for object_path in object_paths:
+                cached_size = self._get_cached_object_size_locked(object_path)
+                if cached_size is None:
+                    unresolved_paths.append(object_path)
+                else:
+                    cached[object_path] = cached_size
+
+        if unresolved_paths:
+            fetched = self._fetch_object_sizes_sync(unresolved_paths)
+            with self._lock:
+                for object_path, size in fetched.items():
+                    self._set_cached_object_size_locked(object_path, size)
+            cached.update(fetched)
+
+        return [cached.get(object_path, 0) for object_path in object_paths]
+
+    def _fetch_object_sizes_sync(self, object_paths: list[str]) -> dict[str, int]:
+        if not object_paths:
+            return {}
+
+        try:
+            path_infos = self._bucket_client.get_paths_info(
+                self._bucket_id,
+                object_paths,
+            )
+        except Exception as exc:
+            if _is_not_found_error(exc):
+                return {object_path: 0 for object_path in object_paths}
+            raise
+
+        size_by_path: dict[str, int] = {}
+        for path_info in path_infos:
+            if _get_path_info_type(path_info) != "file":
+                continue
+            path = _get_path_info_path(path_info)
+            if path:
+                size_by_path[path] = _get_path_info_size(path_info)
+
+        return {
+            object_path: size_by_path.get(object_path, 0)
+            for object_path in object_paths
+        }
+
+    def _load_batch_sync(
+        self,
+        keys: list[ObjectKey],
+        objects: list[MemoryObj],
+    ) -> Bitmap:
+        bitmap = Bitmap(len(keys))
+        object_paths = [
+            _object_key_to_bucket_path(key, self._bucket_location) for key in keys
+        ]
+
+        batch_dir = Path(
+            tempfile.mkdtemp(prefix="load-", dir=self._download_session_dir)
+        )
+        local_paths: list[tuple[int, Path]] = []
+        files: list[tuple[str, str]] = []
+        for index, object_path in enumerate(object_paths):
+            local_path = batch_dir / f"{index}.bin"
+            local_paths.append((index, local_path))
+            files.append((object_path, str(local_path)))
+
+        try:
+            try:
+                self._bucket_client.download_files(self._bucket_id, files)
+            except Exception as exc:
+                if not _is_not_found_error(exc):
+                    logger.warning("Batch download from hfbucket raised: %s", exc)
+
+            for index, local_path in local_paths:
+                if not local_path.exists():
+                    continue
+
+                dst = memoryview(objects[index].byte_array).cast("B")
+                file_size = local_path.stat().st_size
+                if file_size != len(dst):
+                    logger.error(
+                        "Downloaded object %s has %d bytes, expected %d bytes; "
+                        "rejecting load",
+                        object_paths[index],
+                        file_size,
+                        len(dst),
+                    )
+                    with self._lock:
+                        self._set_cached_object_size_locked(
+                            object_paths[index],
+                            file_size,
+                        )
+                    continue
+
+                with local_path.open("rb") as f:
+                    bytes_read = f.readinto(dst)
+                if bytes_read != len(dst):
+                    logger.error(
+                        "Downloaded object %s read %d bytes, expected %d bytes; "
+                        "rejecting load",
+                        object_paths[index],
+                        bytes_read,
+                        len(dst),
+                    )
+                    with self._lock:
+                        self._set_cached_object_size_locked(
+                            object_paths[index],
+                            bytes_read,
+                        )
+                    continue
+
+                bitmap.set(index)
+                with self._lock:
+                    self._set_cached_object_size_locked(
+                        object_paths[index],
+                        file_size,
+                    )
+
+            return bitmap
+        finally:
+            shutil.rmtree(batch_dir, ignore_errors=True)
+
+    def _delete_batch_sync(
+        self,
+        keys: list[ObjectKey],
+    ) -> tuple[list[ObjectKey], list[int]]:
+        object_paths = [
+            _object_key_to_bucket_path(key, self._bucket_location) for key in keys
+        ]
+
+        try:
+            self._bucket_client.delete_files(self._bucket_id, object_paths)
+        except Exception as exc:
+            if not _is_not_found_error(exc):
+                raise
+
+        deleted_keys: list[ObjectKey] = []
+        deleted_sizes: list[int] = []
+        with self._lock:
+            for key, object_path in zip(keys, object_paths, strict=True):
+                size = self._key_sizes.pop(key, None)
+                self._set_cached_object_size_locked(object_path, 0)
+                deleted_keys.append(key)
+                deleted_sizes.append(size if size is not None else 0)
+
+        return deleted_keys, deleted_sizes
+
+    def _ensure_bucket_for_writes(self) -> None:
+        if not self._create_bucket_if_missing or self._bucket_create_checked:
+            return
+
+        with self._bucket_create_lock:
+            if self._bucket_create_checked:
+                return
+            self._bucket_client.create_bucket(self._bucket_id)
+            self._bucket_create_checked = True
+
+    def _refresh_cached_sizes(self, keys: list[ObjectKey]) -> None:
+        try:
+            self._resolve_object_sizes_sync(keys)
+        except Exception:
+            logger.debug("Failed to refresh hfbucket object sizes", exc_info=True)
+
+    def _reconcile_failed_store(
+        self,
+        indexed: list[tuple[ObjectKey, str, int]],
+    ) -> tuple[list[ObjectKey], list[int]]:
+        object_paths = [object_path for _, object_path, _ in indexed]
+        try:
+            sizes_by_path = self._fetch_object_sizes_sync(object_paths)
+        except Exception:
+            logger.debug("Failed to reconcile partial hfbucket store", exc_info=True)
+            return [], []
+
+        stored_keys: list[ObjectKey] = []
+        stored_sizes: list[int] = []
+        with self._lock:
+            for key, object_path, _expected_size in indexed:
+                size = sizes_by_path.get(object_path, 0)
+                self._set_cached_object_size_locked(object_path, size)
+                if size <= 0:
+                    continue
+
+                # Only notify net-new keys. Existing keys already contributed
+                # to byte accounting, and cache objects should be fixed size.
+                was_new = key not in self._key_sizes
+                self._key_sizes[key] = size
+                if was_new:
+                    stored_keys.append(key)
+                    stored_sizes.append(size)
+
+        return stored_keys, stored_sizes
+
+    def _get_cached_object_size_locked(self, object_path: str) -> int | None:
+        entry = self._metadata_cache.get(object_path)
+        if entry is None:
+            return None
+        if entry.expires_at <= time.monotonic():
+            self._metadata_cache.pop(object_path, None)
+            return None
+        return entry.size_bytes
+
+    def _set_cached_object_size_locked(self, object_path: str, size: int) -> None:
+        expires_at = time.monotonic() + self._metadata_cache_ttl_secs
+        self._metadata_cache[object_path] = _CachedObjectMetadata(
+            size_bytes=size,
+            expires_at=expires_at,
+        )
+        self._metadata_cache_updates += 1
+        if self._metadata_cache_updates % _METADATA_CACHE_PRUNE_INTERVAL == 0:
+            self._prune_expired_cache_entries_locked(time.monotonic())
+
+    def _prune_expired_cache_entries_locked(self, now: float) -> None:
+        expired = [
+            object_path
+            for object_path, entry in self._metadata_cache.items()
+            if entry.expires_at <= now
+        ]
+        for object_path in expired:
+            self._metadata_cache.pop(object_path, None)
+
+
+register_l2_adapter_type("hfbucket", HFBucketL2AdapterConfig)
+
+
+def _create_hfbucket_l2_adapter(
+    config: L2AdapterConfigBase,
+    l1_memory_desc: "Optional[L1MemoryDesc]" = None,
+) -> L2AdapterInterface:
+    """Create an HFBucket L2 adapter from registry config."""
+    return HFBucketL2Adapter(config)  # type: ignore[arg-type]
+
+
+register_l2_adapter_factory("hfbucket", _create_hfbucket_l2_adapter)
diff --git a/tests/v1/distributed/test_hfbucket_l2_adapter.py b/tests/v1/distributed/test_hfbucket_l2_adapter.py
new file mode 100644
index 0000000000..2e69c6bb74
--- /dev/null
+++ b/tests/v1/distributed/test_hfbucket_l2_adapter.py
@@ -0,0 +1,570 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for the HFBucket MP L2 adapter."""
+
+# Standard
+from collections.abc import Sequence
+from dataclasses import dataclass
+from pathlib import Path
+import select
+import threading
+import time
+
+# Third Party
+import pytest
+import torch
+
+# First Party
+from lmcache.v1.distributed.api import ObjectKey
+from lmcache.v1.distributed.internal_api import L2AdapterListener
+from lmcache.v1.distributed.l2_adapters import hfbucket_l2_adapter as hfmod
+from lmcache.v1.distributed.l2_adapters.hfbucket_l2_adapter import (
+    HFBucketL2Adapter,
+    HFBucketL2AdapterConfig,
+    _object_key_to_bucket_path,
+    _object_key_to_string,
+)
+from lmcache.v1.memory_management import (
+    MemoryFormat,
+    MemoryObj,
+    MemoryObjMetadata,
+    TensorMemoryObj,
+)
+from lmcache.v1.platform import consume_fd
+from lmcache.v1.storage_backend.connector.hfbucket_connector import (
+    parse_hfbucket_handle,
+)
+
+_TEST_BUCKET_HANDLE = "hf://buckets/test-org/test-bucket/prod"
+_TEST_BUCKET_LOCATION = parse_hfbucket_handle(_TEST_BUCKET_HANDLE)
+
+
+@dataclass(frozen=True)
+class _FakePathInfo:
+    path: str
+    type: str
+    size: int
+
+
+class _FakeBucketClient:
+    """In-memory HFBucket client used by adapter unit tests."""
+
+    def __init__(self) -> None:
+        self.storage: dict[str, bytes] = {}
+        self.created_buckets: list[str] = []
+        self.deleted_paths: list[str] = []
+        self.fail_upload_after: int | None = None
+        self._lock = threading.Lock()
+
+    def create_bucket(self, bucket_id: str) -> None:
+        with self._lock:
+            self.created_buckets.append(bucket_id)
+
+    def bucket_info(self, bucket_id: str) -> object:
+        return {"bucket_id": bucket_id}
+
+    def get_paths_info(
+        self,
+        bucket_id: str,
+        paths: Sequence[str],
+    ) -> list[object]:
+        del bucket_id
+        with self._lock:
+            return [
+                _FakePathInfo(path=path, type="file", size=len(self.storage[path]))
+                for path in paths
+                if path in self.storage
+            ]
+
+    def list_tree(self, bucket_id: str, prefix: str) -> list[object]:
+        del bucket_id
+        with self._lock:
+            return [
+                _FakePathInfo(path=path, type="file", size=len(data))
+                for path, data in self.storage.items()
+                if not prefix or path.startswith(prefix)
+            ]
+
+    def upload_files(
+        self,
+        bucket_id: str,
+        add: Sequence[tuple[bytes, str]],
+    ) -> None:
+        del bucket_id
+        with self._lock:
+            for index, (data, path) in enumerate(add, start=1):
+                self.storage[path] = bytes(data)
+                if (
+                    self.fail_upload_after is not None
+                    and index >= self.fail_upload_after
+                ):
+                    raise RuntimeError("injected partial upload failure")
+
+    def download_files(
+        self,
+        bucket_id: str,
+        files: Sequence[tuple[str, str]],
+    ) -> None:
+        del bucket_id
+        with self._lock:
+            items = [
+                (remote, local, self.storage.get(remote)) for remote, local in files
+            ]
+
+        for _remote, local, data in items:
+            if data is None:
+                continue
+            Path(local).write_bytes(data)
+
+    def delete_files(
+        self,
+        bucket_id: str,
+        delete: Sequence[str],
+    ) -> None:
+        del bucket_id
+        with self._lock:
+            for path in delete:
+                self.deleted_paths.append(path)
+                self.storage.pop(path, None)
+
+    def contains(self, path: str) -> bool:
+        with self._lock:
+            return path in self.storage
+
+
+def create_object_key(chunk_id: int, model_name: str = "test/model") -> ObjectKey:
+    return ObjectKey(
+        chunk_hash=ObjectKey.IntHash2Bytes(chunk_id),
+        model_name=model_name,
+        kv_rank=0,
+    )
+
+
+def create_memory_obj(size: int = 16, fill_value: float = 1.0) -> MemoryObj:
+    raw_data = torch.empty(size, dtype=torch.float32)
+    raw_data.fill_(fill_value)
+    metadata = MemoryObjMetadata(
+        shape=torch.Size([size]),
+        dtype=torch.float32,
+        address=0,
+        phy_size=size * 4,
+        fmt=MemoryFormat.KV_2LTD,
+        ref_count=1,
+    )
+    return TensorMemoryObj(raw_data, metadata, parent_allocator=None)
+
+
+def bucket_path_for_key(key: ObjectKey) -> str:
+    return _object_key_to_bucket_path(key, _TEST_BUCKET_LOCATION)
+
+
+def wait_for_event_fd(event_fd: int, timeout: float = 5.0) -> bool:
+    poll = select.poll()
+    poll.register(event_fd, select.POLLIN)
+    events = poll.poll(timeout * 1000)
+    if events:
+        try:
+            consume_fd(event_fd)
+        except BlockingIOError:
+            pass
+        return True
+    return False
+
+
+@pytest.fixture
+def fake_client() -> _FakeBucketClient:
+    return _FakeBucketClient()
+
+
+@pytest.fixture
+def adapter(tmp_path: Path, fake_client: _FakeBucketClient):
+    cfg = HFBucketL2AdapterConfig(
+        bucket_handle=_TEST_BUCKET_HANDLE,
+        download_tmp_dir=str(tmp_path),
+        metadata_cache_ttl_secs=30,
+        num_workers=2,
+        max_capacity_gb=0.001,
+    )
+    adapter = HFBucketL2Adapter(cfg, bucket_client=fake_client)
+    yield adapter
+    adapter.close()
+
+
+class _RecordingListener(L2AdapterListener):
+    def __init__(self) -> None:
+        self.stored: list[list[ObjectKey]] = []
+        self.accessed: list[list[ObjectKey]] = []
+        self.deleted: list[list[ObjectKey]] = []
+
+    def on_l2_keys_stored(self, keys):
+        self.stored.append(list(keys))
+
+    def on_l2_keys_accessed(self, keys):
+        self.accessed.append(list(keys))
+
+    def on_l2_keys_deleted(self, keys):
+        self.deleted.append(list(keys))
+
+
+class TestObjectKeySerialization:
+    def test_format(self) -> None:
+        key = ObjectKey(
+            chunk_hash=b"\x00\x01\x02\x03",
+            model_name="llama",
+            kv_rank=255,
+        )
+        assert _object_key_to_string(key) == "llama@000000ff@00010203"
+
+    def test_cache_salt_appended(self) -> None:
+        base_key = ObjectKey(
+            chunk_hash=b"\x00\x01\x02\x03",
+            model_name="llama",
+            kv_rank=255,
+        )
+        salted = ObjectKey(
+            chunk_hash=b"\x00\x01\x02\x03",
+            model_name="llama",
+            kv_rank=255,
+            cache_salt="user-42",
+        )
+        assert _object_key_to_string(base_key) == "llama@000000ff@00010203"
+        assert _object_key_to_string(salted) == "llama@000000ff@00010203@user-42"
+        assert _object_key_to_string(base_key) != _object_key_to_string(salted)
+
+    def test_bucket_path_uses_prefix_and_encoding(self) -> None:
+        cfg = HFBucketL2AdapterConfig(bucket_handle=_TEST_BUCKET_HANDLE)
+        key = create_object_key(1)
+        path = _object_key_to_bucket_path(key, cfg.bucket_location)
+        assert path.startswith("prod/")
+        assert "/" not in path.removeprefix("prod/")
+
+
+class TestEventFdInterface:
+    def test_three_distinct_fds(self, adapter: HFBucketL2Adapter) -> None:
+        a = adapter.get_store_event_fd()
+        b = adapter.get_lookup_and_lock_event_fd()
+        c = adapter.get_load_event_fd()
+        assert a >= 0 and b >= 0 and c >= 0
+        assert len({a, b, c}) == 3
+
+
+class TestStoreLookupLoad:
+    def test_roundtrip_single_key(self, adapter: HFBucketL2Adapter) -> None:
+        key = create_object_key(1)
+        obj = create_memory_obj(fill_value=3.14)
+
+        tid = adapter.submit_store_task([key], [obj])
+        assert wait_for_event_fd(adapter.get_store_event_fd())
+        assert adapter.pop_completed_store_tasks()[tid].is_successful()
+
+        tid = adapter.submit_lookup_and_lock_task([key])
+        assert wait_for_event_fd(adapter.get_lookup_and_lock_event_fd())
+        bm = adapter.query_lookup_and_lock_result(tid)
+        assert bm is not None and bm.test(0) is True
+
+        dst = create_memory_obj(fill_value=0.0)
+        tid = adapter.submit_load_task([key], [dst])
+        assert wait_for_event_fd(adapter.get_load_event_fd())
+        bm = adapter.query_load_result(tid)
+        assert bm is not None and bm.test(0) is True
+        assert torch.allclose(dst.tensor, torch.full((16,), 3.14))
+
+    def test_partial_hits(self, adapter: HFBucketL2Adapter) -> None:
+        stored = [create_object_key(0), create_object_key(2)]
+        objs = [create_memory_obj(fill_value=float(i)) for i in range(2)]
+        adapter.submit_store_task(stored, objs)
+        wait_for_event_fd(adapter.get_store_event_fd())
+        adapter.pop_completed_store_tasks()
+
+        keys = [create_object_key(i) for i in range(4)]
+        tid = adapter.submit_lookup_and_lock_task(keys)
+        wait_for_event_fd(adapter.get_lookup_and_lock_event_fd())
+        bm = adapter.query_lookup_and_lock_result(tid)
+        assert bm is not None
+        assert bm.test(0) is True
+        assert bm.test(1) is False
+        assert bm.test(2) is True
+        assert bm.test(3) is False
+
+    def test_load_miss_returns_zero_bit(self, adapter: HFBucketL2Adapter) -> None:
+        key = create_object_key(99)
+        dst = create_memory_obj()
+        tid = adapter.submit_load_task([key], [dst])
+        wait_for_event_fd(adapter.get_load_event_fd())
+        bm = adapter.query_load_result(tid)
+        assert bm is not None and bm.test(0) is False
+
+    def test_load_size_mismatch_returns_zero_bit(
+        self,
+        adapter: HFBucketL2Adapter,
+        fake_client: _FakeBucketClient,
+    ) -> None:
+        key = create_object_key(7)
+        object_path = bucket_path_for_key(key)
+        fake_client.storage[object_path] = b"too-small"
+
+        dst = create_memory_obj()
+        tid = adapter.submit_load_task([key], [dst])
+        wait_for_event_fd(adapter.get_load_event_fd())
+        bm = adapter.query_load_result(tid)
+        assert bm is not None and bm.test(0) is False
+
+    def test_query_lookup_returns_none_after_pop(
+        self,
+        adapter: HFBucketL2Adapter,
+    ) -> None:
+        key = create_object_key(1)
+        tid = adapter.submit_lookup_and_lock_task([key])
+        wait_for_event_fd(adapter.get_lookup_and_lock_event_fd())
+        assert adapter.query_lookup_and_lock_result(tid) is not None
+        assert adapter.query_lookup_and_lock_result(tid) is None
+
+    def test_partial_store_failure_accounts_written_keys(
+        self,
+        adapter: HFBucketL2Adapter,
+        fake_client: _FakeBucketClient,
+    ) -> None:
+        fake_client.fail_upload_after = 1
+        keys = [create_object_key(0), create_object_key(1)]
+        objs = [create_memory_obj(), create_memory_obj()]
+
+        tid = adapter.submit_store_task(keys, objs)
+        assert wait_for_event_fd(adapter.get_store_event_fd())
+        assert not adapter.pop_completed_store_tasks()[tid].is_successful()
+
+        assert fake_client.contains(bucket_path_for_key(keys[0]))
+        assert not fake_client.contains(bucket_path_for_key(keys[1]))
+        assert adapter.get_usage().total_bytes_used == 64
+
+
+class TestEviction:
+    def _store(self, adapter: HFBucketL2Adapter, key: ObjectKey) -> None:
+        adapter.submit_store_task([key], [create_memory_obj()])
+        wait_for_event_fd(adapter.get_store_event_fd())
+        adapter.pop_completed_store_tasks()
+
+    def _lookup(self, adapter: HFBucketL2Adapter, key: ObjectKey):
+        tid = adapter.submit_lookup_and_lock_task([key])
+        wait_for_event_fd(adapter.get_lookup_and_lock_event_fd())
+        return adapter.query_lookup_and_lock_result(tid)
+
+    def test_delete_removes_key(
+        self,
+        adapter: HFBucketL2Adapter,
+        fake_client: _FakeBucketClient,
+    ) -> None:
+        key = create_object_key(1)
+        self._store(adapter, key)
+        object_path = bucket_path_for_key(key)
+        assert fake_client.contains(object_path)
+
+        adapter.delete([key])
+        assert not fake_client.contains(object_path)
+
+    def test_lock_blocks_delete(
+        self,
+        adapter: HFBucketL2Adapter,
+        fake_client: _FakeBucketClient,
+    ) -> None:
+        key = create_object_key(1)
+        self._store(adapter, key)
+        bm = self._lookup(adapter, key)
+        assert bm is not None and bm.test(0) is True
+
+        deletes_before = len(fake_client.deleted_paths)
+        adapter.delete([key])
+        assert len(fake_client.deleted_paths) == deletes_before
+
+        adapter.submit_unlock([key])
+        adapter.delete([key])
+        object_path = bucket_path_for_key(key)
+        assert not fake_client.contains(object_path)
+
+    def test_refcount_unlock(
+        self,
+        adapter: HFBucketL2Adapter,
+        fake_client: _FakeBucketClient,
+    ) -> None:
+        key = create_object_key(1)
+        self._store(adapter, key)
+        self._lookup(adapter, key)
+        self._lookup(adapter, key)
+
+        adapter.submit_unlock([key])
+        adapter.delete([key])
+        object_path = bucket_path_for_key(key)
+        assert fake_client.contains(object_path)
+
+        adapter.submit_unlock([key])
+        adapter.delete([key])
+        assert not fake_client.contains(object_path)
+
+    def test_delete_on_unknown_key(self, adapter: HFBucketL2Adapter) -> None:
+        adapter.delete([create_object_key(42)])
+
+
+class TestGetUsage:
+    def test_disabled_returns_minus_one(self, tmp_path: Path) -> None:
+        cfg = HFBucketL2AdapterConfig(
+            bucket_handle="hf://buckets/test-org/test-bucket",
+            download_tmp_dir=str(tmp_path),
+            max_capacity_gb=0.0,
+        )
+        adapter = HFBucketL2Adapter(cfg, bucket_client=_FakeBucketClient())
+        try:
+            usage = adapter.get_usage()
+            # 0/0 is defined as -1.0 to indicate disabled
+            assert usage.usage_fraction == -1.0
+            assert usage.total_bytes_used == 0
+            assert usage.total_capacity_bytes == 0
+        finally:
+            adapter.close()
+
+    def test_usage_grows_on_store_and_shrinks_on_delete(
+        self,
+        adapter: HFBucketL2Adapter,
+    ) -> None:
+        keys = [create_object_key(i) for i in range(4)]
+        objs = [create_memory_obj() for _ in range(4)]
+
+        adapter.submit_store_task(keys, objs)
+        wait_for_event_fd(adapter.get_store_event_fd())
+        adapter.pop_completed_store_tasks()
+
+        total = 4 * 64
+        capacity = int(0.001 * 1024**3)
+        usage = adapter.get_usage()
+        assert usage.total_bytes_used == total
+        assert usage.total_capacity_bytes == capacity
+        assert usage.usage_fraction == pytest.approx(total / capacity)
+
+        adapter.delete(keys)
+        usage = adapter.get_usage()
+        assert usage.total_bytes_used == 0
+        assert usage.usage_fraction == 0.0
+
+
+class TestListener:
+    def test_stored_accessed_and_deleted_fire(
+        self,
+        adapter: HFBucketL2Adapter,
+    ) -> None:
+        listener = _RecordingListener()
+        adapter.register_listener(listener)
+
+        key = create_object_key(1)
+        adapter.submit_store_task([key], [create_memory_obj()])
+        wait_for_event_fd(adapter.get_store_event_fd())
+        adapter.pop_completed_store_tasks()
+        time.sleep(0.05)
+        assert any(key in batch for batch in listener.stored)
+
+        tid = adapter.submit_lookup_and_lock_task([key])
+        wait_for_event_fd(adapter.get_lookup_and_lock_event_fd())
+        adapter.query_lookup_and_lock_result(tid)
+        time.sleep(0.05)
+        assert any(key in batch for batch in listener.accessed)
+        accessed_count = len(listener.accessed)
+
+        dst = create_memory_obj(fill_value=0.0)
+        tid = adapter.submit_load_task([key], [dst])
+        wait_for_event_fd(adapter.get_load_event_fd())
+        adapter.query_load_result(tid)
+        time.sleep(0.05)
+        assert len(listener.accessed) == accessed_count
+
+        adapter.submit_unlock([key])
+        adapter.delete([key])
+        assert any(key in batch for batch in listener.deleted)
+
+
+class TestConfig:
+    def test_from_dict_requires_bucket_handle(self) -> None:
+        with pytest.raises(ValueError):
+            HFBucketL2AdapterConfig.from_dict({"type": "hfbucket"})
+
+    def test_from_dict_parses_all_fields(self) -> None:
+        cfg = HFBucketL2AdapterConfig.from_dict(
+            {
+                "type": "hfbucket",
+                "bucket_handle": _TEST_BUCKET_HANDLE,
+                "token_env": "HF_TEST_TOKEN",
+                "token": "direct-token",
+                "create_bucket_if_missing": True,
+                "download_tmp_dir": "/tmp/hf",
+                "metadata_cache_ttl_secs": 12.5,
+                "num_workers": 8,
+                "max_capacity_gb": 2.5,
+            }
+        )
+        assert cfg.bucket_handle == _TEST_BUCKET_HANDLE
+        assert cfg.bucket_location.bucket_id == "test-org/test-bucket"
+        assert cfg.bucket_location.object_prefix == "prod"
+        assert cfg.token_env == "HF_TEST_TOKEN"
+        assert cfg.token == "direct-token"
+        assert cfg.create_bucket_if_missing is True
+        assert cfg.download_tmp_dir == Path("/tmp/hf")
+        assert cfg.metadata_cache_ttl_secs == 12.5
+        assert cfg.num_workers == 8
+        assert cfg.max_capacity_gb == 2.5
+
+    # strict boolean parsing
+    def test_from_dict_rejects_string_boolean(self) -> None:
+        with pytest.raises(ValueError, match="create_bucket_if_missing"):
+            HFBucketL2AdapterConfig.from_dict(
+                {
+                    "type": "hfbucket",
+                    "bucket_handle": _TEST_BUCKET_HANDLE,
+                    "create_bucket_if_missing": "false",
+                }
+            )
+
+    def test_help_nonempty(self) -> None:
+        assert isinstance(HFBucketL2AdapterConfig.help(), str)
+        assert "bucket_handle" in HFBucketL2AdapterConfig.help()
+
+
+class TestFactoryRegistration:
+    def test_create_l2_adapter_registers_hfbucket(
+        self,
+        monkeypatch,
+        tmp_path: Path,
+    ) -> None:
+        # First Party
+        from lmcache.v1.distributed.l2_adapters import create_l2_adapter
+
+        monkeypatch.setattr(
+            hfmod,
+            "HFBucketClient",
+            lambda token=None: _FakeBucketClient(),
+        )
+        cfg = HFBucketL2AdapterConfig.from_dict(
+            {
+                "type": "hfbucket",
+                "bucket_handle": _TEST_BUCKET_HANDLE,
+                "download_tmp_dir": str(tmp_path),
+                "num_workers": 1,
+            }
+        )
+        adapter = create_l2_adapter(cfg)
+        try:
+            assert isinstance(adapter, HFBucketL2Adapter)
+        finally:
+            adapter.close()
+
+
+class TestCleanup:
+    def test_close_cleans_temp_dir(
+        self,
+        tmp_path: Path,
+        fake_client: _FakeBucketClient,
+    ) -> None:
+        cfg = HFBucketL2AdapterConfig(
+            bucket_handle=_TEST_BUCKET_HANDLE,
+            download_tmp_dir=str(tmp_path),
+        )
+        adapter = HFBucketL2Adapter(cfg, bucket_client=fake_client)
+        assert list(tmp_path.iterdir())
+
+        adapter.close()
+
+        assert list(tmp_path.iterdir()) == []

From 07f68b27322ecaa884dd165db5ba99290ceb537b Mon Sep 17 00:00:00 2001
From: deng451e <57919305+deng451e@users.noreply.github.com>
Date: Mon, 8 Jun 2026 12:55:21 -0700
Subject: [PATCH 04/57]  [CB] Token-level matching + per-token slot scatter for
 non-block-aligned KV reuse (#3582)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [blend-v3] Token-level matching + per-token slot scatter for CB reuse

Match fingerprints at token stride (probe_stride=1) and scatter reused
KV with the per-token slot kernel (multi_layer_kv_transfer) instead of
matching/scattering at vLLM block granularity. This lets CacheBlend
reuse non-block-aligned matches, the common case for real workloads
where the shared body starts at an arbitrary token offset (a partial
vLLM block) rather than a chunk/block boundary.

- register_rope: probe_stride = 1 (find matches at any token offset)
- cb_unified_lookup: accept non-prefix matches at any cur_st (drop the
  chunk-alignment filter)
- cb_retrieve_pre_computed: per-token slot scatter of the full matched
  range. Partial vLLM blocks are written per slot, so matched and
  recomputed tokens sharing a block don't conflict. Removes the
  block-aligned drop checks and the now-dead whole-block scatter path.

Validated on prefix-suffix-tuner (non-block-aligned by construction):
~99% suffix hit, 3.91x TTFT vs full recompute, output matches the
full-recompute baseline. The slot kernel is bandwidth-bound and matches
the whole-block kernel's throughput (~700 GB/s), so no scatter overhead.

Signed-off-by: deng451e <838677410@qq.com>

* [blend-v3] Vectorize V3 matcher probe; drop obsolete probe stride

Token-level matching (probe_stride=1) had turned match_sub_sequence into
an O(tokens) pure-Python probe loop — ~5.7 ms at 32K context, ~7x the old
block-stride cost. Replace it with a vectorized direct-address probe
(numpy gather over all positions) plus a verify loop over only the
surviving hits; the table is sparse (TABLE_SIZE = 2^20 >> registered
chunks) so the hit set is tiny. This restores the base class's
vectorization that the V3 override had dropped, keeping full-hash
collision rejection.

Probe stride is now obsolete (we always scan every position), so the
_probe_stride field, ctor arg, and register_rope assignment are removed.

Matcher microbench (CPU, per lookup): 32K ctx 5.66 -> 0.83 ms (~7x),
20K 3.43 -> 0.52, 8K 1.39 -> 0.23 — back to the pre-token-scatter
block-stride baseline with full token-level matching. All 20
test_optimized_lookup_v3 tests pass.

Signed-off-by: deng451e <838677410@qq.com>

* update

Signed-off-by: deng451e <838677410@qq.com>

* update stale docstring

Signed-off-by: deng451e <838677410@qq.com>

---------

Signed-off-by: deng451e <838677410@qq.com>
---
 lmcache/v1/multiprocess/custom_types.py       |  13 +-
 lmcache/v1/multiprocess/modules/blend_v3.py   | 263 +++++++++---------
 lmcache/v1/multiprocess/protocols/blend_v3.py |  13 +-
 3 files changed, 141 insertions(+), 148 deletions(-)

diff --git a/lmcache/v1/multiprocess/custom_types.py b/lmcache/v1/multiprocess/custom_types.py
index af991f05fb..a177c705f2 100644
--- a/lmcache/v1/multiprocess/custom_types.py
+++ b/lmcache/v1/multiprocess/custom_types.py
@@ -376,16 +376,19 @@ class CBMatchResult:
 
 @dataclass
 class CBUnifiedLookupResult:
-    """Result of ``CB_UNIFIED_LOOKUP``: prefix lookup + non-prefix fingerprint
-    match, reconciled in one RPC.
+    """Resolved payload of ``CB_UNIFIED_LOOKUP``: prefix lookup + non-prefix
+    fingerprint match, reconciled in one RPC. The RPC returns ``None`` (not this)
+    while either leg's KV is still loading into L1; this type is sent only once
+    both are resident.
 
     Attributes:
         prefix_coverage_tokens: Contiguous prefix-cache coverage (L1+L2) in
             tokens — what the standard LOOKUP would report.
-        non_prefix_segments: Block-aligned matches outside the prefix coverage
+        non_prefix_segments: Fingerprint matches outside the prefix coverage
             (cur_st order), each carrying ``(old_st, old_ed, cur_st, cur_ed,
-            hash)``. Already sparse-prefetched, so the retrieve set equals the
-            prefetched set.
+            hash)``. Token-aligned (any offset, not block-aligned): the per-token
+            slot scatter handles them. Already resident in L1, so the retrieve
+            set equals the prefetched set.
     """
 
     prefix_coverage_tokens: int
diff --git a/lmcache/v1/multiprocess/modules/blend_v3.py b/lmcache/v1/multiprocess/modules/blend_v3.py
index 6905460026..20db4908a9 100644
--- a/lmcache/v1/multiprocess/modules/blend_v3.py
+++ b/lmcache/v1/multiprocess/modules/blend_v3.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Blend V3: paged-aware CacheBlend as an :class:`EngineModule`.
+"""Blend V3: paged-aware CacheBlend as an EngineModule.
 
-Plugs into the unified :class:`MPCacheEngine`; standard ``REGISTER_KV_CACHE``
-+ ``CB_REGISTER_ROPE_V3`` for setup; STORE wrapper registers fingerprints;
+Plugs into the unified MPCacheEngine; standard REGISTER_KV_CACHE +
+CB_REGISTER_ROPE_V3 for setup; STORE wrapper registers fingerprints;
 retrieve scatters into the request's paged blocks.
 """
 
@@ -39,7 +39,6 @@
 from lmcache.v1.multiprocess.engine_context import MPCacheEngineContext
 from lmcache.v1.multiprocess.engine_module import HandlerSpec, ThreadPoolType
 from lmcache.v1.multiprocess.gpu_context import GPUCacheContext
-from lmcache.v1.multiprocess.modules.blend import BlendTokenRangeMatcher
 from lmcache.v1.multiprocess.modules.gpu_transfer import GPUTransferModule
 from lmcache.v1.multiprocess.modules.lookup import LookupModule
 from lmcache.v1.multiprocess.protocol import RequestType
@@ -65,7 +64,7 @@ class _CBRopeState:
 
 @dataclass
 class _CBUnifiedJob:
-    """Per-request poll state for non-blocking ``cb_unified_lookup``.
+    """Per-request poll state for non-blocking cb_unified_lookup.
 
     Stashed across polls because the underlying status/found polls are
     consume-once.
@@ -82,17 +81,35 @@ class _CBUnifiedJob:
     found_uidx: set[int] | None = None  # stashed when the sparse poll completes
 
 
-class BlendTokenRangeMatcherV3(BlendTokenRangeMatcher):
-    """V3 matcher: full-hash collision rejection + block-aligned probe stride.
+class BlendTokenRangeMatcherV3:
+    """V3 matcher: token-level probe (any offset) + full-hash collision
+    rejection. Self-contained (does not inherit a base matcher)."""
 
-    Probes every ``probe_stride`` positions; lossless because retrieve drops
-    non-block-aligned ``cur_st`` anyway.
-    """
+    _TABLE_BITS: int = 20  # 2^20 ~ 1 M entries
+    _TABLE_SIZE: int = 1 << _TABLE_BITS
+    _BASE: np.uint64 = np.uint64(0x9E3779B97F4A7C15)  # Fibonacci-hashing const
+
+    def __init__(self, chunk_size: int = 256):
+        """Initialize the V3 matcher.
 
-    def __init__(self, chunk_size: int = 256, probe_stride: int = 16):
-        super().__init__(chunk_size)
+        Args:
+            chunk_size (int): Tokens per non-overlapping fingerprint chunk.
+        """
+        self.chunk_size = chunk_size
+        # poly_chunk_hash -> compact_chunk_id; -1 = empty
+        self._table_id = np.full(self._TABLE_SIZE, -1, dtype=np.int64)
+        self._mask = np.uint64(self._TABLE_SIZE - 1)
+        # compact_chunk_id -> caller token_hash (full bytes); None once evicted
+        self._chunk_token_hash: list[bytes | None] = []
+        # token_hash -> start position in its registered sequence
+        self._token_hash_to_start: dict[bytes, int] = {}
+        # compact_chunk_id -> table slot (reverse lookup for eviction)
+        self._compact_id_to_slot = np.full(self._TABLE_SIZE, -1, dtype=np.int64)
+        # token_hash -> compact_chunk_id (for eviction lookup)
+        self._token_hash_to_compact_id: dict[bytes, int] = {}
+        self._lock = threading.Lock()
+        # V3 addition: compact_chunk_id -> full poly hash, for collision reject.
         self._chunk_poly_hash: list[int] = []
-        self._probe_stride: int = probe_stride
 
     def on_new_token_hashes(
         self,
@@ -101,9 +118,23 @@ def on_new_token_hashes(
         start_chunk_idx: int = 0,
         position_offset: int = 0,
     ) -> None:
-        """Index non-overlapping chunks; ``start_chunk_idx=1`` skips pos-0
-        (handled by the standard prefix lookup); ``position_offset`` is
-        added to recorded positions for tail-slices."""
+        """Index a stored sequence's non-overlapping chunks into the matcher.
+
+        Records each new chunk's poly hash + start position so a later
+        match_sub_sequence can find it. Thread-safe (holds the matcher lock).
+
+        Args:
+            token_ids (list[int]): The stored sequence's token IDs.
+            token_hashes (list[bytes]): Per-chunk content hashes (one per
+                chunk), used as the dedup/eviction key.
+            start_chunk_idx (int): First chunk to index; 1 skips chunk 0 (the
+                standard prefix lookup owns it).
+            position_offset (int): Added to each recorded start position (for
+                indexing a tail-slice of a larger sequence).
+
+        Returns:
+            None.
+        """
         arr = np.array(token_ids, dtype=np.uint64)
         chunk_hashes = chunk_hash_windows_numba(arr, self.chunk_size, self._BASE)
         n = int(chunk_hashes.shape[0])
@@ -160,74 +191,66 @@ def match_sub_sequence(
         self,
         token_ids: list[int],
     ) -> list[CBMatchResult]:
-        """Probe rolling-hash array every ``probe_stride`` positions; skips
-        bucket-only collisions and evicted entries. One result per unique
-        match; ``cur_st`` is the first block-aligned hit."""
+        """Find every registered chunk reused anywhere in a query sequence.
+
+        Vectorized direct-address probe over all token positions, then a small
+        verify loop over the surviving hits (a full poly-hash check rejects
+        bucket collisions; evicted/unknown chunks are skipped). Thread-safe.
+
+        Args:
+            token_ids (list[int]): The query sequence's token IDs.
+
+        Returns:
+            list[CBMatchResult]: One result per unique reused chunk (cur_st
+            = its first query position, old_st = its stored position).
+            Empty if the query is shorter than one chunk or nothing matched.
+        """
         if len(token_ids) < self.chunk_size:
             return []
 
         arr = np.array(token_ids, dtype=np.uint64)
         rolling = rolling_hash_windows_numba(arr, self.chunk_size, self._BASE)
-        n_positions = int(rolling.shape[0])
 
         with self._lock:
             if not self._chunk_token_hash:
-                logger.info(
-                    "[match_probe] empty fingerprint table; n_tok=%d", len(token_ids)
-                )
                 return []
 
-            mask = int(self._mask)
-            stride = self._probe_stride
+            # Vectorized direct-address probe over all positions. The table is
+            # sparse (TABLE_SIZE >> registered chunks), so only true matches and
+            # a few bucket collisions reach the Python verify loop below.
+            cids_at_pos = self._table_id[rolling & self._mask]
+            hit_positions = np.nonzero(cids_at_pos >= 0)[0]
+
             seen_cids: set[int] = set()
             results: list[CBMatchResult] = []
-            n_probes = 0
-            n_table_hit = 0
-            n_collision = 0
-            n_evicted = 0
-            n_no_old_st = 0
-            for q_pos in range(0, n_positions, stride):
-                n_probes += 1
-                r = int(rolling[q_pos])
-                cid = int(self._table_id[r & mask])
-                if cid < 0 or cid in seen_cids:
-                    continue
-                n_table_hit += 1
-                if r != self._chunk_poly_hash[cid]:
-                    n_collision += 1
+            for pos in hit_positions:
+                pos = int(pos)
+                cid = int(cids_at_pos[pos])
+                if cid in seen_cids:
                     continue
+                if int(rolling[pos]) != self._chunk_poly_hash[cid]:
+                    continue  # bucket-only collision
                 th = self._chunk_token_hash[cid]
                 if th is None:
-                    n_evicted += 1
-                    continue
+                    continue  # evicted
                 old_st = self._token_hash_to_start.get(th)
                 if old_st is None:
-                    n_no_old_st += 1
                     continue
                 seen_cids.add(cid)
                 results.append(
                     CBMatchResult(
                         old_st=old_st,
                         old_ed=old_st + self.chunk_size,
-                        cur_st=q_pos,
-                        cur_ed=q_pos + self.chunk_size,
+                        cur_st=pos,
+                        cur_ed=pos + self.chunk_size,
                         hash=th,
                     )
                 )
             logger.info(
-                "[match_probe] n_tok=%d stride=%d n_probes=%d "
-                "table_hit=%d collisions=%d evicted=%d no_old_st=%d "
-                "→ matches=%d (sample old_st=%s cur_st=%s)",
+                "[match_probe] n_tok=%d table_hits=%d matches=%d",
                 len(token_ids),
-                stride,
-                n_probes,
-                n_table_hit,
-                n_collision,
-                n_evicted,
-                n_no_old_st,
+                len(hit_positions),
                 len(results),
-                [r.old_st for r in results[:3]],
-                [r.cur_st for r in results[:3]],
             )
             return results
 
@@ -386,8 +409,8 @@ def cb_register_rope(
         head_size: int,
         is_neox_style: bool,
     ) -> None:
-        """Bolt rope state onto an already-registered ``cache_contexts`` entry;
-        idempotent. ``REGISTER_KV_CACHE`` must precede this."""
+        """Bolt rope state onto an already-registered cache_contexts entry;
+        idempotent. REGISTER_KV_CACHE must precede this."""
         cache_contexts = self._gpu_transfer.cache_contexts
         if instance_id not in cache_contexts:
             raise ValueError(
@@ -429,31 +452,14 @@ def cb_register_rope(
         self._cb_gpu_contexts[instance_id] = gpu_context
         self._cb_gpu_context_meta[instance_id] = (entry.model_name, entry.world_size)
 
-        # Probe stride = ie block size; must divide chunk_size.
-        ie_logical_block_size = (
-            gpu_context.kv_layer_groups_manager.inference_engine_logical_block_size
-        )
-        if self._ctx.chunk_size % ie_logical_block_size == 0:
-            self._token_range_matcher._probe_stride = ie_logical_block_size
-        else:
-            logger.warning(
-                "CB matcher probe stride unchanged (%d): chunk_size %d is not "
-                "a multiple of inference_engine_logical_block_size %d.",
-                self._token_range_matcher._probe_stride,
-                self._ctx.chunk_size,
-                ie_logical_block_size,
-            )
-
         logger.info(
             "Registered CB rope state for instance %d "
-            "(cos_sin_cache shape=%s dtype=%s, head_size=%d, is_neox=%s, "
-            "matcher_probe_stride=%d)",
+            "(cos_sin_cache shape=%s dtype=%s, head_size=%d, is_neox=%s)",
             instance_id,
             tuple(cos_sin_cache.shape),
             cos_sin_cache.dtype,
             head_size,
             is_neox_style,
-            self._token_range_matcher._probe_stride,
         )
 
     def cb_unregister_rope(self, instance_id: int) -> None:
@@ -494,7 +500,7 @@ def _drain_fingerprints_sync(self) -> None:
     def _match_fingerprints(self, key: IPCCacheEngineKey) -> list[CBMatchResult]:
         """Drain pending registrations, fingerprint-match sub-sequences, then
         leftmost-greedy dedup over overlapping ranges. Returns matches sorted
-        by ``cur_st`` (empty if none)."""
+        by cur_st (empty if none)."""
         self._drain_fingerprints_sync()
         matches = self._token_range_matcher.match_sub_sequence(list(key.token_ids))
         if not matches:
@@ -527,8 +533,8 @@ def _sparse_prefetch_submit(
         layout_desc: "MemoryLayoutDesc",
         matches: list[CBMatchResult],
     ) -> "tuple[PrefetchHandle, dict[bytes, list], list[int]]":
-        """Coalesce all ``matches`` into one sparse prefetch and submit it
-        (non-blocking). The caller polls ``query_prefetch_status(handle)`` then
+        """Coalesce all matches into one sparse prefetch and submit it
+        (non-blocking). The caller polls query_prefetch_status(handle) then
         calls :meth:`_sparse_classify` with the found set."""
         world_size = key.world_size
         per_hash_obj_keys: dict[bytes, list] = {}
@@ -625,7 +631,7 @@ def cb_unified_lookup(
         """Non-blocking single-RPC CB lookup (submit-once, poll-on-recall).
 
         First call submits the prefix lookup + fingerprint match; later calls
-        poll both legs, returning ``None`` until the prefix and the sparse
+        poll both legs, returning None until the prefix and the sparse
         complement are both resident in L1 (so a worker thread never blocks on
         the L2->L1 loads). The prefix job's L1 read locks persist for the
         retrieve.
@@ -667,11 +673,9 @@ def cb_unified_lookup(
         # enter the sparse prefetch, so they cannot leak a read lock.
         if not job.sparse_started:
             prefix_tokens = job.prefix_chunks * chunk_size
-            job.non_prefix = [
-                r
-                for r in job.matches
-                if r.cur_st >= prefix_tokens and r.cur_st % chunk_size == 0
-            ]
+            # Any offset is fine: the per-token slot scatter writes
+            # non-block-aligned matches.
+            job.non_prefix = [r for r in job.matches if r.cur_st >= prefix_tokens]
             if job.non_prefix:
                 layout_desc = self._resolve_cb_layout_desc(
                     key.model_name, key.world_size
@@ -786,7 +790,7 @@ def store(
         return result
 
     def _drain_fingerprint_queue(self) -> None:
-        """Best-effort background drainer for ``_fingerprint_queue``."""
+        """Best-effort background drainer for _fingerprint_queue."""
         while not self._fingerprint_stop.is_set():
             try:
                 job = self._fingerprint_queue.get(timeout=0.1)
@@ -816,7 +820,7 @@ def _apply_cb_rope_batched(
         slots_to_rope: list[tuple[int, int, int]],
     ) -> None:
         """Re-RoPE tmp-pool slots in-place (K-only, per group); list of
-        ``(slot_idx, old_st, cur_st)``."""
+        (slot_idx, old_st, cur_st)."""
         if not slots_to_rope:
             return
         num_groups = gpu_context.kv_layer_groups_manager.num_groups
@@ -891,7 +895,14 @@ def cb_retrieve_pre_computed(
         with self._lookup_obj_keys_lock:
             cached = self._lookup_obj_keys_cache.pop(key.request_id, None)
         if cached is not None and all(r.hash in cached for r in cb_match_result):
-            all_obj_keys = [k for r in cb_match_result for k in cached[r.hash]]
+            # The lookup cached all-ranks obj keys (world_size per hash). This
+            # retrieve is per-worker, so select THIS rank's key -> M objects, not
+            # M*world_size (else the zip below silently truncates and mispairs
+            # ranks at TP>1). Mirrors the non-cached path's per-worker resolve.
+            if key.worker_id is not None and key.world_size > 1:
+                all_obj_keys = [cached[r.hash][key.worker_id] for r in cb_match_result]
+            else:
+                all_obj_keys = [k for r in cb_match_result for k in cached[r.hash]]
         else:
             all_obj_keys = ipc_key_to_object_keys(
                 key, [r.hash for r in cb_match_result]
@@ -937,7 +948,6 @@ def cb_retrieve_pre_computed(
                 f"chunk_size {chunk_size} must be a multiple of "
                 f"inference_engine_logical_block_size {ie_logical_block_size}"
             )
-        blocks_per_chunk = chunk_size // ie_logical_block_size
         num_groups = gpu_context.kv_layer_groups_manager.num_groups
 
         with (
@@ -982,30 +992,18 @@ def cb_retrieve_pre_computed(
                     if memory_objs is None:
                         return event_ipc_handle, False
 
-                    # Drop malformed matches up front.
+                    # Per-token scatter handles any cur_st; just bound the
+                    # matched range to the allocated slots.
                     pairs: list[tuple[CBMatchResult, Any]] = []
-                    for r, memory_obj in zip(
-                        cb_match_result, memory_objs, strict=False
-                    ):
-                        if r.cur_st % ie_logical_block_size != 0:
+                    num_slots = int(all_block_ids_gpu.numel()) * ie_logical_block_size
+                    for r, memory_obj in zip(cb_match_result, memory_objs, strict=True):
+                        if r.cur_ed > num_slots:
                             logger.warning(
-                                "Dropping CB match cur_st=%d: not aligned to "
-                                "ie_logical_block_size=%d.",
+                                "Dropping CB match cur_st=%d cur_ed=%d: exceeds "
+                                "%d slots. Request %s.",
                                 r.cur_st,
-                                ie_logical_block_size,
-                            )
-                            continue
-                        cbs = r.cur_st // ie_logical_block_size
-                        if cbs + blocks_per_chunk > int(all_block_ids_gpu.numel()):
-                            logger.warning(
-                                "Dropping CB match cur_st=%d old_st=%d: needs "
-                                "blocks [%d:%d) but gpu_block_ids has %d. "
-                                "Request %s.",
-                                r.cur_st,
-                                r.old_st,
-                                cbs,
-                                cbs + blocks_per_chunk,
-                                int(all_block_ids_gpu.numel()),
+                                r.cur_ed,
+                                num_slots,
                                 key.request_id,
                             )
                             continue
@@ -1025,7 +1023,6 @@ def cb_retrieve_pre_computed(
                         for batch_start in range(0, len(run), max_batch):
                             batch = run[batch_start : batch_start + max_batch]
                             batch_len = len(batch)
-                            first_cur_st = batch[0][0].cur_st
 
                             # (a) H2D fill into per-chunk tmp slots.
                             for slot_idx, (_, memory_obj) in enumerate(batch):
@@ -1044,14 +1041,24 @@ def cb_retrieve_pre_computed(
                                 gpu_context, rope_state, batch_len, slots_to_rope
                             )
 
-                            # (c) One batched scatter per group.
-                            chunk_block_start = first_cur_st // ie_logical_block_size
-                            chunk_block_end = (
-                                chunk_block_start + batch_len * blocks_per_chunk
+                            # (c) Per-token slot scatter: partial vLLM blocks
+                            # shared with recomputed tokens stay disjoint.
+                            bs = ie_logical_block_size
+                            pos = torch.cat(
+                                [
+                                    torch.arange(
+                                        r.cur_st,
+                                        r.cur_ed,
+                                        device=gpu_context.device,
+                                        dtype=torch.long,
+                                    )
+                                    for (r, _) in batch
+                                ]
                             )
-                            chunk_block_ids_gpu = all_block_ids_gpu[
-                                chunk_block_start:chunk_block_end
-                            ]
+                            slot_mapping = all_block_ids_gpu[pos // bs] * bs + (
+                                pos % bs
+                            )
+                            page_buffer_size = gpu_context.num_blocks * bs
                             for group_idx in range(num_groups):
                                 tmp_buffers = (
                                     gpu_context.get_tmp_chunk_gpu_buffer_batched(
@@ -1059,23 +1066,17 @@ def cb_retrieve_pre_computed(
                                         group_idx=group_idx,
                                     )
                                 )
-                                group_kv_pointers = gpu_context.get_group_kv_pointers(
-                                    group_idx
-                                )
-                                group_lmcache_chunk_size = (
-                                    gpu_context.get_physical_chunk_size(group_idx)
-                                )
-
-                                lmc_ops.multi_layer_block_kv_transfer(
-                                    group_kv_pointers,
-                                    [tb.data_ptr() for tb in tmp_buffers],
-                                    chunk_block_ids_gpu,
+                                key_value = torch.cat(tmp_buffers, dim=2)
+                                lmc_ops.multi_layer_kv_transfer(
+                                    key_value,
+                                    gpu_context.get_group_kv_pointers(group_idx),
+                                    slot_mapping,
                                     gpu_context.device,
+                                    page_buffer_size,
                                     lmc_ops.TransferDirection.H2D,
-                                    gpu_context.get_shape_desc(group_idx),
-                                    group_lmcache_chunk_size,
                                     gpu_context.gpu_kv_format_,
-                                    0,  # skip_blocks_in_chunk
+                                    block_size=bs,
+                                    head_size=rope_state.head_size,
                                 )
             except Exception:
                 logger.exception("Error during retrieving prefetched results")
diff --git a/lmcache/v1/multiprocess/protocols/blend_v3.py b/lmcache/v1/multiprocess/protocols/blend_v3.py
index 69a5114bf1..337c619768 100644
--- a/lmcache/v1/multiprocess/protocols/blend_v3.py
+++ b/lmcache/v1/multiprocess/protocols/blend_v3.py
@@ -1,16 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Blend V3 protocol — paged-aware CB pipeline.
-
-RPCs:
-* ``CB_REGISTER_ROPE_V3`` / ``CB_UNREGISTER_ROPE_V3`` — share / release the rope
-  cos/sin cache onto a context already registered via ``REGISTER_KV_CACHE``.
-* ``CB_RETRIEVE_PRE_COMPUTED_V3`` — scatter all matched chunks (prefix- and
-  non-prefix-hit) into paged KV by per-token block ID; re-RoPE only the shifted
-  (``old_st != cur_st``) subset.
-* ``CB_UNIFIED_LOOKUP`` — the sole live lookup path: one RPC runs prefix +
-  non-prefix match, reconcile, one sparse-coalesced prefetch, and per-TP-rank
-  classify. ``(IPCCacheEngineKey, tp_size)`` → ``CBUnifiedLookupResult``.
-"""
+"""Blend V3 protocol definitions."""
 
 # First Party
 from lmcache.v1.multiprocess.custom_types import (

From 20cf3cdb99f26f86029a7524414cea647016c761 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Mon, 8 Jun 2026 15:37:19 -0700
Subject: [PATCH 05/57] [Core][MP] refactor the LMCache layer group for better
 compat with hybrid models (#3557)

Signed-off-by: ApostaC <yihua@tensormesh.ai>
---
 lmcache/utils.py                              |  41 +-
 lmcache/v1/kv_layer_groups.py                 | 268 ++++--
 lmcache/v1/multiprocess/gpu_context.py        | 602 ++++++++-----
 lmcache/v1/multiprocess/modules/blend_v3.py   |  30 +-
 .../v1/multiprocess/modules/gpu_transfer.py   |  94 +-
 tests/v1/distributed/serde/test_serde_e2e.py  |  14 +-
 .../test_blend_v3_load_store_opts.py          |  31 +-
 tests/v1/multiprocess/test_gpu_context.py     | 818 ++++++++++--------
 .../test_gpu_transfer_layout_registry.py      |   1 +
 tests/v1/test_kv_layer_groups_manager.py      | 132 ++-
 10 files changed, 1306 insertions(+), 725 deletions(-)

diff --git a/lmcache/utils.py b/lmcache/utils.py
index 2e7193ebf2..3145795678 100644
--- a/lmcache/utils.py
+++ b/lmcache/utils.py
@@ -5,13 +5,15 @@
 # Standard
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, TypeVar, Union
 import asyncio
+import functools
 import hashlib
 import inspect
 import re
 import threading
 import traceback
+import warnings
 
 try:
     # Third Party
@@ -700,6 +702,43 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+##### Deprecation #####
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+def lmcache_deprecate(reason: str) -> Callable[[F], F]:
+    """Mark a function or method as deprecated.
+
+    Calling the wrapped callable emits a ``DeprecationWarning`` and logs a
+    warning the first time it is invoked, including the supplied reason.
+
+    Args:
+        reason: Human-readable explanation of why the callable is deprecated
+            and, ideally, what to use instead.
+
+    Returns:
+        A decorator that wraps the target callable while preserving its
+        signature and metadata.
+    """
+
+    def decorator(func: F) -> F:
+        warned = False
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            nonlocal warned
+            if not warned:
+                message = f"{func.__qualname__} is deprecated: {reason}"
+                warnings.warn(message, DeprecationWarning, stacklevel=2)
+                logger.warning(message)
+                warned = True
+            return func(*args, **kwargs)
+
+        return wrapper  # type: ignore[return-value]
+
+    return decorator
+
+
 #### Thread/asyncio-related utilities ####
 def handle_thread_exception(args):
     """Handle an uncaught exception reported by ``threading``.
diff --git a/lmcache/v1/kv_layer_groups.py b/lmcache/v1/kv_layer_groups.py
index bef741085d..2127bcb573 100644
--- a/lmcache/v1/kv_layer_groups.py
+++ b/lmcache/v1/kv_layer_groups.py
@@ -6,7 +6,7 @@
 from collections import defaultdict
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, NamedTuple
 
 # Third Party
 import torch
@@ -14,6 +14,7 @@
 # First Party
 from lmcache.logging import init_logger
 from lmcache.python_ops_fallback import set_shape_desc_dtype
+from lmcache.utils import lmcache_deprecate
 import lmcache.c_ops as lmc_ops
 
 if TYPE_CHECKING:
@@ -48,7 +49,16 @@
 # block address space). Block IDs are only meaningful within one such group, so
 # layers from different groups must not share one LMCache group (and thus one
 # transfer-kernel launch) even if their tensor shape and dtype match.
-LayerGroupIdentity = tuple[int, int, int, int, int, torch.dtype]
+class KernelGroupIdentity(NamedTuple):
+    kv_size: int
+    num_heads: int
+    head_size: int
+    block_size: int
+    engine_group_idx: int
+    dtype: torch.dtype
+
+
+LayerGroupIdentity = KernelGroupIdentity  # Alias for compatibility
 
 
 # Sentinel ``per_layer_engine_group_idx`` value: a KV tensor tagged with it is
@@ -66,7 +76,7 @@ def group_layers_by_identity(
     """Partition layer indices by :data:`LayerGroupIdentity`.
 
     This helper is shared by vLLM-side LMCache group inflation and server-side
-    ``KVLayerGroupInfo`` construction so both sides agree on group order.
+    ``KernelGroupInfo`` construction so both sides agree on group order.
 
     Args:
         kv_caches: Registered KV cache structure inspected for per-layer shape
@@ -113,12 +123,21 @@ def group_layers_by_identity(
         hs = get_head_size(kv_caches, gpu_kv_format, idx)
         dt = get_dtype(kv_caches, gpu_kv_format, idx)
         bs = get_block_size(kv_caches, gpu_kv_format, idx)
-        groups_dict[(kv_size, nh, hs, bs, engine_group_idx, dt)].append(idx)
+
+        identity = LayerGroupIdentity(
+            kv_size=kv_size,
+            num_heads=nh,
+            head_size=hs,
+            block_size=bs,
+            engine_group_idx=engine_group_idx,
+            dtype=dt,
+        )
+        groups_dict[identity].append(idx)
     return sorted(groups_dict.items(), key=lambda kv: kv[1][0])
 
 
 @dataclass
-class KVLayerGroupInfo:
+class KernelGroupInfo:
     """A single transfer-kernel dispatch unit: a set of KV layers that can
     ride one kernel launch with one ``PageBufferShapeDesc``.
 
@@ -180,7 +199,7 @@ def __repr__(self) -> str:
             indices_repr = f"{self.layer_indices[0]}-{self.layer_indices[-1]}"
         sd = self.shape_desc
         return (
-            f"KVLayerGroupInfo(layers={len(self.layer_indices)}, "
+            f"KernelGroupInfo(layers={len(self.layer_indices)}, "
             f"indices={indices_repr}, "
             f"shape_desc=(kv={sd.kv_size}, nl={sd.nl}, nb={sd.nb}, "
             f"bs={sd.bs}, nh={sd.nh}, hs={sd.hs}, "
@@ -203,18 +222,41 @@ def hidden_dim_size(self) -> int:
         return self.shape_desc.nh * self.shape_desc.hs
 
 
+KVLayerGroupInfo = KernelGroupInfo  # Alias for compatibility
+
+
+@dataclass
+class ObjectGroupInfo:
+    """Metadata for an 'object group'.
+
+    An object group contains one or more kernel groups whose
+    KV caches will be stored in the same memory object.
+
+    This will be useful for dealing with sliding window or mamba
+    KV caches that needs a different prefix matching logic from
+    the full attention KV caches.
+    """
+
+    kernel_group_indices: list[int]
+    """Indices of the kernel groups belonging to this object group, in the
+    order they should be laid out in memory."""
+
+    # NOTE: will add fields to indicate the "kv cache type" of this
+    # object group in the follow-up PRs
+
+
 class KVLayerGroupsManager:
     """Partition a model's KV layers into transfer-kernel dispatch units.
 
     At construction time, every layer in ``kv_caches`` is bucketed by its
     :data:`LayerGroupIdentity` (``(kv_size, num_heads, head_size,
     block_size, engine_group_idx, dtype)``). Each bucket becomes one
-    :class:`KVLayerGroupInfo` holding the layer indices, a shared
+    :class:`KernelGroupInfo` holding the layer indices, a shared
     :class:`PageBufferShapeDesc`, and the group's torch dtype.
 
     Downstream consumers (``VLLMPagedMemGPUConnectorV3``,
     ``GPUCacheContext``, the multiprocess server) iterate
-    ``self.kv_layer_groups`` and issue one transfer-kernel launch per
+    ``self._kernel_groups`` and issue one transfer-kernel launch per
     group. The manager itself is a pure metadata object — it does not
     own any GPU buffers or perform any transfers.
 
@@ -239,7 +281,7 @@ def __init__(
         ``(kv_size, num_heads, head_size, dtype)`` via the format-aware
         accessors in ``utils.py``. Layers with identical identities are
         bucketed together; each bucket becomes one
-        :class:`KVLayerGroupInfo`.
+        :class:`KernelGroupInfo`.
 
         Groups are emitted in the order of their first-appearing layer,
         so group indices are deterministic across runs.
@@ -292,7 +334,8 @@ def __init__(
             if layout_hints
             else None
         )
-        self.kv_layer_groups: list[KVLayerGroupInfo] = []
+        self._kernel_groups: list[KernelGroupInfo] = []
+        self._object_groups: list[ObjectGroupInfo] = []
 
         num_layers = get_num_layers(kv_caches, gpu_kv_format)
         if num_layers == 0:
@@ -334,6 +377,10 @@ def __init__(
             group_logical_block_size = (
                 max(global_logical, bs) if global_logical is not None else None
             )
+
+            # TODO (ApostaC): the code here is not very good.
+            # Conceptually, KV Layer Group should not be aware of lmcache logical
+            # chunk size at all.
             compress_ratio, physical_chunk_size = self._derive_compression_metadata(
                 group_idx=group_idx,
                 bs=bs,
@@ -341,8 +388,8 @@ def __init__(
                 lmcache_logical_chunk_size=lmcache_logical_chunk_size,
             )
 
-            self.kv_layer_groups.append(
-                KVLayerGroupInfo(
+            self._kernel_groups.append(
+                KernelGroupInfo(
                     layer_indices=indices,
                     shape_desc=shape_desc,
                     dtype=dt,
@@ -354,10 +401,139 @@ def __init__(
 
         self.inference_engine_logical_block_size_ = (
             self.inference_engine_logical_block_size_
-            or self.kv_layer_groups[0].shape_desc.bs
+            or self._kernel_groups[0].shape_desc.bs
+        )
+
+        logger.info(
+            "KV layer groups: ---\n%s\n---",
+            "\n".join(repr(g) for g in self._kernel_groups),
+        )
+
+        # Detect the object groups
+        self._object_groups = self._detect_object_groups(group_views)
+
+    @property
+    def kernel_groups(self) -> list[KernelGroupInfo]:
+        """List of :class:`KernelGroupInfo`, one per kernel group."""
+        return self._kernel_groups
+
+    @property
+    @lmcache_deprecate("`kv_layer_groups` is an outdated alias for `kernel_groups`")
+    def kv_layer_groups(self) -> list[KernelGroupInfo]:
+        """List of :class:`KernelGroupInfo`, one per kernel group."""
+        return self._kernel_groups
+
+    @property
+    def num_kernel_groups(self) -> int:
+        """Number of :class:`KernelGroupInfo` entries.
+
+        Zero if ``kv_caches`` had no layers at construction time.
+        """
+        return len(self._kernel_groups)
+
+    @property
+    def object_groups(self) -> list[ObjectGroupInfo]:
+        """List of :class:`ObjectGroupInfo`, one per object group."""
+        return self._object_groups
+
+    @property
+    def num_object_groups(self) -> int:
+        """Number of :class:`ObjectGroupInfo` entries."""
+        return len(self._object_groups)
+
+    @property
+    @lmcache_deprecate("`num_groups` is an outdated alias for `num_kernel_groups`")
+    def num_groups(self) -> int:
+        """Number of :class:`KernelGroupInfo` entries.
+
+        Zero if ``kv_caches`` had no layers at construction time.
+        """
+        return len(self._kernel_groups)
+
+    @property
+    def inference_engine_logical_block_size(self) -> int:
+        """Inference-engine-side logical block size.
+
+        Taken from ``layout_hints`` at construction time, or falls back
+        to the first group's physical ``bs`` when no hint is provided
+        (non-vLLM engines, or vLLM without mixed-compression KV groups),
+        in which case every group is treated as non-compressed.
+        """
+        return (
+            self.inference_engine_logical_block_size_
+            or self._kernel_groups[0].shape_desc.bs
         )
 
-        logger.info("KV layer groups: %s", self.kv_layer_groups)
+    def get_shape_desc(self, kernel_group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
+        """Return the :class:`PageBufferShapeDesc` for *kernel_group_idx*.
+
+        Equivalent to ``self._kernel_groups[kernel_group_idx].shape_desc``.
+
+        Args:
+            kernel_group_idx: 0-based kernel group index.
+
+        Raises:
+            IndexError: If *kernel_group_idx* is out of range.
+        """
+        return self._kernel_groups[kernel_group_idx].shape_desc
+
+    def get_physical_chunk_size(self, kernel_group_idx: int) -> int:
+        """Return the per-chunk *physical* slot count for *kernel_group_idx*.
+
+        Equivalent to
+        ``self._kernel_groups[kernel_group_idx].physical_chunk_size``.
+        For non-compressed groups this equals
+        ``lmcache_logical_chunk_size``; for compressed groups it equals
+        ``lmcache_logical_chunk_size // compress_ratio`` and is what the
+        block-level transfer kernel must be told (the logical chunk size
+        in *vLLM tokens* is not what the kernel addresses).
+
+        Args:
+            kernel_group_idx: 0-based kernel group index.
+
+        Raises:
+            IndexError: If *kernel_group_idx* is out of range.
+        """
+        return self._kernel_groups[kernel_group_idx].physical_chunk_size
+
+    def calculate_num_blocks(self, kernel_group_idx: int, num_tokens: int) -> int:
+        """Calculate the number of blocks for a given number of tokens in a
+        specified kernel group.
+
+        Args:
+            kernel_group_idx: 0-based index of the kernel group.
+            num_tokens: The total number of tokens to be processed for the group.
+
+        Returns:
+            The number of blocks.
+
+        Raises:
+            IndexError: If *kernel_group_idx* is out of range.
+        """
+        group = self._kernel_groups[kernel_group_idx]
+        num_physical_slots = num_tokens // group.compress_ratio
+        return num_physical_slots // group.shape_desc.bs
+
+    ### Helper methods
+    def _detect_object_groups(
+        self, group_views: "Sequence[LMCacheGroupView]"
+    ) -> list[ObjectGroupInfo]:
+        """Detect object groups based on the provided group views.
+
+        Args:
+            group_views: LMCache-owned engine KV cache group metadata.
+
+        Returns:
+            A list of ObjectGroupInfo instances representing the detected object groups.
+        """
+        # TODO: add the real object group detection logic based on
+        # the attention type metadata in the group views once it's
+        # available.
+        # Now, we are using a single object group, which means
+        # all kernel groups' KV caches will be stored in the same memory object.
+        return [
+            ObjectGroupInfo(kernel_group_indices=list(range(len(self._kernel_groups))))
+        ]
 
     @staticmethod
     def _derive_compression_metadata(
@@ -406,60 +582,6 @@ def _derive_compression_metadata(
             )
         return compress_ratio, physical_chunk_size
 
-    @property
-    def num_groups(self) -> int:
-        """Number of :class:`KVLayerGroupInfo` entries.
-
-        Zero if ``kv_caches`` had no layers at construction time.
-        """
-        return len(self.kv_layer_groups)
-
-    @property
-    def inference_engine_logical_block_size(self):
-        """Inference-engine-side logical block size.
-
-        Taken from ``layout_hints`` at construction time, or falls back
-        to the first group's physical ``bs`` when no hint is provided
-        (non-vLLM engines, or vLLM without mixed-compression KV groups),
-        in which case every group is treated as non-compressed.
-        """
-        return (
-            self.inference_engine_logical_block_size_
-            or self.kv_layer_groups[0].shape_desc.bs
-        )
-
-    def get_shape_desc(self, group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
-        """Return the :class:`PageBufferShapeDesc` for *group_idx*.
-
-        Equivalent to ``self.kv_layer_groups[group_idx].shape_desc``.
-
-        Args:
-            group_idx: 0-based group index.
-
-        Raises:
-            IndexError: If *group_idx* is out of range.
-        """
-        return self.kv_layer_groups[group_idx].shape_desc
-
-    def get_physical_chunk_size(self, group_idx: int) -> int:
-        """Return the per-chunk *physical* slot count for *group_idx*.
-
-        Equivalent to
-        ``self.kv_layer_groups[group_idx].physical_chunk_size``.
-        For non-compressed groups this equals
-        ``lmcache_logical_chunk_size``; for compressed groups it equals
-        ``lmcache_logical_chunk_size // compress_ratio`` and is what the
-        block-level transfer kernel must be told (the logical chunk size
-        in *vLLM tokens* is not what the kernel addresses).
-
-        Args:
-            group_idx: 0-based group index.
-
-        Raises:
-            IndexError: If *group_idx* is out of range.
-        """
-        return self.kv_layer_groups[group_idx].physical_chunk_size
-
 
 # ------------------------------------------------------------------ #
 #  CLI shape-spec parser                                               #
@@ -468,7 +590,7 @@ def get_physical_chunk_size(self, group_idx: int) -> int:
 
 def parse_kvcache_shape_spec(
     spec_str: str,
-) -> list[KVLayerGroupInfo]:
+) -> list[KernelGroupInfo]:
     """Parse a ``--kvcache-shape-spec`` string into layer groups.
 
     **Grammar** (EBNF-ish)::
@@ -513,7 +635,7 @@ def parse_kvcache_shape_spec(
     (handy for CLI echo-back / debug logging).
 
     Returns:
-        A list of :class:`KVLayerGroupInfo`, one per group.
+        A list of :class:`KernelGroupInfo`, one per group.
 
     Raises:
         ValueError: Malformed spec, unknown dtype, or a shape with a
@@ -522,7 +644,7 @@ def parse_kvcache_shape_spec(
     if not spec_str:
         raise ValueError("KV shape specification cannot be empty")
 
-    groups: list[KVLayerGroupInfo] = []
+    groups: list[KernelGroupInfo] = []
     layer_offset = 0
 
     for group_spec in spec_str.split(";"):
@@ -574,7 +696,7 @@ def parse_kvcache_shape_spec(
 
         indices = list(range(layer_offset, layer_offset + layer_count))
         groups.append(
-            KVLayerGroupInfo(
+            KernelGroupInfo(
                 layer_indices=indices,
                 shape_desc=shape_desc,
                 dtype=dtype,
@@ -588,7 +710,7 @@ def parse_kvcache_shape_spec(
     return groups
 
 
-def format_kvcache_shape_spec(groups: list[KVLayerGroupInfo]) -> str:
+def format_kvcache_shape_spec(groups: list[KernelGroupInfo]) -> str:
     """Format layer groups back into a ``--kvcache-shape-spec`` string.
 
     This is the inverse of :func:`parse_kvcache_shape_spec`; the
diff --git a/lmcache/v1/multiprocess/gpu_context.py b/lmcache/v1/multiprocess/gpu_context.py
index bc885467ad..182d55af15 100644
--- a/lmcache/v1/multiprocess/gpu_context.py
+++ b/lmcache/v1/multiprocess/gpu_context.py
@@ -22,7 +22,7 @@
 # First Party
 from lmcache import torch_dev
 from lmcache.logging import init_logger
-from lmcache.utils import EngineType
+from lmcache.utils import EngineType, lmcache_deprecate
 from lmcache.v1.gpu_connector.utils import (
     LayoutHints,
     get_attention_backend,
@@ -63,6 +63,275 @@ def list_to_gpu_tensor(lis: list[int], device: torch.device) -> torch.Tensor:
     )
 
 
+class _TempGPUBuffer:
+    """
+    Manages the temporary GPU buffer for GPUCacheContext
+
+    The logical layout of the temp GPU buffer is (batch size,
+    object group, kernel group).
+
+    Here is an example of batch size = 4, with 2 object groups,
+    and 2 kernel groups per object group:
+    [
+        batch 0:
+            - object group 0: kernel group 0 | kernel group 1 | ...
+            - object group 1: kernel group 2 | kernel group 3 | ...
+
+        batch 1:
+            - object group 0: kernel group 0 | kernel group 1 | ...
+            - object group 1: kernel group 2 | kernel group 3 | ...
+
+        batch 2:
+            - object group 0: kernel group 0 | kernel group 1 | ...
+            - object group 1: kernel group 2 | kernel group 3 | ...
+
+        batch 3:
+            - object group 0: kernel group 0 | kernel group 1 | ...
+            - object group 1: kernel group 2 | kernel group 3 | ...
+    ]
+
+    During the multi-layer copy kernel launch, we will do it at kernel
+    group level, which means we will have:
+    ```
+    gpu_buffers = [
+        get_temp_kernel_group_buffer(batch_idx, kernel_group_idx)
+        for batch_idx in range(batch_size)
+    ]
+    ```
+
+    During the lmcache_memcpy_async launch, we will do it at the object group
+    level, which will be:
+    ```
+    for i in range(batch_size):
+        gpu_buffer = get_temp_object_group_buffer(batch_idx, object_group_idx)
+        lmcache_memcpy_async(...)
+    ```
+    """
+
+    def __init__(
+        self,
+        kv_layer_groups_manager: KVLayerGroupsManager,
+        lmcache_logical_chunk_size: int,
+        device: torch.device,
+        max_batch_size: int = 4,
+    ) -> None:
+        self._kv_groups_manager = kv_layer_groups_manager
+        self._lmcache_chunk_size = lmcache_logical_chunk_size
+        self._max_batch_size = max_batch_size
+
+        self._temp_buffer = torch.empty(
+            self._get_size_for_single_batch() * max_batch_size,
+            dtype=torch.uint8,
+            device=device,
+        )
+
+        # Offset map: (batch_idx, object_group_idx, kernel_group_idx) ->
+        # (byte offset in the temp buffer, size of the buffer in bytes)
+        self._offset_map: dict[tuple[int, int, int], tuple[int, int]] = {}
+
+        # (batch_idx, kernel_group_idx) -> (byte offset for the kernel group,
+        # size of the buffer in bytes).
+        self._offset_map_kernel_group_only: dict[tuple[int, int], tuple[int, int]] = {}
+
+        # (batch_idx, object_group_idx) -> (byte offset for the object group,
+        # size of the buffer in bytes)
+        self._offset_map_object_group_only: dict[tuple[int, int], tuple[int, int]] = {}
+
+        offset = 0
+        for batch_idx in range(max_batch_size):
+            for object_group_idx in range(self._kv_groups_manager.num_object_groups):
+                object_group_size = 0
+                object_group_start_offset = offset
+
+                for kernel_group_idx in self._kv_groups_manager.object_groups[
+                    object_group_idx
+                ].kernel_group_indices:
+                    key = (batch_idx, object_group_idx, kernel_group_idx)
+                    key2 = (batch_idx, kernel_group_idx)
+
+                    size = self._get_size_for_kernel_group(kernel_group_idx)
+                    self._offset_map[key] = (offset, size)
+                    self._offset_map_kernel_group_only[key2] = (offset, size)
+
+                    offset += size
+                    object_group_size += size
+
+                key3 = (batch_idx, object_group_idx)
+                self._offset_map_object_group_only[key3] = (
+                    object_group_start_offset,
+                    object_group_size,
+                )
+
+        # Shape/dtype cache for kernel groups
+        self._shape_cache_kernel_group: dict[int, tuple[torch.Size, torch.dtype]] = {}
+        for kernel_group_idx in range(self._kv_groups_manager.num_kernel_groups):
+            shape = self._get_shape_for_kernel_group(
+                self._lmcache_chunk_size, kernel_group_idx
+            )
+            group = self._kv_groups_manager.kernel_groups[kernel_group_idx]
+            dtype = group.dtype
+            self._shape_cache_kernel_group[kernel_group_idx] = (shape, dtype)
+
+    # Public APIs
+    @property
+    def max_batch_size(self) -> int:
+        """Maximum number of chunks (batch slots) the buffer holds."""
+        return self._max_batch_size
+
+    def get_temp_kernel_group_buffer(
+        self, batch_idx: int, kernel_group_idx: int
+    ) -> torch.Tensor:
+        """
+        Returns the temp GPU buffer for the given batch index and kernel group index.
+        The returned buffer is with the correct shape and dtype for the kernel group.
+
+        Args:
+            batch_idx: Index of the batch (0 <= batch_idx < max_batch_size)
+            kernel_group_idx: Index of the kernel group.
+
+        Returns:
+            The temp GPU buffer for the given batch index and kernel group index.
+
+        Raises:
+            ValueError: If the batch_idx or kernel_group_idx is out of range.
+        """
+        key = (batch_idx, kernel_group_idx)
+        if key not in self._offset_map_kernel_group_only:
+            raise ValueError(
+                f"Invalid batch_idx {batch_idx} or kernel_group_idx {kernel_group_idx}"
+            )
+
+        offset, size = self._offset_map_kernel_group_only[key]
+        shape, dtype = self._shape_cache_kernel_group[kernel_group_idx]
+        return self._temp_buffer[offset : offset + size].view(dtype).view(shape)
+
+    def get_temp_object_group_buffer(
+        self, batch_idx: int, object_group_idx: int
+    ) -> torch.Tensor:
+        """
+        Returns the temp GPU buffer for the given batch index and object group index
+        The returned buffer is a flat uint8 raw tensor.
+
+        Args:
+            batch_idx: Index of the batch (0 <= batch_idx < max_batch_size)
+            object_group_idx: Index of the object group.
+
+        Returns:
+            The temp GPU buffer for the given batch index and object group index.
+        """
+        key = (batch_idx, object_group_idx)
+        if key not in self._offset_map_object_group_only:
+            raise ValueError(
+                f"Invalid batch_idx {batch_idx} or object_group_idx {object_group_idx}"
+            )
+
+        offset, size = self._offset_map_object_group_only[key]
+        return self._temp_buffer[offset : offset + size]
+
+    def get_kernel_group_shape_dtype(
+        self,
+        num_tokens: int,
+        kernel_group_idx: int,
+    ) -> tuple[torch.Size, torch.dtype]:
+        """
+        Returns the shape and dtype for the given kernel group index and
+        number of tokens.
+
+        Will be exported by GPUCacheContext and used to construct the
+        MemoryLayoutDesc
+
+        Args:
+            num_tokens: Number of tokens
+            kernel_group_idx: Index of the kernel group.
+
+        Returns:
+            The shape and dtype for the given kernel group index and
+            number of tokens.
+        """
+        _, dtype = self._shape_cache_kernel_group[kernel_group_idx]
+        shape = self._get_shape_for_kernel_group(num_tokens, kernel_group_idx)
+
+        return shape, dtype
+
+    def get_cache_size_per_token(self) -> int:
+        """
+        Returns the cache size per token (in bytes), summed across all kernel groups.
+        """
+        return self._get_size_for_single_batch() // self._lmcache_chunk_size
+
+    # Helper functions
+    def _get_shape_for_kernel_group(
+        self,
+        num_tokens: int,
+        kernel_group_idx: int,
+    ) -> torch.Size:
+        """
+        Returns the shape of the temp GPU buffer for the given kernel group index
+
+        Args:
+            num_tokens: Number of tokens
+            kernel_group_idx: Index of the kernel group.
+
+        Returns:
+            The shape of the temp GPU buffer for the given kernel group index.
+        """
+        group = self._kv_groups_manager.kernel_groups[kernel_group_idx]
+        compress_ratio = group.compress_ratio
+        sd = group.shape_desc
+
+        if num_tokens % compress_ratio != 0:
+            raise ValueError(
+                f"logical_num_tokens ({num_tokens}) is not a multiple of "
+                f"compress_ratio ({compress_ratio}) for group {kernel_group_idx}"
+            )
+        num_slots = num_tokens // compress_ratio
+        return torch.Size(
+            (sd.kv_size, group.num_layers, num_slots, group.hidden_dim_size)
+        )
+
+    def _get_size_for_kernel_group(self, kernel_group_idx: int) -> int:
+        """
+        Returns the size in bytes of the temp GPU buffer for the given kernel group
+        index
+
+        **Assumes the size is lmcache_chunk_size
+
+        Will only be called during initialization
+        """
+        shape = self._get_shape_for_kernel_group(
+            self._lmcache_chunk_size, kernel_group_idx
+        )
+        kernel_group = self._kv_groups_manager.kernel_groups[kernel_group_idx]
+        dtype = kernel_group.dtype
+        return shape.numel() * dtype.itemsize
+
+    def _get_size_for_object_group(self, object_group_idx: int) -> int:
+        """
+        Returns the size in bytes of the temp GPU buffer for the given object group
+
+        **Assumes the size is lmcache_chunk_size
+
+        Will only be called during initialization
+        """
+        object_group = self._kv_groups_manager.object_groups[object_group_idx]
+        return sum(
+            self._get_size_for_kernel_group(kernel_group_idx)
+            for kernel_group_idx in object_group.kernel_group_indices
+        )
+
+    def _get_size_for_single_batch(self) -> int:
+        """
+        Returns the size in bytes of the temp GPU buffer for a single batch
+        (i.e., a single chunk)
+
+        **Assumes the size is lmcache_chunk_size
+        """
+        return sum(
+            self._get_size_for_object_group(object_group_idx)
+            for object_group_idx in range(self._kv_groups_manager.num_object_groups)
+        )
+
+
 class GPUCacheContext:
     """
     Manages the shape and pointers to vLLM GPU KV cache tensors.
@@ -107,44 +376,17 @@ def __init__(
         # Pre-allocated GPU buffer for block IDs (up to 1M elements).
         # The caller copies block_ids into this buffer before launching the
         # block-level kernel. Single-thread assumption: no lock needed.
-        _MAX_BLOCK_IDS = 1_000_000
+        _MAX_BLOCK_IDS = 1 << 20
         self.block_ids_buffer_ = torch.empty(
             _MAX_BLOCK_IDS, dtype=torch.long, device=self.device_
         )
 
         # Temporary GPU buffer for transfers — a single flat uint8 buffer
-        # laid out in chunk-major order so that each chunk's data matches
-        # the layout of a MemoryObj.raw_data (all groups concatenated):
-        #
-        #   [ chunk_0: group_0_bytes | group_1_bytes | ... ]
-        #   [ chunk_1: group_0_bytes | group_1_bytes | ... ]
-        #   ...
-        #
-        # This lets callers copy an entire chunk to/from a MemoryObj with a
-        # single memcpy, without needing to know the per-group layout.
-        # max_batch_size is the max number of chunks processed concurrently.
-        self.max_batch_size = 4
-        # Byte size of one chunk entry (= one chunk across all groups).
-        # tmp_chunk_group_offsets_[g] is the byte offset of group g within
-        # a single chunk; tmp_chunk_group_offsets_[num_groups] ==
-        # tmp_chunk_bytes_.
-        self.tmp_chunk_group_offsets_: list[int] = [0]
-        for group_idx, group in enumerate(
-            self.kv_layer_groups_manager_.kv_layer_groups
-        ):
-            # ``get_kv_buffer_shape`` takes *logical* tokens; for
-            # compressed groups it folds ``compress_ratio`` logical
-            # tokens into one physical slot internally.
-            shape = self.get_kv_buffer_shape(lmcache_logical_chunk_size, group_idx)
-            byte_size = shape.numel() * group.dtype.itemsize
-            self.tmp_chunk_group_offsets_.append(
-                self.tmp_chunk_group_offsets_[-1] + byte_size
-            )
-        self.tmp_chunk_bytes_ = self.tmp_chunk_group_offsets_[-1]
-        self.tmp_gpu_buffer_ = torch.empty(
-            self.tmp_chunk_bytes_ * self.max_batch_size,
-            dtype=torch.uint8,
+        self._temp_buffer = _TempGPUBuffer(
+            kv_layer_groups_manager=self.kv_layer_groups_manager_,
+            lmcache_logical_chunk_size=lmcache_logical_chunk_size,
             device=self.device_,
+            max_batch_size=4,
         )
 
         # GPU streams
@@ -156,14 +398,6 @@ def __init__(
             self.cuda_stream_.cuda_stream, self.device_.index
         )
 
-        _, high_priority = torch_dev.Stream.priority_range()
-        self.high_priority_cuda_stream_ = torch_dev.Stream(
-            device=self.device_, priority=high_priority
-        )
-        self.high_priority_cupy_stream_ = cupy.cuda.ExternalStream(
-            self.high_priority_cuda_stream_.cuda_stream, self.device_.index
-        )
-
         # Extra initialization
         self.cupy_stream_.launch_host_func(
             lambda logger: logger.info(
@@ -195,38 +429,6 @@ def stream(self) -> Any:
     def cupy_stream(self) -> "cupy.cuda.Stream":
         return self.cupy_stream_
 
-    @property
-    def high_priority_stream(self) -> Any:
-        return self.high_priority_cuda_stream_
-
-    @property
-    def high_priority_cupy_stream(self) -> "cupy.cuda.Stream":
-        return self.high_priority_cupy_stream_
-
-    @property
-    def group_physical_block_sizes(self) -> list[int]:
-        """Per-group physical slot count (``shape_desc.bs``) in group
-        order. For non-compressed groups this equals
-        ``inference_engine_logical_block_size``; for compressed groups
-        it equals
-        ``inference_engine_logical_block_size // compress_ratio``.
-        """
-        return [
-            group.shape_desc.bs
-            for group in self.kv_layer_groups_manager_.kv_layer_groups
-        ]
-
-    @property
-    def group_compress_ratios(self) -> list[int]:
-        """Per-group compression ratio
-        (= ``inference_engine_logical_block_size // shape_desc.bs``)
-        in group order. ``1`` for non-compressed groups.
-        """
-        return [
-            group.compress_ratio
-            for group in self.kv_layer_groups_manager_.kv_layer_groups
-        ]
-
     @property
     def num_layers(self) -> int:
         """
@@ -256,6 +458,11 @@ def hidden_dim_sizes(self) -> list[int]:
             for group in self.kv_layer_groups_manager_.kv_layer_groups
         ]
 
+    @property
+    def kv_layer_groups_manager(self) -> KVLayerGroupsManager:
+        """Returns the KV layer groups manager."""
+        return self.kv_layer_groups_manager_
+
     def get_shape_desc(self, group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
         """Returns the PageBufferShapeDesc for the given KV layer group."""
         return self.kv_layer_groups_manager_.get_shape_desc(group_idx)
@@ -269,116 +476,70 @@ def get_physical_chunk_size(self, group_idx: int) -> int:
         """
         return self.kv_layer_groups_manager_.get_physical_chunk_size(group_idx)
 
-    def blocks_for_tokens(self, num_logical_tokens: int, group_idx: int) -> int:
-        """Number of group ``group_idx`` blocks that span ``num_logical_tokens``.
+    def get_kernel_group_kv_pointers(self, kernel_group_idx: int) -> torch.Tensor:
+        """Returns the pre-computed GPU tensor of KV cache pointers for the
+        given kernel group index.
+        """
+        return self.group_kv_pointers_[kernel_group_idx]
 
-        Each group counts blocks in its own ``block_size`` (``shape_desc.bs``),
-        which can differ across groups. For compressed groups, ``compress_ratio``
-        logical tokens share one physical slot, so it is divided out first.
+    def get_temp_kernel_group_buffer(
+        self, batch_idx: int, kernel_group_idx: int
+    ) -> torch.Tensor:
+        """Returns the temporary GPU buffer for the given batch index and kernel
+        group index, with the correct shape and dtype for the kernel group.
 
         Args:
-            num_logical_tokens: Number of logical (engine-side) tokens.
-            group_idx: Index of the KV layer group.
+            batch_idx: Index of the batch (0 <= batch_idx < max_batch_size)
+            kernel_group_idx: Index of the kernel group.
 
         Returns:
-            The number of this group's blocks spanning those tokens.
+            The temp GPU buffer for the given batch index and kernel group index.
         """
-        group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
-        physical_slots = num_logical_tokens // group.compress_ratio
-        return physical_slots // group.shape_desc.bs
-
-    @property
-    def kv_layer_groups_manager(self) -> KVLayerGroupsManager:
-        """Returns the KV layer groups manager."""
-        return self.kv_layer_groups_manager_
-
-    @property
-    def gpu_kv_format_name(self) -> str:
-        """Returns the GPU KV format enum name (e.g. ``'NL_X_TWO_NB_BS_NH_HS'``)."""
-        return self.gpu_kv_format_.name
-
-    @property
-    def gpu_kv_shape(self) -> str:
-        """Returns a human-readable shape description of the GPU KV cache layout."""
-        return get_gpu_kv_shape_description(self.gpu_kv_format_)
-
-    @property
-    def attention_backend(self) -> str:
-        """Returns the attention backend name."""
-        return get_attention_backend(self.gpu_kv_format_)
+        return self._temp_buffer.get_temp_kernel_group_buffer(
+            batch_idx, kernel_group_idx
+        )
 
     @property
-    def concrete_gpu_kv_shape(self) -> str:
-        """Returns the GPU KV shape with actual numeric values substituted."""
-        return get_concrete_gpu_kv_shape(self.kv_caches_, self.gpu_kv_format_)
+    def max_batch_size(self) -> int:
+        """Maximum number of chunks processed concurrently in one batch."""
+        return self._temp_buffer.max_batch_size
 
-    def get_group_kv_pointers(self, group_idx: int) -> torch.Tensor:
-        """Returns the pre-computed GPU tensor of KV cache pointers for the
-        given group."""
-        return self.group_kv_pointers_[group_idx]
-
-    def get_tmp_gpu_buffer_flat(self, chunk_idx: int) -> torch.Tensor:
-        """Returns the flat uint8 view of the temporary GPU buffer for the
-        given chunk index, covering all KV layer groups.
-
-        The returned tensor will fit a memory full object corresponding
-        ``self.chunk_size`` tokens, so it can be copied to/from a MemoryObj
-        with a single memcpy.
+    def get_temp_object_group_buffer(
+        self, batch_idx: int, object_group_idx: int
+    ) -> torch.Tensor:
+        """Returns the temporary GPU buffer for the given batch index and object
+        group index, as a flat uint8 tensor.
 
         Args:
-            chunk_idx: Chunk index (0 <= chunk_idx < max_batch_size).
-        """
-        if chunk_idx >= self.max_batch_size:
-            raise ValueError(
-                f"chunk_idx {chunk_idx} exceeds max_batch_size {self.max_batch_size}"
-            )
-        start = chunk_idx * self.tmp_chunk_bytes_
-        return self.tmp_gpu_buffer_[start : start + self.tmp_chunk_bytes_]
-
-    def get_tmp_chunk_gpu_buffer(self, group_idx: int = 0) -> torch.Tensor:
-        """
-        Returns a view of the temporary GPU buffer for the given group,
-        sized for a single chunk. The chunk holds
-        ``lmcache_logical_chunk_size`` logical tokens which, for a
-        compressed group, correspond to ``group.physical_chunk_size``
-        physical slots.
+            batch_idx: Index of the batch (0 <= batch_idx < max_batch_size)
+            object_group_idx: Index of the object group.
 
-        Args:
-            group_idx: Index of the KV layer group (default 0).
+        Returns:
+            The temp GPU buffer for the given batch index and object group index.
         """
-        group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
-        shape = self.get_kv_buffer_shape(self.lmcache_logical_chunk_size, group_idx)
-        start = self.tmp_chunk_group_offsets_[group_idx]
-        end = self.tmp_chunk_group_offsets_[group_idx + 1]
-        return self.tmp_gpu_buffer_[start:end].view(group.dtype).view(shape)
+        return self._temp_buffer.get_temp_object_group_buffer(
+            batch_idx, object_group_idx
+        )
 
-    def get_tmp_chunk_gpu_buffer_batched(
-        self, batch_size: int, group_idx: int = 0
-    ) -> list[torch.Tensor]:
-        """
-        Returns a list of ``batch_size`` non-overlapping views into the
-        pre-allocated temporary GPU buffer for the given group, each
-        sized for ``lmcache_logical_chunk_size`` tokens.
+    def get_kernel_group_shape_dtype(
+        self,
+        num_tokens: int,
+        kernel_group_idx: int,
+    ) -> tuple[torch.Size, torch.dtype]:
+        """Returns the shape and dtype for the given kernel group index and number
+        of tokens.
+        Will be exported by GPUCacheContext and used to construct the MemoryLayoutDesc
 
         Args:
-            batch_size: Number of concurrent requests (must be <= max_batch_size).
-            group_idx: Index of the KV layer group (default 0).
+            num_tokens: Number of tokens
+            kernel_group_idx: Index of the kernel group.
+
+        Returns:
+            The shape and dtype for the given kernel group index and number of tokens.
         """
-        if batch_size > self.max_batch_size:
-            raise ValueError(
-                f"batch_size {batch_size} exceeds max_batch_size {self.max_batch_size}"
-            )
-        group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
-        shape = self.get_kv_buffer_shape(self.lmcache_logical_chunk_size, group_idx)
-        g_start = self.tmp_chunk_group_offsets_[group_idx]
-        g_end = self.tmp_chunk_group_offsets_[group_idx + 1]
-        chunk = self.tmp_chunk_bytes_
-        return [
-            self.tmp_gpu_buffer_[i * chunk + g_start : i * chunk + g_end]
-            .view(group.dtype)
-            .view(shape)
-            for i in range(batch_size)
-        ]
+        return self._temp_buffer.get_kernel_group_shape_dtype(
+            num_tokens, kernel_group_idx
+        )
 
     def copy_view_block_ids_to_gpu(
         self, block_ids_per_group: list[list[int]]
@@ -410,6 +571,7 @@ def copy_view_block_ids_to_gpu(
             for i in range(len(block_ids_per_group))
         ]
 
+    @lmcache_deprecate("will be refactored")
     def get_kv_buffer_shape(
         self, logical_num_tokens: int, group_idx: int = 0
     ) -> torch.Size:
@@ -429,6 +591,7 @@ def get_kv_buffer_shape(
                 of the group's ``compress_ratio``.
             group_idx: Index of the KV layer group (default 0).
         """
+        # TODO: remove this!
         group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
         compress_ratio = group.compress_ratio
         if logical_num_tokens % compress_ratio != 0:
@@ -442,6 +605,24 @@ def get_kv_buffer_shape(
             (sd.kv_size, group.num_layers, num_slots, group.hidden_dim_size)
         )
 
+    def calculate_num_blocks(self, num_tokens: int, kernel_group_idx: int) -> int:
+        """Calculate the number of blocks for a given number of tokens in a
+        specified kernel group.
+
+        Args:
+            kernel_group_idx: 0-based index of the kernel group.
+            num_tokens: The total number of tokens to be processed for the group.
+
+        Returns:
+            The number of blocks.
+
+        Raises:
+            IndexError: If *kernel_group_idx* is out of range.
+        """
+        return self.kv_layer_groups_manager.calculate_num_blocks(
+            kernel_group_idx, num_tokens
+        )
+
     def cache_size_per_token(self) -> int:
         """
         Returns the cache size per *logical* token (in bytes), summed
@@ -453,20 +634,65 @@ def cache_size_per_token(self) -> int:
         endpoint and the ``lmcache describe`` CLI); sub-byte truncation
         from integer division is acceptable.
         """
-        total = 0
-        for group_idx, group in enumerate(
-            self.kv_layer_groups_manager_.kv_layer_groups
-        ):
-            # ``get_kv_buffer_shape`` now takes *logical* tokens, so
-            # query ``compress_ratio`` logical tokens (= 1 physical
-            # slot) and then divide the resulting bytes back by
-            # ``compress_ratio`` to recover the per-logical-token
-            # contribution. Equivalent to the old
-            # ``physical_slot_bytes // compress_ratio`` formulation.
-            numels = self.get_kv_buffer_shape(group.compress_ratio, group_idx).numel()
-            slot_bytes = numels * group.dtype.itemsize
-            total += slot_bytes // group.compress_ratio
-        return total
+        return self._temp_buffer.get_cache_size_per_token()
+
+    def report_status(self) -> dict:
+        """Return this context's KV cache layout metadata for ``/status``.
+
+        Builds the ``kv_cache_layout`` sub-dict surfaced by the ``/status``
+        HTTP endpoint (see ``GPUTransferModule.report_status``) and consumed by
+        the ``lmcache`` CLI (``lmcache describe kvcache`` and
+        ``lmcache bench engine``). It describes only the KV cache geometry; the
+        owning module wraps it with ``model_name``/``world_size``, which this
+        context does not track.
+
+        Returns:
+            A dict with one entry per documented layout field:
+
+            - ``num_layers`` (int)
+            - ``inference_engine_logical_block_size`` (int)
+            - ``group_physical_block_sizes`` (list[int]): per-group
+              ``shape_desc.bs``
+            - ``group_compress_ratios`` (list[int]): per-group compress ratio
+            - ``hidden_dim_sizes`` (str): stringified per-group hidden-dim list
+            - ``dtype`` (str): stringified torch dtype
+            - ``is_mla`` (bool)
+            - ``num_blocks`` (int)
+            - ``gpu_kv_format`` (str): GPU KV format enum name
+            - ``gpu_kv_shape`` (str): symbolic shape description
+            - ``gpu_kv_concrete_shape`` (str): shape with numeric values
+            - ``attention_backend`` (str)
+            - ``cache_size_per_token`` (int): bytes per logical token
+        """
+        # TODO(compat): the key names and value *formatting* below are a
+        # contract with the `/status` endpoint and the `lmcache` CLI
+        # (`lmcache/cli/commands/describe.py`, `bench/engine_bench/config.py`).
+        # Renaming a key breaks `lmcache describe kvcache`; dropping
+        # `cache_size_per_token` breaks `lmcache bench engine`. `hidden_dim_sizes`
+        # and `dtype` are stringified only for back-compat with those consumers
+        # and should become a real list / structured value once the CLI is
+        # updated to parse them.
+        manager = self.kv_layer_groups_manager
+        kernel_groups = manager.kernel_groups
+        return {
+            "num_layers": self.num_layers,
+            "inference_engine_logical_block_size": (
+                manager.inference_engine_logical_block_size
+            ),
+            "group_physical_block_sizes": [g.shape_desc.bs for g in kernel_groups],
+            "group_compress_ratios": [g.compress_ratio for g in kernel_groups],
+            "hidden_dim_sizes": str([g.hidden_dim_size for g in kernel_groups]),
+            "dtype": str(self.dtype),
+            "is_mla": self.is_mla,
+            "num_blocks": self.num_blocks,
+            "gpu_kv_format": self.gpu_kv_format_.name,
+            "gpu_kv_shape": get_gpu_kv_shape_description(self.gpu_kv_format_),
+            "gpu_kv_concrete_shape": get_concrete_gpu_kv_shape(
+                self.kv_caches_, self.gpu_kv_format_
+            ),
+            "attention_backend": get_attention_backend(self.gpu_kv_format_),
+            "cache_size_per_token": self.cache_size_per_token(),
+        }
 
 
 class PlainGPUCacheContext:
@@ -506,14 +732,6 @@ def __init__(self, kv_caches: KVCache, lmcache_chunk_size: int = 256):
             self._cuda_stream.cuda_stream, self._device.index
         )
 
-        _, high_priority = torch_dev.Stream.priority_range()
-        self._high_priority_cuda_stream = torch_dev.Stream(
-            device=self._device, priority=high_priority
-        )
-        self._high_priority_cupy_stream = cupy.cuda.ExternalStream(
-            self._high_priority_cuda_stream.cuda_stream, self._device.index
-        )
-
         # Extra initialization
         self._cupy_stream.launch_host_func(
             lambda logger: logger.info(
@@ -557,14 +775,6 @@ def stream(self) -> Any:
     def cupy_stream(self) -> "cupy.cuda.Stream":
         return self._cupy_stream
 
-    @property
-    def high_priority_stream(self) -> Any:
-        return self._high_priority_cuda_stream
-
-    @property
-    def high_priority_cupy_stream(self) -> "cupy.cuda.Stream":
-        return self._high_priority_cupy_stream
-
     @property
     def num_layers(self) -> int:
         return self._num_layers
diff --git a/lmcache/v1/multiprocess/modules/blend_v3.py b/lmcache/v1/multiprocess/modules/blend_v3.py
index 20db4908a9..7f0dba6889 100644
--- a/lmcache/v1/multiprocess/modules/blend_v3.py
+++ b/lmcache/v1/multiprocess/modules/blend_v3.py
@@ -823,17 +823,18 @@ def _apply_cb_rope_batched(
         (slot_idx, old_st, cur_st)."""
         if not slots_to_rope:
             return
-        num_groups = gpu_context.kv_layer_groups_manager.num_groups
+        num_groups = gpu_context.kv_layer_groups_manager.num_kernel_groups
         for group_idx in range(num_groups):
-            group = gpu_context.kv_layer_groups_manager.kv_layer_groups[group_idx]
+            group = gpu_context.kv_layer_groups_manager.kernel_groups[group_idx]
             if group.compress_ratio != 1:
                 raise RuntimeError(
                     f"CB v3: group {group_idx} has compress_ratio="
                     f"{group.compress_ratio}; compressed layouts unsupported."
                 )
-            all_slots = gpu_context.get_tmp_chunk_gpu_buffer_batched(
-                batch_size=batch_len, group_idx=group_idx
-            )
+            all_slots = [
+                gpu_context.get_temp_kernel_group_buffer(slot_idx, group_idx)
+                for slot_idx in range(batch_len)
+            ]
             if all_slots[0].shape[0] != 2:
                 raise RuntimeError(
                     f"CB v3: group {group_idx} has kv_size={all_slots[0].shape[0]}; "
@@ -948,7 +949,7 @@ def cb_retrieve_pre_computed(
                 f"chunk_size {chunk_size} must be a multiple of "
                 f"inference_engine_logical_block_size {ie_logical_block_size}"
             )
-        num_groups = gpu_context.kv_layer_groups_manager.num_groups
+        num_groups = gpu_context.kv_layer_groups_manager.num_kernel_groups
 
         with (
             torch_dev.device(gpu_context.device),
@@ -1026,8 +1027,9 @@ def cb_retrieve_pre_computed(
 
                             # (a) H2D fill into per-chunk tmp slots.
                             for slot_idx, (_, memory_obj) in enumerate(batch):
-                                flat_slot = gpu_context.get_tmp_gpu_buffer_flat(
-                                    chunk_idx=slot_idx
+                                # Single object group => object_group_idx=0.
+                                flat_slot = gpu_context.get_temp_object_group_buffer(
+                                    slot_idx, 0
                                 )
                                 lmcache_memcpy_async_h2d(memory_obj, flat_slot)
 
@@ -1060,16 +1062,16 @@ def cb_retrieve_pre_computed(
                             )
                             page_buffer_size = gpu_context.num_blocks * bs
                             for group_idx in range(num_groups):
-                                tmp_buffers = (
-                                    gpu_context.get_tmp_chunk_gpu_buffer_batched(
-                                        batch_size=batch_len,
-                                        group_idx=group_idx,
+                                tmp_buffers = [
+                                    gpu_context.get_temp_kernel_group_buffer(
+                                        slot_idx, group_idx
                                     )
-                                )
+                                    for slot_idx in range(batch_len)
+                                ]
                                 key_value = torch.cat(tmp_buffers, dim=2)
                                 lmc_ops.multi_layer_kv_transfer(
                                     key_value,
-                                    gpu_context.get_group_kv_pointers(group_idx),
+                                    gpu_context.get_kernel_group_kv_pointers(group_idx),
                                     slot_mapping,
                                     gpu_context.device,
                                     page_buffer_size,
diff --git a/lmcache/v1/multiprocess/modules/gpu_transfer.py b/lmcache/v1/multiprocess/modules/gpu_transfer.py
index 25c0bd0ab6..8b012af0c3 100644
--- a/lmcache/v1/multiprocess/modules/gpu_transfer.py
+++ b/lmcache/v1/multiprocess/modules/gpu_transfer.py
@@ -49,29 +49,35 @@
 
 
 def get_layout_desc(
-    cache_context: GPUCacheContext, num_tokens: int
+    gpu_context: GPUCacheContext,
+    num_tokens: int,
+    object_group_id: int = 0,
 ) -> MemoryLayoutDesc:
-    """Get the memory layout description for a given GPU context and number of tokens.
+    """Get the memory layout description for a specific object group.
 
-    Supports multiple KV layer groups with different shapes and dtypes.
+    The returned layout describes the single memory object that backs
+    ``object_group_id``: one (shape, dtype) entry per kernel group in that
+    object group, in the kernel groups' declared layout order. Kernel groups
+    may have different shapes and dtypes.
 
     Args:
         cache_context: The GPU cache context containing the KV cache information.
         num_tokens: The number of tokens to determine the layout for.
+        object_group_id: Index of the object group whose layout to build.
+            Defaults to 0; under the current single-object-group assumption this
+            covers every kernel group.
 
     Returns:
-        MemoryLayoutDesc: The memory layout description containing shapes and dtypes.
+        MemoryLayoutDesc: The memory layout description containing shapes and
+        dtypes, one entry per kernel group in the object group.
     """
-    num_groups = cache_context.kv_layer_groups_manager.num_groups
-    shapes = [
-        cache_context.get_kv_buffer_shape(num_tokens, group_idx)
-        for group_idx in range(num_groups)
+    object_group = gpu_context.kv_layer_groups_manager.object_groups[object_group_id]
+    shapes_and_dtypes = [
+        gpu_context.get_kernel_group_shape_dtype(num_tokens, kernel_group_idx)
+        for kernel_group_idx in object_group.kernel_group_indices
     ]
-    dtypes = [
-        cache_context.kv_layer_groups_manager.kv_layer_groups[group_idx].dtype
-        for group_idx in range(num_groups)
-    ]
-    return MemoryLayoutDesc(shapes=shapes, dtypes=dtypes)
+    shapes, dtypes = zip(*shapes_and_dtypes, strict=False)
+    return MemoryLayoutDesc(shapes=list(shapes), dtypes=list(dtypes))
 
 
 def batched_iteration(lst: list, batch_size: int) -> Generator[tuple, None, None]:
@@ -198,23 +204,7 @@ def report_status(self) -> dict:
             cache_context_meta[str(instance_id)] = {
                 "model_name": entry.model_name,
                 "world_size": entry.world_size,
-                "kv_cache_layout": {
-                    "num_layers": ctx.num_layers,
-                    "inference_engine_logical_block_size": (
-                        ctx.kv_layer_groups_manager.inference_engine_logical_block_size
-                    ),
-                    "group_physical_block_sizes": ctx.group_physical_block_sizes,
-                    "group_compress_ratios": ctx.group_compress_ratios,
-                    "hidden_dim_sizes": str(ctx.hidden_dim_sizes),
-                    "dtype": str(ctx.dtype),
-                    "is_mla": ctx.is_mla,
-                    "num_blocks": ctx.num_blocks,
-                    "gpu_kv_format": ctx.gpu_kv_format_name,
-                    "gpu_kv_shape": ctx.gpu_kv_shape,
-                    "gpu_kv_concrete_shape": ctx.concrete_gpu_kv_shape,
-                    "attention_backend": ctx.attention_backend,
-                    "cache_size_per_token": ctx.cache_size_per_token(),
-                },
+                "kv_cache_layout": ctx.report_status(),
             }
 
         return {
@@ -279,7 +269,9 @@ def register_kv_cache(
             world_size=world_size,
         )
 
-        layout_desc = get_layout_desc(cache_context, self._ctx.chunk_size)
+        layout_desc = get_layout_desc(
+            cache_context, self._ctx.chunk_size, object_group_id=0
+        )
         self._ctx.layout_desc_registry.register(model_name, world_size, layout_desc)
 
         logger.info(
@@ -351,11 +343,14 @@ def store(
         cache_context = entry.cache_context
         model_name = entry.model_name
 
+        # TODO(refactor): only single-object-group transfers are wired up so far.
+        assert cache_context.kv_layer_groups_manager.num_object_groups == 1
+
         # NOTE: different engine groups may have different block sizes, so
         # ``blocks_per_chunk[i]`` is the number of blocks in one chunk for
         # group ``i``.
         blocks_per_chunk = [
-            cache_context.blocks_for_tokens(self._ctx.chunk_size, group_idx)
+            cache_context.calculate_num_blocks(self._ctx.chunk_size, group_idx)
             for group_idx in range(cache_context.kv_layer_groups_manager.num_groups)
         ]
 
@@ -431,7 +426,9 @@ def store(
             reserved_dict: dict[ObjectKey, MemoryObj] = {}
             store_succeeded = False
             try:
-                layout_desc = get_layout_desc(cache_context, self._ctx.chunk_size)
+                layout_desc = get_layout_desc(
+                    cache_context, self._ctx.chunk_size, object_group_id=0
+                )
                 reserved_dict = self._ctx.storage_manager.reserve_write(
                     obj_keys, layout_desc, "new"
                 )
@@ -454,8 +451,11 @@ def store(
                         chunk_block_ids_gpu = block_ids_per_group_gpu[group_idx][
                             idx * bpc : (idx + 1) * bpc
                         ]
-                        tmp_buffer = cache_context.get_tmp_chunk_gpu_buffer(group_idx)
-                        group_kv_pointers = cache_context.get_group_kv_pointers(
+                        # Store is not batched, so we always use batch_idx=0.
+                        tmp_buffer = cache_context.get_temp_kernel_group_buffer(
+                            0, group_idx
+                        )
+                        group_kv_pointers = cache_context.get_kernel_group_kv_pointers(
                             group_idx
                         )
                         # Kernel contract: ``group_lmcache_chunk_size`` here is the
@@ -475,9 +475,10 @@ def store(
                             cache_context.gpu_kv_format_,
                             0,
                         )
-                    # Store is not batched, so we always use chunk_idx=0 (single slot)
+                    # Store is not batched, so we always use batch_idx=0 (single
+                    # slot). Single object group => object_group_idx=0.
                     lmcache_memcpy_async_d2h(
-                        cache_context.get_tmp_gpu_buffer_flat(chunk_idx=0), memory_obj
+                        cache_context.get_temp_object_group_buffer(0, 0), memory_obj
                     )
                 store_succeeded = True
             except Exception:
@@ -565,6 +566,9 @@ def retrieve(
         cache_context = entry.cache_context
         model_name = entry.model_name
 
+        # TODO(refactor): only single-object-group transfers are wired up so far.
+        assert cache_context.kv_layer_groups_manager.num_object_groups == 1
+
         # CPU-synchronous sentinel: a GPU retrieve is about to be enqueued.
         # Must be published via publish() (not publish_on_stream) so the
         # drain thread sees it before MP_REQUEST_END can race MP_RETRIEVE_END.
@@ -634,12 +638,13 @@ def _retrieve_loop(keys: list[ObjectKey], memory_objs: list[MemoryObj]) -> None:
                 # Copy from CPU to GPU tmp buffers, then scatter to paged KV — per group
                 # H2D copy: each memory_obj maps to its own batch slot
                 for chunk_idx, memory_obj in enumerate(memory_obj_batch):
+                    # Single object group => object_group_idx=0.
                     lmcache_memcpy_async_h2d(
                         memory_obj,
-                        cache_context.get_tmp_gpu_buffer_flat(chunk_idx=chunk_idx),
+                        cache_context.get_temp_object_group_buffer(chunk_idx, 0),
                     )
                 for group_idx, group in enumerate(groups):
-                    bpc = cache_context.blocks_for_tokens(
+                    bpc = cache_context.calculate_num_blocks(
                         self._ctx.chunk_size, group_idx
                     )
                     chunk_block_ids_gpu = block_ids_per_group_gpu[group_idx][
@@ -656,13 +661,16 @@ def _retrieve_loop(keys: list[ObjectKey], memory_objs: list[MemoryObj]) -> None:
                             f"expected={batch_len * bpc} "
                             f"got={chunk_block_ids_gpu.shape[0]}"
                         )
-                    group_skip_blocks = cache_context.blocks_for_tokens(
+                    group_skip_blocks = cache_context.calculate_num_blocks(
                         skip_tokens_in_chunk, group_idx
                     )
-                    tmp_buffers = cache_context.get_tmp_chunk_gpu_buffer_batched(
-                        batch_len, group_idx
+                    tmp_buffers = [
+                        cache_context.get_temp_kernel_group_buffer(i, group_idx)
+                        for i in range(batch_len)
+                    ]
+                    group_kv_pointers = cache_context.get_kernel_group_kv_pointers(
+                        group_idx
                     )
-                    group_kv_pointers = cache_context.get_group_kv_pointers(group_idx)
                     group_lmcache_chunk_size = cache_context.get_physical_chunk_size(
                         group_idx
                     )
diff --git a/tests/v1/distributed/serde/test_serde_e2e.py b/tests/v1/distributed/serde/test_serde_e2e.py
index b88a18fb57..1b192df05e 100644
--- a/tests/v1/distributed/serde/test_serde_e2e.py
+++ b/tests/v1/distributed/serde/test_serde_e2e.py
@@ -197,7 +197,7 @@ def test_store_and_prefetch_with_serde(self) -> None:
         write_and_wait_for_l2(sm, keys, layout)
 
         # Brief sleep so StoreController releases read locks after L2 store
-        time.sleep(0.1)
+        time.sleep(1)
         sm.clear()
         assert get_l1_object_count(sm) == 0
 
@@ -222,7 +222,7 @@ def test_no_memory_leak_after_full_cycle(self) -> None:
         keys = [make_object_key(i) for i in range(3)]
 
         write_and_wait_for_l2(sm, keys, layout)
-        time.sleep(0.1)
+        time.sleep(1)
         sm.clear()
 
         # Prefetch
@@ -263,7 +263,7 @@ def test_store_and_prefetch_without_serde(self) -> None:
         keys = [make_object_key(i) for i in range(5)]
 
         write_and_wait_for_l2(sm, keys, layout)
-        time.sleep(0.1)
+        time.sleep(1)
         sm.clear()
 
         handle = sm.submit_prefetch_task(keys, layout)
@@ -285,7 +285,7 @@ def test_no_memory_leak_without_serde(self) -> None:
         keys = [make_object_key(i) for i in range(3)]
 
         write_and_wait_for_l2(sm, keys, layout)
-        time.sleep(0.1)
+        time.sleep(1)
         sm.clear()
 
         handle = sm.submit_prefetch_task(keys, layout)
@@ -318,7 +318,7 @@ def test_partial_prefix_with_serde(self) -> None:
         # Write only keys 0, 1, 3, 4 (skip 2)
         keys_to_write = [make_object_key(i) for i in [0, 1, 3, 4]]
         write_and_wait_for_l2(sm, keys_to_write, layout)
-        time.sleep(0.1)
+        time.sleep(1)
         sm.clear()
 
         # Request all 5 keys — prefix should be 2 (gap at index 2)
@@ -354,7 +354,7 @@ def test_repeated_cycles_no_leak(self) -> None:
         for cycle in range(5):
             keys = [make_object_key(cycle * 10 + i) for i in range(3)]
             write_and_wait_for_l2(sm, keys, layout)
-            time.sleep(0.1)
+            time.sleep(1)
             sm.clear()
 
             handle = sm.submit_prefetch_task(keys, layout)
@@ -441,7 +441,7 @@ def _run_roundtrip(
         keys = [make_object_key(i) for i in range(num_keys)]
 
         write_and_wait_for_l2(sm, keys, layout)
-        time.sleep(0.1)
+        time.sleep(1)
         sm.clear()
         assert get_l1_object_count(sm) == 0
 
diff --git a/tests/v1/multiprocess/test_blend_v3_load_store_opts.py b/tests/v1/multiprocess/test_blend_v3_load_store_opts.py
index bcf7a73820..fd8047ed6a 100644
--- a/tests/v1/multiprocess/test_blend_v3_load_store_opts.py
+++ b/tests/v1/multiprocess/test_blend_v3_load_store_opts.py
@@ -230,25 +230,20 @@ def _build_fake_gpu_context(batch_size: int, num_groups: int):
     """Returns a MagicMock matching the minimal GPUCacheContext surface
     used by _apply_cb_rope_batched."""
     gpu_context = MagicMock()
-    gpu_context.kv_layer_groups_manager.num_groups = num_groups
+    gpu_context.kv_layer_groups_manager.num_kernel_groups = num_groups
     # All groups: compress_ratio=1, kv_size=2.
     groups = [SimpleNamespace(compress_ratio=1) for _ in range(num_groups)]
-    gpu_context.kv_layer_groups_manager.kv_layer_groups = groups
+    gpu_context.kv_layer_groups_manager.kernel_groups = groups
 
-    # all_slots = [tmp_for_slot_0, ..., tmp_for_slot_{batch-1}]
-    # Each tmp shape: (2 kv, num_layers, slots_per_chunk, hidden_dim).
+    # Each per-(slot, group) buffer has shape
+    # (2 kv, num_layers, slots_per_chunk, hidden_dim).
     num_layers, slots_per_chunk, hidden_dim = 2, 4, 64
     head_size = 32
 
-    def _get_tmp_chunk_gpu_buffer_batched(batch_size, group_idx):
-        return [
-            _FakeTensor((2, num_layers, slots_per_chunk, hidden_dim))
-            for _ in range(batch_size)
-        ]
+    def _get_temp_kernel_group_buffer(batch_idx, kernel_group_idx):
+        return _FakeTensor((2, num_layers, slots_per_chunk, hidden_dim))
 
-    gpu_context.get_tmp_chunk_gpu_buffer_batched.side_effect = (
-        _get_tmp_chunk_gpu_buffer_batched
-    )
+    gpu_context.get_temp_kernel_group_buffer.side_effect = _get_temp_kernel_group_buffer
     return gpu_context, head_size
 
 
@@ -292,8 +287,10 @@ def repeat(self, n):
 
         eng._apply_cb_rope_batched(gpu_context, rope_state, 4, slots_to_rope)
 
-    # Per-group setup (get_tmp_chunk_gpu_buffer_batched) called once per group.
-    assert gpu_context.get_tmp_chunk_gpu_buffer_batched.call_count == 2
+    # all_slots is built once per group (G=2), each fetching the full batch
+    # of slot buffers => batch_len(4) × G(2) = 8 buffer fetches, independent
+    # of how many slots are actually re-RoPE'd.
+    assert gpu_context.get_temp_kernel_group_buffer.call_count == 8
     # Kernel called N=2 slots × G=2 groups = 4 times.
     assert ops.rotary_embedding_k_fused.call_count == 4
 
@@ -315,7 +312,7 @@ def test_batched_rope_noop_on_empty_slots():
     with patch.object(v3_mod, "lmc_ops") as ops:
         eng._apply_cb_rope_batched(gpu_context, rope_state, 2, [])
 
-    assert gpu_context.get_tmp_chunk_gpu_buffer_batched.call_count == 0
+    assert gpu_context.get_temp_kernel_group_buffer.call_count == 0
     assert ops.rotary_embedding_k_fused.call_count == 0
 
 
@@ -325,8 +322,8 @@ def test_batched_rope_raises_on_compressed_layout():
     from lmcache.v1.multiprocess.modules import blend_v3 as v3_mod
 
     gpu_context = MagicMock()
-    gpu_context.kv_layer_groups_manager.num_groups = 1
-    gpu_context.kv_layer_groups_manager.kv_layer_groups = [
+    gpu_context.kv_layer_groups_manager.num_kernel_groups = 1
+    gpu_context.kv_layer_groups_manager.kernel_groups = [
         SimpleNamespace(compress_ratio=2)
     ]
     rope_state = SimpleNamespace(
diff --git a/tests/v1/multiprocess/test_gpu_context.py b/tests/v1/multiprocess/test_gpu_context.py
index e7be624cc8..535cdf4ef3 100644
--- a/tests/v1/multiprocess/test_gpu_context.py
+++ b/tests/v1/multiprocess/test_gpu_context.py
@@ -1,14 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 
-"""Unit tests for GPUCacheContext.get_tmp_chunk_gpu_buffer,
-get_tmp_chunk_gpu_buffer_batched and get_tmp_gpu_buffer_flat — verifying
-contiguity, shape, non-overlapping guarantees, and multi-group layout.
-
-These tests construct a minimal GPUCacheContext-like object that has
-just the fields the buffer methods need, avoiding the full KVCache /
-CudaIPCWrapper construction.
+"""Unit tests for the temp-GPU-buffer machinery in
+``lmcache.v1.multiprocess.gpu_context``.
+
+Two layers are exercised:
+
+* ``_TempGPUBuffer`` -- the standalone buffer manager. It is built directly
+  from a real :class:`KVLayerGroupsManager` (its constructor is fully public),
+  so the layout invariants (per-kernel-group shape/dtype, per-object-group flat
+  views, non-overlap, write isolation, byte sizing) are tested in isolation.
+
+* ``GPUCacheContext`` -- the higher-level context that owns a ``_TempGPUBuffer``
+  and exposes the per-kernel-group / per-object-group buffer accessors plus
+  ``get_kernel_group_kv_pointers``, ``calculate_num_blocks``,
+  ``kv_layer_groups_manager`` and ``report_status``. It is built through its
+  real public constructor using a lightweight ``to_tensor`` test double in place
+  of ``CudaIPCWrapper`` (same-process CUDA IPC cannot reimport its own handle).
 """
 
+# Standard
+from collections.abc import Sequence
+
 # Third Party
 import pytest
 import torch
@@ -18,402 +30,486 @@
 )
 
 # First Party
+from lmcache.v1.gpu_connector.utils import LayoutHints  # noqa: E402
 from lmcache.v1.kv_layer_groups import KVLayerGroupsManager  # noqa: E402
-from lmcache.v1.multiprocess.gpu_context import GPUCacheContext  # noqa: E402
+from lmcache.v1.multiprocess.gpu_context import (  # noqa: E402
+    GPUCacheContext,
+    _TempGPUBuffer,
+)
 import lmcache.c_ops as lmc_ops  # noqa: E402
 
+_DEVICE = torch.device("cuda")
+
+
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 
 
-def _make_context(
-    num_layers: int = 4,
-    num_heads: int = 8,
-    head_size: int = 128,
-    is_mla: bool = False,
-    chunk_size: int = 256,
-    dtype: torch.dtype = torch.bfloat16,
-) -> GPUCacheContext:
-    """Build a GPUCacheContext with a single KV layer group by directly
-    setting internal fields, bypassing the KVCache/IPC wrapper construction."""
-    ctx = object.__new__(GPUCacheContext)
-    ctx.is_mla_ = is_mla
-    ctx.num_layers_ = num_layers
-    ctx.max_batch_size = 4
-
-    # Build a real KVLayerGroupsManager from synthetic tensors shaped to
-    # match the grouping signature the tests care about.
-    if is_mla:
-        kv_caches = [
-            torch.empty(1, 1, head_size, dtype=dtype) for _ in range(num_layers)
-        ]
-        fmt = lmc_ops.GPUKVFormat.NL_X_NB_BS_HS
-    else:
-        kv_caches = [
-            torch.empty(2, 1, 1, num_heads, head_size, dtype=dtype)
-            for _ in range(num_layers)
-        ]
-        fmt = lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS
-    manager = KVLayerGroupsManager(
-        kv_caches, fmt, num_blocks=1, lmcache_logical_chunk_size=chunk_size
-    )
-    ctx.kv_layer_groups_manager_ = manager
-
-    # Build flat tmp_gpu_buffer_ with prefix-sum offsets (new layout)
-    ctx.tmp_chunk_group_offsets_ = [0]
-    for gidx, grp in enumerate(manager.kv_layer_groups):
-        shape = ctx.get_kv_buffer_shape(chunk_size, gidx)
-        byte_size = shape.numel() * grp.dtype.itemsize
-        ctx.tmp_chunk_group_offsets_.append(
-            ctx.tmp_chunk_group_offsets_[-1] + byte_size
-        )
-    ctx.tmp_chunk_bytes_ = ctx.tmp_chunk_group_offsets_[-1]
-    ctx.lmcache_logical_chunk_size = chunk_size
-    ctx.tmp_gpu_buffer_ = torch.empty(
-        ctx.tmp_chunk_bytes_ * ctx.max_batch_size,
-        dtype=torch.uint8,
-        device="cuda",
+class _GroupSpec:
+    """Description of one homogeneous block of KV layers used to build the
+    synthetic ``[2, NB, BS, NH, HS]`` (non-MLA) tensors fed to the manager."""
+
+    def __init__(
+        self,
+        num_layers: int,
+        num_heads: int = 8,
+        head_size: int = 64,
+        block_size: int = 16,
+        dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.block_size = block_size
+        self.dtype = dtype
+
+
+def _make_kv_tensors(
+    specs: Sequence[_GroupSpec],
+    num_blocks: int = 4,
+) -> list[torch.Tensor]:
+    """Build non-MLA per-layer KV tensors shaped ``[2, NB, BS, NH, HS]``."""
+    tensors: list[torch.Tensor] = []
+    for spec in specs:
+        for _ in range(spec.num_layers):
+            tensors.append(
+                torch.empty(
+                    2,
+                    num_blocks,
+                    spec.block_size,
+                    spec.num_heads,
+                    spec.head_size,
+                    dtype=spec.dtype,
+                    device=_DEVICE,
+                )
+            )
+    return tensors
+
+
+def _build_manager(
+    tensors: list[torch.Tensor],
+    num_blocks: int = 4,
+    gpu_kv_format: "lmc_ops.GPUKVFormat" = lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS,
+    layout_hints: LayoutHints | None = None,
+) -> KVLayerGroupsManager:
+    """Build a real :class:`KVLayerGroupsManager` from synthetic tensors."""
+    return KVLayerGroupsManager(
+        tensors,
+        gpu_kv_format=gpu_kv_format,
+        num_blocks=num_blocks,
+        layout_hints=layout_hints,
     )
-    return ctx
 
 
-def _make_context_multi_group(
-    groups: list[dict],
+def _make_temp_buffer(
+    specs: Sequence[_GroupSpec],
     chunk_size: int = 256,
-    is_mla: bool = False,
-) -> GPUCacheContext:
-    """Build a GPUCacheContext with multiple KV layer groups.
-
-    Args:
-        groups: List of dicts, each with keys:
-            - num_layers (int)
-            - num_heads  (int)
-            - head_size  (int)
-            - dtype      (torch.dtype, optional, default bfloat16)
-        chunk_size: Tokens per chunk.
-        is_mla: Whether to use MLA (kv_dim=1) layout.
-    """
-    assert not is_mla, "multi-group helper only exercises the non-MLA path"
-    ctx = object.__new__(GPUCacheContext)
-    ctx.is_mla_ = is_mla
-    ctx.max_batch_size = 4
-
-    kv_caches: list[torch.Tensor] = []
-    for g in groups:
-        nl = g["num_layers"]
-        nh = g["num_heads"]
-        hs = g["head_size"]
-        dt = g.get("dtype", torch.bfloat16)
-        kv_caches.extend(torch.empty(2, 1, 1, nh, hs, dtype=dt) for _ in range(nl))
-
-    ctx.num_layers_ = len(kv_caches)
-    manager = KVLayerGroupsManager(
-        kv_caches,
-        lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS,
-        num_blocks=1,
+    max_batch_size: int = 4,
+    num_blocks: int = 4,
+    layout_hints: LayoutHints | None = None,
+) -> _TempGPUBuffer:
+    """Build a ``_TempGPUBuffer`` backed by a real manager."""
+    tensors = _make_kv_tensors(specs, num_blocks=num_blocks)
+    manager = _build_manager(tensors, num_blocks=num_blocks, layout_hints=layout_hints)
+    return _TempGPUBuffer(
+        kv_layer_groups_manager=manager,
         lmcache_logical_chunk_size=chunk_size,
+        device=_DEVICE,
+        max_batch_size=max_batch_size,
     )
-    ctx.kv_layer_groups_manager_ = manager
-
-    # Build flat tmp_gpu_buffer_ with prefix-sum offsets
-    ctx.tmp_chunk_group_offsets_ = [0]
-    for gidx, grp in enumerate(manager.kv_layer_groups):
-        shape = ctx.get_kv_buffer_shape(chunk_size, gidx)
-        byte_size = shape.numel() * grp.dtype.itemsize
-        ctx.tmp_chunk_group_offsets_.append(
-            ctx.tmp_chunk_group_offsets_[-1] + byte_size
+
+
+def _expected_kernel_group_shape(
+    manager: KVLayerGroupsManager, num_tokens: int, kernel_group_idx: int
+) -> torch.Size:
+    """Compute the expected kernel-group buffer shape from the manager's
+    public metadata (kv_size, num_layers, slots, hidden_dim)."""
+    group = manager.kernel_groups[kernel_group_idx]
+    num_slots = num_tokens // group.compress_ratio
+    return torch.Size(
+        (
+            group.shape_desc.kv_size,
+            group.num_layers,
+            num_slots,
+            group.hidden_dim_size,
         )
-    ctx.tmp_chunk_bytes_ = ctx.tmp_chunk_group_offsets_[-1]
-    ctx.lmcache_logical_chunk_size = chunk_size
-    ctx.tmp_gpu_buffer_ = torch.empty(
-        ctx.tmp_chunk_bytes_ * ctx.max_batch_size,
-        dtype=torch.uint8,
-        device="cuda",
     )
-    return ctx
 
 
-# ---------------------------------------------------------------------------
-# get_tmp_chunk_gpu_buffer tests
-# ---------------------------------------------------------------------------
+def _expected_kernel_group_bytes(
+    manager: KVLayerGroupsManager, chunk_size: int, kernel_group_idx: int
+) -> int:
+    """Byte size of one kernel group's per-chunk buffer."""
+    group = manager.kernel_groups[kernel_group_idx]
+    shape = _expected_kernel_group_shape(manager, chunk_size, kernel_group_idx)
+    return shape.numel() * group.dtype.itemsize
 
 
-class TestGetTmpChunkGpuBuffer:
-    def test_contiguity(self) -> None:
-        ctx = _make_context(chunk_size=256)
-        buf = ctx.get_tmp_chunk_gpu_buffer()
-        assert buf.is_contiguous(), "Buffer not contiguous"
+def _byte_region(buf: torch.Tensor) -> tuple[int, int]:
+    """Return ``(start_ptr, end_ptr)`` covering a tensor's bytes."""
+    start = buf.data_ptr()
+    return start, start + buf.nelement() * buf.element_size()
 
-    def test_shape(self) -> None:
-        ctx = _make_context(chunk_size=256)
-        buf = ctx.get_tmp_chunk_gpu_buffer()
-        expected = ctx.get_kv_buffer_shape(256)
-        assert buf.shape == expected
 
-    def test_shape_mla(self) -> None:
-        ctx = _make_context(is_mla=True, num_heads=1, head_size=576, chunk_size=256)
-        buf = ctx.get_tmp_chunk_gpu_buffer()
-        expected = ctx.get_kv_buffer_shape(256)
-        assert buf.shape == expected
-        assert buf.shape[0] == 1  # kv_dim=1 for MLA
+def _assert_disjoint(regions: list[tuple[int, int, str]]) -> None:
+    """Assert that no two ``(start, end, label)`` byte ranges overlap."""
+    for i in range(len(regions)):
+        for j in range(i + 1, len(regions)):
+            s_i, e_i, label_i = regions[i]
+            s_j, e_j, label_j = regions[j]
+            assert e_i <= s_j or e_j <= s_i, f"Overlap between {label_i} and {label_j}"
 
-    def test_repeated_calls_same_ptr(self) -> None:
-        """Two calls should return the same base pointer (same pre-allocated slot)."""
-        ctx = _make_context(chunk_size=256)
-        buf1 = ctx.get_tmp_chunk_gpu_buffer()
-        buf2 = ctx.get_tmp_chunk_gpu_buffer()
-        assert buf1.data_ptr() == buf2.data_ptr()
-
-    def test_write_read_roundtrip(self) -> None:
-        """Write a pattern, read it back to verify the view is correct."""
-        ctx = _make_context(num_layers=2, num_heads=2, head_size=16, chunk_size=32)
-        buf = ctx.get_tmp_chunk_gpu_buffer()
-        buf.fill_(42.0)
-        assert buf.to(torch.float32).sum().item() == pytest.approx(
-            42.0 * buf.numel(), rel=1e-3
-        )
 
+class _FakeIPCWrapper:
+    """Test-only stand-in for ``CudaIPCWrapper``.
 
-# ---------------------------------------------------------------------------
-# get_tmp_chunk_gpu_buffer_batched tests
-# ---------------------------------------------------------------------------
+    ``GPUCacheContext`` only needs ``to_tensor()`` from each entry of its
+    ``kv_caches`` argument. Same-process CUDA IPC cannot reopen its own handle,
+    so this test double simply hands back a locally allocated CUDA tensor,
+    letting the real ``GPUCacheContext`` constructor run end to end.
+    """
 
+    def __init__(self, tensor: torch.Tensor) -> None:
+        self._tensor = tensor
 
-class TestGetTmpChunkGpuBufferBatched:
-    @pytest.mark.parametrize("batch_size", [1, 2, 3, 4])
-    def test_contiguity(self, batch_size: int) -> None:
-        ctx = _make_context(chunk_size=256)
-        buffers = ctx.get_tmp_chunk_gpu_buffer_batched(batch_size)
-        assert len(buffers) == batch_size
-        for i, buf in enumerate(buffers):
-            assert buf.is_contiguous(), f"Buffer {i} not contiguous"
-
-    @pytest.mark.parametrize("batch_size", [1, 2, 3, 4])
-    def test_shapes(self, batch_size: int) -> None:
-        ctx = _make_context(chunk_size=256)
-        buffers = ctx.get_tmp_chunk_gpu_buffer_batched(batch_size)
-        expected_shape = ctx.get_kv_buffer_shape(256)
-        for buf in buffers:
-            assert buf.shape == expected_shape
-
-    @pytest.mark.parametrize("batch_size", [2, 3, 4])
-    def test_non_overlapping(self, batch_size: int) -> None:
-        """Buffers in a batch must not overlap in memory."""
-        ctx = _make_context(chunk_size=256)
-        buffers = ctx.get_tmp_chunk_gpu_buffer_batched(batch_size)
-        for i in range(len(buffers)):
-            for j in range(i + 1, len(buffers)):
-                start_i = buffers[i].data_ptr()
-                end_i = start_i + buffers[i].nelement() * buffers[i].element_size()
-                start_j = buffers[j].data_ptr()
-                end_j = start_j + buffers[j].nelement() * buffers[j].element_size()
-                assert end_i <= start_j or end_j <= start_i, (
-                    f"Buffers {i} and {j} overlap"
-                )
+    def to_tensor(self) -> torch.Tensor:
+        """Return the wrapped local CUDA tensor (test-only)."""
+        return self._tensor
 
-    def test_write_isolation(self) -> None:
-        """Writing to one buffer must not affect another."""
-        ctx = _make_context(num_layers=2, num_heads=2, head_size=16, chunk_size=32)
-        buffers = ctx.get_tmp_chunk_gpu_buffer_batched(4)
-
-        # Write distinct values to each buffer
-        for i, buf in enumerate(buffers):
-            buf.fill_(float(i + 1))
-
-        # Verify each buffer has its own value
-        for i, buf in enumerate(buffers):
-            expected = float(i + 1)
-            assert buf.to(torch.float32).min().item() == pytest.approx(
-                expected, rel=1e-3
-            )
-            assert buf.to(torch.float32).max().item() == pytest.approx(
-                expected, rel=1e-3
-            )
 
-    def test_batch_exceeds_max_raises(self) -> None:
-        ctx = _make_context(chunk_size=256)
-        with pytest.raises(ValueError, match="exceeds max"):
-            ctx.get_tmp_chunk_gpu_buffer_batched(5)
-
-    @pytest.mark.parametrize("batch_size", [1, 2, 3, 4])
-    def test_mla(self, batch_size: int) -> None:
-        ctx = _make_context(is_mla=True, num_heads=1, head_size=576, chunk_size=256)
-        buffers = ctx.get_tmp_chunk_gpu_buffer_batched(batch_size)
-        for buf in buffers:
-            assert buf.is_contiguous()
-            assert buf.shape[0] == 1  # kv_dim=1 for MLA
-
-    def test_consistent_with_single(self) -> None:
-        """get_tmp_chunk_gpu_buffer_batched(1)[0] should have the same data_ptr
-        and shape as get_tmp_chunk_gpu_buffer()."""
-        ctx = _make_context(chunk_size=256)
-        single = ctx.get_tmp_chunk_gpu_buffer()
-        batched = ctx.get_tmp_chunk_gpu_buffer_batched(1)
-        assert len(batched) == 1
-        assert batched[0].data_ptr() == single.data_ptr()
-        assert batched[0].shape == single.shape
+def _make_context(
+    specs: Sequence[_GroupSpec],
+    chunk_size: int = 256,
+    num_blocks: int = 4,
+    layout_hints: LayoutHints | None = None,
+) -> GPUCacheContext:
+    """Build a real ``GPUCacheContext`` via its public constructor."""
+    tensors = _make_kv_tensors(specs, num_blocks=num_blocks)
+    kv_caches = [_FakeIPCWrapper(t) for t in tensors]
+    return GPUCacheContext(
+        kv_caches,  # type: ignore
+        lmcache_logical_chunk_size=chunk_size,
+        layout_hints=layout_hints,
+    )
+
+
+# Common group layouts reused across tests.
+_SINGLE_GROUP = [_GroupSpec(num_layers=4, num_heads=8, head_size=64)]
+_MULTI_GROUP = [
+    _GroupSpec(num_layers=4, num_heads=8, head_size=64, dtype=torch.bfloat16),
+    _GroupSpec(num_layers=2, num_heads=16, head_size=64, dtype=torch.float16),
+]
 
 
 # ---------------------------------------------------------------------------
-# Multi-group tests
+# _TempGPUBuffer tests
 # ---------------------------------------------------------------------------
 
 
-class TestMultiGroup:
-    """Tests for multi-group flat buffer layout."""
-
-    GROUPS_SAME_DTYPE = [
-        {"num_layers": 4, "num_heads": 8, "head_size": 128, "dtype": torch.bfloat16},
-        {"num_layers": 4, "num_heads": 8, "head_size": 128, "dtype": torch.bfloat16},
-    ]
-    GROUPS_DIFF_DTYPE = [
-        {"num_layers": 4, "num_heads": 8, "head_size": 128, "dtype": torch.bfloat16},
-        {"num_layers": 2, "num_heads": 4, "head_size": 64, "dtype": torch.float16},
-    ]
-
-    def test_prefix_sum_length(self) -> None:
-        """tmp_chunk_group_offsets_ should have num_groups+1 entries."""
-        ctx = _make_context_multi_group(self.GROUPS_SAME_DTYPE)
-        num_groups = len(ctx.kv_layer_groups_manager_.kv_layer_groups)
-        assert len(ctx.tmp_chunk_group_offsets_) == num_groups + 1
-
-    def test_prefix_sum_monotone(self) -> None:
-        """Offsets must be strictly increasing."""
-        ctx = _make_context_multi_group(self.GROUPS_DIFF_DTYPE)
-        offsets = ctx.tmp_chunk_group_offsets_
-        for i in range(1, len(offsets)):
-            assert offsets[i] > offsets[i - 1], (
-                f"Offset not increasing at index {i}: {offsets}"
-            )
+class TestTempGPUBufferConstruction:
+    def test_max_batch_size_property(self) -> None:
+        buf = _make_temp_buffer(_SINGLE_GROUP, max_batch_size=3)
+        assert buf.max_batch_size == 3
 
-    def test_flat_buffer_total_size(self) -> None:
-        """tmp_gpu_buffer_ byte count == tmp_chunk_bytes_ * max_batch_size."""
-        ctx = _make_context_multi_group(self.GROUPS_SAME_DTYPE)
-        assert ctx.tmp_gpu_buffer_.numel() == ctx.tmp_chunk_bytes_ * ctx.max_batch_size
-
-    def test_groups_non_overlapping_in_chunk(self) -> None:
-        """Within a single chunk, different groups must occupy disjoint byte ranges."""
-        ctx = _make_context_multi_group(self.GROUPS_DIFF_DTYPE)
-        offsets = ctx.tmp_chunk_group_offsets_
-        num_groups = len(ctx.kv_layer_groups_manager_.kv_layer_groups)
-        for i in range(num_groups):
-            for j in range(i + 1, num_groups):
-                # [offsets[i], offsets[i+1]) vs [offsets[j], offsets[j+1])
-                assert offsets[i + 1] <= offsets[j] or offsets[j + 1] <= offsets[i], (
-                    f"Groups {i} and {j} overlap in chunk layout"
-                )
 
-    def test_get_tmp_chunk_gpu_buffer_shape_per_group(self) -> None:
-        """get_tmp_chunk_gpu_buffer returns the correct shape for each group."""
-        ctx = _make_context_multi_group(self.GROUPS_DIFF_DTYPE, chunk_size=256)
-        num_groups = len(ctx.kv_layer_groups_manager_.kv_layer_groups)
-        for gidx in range(num_groups):
-            buf = ctx.get_tmp_chunk_gpu_buffer(group_idx=gidx)
-            expected = ctx.get_kv_buffer_shape(256, gidx)
-            assert buf.shape == expected, (
-                f"Group {gidx}: expected {expected}, got {buf.shape}"
-            )
+class TestTempGPUBufferKernelGroupBuffer:
+    def test_shape_and_dtype(self) -> None:
+        tensors = _make_kv_tensors(_MULTI_GROUP)
+        manager = _build_manager(tensors)
+        buf = _TempGPUBuffer(manager, 256, _DEVICE)
+        for kg in range(manager.num_kernel_groups):
+            tensor = buf.get_temp_kernel_group_buffer(0, kg)
+            assert tensor.shape == _expected_kernel_group_shape(manager, 256, kg)
+            assert tensor.dtype == manager.kernel_groups[kg].dtype
 
-    def test_get_tmp_chunk_gpu_buffer_dtype_per_group(self) -> None:
-        """get_tmp_chunk_gpu_buffer returns the correct dtype for each group."""
-        ctx = _make_context_multi_group(self.GROUPS_DIFF_DTYPE, chunk_size=256)
-        groups = ctx.kv_layer_groups_manager_.kv_layer_groups
-        for gidx, grp in enumerate(groups):
-            buf = ctx.get_tmp_chunk_gpu_buffer(group_idx=gidx)
-            assert buf.dtype == grp.dtype, (
-                f"Group {gidx}: expected dtype {grp.dtype}, got {buf.dtype}"
+    def test_contiguous(self) -> None:
+        buf = _make_temp_buffer(_SINGLE_GROUP)
+        assert buf.get_temp_kernel_group_buffer(0, 0).is_contiguous()
+
+    def test_repeated_calls_same_ptr(self) -> None:
+        buf = _make_temp_buffer(_SINGLE_GROUP)
+        first = buf.get_temp_kernel_group_buffer(1, 0)
+        second = buf.get_temp_kernel_group_buffer(1, 0)
+        assert first.data_ptr() == second.data_ptr()
+
+    def test_invalid_batch_idx_raises(self) -> None:
+        buf = _make_temp_buffer(_SINGLE_GROUP, max_batch_size=4)
+        with pytest.raises(ValueError, match="Invalid batch_idx"):
+            buf.get_temp_kernel_group_buffer(4, 0)
+
+    def test_invalid_kernel_group_idx_raises(self) -> None:
+        buf = _make_temp_buffer(_SINGLE_GROUP)
+        with pytest.raises(ValueError, match="kernel_group_idx"):
+            buf.get_temp_kernel_group_buffer(0, 99)
+
+    def test_buffers_non_overlapping(self) -> None:
+        """Every (batch, kernel_group) buffer occupies disjoint memory."""
+        tensors = _make_kv_tensors(_MULTI_GROUP)
+        manager = _build_manager(tensors)
+        max_batch_size = 4
+        buf = _TempGPUBuffer(manager, 256, _DEVICE, max_batch_size=max_batch_size)
+        regions: list[tuple[int, int, str]] = []
+        for batch in range(max_batch_size):
+            for kg in range(manager.num_kernel_groups):
+                tensor = buf.get_temp_kernel_group_buffer(batch, kg)
+                start, end = _byte_region(tensor)
+                regions.append((start, end, f"batch={batch},kg={kg}"))
+        _assert_disjoint(regions)
+
+    def test_write_isolation(self) -> None:
+        """Writing to one batch slot must not corrupt another."""
+        buf = _make_temp_buffer(
+            [_GroupSpec(num_layers=2, num_heads=2, head_size=16)],
+            chunk_size=32,
+            max_batch_size=4,
+        )
+        for batch in range(4):
+            buf.get_temp_kernel_group_buffer(batch, 0).fill_(float(batch + 1))
+        for batch in range(4):
+            tensor = buf.get_temp_kernel_group_buffer(batch, 0).to(torch.float32)
+            assert tensor.min().item() == pytest.approx(batch + 1, rel=1e-3)
+            assert tensor.max().item() == pytest.approx(batch + 1, rel=1e-3)
+
+
+class TestTempGPUBufferObjectGroupBuffer:
+    def test_flat_uint8(self) -> None:
+        buf = _make_temp_buffer(_MULTI_GROUP)
+        tensor = buf.get_temp_object_group_buffer(0, 0)
+        assert tensor.dtype == torch.uint8
+        assert tensor.dim() == 1
+        assert tensor.is_contiguous()
+
+    def test_size_covers_all_kernel_groups(self) -> None:
+        """The single object group's flat buffer spans every kernel group's
+        bytes for one chunk."""
+        tensors = _make_kv_tensors(_MULTI_GROUP)
+        manager = _build_manager(tensors)
+        chunk_size = 256
+        buf = _TempGPUBuffer(manager, chunk_size, _DEVICE)
+        obj_group = manager.object_groups[0]
+        expected_bytes = sum(
+            _expected_kernel_group_bytes(manager, chunk_size, kg)
+            for kg in obj_group.kernel_group_indices
+        )
+        assert buf.get_temp_object_group_buffer(0, 0).numel() == expected_bytes
+
+    def test_starts_at_first_kernel_group(self) -> None:
+        """The object-group flat view aliases the same memory as its first
+        kernel group's buffer."""
+        tensors = _make_kv_tensors(_MULTI_GROUP)
+        manager = _build_manager(tensors)
+        buf = _TempGPUBuffer(manager, 256, _DEVICE)
+        first_kg = manager.object_groups[0].kernel_group_indices[0]
+        obj_buf = buf.get_temp_object_group_buffer(0, 0)
+        kg_buf = buf.get_temp_kernel_group_buffer(0, first_kg)
+        assert obj_buf.data_ptr() == kg_buf.data_ptr()
+
+    def test_invalid_indices_raise(self) -> None:
+        buf = _make_temp_buffer(_SINGLE_GROUP, max_batch_size=4)
+        with pytest.raises(ValueError, match="object_group_idx"):
+            buf.get_temp_object_group_buffer(0, 99)
+        with pytest.raises(ValueError, match="batch_idx"):
+            buf.get_temp_object_group_buffer(4, 0)
+
+    def test_contains_kernel_group_data(self) -> None:
+        """Bytes written through kernel-group views are visible through the
+        object-group flat view at matching offsets."""
+        tensors = _make_kv_tensors(_MULTI_GROUP)
+        manager = _build_manager(tensors)
+        chunk_size = 64
+        buf = _TempGPUBuffer(manager, chunk_size, _DEVICE)
+        obj_group = manager.object_groups[0]
+
+        for offset_kg, kg in enumerate(obj_group.kernel_group_indices):
+            buf.get_temp_kernel_group_buffer(0, kg).view(torch.uint8).fill_(
+                offset_kg + 1
             )
 
-    def test_groups_data_ptr_matches_offsets(self) -> None:
-        """data_ptr of each group's buffer should equal base + group offset."""
-        ctx = _make_context_multi_group(self.GROUPS_DIFF_DTYPE, chunk_size=256)
-        base_ptr = ctx.tmp_gpu_buffer_.data_ptr()
-        num_groups = len(ctx.kv_layer_groups_manager_.kv_layer_groups)
-        for gidx in range(num_groups):
-            buf = ctx.get_tmp_chunk_gpu_buffer(group_idx=gidx)
-            expected_ptr = base_ptr + ctx.tmp_chunk_group_offsets_[gidx]
-            assert buf.data_ptr() == expected_ptr, (
-                f"Group {gidx}: expected ptr offset "
-                f"{ctx.tmp_chunk_group_offsets_[gidx]}, "
-                f"got {buf.data_ptr() - base_ptr}"
+        flat = buf.get_temp_object_group_buffer(0, 0)
+        cursor = 0
+        for offset_kg, kg in enumerate(obj_group.kernel_group_indices):
+            size = _expected_kernel_group_bytes(manager, chunk_size, kg)
+            region = flat[cursor : cursor + size]
+            assert region.min().item() == offset_kg + 1
+            assert region.max().item() == offset_kg + 1
+            cursor += size
+
+    def test_object_groups_non_overlapping(self) -> None:
+        """Object-group buffers across batch slots occupy disjoint memory."""
+        tensors = _make_kv_tensors(_MULTI_GROUP)
+        manager = _build_manager(tensors)
+        max_batch_size = 4
+        buf = _TempGPUBuffer(manager, 256, _DEVICE, max_batch_size=max_batch_size)
+        regions: list[tuple[int, int, str]] = []
+        for batch in range(max_batch_size):
+            for og in range(manager.num_object_groups):
+                start, end = _byte_region(buf.get_temp_object_group_buffer(batch, og))
+                regions.append((start, end, f"batch={batch},og={og}"))
+        _assert_disjoint(regions)
+
+
+class TestTempGPUBufferShapeDtype:
+    def test_shape_scales_with_num_tokens(self) -> None:
+        tensors = _make_kv_tensors(_SINGLE_GROUP)
+        manager = _build_manager(tensors)
+        buf = _TempGPUBuffer(manager, 256, _DEVICE)
+        for num_tokens in (16, 128, 256):
+            shape, dtype = buf.get_kernel_group_shape_dtype(num_tokens, 0)
+            assert shape == _expected_kernel_group_shape(manager, num_tokens, 0)
+            assert dtype == manager.kernel_groups[0].dtype
+
+    def test_shape_compressed_group(self) -> None:
+        """For a compressed group, the token dim is divided by compress_ratio."""
+        tensors = _make_kv_tensors([_GroupSpec(num_layers=2, block_size=8)])
+        manager = _build_manager(
+            tensors, layout_hints={"inference_engine_logical_block_size": 16}
+        )
+        assert manager.kernel_groups[0].compress_ratio == 2
+        buf = _TempGPUBuffer(manager, 256, _DEVICE)
+        shape, _ = buf.get_kernel_group_shape_dtype(256, 0)
+        assert shape[2] == 256 // 2
+
+    def test_not_divisible_by_compress_ratio_raises(self) -> None:
+        tensors = _make_kv_tensors([_GroupSpec(num_layers=2, block_size=8)])
+        manager = _build_manager(
+            tensors, layout_hints={"inference_engine_logical_block_size": 16}
+        )
+        buf = _TempGPUBuffer(manager, 256, _DEVICE)
+        with pytest.raises(ValueError, match="not a multiple of"):
+            buf.get_kernel_group_shape_dtype(255, 0)
+
+
+class TestTempGPUBufferCacheSize:
+    def test_cache_size_per_token(self) -> None:
+        tensors = _make_kv_tensors(_MULTI_GROUP)
+        manager = _build_manager(tensors)
+        chunk_size = 256
+        buf = _TempGPUBuffer(manager, chunk_size, _DEVICE)
+        expected = (
+            sum(
+                _expected_kernel_group_bytes(manager, chunk_size, kg)
+                for kg in range(manager.num_kernel_groups)
             )
+            // chunk_size
+        )
+        assert buf.get_cache_size_per_token() == expected
+
+    def test_cache_size_per_token_compressed(self) -> None:
+        """Compression halves per-physical-slot bytes, so the per-logical-token
+        size of a 2x-compressed group is half its uncompressed counterpart."""
+        uncompressed = _make_temp_buffer([_GroupSpec(num_layers=2, block_size=16)])
+        compressed = _make_temp_buffer(
+            [_GroupSpec(num_layers=2, block_size=8)],
+            layout_hints={"inference_engine_logical_block_size": 16},
+        )
+        assert (
+            compressed.get_cache_size_per_token() * 2
+            == uncompressed.get_cache_size_per_token()
+        )
 
-    def test_write_isolation_across_groups(self) -> None:
-        """Writing to one group's buffer must not corrupt another group."""
-        ctx = _make_context_multi_group(self.GROUPS_SAME_DTYPE, chunk_size=64)
-        num_groups = len(ctx.kv_layer_groups_manager_.kv_layer_groups)
-        buffers = [ctx.get_tmp_chunk_gpu_buffer(group_idx=g) for g in range(num_groups)]
-
-        for i, buf in enumerate(buffers):
-            buf.fill_(float(i + 1))
-
-        for i, buf in enumerate(buffers):
-            expected = float(i + 1)
-            assert buf.to(torch.float32).min().item() == pytest.approx(
-                expected, rel=1e-3
-            ), f"Group {i} was corrupted"
-            assert buf.to(torch.float32).max().item() == pytest.approx(
-                expected, rel=1e-3
-            ), f"Group {i} was corrupted"
-
-    @pytest.mark.parametrize("batch_size", [1, 2, 4])
-    def test_batched_non_overlapping_across_groups_and_chunks(
-        self, batch_size: int
-    ) -> None:
-        """All (group, chunk_idx) combinations must occupy disjoint memory."""
-        ctx = _make_context_multi_group(self.GROUPS_DIFF_DTYPE, chunk_size=256)
-        num_groups = len(ctx.kv_layer_groups_manager_.kv_layer_groups)
 
-        # Collect (data_ptr, end_ptr) for every (group, chunk) combination
-        regions: list[tuple[int, int, str]] = []
-        for gidx in range(num_groups):
-            bufs = ctx.get_tmp_chunk_gpu_buffer_batched(batch_size, group_idx=gidx)
-            for cidx, buf in enumerate(bufs):
-                start = buf.data_ptr()
-                end = start + buf.nelement() * buf.element_size()
-                regions.append((start, end, f"group={gidx},chunk={cidx}"))
-
-        for i in range(len(regions)):
-            for j in range(i + 1, len(regions)):
-                s_i, e_i, label_i = regions[i]
-                s_j, e_j, label_j = regions[j]
-                assert e_i <= s_j or e_j <= s_i, (
-                    f"Overlap between {label_i} and {label_j}"
-                )
+# ---------------------------------------------------------------------------
+# GPUCacheContext tests
+# ---------------------------------------------------------------------------
 
-    def test_flat_buffer_covers_all_groups(self) -> None:
-        """get_tmp_gpu_buffer_flat covers the full chunk (all groups)."""
-        ctx = _make_context_multi_group(self.GROUPS_DIFF_DTYPE, chunk_size=256)
-        flat = ctx.get_tmp_gpu_buffer_flat(chunk_idx=0)
-        assert flat.numel() == ctx.tmp_chunk_bytes_
-        assert flat.dtype == torch.uint8
-
-    def test_flat_buffer_chunk_idx_raises(self) -> None:
-        """chunk_idx >= max_batch_size should raise ValueError."""
-        ctx = _make_context_multi_group(self.GROUPS_SAME_DTYPE)
-        with pytest.raises(ValueError, match="exceeds max_batch_size"):
-            ctx.get_tmp_gpu_buffer_flat(chunk_idx=ctx.max_batch_size)
-
-    def test_flat_buffer_contains_group_data(self) -> None:
-        """Data written via get_tmp_chunk_gpu_buffer should be visible in flat view."""
-        ctx = _make_context_multi_group(self.GROUPS_SAME_DTYPE, chunk_size=64)
-        num_groups = len(ctx.kv_layer_groups_manager_.kv_layer_groups)
-
-        # Fill each group with a distinct byte value
-        for gidx in range(num_groups):
-            buf = ctx.get_tmp_chunk_gpu_buffer(group_idx=gidx)
-            # Use view(torch.uint8) to fill raw bytes
-            buf.view(torch.uint8).fill_(gidx + 1)
-
-        flat = ctx.get_tmp_gpu_buffer_flat(chunk_idx=0)
-        for gidx in range(num_groups):
-            g_start = ctx.tmp_chunk_group_offsets_[gidx]
-            g_end = ctx.tmp_chunk_group_offsets_[gidx + 1]
-            region = flat[g_start:g_end]
-            assert region.min().item() == gidx + 1, (
-                f"Group {gidx} flat region has wrong min value"
-            )
-            assert region.max().item() == gidx + 1, (
-                f"Group {gidx} flat region has wrong max value"
+
+class TestGPUCacheContextBuffers:
+    def test_max_batch_size(self) -> None:
+        ctx = _make_context(_SINGLE_GROUP)
+        assert ctx.max_batch_size == 4
+
+    def test_kv_layer_groups_manager(self) -> None:
+        ctx = _make_context(_MULTI_GROUP)
+        manager = ctx.kv_layer_groups_manager
+        assert isinstance(manager, KVLayerGroupsManager)
+        assert manager.num_kernel_groups == 2
+
+    def test_get_temp_kernel_group_buffer(self) -> None:
+        ctx = _make_context(_MULTI_GROUP)
+        manager = ctx.kv_layer_groups_manager
+        for kg in range(manager.num_kernel_groups):
+            tensor = ctx.get_temp_kernel_group_buffer(0, kg)
+            assert tensor.shape == _expected_kernel_group_shape(manager, 256, kg)
+            assert tensor.dtype == manager.kernel_groups[kg].dtype
+
+    def test_get_temp_object_group_buffer(self) -> None:
+        ctx = _make_context(_MULTI_GROUP)
+        tensor = ctx.get_temp_object_group_buffer(0, 0)
+        assert tensor.dtype == torch.uint8
+        assert tensor.dim() == 1
+
+    def test_get_kernel_group_shape_dtype(self) -> None:
+        ctx = _make_context(_SINGLE_GROUP)
+        manager = ctx.kv_layer_groups_manager
+        shape, dtype = ctx.get_kernel_group_shape_dtype(128, 0)
+        assert shape == _expected_kernel_group_shape(manager, 128, 0)
+        assert dtype == manager.kernel_groups[0].dtype
+
+
+class TestGPUCacheContextPointers:
+    def test_get_kernel_group_kv_pointers(self) -> None:
+        ctx = _make_context(_MULTI_GROUP)
+        manager = ctx.kv_layer_groups_manager
+        for kg in range(manager.num_kernel_groups):
+            pointers = ctx.get_kernel_group_kv_pointers(kg)
+            assert pointers.dtype == torch.long
+            # One pointer per layer in the group.
+            assert pointers.numel() == manager.kernel_groups[kg].num_layers
+
+
+class TestGPUCacheContextBlocks:
+    def test_calculate_num_blocks_uncompressed(self) -> None:
+        # block_size=16, compress_ratio=1 -> 256 tokens span 16 blocks.
+        ctx = _make_context([_GroupSpec(num_layers=2, block_size=16)])
+        assert ctx.calculate_num_blocks(256, 0) == 16
+
+    def test_calculate_num_blocks_matches_manager(self) -> None:
+        ctx = _make_context(_MULTI_GROUP)
+        manager = ctx.kv_layer_groups_manager
+        for kg in range(manager.num_kernel_groups):
+            assert ctx.calculate_num_blocks(256, kg) == manager.calculate_num_blocks(
+                kg, 256
             )
+
+
+class TestGPUCacheContextReportStatus:
+    def test_report_status_fields(self) -> None:
+        ctx = _make_context(_SINGLE_GROUP)
+        status = ctx.report_status()
+
+        expected_keys = {
+            "num_layers",
+            "inference_engine_logical_block_size",
+            "group_physical_block_sizes",
+            "group_compress_ratios",
+            "hidden_dim_sizes",
+            "dtype",
+            "is_mla",
+            "num_blocks",
+            "gpu_kv_format",
+            "gpu_kv_shape",
+            "gpu_kv_concrete_shape",
+            "attention_backend",
+            "cache_size_per_token",
+        }
+        assert set(status.keys()) == expected_keys
+
+        assert status["num_layers"] == 4
+        assert status["is_mla"] is False
+        assert status["group_compress_ratios"] == [1]
+        assert status["gpu_kv_format"] == "NL_X_TWO_NB_BS_NH_HS"
+        assert status["dtype"] == str(ctx.dtype)
+        assert status["cache_size_per_token"] == ctx.cache_size_per_token()
+
+    def test_report_status_multi_group(self) -> None:
+        ctx = _make_context(_MULTI_GROUP)
+        manager = ctx.kv_layer_groups_manager
+        status = ctx.report_status()
+        assert status["num_layers"] == 6
+        assert len(status["group_physical_block_sizes"]) == manager.num_kernel_groups
+        assert len(status["group_compress_ratios"]) == manager.num_kernel_groups
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py b/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
index f2853f6260..8ab2470bef 100644
--- a/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
+++ b/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
@@ -81,6 +81,7 @@ def fake_create_cache_context(
     def fake_layout_desc(
         gpu_context: _FakeGPUContext,
         num_tokens: int,
+        object_group_id: int = 0,
     ) -> MemoryLayoutDesc:
         """Return the shared layout descriptor used by both registrations."""
         return layout_desc
diff --git a/tests/v1/test_kv_layer_groups_manager.py b/tests/v1/test_kv_layer_groups_manager.py
index 8dd259a562..e8cc99bf0d 100644
--- a/tests/v1/test_kv_layer_groups_manager.py
+++ b/tests/v1/test_kv_layer_groups_manager.py
@@ -9,8 +9,13 @@
 # First Party
 from lmcache.v1.gpu_connector.utils import LayoutHints
 from lmcache.v1.kv_layer_groups import (
+    EXCLUDED_ENGINE_GROUP,
+    KernelGroupIdentity,
+    KernelGroupInfo,
     KVLayerGroupInfo,
     KVLayerGroupsManager,
+    LayerGroupIdentity,
+    ObjectGroupInfo,
     format_kvcache_shape_spec,
     parse_kvcache_shape_spec,
 )
@@ -52,14 +57,14 @@ class TestKVLayerGroupsManager:
 
     def test_build_empty(self):
         manager = _build_manager([], num_blocks=32)
-        assert manager.kv_layer_groups == []
+        assert manager.kernel_groups == []
 
     def test_build_single_layer(self):
         tensors = [torch.randn(2, 32, 256, 8, 64, dtype=torch.float16)]
         manager = _build_manager(tensors, num_blocks=32)
 
-        assert len(manager.kv_layer_groups) == 1
-        group = manager.kv_layer_groups[0]
+        assert len(manager.kernel_groups) == 1
+        group = manager.kernel_groups[0]
         assert isinstance(group, KVLayerGroupInfo)
         assert group.layer_indices == [0]
         assert group.shape_desc.kv_size == 2
@@ -76,8 +81,8 @@ def test_build_multiple_layers_same_shape(self):
         ]
         manager = _build_manager(tensors, num_blocks=32)
 
-        assert len(manager.kv_layer_groups) == 1
-        group = manager.kv_layer_groups[0]
+        assert len(manager.kernel_groups) == 1
+        group = manager.kernel_groups[0]
         assert group.layer_indices == [0, 1, 2]
         assert group.shape_desc.nl == 3
         assert group.shape_desc.nh == 8
@@ -96,9 +101,9 @@ def test_build_splits_same_shape_by_engine_group_idx(self):
             ],
         )
 
-        assert len(manager.kv_layer_groups) == 2
+        assert len(manager.kernel_groups) == 2
         groups_by_engine_group_idx = {
-            group.engine_group_idx: group for group in manager.kv_layer_groups
+            group.engine_group_idx: group for group in manager.kernel_groups
         }
         assert groups_by_engine_group_idx[0].layer_indices == [0, 2]
         assert groups_by_engine_group_idx[1].layer_indices == [1, 3]
@@ -121,8 +126,8 @@ def test_build_different_shapes(self):
             torch.randn(2, 32, 256, 8, 64, dtype=torch.float16),
         ]
         manager = _build_manager(tensors, num_blocks=32)
-        assert len(manager.kv_layer_groups) == 2
-        group1, group2 = manager.kv_layer_groups
+        assert len(manager.kernel_groups) == 2
+        group1, group2 = manager.kernel_groups
         assert group1.layer_indices == [0, 2]
         assert group1.shape_desc.nh == 8
         assert group2.layer_indices == [1]
@@ -135,8 +140,8 @@ def test_build_different_dtypes(self):
             torch.randn(2, 32, 256, 8, 64, dtype=torch.float16),
         ]
         manager = _build_manager(tensors, num_blocks=32)
-        assert len(manager.kv_layer_groups) == 2
-        group1, group2 = manager.kv_layer_groups
+        assert len(manager.kernel_groups) == 2
+        group1, group2 = manager.kernel_groups
         assert group1.layer_indices == [0, 2]
         assert group1.dtype == torch.float16
         assert group2.layer_indices == [1]
@@ -151,9 +156,9 @@ def test_build_mixed_differences(self):
             torch.randn(2, 32, 256, 16, 64, dtype=torch.float32),  # nh=16, f32
         ]
         manager = _build_manager(tensors, num_blocks=32)
-        assert len(manager.kv_layer_groups) == 4
+        assert len(manager.kernel_groups) == 4
 
-        groups_by_key = {(g.shape_desc.nh, g.dtype): g for g in manager.kv_layer_groups}
+        groups_by_key = {(g.shape_desc.nh, g.dtype): g for g in manager.kernel_groups}
         assert groups_by_key[(8, torch.float16)].layer_indices == [0, 3]
         assert groups_by_key[(8, torch.float32)].layer_indices == [1]
         assert groups_by_key[(16, torch.float16)].layer_indices == [2]
@@ -306,5 +311,106 @@ def test_not_divisible_raises(self):
             self._derive(bs=6, logical=16)
 
 
+class TestKernelGroupIdentity:
+    """The grouping key is a named tuple; ``LayerGroupIdentity`` is its alias."""
+
+    def test_fields_and_alias(self):
+        ident = KernelGroupIdentity(
+            kv_size=2,
+            num_heads=8,
+            head_size=64,
+            block_size=16,
+            engine_group_idx=0,
+            dtype=torch.float16,
+        )
+        assert ident.kv_size == 2
+        assert ident.num_heads == 8
+        assert ident.head_size == 64
+        assert ident.block_size == 16
+        assert ident.engine_group_idx == 0
+        assert ident.dtype == torch.float16
+        assert LayerGroupIdentity is KernelGroupIdentity
+
+    def test_hashable_as_dict_key(self):
+        ident = KernelGroupIdentity(2, 8, 64, 16, 0, torch.float16)
+        assert {ident: "x"}[ident] == "x"
+
+    def test_excluded_engine_group_sentinel(self):
+        assert EXCLUDED_ENGINE_GROUP == -1
+
+
+class TestKernelAndObjectGroups:
+    """Kernel-group accessors, deprecated aliases, and the (currently single)
+    object-group layout."""
+
+    def test_kernel_groups_match_deprecated_alias(self):
+        tensors = [
+            torch.randn(2, 32, 256, 8, 64, dtype=torch.float16) for _ in range(3)
+        ]
+        manager = _build_manager(tensors, num_blocks=32)
+        # The deprecated alias must still return the live list, not a bound
+        # method (regression guard for the @property/@deprecate ordering).
+        assert isinstance(manager.kv_layer_groups, list)
+        assert manager.kernel_groups is manager.kv_layer_groups
+        assert manager.num_kernel_groups == manager.num_groups
+        assert manager.num_kernel_groups == len(manager.kernel_groups)
+        assert all(isinstance(g, KernelGroupInfo) for g in manager.kernel_groups)
+
+    def test_single_object_group_covers_all_kernel_groups(self):
+        # Two distinct kernel groups (different num_heads) still share one
+        # object group under the current single-object-group assumption.
+        tensors = [
+            torch.randn(2, 32, 256, 8, 64, dtype=torch.float16),
+            torch.randn(2, 32, 256, 16, 64, dtype=torch.float16),
+        ]
+        manager = _build_manager(tensors, num_blocks=32)
+        assert manager.num_kernel_groups == 2
+        assert manager.num_object_groups == 1
+        obj = manager.object_groups[0]
+        assert isinstance(obj, ObjectGroupInfo)
+        assert obj.kernel_group_indices == list(range(manager.num_kernel_groups))
+
+    def test_empty_manager_has_no_groups(self):
+        # Empty registration returns early in __init__; both group lists must
+        # still be initialized (regression guard for missing _object_groups).
+        manager = _build_manager([], num_blocks=32)
+        assert manager.kernel_groups == []
+        assert manager.num_kernel_groups == 0
+        assert manager.object_groups == []
+        assert manager.num_object_groups == 0
+
+    def test_excluded_layer_left_out_of_all_groups(self):
+        # Layer 2 is referenced by no group view, so it is excluded entirely.
+        tensors = [
+            torch.randn(2, 32, 256, 8, 64, dtype=torch.float16) for _ in range(3)
+        ]
+        manager = _build_manager(
+            tensors,
+            num_blocks=32,
+            group_views=[LMCacheGroupView(0, (0, 1))],
+        )
+        grouped = sorted(
+            idx for group in manager.kernel_groups for idx in group.layer_indices
+        )
+        assert grouped == [0, 1]
+
+    def test_calculate_num_blocks_uncompressed(self):
+        # bs=16, compress_ratio=1 -> 256 tokens span 16 blocks.
+        tensors = [torch.randn(2, 32, 16, 8, 64, dtype=torch.float16) for _ in range(2)]
+        manager = _build_manager(tensors, num_blocks=32)
+        assert manager.calculate_num_blocks(0, 256) == 16
+
+    def test_calculate_num_blocks_compressed(self):
+        # bs=8, ie_logical_block_size=16 -> compress_ratio=2;
+        # 256 logical tokens -> 128 physical slots -> 128 // 8 = 16 blocks.
+        tensors = [torch.randn(2, 32, 8, 8, 64, dtype=torch.float16) for _ in range(2)]
+        manager = _build_manager(
+            tensors,
+            num_blocks=32,
+            layout_hints={"inference_engine_logical_block_size": 16},
+        )
+        assert manager.calculate_num_blocks(0, 256) == 16
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 874f81b83fc88dd4a6500b4059a3dfc5e5988758 Mon Sep 17 00:00:00 2001
From: Samuel Shen <slshen@tensormesh.ai>
Date: Mon, 8 Jun 2026 17:05:00 -0700
Subject: [PATCH 06/57] [GPUKVFormat]: support vLLM CPU 2-fused KV layout
 (#3567)

Signed-off-by: Samuel Shen <slshen@tensormesh.ai>
---
 .../scripts/run-cpu-e2e-validation.sh         |   5 +-
 csrc/mem_kernels.cuh                          |  10 ++
 csrc/pybind.cpp                               |   1 +
 csrc/sycl/pybind_sycl.cpp                     |   1 +
 lmcache/python_ops_fallback.py                |   6 ++
 lmcache/v1/gpu_connector/utils.py             |  58 +++++++++-
 .../v1/multiprocess/transfer_context/base.py  |  10 ++
 .../test_blocks_first_fused_kv_format.py      | 100 ++++++++++++++++++
 tests/v1/utils.py                             |   4 +
 9 files changed, 191 insertions(+), 4 deletions(-)
 create mode 100644 tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py

diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh b/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
index d67f06f17b..6c3448199c 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
@@ -230,7 +230,10 @@ uv pip install -r requirements/common.txt
 echo "✅ Installed requirements/common.txt"
 
 echo "Installing vLLM CPU build"
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/71df063c494c111ab60f6a33c54aafe7b9ae1d02/cpu --index-strategy first-index --torch-backend cpu
+# Un-pinned from 71df063c (LMCache #3538) now that LMCache handles the
+# blocks-first fused KV layout. Running against nightly means a passing CPU
+# e2e proves the new GPUKVFormat path works.
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index --torch-backend cpu
 echo "✅ vLLM CPU install completed"
 
 echo "Installing LMCache in editable mode with NO_GPU_EXT=1"
diff --git a/csrc/mem_kernels.cuh b/csrc/mem_kernels.cuh
index ac2e22adaa..8c00fad5f0 100644
--- a/csrc/mem_kernels.cuh
+++ b/csrc/mem_kernels.cuh
@@ -99,6 +99,16 @@ enum class GPUKVFormat : int {
   - SGLang MHA via the MP daemon path
   physical shape per layer: [num_blocks, block_size, num_heads, head_size]
   */
+
+  NL_X_NB_NH_BS_TWO_HS = 10,
+  /*
+  used by:
+  - vLLM non-MLA blocks-first attention with K/V fused into the trailing dim
+  physical shape per layer: [num_blocks, num_heads, block_size, 2, head_size]
+  (recovered by splitting the fused trailing [block_size, 2 * head_size]).
+  Currently only reached via the host gather/scatter path, not the CUDA
+  transfer kernels.
+  */
 };
 
 void multi_layer_kv_transfer(
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index e2e6eae68b..adc7ed23ab 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -32,6 +32,7 @@ PYBIND11_MODULE(c_ops, m) {
       .value("NL_X_NB_TWO_NH_BS_HS", GPUKVFormat::NL_X_NB_TWO_NH_BS_HS)
       .value("NB_NL_TWO_NH_BS_HS", GPUKVFormat::NB_NL_TWO_NH_BS_HS)
       .value("TWO_X_NL_X_NB_BS_NH_HS", GPUKVFormat::TWO_X_NL_X_NB_BS_NH_HS)
+      .value("NL_X_NB_NH_BS_TWO_HS", GPUKVFormat::NL_X_NB_NH_BS_TWO_HS)
       .export_values();
   m.def("multi_layer_kv_transfer", &multi_layer_kv_transfer,
         py::arg("key_value"), py::arg("key_value_ptrs"),
diff --git a/csrc/sycl/pybind_sycl.cpp b/csrc/sycl/pybind_sycl.cpp
index 6ed9d58f3a..430e08dc2d 100644
--- a/csrc/sycl/pybind_sycl.cpp
+++ b/csrc/sycl/pybind_sycl.cpp
@@ -27,6 +27,7 @@ PYBIND11_MODULE(xpu_ops, m) {
       .value("NL_X_NB_TWO_NH_BS_HS", GPUKVFormat::NL_X_NB_TWO_NH_BS_HS)
       .value("NB_NL_TWO_NH_BS_HS", GPUKVFormat::NB_NL_TWO_NH_BS_HS)
       .value("TWO_X_NL_X_NB_BS_NH_HS", GPUKVFormat::TWO_X_NL_X_NB_BS_NH_HS)
+      .value("NL_X_NB_NH_BS_TWO_HS", GPUKVFormat::NL_X_NB_NH_BS_TWO_HS)
       .export_values();
   m.def("multi_layer_kv_transfer", &multi_layer_kv_transfer,
         py::arg("key_value"), py::arg("key_value_ptrs"),
diff --git a/lmcache/python_ops_fallback.py b/lmcache/python_ops_fallback.py
index f1b8e15593..a95bf24493 100644
--- a/lmcache/python_ops_fallback.py
+++ b/lmcache/python_ops_fallback.py
@@ -290,6 +290,12 @@ class GPUKVFormat(IntEnum):
     # used by: SGLang MHA via the MP daemon path
     TWO_X_NL_X_NB_BS_NH_HS = 9
 
+    # used by: vLLM non-MLA blocks-first attention with K/V fused into the
+    # trailing dim. Per-layer physical shape
+    # [num_blocks, num_heads, block_size, 2, head_size] -- the K/V "2" axis is
+    # second-to-last, recovered by splitting the fused [..., 2 * head_size].
+    NL_X_NB_NH_BS_TWO_HS = 10
+
 
 class PageBufferShapeDesc:
     """Python stand-in for the C++ ``PageBufferShapeDesc`` struct.
diff --git a/lmcache/v1/gpu_connector/utils.py b/lmcache/v1/gpu_connector/utils.py
index beff9f0f56..8fdf21387d 100644
--- a/lmcache/v1/gpu_connector/utils.py
+++ b/lmcache/v1/gpu_connector/utils.py
@@ -315,6 +315,7 @@ def get_gpu_kv_shape_description(gpu_kv_format: "lmc_ops.GPUKVFormat") -> str:
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS: "NL x [2, NB, NH, BS, HS]",
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS: "NL x [NB, 2, NH, BS, HS]",
         lmc_ops.GPUKVFormat.NB_NL_TWO_NH_BS_HS: "[NB, NL, 2, NH, BS, HS]",
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS: "NL x [NB, NH, BS, 2, HS]",
     }
     return _SHAPE_DESCRIPTIONS.get(gpu_kv_format, f"Unknown ({gpu_kv_format})")
 
@@ -340,6 +341,9 @@ def get_attention_backend(gpu_kv_format: "lmc_ops.GPUKVFormat") -> str:
             "vLLM non-MLA flash infer (HND layout)"
         ),
         lmc_ops.GPUKVFormat.NB_NL_TWO_NH_BS_HS: "TRT-LLM cross-layer (HND layout)",
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS: (
+            "vLLM non-MLA blocks-first, fused K/V"
+        ),
     }
     return _ATTENTION_BACKENDS.get(gpu_kv_format, f"Unknown ({gpu_kv_format})")
 
@@ -414,6 +418,12 @@ def get_concrete_gpu_kv_shape(
         bs = get_block_size(kv_caches, fmt)
         return f"[{nb}, {nl}, 2, {nh}, {bs}, {hs}]"
 
+    if fmt == F.NL_X_NB_NH_BS_TWO_HS:
+        nb = get_num_blocks(kv_caches, fmt)
+        nh = get_num_heads(kv_caches, fmt)
+        bs = get_block_size(kv_caches, fmt)
+        return f"{nl} x [{nb}, {nh}, {bs}, 2, {hs}]"
+
     return f"Unknown ({gpu_kv_format})"
 
 
@@ -615,6 +625,22 @@ def normalize_kv_and_discover_format(
                         detected_format = lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS
                     else:
                         detected_format = lmc_ops.GPUKVFormat.NL_X_NB_TWO_BS_NH_HS
+            elif tensor_dim == 4:
+                # vLLM non-MLA blocks-first attention: K/V fused into the
+                # trailing dim -> [NB, NH, BS, 2*head_size].
+                # Split the fused axis so downstream sees the canonical 5D
+                # [NB, NH, BS, 2, HS].
+                last_dim = probe.shape[3]
+                if last_dim % 2 != 0:
+                    raise ValueError(
+                        "blocks-first fused KV cache trailing dim "
+                        f"{last_dim} is not 2 * head_size"
+                    )
+                kv_caches = [
+                    layer.reshape(*layer.shape[:3], 2, last_dim // 2)
+                    for layer in kv_caches
+                ]
+                detected_format = lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS
             elif tensor_dim == 3:
                 # vllm MLA
                 detected_format = lmc_ops.GPUKVFormat.NL_X_NB_BS_HS
@@ -659,6 +685,7 @@ def get_num_layers(
         lmc_ops.GPUKVFormat.NL_X_NB_BS_HS,
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
     ):
         return len(kv_caches)
     elif gpu_kv_format in (
@@ -692,8 +719,9 @@ def get_num_blocks(
     elif gpu_kv_format in (
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_BS_NH_HS,
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
     ):
-        # [num_blocks, 2, ...] — shape[0] is num_blocks
+        # [num_blocks, ...] — shape[0] is num_blocks
         return kv_caches[0].shape[0]
     elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_BS_HS:
         return kv_caches[0].shape[0]
@@ -731,8 +759,10 @@ def get_block_size(
     elif gpu_kv_format in (
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS,
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_BS_NH_HS,
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
     ):
-        # NHD: [..., BS, NH, HS] — block_size at shape[2]
+        # block_size at shape[2]: NHD [..., BS, NH, HS] and the CPU fused
+        # layout [NB, NH, BS, 2, HS] both carry block_size at shape[2].
         return kv_caches[layer_idx].shape[2]
     elif gpu_kv_format in (
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
@@ -780,6 +810,10 @@ def get_page_buffer_size(
         # list[num_layers] of [num_blocks, 2, num_heads, block_size, head_size]
         # num_blocks=shape[0], block_size=shape[3]
         return kv_caches[0].shape[0] * kv_caches[0].shape[3]
+    elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
+        # list[num_layers] of [num_blocks, num_heads, block_size, 2, head_size]
+        # num_blocks=shape[0], block_size=shape[2]
+        return kv_caches[0].shape[0] * kv_caches[0].shape[2]
     elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_BS_HS:
         # list[num_layers] of [num_blocks, block_size, head_size]
         return kv_caches[0].shape[0] * kv_caches[0].shape[1]
@@ -821,6 +855,9 @@ def get_num_heads(
     ):
         # HND: [..., NH, BS, HS] — num_heads at shape[2]
         return kv_caches[layer_idx].shape[2]
+    elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
+        # CPU fused: [NB, NH, BS, 2, HS] — num_heads at shape[1]
+        return kv_caches[layer_idx].shape[1]
     elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_BS_HS:
         # MLA: heads are absorbed into hidden dim, so num_heads = 1
         return 1
@@ -861,6 +898,9 @@ def get_hidden_dim_size(
     ):
         # HND: [..., NH, BS, HS] — hidden_dim = NH * HS = shape[2] * shape[4]
         return kv_caches[layer_idx].shape[2] * kv_caches[layer_idx].shape[4]
+    elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
+        # CPU fused: [NB, NH, BS, 2, HS] — hidden_dim = NH * HS = shape[1] * shape[4]
+        return kv_caches[layer_idx].shape[1] * kv_caches[layer_idx].shape[4]
     elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_BS_HS:
         return kv_caches[layer_idx].shape[2]
     elif gpu_kv_format == lmc_ops.GPUKVFormat.TWO_X_NL_X_NBBS_NH_HS:
@@ -895,8 +935,9 @@ def get_head_size(
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_BS_NH_HS,
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
     ):
-        # Both NHD [..., NH, HS] and HND [..., BS, HS] have head_size last
+        # All these per-layer non-MLA layouts carry head_size as the last dim
         return kv_caches[layer_idx].shape[4]
     elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_BS_HS:
         return kv_caches[layer_idx].shape[2]
@@ -943,6 +984,10 @@ def get_tokens_per_layer(
         # k_cache = kv_caches[0][:, 0] → (NB, NH, BS, HS); tokens = NB * BS
         k_cache_shape = kv_caches[0][:, 0].shape
         return k_cache_shape[0] * k_cache_shape[2]
+    elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
+        # list[num_layers] of [num_blocks, num_heads, block_size, 2, head_size]
+        # tokens = NB * BS = shape[0] * shape[2]
+        return kv_caches[0].shape[0] * kv_caches[0].shape[2]
     elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_BS_HS:
         # list[num_layers] of [num_blocks, block_size, head_size]
         return kv_caches[0].shape[0] * kv_caches[0].shape[1]
@@ -995,6 +1040,10 @@ def get_elements_per_layer(
         # [num_blocks, 2, ...] — k_cache is kv_caches[0][:, 0]
         k_cache_shape = kv_caches[0][:, 0].shape
         return k_cache_shape.numel() * 2
+    elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
+        # [NB, NH, BS, 2, HS] — K/V at dim 3; k_cache is kv_caches[0][:, :, :, 0]
+        k_cache_shape = kv_caches[0][:, :, :, 0].shape
+        return k_cache_shape.numel() * 2
     elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_BS_HS:
         # list[num_layers] of [num_blocks, block_size, head_size] (MLA)
         return kv_caches[0].numel()
@@ -1022,6 +1071,7 @@ def assert_is_vllm_flash_attn_or_flash_infer(gpu_kv_format: "lmc_ops.GPUKVFormat
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_BS_NH_HS,
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
     )
 
 
@@ -1033,6 +1083,7 @@ def is_hnd(gpu_kv_format: "lmc_ops.GPUKVFormat") -> bool:
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
         lmc_ops.GPUKVFormat.NB_NL_TWO_NH_BS_HS,
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
     )
 
 
@@ -1092,6 +1143,7 @@ def get_dtype(
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
         lmc_ops.GPUKVFormat.NL_X_NBBS_ONE_HS,
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
     ):
         return kv_caches[layer_idx].dtype
     elif gpu_kv_format in (
diff --git a/lmcache/v1/multiprocess/transfer_context/base.py b/lmcache/v1/multiprocess/transfer_context/base.py
index 8857ed13d8..ef43dd6121 100644
--- a/lmcache/v1/multiprocess/transfer_context/base.py
+++ b/lmcache/v1/multiprocess/transfer_context/base.py
@@ -282,6 +282,7 @@ def gather_paged_kv_to_cpu(
     is_hnd = gpu_kv_format in (
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
     )
 
     block_size = get_block_size(normalized, gpu_kv_format)
@@ -326,6 +327,10 @@ def gather_paged_kv_to_cpu(
                     if gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS:
                         k_t = layer[0]
                         v_t = layer[1]
+                    elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
+                        # [NB, NH, BS, 2, HS] — K/V fused at dim 3
+                        k_t = layer[:, :, :, 0]
+                        v_t = layer[:, :, :, 1]
                     else:
                         k_t = layer[:, 0]
                         v_t = layer[:, 1]
@@ -419,6 +424,7 @@ def scatter_cpu_to_paged_kv(
     is_hnd = gpu_kv_format in (
         lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
         lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
+        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
     )
 
     # After normalization the structure is always a list of per-layer
@@ -462,6 +468,10 @@ def scatter_cpu_to_paged_kv(
                 if gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS:
                     k_t = layer[0]
                     v_t = layer[1]
+                elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
+                    # [NB, NH, BS, 2, HS] — K/V fused at dim 3
+                    k_t = layer[:, :, :, 0]
+                    v_t = layer[:, :, :, 1]
                 else:
                     k_t = layer[:, 0]
                     v_t = layer[:, 1]
diff --git a/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py b/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py
new file mode 100644
index 0000000000..72ce1ecd3e
--- /dev/null
+++ b/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Blocks-first, fused-K/V KV layout (GPUKVFormat.NL_X_NB_NH_BS_TWO_HS).
+
+A non-MLA blocks-first attention backend registers its KV cache as the 4D
+``[NB, NH, BS, 2 * HS]`` with K/V fused into the trailing dim (as opposed to
+the 5D K/V-major ``[2, NB, NH, BS, HS]``). Discovery splits the fused axis into
+the canonical 5D ``[NB, NH, BS, 2, HS]`` and classifies it as
+``NL_X_NB_NH_BS_TWO_HS``.
+
+These tests pin discovery, the format-aware accessors, and the multiprocess
+gather/scatter round-trip for that layout.
+"""
+
+# Third Party
+import pytest
+import torch
+
+# First Party
+from lmcache.utils import EngineType
+from lmcache.v1.gpu_connector import utils as U
+from lmcache.v1.multiprocess.transfer_context.base import (
+    gather_paged_kv_to_cpu,
+    scatter_cpu_to_paged_kv,
+)
+import lmcache.c_ops as lmc_ops
+
+NB, NH, BS, HS, NL = 16, 4, 128, 64, 3
+HINTS = {"kv_layout": "HND"}
+
+
+def _raw_blocks_first_caches() -> list[torch.Tensor]:
+    """Per-layer blocks-first tensors as registered: [NB, NH, BS, 2 * HS]."""
+    torch.manual_seed(0)
+    return [torch.randn(NB, NH, BS, 2 * HS) for _ in range(NL)]
+
+
+def test_discovery_splits_fused_axis():
+    fmt, norm = U.normalize_kv_and_discover_format(
+        _raw_blocks_first_caches(), EngineType.VLLM, HINTS
+    )
+    assert fmt == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS
+    # 4D [NB, NH, BS, 2*HS] -> canonical 5D [NB, NH, BS, 2, HS]
+    assert tuple(norm[0].shape) == (NB, NH, BS, 2, HS)
+
+
+def test_discovery_rejects_odd_trailing_dim():
+    bad = [torch.randn(NB, NH, BS, 2 * HS + 1) for _ in range(NL)]
+    with pytest.raises(ValueError):
+        U.normalize_kv_and_discover_format(bad, EngineType.VLLM, HINTS)
+
+
+def test_accessors():
+    fmt, norm = U.normalize_kv_and_discover_format(
+        _raw_blocks_first_caches(), EngineType.VLLM, HINTS
+    )
+    assert U.get_num_layers(norm, fmt) == NL
+    assert U.get_num_blocks(norm, fmt) == NB
+    assert U.get_block_size(norm, fmt) == BS
+    assert U.get_num_heads(norm, fmt) == NH
+    assert U.get_head_size(norm, fmt) == HS
+    assert U.get_hidden_dim_size(norm, fmt) == NH * HS
+    assert U.get_page_buffer_size(norm, fmt) == NB * BS
+    assert U.get_tokens_per_layer(norm, fmt) == NB * BS
+    assert U.get_elements_per_layer(norm, fmt) == NB * NH * BS * HS * 2
+    # get_dtype is on the register_kv_caches -> group_layers_by_identity path,
+    # so it must recognize this format too.
+    assert U.get_dtype(norm, fmt) == _raw_blocks_first_caches()[0].dtype
+    assert U.is_hnd(fmt) is True
+    assert not U.is_mla(fmt)
+
+
+def test_mp_gather_scatter_roundtrip():
+    blocks_per_chunk = 2
+    block_ids = [0, 3, 5, 6]  # 2 chunks
+    raw = _raw_blocks_first_caches()
+    src = {f"layer_{i}": t for i, t in enumerate(raw)}
+    ref = {k: v.clone() for k, v in src.items()}
+    idx = torch.tensor(block_ids)
+
+    chunks = gather_paged_kv_to_cpu(
+        src, block_ids, blocks_per_chunk, layout_hints=HINTS
+    )
+    # [K/V, NL, chunk_tokens, NH*HS]
+    assert tuple(chunks[0].shape) == (2, NL, blocks_per_chunk * BS, NH * HS)
+
+    # Wipe the gathered blocks, scatter back, and confirm exact recovery.
+    dst = {k: v.clone() for k, v in src.items()}
+    for k in dst:
+        dst[k][idx] = 0.0
+    scatter_cpu_to_paged_kv(
+        dst, block_ids, chunks, blocks_per_chunk, layout_hints=HINTS
+    )
+
+    for k in dst:
+        assert torch.equal(dst[k][idx], ref[k][idx])
+
+    # Untouched blocks must be left alone.
+    untouched = torch.tensor([b for b in range(NB) if b not in block_ids])
+    for k in dst:
+        assert torch.equal(dst[k][untouched], ref[k][untouched])
diff --git a/tests/v1/utils.py b/tests/v1/utils.py
index 94826b20f8..a4c98d601f 100644
--- a/tests/v1/utils.py
+++ b/tests/v1/utils.py
@@ -46,6 +46,7 @@ class MockGPUKVFormat:
         NL_X_NB_BS_HS = 2
         NL_X_TWO_NB_NH_BS_HS = 3
         NL_X_NB_TWO_NH_BS_HS = 4
+        NL_X_NB_NH_BS_TWO_HS = 5
 
     class MockCOps:
         GPUKVFormat = MockGPUKVFormat
@@ -311,6 +312,9 @@ def generate_kv_cache_paged_list_tensors(
             shape = [2, num_blocks, num_heads, block_size, head_size]
         elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS:
             shape = [num_blocks, 2, num_heads, block_size, head_size]
+        elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
+            # blocks-first, K/V fused into the trailing dim
+            shape = [num_blocks, num_heads, block_size, 2, head_size]
         else:
             raise ValueError(f"Unsupported gpu_kv_format: {gpu_kv_format}")
 

From 936bb94744e36ff58c09ac8af06a57ecd7fea83c Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Mon, 8 Jun 2026 18:07:08 -0700
Subject: [PATCH 07/57] [Refactor] Rename LMCacheGroupView to EngineGroupInfo
 (#3598)

Signed-off-by: ApostaC <yihua@tensormesh.ai>
---
 .../vllm/hybrid-kv-cache-groups.md            | 58 +++++++++----------
 .../commands/bench/server_bench/command.py    |  6 +-
 .../commands/bench/server_bench/helpers.py    | 14 ++---
 .../sglang/multi_process_adapter.py           |  4 +-
 lmcache/integration/vllm/kv_cache_groups.py   | 14 ++---
 .../integration/vllm/lmcache_mp_connector.py  |  8 ++-
 .../vllm/vllm_multi_process_adapter.py        | 16 ++---
 lmcache/v1/kv_layer_groups.py                 | 22 +++----
 lmcache/v1/multiprocess/gpu_context.py        |  6 +-
 lmcache/v1/multiprocess/group_view.py         | 30 +++++-----
 .../v1/multiprocess/modules/gpu_transfer.py   |  8 +--
 lmcache/v1/multiprocess/protocols/engine.py   |  6 +-
 .../transfer_context/worker_transfer.py       | 14 ++---
 lmcache/v1/platform/cache_context.py          |  8 +--
 .../test_gpu_transfer_layout_registry.py      |  2 +-
 .../multiprocess/test_mq_handler_helpers.py   | 10 ++--
 tests/v1/test_kv_cache_groups.py              | 46 +++++++--------
 tests/v1/test_kv_layer_groups_manager.py      | 20 +++----
 tests/v1/test_vllm_kv_cache_groups.py         | 12 ++--
 tests/v1/test_vllm_mp_adapter.py              | 10 ++--
 20 files changed, 159 insertions(+), 155 deletions(-)

diff --git a/docs/design/integration/vllm/hybrid-kv-cache-groups.md b/docs/design/integration/vllm/hybrid-kv-cache-groups.md
index 919b66afcc..95f2f05ad8 100644
--- a/docs/design/integration/vllm/hybrid-kv-cache-groups.md
+++ b/docs/design/integration/vllm/hybrid-kv-cache-groups.md
@@ -8,16 +8,16 @@ connector. It separates three concepts:
 - **Engine KV cache group** — a group defined by the serving engine (vLLM's
   `KVCacheConfig.kv_cache_groups`). Each is one distinct paged-block address
   space; block IDs are only meaningful within one group.
-- **`LMCacheGroupView`** — LMCache's engine-neutral, `msgspec`-encoded view of
-  one such group (`group_view.py`). A `list[LMCacheGroupView]` is the
+- **`EngineGroupInfo`** — LMCache's engine-neutral, `msgspec`-encoded record of
+  one such group (`group_view.py`). A `list[EngineGroupInfo]` is the
   registration contract.
 - **`KVLayerGroupInfo`** — the server's runtime transfer-kernel dispatch unit,
-  built from the views + the real tensors (`kv_layer_groups.py`).
+  built from the engine group infos + the real tensors (`kv_layer_groups.py`).
 
 vLLM groups layers by cache behavior; LMCache must transfer by physical layout
 (kv_size, num_heads, head_size, block_size, dtype) *and* keep distinct engine
-block-id spaces separate. So at registration we build group views, and
-store/retrieve address those views directly.
+block-id spaces separate. So at registration we build engine group infos, and
+store/retrieve address those infos directly.
 
 ## Goals / Non-Goals
 
@@ -36,13 +36,13 @@ store/retrieve address those views directly.
 
 ## Types
 
-- **`LMCacheGroupView`** (`msgspec.Struct`): `engine_group_id` (which engine
-  block group its layers live in; dense from 0) + `layer_indices`. Several views
+- **`EngineGroupInfo`** (`msgspec.Struct`): `engine_group_id` (which engine
+  block group its layers live in; dense from 0) + `layer_indices`. Several infos
   may share an `engine_group_id` when one engine group is split by physical
   transfer identity. The list order is the protocol-visible group order; an
   empty list means a single non-hybrid group.
-- Helpers in `group_view.py` operate on `Sequence[LMCacheGroupView]`:
-  `num_engine_groups`, `num_group_views`, `expand_block_ids_to_views`,
+- Helpers in `group_view.py` operate on `Sequence[EngineGroupInfo]`:
+  `num_engine_groups`, `num_engine_group_infos`, `expand_engine_block_ids`,
   `get_engine_group_indices`.
 - **`KVLayerGroupInfo`** (runtime, server-only): layer indices,
   `PageBufferShapeDesc`, dtype, compress ratio, physical chunk size,
@@ -52,17 +52,17 @@ store/retrieve address those views directly.
 
 ```text
 vLLM KVCacheConfig + registered kv_caches
-  | integration.vllm.kv_cache_groups.create_group_views_from_vllm
+  | integration.vllm.kv_cache_groups.create_engine_group_infos_from_vllm
   v
-list[LMCacheGroupView]  --REGISTER_KV_CACHE (msgspec)-->  server msgspec-decode
+list[EngineGroupInfo]  --REGISTER_KV_CACHE (msgspec)-->  server msgspec-decode
   | KVLayerGroupsManager validates against real tensors
   v
-KVLayerGroupInfo list   --STORE/RETRIEVE block_ids per view-->  transfer kernels
+KVLayerGroupInfo list   --STORE/RETRIEVE block_ids per info-->  transfer kernels
 ```
 
 ## Registration
 
-`create_group_views_from_vllm` (the only place that reads vLLM `KVCacheConfig`):
+`create_engine_group_infos_from_vllm` (the only place that reads vLLM `KVCacheConfig`):
 
 1. Inspect registered tensors for physical layout/dtype.
 2. Map each registered layer to its engine group index; layers absent from
@@ -71,17 +71,17 @@ KVLayerGroupInfo list   --STORE/RETRIEVE block_ids per view-->  transfer kernels
 3. `group_layers_by_identity` splits layers by transfer identity
    `(kv_size, num_heads, head_size, block_size, engine_group_idx, dtype)` — the
    `engine_group_idx` term keeps identically-shaped layers from different engine
-   groups in separate views.
-4. Emit one `LMCacheGroupView` per identity; send the list in the
+   groups in separate infos.
+4. Emit one `EngineGroupInfo` per identity; send the list in the
    `REGISTER_KV_CACHE` payload (the message queue encodes it).
 
 ## Store and retrieve
 
 vLLM reports block IDs per engine group. The worker adapter re-indexes them to
-group-view order with `expand_block_ids_to_views(group_views, block_ids)` (each
-view reuses its source engine group's block IDs), so `STORE`/`RETRIEVE` receive
-`list[list[int]]` indexed by view order. The server loop is then trivial: for
-view `i`, use `gpu_block_ids[i]`.
+engine-group-info order with `expand_engine_block_ids(engine_group_infos, block_ids)` (each
+info reuses its source engine group's block IDs), so `STORE`/`RETRIEVE` receive
+`list[list[int]]` indexed by info order. The server loop is then trivial: for
+info `i`, use `gpu_block_ids[i]`.
 
 ### Per-group block sizes
 
@@ -108,7 +108,7 @@ layers in `kv_cache_groups`; a sharing layer is absent from every group's
 `layer_names`. Such a layer's KV physically lives in its target owner's blocks,
 so storing/retrieving the owner already covers it. Registration therefore tags
 unlisted layers with `EXCLUDED_ENGINE_GROUP` and `group_layers_by_identity`
-skips them — they never form their own view. (Placing them in a group would
+skips them — they never form their own info. (Placing them in a group would
 duplicate work and, when their block size differs from the group they default
 into, corrupt the per-group block-id counts.)
 
@@ -124,19 +124,19 @@ vLLM exposes two engine groups — group 0: layers [0,2,4], group 1: [1,3]. If
 layers 0–3 share a shape but layer 4 differs, registration produces:
 
 ```text
-view 0: engine group 0, layers [0, 2]
-view 1: engine group 1, layers [1, 3]
-view 2: engine group 0, layers [4]
+info 0: engine group 0, layers [0, 2]
+info 1: engine group 1, layers [1, 3]
+info 2: engine group 0, layers [4]
 ```
 
 Block IDs `{group 0: [10,11], group 1: [20,21]}` are sent as
-`[[10,11], [20,21], [10,11]]` (views 0 and 2 share group 0's IDs).
+`[[10,11], [20,21], [10,11]]` (infos 0 and 2 share group 0's IDs).
 
 ## Invariants
 
-- The `list[LMCacheGroupView]` order is the protocol-visible group order; callers
-  send one block-id list per view.
-- vLLM-specific access stays in `lmcache.integration.vllm`; views carry neutral
+- The `list[EngineGroupInfo]` order is the protocol-visible group order; callers
+  send one block-id list per info.
+- vLLM-specific access stays in `lmcache.integration.vllm`; infos carry neutral
   metadata only.
 - The server reproduces grouping with the same `group_layers_by_identity`; real
   tensors remain the source of truth for shape/dtype/stride.
@@ -151,9 +151,9 @@ but LMCache cannot store/retrieve those layers.
 
 | Area | File |
 |---|---|
-| Group view (IPC type) + helpers | `lmcache/v1/multiprocess/group_view.py` |
+| Engine group info (IPC type) + helpers | `lmcache/v1/multiprocess/group_view.py` |
 | Shared grouping primitive | `lmcache/v1/kv_layer_groups.py` |
-| vLLM → `list[LMCacheGroupView]` | `lmcache/integration/vllm/kv_cache_groups.py` |
+| vLLM → `list[EngineGroupInfo]` | `lmcache/integration/vllm/kv_cache_groups.py` |
 | Register / store / retrieve | `lmcache/integration/vllm/{lmcache_mp_connector,vllm_multi_process_adapter}.py` |
 | Server GPU context / transfer | `lmcache/v1/multiprocess/{gpu_context,modules/gpu_transfer}.py` |
 | ZMQ protocol | `lmcache/v1/multiprocess/protocols/engine.py` |
diff --git a/lmcache/cli/commands/bench/server_bench/command.py b/lmcache/cli/commands/bench/server_bench/command.py
index 8ade408a1d..269f01a22e 100644
--- a/lmcache/cli/commands/bench/server_bench/command.py
+++ b/lmcache/cli/commands/bench/server_bench/command.py
@@ -294,7 +294,7 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
         layer_groups = parse_kvcache_shape_spec(args.kvcache_shape_spec)
         # One block-id list is sent per LMCache KV group; each shape-spec
         # group becomes its own group server-side.
-        num_group_views = len(layer_groups) or 1
+        num_engine_group_infos = len(layer_groups) or 1
         # Echo the resolved spec so operators can verify that their
         # input was interpreted as intended. The echoed string is a
         # valid ``--kvcache-shape-spec`` itself.
@@ -473,7 +473,7 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
                 http_base=http_base,
                 block_size=block_size,
                 total_blocks=num_blocks,
-                num_group_views=num_group_views,
+                num_engine_group_infos=num_engine_group_infos,
                 use_gpu=use_gpu,
                 use_handle=use_handle,
                 client_tensors=client_tensors,
@@ -492,7 +492,7 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
                 http_base=http_base,
                 block_size=block_size,
                 total_blocks=num_blocks,
-                num_group_views=num_group_views,
+                num_engine_group_infos=num_engine_group_infos,
                 use_gpu=use_gpu,
                 use_handle=use_handle,
                 client_tensors=client_tensors,
diff --git a/lmcache/cli/commands/bench/server_bench/helpers.py b/lmcache/cli/commands/bench/server_bench/helpers.py
index dc7b99046b..1ed0be6d00 100644
--- a/lmcache/cli/commands/bench/server_bench/helpers.py
+++ b/lmcache/cli/commands/bench/server_bench/helpers.py
@@ -554,7 +554,7 @@ def _send_store(
     key: IPCCacheEngineKey,
     block_offset: int = 0,
     block_size: int = 16,
-    num_group_views: int = 1,
+    num_engine_group_infos: int = 1,
     use_gpu: bool = True,
     use_handle: bool | None = None,
     client_tensors: list["torch.Tensor"] | None = None,
@@ -582,7 +582,7 @@ def _send_store(
         payloads = [
             key,
             _INSTANCE_ID,
-            [block_ids] * num_group_views,
+            [block_ids] * num_engine_group_infos,
             _make_event_handle(use_gpu),
         ]
         result = _call(client, RequestType.STORE, payloads)
@@ -627,7 +627,7 @@ def _send_retrieve(
     hit_chunks: int,
     block_offset: int = 0,
     block_size: int = 16,
-    num_group_views: int = 1,
+    num_engine_group_infos: int = 1,
     use_gpu: bool = True,
     use_handle: bool | None = None,
     client_tensors: list["torch.Tensor"] | None = None,
@@ -654,7 +654,7 @@ def _send_retrieve(
         payloads = [
             key,
             _INSTANCE_ID,
-            [block_ids] * num_group_views,
+            [block_ids] * num_engine_group_infos,
             _make_event_handle(use_gpu),
             0,  # skip_first_n_tokens
         ]
@@ -784,7 +784,7 @@ def _process_request(
     http_base: str = "",
     block_size: int = 16,
     total_blocks: int = 1024,
-    num_group_views: int = 1,
+    num_engine_group_infos: int = 1,
     use_gpu: bool = True,
     use_handle: bool | None = None,
     client_tensors: list["torch.Tensor"] | None = None,
@@ -901,7 +901,7 @@ def _process_request(
             hit_chunks,
             block_offset=block_offset,
             block_size=block_size,
-            num_group_views=num_group_views,
+            num_engine_group_infos=num_engine_group_infos,
             use_gpu=use_gpu,
             use_handle=use_handle,
             client_tensors=client_tensors,
@@ -938,7 +938,7 @@ def _process_request(
             store_key,
             block_offset=store_block_off,
             block_size=block_size,
-            num_group_views=num_group_views,
+            num_engine_group_infos=num_engine_group_infos,
             use_gpu=use_gpu,
             use_handle=use_handle,
             client_tensors=client_tensors,
diff --git a/lmcache/integration/sglang/multi_process_adapter.py b/lmcache/integration/sglang/multi_process_adapter.py
index 16de0e2955..3bf940abde 100644
--- a/lmcache/integration/sglang/multi_process_adapter.py
+++ b/lmcache/integration/sglang/multi_process_adapter.py
@@ -135,12 +135,12 @@ def __init__(
 
         # Upstream's REGISTER_KV_CACHE protocol takes flat positional args:
         # (instance_id, kv_cache, model_name, world_size, engine_type,
-        # layout_hints, group_views). SGLang's natural KV layout is depth-2
+        # layout_hints, engine_group_infos). SGLang's natural KV layout is depth-2
         # ([K_layers, V_layers]); we flatten it on the wire to fit
         # ``KVCache = list[CudaIPCWrapper]``. The daemon recognizes the
         # SGLang-MHA flat-of-2NL pattern from ``EngineType.SGLANG`` plus the
         # ``tokens_per_block`` hint and un-flattens + reshapes per layer.
-        # SGLang is non-hybrid (a single KV cache group), so group_views is the
+        # SGLang is non-hybrid (a single KV cache group), so engine_group_infos is the
         # empty list -- which the server treats as one group spanning all layers
         # (matching the vLLM non-hybrid and TensorRT-LLM register paths).
         send_lmcache_request(
diff --git a/lmcache/integration/vllm/kv_cache_groups.py b/lmcache/integration/vllm/kv_cache_groups.py
index 5167d74dd8..fdc9459410 100644
--- a/lmcache/integration/vllm/kv_cache_groups.py
+++ b/lmcache/integration/vllm/kv_cache_groups.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Build LMCache group views from vLLM KV cache group metadata."""
+"""Build LMCache engine group infos from vLLM KV cache group metadata."""
 
 # Future
 from __future__ import annotations
@@ -13,15 +13,15 @@
     from lmcache.v1.gpu_connector.utils import LayoutHints
 
 # First Party
-from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 
 
-def create_group_views_from_vllm(
+def create_engine_group_infos_from_vllm(
     kv_cache_config: Any,
     kv_caches: Mapping[str, Any],
     layout_hints: "LayoutHints | None" = None,
-) -> list[LMCacheGroupView]:
-    """Build the LMCache group views from vLLM metadata and registered tensors.
+) -> list[EngineGroupInfo]:
+    """Build the LMCache engine group infos from vLLM metadata and registered tensors.
 
     This is the single entry point for the vLLM -> LMCache conversion. It reads
     the vLLM-specific fields (``KVCacheConfig.kv_cache_groups`` and
@@ -41,7 +41,7 @@ def create_group_views_from_vllm(
             detection (e.g. ``NHD``/``HND`` and compression metadata).
 
     Returns:
-        The list of ``LMCacheGroupView`` in protocol order, i.e. the LMCache group
+        The list of ``EngineGroupInfo`` in protocol order, i.e. the LMCache group
         order used by store/retrieve block IDs.
     """
     # First Party
@@ -95,7 +95,7 @@ def create_group_views_from_vllm(
     # the shared, engine-neutral primitive the server reuses to reproduce the
     # same grouping from the registered tensors.
     return [
-        LMCacheGroupView(
+        EngineGroupInfo(
             engine_group_id=identity[4],
             layer_indices=tuple(indices),
         )
diff --git a/lmcache/integration/vllm/lmcache_mp_connector.py b/lmcache/integration/vllm/lmcache_mp_connector.py
index 012e96fd2c..3ca1c9086d 100644
--- a/lmcache/integration/vllm/lmcache_mp_connector.py
+++ b/lmcache/integration/vllm/lmcache_mp_connector.py
@@ -37,7 +37,7 @@ class SupportsHMA:  # type: ignore[no-redef]
 # First Party
 from lmcache import torch_dev
 from lmcache.integration.vllm.kv_cache_groups import (
-    create_group_views_from_vllm,
+    create_engine_group_infos_from_vllm,
 )
 from lmcache.integration.vllm.utils import mla_enabled, vllm_layout_hints
 from lmcache.utils import init_logger as lmcache_init_logger
@@ -619,12 +619,14 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """
         logger.info("Registering kv caches!")
         kv_cache_config = getattr(self, "_kv_cache_config", None)
-        group_views = create_group_views_from_vllm(
+        engine_group_infos = create_engine_group_infos_from_vllm(
             kv_cache_config,
             kv_caches,
             layout_hints=vllm_layout_hints(),
         )
-        self.worker_adapter.register_kv_caches(kv_caches, group_views=group_views)
+        self.worker_adapter.register_kv_caches(
+            kv_caches, engine_group_infos=engine_group_infos
+        )
         return
 
     def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
diff --git a/lmcache/integration/vllm/vllm_multi_process_adapter.py b/lmcache/integration/vllm/vllm_multi_process_adapter.py
index c2fcb083a9..3fcc0054e6 100644
--- a/lmcache/integration/vllm/vllm_multi_process_adapter.py
+++ b/lmcache/integration/vllm/vllm_multi_process_adapter.py
@@ -22,8 +22,8 @@
     KVCache,
 )
 from lmcache.v1.multiprocess.group_view import (
-    LMCacheGroupView,
-    expand_block_ids_to_views,
+    EngineGroupInfo,
+    expand_engine_block_ids,
 )
 from lmcache.v1.multiprocess.mq import MessageQueueClient, MessagingFuture
 from lmcache.v1.multiprocess.protocol import RequestType, get_response_class
@@ -920,7 +920,7 @@ def __init__(
 
         # Registered kv caches from vLLM
         self.kv_caches: dict[str, torch.Tensor] = {}
-        self.group_views: list[LMCacheGroupView] = []
+        self.engine_group_infos: list[EngineGroupInfo] = []
 
         # Transport context for transfer operations.
         self.transfer_ctx: TransferContext | None = None
@@ -1034,7 +1034,7 @@ def is_first_rank_of_pp_group(self) -> bool:
     def register_kv_caches(
         self,
         kv_caches: dict[str, torch.Tensor],
-        group_views: Sequence[LMCacheGroupView] = (),
+        engine_group_infos: Sequence[EngineGroupInfo] = (),
     ) -> None:
         """
         Register the kv caches with LMCache server.
@@ -1042,7 +1042,7 @@ def register_kv_caches(
         Args:
             kv_caches: A dict of kv caches to register. The keys are the
                 layer names and the values are the corresponding tensors.
-            group_views: LMCache-owned engine KV cache group metadata.
+            engine_group_infos: LMCache-owned engine KV cache group metadata.
 
         Raises:
             ConnectionError: if the server does not respond within
@@ -1050,11 +1050,11 @@ def register_kv_caches(
         """
         logger.info("Registering kv caches")
         self.kv_caches = kv_caches
-        self.group_views = list(group_views)
+        self.engine_group_infos = list(engine_group_infos)
         self._send_register_kv_caches_request(kv_caches)
 
     def _block_ids_per_group(self, op: LoadStoreOp) -> list[list[int]]:
-        return expand_block_ids_to_views(self.group_views, op.block_ids)
+        return expand_engine_block_ids(self.engine_group_infos, op.block_ids)
 
     def _send_register_kv_caches_request(
         self, kv_caches: dict[str, torch.Tensor]
@@ -1090,7 +1090,7 @@ def _send_register_kv_caches_request(
                 self._mq_timeout,
                 send_request=send_lmcache_request,
                 layout_hints=layout_hints,
-                group_views=self.group_views,
+                engine_group_infos=self.engine_group_infos,
             )
         except TimeoutError:
             raise ConnectionError(
diff --git a/lmcache/v1/kv_layer_groups.py b/lmcache/v1/kv_layer_groups.py
index 2127bcb573..78ea6ba055 100644
--- a/lmcache/v1/kv_layer_groups.py
+++ b/lmcache/v1/kv_layer_groups.py
@@ -20,7 +20,7 @@
 if TYPE_CHECKING:
     # First Party
     from lmcache.v1.gpu_connector.utils import DiscoverableKVCache, LayoutHints
-    from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+    from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 
 logger = init_logger(__name__)
 
@@ -63,7 +63,7 @@ class KernelGroupIdentity(NamedTuple):
 
 # Sentinel ``per_layer_engine_group_idx`` value: a KV tensor tagged with it is
 # excluded from every LMCache group (used for cross-layer KV-sharing layers; see
-# ``create_group_views_from_vllm``).
+# ``create_engine_group_infos_from_vllm``).
 EXCLUDED_ENGINE_GROUP = -1
 
 
@@ -271,7 +271,7 @@ def __init__(
         gpu_kv_format: "lmc_ops.GPUKVFormat",
         num_blocks: int,
         layout_hints: "LayoutHints | None" = None,
-        group_views: "Sequence[LMCacheGroupView]" = (),
+        engine_group_infos: "Sequence[EngineGroupInfo]" = (),
         lmcache_logical_chunk_size: int = 256,
     ) -> None:
         """Partition layers into groups keyed by
@@ -301,7 +301,7 @@ def __init__(
                 group's ``compress_ratio`` and ``physical_chunk_size``.
                 ``None`` means every group is treated as non-compressed
                 (``compress_ratio == 1``).
-            group_views: LMCache-owned engine KV cache group
+            engine_group_infos: LMCache-owned engine KV cache group
                 metadata. When present, it is used to keep layers from
                 different engine block-ID spaces in separate LMCache
                 transfer groups.
@@ -342,7 +342,9 @@ def __init__(
             logger.debug("No KV caches available, skipping KV layer groups building")
             return
 
-        per_layer_engine_group_idx = get_engine_group_indices(group_views, num_layers)
+        per_layer_engine_group_idx = get_engine_group_indices(
+            engine_group_infos, num_layers
+        )
 
         groups_by_identity = group_layers_by_identity(
             kv_caches, gpu_kv_format, num_layers, per_layer_engine_group_idx
@@ -410,7 +412,7 @@ def __init__(
         )
 
         # Detect the object groups
-        self._object_groups = self._detect_object_groups(group_views)
+        self._object_groups = self._detect_object_groups(engine_group_infos)
 
     @property
     def kernel_groups(self) -> list[KernelGroupInfo]:
@@ -516,18 +518,18 @@ def calculate_num_blocks(self, kernel_group_idx: int, num_tokens: int) -> int:
 
     ### Helper methods
     def _detect_object_groups(
-        self, group_views: "Sequence[LMCacheGroupView]"
+        self, engine_group_infos: "Sequence[EngineGroupInfo]"
     ) -> list[ObjectGroupInfo]:
-        """Detect object groups based on the provided group views.
+        """Detect object groups based on the provided engine group infos.
 
         Args:
-            group_views: LMCache-owned engine KV cache group metadata.
+            engine_group_infos: LMCache-owned engine KV cache group metadata.
 
         Returns:
             A list of ObjectGroupInfo instances representing the detected object groups.
         """
         # TODO: add the real object group detection logic based on
-        # the attention type metadata in the group views once it's
+        # the attention type metadata in the engine group infos once it's
         # available.
         # Now, we are using a single object group, which means
         # all kernel groups' KV caches will be stored in the same memory object.
diff --git a/lmcache/v1/multiprocess/gpu_context.py b/lmcache/v1/multiprocess/gpu_context.py
index 182d55af15..b3eef93296 100644
--- a/lmcache/v1/multiprocess/gpu_context.py
+++ b/lmcache/v1/multiprocess/gpu_context.py
@@ -38,7 +38,7 @@
 )
 from lmcache.v1.kv_layer_groups import KVLayerGroupsManager
 from lmcache.v1.multiprocess.custom_types import KVCache
-from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 
 # Backend selection (c_ops when CUDA is available, otherwise a pure-Python
 # fallback) is handled once in ``lmcache/__init__.py`` via ``_get_backend``,
@@ -342,7 +342,7 @@ def __init__(
         kv_caches: KVCache,
         lmcache_logical_chunk_size: int = 256,
         layout_hints: LayoutHints | None = None,
-        group_views: Sequence[LMCacheGroupView] = (),
+        engine_group_infos: Sequence[EngineGroupInfo] = (),
         engine_type: EngineType = EngineType.VLLM,
     ):
         unwrapped = unwrap_kv_cache_tensors(kv_caches)
@@ -362,7 +362,7 @@ def __init__(
             gpu_kv_format=self.gpu_kv_format_,
             num_blocks=self.num_blocks_,
             layout_hints=layout_hints,
-            group_views=group_views,
+            engine_group_infos=engine_group_infos,
             lmcache_logical_chunk_size=lmcache_logical_chunk_size,
         )
 
diff --git a/lmcache/v1/multiprocess/group_view.py b/lmcache/v1/multiprocess/group_view.py
index 7155791926..5c95cef30d 100644
--- a/lmcache/v1/multiprocess/group_view.py
+++ b/lmcache/v1/multiprocess/group_view.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""LMCache's engine-neutral view of a serving engine's KV cache groups.
+"""LMCache's engine-neutral description of a serving engine's KV cache groups.
 
 An *engine group* is one distinct paged-block address space exposed by the
 serving engine (e.g. one of vLLM's hybrid KV cache groups): block IDs are only
@@ -7,8 +7,8 @@
 merged into one LMCache KV group. Engine group ids are assumed dense and
 consecutive starting from 0.
 
-LMCache's neutral KV cache spec is simply a ``list[LMCacheGroupView]`` (passed as
-a ``Sequence[LMCacheGroupView]`` where only order matters). The group order is
+LMCache's neutral KV cache spec is simply a ``list[EngineGroupInfo]`` (passed as
+a ``Sequence[EngineGroupInfo]`` where only order matters). The group order is
 the protocol-visible LMCache group order used by store/retrieve block IDs. An
 empty list means a single non-hybrid group (the default for engines that do not
 report KV cache group metadata). Engine-specific conversion belongs in the
@@ -23,26 +23,26 @@
 import msgspec
 
 
-class LMCacheGroupView(msgspec.Struct, frozen=True):
+class EngineGroupInfo(msgspec.Struct, frozen=True):
     """One LMCache KV group: layers of one engine group that share a copy kernel.
 
     Carries the layer indices and which engine group they belong to. Several
-    ``LMCacheGroupView`` instances may share the same ``engine_group_id`` when
+    ``EngineGroupInfo`` instances may share the same ``engine_group_id`` when
     one engine group is split by physical transfer identity (e.g. differing
-    hidden dims). A ``list[LMCacheGroupView]`` is carried verbatim in the
+    hidden dims). A ``list[EngineGroupInfo]`` is carried verbatim in the
     ``REGISTER_KV_CACHE`` IPC payload; the message queue handles
     encoding/decoding.
     """
 
     engine_group_id: int
-    """Engine group this view's layers live in (one distinct paged-block address
+    """Engine group these layers live in (one distinct paged-block address
     space). Selects which request block-id list applies. Dense from 0."""
 
     layer_indices: tuple[int, ...] = ()
     """Registered KV tensor indices assigned to this group."""
 
 
-def num_engine_groups(groups: Sequence[LMCacheGroupView]) -> int:
+def num_engine_groups(groups: Sequence[EngineGroupInfo]) -> int:
     """Return the number of engine groups (block-id lists per transfer request).
 
     Engine group ids are assumed dense and consecutive from 0.
@@ -59,7 +59,7 @@ def num_engine_groups(groups: Sequence[LMCacheGroupView]) -> int:
     return max(group.engine_group_id for group in groups) + 1
 
 
-def num_group_views(groups: Sequence[LMCacheGroupView]) -> int:
+def num_engine_group_infos(groups: Sequence[EngineGroupInfo]) -> int:
     """Return the number of LMCache KV groups visible to transfer requests.
 
     Args:
@@ -75,7 +75,7 @@ def num_group_views(groups: Sequence[LMCacheGroupView]) -> int:
 
 
 def _engine_group_id_per_view(
-    groups: Sequence[LMCacheGroupView],
+    groups: Sequence[EngineGroupInfo],
 ) -> tuple[int, ...]:
     """Return, per LMCache group, the engine group it draws block IDs from.
 
@@ -84,7 +84,7 @@ def _engine_group_id_per_view(
 
     Returns:
         A tuple whose length equals the number of LMCache groups (i.e.
-        :func:`num_group_views`); element ``i`` is the engine group id
+        :func:`num_engine_group_infos`); element ``i`` is the engine group id
         that LMCache group ``i`` reads block IDs from. ``(0,)`` for an empty
         ``groups`` (single non-hybrid group).
     """
@@ -93,11 +93,11 @@ def _engine_group_id_per_view(
     return tuple(group.engine_group_id for group in groups)
 
 
-def expand_block_ids_to_views(
-    groups: Sequence[LMCacheGroupView],
+def expand_engine_block_ids(
+    groups: Sequence[EngineGroupInfo],
     engine_side_block_ids: Sequence[Sequence[int]] | Sequence[int],
 ) -> list[list[int]]:
-    """Re-index engine-side block IDs to one list per LMCache group.
+    """Expand the engine-side block id list to the list per LMCache kernel group.
 
     The serving engine reports block IDs per engine group. LMCache transfer
     requests are indexed by LMCache KV group, so each LMCache group reuses the
@@ -174,7 +174,7 @@ def slice_block_ids_per_group(
 
 
 def get_engine_group_indices(
-    groups: Sequence[LMCacheGroupView],
+    groups: Sequence[EngineGroupInfo],
     num_registered_layers: int,
 ) -> list[int] | None:
     """Return the engine group index for each registered KV tensor.
diff --git a/lmcache/v1/multiprocess/modules/gpu_transfer.py b/lmcache/v1/multiprocess/modules/gpu_transfer.py
index 8b012af0c3..eaf4d1ff83 100644
--- a/lmcache/v1/multiprocess/modules/gpu_transfer.py
+++ b/lmcache/v1/multiprocess/modules/gpu_transfer.py
@@ -36,7 +36,7 @@
     ThreadPoolType,
 )
 from lmcache.v1.multiprocess.gpu_context import GPUCacheContext
-from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 from lmcache.v1.multiprocess.native_completion import (
     DeviceHostFuncDispatcher,
     submit_callback_to_stream,
@@ -231,7 +231,7 @@ def register_kv_cache(
         world_size: int,
         engine_type: EngineType,
         layout_hints: LayoutHints,
-        group_views: list[LMCacheGroupView],
+        engine_group_infos: list[EngineGroupInfo],
     ) -> None:
         """Register the KV cache tensors for a given GPU instance ID.
 
@@ -245,7 +245,7 @@ def register_kv_cache(
                 Forwarded to GPUCacheContext for format detection.
             layout_hints: See LayoutHints.  Forwarded to
                 GPUCacheContext for GPU KV format detection.
-            group_views: Engine-neutral KV cache group metadata
+            engine_group_infos: Engine-neutral KV cache group metadata
                 (already msgspec-decoded by the message queue).
         """
         if instance_id in self._cache_contexts:
@@ -260,7 +260,7 @@ def register_kv_cache(
             kv_caches,
             self._ctx.chunk_size,
             layout_hints=layout_hints or None,
-            group_views=group_views,
+            engine_group_infos=engine_group_infos,
             engine_type=engine_type,
         )
         self._cache_contexts[instance_id] = ContextEntry(
diff --git a/lmcache/v1/multiprocess/protocols/engine.py b/lmcache/v1/multiprocess/protocols/engine.py
index bc61f7f2b1..b2aa603c66 100644
--- a/lmcache/v1/multiprocess/protocols/engine.py
+++ b/lmcache/v1/multiprocess/protocols/engine.py
@@ -24,7 +24,7 @@
     KVCache,
     RegisterNonGpuContextPayload,
 )
-from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 from lmcache.v1.multiprocess.protocols.base import HandlerType, ProtocolDefinition
 
 
@@ -96,7 +96,7 @@ def get_protocol_definitions() -> dict[str, ProtocolDefinition]:
         #   - engine_type: EngineType - Which serving engine produced the
         #     caches (vLLM, SGLang, ...). Drives format detection.
         #   - layout_hints: LayoutHints - See custom_types.LayoutHints.
-        #   - group_views: list[LMCacheGroupView] - Engine-neutral KV cache
+        #   - engine_group_infos: list[EngineGroupInfo] - Engine-neutral KV cache
         #     group metadata (msgspec-encoded by the message queue).
         # Returns: None
         "REGISTER_KV_CACHE": ProtocolDefinition(
@@ -107,7 +107,7 @@ def get_protocol_definitions() -> dict[str, ProtocolDefinition]:
                 int,
                 EngineType,
                 LayoutHints,
-                list[LMCacheGroupView],
+                list[EngineGroupInfo],
             ],
             response_class=None,
             handler_type=HandlerType.SYNC,
diff --git a/lmcache/v1/multiprocess/transfer_context/worker_transfer.py b/lmcache/v1/multiprocess/transfer_context/worker_transfer.py
index 41c72ca7d2..7ab3dc3cdc 100644
--- a/lmcache/v1/multiprocess/transfer_context/worker_transfer.py
+++ b/lmcache/v1/multiprocess/transfer_context/worker_transfer.py
@@ -18,7 +18,7 @@
 from lmcache.v1.gpu_connector.utils import LayoutHints, is_mla
 from lmcache.v1.multiprocess.custom_types import RegisterNonGpuContextPayload
 from lmcache.v1.multiprocess.futures import MessagingFuture
-from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 from lmcache.v1.multiprocess.mq import MessageQueueClient
 from lmcache.v1.multiprocess.protocol import RequestType
 from lmcache.v1.multiprocess.protocols.engine import RegisterNonGpuContextResponse
@@ -126,7 +126,7 @@ def register(
         mq_timeout: float,
         send_request: SendRequest,
         layout_hints: LayoutHints | None = None,
-        group_views: Sequence[LMCacheGroupView] = (),
+        engine_group_infos: Sequence[EngineGroupInfo] = (),
     ) -> None:
         """Register KV caches with the server and wait for ACK.
 
@@ -140,7 +140,7 @@ def register(
             mq_timeout: Timeout in seconds for synchronous request wait.
             send_request: Request sender callable used to issue MQ requests.
             layout_hints: Optional inference-engine-provided layout hints.
-            group_views: LMCache-owned engine KV cache group metadata.
+            engine_group_infos: LMCache-owned engine KV cache group metadata.
 
         Raises:
             TimeoutError: If server registration does not complete before
@@ -232,7 +232,7 @@ def register(
         mq_timeout: float,
         send_request: SendRequest,
         layout_hints: LayoutHints | None = None,
-        group_views: Sequence[LMCacheGroupView] = (),
+        engine_group_infos: Sequence[EngineGroupInfo] = (),
     ) -> None:
         # First Party
         from lmcache.integration.vllm.vllm_multi_process_adapter import wrap_kv_caches
@@ -249,7 +249,7 @@ def register(
                 world_size,
                 EngineType.VLLM,
                 layout_hints,
-                list(group_views),
+                list(engine_group_infos),
             ],
         )
         future.result(timeout=mq_timeout)
@@ -321,11 +321,11 @@ def register(
         mq_timeout: float,
         send_request: SendRequest,
         layout_hints: LayoutHints | None = None,
-        group_views: Sequence[LMCacheGroupView] = (),
+        engine_group_infos: Sequence[EngineGroupInfo] = (),
     ) -> None:
         """Register KV caches with the non-GPU context server.
 
-        ``group_views`` is accepted to satisfy the base interface but
+        ``engine_group_infos`` is accepted to satisfy the base interface but
         is currently a no-op: the non-GPU transfer path does not support
         hybrid KV cache groups and rejects multi-group transfers at store /
         retrieve time (see ``_single_group_block_ids``).
diff --git a/lmcache/v1/platform/cache_context.py b/lmcache/v1/platform/cache_context.py
index 3c723cb222..b3f983e74f 100644
--- a/lmcache/v1/platform/cache_context.py
+++ b/lmcache/v1/platform/cache_context.py
@@ -25,14 +25,14 @@
 
 if TYPE_CHECKING:
     # First Party
-    from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+    from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 
 
 def create_cache_context(
     kv_caches: KVCache,
     lmcache_logical_chunk_size: int = 256,
     layout_hints: LayoutHints | None = None,
-    group_views: "Sequence[LMCacheGroupView]" = (),
+    engine_group_infos: "Sequence[EngineGroupInfo]" = (),
     engine_type: EngineType = EngineType.VLLM,
 ) -> Any:
     """Create the appropriate cache context.
@@ -50,7 +50,7 @@ def create_cache_context(
         lmcache_logical_chunk_size: Number of tokens per LMCache chunk.
         layout_hints: Optional hints for GPU KV format detection.
             Forwarded verbatim to the concrete context constructor.
-        group_views: Engine-neutral KV cache group metadata.
+        engine_group_infos: Engine-neutral KV cache group metadata.
         engine_type: Which serving engine produced the caches.
 
     Returns:
@@ -70,6 +70,6 @@ def create_cache_context(
         kv_caches,
         lmcache_logical_chunk_size,
         layout_hints,
-        group_views,
+        engine_group_infos,
         engine_type,
     )
diff --git a/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py b/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
index 8ab2470bef..09abf56228 100644
--- a/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
+++ b/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
@@ -72,7 +72,7 @@ def fake_create_cache_context(
         kv_caches: object,
         lmcache_logical_chunk_size: int,
         layout_hints: object = None,
-        group_views: object = (),
+        engine_group_infos: object = (),
         engine_type: object = None,
     ) -> _FakeGPUContext:
         """Return a fake cache context without touching CUDA or wrappers."""
diff --git a/tests/v1/multiprocess/test_mq_handler_helpers.py b/tests/v1/multiprocess/test_mq_handler_helpers.py
index 43610b07bc..60b1b72345 100644
--- a/tests/v1/multiprocess/test_mq_handler_helpers.py
+++ b/tests/v1/multiprocess/test_mq_handler_helpers.py
@@ -13,7 +13,7 @@
     BlockAllocationRecord,
     KVCache,
 )
-from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 from lmcache.v1.multiprocess.protocol import KeyType
 
 # ==============================================================================
@@ -41,7 +41,7 @@ def register_kv_cache_handler(
     world_size: int,
     engine_type: EngineType,
     layout_hints: LayoutHints,
-    group_views: list[LMCacheGroupView],
+    engine_group_infos: list[EngineGroupInfo],
 ) -> None:
     """
     Dummy handler for REGISTER_KV_CACHE requests.
@@ -56,7 +56,7 @@ def register_kv_cache_handler(
             ``layout_hints["inference_engine_logical_block_size"]``
             carries the logical tokens-per-engine-block (previously a
             standalone argument).
-        group_views: Engine-neutral KV cache group metadata,
+        engine_group_infos: Engine-neutral KV cache group metadata,
             msgspec-decoded from the request payload.
 
     Returns:
@@ -86,8 +86,8 @@ def register_kv_cache_handler(
         "Expected layout_hints['inference_engine_logical_block_size'] to be int, got "
         f"{type(ie_logical_block_size)}"
     )
-    assert isinstance(group_views, list), (
-        f"Expected group_views to be a list, got {type(group_views)}"
+    assert isinstance(engine_group_infos, list), (
+        f"Expected engine_group_infos to be a list, got {type(engine_group_infos)}"
     )
     # No return value (returns None implicitly)
 
diff --git a/tests/v1/test_kv_cache_groups.py b/tests/v1/test_kv_cache_groups.py
index cb04ff45f1..b554f1f699 100644
--- a/tests/v1/test_kv_cache_groups.py
+++ b/tests/v1/test_kv_cache_groups.py
@@ -4,61 +4,61 @@
 
 # First Party
 from lmcache.v1.multiprocess.group_view import (
-    LMCacheGroupView,
-    expand_block_ids_to_views,
+    EngineGroupInfo,
+    expand_engine_block_ids,
     get_engine_group_indices,
+    num_engine_group_infos,
     num_engine_groups,
-    num_group_views,
     slice_block_ids_per_group,
 )
 
 
-def test_group_views_default_to_one_engine_group():
+def test_engine_group_infos_default_to_one_engine_group():
     assert num_engine_groups([]) == 1
-    assert num_group_views([]) == 1
+    assert num_engine_group_infos([]) == 1
     assert get_engine_group_indices([], 1) is None
 
 
-def test_group_views_build_per_layer_engine_group_indices():
+def test_engine_group_infos_build_per_layer_engine_group_indices():
     groups = [
-        LMCacheGroupView(0, (0, 2)),
-        LMCacheGroupView(1, (1, 3)),
+        EngineGroupInfo(0, (0, 2)),
+        EngineGroupInfo(1, (1, 3)),
     ]
 
     assert num_engine_groups(groups) == 2
-    assert num_group_views(groups) == 2
+    assert num_engine_group_infos(groups) == 2
     assert get_engine_group_indices(groups, 4) == [0, 1, 0, 1]
 
 
-def test_group_views_expand_block_ids_to_views():
+def test_engine_group_infos_expand_engine_block_ids():
     groups = [
-        LMCacheGroupView(0, (0, 2)),
-        LMCacheGroupView(0, (4,)),
-        LMCacheGroupView(1, (1, 3)),
+        EngineGroupInfo(0, (0, 2)),
+        EngineGroupInfo(0, (4,)),
+        EngineGroupInfo(1, (1, 3)),
     ]
 
-    assert expand_block_ids_to_views(groups, [[10, 11], [20, 21]]) == [
+    assert expand_engine_block_ids(groups, [[10, 11], [20, 21]]) == [
         [10, 11],
         [10, 11],
         [20, 21],
     ]
 
 
-def test_group_views_msgspec_round_trip():
+def test_engine_group_infos_msgspec_round_trip():
     """The groups encode/decode losslessly via msgspec (the IPC path)."""
     groups = [
-        LMCacheGroupView(0, (0, 2)),
-        LMCacheGroupView(1, (1, 3)),
+        EngineGroupInfo(0, (0, 2)),
+        EngineGroupInfo(1, (1, 3)),
     ]
 
     decoded = msgspec.msgpack.decode(
-        msgspec.msgpack.encode(groups), type=list[LMCacheGroupView]
+        msgspec.msgpack.encode(groups), type=list[EngineGroupInfo]
     )
 
     assert decoded == groups
 
 
-def test_group_views_exclude_uncovered_layers():
+def test_engine_group_infos_exclude_uncovered_layers():
     """Layers not referenced by any group are tagged EXCLUDED_ENGINE_GROUP.
 
     Cross-layer KV-sharing layers (e.g. google/gemma-4-E4B-it) alias a target
@@ -69,16 +69,16 @@ def test_group_views_exclude_uncovered_layers():
     from lmcache.v1.kv_layer_groups import EXCLUDED_ENGINE_GROUP
 
     groups = [
-        LMCacheGroupView(0, (0,)),
-        LMCacheGroupView(1, (1,)),
+        EngineGroupInfo(0, (0,)),
+        EngineGroupInfo(1, (1,)),
     ]
 
     # Layer 2 is not covered by any group -> excluded, not an error.
     assert get_engine_group_indices(groups, 3) == [0, 1, EXCLUDED_ENGINE_GROUP]
 
 
-def test_group_views_reject_out_of_range_layer():
-    groups = [LMCacheGroupView(0, (0, 5))]
+def test_engine_group_infos_reject_out_of_range_layer():
+    groups = [EngineGroupInfo(0, (0, 5))]
 
     try:
         get_engine_group_indices(groups, 3)
diff --git a/tests/v1/test_kv_layer_groups_manager.py b/tests/v1/test_kv_layer_groups_manager.py
index e8cc99bf0d..58aae45fcc 100644
--- a/tests/v1/test_kv_layer_groups_manager.py
+++ b/tests/v1/test_kv_layer_groups_manager.py
@@ -19,7 +19,7 @@
     format_kvcache_shape_spec,
     parse_kvcache_shape_spec,
 )
-from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 
 pytestmark = pytest.mark.skipif(
     not torch.cuda.is_available(), reason="PageBufferShapeDesc requires CUDA build"
@@ -31,7 +31,7 @@ def _build_manager(
     *,
     num_blocks: int,
     layout_hints: LayoutHints | None = None,
-    group_views: Sequence[LMCacheGroupView] = (),
+    engine_group_infos: Sequence[EngineGroupInfo] = (),
 ) -> KVLayerGroupsManager:
     """Build a manager using the per-layer NHD format.
 
@@ -48,7 +48,7 @@ def _build_manager(
         gpu_kv_format=lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS,
         num_blocks=num_blocks,
         layout_hints=layout_hints,
-        group_views=group_views,
+        engine_group_infos=engine_group_infos,
     )
 
 
@@ -95,9 +95,9 @@ def test_build_splits_same_shape_by_engine_group_idx(self):
         manager = _build_manager(
             tensors,
             num_blocks=32,
-            group_views=[
-                LMCacheGroupView(0, (0, 2)),
-                LMCacheGroupView(1, (1, 3)),
+            engine_group_infos=[
+                EngineGroupInfo(0, (0, 2)),
+                EngineGroupInfo(1, (1, 3)),
             ],
         )
 
@@ -108,7 +108,7 @@ def test_build_splits_same_shape_by_engine_group_idx(self):
         assert groups_by_engine_group_idx[0].layer_indices == [0, 2]
         assert groups_by_engine_group_idx[1].layer_indices == [1, 3]
 
-    def test_build_rejects_bad_group_views(self):
+    def test_build_rejects_bad_engine_group_infos(self):
         tensors = [
             torch.randn(2, 32, 256, 8, 64, dtype=torch.float16) for _ in range(2)
         ]
@@ -116,7 +116,7 @@ def test_build_rejects_bad_group_views(self):
             _build_manager(
                 tensors,
                 num_blocks=32,
-                group_views=[LMCacheGroupView(0, (2,))],
+                engine_group_infos=[EngineGroupInfo(0, (2,))],
             )
 
     def test_build_different_shapes(self):
@@ -380,14 +380,14 @@ def test_empty_manager_has_no_groups(self):
         assert manager.num_object_groups == 0
 
     def test_excluded_layer_left_out_of_all_groups(self):
-        # Layer 2 is referenced by no group view, so it is excluded entirely.
+        # Layer 2 is referenced by no engine group info, so it is excluded entirely.
         tensors = [
             torch.randn(2, 32, 256, 8, 64, dtype=torch.float16) for _ in range(3)
         ]
         manager = _build_manager(
             tensors,
             num_blocks=32,
-            group_views=[LMCacheGroupView(0, (0, 1))],
+            engine_group_infos=[EngineGroupInfo(0, (0, 1))],
         )
         grouped = sorted(
             idx for group in manager.kernel_groups for idx in group.layer_indices
diff --git a/tests/v1/test_vllm_kv_cache_groups.py b/tests/v1/test_vllm_kv_cache_groups.py
index aec934ada9..d6e990c40a 100644
--- a/tests/v1/test_vllm_kv_cache_groups.py
+++ b/tests/v1/test_vllm_kv_cache_groups.py
@@ -7,10 +7,10 @@
 
 # First Party
 from lmcache.integration.vllm.kv_cache_groups import (
-    create_group_views_from_vllm,
+    create_engine_group_infos_from_vllm,
 )
 from lmcache.v1.multiprocess.group_view import (
-    expand_block_ids_to_views,
+    expand_engine_block_ids,
     get_engine_group_indices,
     num_engine_groups,
 )
@@ -32,7 +32,7 @@ def _same_shape_caches(names: list[str]) -> dict[str, torch.Tensor]:
 
 def test_conversion_defaults_to_single_group_without_config():
     """No vLLM KV cache groups -> all layers fall into a single engine group."""
-    spec = create_group_views_from_vllm(
+    spec = create_engine_group_infos_from_vllm(
         None, _same_shape_caches(["layer.0", "layer.1"])
     )
 
@@ -43,7 +43,7 @@ def test_conversion_defaults_to_single_group_without_config():
 
 def test_conversion_preserves_engine_group_layers():
     """Two engine groups with identical tensor shape stay separate by group."""
-    spec = create_group_views_from_vllm(
+    spec = create_engine_group_infos_from_vllm(
         MockKVCacheConfig(
             kv_cache_groups=[
                 MockKVCacheGroup(["layer.0", "layer.2"]),
@@ -62,7 +62,7 @@ def test_conversion_splits_by_lmcache_layer_identity():
     caches = _same_shape_caches(["layer.0", "layer.1", "layer.2", "layer.3"])
     # layer.4 has a different head count -> distinct transfer identity.
     caches["layer.4"] = torch.randn(2, 32, 16, 16, 64, dtype=torch.float16)
-    spec = create_group_views_from_vllm(
+    spec = create_engine_group_infos_from_vllm(
         MockKVCacheConfig(
             kv_cache_groups=[
                 MockKVCacheGroup(["layer.0", "layer.2", "layer.4"]),
@@ -74,7 +74,7 @@ def test_conversion_splits_by_lmcache_layer_identity():
 
     assert [group.engine_group_id for group in spec] == [0, 1, 0]
     assert [group.layer_indices for group in spec] == [(0, 2), (1, 3), (4,)]
-    assert expand_block_ids_to_views(spec, [[10], [20]]) == [
+    assert expand_engine_block_ids(spec, [[10], [20]]) == [
         [10],
         [20],
         [10],
diff --git a/tests/v1/test_vllm_mp_adapter.py b/tests/v1/test_vllm_mp_adapter.py
index 635e7ea09d..54f5e36e23 100644
--- a/tests/v1/test_vllm_mp_adapter.py
+++ b/tests/v1/test_vllm_mp_adapter.py
@@ -25,7 +25,7 @@
     LoadStoreOp,
     ParallelStrategy,
 )
-from lmcache.v1.multiprocess.group_view import LMCacheGroupView
+from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 from lmcache.v1.multiprocess.protocol import RequestType
 
 
@@ -174,10 +174,10 @@ def test_submit_store_request_expands_block_ids_to_views(fake_adapter, monkeypat
     fake_tensor = MagicMock()
     fake_tensor.device.type = "cuda"
     adapter.kv_caches = {"layer.0": fake_tensor}
-    adapter.group_views = [
-        LMCacheGroupView(0, (0, 2)),
-        LMCacheGroupView(0, (4,)),
-        LMCacheGroupView(1, (1, 3)),
+    adapter.engine_group_infos = [
+        EngineGroupInfo(0, (0, 2)),
+        EngineGroupInfo(0, (4,)),
+        EngineGroupInfo(1, (1, 3)),
     ]
     transfer_ctx = MagicMock()
     fake_future = MagicMock()

From bf1a215ec9072c11c5fd390fb6d5c8ab19fb17cb Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Mon, 8 Jun 2026 19:10:36 -0700
Subject: [PATCH 08/57] [Refactor] Change the report_status to be
 per-kernel-group in LMCache (#3599)

Signed-off-by: ApostaC <yihua@tensormesh.ai>
---
 docs/design/cli/commands/describe.md          | 103 +++++++++++++-----
 docs/source/cli/describe.rst                  |  49 ++++++---
 docs/source/mp/http_api.rst                   |  28 +++--
 lmcache/cli/commands/describe.py              |  69 ++++++++----
 lmcache/v1/gpu_connector/utils.py             |  70 ++++++++++++
 lmcache/v1/multiprocess/gpu_context.py        |  97 ++++++++++-------
 tests/cli/test_describe.py                    |  43 ++++++--
 tests/v1/gpu_connector/test_concrete_shape.py |  68 ++++++++++++
 tests/v1/multiprocess/test_gpu_context.py     |  72 ++++++++----
 9 files changed, 451 insertions(+), 148 deletions(-)
 create mode 100644 tests/v1/gpu_connector/test_concrete_shape.py

diff --git a/docs/design/cli/commands/describe.md b/docs/design/cli/commands/describe.md
index 3aa2f729c7..0613a80620 100644
--- a/docs/design/cli/commands/describe.md
+++ b/docs/design/cli/commands/describe.md
@@ -33,15 +33,21 @@ Uptime:                                  2h 14m 32s
 ------ Model: meta-llama/Llama-3.1-70B-Instruct ---
 World size:                              4
 GPU IDs:                                 0, 1, 2, 3
-Attention backend:         vLLM non-MLA flash attention
-GPU KV shape:              NL x [2, NB, BS, NH, HS]
-GPU KV tensor shape:       80 x [2, 2048, 128, 8, 128]
 Num layers:                              80
-Block size:                              128
-Hidden dim size:                         1024
+Num blocks:                              2048
+Cache size per token (bytes):            327680
+--- Kernel group 0 (meta-llama/Llama-3.1-70B-Instruct) ---
+Kernel group index:                      0
+Engine group index:                      0
+Object group index:                      0
+Num layers:                              80
+Physical block size:                     128
+Compress ratio:                          1
 Dtype:                                   torch.float16
 MLA:                                     False
-Num blocks:                              2048
+Attention backend:         vLLM non-MLA flash attention
+GPU KV shape:              NL x [2, NB, BS, NH, HS]
+GPU KV tensor shape:       80 x [2, 2048, 128, 8, 128]
 ----------- L2: NixlStoreL2Adapter ------------
 Type:                          NixlStoreL2Adapter
 Health:                                  OK
@@ -67,15 +73,25 @@ programmatic access:
         "model": "meta-llama/Llama-3.1-70B-Instruct",
         "world_size": 4,
         "gpu_ids": "0, 1, 2, 3",
-        "attention_backend": "vLLM non-MLA flash attention",
-        "gpu_kv_shape": "NL x [2, NB, BS, NH, HS]",
-        "gpu_kv_concrete_shape": "80 x [2, 2048, 128, 8, 128]",
         "num_layers": 80,
-        "block_size": 128,
-        "hidden_dim_size": 1024,
+        "num_blocks": 2048,
+        "cache_size_per_token": 327680
+      }
+    ],
+    "kernel_groups": [
+      {
+        "model": "meta-llama/Llama-3.1-70B-Instruct",
+        "kernel_group_idx": 0,
+        "engine_group_idx": 0,
+        "object_group_idx": 0,
+        "num_layers": 80,
+        "physical_block_size": 128,
+        "compress_ratio": 1,
         "dtype": "torch.float16",
         "is_mla": false,
-        "num_blocks": 2048
+        "attention_backend": "vLLM non-MLA flash attention",
+        "gpu_kv_shape": "NL x [2, NB, BS, NH, HS]",
+        "gpu_kv_concrete_shape": "80 x [2, 2048, 128, 8, 128]"
       }
     ],
     "l2_adapters": [
@@ -92,8 +108,21 @@ programmatic access:
 ```
 
 Per-model sections are generated for each unique `(model_name, world_size)` pair
-registered with the engine. The section includes:
-
+registered with the engine. The model section carries the context-wide fields —
+`num_layers`, `num_blocks`, and `cache_size_per_token` — and is followed by one
+**kernel group** section per kernel group, since a hybrid model's groups can
+differ in geometry.
+
+Each kernel group section includes:
+
+- **Kernel / engine / object group index** — the group's identity:
+  `kernel_group_idx` enumerates the manager's kernel groups, `engine_group_idx`
+  is the paged-block address space (0 for non-hybrid), and `object_group_idx` is
+  the owning object group.
+- **Num layers** and **Physical block size** — the group's layer count and
+  `shape_desc.bs`.
+- **Compress ratio** — logical tokens per physical slot (1 for non-compressed).
+- **Dtype** and **MLA** — the group's torch dtype and MLA flag.
 - **Attention backend** — which attention implementation is active (e.g.,
   `vLLM non-MLA flash attention`, `vLLM MLA`, `SGLang MHA`), derived from the
   `GPUKVFormat` enum.
@@ -101,9 +130,8 @@ registered with the engine. The section includes:
   `GPUKVFormat` enum (NB=num_blocks, NL=num_layers, BS=block_size, NH=num_heads,
   HS=head_size, PBS=page_buffer_size). E.g., `NL x [2, NB, BS, NH, HS]`.
 - **GPU KV tensor shape** — the same layout with actual numeric values substituted
-  (e.g., `80 x [2, 2048, 128, 8, 128]`).
-- **Layout details** — num_layers, block_size, hidden_dim_size, dtype, MLA flag,
-  num_blocks.
+  from the group's `shape_desc` (e.g., `80 x [2, 2048, 128, 8, 128]`), so it is
+  group-accurate.
 
 L2 adapter sections are generated for each adapter in
 `storage_manager.l2_adapters`. Fields shown depend on the adapter type:
@@ -258,22 +286,41 @@ Mirror the same `start_time`, `zmq_endpoint`, and `http_endpoint` additions if
 
 **Files:** `lmcache/v1/gpu_connector/utils.py`, `lmcache/v1/multiprocess/gpu_context.py`, `lmcache/v1/multiprocess/server.py`
 
-Three new helper functions in `utils.py` (derived from `legible_print_gpu_kv_format()`):
-- `get_gpu_kv_shape_description(gpu_kv_format)` — symbolic shape (e.g., `List[num_layers] of [2, num_blocks, ...]`)
+Helper functions in `utils.py` (derived from `legible_print_gpu_kv_format()`):
+- `get_gpu_kv_shape_description(gpu_kv_format)` — symbolic shape (e.g., `NL x [2, NB, BS, NH, HS]`)
 - `get_attention_backend(gpu_kv_format)` — backend name (e.g., `vLLM non-MLA flash attention`)
-- `get_concrete_gpu_kv_shape(kv_caches, gpu_kv_format)` — shape with actual values (e.g., `List[80] of [2, 2048, 128, 8, 128]`)
+- `get_concrete_gpu_kv_shape(kv_caches, gpu_kv_format)` — whole-context shape with actual values
+- `get_concrete_gpu_kv_shape_from_shape_desc(shape_desc, gpu_kv_format)` — **group-accurate** shape with actual values, read from a single kernel group's `PageBufferShapeDesc` (used by `report_status`)
 
-`GPUCacheContext` exposes these as properties: `gpu_kv_format_name`, `gpu_kv_shape`, `concrete_gpu_kv_shape`, `attention_backend`.
-
-`report_status()` includes them in the per-GPU `kv_cache_layout` dict:
+`report_status()` is organised **per kernel group**: a small set of context-wide
+fields at the top level, plus a `kernel_groups` list where each entry is
+self-describing. The format-derived fields (`gpu_kv_format`, `gpu_kv_shape`,
+`attention_backend`, `is_mla`) and the group-accurate `gpu_kv_concrete_shape`
+live inside each group:
 
 ```python
 "kv_cache_layout": {
-    ...,
-    "gpu_kv_format": "NL_X_TWO_NB_BS_NH_HS",
-    "gpu_kv_shape": "NL x [2, NB, BS, NH, HS]",
-    "gpu_kv_concrete_shape": "80 x [2, 2048, 128, 8, 128]",
-    "attention_backend": "vLLM non-MLA flash attention",
+    "num_layers": 80,
+    "inference_engine_logical_block_size": 128,
+    "num_blocks": 2048,
+    "cache_size_per_token": 327680,
+    "kernel_groups": [
+        {
+            "kernel_group_idx": 0,
+            "engine_group_idx": 0,
+            "object_group_idx": 0,
+            "num_layers": 80,
+            "layer_indices": [0, 1, ...],
+            "physical_block_size": 128,
+            "compress_ratio": 1,
+            "dtype": "torch.float16",
+            "gpu_kv_concrete_shape": "80 x [2, 2048, 128, 8, 128]",
+            "is_mla": false,
+            "gpu_kv_format": "NL_X_TWO_NB_BS_NH_HS",
+            "gpu_kv_shape": "NL x [2, NB, BS, NH, HS]",
+            "attention_backend": "vLLM non-MLA flash attention",
+        },
+    ],
 }
 ```
 
diff --git a/docs/source/cli/describe.rst b/docs/source/cli/describe.rst
index 15b7e46e90..1b07751210 100644
--- a/docs/source/cli/describe.rst
+++ b/docs/source/cli/describe.rst
@@ -25,15 +25,21 @@ L2 adapters.
    Model:           meta-llama/Llama-3.1-70B-Instruct
    World size:                                      4
    GPU IDs:                                0, 1, 2, 3
-   Attention backend:    vLLM non-MLA flash attention
-   GPU KV shape:             NL x [2, NB, BS, NH, HS]
-   GPU KV tensor shape:   80 x [2, 2048, 128, 8, 128]
    Num layers:                                     80
-   Block size:                                    128
-   Hidden dim sizes:                             1024
+   Num blocks:                                   2048
+   Cache size per token (bytes):               327680
+   --- Kernel group 0 (meta-llama/Llama-3.1-70B-Instruct) ---
+   Kernel group index:                              0
+   Engine group index:                              0
+   Object group index:                              0
+   Num layers:                                     80
+   Physical block size:                           128
+   Compress ratio:                                  1
    Dtype:                               torch.float16
    MLA:                                         False
-   Num blocks:                                   2048
+   Attention backend:    vLLM non-MLA flash attention
+   GPU KV shape:             NL x [2, NB, BS, NH, HS]
+   GPU KV tensor shape:   80 x [2, 2048, 128, 8, 128]
    ------------- L2: NixlStoreL2Adapter -------------
    Type:                           NixlStoreL2Adapter
    Health:                                         OK
@@ -46,8 +52,9 @@ The output shows:
 
 - **Overview** — health status, engine type, chunk size.
 - **L1 storage** — capacity, usage, eviction policy, cached object count.
-- **Registered models** — per-model KV cache layout including the GPU KV
-  tensor shape (symbolic and concrete), attention backend, and layer details.
+- **Registered models** — per-model KV cache layout: a context-wide summary
+  followed by one kernel group section per kernel group, each with the GPU KV
+  tensor shape (symbolic and concrete), attention backend, and group geometry.
 - **L2 adapters** — type, health, backend, stored objects, and utilization.
 
 Options
@@ -74,8 +81,8 @@ Options
 JSON Output
 -----------
 
-Use ``--format json`` for machine-readable output. Models and L2 adapters
-are collected into lists for easy programmatic access:
+Use ``--format json`` for machine-readable output. Models, kernel groups, and
+L2 adapters are collected into lists for easy programmatic access:
 
 .. code-block:: bash
 
@@ -100,15 +107,25 @@ are collected into lists for easy programmatic access:
            "model": "meta-llama/Llama-3.1-70B-Instruct",
            "world_size": 4,
            "gpu_ids": "0, 1, 2, 3",
-           "attention_backend": "vLLM non-MLA flash attention",
-           "gpu_kv_shape": "NL x [2, NB, BS, NH, HS]",
-           "gpu_kv_concrete_shape": "80 x [2, 2048, 128, 8, 128]",
            "num_layers": 80,
-           "block_size": 128,
-           "hidden_dim_sizes": [1024],
+           "num_blocks": 2048,
+           "cache_size_per_token": 327680
+         }
+       ],
+       "kernel_groups": [
+         {
+           "model": "meta-llama/Llama-3.1-70B-Instruct",
+           "kernel_group_idx": 0,
+           "engine_group_idx": 0,
+           "object_group_idx": 0,
+           "num_layers": 80,
+           "physical_block_size": 128,
+           "compress_ratio": 1,
            "dtype": "torch.float16",
            "is_mla": false,
-           "num_blocks": 2048
+           "attention_backend": "vLLM non-MLA flash attention",
+           "gpu_kv_shape": "NL x [2, NB, BS, NH, HS]",
+           "gpu_kv_concrete_shape": "80 x [2, 2048, 128, 8, 128]"
          }
        ],
        "l2_adapters": [
diff --git a/docs/source/mp/http_api.rst b/docs/source/mp/http_api.rst
index 1ac69b9a17..06086b76de 100644
--- a/docs/source/mp/http_api.rst
+++ b/docs/source/mp/http_api.rst
@@ -237,16 +237,26 @@ prefetch jobs. Intended for operators and debugging, not for monitoring
           "world_size": 1,
           "kv_cache_layout": {
             "num_layers": 32,
-            "block_size": 16,
-            "hidden_dim_sizes": "...",
-            "dtype": "torch.bfloat16",
-            "is_mla": false,
+            "inference_engine_logical_block_size": 16,
             "num_blocks": 12345,
-            "gpu_kv_format": "...",
-            "gpu_kv_shape": "...",
-            "gpu_kv_concrete_shape": "...",
-            "attention_backend": "...",
-            "cache_size_per_token": 131072
+            "cache_size_per_token": 131072,
+            "kernel_groups": [
+              {
+                "kernel_group_idx": 0,
+                "engine_group_idx": 0,
+                "object_group_idx": 0,
+                "num_layers": 32,
+                "layer_indices": [0, 1, "..."],
+                "physical_block_size": 16,
+                "compress_ratio": 1,
+                "dtype": "torch.bfloat16",
+                "gpu_kv_concrete_shape": "...",
+                "is_mla": false,
+                "gpu_kv_format": "...",
+                "gpu_kv_shape": "...",
+                "attention_backend": "..."
+              }
+            ]
           }
         }
       },
diff --git a/lmcache/cli/commands/describe.py b/lmcache/cli/commands/describe.py
index ad98d4d8e7..d23350821b 100644
--- a/lmcache/cli/commands/describe.py
+++ b/lmcache/cli/commands/describe.py
@@ -171,11 +171,13 @@ def add_l1_storage(self) -> None:
         )
 
     def add_models(self) -> None:
-        """Per-model KV cache layout sections."""
+        """Per-model KV cache layout sections.
+
+        Each model gets one section with context-wide fields, followed by
+        one ``kernel_groups`` list entry per kernel group carrying that
+        group's identity and geometry.
+        """
         gpu_meta = self.data.get("cache_context_meta", {})
-        if not gpu_meta:
-            # CB-only deployments populate cb_gpu_context_meta instead.
-            gpu_meta = self.data.get("cb_gpu_context_meta", {})
         if not gpu_meta:
             return
 
@@ -202,33 +204,54 @@ def add_models(self) -> None:
             layout = info.get("layout")
             if not layout:
                 continue
-            sec.add(
-                "attention_backend",
-                "Attention backend",
-                layout.get("attention_backend"),
-            )
-            sec.add("gpu_kv_shape", "GPU KV shape", layout.get("gpu_kv_shape"))
-            sec.add(
-                "gpu_kv_concrete_shape",
-                "GPU KV tensor shape",
-                layout.get("gpu_kv_concrete_shape"),
-            )
-            # CB-only contexts ship a singular ``hidden_dim_size``; wrap to
-            # match the plural list-shape used by the regular path.
-            if "hidden_dim_sizes" not in layout and "hidden_dim_size" in layout:
-                layout = dict(layout, hidden_dim_sizes=[layout["hidden_dim_size"]])
             for _key, _label in (
                 ("num_layers", "Num layers"),
-                ("block_size", "Block size"),
-                ("hidden_dim_sizes", "Hidden dim sizes"),
-                ("dtype", "Dtype"),
-                ("is_mla", "MLA"),
                 ("num_blocks", "Num blocks"),
                 ("cache_size_per_token", "Cache size per token (bytes)"),
             ):
                 if _key in layout:
                     sec.add(_key, _label, layout[_key])
 
+            self._add_kernel_groups(idx, model_name, layout.get("kernel_groups", []))
+
+    def _add_kernel_groups(
+        self, model_idx: int, model_name: str, kernel_groups: list
+    ) -> None:
+        """Emit one ``kernel_groups`` list section per kernel group.
+
+        Args:
+            model_idx: Index of the owning model section (keeps section keys
+                unique across models).
+            model_name: Human-readable model name, shown in each group header.
+            kernel_groups: The model layout's ``kernel_groups`` list (each a
+                dict produced by ``GPUCacheContext.report_status``).
+        """
+        for group in kernel_groups:
+            kg_idx = group.get("kernel_group_idx")
+            section_key = f"model_{model_idx}_kg_{kg_idx}"
+            self.metrics.add_list_section(
+                "kernel_groups",
+                section_key,
+                f"Kernel group {kg_idx} ({model_name})",
+            )
+            sec = self.metrics[section_key]
+            sec.add("model", "Model", model_name)
+            for _key, _label in (
+                ("kernel_group_idx", "Kernel group index"),
+                ("engine_group_idx", "Engine group index"),
+                ("object_group_idx", "Object group index"),
+                ("num_layers", "Num layers"),
+                ("physical_block_size", "Physical block size"),
+                ("compress_ratio", "Compress ratio"),
+                ("dtype", "Dtype"),
+                ("is_mla", "MLA"),
+                ("attention_backend", "Attention backend"),
+                ("gpu_kv_shape", "GPU KV shape"),
+                ("gpu_kv_concrete_shape", "GPU KV tensor shape"),
+            ):
+                if _key in group:
+                    sec.add(_key, _label, group[_key])
+
     def add_l2_adapters(self) -> None:
         """L2 adapter sections."""
         l2_adapters = safe_get(self.data, "storage_manager", "l2_adapters") or []
diff --git a/lmcache/v1/gpu_connector/utils.py b/lmcache/v1/gpu_connector/utils.py
index 8fdf21387d..8e39bbf672 100644
--- a/lmcache/v1/gpu_connector/utils.py
+++ b/lmcache/v1/gpu_connector/utils.py
@@ -427,6 +427,76 @@ def get_concrete_gpu_kv_shape(
     return f"Unknown ({gpu_kv_format})"
 
 
+def get_concrete_gpu_kv_shape_from_shape_desc(
+    shape_desc: "lmc_ops.PageBufferShapeDesc",
+    gpu_kv_format: "lmc_ops.GPUKVFormat",
+) -> str:
+    """Return the concrete shape for a single kernel group's ``shape_desc``.
+
+    Like :func:`get_concrete_gpu_kv_shape`, but the numeric values are
+    read from a per-group :class:`PageBufferShapeDesc` instead of from
+    the whole ``kv_caches`` structure. This makes the result
+    *group-accurate*: ``shape_desc.nl`` is the number of layers in the
+    group (not the model total), so for hybrid models each kernel group
+    reports its own shape.
+
+    For example, instead of ``NL x [2, NB, BS, NH, HS]`` this returns
+    ``80 x [2, 2048, 128, 8, 128]``.
+
+    Args:
+        shape_desc: The kernel group's shape descriptor. Numeric values
+            are pulled from its ``nl``/``nb``/``bs``/``nh``/``hs`` fields;
+            the page-buffer-size (``PBS``) formats use ``nb * bs``.
+        gpu_kv_format: The GPU KV format that determines the symbolic
+            shape template.
+
+    Returns:
+        The shape string with numeric values substituted, or
+        ``"Unknown (<format>)"`` for an unrecognised format.
+    """
+    nl = shape_desc.nl
+    nb = shape_desc.nb
+    bs = shape_desc.bs
+    nh = shape_desc.nh
+    hs = shape_desc.hs
+    pbs = nb * bs
+
+    fmt = gpu_kv_format
+    F = lmc_ops.GPUKVFormat
+
+    if fmt == F.NB_NL_TWO_BS_NH_HS:
+        return f"[{nb}, {nl}, 2, {bs}, {nh}, {hs}]"
+
+    if fmt == F.NL_X_TWO_NB_BS_NH_HS:
+        return f"{nl} x [2, {nb}, {bs}, {nh}, {hs}]"
+
+    if fmt == F.NL_X_NB_TWO_BS_NH_HS:
+        return f"{nl} x [{nb}, 2, {bs}, {nh}, {hs}]"
+
+    if fmt == F.NL_X_NB_BS_HS:
+        return f"{nl} x [{nb}, {bs}, {hs}]"
+
+    if fmt == F.TWO_X_NL_X_NBBS_NH_HS:
+        return f"2 x {nl} x [{pbs}, {nh}, {hs}]"
+
+    if fmt == F.TWO_X_NL_X_NB_BS_NH_HS:
+        return f"2 x {nl} x [{nb}, {bs}, {nh}, {hs}]"
+
+    if fmt == F.NL_X_NBBS_ONE_HS:
+        return f"{nl} x [{pbs}, 1, {hs}]"
+
+    if fmt == F.NL_X_TWO_NB_NH_BS_HS:
+        return f"{nl} x [2, {nb}, {nh}, {bs}, {hs}]"
+
+    if fmt == F.NL_X_NB_TWO_NH_BS_HS:
+        return f"{nl} x [{nb}, 2, {nh}, {bs}, {hs}]"
+
+    if fmt == F.NB_NL_TWO_NH_BS_HS:
+        return f"[{nb}, {nl}, 2, {nh}, {bs}, {hs}]"
+
+    return f"Unknown ({gpu_kv_format})"
+
+
 def legible_print_gpu_kv_format(gpu_kv_format: "lmc_ops.GPUKVFormat"):
     """
     Print the GPU KV Format in a legible way
diff --git a/lmcache/v1/multiprocess/gpu_context.py b/lmcache/v1/multiprocess/gpu_context.py
index b3eef93296..ac20961721 100644
--- a/lmcache/v1/multiprocess/gpu_context.py
+++ b/lmcache/v1/multiprocess/gpu_context.py
@@ -26,7 +26,7 @@
 from lmcache.v1.gpu_connector.utils import (
     LayoutHints,
     get_attention_backend,
-    get_concrete_gpu_kv_shape,
+    get_concrete_gpu_kv_shape_from_shape_desc,
     get_device,
     get_dtype,
     get_gpu_kv_shape_description,
@@ -639,59 +639,74 @@ def cache_size_per_token(self) -> int:
     def report_status(self) -> dict:
         """Return this context's KV cache layout metadata for ``/status``.
 
-        Builds the ``kv_cache_layout`` sub-dict surfaced by the ``/status``
-        HTTP endpoint (see ``GPUTransferModule.report_status``) and consumed by
-        the ``lmcache`` CLI (``lmcache describe kvcache`` and
-        ``lmcache bench engine``). It describes only the KV cache geometry; the
-        owning module wraps it with ``model_name``/``world_size``, which this
-        context does not track.
-
         Returns:
-            A dict with one entry per documented layout field:
+            A dict with these top-level fields:
 
-            - ``num_layers`` (int)
+            - ``num_layers`` (int): total layers in the model.
             - ``inference_engine_logical_block_size`` (int)
-            - ``group_physical_block_sizes`` (list[int]): per-group
-              ``shape_desc.bs``
-            - ``group_compress_ratios`` (list[int]): per-group compress ratio
-            - ``hidden_dim_sizes`` (str): stringified per-group hidden-dim list
-            - ``dtype`` (str): stringified torch dtype
-            - ``is_mla`` (bool)
             - ``num_blocks`` (int)
-            - ``gpu_kv_format`` (str): GPU KV format enum name
-            - ``gpu_kv_shape`` (str): symbolic shape description
-            - ``gpu_kv_concrete_shape`` (str): shape with numeric values
-            - ``attention_backend`` (str)
-            - ``cache_size_per_token`` (int): bytes per logical token
-        """
-        # TODO(compat): the key names and value *formatting* below are a
-        # contract with the `/status` endpoint and the `lmcache` CLI
-        # (`lmcache/cli/commands/describe.py`, `bench/engine_bench/config.py`).
-        # Renaming a key breaks `lmcache describe kvcache`; dropping
-        # `cache_size_per_token` breaks `lmcache bench engine`. `hidden_dim_sizes`
-        # and `dtype` are stringified only for back-compat with those consumers
-        # and should become a real list / structured value once the CLI is
-        # updated to parse them.
+            - ``cache_size_per_token`` (int): bytes per logical token,
+              summed across groups.
+            - ``kernel_groups`` (list[dict]): one entry per kernel group,
+              each with:
+
+              - ``kernel_group_idx`` (int): index into ``manager.kernel_groups``.
+              - ``engine_group_idx`` (int): paged-block address space.
+              - ``object_group_idx`` (int): owning object group.
+              - ``num_layers`` (int): layers in this group.
+              - ``layer_indices`` (list[int]): the group's layer indices.
+              - ``physical_block_size`` (int): ``shape_desc.bs``.
+              - ``compress_ratio`` (int)
+              - ``dtype`` (str): stringified torch dtype.
+              - ``gpu_kv_concrete_shape`` (str): group-accurate numeric shape.
+              - ``is_mla`` (bool)
+              - ``gpu_kv_format`` (str): GPU KV format enum name.
+              - ``gpu_kv_shape`` (str): symbolic shape description.
+              - ``attention_backend`` (str)
+        """
         manager = self.kv_layer_groups_manager
         kernel_groups = manager.kernel_groups
+
+        # Reverse-map each kernel group to its owning object group.
+        kernel_group_to_object_group: dict[int, int] = {
+            kg_idx: og_idx
+            for og_idx, og in enumerate(manager.object_groups)
+            for kg_idx in og.kernel_group_indices
+        }
+
+        gpu_kv_format = self.gpu_kv_format_
+        group_reports: list[dict] = []
+        for kernel_group_idx, group in enumerate(kernel_groups):
+            group_reports.append(
+                {
+                    "kernel_group_idx": kernel_group_idx,
+                    "engine_group_idx": group.engine_group_idx,
+                    "object_group_idx": kernel_group_to_object_group.get(
+                        kernel_group_idx, 0
+                    ),
+                    "num_layers": group.num_layers,
+                    "layer_indices": list(group.layer_indices),
+                    "physical_block_size": group.shape_desc.bs,
+                    "compress_ratio": group.compress_ratio,
+                    "dtype": str(group.dtype),
+                    "gpu_kv_concrete_shape": get_concrete_gpu_kv_shape_from_shape_desc(
+                        group.shape_desc, gpu_kv_format
+                    ),
+                    "is_mla": is_mla(gpu_kv_format),
+                    "gpu_kv_format": gpu_kv_format.name,
+                    "gpu_kv_shape": get_gpu_kv_shape_description(gpu_kv_format),
+                    "attention_backend": get_attention_backend(gpu_kv_format),
+                }
+            )
+
         return {
             "num_layers": self.num_layers,
             "inference_engine_logical_block_size": (
                 manager.inference_engine_logical_block_size
             ),
-            "group_physical_block_sizes": [g.shape_desc.bs for g in kernel_groups],
-            "group_compress_ratios": [g.compress_ratio for g in kernel_groups],
-            "hidden_dim_sizes": str([g.hidden_dim_size for g in kernel_groups]),
-            "dtype": str(self.dtype),
-            "is_mla": self.is_mla,
             "num_blocks": self.num_blocks,
-            "gpu_kv_format": self.gpu_kv_format_.name,
-            "gpu_kv_shape": get_gpu_kv_shape_description(self.gpu_kv_format_),
-            "gpu_kv_concrete_shape": get_concrete_gpu_kv_shape(
-                self.kv_caches_, self.gpu_kv_format_
-            ),
-            "attention_backend": get_attention_backend(self.gpu_kv_format_),
             "cache_size_per_token": self.cache_size_per_token(),
+            "kernel_groups": group_reports,
         }
 
 
diff --git a/tests/cli/test_describe.py b/tests/cli/test_describe.py
index e493c86cd2..c99aea8458 100644
--- a/tests/cli/test_describe.py
+++ b/tests/cli/test_describe.py
@@ -35,12 +35,26 @@
             "world_size": 1,
             "kv_cache_layout": {
                 "num_layers": 32,
-                "block_size": 16,
-                "hidden_dim_sizes": 128,
-                "dtype": "torch.float16",
-                "is_mla": False,
+                "inference_engine_logical_block_size": 16,
                 "num_blocks": 2048,
                 "cache_size_per_token": 163840,
+                "kernel_groups": [
+                    {
+                        "kernel_group_idx": 0,
+                        "engine_group_idx": 0,
+                        "object_group_idx": 0,
+                        "num_layers": 32,
+                        "layer_indices": list(range(32)),
+                        "physical_block_size": 16,
+                        "compress_ratio": 1,
+                        "dtype": "torch.float16",
+                        "gpu_kv_concrete_shape": "32 x [2, 2048, 16, 8, 128]",
+                        "is_mla": False,
+                        "gpu_kv_format": "NL_X_TWO_NB_BS_NH_HS",
+                        "gpu_kv_shape": "NL x [2, NB, BS, NH, HS]",
+                        "attention_backend": "vLLM non-MLA flash attention",
+                    },
+                ],
             },
         },
     },
@@ -180,11 +194,24 @@ class FakeArgs:
         assert model["world_size"] == 1
         assert model["gpu_ids"] == "0"
         assert model["num_layers"] == 32
-        assert model["block_size"] == 16
-        assert model["hidden_dim_sizes"] == 128
-        assert model["dtype"] == "torch.float16"
-        assert model["is_mla"] is False
         assert model["num_blocks"] == 2048
+        assert model["cache_size_per_token"] == 163840
+
+        # Per-kernel-group section (list)
+        assert "kernel_groups" in m
+        kg = m["kernel_groups"][0]
+        assert kg["model"] == "llama"
+        assert kg["kernel_group_idx"] == 0
+        assert kg["engine_group_idx"] == 0
+        assert kg["object_group_idx"] == 0
+        assert kg["num_layers"] == 32
+        assert kg["physical_block_size"] == 16
+        assert kg["compress_ratio"] == 1
+        assert kg["dtype"] == "torch.float16"
+        assert kg["is_mla"] is False
+        assert kg["attention_backend"] == "vLLM non-MLA flash attention"
+        assert kg["gpu_kv_shape"] == "NL x [2, NB, BS, NH, HS]"
+        assert kg["gpu_kv_concrete_shape"] == "32 x [2, 2048, 16, 8, 128]"
 
     def test_unhealthy(self):
         """Verify health shows UNHEALTHY when is_healthy is False."""
diff --git a/tests/v1/gpu_connector/test_concrete_shape.py b/tests/v1/gpu_connector/test_concrete_shape.py
new file mode 100644
index 0000000000..2abc5fc7d8
--- /dev/null
+++ b/tests/v1/gpu_connector/test_concrete_shape.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for :func:`get_concrete_gpu_kv_shape_from_shape_desc`.
+
+These run without a CUDA build: ``lmcache.c_ops`` resolves to the
+pure-Python fallback, which provides both ``PageBufferShapeDesc`` and
+``GPUKVFormat``.
+"""
+
+# First Party
+from lmcache.v1.gpu_connector.utils import (
+    get_concrete_gpu_kv_shape_from_shape_desc,
+)
+import lmcache.c_ops as lmc_ops
+
+
+def _make_shape_desc(
+    *, kv_size: int, nl: int, nb: int, bs: int, nh: int, hs: int
+) -> "lmc_ops.PageBufferShapeDesc":
+    """Build a ``PageBufferShapeDesc`` with the given geometry."""
+    sd = lmc_ops.PageBufferShapeDesc()
+    sd.kv_size = kv_size
+    sd.nl = nl
+    sd.nb = nb
+    sd.bs = bs
+    sd.nh = nh
+    sd.hs = hs
+    sd.element_size = 2
+    sd.block_stride_elems = 0
+    return sd
+
+
+def test_concrete_shape_vllm_flash_attn():
+    sd = _make_shape_desc(kv_size=2, nl=32, nb=2048, bs=16, nh=8, hs=128)
+    out = get_concrete_gpu_kv_shape_from_shape_desc(
+        sd, lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS
+    )
+    assert out == "32 x [2, 2048, 16, 8, 128]"
+
+
+def test_concrete_shape_vllm_mla():
+    sd = _make_shape_desc(kv_size=1, nl=61, nb=1024, bs=64, nh=1, hs=512)
+    out = get_concrete_gpu_kv_shape_from_shape_desc(
+        sd, lmc_ops.GPUKVFormat.NL_X_NB_BS_HS
+    )
+    assert out == "61 x [1024, 64, 512]"
+
+
+def test_concrete_shape_uses_pbs_for_folded_formats():
+    # NL_X_NBBS_ONE_HS folds num_blocks * block_size into one PBS dim.
+    sd = _make_shape_desc(kv_size=1, nl=2, nb=32, bs=16, nh=1, hs=128)
+    out = get_concrete_gpu_kv_shape_from_shape_desc(
+        sd, lmc_ops.GPUKVFormat.NL_X_NBBS_ONE_HS
+    )
+    assert out == "2 x [512, 1, 128]"  # 512 == 32 * 16
+
+
+def test_concrete_shape_is_group_accurate():
+    # Two groups with different layer counts produce different shapes for
+    # the same format — the whole-context helper could not do this.
+    fmt = lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS
+    g0 = _make_shape_desc(kv_size=2, nl=4, nb=128, bs=16, nh=8, hs=64)
+    g1 = _make_shape_desc(kv_size=2, nl=2, nb=128, bs=16, nh=16, hs=64)
+    assert get_concrete_gpu_kv_shape_from_shape_desc(g0, fmt) == (
+        "4 x [2, 128, 16, 8, 64]"
+    )
+    assert get_concrete_gpu_kv_shape_from_shape_desc(g1, fmt) == (
+        "2 x [2, 128, 16, 16, 64]"
+    )
diff --git a/tests/v1/multiprocess/test_gpu_context.py b/tests/v1/multiprocess/test_gpu_context.py
index 535cdf4ef3..75790bed51 100644
--- a/tests/v1/multiprocess/test_gpu_context.py
+++ b/tests/v1/multiprocess/test_gpu_context.py
@@ -474,41 +474,67 @@ def test_calculate_num_blocks_matches_manager(self) -> None:
 
 
 class TestGPUCacheContextReportStatus:
+    _TOP_LEVEL_KEYS = {
+        "num_layers",
+        "inference_engine_logical_block_size",
+        "num_blocks",
+        "cache_size_per_token",
+        "kernel_groups",
+    }
+    _GROUP_KEYS = {
+        "kernel_group_idx",
+        "engine_group_idx",
+        "object_group_idx",
+        "num_layers",
+        "layer_indices",
+        "physical_block_size",
+        "compress_ratio",
+        "dtype",
+        "gpu_kv_concrete_shape",
+        "is_mla",
+        "gpu_kv_format",
+        "gpu_kv_shape",
+        "attention_backend",
+    }
+
     def test_report_status_fields(self) -> None:
         ctx = _make_context(_SINGLE_GROUP)
         status = ctx.report_status()
 
-        expected_keys = {
-            "num_layers",
-            "inference_engine_logical_block_size",
-            "group_physical_block_sizes",
-            "group_compress_ratios",
-            "hidden_dim_sizes",
-            "dtype",
-            "is_mla",
-            "num_blocks",
-            "gpu_kv_format",
-            "gpu_kv_shape",
-            "gpu_kv_concrete_shape",
-            "attention_backend",
-            "cache_size_per_token",
-        }
-        assert set(status.keys()) == expected_keys
-
+        assert set(status.keys()) == self._TOP_LEVEL_KEYS
         assert status["num_layers"] == 4
-        assert status["is_mla"] is False
-        assert status["group_compress_ratios"] == [1]
-        assert status["gpu_kv_format"] == "NL_X_TWO_NB_BS_NH_HS"
-        assert status["dtype"] == str(ctx.dtype)
         assert status["cache_size_per_token"] == ctx.cache_size_per_token()
 
+        assert len(status["kernel_groups"]) == 1
+        group = status["kernel_groups"][0]
+        assert set(group.keys()) == self._GROUP_KEYS
+        assert group["kernel_group_idx"] == 0
+        assert group["num_layers"] == 4
+        assert group["layer_indices"] == [0, 1, 2, 3]
+        assert group["is_mla"] is False
+        assert group["compress_ratio"] == 1
+        assert group["gpu_kv_format"] == "NL_X_TWO_NB_BS_NH_HS"
+        assert group["dtype"] == str(ctx.dtype)
+
     def test_report_status_multi_group(self) -> None:
         ctx = _make_context(_MULTI_GROUP)
         manager = ctx.kv_layer_groups_manager
         status = ctx.report_status()
         assert status["num_layers"] == 6
-        assert len(status["group_physical_block_sizes"]) == manager.num_kernel_groups
-        assert len(status["group_compress_ratios"]) == manager.num_kernel_groups
+        assert len(status["kernel_groups"]) == manager.num_kernel_groups
+
+        # Group reports enumerate in order and stay self-consistent with the
+        # manager's kernel groups.
+        for kg_idx, (group, kernel_group) in enumerate(
+            zip(status["kernel_groups"], manager.kernel_groups, strict=False)
+        ):
+            assert set(group.keys()) == self._GROUP_KEYS
+            assert group["kernel_group_idx"] == kg_idx
+            assert group["engine_group_idx"] == kernel_group.engine_group_idx
+            assert group["num_layers"] == kernel_group.num_layers
+            assert group["physical_block_size"] == kernel_group.shape_desc.bs
+            assert group["compress_ratio"] == kernel_group.compress_ratio
+            assert 0 <= group["object_group_idx"] < manager.num_object_groups
 
 
 if __name__ == "__main__":

From cb193c741be965178f8228ae6ce85d72ce7c8c44 Mon Sep 17 00:00:00 2001
From: sonimwang <17816198144@163.com>
Date: Tue, 9 Jun 2026 10:43:34 +0800
Subject: [PATCH 09/57] fix(zh_CN): correct machine translation errors in
 documentation (#3592)

Signed-off-by: sonimwang <17816198144@163.com>
---
 docs/source/locale/zh_CN/LC_MESSAGES/cli/index.po  |  4 ++--
 .../LC_MESSAGES/developer_guide/contributing.po    |  8 ++++----
 docs/source/locale/zh_CN/LC_MESSAGES/index.po      |  2 +-
 .../zh_CN/LC_MESSAGES/kv_cache_management/index.po |  4 ++--
 docs/source/locale/zh_CN/LC_MESSAGES/mp/index.po   |  4 ++--
 .../locale/zh_CN/LC_MESSAGES/recipes/index.po      | 14 +++++++-------
 .../locale/zh_CN/LC_MESSAGES/recipes/minimax_m2.po |  2 +-
 7 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/cli/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/cli/index.po
index b189725888..1bb99fa3d3 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/cli/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/cli/index.po
@@ -77,7 +77,7 @@ msgstr "``lmcache-cli``"
 
 #: ../../source/cli/index.rst:34
 msgid "``pip install lmcache-cli``"
-msgstr "``pip install kvcache``"
+msgstr "``pip install lmcache-cli``"
 
 #: ../../source/cli/index.rst:35
 msgid ""
@@ -147,7 +147,7 @@ msgstr "对推理引擎（``engine``）、LMCache MP 服务器（``server``）
 
 #: ../../source/cli/index.rst:64
 msgid ":doc:`kvcache`"
-msgstr ":kvcache:"
+msgstr ":doc:`kvcache`"
 
 #: ../../source/cli/index.rst:65
 msgid "Manage KV cache state (e.g. clear L1 cache) on a running server."
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contributing.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contributing.po
index 8dff10e725..366ba73c1c 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contributing.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contributing.po
@@ -28,7 +28,7 @@ msgid ""
 "Thank you for your interest in contributing to LMCache! We welcome and "
 "accept all kinds of contributions, no matter how small or large. There "
 "are several ways you can contribute to the project:"
-msgstr "感谢您对为 LMCache 贡献的兴趣！我们欢迎并接受各种形式的贡献，无论大小。您可以通过几种方式为项目做出贡献："
+msgstr "感谢您有兴趣为 LMCache 做出贡献！我们欢迎并接受各种形式的贡献，无论大小。您可以通过以下几种方式为项目做出贡献："
 
 #: ../../source/developer_guide/contributing.rst:6
 msgid "Identify and report any issues or bugs"
@@ -51,7 +51,7 @@ msgid ""
 "A comprehensive list of good first issues can be found in the issue "
 "`[Onboarding][Q4] Welcoming contributors with good first issues! "
 "<https://github.com/LMCache/LMCache/issues/1882>`_."
-msgstr "可以在问题 `[Onboarding][Q4] 欢迎贡献者的好第一问题！ <https://github.com/LMCache/LMCache/issues/1882>`_ 中找到一个全面的好第一问题列表。"
+msgstr "可以在 Issue `[Onboarding][Q4] Welcoming contributors with good first issues! <https://github.com/LMCache/LMCache/issues/1882>`_ 中找到完整的适合新手的 Issue 列表。"
 
 #: ../../source/developer_guide/contributing.rst:13
 msgid ""
@@ -103,7 +103,7 @@ msgid ""
 "and there is always a need for more test coverage. If you see something "
 "that you think should be fixed, take ownership! Here is how you get "
 "started."
-msgstr "对开源项目的帮助总是受欢迎的，总有一些可以改进的地方。例如，文档（就像您现在正在阅读的文本）总是可以改进，代码总是可以更清晰，变量或函数总是可以重命名或添加注释，并且总是需要更多的测试覆盖率。如果您看到认为应该修复的内容，请主动承担责任！以下是您如何开始的指南。"
+msgstr "对开源项目的帮助总是受欢迎的，总有一些可以改进的地方。例如，文档（就像您现在正在阅读的文本）总是可以改进，代码总是可以更清晰，变量或函数总是可以重命名或添加注释，并且总是需要更多的测试覆盖率。如果您看到认为应该修复的内容，请主动承担责任！以下是入门指南。"
 
 #: ../../source/developer_guide/contributing.rst:33
 msgid "How Can I Contribute?"
@@ -487,7 +487,7 @@ msgstr "在 http://localhost:8000 本地服务文档页面： :code:`python -m h
 
 #: ../../source/developer_guide/contributing.rst:201
 msgid "Thank You"
-msgstr "谢谢你"
+msgstr "感谢"
 
 #: ../../source/developer_guide/contributing.rst:203
 msgid ""
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/index.po
index d896b3313d..d83dc9acf8 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/index.po
@@ -29,7 +29,7 @@ msgstr "入门指南"
 
 #: ../../source/index.rst:86
 msgid "Recipes"
-msgstr "食谱"
+msgstr "使用指南"
 
 #: ../../source/index.rst:94
 msgid "KV Cache offloading and sharing"
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/kv_cache_management/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/kv_cache_management/index.po
index 63d3d35f23..afd0ff958b 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/kv_cache_management/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/kv_cache_management/index.po
@@ -120,7 +120,7 @@ msgstr ":ref:`压缩 <compress>`: 压缩 KV Cache。"
 
 #: ../../source/kv_cache_management/index.rst:41
 msgid ":ref:`Health <health>`: Check the health status of cache workers."
-msgstr "`:ref:`Health <health>`: 检查缓存工作线程的健康状态。`"
+msgstr ":ref:`Health <health>`: 检查缓存工作线程的健康状态。"
 
 #: ../../source/kv_cache_management/index.rst:42
 msgid ":ref:`Lookup <lookup>`: Lookup the KV cache for a given list of tokens."
@@ -138,7 +138,7 @@ msgstr ":ref:`Pin <pin>`: 持久化 KV Cache 以防止其被逐出。"
 msgid ""
 ":ref:`CheckFinish <check_finish>`: Check whether a (non-blocking) control"
 " event has finished or not."
-msgstr "`:ref:`CheckFinish <check_finish>`: 检查一个（非阻塞）控制事件是否已经完成。`"
+msgstr ":ref:`CheckFinish <check_finish>`: 检查一个（非阻塞）控制事件是否已经完成。"
 
 #: ../../source/kv_cache_management/index.rst:46
 msgid ":ref:`QueryWorkerInfo <query_worker_info>`: Query the worker info."
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/index.po
index 31459db2e0..63182bf8d1 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/mp/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/index.po
@@ -37,7 +37,7 @@ msgstr "LMCache 多进程 (MP) 模式将 LMCache 作为一个 **独立服务** 
 
 #: ../../source/mp/index.rst:10
 msgid "Key Benefits"
-msgstr "关键好处"
+msgstr "主要优势"
 
 #: ../../source/mp/index.rst:12
 msgid ""
@@ -126,7 +126,7 @@ msgstr "``python3 -m lmcache.v1.multiprocess.server``"
 msgid ""
 "(Legacy) ZMQ-only server using MPCacheEngine (no HTTP endpoints). Prefer "
 "``lmcache server``."
-msgstr "(遗留) 仅使用 MPCacheEngine 的 ZMQ 服务器（没有 HTTP 端点）。请使用 ``lmcache server``。"
+msgstr "（遗留）仅使用 MPCacheEngine 的 ZMQ 服务器（没有 HTTP 端点）。请使用 ``lmcache server``。"
 
 #: ../../source/mp/index.rst:51
 msgid "``python3 -m lmcache.v1.multiprocess.blend_server_v2``"
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/recipes/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/index.po
index 87474a844c..ced9f6bde5 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/recipes/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/index.po
@@ -21,7 +21,7 @@ msgstr ""
 
 #: ../../source/recipes/index.rst:4
 msgid "Recipes"
-msgstr "食谱"
+msgstr "使用指南"
 
 #: ../../source/recipes/index.rst:6
 msgid ""
@@ -34,15 +34,15 @@ msgstr "本节列出了经过 LMCache 端到端验证的模型架构，每个架
 msgid ""
 "Engine-side documentation (how to serve the model itself) lives with the "
 "serving engine. Recipe pages link out rather than duplicate."
-msgstr "引擎端文档（如何服务模型本身）与服务引擎一起存在。食谱页面链接而不是重复。"
+msgstr "引擎端文档（如何服务模型本身）随服务引擎一起维护。使用指南页面提供外部链接，不重复已有内容。"
 
 #: ../../source/recipes/index.rst:14
 msgid "Recipe page contents"
-msgstr "食谱页面内容"
+msgstr "使用指南页面内容"
 
 #: ../../source/recipes/index.rst:16
 msgid "Each recipe page is intentionally minimal:"
-msgstr "每个食谱页面都故意保持简约："
+msgstr "每个使用指南页面都故意保持简约："
 
 #: ../../source/recipes/index.rst:18
 msgid "**Validated models** -- exact HF repo IDs that have been tested."
@@ -81,7 +81,7 @@ msgid ""
 msgstr ""
 "有关通用 LMCache + 引擎连接（端口、远程主机、进程内模式、发送第一个请求），请参阅 "
 ":doc:`../getting_started/quickstart` 和 "
-":doc:`../mp/quickstart`。食谱假设这些页面是先决条件。"
+":doc:`../mp/quickstart`。使用指南假设这些页面是先决条件。"
 
 #: ../../source/recipes/index.rst:33
 msgid "Supported architectures"
@@ -109,7 +109,7 @@ msgstr "TRT-LLM"
 
 #: ../../source/recipes/index.rst:44
 msgid "Recipe"
-msgstr "食谱"
+msgstr "使用指南"
 
 #: ../../source/recipes/index.rst:46
 msgid "``MiniMaxM2ForCausalLM``"
@@ -231,7 +231,7 @@ msgstr "图例：``✓`` 已验证，``—`` 未验证。"
 
 #: ../../source/recipes/index.rst:105
 msgid "Contributing a recipe"
-msgstr "贡献一个食谱"
+msgstr "贡献一个使用指南"
 
 #: ../../source/recipes/index.rst:107
 msgid "To add a new architecture:"
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/recipes/minimax_m2.po b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/minimax_m2.po
index 5d7ad47c9d..b90f18d9fd 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/recipes/minimax_m2.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/minimax_m2.po
@@ -98,7 +98,7 @@ msgid ""
 "<https://docs.sglang.io/cookbook/autoregressive/MiniMax/MiniMax-M2>`_, "
 "`MiniMax M2.5/M2.1/M2 usage guide "
 "<https://docs.sglang.io/docs/basic_usage/minimax_m2>`_."
-msgstr "**引擎文档：** `MiniMax-M2 SGLang 食谱 <https://docs.sglang.io/cookbook/autoregressive/MiniMax/MiniMax-M2>`_，`MiniMax M2.5/M2.1/M2 使用指南 <https://docs.sglang.io/docs/basic_usage/minimax_m2>`_。"
+msgstr "**引擎文档：** `MiniMax-M2 SGLang 实战指南 <https://docs.sglang.io/cookbook/autoregressive/MiniMax/MiniMax-M2>`_，`MiniMax M2.5/M2.1/M2 使用指南 <https://docs.sglang.io/docs/basic_usage/minimax_m2>`_。"
 
 #: ../../source/recipes/minimax_m2.rst:93
 msgid "**Status:** Not validated with LMCache."

From ae328a66b0d9bc2c5e5f09c1a686e37766a03199 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jun 2026 11:53:54 +0800
Subject: [PATCH 10/57] [CI] Improve CI stability: gemma-4 test & serde test
 (#3556)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 .buildkite/k3_tests/multiprocess/pipeline.yml | 58 +++++-------
 .../multiprocess/scripts/run-hma-lm-eval.sh   | 92 ++++++++-----------
 .../multiprocess/scripts/run-single-test.sh   |  9 +-
 docs/source/recipes/gemma4.rst                | 17 ++--
 docs/source/recipes/index.rst                 |  7 ++
 tests/v1/distributed/serde/test_serde_e2e.py  | 45 ++++++---
 6 files changed, 110 insertions(+), 118 deletions(-)

diff --git a/.buildkite/k3_tests/multiprocess/pipeline.yml b/.buildkite/k3_tests/multiprocess/pipeline.yml
index b9fa979b86..fa47e37108 100644
--- a/.buildkite/k3_tests/multiprocess/pipeline.yml
+++ b/.buildkite/k3_tests/multiprocess/pipeline.yml
@@ -26,40 +26,6 @@ steps:
                   - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } }
         artifact_paths: ["*.log"]
 
-      # HMA (hybrid memory allocator) correctness check on google/gemma-4-31B-it.
-      # It interleaves sliding + full attention whose full layers use a larger
-      # head_dim (512 vs 256), so vLLM gives its KV cache groups different block
-      # sizes -- exercising LMCache's per-group block-size handling. Runs gsm8k,
-      # resets vLLM's local prefix cache (LMCache preserved), reruns, and asserts
-      # the scores match (run1 == run2 == no-LMCache baseline). Needs 2 GPUs
-      # (LMCache+vLLM + baseline). It is public (no HF_TOKEN), forces TRITON_ATTN
-      # (so ATTENTION_BACKEND=auto and a non-zero SCORE_TOLERANCE, since
-      # TRITON_ATTN is not bit-exact under batch invariance), and its ~63GB of
-      # weights need a higher GPU_MEMORY_UTILIZATION than the 0.5 default.
-      - label: ":compression: hma_lm_eval_gemma4"
-        command: .buildkite/k3_tests/multiprocess/run.sh hma_lm_eval_gemma4
-        timeout_in_minutes: 60
-        env:
-          MODEL: "google/gemma-4-31B-it"
-          SCORE_TOLERANCE: "0.05"
-          ATTENTION_BACKEND: "auto"
-          GPU_MEMORY_UTILIZATION: "0.85"
-          # Skip CUDA-graph capture so the large model doesn't time out at launch
-          # (safe here: this test uses a tolerance, not the bit-exact check).
-          ENFORCE_EAGER: "1"
-          # 31B weights are large; allow longer for download + load before the
-          # readiness probe gives up (other models keep the 300s default).
-          MAX_WAIT_SECONDS: "400"
-          # LIMIT = number of gsm8k samples. 31B's large per-token KV makes the
-          # full 200-sample working set overflow the CPU pool and thrash, so run
-          # 2 misses LMCache; cap the samples and enlarge the pool to keep run 2
-          # cache-served. CPU_BUFFER_SIZE (GB) is bounded by node RAM.
-          LIMIT: "100"
-          CPU_BUFFER_SIZE: "200"
-        agents: { queue: "k8s" }
-        plugins: [{ kubernetes: { podSpec: *pod-2gpu } }]
-        artifact_paths: ["*.log"]
-
       - label: ":compression: long_doc_qa"
         command: .buildkite/k3_tests/multiprocess/run.sh long_doc_qa
         timeout_in_minutes: 30
@@ -99,6 +65,30 @@ steps:
                 volumes: *vols
         artifact_paths: ["*.log"]
 
+      # HMA correctness check on google/gemma-4-31B-it (a hybrid model whose KV
+      # cache groups get different block sizes). Runs gsm8k, resets vLLM's prefix
+      # cache (LMCache preserved), reruns served by LMCache, and asserts the two
+      # runs' scores match. Single GPU, no baseline.
+      - label: ":compression: hma_lm_eval_gemma4"
+        command: .buildkite/k3_tests/multiprocess/run.sh hma_lm_eval_gemma4
+        timeout_in_minutes: 60
+        env:
+          MODEL: "google/gemma-4-31B-it"
+          # Require an exact score match between the two runs.
+          SCORE_TOLERANCE: "0"
+          ATTENTION_BACKEND: "auto"
+          GPU_MEMORY_UTILIZATION: "0.85"
+          # 31B load + CUDA-graph capture is slow; raise the readiness timeout
+          # above the 300s default.
+          MAX_WAIT_SECONDS: "600"
+          # Cap samples and enlarge the CPU pool so the retrieve run stays
+          # cache-served (31B's per-token KV is large).
+          LIMIT: "100"
+          CPU_BUFFER_SIZE: "200"
+        agents: { queue: "k8s" }
+        plugins: [{ kubernetes: { podSpec: *pod-1gpu } }]
+        artifact_paths: ["*.log"]
+
       - label: ":compression: fault_tolerance"
         command: .buildkite/k3_tests/multiprocess/run.sh fault_tolerance
         timeout_in_minutes: 30
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-hma-lm-eval.sh b/.buildkite/k3_tests/multiprocess/scripts/run-hma-lm-eval.sh
index f650a0a7db..5d2f33e9c5 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/run-hma-lm-eval.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/run-hma-lm-eval.sh
@@ -11,17 +11,16 @@
 #     Qwen3.5/Qwen3-Next, whose state caches LMCache cannot yet transfer).
 #   - Public, so no HF_TOKEN is required.
 #
-# Flow:
-#   1. Run lm_eval (gsm8k) against vLLM+LMCache       -> populates LMCache (STORE).
+# Flow (single GPU, no baseline server):
+#   1. vLLM run: lm_eval (gsm8k) against vLLM+LMCache, populating LMCache.
 #   2. Reset vLLM's *local* prefix cache (APC) only, leaving LMCache intact, via
 #      the dev-mode endpoint POST /reset_prefix_cache (reset_external defaults to
 #      false, so the LMCache-managed cache is preserved).
-#   3. Re-run lm_eval                                  -> vLLM APC misses, so the
-#      prefix KV is served by LMCache (RETRIEVE), exercising the HMA retrieve path.
-#   4. Assert the three gsm8k scores agree within SCORE_TOLERANCE (run 1 store ==
-#      run 2 retrieve == no-LMCache baseline); a broken retrieve corrupts the KV
-#      and the score diverges.
-#   5. Assert LMCache actually served retrieves during run 2 (non-vacuous).
+#   3. LMCache retrieve run: re-run lm_eval; vLLM's APC misses, so the prefix KV
+#      is served by LMCache.
+#   4. Assert the two runs' gsm8k scores match -- a broken LMCache would skew the
+#      retrieved KV and make them diverge.
+#   5. Assert LMCache actually served retrieves in the retrieve run (non-vacuous).
 #
 # The reset endpoint requires VLLM_SERVER_DEV_MODE=1 (set by launch-processes.sh).
 set -e
@@ -34,40 +33,35 @@ source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh"
 
 # Configuration
 VLLM_PORT="${VLLM_PORT:-8000}"
-VLLM_BASELINE_PORT="${VLLM_BASELINE_PORT:-9000}"
 MODEL="${MODEL:-google/gemma-4-31B-it}"
 NUM_CONCURRENT="${NUM_CONCURRENT:-50}"
 # 31B has a large per-token KV footprint; cap the sample count so the working
-# set fits the CPU pool (a too-large set thrashes and run 2 misses LMCache).
+# set fits the CPU pool (a too-large set thrashes and the retrieve run misses).
 LIMIT="${LIMIT:-100}"
-# Max allowed absolute difference in the gsm8k exact_match score across runs.
-# gemma-4 forces the Triton backend, which is not bit-exact under vLLM's
-# batch-invariant mode, so a correct retrieve can differ from a fresh compute by
-# a small margin; the default allows a small tolerance instead of an exact match.
-SCORE_TOLERANCE="${SCORE_TOLERANCE:-0.05}"
-# Seconds to wait after run 1 so async LMCache stores drain before run 2.
+# Max abs difference allowed between the two runs' gsm8k scores; 0 requires an
+# exact match.
+SCORE_TOLERANCE="${SCORE_TOLERANCE:-0}"
+# Seconds to let async LMCache stores drain before the retrieve run.
 STORE_DRAIN_SECONDS="${STORE_DRAIN_SECONDS:-20}"
 BUILD_ID="${BUILD_ID:-local_$$}"
 RESULTS_DIR="${RESULTS_DIR:-/tmp/lmcache_ci_results_${BUILD_ID}}"
-# LMCache MP server log, scanned to confirm run 2 was served by LMCache retrieves.
+# LMCache MP server log, scanned to confirm the retrieve run hit LMCache.
 LMCACHE_LOG="${LMCACHE_LOG:-/tmp/build_${BUILD_ID}_lmcache.log}"
 
 HMA_DIR="$RESULTS_DIR/hma_lm_eval"
-RUN1_DIR="$HMA_DIR/run1_store"
-RUN2_DIR="$HMA_DIR/run2_retrieve"
-BASELINE_DIR="$HMA_DIR/baseline"
+VLLM_RUN_DIR="$HMA_DIR/vllm_run"
+RETRIEVE_RUN_DIR="$HMA_DIR/retrieve_run"
 
 echo "=== HMA lm_eval correctness test ==="
 echo "Model: $MODEL"
 echo "vLLM (LMCache) port: $VLLM_PORT"
-echo "vLLM baseline port: $VLLM_BASELINE_PORT"
 echo "Concurrent requests: $NUM_CONCURRENT"
 echo "Limit: $LIMIT"
 echo "Score tolerance: $SCORE_TOLERANCE"
 echo "Results dir: $HMA_DIR"
 echo ""
 
-mkdir -p "$RUN1_DIR" "$RUN2_DIR" "$BASELINE_DIR"
+mkdir -p "$VLLM_RUN_DIR" "$RETRIEVE_RUN_DIR"
 
 # Run one lm_eval gsm8k pass against a vLLM OpenAI-compatible server.
 #
@@ -147,8 +141,8 @@ count_retrieves() {
     grep -c "Retrieved" "$LMCACHE_LOG" 2>/dev/null || true
 }
 
-# ── 1. Cold run: compute + STORE into LMCache ───────────────
-run_lm_eval "$VLLM_PORT" "$RUN1_DIR" "run1 LMCache STORE"
+# ── 1. vLLM run: compute from scratch, populating LMCache ───
+run_lm_eval "$VLLM_PORT" "$VLLM_RUN_DIR" "vLLM run"
 
 # Let async stores drain to the LMCache server before invalidating the APC.
 echo "Waiting ${STORE_DRAIN_SECONDS}s for LMCache stores to drain..."
@@ -159,28 +153,25 @@ retrieves_before=$(count_retrieves)
 # ── 2. Invalidate vLLM's local prefix cache (keep LMCache) ──
 reset_vllm_prefix_cache "$VLLM_PORT"
 
-# ── 3. Warm run: vLLM APC misses -> LMCache RETRIEVE ────────
-run_lm_eval "$VLLM_PORT" "$RUN2_DIR" "run2 LMCache RETRIEVE"
+# ── 3. Retrieve run: vLLM APC misses -> LMCache serves the KV ─
+run_lm_eval "$VLLM_PORT" "$RETRIEVE_RUN_DIR" "LMCache retrieve run"
 
 retrieves_after=$(count_retrieves)
 
-# ── 4. Baseline run: no LMCache, ground truth ──────────────
-run_lm_eval "$VLLM_BASELINE_PORT" "$BASELINE_DIR" "baseline no LMCache"
-
-# ── 5. Compare scores and verify LMCache was actually used ──
+# ── 4. Compare scores and verify LMCache was actually used ──
 echo "============================================"
 echo "=== Verifying HMA store/retrieve correctness ==="
 echo "============================================"
-echo "LMCache retrieves logged: before run2=${retrieves_before}, after run2=${retrieves_after}"
+echo "LMCache retrieves logged: before=${retrieves_before}, after=${retrieves_after}"
 
-python3 - "$RUN1_DIR" "$RUN2_DIR" "$BASELINE_DIR" \
+python3 - "$VLLM_RUN_DIR" "$RETRIEVE_RUN_DIR" \
     "$SCORE_TOLERANCE" "$retrieves_before" "$retrieves_after" <<'PYEOF'
 import glob
 import json
 import os
 import sys
 
-run1_dir, run2_dir, baseline_dir, tol_s, before_s, after_s = sys.argv[1:7]
+vllm_run_dir, retrieve_run_dir, tol_s, before_s, after_s = sys.argv[1:6]
 tol = float(tol_s)
 retrieves_before = int(before_s)
 retrieves_after = int(after_s)
@@ -221,34 +212,25 @@ def gsm8k_exact_match(results_dir: str) -> float:
     raise SystemExit(f"No exact_match metric in {latest}: {sorted(metrics)}")
 
 
-s1 = gsm8k_exact_match(run1_dir)
-s2 = gsm8k_exact_match(run2_dir)
-sb = gsm8k_exact_match(baseline_dir)
+s_vllm = gsm8k_exact_match(vllm_run_dir)
+s_retrieve = gsm8k_exact_match(retrieve_run_dir)
 
-print(f"  run1 (LMCache STORE)    gsm8k exact_match = {s1:.4f}")
-print(f"  run2 (LMCache RETRIEVE) gsm8k exact_match = {s2:.4f}")
-print(f"  baseline (no LMCache)   gsm8k exact_match = {sb:.4f}")
+print(f"  vLLM run             gsm8k exact_match = {s_vllm:.4f}")
+print(f"  LMCache retrieve run gsm8k exact_match = {s_retrieve:.4f}")
 print(f"  tolerance = {tol}")
 
 failures = []
-# run1 (store) vs run2 (retrieve): same server, the core store/retrieve check.
-if abs(s1 - s2) > tol:
-    failures.append(
-        f"LMCache store-vs-retrieve score drift: |{s1:.4f} - {s2:.4f}| = "
-        f"{abs(s1 - s2):.4f} > {tol}"
-    )
-# run2 (retrieve) vs baseline (no LMCache): retrieve must match ground truth.
-if abs(s2 - sb) > tol:
+# The two runs must match -- a broken LMCache would skew the retrieved KV.
+if abs(s_vllm - s_retrieve) > tol:
     failures.append(
-        f"Retrieve-vs-baseline score drift: |{s2:.4f} - {sb:.4f}| = "
-        f"{abs(s2 - sb):.4f} > {tol}"
+        f"score drift between runs: |{s_vllm:.4f} - {s_retrieve:.4f}| = "
+        f"{abs(s_vllm - s_retrieve):.4f} > {tol}"
     )
-# Non-vacuous: run 2 must have been served by LMCache retrieves, not recompute.
+# Non-vacuous: the retrieve run must have been served by LMCache, not recompute.
 if retrieves_after <= retrieves_before:
     failures.append(
-        "LMCache served no retrieves during run 2 "
-        f"(before={retrieves_before}, after={retrieves_after}); "
-        "the retrieve path was not exercised"
+        "LMCache served no retrieves during the retrieve run "
+        f"(before={retrieves_before}, after={retrieves_after})"
     )
 
 if failures:
@@ -258,8 +240,8 @@ if failures:
     sys.exit(1)
 
 print(
-    f"\nPASS: store, retrieve, and baseline gsm8k scores match (tol={tol}); "
-    f"LMCache served {retrieves_after - retrieves_before} retrieves during run 2."
+    f"\nPASS: vLLM and LMCache-retrieve gsm8k scores match (tol={tol}); "
+    f"LMCache served {retrieves_after - retrieves_before} retrieves."
 )
 PYEOF
 
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh b/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
index 71dd68f762..4df0e2ad95 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
@@ -28,10 +28,9 @@ if [ "$TEST_NAME" = "hma_lm_eval_gemma4" ]; then
     # gemma-4-31B-it is public (no gating, so no HF token check) and has
     # heterogeneous head dims (head_dim 256 / global_head_dim 512), so vLLM
     # gives its KV cache groups different block sizes -- this is what exercises
-    # LMCache's per-group block-size handling. It forces TRITON_ATTN, which is
-    # not bit-exact under batch invariance, so the pipeline sets a small
-    # SCORE_TOLERANCE and ATTENTION_BACKEND=auto; its ~63GB of weights also need
-    # a higher GPU_MEMORY_UTILIZATION than the default (all set in pipeline.yml).
+    # LMCache's per-group block-size handling. It forces TRITON_ATTN, so the
+    # pipeline sets ATTENTION_BACKEND=auto; its ~63GB of weights also need a
+    # higher GPU_MEMORY_UTILIZATION than the default (all set in pipeline.yml).
     export MODEL="${MODEL:-google/gemma-4-31B-it}"
 else
     export MODEL="${MODEL:-Qwen/Qwen3-14B}"
@@ -63,7 +62,7 @@ SELF_CONTAINED_TESTS=" deadlock "
 # Tests that compare against a baseline vLLM (no LMCache) on a second GPU.
 # Only these need the baseline server (and thus a 2-GPU pod); everything
 # else runs on GPU 0 alone, so launch-processes.sh skips the baseline.
-BASELINE_TESTS=" vllm_bench long_doc_qa long_doc_qa_l2 hma_lm_eval_gemma4 "
+BASELINE_TESTS=" vllm_bench long_doc_qa long_doc_qa_l2 "
 if [[ "$BASELINE_TESTS" == *" $TEST_NAME "* ]]; then
     export LAUNCH_BASELINE=true
 else
diff --git a/docs/source/recipes/gemma4.rst b/docs/source/recipes/gemma4.rst
index e4e1acc607..a2ed89894e 100644
--- a/docs/source/recipes/gemma4.rst
+++ b/docs/source/recipes/gemma4.rst
@@ -1,12 +1,13 @@
 .. _recipe_gemma4:
 
-Gemma4ForConditionalGeneration
-===============================
+Gemma 4
+=======
 
 Validated models
 ----------------
 
 - `google/gemma-4-31B-it <https://huggingface.co/google/gemma-4-31B-it>`_
+- `google/gemma-4-12B-it <https://huggingface.co/google/gemma-4-12B-it>`_
 - `google/gemma-4-E4B-it <https://huggingface.co/google/gemma-4-E4B-it>`_
 
 .. tab-set::
@@ -17,7 +18,8 @@ Validated models
       **Engine documentation:**
       `Gemma 4 in vLLM supported models
       <https://docs.vllm.ai/en/latest/models/supported_models.html#multimodal-language-models>`_
-      (architecture ``Gemma4ForConditionalGeneration``).
+      (architectures ``Gemma4ForConditionalGeneration`` for 31B/E4B and
+      ``Gemma4UnifiedForConditionalGeneration`` for 12B).
 
       **Status:** Validated with LMCache.
 
@@ -40,11 +42,12 @@ Validated models
 
       |
 
-      The smaller ``google/gemma-4-E4B-it`` runs on a single GPU:
+      The smaller ``google/gemma-4-12B-it`` and ``google/gemma-4-E4B-it`` run on
+      a single GPU:
 
       .. code-block:: bash
 
-         vllm serve google/gemma-4-E4B-it \
+         vllm serve google/gemma-4-12B-it \
              --kv-transfer-config \
              '{"kv_connector":"LMCacheMPConnector", "kv_role":"kv_both"}'
 
@@ -95,7 +98,3 @@ Caveats
 - **Cross-layer KV sharing.** ``google/gemma-4-E4B-it`` reuses some layers' KV
   caches across layers. LMCache stores the cache-owning layers only; the sharing
   layers' KV lives in the same blocks and is restored automatically.
-- **Determinism.** Gemma 4 runs on the Triton attention backend, which is not
-  bit-exact under vLLM's batch-invariant mode, so a retrieved result may differ
-  from a freshly computed one by a small numerical margin rather than being
-  byte-identical.
diff --git a/docs/source/recipes/index.rst b/docs/source/recipes/index.rst
index d02b991808..8dd0fdd370 100644
--- a/docs/source/recipes/index.rst
+++ b/docs/source/recipes/index.rst
@@ -58,6 +58,13 @@ Supported architectures
      - —
      - :doc:`gemma4`
 
+   * - ``Gemma4UnifiedForConditionalGeneration``
+     - ``google/gemma-4-12B-it``
+     - ✓
+     - —
+     - —
+     - :doc:`gemma4`
+
    * - ``Gemma3ForConditionalGeneration``
      - ``google/gemma-3-4b-it``
      - ✓
diff --git a/tests/v1/distributed/serde/test_serde_e2e.py b/tests/v1/distributed/serde/test_serde_e2e.py
index 1b192df05e..2aa42f7bbb 100644
--- a/tests/v1/distributed/serde/test_serde_e2e.py
+++ b/tests/v1/distributed/serde/test_serde_e2e.py
@@ -175,6 +175,29 @@ def get_l1_object_count(sm: StorageManager) -> int:
     return sm.report_status()["l1_manager"]["total_object_count"]
 
 
+def clear_and_wait_drained(sm: StorageManager, timeout: float = 10.0) -> None:
+    """Clear L1 and poll until every object is evicted.
+
+    After an L2 store the StoreController holds read locks on the stored objects
+    for a short window, and ``StorageManager.clear`` keeps locked objects intact.
+    A single clear right after the store therefore races the lock release and can
+    leave objects behind. Retry clear() until the locks drop and L1 drains rather
+    than relying on a fixed sleep.
+
+    Raises:
+        AssertionError: If L1 still holds objects after ``timeout`` seconds.
+    """
+
+    def drained() -> bool:
+        sm.clear()
+        return get_l1_object_count(sm) == 0
+
+    if not wait_for_condition(drained, timeout=timeout):
+        raise AssertionError(
+            f"L1 did not drain after clear: {get_l1_object_count(sm)} objects remain"
+        )
+
+
 # =============================================================================
 # Tests: Full round-trip through serde
 # =============================================================================
@@ -196,9 +219,7 @@ def test_store_and_prefetch_with_serde(self) -> None:
 
         write_and_wait_for_l2(sm, keys, layout)
 
-        # Brief sleep so StoreController releases read locks after L2 store
-        time.sleep(1)
-        sm.clear()
+        clear_and_wait_drained(sm)
         assert get_l1_object_count(sm) == 0
 
         # Prefetch from L2
@@ -222,8 +243,7 @@ def test_no_memory_leak_after_full_cycle(self) -> None:
         keys = [make_object_key(i) for i in range(3)]
 
         write_and_wait_for_l2(sm, keys, layout)
-        time.sleep(1)
-        sm.clear()
+        clear_and_wait_drained(sm)
 
         # Prefetch
         handle = sm.submit_prefetch_task(keys, layout)
@@ -263,8 +283,7 @@ def test_store_and_prefetch_without_serde(self) -> None:
         keys = [make_object_key(i) for i in range(5)]
 
         write_and_wait_for_l2(sm, keys, layout)
-        time.sleep(1)
-        sm.clear()
+        clear_and_wait_drained(sm)
 
         handle = sm.submit_prefetch_task(keys, layout)
         hits = wait_for_prefetch_status(sm, handle)
@@ -285,8 +304,7 @@ def test_no_memory_leak_without_serde(self) -> None:
         keys = [make_object_key(i) for i in range(3)]
 
         write_and_wait_for_l2(sm, keys, layout)
-        time.sleep(1)
-        sm.clear()
+        clear_and_wait_drained(sm)
 
         handle = sm.submit_prefetch_task(keys, layout)
         hits = wait_for_prefetch_status(sm, handle)
@@ -318,8 +336,7 @@ def test_partial_prefix_with_serde(self) -> None:
         # Write only keys 0, 1, 3, 4 (skip 2)
         keys_to_write = [make_object_key(i) for i in [0, 1, 3, 4]]
         write_and_wait_for_l2(sm, keys_to_write, layout)
-        time.sleep(1)
-        sm.clear()
+        clear_and_wait_drained(sm)
 
         # Request all 5 keys — prefix should be 2 (gap at index 2)
         all_keys = [make_object_key(i) for i in range(5)]
@@ -354,8 +371,7 @@ def test_repeated_cycles_no_leak(self) -> None:
         for cycle in range(5):
             keys = [make_object_key(cycle * 10 + i) for i in range(3)]
             write_and_wait_for_l2(sm, keys, layout)
-            time.sleep(1)
-            sm.clear()
+            clear_and_wait_drained(sm)
 
             handle = sm.submit_prefetch_task(keys, layout)
             hits = wait_for_prefetch_status(sm, handle)
@@ -441,8 +457,7 @@ def _run_roundtrip(
         keys = [make_object_key(i) for i in range(num_keys)]
 
         write_and_wait_for_l2(sm, keys, layout)
-        time.sleep(1)
-        sm.clear()
+        clear_and_wait_drained(sm)
         assert get_l1_object_count(sm) == 0
 
         handle = sm.submit_prefetch_task(keys, layout)

From 996f03bbb68c16d7993eb02d45d1d5d1c8e06249 Mon Sep 17 00:00:00 2001
From: ruicheng <95903923+KimmoZAG@users.noreply.github.com>
Date: Tue, 9 Jun 2026 12:15:52 +0800
Subject: [PATCH 11/57] examples(kv_cache_calculator): add Hunyuan & DeepSeek
 models, fix head_dim/CLA calculation, add i18n UI (#2834)

* examples(kv_cache_calculator): add Hunyuan & DeepSeek models, UI i18n, prefer local modelconfig

Signed-off-by: KimmoZAG <995496585@qq.com>

* fix(kv_cache_calculator): use prefix match for DeepSeek V3 variants; consolidate head_dim logic

Signed-off-by: KimmoZAG <995496585@qq.com>

---------

Signed-off-by: KimmoZAG <995496585@qq.com>
---
 .../kv_cache_calculator/generate_config.py    |  20 +-
 .../kv_cache_calculator.html                  | 538 +++++++++++++-----
 examples/kv_cache_calculator/modelconfig.json |  46 +-
 .../benchmarks/test_xpu_kernels_microbench.py |   2 +-
 4 files changed, 451 insertions(+), 155 deletions(-)

diff --git a/examples/kv_cache_calculator/generate_config.py b/examples/kv_cache_calculator/generate_config.py
index c7724170aa..dac665298a 100644
--- a/examples/kv_cache_calculator/generate_config.py
+++ b/examples/kv_cache_calculator/generate_config.py
@@ -34,17 +34,33 @@ def main():
             "num_key_value_heads": getattr(config, "num_key_value_heads", None),
         }
 
-        if args.model == "deepseek-ai/DeepSeek-V3":
+        # DeepSeek MLA models (V3, V3.1, V3.2, … and R1) store
+        # KV in latent space
+        if (
+            args.model.lower().startswith("deepseek-ai/deepseek-v3")
+            or args.model == "deepseek-ai/DeepSeek-R1"
+        ):
             config_data["kv_lora_rank"] = getattr(config, "kv_lora_rank", None)
             config_data["qk_rope_head_dim"] = getattr(config, "qk_rope_head_dim", None)
 
-        # Check for Qwen3 models (fuzzy matching) or GLM4 models
+        # Models whose head_dim is explicit in config and may
+        # differ from hidden_size / num_heads:
+        # Qwen3, GLM4, and Hunyuan dense variants.
         if (
             "qwen/qwen3-" in args.model.lower()
             or "zai-org/glm-4." in args.model.lower()
+            or (
+                args.model.lower().startswith("tencent/hunyuan-")
+                and args.model.lower() != "tencent/hunyuan-large"
+            )
         ):
             config_data["head_dim"] = getattr(config, "head_dim", None)
 
+        # Hunyuan-Large uses CLA (Cross-Layer Attention):
+        # KV layers = num_hidden_layers / cla_share_factor
+        if args.model.lower() == "tencent/hunyuan-large":
+            config_data["cla_share_factor"] = getattr(config, "cla_share_factor", None)
+
         # Convert to JSON and print
         string = json.dumps(config_data, indent=4)
 
diff --git a/examples/kv_cache_calculator/kv_cache_calculator.html b/examples/kv_cache_calculator/kv_cache_calculator.html
index 0dfe1e6116..92ed0c7053 100644
--- a/examples/kv_cache_calculator/kv_cache_calculator.html
+++ b/examples/kv_cache_calculator/kv_cache_calculator.html
@@ -5,141 +5,353 @@
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>KV Cache Size Calculator</title>
     <style>
-        .container {
-            font-family: Arial, sans-serif;
-            max-width: 400px;
-            margin: 0 auto;
-            padding: 20px;
-            border: 1px solid #ccc;
-            border-radius: 8px;
+        *, *::before, *::after { box-sizing: border-box; }
+
+        body {
+            margin: 0;
+            min-height: 100vh;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: flex-start;
             background-color: #f9f9f9;
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif;
+            padding: 24px 12px 48px;
         }
 
-        label, select, input, button {
-            display: block;
+        .lang-bar {
             width: 100%;
-            margin-bottom: 15px;
+            max-width: 480px;
+            display: flex;
+            justify-content: flex-end;
+            margin-bottom: 8px;
+            gap: 6px;
         }
 
-        label {
-            font-weight: bold;
-            margin-bottom: 5px;
+        .lang-btn {
+            padding: 4px 12px;
+            font-size: 13px;
+            border: 1px solid #b0c4de;
+            border-radius: 20px;
+            background: #fff;
+            color: #3898ec;
+            cursor: pointer;
+            transition: background 0.15s, color 0.15s;
+        }
+        .lang-btn.active, .lang-btn:hover {
+            background: #3898ec;
+            color: #fff;
+            border-color: #3898ec;
         }
 
-        select, input, button {
-            padding: 10px;
-            font-size: 16px;
-            border: 1px solid #ccc;
-            border-radius: 4px;
-            box-sizing: border-box;
+        .card {
+            width: 100%;
+            max-width: 480px;
+            background: #fff;
+            border-radius: 14px;
+            box-shadow: 0 4px 24px rgba(56, 152, 236, 0.10), 0 1px 4px rgba(0,0,0,0.06);
+            padding: 32px 28px 24px;
         }
 
-        button {
-            background-color: #3898ec;
-            color: white;
-            cursor: pointer;
+        .card h1 {
+            margin: 0 0 24px;
+            font-size: 22px;
+            font-weight: 700;
+            color: #1a2e4a;
+            letter-spacing: -0.3px;
         }
 
-        button:hover {
-            background-color: #0056b3;
+        .field {
+            margin-bottom: 18px;
         }
 
-        #result {
-            margin-top: 20px;
-            font-size: 16px;
-            font-weight: bold;
+        .field label {
+            display: block;
+            font-size: 13px;
+            font-weight: 600;
+            color: #4a5568;
+            margin-bottom: 6px;
+            letter-spacing: 0.1px;
         }
 
-        /* New CSS for calculation steps */
-        #calculation-steps {
-            font-size: 12px;
-            margin-top: 10px;
-            color: #555;
+        .field select, .field input {
+            width: 100%;
+            padding: 9px 12px;
+            font-size: 15px;
+            border: 1.5px solid #d0d9e8;
+            border-radius: 8px;
+            background: #f7faff;
+            color: #1a2e4a;
+            outline: none;
+            transition: border-color 0.15s, box-shadow 0.15s;
+            appearance: none;
+            -webkit-appearance: none;
+        }
+        .select-wrap {
+            position: relative;
+        }
+        .select-wrap::after {
+            content: "▾";
+            position: absolute;
+            right: 12px;
+            top: 50%;
+            transform: translateY(-50%);
+            pointer-events: none;
+            color: #8aa4c8;
+            font-size: 14px;
+        }
+        .field select:focus, .field input:focus {
+            border-color: #3898ec;
+            box-shadow: 0 0 0 3px rgba(56,152,236,0.12);
+            background: #fff;
         }
 
-        /* New button for GitHub repo */
-        #githubButton {
-            background-color: #d3d3d3; /* Light grey color */
-            color: black; /* Black text color */
-            text-align: center;
+        .calc-btn {
+            display: block;
+            width: 100%;
+            padding: 11px;
+            font-size: 15px;
+            font-weight: 600;
+            background: #3898ec;
+            color: #fff;
+            border: none;
+            border-radius: 8px;
             cursor: pointer;
+            transition: background 0.15s, transform 0.1s;
+            margin-top: 4px;
+        }
+        .calc-btn:hover { background: #1a7fd4; }
+        .calc-btn:active { transform: scale(0.98); }
+
+        #result {
             margin-top: 20px;
-            border: 1px solid #ccc;
+            padding: 14px 16px;
+            background: linear-gradient(90deg, #e8f4fd, #f0f8ff);
+            border-left: 4px solid #3898ec;
+            border-radius: 6px;
+            font-size: 20px;
+            font-weight: 700;
+            color: #1a5fa8;
+            display: none;
         }
 
-        #githubButton:hover {
-            background-color: #b3b3b3; /* Darker grey when hovered */
+        #calculation-steps {
+            margin-top: 14px;
+            padding: 14px 16px;
+            background: #f7faff;
+            border: 1px solid #d8e8f8;
+            border-radius: 8px;
+            font-size: 12.5px;
+            line-height: 1.8;
+            color: #4a5568;
+            display: none;
         }
 
-        footer {
+        .github-btn {
+            display: block;
+            width: 100%;
+            padding: 9px;
+            margin-top: 18px;
+            font-size: 13px;
+            font-weight: 500;
+            background: #f0f4fa;
+            color: #4a5568;
+            border: 1.5px solid #d0d9e8;
+            border-radius: 8px;
+            cursor: pointer;
             text-align: center;
-            margin-top: 20px;
-            font-size: 12px;
-            color: #555;
+            transition: background 0.15s;
         }
+        .github-btn:hover { background: #e2eaf6; }
 
+        footer {
+            margin-top: 24px;
+            font-size: 12px;
+            color: #8aa4c8;
+        }
     </style>
 </head>
 <body>
-    <div class="container">
-        <h1>KV Cache Size Calculator</h1>
-        <label for="model">Select LLM Model:</label>
-        <select id="model">
-            <!-- Options will be dynamically generated -->
-        </select>
-        <label for="dtype">Select data type:</label>
-        <select id="dtype">
-            <option value="float16">float16</option>
-            <option value="bfloat16">bfloat16</option>
-            <option value="float32">float32</option>
-            <option value="int8">int8</option>
-        </select>
-        <label for="tokens">Enter Number of Tokens:</label>
-        <input type="number" id="tokens" placeholder="Enter number of tokens">
-        <button onclick="calculateKVCache()">Calculate KV Cache Size</button>
+    <div class="lang-bar">
+        <button class="lang-btn active" onclick="setLang('en')" id="btn-en">English</button>
+        <button class="lang-btn" onclick="setLang('zh')" id="btn-zh">中文</button>
+    </div>
+    <div class="card">
+        <h1 id="title">KV Cache Size Calculator</h1>
+
+        <div class="field">
+            <label id="lbl-model" for="model">Select LLM Model</label>
+            <div class="select-wrap">
+                <select id="model"></select>
+            </div>
+        </div>
+
+        <div class="field">
+            <label id="lbl-dtype" for="dtype">Data Type</label>
+            <div class="select-wrap">
+                <select id="dtype">
+                    <option value="float16">float16</option>
+                    <option value="bfloat16">bfloat16</option>
+                    <option value="float32">float32</option>
+                    <option value="int8">int8</option>
+                </select>
+            </div>
+        </div>
+
+        <div class="field">
+            <label id="lbl-tokens" for="tokens">Number of Tokens</label>
+            <input type="number" id="tokens" placeholder="e.g. 4096">
+        </div>
+
+        <button class="calc-btn" onclick="calculateKVCache()" id="btn-calc">Calculate</button>
+
         <div id="result"></div>
-        <!-- New div for calculation steps -->
         <div id="calculation-steps"></div>
-        <button id="githubButton" onclick="openGitHubRepo()">Contribute new models on GitHub</button>
+
+        <button class="github-btn" onclick="openGitHubRepo()" id="btn-github">
+            ➕ Contribute new models on GitHub
+        </button>
     </div>
-    <footer>
-        Developed by Zhuohan Gu @ LMCache team
-    </footer>
+    <footer id="footer">Developed by Zhuohan Gu @ LMCache team</footer>
+
     <script>
         let modelConfigs = {};
+        let currentLang = 'en';
+
+        const i18n = {
+            en: {
+                title: 'KV Cache Size Calculator',
+                lblModel: 'Select LLM Model',
+                lblDtype: 'Data Type',
+                lblTokens: 'Number of Tokens',
+                tokenPlaceholder: 'e.g. 4096',
+                btnCalc: 'Calculate',
+                btnGithub: '➕ Contribute new models on GitHub',
+                footer: 'Developed by Zhuohan Gu @ LMCache team',
+                errTokens: 'Please enter a valid number of tokens.',
+                errModel: 'Model not recognized.',
+                errDtype: 'Invalid data type selected.',
+                errLoad: 'Failed to load model configurations. Please try again later.',
+                detailsTitle: 'Calculation Details',
+                fldModel: 'Selected Model',
+                fldLayers: 'Number of Hidden Layers',
+                fldKvLoraRank: 'KV-LoRA Rank (latent dim)',
+                fldQkRopeHeadDim: 'QK-Rope Head Dim',
+                fldDtypeSize: 'Data Type Size',
+                fldTotalElem: 'Total Elements',
+                fldTotalBytes: 'Total Bytes',
+                fldKvSize: 'KV Cache Size',
+                fldKvHeads: 'Number of Key-Value Heads',
+                fldHeadDim: 'Head Dim',
+                fldClaFactor: 'CLA Share Factor',
+                fldEffLayers: 'Effective KV Layers',
+                fldHeadSize: 'Head Size',
+                fldHiddenSize: 'Hidden Size',
+                fldAttnHeads: 'Number of Attention Heads',
+                claNote: (f) => `every ${f} layers share one KV cache`,
+                headSizeNote: 'Hidden Size / Attention Heads',
+                bytes: 'bytes',
+                result: (gb) => `KV Cache Size: ${gb} GB`,
+            },
+            zh: {
+                title: 'KV Cache 大小计算器',
+                lblModel: '选择 LLM 模型',
+                lblDtype: '数据类型',
+                lblTokens: 'Token 数量',
+                tokenPlaceholder: '例如：4096',
+                btnCalc: '开始计算',
+                btnGithub: '➕ 在 GitHub 上贡献新模型',
+                footer: '由 LMCache 团队 Zhuohan Gu 开发',
+                errTokens: '请输入有效的 Token 数量。',
+                errModel: '未识别的模型。',
+                errDtype: '无效的数据类型。',
+                errLoad: '模型配置加载失败，请稍后重试。',
+                detailsTitle: '计算详情',
+                fldModel: '所选模型',
+                fldLayers: '隐藏层数',
+                fldKvLoraRank: 'KV-LoRA 秩（隐空间维度）',
+                fldQkRopeHeadDim: 'QK-RoPE Head Dim',
+                fldDtypeSize: '数据类型大小',
+                fldTotalElem: '总元素数',
+                fldTotalBytes: '总字节数',
+                fldKvSize: 'KV Cache 大小',
+                fldKvHeads: 'KV 头数',
+                fldHeadDim: 'Head Dim',
+                fldClaFactor: 'CLA 共享因子',
+                fldEffLayers: '有效 KV 层数',
+                fldHeadSize: 'Head Size',
+                fldHiddenSize: '隐藏层维度',
+                fldAttnHeads: '注意力头数',
+                claNote: (f) => `每 ${f} 层共享一份 KV Cache`,
+                headSizeNote: '隐藏层维度 / 注意力头数',
+                bytes: '字节',
+                result: (gb) => `KV Cache 大小：${gb} GB`,
+            }
+        };
+
+        function t(key, ...args) {
+            const v = i18n[currentLang][key];
+            return typeof v === 'function' ? v(...args) : v;
+        }
+
+        function setLang(lang) {
+            currentLang = lang;
+            document.getElementById('btn-en').classList.toggle('active', lang === 'en');
+            document.getElementById('btn-zh').classList.toggle('active', lang === 'zh');
+            document.getElementById('title').textContent = t('title');
+            document.getElementById('lbl-model').textContent = t('lblModel');
+            document.getElementById('lbl-dtype').textContent = t('lblDtype');
+            document.getElementById('lbl-tokens').textContent = t('lblTokens');
+            document.getElementById('tokens').placeholder = t('tokenPlaceholder');
+            document.getElementById('btn-calc').textContent = t('btnCalc');
+            document.getElementById('btn-github').textContent = t('btnGithub');
+            document.getElementById('footer').textContent = t('footer');
+            // Re-render result if visible
+            if (document.getElementById('result').style.display !== 'none') {
+                calculateKVCache();
+            }
+        }
 
         function openGitHubRepo() {
             const githubUrl = 'https://github.com/LMCache/LMCache/issues/244#:~:text=https%3A//github.com/LMCache/LMCache/tree/dev/examples/kv_cache_calculator';
             window.open(githubUrl, '_blank');
         }
 
-
-        // Load model configurations from GitHub
+        // Load model configurations: prefer local file, fallback to GitHub
         async function loadModelConfigs() {
-            const url = 'https://raw.githubusercontent.com/LMCache/LMCache/refs/heads/dev/examples/kv_cache_calculator/modelconfig.json';
+            const localUrl = './modelconfig.json';
+            const remoteUrl = 'https://raw.githubusercontent.com/LMCache/LMCache/refs/heads/dev/examples/kv_cache_calculator/modelconfig.json';
+            // Try local first
             try {
-                const response = await fetch(url);
-                if (!response.ok) {
-                    throw new Error(`HTTP error! Status: ${response.status}`);
+                let response = await fetch(localUrl);
+                if (response.ok) {
+                    modelConfigs = await response.json();
+                    populateModelDropdown();
+                    return;
                 }
+            } catch (e) {
+                // ignore local fetch errors and fallback
+                console.debug('Local modelconfig.json not available locally, falling back to remote.');
+            }
+
+            // Fallback to remote
+            try {
+                const response = await fetch(remoteUrl);
+                if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
                 modelConfigs = await response.json();
-                console.log('Model configurations loaded successfully:', modelConfigs);
                 populateModelDropdown();
             } catch (error) {
                 console.error('Failed to load model configurations:', error);
-                document.getElementById('result').textContent = "Failed to load model configurations. Please try again later.";
+                showResult(t('errLoad'), true);
             }
         }
 
         // Populate the model dropdown dynamically
         function populateModelDropdown() {
             const modelSelect = document.getElementById('model');
-            modelSelect.innerHTML = ""; // Clear existing options
-
-            // Sort model names using natural/numeric ordering
-            const collator = new Intl.Collator(undefined, { numeric: true, sensitivity: 'base'});
+            modelSelect.innerHTML = "";
+            const collator = new Intl.Collator(undefined, { numeric: true, sensitivity: 'base' });
             const sortedModelNames = Object.keys(modelConfigs).sort(collator.compare);
-            
             for (const modelName of sortedModelNames) {
                 const option = document.createElement('option');
                 option.value = modelName;
@@ -148,35 +360,40 @@ <h1>KV Cache Size Calculator</h1>
             }
         }
 
+        function showResult(html, isError = false) {
+            const el = document.getElementById('result');
+            el.innerHTML = html;
+            el.style.display = 'block';
+            el.style.borderLeftColor = isError ? '#e05252' : '#3898ec';
+            el.style.color = isError ? '#a02020' : '#1a5fa8';
+        }
+
         async function calculateKVCache() {
-            // Ensure model configs are loaded before running calculations
-            if (Object.keys(modelConfigs).length === 0) {
-                await loadModelConfigs();
-            }
+            if (Object.keys(modelConfigs).length === 0) await loadModelConfigs();
 
             const model = document.getElementById('model').value;
             const tokens = parseInt(document.getElementById('tokens').value);
             const dtype = document.getElementById('dtype').value;
 
             if (isNaN(tokens) || tokens <= 0) {
-                document.getElementById('result').textContent = "Please enter a valid number of tokens.";
-                document.getElementById('calculation-steps').innerHTML = "";
+                showResult(t('errTokens'), true);
+                document.getElementById('calculation-steps').style.display = 'none';
                 return;
             }
 
             const config = modelConfigs[model];
             if (!config) {
-                document.getElementById('result').textContent = "Model not recognized.";
-                document.getElementById('calculation-steps').innerHTML = "";
+                showResult(t('errModel'), true);
+                document.getElementById('calculation-steps').style.display = 'none';
                 return;
             }
 
             let hidden_size, num_attention_heads, num_hidden_layers, num_key_value_heads;
-            let kv_lora_rank, qk_rope_head_dim; // for deepseek-ai/DeepSeek-V3
-            let head_size;
+            let kv_lora_rank, qk_rope_head_dim;
+            let head_dim, head_size;
 
-            // Check for DeepSeek models (exact matching)
-            const isDeepSeekModel = model === "deepseek-ai/DeepSeek-V3" || model === "deepseek-ai/DeepSeek-R1";
+            // Check for DeepSeek MLA models (prefix match covers V3, V3.1, V3.2, … ; plus R1)
+            const isDeepSeekModel = model.startsWith("deepseek-ai/DeepSeek-V3") || model === "deepseek-ai/DeepSeek-R1";
 
             // Check for Qwen3 models (fuzzy matching)
             const isQwen3Model = model.toLowerCase().includes("qwen/qwen3-");
@@ -184,14 +401,23 @@ <h1>KV Cache Size Calculator</h1>
             // Check for GLM4 models (prefix matching)
             const isGLM4Model = model.startsWith("zai-org/GLM-4.");
 
-            const isGQAWithHeadDimModel = isQwen3Model || isGLM4Model;
+            // Check for Hunyuan dense models (explicit head_dim, may differ from hidden/heads)
+            const isHunyuanDenseModel = model.toLowerCase().startsWith("tencent/hunyuan-") && model.toLowerCase() !== "tencent/hunyuan-large";
+
+            // Check for Hunyuan-Large (CLA: cross-layer attention sharing)
+            const isHunyuanLargeModel = model.toLowerCase() === "tencent/hunyuan-large";
+
+            const isGQAWithHeadDimModel = isQwen3Model || isGLM4Model || isHunyuanDenseModel;
 
             if (isDeepSeekModel) {
                 ({ hidden_size, num_attention_heads, num_hidden_layers, num_key_value_heads, kv_lora_rank, qk_rope_head_dim } = config);
+            } else if (isHunyuanLargeModel) {
+                // Hunyuan-Large uses CLA (Cross-Layer Attention): every cla_share_factor layers share one KV cache.
+                ({ hidden_size, num_attention_heads, num_hidden_layers, num_key_value_heads } = config);
+                head_size = hidden_size / num_attention_heads;
             } else if (isGQAWithHeadDimModel) {
-                // The Qwen3 series and GLM use GQA, and `head_dim` needs to be read from config file.
+                // Qwen3, GLM4, and Hunyuan dense models all have an explicit head_dim in their configs.
                 ({ hidden_size, num_attention_heads, num_hidden_layers, num_key_value_heads, head_dim } = config);
-                console.log(config);
             } else {
                 ({ hidden_size, num_attention_heads, num_hidden_layers, num_key_value_heads } = config);
                 head_size = hidden_size / num_attention_heads;
@@ -199,86 +425,98 @@ <h1>KV Cache Size Calculator</h1>
 
             // Determine dtype size in bytes
             let dtype_size;
-            if (dtype === 'float32') {
-                dtype_size = 4;
-            } else if (dtype === 'float16' || dtype === 'bfloat16') {
-                dtype_size = 2;
-            } else if (dtype === 'int8') {
-                dtype_size = 1;
-            } else {
-                document.getElementById('result').textContent = "Invalid data type selected.";
-                document.getElementById('calculation-steps').innerHTML = "";
+            if (dtype === 'float32') dtype_size = 4;
+            else if (dtype === 'float16' || dtype === 'bfloat16') dtype_size = 2;
+            else if (dtype === 'int8') dtype_size = 1;
+            else {
+                showResult(t('errDtype'), true);
+                document.getElementById('calculation-steps').style.display = 'none';
                 return;
             }
 
             // Calculate KV cache size
             let total_elements;
+            let effective_layers;
             if (isDeepSeekModel) {
                 total_elements = num_hidden_layers * tokens * (kv_lora_rank + qk_rope_head_dim);
+            } else if (isHunyuanLargeModel) {
+                const cla_share_factor = config.cla_share_factor;
+                effective_layers = num_hidden_layers / cla_share_factor;
+                total_elements = 2 * effective_layers * tokens * num_key_value_heads * head_size;
             } else if (isGQAWithHeadDimModel) {
                 total_elements = 2 * num_hidden_layers * tokens * num_key_value_heads * head_dim;
             } else {
                 total_elements = 2 * num_hidden_layers * tokens * num_key_value_heads * head_size;
             }
             const total_bytes = total_elements * dtype_size;
-            const kvCacheSizeGB = total_bytes / (1024 ** 3); // Convert bytes to GB
+            const kvCacheSizeGB = total_bytes / (1024 ** 3);
 
-            document.getElementById('result').innerHTML =
-                `KV Cache Size: ${kvCacheSizeGB.toFixed(4)} GB`;
+            showResult(t('result', kvCacheSizeGB.toFixed(4)));
 
             // Prepare calculation steps
-            let steps;
+            const B = (s) => `<b>${s}</b>`;
+            let rows;
             if (isDeepSeekModel) {
-                steps = `
-                <strong>Calculation Details:</strong><br><br>
-                <b>Selected Model:</b> ${model}<br>
-                <b>Number of Hidden Layers:</b> ${num_hidden_layers}<br>
-                <b>KV-LoRA Rank(dimension of latent space):</b> ${kv_lora_rank}<br>
-                <b>QK-Rope Head Dim:</b> ${qk_rope_head_dim}<br>
-                <b>Data Type Size:</b> ${dtype_size} bytes<br>
-                <b>Total Elements:</b> ${num_hidden_layers} × ${tokens} × (${kv_lora_rank} + ${qk_rope_head_dim}) = ${total_elements}<br>
-                <b>Total Bytes:</b> ${total_elements} × ${dtype_size} = ${total_bytes} bytes<br>
-                <b>KV Cache Size:</b> ${total_bytes} / (1024³) ≈ ${kvCacheSizeGB.toFixed(4)} GB
-                `;
+                rows = [
+                    [B(t('fldModel')), model],
+                    [B(t('fldLayers')), num_hidden_layers],
+                    [B(t('fldKvLoraRank')), kv_lora_rank],
+                    [B(t('fldQkRopeHeadDim')), qk_rope_head_dim],
+                    [B(t('fldDtypeSize')), `${dtype_size} ${t('bytes')}`],
+                    [B(t('fldTotalElem')), `${num_hidden_layers} × ${tokens} × (${kv_lora_rank} + ${qk_rope_head_dim}) = ${total_elements}`],
+                    [B(t('fldTotalBytes')), `${total_elements} × ${dtype_size} = ${total_bytes} ${t('bytes')}`],
+                    [B(t('fldKvSize')), `${total_bytes} / 1024³ ≈ ${kvCacheSizeGB.toFixed(4)} GB`],
+                ];
+            } else if (isHunyuanLargeModel) {
+                const cla_share_factor = config.cla_share_factor;
+                rows = [
+                    [B(t('fldModel')), model],
+                    [B(t('fldLayers')), num_hidden_layers],
+                    [B(t('fldClaFactor')), `${cla_share_factor} (${t('claNote', cla_share_factor)})`],
+                    [B(t('fldEffLayers')), `${num_hidden_layers} / ${cla_share_factor} = ${effective_layers}`],
+                    [B(t('fldKvHeads')), num_key_value_heads],
+                    [B(t('fldHeadSize')), `${head_size} (${t('headSizeNote')})`],
+                    [B(t('fldDtypeSize')), `${dtype_size} ${t('bytes')}`],
+                    [B(t('fldTotalElem')), `2 × ${effective_layers} × ${tokens} × ${num_key_value_heads} × ${head_size} = ${total_elements}`],
+                    [B(t('fldTotalBytes')), `${total_elements} × ${dtype_size} = ${total_bytes} ${t('bytes')}`],
+                    [B(t('fldKvSize')), `${total_bytes} / 1024³ ≈ ${kvCacheSizeGB.toFixed(4)} GB`],
+                ];
             } else if (isGQAWithHeadDimModel) {
-                steps = `
-                <strong>Calculation Details:</strong><br><br>
-                <b>Selected Model:</b> ${model}<br>
-                <b>Number of Hidden Layers:</b> ${num_hidden_layers}<br>
-                <b>Number of Key-Value Heads:</b> ${num_key_value_heads}<br>
-                <b>Head dim:</b> ${head_dim}<br>
-                <b>Data Type Size:</b> ${dtype_size} bytes<br>
-                <b>Total Elements:</b> 2 × ${num_hidden_layers} × ${tokens} × ${num_key_value_heads} × ${head_dim} = ${total_elements}<br>
-                <b>Total Bytes:</b> ${total_elements} × ${dtype_size} = ${total_bytes} bytes<br>
-                <b>KV Cache Size:</b> ${total_bytes} / (1024³) ≈ ${kvCacheSizeGB.toFixed(4)} GB
-                `;
+                rows = [
+                    [B(t('fldModel')), model],
+                    [B(t('fldLayers')), num_hidden_layers],
+                    [B(t('fldKvHeads')), num_key_value_heads],
+                    [B(t('fldHeadDim')), head_dim],
+                    [B(t('fldDtypeSize')), `${dtype_size} ${t('bytes')}`],
+                    [B(t('fldTotalElem')), `2 × ${num_hidden_layers} × ${tokens} × ${num_key_value_heads} × ${head_dim} = ${total_elements}`],
+                    [B(t('fldTotalBytes')), `${total_elements} × ${dtype_size} = ${total_bytes} ${t('bytes')}`],
+                    [B(t('fldKvSize')), `${total_bytes} / 1024³ ≈ ${kvCacheSizeGB.toFixed(4)} GB`],
+                ];
             } else {
-                steps = `
-                <strong>Calculation Details:</strong><br><br>
-                <b>Selected Model:</b> ${model}<br>
-                <b>Hidden Size:</b> ${hidden_size}<br>
-                <b>Number of Attention Heads:</b> ${num_attention_heads}<br>
-                <b>Number of Hidden Layers:</b> ${num_hidden_layers}<br>
-                <b>Number of Key-Value Heads:</b> ${num_key_value_heads}<br>
-                <b>Head Size:</b> ${head_size} (Hidden Size / Attention Heads)<br>
-                <b>Data Type Size:</b> ${dtype_size} bytes<br>
-                <b>Total Elements:</b> 2 × ${num_hidden_layers} × ${tokens} × ${num_key_value_heads} × ${head_size} = ${total_elements}<br>
-                <b>Total Bytes:</b> ${total_elements} × ${dtype_size} = ${total_bytes} bytes<br>
-                <b>KV Cache Size:</b> ${total_bytes} / (1024³) ≈ ${kvCacheSizeGB.toFixed(4)} GB
-            `;
+                rows = [
+                    [B(t('fldModel')), model],
+                    [B(t('fldHiddenSize')), hidden_size],
+                    [B(t('fldAttnHeads')), num_attention_heads],
+                    [B(t('fldLayers')), num_hidden_layers],
+                    [B(t('fldKvHeads')), num_key_value_heads],
+                    [B(t('fldHeadSize')), `${head_size} (${t('headSizeNote')})`],
+                    [B(t('fldDtypeSize')), `${dtype_size} ${t('bytes')}`],
+                    [B(t('fldTotalElem')), `2 × ${num_hidden_layers} × ${tokens} × ${num_key_value_heads} × ${head_size} = ${total_elements}`],
+                    [B(t('fldTotalBytes')), `${total_elements} × ${dtype_size} = ${total_bytes} ${t('bytes')}`],
+                    [B(t('fldKvSize')), `${total_bytes} / 1024³ ≈ ${kvCacheSizeGB.toFixed(4)} GB`],
+                ];
             }
-            // Display calculation steps
-            document.getElementById('calculation-steps').innerHTML = steps;
+
+            const stepsEl = document.getElementById('calculation-steps');
+            stepsEl.innerHTML = `<b>${t('detailsTitle')}</b><br><br>` +
+                rows.map(([k, v]) => `${k}: ${v}`).join('<br>');
+            stepsEl.style.display = 'block';
         }
 
-        // Add event listener for Enter key
         document.getElementById('tokens').addEventListener('keydown', function(event) {
-            if (event.key === 'Enter') {
-                calculateKVCache();
-            }
+            if (event.key === 'Enter') calculateKVCache();
         });
 
-        // Load model configurations when the page loads
         window.onload = function() {
             loadModelConfigs();
         };
diff --git a/examples/kv_cache_calculator/modelconfig.json b/examples/kv_cache_calculator/modelconfig.json
index 35d1d03756..6892175da4 100644
--- a/examples/kv_cache_calculator/modelconfig.json
+++ b/examples/kv_cache_calculator/modelconfig.json
@@ -128,14 +128,14 @@
         "num_hidden_layers": 32,
         "num_key_value_heads": 32
     },
-    "zai-org/GLM-4.5":{
+    "zai-org/GLM-4.5": {
         "hidden_size": 5120,
         "num_attention_heads": 96,
         "num_hidden_layers": 92,
         "num_key_value_heads": 8,
         "head_dim": 128
     },
-    "zai-org/GLM-4.6":{
+    "zai-org/GLM-4.6": {
         "hidden_size": 5120,
         "num_attention_heads": 96,
         "num_hidden_layers": 92,
@@ -155,5 +155,47 @@
         "num_hidden_layers": 94,
         "num_key_value_heads": 4,
         "head_dim": 128
+    },
+    "tencent/Hunyuan-0.5B-Instruct": {
+        "hidden_size": 1024,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "num_key_value_heads": 8,
+        "head_dim": 128
+    },
+    "tencent/Hunyuan-1.8B-Instruct": {
+        "hidden_size": 2048,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 4,
+        "head_dim": 128
+    },
+    "tencent/Hunyuan-4B-Instruct": {
+        "hidden_size": 3072,
+        "num_attention_heads": 32,
+        "num_hidden_layers": 36,
+        "num_key_value_heads": 8,
+        "head_dim": 128
+    },
+    "tencent/Hunyuan-7B-Instruct": {
+        "hidden_size": 4096,
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 8,
+        "head_dim": 128
+    },
+    "tencent/Hunyuan-A13B-Instruct": {
+        "hidden_size": 4096,
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 8,
+        "head_dim": 128
+    },
+    "tencent/Hunyuan-Large": {
+        "hidden_size": 6400,
+        "num_attention_heads": 80,
+        "num_hidden_layers": 64,
+        "num_key_value_heads": 8,
+        "cla_share_factor": 2
     }
 }
diff --git a/tests/benchmarks/test_xpu_kernels_microbench.py b/tests/benchmarks/test_xpu_kernels_microbench.py
index 7395389a06..1d94058896 100644
--- a/tests/benchmarks/test_xpu_kernels_microbench.py
+++ b/tests/benchmarks/test_xpu_kernels_microbench.py
@@ -31,7 +31,7 @@ def _xpu_sync():
 @pytest.fixture(scope="module")
 def xops():
     # First Party
-    import lmcache.xpu_ops as XOPS  # noqa: WPS433
+    import lmcache.xpu_ops as XOPS  # noqa: F401
 
     return XOPS
 

From fe8fb9d8b526f76e6f3fdb1ad4d145e1f2c9e3ad Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 9 Jun 2026 05:01:22 +0000
Subject: [PATCH 12/57] Update Chinese documentation translations (#3588)

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
---
 .../api_reference/configurations.po           |  25 +-
 .../locale/zh_CN/LC_MESSAGES/cli/bench.po     | 408 +++++++-----
 .../zh_CN/LC_MESSAGES/cli/coordinator.po      | 111 ++++
 .../locale/zh_CN/LC_MESSAGES/cli/index.po     |  72 ++-
 .../locale/zh_CN/LC_MESSAGES/cli/server.po    |  54 +-
 .../zh_CN/LC_MESSAGES/community/meetings.po   |  32 +-
 docs/source/locale/zh_CN/LC_MESSAGES/index.po |  56 +-
 .../kv_cache/storage_backends/nixl.po         | 195 ++++--
 .../zh_CN/LC_MESSAGES/mp/architecture.po      | 518 ++++++++++------
 .../zh_CN/LC_MESSAGES/mp/configuration.po     | 438 +++++++------
 .../zh_CN/LC_MESSAGES/mp/coordinator.po       | 222 +++++++
 .../locale/zh_CN/LC_MESSAGES/mp/deployment.po |  63 +-
 .../LC_MESSAGES/mp/frontend_dashboard.po      | 354 +++++++++++
 .../locale/zh_CN/LC_MESSAGES/mp/http_api.po   | 579 ++++++++++++------
 .../zh_CN/LC_MESSAGES/mp/hybrid_models.po     | 180 ++++++
 .../locale/zh_CN/LC_MESSAGES/mp/index.po      |  72 ++-
 .../zh_CN/LC_MESSAGES/mp/observability.po     | 521 ++++++++--------
 .../zh_CN/LC_MESSAGES/recipes/gemma3.po       | 151 +++++
 .../zh_CN/LC_MESSAGES/recipes/gemma4.po       |  87 ++-
 .../locale/zh_CN/LC_MESSAGES/recipes/index.po |  94 +--
 20 files changed, 3012 insertions(+), 1220 deletions(-)
 create mode 100644 docs/source/locale/zh_CN/LC_MESSAGES/cli/coordinator.po
 create mode 100644 docs/source/locale/zh_CN/LC_MESSAGES/mp/coordinator.po
 create mode 100644 docs/source/locale/zh_CN/LC_MESSAGES/mp/frontend_dashboard.po
 create mode 100644 docs/source/locale/zh_CN/LC_MESSAGES/mp/hybrid_models.po
 create mode 100644 docs/source/locale/zh_CN/LC_MESSAGES/recipes/gemma3.po

diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/api_reference/configurations.po b/docs/source/locale/zh_CN/LC_MESSAGES/api_reference/configurations.po
index 7b66731ee4..05f520586f 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/api_reference/configurations.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/api_reference/configurations.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-29 22:44+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -151,7 +151,9 @@ msgid ""
 " compatible with P2P mode or shared memory (multiprocess). Requires pre-"
 "allocated hugepages (``sysctl vm.nr_hugepages``). Values: true/false. "
 "Default: false"
-msgstr "是否使用 Linux 大页（2 MB）作为 CPU 固定的 KV Cache 内存。不兼容 P2P 模式或共享内存（多进程）。需要预分配大页（``sysctl vm.nr_hugepages``）。值：true/false。默认值：false"
+msgstr ""
+"是否使用 Linux 大页（2 MB）作为 CPU 固定的 KV Cache 内存。不兼容 P2P "
+"模式或共享内存（多进程）。需要预分配大页（``sysctl vm.nr_hugepages``）。值：true/false。默认值：false"
 
 #: ../../source/api_reference/configurations.rst:40
 msgid "local_disk"
@@ -1188,10 +1190,11 @@ msgstr "nixl_use_hugepages"
 
 #: ../../source/api_reference/configurations.rst:361
 msgid ""
-"Whether to use Linux hugepages (2 MiB) for the NIXL CPU buffer. Requires "
-"pre-allocated hugepages (``sysctl vm.nr_hugepages``). Values: true/false."
-" Default: false"
-msgstr "是否为 NIXL CPU 缓冲区使用 Linux 大页（2 MiB）。需要预分配大页（``sysctl vm.nr_hugepages``）。值：true/false。默认值：false"
+"**Deprecated.** Use ``local_cpu_use_hugepages`` instead. When set, the "
+"value is copied into ``local_cpu_use_hugepages`` (a warning is logged) "
+"and the key is dropped. Hugepages have never applied to GPU buffers; in "
+"CPU mode the NIXL pool is now owned by ``LocalCPUBackend``."
+msgstr "**已弃用。** 请改用 ``local_cpu_use_hugepages``。设置后，该值将被复制到 ``local_cpu_use_hugepages``（会记录警告），并且该键将被丢弃。大页从未应用于 GPU 缓冲区；在 CPU 模式下，NIXL 池现在由 ``LocalCPUBackend`` 拥有。"
 
 #: ../../source/api_reference/configurations.rst:365
 msgid "Additional Storage Configurations"
@@ -1626,3 +1629,13 @@ msgstr ""
 #~ "每个工作节点分发的对象存储端点 URL 列表。当设置时，会覆盖 "
 #~ "``nixl_backend_params.endpoint_override``。"
 
+#~ msgid ""
+#~ "Whether to use Linux hugepages (2 "
+#~ "MiB) for the NIXL CPU buffer. "
+#~ "Requires pre-allocated hugepages (``sysctl "
+#~ "vm.nr_hugepages``). Values: true/false. Default: "
+#~ "false"
+#~ msgstr ""
+#~ "是否为 NIXL CPU 缓冲区使用 Linux 大页（2 "
+#~ "MiB）。需要预分配大页（``sysctl vm.nr_hugepages``）。值：true/false。默认值：false"
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/cli/bench.po b/docs/source/locale/zh_CN/LC_MESSAGES/cli/bench.po
index a98b6ac246..70301d05ac 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/cli/bench.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/cli/bench.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-06-01 10:55+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -36,8 +36,8 @@ msgstr "子命令"
 #: ../../source/cli/bench.rst:14 ../../source/cli/bench.rst:126
 #: ../../source/cli/bench.rst:219 ../../source/cli/bench.rst:260
 #: ../../source/cli/bench.rst:311 ../../source/cli/bench.rst:398
-#: ../../source/cli/bench.rst:462 ../../source/cli/bench.rst:735
-#: ../../source/cli/bench.rst:988
+#: ../../source/cli/bench.rst:462 ../../source/cli/bench.rst:736
+#: ../../source/cli/bench.rst:1002
 msgid "Description"
 msgstr "描述"
 
@@ -173,7 +173,7 @@ msgstr "常规选项"
 #: ../../source/cli/bench.rst:124 ../../source/cli/bench.rst:217
 #: ../../source/cli/bench.rst:258 ../../source/cli/bench.rst:309
 #: ../../source/cli/bench.rst:396 ../../source/cli/bench.rst:460
-#: ../../source/cli/bench.rst:733 ../../source/cli/bench.rst:986
+#: ../../source/cli/bench.rst:734 ../../source/cli/bench.rst:1000
 msgid "Flag"
 msgstr "标志"
 
@@ -372,8 +372,8 @@ msgstr "模拟对长文档的重复问答。热身阶段将每个文档发送一
 
 #: ../../source/cli/bench.rst:218 ../../source/cli/bench.rst:259
 #: ../../source/cli/bench.rst:310 ../../source/cli/bench.rst:397
-#: ../../source/cli/bench.rst:461 ../../source/cli/bench.rst:734
-#: ../../source/cli/bench.rst:987
+#: ../../source/cli/bench.rst:461 ../../source/cli/bench.rst:735
+#: ../../source/cli/bench.rst:1001
 msgid "Default"
 msgstr "默认"
 
@@ -939,18 +939,18 @@ msgstr "两个文件都写入到 ``--output-dir``（默认：当前目录）。"
 msgid "Exit Codes"
 msgstr "退出代码"
 
-#: ../../source/cli/bench.rst:650 ../../source/cli/bench.rst:836
-#: ../../source/cli/bench.rst:1167
+#: ../../source/cli/bench.rst:650 ../../source/cli/bench.rst:845
+#: ../../source/cli/bench.rst:1181
 msgid "Code"
 msgstr "代码"
 
-#: ../../source/cli/bench.rst:651 ../../source/cli/bench.rst:837
-#: ../../source/cli/bench.rst:1168
+#: ../../source/cli/bench.rst:651 ../../source/cli/bench.rst:846
+#: ../../source/cli/bench.rst:1182
 msgid "Meaning"
 msgstr "含义"
 
-#: ../../source/cli/bench.rst:652 ../../source/cli/bench.rst:757
-#: ../../source/cli/bench.rst:838 ../../source/cli/bench.rst:1169
+#: ../../source/cli/bench.rst:652 ../../source/cli/bench.rst:766
+#: ../../source/cli/bench.rst:847 ../../source/cli/bench.rst:1183
 msgid "``0``"
 msgstr "``0``"
 
@@ -958,9 +958,9 @@ msgstr "``0``"
 msgid "All requests succeeded."
 msgstr "所有请求均成功。"
 
-#: ../../source/cli/bench.rst:654 ../../source/cli/bench.rst:841
-#: ../../source/cli/bench.rst:1001 ../../source/cli/bench.rst:1009
-#: ../../source/cli/bench.rst:1012 ../../source/cli/bench.rst:1172
+#: ../../source/cli/bench.rst:654 ../../source/cli/bench.rst:850
+#: ../../source/cli/bench.rst:1015 ../../source/cli/bench.rst:1023
+#: ../../source/cli/bench.rst:1026 ../../source/cli/bench.rst:1186
 msgid "``1``"
 msgstr "``1``"
 
@@ -979,290 +979,326 @@ msgid ""
 " over ZMQ and exercises the full KV-cache data path for a sequence of "
 "synthetic requests, then optionally verifies per-chunk checksums through "
 "the HTTP API."
-msgstr "``lmcache bench server`` 命令是对 LMCache 多进程 (MP) 缓存服务器的端到端完整性测试。它通过 ZMQ 连接到正在运行的服务器，并对一系列合成请求执行完整的 KV Cache 数据路径，然后可选地通过 HTTP API 验证每个块的校验和。"
+msgstr ""
+"``lmcache bench server`` 命令是对 LMCache 多进程 (MP) 缓存服务器的端到端完整性测试。它通过 ZMQ "
+"连接到正在运行的服务器，并对一系列合成请求执行完整的 KV Cache 数据路径，然后可选地通过 HTTP API 验证每个块的校验和。"
 
 #: ../../source/cli/bench.rst:673
 msgid ""
 "Unlike :ref:`lmcache bench engine <lmcache-bench-engine>`, this command "
 "does **not** require an inference engine. It only needs a running LMCache"
-" MP server (ZMQ + HTTP) and a GPU. It also requires the full ``lmcache`` "
-"install (not the lightweight ``lmcache-cli`` package)."
-msgstr "与 :ref:`lmcache bench engine <lmcache-bench-engine>` 不同，此命令**不**需要推理引擎。它只需要一个运行中的 LMCache MP 服务器（ZMQ + HTTP）和一个 GPU。它还需要完整的 ``lmcache`` 安装（而不是轻量级的 ``lmcache-cli`` 包）。"
+" MP server (ZMQ + HTTP). GPU mode additionally requires a CUDA-capable "
+"device. It also requires the full ``lmcache`` install (not the "
+"lightweight ``lmcache-cli`` package)."
+msgstr "与 :ref:`lmcache bench engine <lmcache-bench-engine>` 不同，此命令**不**需要推理引擎。它只需要一个正在运行的 LMCache MP 服务器 (ZMQ + HTTP)。GPU 模式还额外需要一个支持 CUDA 的设备。它还需要完整的 ``lmcache`` 安装（而不是轻量级的 ``lmcache-cli`` 包）。"
 
-#: ../../source/cli/bench.rst:680 ../../source/cli/bench.rst:869
+#: ../../source/cli/bench.rst:681 ../../source/cli/bench.rst:883
 msgid "What it does"
 msgstr "它的功能"
 
-#: ../../source/cli/bench.rst:682
+#: ../../source/cli/bench.rst:683
 msgid "For each sequence in ``[--start, --end)``, the tool runs two passes:"
 msgstr "对于 ``[--start, --end)`` 中的每个序列，该工具运行两个阶段："
 
-#: ../../source/cli/bench.rst:684
+#: ../../source/cli/bench.rst:685
 msgid ""
 "**Cold pass** -- ``LOOKUP`` is expected to miss, so the generated KV "
 "tensors are ``STORE``\\ d on the server."
 msgstr "**冷启动** -- ``LOOKUP`` 预计会未命中，因此生成的 KV 张量会被 ``STORE``\\\\ 存储在服务器上。"
 
-#: ../../source/cli/bench.rst:686
+#: ../../source/cli/bench.rst:687
 msgid ""
 "**Warm pass** -- ``LOOKUP`` is expected to hit; the tool issues "
 "``RETRIEVE`` and compares the retrieved KV chunks' checksums to the "
 "originals."
 msgstr "**热通道** -- ``LOOKUP`` 预计会命中；工具发出 ``RETRIEVE`` 并将检索到的 KV 块的校验和与原始值进行比较。"
 
-#: ../../source/cli/bench.rst:690
+#: ../../source/cli/bench.rst:691
 msgid "The full RPC path exercised is::"
 msgstr "完整的 RPC 路径是::"
 
-#: ../../source/cli/bench.rst:696
+#: ../../source/cli/bench.rst:697
 msgid ""
 "When ``--url`` points to the server's HTTP endpoint, per-chunk checksums "
 "are additionally cross-checked against the server-side computation, so a "
 "mismatch between producer and consumer surfaces as a loud ``CHECKSUM "
 "MISMATCH`` log line."
-msgstr "当 ``--url`` 指向服务器的 HTTP 端点时，逐块的校验和会额外与服务器端的计算进行交叉检查，因此生产者和消费者之间的不匹配会以显眼的 ``CHECKSUM MISMATCH`` 日志行显示出来。"
+msgstr ""
+"当 ``--url`` 指向服务器的 HTTP 端点时，逐块的校验和会额外与服务器端的计算进行交叉检查，因此生产者和消费者之间的不匹配会以显眼的 "
+"``CHECKSUM MISMATCH`` 日志行显示出来。"
 
-#: ../../source/cli/bench.rst:703 ../../source/cli/bench.rst:920
+#: ../../source/cli/bench.rst:704 ../../source/cli/bench.rst:934
 msgid "Quick start"
 msgstr "快速开始"
 
-#: ../../source/cli/bench.rst:705
+#: ../../source/cli/bench.rst:706
 msgid "Start the MP server in one terminal:"
 msgstr "在一个终端中启动 MP 服务器："
 
-#: ../../source/cli/bench.rst:714
+#: ../../source/cli/bench.rst:715
 msgid "Then in another terminal:"
 msgstr "然后在另一个终端中："
 
-#: ../../source/cli/bench.rst:722
+#: ../../source/cli/bench.rst:723
 msgid ""
 "By default the tool runs forever (``--end`` unset); stop it with "
 "``Ctrl-C`` at any time. Pass ``--end N`` for a bounded run."
-msgstr "默认情况下，该工具会一直运行（``--end`` 未设置）；您可以随时使用 ``Ctrl-C`` 停止它。传递 ``--end N`` 以进行有限运行。"
+msgstr ""
+"默认情况下，该工具会一直运行（``--end`` 未设置）；您可以随时使用 ``Ctrl-C`` 停止它。传递 ``--end N`` "
+"以进行有限运行。"
 
-#: ../../source/cli/bench.rst:727 ../../source/cli/bench.rst:980
+#: ../../source/cli/bench.rst:728 ../../source/cli/bench.rst:994
 msgid "Options"
 msgstr "选项"
 
-#: ../../source/cli/bench.rst:736
+#: ../../source/cli/bench.rst:737
 msgid "``--rpc-url URL``"
 msgstr "``--rpc-url URL``"
 
-#: ../../source/cli/bench.rst:737
+#: ../../source/cli/bench.rst:738
 msgid "``tcp://localhost:5555``"
 msgstr "``tcp://localhost:5555``"
 
-#: ../../source/cli/bench.rst:738
+#: ../../source/cli/bench.rst:739
 msgid "ZMQ endpoint of the MP cache server."
 msgstr "MP 缓存服务器的 ZMQ 端点。"
 
-#: ../../source/cli/bench.rst:739
+#: ../../source/cli/bench.rst:740
 msgid "``--url URL``"
 msgstr "``--url URL``"
 
-#: ../../source/cli/bench.rst:740
+#: ../../source/cli/bench.rst:741
 msgid "``http://localhost:8080``"
 msgstr "``http://localhost:8080``"
 
-#: ../../source/cli/bench.rst:741
+#: ../../source/cli/bench.rst:742
 msgid ""
 "HTTP base URL of the server's checksum API. Used to verify per-chunk "
 "checksums end-to-end."
 msgstr "服务器的校验和 API 的 HTTP 基础 URL。用于端到端验证每个块的校验和。"
 
-#: ../../source/cli/bench.rst:743
+#: ../../source/cli/bench.rst:744
 #, python-brace-format
-msgid "``--mode {gpu}``"
-msgstr "``--mode {gpu}``"
+msgid "``--mode {gpu,cpu}``"
+msgstr "``--mode {gpu,cpu}``"
 
-#: ../../source/cli/bench.rst:744
+#: ../../source/cli/bench.rst:745
 msgid "``gpu``"
 msgstr "``gpu``"
 
-#: ../../source/cli/bench.rst:745
+#: ../../source/cli/bench.rst:746
+msgid ""
+"Run mode. ``gpu`` allocates real CUDA tensors and uses CUDA IPC (handle "
+"path). ``cpu`` allocates POSIX-SHM-backed tensors and uses the data-"
+"transfer path (gather/scatter via slot descriptors)."
+msgstr "运行模式。``gpu`` 分配真实的 CUDA 张量并使用 CUDA IPC（句柄路径）。``cpu`` 分配基于 POSIX-SHM 的张量并使用数据传输路径（通过槽描述符进行聚集/分散）。"
+
+#: ../../source/cli/bench.rst:749
+#, python-brace-format
+msgid "``--transfer-mode {auto,handle,data}``"
+msgstr "``--transfer-mode {auto,handle,data}``"
+
+#: ../../source/cli/bench.rst:750
+msgid "``auto``"
+msgstr "``auto``"
+
+#: ../../source/cli/bench.rst:751
 msgid ""
-"Run mode. Only ``gpu`` is supported today; CPU mode is a planned follow-"
-"up."
-msgstr "运行模式。目前仅支持 ``gpu``；CPU 模式是计划中的后续功能。"
+"Transport routing for STORE/RETRIEVE. ``handle`` forces the single-shot "
+"path (``REGISTER_KV_CACHE`` + ``STORE``/``RETRIEVE``). ``data`` forces "
+"the two-phase gather/scatter path (``REGISTER_KV_CACHE_NON_GPU_CONTEXT`` "
+"+ ``PREPARE``/``COMMIT``). ``auto`` maps gpu→handle and cpu→data."
+msgstr "STORE/RETRIEVE 的传输路由。``handle`` 强制执行单次路径 (``REGISTER_KV_CACHE`` + ``STORE``/``RETRIEVE``)。``data`` 强制执行两阶段的收集/散布路径 (``REGISTER_KV_CACHE_NON_GPU_CONTEXT`` + ``PREPARE``/``COMMIT``)。``auto`` 将 gpu 映射到 handle，将 cpu 映射到 data。"
 
-#: ../../source/cli/bench.rst:747
+#: ../../source/cli/bench.rst:756
 msgid "``--num-tokens N``"
 msgstr "``--num-tokens N``"
 
-#: ../../source/cli/bench.rst:748
+#: ../../source/cli/bench.rst:757
 msgid "``512``"
 msgstr "``512``"
 
-#: ../../source/cli/bench.rst:749
+#: ../../source/cli/bench.rst:758
 msgid "Tokens per synthetic request."
 msgstr "每个合成请求的令牌数。"
 
-#: ../../source/cli/bench.rst:750
+#: ../../source/cli/bench.rst:759
 msgid "``--num-blocks N``"
 msgstr "``--num-blocks N``"
 
-#: ../../source/cli/bench.rst:751
+#: ../../source/cli/bench.rst:760
 msgid "``1024``"
 msgstr "``1024``"
 
-#: ../../source/cli/bench.rst:752
+#: ../../source/cli/bench.rst:761
 msgid "Number of paged blocks allocated on the GPU."
 msgstr "在 GPU 上分配的分页块数量。"
 
-#: ../../source/cli/bench.rst:753
+#: ../../source/cli/bench.rst:762
 msgid "``--block-size N``"
 msgstr "``--block-size N``"
 
-#: ../../source/cli/bench.rst:754
+#: ../../source/cli/bench.rst:763
 msgid "``16``"
 msgstr "``16``"
 
-#: ../../source/cli/bench.rst:755
+#: ../../source/cli/bench.rst:764
 msgid "Tokens per paged block."
 msgstr "每个分页块的令牌数。"
 
-#: ../../source/cli/bench.rst:756
+#: ../../source/cli/bench.rst:765
 msgid "``--start N``"
 msgstr "``--start N``"
 
-#: ../../source/cli/bench.rst:758
+#: ../../source/cli/bench.rst:767
 msgid "First sequence number to run."
 msgstr "运行的第一个序列号。"
 
-#: ../../source/cli/bench.rst:759
+#: ../../source/cli/bench.rst:768
 msgid "``--end N``"
 msgstr "``--end N``"
 
-#: ../../source/cli/bench.rst:760 ../../source/cli/bench.rst:990
-#: ../../source/cli/bench.rst:1030
+#: ../../source/cli/bench.rst:769 ../../source/cli/bench.rst:1004
+#: ../../source/cli/bench.rst:1044
 msgid "*(unset)*"
 msgstr "*(未设置)*"
 
-#: ../../source/cli/bench.rst:761
+#: ../../source/cli/bench.rst:770
 msgid ""
 "Exclusive upper bound on sequence numbers. When omitted the loop runs "
 "forever."
 msgstr "序列号的独占上限。当省略时，循环将无限运行。"
 
-#: ../../source/cli/bench.rst:763
+#: ../../source/cli/bench.rst:772
 msgid "``--interval SECS``"
 msgstr "``--interval SECS``"
 
-#: ../../source/cli/bench.rst:764
+#: ../../source/cli/bench.rst:773
 msgid "``0.5``"
 msgstr "``0.5``"
 
-#: ../../source/cli/bench.rst:765
+#: ../../source/cli/bench.rst:774
 msgid "Delay between successive sub-passes."
 msgstr "连续子通道之间的延迟。"
 
-#: ../../source/cli/bench.rst:766
+#: ../../source/cli/bench.rst:775
 msgid "``--kvcache-shape-spec SPEC``"
 msgstr "``--kvcache-shape-spec SPEC``"
 
-#: ../../source/cli/bench.rst:767
+#: ../../source/cli/bench.rst:776
 msgid "``(2,1024,16,8,128):float16:32``"
 msgstr "``(2,1024,16,8,128):float16:32``"
 
-#: ../../source/cli/bench.rst:768
+#: ../../source/cli/bench.rst:777
 msgid "KV cache shape spec (see below)."
 msgstr "KV Cache 形状规格（见下文）。"
 
-#: ../../source/cli/bench.rst:772
+#: ../../source/cli/bench.rst:781
 msgid "KV cache shape spec"
 msgstr "KV 缓存形状规格"
 
-#: ../../source/cli/bench.rst:774
+#: ../../source/cli/bench.rst:783
 msgid ""
 "The ``--kvcache-shape-spec`` flag describes how KV tensors are laid out "
 "on the GPU. A spec is one or more groups separated by ``;``:"
 msgstr "``--kvcache-shape-spec`` 标志描述了 KV 张量在 GPU 上的布局。规格是一个或多个用 ``;`` 分隔的组："
 
-#: ../../source/cli/bench.rst:781
+#: ../../source/cli/bench.rst:790
 msgid "Fields:"
 msgstr "字段："
 
-#: ../../source/cli/bench.rst:783
+#: ../../source/cli/bench.rst:792
 msgid "``kv_size`` -- 2 for classical attention (separate K/V), 1 for MLA."
 msgstr "``kv_size`` -- 经典注意力（分开的 K/V）为 2，MLA 为 1。"
 
-#: ../../source/cli/bench.rst:784
+#: ../../source/cli/bench.rst:793
 msgid "``NB`` -- number of paged blocks."
 msgstr "``NB`` -- 页块数量。"
 
-#: ../../source/cli/bench.rst:785
+#: ../../source/cli/bench.rst:794
 msgid "``BS`` -- block size (tokens per block)."
 msgstr "``BS`` -- 块大小（每块的令牌数）。"
 
-#: ../../source/cli/bench.rst:786
+#: ../../source/cli/bench.rst:795
 msgid "``NH`` -- number of attention heads per layer."
 msgstr "``NH`` -- 每层的注意力头数量。"
 
-#: ../../source/cli/bench.rst:787
+#: ../../source/cli/bench.rst:796
 msgid "``HS`` -- head size (in elements)."
 msgstr "``HS`` -- 头大小（以元素为单位）。"
 
-#: ../../source/cli/bench.rst:788
+#: ../../source/cli/bench.rst:797
 msgid ""
 "``dtype`` -- element dtype (e.g. ``float16``, ``bfloat16``, ``float32``, "
 "``uint8``). The full set matches the keys of ``DTYPE_MAP`` in "
 "``lmcache/v1/kv_layer_groups.py``."
-msgstr "``dtype`` -- 元素数据类型（例如 ``float16``、``bfloat16``、``float32``、``uint8``）。完整的集合与 ``lmcache/v1/kv_layer_groups.py`` 中的 ``DTYPE_MAP`` 键匹配。"
+msgstr ""
+"``dtype`` -- 元素数据类型（例如 "
+"``float16``、``bfloat16``、``float32``、``uint8``）。完整的集合与 "
+"``lmcache/v1/kv_layer_groups.py`` 中的 ``DTYPE_MAP`` 键匹配。"
 
-#: ../../source/cli/bench.rst:791
+#: ../../source/cli/bench.rst:800
 msgid "``layers`` -- number of layers in this group."
 msgstr "``layers`` -- 该组中的层数。"
 
-#: ../../source/cli/bench.rst:793
+#: ../../source/cli/bench.rst:802
 msgid ""
 "Multi-group specs let you model heterogeneous layers (for example, MLA "
 "layers + classical attention layers in the same model):"
 msgstr "多组规格允许您建模异构层（例如，在同一模型中结合 MLA 层和经典注意力层）："
 
-#: ../../source/cli/bench.rst:802
+#: ../../source/cli/bench.rst:811
 msgid ""
 "All groups must share the same ``NB`` and ``BS`` (this is a physical "
 "constraint of paged KV). Layer counts across groups sum to the total "
 "layer count registered with the server."
 msgstr "所有组必须共享相同的 ``NB`` 和 ``BS``（这是分页 KV 的物理限制）。各组的层数总和等于注册到服务器的总层数。"
 
-#: ../../source/cli/bench.rst:806
+#: ../../source/cli/bench.rst:815
 msgid ""
 "See ``parse_kvcache_shape_spec`` in ``lmcache/v1/kv_layer_groups.py`` for"
 " the authoritative parsing rules and validation errors."
-msgstr "请参阅 ``parse_kvcache_shape_spec`` 在 ``lmcache/v1/kv_layer_groups.py`` 中的权威解析规则和验证错误。"
+msgstr ""
+"请参阅 ``parse_kvcache_shape_spec`` 在 ``lmcache/v1/kv_layer_groups.py`` "
+"中的权威解析规则和验证错误。"
 
-#: ../../source/cli/bench.rst:811 ../../source/cli/bench.rst:1061
+#: ../../source/cli/bench.rst:820 ../../source/cli/bench.rst:1075
 msgid "Example output"
 msgstr "示例输出"
 
-#: ../../source/cli/bench.rst:825
+#: ../../source/cli/bench.rst:834
 msgid ""
 "Any ``CHECKSUM MISMATCH``, ``ERROR``, or Python traceback in the log "
 "indicates a real problem worth investigating."
 msgstr "日志中的任何 ``CHECKSUM MISMATCH``、``ERROR`` 或 Python 回溯都表明存在值得调查的实际问题。"
 
-#: ../../source/cli/bench.rst:830 ../../source/cli/bench.rst:1161
+#: ../../source/cli/bench.rst:839 ../../source/cli/bench.rst:1175
 msgid "Exit codes"
 msgstr "退出代码"
 
-#: ../../source/cli/bench.rst:839
+#: ../../source/cli/bench.rst:848
 msgid ""
 "Test loop completed (or was interrupted cleanly with Ctrl-C) with no "
 "checksum mismatches."
 msgstr "测试循环完成（或通过 Ctrl-C 干净地中断），没有校验和不匹配。"
 
-#: ../../source/cli/bench.rst:842
+#: ../../source/cli/bench.rst:851
 msgid ""
 "Fatal error (for example, CUDA unavailable in ``--mode gpu``, server "
 "unreachable, or a checksum mismatch)."
 msgstr "致命错误（例如，在 ``--mode gpu`` 中 CUDA 不可用、服务器无法访问或校验和不匹配）。"
 
-#: ../../source/cli/bench.rst:849
+#: ../../source/cli/bench.rst:856
+msgid ""
+"``--transfer-mode handle`` on CPU mode is not yet implemented and will be"
+" added in a future release."
+msgstr "``--transfer-mode handle`` 在 CPU 模式下尚未实现，将在未来的版本中添加。"
+
+#: ../../source/cli/bench.rst:863
 msgid "l2"
 msgstr "l2"
 
-#: ../../source/cli/bench.rst:851
+#: ../../source/cli/bench.rst:865
 msgid ""
 "The ``lmcache bench l2`` command benchmarks an L2 cache adapter (e.g. the"
 " local-filesystem adapter) end-to-end through the same "
@@ -1270,59 +1306,74 @@ msgid ""
 "that LMCache uses in production. Any registered adapter type can be "
 "tested without code changes: you describe the adapter with a single JSON "
 "spec and pick the operations to exercise."
-msgstr "``lmcache bench l2`` 命令通过 LMCache 在生产中使用的相同 ``parse_args_to_l2_adapters_config`` + ``create_l2_adapter`` 流水线，对 L2 缓存适配器（例如本地文件系统适配器）进行端到端基准测试。任何注册的适配器类型都可以在不更改代码的情况下进行测试：您只需用一个 JSON 规范描述适配器，并选择要执行的操作。"
+msgstr ""
+"``lmcache bench l2`` 命令通过 LMCache 在生产中使用的相同 "
+"``parse_args_to_l2_adapters_config`` + ``create_l2_adapter`` 流水线，对 L2 "
+"缓存适配器（例如本地文件系统适配器）进行端到端基准测试。任何注册的适配器类型都可以在不更改代码的情况下进行测试：您只需用一个 JSON "
+"规范描述适配器，并选择要执行的操作。"
 
-#: ../../source/cli/bench.rst:862
+#: ../../source/cli/bench.rst:876
 msgid ""
 "Unlike :ref:`lmcache bench engine <lmcache-bench-engine>`, this command "
 "does **not** require an inference engine or an LMCache MP server. It only"
 " needs the adapter's own backing storage to be reachable (for the ``fs`` "
 "adapter, that simply means a writable directory)."
-msgstr "与 :ref:`lmcache bench engine <lmcache-bench-engine>` 不同，此命令 **不** 需要推理引擎或 LMCache MP 服务器。它只需要适配器自身的后端存储可访问（对于 ``fs`` 适配器，这仅意味着一个可写的目录）。"
+msgstr ""
+"与 :ref:`lmcache bench engine <lmcache-bench-engine>` 不同，此命令 **不** 需要推理引擎或"
+" LMCache MP 服务器。它只需要适配器自身的后端存储可访问（对于 ``fs`` 适配器，这仅意味着一个可写的目录）。"
 
-#: ../../source/cli/bench.rst:871
+#: ../../source/cli/bench.rst:885
 msgid ""
 "For each measured operation the tool drives the adapter directly via its "
 "public submit/wait API:"
 msgstr "对于每个测量的操作，该工具通过适配器的公共提交/等待 API 直接驱动适配器："
 
-#: ../../source/cli/bench.rst:874
+#: ../../source/cli/bench.rst:888
 msgid ""
 "``Store``  -- ``submit_store_task`` writes ``num_keys`` MemoryObjs per "
 "submit and waits for the store eventfd."
-msgstr "``Store``  -- ``submit_store_task`` 写入 ``num_keys`` 个 MemoryObjs 每次提交，并等待存储 eventfd。"
+msgstr ""
+"``Store``  -- ``submit_store_task`` 写入 ``num_keys`` 个 MemoryObjs "
+"每次提交，并等待存储 eventfd。"
 
-#: ../../source/cli/bench.rst:876
+#: ../../source/cli/bench.rst:890
 msgid ""
 "``Lookup`` -- ``submit_lookup_and_lock_task`` checks key existence (no "
 "payload transfer) and waits for the lookup eventfd."
 msgstr "``查找`` -- ``submit_lookup_and_lock_task`` 检查键的存在性（不传输有效负载）并等待查找 eventfd。"
 
-#: ../../source/cli/bench.rst:878
+#: ../../source/cli/bench.rst:892
 msgid ""
 "``Load``   -- ``submit_load_task`` reads ``num_keys`` MemoryObjs per "
 "submit and waits for the load eventfd."
-msgstr "``Load``   -- ``submit_load_task`` 读取 ``num_keys`` MemoryObjs 每次提交，并等待加载 eventfd。"
+msgstr ""
+"``Load``   -- ``submit_load_task`` 读取 ``num_keys`` MemoryObjs 每次提交，并等待加载 "
+"eventfd。"
 
-#: ../../source/cli/bench.rst:881
+#: ../../source/cli/bench.rst:895
 msgid ""
 "Each measured **round** issues ``--in-flight`` submits sequentially from "
 "a single producer thread and then waits for all of them to complete; the "
 "round duration is the wall-clock time from the first submit until the "
 "last completion. Warmup rounds run before measurement and their results "
 "are discarded from the final summary."
-msgstr "每个测量的 **轮次** 从单个生产者线程顺序发出 ``--in-flight`` 提交，然后等待所有提交完成；轮次持续时间是从第一次提交到最后一次完成的墙钟时间。热身轮次在测量之前运行，其结果会从最终摘要中丢弃。"
+msgstr ""
+"每个测量的 **轮次** 从单个生产者线程顺序发出 ``--in-flight`` "
+"提交，然后等待所有提交完成；轮次持续时间是从第一次提交到最后一次完成的墙钟时间。热身轮次在测量之前运行，其结果会从最终摘要中丢弃。"
 
-#: ../../source/cli/bench.rst:887
+#: ../../source/cli/bench.rst:901
 msgid ""
 "All three operations share the same key idx universe, so running ``--only"
 " store`` followed by ``--only load`` (or ``--only lookup``) with "
 "identical other flags hits exactly the same keys. This makes the "
 "benchmark useful as a quick regression test for adapters that should "
 "support a clean store -> load round-trip."
-msgstr "这三种操作共享相同的键 idx 宇宙，因此运行 ``--only store`` 然后是 ``--only load``（或 ``--only lookup``）并使用相同的其他标志时，正好命中相同的键。这使得基准测试作为适配器的快速回归测试变得有用，适配器应该支持干净的存储 -> 加载往返。"
+msgstr ""
+"这三种操作共享相同的键 idx 宇宙，因此运行 ``--only store`` 然后是 ``--only load``（或 ``--only "
+"lookup``）并使用相同的其他标志时，正好命中相同的键。这使得基准测试作为适配器的快速回归测试变得有用，适配器应该支持干净的存储 -> "
+"加载往返。"
 
-#: ../../source/cli/bench.rst:895
+#: ../../source/cli/bench.rst:909
 msgid ""
 "When ``--only`` is not given, the three operations are run **in a single "
 "process in the order** ``store -> lookup -> load``. For adapters whose "
@@ -1331,64 +1382,72 @@ msgid ""
 " -- this means ``lookup`` and ``load`` will almost always observe the "
 "data that ``store`` just wrote still hot in RAM, and the reported numbers"
 " reflect page-cache throughput rather than the underlying device."
-msgstr "当未给出 ``--only`` 时，这三个操作将在 **单个进程中按顺序** 执行 ``store -> lookup -> load``。对于其后端存储位于操作系统级缓存之后的适配器——尤其是本地文件系统（``fs``）适配器，它受 Linux **页面缓存** 的影响——这意味着 ``lookup`` 和 ``load`` 几乎总是会观察到 ``store`` 刚刚写入的数据仍然在 RAM 中热存储，报告的数字反映的是页面缓存的吞吐量，而不是底层设备的吞吐量。"
+msgstr ""
+"当未给出 ``--only`` 时，这三个操作将在 **单个进程中按顺序** 执行 ``store -> lookup -> "
+"load``。对于其后端存储位于操作系统级缓存之后的适配器——尤其是本地文件系统（``fs``）适配器，它受 Linux **页面缓存** "
+"的影响——这意味着 ``lookup`` 和 ``load`` 几乎总是会观察到 ``store`` 刚刚写入的数据仍然在 RAM "
+"中热存储，报告的数字反映的是页面缓存的吞吐量，而不是底层设备的吞吐量。"
 
-#: ../../source/cli/bench.rst:904
+#: ../../source/cli/bench.rst:918
 msgid ""
 "To benchmark each operation against a cold cache, run them separately "
 "with ``--only`` and drop the OS caches in between, for example::"
 msgstr "要对每个操作在冷缓存下进行基准测试，请使用 ``--only`` 单独运行它们，并在之间清除操作系统缓存，例如::"
 
-#: ../../source/cli/bench.rst:914
+#: ../../source/cli/bench.rst:928
 msgid ""
 "For adapters that bypass the page cache (e.g. ``fs`` with "
 "``\"use_odirect\": true``) or that talk to a remote service without a "
 "local cache, the default combined run is usually fine."
-msgstr "对于绕过页面缓存的适配器（例如，使用 ``\"use_odirect\": true`` 的 ``fs``）或与没有本地缓存的远程服务通信的适配器，默认的组合运行通常是可以的。"
+msgstr ""
+"对于绕过页面缓存的适配器（例如，使用 ``\"use_odirect\": true`` 的 "
+"``fs``）或与没有本地缓存的远程服务通信的适配器，默认的组合运行通常是可以的。"
 
-#: ../../source/cli/bench.rst:922
+#: ../../source/cli/bench.rst:936
 msgid "Benchmark the local filesystem adapter with default parameters:"
 msgstr "使用默认参数对本地文件系统适配器进行基准测试："
 
-#: ../../source/cli/bench.rst:929
+#: ../../source/cli/bench.rst:943
 msgid ""
 "This runs all three operations (store, lookup, load) with one warmup "
 "round and one measurement round."
 msgstr "这将运行所有三个操作（存储、查找、加载），并进行一次预热轮次和一次测量轮次。"
 
-#: ../../source/cli/bench.rst:932
+#: ../../source/cli/bench.rst:946
 msgid "Stress the adapter with more in-flight submits and larger payloads:"
 msgstr "对适配器施加更多的并发提交和更大的负载："
 
-#: ../../source/cli/bench.rst:942
+#: ../../source/cli/bench.rst:956
 msgid "Run only one operation (useful to isolate store vs. load throughput):"
 msgstr "仅运行一个操作（有助于隔离存储与加载的吞吐量）："
 
-#: ../../source/cli/bench.rst:950
+#: ../../source/cli/bench.rst:964
 msgid ""
 "Lookup with a controlled hit rate (the benchmark splits the lookup keys "
 "between a potentially-existing range and a guaranteed-non-existent "
 "range):"
 msgstr "使用受控命中率的查找（基准测试将查找键分为一个潜在存在的范围和一个保证不存在的范围）："
 
-#: ../../source/cli/bench.rst:960
+#: ../../source/cli/bench.rst:974
 msgid ""
 "Enable a store -> load round-trip data integrity check on the last "
 "measured round:"
 msgstr "在最后一次测量的轮次上启用存储 -> 加载往返数据完整性检查："
 
-#: ../../source/cli/bench.rst:969
+#: ../../source/cli/bench.rst:983
 msgid ""
 "If you prefer to keep the JSON spec out of the command line, set the "
 "``L2_ADAPTER_JSON`` environment variable instead of passing "
 "``--l2-adapter``:"
-msgstr "如果您希望将 JSON 规范保留在命令行之外，请设置 ``L2_ADAPTER_JSON`` 环境变量，而不是传递 ``--l2-adapter``："
+msgstr ""
+"如果您希望将 JSON 规范保留在命令行之外，请设置 ``L2_ADAPTER_JSON`` 环境变量，而不是传递 "
+"``--l2-adapter``："
 
-#: ../../source/cli/bench.rst:989
+#: ../../source/cli/bench.rst:1003
 msgid "``--l2-adapter JSON``"
 msgstr "``--l2-adapter JSON``"
 
-#: ../../source/cli/bench.rst:991
+#: ../../source/cli/bench.rst:1005
 #, python-brace-format
 msgid ""
 "L2 adapter spec as JSON with a ``\"type\"`` field plus adapter-specific "
@@ -1396,200 +1455,225 @@ msgid ""
 "be passed multiple times; only the first spec is benchmarked. If not "
 "provided, falls back to the ``L2_ADAPTER_JSON`` environment variable. "
 "Either the flag or the env var is **required**."
-msgstr "L2 适配器规范以 JSON 格式提供，包含 ``\"type\"`` 字段和适配器特定的配置，例如 ``'{\\\"type\\\":\\\"fs\\\",\\\"base_path\\\":\\\"/tmp/bench\\\"}'``。可以多次传递；只有第一个规范会被基准测试。如果未提供，则回退到 ``L2_ADAPTER_JSON`` 环境变量。标志或环境变量 **必需**。"
+msgstr ""
+"L2 适配器规范以 JSON 格式提供，包含 ``\"type\"`` 字段和适配器特定的配置，例如 "
+"``'{\\\"type\\\":\\\"fs\\\",\\\"base_path\\\":\\\"/tmp/bench\\\"}'``。可以多次传递；只有第一个规范会被基准测试。如果未提供，则回退到"
+" ``L2_ADAPTER_JSON`` 环境变量。标志或环境变量 **必需**。"
 
-#: ../../source/cli/bench.rst:997
+#: ../../source/cli/bench.rst:1011
 msgid "``--num-keys N``"
 msgstr "``--num-keys N``"
 
-#: ../../source/cli/bench.rst:998
+#: ../../source/cli/bench.rst:1012
 msgid "``32``"
 msgstr "``32``"
 
-#: ../../source/cli/bench.rst:999
+#: ../../source/cli/bench.rst:1013
 msgid "Number of keys per submit."
 msgstr "每次提交的键数量。"
 
-#: ../../source/cli/bench.rst:1000
+#: ../../source/cli/bench.rst:1014
 msgid "``--in-flight N``"
 msgstr "``--in-flight N``"
 
-#: ../../source/cli/bench.rst:1002
+#: ../../source/cli/bench.rst:1016
 msgid ""
 "In-flight submits per round. Each round issues this many submits "
 "sequentially from a single producer thread, then waits for all of them."
 msgstr "每轮的在途提交数量。每轮从单个生产者线程顺序发出这么多提交，然后等待所有提交完成。"
 
-#: ../../source/cli/bench.rst:1005
+#: ../../source/cli/bench.rst:1019
 msgid "``--data-size-kb N``"
 msgstr "``--data-size-kb N``"
 
-#: ../../source/cli/bench.rst:1006
+#: ../../source/cli/bench.rst:1020
 msgid "``256``"
 msgstr "``256``"
 
-#: ../../source/cli/bench.rst:1007
+#: ../../source/cli/bench.rst:1021
 msgid "Data size per key, in KiB."
 msgstr "每个键的数据大小，以 KiB 为单位。"
 
-#: ../../source/cli/bench.rst:1008
+#: ../../source/cli/bench.rst:1022
 msgid "``--rounds N``"
 msgstr "``--rounds N``"
 
-#: ../../source/cli/bench.rst:1010
+#: ../../source/cli/bench.rst:1024
 msgid "Measurement rounds per operation."
 msgstr "每次操作的测量轮数。"
 
-#: ../../source/cli/bench.rst:1011
+#: ../../source/cli/bench.rst:1025
 msgid "``--warmup-rounds N``"
 msgstr "``--warmup-rounds N``"
 
-#: ../../source/cli/bench.rst:1013
+#: ../../source/cli/bench.rst:1027
 msgid "Warmup rounds run before measurement; their results are discarded."
 msgstr "在测量之前运行的预热轮次；它们的结果会被丢弃。"
 
-#: ../../source/cli/bench.rst:1015
+#: ../../source/cli/bench.rst:1029
 msgid "``--lookup-max-hit-rate F``"
 msgstr "``--lookup-max-hit-rate F``"
 
-#: ../../source/cli/bench.rst:1016
+#: ../../source/cli/bench.rst:1030
 msgid "``0.0``"
 msgstr "``0.0``"
 
-#: ../../source/cli/bench.rst:1017
+#: ../../source/cli/bench.rst:1031
 msgid ""
 "Upper bound on the lookup hit rate, in ``[0, 1]``. The benchmark requests"
 " ``floor(N * rate)`` keys from the potentially-existing range and ``N - "
 "hit`` keys from a guaranteed-non-existent range, where ``N`` is the total"
 " number of lookup keys. The actual hit rate may be lower if those keys "
 "were never stored in this run."
-msgstr "查找命中率的上限，范围在 ``[0, 1]`` 之间。基准测试请求 ``floor(N * rate)`` 个来自潜在存在范围的键，以及 ``N - hit`` 个来自保证不存在范围的键，其中 ``N`` 是查找键的总数。如果这些键在此运行中从未存储，则实际命中率可能会更低。"
+msgstr ""
+"查找命中率的上限，范围在 ``[0, 1]`` 之间。基准测试请求 ``floor(N * rate)`` 个来自潜在存在范围的键，以及 ``N "
+"- hit`` 个来自保证不存在范围的键，其中 ``N`` 是查找键的总数。如果这些键在此运行中从未存储，则实际命中率可能会更低。"
 
-#: ../../source/cli/bench.rst:1023
+#: ../../source/cli/bench.rst:1037
 msgid "``--skip-verify`` / ``--no-skip-verify``"
 msgstr "``--skip-verify`` / ``--no-skip-verify``"
 
-#: ../../source/cli/bench.rst:1024
+#: ../../source/cli/bench.rst:1038
 msgid "``--skip-verify``"
 msgstr "``--skip-verify``"
 
-#: ../../source/cli/bench.rst:1025
+#: ../../source/cli/bench.rst:1039
 msgid ""
 "Skip the store -> load round-trip data integrity check (the default). "
 "Pass ``--no-skip-verify`` to enable verification on the last measured "
 "round; this requires both ``store`` and ``load`` to be exercised."
-msgstr "跳过存储 -> 加载往返数据完整性检查（默认）。传递 ``--no-skip-verify`` 以在最后一次测量的轮次上启用验证；这要求同时执行 ``store`` 和 ``load``。"
+msgstr ""
+"跳过存储 -> 加载往返数据完整性检查（默认）。传递 ``--no-skip-verify`` 以在最后一次测量的轮次上启用验证；这要求同时执行 "
+"``store`` 和 ``load``。"
 
-#: ../../source/cli/bench.rst:1029
+#: ../../source/cli/bench.rst:1043
 #, python-brace-format
 msgid "``--only {lookup,store,load}``"
 msgstr "``--仅 {查找,存储,加载}``"
 
-#: ../../source/cli/bench.rst:1031
+#: ../../source/cli/bench.rst:1045
 msgid ""
 "Run only the specified operation. When omitted, all three operations are "
 "run in the order ``store -> lookup -> load``."
 msgstr "仅运行指定的操作。省略时，将按顺序运行所有三个操作 ``store -> lookup -> load``。"
 
-#: ../../source/cli/bench.rst:1036
+#: ../../source/cli/bench.rst:1050
 msgid "Adapter JSON spec"
 msgstr "适配器 JSON 规范"
 
-#: ../../source/cli/bench.rst:1038
+#: ../../source/cli/bench.rst:1052
 msgid ""
 "The ``--l2-adapter`` JSON is parsed by "
 "``lmcache.v1.distributed.l2_adapters.config.parse_args_to_l2_adapters_config``,"
 " the same entry point LMCache uses everywhere else. The minimum required "
 "field is ``type``; all remaining fields are forwarded to the adapter "
 "implementation as keyword arguments."
-msgstr "``--l2-adapter`` JSON 由 ``lmcache.v1.distributed.l2_adapters.config.parse_args_to_l2_adapters_config`` 解析，这是 LMCache 在其他地方使用的相同入口点。最小必需字段是 ``type``；所有剩余字段作为关键字参数转发给适配器实现。"
+msgstr ""
+"``--l2-adapter`` JSON 由 "
+"``lmcache.v1.distributed.l2_adapters.config.parse_args_to_l2_adapters_config``"
+" 解析，这是 LMCache 在其他地方使用的相同入口点。最小必需字段是 ``type``；所有剩余字段作为关键字参数转发给适配器实现。"
 
-#: ../../source/cli/bench.rst:1044
+#: ../../source/cli/bench.rst:1058
 msgid "Example for the local-filesystem adapter:"
 msgstr "本地文件系统适配器示例："
 
-#: ../../source/cli/bench.rst:1056
+#: ../../source/cli/bench.rst:1070
 msgid ""
 "See the source under ``lmcache/v1/distributed/l2_adapters/`` for the full"
 " list of adapter types and their accepted fields."
 msgstr "请参阅 ``lmcache/v1/distributed/l2_adapters/`` 下的源代码，以获取适配器类型及其接受字段的完整列表。"
 
-#: ../../source/cli/bench.rst:1063
+#: ../../source/cli/bench.rst:1077
 msgid "Per-round progress (suppressed by ``-q``):"
 msgstr "每轮进度（通过 ``-q`` 被抑制）："
 
-#: ../../source/cli/bench.rst:1096
+#: ../../source/cli/bench.rst:1110
 msgid "Final summary (one section per exercised operation):"
 msgstr "最终总结（每个操作一个部分）："
 
-#: ../../source/cli/bench.rst:1126
+#: ../../source/cli/bench.rst:1140
 msgid ""
 "Each operation section reports per-round duration statistics (avg / min /"
 " max / p50 / p99 / std), aggregate throughput (``avg_throughput_mbps`` --"
 " 0 for ``Lookup`` since it has no payload), average key-rate "
 "(``avg_ops_per_sec``), and a per-key latency."
-msgstr "每个操作部分报告每轮持续时间统计（平均 / 最小 / 最大 / p50 / p99 / 标准差）、汇总吞吐量（``avg_throughput_mbps`` -- 对于``Lookup``来说为0，因为它没有有效负载）、平均键速率（``avg_ops_per_sec``）以及每个键的延迟。"
+msgstr ""
+"每个操作部分报告每轮持续时间统计（平均 / 最小 / 最大 / p50 / p99 / "
+"标准差）、汇总吞吐量（``avg_throughput_mbps`` -- "
+"对于``Lookup``来说为0，因为它没有有效负载）、平均键速率（``avg_ops_per_sec``）以及每个键的延迟。"
 
-#: ../../source/cli/bench.rst:1131
+#: ../../source/cli/bench.rst:1145
 msgid ""
 "For ``Lookup``, three additional fields are reported when ``--lookup-max-"
 "hit-rate`` is non-zero or some keys were found:"
 msgstr "对于 ``Lookup``，当 ``--lookup-max-hit-rate`` 非零或找到了一些键时，会报告三个额外字段："
 
-#: ../../source/cli/bench.rst:1134
+#: ../../source/cli/bench.rst:1148
 msgid "``Expected max hit rate`` -- the configured upper bound."
 msgstr "``预期最大命中率`` -- 配置的上限。"
 
-#: ../../source/cli/bench.rst:1135
+#: ../../source/cli/bench.rst:1149
 msgid ""
 "``Expected hit keys`` -- ``floor(total_keys * rate)``, scaled for the "
 "measured rounds only."
 msgstr "``期望命中键`` -- ``floor(total_keys * rate)``, 仅针对测量的轮次进行缩放。"
 
-#: ../../source/cli/bench.rst:1137
+#: ../../source/cli/bench.rst:1151
 msgid "``Actual hit rate`` -- the measured hit rate over the kept rounds."
 msgstr "``实际命中率`` -- 在保留的轮次中测量的命中率。"
 
-#: ../../source/cli/bench.rst:1141
+#: ../../source/cli/bench.rst:1155
 msgid "Round-trip verification"
 msgstr "往返验证"
 
-#: ../../source/cli/bench.rst:1143
+#: ../../source/cli/bench.rst:1157
 msgid ""
 "When ``--no-skip-verify`` is passed and both ``store`` and ``load`` were "
 "run, the benchmark compares the load buffers from the last measured round"
 " against the byte pattern that ``store`` wrote (see "
 "``make_memory_objects`` in "
 "``lmcache/cli/commands/bench/l2_adapter_bench/data.py``):"
-msgstr "当传递 ``--no-skip-verify`` 并且同时运行了 ``store`` 和 ``load`` 时，基准测试将最后测量轮次的加载缓冲区与 ``store`` 写入的字节模式进行比较（请参见 ``lmcache/cli/commands/bench/l2_adapter_bench/data.py`` 中的 ``make_memory_objects``）："
+msgstr ""
+"当传递 ``--no-skip-verify`` 并且同时运行了 ``store`` 和 ``load`` "
+"时，基准测试将最后测量轮次的加载缓冲区与 ``store`` 写入的字节模式进行比较（请参见 "
+"``lmcache/cli/commands/bench/l2_adapter_bench/data.py`` 中的 "
+"``make_memory_objects``）："
 
-#: ../../source/cli/bench.rst:1154
+#: ../../source/cli/bench.rst:1168
 msgid ""
 "Verification is **off** by default because the stricter byte pattern also"
 " forces every key to allocate its own ``data_size`` buffer (otherwise the"
 " runner is free to reuse a single shared buffer across keys to keep the "
 "memory footprint small)."
-msgstr "默认情况下，验证是 **关闭** 的，因为更严格的字节模式也迫使每个键分配其自己的 ``data_size`` 缓冲区（否则运行程序可以自由地在键之间重用一个共享缓冲区，以保持内存占用小）。"
+msgstr ""
+"默认情况下，验证是 **关闭** 的，因为更严格的字节模式也迫使每个键分配其自己的 ``data_size`` "
+"缓冲区（否则运行程序可以自由地在键之间重用一个共享缓冲区，以保持内存占用小）。"
 
-#: ../../source/cli/bench.rst:1170
+#: ../../source/cli/bench.rst:1184
 msgid ""
 "All requested operations completed and (when enabled) the round-trip "
 "verification passed."
 msgstr "所有请求的操作已完成，并且（在启用时）往返验证通过。"
 
-#: ../../source/cli/bench.rst:1173
+#: ../../source/cli/bench.rst:1187
 msgid ""
 "Adapter creation failed, round-trip verification failed, or an operation "
 "hit a fatal error (e.g. all rounds timed out)."
 msgstr "适配器创建失败，往返验证失败，或某个操作遇到致命错误（例如，所有轮次超时）。"
 
-#: ../../source/cli/bench.rst:1175
+#: ../../source/cli/bench.rst:1189
 msgid "``2``"
 msgstr "``2``"
 
-#: ../../source/cli/bench.rst:1176
+#: ../../source/cli/bench.rst:1190
 msgid ""
 "The ``--l2-adapter`` JSON / ``L2_ADAPTER_JSON`` env var was missing or "
 "could not be parsed."
 msgstr "缺少或无法解析 ``--l2-adapter`` JSON / ``L2_ADAPTER_JSON`` 环境变量。"
 
+#~ msgid ""
+#~ "Run mode. Only ``gpu`` is supported "
+#~ "today; CPU mode is a planned "
+#~ "follow-up."
+#~ msgstr "运行模式。目前仅支持 ``gpu``；CPU 模式是计划中的后续功能。"
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/cli/coordinator.po b/docs/source/locale/zh_CN/LC_MESSAGES/cli/coordinator.po
new file mode 100644
index 0000000000..74c18709f0
--- /dev/null
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/cli/coordinator.po
@@ -0,0 +1,111 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2024, The LMCache Team
+# This file is distributed under the same license as the LMCache package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2026.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: LMCache \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: zh_CN\n"
+"Language-Team: zh_CN <LL@li.org>\n"
+"Plural-Forms: nplurals=1; plural=0;\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.18.0\n"
+
+#: ../../source/cli/coordinator.rst:2
+msgid "lmcache coordinator"
+msgstr "lmcache coordinator"
+
+#: ../../source/cli/coordinator.rst:4
+msgid ""
+"The ``lmcache coordinator`` command launches the LMCache MP "
+"**coordinator**, a standalone HTTP service that tracks the MP server "
+"instances in a deployment. MP servers register with it and send periodic "
+"heartbeats; the coordinator evicts any instance whose heartbeat lapses "
+"past ``--instance-timeout``."
+msgstr "``lmcache coordinator`` 命令启动 LMCache MP **协调器**，这是一个独立的 HTTP 服务，用于跟踪部署中的 MP 服务器实例。MP 服务器向其注册并发送定期心跳；协调器会逐出任何心跳超时的实例，该超时时间由 ``--instance-timeout`` 指定。"
+
+#: ../../source/cli/coordinator.rst:9
+msgid ""
+"It replaces ``python -m lmcache.v1.mp_coordinator``. The process runs in "
+"the foreground; stop it with ``Ctrl-C``."
+msgstr "它替代了 ``python -m lmcache.v1.mp_coordinator``。该进程在前台运行；使用 ``Ctrl-C`` 停止它。"
+
+#: ../../source/cli/coordinator.rst:17
+msgid "Quick start"
+msgstr "快速开始"
+
+#: ../../source/cli/coordinator.rst:27
+msgid "Options"
+msgstr "选项"
+
+#: ../../source/cli/coordinator.rst:33
+msgid "Flag"
+msgstr "标志"
+
+#: ../../source/cli/coordinator.rst:34
+msgid "Description"
+msgstr "描述"
+
+#: ../../source/cli/coordinator.rst:35
+msgid "``--host HOST``"
+msgstr "``--host HOST``"
+
+#: ../../source/cli/coordinator.rst:36
+msgid "Bind address for the coordinator's HTTP server (default: ``0.0.0.0``)."
+msgstr "协调器 HTTP 服务器的绑定地址（默认：``0.0.0.0``）。"
+
+#: ../../source/cli/coordinator.rst:37
+msgid "``--port PORT``"
+msgstr "``--port PORT``"
+
+#: ../../source/cli/coordinator.rst:38
+msgid "HTTP port (default: ``9300``)."
+msgstr "HTTP 端口（默认值：``9300``）。"
+
+#: ../../source/cli/coordinator.rst:39
+msgid "``--instance-timeout SECS``"
+msgstr "``--instance-timeout SECS``"
+
+#: ../../source/cli/coordinator.rst:40
+msgid ""
+"Seconds without a heartbeat after which an instance is evicted (default: "
+"``30``)."
+msgstr "在没有心跳的情况下，经过多少秒后实例被逐出（默认值：``30``）。"
+
+#: ../../source/cli/coordinator.rst:42
+msgid "``--health-check-interval SECS``"
+msgstr "``--health-check-interval SECS``"
+
+#: ../../source/cli/coordinator.rst:43
+msgid ""
+"Seconds between eviction sweeps; ``0`` disables the loop (default: "
+"``10``)."
+msgstr "逐出清理之间的秒数；``0`` 禁用循环（默认值：``10``）。"
+
+#: ../../source/cli/coordinator.rst:47
+msgid "Configuration"
+msgstr "配置"
+
+#: ../../source/cli/coordinator.rst:49
+msgid ""
+"Every flag is optional. Unset flags fall back to the "
+"``LMCACHE_MP_COORDINATOR_*`` environment variables (``HOST``, ``PORT``, "
+"``INSTANCE_TIMEOUT``, ``HEALTH_CHECK_INTERVAL``), and then to the built-"
+"in defaults. A supplied flag always overrides the matching env-derived "
+"value, so env-only deployments keep working unchanged."
+msgstr "每个标志都是可选的。未设置的标志会回退到 ``LMCACHE_MP_COORDINATOR_*`` 环境变量（``HOST``、``PORT``、``INSTANCE_TIMEOUT``、``HEALTH_CHECK_INTERVAL``），然后再回退到内置默认值。提供的标志始终会覆盖匹配的环境派生值，因此仅使用环境的部署将保持不变。"
+
+#: ../../source/cli/coordinator.rst:55
+msgid ""
+"See :doc:`/mp/coordinator` for the coordinator's architecture, "
+"registration protocol, and HTTP API."
+msgstr "请参阅 :doc:`/mp/coordinator` 以了解协调器的架构、注册协议和 HTTP API。"
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/cli/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/cli/index.po
index 1bb99fa3d3..87a9001184 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/cli/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/cli/index.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-06-01 10:55+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -35,7 +35,9 @@ msgid ""
 "After installing LMCache, the ``lmcache`` command is available globally. "
 "Run ``lmcache -h`` to see all commands, or ``lmcache <command> -h`` for a"
 " specific command."
-msgstr "安装 LMCache 后，``lmcache`` 命令可以全局使用。运行 ``lmcache -h`` 查看所有命令，或运行 ``lmcache <command> -h`` 查看特定命令的帮助。"
+msgstr ""
+"安装 LMCache 后，``lmcache`` 命令可以全局使用。运行 ``lmcache -h`` 查看所有命令，或运行 ``lmcache "
+"<command> -h`` 查看特定命令的帮助。"
 
 #: ../../source/cli/index.rst:17
 msgid "Installation"
@@ -69,7 +71,9 @@ msgstr "``pip install lmcache``"
 msgid ""
 "Full install: server, CLI, and CUDA extensions. Required for ``server``, "
 "``bench server``, ``bench l2``, and ``trace``. Linux + GPU."
-msgstr "完整安装：服务器、CLI 和 CUDA 扩展。``server``、``bench server``、``bench l2`` 和 ``trace`` 所需。Linux + GPU。"
+msgstr ""
+"完整安装：服务器、CLI 和 CUDA 扩展。``server``、``bench server``、``bench l2`` 和 "
+"``trace`` 所需。Linux + GPU。"
 
 #: ../../source/cli/index.rst:33
 msgid "``lmcache-cli``"
@@ -83,7 +87,9 @@ msgstr "``pip install lmcache-cli``"
 msgid ""
 "CLI only: ``ping``, ``query``, ``describe``, ``kvcache``, ``bench "
 "engine``. No GPU required, any OS."
-msgstr "仅限 CLI：``ping``、``query``、``describe``、``kvcache``、``bench engine``。不需要 GPU，任何操作系统均可。"
+msgstr ""
+"仅限 CLI：``ping``、``query``、``describe``、``kvcache``、``bench engine``。不需要 "
+"GPU，任何操作系统均可。"
 
 #: ../../source/cli/index.rst:40
 msgid ""
@@ -112,101 +118,113 @@ msgid "Launch the LMCache MP server (ZMQ + HTTP). Requires the full install."
 msgstr "启动 LMCache MP 服务器（ZMQ + HTTP）。需要完整安装。"
 
 #: ../../source/cli/index.rst:54
+msgid ":doc:`coordinator`"
+msgstr ":doc:`协调器`"
+
+#: ../../source/cli/index.rst:55
+msgid "Launch the LMCache MP coordinator (HTTP instance registry)."
+msgstr "启动 LMCache MP 协调器（HTTP 实例注册表）。"
+
+#: ../../source/cli/index.rst:56
 msgid ":doc:`describe`"
 msgstr ":doc:`描述`"
 
-#: ../../source/cli/index.rst:55
+#: ../../source/cli/index.rst:57
 msgid "Show detailed status of a running LMCache service."
 msgstr "显示正在运行的 LMCache 服务的详细状态。"
 
-#: ../../source/cli/index.rst:56
+#: ../../source/cli/index.rst:58
 msgid ":doc:`ping`"
 msgstr ":doc:`ping`"
 
-#: ../../source/cli/index.rst:57
+#: ../../source/cli/index.rst:59
 msgid "Liveness check for LMCache or vLLM servers."
 msgstr "对 LMCache 或 vLLM 服务器的存活检查。"
 
-#: ../../source/cli/index.rst:58
+#: ../../source/cli/index.rst:60
 msgid ":doc:`query`"
 msgstr ":doc:`查询`"
 
-#: ../../source/cli/index.rst:59
+#: ../../source/cli/index.rst:61
 msgid "Single-shot query interface for the serving engine."
 msgstr "服务引擎的单次查询接口。"
 
-#: ../../source/cli/index.rst:60
+#: ../../source/cli/index.rst:62
 msgid ":doc:`bench`"
 msgstr ":doc:`bench`"
 
-#: ../../source/cli/index.rst:61
+#: ../../source/cli/index.rst:63
 msgid ""
 "Run sustained benchmarks against an inference engine (``engine``), an "
 "LMCache MP server (``server``), or an L2 cache adapter (``l2``)."
 msgstr "对推理引擎（``engine``）、LMCache MP 服务器（``server``）或 L2 缓存适配器（``l2``）运行持续基准测试。"
 
-#: ../../source/cli/index.rst:64
+#: ../../source/cli/index.rst:66
 msgid ":doc:`kvcache`"
 msgstr ":doc:`kvcache`"
 
-#: ../../source/cli/index.rst:65
+#: ../../source/cli/index.rst:67
 msgid "Manage KV cache state (e.g. clear L1 cache) on a running server."
 msgstr "在运行的服务器上管理 KV Cache 状态（例如，清除 L1 缓存）。"
 
-#: ../../source/cli/index.rst:66
+#: ../../source/cli/index.rst:68
 msgid ":doc:`trace`"
 msgstr ":doc:`trace`"
 
-#: ../../source/cli/index.rst:67
+#: ../../source/cli/index.rst:69
 msgid "Inspect and replay storage-level trace files."
 msgstr "检查和重放存储级跟踪文件。"
 
-#: ../../source/cli/index.rst:68
+#: ../../source/cli/index.rst:70
 msgid ":doc:`tool`"
 msgstr ":doc:`工具`"
 
-#: ../../source/cli/index.rst:69
+#: ../../source/cli/index.rst:71
 msgid "Run offline analysis tools (e.g. the cache simulator)."
 msgstr "运行离线分析工具（例如缓存模拟器）。"
 
-#: ../../source/cli/index.rst:72
+#: ../../source/cli/index.rst:74
 msgid "Output Formats"
 msgstr "输出格式"
 
-#: ../../source/cli/index.rst:74
+#: ../../source/cli/index.rst:76
 msgid "Commands that produce metrics share three common flags:"
 msgstr "产生指标的命令共享三个常见标志："
 
-#: ../../source/cli/index.rst:76
+#: ../../source/cli/index.rst:78
 #, python-brace-format
 msgid "``--format {terminal,json}`` — stdout format (default: ``terminal``)."
 msgstr "``--format {terminal,json}`` — 标准输出格式（默认：``terminal``）。"
 
-#: ../../source/cli/index.rst:77
+#: ../../source/cli/index.rst:79
 msgid "``--output PATH`` — also write metrics to a file (uses ``--format``)."
 msgstr "``--output PATH`` — 还将指标写入文件（使用 ``--format``）。"
 
-#: ../../source/cli/index.rst:78
+#: ../../source/cli/index.rst:80
 msgid "``-q`` / ``--quiet`` — suppress stdout; rely on the exit code."
 msgstr "``-q`` / ``--quiet`` — 抑制 stdout；依赖退出代码。"
 
-#: ../../source/cli/index.rst:80
+#: ../../source/cli/index.rst:82
 msgid ""
 "The terminal output uses human-readable labels (e.g. ``\"Round trip time "
 "(ms)\"``), while JSON uses machine-readable keys (e.g. "
 "``\"round_trip_time_ms\"``)."
-msgstr "终端输出使用人类可读的标签（例如 ``\"Round trip time (ms)\"``），而 JSON 使用机器可读的键（例如 ``\"round_trip_time_ms\"``）。"
+msgstr ""
+"终端输出使用人类可读的标签（例如 ``\"Round trip time (ms)\"``），而 JSON 使用机器可读的键（例如 "
+"``\"round_trip_time_ms\"``）。"
 
-#: ../../source/cli/index.rst:85
+#: ../../source/cli/index.rst:87
 msgid "Adding New Commands"
 msgstr "添加新命令"
 
-#: ../../source/cli/index.rst:87
+#: ../../source/cli/index.rst:89
 msgid ""
 "New CLI subcommands are added by creating a ``BaseCommand`` subclass "
 "under ``lmcache/cli/commands/``; they are discovered and registered "
 "automatically. See :doc:`/developer_guide/cli` for details."
-msgstr "通过在 ``lmcache/cli/commands/`` 下创建 ``BaseCommand`` 子类来添加新的 CLI 子命令；它们会被自动发现和注册。有关详细信息，请参见 :doc:`/developer_guide/cli`。"
+msgstr ""
+"通过在 ``lmcache/cli/commands/`` 下创建 ``BaseCommand`` 子类来添加新的 CLI "
+"子命令；它们会被自动发现和注册。有关详细信息，请参见 :doc:`/developer_guide/cli`。"
 
 #~ msgid ""
 #~ "For a comprehensive guide with examples,"
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/cli/server.po b/docs/source/locale/zh_CN/LC_MESSAGES/cli/server.po
index 954a4099d3..8e99bc3809 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/cli/server.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/cli/server.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-06-01 10:55+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -30,7 +30,10 @@ msgid ""
 "frontend (status, healthcheck, cache-clear, checksum APIs). It is the "
 "server that ``lmcache describe``, ``lmcache ping kvcache``, ``lmcache "
 "kvcache``, and ``lmcache bench server`` talk to."
-msgstr "``lmcache server`` 命令启动独立的 LMCache 多进程 (MP) 服务器，该服务器暴露 ZMQ 控制平面和 HTTP 前端 (状态、健康检查、清除缓存、校验和 API)。这是 ``lmcache describe``、``lmcache ping kvcache``、``lmcache kvcache`` 和 ``lmcache bench server`` 进行交互的服务器。"
+msgstr ""
+"``lmcache server`` 命令启动独立的 LMCache 多进程 (MP) 服务器，该服务器暴露 ZMQ 控制平面和 HTTP 前端 "
+"(状态、健康检查、清除缓存、校验和 API)。这是 ``lmcache describe``、``lmcache ping "
+"kvcache``、``lmcache kvcache`` 和 ``lmcache bench server`` 进行交互的服务器。"
 
 #: ../../source/cli/server.rst:12
 msgid ""
@@ -54,7 +57,9 @@ msgid ""
 "eviction), the HTTP frontend, and the Prometheus / telemetry "
 "observability layer. The full, authoritative list is large and evolves "
 "with the runtime, so consult:"
-msgstr "服务器从多个配置模块组合其参数——多进程服务器、存储管理器（L1 / L2 适配器 / 逐出）、HTTP 前端以及 Prometheus / 监控可观察性层。完整的权威列表很大，并随着运行时而变化，因此请参考："
+msgstr ""
+"服务器从多个配置模块组合其参数——多进程服务器、存储管理器（L1 / L2 适配器 / 逐出）、HTTP 前端以及 Prometheus / "
+"监控可观察性层。完整的权威列表很大，并随着运行时而变化，因此请参考："
 
 #: ../../source/cli/server.rst:42
 msgid "Commonly used flags include:"
@@ -133,26 +138,59 @@ msgid "Number of server worker processes."
 msgstr "服务器工作进程的数量。"
 
 #: ../../source/cli/server.rst:66
+msgid "``--coordinator-url URL``"
+msgstr "``--coordinator-url URL``"
+
+#: ../../source/cli/server.rst:67
+msgid ""
+"Register with an MP coordinator at this base URL (e.g. "
+"``http://coordinator:9300``). Opt-in; enables fleet registration. See "
+":doc:`/mp/coordinator`."
+msgstr "在此基础 URL（例如 ``http://coordinator:9300``）注册 MP 协调器。可选；启用车队注册。请参见 :doc:`/mp/coordinator`。"
+
+#: ../../source/cli/server.rst:70
+msgid "``--coordinator-advertise-ip IP``"
+msgstr "``--coordinator-advertise-ip IP``"
+
+#: ../../source/cli/server.rst:71
+msgid ""
+"IP the coordinator should reach this server at (defaults to the outbound "
+"IP)."
+msgstr "协调器应通过此服务器访问的 IP（默认为出站 IP）。"
+
+#: ../../source/cli/server.rst:73
+msgid "``--coordinator-heartbeat-interval SECONDS``"
+msgstr "``--coordinator-heartbeat-interval SECONDS``"
+
+#: ../../source/cli/server.rst:74
+msgid ""
+"Seconds between heartbeats (``> 0``, default ``5``). Keep well below the "
+"coordinator's instance timeout."
+msgstr "心跳之间的秒数（``> 0``，默认值为 ``5``）。应远低于协调器的实例超时。"
+
+#: ../../source/cli/server.rst:76
 #, python-brace-format
 msgid "``--trace-level {storage}``"
 msgstr "``--trace-level {storage}``"
 
-#: ../../source/cli/server.rst:67
+#: ../../source/cli/server.rst:77
 msgid "Enable storage-level trace recording (see :doc:`trace`)."
 msgstr "启用存储级别的追踪记录（请参见 :doc:`trace`）。"
 
-#: ../../source/cli/server.rst:68
+#: ../../source/cli/server.rst:78
 msgid "``--trace-output PATH``"
 msgstr "``--trace-output PATH``"
 
-#: ../../source/cli/server.rst:69
+#: ../../source/cli/server.rst:79
 msgid "Destination for recorded ``.lct`` trace files."
 msgstr "记录的 ``.lct`` 跟踪文件的目标。"
 
-#: ../../source/cli/server.rst:71
+#: ../../source/cli/server.rst:81
 msgid ""
 "L2 adapters, observability, and Prometheus exporters are configured "
 "through their own flag groups; see ``lmcache server --help`` for the "
 "complete set."
-msgstr "L2 适配器、可观察性和 Prometheus 导出器通过各自的标志组进行配置；有关完整的设置，请参见 ``lmcache server --help``。"
+msgstr ""
+"L2 适配器、可观察性和 Prometheus 导出器通过各自的标志组进行配置；有关完整的设置，请参见 ``lmcache server "
+"--help``。"
 
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/meetings.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/meetings.po
index fc4f545b6b..341a84cb79 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/community/meetings.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/meetings.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-18 17:25+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -30,7 +30,9 @@ msgid ""
 "in contributing to the LMCache projects (core LMCache or Production "
 "Stack), we encourage you to join the meetings. We also host a monthly "
 "\"Office Hour\" on select topics."
-msgstr "LMCache 定期举办社区会议，以讨论更新、处理新功能请求以及收集社区反馈。如果您有兴趣为 LMCache 项目（核心 LMCache 或生产堆栈）做出贡献，我们鼓励您参加会议。我们还会就特定主题举办每月的“办公时间”。"
+msgstr ""
+"LMCache 定期举办社区会议，以讨论更新、处理新功能请求以及收集社区反馈。如果您有兴趣为 LMCache 项目（核心 LMCache "
+"或生产堆栈）做出贡献，我们鼓励您参加会议。我们还会就特定主题举办每月的“办公时间”。"
 
 #: ../../source/community/meetings.rst:10
 msgid "Meeting schedule"
@@ -69,20 +71,24 @@ msgstr "请在下面找到会议邀请链接："
 msgid ""
 "**Meeting link**: `Zoom link "
 "<https://uchicago.zoom.us/j/6603596916?pwd=Z1E5MDRWUSt2am5XbEt4dTFkNGx6QT09>`_"
-msgstr "**会议链接**: `Zoom 链接 <https://uchicago.zoom.us/j/6603596916?pwd=Z1E5MDRWUSt2am5XbEt4dTFkNGx6QT09>`_"
+msgstr ""
+"**会议链接**: `Zoom 链接 "
+"<https://uchicago.zoom.us/j/6603596916?pwd=Z1E5MDRWUSt2am5XbEt4dTFkNGx6QT09>`_"
 
 #: ../../source/community/meetings.rst:24
 msgid ""
 "**Calendar Invite**: `Google Calendar "
 "<https://drive.usercontent.google.com/u/0/uc?id=15Xz8-LtpBQ5QgR7KrorOOyfuohCFQmwn&export=download>`__"
-msgstr "**日历邀请**: `Google 日历 <https://drive.usercontent.google.com/u/0/uc?id=15Xz8-LtpBQ5QgR7KrorOOyfuohCFQmwn&export=download>`__"
+msgstr ""
+"**日历邀请**: `Google 日历 "
+"<https://drive.usercontent.google.com/u/0/uc?id=15Xz8-LtpBQ5QgR7KrorOOyfuohCFQmwn&export=download>`__"
 
 #: ../../source/community/meetings.rst:25
 msgid ""
 "**Slack Channel**: `#lmcache "
-"<https://lmcacheworkspace.slack.com/join/shared_invite/zt-3h7ohnf5t-"
-"ZZ0JBuYCIh1eUwHPTqSNCQ#/shared-invite/email>`_"
-msgstr "**Slack 频道**: `#lmcache <https://lmcacheworkspace.slack.com/join/shared_invite/zt-3h7ohnf5t-ZZ0JBuYCIh1eUwHPTqSNCQ#/shared-invite/email>`_"
+"<https://join.slack.com/t/lmcacheworkspace/shared_invite/zt-"
+"3zxjao8h0-lRfBfnLqbALOtLsWn2ITxA>`_"
+msgstr "**Slack 频道**: `#lmcache <https://join.slack.com/t/lmcacheworkspace/shared_invite/zt-3zxjao8h0-lRfBfnLqbALOtLsWn2ITxA>`_"
 
 #: ../../source/community/meetings.rst:28
 msgid "vLLM Production Stack Project"
@@ -98,20 +104,26 @@ msgstr "生产堆栈社区会议每两周在**星期二**的**下午5:30-6:00（
 msgid ""
 "**Calendar Invite**: `Google Calendar "
 "<https://drive.usercontent.google.com/u/0/uc?id=1I3WuivUVAq1vZ2XSW4rmqgD5c0bQcxE0&export=download>`__"
-msgstr "**日历邀请**： `Google 日历 <https://drive.usercontent.google.com/u/0/uc?id=1I3WuivUVAq1vZ2XSW4rmqgD5c0bQcxE0&export=download>`__"
+msgstr ""
+"**日历邀请**： `Google 日历 "
+"<https://drive.usercontent.google.com/u/0/uc?id=1I3WuivUVAq1vZ2XSW4rmqgD5c0bQcxE0&export=download>`__"
 
 #: ../../source/community/meetings.rst:35
 msgid ""
 "**Slack Channel**: `#production-stack <https://vllm-"
 "dev.slack.com/archives/C089SMEAKRA>`_"
-msgstr "**Slack 频道**: `#production-stack <https://vllm-dev.slack.com/archives/C089SMEAKRA>`_"
+msgstr ""
+"**Slack 频道**: `#production-stack <https://vllm-"
+"dev.slack.com/archives/C089SMEAKRA>`_"
 
 #: ../../source/community/meetings.rst:39
 msgid ""
 "The Zoom meeting link is the same for both LMCache and Production Stack "
 "community meetings. Meeting notes are available here: `Meeting notes "
 "<https://docs.google.com/document/d/1vX0g2q3j4x5m7J6z8Q9Gk4Z5l7f3K8h0nqYwW1a2c4o/edit?usp=sharing>`_."
-msgstr "Zoom 会议链接对于 LMCache 和 Production Stack 社区会议是相同的。会议记录可以在这里查看：`会议记录 <https://docs.google.com/document/d/1vX0g2q3j4x5m7J6z8Q9Gk4Z5l7f3K8h0nqYwW1a2c4o/edit?usp=sharing>`_。"
+msgstr ""
+"Zoom 会议链接对于 LMCache 和 Production Stack 社区会议是相同的。会议记录可以在这里查看：`会议记录 "
+"<https://docs.google.com/document/d/1vX0g2q3j4x5m7J6z8Q9Gk4Z5l7f3K8h0nqYwW1a2c4o/edit?usp=sharing>`_。"
 
 #: ../../source/community/meetings.rst:43
 msgid "LMCache Office Hours"
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/index.po
index d83dc9acf8..1c73f27e2a 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/index.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-29 22:44+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -27,59 +27,59 @@ msgstr "欢迎使用 LMCache"
 msgid "Getting Started"
 msgstr "入门指南"
 
-#: ../../source/index.rst:86
+#: ../../source/index.rst:85
 msgid "Recipes"
 msgstr "使用指南"
 
-#: ../../source/index.rst:94
+#: ../../source/index.rst:93
 msgid "KV Cache offloading and sharing"
 msgstr "KV Cache 卸载和共享"
 
-#: ../../source/index.rst:105
+#: ../../source/index.rst:104
 msgid "Non-KV caching"
 msgstr "非 KV 缓存"
 
-#: ../../source/index.rst:113
+#: ../../source/index.rst:112
 msgid "Multiprocess Mode"
 msgstr "多进程模式"
 
-#: ../../source/index.rst:121
+#: ../../source/index.rst:120
 msgid "Disaggregated prefill"
 msgstr "分离式 Prefill"
 
-#: ../../source/index.rst:130
+#: ../../source/index.rst:129
 msgid "KV Cache management"
 msgstr "KV Cache 管理"
 
-#: ../../source/index.rst:138
+#: ../../source/index.rst:137
 msgid "KV Cache Optimizations"
 msgstr "KV Cache 优化"
 
-#: ../../source/index.rst:148
+#: ../../source/index.rst:147
 msgid "Use LMCache in production"
 msgstr "在生产中使用 LMCache"
 
-#: ../../source/index.rst:160
+#: ../../source/index.rst:159
 msgid "CLI"
 msgstr "命令行界面"
 
-#: ../../source/index.rst:168
+#: ../../source/index.rst:167
 msgid "Internal API Server"
 msgstr "内部 API 服务器"
 
-#: ../../source/index.rst:176
+#: ../../source/index.rst:175
 msgid "Controller WebUI"
 msgstr "控制器 WebUI"
 
-#: ../../source/index.rst:184
+#: ../../source/index.rst:183
 msgid "Developer Guide"
 msgstr "开发者指南"
 
-#: ../../source/index.rst:197
+#: ../../source/index.rst:196
 msgid "API Reference"
 msgstr "API 参考"
 
-#: ../../source/index.rst:208
+#: ../../source/index.rst:207
 msgid "Community"
 msgstr "社区"
 
@@ -109,9 +109,9 @@ msgstr "`LMCache 博客 <https://lmcache.github.io>`_"
 #: ../../source/index.rst:49
 msgid ""
 "`Join LMCache slack workspace "
-"<https://join.slack.com/t/lmcacheworkspace/shared_invite/zt-3eck2v8ub-"
-"j2hPn0AdzcfyykatTBXSTg>`_"
-msgstr "`加入 LMCache Slack 工作区 <https://join.slack.com/t/lmcacheworkspace/shared_invite/zt-3eck2v8ub-j2hPn0AdzcfyykatTBXSTg>`_"
+"<https://join.slack.com/t/lmcacheworkspace/shared_invite/zt-"
+"3zxjao8h0-lRfBfnLqbALOtLsWn2ITxA>`_"
+msgstr "`加入 LMCache Slack 工作区 <https://join.slack.com/t/lmcacheworkspace/shared_invite/zt-3zxjao8h0-lRfBfnLqbALOtLsWn2ITxA>`_"
 
 #: ../../source/index.rst:50
 msgid "Our papers:"
@@ -121,7 +121,9 @@ msgstr "我们的论文："
 msgid ""
 "`CacheGen: KV Cache Compression and Streaming for Fast Large Language "
 "Model Serving <https://dl.acm.org/doi/10.1145/3651890.3672274>`_"
-msgstr "`CacheGen: KV Cache 压缩与流式传输以快速服务大型语言模型 <https://dl.acm.org/doi/10.1145/3651890.3672274>`_"
+msgstr ""
+"`CacheGen: KV Cache 压缩与流式传输以快速服务大型语言模型 "
+"<https://dl.acm.org/doi/10.1145/3651890.3672274>`_"
 
 #: ../../source/index.rst:53
 msgid ""
@@ -135,14 +137,14 @@ msgid ""
 "<https://arxiv.org/abs/2409.13761>`_"
 msgstr "`大型语言模型需要内容分发网络吗？ <https://arxiv.org/abs/2409.13761>`_"
 
-#: ../../source/index.rst:56 ../../source/index.rst:84
-#: ../../source/index.rst:92 ../../source/index.rst:103
-#: ../../source/index.rst:111 ../../source/index.rst:119
-#: ../../source/index.rst:128 ../../source/index.rst:136
-#: ../../source/index.rst:146 ../../source/index.rst:158
-#: ../../source/index.rst:166 ../../source/index.rst:174
-#: ../../source/index.rst:182 ../../source/index.rst:195
-#: ../../source/index.rst:206 ../../source/index.rst:215
+#: ../../source/index.rst:56 ../../source/index.rst:83
+#: ../../source/index.rst:91 ../../source/index.rst:102
+#: ../../source/index.rst:110 ../../source/index.rst:118
+#: ../../source/index.rst:127 ../../source/index.rst:135
+#: ../../source/index.rst:145 ../../source/index.rst:157
+#: ../../source/index.rst:165 ../../source/index.rst:173
+#: ../../source/index.rst:181 ../../source/index.rst:194
+#: ../../source/index.rst:205 ../../source/index.rst:214
 msgid ":raw-html:`<br />`"
 msgstr ":raw-html:`<br />`"
 
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/kv_cache/storage_backends/nixl.po b/docs/source/locale/zh_CN/LC_MESSAGES/kv_cache/storage_backends/nixl.po
index 54729907f7..ffef7095ee 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/kv_cache/storage_backends/nixl.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/kv_cache/storage_backends/nixl.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-29 22:44+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -83,78 +83,115 @@ msgstr "通过 ``LMCACHE_CONFIG_FILE=lmcache-config.yaml`` 传入"
 msgid "Example ``lmcache-config.yaml`` for POSIX backend:"
 msgstr "示例 ``lmcache-config.yaml`` 用于 POSIX 后端："
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:43
+#: ../../source/kv_cache/storage_backends/nixl.rst:42
 msgid "Key settings:"
 msgstr "关键设置："
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:45
-msgid "``nixl_buffer_size``: buffer size for NIXL transfers."
-msgstr "``nixl_buffer_size``: NIXL 传输的缓冲区大小。"
+#: ../../source/kv_cache/storage_backends/nixl.rst:44
+msgid ""
+"``nixl_buffer_size``: buffer size for NIXL transfers. **GPU mode only** "
+"(``nixl_buffer_device: cuda``). Setting this with ``nixl_buffer_device: "
+"cpu`` is a configuration error and will be rejected — in CPU mode NIXL "
+"shares ``LocalCPUBackend``'s pinned pool, which is sized by "
+"``max_local_cpu_size``."
+msgstr "``nixl_buffer_size``：NIXL 传输的缓冲区大小。**仅限 GPU 模式**（``nixl_buffer_device: cuda``）。将其设置为 ``nixl_buffer_device: cpu`` 是配置错误，将被拒绝——在 CPU 模式下，NIXL 共享 ``LocalCPUBackend`` 的固定池，该池的大小由 ``max_local_cpu_size`` 决定。"
+
+#: ../../source/kv_cache/storage_backends/nixl.rst:46
+msgid ""
+"``max_local_cpu_size``: size of ``LocalCPUBackend``'s pinned pool in GiB."
+" In CPU mode, this pool is shared with NIXL and must accommodate both the"
+" hot cache and concurrent NIXL I/O in flight. Must be > 0 when "
+"``nixl_buffer_device: cpu``. Default: ``5.0``."
+msgstr "``max_local_cpu_size``: ``LocalCPUBackend`` 的固定池大小（以 GiB 为单位）。在 CPU 模式下，该池与 NIXL 共享，必须能够容纳热缓存和并发的 NIXL I/O。设置为 ``nixl_buffer_device: cpu`` 时必须大于 0。默认值：``5.0``。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:47
+#: ../../source/kv_cache/storage_backends/nixl.rst:48
 msgid ""
 "``nixl_pool_size``: number of descriptors opened at init time for nixl "
 "backend. Set to 0 for dynamic mode."
 msgstr "``nixl_pool_size``: 初始化时为 nixl 后端打开的描述符数量。设置为 0 以启用动态模式。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:49
+#: ../../source/kv_cache/storage_backends/nixl.rst:50
 msgid ""
 "``nixl_path``: directory under which the storage files will be saved "
 "(e.g. /mnt/nixl/). Needed for NIXL backends that store to file."
 msgstr "``nixl_path``: 存储文件将保存的目录（例如 /mnt/nixl/）。对于将数据存储到文件的 NIXL 后端，这是必需的。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:51
+#: ../../source/kv_cache/storage_backends/nixl.rst:52
 msgid ""
 "``nixl_buffer_device``: dictates where the memory managed by NIXL should "
 "be on. \"cpu\" or \"cuda\" is supported for \"GDS\", \"GDS_MT\", and "
-"\"OBJ\" backends - for \"POSIX\", \"HF3FS\" & \"AZURE_BLOB\", must be "
-"\"cpu\"."
-msgstr ""
-"``nixl_buffer_device``: 指定 NIXL 管理的内存应该位于何处。对于 \\\"GDS\\\"、\\\"GDS_MT\\\""
-" 和 \\\"OBJ\\\" 后端，支持 \\\"cpu\\\" 或 \\\"cuda\\\"；对于 "
-"\\\"POSIX\\\"、\\\"HF3FS\\\" 和 \\\"AZURE_BLOB\\\"，必须为 \\\"cpu\\\"。"
-
-#: ../../source/kv_cache/storage_backends/nixl.rst:53
+"\"OBJ\" backends - for \"POSIX\", \"HF3FS\", \"AZURE_BLOB\" & "
+"\"DOCA_MEMOS\", must be \"cpu\". In CPU mode, NIXL shares "
+"``LocalCPUBackend``'s pinned buffer; ``LocalCPUBackend`` is always "
+"created when ``nixl_buffer_device: cpu``, regardless of the ``local_cpu``"
+" setting. ``local_cpu: false`` still suppresses hot-cache promotions — "
+"the backend acts as a staging buffer only, mirroring how ``local_disk`` "
+"already uses ``LocalCPUBackend``."
+msgstr "``nixl_buffer_device``: 指定 NIXL 管理的内存应位于何处。对于 ``GDS``、``GDS_MT`` 和 ``OBJ`` 后端，支持 ``cpu`` 或 ``cuda``；对于 ``POSIX``、``HF3FS``、``AZURE_BLOB`` 和 ``DOCA_MEMOS``，必须为 ``cpu``。在 CPU 模式下，NIXL 共享 ``LocalCPUBackend`` 的固定缓冲区；当 ``nixl_buffer_device: cpu`` 时，``LocalCPUBackend`` 始终会被创建，无论 ``local_cpu`` 设置如何。``local_cpu: false`` 仍然会抑制热缓存提升——后端仅作为暂存缓冲区，镜像 ``local_disk`` 已经如何使用 ``LocalCPUBackend``。"
+
+#: ../../source/kv_cache/storage_backends/nixl.rst:54
 msgid "``nixl_backend``: configuration of which nixl backend to use for storage."
 msgstr "``nixl_backend``: 配置用于存储的 nixl 后端。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:55
+#: ../../source/kv_cache/storage_backends/nixl.rst:56
 msgid ""
-"``nixl_use_hugepages``: whether to use Linux hugepages (2 MiB) for the "
-"NIXL CPU buffer. Not supported for GPU buffers. Requires pre-allocated "
-"hugepages (``sysctl vm.nr_hugepages``). Default: ``false``."
-msgstr "``nixl_use_hugepages``: 是否为 NIXL CPU 缓冲区使用 Linux 大页（2 MiB）。不支持 GPU 缓冲区。需要预分配的大页（``sysctl vm.nr_hugepages``）。默认值：``false``。"
-
-#: ../../source/kv_cache/storage_backends/nixl.rst:59
+"``local_cpu_use_hugepages``: whether to use Linux hugepages (2 MiB) for "
+"``LocalCPUBackend``'s pinned pool (which NIXL shares in CPU mode). "
+"Requires pre-allocated hugepages (``sysctl vm.nr_hugepages``). Default: "
+"``false``. **Deprecated alias:** ``extra_config.nixl_use_hugepages`` — "
+"accepted with a warning and copied into this field; will be removed in a "
+"future release."
+msgstr "``local_cpu_use_hugepages``: 是否为 ``LocalCPUBackend`` 的固定池使用 Linux 大页（2 MiB）（NIXL 在 CPU 模式下共享）。需要预分配的大页（``sysctl vm.nr_hugepages``）。默认值：``false``。**已弃用的别名：** ``extra_config.nixl_use_hugepages`` — 接受时会发出警告并复制到此字段；将在未来的版本中移除。"
+
+#: ../../source/kv_cache/storage_backends/nixl.rst:60
+msgid ""
+"In CPU mode, the shared paged allocator consumes one full page per "
+"object. With ``save_unfull_chunk: true`` (only valid in static mode — "
+"dynamic mode rejects it; see \"Dynamic Mode\" → \"Restrictions\" below), "
+"partial chunks still occupy a full page each, so effective capacity "
+"degrades proportionally to the fraction of unfull last chunks across "
+"active sequences."
+msgstr "在 CPU 模式下，共享分页分配器每个对象消耗一个完整的页面。使用 ``save_unfull_chunk: true``（仅在静态模式下有效——动态模式会拒绝它；请参见“动态模式”→“限制”），部分块仍然占用一个完整的页面，因此有效容量与活动序列中未满最后块的比例成正比地下降。"
+
+#: ../../source/kv_cache/storage_backends/nixl.rst:64
+msgid ""
+"``enable_p2p: true`` is rejected together with ``nixl_buffer_device: "
+"cpu``. The combination is structurally supported — both backends share "
+"``LocalCPUBackend``'s pinned pool, each runs its own NIXL agent over it, "
+"and allocations route through ``LocalCPUBackend.allocate()`` — but it has"
+" not been exercised end-to-end and has no CI coverage. Use ``enable_p2p: "
+"true`` with ``nixl_buffer_device: cuda`` instead, or disable "
+"``enable_p2p`` when running the NIXL CPU shared pool."
+msgstr "``enable_p2p: true`` 与 ``nixl_buffer_device: cpu`` 一起被拒绝。这个组合在结构上是支持的——两个后端共享 ``LocalCPUBackend`` 的固定池，各自运行自己的 NIXL 代理，并且分配通过 ``LocalCPUBackend.allocate()`` 路由——但尚未进行端到端的测试，也没有 CI 覆盖。在使用 NIXL CPU 共享池时，请改用 ``nixl_buffer_device: cuda`` 的 ``enable_p2p: true``，或者在运行时禁用 ``enable_p2p``。"
+
+#: ../../source/kv_cache/storage_backends/nixl.rst:68
 msgid ""
 "Supported backends are: [\"GDS\", \"GDS_MT\", \"POSIX\", \"HF3FS\", "
-"\"OBJ\", \"AZURE_BLOB\"]."
-msgstr ""
-"支持的后端包括：[\\\"GDS\\\", \\\"GDS_MT\\\", \\\"POSIX\\\", \\\"HF3FS\\\", "
-"\\\"OBJ\\\", \\\"AZURE_BLOB\\\"]。"
+"\"OBJ\", \"AZURE_BLOB\", \"DOCA_MEMOS\"]."
+msgstr "支持的后端包括：[\\\"GDS\\\", \\\"GDS_MT\\\", \\\"POSIX\\\", \\\"HF3FS\\\", \\\"OBJ\\\", \\\"AZURE_BLOB\\\", \\\"DOCA_MEMOS\\\"]."
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:61
+#: ../../source/kv_cache/storage_backends/nixl.rst:70
 msgid ""
 "Backend specific params should be provided via "
 "``extra_config.nixl_backend_params``. Please refer to NIXL documentation "
 "for specifics."
 msgstr "后端特定参数应通过 ``extra_config.nixl_backend_params`` 提供。有关具体信息，请参阅 NIXL 文档。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:63
+#: ../../source/kv_cache/storage_backends/nixl.rst:72
 msgid "Example ``lmcache-config.yaml`` for OBJ backend using S3 API:"
 msgstr "示例 ``lmcache-config.yaml`` 用于使用 S3 API 的 OBJ 后端："
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:81
+#: ../../source/kv_cache/storage_backends/nixl.rst:90
 msgid "Example ``lmcache-config.yaml`` for POSIX backend using liburing:"
 msgstr "示例 ``lmcache-config.yaml`` 用于使用 liburing 的 POSIX 后端："
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:85
+#: ../../source/kv_cache/storage_backends/nixl.rst:94
 msgid ""
 "using POSIX backend with liburing requires NIXL to be built with liburing"
 " support."
 msgstr "使用带有 liburing 支持的 POSIX 后端需要将 NIXL 构建为支持 liburing。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:101
+#: ../../source/kv_cache/storage_backends/nixl.rst:110
 msgid ""
 "Example ``lmcache-config.yaml`` for AZURE_BLOB backend to offload using "
 "Azure Blob Storage API:"
@@ -162,11 +199,11 @@ msgstr ""
 "示例 ``lmcache-config.yaml`` 用于 AZURE_BLOB 后端，通过 Azure Blob Storage API "
 "进行卸载："
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:118
+#: ../../source/kv_cache/storage_backends/nixl.rst:127
 msgid "Per-Worker Endpoint Distribution"
 msgstr "每个工作节点的端点分配"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:120
+#: ../../source/kv_cache/storage_backends/nixl.rst:129
 msgid ""
 "When using the OBJ backend with multiple tensor-parallel (TP) workers, "
 "you can distribute workers across multiple object-storage endpoints by "
@@ -178,7 +215,7 @@ msgstr ""
 "``nixl_endpoint_list``。每个工作节点根据其 ``local_worker_id``（其主机内的工作节点 "
 "ID）以轮询方式选择一个端点。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:144
+#: ../../source/kv_cache/storage_backends/nixl.rst:153
 msgid ""
 "When ``nixl_endpoint_list`` is set, any ``endpoint_override`` value in "
 "``nixl_backend_params`` is ignored (a warning is logged)."
@@ -186,41 +223,107 @@ msgstr ""
 "当设置 ``nixl_endpoint_list`` 时，``nixl_backend_params`` 中的任何 "
 "``endpoint_override`` 值将被忽略（会记录警告）。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:148
+#: ../../source/kv_cache/storage_backends/nixl.rst:156
+msgid ""
+"``nixl_endpoint_list`` is only honored for the OBJ backend; it is ignored"
+" for all other backends (including DOCA_MEMOS, AZURE_BLOB, and the file "
+"backends)."
+msgstr "``nixl_endpoint_list`` 仅对 OBJ 后端有效；对于所有其他后端（包括 DOCA_MEMOS、AZURE_BLOB 和文件后端）将被忽略。"
+
+#: ../../source/kv_cache/storage_backends/nixl.rst:161
 msgid "Dynamic Mode"
 msgstr "动态模式"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:150
+#: ../../source/kv_cache/storage_backends/nixl.rst:163
 msgid ""
 "Nixl Storage Backend also supports a dynamic mode, which creates nixl "
 "storage descriptors on demand instead of at init time."
 msgstr "Nixl 存储后端还支持动态模式，该模式按需创建 nixl 存储描述符，而不是在初始化时创建。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:152
+#: ../../source/kv_cache/storage_backends/nixl.rst:165
 msgid ""
 "In order to use dynamic mode, extra_config.nixl_pool_size should be set "
 "to 0."
 msgstr "为了使用动态模式，extra_config.nixl_pool_size 应设置为 0。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:155
+#: ../../source/kv_cache/storage_backends/nixl.rst:168
 msgid "Restrictions"
 msgstr "限制"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:157
+#: ../../source/kv_cache/storage_backends/nixl.rst:170
 msgid ""
-"Dynamic mode is currently only supported for nixl OBJ and AZURE_BLOB "
-"backends."
-msgstr "动态模式目前仅支持 nixl OBJ 和 AZURE_BLOB 后端。"
+"Dynamic mode is supported for object backends (\"OBJ\", \"AZURE_BLOB\", "
+"\"DOCA_MEMOS\") and file backends (\"POSIX\", \"GDS\", \"GDS_MT\", "
+"\"HF3FS\")."
+msgstr "动态模式支持对象后端（“OBJ”，“AZURE_BLOB”，“DOCA_MEMOS”）和文件后端（“POSIX”，“GDS”，“GDS_MT”，“HF3FS”）。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:158
+#: ../../source/kv_cache/storage_backends/nixl.rst:171
 msgid "save_unfull_chunk must be set to False."
 msgstr "save_unfull_chunk 必须设置为 False。"
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:160
+#: ../../source/kv_cache/storage_backends/nixl.rst:173
 msgid "Example ``lmcache-config.yaml`` for OBJ backend with dynamic mode:"
 msgstr "示例 ``lmcache-config.yaml`` 用于动态模式的 OBJ 后端："
 
-#: ../../source/kv_cache/storage_backends/nixl.rst:187
+#: ../../source/kv_cache/storage_backends/nixl.rst:198
 msgid "Example ``lmcache-config.yaml`` for AZURE_BLOB backend with dynamic mode:"
 msgstr "示例 ``lmcache-config.yaml`` 用于动态模式的 AZURE_BLOB 后端："
 
+#: ../../source/kv_cache/storage_backends/nixl.rst:219
+msgid "DOCA_MEMOS Backend (NVIDIA CMX)"
+msgstr "DOCA_MEMOS 后端 (NVIDIA CMX)"
+
+#: ../../source/kv_cache/storage_backends/nixl.rst:221
+msgid ""
+"``DOCA_MEMOS`` stores KV cache on NVIDIA CMX (Context Memory Storage), a "
+"BlueField-4 context-memory tier accessed through NIXL. It is an object-"
+"style backend (like ``OBJ``), supported in both static "
+"(``nixl_pool_size`` > 0) and dynamic (``nixl_pool_size`` = 0) mode. "
+"``nixl_buffer_device`` must be ``cpu``. ``nixl_endpoint_list`` is not "
+"supported for DOCA_MEMOS."
+msgstr "``DOCA_MEMOS`` 将 KV Cache 存储在 NVIDIA CMX（上下文内存存储）上，这是一个通过 NIXL 访问的 BlueField-4 上下文内存层。它是一种对象风格的后端（类似于 ``OBJ``），支持静态模式（``nixl_pool_size`` > 0）和动态模式（``nixl_pool_size`` = 0）。``nixl_buffer_device`` 必须为 ``cpu``。``nixl_endpoint_list`` 不支持 DOCA_MEMOS。"
+
+#: ../../source/kv_cache/storage_backends/nixl.rst:227
+msgid ""
+"Object names are 128-bit lowercase-hex strings: the NIXL DOCA_MEMOS "
+"plugin passes object names as strings and hex-decodes them on the device "
+"side, so each name is exactly 32 hex characters. In dynamic mode this "
+"name is a truncated SHA-256 of the cache key, so names are opaque (they "
+"carry no model/chunk debug information) and uniqueness is probabilistic "
+"at 128 bits."
+msgstr "对象名称是 128 位小写十六进制字符串：NIXL DOCA_MEMOS 插件将对象名称作为字符串传递，并在设备端进行十六进制解码，因此每个名称恰好是 32 个十六进制字符。在动态模式下，此名称是缓存键的截断 SHA-256，因此名称是不透明的（它们不携带任何模型/块调试信息），并且唯一性在 128 位上是概率性的。"
+
+#~ msgid "``nixl_buffer_size``: buffer size for NIXL transfers."
+#~ msgstr "``nixl_buffer_size``: NIXL 传输的缓冲区大小。"
+
+#~ msgid ""
+#~ "``nixl_buffer_device``: dictates where the "
+#~ "memory managed by NIXL should be "
+#~ "on. \"cpu\" or \"cuda\" is supported "
+#~ "for \"GDS\", \"GDS_MT\", and \"OBJ\" "
+#~ "backends - for \"POSIX\", \"HF3FS\" &"
+#~ " \"AZURE_BLOB\", must be \"cpu\"."
+#~ msgstr ""
+#~ "``nixl_buffer_device``: 指定 NIXL 管理的内存应该位于何处。对于 "
+#~ "\\\"GDS\\\"、\\\"GDS_MT\\\" 和 \\\"OBJ\\\" 后端，支持 "
+#~ "\\\"cpu\\\" 或 \\\"cuda\\\"；对于 "
+#~ "\\\"POSIX\\\"、\\\"HF3FS\\\" 和 \\\"AZURE_BLOB\\\"，必须为 "
+#~ "\\\"cpu\\\"。"
+
+#~ msgid ""
+#~ "``nixl_use_hugepages``: whether to use Linux"
+#~ " hugepages (2 MiB) for the NIXL "
+#~ "CPU buffer. Not supported for GPU "
+#~ "buffers. Requires pre-allocated hugepages "
+#~ "(``sysctl vm.nr_hugepages``). Default: ``false``."
+#~ msgstr ""
+#~ "``nixl_use_hugepages``: 是否为 NIXL CPU 缓冲区使用 "
+#~ "Linux 大页（2 MiB）。不支持 GPU 缓冲区。需要预分配的大页（``sysctl"
+#~ " vm.nr_hugepages``）。默认值：``false``。"
+
+#~ msgid ""
+#~ "Dynamic mode is currently only supported"
+#~ " for nixl OBJ and AZURE_BLOB "
+#~ "backends."
+#~ msgstr "动态模式目前仅支持 nixl OBJ 和 AZURE_BLOB 后端。"
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/architecture.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/architecture.po
index d345fdb064..f644c7ecd3 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/mp/architecture.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/architecture.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-18 17:25+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -40,28 +40,39 @@ msgstr "服务器变体"
 
 #: ../../source/mp/architecture.rst:45
 msgid ""
-"All three server entry points share the same ``MPCacheEngine`` and "
-"``StorageManager`` core."
-msgstr "所有三个服务器入口点共享相同的 ``MPCacheEngine`` 和 ``StorageManager`` 核心。"
+"All server entry points share the same ``MPCacheEngine`` and "
+"``StorageManager`` core. ``MPCacheEngine`` is now a thin compositor: it "
+"holds an ``MPCacheEngineContext`` and a list of ``EngineModule`` "
+"instances assembled by ``build_engine_modules()`` (in ``server.py``) "
+"based on ``--engine-type`` and ``--supported-transfer-mode``."
+msgstr "所有服务器入口点共享相同的 ``MPCacheEngine`` 和 ``StorageManager`` 核心。 ``MPCacheEngine`` 现在是一个轻量级合成器：它持有一个 ``MPCacheEngineContext`` 和一个由 ``build_engine_modules()``（在 ``server.py`` 中）基于 ``--engine-type`` 和 ``--supported-transfer-mode`` 组装的 ``EngineModule`` 实例列表。"
 
-#: ../../source/mp/architecture.rst:48
+#: ../../source/mp/architecture.rst:51
 msgid ""
 "**``server.py``** -- The default ZMQ-only server.  Creates an "
-"``MPCacheEngine`` and a ``MessageQueueServer``, registers handlers for "
-"all core ``RequestType`` values, and blocks in a keep-alive loop."
-msgstr "**``server.py``** -- 默认的仅 ZMQ 服务器。创建一个 ``MPCacheEngine`` 和一个 ``MessageQueueServer``，为所有核心 ``RequestType`` 值注册处理程序，并在保持活动循环中阻塞。"
-
-#: ../../source/mp/architecture.rst:52
-msgid ""
-"**``blend_server_v2.py``** -- Extends ``MPCacheEngine`` with "
-"``BlendEngineV2``, which adds CacheBlend operations "
+"``MPCacheEngine``, assembles the engine modules (``LookupModule`` + "
+"``ManagementModule`` + ``GPUTransferModule`` and/or "
+"``NonGPUTransferModule`` depending on ``--supported-transfer-mode`` — "
+"``gpu`` or ``non_gpu`` loads just one, ``auto`` (default) loads both — "
+"plus ``BlendModule`` when ``--engine-type blend``), starts a "
+"``MessageQueueServer``, registers handlers for every ``RequestType`` "
+"exposed by the loaded modules, and blocks in a keep-alive loop."
+msgstr "**``server.py``** -- 默认的仅 ZMQ 服务器。创建一个 ``MPCacheEngine``，组装引擎模块（``LookupModule`` + ``ManagementModule`` + ``GPUTransferModule`` 和/或 ``NonGPUTransferModule``，具体取决于 ``--supported-transfer-mode`` — ``gpu`` 或 ``non_gpu`` 仅加载一个，``auto``（默认）加载两个 — 当 ``--engine-type blend`` 时还会加载 ``BlendModule``），启动 ``MessageQueueServer``，为加载的模块暴露的每个 ``RequestType`` 注册处理程序，并在保持活动的循环中阻塞。"
+
+#: ../../source/mp/architecture.rst:61
+msgid ""
+"**``modules/blend.py``** -- Defines ``BlendModule`` and "
+"``BlendEngineV2``, which add CacheBlend operations "
 "(``CB_REGISTER_KV_CACHE``, ``CB_LOOKUP_PRE_COMPUTED``, "
 "``CB_STORE_PRE_COMPUTED``, ``CB_RETRIEVE_PRE_COMPUTED``, "
-"``CB_STORE_FINAL``).  Enables non-prefix KV cache reuse across document "
-"paragraphs."
-msgstr "**``blend_server_v2.py``** -- 扩展了 ``MPCacheEngine``，增加了 ``BlendEngineV2``，该引擎添加了 CacheBlend 操作 (``CB_REGISTER_KV_CACHE``, ``CB_LOOKUP_PRE_COMPUTED``, ``CB_STORE_PRE_COMPUTED``, ``CB_RETRIEVE_PRE_COMPUTED``, ``CB_STORE_FINAL``)。 使得在文档段落之间能够重用非前缀 KV Cache。"
+"``CB_STORE_FINAL`` and their V2 variants). Enables non-prefix KV cache "
+"reuse across document paragraphs. Selected by passing ``--engine-type "
+"blend`` to ``lmcache server``; ``BlendModule`` requires ``--supported-"
+"transfer-mode`` to be ``gpu`` or ``auto`` and will refuse to load when it"
+" is ``non_gpu``."
+msgstr "**``modules/blend.py``** -- 定义了 ``BlendModule`` 和 ``BlendEngineV2``，它们添加了 CacheBlend 操作（``CB_REGISTER_KV_CACHE``、``CB_LOOKUP_PRE_COMPUTED``、``CB_STORE_PRE_COMPUTED``、``CB_RETRIEVE_PRE_COMPUTED``、``CB_STORE_FINAL`` 及其 V2 变体）。启用跨文档段落的非前缀 KV 缓存重用。通过将 ``--engine-type blend`` 传递给 ``lmcache server`` 进行选择；``BlendModule`` 需要 ``--supported-transfer-mode`` 为 ``gpu`` 或 ``auto``，当为 ``non_gpu`` 时将拒绝加载。"
 
-#: ../../source/mp/architecture.rst:58
+#: ../../source/mp/architecture.rst:71
 msgid ""
 "**``http_server.py``** -- Wraps ``run_cache_server()`` (from "
 "``server.py``) inside a FastAPI application.  Endpoints are contributed "
@@ -72,380 +83,397 @@ msgid ""
 "state.  The ZMQ server runs as part of the same process, and any "
 "configured runtime plugins are spawned by ``MPRuntimePluginLauncher`` "
 "during FastAPI startup."
-msgstr "**``http_server.py``** -- 在 FastAPI 应用程序中封装 ``run_cache_server()`` (来自 ``server.py``)。 端点由 ``http_apis/`` 下的模块贡献，并通过 ``HTTPAPIRegistry`` 自动注册：``GET /`` (基本存活检查)、``GET /healthcheck`` 用于 Kubernetes 探针、``POST /clear-cache`` 用于清除 L1 (CPU) 内存中的所有 KV 缓存数据，以及 ``GET /status`` 用于检查详细的内部状态。 ZMQ 服务器作为同一进程的一部分运行，任何配置的运行时插件在 FastAPI 启动期间由 ``MPRuntimePluginLauncher`` 生成。"
+msgstr ""
+"**``http_server.py``** -- 在 FastAPI 应用程序中封装 ``run_cache_server()`` (来自 "
+"``server.py``)。 端点由 ``http_apis/`` 下的模块贡献，并通过 ``HTTPAPIRegistry`` "
+"自动注册：``GET /`` (基本存活检查)、``GET /healthcheck`` 用于 Kubernetes 探针、``POST "
+"/clear-cache`` 用于清除 L1 (CPU) 内存中的所有 KV 缓存数据，以及 ``GET /status`` "
+"用于检查详细的内部状态。 ZMQ 服务器作为同一进程的一部分运行，任何配置的运行时插件在 FastAPI 启动期间由 "
+"``MPRuntimePluginLauncher`` 生成。"
 
-#: ../../source/mp/architecture.rst:68
+#: ../../source/mp/architecture.rst:81
 msgid "ZMQ Protocol"
 msgstr "ZMQ 协议"
 
-#: ../../source/mp/architecture.rst:70
+#: ../../source/mp/architecture.rst:83
 msgid "Communication between vLLM and LMCache uses ZMQ (DEALER/ROUTER pattern)."
 msgstr "vLLM 和 LMCache 之间的通信使用 ZMQ（DEALER/ROUTER 模式）。"
 
-#: ../../source/mp/architecture.rst:72
+#: ../../source/mp/architecture.rst:85
 msgid "**RequestType enum** (defined in ``protocols/base.py``):"
 msgstr "**RequestType 枚举**（定义在 ``protocols/base.py``）："
 
-#: ../../source/mp/architecture.rst:78
+#: ../../source/mp/architecture.rst:91
 msgid "Request Type"
 msgstr "请求类型"
 
-#: ../../source/mp/architecture.rst:79
+#: ../../source/mp/architecture.rst:92
 msgid "Handler Type"
 msgstr "处理程序类型"
 
-#: ../../source/mp/architecture.rst:80
+#: ../../source/mp/architecture.rst:93
 msgid "Description"
 msgstr "描述"
 
-#: ../../source/mp/architecture.rst:81
+#: ../../source/mp/architecture.rst:94
 msgid "``REGISTER_KV_CACHE``"
 msgstr "``REGISTER_KV_CACHE``"
 
-#: ../../source/mp/architecture.rst:82 ../../source/mp/architecture.rst:85
-#: ../../source/mp/architecture.rst:117 ../../source/mp/architecture.rst:127
-#: ../../source/mp/architecture.rst:130 ../../source/mp/architecture.rst:133
+#: ../../source/mp/architecture.rst:95 ../../source/mp/architecture.rst:98
+#: ../../source/mp/architecture.rst:130 ../../source/mp/architecture.rst:140
+#: ../../source/mp/architecture.rst:143 ../../source/mp/architecture.rst:146
 msgid "SYNC"
 msgstr "同步"
 
-#: ../../source/mp/architecture.rst:83
+#: ../../source/mp/architecture.rst:96
 msgid "Register GPU KV cache tensors for a vLLM instance."
 msgstr "为 vLLM 实例注册 GPU KV Cache 张量。"
 
-#: ../../source/mp/architecture.rst:84
+#: ../../source/mp/architecture.rst:97
 msgid "``UNREGISTER_KV_CACHE``"
 msgstr "``UNREGISTER_KV_CACHE``"
 
-#: ../../source/mp/architecture.rst:86
+#: ../../source/mp/architecture.rst:99
 msgid "Unregister KV cache tensors."
 msgstr "注销 KV Cache 张量。"
 
-#: ../../source/mp/architecture.rst:87
+#: ../../source/mp/architecture.rst:100
 msgid "``STORE``"
 msgstr "``STORE``"
 
-#: ../../source/mp/architecture.rst:88 ../../source/mp/architecture.rst:91
-#: ../../source/mp/architecture.rst:94 ../../source/mp/architecture.rst:98
-#: ../../source/mp/architecture.rst:102 ../../source/mp/architecture.rst:107
-#: ../../source/mp/architecture.rst:111 ../../source/mp/architecture.rst:114
-#: ../../source/mp/architecture.rst:120 ../../source/mp/architecture.rst:123
-#: ../../source/mp/architecture.rst:136 ../../source/mp/architecture.rst:139
-#: ../../source/mp/architecture.rst:142 ../../source/mp/architecture.rst:145
-#: ../../source/mp/architecture.rst:148 ../../source/mp/architecture.rst:153
+#: ../../source/mp/architecture.rst:101 ../../source/mp/architecture.rst:104
+#: ../../source/mp/architecture.rst:107 ../../source/mp/architecture.rst:111
+#: ../../source/mp/architecture.rst:115 ../../source/mp/architecture.rst:120
+#: ../../source/mp/architecture.rst:124 ../../source/mp/architecture.rst:127
+#: ../../source/mp/architecture.rst:133 ../../source/mp/architecture.rst:136
+#: ../../source/mp/architecture.rst:149 ../../source/mp/architecture.rst:152
+#: ../../source/mp/architecture.rst:155 ../../source/mp/architecture.rst:158
+#: ../../source/mp/architecture.rst:161 ../../source/mp/architecture.rst:166
 msgid "BLOCKING"
 msgstr "阻塞"
 
-#: ../../source/mp/architecture.rst:89
+#: ../../source/mp/architecture.rst:102
 msgid "Store KV cache chunks from GPU to L1 (CPU)."
 msgstr "将 KV Cache 块从 GPU 存储到 L1 (CPU)。"
 
-#: ../../source/mp/architecture.rst:90
+#: ../../source/mp/architecture.rst:103
 msgid "``RETRIEVE``"
 msgstr "``RETRIEVE``"
 
-#: ../../source/mp/architecture.rst:92
+#: ../../source/mp/architecture.rst:105
 msgid "Copy KV cache chunks from L1 (CPU) back to GPU."
 msgstr "将 KV Cache 块从 L1 (CPU) 复制回 GPU。"
 
-#: ../../source/mp/architecture.rst:93
+#: ../../source/mp/architecture.rst:106
 msgid "``LOOKUP``"
 msgstr "``LOOKUP``"
 
-#: ../../source/mp/architecture.rst:95
+#: ../../source/mp/architecture.rst:108
 msgid ""
 "Submit a prefix lookup; the prefetch job is tracked server-side by "
 "request_id."
 msgstr "提交前缀查找；预取作业由 request_id 在服务器端进行跟踪。"
 
-#: ../../source/mp/architecture.rst:97
+#: ../../source/mp/architecture.rst:110
 msgid "``QUERY_PREFETCH_STATUS``"
 msgstr "``QUERY_PREFETCH_STATUS``"
 
-#: ../../source/mp/architecture.rst:99
+#: ../../source/mp/architecture.rst:112
 msgid ""
 "Poll a prefetch job by request_id. Returns the loaded chunk count when "
 "done, or ``None`` while the prefetch is still in progress."
 msgstr "通过 request_id 轮询预取作业。完成时返回加载的块数，预取仍在进行时返回 ``None``。"
 
-#: ../../source/mp/architecture.rst:101
+#: ../../source/mp/architecture.rst:114
 msgid "``QUERY_PREFETCH_LOOKUP_HITS``"
 msgstr "``QUERY_PREFETCH_LOOKUP_HITS``"
 
-#: ../../source/mp/architecture.rst:103
+#: ../../source/mp/architecture.rst:116
 msgid ""
 "Query the lookup-phase hit chunk count by request_id, before the prefetch"
 " finishes. Returns ``None`` while the lookup is still running."
 msgstr "在预取完成之前，通过 request_id 查询查找阶段的命中块计数。当查找仍在运行时返回 ``None``。"
 
-#: ../../source/mp/architecture.rst:106
+#: ../../source/mp/architecture.rst:119
 msgid "``FREE_LOOKUP_LOCKS``"
 msgstr "``FREE_LOOKUP_LOCKS``"
 
-#: ../../source/mp/architecture.rst:108
+#: ../../source/mp/architecture.rst:121
 msgid "Release read locks from a cancelled lookup without doing a full RETRIEVE."
 msgstr "从取消的查找中释放读取锁，而无需执行完整的 RETRIEVE。"
 
-#: ../../source/mp/architecture.rst:110
+#: ../../source/mp/architecture.rst:123
 msgid "``END_SESSION``"
 msgstr "``END_SESSION``"
 
-#: ../../source/mp/architecture.rst:112
+#: ../../source/mp/architecture.rst:125
 msgid "Remove session state for a finished request."
 msgstr "移除已完成请求的会话状态。"
 
-#: ../../source/mp/architecture.rst:113
+#: ../../source/mp/architecture.rst:126
 msgid "``CLEAR``"
 msgstr "``CLEAR``"
 
-#: ../../source/mp/architecture.rst:115
+#: ../../source/mp/architecture.rst:128
 msgid "Clear all cached data."
 msgstr "清除所有缓存数据。"
 
-#: ../../source/mp/architecture.rst:116
+#: ../../source/mp/architecture.rst:129
 msgid "``GET_CHUNK_SIZE``"
 msgstr "``GET_CHUNK_SIZE``"
 
-#: ../../source/mp/architecture.rst:118
+#: ../../source/mp/architecture.rst:131
 msgid "Return the server's chunk size."
 msgstr "返回服务器的块大小。"
 
-#: ../../source/mp/architecture.rst:119
+#: ../../source/mp/architecture.rst:132
 msgid "``PING``"
 msgstr "``PING``"
 
-#: ../../source/mp/architecture.rst:121
+#: ../../source/mp/architecture.rst:134
 msgid "Liveness ping; the handler always returns ``True``."
 msgstr "存活探测；处理程序始终返回 ``True``。"
 
-#: ../../source/mp/architecture.rst:122
+#: ../../source/mp/architecture.rst:135
 msgid "``REPORT_BLOCK_ALLOCATION``"
 msgstr "``REPORT_BLOCK_ALLOCATION``"
 
-#: ../../source/mp/architecture.rst:124
+#: ../../source/mp/architecture.rst:137
 msgid ""
 "Fire-and-forget channel for the vLLM scheduler to report GPU block "
 "allocation events to the observability subsystem."
 msgstr "vLLM 调度器的火忘通道，用于向可观察性子系统报告 GPU 块分配事件。"
 
-#: ../../source/mp/architecture.rst:126
+#: ../../source/mp/architecture.rst:139
 msgid "``NOOP``"
 msgstr "``NOOP``"
 
-#: ../../source/mp/architecture.rst:128
+#: ../../source/mp/architecture.rst:141
 msgid "Debug heartbeat -- returns a confirmation string."
 msgstr "调试心跳 -- 返回确认字符串。"
 
-#: ../../source/mp/architecture.rst:129
+#: ../../source/mp/architecture.rst:142
 msgid "``CB_REGISTER_KV_CACHE``"
 msgstr "``CB_REGISTER_KV_CACHE``"
 
-#: ../../source/mp/architecture.rst:131
+#: ../../source/mp/architecture.rst:144
 msgid "(Blend) Register CacheBlend KV buffer."
 msgstr "(Blend) 注册 CacheBlend KV 缓冲区。"
 
-#: ../../source/mp/architecture.rst:132
+#: ../../source/mp/architecture.rst:145
 msgid "``CB_UNREGISTER_KV_CACHE``"
 msgstr "``CB_UNREGISTER_KV_CACHE``"
 
-#: ../../source/mp/architecture.rst:134
+#: ../../source/mp/architecture.rst:147
 msgid "(Blend) Unregister CacheBlend KV buffer."
 msgstr "(Blend) 取消注册 CacheBlend KV 缓冲区。"
 
-#: ../../source/mp/architecture.rst:135
+#: ../../source/mp/architecture.rst:148
 msgid "``CB_STORE_PRE_COMPUTED``"
 msgstr "``CB_STORE_PRE_COMPUTED``"
 
-#: ../../source/mp/architecture.rst:137
+#: ../../source/mp/architecture.rst:150
 msgid "(Blend) Store pre-computed paragraph chunks."
 msgstr "(Blend) 存储预计算的段落块。"
 
-#: ../../source/mp/architecture.rst:138
+#: ../../source/mp/architecture.rst:151
 msgid "``CB_LOOKUP_PRE_COMPUTED``"
 msgstr "``CB_LOOKUP_PRE_COMPUTED``"
 
-#: ../../source/mp/architecture.rst:140
+#: ../../source/mp/architecture.rst:153
 msgid "(Blend) Lookup pre-computed paragraph chunks."
 msgstr "(Blend) 查找预计算的段落块。"
 
-#: ../../source/mp/architecture.rst:141
+#: ../../source/mp/architecture.rst:154
 msgid "``CB_RETRIEVE_PRE_COMPUTED``"
 msgstr "``CB_RETRIEVE_PRE_COMPUTED``"
 
-#: ../../source/mp/architecture.rst:143
+#: ../../source/mp/architecture.rst:156
 msgid "(Blend) Retrieve pre-computed paragraph chunks to GPU."
 msgstr "(Blend) 将预计算的段落块检索到 GPU。"
 
-#: ../../source/mp/architecture.rst:144
+#: ../../source/mp/architecture.rst:157
 msgid "``CB_STORE_FINAL``"
 msgstr "``CB_STORE_FINAL``"
 
-#: ../../source/mp/architecture.rst:146
+#: ../../source/mp/architecture.rst:159
 msgid "(Blend) Store final blended chunks."
 msgstr "(Blend) 存储最终混合块。"
 
-#: ../../source/mp/architecture.rst:147
+#: ../../source/mp/architecture.rst:160
 msgid "``CB_LOOKUP_PRE_COMPUTED_V2``"
 msgstr "``CB_LOOKUP_PRE_COMPUTED_V2``"
 
-#: ../../source/mp/architecture.rst:149
+#: ../../source/mp/architecture.rst:162
 msgid ""
 "(Blend V2) Lookup pre-computed chunks; returns ``CBMatchResult`` entries "
 "(with old/cur ranges and per-chunk hashes) so the retrieve step can skip "
 "re-hashing."
 msgstr "（Blend V2）查找预计算的块；返回 ``CBMatchResult`` 条目（包含旧范围/当前范围和每块哈希），以便检索步骤可以跳过重新哈希。"
 
-#: ../../source/mp/architecture.rst:152
+#: ../../source/mp/architecture.rst:165
 msgid "``CB_RETRIEVE_PRE_COMPUTED_V2``"
 msgstr "``CB_RETRIEVE_PRE_COMPUTED_V2``"
 
-#: ../../source/mp/architecture.rst:154
+#: ../../source/mp/architecture.rst:167
 msgid ""
 "(Blend V2) Retrieve pre-computed chunks using the ``CBMatchResult`` list "
 "returned by ``CB_LOOKUP_PRE_COMPUTED_V2``."
 msgstr "（Blend V2）使用 ``CB_LOOKUP_PRE_COMPUTED_V2`` 返回的 ``CBMatchResult`` 列表检索预计算块。"
 
-#: ../../source/mp/architecture.rst:157
+#: ../../source/mp/architecture.rst:170
 msgid "**Handler types:**"
 msgstr "**处理程序类型:**"
 
-#: ../../source/mp/architecture.rst:159
+#: ../../source/mp/architecture.rst:172
 msgid "**SYNC** -- Runs directly in the ZMQ main loop (fast, non-blocking)."
 msgstr "**同步** -- 直接在 ZMQ 主循环中运行（快速，非阻塞）。"
 
-#: ../../source/mp/architecture.rst:160
+#: ../../source/mp/architecture.rst:173
 msgid ""
 "**BLOCKING** -- Dispatched to a thread pool (may involve GPU copies or "
 "I/O)."
 msgstr "**阻塞** -- 分配到线程池（可能涉及 GPU 复制或 I/O）。"
 
-#: ../../source/mp/architecture.rst:163
+#: ../../source/mp/architecture.rst:176
 msgid "Config System"
 msgstr "配置系统"
 
-#: ../../source/mp/architecture.rst:165
+#: ../../source/mp/architecture.rst:178
 msgid "Each config module exposes a composable triple:"
 msgstr "每个配置模块都暴露一个可组合的三元组："
 
-#: ../../source/mp/architecture.rst:171
+#: ../../source/mp/architecture.rst:184
 msgid "``server.py:parse_args()`` composes them:"
 msgstr "``server.py:parse_args()`` 组合它们："
 
-#: ../../source/mp/architecture.rst:184
+#: ../../source/mp/architecture.rst:197
 msgid ""
-"Both ``blend_server_v2.py`` and ``http_server.py`` reuse this pattern, "
-"adding ``add_http_frontend_args()`` for the HTTP variant."
-msgstr "``blend_server_v2.py`` 和 ``http_server.py`` 都重用了这个模式，为 HTTP 变体添加了 ``add_http_frontend_args()``。"
+"``http_server.py`` reuses this pattern, adding "
+"``add_http_frontend_args()`` for the HTTP variant. CacheBlend is no "
+"longer a separate entry point — it is opted into at runtime by passing "
+"``--engine-type blend`` to ``server.py`` (or ``lmcache server``), which "
+"appends ``BlendModule`` to the engine module list."
+msgstr "``http_server.py`` 重用此模式，为 HTTP 变体添加了 ``add_http_frontend_args()``。CacheBlend 不再是一个单独的入口点——它通过在运行时将 ``--engine-type blend`` 传递给 ``server.py``（或 ``lmcache server``）来选择，这会将 ``BlendModule`` 附加到引擎模块列表中。"
 
-#: ../../source/mp/architecture.rst:188
+#: ../../source/mp/architecture.rst:204
 msgid "Distributed Storage"
 msgstr "分布式存储"
 
-#: ../../source/mp/architecture.rst:191
+#: ../../source/mp/architecture.rst:207
 msgid "StorageManager"
 msgstr "StorageManager"
 
-#: ../../source/mp/architecture.rst:193 ../../source/mp/architecture.rst:423
+#: ../../source/mp/architecture.rst:209 ../../source/mp/architecture.rst:448
 msgid "``lmcache/v1/distributed/storage_manager.py``"
 msgstr "``lmcache/v1/distributed/storage_manager.py``"
 
-#: ../../source/mp/architecture.rst:195
+#: ../../source/mp/architecture.rst:211
 msgid ""
 "The top-level manager that wires together L1, L2, and all controllers.  "
 "Key methods:"
 msgstr "将 L1、L2 和所有控制器连接在一起的顶级管理器。关键方法："
 
-#: ../../source/mp/architecture.rst:198
+#: ../../source/mp/architecture.rst:214
 msgid "``reserve_write()`` / ``finish_write()`` -- Two-phase write into L1."
 msgstr "``reserve_write()`` / ``finish_write()`` -- L1 的两阶段写入。"
 
-#: ../../source/mp/architecture.rst:199
+#: ../../source/mp/architecture.rst:215
 msgid ""
 "``submit_prefetch_task()`` / ``query_prefetch_status()`` -- Async lookup "
 "+ L2 prefetch."
 msgstr "``submit_prefetch_task()`` / ``query_prefetch_status()`` -- 异步查找 + L2 预取。"
 
-#: ../../source/mp/architecture.rst:201
+#: ../../source/mp/architecture.rst:217
 msgid ""
 "``read_prefetched_results()`` / ``finish_read_prefetched()`` -- Read "
 "prefetched data from L1 with automatic lock management."
-msgstr "``read_prefetched_results()`` / ``finish_read_prefetched()`` -- 从 L1 读取预取的数据，并自动管理锁。"
+msgstr ""
+"``read_prefetched_results()`` / ``finish_read_prefetched()`` -- 从 L1 "
+"读取预取的数据，并自动管理锁。"
 
-#: ../../source/mp/architecture.rst:205
+#: ../../source/mp/architecture.rst:221
 msgid "L1Manager"
 msgstr "L1Manager"
 
-#: ../../source/mp/architecture.rst:207 ../../source/mp/architecture.rst:427
+#: ../../source/mp/architecture.rst:223 ../../source/mp/architecture.rst:452
 msgid "``lmcache/v1/distributed/l1_manager.py``"
 msgstr "``lmcache/v1/distributed/l1_manager.py``"
 
-#: ../../source/mp/architecture.rst:209
+#: ../../source/mp/architecture.rst:225
 msgid "Manages objects in CPU memory with a state machine:"
 msgstr "在 CPU 内存中使用状态机管理对象："
 
-#: ../../source/mp/architecture.rst:219
+#: ../../source/mp/architecture.rst:235
 msgid ""
 "Each object has two ``TTLLock`` instances (read and write) with "
 "configurable timeouts to prevent deadlocks from crashed clients."
 msgstr "每个对象都有两个 ``TTLLock`` 实例（读和写），并具有可配置的超时，以防止因客户端崩溃而导致的死锁。"
 
-#: ../../source/mp/architecture.rst:222
+#: ../../source/mp/architecture.rst:238
 msgid ""
 "The ``L1MemoryManager`` handles the underlying memory allocation (lazy "
 "growth up to ``--l1-size-gb``)."
 msgstr "``L1MemoryManager`` 处理底层内存分配（懒惰增长至 ``--l1-size-gb``）。"
 
-#: ../../source/mp/architecture.rst:226
+#: ../../source/mp/architecture.rst:242
 msgid "L2 Adapters"
 msgstr "L2 适配器"
 
-#: ../../source/mp/architecture.rst:228
+#: ../../source/mp/architecture.rst:244
 msgid "``lmcache/v1/distributed/l2_adapters/``"
 msgstr "``lmcache/v1/distributed/l2_adapters/``"
 
-#: ../../source/mp/architecture.rst:230
+#: ../../source/mp/architecture.rst:246
 msgid ""
 "The ``L2AdapterInterface`` (in ``base.py``) defines three async task "
 "methods:"
 msgstr "``L2AdapterInterface``（在 ``base.py`` 中）定义了三个异步任务方法："
 
-#: ../../source/mp/architecture.rst:232
+#: ../../source/mp/architecture.rst:248
 msgid "``submit_store_task(key, data)`` -- Push data to L2."
 msgstr "``submit_store_task(key, data)`` -- 将数据推送到 L2."
 
-#: ../../source/mp/architecture.rst:233
+#: ../../source/mp/architecture.rst:249
 msgid "``submit_lookup_and_lock_task(keys)`` -- Check if keys exist in L2."
 msgstr "``submit_lookup_and_lock_task(keys)`` -- 检查 keys 是否存在于 L2 中。"
 
-#: ../../source/mp/architecture.rst:234
+#: ../../source/mp/architecture.rst:250
 msgid "``submit_load_task(keys, layout_desc)`` -- Load data from L2 into L1."
 msgstr "``submit_load_task(keys, layout_desc)`` -- 从 L2 加载数据到 L1。"
 
-#: ../../source/mp/architecture.rst:236
+#: ../../source/mp/architecture.rst:252
 msgid ""
 "The factory function ``create_l2_adapter()`` (in ``__init__.py``) uses "
 "``isinstance()`` on the config type to instantiate the correct adapter."
-msgstr "工厂函数 ``create_l2_adapter()`` （在 ``__init__.py`` 中）使用 ``isinstance()`` 对配置类型进行检查，以实例化正确的适配器。"
+msgstr ""
+"工厂函数 ``create_l2_adapter()`` （在 ``__init__.py`` 中）使用 ``isinstance()`` "
+"对配置类型进行检查，以实例化正确的适配器。"
 
-#: ../../source/mp/architecture.rst:239
+#: ../../source/mp/architecture.rst:255
 msgid ""
 "New adapter types are registered via ``register_l2_adapter_type()`` in "
 "``config.py``."
 msgstr "新的适配器类型通过 ``register_l2_adapter_type()`` 在 ``config.py`` 中注册。"
 
-#: ../../source/mp/architecture.rst:243
+#: ../../source/mp/architecture.rst:259
 msgid "Controllers"
 msgstr "控制器"
 
-#: ../../source/mp/architecture.rst:245
+#: ../../source/mp/architecture.rst:261
 msgid ""
 "**StoreController** (``storage_controllers/store_controller.py``): Event-"
 "driven background thread that uses ``select.poll()`` on listener eventfd "
 "and adapter store eventfds.  When new objects appear in L1 (signaled via "
 "``StoreListener``), it submits async store tasks to each L2 adapter based"
 " on the ``StorePolicy``."
-msgstr "**StoreController** (``storage_controllers/store_controller.py``)：事件驱动的后台线程，使用 ``select.poll()`` 监听事件文件描述符和适配器存储事件文件描述符。当 L1 中出现新对象时（通过 ``StoreListener`` 发出信号），它根据 ``StorePolicy`` 向每个 L2 适配器提交异步存储任务。"
+msgstr ""
+"**StoreController** "
+"(``storage_controllers/store_controller.py``)：事件驱动的后台线程，使用 "
+"``select.poll()`` 监听事件文件描述符和适配器存储事件文件描述符。当 L1 中出现新对象时（通过 "
+"``StoreListener`` 发出信号），它根据 ``StorePolicy`` 向每个 L2 适配器提交异步存储任务。"
 
-#: ../../source/mp/architecture.rst:251
+#: ../../source/mp/architecture.rst:267
 msgid ""
 "**EvictionController** (``storage_controllers/eviction_controller.py``): "
 "Periodically checks L1 memory usage against the watermark threshold.  "
@@ -453,37 +481,44 @@ msgid ""
 "``IsolatedLRU``, or ``noop``) until usage drops below the target. "
 "``IsolatedLRU`` evicts per ``cache_salt`` against limits registered "
 "through the ``/quota`` HTTP endpoints; see :ref:`mp-http-quota-api`."
-msgstr "**逐出控制器** (``storage_controllers/eviction_controller.py``)：定期检查 L1 内存使用情况与水位线阈值的关系。当触发时，使用配置的策略（``LRU``、``IsolatedLRU`` 或 ``noop``）逐出对象，直到使用量降到目标以下。``IsolatedLRU`` 根据通过 ``/quota`` HTTP 端点注册的限制，针对 ``cache_salt`` 进行逐出；请参见 :ref:`mp-http-quota-api`。"
+msgstr ""
+"**逐出控制器** (``storage_controllers/eviction_controller.py``)：定期检查 L1 "
+"内存使用情况与水位线阈值的关系。当触发时，使用配置的策略（``LRU``、``IsolatedLRU`` 或 "
+"``noop``）逐出对象，直到使用量降到目标以下。``IsolatedLRU`` 根据通过 ``/quota`` HTTP 端点注册的限制，针对"
+" ``cache_salt`` 进行逐出；请参见 :ref:`mp-http-quota-api`。"
 
-#: ../../source/mp/architecture.rst:258
+#: ../../source/mp/architecture.rst:274
 msgid ""
 "**PrefetchController** (``storage_controllers/prefetch_controller.py``): "
 "Handles L2 lookup and load requests submitted by ``StorageManager`` "
 "during ``LOOKUP`` RPCs.  When keys are not in L1, it queries L2 adapters "
 "and loads found data back into L1."
-msgstr "**预取控制器** (``storage_controllers/prefetch_controller.py``): 处理 ``StorageManager`` 在 ``LOOKUP`` RPC 中提交的 L2 查找和加载请求。当键不在 L1 中时，它会查询 L2 适配器并将找到的数据加载回 L1。"
+msgstr ""
+"**预取控制器** (``storage_controllers/prefetch_controller.py``): 处理 "
+"``StorageManager`` 在 ``LOOKUP`` RPC 中提交的 L2 查找和加载请求。当键不在 L1 中时，它会查询 L2 "
+"适配器并将找到的数据加载回 L1。"
 
-#: ../../source/mp/architecture.rst:264
+#: ../../source/mp/architecture.rst:280
 msgid "Request Flows"
 msgstr "请求流程"
 
-#: ../../source/mp/architecture.rst:267
+#: ../../source/mp/architecture.rst:283
 msgid "LOOKUP Flow"
 msgstr "查找流程"
 
-#: ../../source/mp/architecture.rst:285
+#: ../../source/mp/architecture.rst:301
 msgid "STORE Flow"
 msgstr "存储流程"
 
-#: ../../source/mp/architecture.rst:304
+#: ../../source/mp/architecture.rst:320
 msgid "RETRIEVE Flow"
 msgstr "获取流程"
 
-#: ../../source/mp/architecture.rst:320
+#: ../../source/mp/architecture.rst:336
 msgid "Observability Internals"
 msgstr "可观察性内部实现"
 
-#: ../../source/mp/architecture.rst:322
+#: ../../source/mp/architecture.rst:338
 msgid ""
 "**EventBus** (``lmcache/v1/mp_observability/event_bus.py``) is a global "
 "singleton initialized at server startup by ``init_observability()``. "
@@ -491,9 +526,14 @@ msgid ""
 "objects to a bounded queue (``--event-bus-queue-size``, default 10000, "
 "tail-drop on overflow).  A background drain thread dispatches each event "
 "to all registered subscribers."
-msgstr "**EventBus** (``lmcache/v1/mp_observability/event_bus.py``) 是一个在服务器启动时由 ``init_observability()`` 初始化的全局单例。生产者（L1Manager、StorageManager、MPCacheEngine）将 ``Event`` 对象发布到一个有界队列中 (``--event-bus-queue-size``, 默认 10000，溢出时尾部丢弃)。一个后台排空线程将每个事件分发给所有注册的订阅者。"
+msgstr ""
+"**EventBus** (``lmcache/v1/mp_observability/event_bus.py``) 是一个在服务器启动时由 "
+"``init_observability()`` "
+"初始化的全局单例。生产者（L1Manager、StorageManager、MPCacheEngine）将 ``Event`` "
+"对象发布到一个有界队列中 (``--event-bus-queue-size``, 默认 "
+"10000，溢出时尾部丢弃)。一个后台排空线程将每个事件分发给所有注册的订阅者。"
 
-#: ../../source/mp/architecture.rst:329
+#: ../../source/mp/architecture.rst:345
 msgid ""
 "**Subscribers** live under ``lmcache/v1/mp_observability/subscribers/`` "
 "and are grouped by concern: ``metrics/`` (OTel counters and lifecycle "
@@ -501,69 +541,86 @@ msgid ""
 "and ``tracing/`` (OTel spans built from START/END event pairs). "
 "``init_observability()`` registers the set selected by CLI flags "
 "(``--disable-metrics``, ``--disable-logging``, ``--enable-tracing``)."
-msgstr "**订阅者** 位于 ``lmcache/v1/mp_observability/subscribers/`` 目录下，按关注点分组：``metrics/``（OTel 计数器和生命周期直方图）、``logging/``（Python 日志处理程序、查找哈希 JSONL）和 ``tracing/``（由 START/END 事件对构建的 OTel 跨度）。``init_observability()`` 根据 CLI 标志（``--disable-metrics``、``--disable-logging``、``--enable-tracing``）注册所选的集合。"
+msgstr ""
+"**订阅者** 位于 ``lmcache/v1/mp_observability/subscribers/`` "
+"目录下，按关注点分组：``metrics/``（OTel 计数器和生命周期直方图）、``logging/``（Python 日志处理程序、查找哈希"
+" JSONL）和 ``tracing/``（由 START/END 事件对构建的 OTel "
+"跨度）。``init_observability()`` 根据 CLI 标志（``--disable-metrics``、``--disable-"
+"logging``、``--enable-tracing``）注册所选的集合。"
 
-#: ../../source/mp/architecture.rst:336
+#: ../../source/mp/architecture.rst:352
 msgid ""
 "**OTel providers** are set up via ``otel_init.py`` before subscribers are"
 " constructed, so module-level ``get_meter()`` / ``get_tracer()`` calls "
 "bind to the real provider. Metrics are exported both to an in-process "
 "Prometheus ``/metrics`` endpoint (``--prometheus-port``, default 9090) "
 "and, when ``--otlp-endpoint`` is set, pushed to an OTel collector."
-msgstr "**OTel 提供者**在构造订阅者之前通过 ``otel_init.py`` 进行设置，因此模块级的 ``get_meter()`` / ``get_tracer()`` 调用绑定到真实的提供者。指标同时导出到进程内的 Prometheus ``/metrics`` 端点（``--prometheus-port``, 默认 9090），并且在设置了 ``--otlp-endpoint`` 时，推送到 OTel 收集器。"
+msgstr ""
+"**OTel 提供者**在构造订阅者之前通过 ``otel_init.py`` 进行设置，因此模块级的 ``get_meter()`` / "
+"``get_tracer()`` 调用绑定到真实的提供者。指标同时导出到进程内的 Prometheus ``/metrics`` 端点"
+"（``--prometheus-port``, 默认 9090），并且在设置了 ``--otlp-endpoint`` 时，推送到 OTel "
+"收集器。"
 
-#: ../../source/mp/architecture.rst:344
+#: ../../source/mp/architecture.rst:360
 msgid "How to Extend"
 msgstr "如何扩展"
 
-#: ../../source/mp/architecture.rst:347
+#: ../../source/mp/architecture.rst:363
 msgid "Adding a new L2 adapter"
 msgstr "添加新的 L2 适配器"
 
-#: ../../source/mp/architecture.rst:349
+#: ../../source/mp/architecture.rst:365
 msgid ""
 "Create a new ``*_l2_adapter.py`` module under "
 "``lmcache/v1/distributed/l2_adapters/`` — ``__init__.py`` auto-discovers "
 "modules matching that suffix via ``pkgutil`` and imports them lazily on "
 "first use, so no other files need to be modified."
-msgstr "在 ``lmcache/v1/distributed/l2_adapters/`` 下创建一个新的 ``*_l2_adapter.py`` 模块 — ``__init__.py`` 通过 ``pkgutil`` 自动发现匹配该后缀的模块，并在首次使用时懒加载导入，因此无需修改其他文件。"
+msgstr ""
+"在 ``lmcache/v1/distributed/l2_adapters/`` 下创建一个新的 ``*_l2_adapter.py`` 模块 "
+"— ``__init__.py`` 通过 ``pkgutil`` 自动发现匹配该后缀的模块，并在首次使用时懒加载导入，因此无需修改其他文件。"
 
-#: ../../source/mp/architecture.rst:354
+#: ../../source/mp/architecture.rst:370
 msgid ""
 "Create a config class subclassing ``L2AdapterConfigBase`` with "
 "``from_dict()`` and ``help()`` methods."
 msgstr "创建一个配置类，继承自 ``L2AdapterConfigBase``，并实现 ``from_dict()`` 和 ``help()`` 方法。"
 
-#: ../../source/mp/architecture.rst:356
+#: ../../source/mp/architecture.rst:372
 msgid ""
 "Create an adapter class implementing ``L2AdapterInterface``, and a small "
 "factory function ``(config, l1_memory_desc) -> L2AdapterInterface``."
-msgstr "创建一个实现 ``L2AdapterInterface`` 的适配器类，以及一个小型工厂函数 ``(config, l1_memory_desc) -> L2AdapterInterface``。"
+msgstr ""
+"创建一个实现 ``L2AdapterInterface`` 的适配器类，以及一个小型工厂函数 ``(config, l1_memory_desc)"
+" -> L2AdapterInterface``。"
 
-#: ../../source/mp/architecture.rst:359
+#: ../../source/mp/architecture.rst:375
 msgid "At module level, self-register both the config and the factory:"
 msgstr "在模块级别，自我注册配置和工厂："
 
-#: ../../source/mp/architecture.rst:366
+#: ../../source/mp/architecture.rst:382
 msgid ""
 "See ``mock_l2_adapter.py`` or ``s3_l2_adapter.py`` for reference "
 "implementations."
 msgstr "请参阅 ``mock_l2_adapter.py`` 或 ``s3_l2_adapter.py`` 以获取参考实现。"
 
-#: ../../source/mp/architecture.rst:370
+#: ../../source/mp/architecture.rst:386
 msgid "Adding an observability subscriber"
 msgstr "添加可观察性订阅者"
 
-#: ../../source/mp/architecture.rst:372
+#: ../../source/mp/architecture.rst:388
 #, python-brace-format
 msgid ""
 "Create a subscriber class subclassing ``EventSubscriber`` (defined in "
 "``lmcache/v1/mp_observability/event_bus.py``): implement "
 "``get_subscriptions()`` to return an ``{EventType: callback}`` mapping; "
 "optionally override ``shutdown()`` for cleanup."
-msgstr "创建一个继承自 ``EventSubscriber`` 的订阅者类（定义在 ``lmcache/v1/mp_observability/event_bus.py`` 中）：实现 ``get_subscriptions()`` 返回一个 ``{EventType: callback}`` 映射；可选地重写 ``shutdown()`` 进行清理。"
+msgstr ""
+"创建一个继承自 ``EventSubscriber`` 的订阅者类（定义在 "
+"``lmcache/v1/mp_observability/event_bus.py`` 中）：实现 "
+"``get_subscriptions()`` 返回一个 ``{EventType: callback}`` 映射；可选地重写 "
+"``shutdown()`` 进行清理。"
 
-#: ../../source/mp/architecture.rst:376
+#: ../../source/mp/architecture.rst:392
 msgid ""
 "Place the class under the appropriate concern group "
 "(``subscribers/metrics/``, ``subscribers/logging/``, or "
@@ -571,222 +628,285 @@ msgid ""
 "``__init__.py``."
 msgstr "将类放置在适当的关注组（``subscribers/metrics/``、``subscribers/logging/``或``subscribers/tracing/``）下，并从该包的``__init__.py``中导出。"
 
-#: ../../source/mp/architecture.rst:380
+#: ../../source/mp/architecture.rst:396
 msgid ""
 "Register the subscriber in ``init_observability()`` "
 "(``lmcache/v1/mp_observability/config.py``) via "
 "``bus.register_subscriber(...)`` inside the branch matching its concern "
 "(metrics / logging / tracing), gated on the corresponding CLI flag if "
 "needed."
-msgstr "在 ``init_observability()`` 中注册订阅者 (``lmcache/v1/mp_observability/config.py``)，通过 ``bus.register_subscriber(...)`` 在与其关注点 (metrics / logging / tracing) 匹配的分支中进行注册，如有需要，受相应 CLI 标志的限制。"
+msgstr ""
+"在 ``init_observability()`` 中注册订阅者 "
+"(``lmcache/v1/mp_observability/config.py``)，通过 "
+"``bus.register_subscriber(...)`` 在与其关注点 (metrics / logging / tracing) "
+"匹配的分支中进行注册，如有需要，受相应 CLI 标志的限制。"
 
-#: ../../source/mp/architecture.rst:387
+#: ../../source/mp/architecture.rst:403
 msgid "Adding a new request type"
 msgstr "添加新的请求类型"
 
-#: ../../source/mp/architecture.rst:389
+#: ../../source/mp/architecture.rst:405
 msgid "Add a new member to ``RequestType`` in ``protocols/base.py``."
 msgstr "在 ``protocols/base.py`` 中向 ``RequestType`` 添加一个新成员。"
 
-#: ../../source/mp/architecture.rst:390
+#: ../../source/mp/architecture.rst:406
 msgid ""
 "Create a ``ProtocolDefinition`` in the appropriate ``protocols/*.py`` "
 "file (``engine``, ``controller``, ``observability``, ``debug``, "
 "``blend``, or ``blend_v2``) and add the request name to that module's "
 "``REQUEST_NAMES``."
-msgstr "在适当的 ``protocols/*.py`` 文件中创建一个 ``ProtocolDefinition``（``engine``、``controller``、``observability``、``debug``、``blend`` 或 ``blend_v2``），并将请求名称添加到该模块的 ``REQUEST_NAMES`` 中。"
+msgstr ""
+"在适当的 ``protocols/*.py`` 文件中创建一个 "
+"``ProtocolDefinition``（``engine``、``controller``、``observability``、``debug``、``blend``"
+" 或 ``blend_v2``），并将请求名称添加到该模块的 ``REQUEST_NAMES`` 中。"
 
-#: ../../source/mp/architecture.rst:393
+#: ../../source/mp/architecture.rst:409
 msgid "Implement the handler method on ``MPCacheEngine`` (or ``BlendEngineV2``)."
 msgstr "在 ``MPCacheEngine`` （或 ``BlendEngineV2``）上实现处理程序方法。"
 
-#: ../../source/mp/architecture.rst:394
+#: ../../source/mp/architecture.rst:410
 msgid ""
 "Register the handler in ``run_cache_server()`` via "
 "``add_handler_helper()``."
 msgstr "在 ``run_cache_server()`` 中通过 ``add_handler_helper()`` 注册处理程序。"
 
-#: ../../source/mp/architecture.rst:397
+#: ../../source/mp/architecture.rst:413
 msgid "Key Source Files"
 msgstr "关键源文件"
 
-#: ../../source/mp/architecture.rst:403
+#: ../../source/mp/architecture.rst:419
 msgid "File"
 msgstr "文件"
 
-#: ../../source/mp/architecture.rst:404
+#: ../../source/mp/architecture.rst:420
 msgid "Purpose"
 msgstr "目的"
 
-#: ../../source/mp/architecture.rst:405
+#: ../../source/mp/architecture.rst:421
 msgid "``lmcache/v1/multiprocess/server.py``"
 msgstr "``lmcache/v1/multiprocess/server.py``"
 
-#: ../../source/mp/architecture.rst:406
+#: ../../source/mp/architecture.rst:422
 msgid "MPCacheEngine + ZMQ server entry point"
 msgstr "MPCacheEngine + ZMQ 服务器入口点"
 
-#: ../../source/mp/architecture.rst:407
+#: ../../source/mp/architecture.rst:423
 msgid "``lmcache/v1/multiprocess/config.py``"
 msgstr "``lmcache/v1/multiprocess/config.py``"
 
-#: ../../source/mp/architecture.rst:408
+#: ../../source/mp/architecture.rst:424
 msgid "MPServerConfig, HTTPFrontendConfig"
 msgstr "MPServerConfig, HTTPFrontendConfig"
 
-#: ../../source/mp/architecture.rst:409
-msgid "``lmcache/v1/multiprocess/blend_server_v2.py``"
-msgstr "``lmcache/v1/multiprocess/blend_server_v2.py``"
+#: ../../source/mp/architecture.rst:425
+msgid "``lmcache/v1/multiprocess/engine_context.py``"
+msgstr "``lmcache/v1/multiprocess/engine_context.py``"
 
-#: ../../source/mp/architecture.rst:410
-msgid "BlendEngineV2 (extends MPCacheEngine)"
-msgstr "BlendEngineV2 (extends MPCacheEngine)"
+#: ../../source/mp/architecture.rst:426
+msgid "MPCacheEngineContext (shared state passed to every EngineModule)"
+msgstr "MPCacheEngineContext（传递给每个 EngineModule 的共享状态）"
+
+#: ../../source/mp/architecture.rst:427
+msgid "``lmcache/v1/multiprocess/engine_module.py``"
+msgstr "``lmcache/v1/multiprocess/engine_module.py``"
+
+#: ../../source/mp/architecture.rst:428
+msgid ""
+"``EngineModule`` protocol, ``HandlerSpec``, ``ThreadPoolType`` (per-"
+"module handler registration)"
+msgstr "``EngineModule`` 协议, ``HandlerSpec``, ``ThreadPoolType`` (每模块处理程序注册)"
+
+#: ../../source/mp/architecture.rst:430
+msgid "``lmcache/v1/multiprocess/modules/``"
+msgstr "``lmcache/v1/multiprocess/modules/``"
+
+#: ../../source/mp/architecture.rst:431
+msgid ""
+"Engine module implementations: ``lookup.py`` (``LookupModule``), "
+"``management.py`` (``ManagementModule``), ``gpu_transfer.py`` "
+"(``GPUTransferModule``), ``non_gpu_transfer.py`` "
+"(``NonGPUTransferModule``), and ``blend.py`` (``BlendModule`` / "
+"``BlendEngineV2``)."
+msgstr "引擎模块实现：``lookup.py`` (``LookupModule``)、``management.py`` (``ManagementModule``)、``gpu_transfer.py`` (``GPUTransferModule``)、``non_gpu_transfer.py`` (``NonGPUTransferModule``) 和 ``blend.py`` (``BlendModule`` / ``BlendEngineV2``)。"
 
-#: ../../source/mp/architecture.rst:411
+#: ../../source/mp/architecture.rst:436
 msgid "``lmcache/v1/multiprocess/http_server.py``"
 msgstr "``lmcache/v1/multiprocess/http_server.py``"
 
-#: ../../source/mp/architecture.rst:412
+#: ../../source/mp/architecture.rst:437
 msgid "FastAPI wrapper with health check and many other useful APIs"
 msgstr "带健康检查和许多其他有用 API 的 FastAPI 包装器"
 
-#: ../../source/mp/architecture.rst:413
+#: ../../source/mp/architecture.rst:438
 msgid "``lmcache/v1/multiprocess/http_api_registry.py``"
 msgstr "``lmcache/v1/multiprocess/http_api_registry.py``"
 
-#: ../../source/mp/architecture.rst:414
+#: ../../source/mp/architecture.rst:439
 msgid "``HTTPAPIRegistry`` that auto-discovers routers in ``http_apis/``"
 msgstr "``HTTPAPIRegistry`` 自动发现 ``http_apis/`` 中的路由器"
 
-#: ../../source/mp/architecture.rst:415
+#: ../../source/mp/architecture.rst:440
 msgid "``lmcache/v1/multiprocess/http_apis/``"
 msgstr "``lmcache/v1/multiprocess/http_apis/``"
 
-#: ../../source/mp/architecture.rst:416
+#: ../../source/mp/architecture.rst:441
 msgid ""
 "Extensible HTTP endpoints (``/``, ``/healthcheck``, ``/clear-cache``, "
 "``/status``)"
 msgstr "可扩展的 HTTP 端点 (``/``, ``/healthcheck``, ``/clear-cache``, ``/status``)"
 
-#: ../../source/mp/architecture.rst:418
+#: ../../source/mp/architecture.rst:443
 msgid "``lmcache/v1/multiprocess/mp_runtime_plugin_launcher.py``"
 msgstr "``lmcache/v1/multiprocess/mp_runtime_plugin_launcher.py``"
 
-#: ../../source/mp/architecture.rst:419
+#: ../../source/mp/architecture.rst:444
 msgid ""
 "``MPRuntimePluginLauncher`` that spawns runtime plugins with the full "
 "server config serialized into environment variables"
 msgstr "``MPRuntimePluginLauncher`` 通过将完整的服务器配置序列化为环境变量来生成运行时插件"
 
-#: ../../source/mp/architecture.rst:421
+#: ../../source/mp/architecture.rst:446
 msgid "``lmcache/v1/multiprocess/protocols/base.py``"
 msgstr "``lmcache/v1/multiprocess/protocols/base.py``"
 
-#: ../../source/mp/architecture.rst:422
+#: ../../source/mp/architecture.rst:447
 msgid "RequestType, HandlerType, ProtocolDefinition"
 msgstr "请求类型、处理程序类型、协议定义"
 
-#: ../../source/mp/architecture.rst:424
+#: ../../source/mp/architecture.rst:449
 msgid "StorageManager (top-level manager)"
 msgstr "存储管理器（顶层管理器）"
 
-#: ../../source/mp/architecture.rst:425
+#: ../../source/mp/architecture.rst:450
 msgid "``lmcache/v1/distributed/config.py``"
 msgstr "``lmcache/v1/distributed/config.py``"
 
-#: ../../source/mp/architecture.rst:426
+#: ../../source/mp/architecture.rst:451
 msgid "StorageManagerConfig hierarchy"
 msgstr "StorageManagerConfig 层次结构"
 
-#: ../../source/mp/architecture.rst:428
+#: ../../source/mp/architecture.rst:453
 msgid "L1Manager (object state machine)"
 msgstr "L1Manager（对象状态机）"
 
-#: ../../source/mp/architecture.rst:429
+#: ../../source/mp/architecture.rst:454
 msgid "``lmcache/v1/distributed/l2_adapters/config.py``"
 msgstr "``lmcache/v1/distributed/l2_adapters/config.py``"
 
-#: ../../source/mp/architecture.rst:430
+#: ../../source/mp/architecture.rst:455
 msgid "L2 adapter config registry"
 msgstr "L2 适配器配置注册表"
 
-#: ../../source/mp/architecture.rst:431
+#: ../../source/mp/architecture.rst:456
 msgid "``lmcache/v1/distributed/l2_adapters/base.py``"
 msgstr "``lmcache/v1/distributed/l2_adapters/base.py``"
 
-#: ../../source/mp/architecture.rst:432
+#: ../../source/mp/architecture.rst:457
 msgid "L2AdapterInterface"
 msgstr "L2AdapterInterface"
 
-#: ../../source/mp/architecture.rst:433
+#: ../../source/mp/architecture.rst:458
 msgid "``lmcache/v1/distributed/storage_controllers/store_controller.py``"
 msgstr "``lmcache/v1/distributed/storage_controllers/store_controller.py``"
 
-#: ../../source/mp/architecture.rst:434
+#: ../../source/mp/architecture.rst:459
 msgid "StoreController (event-driven L1->L2)"
 msgstr "StoreController（事件驱动 L1->L2）"
 
-#: ../../source/mp/architecture.rst:435
+#: ../../source/mp/architecture.rst:460
 msgid "``lmcache/v1/distributed/storage_controllers/eviction_controller.py``"
 msgstr "``lmcache/v1/distributed/storage_controllers/eviction_controller.py``"
 
-#: ../../source/mp/architecture.rst:436
+#: ../../source/mp/architecture.rst:461
 msgid "EvictionController (watermark-triggered)"
 msgstr "逐出控制器（基于水印触发）"
 
-#: ../../source/mp/architecture.rst:437
+#: ../../source/mp/architecture.rst:462
 msgid "``lmcache/v1/distributed/storage_controllers/prefetch_controller.py``"
 msgstr "``lmcache/v1/distributed/storage_controllers/prefetch_controller.py``"
 
-#: ../../source/mp/architecture.rst:438
+#: ../../source/mp/architecture.rst:463
 msgid "PrefetchController (L2->L1 on miss)"
 msgstr "预取控制器 (未命中时从 L2->L1)"
 
-#: ../../source/mp/architecture.rst:439
+#: ../../source/mp/architecture.rst:464
 msgid "``lmcache/v1/mp_observability/config.py``"
 msgstr "``lmcache/v1/mp_observability/config.py``"
 
-#: ../../source/mp/architecture.rst:440
+#: ../../source/mp/architecture.rst:465
 msgid "ObservabilityConfig + ``init_observability()`` entry point"
 msgstr "可观察性配置 + ``init_observability()`` 入口点"
 
-#: ../../source/mp/architecture.rst:441
+#: ../../source/mp/architecture.rst:466
 msgid "``lmcache/v1/mp_observability/event_bus.py``"
 msgstr "``lmcache/v1/mp_observability/event_bus.py``"
 
-#: ../../source/mp/architecture.rst:442
+#: ../../source/mp/architecture.rst:467
 msgid "EventBus singleton and ``EventSubscriber`` base class"
 msgstr "事件总线单例和 ``EventSubscriber`` 基类"
 
-#: ../../source/mp/architecture.rst:443
+#: ../../source/mp/architecture.rst:468
 msgid "``lmcache/v1/mp_observability/event.py``"
 msgstr "``lmcache/v1/mp_observability/event.py``"
 
-#: ../../source/mp/architecture.rst:444
+#: ../../source/mp/architecture.rst:469
 msgid "``Event`` / ``EventType`` definitions"
 msgstr "``Event`` / ``EventType`` 定义"
 
-#: ../../source/mp/architecture.rst:445
+#: ../../source/mp/architecture.rst:470
 msgid "``lmcache/v1/mp_observability/otel_init.py``"
 msgstr "``lmcache/v1/mp_observability/otel_init.py``"
 
-#: ../../source/mp/architecture.rst:446
+#: ../../source/mp/architecture.rst:471
 msgid "OTel metrics / tracing provider setup"
 msgstr "OTel 指标 / 跟踪提供程序设置"
 
-#: ../../source/mp/architecture.rst:447
+#: ../../source/mp/architecture.rst:472
 msgid "``lmcache/v1/mp_observability/subscribers/``"
 msgstr "``lmcache/v1/mp_observability/subscribers/``"
 
-#: ../../source/mp/architecture.rst:448
+#: ../../source/mp/architecture.rst:473
 msgid "Metrics, logging, and tracing subscribers"
 msgstr "指标、日志和追踪订阅者"
 
-#: ../../source/mp/architecture.rst:449
+#: ../../source/mp/architecture.rst:474
 msgid "``lmcache/v1/mp_observability/trace/``"
 msgstr "``lmcache/v1/mp_observability/trace/``"
 
-#: ../../source/mp/architecture.rst:450
+#: ../../source/mp/architecture.rst:475
 msgid "Trace recording (``--trace-level storage``) capture stack"
 msgstr "跟踪记录 (``--trace-level storage``) 捕获堆栈"
 
+#~ msgid ""
+#~ "All three server entry points share "
+#~ "the same ``MPCacheEngine`` and "
+#~ "``StorageManager`` core."
+#~ msgstr "所有三个服务器入口点共享相同的 ``MPCacheEngine`` 和 ``StorageManager`` 核心。"
+
+#~ msgid ""
+#~ "**``server.py``** -- The default ZMQ-"
+#~ "only server.  Creates an ``MPCacheEngine`` "
+#~ "and a ``MessageQueueServer``, registers "
+#~ "handlers for all core ``RequestType`` "
+#~ "values, and blocks in a keep-alive"
+#~ " loop."
+#~ msgstr ""
+#~ "**``server.py``** -- 默认的仅 ZMQ 服务器。创建一个 "
+#~ "``MPCacheEngine`` 和一个 ``MessageQueueServer``，为所有核心 "
+#~ "``RequestType`` 值注册处理程序，并在保持活动循环中阻塞。"
+
+#~ msgid ""
+#~ "Both ``blend_server_v2.py`` and ``http_server.py``"
+#~ " reuse this pattern, adding "
+#~ "``add_http_frontend_args()`` for the HTTP "
+#~ "variant."
+#~ msgstr ""
+#~ "``blend_server_v2.py`` 和 ``http_server.py`` "
+#~ "都重用了这个模式，为 HTTP 变体添加了 ``add_http_frontend_args()``。"
+
+#~ msgid "``lmcache/v1/multiprocess/blend_server_v2.py``"
+#~ msgstr "``lmcache/v1/multiprocess/blend_server_v2.py``"
+
+#~ msgid "BlendEngineV2 (extends MPCacheEngine)"
+#~ msgstr "BlendEngineV2 (extends MPCacheEngine)"
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/configuration.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/configuration.po
index 690b9ed125..0e9b8aaff9 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/mp/configuration.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/configuration.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-06-01 10:55+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -34,30 +34,30 @@ msgstr "本页面记录了 LMCache 多进程服务器接受的每个 CLI 参数
 msgid "MP Server"
 msgstr "MP 服务器"
 
-#: ../../source/mp/configuration.rst:14 ../../source/mp/configuration.rst:111
+#: ../../source/mp/configuration.rst:14 ../../source/mp/configuration.rst:127
 msgid "Source: ``lmcache/v1/multiprocess/config.py``"
 msgstr "源: ``lmcache/v1/multiprocess/config.py``"
 
-#: ../../source/mp/configuration.rst:20 ../../source/mp/configuration.rst:89
-#: ../../source/mp/configuration.rst:119 ../../source/mp/configuration.rst:138
-#: ../../source/mp/configuration.rst:169 ../../source/mp/configuration.rst:188
-#: ../../source/mp/configuration.rst:220 ../../source/mp/configuration.rst:379
+#: ../../source/mp/configuration.rst:20 ../../source/mp/configuration.rst:105
+#: ../../source/mp/configuration.rst:135 ../../source/mp/configuration.rst:154
+#: ../../source/mp/configuration.rst:185 ../../source/mp/configuration.rst:204
+#: ../../source/mp/configuration.rst:236 ../../source/mp/configuration.rst:402
 msgid "Argument"
 msgstr "参数"
 
-#: ../../source/mp/configuration.rst:21 ../../source/mp/configuration.rst:90
-#: ../../source/mp/configuration.rst:120 ../../source/mp/configuration.rst:139
-#: ../../source/mp/configuration.rst:170 ../../source/mp/configuration.rst:189
-#: ../../source/mp/configuration.rst:221 ../../source/mp/configuration.rst:380
-#: ../../source/mp/configuration.rst:430
+#: ../../source/mp/configuration.rst:21 ../../source/mp/configuration.rst:106
+#: ../../source/mp/configuration.rst:136 ../../source/mp/configuration.rst:155
+#: ../../source/mp/configuration.rst:186 ../../source/mp/configuration.rst:205
+#: ../../source/mp/configuration.rst:237 ../../source/mp/configuration.rst:403
+#: ../../source/mp/configuration.rst:459
 msgid "Default"
 msgstr "默认"
 
-#: ../../source/mp/configuration.rst:22 ../../source/mp/configuration.rst:91
-#: ../../source/mp/configuration.rst:121 ../../source/mp/configuration.rst:140
-#: ../../source/mp/configuration.rst:171 ../../source/mp/configuration.rst:190
-#: ../../source/mp/configuration.rst:222 ../../source/mp/configuration.rst:381
-#: ../../source/mp/configuration.rst:431 ../../source/mp/configuration.rst:457
+#: ../../source/mp/configuration.rst:22 ../../source/mp/configuration.rst:107
+#: ../../source/mp/configuration.rst:137 ../../source/mp/configuration.rst:156
+#: ../../source/mp/configuration.rst:187 ../../source/mp/configuration.rst:206
+#: ../../source/mp/configuration.rst:238 ../../source/mp/configuration.rst:404
+#: ../../source/mp/configuration.rst:460 ../../source/mp/configuration.rst:492
 msgid "Description"
 msgstr "描述"
 
@@ -77,7 +77,7 @@ msgstr "绑定 ZMQ 服务器的主机地址。"
 msgid "``--port``"
 msgstr "``--port``"
 
-#: ../../source/mp/configuration.rst:27 ../../source/mp/configuration.rst:436
+#: ../../source/mp/configuration.rst:27 ../../source/mp/configuration.rst:465
 msgid "``5555``"
 msgstr "``5555``"
 
@@ -155,29 +155,45 @@ msgstr "基于令牌的操作的哈希算法。可选项：``builtin``、``sha25
 msgid "``--engine-type``"
 msgstr "``--engine-type``"
 
-#: ../../source/mp/configuration.rst:50 ../../source/mp/configuration.rst:224
-#: ../../source/mp/configuration.rst:232
+#: ../../source/mp/configuration.rst:50 ../../source/mp/configuration.rst:240
+#: ../../source/mp/configuration.rst:248
 msgid "``default``"
 msgstr "``default``"
 
 #: ../../source/mp/configuration.rst:51
 msgid ""
-"Cache engine backend type. ``default`` uses MPCacheEngine; ``blend`` uses"
-" BlendEngineV2 for cross-request KV reuse. Choices: ``default``, "
-"``blend``."
-msgstr ""
-"缓存引擎后端类型。``default`` 使用 MPCacheEngine；``blend`` 使用 BlendEngineV2 进行跨请求的 "
-"KV 重用。可选项：``default``，``blend``。"
+"Cache engine backend type. ``default`` uses standard prefix caching; "
+"``blend`` enables CacheBlend non-prefix KV reuse (composes a "
+"``BlendModule`` into the engine, which requires ``--supported-transfer-"
+"mode`` to be ``gpu`` or ``auto``). Choices: ``default``, ``blend``."
+msgstr "缓存引擎后端类型。``default`` 使用标准前缀缓存；``blend`` 启用 CacheBlend 非前缀 KV 重用（将 ``BlendModule`` 组合到引擎中，这需要 ``--supported-transfer-mode`` 为 ``gpu`` 或 ``auto``）。可选项：``default``，``blend``。"
+
+#: ../../source/mp/configuration.rst:56
+msgid "``--supported-transfer-mode``"
+msgstr "``--supported-transfer-mode``"
+
+#: ../../source/mp/configuration.rst:57 ../../source/mp/configuration.rst:478
+msgid "``auto``"
+msgstr "``auto``"
+
+#: ../../source/mp/configuration.rst:58
+msgid ""
+"Which worker → server transfer paths the server loads. ``gpu`` enables "
+"only GPU-based IPC transfer (STORE/RETRIEVE); ``non_gpu`` enables only "
+"the non-GPU (PREPARE/COMMIT) transfer path; ``auto`` (default) loads both"
+" so workers of either device type can connect without manual "
+"configuration. Choices: ``gpu``, ``non_gpu``, ``auto``."
+msgstr "服务器加载哪些工作进程 → 服务器传输路径。``gpu`` 仅启用基于 GPU 的 IPC 传输（STORE/RETRIEVE）；``non_gpu`` 仅启用非 GPU（PREPARE/COMMIT）传输路径；``auto``（默认）加载两者，以便任一设备类型的工作进程可以在无需手动配置的情况下连接。选择：``gpu``、``non_gpu``、``auto``。"
 
-#: ../../source/mp/configuration.rst:55
+#: ../../source/mp/configuration.rst:64
 msgid "``--runtime-plugin-locations``"
 msgstr "``--runtime-plugin-locations``"
 
-#: ../../source/mp/configuration.rst:56
+#: ../../source/mp/configuration.rst:65 ../../source/mp/configuration.rst:76
 msgid "``[]``"
 msgstr "``[]``"
 
-#: ../../source/mp/configuration.rst:57
+#: ../../source/mp/configuration.rst:66
 msgid ""
 "Zero or more paths to runtime plugin scripts or directories to launch "
 "alongside the server. Plugins are spawned by ``MPRuntimePluginLauncher`` "
@@ -187,16 +203,16 @@ msgstr ""
 "零个或多个路径，用于运行时插件脚本或目录，以便与服务器一起启动。插件由 ``MPRuntimePluginLauncher`` 生成，并通过 "
 "``LMCACHE_RUNTIME_PLUGIN_CONFIG`` 环境变量接收完整的服务器配置。"
 
-#: ../../source/mp/configuration.rst:61
+#: ../../source/mp/configuration.rst:70
 msgid "``--runtime-plugin-config``"
 msgstr "``--runtime-plugin-config``"
 
-#: ../../source/mp/configuration.rst:62
+#: ../../source/mp/configuration.rst:71
 #, python-brace-format
 msgid "``\"{}\"``"
 msgstr "``\"{}\"``"
 
-#: ../../source/mp/configuration.rst:63
+#: ../../source/mp/configuration.rst:72
 #, python-brace-format
 msgid ""
 "JSON string of extra key-value config forwarded to runtime plugins via "
@@ -208,31 +224,43 @@ msgstr ""
 "字符串。示例：``'{\\\"plugin.frontend.heartbeat_url\\\": "
 "\\\"http://localhost:5000/heartbeat\\\"}'``。"
 
-#: ../../source/mp/configuration.rst:66
+#: ../../source/mp/configuration.rst:75
+msgid "``--script-allowed-imports``"
+msgstr "``--script-allowed-imports``"
+
+#: ../../source/mp/configuration.rst:77
+msgid ""
+"Space-separated list of Python module names that scripts posted to the "
+"HTTP ``/run_script`` endpoint are allowed to import. Example: ``--script-"
+"allowed-imports numpy pandas``."
+msgstr "允许在 HTTP ``/run_script`` 端点发布的脚本导入的以空格分隔的 Python 模块名称列表。示例：``--script-allowed-imports numpy pandas``。"
+
+#: ../../source/mp/configuration.rst:80
 msgid "``--shm-name``"
 msgstr "``--shm-name``"
 
-#: ../../source/mp/configuration.rst:67
+#: ../../source/mp/configuration.rst:81
 msgid "*(not set)*"
 msgstr "*(未设置)*"
 
-#: ../../source/mp/configuration.rst:68
+#: ../../source/mp/configuration.rst:82
 msgid ""
-"SHM segment name for non-GPU KV transfer. Not set (default): auto-"
-"allocate a shared-memory pool. ``\"\"`` (empty string): disable SHM and "
-"force the pickle transfer path.  Any other value: use that exact name for"
-" the SHM pool segment."
-msgstr "非 GPU KV 传输的 SHM 段名称。未设置（默认）：自动分配共享内存池。``\"\"``（空字符串）：禁用 SHM 并强制使用 pickle 传输路径。任何其他值：使用该确切名称作为 SHM 池段的名称。"
+"SHM segment name for non-GPU KV transfer (only used when the non-GPU path"
+" is loaded, i.e. ``--supported-transfer-mode`` is ``auto`` or "
+"``non_gpu``). Not set (default): auto-allocate a shared-memory pool. "
+"``\"\"`` (empty string): disable SHM and force the pickle transfer path."
+"  Any other value: use that exact name for the SHM pool segment."
+msgstr "非 GPU KV 传输的 SHM 段名称（仅在加载非 GPU 路径时使用，即 ``--supported-transfer-mode`` 为 ``auto`` 或 ``non_gpu``）。未设置（默认）：自动分配共享内存池。``\\\"\\\"``（空字符串）：禁用 SHM 并强制使用 pickle 传输路径。任何其他值：将该确切名称用于 SHM 池段。"
 
-#: ../../source/mp/configuration.rst:75
+#: ../../source/mp/configuration.rst:91
 msgid "Lookup Hash Logging"
 msgstr "查找哈希日志记录"
 
-#: ../../source/mp/configuration.rst:77
+#: ../../source/mp/configuration.rst:93
 msgid "Source: ``lmcache/v1/mp_observability/subscribers/logging/lookup_hash.py``"
 msgstr "源: ``lmcache/v1/mp_observability/subscribers/logging/lookup_hash.py``"
 
-#: ../../source/mp/configuration.rst:79
+#: ../../source/mp/configuration.rst:95
 msgid ""
 "When enabled, the server publishes chunk hashes computed during "
 "``lookup()`` as ``MP_LOOKUP`` events on the EventBus.  The "
@@ -244,122 +272,122 @@ msgstr ""
 "``LookupHashLoggingSubscriber`` 将这些写入旋转的 JSONL 文件以供离线分析。 默认情况下禁用。 "
 "这些参数是可观察性组的一部分。"
 
-#: ../../source/mp/configuration.rst:92
+#: ../../source/mp/configuration.rst:108
 msgid "``--lookup-hash-log-dir``"
 msgstr "``--lookup-hash-log-dir``"
 
-#: ../../source/mp/configuration.rst:93
+#: ../../source/mp/configuration.rst:109
 msgid "``\"\"`` (disabled)"
 msgstr "``\"\"`` (禁用)"
 
-#: ../../source/mp/configuration.rst:94
+#: ../../source/mp/configuration.rst:110
 msgid ""
 "Directory to write lookup hash JSONL files. An empty string disables "
 "logging."
 msgstr "写入查找哈希 JSONL 文件的目录。空字符串将禁用日志记录。"
 
-#: ../../source/mp/configuration.rst:96
+#: ../../source/mp/configuration.rst:112
 msgid "``--lookup-hash-log-rotation-interval``"
 msgstr "``--lookup-hash-log-rotation-interval``"
 
-#: ../../source/mp/configuration.rst:97
+#: ../../source/mp/configuration.rst:113
 msgid "``21600`` (6 h)"
 msgstr "``21600`` (6 小时)"
 
-#: ../../source/mp/configuration.rst:98
+#: ../../source/mp/configuration.rst:114
 msgid "Time interval in seconds before rotating to a new log file."
 msgstr "在切换到新日志文件之前的时间间隔（以秒为单位）。"
 
-#: ../../source/mp/configuration.rst:99
+#: ../../source/mp/configuration.rst:115
 msgid "``--lookup-hash-log-rotation-max-size``"
 msgstr "``--lookup-hash-log-rotation-max-size``"
 
-#: ../../source/mp/configuration.rst:100
+#: ../../source/mp/configuration.rst:116
 msgid "``104857600`` (100 MB)"
 msgstr "``104857600`` (100 MB)"
 
-#: ../../source/mp/configuration.rst:101
+#: ../../source/mp/configuration.rst:117
 msgid ""
 "Max file size in bytes before rotating even if the time interval has not "
 "elapsed."
 msgstr "在时间间隔尚未到达之前，旋转前的最大文件大小（以字节为单位）。"
 
-#: ../../source/mp/configuration.rst:103
+#: ../../source/mp/configuration.rst:119
 msgid "``--lookup-hash-log-max-files``"
 msgstr "``--lookup-hash-log-max-files``"
 
-#: ../../source/mp/configuration.rst:104
+#: ../../source/mp/configuration.rst:120
 msgid "``100``"
 msgstr "``100``"
 
-#: ../../source/mp/configuration.rst:105
+#: ../../source/mp/configuration.rst:121
 msgid ""
 "Max number of log files to keep.  Oldest files are deleted when this "
 "limit is exceeded."
 msgstr "保留的最大日志文件数量。当超过此限制时，最旧的文件将被删除。"
 
-#: ../../source/mp/configuration.rst:109
+#: ../../source/mp/configuration.rst:125
 msgid "HTTP Frontend"
 msgstr "HTTP 前端"
 
-#: ../../source/mp/configuration.rst:113
+#: ../../source/mp/configuration.rst:129
 msgid "The HTTP frontend is included when running ``lmcache server``."
 msgstr "HTTP 前端在运行 ``lmcache server`` 时包含在内。"
 
-#: ../../source/mp/configuration.rst:122
+#: ../../source/mp/configuration.rst:138
 msgid "``--http-host``"
 msgstr "``--http-host``"
 
-#: ../../source/mp/configuration.rst:123
+#: ../../source/mp/configuration.rst:139
 msgid "``0.0.0.0``"
 msgstr "``0.0.0.0``"
 
-#: ../../source/mp/configuration.rst:124
+#: ../../source/mp/configuration.rst:140
 msgid "Host to bind the HTTP (FastAPI/uvicorn) server."
 msgstr "绑定 HTTP (FastAPI/uvicorn) 服务器的主机。"
 
-#: ../../source/mp/configuration.rst:125
+#: ../../source/mp/configuration.rst:141
 msgid "``--http-port``"
 msgstr "``--http-port``"
 
-#: ../../source/mp/configuration.rst:126
+#: ../../source/mp/configuration.rst:142
 msgid "``8080``"
 msgstr "``8080``"
 
-#: ../../source/mp/configuration.rst:127
+#: ../../source/mp/configuration.rst:143
 msgid "Port to bind the HTTP server."
 msgstr "绑定 HTTP 服务器的端口。"
 
-#: ../../source/mp/configuration.rst:130
+#: ../../source/mp/configuration.rst:146
 msgid "L1 Memory Manager"
 msgstr "L1 内存管理器"
 
-#: ../../source/mp/configuration.rst:132 ../../source/mp/configuration.rst:163
-#: ../../source/mp/configuration.rst:182 ../../source/mp/configuration.rst:214
+#: ../../source/mp/configuration.rst:148 ../../source/mp/configuration.rst:179
+#: ../../source/mp/configuration.rst:198 ../../source/mp/configuration.rst:230
 msgid "Source: ``lmcache/v1/distributed/config.py``"
 msgstr "来源: ``lmcache/v1/distributed/config.py``"
 
-#: ../../source/mp/configuration.rst:141
+#: ../../source/mp/configuration.rst:157
 msgid "``--l1-size-gb``"
 msgstr "``--l1-size-gb``"
 
-#: ../../source/mp/configuration.rst:142 ../../source/mp/configuration.rst:192
+#: ../../source/mp/configuration.rst:158 ../../source/mp/configuration.rst:208
 msgid "*required*"
 msgstr "*必需*"
 
-#: ../../source/mp/configuration.rst:143
+#: ../../source/mp/configuration.rst:159
 msgid "Size of L1 memory in GB."
 msgstr "L1 内存的大小（以 GB 为单位）。"
 
-#: ../../source/mp/configuration.rst:144
+#: ../../source/mp/configuration.rst:160
 msgid "``--l1-use-lazy`` / ``--no-l1-use-lazy``"
 msgstr "``--l1-use-lazy`` / ``--no-l1-use-lazy``"
 
-#: ../../source/mp/configuration.rst:145
+#: ../../source/mp/configuration.rst:161
 msgid "``True``"
 msgstr "``True``"
 
-#: ../../source/mp/configuration.rst:146
+#: ../../source/mp/configuration.rst:162
 msgid ""
 "Enable or disable lazy allocation for L1 memory. Pass ``--l1-use-lazy`` "
 "to enable (default) or ``--no-l1-use-lazy`` to explicitly disable. Lazy "
@@ -372,67 +400,67 @@ msgstr ""
 "以显式禁用。延迟分配依赖于 ``cudart`` 主机固定内存，因此在非 CUDA 后端（其中 ``lmcache.torch_dev`` 不暴露"
 " ``cudart`` 属性）时，它会自动降级为急切分配，并记录警告，无论标志值如何。"
 
-#: ../../source/mp/configuration.rst:153
+#: ../../source/mp/configuration.rst:169
 msgid "``--l1-init-size-gb``"
 msgstr "``--l1-init-size-gb``"
 
-#: ../../source/mp/configuration.rst:154
+#: ../../source/mp/configuration.rst:170
 msgid "``20``"
 msgstr "``20``"
 
-#: ../../source/mp/configuration.rst:155
+#: ../../source/mp/configuration.rst:171
 msgid "Initial allocation size (GB) when using lazy allocation."
 msgstr "使用延迟分配时的初始分配大小（GB）。"
 
-#: ../../source/mp/configuration.rst:156
+#: ../../source/mp/configuration.rst:172
 msgid "``--l1-align-bytes``"
 msgstr "``--l1-align-bytes``"
 
-#: ../../source/mp/configuration.rst:157
+#: ../../source/mp/configuration.rst:173
 msgid "``4096``"
 msgstr "``4096``"
 
-#: ../../source/mp/configuration.rst:158
+#: ../../source/mp/configuration.rst:174
 msgid "Alignment size in bytes (default 4 KB)."
 msgstr "对齐大小（以字节为单位，默认 4 KB）。"
 
-#: ../../source/mp/configuration.rst:161
+#: ../../source/mp/configuration.rst:177
 msgid "L1 Manager TTLs"
 msgstr "L1 管理器 TTLs"
 
-#: ../../source/mp/configuration.rst:172
+#: ../../source/mp/configuration.rst:188
 msgid "``--l1-write-ttl-seconds``"
 msgstr "``--l1-write-ttl-seconds``"
 
-#: ../../source/mp/configuration.rst:173
+#: ../../source/mp/configuration.rst:189
 msgid "``600``"
 msgstr "``600``"
 
-#: ../../source/mp/configuration.rst:174
+#: ../../source/mp/configuration.rst:190
 msgid "Time-to-live for each object's write lock (seconds)."
 msgstr "每个对象的写锁的生存时间（秒）。"
 
-#: ../../source/mp/configuration.rst:175
+#: ../../source/mp/configuration.rst:191
 msgid "``--l1-read-ttl-seconds``"
 msgstr "``--l1-read-ttl-seconds``"
 
-#: ../../source/mp/configuration.rst:176
+#: ../../source/mp/configuration.rst:192
 msgid "``300``"
 msgstr "``300``"
 
-#: ../../source/mp/configuration.rst:177
+#: ../../source/mp/configuration.rst:193
 msgid "Time-to-live for each object's read lock (seconds)."
 msgstr "每个对象的读取锁的生存时间（秒）。"
 
-#: ../../source/mp/configuration.rst:180
+#: ../../source/mp/configuration.rst:196
 msgid "Eviction Policy"
 msgstr "逐出策略"
 
-#: ../../source/mp/configuration.rst:191
+#: ../../source/mp/configuration.rst:207
 msgid "``--eviction-policy``"
 msgstr "``--eviction-policy``"
 
-#: ../../source/mp/configuration.rst:193
+#: ../../source/mp/configuration.rst:209
 msgid ""
 "Eviction policy. Choices: ``LRU``, ``IsolatedLRU``, ``noop``. Use "
 "``noop`` for buffer-only mode where L1 acts as a pure write buffer (data "
@@ -449,39 +477,39 @@ msgstr ""
 "quota-api`）；没有注册配额的 ``cache_salt`` 的有效限制为 ``0`` "
 "字节，因此其数据将在下一个逐出周期被逐出（白名单语义）。"
 
-#: ../../source/mp/configuration.rst:204
+#: ../../source/mp/configuration.rst:220
 msgid "``--eviction-trigger-watermark``"
 msgstr "``--eviction-trigger-watermark``"
 
-#: ../../source/mp/configuration.rst:205
+#: ../../source/mp/configuration.rst:221
 msgid "``0.8``"
 msgstr "``0.8``"
 
-#: ../../source/mp/configuration.rst:206
+#: ../../source/mp/configuration.rst:222
 msgid "Memory usage ratio (0.0--1.0) that triggers eviction."
 msgstr "触发逐出的内存使用比例 (0.0--1.0)。"
 
-#: ../../source/mp/configuration.rst:207
+#: ../../source/mp/configuration.rst:223
 msgid "``--eviction-ratio``"
 msgstr "``--eviction-ratio``"
 
-#: ../../source/mp/configuration.rst:208
+#: ../../source/mp/configuration.rst:224
 msgid "``0.2``"
 msgstr "``0.2``"
 
-#: ../../source/mp/configuration.rst:209
+#: ../../source/mp/configuration.rst:225
 msgid "Fraction of allocated memory to evict when triggered (0.0--1.0)."
 msgstr "触发时逐出的已分配内存比例 (0.0--1.0)。"
 
-#: ../../source/mp/configuration.rst:212
+#: ../../source/mp/configuration.rst:228
 msgid "L2 Policies"
 msgstr "L2 策略"
 
-#: ../../source/mp/configuration.rst:223
+#: ../../source/mp/configuration.rst:239
 msgid "``--l2-store-policy``"
 msgstr "``--l2-store-policy``"
 
-#: ../../source/mp/configuration.rst:225
+#: ../../source/mp/configuration.rst:241
 msgid ""
 "L2 store policy.  Determines which adapters receive each key and whether "
 "keys are deleted from L1 after L2 store. The ``default`` policy stores "
@@ -493,11 +521,11 @@ msgstr ""
 "L1。``skip_l1`` 策略将所有键存储到所有适配器，然后从 L1 "
 "中删除它们（仅缓冲区模式）。可选项：``default``，``skip_l1``。"
 
-#: ../../source/mp/configuration.rst:231
+#: ../../source/mp/configuration.rst:247
 msgid "``--l2-prefetch-policy``"
 msgstr "``--l2-prefetch-policy``"
 
-#: ../../source/mp/configuration.rst:233
+#: ../../source/mp/configuration.rst:249
 msgid ""
 "L2 prefetch policy.  Determines which adapter loads each key when "
 "multiple adapters have it. The ``default`` policy picks the first adapter"
@@ -509,30 +537,46 @@ msgstr ""
 "策略选择第一个适配器（最低索引）。预取的键是临时的（在读取器完成后删除）。``retain`` 策略使用相同的加载计划，但将预取的键永久保留在 "
 "L1 中。选择：``default``，``retain``。"
 
-#: ../../source/mp/configuration.rst:240
+#: ../../source/mp/configuration.rst:256
 msgid "``--l2-prefetch-max-in-flight``"
 msgstr "``--l2-prefetch-max-in-flight``"
 
-#: ../../source/mp/configuration.rst:241
+#: ../../source/mp/configuration.rst:257
 msgid "``8``"
 msgstr "``8``"
 
-#: ../../source/mp/configuration.rst:242
+#: ../../source/mp/configuration.rst:258
 msgid ""
 "Maximum number of concurrent prefetch (L2 load) requests. Limits how many"
 " in-flight loads the PrefetchController may issue at once, preventing "
 "excessive L1 memory pressure."
 msgstr "最大并发预取（L2 加载）请求的数量。限制 PrefetchController 同时发出的在途加载数量，以防止过度的 L1 内存压力。"
 
-#: ../../source/mp/configuration.rst:247
+#: ../../source/mp/configuration.rst:261
+msgid "``--periodic-notifier-interval-ms``"
+msgstr "``--periodic-notifier-interval-ms``"
+
+#: ../../source/mp/configuration.rst:262
+msgid "``5``"
+msgstr "``5``"
+
+#: ../../source/mp/configuration.rst:263
+msgid ""
+"Interval in milliseconds for the periodic event notifier heartbeat.  A "
+"native C++ background thread writes to all registered file descriptors at"
+" this interval, waking controller poll loops for L2 adapters that lack "
+"native async completion callbacks."
+msgstr "定期事件通知器心跳的间隔（以毫秒为单位）。一个本地 C++ 后台线程在此间隔内写入所有注册的文件描述符，唤醒缺少本地异步完成回调的 L2 适配器的控制器轮询循环。"
+
+#: ../../source/mp/configuration.rst:270
 msgid "L2 Adapters"
 msgstr "L2 适配器"
 
-#: ../../source/mp/configuration.rst:249
+#: ../../source/mp/configuration.rst:272
 msgid "Source: ``lmcache/v1/distributed/l2_adapters/config.py``"
 msgstr "源: ``lmcache/v1/distributed/l2_adapters/config.py``"
 
-#: ../../source/mp/configuration.rst:251
+#: ../../source/mp/configuration.rst:274
 msgid ""
 "L2 adapters are configured via repeatable ``--l2-adapter <JSON>`` "
 "arguments. Each JSON object must include a ``\"type\"`` field that "
@@ -542,23 +586,23 @@ msgstr ""
 "L2 适配器通过可重复的 ``--l2-adapter <JSON>`` 参数进行配置。每个 JSON 对象必须包含一个 ``\"type\"``"
 " 字段，用于选择适配器类型。``--l2-adapter`` 参数的顺序决定了适配器的顺序（级联）。"
 
-#: ../../source/mp/configuration.rst:255
+#: ../../source/mp/configuration.rst:278
 msgid ""
 "Registered adapter types: ``nixl_store``, ``nixl_store_dynamic``, ``fs``,"
 " ``fs_native``, ``mock``, ``mooncake_store``, ``s3``, ``resp``, "
 "``plugin``, ``native_plugin``, ``raw_block``, ``dax``."
 msgstr "注册的适配器类型：``nixl_store``、``nixl_store_dynamic``、``fs``、``fs_native``、``mock``、``mooncake_store``、``s3``、``resp``、``plugin``、``native_plugin``、``raw_block``、``dax``。"
 
-#: ../../source/mp/configuration.rst:260
+#: ../../source/mp/configuration.rst:283
 msgid "``nixl_store`` -- NIXL-based persistent storage"
 msgstr "``nixl_store`` -- 基于 NIXL 的持久存储"
 
-#: ../../source/mp/configuration.rst:262 ../../source/mp/configuration.rst:298
-#: ../../source/mp/configuration.rst:318 ../../source/mp/configuration.rst:336
+#: ../../source/mp/configuration.rst:285 ../../source/mp/configuration.rst:321
+#: ../../source/mp/configuration.rst:341 ../../source/mp/configuration.rst:359
 msgid "Fields:"
 msgstr "字段："
 
-#: ../../source/mp/configuration.rst:264
+#: ../../source/mp/configuration.rst:287
 msgid ""
 "``backend`` *(required)*: One of ``POSIX``, ``GDS``, ``GDS_MT``, "
 "``HF3FS``, ``OBJ``, ``AZURE_BLOB``."
@@ -566,7 +610,7 @@ msgstr ""
 "``backend`` *(必需)*: 选项包括 ``POSIX``, ``GDS``, ``GDS_MT``, ``HF3FS``, "
 "``OBJ``, ``AZURE_BLOB``。"
 
-#: ../../source/mp/configuration.rst:265
+#: ../../source/mp/configuration.rst:288
 msgid ""
 "``backend_params`` *(required for file-based backends)*: Dict of string "
 "key-value pairs.  File-based backends (``GDS``, ``GDS_MT``, ``POSIX``, "
@@ -575,65 +619,65 @@ msgstr ""
 "``backend_params`` *(文件基础后端必需)*：字符串键值对的字典。文件基础后端（``GDS``, ``GDS_MT``, "
 "``POSIX``, ``HF3FS``）需要 ``file_path`` 和 ``use_direct_io``。"
 
-#: ../../source/mp/configuration.rst:268
+#: ../../source/mp/configuration.rst:291
 msgid ""
 "``pool_size`` *(required)*: Number of storage descriptors to pre-allocate"
 " (> 0)."
 msgstr "``pool_size`` *(必需)*: 预分配的存储描述符数量 (> 0)。"
 
-#: ../../source/mp/configuration.rst:270 ../../source/mp/configuration.rst:305
+#: ../../source/mp/configuration.rst:293 ../../source/mp/configuration.rst:328
 msgid "Examples:"
 msgstr "示例："
 
-#: ../../source/mp/configuration.rst:294
+#: ../../source/mp/configuration.rst:317
 msgid "``fs`` -- File-system backed storage"
 msgstr "``fs`` -- 文件系统支持的存储"
 
-#: ../../source/mp/configuration.rst:296
+#: ../../source/mp/configuration.rst:319
 msgid "A pure file-system L2 adapter using async I/O."
 msgstr "一个使用异步 I/O 的纯文件系统 L2 适配器。"
 
-#: ../../source/mp/configuration.rst:300
+#: ../../source/mp/configuration.rst:323
 msgid "``base_path`` *(required)*: Directory for storing KV cache files."
 msgstr "``base_path`` *(必需)*: 存储 KV Cache 文件的目录。"
 
-#: ../../source/mp/configuration.rst:301
+#: ../../source/mp/configuration.rst:324
 msgid "``relative_tmp_dir`` *(optional)*: Relative sub-dir for temp files."
 msgstr "``relative_tmp_dir`` *(可选)*: 临时文件的相对子目录。"
 
-#: ../../source/mp/configuration.rst:302
+#: ../../source/mp/configuration.rst:325
 msgid ""
 "``read_ahead_size`` *(optional)*: Trigger read-ahead by reading this many"
 " bytes first."
 msgstr "``read_ahead_size`` *(可选)*: 通过首先读取这么多字节来触发预读。"
 
-#: ../../source/mp/configuration.rst:303
+#: ../../source/mp/configuration.rst:326
 msgid ""
 "``use_odirect`` *(optional)*: Bypass page cache via ``O_DIRECT`` (default"
 " ``false``)."
 msgstr "``use_odirect`` *(可选)*：通过 ``O_DIRECT`` 跳过页面缓存（默认 ``false``）。"
 
-#: ../../source/mp/configuration.rst:316
+#: ../../source/mp/configuration.rst:339
 msgid "``mock`` -- Mock adapter for testing"
 msgstr "``mock`` -- 测试用的模拟适配器"
 
-#: ../../source/mp/configuration.rst:320
+#: ../../source/mp/configuration.rst:343
 msgid "``max_size_gb`` *(required)*: Maximum size of the adapter in GB (> 0)."
 msgstr "``max_size_gb`` *(必需)*：适配器的最大大小（以 GB 为单位，> 0）。"
 
-#: ../../source/mp/configuration.rst:321
+#: ../../source/mp/configuration.rst:344
 msgid "``mock_bandwidth_gb`` *(required)*: Simulated bandwidth in GB/sec (> 0)."
 msgstr "``mock_bandwidth_gb`` *(必需)*: 模拟带宽，单位为 GB/秒 (> 0)。"
 
-#: ../../source/mp/configuration.rst:323 ../../source/mp/configuration.rst:351
+#: ../../source/mp/configuration.rst:346 ../../source/mp/configuration.rst:374
 msgid "Example:"
 msgstr "示例："
 
-#: ../../source/mp/configuration.rst:330
+#: ../../source/mp/configuration.rst:353
 msgid "``s3`` -- S3-compatible object store"
 msgstr "``s3`` -- 兼容 S3 的对象存储"
 
-#: ../../source/mp/configuration.rst:332
+#: ../../source/mp/configuration.rst:355
 msgid ""
 "S3-backed L2 adapter using the AWS CRT (Common Runtime) for high-"
 "throughput transfers to AWS S3 or any S3-compatible endpoint. See "
@@ -642,39 +686,39 @@ msgstr ""
 "使用 AWS CRT（通用运行时）的 S3 后端 L2 适配器，以实现高吞吐量传输到 AWS S3 或任何 S3 兼容的端点。有关详细信息，请参见"
 " :doc:`l2_storage`。"
 
-#: ../../source/mp/configuration.rst:338
+#: ../../source/mp/configuration.rst:361
 msgid ""
 "``s3_endpoint`` *(required)*: Bucket URL, either ``\"s3://<bucket>\"`` or"
 " the bare host form."
 msgstr "``s3_endpoint`` *(必需)*: 存储桶 URL，可以是 ``\"s3://<bucket>\"`` 或裸主机形式。"
 
-#: ../../source/mp/configuration.rst:340
+#: ../../source/mp/configuration.rst:363
 msgid "``s3_region`` *(required)*: AWS region string."
 msgstr "``s3_region`` *(必需)*: AWS 区域字符串。"
 
-#: ../../source/mp/configuration.rst:341
+#: ../../source/mp/configuration.rst:364
 msgid "``s3_num_io_threads`` *(optional, default ``64``)*: CRT I/O threads."
 msgstr "``s3_num_io_threads`` *(可选，默认 ``64``)*: CRT I/O 线程。"
 
-#: ../../source/mp/configuration.rst:342
+#: ../../source/mp/configuration.rst:365
 msgid ""
 "``s3_prefer_http2`` *(optional, default ``true``)*: Negotiate HTTP/2 via "
 "ALPN."
 msgstr "``s3_prefer_http2`` *(可选，默认 ``true``)*: 通过 ALPN 协商 HTTP/2。"
 
-#: ../../source/mp/configuration.rst:343
+#: ../../source/mp/configuration.rst:366
 msgid ""
 "``s3_enable_s3express`` *(optional, default ``false``)*: Enable S3 "
 "Express signing."
 msgstr "``s3_enable_s3express`` *(可选，默认 ``false``)*: 启用 S3 Express 签名。"
 
-#: ../../source/mp/configuration.rst:344
+#: ../../source/mp/configuration.rst:367
 msgid ""
 "``disable_tls`` *(optional, default ``false``)*: Bypass TLS (for non-AWS "
 "HTTP endpoints)."
 msgstr "``disable_tls`` *(可选，默认 ``false``)*: 跳过 TLS（用于非 AWS HTTP 端点）。"
 
-#: ../../source/mp/configuration.rst:346
+#: ../../source/mp/configuration.rst:369
 msgid ""
 "``aws_access_key_id`` / ``aws_secret_access_key`` *(optional)*: Static "
 "credentials; omit to use the default credential provider chain."
@@ -682,7 +726,7 @@ msgstr ""
 "``aws_access_key_id`` / ``aws_secret_access_key`` *(可选)*: "
 "静态凭证；省略以使用默认凭证提供程序链。"
 
-#: ../../source/mp/configuration.rst:348
+#: ../../source/mp/configuration.rst:371
 msgid ""
 "``max_capacity_gb`` *(optional, default ``0.0``)*: Aggregate capacity "
 "used by ``get_usage()``. A value of ``0`` disables aggregate eviction."
@@ -690,112 +734,112 @@ msgstr ""
 "``max_capacity_gb`` *(可选，默认 ``0.0``)*: ``get_usage()`` 使用的总容量。值为 ``0`` "
 "将禁用总的逐出。"
 
-#: ../../source/mp/configuration.rst:358
+#: ../../source/mp/configuration.rst:381
 msgid "Multiple adapters (cascade)"
 msgstr "多个适配器（级联）"
 
-#: ../../source/mp/configuration.rst:360
+#: ../../source/mp/configuration.rst:383
 msgid ""
 "Pass ``--l2-adapter`` multiple times.  Adapters are used in the order "
 "given:"
 msgstr "多次传递 ``--l2-adapter``。适配器按给定顺序使用："
 
-#: ../../source/mp/configuration.rst:368
+#: ../../source/mp/configuration.rst:391
 msgid "Observability"
 msgstr "可观察性"
 
-#: ../../source/mp/configuration.rst:370
+#: ../../source/mp/configuration.rst:393
 msgid "Source: ``lmcache/v1/mp_observability/config.py``"
 msgstr "源: ``lmcache/v1/mp_observability/config.py``"
 
-#: ../../source/mp/configuration.rst:372
+#: ../../source/mp/configuration.rst:395
 msgid ""
 "See :doc:`observability` for full details on the three modes (metrics, "
 "logging, tracing)."
 msgstr "有关三种模式（指标、日志、跟踪）的完整详细信息，请参见 :doc:`observability`。"
 
-#: ../../source/mp/configuration.rst:382
+#: ../../source/mp/configuration.rst:405
 msgid "``--disable-observability``"
 msgstr "``--disable-observability``"
 
-#: ../../source/mp/configuration.rst:383 ../../source/mp/configuration.rst:386
-#: ../../source/mp/configuration.rst:389 ../../source/mp/configuration.rst:392
+#: ../../source/mp/configuration.rst:406 ../../source/mp/configuration.rst:409
+#: ../../source/mp/configuration.rst:412 ../../source/mp/configuration.rst:415
 msgid "off"
 msgstr "关闭"
 
-#: ../../source/mp/configuration.rst:384
+#: ../../source/mp/configuration.rst:407
 msgid "Master switch: disable the EventBus entirely."
 msgstr "主开关：完全禁用 EventBus。"
 
-#: ../../source/mp/configuration.rst:385
+#: ../../source/mp/configuration.rst:408
 msgid "``--disable-metrics``"
 msgstr "``--disable-metrics``"
 
-#: ../../source/mp/configuration.rst:387
+#: ../../source/mp/configuration.rst:410
 msgid "Skip metrics subscribers (no Prometheus endpoint)."
 msgstr "跳过指标订阅者（没有 Prometheus 端点）。"
 
-#: ../../source/mp/configuration.rst:388
+#: ../../source/mp/configuration.rst:411
 msgid "``--disable-logging``"
 msgstr "``--disable-logging``"
 
-#: ../../source/mp/configuration.rst:390
+#: ../../source/mp/configuration.rst:413
 msgid "Skip logging subscribers."
 msgstr "跳过日志订阅者。"
 
-#: ../../source/mp/configuration.rst:391
+#: ../../source/mp/configuration.rst:414
 msgid "``--enable-tracing``"
 msgstr "``--enable-tracing``"
 
-#: ../../source/mp/configuration.rst:393
+#: ../../source/mp/configuration.rst:416
 msgid "Register tracing subscribers. Requires ``--otlp-endpoint``."
 msgstr "注册追踪订阅者。需要 ``--otlp-endpoint``。"
 
-#: ../../source/mp/configuration.rst:394
+#: ../../source/mp/configuration.rst:417
 msgid "``--event-bus-queue-size``"
 msgstr "``--event-bus-queue-size``"
 
-#: ../../source/mp/configuration.rst:395
+#: ../../source/mp/configuration.rst:418
 msgid "``10000``"
 msgstr "``10000``"
 
-#: ../../source/mp/configuration.rst:396
+#: ../../source/mp/configuration.rst:419
 msgid "Max events in the EventBus queue before tail-drop."
 msgstr "事件总线队列中最大事件数，超过后将进行尾部丢弃。"
 
-#: ../../source/mp/configuration.rst:397
+#: ../../source/mp/configuration.rst:420
 msgid "``--otlp-endpoint``"
 msgstr "``--otlp-endpoint``"
 
-#: ../../source/mp/configuration.rst:398
+#: ../../source/mp/configuration.rst:421
 msgid "*(none)*"
 msgstr "*(无)*"
 
-#: ../../source/mp/configuration.rst:399
+#: ../../source/mp/configuration.rst:422
 msgid "OTLP gRPC endpoint for exporting metrics and traces."
 msgstr "用于导出指标和跟踪的 OTLP gRPC 端点。"
 
-#: ../../source/mp/configuration.rst:400
+#: ../../source/mp/configuration.rst:423
 msgid "``--prometheus-port``"
 msgstr "``--prometheus-port``"
 
-#: ../../source/mp/configuration.rst:401
+#: ../../source/mp/configuration.rst:424
 msgid "``9090``"
 msgstr "``9090``"
 
-#: ../../source/mp/configuration.rst:402
+#: ../../source/mp/configuration.rst:425
 msgid "Port for the Prometheus ``/metrics`` endpoint."
 msgstr "Prometheus ``/metrics`` 端点的端口。"
 
-#: ../../source/mp/configuration.rst:403
+#: ../../source/mp/configuration.rst:426
 msgid "``--service-instance-id``"
 msgstr "``--service-instance-id``"
 
-#: ../../source/mp/configuration.rst:404
+#: ../../source/mp/configuration.rst:427
 msgid "*(unset, default UUID v4)*"
 msgstr "*(未设置，默认 UUID v4)*"
 
-#: ../../source/mp/configuration.rst:405
+#: ../../source/mp/configuration.rst:428
 msgid ""
 "Identifier for this MP server instance, attached as the OTel Resource "
 "attribute ``service.instance.id`` on every metric and span. When the flag"
@@ -806,55 +850,65 @@ msgstr ""
 "附加在每个指标和跨度上。当未传递该标志时，默认为随机 UUID v4。传递 ``--service-instance-id=\\\"\\\"`` "
 "以强制设置为空值。"
 
-#: ../../source/mp/configuration.rst:411
+#: ../../source/mp/configuration.rst:434
 msgid "vLLM Client Configuration"
 msgstr "vLLM 客户端配置"
 
-#: ../../source/mp/configuration.rst:413
+#: ../../source/mp/configuration.rst:436
 msgid ""
 "On the vLLM side, specify the LMCache server host and port via the "
 "``kv_connector_extra_config`` parameter:"
 msgstr "在 vLLM 端，通过 ``kv_connector_extra_config`` 参数指定 LMCache 服务器的主机和端口："
 
-#: ../../source/mp/configuration.rst:422
+#: ../../source/mp/configuration.rst:445
 msgid ""
 "``LMCacheMPConnector`` reads the following keys from "
 "``kv_connector_extra_config``:"
 msgstr "``LMCacheMPConnector`` 从 ``kv_connector_extra_config`` 中读取以下键："
 
-#: ../../source/mp/configuration.rst:429
+#: ../../source/mp/configuration.rst:449
+msgid "Connector ``extra_config`` Keys"
+msgstr "连接器 ``extra_config`` 键"
+
+#: ../../source/mp/configuration.rst:451
+msgid ""
+"All connector-level options are passed through "
+"``kv_connector_extra_config`` and use the ``lmcache.mp.`` prefix."
+msgstr "所有连接器级别的选项都通过 ``kv_connector_extra_config`` 传递，并使用 ``lmcache.mp.`` 前缀。"
+
+#: ../../source/mp/configuration.rst:458
 msgid "Key"
 msgstr "键"
 
-#: ../../source/mp/configuration.rst:432
+#: ../../source/mp/configuration.rst:461
 msgid "``lmcache.mp.host``"
 msgstr "``lmcache.mp.host``"
 
-#: ../../source/mp/configuration.rst:433
+#: ../../source/mp/configuration.rst:462
 msgid "``tcp://localhost``"
 msgstr "``tcp://localhost``"
 
-#: ../../source/mp/configuration.rst:434
+#: ../../source/mp/configuration.rst:463
 msgid "Host (with ZMQ transport prefix) of the LMCache MP server."
 msgstr "LMCache MP 服务器的主机（带有 ZMQ 传输前缀）。"
 
-#: ../../source/mp/configuration.rst:435
+#: ../../source/mp/configuration.rst:464
 msgid "``lmcache.mp.port``"
 msgstr "``lmcache.mp.port``"
 
-#: ../../source/mp/configuration.rst:437
+#: ../../source/mp/configuration.rst:466
 msgid "Port of the LMCache MP server. Must match the server's ``--port``."
 msgstr "LMCache MP 服务器的端口。必须与服务器的 ``--port`` 匹配。"
 
-#: ../../source/mp/configuration.rst:438
+#: ../../source/mp/configuration.rst:467
 msgid "``lmcache.mp.mq_timeout``"
 msgstr "``lmcache.mp.mq_timeout``"
 
-#: ../../source/mp/configuration.rst:439
+#: ../../source/mp/configuration.rst:468
 msgid "``300.0``"
 msgstr "``300.0``"
 
-#: ../../source/mp/configuration.rst:440
+#: ../../source/mp/configuration.rst:469
 msgid ""
 "Timeout (seconds) for blocking message-queue requests, including the "
 "initial chunk-size query and KV cache registration/unregistration. If the"
@@ -864,33 +918,45 @@ msgstr ""
 "阻塞消息队列请求的超时时间（秒），包括初始块大小查询和 KV Cache 注册/注销。如果服务器在此时间窗口内未响应，连接器将在启动时引发 "
 "``ConnectionError``。"
 
-#: ../../source/mp/configuration.rst:444
+#: ../../source/mp/configuration.rst:473
 msgid "``lmcache.mp.heartbeat_interval``"
 msgstr "``lmcache.mp.heartbeat_interval``"
 
-#: ../../source/mp/configuration.rst:445
+#: ../../source/mp/configuration.rst:474
 msgid "``10.0``"
 msgstr "``10.0``"
 
-#: ../../source/mp/configuration.rst:446
+#: ../../source/mp/configuration.rst:475
 msgid ""
 "Interval (seconds) between periodic heartbeat pings sent from the "
 "connector to the server."
 msgstr "连接器向服务器发送的定期心跳 ping 之间的间隔（秒）。"
 
-#: ../../source/mp/configuration.rst:450
+#: ../../source/mp/configuration.rst:477
+msgid "``lmcache.mp.mp_transfer_mode``"
+msgstr "``lmcache.mp.mp_transfer_mode``"
+
+#: ../../source/mp/configuration.rst:479
+msgid ""
+"Routing mode for the worker -> server transfer context. One of ``auto`` "
+"(CUDA -> handle, others -> data), ``handle`` (force IPC / SHM zero-copy),"
+" or ``data`` (force worker-side gather/scatter copy). Overrides the "
+"``LMCACHE_MP_TRANSFER_MODE`` env var when set."
+msgstr "用于工作者与服务器传输上下文的路由模式。可以是 ``auto``（CUDA -> 句柄，其他 -> 数据）、``handle``（强制 IPC / SHM 零拷贝）或 ``data``（强制工作者端聚合/分散拷贝）。当设置时，将覆盖 ``LMCACHE_MP_TRANSFER_MODE`` 环境变量。"
+
+#: ../../source/mp/configuration.rst:485
 msgid "Environment Variables"
 msgstr "环境变量"
 
-#: ../../source/mp/configuration.rst:456
+#: ../../source/mp/configuration.rst:491
 msgid "Variable"
 msgstr "变量"
 
-#: ../../source/mp/configuration.rst:458
+#: ../../source/mp/configuration.rst:493
 msgid "``LMCACHE_LOG_LEVEL``"
 msgstr "``LMCACHE_LOG_LEVEL``"
 
-#: ../../source/mp/configuration.rst:459
+#: ../../source/mp/configuration.rst:494
 msgid ""
 "Log level for LMCache (``DEBUG``, ``INFO``, ``WARNING``, ``ERROR``). Set "
 "to ``DEBUG`` to see L2 store activity, prefetch results, etc."
@@ -898,17 +964,17 @@ msgstr ""
 "LMCache 的日志级别（``DEBUG``、``INFO``、``WARNING``、``ERROR``）。设置为 ``DEBUG`` 以查看"
 " L2 存储活动、预取结果等。"
 
-#: ../../source/mp/configuration.rst:461
+#: ../../source/mp/configuration.rst:496
 msgid "``PYTHONHASHSEED``"
 msgstr "``PYTHONHASHSEED``"
 
-#: ../../source/mp/configuration.rst:462
+#: ../../source/mp/configuration.rst:497
 msgid ""
 "Set to a fixed value for reproducible hashing across processes (relevant "
 "when using ``--hash-algorithm builtin``)."
 msgstr "设置为固定值以实现跨进程的可重复哈希（在使用 ``--hash-algorithm builtin`` 时相关）。"
 
-#: ../../source/mp/configuration.rst:466
+#: ../../source/mp/configuration.rst:501
 msgid "Full Example"
 msgstr "完整示例"
 
@@ -921,3 +987,13 @@ msgstr "完整示例"
 #~ "启用或禁用 L1 内存的延迟分配。传递 ``--l1-use-lazy`` "
 #~ "以启用（默认）或 ``--no-l1-use-lazy`` 以显式禁用。"
 
+#~ msgid ""
+#~ "Cache engine backend type. ``default`` "
+#~ "uses MPCacheEngine; ``blend`` uses "
+#~ "BlendEngineV2 for cross-request KV "
+#~ "reuse. Choices: ``default``, ``blend``."
+#~ msgstr ""
+#~ "缓存引擎后端类型。``default`` 使用 MPCacheEngine；``blend`` 使用"
+#~ " BlendEngineV2 进行跨请求的 KV "
+#~ "重用。可选项：``default``，``blend``。"
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/coordinator.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/coordinator.po
new file mode 100644
index 0000000000..940973bb87
--- /dev/null
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/coordinator.po
@@ -0,0 +1,222 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2024, The LMCache Team
+# This file is distributed under the same license as the LMCache package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2026.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: LMCache \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: zh_CN\n"
+"Language-Team: zh_CN <LL@li.org>\n"
+"Plural-Forms: nplurals=1; plural=0;\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.18.0\n"
+
+#: ../../source/mp/coordinator.rst:2
+msgid "Multi-Server Coordination"
+msgstr "多服务器协调"
+
+#: ../../source/mp/coordinator.rst:4
+msgid ""
+"When you run more than one LMCache multiprocess (MP) server, the **MP "
+"Coordinator** is a standalone service they register with, giving you a "
+"single, fleet-wide view of every running server. Each MP server caches "
+"independently; the coordinator ties them together into one coordinated "
+"fleet."
+msgstr "当您运行多个 LMCache 多进程 (MP) 服务器时，**MP 协调器**是一个独立的服务，它们会向其注册，从而为您提供每个运行中的服务器的单一、全队视图。每个 MP 服务器独立缓存；协调器将它们结合成一个协调的队列。"
+
+#: ../../source/mp/coordinator.rst:10
+msgid "Running the coordinator"
+msgstr "运行协调器"
+
+#: ../../source/mp/coordinator.rst:12
+msgid "The coordinator is a FastAPI service. Start it with:"
+msgstr "协调器是一个 FastAPI 服务。使用以下命令启动它："
+
+#: ../../source/mp/coordinator.rst:18
+msgid "Expected log output:"
+msgstr "预期的日志输出："
+
+#: ../../source/mp/coordinator.rst:24
+msgid ""
+"The CLI accepts ``--host``, ``--port``, ``--instance-timeout``, and "
+"``--health-check-interval``; any flag overrides the matching environment "
+"variable below. See :doc:`/cli/coordinator` for details. Equivalently, "
+"the coordinator can still be launched as a module with ``python3 -m "
+"lmcache.v1.mp_coordinator``."
+msgstr "CLI 接受 ``--host``、``--port``、``--instance-timeout`` 和 ``--health-check-interval``；任何标志都会覆盖下面匹配的环境变量。有关详细信息，请参见 :doc:`/cli/coordinator`。同样，协调器仍然可以通过 ``python3 -m lmcache.v1.mp_coordinator`` 作为模块启动。"
+
+#: ../../source/mp/coordinator.rst:31
+msgid "Configuration"
+msgstr "配置"
+
+#: ../../source/mp/coordinator.rst:33
+msgid ""
+"The coordinator is configured through ``LMCACHE_MP_COORDINATOR_*`` "
+"environment variables:"
+msgstr "协调器通过 ``LMCACHE_MP_COORDINATOR_*`` 环境变量进行配置："
+
+#: ../../source/mp/coordinator.rst:40
+msgid "Environment variable"
+msgstr "环境变量"
+
+#: ../../source/mp/coordinator.rst:41
+msgid "Default"
+msgstr "默认"
+
+#: ../../source/mp/coordinator.rst:42 ../../source/mp/coordinator.rst:73
+msgid "Description"
+msgstr "描述"
+
+#: ../../source/mp/coordinator.rst:43
+msgid "``LMCACHE_MP_COORDINATOR_HOST``"
+msgstr "``LMCACHE_MP_COORDINATOR_HOST``"
+
+#: ../../source/mp/coordinator.rst:44
+msgid "``0.0.0.0``"
+msgstr "``0.0.0.0``"
+
+#: ../../source/mp/coordinator.rst:45
+msgid "Host the HTTP server binds to."
+msgstr "HTTP服务器绑定的主机。"
+
+#: ../../source/mp/coordinator.rst:46
+msgid "``LMCACHE_MP_COORDINATOR_PORT``"
+msgstr "``LMCACHE_MP_COORDINATOR_PORT``"
+
+#: ../../source/mp/coordinator.rst:47
+msgid "``9300``"
+msgstr "``9300``"
+
+#: ../../source/mp/coordinator.rst:48
+msgid "Port the HTTP server binds to."
+msgstr "HTTP服务器绑定的端口。"
+
+#: ../../source/mp/coordinator.rst:49
+msgid "``LMCACHE_MP_COORDINATOR_INSTANCE_TIMEOUT``"
+msgstr "``LMCACHE_MP_COORDINATOR_INSTANCE_TIMEOUT``"
+
+#: ../../source/mp/coordinator.rst:50
+msgid "``30``"
+msgstr "``30``"
+
+#: ../../source/mp/coordinator.rst:51
+msgid ""
+"Seconds without a heartbeat after which a server is dropped from the "
+"fleet."
+msgstr "在没有心跳的情况下经过的秒数，之后服务器将从集群中移除。"
+
+#: ../../source/mp/coordinator.rst:53
+msgid "``LMCACHE_MP_COORDINATOR_HEALTH_CHECK_INTERVAL``"
+msgstr "``LMCACHE_MP_COORDINATOR_HEALTH_CHECK_INTERVAL``"
+
+#: ../../source/mp/coordinator.rst:54
+msgid "``10``"
+msgstr "``10``"
+
+#: ../../source/mp/coordinator.rst:55
+msgid "Seconds between health-check sweeps. ``0`` disables eviction."
+msgstr "健康检查扫描之间的秒数。``0`` 禁用逐出。"
+
+#: ../../source/mp/coordinator.rst:58
+msgid "Connecting MP servers"
+msgstr "连接 MP 服务器"
+
+#: ../../source/mp/coordinator.rst:60
+msgid ""
+"An MP server (``lmcache server``) joins the coordinator when you point it"
+" at one with ``--coordinator-url``. It registers on startup, heartbeats "
+"while running, and deregisters on shutdown -- all on the server's own "
+"event loop. This is opt-in: with no URL set, the server runs exactly as "
+"before. Each flag falls back to a matching ``LMCACHE_COORDINATOR_*`` "
+"environment variable (handy for the Kubernetes downward API); an explicit"
+" flag wins over the env var."
+msgstr "当您使用 ``--coordinator-url`` 指向一个协调器时，MP 服务器（``lmcache server``）会加入协调器。它在启动时注册，运行时进行心跳，关闭时注销——所有这些都在服务器自己的事件循环中。这是可选的：如果未设置 URL，服务器将按以前的方式运行。每个标志都可以回退到匹配的 ``LMCACHE_COORDINATOR_*`` 环境变量（对于 Kubernetes 向下 API 很方便）；显式标志优先于环境变量。"
+
+#: ../../source/mp/coordinator.rst:71
+msgid "Flag (on the MP server)"
+msgstr "标志（在 MP 服务器上）"
+
+#: ../../source/mp/coordinator.rst:72
+msgid "Env fallback"
+msgstr "环境回退"
+
+#: ../../source/mp/coordinator.rst:74
+msgid "``--coordinator-url``"
+msgstr "``--coordinator-url``"
+
+#: ../../source/mp/coordinator.rst:75
+msgid "``LMCACHE_COORDINATOR_URL``"
+msgstr "``LMCACHE_COORDINATOR_URL``"
+
+#: ../../source/mp/coordinator.rst:76
+msgid ""
+"Coordinator base URL, e.g. ``http://coordinator:9300``. Enables "
+"registration when set."
+msgstr "协调器基础 URL，例如 ``http://coordinator:9300``。设置后启用注册。"
+
+#: ../../source/mp/coordinator.rst:78
+msgid "``--coordinator-advertise-ip``"
+msgstr "``--coordinator-advertise-ip``"
+
+#: ../../source/mp/coordinator.rst:79
+msgid "``LMCACHE_COORDINATOR_ADVERTISE_IP``"
+msgstr "``LMCACHE_COORDINATOR_ADVERTISE_IP``"
+
+#: ../../source/mp/coordinator.rst:80
+msgid ""
+"IP the coordinator should reach this server at (defaults to the server's "
+"outbound IP)."
+msgstr "协调器应通过此服务器访问的 IP（默认为服务器的外部 IP）。"
+
+#: ../../source/mp/coordinator.rst:82
+msgid "``--coordinator-heartbeat-interval``"
+msgstr "``--coordinator-heartbeat-interval``"
+
+#: ../../source/mp/coordinator.rst:83
+msgid "``LMCACHE_COORDINATOR_HEARTBEAT_INTERVAL``"
+msgstr "``LMCACHE_COORDINATOR_HEARTBEAT_INTERVAL``"
+
+#: ../../source/mp/coordinator.rst:84
+msgid ""
+"Seconds between heartbeats (must be ``> 0``, default ``5``). Keep it well"
+" below the coordinator's ``INSTANCE_TIMEOUT``."
+msgstr "心跳之间的秒数（必须为 ``> 0``，默认值为 ``5``）。保持远低于协调器的 ``INSTANCE_TIMEOUT``。"
+
+#: ../../source/mp/coordinator.rst:87
+msgid ""
+"The server registers under its telemetry identity (``--service-instance-"
+"id`` / OTel ``service.instance.id``); if that is unset, the coordinator "
+"assigns an id."
+msgstr "服务器在其遥测身份下注册 (``--service-instance-id`` / OTel ``service.instance.id``)；如果未设置，则协调器会分配一个 ID。"
+
+#: ../../source/mp/coordinator.rst:90
+msgid ""
+"Registration is best-effort: if the coordinator is unreachable, the MP "
+"server logs a warning, keeps retrying, and continues serving. A malformed"
+" heartbeat-interval value is rejected at startup."
+msgstr "注册是尽力而为的：如果协调器无法访问，MP 服务器会记录警告，持续重试，并继续提供服务。启动时会拒绝格式错误的心跳间隔值。"
+
+#: ../../source/mp/coordinator.rst:95
+msgid "Inspecting the fleet"
+msgstr "检查集群"
+
+#: ../../source/mp/coordinator.rst:97
+msgid "Two read-only endpoints let you observe the coordinator:"
+msgstr "两个只读端点让您观察协调器："
+
+#: ../../source/mp/coordinator.rst:99
+msgid "``GET /instances`` -- list every registered MP server."
+msgstr "``GET /instances`` -- 列出每个注册的 MP 服务器。"
+
+#: ../../source/mp/coordinator.rst:100
+msgid "``GET /healthz`` -- coordinator liveness probe (for Kubernetes)."
+msgstr "``GET /healthz`` -- 协调器存活探测（用于 Kubernetes）。"
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/deployment.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/deployment.po
index 01866d63ed..d125748d74 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/mp/deployment.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/deployment.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-06-01 10:55+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -273,55 +273,86 @@ msgstr ""
 "``INFO``（默认）以减少日志量。"
 
 #: ../../source/mp/deployment.rst:223
-msgid "Non-GPU Transfer Mode (``--shm-name``)"
-msgstr "非 GPU 传输模式 (``--shm-name``)"
+msgid "Non-GPU Transfer Mode (``--supported-transfer-mode``, ``--shm-name``)"
+msgstr "非 GPU 传输模式 (``--supported-transfer-mode``, ``--shm-name``)"
 
 #: ../../source/mp/deployment.rst:225
 msgid ""
-"By default, LMCache creates a shared-memory (SHM) pool for non-GPU KV "
-"transfers between the server and vLLM workers.  The ``--shm-name`` option"
-" lets you control this behavior:"
-msgstr "默认情况下，LMCache 为服务器和 vLLM 工作节点之间的非 GPU KV 传输创建一个共享内存 (SHM) 池。 ``--shm-name`` 选项让您可以控制此行为："
+"LMCache supports two worker → server transfer paths: a **GPU** path (CUDA"
+" IPC, used for STORE/RETRIEVE) and a **non-GPU** path (PREPARE/COMMIT, "
+"used by CPU-only or non-CUDA accelerator workers). The server picks which"
+" paths to load via ``--supported-transfer-mode``:"
+msgstr "LMCache 支持两种工作进程 → 服务器传输路径：一种是 **GPU** 路径 (CUDA IPC，用于 STORE/RETRIEVE)，另一种是 **非 GPU** 路径 (PREPARE/COMMIT，用于仅 CPU 或非 CUDA 加速器工作进程)。服务器通过 ``--supported-transfer-mode`` 选择加载哪些路径："
+
+#: ../../source/mp/deployment.rst:230
+msgid ""
+"``auto`` *(default)* -- load both paths.  Workers of either device type "
+"can connect without manual configuration; the server has no upfront "
+"knowledge of the connecting worker's device."
+msgstr "``auto`` *(默认)* -- 加载两条路径。任一设备类型的工作节点都可以连接，无需手动配置；服务器对连接工作节点的设备没有事先的了解。"
 
 #: ../../source/mp/deployment.rst:233
+msgid ""
+"``gpu`` -- load only the GPU IPC path.  Use when every worker is a CUDA "
+"device and you want to skip allocating the non-GPU resources (SHM pool, "
+"pickle codec)."
+msgstr "``gpu`` -- 仅加载 GPU IPC 路径。当每个工作节点都是 CUDA 设备时使用，并且您希望跳过分配非 GPU 资源（SHM 池、pickle 编解码器）。"
+
+#: ../../source/mp/deployment.rst:236
+msgid ""
+"``non_gpu`` -- load only the non-GPU path.  Use when serving CPU-only or "
+"non-CUDA accelerator workers."
+msgstr "``non_gpu`` -- 仅加载非 GPU 路径。用于服务仅 CPU 或非 CUDA 加速器的工作节点。"
+
+#: ../../source/mp/deployment.rst:239
+msgid ""
+"When the non-GPU path is loaded (``auto`` or ``non_gpu``), LMCache by "
+"default creates a shared-memory (SHM) pool for non-GPU KV transfers "
+"between the server and vLLM workers.  The ``--shm-name`` option lets you "
+"control this behavior:"
+msgstr "当加载非 GPU 路径（``auto`` 或 ``non_gpu``）时，LMCache 默认会为服务器和 vLLM 工作节点之间的非 GPU KV 传输创建一个共享内存（SHM）池。``--shm-name`` 选项允许您控制此行为："
+
+#: ../../source/mp/deployment.rst:248
 msgid "Value"
 msgstr "值"
 
-#: ../../source/mp/deployment.rst:234
+#: ../../source/mp/deployment.rst:249
 msgid "Effect"
 msgstr "效果"
 
-#: ../../source/mp/deployment.rst:235
+#: ../../source/mp/deployment.rst:250
 msgid "*(not set)* (default)"
 msgstr "*(未设置)* (默认)"
 
-#: ../../source/mp/deployment.rst:236
+#: ../../source/mp/deployment.rst:251
 msgid "Auto-allocate a SHM pool (current default behavior)."
 msgstr "自动分配一个 SHM 池（当前默认行为）。"
 
-#: ../../source/mp/deployment.rst:237
+#: ../../source/mp/deployment.rst:252
 msgid "``\"\"`` (empty string)"
 msgstr "``\"\"`` (空字符串)"
 
-#: ../../source/mp/deployment.rst:238
+#: ../../source/mp/deployment.rst:253
 msgid ""
 "Disable the SHM pool entirely and fall back to the pickle-based transfer "
 "path.  Useful when ``/dev/shm`` is unavailable or when running without "
 "``--ipc host`` in Docker."
-msgstr "完全禁用 SHM 池并回退到基于 pickle 的传输路径。当 ``/dev/shm`` 不可用或在 Docker 中未使用 ``--ipc host`` 时非常有用。"
+msgstr ""
+"完全禁用 SHM 池并回退到基于 pickle 的传输路径。当 ``/dev/shm`` 不可用或在 Docker 中未使用 ``--ipc "
+"host`` 时非常有用。"
 
-#: ../../source/mp/deployment.rst:241
+#: ../../source/mp/deployment.rst:256
 msgid "``\"my_pool\"`` (any non-empty name)"
 msgstr "``\"my_pool\"``（任何非空名称）"
 
-#: ../../source/mp/deployment.rst:242
+#: ../../source/mp/deployment.rst:257
 msgid ""
 "Use that exact name for the SHM segment instead of the auto-generated "
 "one.  Handy when you need a deterministic, human-readable segment name "
 "for monitoring or debugging."
 msgstr "使用该确切名称作为 SHM 段，而不是自动生成的名称。当您需要一个确定性、可读性强的段名称以便于监控或调试时，这非常方便。"
 
-#: ../../source/mp/deployment.rst:246
+#: ../../source/mp/deployment.rst:261
 msgid "**Examples:**"
 msgstr "**示例：**"
 
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/frontend_dashboard.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/frontend_dashboard.po
new file mode 100644
index 0000000000..9f0825cfb6
--- /dev/null
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/frontend_dashboard.po
@@ -0,0 +1,354 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2024, The LMCache Team
+# This file is distributed under the same license as the LMCache package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2026.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: LMCache \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: zh_CN\n"
+"Language-Team: zh_CN <LL@li.org>\n"
+"Plural-Forms: nplurals=1; plural=0;\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.18.0\n"
+
+#: ../../source/mp/frontend_dashboard.rst:2
+msgid "Frontend Dashboard"
+msgstr "前端仪表板"
+
+#: ../../source/mp/frontend_dashboard.rst:4
+msgid ""
+"The **LMCache Frontend Dashboard** is a lightweight web UI that lets you "
+"monitor and manage a fleet of LMCache multiprocess (MP) servers from a "
+"single browser tab.  It is shipped as part of the ``lmcache`` package and"
+" requires no extra infrastructure beyond a small discovery service."
+msgstr "**LMCache 前端仪表板**是一个轻量级的网页用户界面，允许您从一个浏览器标签页监控和管理一组 LMCache 多进程 (MP) 服务器。它作为 ``lmcache`` 包的一部分提供，不需要额外的基础设施，只需一个小型发现服务。"
+
+#: ../../source/mp/frontend_dashboard.rst:10
+msgid "Architecture Overview"
+msgstr "架构概述"
+
+#: ../../source/mp/frontend_dashboard.rst:12
+msgid "Architecture Diagram"
+msgstr "架构图"
+
+#: ../../source/mp/frontend_dashboard.rst:39
+msgid ""
+"Each LMCache MP server runs a **frontend plugin subprocess** that "
+"periodically sends a heartbeat to the discovery service.  The dashboard "
+"queries the discovery service to discover all live nodes and proxies "
+"their HTTP APIs through a built-in reverse proxy."
+msgstr "每个 LMCache MP 服务器运行一个 **前端插件子进程**，定期向发现服务发送心跳。仪表板查询发现服务以发现所有活动节点，并通过内置反向代理代理它们的 HTTP API。"
+
+#: ../../source/mp/frontend_dashboard.rst:45
+msgid "Components"
+msgstr "组件"
+
+#: ../../source/mp/frontend_dashboard.rst:51
+msgid "Component"
+msgstr "组件"
+
+#: ../../source/mp/frontend_dashboard.rst:52
+#: ../../source/mp/frontend_dashboard.rst:153
+#: ../../source/mp/frontend_dashboard.rst:193
+msgid "Description"
+msgstr "描述"
+
+#: ../../source/mp/frontend_dashboard.rst:53
+msgid "``lmcache.lmcache_frontend.app``"
+msgstr "``lmcache.lmcache_frontend.app``"
+
+#: ../../source/mp/frontend_dashboard.rst:54
+msgid ""
+"FastAPI application serving the web UI and a reverse proxy to every "
+"registered LMCache node.  Start with ``python -m "
+"lmcache.lmcache_frontend.app``."
+msgstr "FastAPI 应用程序提供 Web UI 和每个注册的 LMCache 节点的反向代理。使用 ``python -m lmcache.lmcache_frontend.app`` 启动。"
+
+#: ../../source/mp/frontend_dashboard.rst:57
+msgid "``lmcache_mp_frontend_plugin``"
+msgstr "``lmcache_mp_frontend_plugin``"
+
+#: ../../source/mp/frontend_dashboard.rst:58
+msgid ""
+"Runtime plugin subprocess launched by ``MPRuntimePluginLauncher``. Runs "
+"``HeartbeatService`` (``--no-http`` mode) to register the server with the"
+" discovery service."
+msgstr "由 ``MPRuntimePluginLauncher`` 启动的运行时插件子进程。运行 ``HeartbeatService``（``--no-http`` 模式）以将服务器注册到发现服务。"
+
+#: ../../source/mp/frontend_dashboard.rst:61
+msgid "``lmcache.tools.simple_discover_service``"
+msgstr "``lmcache.tools.simple_discover_service``"
+
+#: ../../source/mp/frontend_dashboard.rst:62
+msgid ""
+"Reference Flask discovery service.  Accepts heartbeats at "
+"``/lmcache_heartbeat`` and exposes the node list at ``/lmcache_infos``.  "
+"Start with ``python -m lmcache.tools.simple_discover_service``."
+msgstr "参考 Flask 发现服务。接受 ``/lmcache_heartbeat`` 的心跳并在 ``/lmcache_infos`` 处公开节点列表。使用 ``python -m lmcache.tools.simple_discover_service`` 启动。"
+
+#: ../../source/mp/frontend_dashboard.rst:68
+msgid "Prerequisites"
+msgstr "先决条件"
+
+#: ../../source/mp/frontend_dashboard.rst:70
+msgid "Install the extra dependencies used by the frontend and discovery service:"
+msgstr "安装前端和发现服务所需的额外依赖项："
+
+#: ../../source/mp/frontend_dashboard.rst:76
+msgid "These are not pulled in by the base ``lmcache`` install to keep it slim."
+msgstr "这些并不是通过基础的 ``lmcache`` 安装引入的，以保持其精简。"
+
+#: ../../source/mp/frontend_dashboard.rst:79
+msgid "Quick Start"
+msgstr "快速开始"
+
+#: ../../source/mp/frontend_dashboard.rst:81
+msgid "**Step 1 — Start the discovery service**"
+msgstr "**步骤 1 — 启动发现服务**"
+
+#: ../../source/mp/frontend_dashboard.rst:87
+msgid "The service listens on ``0.0.0.0:5000`` and exposes:"
+msgstr "该服务监听 ``0.0.0.0:5000`` 并暴露："
+
+#: ../../source/mp/frontend_dashboard.rst:89
+msgid "``GET /lmcache_heartbeat`` — record a heartbeat from an MP server."
+msgstr "``GET /lmcache_heartbeat`` — 记录来自 MP 服务器的心跳。"
+
+#: ../../source/mp/frontend_dashboard.rst:90
+msgid "``GET /lmcache_infos`` — return all registered nodes as JSON."
+msgstr "``GET /lmcache_infos`` — 以 JSON 格式返回所有注册的节点。"
+
+#: ../../source/mp/frontend_dashboard.rst:92
+msgid "**Step 2 — Start the LMCache MP server with the frontend plugin**"
+msgstr "**步骤 2 — 启动带有前端插件的 LMCache MP 服务器**"
+
+#: ../../source/mp/frontend_dashboard.rst:105
+msgid ""
+"The plugin subprocess will start sending heartbeats to the discovery "
+"service every 30 seconds (configurable via ``plugin.frontend.heartbeat-"
+"interval``)."
+msgstr "插件子进程将每 30 秒向发现服务发送一次心跳（可通过 ``plugin.frontend.heartbeat-interval`` 配置）。"
+
+#: ../../source/mp/frontend_dashboard.rst:109
+msgid "Alternatively, use the provided example script:"
+msgstr "或者，使用提供的示例脚本："
+
+#: ../../source/mp/frontend_dashboard.rst:115
+msgid "**Step 3 — Start the dashboard**"
+msgstr "**步骤 3 — 启动仪表板**"
+
+#: ../../source/mp/frontend_dashboard.rst:124
+msgid "Open ``http://localhost:8000`` in your browser."
+msgstr "在浏览器中打开 ``http://localhost:8000``。"
+
+#: ../../source/mp/frontend_dashboard.rst:127
+msgid ""
+"The dashboard auto-refreshes the node list from the supplier URL at most "
+"once every 30 seconds when the homepage is loaded."
+msgstr "仪表板在加载主页时，最多每 30 秒自动从供应商 URL 刷新一次节点列表。"
+
+#: ../../source/mp/frontend_dashboard.rst:131
+msgid "Dashboard Features"
+msgstr "仪表板功能"
+
+#: ../../source/mp/frontend_dashboard.rst:133
+msgid ""
+"**Node tree view** — shows all proxies and their child nodes in a "
+"collapsible tree."
+msgstr "**节点树视图** — 显示所有代理及其子节点，以可折叠的树形结构呈现。"
+
+#: ../../source/mp/frontend_dashboard.rst:135
+msgid ""
+"**Metrics aggregation** — ``GET /metrics`` on the dashboard aggregates "
+"Prometheus metrics from every leaf node."
+msgstr "**指标聚合** — ``GET /metrics`` 在仪表板上聚合来自每个叶节点的 Prometheus 指标。"
+
+#: ../../source/mp/frontend_dashboard.rst:137
+#, python-brace-format
+msgid ""
+"**Reverse proxy** — ``/proxy2/{node_name}/{path}`` forwards requests to "
+"the named node, enabling direct API access from the browser."
+msgstr "**反向代理** — ``/proxy2/{node_name}/{path}`` 将请求转发到指定节点，使浏览器能够直接访问 API。"
+
+#: ../../source/mp/frontend_dashboard.rst:139
+#, python-brace-format
+msgid ""
+"**Health endpoint** — ``GET /health`` returns ``{\"status\": "
+"\"healthy\"}``."
+msgstr "**健康端点** — ``GET /health`` 返回 ``{\"status\": \"healthy\"}``."
+
+#: ../../source/mp/frontend_dashboard.rst:142
+msgid "CLI Reference"
+msgstr "CLI 参考"
+
+#: ../../source/mp/frontend_dashboard.rst:145
+msgid "``python -m lmcache.lmcache_frontend.app``"
+msgstr "``python -m lmcache.lmcache_frontend.app``"
+
+#: ../../source/mp/frontend_dashboard.rst:151
+msgid "Flag"
+msgstr "标志"
+
+#: ../../source/mp/frontend_dashboard.rst:152
+msgid "Default"
+msgstr "默认"
+
+#: ../../source/mp/frontend_dashboard.rst:154
+msgid "``--host``"
+msgstr "``--host``"
+
+#: ../../source/mp/frontend_dashboard.rst:155
+msgid "``0.0.0.0``"
+msgstr "``0.0.0.0``"
+
+#: ../../source/mp/frontend_dashboard.rst:156
+msgid "Bind address for the dashboard HTTP server."
+msgstr "仪表板 HTTP 服务器的绑定地址。"
+
+#: ../../source/mp/frontend_dashboard.rst:157
+msgid "``--port``"
+msgstr "``--port``"
+
+#: ../../source/mp/frontend_dashboard.rst:158
+msgid "``8000``"
+msgstr "``8000``"
+
+#: ../../source/mp/frontend_dashboard.rst:159
+msgid "Port for the dashboard HTTP server."
+msgstr "仪表板 HTTP 服务器的端口。"
+
+#: ../../source/mp/frontend_dashboard.rst:160
+msgid "``--node-supplier-url``"
+msgstr "``--node-supplier-url``"
+
+#: ../../source/mp/frontend_dashboard.rst:161
+#: ../../source/mp/frontend_dashboard.rst:169
+#: ../../source/mp/frontend_dashboard.rst:173
+msgid "*(none)*"
+msgstr "*(无)*"
+
+#: ../../source/mp/frontend_dashboard.rst:162
+msgid ""
+"URL of the discovery service's node-list endpoint, e.g. "
+"``http://localhost:5000/lmcache_infos``."
+msgstr "发现服务的节点列表端点的 URL，例如 ``http://localhost:5000/lmcache_infos``。"
+
+#: ../../source/mp/frontend_dashboard.rst:164
+msgid "``--config``"
+msgstr "``--config``"
+
+#: ../../source/mp/frontend_dashboard.rst:165
+msgid "*(built-in)*"
+msgstr "*(内置)*"
+
+#: ../../source/mp/frontend_dashboard.rst:166
+msgid ""
+"Path to a JSON config file listing proxy nodes.  Used when ``--node-"
+"supplier-url`` is not set."
+msgstr "代理节点的 JSON 配置文件路径。当未设置 ``--node-supplier-url`` 时使用。"
+
+#: ../../source/mp/frontend_dashboard.rst:168
+msgid "``--nodes``"
+msgstr "``--nodes``"
+
+#: ../../source/mp/frontend_dashboard.rst:170
+#, python-brace-format
+msgid ""
+"Inline JSON array of node dicts, e.g. "
+"``'[{\"name\":\"n1\",\"host\":\"127.0.0.1\",\"port\":\"8085\"}]'``."
+msgstr "节点字典的内联 JSON 数组，例如 ``'[{\\\"name\\\":\\\"n1\\\",\\\"host\\\":\\\"127.0.0.1\\\",\\\"port\\\":\\\"8085\\\"}]'``。"
+
+#: ../../source/mp/frontend_dashboard.rst:172
+msgid "``--heartbeat-url``"
+msgstr "``--heartbeat-url``"
+
+#: ../../source/mp/frontend_dashboard.rst:174
+msgid "If set, the dashboard itself also sends heartbeats to this URL."
+msgstr "如果设置，仪表板本身也会向此 URL 发送心跳。"
+
+#: ../../source/mp/frontend_dashboard.rst:175
+msgid "``--log-level``"
+msgstr "``--log-level``"
+
+#: ../../source/mp/frontend_dashboard.rst:176
+msgid "``warning``"
+msgstr "``warning``"
+
+#: ../../source/mp/frontend_dashboard.rst:177
+msgid "Uvicorn log level (``debug``, ``info``, ``warning``, …)."
+msgstr "Uvicorn 日志级别 (``debug``, ``info``, ``warning``, …)。"
+
+#: ../../source/mp/frontend_dashboard.rst:178
+msgid "``--no-http``"
+msgstr "``--no-http``"
+
+#: ../../source/mp/frontend_dashboard.rst:179
+msgid "``false``"
+msgstr "``false``"
+
+#: ../../source/mp/frontend_dashboard.rst:180
+msgid ""
+"Disable the HTTP server; only the heartbeat loop runs.  Used internally "
+"by the MP plugin."
+msgstr "禁用 HTTP 服务器；仅运行心跳循环。由 MP 插件内部使用。"
+
+#: ../../source/mp/frontend_dashboard.rst:184
+msgid "Plugin Config Keys"
+msgstr "插件配置键"
+
+#: ../../source/mp/frontend_dashboard.rst:186
+msgid ""
+"Pass these inside ``--runtime-plugin-config`` when launching the MP "
+"server:"
+msgstr "在启动 MP 服务器时，将这些放在 ``--runtime-plugin-config`` 中："
+
+#: ../../source/mp/frontend_dashboard.rst:192
+msgid "Key"
+msgstr "键"
+
+#: ../../source/mp/frontend_dashboard.rst:194
+msgid "``plugin.frontend.heartbeat-url``"
+msgstr "``plugin.frontend.heartbeat-url``"
+
+#: ../../source/mp/frontend_dashboard.rst:195
+msgid "**(Required)** Heartbeat endpoint of the discovery service."
+msgstr "**（必需）** 发现服务的心跳端点。"
+
+#: ../../source/mp/frontend_dashboard.rst:196
+msgid "``plugin.frontend.heartbeat-interval``"
+msgstr "``plugin.frontend.heartbeat-interval``"
+
+#: ../../source/mp/frontend_dashboard.rst:197
+msgid "Heartbeat interval in seconds (default: ``30``)."
+msgstr "心跳间隔（单位：秒，默认值：``30``）。"
+
+#: ../../source/mp/frontend_dashboard.rst:198
+msgid "``plugin.frontend.heartbeat-initial-delay``"
+msgstr "``plugin.frontend.heartbeat-initial-delay``"
+
+#: ../../source/mp/frontend_dashboard.rst:199
+msgid "Seconds to wait before the first heartbeat (default: ``0``)."
+msgstr "在第一次心跳之前等待的秒数（默认值：``0``）。"
+
+#: ../../source/mp/frontend_dashboard.rst:202
+msgid "Using a Custom Discovery Service"
+msgstr "使用自定义发现服务"
+
+#: ../../source/mp/frontend_dashboard.rst:204
+msgid ""
+"The ``simple_discover_service`` is a reference implementation.  Any HTTP "
+"service that accepts the following GET request can be used:"
+msgstr "``simple_discover_service`` 是一个参考实现。任何接受以下 GET 请求的 HTTP 服务都可以使用："
+
+#: ../../source/mp/frontend_dashboard.rst:211
+msgid "And exposes a node-list endpoint that returns JSON in the shape:"
+msgstr "并暴露一个节点列表端点，该端点返回形状为 JSON 的数据："
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/http_api.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/http_api.po
index 810df583d9..dd3b678db9 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/mp/http_api.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/http_api.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-18 17:25+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -30,7 +30,10 @@ msgid ""
 "socket used by vLLM. This HTTP API is intended for operators, "
 "orchestrators (e.g. Kubernetes), and debugging tools — it is **not** on "
 "the inference data path."
-msgstr "当 MP 服务器通过 ``lmcache server`` 启动时（推荐的入口点），会暴露一个基于 FastAPI 的 HTTP 前端，以及 vLLM 使用的 ZMQ 套接字。此 HTTP API 旨在供操作员、编排者（例如 Kubernetes）和调试工具使用——它**不**在推理数据路径上。"
+msgstr ""
+"当 MP 服务器通过 ``lmcache server`` 启动时（推荐的入口点），会暴露一个基于 FastAPI 的 HTTP 前端，以及 "
+"vLLM 使用的 ZMQ 套接字。此 HTTP API 旨在供操作员、编排者（例如 "
+"Kubernetes）和调试工具使用——它**不**在推理数据路径上。"
 
 #: ../../source/mp/http_api.rst:10
 msgid ""
@@ -38,366 +41,509 @@ msgid ""
 "``lmcache/v1/multiprocess/http_apis/``: any module named ``*_api.py`` "
 "that exposes a module-level ``router`` (a :class:`fastapi.APIRouter`) is "
 "discovered at startup."
-msgstr "新的端点会自动从 ``lmcache/v1/multiprocess/http_apis/`` 注册：任何名为 ``*_api.py`` 的模块，只要暴露一个模块级的 ``router``（一个 :class:`fastapi.APIRouter`），在启动时都会被发现。"
+msgstr ""
+"新的端点会自动从 ``lmcache/v1/multiprocess/http_apis/`` 注册：任何名为 ``*_api.py`` "
+"的模块，只要暴露一个模块级的 ``router``（一个 :class:`fastapi.APIRouter`），在启动时都会被发现。"
 
 #: ../../source/mp/http_api.rst:15
 msgid ""
 "A subset of routes defined under "
 "``lmcache/v1/internal_api_server/common/`` is also exposed on this HTTP "
 "server. The module ``lmcache/v1/multiprocess/http_apis/common_api.py`` "
-"aggregates those routers (skipping modules listed in "
-"``_MP_INCOMPATIBLE_MODULES``, such as ``run_script_api``) and forwards "
-"them to the auto-discovery pipeline. Adding a new compatible module under"
-" ``internal_api_server/common`` therefore requires no wiring changes on "
+"aggregates those routers (skipping any module listed in "
+"``_MP_INCOMPATIBLE_MODULES``, which is currently empty) and forwards them"
+" to the auto-discovery pipeline. Adding a new compatible module under "
+"``internal_api_server/common`` therefore requires no wiring changes on "
 "the MP side."
-msgstr "在 ``lmcache/v1/internal_api_server/common/`` 下定义的部分路由也在此 HTTP 服务器上暴露。模块 ``lmcache/v1/multiprocess/http_apis/common_api.py`` 聚合了这些路由（跳过在 ``_MP_INCOMPATIBLE_MODULES`` 中列出的模块，例如 ``run_script_api``），并将它们转发到自动发现管道。因此，在 ``internal_api_server/common`` 下添加一个新的兼容模块不需要在 MP 端进行接线更改。"
+msgstr "在 ``lmcache/v1/internal_api_server/common/`` 下定义的部分路由也会在此 HTTP 服务器上暴露。模块 ``lmcache/v1/multiprocess/http_apis/common_api.py`` 聚合了这些路由（跳过任何列在 ``_MP_INCOMPATIBLE_MODULES`` 中的模块，目前该列表为空），并将它们转发到自动发现管道。因此，在 ``internal_api_server/common`` 下添加一个新的兼容模块不需要在 MP 端进行接线更改。"
 
-#: ../../source/mp/http_api.rst:29
+#: ../../source/mp/http_api.rst:30
 msgid "Server Configuration"
 msgstr "服务器配置"
 
-#: ../../source/mp/http_api.rst:35
+#: ../../source/mp/http_api.rst:36
 msgid "Argument"
 msgstr "参数"
 
-#: ../../source/mp/http_api.rst:36
+#: ../../source/mp/http_api.rst:37
 msgid "Default"
 msgstr "默认"
 
-#: ../../source/mp/http_api.rst:37
+#: ../../source/mp/http_api.rst:38 ../../source/mp/http_api.rst:331
 msgid "Description"
 msgstr "描述"
 
-#: ../../source/mp/http_api.rst:38
+#: ../../source/mp/http_api.rst:39
 msgid "``--http-host``"
 msgstr "``--http-host``"
 
-#: ../../source/mp/http_api.rst:39
+#: ../../source/mp/http_api.rst:40
 msgid "``0.0.0.0``"
 msgstr "``0.0.0.0``"
 
-#: ../../source/mp/http_api.rst:40
+#: ../../source/mp/http_api.rst:41
 msgid "Host to bind the HTTP server."
 msgstr "绑定 HTTP 服务器的主机。"
 
-#: ../../source/mp/http_api.rst:41
+#: ../../source/mp/http_api.rst:42
 msgid "``--http-port``"
 msgstr "``--http-port``"
 
-#: ../../source/mp/http_api.rst:42
+#: ../../source/mp/http_api.rst:43
 msgid "``8080``"
 msgstr "``8080``"
 
-#: ../../source/mp/http_api.rst:43
+#: ../../source/mp/http_api.rst:44
 msgid "Port to bind the HTTP server."
 msgstr "绑定 HTTP 服务器的端口。"
 
-#: ../../source/mp/http_api.rst:45
+#: ../../source/mp/http_api.rst:46
 msgid "Example:"
 msgstr "示例："
 
-#: ../../source/mp/http_api.rst:53
+#: ../../source/mp/http_api.rst:54
 msgid ""
 "All examples below assume the server is reachable at "
 "``http://localhost:8080``."
 msgstr "以下所有示例都假设服务器可以通过 ``http://localhost:8080`` 访问。"
 
-#: ../../source/mp/http_api.rst:57
+#: ../../source/mp/http_api.rst:58
 msgid "Endpoints"
 msgstr "端点"
 
-#: ../../source/mp/http_api.rst:59
+#: ../../source/mp/http_api.rst:60
 msgid ""
 "The table below groups the routes by purpose. The operational surface "
 "(health, status, cache control) is exposed at top-level paths. Routes "
 "inherited from the shared ``internal_api_server`` package are kept at "
 "their original paths for compatibility with the vLLM-embedded API server."
-msgstr "下表按目的对路由进行了分组。操作接口（健康检查、状态、缓存控制）在顶层路径中暴露。来自共享 ``internal_api_server`` 包的路由保持在其原始路径上，以便与 vLLM 嵌入的 API 服务器兼容。"
+msgstr ""
+"下表按目的对路由进行了分组。操作接口（健康检查、状态、缓存控制）在顶层路径中暴露。来自共享 ``internal_api_server`` "
+"包的路由保持在其原始路径上，以便与 vLLM 嵌入的 API 服务器兼容。"
 
-#: ../../source/mp/http_api.rst:69
+#: ../../source/mp/http_api.rst:70
 msgid "Method"
 msgstr "方法"
 
-#: ../../source/mp/http_api.rst:70
+#: ../../source/mp/http_api.rst:71
 msgid "Path"
 msgstr "路径"
 
-#: ../../source/mp/http_api.rst:71
+#: ../../source/mp/http_api.rst:72
 msgid "Purpose"
 msgstr "目的"
 
-#: ../../source/mp/http_api.rst:72 ../../source/mp/http_api.rst:75
-#: ../../source/mp/http_api.rst:78 ../../source/mp/http_api.rst:84
-#: ../../source/mp/http_api.rst:90 ../../source/mp/http_api.rst:97
-#: ../../source/mp/http_api.rst:101 ../../source/mp/http_api.rst:104
-#: ../../source/mp/http_api.rst:107 ../../source/mp/http_api.rst:110
-#: ../../source/mp/http_api.rst:113 ../../source/mp/http_api.rst:116
-#: ../../source/mp/http_api.rst:122 ../../source/mp/http_api.rst:125
-#: ../../source/mp/http_api.rst:128 ../../source/mp/http_api.rst:131
+#: ../../source/mp/http_api.rst:73 ../../source/mp/http_api.rst:76
+#: ../../source/mp/http_api.rst:79 ../../source/mp/http_api.rst:85
+#: ../../source/mp/http_api.rst:90 ../../source/mp/http_api.rst:96
+#: ../../source/mp/http_api.rst:103 ../../source/mp/http_api.rst:107
+#: ../../source/mp/http_api.rst:110 ../../source/mp/http_api.rst:113
+#: ../../source/mp/http_api.rst:116 ../../source/mp/http_api.rst:119
+#: ../../source/mp/http_api.rst:122 ../../source/mp/http_api.rst:128
+#: ../../source/mp/http_api.rst:131 ../../source/mp/http_api.rst:134
+#: ../../source/mp/http_api.rst:137
 msgid "GET"
 msgstr "获取"
 
-#: ../../source/mp/http_api.rst:73
+#: ../../source/mp/http_api.rst:74
 msgid "``/``"
 msgstr "``/``"
 
-#: ../../source/mp/http_api.rst:74
+#: ../../source/mp/http_api.rst:75
 msgid "Basic liveness ping."
 msgstr "基本存活探测。"
 
-#: ../../source/mp/http_api.rst:76
+#: ../../source/mp/http_api.rst:77
 msgid "``/healthcheck``"
 msgstr "``/healthcheck``"
 
-#: ../../source/mp/http_api.rst:77
+#: ../../source/mp/http_api.rst:78
 msgid "K8s liveness/readiness probe."
 msgstr "K8s 存活/就绪探针。"
 
-#: ../../source/mp/http_api.rst:79
+#: ../../source/mp/http_api.rst:80
 msgid "``/status``"
 msgstr "``/status``"
 
-#: ../../source/mp/http_api.rst:80
+#: ../../source/mp/http_api.rst:81
 msgid "Detailed engine status for inspection and debugging."
 msgstr "详细的引擎状态以供检查和调试。"
 
-#: ../../source/mp/http_api.rst:81 ../../source/mp/http_api.rst:119
+#: ../../source/mp/http_api.rst:82 ../../source/mp/http_api.rst:125
+#: ../../source/mp/http_api.rst:140
 msgid "POST"
 msgstr "POST"
 
-#: ../../source/mp/http_api.rst:82
+#: ../../source/mp/http_api.rst:83
 msgid "``/clear-cache``"
 msgstr "``/clear-cache``"
 
-#: ../../source/mp/http_api.rst:83
+#: ../../source/mp/http_api.rst:84
 msgid "Force-clear all KV data in L1 (CPU) memory."
 msgstr "强制清除 L1 (CPU) 内存中的所有 KV 数据。"
 
-#: ../../source/mp/http_api.rst:85
+#: ../../source/mp/http_api.rst:86
+msgid "``/kvcache/check``"
+msgstr "``/kvcache/check``"
+
+#: ../../source/mp/http_api.rst:87
+msgid ""
+"Compute MD5 checksums over the GPU KV cache for a set of block IDs. "
+"Intended for diagnostics and round-trip integrity checks from ``lmcache "
+"bench server``."
+msgstr "计算一组块 ID 的 GPU KV Cache 的 MD5 校验和。用于诊断和从 ``lmcache bench server`` 进行往返完整性检查。"
+
+#: ../../source/mp/http_api.rst:91
 msgid "``/quota``"
 msgstr "``/quota``"
 
-#: ../../source/mp/http_api.rst:86
+#: ../../source/mp/http_api.rst:92
 msgid "List every registered ``cache_salt`` quota with live usage."
 msgstr "列出每个注册的 ``cache_salt`` 配额及其实时使用情况。"
 
-#: ../../source/mp/http_api.rst:87
+#: ../../source/mp/http_api.rst:93
 msgid "PUT"
 msgstr "PUT"
 
-#: ../../source/mp/http_api.rst:88 ../../source/mp/http_api.rst:91
-#: ../../source/mp/http_api.rst:94
+#: ../../source/mp/http_api.rst:94 ../../source/mp/http_api.rst:97
+#: ../../source/mp/http_api.rst:100
 #, python-brace-format
 msgid "``/quota/{cache_salt}``"
 msgstr "``/quota/{cache_salt}``"
 
-#: ../../source/mp/http_api.rst:89
+#: ../../source/mp/http_api.rst:95
 msgid "Set or update the quota (in GB) for a ``cache_salt``."
 msgstr "设置或更新 ``cache_salt`` 的配额（以 GB 为单位）。"
 
-#: ../../source/mp/http_api.rst:92
+#: ../../source/mp/http_api.rst:98
 msgid "Read the quota and live usage for a single ``cache_salt``."
 msgstr "读取单个 ``cache_salt`` 的配额和实时使用情况。"
 
-#: ../../source/mp/http_api.rst:93
+#: ../../source/mp/http_api.rst:99
 msgid "DELETE"
 msgstr "删除"
 
-#: ../../source/mp/http_api.rst:95
+#: ../../source/mp/http_api.rst:101
 msgid "Remove a ``cache_salt``'s quota entry (its data is evicted next cycle)."
 msgstr "移除 ``cache_salt`` 的配额条目（其数据将在下一个周期被逐出）。"
 
-#: ../../source/mp/http_api.rst:98
+#: ../../source/mp/http_api.rst:104
 msgid "``/conf``"
 msgstr "``/conf``"
 
-#: ../../source/mp/http_api.rst:99
+#: ../../source/mp/http_api.rst:105
 msgid "Dump merged server configurations (mp, storage_manager, observability)."
 msgstr "转储合并的服务器配置（mp、存储管理器、可观察性）。"
 
-#: ../../source/mp/http_api.rst:102
+#: ../../source/mp/http_api.rst:108
 msgid "``/version``"
 msgstr "``/version``"
 
-#: ../../source/mp/http_api.rst:103
+#: ../../source/mp/http_api.rst:109
 msgid "Full version descriptor (package version + commit id)."
 msgstr "完整版本描述符（软件包版本 + 提交 ID）。"
 
-#: ../../source/mp/http_api.rst:105
+#: ../../source/mp/http_api.rst:111
 msgid "``/lmc_version``"
 msgstr "``/lmc_version``"
 
-#: ../../source/mp/http_api.rst:106
+#: ../../source/mp/http_api.rst:112
 msgid "LMCache package version string."
 msgstr "LMCache 包版本字符串。"
 
-#: ../../source/mp/http_api.rst:108
+#: ../../source/mp/http_api.rst:114
 msgid "``/commit_id``"
 msgstr "``/commit_id``"
 
-#: ../../source/mp/http_api.rst:109
+#: ../../source/mp/http_api.rst:115
 msgid "Current build commit id."
 msgstr "当前构建提交 ID。"
 
-#: ../../source/mp/http_api.rst:111
+#: ../../source/mp/http_api.rst:117
 msgid "``/env``"
 msgstr "``/env``"
 
-#: ../../source/mp/http_api.rst:112
+#: ../../source/mp/http_api.rst:118
 msgid "Dump process environment variables (JSON, plain text)."
 msgstr "转储进程环境变量（JSON，纯文本）。"
 
-#: ../../source/mp/http_api.rst:114
+#: ../../source/mp/http_api.rst:120
 msgid "``/loglevel``"
 msgstr "``/loglevel``"
 
-#: ../../source/mp/http_api.rst:115
+#: ../../source/mp/http_api.rst:121
 msgid "List or inspect logger levels; also accepts ``level`` to mutate."
 msgstr "列出或检查日志记录器级别；也接受 ``level`` 以进行修改。"
 
-#: ../../source/mp/http_api.rst:117
+#: ../../source/mp/http_api.rst:123
 msgid "``/metrics``"
 msgstr "``/metrics``"
 
-#: ../../source/mp/http_api.rst:118
+#: ../../source/mp/http_api.rst:124
 msgid "Prometheus exposition format."
 msgstr "Prometheus 展示格式。"
 
-#: ../../source/mp/http_api.rst:120
+#: ../../source/mp/http_api.rst:126
 msgid "``/metrics/reset``"
 msgstr "``/metrics/reset``"
 
-#: ../../source/mp/http_api.rst:121
+#: ../../source/mp/http_api.rst:127
 msgid "Reset all observability metrics to their initial state."
 msgstr "重置所有可观察性指标为其初始状态。"
 
-#: ../../source/mp/http_api.rst:123
+#: ../../source/mp/http_api.rst:129
 msgid "``/threads``"
 msgstr "``/threads``"
 
-#: ../../source/mp/http_api.rst:124
+#: ../../source/mp/http_api.rst:130
 msgid "Enumerate active Python threads and their stack traces."
 msgstr "列出活动的 Python 线程及其堆栈跟踪。"
 
-#: ../../source/mp/http_api.rst:126
+#: ../../source/mp/http_api.rst:132
 msgid "``/periodic-threads``"
 msgstr "``/periodic-threads``"
 
-#: ../../source/mp/http_api.rst:127
+#: ../../source/mp/http_api.rst:133
 msgid "List registered periodic threads with summary counts."
 msgstr "列出注册的周期性线程及其摘要计数。"
 
-#: ../../source/mp/http_api.rst:129
+#: ../../source/mp/http_api.rst:135
 #, python-brace-format
 msgid "``/periodic-threads/{thread_name}``"
 msgstr "``/periodic-threads/{thread_name}``"
 
-#: ../../source/mp/http_api.rst:130
+#: ../../source/mp/http_api.rst:136
 msgid "Detailed status for a single periodic thread."
 msgstr "单个周期线程的详细状态。"
 
-#: ../../source/mp/http_api.rst:132
+#: ../../source/mp/http_api.rst:138
 msgid "``/periodic-threads-health``"
 msgstr "``/periodic-threads-health``"
 
-#: ../../source/mp/http_api.rst:133
+#: ../../source/mp/http_api.rst:139
 msgid "Quick health check for critical/high-level periodic threads."
 msgstr "对关键/高层周期线程的快速健康检查。"
 
-#: ../../source/mp/http_api.rst:136
+#: ../../source/mp/http_api.rst:141
+msgid "``/run_script``"
+msgstr "``/run_script``"
+
+#: ../../source/mp/http_api.rst:142
+msgid ""
+"Execute an uploaded Python script in a restricted sandbox. Only modules "
+"listed in ``--script-allowed-imports`` can be imported."
+msgstr "在受限的沙箱中执行上传的 Python 脚本。只能导入在 ``--script-allowed-imports`` 中列出的模块。"
+
+#: ../../source/mp/http_api.rst:146
 msgid "``GET /``"
 msgstr "``GET /``"
 
-#: ../../source/mp/http_api.rst:138
+#: ../../source/mp/http_api.rst:148
 msgid ""
 "Basic liveness check. Returns a static payload indicating the HTTP server"
 " is running. Use ``/healthcheck`` instead for probes that also verify the"
 " cache engine is initialized."
 msgstr "基本的存活检查。返回一个静态负载，指示 HTTP 服务器正在运行。对于还验证缓存引擎是否初始化的探测，请使用 ``/healthcheck``。"
 
-#: ../../source/mp/http_api.rst:142 ../../source/mp/http_api.rst:165
-#: ../../source/mp/http_api.rst:214 ../../source/mp/http_api.rst:279
-#: ../../source/mp/http_api.rst:334 ../../source/mp/http_api.rst:357
-#: ../../source/mp/http_api.rst:380 ../../source/mp/http_api.rst:394
-#: ../../source/mp/http_api.rst:415 ../../source/mp/http_api.rst:454
-#: ../../source/mp/http_api.rst:560 ../../source/mp/http_api.rst:617
-#: ../../source/mp/http_api.rst:657
+#: ../../source/mp/http_api.rst:152 ../../source/mp/http_api.rst:175
+#: ../../source/mp/http_api.rst:224 ../../source/mp/http_api.rst:289
+#: ../../source/mp/http_api.rst:346 ../../source/mp/http_api.rst:414
+#: ../../source/mp/http_api.rst:437 ../../source/mp/http_api.rst:460
+#: ../../source/mp/http_api.rst:474 ../../source/mp/http_api.rst:495
+#: ../../source/mp/http_api.rst:534 ../../source/mp/http_api.rst:640
+#: ../../source/mp/http_api.rst:697 ../../source/mp/http_api.rst:737
 msgid "**Response** (``200 OK``):"
 msgstr "**响应** (``200 OK``):"
 
-#: ../../source/mp/http_api.rst:151 ../../source/mp/http_api.rst:182
-#: ../../source/mp/http_api.rst:260 ../../source/mp/http_api.rst:296
-#: ../../source/mp/http_api.rst:344 ../../source/mp/http_api.rst:442
-#: ../../source/mp/http_api.rst:460 ../../source/mp/http_api.rst:471
-#: ../../source/mp/http_api.rst:482 ../../source/mp/http_api.rst:500
-#: ../../source/mp/http_api.rst:547 ../../source/mp/http_api.rst:566
-#: ../../source/mp/http_api.rst:591 ../../source/mp/http_api.rst:633
-#: ../../source/mp/http_api.rst:644 ../../source/mp/http_api.rst:684
+#: ../../source/mp/http_api.rst:161 ../../source/mp/http_api.rst:192
+#: ../../source/mp/http_api.rst:270 ../../source/mp/http_api.rst:306
+#: ../../source/mp/http_api.rst:374 ../../source/mp/http_api.rst:424
+#: ../../source/mp/http_api.rst:522 ../../source/mp/http_api.rst:540
+#: ../../source/mp/http_api.rst:551 ../../source/mp/http_api.rst:562
+#: ../../source/mp/http_api.rst:580 ../../source/mp/http_api.rst:627
+#: ../../source/mp/http_api.rst:646 ../../source/mp/http_api.rst:671
+#: ../../source/mp/http_api.rst:713 ../../source/mp/http_api.rst:724
+#: ../../source/mp/http_api.rst:764
 msgid "**Example:**"
 msgstr "**示例:**"
 
-#: ../../source/mp/http_api.rst:158
+#: ../../source/mp/http_api.rst:168
 msgid "``GET /healthcheck``"
 msgstr "``GET /healthcheck``"
 
-#: ../../source/mp/http_api.rst:160
+#: ../../source/mp/http_api.rst:170
 msgid ""
 "Health check endpoint suitable for Kubernetes liveness and readiness "
 "probes. A ``200`` response implies the HTTP server is alive **and** the "
 "MP cache engine is initialized. A ``503`` response indicates the engine "
 "is not yet ready (still initializing, or failed to initialize)."
-msgstr "健康检查端点适用于 Kubernetes 的存活和就绪探针。``200`` 响应意味着 HTTP 服务器处于活动状态 **并且** MP 缓存引擎已初始化。``503`` 响应表示引擎尚未准备好（仍在初始化中，或初始化失败）。"
+msgstr ""
+"健康检查端点适用于 Kubernetes 的存活和就绪探针。``200`` 响应意味着 HTTP 服务器处于活动状态 **并且** MP "
+"缓存引擎已初始化。``503`` 响应表示引擎尚未准备好（仍在初始化中，或初始化失败）。"
 
-#: ../../source/mp/http_api.rst:173 ../../source/mp/http_api.rst:287
+#: ../../source/mp/http_api.rst:183 ../../source/mp/http_api.rst:297
 msgid "**Response** (``503 Service Unavailable``):"
 msgstr "**响应** (``503 服务不可用``):"
 
-#: ../../source/mp/http_api.rst:188
+#: ../../source/mp/http_api.rst:198
 msgid "**Kubernetes probe snippet:**"
 msgstr "**Kubernetes 探针代码片段:**"
 
-#: ../../source/mp/http_api.rst:206
+#: ../../source/mp/http_api.rst:216
 msgid "``GET /status``"
 msgstr "``GET /status``"
 
-#: ../../source/mp/http_api.rst:208
+#: ../../source/mp/http_api.rst:218
 msgid ""
 "Returns a detailed snapshot of the MP engine's internal state: L1 cache, "
 "L2 adapters, registered GPU contexts, active sessions, and in-flight "
 "prefetch jobs. Intended for operators and debugging, not for monitoring "
 "(use Prometheus metrics for time-series data — see :doc:`observability`)."
-msgstr "返回 MP 引擎内部状态的详细快照：L1 缓存、L2 适配器、注册的 GPU 上下文、活动会话和正在进行的预取任务。旨在供操作员和调试使用，而非监控（请使用 Prometheus 指标获取时间序列数据 — 参见 :doc:`observability`）。"
+msgstr ""
+"返回 MP 引擎内部状态的详细快照：L1 缓存、L2 适配器、注册的 GPU "
+"上下文、活动会话和正在进行的预取任务。旨在供操作员和调试使用，而非监控（请使用 Prometheus 指标获取时间序列数据 — 参见 "
+":doc:`observability`）。"
 
-#: ../../source/mp/http_api.rst:251
+#: ../../source/mp/http_api.rst:261
 msgid ""
 "**Response** (``503 Service Unavailable``) when the engine has not yet "
 "been initialized:"
 msgstr "**响应** (``503 服务不可用``) 当引擎尚未初始化时："
 
-#: ../../source/mp/http_api.rst:267
+#: ../../source/mp/http_api.rst:277
 msgid "``POST /clear-cache``"
 msgstr "``POST /clear-cache``"
 
-#: ../../source/mp/http_api.rst:269
+#: ../../source/mp/http_api.rst:279
 msgid "Force-clears **all** KV cache data currently held in L1 (CPU) memory."
 msgstr "强制清除当前保存在 L1 (CPU) 内存中的 **所有** KV Cache 数据。"
 
-#: ../../source/mp/http_api.rst:273
+#: ../../source/mp/http_api.rst:283
 msgid ""
 "This endpoint is destructive and bypasses read/write locks. In-flight "
 "store or prefetch operations may be corrupted. Use only when the server "
 "is idle, or when recovering from a known-bad cache state."
 msgstr "此端点是破坏性的，并绕过读/写锁。正在进行的存储或预取操作可能会被损坏。仅在服务器空闲时或从已知的坏缓存状态恢复时使用。"
 
-#: ../../source/mp/http_api.rst:277
+#: ../../source/mp/http_api.rst:287
 msgid "The request body is ignored."
 msgstr "请求体将被忽略。"
 
-#: ../../source/mp/http_api.rst:305
+#: ../../source/mp/http_api.rst:313
+msgid "``GET /kvcache/check``"
+msgstr "``GET /kvcache/check``"
+
+#: ../../source/mp/http_api.rst:315
+msgid ""
+"Compute MD5 checksums over the GPU KV cache, grouped ``chunk_size`` "
+"blocks per hashed chunk. MP mode addresses KV storage by block IDs "
+"natively (the same units used by ``STORE`` / ``RETRIEVE``), so the "
+"endpoint is fully block-centric: ``block_ids`` enumerates the target "
+"blocks and ``chunk_size`` counts blocks per chunk. Intended for "
+"diagnostics and round-trip integrity checks from ``lmcache bench server``"
+" — not for the inference data path."
+msgstr "计算 GPU KV Cache 的 MD5 校验和，每个哈希块分组为 ``chunk_size`` 块。MP 模式通过块 ID 原生地处理 KV 存储（与 ``STORE`` / ``RETRIEVE`` 使用的单位相同），因此该端点完全以块为中心：``block_ids`` 列举目标块，``chunk_size`` 计算每个块的数量。旨在用于诊断和 ``lmcache bench server`` 的往返完整性检查——而非推理数据路径。"
+
+#: ../../source/mp/http_api.rst:323
+msgid "**Query parameters:**"
+msgstr "**查询参数：**"
+
+#: ../../source/mp/http_api.rst:329
+msgid "Name"
+msgstr "名称"
+
+#: ../../source/mp/http_api.rst:330
+msgid "Required"
+msgstr "必需的"
+
+#: ../../source/mp/http_api.rst:332
+msgid "``block_ids``"
+msgstr "``block_ids``"
+
+#: ../../source/mp/http_api.rst:333 ../../source/mp/http_api.rst:336
+msgid "yes"
+msgstr "是"
+
+#: ../../source/mp/http_api.rst:334
+msgid "GPU block IDs in mixed format, e.g. ``\"0,[2,5],8\"``."
+msgstr "混合格式的 GPU 块 ID，例如 ``\"0,[2,5],8\"``。"
+
+#: ../../source/mp/http_api.rst:335
+msgid "``chunk_size``"
+msgstr "``chunk_size``"
+
+#: ../../source/mp/http_api.rst:337
+msgid "Positive integer — number of blocks per hashed chunk."
+msgstr "正整数 — 每个哈希块的块数。"
+
+#: ../../source/mp/http_api.rst:338
+msgid "``instance_id``"
+msgstr "``instance_id``"
+
+#: ../../source/mp/http_api.rst:339
+msgid "no (default ``0``)"
+msgstr "否（默认 ``0``）"
+
+#: ../../source/mp/http_api.rst:340
+msgid "Registered GPU context ID on the engine."
+msgstr "引擎上注册的 GPU 上下文 ID。"
+
+#: ../../source/mp/http_api.rst:341
+msgid "``layerwise``"
+msgstr "``layerwise``"
+
+#: ../../source/mp/http_api.rst:342
+msgid "no (default ``false``)"
+msgstr "不（默认 ``false``）"
+
+#: ../../source/mp/http_api.rst:343
+msgid ""
+"If ``true``, return per-layer checksums keyed by ``\"layer_<idx>\"``; "
+"otherwise a single aggregated digest per chunk over all layers."
+msgstr "如果为 ``true``，则返回按 ``\"layer_<idx>\"`` 键入的逐层校验和；否则返回每个块的所有层的单个聚合摘要。"
+
+#: ../../source/mp/http_api.rst:359
+msgid ""
+"When ``layerwise=true``, ``chunk_checksums`` is a dict keyed by "
+"``\"layer_<idx>\"`` whose values are per-layer lists."
+msgstr "当 ``layerwise=true`` 时，``chunk_checksums`` 是一个以 ``\"layer_<idx>\"`` 为键的字典，其值是逐层列表。"
+
+#: ../../source/mp/http_api.rst:362
+msgid "**HTTP status codes:**"
+msgstr "**HTTP 状态码:**"
+
+#: ../../source/mp/http_api.rst:364
+msgid "``200``: success."
+msgstr "``200``: 成功。"
+
+#: ../../source/mp/http_api.rst:365
+msgid ""
+"``400``: ``block_ids`` missing/malformed, or ``chunk_size`` missing or "
+"non-positive."
+msgstr "``400``: ``block_ids`` 缺失/格式错误，或 ``chunk_size`` 缺失或非正数。"
+
+#: ../../source/mp/http_api.rst:367
+msgid ""
+"``404``: ``instance_id`` not registered, or the registered KV tensors are"
+" empty."
+msgstr "``404``: ``instance_id`` 未注册，或者注册的 KV 张量为空。"
+
+#: ../../source/mp/http_api.rst:369
+msgid ""
+"``501``: engine has no ``gpu_contexts``, or the GPU KV format is not "
+"supported by this endpoint (page-buffer-fused and cross-layer layouts are"
+" declined until a real need appears)."
+msgstr "``501``: 引擎没有 ``gpu_contexts``，或者该端点不支持 GPU KV 格式（页面缓冲融合和跨层布局在真正需要之前被拒绝）。"
+
+#: ../../source/mp/http_api.rst:372
+msgid "``503``: engine not yet initialized on ``app.state``."
+msgstr "``503``: 引擎尚未在 ``app.state`` 上初始化。"
+
+#: ../../source/mp/http_api.rst:385
 msgid "``/quota`` — per-``cache_salt`` quota management"
 msgstr "``/quota`` — 每个``cache_salt``的配额管理"
 
-#: ../../source/mp/http_api.rst:307
+#: ../../source/mp/http_api.rst:387
 msgid ""
 "These endpoints manage the per-``cache_salt`` storage budgets consumed by"
 " the ``IsolatedLRU`` eviction policy (selected via ``--eviction-policy "
@@ -406,16 +552,22 @@ msgid ""
 "cycle (~1 s). A ``cache_salt`` with no registered quota has an effective "
 "limit of ``0`` bytes, so its data is cleared next cycle (allowlist "
 "semantics)."
-msgstr "这些端点管理由 ``IsolatedLRU`` 逐出策略（通过 ``--eviction-policy IsolatedLRU`` 选择）消耗的每个 ``cache_salt`` 存储预算。配额是 **软性** 的：设置限制并不会拒绝写入 — 任何超出预算的 ``cache_salt`` 会在下一个逐出周期（约 1 秒）被逐出。没有注册配额的 ``cache_salt`` 有一个有效限制为 ``0`` 字节，因此其数据将在下一个周期被清除（白名单语义）。"
+msgstr ""
+"这些端点管理由 ``IsolatedLRU`` 逐出策略（通过 ``--eviction-policy IsolatedLRU`` "
+"选择）消耗的每个 ``cache_salt`` 存储预算。配额是 **软性** 的：设置限制并不会拒绝写入 — 任何超出预算的 "
+"``cache_salt`` 会在下一个逐出周期（约 1 秒）被逐出。没有注册配额的 ``cache_salt`` 有一个有效限制为 ``0`` "
+"字节，因此其数据将在下一个周期被清除（白名单语义）。"
 
-#: ../../source/mp/http_api.rst:315
+#: ../../source/mp/http_api.rst:395
 msgid ""
 "These endpoints are no-ops on engines that did not start with "
 "``--eviction-policy IsolatedLRU``: the ``QuotaManager`` is still present,"
 " but the LRU policy ignores the registered quotas."
-msgstr "这些端点在未使用 ``--eviction-policy IsolatedLRU`` 启动的引擎上是无操作的：``QuotaManager`` 仍然存在，但 LRU 策略会忽略注册的配额。"
+msgstr ""
+"这些端点在未使用 ``--eviction-policy IsolatedLRU`` 启动的引擎上是无操作的：``QuotaManager`` "
+"仍然存在，但 LRU 策略会忽略注册的配额。"
 
-#: ../../source/mp/http_api.rst:319
+#: ../../source/mp/http_api.rst:399
 msgid ""
 "**URL escaping for the empty salt.** ``cache_salt=\"\"`` (un-salted / "
 "anonymous traffic) cannot appear in a URL path parameter, so the API "
@@ -424,79 +576,92 @@ msgid ""
 "data with ``cache_salt=\"_default\"`` cannot be managed via this HTTP API"
 " distinctly from anonymous traffic — both map to the same path parameter;"
 " pick any other value (e.g. ``\"default\"``) to disambiguate."
-msgstr "**空盐的 URL 转义。** ``cache_salt=\\\"\\\"``（无盐 / 匿名流量）不能出现在 URL 路径参数中，因此 API 接受哨兵 ``_default`` 作为替代。``PUT /quota/_default`` 设置 ``cache_salt=\\\"\\\"`` 的配额。合法存储数据的用户使用 ``cache_salt=\\\"_default\\\"``，无法通过此 HTTP API 与匿名流量区分管理——两者映射到相同的路径参数；选择任何其他值（例如 ``\\\"default\\\"``）以消除歧义。"
+msgstr ""
+"**空盐的 URL 转义。** ``cache_salt=\\\"\\\"``（无盐 / 匿名流量）不能出现在 URL 路径参数中，因此 API "
+"接受哨兵 ``_default`` 作为替代。``PUT /quota/_default`` 设置 ``cache_salt=\\\"\\\"``"
+" 的配额。合法存储数据的用户使用 ``cache_salt=\\\"_default\\\"``，无法通过此 HTTP API "
+"与匿名流量区分管理——两者映射到相同的路径参数；选择任何其他值（例如 ``\\\"default\\\"``）以消除歧义。"
 
-#: ../../source/mp/http_api.rst:328
+#: ../../source/mp/http_api.rst:408
 #, python-brace-format
 msgid "``PUT /quota/{cache_salt}``"
 msgstr "``PUT /quota/{cache_salt}``"
 
-#: ../../source/mp/http_api.rst:330
+#: ../../source/mp/http_api.rst:410
 msgid "Create or update a quota."
 msgstr "创建或更新配额。"
 
-#: ../../source/mp/http_api.rst:332
+#: ../../source/mp/http_api.rst:412
 #, python-brace-format
 msgid "**Body:** ``{\"limit_gb\": <float>}`` (required, finite, non-negative)."
 msgstr "**主体:** ``{\\\"limit_gb\\\": <float>}`` （必需，有限，非负）。"
 
-#: ../../source/mp/http_api.rst:340
+#: ../../source/mp/http_api.rst:420
 msgid ""
 "**Errors:** ``400`` for malformed JSON, missing ``limit_gb``, non-numeric"
 " ``limit_gb``, ``nan`` / ``inf``, or negative values; ``503`` if the "
 "engine is not initialized."
-msgstr "**错误:** ``400`` 表示 JSON 格式错误、缺少 ``limit_gb``、``limit_gb`` 不是数字、``nan`` / ``inf`` 或负值；``503`` 表示引擎未初始化。"
+msgstr ""
+"**错误:** ``400`` 表示 JSON 格式错误、缺少 ``limit_gb``、``limit_gb`` 不是数字、``nan`` / "
+"``inf`` 或负值；``503`` 表示引擎未初始化。"
 
-#: ../../source/mp/http_api.rst:353
+#: ../../source/mp/http_api.rst:433
 #, python-brace-format
 msgid "``GET /quota/{cache_salt}``"
 msgstr "``GET /quota/{cache_salt}``"
 
-#: ../../source/mp/http_api.rst:355
+#: ../../source/mp/http_api.rst:435
 msgid "Read the current quota and live usage for one ``cache_salt``."
 msgstr "读取当前配额和一个 ``cache_salt`` 的实时使用情况。"
 
-#: ../../source/mp/http_api.rst:368
+#: ../../source/mp/http_api.rst:448
 msgid ""
 "``exists`` is ``false`` when no quota was ever registered for this "
 "``cache_salt`` (``limit_gb`` is then ``0.0`` and ``current_usage_gb`` "
 "reflects whatever bytes are currently cached for that salt — those bytes "
 "will evict next cycle under ``IsolatedLRU``)."
-msgstr "``exists`` 为 ``false`` 当该 ``cache_salt`` 从未注册过配额时（``limit_gb`` 此时为 ``0.0``，而 ``current_usage_gb`` 反映当前为该盐缓存的字节数——这些字节将在下一个周期根据 ``IsolatedLRU`` 被逐出）。"
+msgstr ""
+"``exists`` 为 ``false`` 当该 ``cache_salt`` 从未注册过配额时（``limit_gb`` 此时为 "
+"``0.0``，而 ``current_usage_gb`` 反映当前为该盐缓存的字节数——这些字节将在下一个周期根据 "
+"``IsolatedLRU`` 被逐出）。"
 
-#: ../../source/mp/http_api.rst:374
+#: ../../source/mp/http_api.rst:454
 #, python-brace-format
 msgid "``DELETE /quota/{cache_salt}``"
 msgstr "``DELETE /quota/{cache_salt}``"
 
-#: ../../source/mp/http_api.rst:376
+#: ../../source/mp/http_api.rst:456
 msgid ""
 "Remove a ``cache_salt``'s quota entry. Any bytes still cached under this "
 "``cache_salt`` become over-budget on the next eviction cycle (effective "
 "limit drops to ``0``) and will be evicted."
-msgstr "删除 ``cache_salt`` 的配额条目。任何仍然缓存于此 ``cache_salt`` 下的字节将在下一个逐出周期中超出预算（有效限制降至 ``0``），并将被逐出。"
+msgstr ""
+"删除 ``cache_salt`` 的配额条目。任何仍然缓存于此 ``cache_salt`` 下的字节将在下一个逐出周期中超出预算（有效限制降至"
+" ``0``），并将被逐出。"
 
-#: ../../source/mp/http_api.rst:386
+#: ../../source/mp/http_api.rst:466
 #, python-brace-format
 msgid ""
 "When no quota was registered for the given ``cache_salt``, the response "
 "is ``{\"cache_salt\": \"...\", \"status\": \"not_found\"}`` (still ``200 "
 "OK``)."
-msgstr "当给定的 ``cache_salt`` 没有注册配额时，响应为 ``{\\\"cache_salt\\\": \\\"...\\\", \\\"status\\\": \\\"not_found\\\"}``（仍然是 ``200 OK``）。"
+msgstr ""
+"当给定的 ``cache_salt`` 没有注册配额时，响应为 ``{\\\"cache_salt\\\": \\\"...\\\", "
+"\\\"status\\\": \\\"not_found\\\"}``（仍然是 ``200 OK``）。"
 
-#: ../../source/mp/http_api.rst:390
+#: ../../source/mp/http_api.rst:470
 msgid "``GET /quota``"
 msgstr "``GET /quota``"
 
-#: ../../source/mp/http_api.rst:392
+#: ../../source/mp/http_api.rst:472
 msgid "List every registered quota alongside its live usage."
 msgstr "列出每个注册的配额及其实时使用情况。"
 
-#: ../../source/mp/http_api.rst:406
+#: ../../source/mp/http_api.rst:486
 msgid "``GET /conf``"
 msgstr "``GET /conf``"
 
-#: ../../source/mp/http_api.rst:408
+#: ../../source/mp/http_api.rst:488
 msgid ""
 "Returns every server-side configuration object registered on "
 "``app.state.configs`` (typically ``mp``, ``storage_manager`` and "
@@ -504,266 +669,284 @@ msgid ""
 "serialized via ``safe_asdict``; other values go through "
 "``make_json_safe``. Useful for confirming what the process actually "
 "loaded — including environment overrides — without restarting."
-msgstr "返回在 ``app.state.configs`` 上注册的每个服务器端配置对象（通常是 ``mp``、``storage_manager`` 和 ``observability``），作为一个单一的缩进 JSON 文档。数据类通过 ``safe_asdict`` 序列化；其他值通过 ``make_json_safe`` 处理。对于确认进程实际加载的内容（包括环境覆盖）非常有用，而无需重启。"
+msgstr ""
+"返回在 ``app.state.configs`` 上注册的每个服务器端配置对象（通常是 ``mp``、``storage_manager`` 和"
+" ``observability``），作为一个单一的缩进 JSON 文档。数据类通过 ``safe_asdict`` 序列化；其他值通过 "
+"``make_json_safe`` 处理。对于确认进程实际加载的内容（包括环境覆盖）非常有用，而无需重启。"
 
-#: ../../source/mp/http_api.rst:433
+#: ../../source/mp/http_api.rst:513
 msgid ""
 "**Response** (``503 Service Unavailable``) when configs are not wired "
 "onto ``app.state`` yet:"
 msgstr "**响应**（``503 服务不可用``），当配置尚未连接到 ``app.state`` 时："
 
-#: ../../source/mp/http_api.rst:449
+#: ../../source/mp/http_api.rst:529
 msgid "``GET /version``"
 msgstr "``GET /version``"
 
-#: ../../source/mp/http_api.rst:451
+#: ../../source/mp/http_api.rst:531
 msgid ""
 "Returns the full version descriptor (package version combined with the "
 "current commit id), formatted by ``lmcache.utils.get_version()``."
 msgstr "返回完整的版本描述符（软件包版本与当前提交 ID 的组合），格式由 ``lmcache.utils.get_version()`` 生成。"
 
-#: ../../source/mp/http_api.rst:467
+#: ../../source/mp/http_api.rst:547
 msgid "``GET /lmc_version``"
 msgstr "``GET /lmc_version``"
 
-#: ../../source/mp/http_api.rst:469
+#: ../../source/mp/http_api.rst:549
 msgid ""
 "Returns the raw LMCache package version string "
 "(``lmcache.utils.VERSION``)."
 msgstr "返回原始的 LMCache 包版本字符串 (``lmcache.utils.VERSION``)。"
 
-#: ../../source/mp/http_api.rst:478
+#: ../../source/mp/http_api.rst:558
 msgid "``GET /commit_id``"
 msgstr "``GET /commit_id``"
 
-#: ../../source/mp/http_api.rst:480
+#: ../../source/mp/http_api.rst:560
 msgid ""
 "Returns the git commit id baked into the build "
 "(``lmcache.utils.COMMIT_ID``)."
 msgstr "返回构建中嵌入的 git 提交 id (``lmcache.utils.COMMIT_ID``)。"
 
-#: ../../source/mp/http_api.rst:489
+#: ../../source/mp/http_api.rst:569
 msgid "``GET /env``"
 msgstr "``GET /env``"
 
-#: ../../source/mp/http_api.rst:491
+#: ../../source/mp/http_api.rst:571
 msgid ""
 "Dumps the process environment variables as a sorted, pretty-printed JSON "
 "document. Response ``Content-Type`` is ``text/plain`` so it can be piped "
 "directly to a terminal."
-msgstr "将进程环境变量转储为排序的、格式良好的 JSON 文档。响应的 ``Content-Type`` 为 ``text/plain``，因此可以直接通过管道传输到终端。"
+msgstr ""
+"将进程环境变量转储为排序的、格式良好的 JSON 文档。响应的 ``Content-Type`` 为 "
+"``text/plain``，因此可以直接通过管道传输到终端。"
 
-#: ../../source/mp/http_api.rst:497
+#: ../../source/mp/http_api.rst:577
 msgid ""
 "The payload may contain secrets injected via environment variables. "
 "Restrict network access to this endpoint in production."
 msgstr "有效负载可能包含通过环境变量注入的机密。在生产环境中限制对该端点的网络访问。"
 
-#: ../../source/mp/http_api.rst:507
+#: ../../source/mp/http_api.rst:587
 msgid "``GET /loglevel``"
 msgstr "``GET /loglevel``"
 
-#: ../../source/mp/http_api.rst:509
+#: ../../source/mp/http_api.rst:589
 msgid ""
 "Inspect or mutate Python logger levels at runtime. All responses are "
 "``text/plain``. The endpoint has three modes driven by query parameters:"
 msgstr "在运行时检查或修改 Python 日志记录器级别。所有响应都是 ``text/plain``。该端点有三种模式，由查询参数驱动："
 
-#: ../../source/mp/http_api.rst:516 ../../source/mp/http_api.rst:583
-#: ../../source/mp/http_api.rst:608
+#: ../../source/mp/http_api.rst:596 ../../source/mp/http_api.rst:663
+#: ../../source/mp/http_api.rst:688
 msgid "Query"
 msgstr "查询"
 
-#: ../../source/mp/http_api.rst:517 ../../source/mp/http_api.rst:584
-#: ../../source/mp/http_api.rst:609
+#: ../../source/mp/http_api.rst:597 ../../source/mp/http_api.rst:664
+#: ../../source/mp/http_api.rst:689
 msgid "Behavior"
 msgstr "行为"
 
-#: ../../source/mp/http_api.rst:518
+#: ../../source/mp/http_api.rst:598
 msgid "(no params)"
 msgstr "(无参数)"
 
-#: ../../source/mp/http_api.rst:519
+#: ../../source/mp/http_api.rst:599
 msgid "List every logger registered with :mod:`logging` and its level."
 msgstr "列出所有在 :mod:`logging` 中注册的记录器及其级别。"
 
-#: ../../source/mp/http_api.rst:520
+#: ../../source/mp/http_api.rst:600
 msgid "``?logger_name=<name>``"
 msgstr "``?logger_name=<name>``"
 
-#: ../../source/mp/http_api.rst:521
+#: ../../source/mp/http_api.rst:601
 msgid "Return the effective level of the named logger."
 msgstr "返回指定记录器的有效级别。"
 
-#: ../../source/mp/http_api.rst:522
+#: ../../source/mp/http_api.rst:602
 msgid "``?logger_name=<name>&level=<LEVEL>``"
 msgstr "``?logger_name=<name>&level=<LEVEL>``"
 
-#: ../../source/mp/http_api.rst:523
+#: ../../source/mp/http_api.rst:603
 msgid ""
 "Set the named logger (and its handlers) to ``LEVEL`` "
 "(``DEBUG``/``INFO``/``WARNING``/``ERROR``/``CRITICAL``). Returns ``400`` "
 "on an unknown level."
-msgstr "将命名的日志记录器（及其处理程序）设置为 ``LEVEL``（``DEBUG``/``INFO``/``WARNING``/``ERROR``/``CRITICAL``）。在未知级别时返回 ``400``。"
+msgstr ""
+"将命名的日志记录器（及其处理程序）设置为 "
+"``LEVEL``（``DEBUG``/``INFO``/``WARNING``/``ERROR``/``CRITICAL``）。在未知级别时返回"
+" ``400``。"
 
-#: ../../source/mp/http_api.rst:527
+#: ../../source/mp/http_api.rst:607
 msgid "**Examples:**"
 msgstr "**示例：**"
 
-#: ../../source/mp/http_api.rst:541
+#: ../../source/mp/http_api.rst:621
 msgid "``GET /metrics``"
 msgstr "``GET /metrics``"
 
-#: ../../source/mp/http_api.rst:543
+#: ../../source/mp/http_api.rst:623
 msgid ""
 "Prometheus exposition format for every metric registered on the default "
 "``prometheus_client`` registry. Scrape this directly from Prometheus. See"
 " :doc:`observability` for the list of exported metrics."
-msgstr "默认 ``prometheus_client`` 注册表中注册的每个指标的 Prometheus 展示格式。直接从 Prometheus 抓取此内容。有关导出指标的列表，请参见 :doc:`observability`。"
+msgstr ""
+"默认 ``prometheus_client`` 注册表中注册的每个指标的 Prometheus 展示格式。直接从 Prometheus "
+"抓取此内容。有关导出指标的列表，请参见 :doc:`observability`。"
 
-#: ../../source/mp/http_api.rst:554
+#: ../../source/mp/http_api.rst:634
 msgid "``POST /metrics/reset``"
 msgstr "``POST /metrics/reset``"
 
-#: ../../source/mp/http_api.rst:556
+#: ../../source/mp/http_api.rst:636
 msgid ""
 "Resets all LMCache observability metrics to their initial state "
 "(``reset_observability_metrics``). Intended for test harnesses and "
 "benchmarks — not for production."
-msgstr "重置所有 LMCache 可观察性指标到其初始状态（``reset_observability_metrics``）。旨在用于测试工具和基准测试——不适用于生产环境。"
+msgstr ""
+"重置所有 LMCache "
+"可观察性指标到其初始状态（``reset_observability_metrics``）。旨在用于测试工具和基准测试——不适用于生产环境。"
 
-#: ../../source/mp/http_api.rst:573
+#: ../../source/mp/http_api.rst:653
 msgid "``GET /threads``"
 msgstr "``GET /threads``"
 
-#: ../../source/mp/http_api.rst:575
+#: ../../source/mp/http_api.rst:655
 msgid ""
 "Enumerate active Python threads in the server process along with their "
 "stack traces, plus a total-count summary. Useful for live debugging of "
 "hangs or runaway workers."
 msgstr "列出服务器进程中活动的 Python 线程及其堆栈跟踪，并提供总计摘要。这对于实时调试挂起或失控的工作线程非常有用。"
 
-#: ../../source/mp/http_api.rst:585
+#: ../../source/mp/http_api.rst:665
 msgid "``?name=<substr>``"
 msgstr "``?name=<substr>``"
 
-#: ../../source/mp/http_api.rst:586
+#: ../../source/mp/http_api.rst:666
 msgid "Keep only threads whose name contains ``<substr>`` (case-insensitive)."
 msgstr "仅保留名称包含 ``<substr>`` 的线程（不区分大小写）。"
 
-#: ../../source/mp/http_api.rst:588
+#: ../../source/mp/http_api.rst:668
 msgid "``?thread_id=<int>``"
 msgstr "``?thread_id=<int>``"
 
-#: ../../source/mp/http_api.rst:589
+#: ../../source/mp/http_api.rst:669
 msgid "Keep only the thread with the matching ``ident``."
 msgstr "只保留与匹配的 ``ident`` 相关的线程。"
 
-#: ../../source/mp/http_api.rst:598
+#: ../../source/mp/http_api.rst:678
 msgid "``GET /periodic-threads``"
 msgstr "``GET /periodic-threads``"
 
-#: ../../source/mp/http_api.rst:600
+#: ../../source/mp/http_api.rst:680
 msgid ""
 "Returns a JSON snapshot of the "
 ":class:`~lmcache.v1.periodic_thread.PeriodicThreadRegistry`: counts by "
 "level plus per-thread status (last run timestamp, latest summary, etc.)."
-msgstr "返回 :class:`~lmcache.v1.periodic_thread.PeriodicThreadRegistry` 的 JSON 快照：按级别统计以及每个线程的状态（上次运行时间戳、最新摘要等）。"
+msgstr ""
+"返回 :class:`~lmcache.v1.periodic_thread.PeriodicThreadRegistry` 的 JSON "
+"快照：按级别统计以及每个线程的状态（上次运行时间戳、最新摘要等）。"
 
-#: ../../source/mp/http_api.rst:610
+#: ../../source/mp/http_api.rst:690
 msgid "``?level=critical|high|medium|low``"
 msgstr "``?level=critical|high|medium|low``"
 
-#: ../../source/mp/http_api.rst:611
+#: ../../source/mp/http_api.rst:691
 msgid "Only include threads at the given level. ``400`` on unknown."
 msgstr "仅包含给定级别的线程。对未知情况返回 ``400``。"
 
-#: ../../source/mp/http_api.rst:612
+#: ../../source/mp/http_api.rst:692
 msgid "``?running_only=true``"
 msgstr "``?running_only=true``"
 
-#: ../../source/mp/http_api.rst:613
+#: ../../source/mp/http_api.rst:693
 msgid "Only include threads currently running."
 msgstr "仅包含当前正在运行的线程。"
 
-#: ../../source/mp/http_api.rst:614
+#: ../../source/mp/http_api.rst:694
 msgid "``?active_only=true``"
 msgstr "``?active_only=true``"
 
-#: ../../source/mp/http_api.rst:615
+#: ../../source/mp/http_api.rst:695
 msgid "Only include threads considered active (recent tick)."
 msgstr "仅包括被视为活动的线程（最近的滴答）。"
 
-#: ../../source/mp/http_api.rst:640
+#: ../../source/mp/http_api.rst:720
 #, python-brace-format
 msgid "``GET /periodic-threads/{thread_name}``"
 msgstr "``GET /periodic-threads/{thread_name}``"
 
-#: ../../source/mp/http_api.rst:642
+#: ../../source/mp/http_api.rst:722
 msgid "Detailed status for a single periodic thread (``404`` if not found)."
 msgstr "单个周期性线程的详细状态（如果未找到则返回 ``404``）。"
 
-#: ../../source/mp/http_api.rst:651
+#: ../../source/mp/http_api.rst:731
 msgid "``GET /periodic-threads-health``"
 msgstr "``GET /periodic-threads-health``"
 
-#: ../../source/mp/http_api.rst:653
+#: ../../source/mp/http_api.rst:733
 msgid ""
 "Fast health check covering only ``critical`` and ``high`` level periodic "
 "threads. A thread is flagged unhealthy when it is marked running but has "
 "not ticked within its expected interval."
-msgstr "快速健康检查仅覆盖 ``critical`` 和 ``high`` 级别的周期性线程。当线程被标记为正在运行但在预期间隔内未进行滴答时，它会被标记为不健康。"
+msgstr ""
+"快速健康检查仅覆盖 ``critical`` 和 ``high`` "
+"级别的周期性线程。当线程被标记为正在运行但在预期间隔内未进行滴答时，它会被标记为不健康。"
 
-#: ../../source/mp/http_api.rst:667
+#: ../../source/mp/http_api.rst:747
 msgid "When something is lagging:"
 msgstr "当某些东西滞后时："
 
-#: ../../source/mp/http_api.rst:691
+#: ../../source/mp/http_api.rst:771
 msgid "Adding New Endpoints"
 msgstr "添加新端点"
 
-#: ../../source/mp/http_api.rst:693
+#: ../../source/mp/http_api.rst:773
 msgid ""
 "Endpoints are auto-discovered from "
 "``lmcache/v1/multiprocess/http_apis/``. To add a new endpoint:"
 msgstr "端点会从 ``lmcache/v1/multiprocess/http_apis/`` 自动发现。要添加一个新端点："
 
-#: ../../source/mp/http_api.rst:696
+#: ../../source/mp/http_api.rst:776
 msgid "Create a new module in that directory named ``<name>_api.py``."
 msgstr "在该目录中创建一个名为 ``<name>_api.py`` 的新模块。"
 
-#: ../../source/mp/http_api.rst:697
+#: ../../source/mp/http_api.rst:777
 msgid "Define a module-level ``router = APIRouter()``."
 msgstr "定义一个模块级的 ``router = APIRouter()``。"
 
-#: ../../source/mp/http_api.rst:698
+#: ../../source/mp/http_api.rst:778
 msgid "Register handlers on ``router`` using FastAPI decorators."
 msgstr "使用 FastAPI 装饰器在 ``router`` 上注册处理程序。"
 
-#: ../../source/mp/http_api.rst:699
+#: ../../source/mp/http_api.rst:779
 msgid ""
 "Access the engine via ``request.app.state.engine`` and guard for the "
 "``None`` case (engine not yet initialized)."
 msgstr "通过 ``request.app.state.engine`` 访问引擎，并检查 ``None`` 情况（引擎尚未初始化）。"
 
-#: ../../source/mp/http_api.rst:702
+#: ../../source/mp/http_api.rst:782
 msgid ""
 "The :class:`~lmcache.v1.multiprocess.http_api_registry.HTTPAPIRegistry` "
 "will pick the module up automatically at startup — no central "
 "registration list to edit."
-msgstr ":class:`~lmcache.v1.multiprocess.http_api_registry.HTTPAPIRegistry` 将在启动时自动加载模块 — 无需编辑中央注册列表。"
+msgstr ""
+":class:`~lmcache.v1.multiprocess.http_api_registry.HTTPAPIRegistry` "
+"将在启动时自动加载模块 — 无需编辑中央注册列表。"
 
-#: ../../source/mp/http_api.rst:706
+#: ../../source/mp/http_api.rst:786
 msgid ""
 "If the route is generic enough to be shared with the vLLM-embedded API "
 "server, add it under ``lmcache/v1/internal_api_server/common/`` instead. "
 "It will be picked up on the MP side via ``common_api.py`` unless its "
-"module name is listed in ``_MP_INCOMPATIBLE_MODULES`` there (used for "
-"modules that require vLLM-specific ``app.state`` attributes, e.g. "
-"``run_script_api``)."
-msgstr "如果路由足够通用，可以与嵌入 vLLM 的 API 服务器共享，请将其添加到 ``lmcache/v1/internal_api_server/common/`` 下。它将在 MP 端通过 ``common_api.py`` 被识别，除非其模块名称在 ``_MP_INCOMPATIBLE_MODULES`` 中列出（用于需要 vLLM 特定 ``app.state`` 属性的模块，例如 ``run_script_api``）。"
+"module name is listed in ``_MP_INCOMPATIBLE_MODULES`` there (reserved for"
+" modules that require vLLM-specific ``app.state`` attributes; the list is"
+" currently empty)."
+msgstr "如果路由足够通用，可以与嵌入式 vLLM API 服务器共享，请将其添加到 ``lmcache/v1/internal_api_server/common/`` 下。它将在 MP 端通过 ``common_api.py`` 被加载，除非其模块名称在 ``_MP_INCOMPATIBLE_MODULES`` 中列出（保留给需要 vLLM 特定 ``app.state`` 属性的模块；该列表当前为空）。"
 
-#: ../../source/mp/http_api.rst:713
+#: ../../source/mp/http_api.rst:793
 msgid ""
 "When adding a new endpoint, please also add a matching section to this "
 "page documenting the endpoint's purpose, request/response schema, and an "
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/hybrid_models.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/hybrid_models.po
new file mode 100644
index 0000000000..616593bbba
--- /dev/null
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/hybrid_models.po
@@ -0,0 +1,180 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2024, The LMCache Team
+# This file is distributed under the same license as the LMCache package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2026.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: LMCache \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: zh_CN\n"
+"Language-Team: zh_CN <LL@li.org>\n"
+"Plural-Forms: nplurals=1; plural=0;\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.18.0\n"
+
+#: ../../source/mp/hybrid_models.rst:2
+msgid "Hybrid-Attention Models"
+msgstr "混合注意力模型"
+
+#: ../../source/mp/hybrid_models.rst:4
+msgid ""
+"Some models interleave more than one attention type across their layers —"
+" most commonly **sliding-window attention** on most layers and **full "
+"attention** on a few. vLLM serves these with its *hybrid KV cache "
+"manager*, which splits the model's layers into multiple **KV cache "
+"groups** (one per attention behavior)."
+msgstr "一些模型在其层中交错使用多种注意力类型——最常见的是在大多数层上使用 **滑动窗口注意力**，而在少数层上使用 **全注意力**。vLLM 通过其 *混合 KV 缓存管理器* 来服务这些模型，该管理器将模型的层分成多个 **KV 缓存组**（每种注意力行为一个）。"
+
+#: ../../source/mp/hybrid_models.rst:9
+msgid ""
+"The LMCache multiprocess connector (``LMCacheMPConnector``) supports "
+"these hybrid models: it stores and retrieves the KV cache for every "
+"group, so prefix caching and KV reuse work the same way they do for plain"
+" models."
+msgstr "LMCache 多进程连接器 (``LMCacheMPConnector``) 支持这些混合模型：它为每个组存储和检索 KV Cache，因此前缀缓存和 KV 重用的工作方式与普通模型相同。"
+
+#: ../../source/mp/hybrid_models.rst:18
+msgid "What Works"
+msgstr "有效的内容"
+
+#: ../../source/mp/hybrid_models.rst:20
+msgid ""
+"Models whose layers all use **standard paged attention** — including "
+"hybrids that mix sliding-window and full attention — are supported with "
+"no special configuration. Examples:"
+msgstr "所有层均使用 **标准分页注意力** 的模型——包括混合滑动窗口和全注意力的混合模型——在没有特殊配置的情况下得到支持。示例："
+
+#: ../../source/mp/hybrid_models.rst:28
+msgid "Model family"
+msgstr "模型家族"
+
+#: ../../source/mp/hybrid_models.rst:29
+msgid "Attention layout"
+msgstr "注意力布局"
+
+#: ../../source/mp/hybrid_models.rst:30
+msgid "Status"
+msgstr "状态"
+
+#: ../../source/mp/hybrid_models.rst:31
+msgid "Gemma 2 / Gemma 3"
+msgstr "Gemma 2 / Gemma 3"
+
+#: ../../source/mp/hybrid_models.rst:32 ../../source/mp/hybrid_models.rst:35
+msgid "Interleaved sliding-window + full"
+msgstr "交错滑动窗口 + 完整"
+
+#: ../../source/mp/hybrid_models.rst:33 ../../source/mp/hybrid_models.rst:36
+#: ../../source/mp/hybrid_models.rst:39
+msgid "Supported"
+msgstr "支持"
+
+#: ../../source/mp/hybrid_models.rst:34
+msgid "gpt-oss"
+msgstr "gpt-oss"
+
+#: ../../source/mp/hybrid_models.rst:37
+msgid "Llama, Qwen2/Qwen3 (dense), Mistral, …"
+msgstr "Llama, Qwen2/Qwen3 (dense), Mistral, …"
+
+#: ../../source/mp/hybrid_models.rst:38
+msgid "Single attention type"
+msgstr "单一注意力类型"
+
+#: ../../source/mp/hybrid_models.rst:41
+msgid ""
+"Just point vLLM at the LMCache server as usual (see :doc:`quickstart`); "
+"LMCache detects the model's KV cache groups automatically at registration"
+" time."
+msgstr "只需像往常一样将 vLLM 指向 LMCache 服务器（请参见 :doc:`quickstart`）；LMCache 在注册时会自动检测模型的 KV Cache 组。"
+
+#: ../../source/mp/hybrid_models.rst:46
+msgid ""
+"Because ``LMCacheMPConnector`` advertises hybrid support to vLLM, vLLM "
+"keeps its hybrid KV cache manager **enabled** for these models (it does "
+"not fall back to a single unified group). You do not need ``--no-disable-"
+"hybrid-kv-cache-manager`` or any related flag."
+msgstr "因为 ``LMCacheMPConnector`` 向 vLLM 宣传了混合支持，vLLM 为这些模型保持其混合 KV 缓存管理器 **启用**（它不会回退到单一统一组）。您不需要 ``--no-disable-hybrid-kv-cache-manager`` 或任何相关标志。"
+
+#: ../../source/mp/hybrid_models.rst:52
+msgid "What Is Not Supported Yet"
+msgstr "尚未支持的内容"
+
+#: ../../source/mp/hybrid_models.rst:54
+msgid ""
+"**Mamba / linear-attention hybrids** (e.g. Qwen3-Next, Qwen3.5, and other"
+" Gated-DeltaNet models). These layers keep a recurrent *state cache* (a "
+"convolution + SSM state) instead of a paged key/value cache, which "
+"LMCache's transfer path cannot represent today. Such models will fail to "
+"register with the LMCache server. Tracking support is future work."
+msgstr "**Mamba / 线性注意力混合模型**（例如 Qwen3-Next、Qwen3.5 和其他 Gated-DeltaNet 模型）。这些层保持一个递归的 *状态缓存*（一个卷积 + SSM 状态），而不是分页的键/值缓存，这在今天的 LMCache 传输路径中无法表示。这些模型将无法在 LMCache 服务器上注册。跟踪支持是未来的工作。"
+
+#: ../../source/mp/hybrid_models.rst:59
+msgid ""
+"**DeepSeek-V4-style compressed / indexer caches** are likewise not yet "
+"handled by the multiprocess connector."
+msgstr "**DeepSeek-V4 风格的压缩 / 索引器缓存** 同样尚未被多进程连接器处理。"
+
+#: ../../source/mp/hybrid_models.rst:63
+msgid "Verifying Correctness"
+msgstr "验证正确性"
+
+#: ../../source/mp/hybrid_models.rst:65
+msgid ""
+"To convince yourself that a hybrid model's KV is being cached and reused "
+"correctly, you can compare a cold run against a run served from LMCache:"
+msgstr "为了确保混合模型的 KV 被正确缓存和重用，您可以将冷启动与从 LMCache 提供的运行进行比较："
+
+#: ../../source/mp/hybrid_models.rst:68
+msgid ""
+"Run an evaluation (e.g. ``lm_eval`` on ``gsm8k``) against vLLM + LMCache."
+" This computes the KV cache and **stores** it in LMCache."
+msgstr "对 vLLM + LMCache 运行评估（例如在 ``gsm8k`` 上使用 ``lm_eval``）。这会计算 KV Cache 并 **存储** 在 LMCache 中。"
+
+#: ../../source/mp/hybrid_models.rst:70
+msgid ""
+"Reset *only* vLLM's local prefix cache, leaving the LMCache-managed cache"
+" intact (requires launching vLLM with ``VLLM_SERVER_DEV_MODE=1``)::"
+msgstr "重置 *仅* vLLM 的本地前缀缓存，保持 LMCache 管理的缓存不变（需要以 ``VLLM_SERVER_DEV_MODE=1`` 启动 vLLM）::"
+
+#: ../../source/mp/hybrid_models.rst:75
+msgid ""
+"Omit the ``reset_external=true`` query parameter so the LMCache cache is "
+"preserved."
+msgstr "省略 ``reset_external=true`` 查询参数，以便保留 LMCache 缓存。"
+
+#: ../../source/mp/hybrid_models.rst:77
+msgid ""
+"Re-run the same evaluation. vLLM now misses in its local cache, so the "
+"prefix KV is **retrieved** from LMCache. The score should match the first"
+" run."
+msgstr "重新运行相同的评估。vLLM 现在在其本地缓存中未命中，因此前缀 KV 从 LMCache 中 **检索**。得分应与第一次运行匹配。"
+
+#: ../../source/mp/hybrid_models.rst:80
+msgid ""
+"The project ships this as the ``hma_lm_eval`` continuous-integration test"
+" (see ``.buildkite/k3_tests/multiprocess``)."
+msgstr "该项目将其作为 ``hma_lm_eval`` 持续集成测试发布（参见 ``.buildkite/k3_tests/multiprocess``）。"
+
+#: ../../source/mp/hybrid_models.rst:84
+msgid "See Also"
+msgstr "另请参阅"
+
+#: ../../source/mp/hybrid_models.rst:86
+msgid ":doc:`quickstart` — launching the LMCache server and a vLLM client."
+msgstr "：doc:`quickstart` — 启动 LMCache 服务器和 vLLM 客户端。"
+
+#: ../../source/mp/hybrid_models.rst:87
+msgid ""
+"Design notes on how groups are detected and addressed: "
+"``docs/design/integration/vllm/hybrid-kv-cache-groups.md`` in the source "
+"tree."
+msgstr "关于如何检测和处理组的设计说明：``docs/design/integration/vllm/hybrid-kv-cache-groups.md`` 在源代码树中。"
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/index.po
index 63182bf8d1..8c70b24722 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/mp/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/index.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-18 17:25+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -19,7 +19,7 @@ msgstr ""
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.18.0\n"
 
-#: ../../source/mp/index.rst:54
+#: ../../source/mp/index.rst:50
 msgid "Contents"
 msgstr "目录"
 
@@ -33,7 +33,9 @@ msgid ""
 "that vLLM instances connect to over ZMQ.  One LMCache server per node can"
 " serve multiple vLLM pods, providing process isolation, shared caching, "
 "and independent resource scaling."
-msgstr "LMCache 多进程 (MP) 模式将 LMCache 作为一个 **独立服务** 运行，vLLM 实例通过 ZMQ 连接到该服务。每个节点的 LMCache 服务器可以为多个 vLLM pod 提供服务，提供进程隔离、共享缓存和独立资源扩展。"
+msgstr ""
+"LMCache 多进程 (MP) 模式将 LMCache 作为一个 **独立服务** 运行，vLLM 实例通过 ZMQ 连接到该服务。每个节点的 "
+"LMCache 服务器可以为多个 vLLM pod 提供服务，提供进程隔离、共享缓存和独立资源扩展。"
 
 #: ../../source/mp/index.rst:10
 msgid "Key Benefits"
@@ -52,7 +54,9 @@ msgid ""
 "running LMCache in a separate process, its Python GIL and CPU work "
 "(hashing, memory management, L2 I/O) do not compete with vLLM's inference"
 " threads."
-msgstr "**推理路径上没有 GIL 争用或 Python 开销** -- 通过在单独的进程中运行 LMCache，它的 Python GIL 和 CPU 工作（哈希、内存管理、L2 I/O）不会与 vLLM 的推理线程竞争。"
+msgstr ""
+"**推理路径上没有 GIL 争用或 Python 开销** -- 通过在单独的进程中运行 LMCache，它的 Python GIL 和 CPU "
+"工作（哈希、内存管理、L2 I/O）不会与 vLLM 的推理线程竞争。"
 
 #: ../../source/mp/index.rst:17
 msgid ""
@@ -95,8 +99,8 @@ msgid "Server Variants"
 msgstr "服务器变体"
 
 #: ../../source/mp/index.rst:35
-msgid "LMCache ships three server entry points:"
-msgstr "LMCache 提供三个服务器入口点："
+msgid "LMCache ships two server entry points:"
+msgstr "LMCache 提供两个服务器入口点："
 
 #: ../../source/mp/index.rst:41
 msgid "Entry Point"
@@ -111,30 +115,44 @@ msgid "``lmcache server``"
 msgstr "``lmcache server``"
 
 #: ../../source/mp/index.rst:44
-msgid ""
-"**Recommended.** ZMQ + FastAPI HTTP frontend (adds ``/healthcheck`` for "
-"K8s probes, ``/clear-cache``, ``/status`` — see :doc:`http_api`). Use "
-"``--engine-type blend`` to enable BlendEngineV2 for cross-request KV "
-"reuse."
-msgstr "**推荐。** ZMQ + FastAPI HTTP 前端（为 K8s 探针添加 ``/healthcheck``、``/clear-cache``、``/status`` — 参见 :doc:`http_api`）。使用 ``--engine-type blend`` 启用 BlendEngineV2 以实现跨请求的 KV 重用。"
+msgid "**Recommended.** ZMQ + FastAPI HTTP frontend — see :doc:`http_api`."
+msgstr "**推荐。** ZMQ + FastAPI HTTP 前端 — 参见 :doc:`http_api`。"
 
-#: ../../source/mp/index.rst:48
+#: ../../source/mp/index.rst:45
 msgid "``python3 -m lmcache.v1.multiprocess.server``"
 msgstr "``python3 -m lmcache.v1.multiprocess.server``"
 
-#: ../../source/mp/index.rst:49
+#: ../../source/mp/index.rst:46
 msgid ""
-"(Legacy) ZMQ-only server using MPCacheEngine (no HTTP endpoints). Prefer "
+"(Legacy) ZMQ-only server with no HTTP endpoints; same ``--engine-type`` /"
+" ``--supported-transfer-mode`` flags as ``lmcache server``. Prefer "
 "``lmcache server``."
-msgstr "（遗留）仅使用 MPCacheEngine 的 ZMQ 服务器（没有 HTTP 端点）。请使用 ``lmcache server``。"
-
-#: ../../source/mp/index.rst:51
-msgid "``python3 -m lmcache.v1.multiprocess.blend_server_v2``"
-msgstr "``python3 -m lmcache.v1.multiprocess.blend_server_v2``"
-
-#: ../../source/mp/index.rst:52
-msgid ""
-"(Legacy) CacheBlend-enabled server. Prefer ``lmcache server --engine-type"
-" blend``."
-msgstr "（遗留）启用 CacheBlend 的服务器。建议使用 ``lmcache server --engine-type blend``。"
-
+msgstr "（遗留）仅支持 ZMQ 的服务器，没有 HTTP 端点；与 ``lmcache server`` 相同的 ``--engine-type`` / ``--supported-transfer-mode`` 标志。建议使用 ``lmcache server``。"
+
+#~ msgid ""
+#~ "**Recommended.** ZMQ + FastAPI HTTP "
+#~ "frontend (adds ``/healthcheck`` for K8s "
+#~ "probes, ``/clear-cache``, ``/status`` — "
+#~ "see :doc:`http_api`). Use ``--engine-type "
+#~ "blend`` to enable BlendEngineV2 for "
+#~ "cross-request KV reuse."
+#~ msgstr ""
+#~ "**推荐。** ZMQ + FastAPI HTTP 前端（为 "
+#~ "K8s 探针添加 ``/healthcheck``、``/clear-"
+#~ "cache``、``/status`` — 参见 :doc:`http_api`）。使用 "
+#~ "``--engine-type blend`` 启用 BlendEngineV2 "
+#~ "以实现跨请求的 KV 重用。"
+
+#~ msgid ""
+#~ "(Legacy) ZMQ-only server using "
+#~ "MPCacheEngine (no HTTP endpoints). Prefer "
+#~ "``lmcache server``."
+#~ msgstr "(遗留) 仅使用 MPCacheEngine 的 ZMQ 服务器（没有 HTTP 端点）。请使用 ``lmcache server``。"
+
+#~ msgid "``python3 -m lmcache.v1.multiprocess.blend_server_v2``"
+#~ msgstr "``python3 -m lmcache.v1.multiprocess.blend_server_v2``"
+
+#~ msgid ""
+#~ "(Legacy) CacheBlend-enabled server. Prefer "
+#~ "``lmcache server --engine-type blend``."
+#~ msgstr "（遗留）启用 CacheBlend 的服务器。建议使用 ``lmcache server --engine-type blend``。"
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/mp/observability.po b/docs/source/locale/zh_CN/LC_MESSAGES/mp/observability.po
index 35654551fb..dfd0ca52f3 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/mp/observability.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/mp/observability.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-29 22:44+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -68,13 +68,13 @@ msgid "Default"
 msgstr "默认"
 
 #: ../../source/mp/observability.rst:44 ../../source/mp/observability.rst:99
-#: ../../source/mp/observability.rst:152 ../../source/mp/observability.rst:189
-#: ../../source/mp/observability.rst:224 ../../source/mp/observability.rst:246
-#: ../../source/mp/observability.rst:303 ../../source/mp/observability.rst:349
-#: ../../source/mp/observability.rst:401 ../../source/mp/observability.rst:435
-#: ../../source/mp/observability.rst:472 ../../source/mp/observability.rst:494
-#: ../../source/mp/observability.rst:525 ../../source/mp/observability.rst:595
-#: ../../source/mp/observability.rst:715
+#: ../../source/mp/observability.rst:152 ../../source/mp/observability.rst:190
+#: ../../source/mp/observability.rst:225 ../../source/mp/observability.rst:247
+#: ../../source/mp/observability.rst:308 ../../source/mp/observability.rst:354
+#: ../../source/mp/observability.rst:406 ../../source/mp/observability.rst:440
+#: ../../source/mp/observability.rst:477 ../../source/mp/observability.rst:499
+#: ../../source/mp/observability.rst:530 ../../source/mp/observability.rst:600
+#: ../../source/mp/observability.rst:720
 msgid "Description"
 msgstr "描述"
 
@@ -278,7 +278,7 @@ msgstr ""
 "每个由 MP 服务器导出的指标和跨度都携带在启动时构建的资源级属性。这些属性标识生成遥测的进程，并且与每个指标的属性（例如 "
 "``cache_salt``）是正交的。"
 
-#: ../../source/mp/observability.rst:131 ../../source/mp/observability.rst:713
+#: ../../source/mp/observability.rst:131 ../../source/mp/observability.rst:718
 msgid "Attribute"
 msgstr "属性"
 
@@ -316,21 +316,21 @@ msgstr ""
 msgid "L1 Metrics"
 msgstr "L1 指标"
 
-#: ../../source/mp/observability.rst:150 ../../source/mp/observability.rst:187
-#: ../../source/mp/observability.rst:222 ../../source/mp/observability.rst:244
-#: ../../source/mp/observability.rst:301 ../../source/mp/observability.rst:347
-#: ../../source/mp/observability.rst:399 ../../source/mp/observability.rst:433
-#: ../../source/mp/observability.rst:470 ../../source/mp/observability.rst:492
-#: ../../source/mp/observability.rst:523 ../../source/mp/observability.rst:593
+#: ../../source/mp/observability.rst:150 ../../source/mp/observability.rst:188
+#: ../../source/mp/observability.rst:223 ../../source/mp/observability.rst:245
+#: ../../source/mp/observability.rst:306 ../../source/mp/observability.rst:352
+#: ../../source/mp/observability.rst:404 ../../source/mp/observability.rst:438
+#: ../../source/mp/observability.rst:475 ../../source/mp/observability.rst:497
+#: ../../source/mp/observability.rst:528 ../../source/mp/observability.rst:598
 msgid "Metric"
 msgstr "指标"
 
-#: ../../source/mp/observability.rst:151 ../../source/mp/observability.rst:188
-#: ../../source/mp/observability.rst:223 ../../source/mp/observability.rst:245
-#: ../../source/mp/observability.rst:302 ../../source/mp/observability.rst:348
-#: ../../source/mp/observability.rst:400 ../../source/mp/observability.rst:434
-#: ../../source/mp/observability.rst:471 ../../source/mp/observability.rst:493
-#: ../../source/mp/observability.rst:524 ../../source/mp/observability.rst:594
+#: ../../source/mp/observability.rst:151 ../../source/mp/observability.rst:189
+#: ../../source/mp/observability.rst:224 ../../source/mp/observability.rst:246
+#: ../../source/mp/observability.rst:307 ../../source/mp/observability.rst:353
+#: ../../source/mp/observability.rst:405 ../../source/mp/observability.rst:439
+#: ../../source/mp/observability.rst:476 ../../source/mp/observability.rst:498
+#: ../../source/mp/observability.rst:529 ../../source/mp/observability.rst:599
 msgid "Type"
 msgstr "类型"
 
@@ -339,52 +339,56 @@ msgid "``lmcache_mp.l1_read``"
 msgstr "``lmcache_mp.l1_read``"
 
 #: ../../source/mp/observability.rst:154 ../../source/mp/observability.rst:157
-#: ../../source/mp/observability.rst:160 ../../source/mp/observability.rst:163
-#: ../../source/mp/observability.rst:167 ../../source/mp/observability.rst:248
-#: ../../source/mp/observability.rst:251 ../../source/mp/observability.rst:257
-#: ../../source/mp/observability.rst:260 ../../source/mp/observability.rst:263
-#: ../../source/mp/observability.rst:266 ../../source/mp/observability.rst:269
-#: ../../source/mp/observability.rst:272 ../../source/mp/observability.rst:275
-#: ../../source/mp/observability.rst:305 ../../source/mp/observability.rst:311
-#: ../../source/mp/observability.rst:320
-msgid "Counter"
-msgstr "计数器"
+#: ../../source/mp/observability.rst:160 ../../source/mp/observability.rst:252
+#: ../../source/mp/observability.rst:258 ../../source/mp/observability.rst:264
+#: ../../source/mp/observability.rst:274 ../../source/mp/observability.rst:277
+#: ../../source/mp/observability.rst:283
+msgid "Counter (attr: ``cache_salt``)"
+msgstr "计数器（属性：``cache_salt``）"
 
 #: ../../source/mp/observability.rst:155
-msgid "Number of chunks read from L1."
-msgstr "从 L1 读取的块数。"
+msgid "Number of chunks read from L1, grouped by tenant."
+msgstr "按租户分组的从 L1 读取的块数。"
 
 #: ../../source/mp/observability.rst:156
 msgid "``lmcache_mp.l1_write``"
 msgstr "``lmcache_mp.l1_write``"
 
 #: ../../source/mp/observability.rst:158
-msgid "Number of chunks written to L1."
-msgstr "写入 L1 的块数。"
+msgid "Number of chunks written to L1, grouped by tenant."
+msgstr "按租户分组写入 L1 的块数。"
 
 #: ../../source/mp/observability.rst:159
 msgid "``lmcache_mp.l1_evicted``"
 msgstr "``lmcache_mp.l1_evicted``"
 
 #: ../../source/mp/observability.rst:161
-msgid "Number of chunks evicted by the EvictionController."
-msgstr "EvictionController 逐出的块数。"
+msgid "Number of chunks evicted by the EvictionController, grouped by tenant."
+msgstr "按租户分组的被 EvictionController 逐出的块数。"
 
-#: ../../source/mp/observability.rst:162
+#: ../../source/mp/observability.rst:163
 msgid "``lmcache_mp.l1_eviction_loop_ticks``"
 msgstr "``lmcache_mp.l1_eviction_loop_ticks``"
 
-#: ../../source/mp/observability.rst:164
+#: ../../source/mp/observability.rst:164 ../../source/mp/observability.rst:168
+#: ../../source/mp/observability.rst:249 ../../source/mp/observability.rst:261
+#: ../../source/mp/observability.rst:268 ../../source/mp/observability.rst:271
+#: ../../source/mp/observability.rst:310 ../../source/mp/observability.rst:316
+#: ../../source/mp/observability.rst:325
+msgid "Counter"
+msgstr "计数器"
+
+#: ../../source/mp/observability.rst:165
 msgid ""
 "L1 eviction-loop iterations (every cycle, regardless of whether the "
 "watermark was crossed). Driven by ``L1_EVICTION_LOOP_TICK``."
 msgstr "L1 逐出循环迭代（每个周期，无论水位线是否被跨越）。由 ``L1_EVICTION_LOOP_TICK`` 驱动。"
 
-#: ../../source/mp/observability.rst:166
+#: ../../source/mp/observability.rst:167
 msgid "``lmcache_mp.l1_eviction_loop_triggered``"
 msgstr "``lmcache_mp.l1_eviction_loop_triggered``"
 
-#: ../../source/mp/observability.rst:168
+#: ../../source/mp/observability.rst:169
 msgid ""
 "L1 eviction-loop iterations where ``usage >= watermark`` and the eviction"
 " policy actually ran. The two counters distinguish \"loop is alive\" from"
@@ -394,11 +398,11 @@ msgstr ""
 "L1 逐出循环迭代，其中 ``usage >= watermark`` "
 "且逐出策略实际运行。两个计数器区分“循环仍在运行”和“逐出已触发”——在调试完成速度快于 1 Hz 轮询周期的短期基准时，这一点很重要。"
 
-#: ../../source/mp/observability.rst:175
+#: ../../source/mp/observability.rst:176
 msgid "L1 Chunk Lifecycle Histograms"
 msgstr "L1 块生命周期直方图"
 
-#: ../../source/mp/observability.rst:177
+#: ../../source/mp/observability.rst:178
 msgid ""
 "Sampled (default 1%) chunk-level lifecycle tracking via "
 "``L1LifecycleSubscriber``. Only sampled chunks contribute to histograms; "
@@ -409,52 +413,52 @@ msgstr ""
 "通过 ``L1LifecycleSubscriber`` 进行采样的（默认 "
 "1%）块级生命周期跟踪。只有采样的块会对直方图产生贡献；上述计数器始终计算所有事件。采样是确定性的（基于哈希），因此相同的键总是会得到相同的决策，且没有内存开销。"
 
-#: ../../source/mp/observability.rst:190
+#: ../../source/mp/observability.rst:191
 msgid "``lmcache_mp.l1_chunk_lifetime``"
 msgstr "``lmcache_mp.l1_chunk_lifetime``"
 
-#: ../../source/mp/observability.rst:191 ../../source/mp/observability.rst:194
-#: ../../source/mp/observability.rst:197 ../../source/mp/observability.rst:200
-#: ../../source/mp/observability.rst:403 ../../source/mp/observability.rst:406
-#: ../../source/mp/observability.rst:409 ../../source/mp/observability.rst:437
-#: ../../source/mp/observability.rst:440 ../../source/mp/observability.rst:474
-#: ../../source/mp/observability.rst:477
+#: ../../source/mp/observability.rst:192 ../../source/mp/observability.rst:195
+#: ../../source/mp/observability.rst:198 ../../source/mp/observability.rst:201
+#: ../../source/mp/observability.rst:408 ../../source/mp/observability.rst:411
+#: ../../source/mp/observability.rst:414 ../../source/mp/observability.rst:442
+#: ../../source/mp/observability.rst:445 ../../source/mp/observability.rst:479
+#: ../../source/mp/observability.rst:482
 msgid "Histogram"
 msgstr "直方图"
 
-#: ../../source/mp/observability.rst:192
+#: ../../source/mp/observability.rst:193
 msgid "Time from allocation to eviction per sampled chunk."
 msgstr "每个采样块从分配到逐出的时间。"
 
-#: ../../source/mp/observability.rst:193
+#: ../../source/mp/observability.rst:194
 msgid "``lmcache_mp.l1_chunk_idle_before_evict``"
 msgstr "``lmcache_mp.l1_chunk_idle_before_evict``"
 
-#: ../../source/mp/observability.rst:195
+#: ../../source/mp/observability.rst:196
 msgid "Time from last access to eviction per sampled chunk."
 msgstr "每个采样块从最后访问到逐出的时间。"
 
-#: ../../source/mp/observability.rst:196
+#: ../../source/mp/observability.rst:197
 msgid "``lmcache_mp.l1_chunk_reuse_gap``"
 msgstr "``lmcache_mp.l1_chunk_reuse_gap``"
 
-#: ../../source/mp/observability.rst:198
+#: ../../source/mp/observability.rst:199
 msgid "Time gap between consecutive touches (read or write) of the same chunk."
 msgstr "同一块的连续访问（读取或写入）之间的时间间隔。"
 
-#: ../../source/mp/observability.rst:199
+#: ../../source/mp/observability.rst:200
 msgid "``lmcache_mp.l1_chunk_evict_reuse_gap``"
 msgstr "``lmcache_mp.l1_chunk_evict_reuse_gap``"
 
-#: ../../source/mp/observability.rst:201
+#: ../../source/mp/observability.rst:202
 msgid "Time from eviction to next reuse (capped at 300 s)."
 msgstr "逐出到下次重用的时间（上限为 300 秒）。"
 
-#: ../../source/mp/observability.rst:204
+#: ../../source/mp/observability.rst:205
 msgid "StorageManager Real-Reuse Metrics"
 msgstr "存储管理器真实重用指标"
 
-#: ../../source/mp/observability.rst:206
+#: ../../source/mp/observability.rst:207
 msgid ""
 "Workload-level reuse histograms emitted by ``SMLifecycleSubscriber``, "
 "driven by caller-facing StorageManager events "
@@ -466,7 +470,7 @@ msgstr ""
 "事件驱动（``SM_READ_PREFETCHED_FINISHED``，``SM_WRITE_FINISHED``）。 "
 "存储/预取控制器的内部读锁释放被排除，因此信号仅反映用户驱动的访问。"
 
-#: ../../source/mp/observability.rst:212
+#: ../../source/mp/observability.rst:213
 msgid ""
 "Both histograms are tagged with ``cache_salt`` for per-tenant isolation."
 "  The per-salt access counter advances on every read and write of every "
@@ -477,26 +481,26 @@ msgstr ""
 "两个直方图都标记有 ``cache_salt`` "
 "以实现每个租户的隔离。每个块的每次读取和写入（无论是否采样）都会使每个盐值的访问计数器增加，因此块间隙反映了真实的存储量；直方图本身仅记录通过（确定性、基于哈希的）采样门的块的间隙。"
 
-#: ../../source/mp/observability.rst:225
+#: ../../source/mp/observability.rst:226
 msgid "``lmcache_mp.real_reuse_gap``"
 msgstr "``lmcache_mp.real_reuse_gap``"
 
-#: ../../source/mp/observability.rst:226 ../../source/mp/observability.rst:231
+#: ../../source/mp/observability.rst:227 ../../source/mp/observability.rst:232
 msgid "Histogram (tag: ``cache_salt``)"
 msgstr "直方图（标签：``cache_salt``）"
 
-#: ../../source/mp/observability.rst:227
+#: ../../source/mp/observability.rst:228
 msgid ""
 "Time gap between a chunk's last access (read or write) and its next read."
 "  Captures storage cost — how long a stored chunk sat between accesses.  "
 "Emitted only on read events."
 msgstr "块的最后一次访问（读取或写入）与下一次读取之间的时间间隔。捕获存储成本——一个存储块在访问之间静止了多久。仅在读取事件中发出。"
 
-#: ../../source/mp/observability.rst:230
+#: ../../source/mp/observability.rst:231
 msgid "``lmcache_mp.real_reuse_gap_objects``"
 msgstr "``lmcache_mp.real_reuse_gap_objects``"
 
-#: ../../source/mp/observability.rst:232
+#: ../../source/mp/observability.rst:233
 msgid ""
 "Per-``cache_salt`` access-counter gap between two reads of the same "
 "chunk.  Captures storage volume — how many chunk-accesses occurred while "
@@ -506,103 +510,111 @@ msgstr ""
 "每个 ``cache_salt`` "
 "的访问计数器在同一块的两次读取之间的间隙。捕获存储量——在此块等待下一个读取时发生了多少次块访问。在采样块的读取事件中发出。"
 
-#: ../../source/mp/observability.rst:238
+#: ../../source/mp/observability.rst:239
 msgid "L2 Metrics"
 msgstr "L2 指标"
 
-#: ../../source/mp/observability.rst:247
+#: ../../source/mp/observability.rst:248
 msgid "``lmcache_mp.l2_store_submitted``"
 msgstr "``lmcache_mp.l2_store_submitted``"
 
-#: ../../source/mp/observability.rst:249
+#: ../../source/mp/observability.rst:250
 msgid "Number of L2 store requests submitted."
 msgstr "提交的 L2 存储请求数量。"
 
-#: ../../source/mp/observability.rst:250
+#: ../../source/mp/observability.rst:251
 msgid "``lmcache_mp.l2_store_submitted_objects``"
 msgstr "``lmcache_mp.l2_store_submitted_objects``"
 
-#: ../../source/mp/observability.rst:252
-msgid "Number of chunks submitted for L2 store."
-msgstr "提交到 L2 存储的块数量。"
-
 #: ../../source/mp/observability.rst:253
+msgid "Number of chunks submitted for L2 store, grouped by tenant."
+msgstr "按租户分组的提交到 L2 存储的块数。"
+
+#: ../../source/mp/observability.rst:254
 msgid "``lmcache_mp.l2_store_completed``"
 msgstr "``lmcache_mp.l2_store_completed``"
 
-#: ../../source/mp/observability.rst:254 ../../source/mp/observability.rst:278
+#: ../../source/mp/observability.rst:255 ../../source/mp/observability.rst:280
 msgid "Counter (attr: ``l2_name``)"
 msgstr "计数器 (属性: ``l2_name``)"
 
-#: ../../source/mp/observability.rst:255
+#: ../../source/mp/observability.rst:256
 msgid "Number of L2 store requests completed, labeled by adapter type."
 msgstr "按适配器类型标记的完成的 L2 存储请求数量。"
 
-#: ../../source/mp/observability.rst:256
+#: ../../source/mp/observability.rst:257
 msgid "``lmcache_mp.l2_store_completed_objects``"
 msgstr "``lmcache_mp.l2_store_completed_objects``"
 
-#: ../../source/mp/observability.rst:258
-msgid "Number of chunks successfully stored to L2."
-msgstr "成功存储到 L2 的块数。"
-
 #: ../../source/mp/observability.rst:259
+msgid "Number of chunks successfully stored to L2, grouped by tenant."
+msgstr "按租户分组成功存储到 L2 的块数。"
+
+#: ../../source/mp/observability.rst:260
 msgid "``lmcache_mp.l2_prefetch_lookup``"
 msgstr "``lmcache_mp.l2_prefetch_lookup``"
 
-#: ../../source/mp/observability.rst:261
+#: ../../source/mp/observability.rst:262
 msgid "Number of L2 prefetch lookup requests."
 msgstr "L2 预取查找请求的数量。"
 
-#: ../../source/mp/observability.rst:262
+#: ../../source/mp/observability.rst:263
 msgid "``lmcache_mp.l2_prefetch_lookup_objects``"
 msgstr "``lmcache_mp.l2_prefetch_lookup_objects``"
 
-#: ../../source/mp/observability.rst:264
-msgid "Number of chunks submitted for L2 prefetch lookup."
-msgstr "L2 预取查找提交的块数。"
-
 #: ../../source/mp/observability.rst:265
+msgid "Number of chunks submitted for L2 prefetch lookup, grouped by tenant."
+msgstr "按租户分组的提交用于 L2 预取查找的块数量。"
+
+#: ../../source/mp/observability.rst:267
 msgid "``lmcache_mp.l2_prefetch_hit``"
 msgstr "``lmcache_mp.l2_prefetch_hit``"
 
-#: ../../source/mp/observability.rst:267
+#: ../../source/mp/observability.rst:269
 msgid "Number of prefix chunks found in L2 lookup."
 msgstr "在 L2 查找中找到的前缀块数量。"
 
-#: ../../source/mp/observability.rst:268
+#: ../../source/mp/observability.rst:270
 msgid "``lmcache_mp.l2_prefetch_load_submitted``"
 msgstr "``lmcache_mp.l2_prefetch_load_submitted``"
 
-#: ../../source/mp/observability.rst:270
+#: ../../source/mp/observability.rst:272
 msgid "Number of L2 prefetch load requests submitted."
 msgstr "提交的 L2 预取加载请求数量。"
 
-#: ../../source/mp/observability.rst:271
+#: ../../source/mp/observability.rst:273
 msgid "``lmcache_mp.l2_prefetch_load_submitted_objects``"
 msgstr "``lmcache_mp.l2_prefetch_load_submitted_objects``"
 
-#: ../../source/mp/observability.rst:273
-msgid "Number of chunks submitted for L2 load."
-msgstr "提交用于 L2 加载的块数量。"
+#: ../../source/mp/observability.rst:275
+msgid "Number of chunks submitted for L2 load, grouped by tenant."
+msgstr "按租户分组的提交用于 L2 加载的块数量。"
 
-#: ../../source/mp/observability.rst:274
+#: ../../source/mp/observability.rst:276
 msgid "``lmcache_mp.l2_prefetch_load_completed``"
 msgstr "``lmcache_mp.l2_prefetch_load_completed``"
 
-#: ../../source/mp/observability.rst:276
-msgid "Number of chunks successfully loaded from L2."
-msgstr "成功从 L2 加载的块数。"
+#: ../../source/mp/observability.rst:278
+msgid "Number of chunks successfully loaded from L2, grouped by tenant."
+msgstr "按租户分组，从 L2 成功加载的块数。"
 
-#: ../../source/mp/observability.rst:277
+#: ../../source/mp/observability.rst:279
 msgid "``lmcache_mp.l2_load_completed``"
 msgstr "``lmcache_mp.l2_load_completed``"
 
-#: ../../source/mp/observability.rst:279
+#: ../../source/mp/observability.rst:281
 msgid "Number of per-adapter L2 load requests completed, labeled by adapter type."
 msgstr "按适配器类型标记的每个适配器 L2 加载请求完成的数量。"
 
-#: ../../source/mp/observability.rst:281
+#: ../../source/mp/observability.rst:282
+msgid "``lmcache_mp.l2_evicted_objects``"
+msgstr "``lmcache_mp.l2_evicted_objects``"
+
+#: ../../source/mp/observability.rst:284
+msgid "Number of chunks evicted from L2, grouped by tenant."
+msgstr "按租户分组的从 L2 逐出的块数量。"
+
+#: ../../source/mp/observability.rst:286
 #, python-brace-format
 msgid ""
 "The ``l2_name``-labeled counters (``l2_store_completed`` and "
@@ -618,11 +630,11 @@ msgstr ""
 "``rate(lmcache_mp_l2_store_completed_requests_total{l2_name=\\\"...\\\"}[1m])``（以及加载的等效项）按需计算每个后端的"
 " IOPS。没有单独导出 ``*_iops`` 指标；保留原始计数器让仪表板用户可以选择自己的时间窗口。"
 
-#: ../../source/mp/observability.rst:288
+#: ../../source/mp/observability.rst:293
 msgid "Failure & Health Counters"
 msgstr "失败与健康计数器"
 
-#: ../../source/mp/observability.rst:290
+#: ../../source/mp/observability.rst:295
 msgid ""
 "Health-monitoring counters emitted on the dedicated ``lmcache_mp.health``"
 " OTel meter. Driven by the ``L1FailureMetricsSubscriber`` and "
@@ -636,11 +648,11 @@ msgstr ""
 "驱动，这些订阅者在启用指标时会自动注册。所有三个计数器都携带 ``model_name``（从每个 ``ObjectKey`` "
 "中提取），以便操作员可以在 Prometheus ``/metrics`` 端点上按模型进行切片。"
 
-#: ../../source/mp/observability.rst:304
+#: ../../source/mp/observability.rst:309
 msgid "``lmcache_mp.l1_allocation_failure``"
 msgstr "``lmcache_mp.l1_allocation_failure``"
 
-#: ../../source/mp/observability.rst:306
+#: ../../source/mp/observability.rst:311
 #, python-brace-format
 msgid ""
 "L1 memory allocation failures (OOM) during ``reserve_write``. Tagged by "
@@ -651,11 +663,11 @@ msgstr ""
 "在 ``reserve_write`` 期间发生 L1 内存分配失败（OOM）。通过 ``during`` ∈ {``l1_store``, "
 "``l2_prefetch``} 标记，以区分用户发起的存储与预取触发的分配，以及 ``model_name``。"
 
-#: ../../source/mp/observability.rst:310
+#: ../../source/mp/observability.rst:315
 msgid "``lmcache_mp.l1_read_failure``"
 msgstr "``lmcache_mp.l1_read_failure``"
 
-#: ../../source/mp/observability.rst:312
+#: ../../source/mp/observability.rst:317
 #, python-brace-format
 msgid ""
 "L1 ``reserve_read`` failures. Tagged by ``during`` ∈ {``l2_store``, "
@@ -670,11 +682,11 @@ msgstr ""
 "``model_name``。**后查找异常计数器**，而不是缓存未命中计数器——在 MP 模式下，``reserve_read`` "
 "仅在成功查找后调用，因此任何非零值都表示查找/保留竞争或意外逐出，健康运行时应保持接近零。"
 
-#: ../../source/mp/observability.rst:319
+#: ../../source/mp/observability.rst:324
 msgid "``lmcache_mp.l2_prefetch_failure``"
 msgstr "``lmcache_mp.l2_prefetch_failure``"
 
-#: ../../source/mp/observability.rst:321
+#: ../../source/mp/observability.rst:326
 #, python-brace-format
 msgid ""
 "Chunks that L2 reported present at lookup but failed to land in L1. "
@@ -687,7 +699,7 @@ msgstr ""
 "``model_name``。``l1_oom`` 表示 L1 没有空间接收预取的对象；``not_found`` "
 "表示适配器在正查找的情况下未返回任何数据（例如并发删除）。"
 
-#: ../../source/mp/observability.rst:327
+#: ../../source/mp/observability.rst:332
 msgid ""
 "A ``reason=serde_failure`` value will be added to ``l2_prefetch_failure``"
 " as an additive, non-breaking extension once L2 adapters distinguish "
@@ -697,7 +709,7 @@ msgstr ""
 "一旦 L2 适配器区分反序列化错误和缺失对象，将会将 ``reason=serde_failure`` 值作为附加的、非破坏性的扩展添加到 "
 "``l2_prefetch_failure`` 中——当这项功能上线时，无需进行仪表板迁移。"
 
-#: ../../source/mp/observability.rst:332
+#: ../../source/mp/observability.rst:337
 msgid ""
 "For the full design rationale (including which event types drive each "
 "counter and why ``lmcache_instance_id`` is deferred), see "
@@ -706,11 +718,11 @@ msgstr ""
 "有关完整的设计原理（包括哪些事件类型驱动每个计数器以及为什么推迟 ``lmcache_instance_id``），请参阅源树中的 "
 "``docs/design/v1/mp_observability/METRICS.md``。"
 
-#: ../../source/mp/observability.rst:337
+#: ../../source/mp/observability.rst:342
 msgid "Lookup Hit-Rate Metrics"
 msgstr "查找命中率指标"
 
-#: ../../source/mp/observability.rst:339
+#: ../../source/mp/observability.rst:344
 msgid ""
 "Token-level counters whose ratio gives the fraction of tokens requested "
 "by a lookup that were served from either L1 or L2. L0 (GPU prefix cache) "
@@ -720,31 +732,31 @@ msgstr ""
 "按令牌级别计数器，其比例表示由查找请求的令牌中，从 L1 或 L2 服务的令牌的比例。L0（GPU 前缀缓存）故意被排除在外——它是 vLLM "
 "所有的，无法从 LMCache 中观察到。"
 
-#: ../../source/mp/observability.rst:350
+#: ../../source/mp/observability.rst:355
 msgid "``lmcache_mp.lookup_requested``"
 msgstr "``lmcache_mp.lookup_requested``"
 
-#: ../../source/mp/observability.rst:351 ../../source/mp/observability.rst:355
+#: ../../source/mp/observability.rst:356 ../../source/mp/observability.rst:360
 msgid "Counter (attrs: ``model_name``, ``cache_salt``)"
 msgstr "计数器（属性：``model_name``, ``cache_salt``）"
 
-#: ../../source/mp/observability.rst:352
+#: ../../source/mp/observability.rst:357
 msgid ""
 "Total tokens submitted for lookup (denominator of the L1+L2 token-level "
 "hit rate). Only chunk-aligned tokens are counted."
 msgstr "提交查找的总令牌数（L1+L2 令牌级命中率的分母）。仅计算与块对齐的令牌。"
 
-#: ../../source/mp/observability.rst:354
+#: ../../source/mp/observability.rst:359
 msgid "``lmcache_mp.lookup_hit``"
 msgstr "``lmcache_mp.lookup_hit``"
 
-#: ../../source/mp/observability.rst:356
+#: ../../source/mp/observability.rst:361
 msgid ""
 "Total tokens found in L1 or L2 during lookup (numerator of the L1+L2 "
 "token-level hit rate). Counts the contiguous prefix hit only."
 msgstr "在查找过程中在 L1 或 L2 中找到的总令牌数（L1+L2 令牌级命中率的分子）。仅计算连续前缀命中。"
 
-#: ../../source/mp/observability.rst:359
+#: ../../source/mp/observability.rst:364
 msgid ""
 "Both counters are driven by the same event (``MP_LOOKUP_PREFETCH_END``), "
 "so they always advance together per completed lookup. Early-exit lookups "
@@ -753,7 +765,7 @@ msgstr ""
 "这两个计数器由同一事件（``MP_LOOKUP_PREFETCH_END``）驱动，因此它们在每次完成查找时总是一起增加。提前退出的查找对两者都贡献"
 " ``0``，而放弃的查找则对两者都没有贡献。"
 
-#: ../../source/mp/observability.rst:363
+#: ../../source/mp/observability.rst:368
 msgid ""
 "The ``model_name`` and ``cache_salt`` attributes are captured at lookup "
 "time from ``IPCCacheEngineKey`` so dashboards can compute per-model or "
@@ -765,15 +777,15 @@ msgstr ""
 "中捕获，以便仪表板可以计算每个模型或每个租户的命中率。``cache_salt`` "
 "可能具有高基数（每个租户或隔离域一个条目）；如果存储成本重要，请在抓取时通过 ``metric_relabel_configs`` 丢弃它。"
 
-#: ../../source/mp/observability.rst:369
+#: ../../source/mp/observability.rst:374
 msgid "**PromQL for hit rate:**"
 msgstr "**PromQL 查询命中率:**"
 
-#: ../../source/mp/observability.rst:382
+#: ../../source/mp/observability.rst:387
 msgid "L0 (GPU) Block Lifecycle Histograms"
 msgstr "L0 (GPU) 块生命周期直方图"
 
-#: ../../source/mp/observability.rst:384
+#: ../../source/mp/observability.rst:389
 msgid ""
 "Sampled (default 1%) GPU KV cache block lifecycle tracking via "
 "``L0LifecycleSubscriber``. Eviction is detected at reallocation time "
@@ -785,7 +797,7 @@ msgstr ""
 "块生命周期跟踪。在重新分配时（当块被分配不同的令牌时）检测逐出。采样使用随机选择，并带有一个 ``_skipped`` 集（受物理 GPU "
 "块有限数量的限制）。"
 
-#: ../../source/mp/observability.rst:390
+#: ../../source/mp/observability.rst:395
 #, python-brace-format
 msgid ""
 "All L0 histograms are emitted with ``instance_id`` and ``model_name`` "
@@ -799,35 +811,35 @@ msgstr ""
 "``lmcache_mp_l0_block_lifetime_seconds{instance_id=\\\"12345\\\",model_name"
 "=\\\"llama-7b\\\"}``）。"
 
-#: ../../source/mp/observability.rst:402
+#: ../../source/mp/observability.rst:407
 msgid "``lmcache_mp.l0_block_lifetime``"
 msgstr "``lmcache_mp.l0_block_lifetime``"
 
-#: ../../source/mp/observability.rst:404
+#: ../../source/mp/observability.rst:409
 msgid "Time from allocation to eviction per sampled GPU block."
 msgstr "每个采样的 GPU 块从分配到逐出的时间。"
 
-#: ../../source/mp/observability.rst:405
+#: ../../source/mp/observability.rst:410
 msgid "``lmcache_mp.l0_block_idle_before_evict``"
 msgstr "``lmcache_mp.l0_block_idle_before_evict``"
 
-#: ../../source/mp/observability.rst:407
+#: ../../source/mp/observability.rst:412
 msgid "Time from last access to eviction per sampled GPU block."
 msgstr "从最后访问到逐出的时间（每个采样的 GPU 块）。"
 
-#: ../../source/mp/observability.rst:408
+#: ../../source/mp/observability.rst:413
 msgid "``lmcache_mp.l0_block_reuse_gap``"
 msgstr "``lmcache_mp.l0_block_reuse_gap``"
 
-#: ../../source/mp/observability.rst:410
+#: ../../source/mp/observability.rst:415
 msgid "Time gaps between consecutive accesses of the same GPU block."
 msgstr "同一 GPU 块的连续访问之间的时间间隔。"
 
-#: ../../source/mp/observability.rst:413
+#: ../../source/mp/observability.rst:418
 msgid "L0 ↔ L1 Throughput Histograms"
 msgstr "L0 ↔ L1 吞吐量直方图"
 
-#: ../../source/mp/observability.rst:415
+#: ../../source/mp/observability.rst:420
 #, python-brace-format
 msgid ""
 "Per-request throughput of GPU↔CPU copies via "
@@ -843,7 +855,7 @@ msgstr ""
 "GB/s。时间戳来自在 GPU cupy 流上发布的 ``MP_{STORE,RETRIEVE}_{START,END}`` "
 "事件，因此它们反映了真实的 GPU 流复制时间——而不是 Python/锁的开销。"
 
-#: ../../source/mp/observability.rst:423
+#: ../../source/mp/observability.rst:428
 #, python-brace-format
 msgid ""
 "All throughput histograms are emitted with ``engine_id`` (vLLM worker "
@@ -858,27 +870,27 @@ msgstr ""
 "``lmcache_mp_l0_l1_store_throughput_GB_per_second{engine_id=\"0\",device=\"cuda:3\",model_name"
 "=\"meta-llama/Llama-3.1-8B\"}``)。"
 
-#: ../../source/mp/observability.rst:436
+#: ../../source/mp/observability.rst:441
 msgid "``lmcache_mp.l0_l1_store_throughput``"
 msgstr "``lmcache_mp.l0_l1_store_throughput``"
 
-#: ../../source/mp/observability.rst:438
+#: ../../source/mp/observability.rst:443
 msgid "GPU→CPU (L0→L1) store throughput in GB/s per request."
 msgstr "每个请求的 GPU→CPU (L0→L1) 存储吞吐量（单位：GB/s）。"
 
-#: ../../source/mp/observability.rst:439
+#: ../../source/mp/observability.rst:444
 msgid "``lmcache_mp.l0_l1_load_throughput``"
 msgstr "``lmcache_mp.l0_l1_load_throughput``"
 
-#: ../../source/mp/observability.rst:441
+#: ../../source/mp/observability.rst:446
 msgid "CPU→GPU (L1→L0) load throughput in GB/s per request."
 msgstr "每个请求的 CPU→GPU (L1→L0) 加载吞吐量（GB/s）。"
 
-#: ../../source/mp/observability.rst:444
+#: ../../source/mp/observability.rst:449
 msgid "L1 ↔ L2 Throughput Histograms"
 msgstr "L1 ↔ L2 吞吐量直方图"
 
-#: ../../source/mp/observability.rst:446
+#: ../../source/mp/observability.rst:451
 msgid ""
 "Per-request throughput of L1↔L2 transfers via ``L2ThroughputSubscriber``."
 " The store path correlates ``L2_STORE_SUBMITTED`` → "
@@ -895,7 +907,7 @@ msgstr ""
 "``L2_LOAD_TASK_COMPLETED`` 事件；请求级别的 ``L2_PREFETCH_LOAD_*`` "
 "事件用于块计数器在适配器之间聚合，无法归因于特定的 ``l2_name``。"
 
-#: ../../source/mp/observability.rst:455
+#: ../../source/mp/observability.rst:460
 msgid ""
 "Timestamps span **submit → complete**, so the duration includes adapter "
 "queue, network, and disk I/O — the value is *bytes / end-to-end latency*,"
@@ -906,7 +918,7 @@ msgstr ""
 "时间戳跨越 **提交 → 完成**，因此持续时间包括适配器队列、网络和磁盘 I/O — 该值为 *字节 / "
 "端到端延迟*，而不是原始传输速率。使用这些直方图来比较适配器类型并捕捉回归；当您需要纯粹的复制时间吞吐量时，请使用 L0↔L1 直方图。"
 
-#: ../../source/mp/observability.rst:461
+#: ../../source/mp/observability.rst:466
 #, python-brace-format
 msgid ""
 "All L1↔L2 throughput histograms carry a single ``l2_name`` OTel attribute"
@@ -920,27 +932,27 @@ msgstr ""
 "中能够按后端进行切片（例如 "
 "``lmcache_mp_l2_store_throughput_GB_per_second{l2_name=\\\"nixl_store\\\"}``）。"
 
-#: ../../source/mp/observability.rst:473
+#: ../../source/mp/observability.rst:478
 msgid "``lmcache_mp.l2_store_throughput``"
 msgstr "``lmcache_mp.l2_store_throughput``"
 
-#: ../../source/mp/observability.rst:475
+#: ../../source/mp/observability.rst:480
 msgid "L1→L2 store throughput in GB/s per request."
 msgstr "每个请求的 L1→L2 存储吞吐量（GB/s）。"
 
-#: ../../source/mp/observability.rst:476
+#: ../../source/mp/observability.rst:481
 msgid "``lmcache_mp.l2_load_throughput``"
 msgstr "``lmcache_mp.l2_load_throughput``"
 
-#: ../../source/mp/observability.rst:478
+#: ../../source/mp/observability.rst:483
 msgid "L2→L1 load throughput in GB/s per (request, adapter) pair."
 msgstr "每对（请求，适配器）的 L2→L1 加载吞吐量（单位：GB/s）。"
 
-#: ../../source/mp/observability.rst:481
+#: ../../source/mp/observability.rst:486
 msgid "Engine Counters"
 msgstr "引擎计数器"
 
-#: ../../source/mp/observability.rst:483
+#: ../../source/mp/observability.rst:488
 msgid ""
 "Worker-scoped counters tied to what the MP server delivers back to each "
 "vLLM worker via ``retrieve()``.  Labeled by ``worker_id`` (the vLLM "
@@ -950,15 +962,15 @@ msgstr ""
 "与 MP 服务器通过 ``retrieve()`` 返回给每个 vLLM 工作线程相关的工作线程范围计数器。通过 "
 "``worker_id``（vLLM 工作线程实例 ID）标记——与可能出现在其他指标上的任何调度器范围 ID 不同。"
 
-#: ../../source/mp/observability.rst:495
+#: ../../source/mp/observability.rst:500
 msgid "``lmcache_mp.num_chunks_loaded``"
 msgstr "``lmcache_mp.num_chunks_loaded``"
 
-#: ../../source/mp/observability.rst:496
+#: ../../source/mp/observability.rst:501
 msgid "Counter (attrs: ``worker_id``, ``model_name``, ``cache_salt``)"
 msgstr "计数器（属性：``worker_id``, ``model_name``, ``cache_salt``）"
 
-#: ../../source/mp/observability.rst:497
+#: ../../source/mp/observability.rst:502
 msgid ""
 "Total number of LMCache chunks loaded into the engine, summed over all "
 "``retrieve()`` completions.  Sliceable per worker, per model, and per "
@@ -970,17 +982,17 @@ msgstr ""
 "完成的总和计算。可以按工作者、模型和租户/隔离域（``cache_salt``）进行切片。``cache_salt`` "
 "可能具有高基数；如果存储成本很重要，请在抓取时使用 ``metric_relabel_configs`` 丢弃它。"
 
-#: ../../source/mp/observability.rst:504
+#: ../../source/mp/observability.rst:509
 msgid "Observable Gauges"
 msgstr "可观察的仪表盘"
 
-#: ../../source/mp/observability.rst:506
+#: ../../source/mp/observability.rst:511
 msgid ""
 "Point-in-time state snapshots registered via ``register_gauge`` (pull-"
 "based OTel observable gauges)."
 msgstr "通过 ``register_gauge`` 注册的时间点状态快照（基于拉取的 OTel 可观察量度）。"
 
-#: ../../source/mp/observability.rst:509
+#: ../../source/mp/observability.rst:514
 msgid ""
 "The three in-flight metrics carry two attributes that distinguish "
 "adapters even when more than one is registered with the same backend type"
@@ -989,7 +1001,7 @@ msgstr ""
 "这三个正在进行的指标携带两个属性，即使注册了多个相同后端类型的适配器，也能区分它们——与 "
 "``lmcache_mp.l2_store_completed`` 具有相同形状："
 
-#: ../../source/mp/observability.rst:513
+#: ../../source/mp/observability.rst:518
 msgid ""
 "``l2_name`` — the registered adapter type (e.g. ``\"fs\"``, "
 "``\"nixl_store\"``, ``\"mooncake_store\"``)."
@@ -997,35 +1009,35 @@ msgstr ""
 "``l2_name`` — 注册的适配器类型（例如 ``\"fs\"``, ``\"nixl_store\"``, "
 "``\"mooncake_store\"``）。"
 
-#: ../../source/mp/observability.rst:515
+#: ../../source/mp/observability.rst:520
 msgid "``adapter_index`` — position in the controller's adapter list."
 msgstr "``adapter_index`` — 控制器适配器列表中的位置。"
 
-#: ../../source/mp/observability.rst:517
+#: ../../source/mp/observability.rst:522
 msgid "Adapters with no in-flight work emit no datapoint for that scrape."
 msgstr "没有正在进行的工作的适配器不会为该抓取发出数据点。"
 
-#: ../../source/mp/observability.rst:526
+#: ../../source/mp/observability.rst:531
 msgid "``lmcache_mp.active_prefetch_jobs``"
 msgstr "``lmcache_mp.active_prefetch_jobs``"
 
-#: ../../source/mp/observability.rst:527 ../../source/mp/observability.rst:531
-#: ../../source/mp/observability.rst:536 ../../source/mp/observability.rst:597
-#: ../../source/mp/observability.rst:601
+#: ../../source/mp/observability.rst:532 ../../source/mp/observability.rst:536
+#: ../../source/mp/observability.rst:541 ../../source/mp/observability.rst:602
+#: ../../source/mp/observability.rst:606
 msgid "ObservableGauge"
 msgstr "可观察仪表"
 
-#: ../../source/mp/observability.rst:528
+#: ../../source/mp/observability.rst:533
 msgid ""
 "Number of prefetch jobs currently in-flight. A sustained high value may "
 "indicate slow L2 backends or polling delays."
 msgstr "当前正在进行的预取作业数量。持续的高值可能表示 L2 后端缓慢或轮询延迟。"
 
-#: ../../source/mp/observability.rst:530
+#: ../../source/mp/observability.rst:535
 msgid "``lmcache_mp.l1_memory_usage_bytes``"
 msgstr "``lmcache_mp.l1_memory_usage_bytes``"
 
-#: ../../source/mp/observability.rst:532
+#: ../../source/mp/observability.rst:537
 msgid ""
 "Bytes currently held in L1.  Rising without plateauing typically "
 "indicates a leak; saturating at the configured ``--l1-size-gb`` indicates"
@@ -1034,11 +1046,11 @@ msgstr ""
 "当前在 L1 中占用的字节数。持续上升而没有达到平稳状态通常表示存在泄漏；在配置的 ``--l1-size-gb`` "
 "达到饱和时表示工作集超过了容量。"
 
-#: ../../source/mp/observability.rst:535
+#: ../../source/mp/observability.rst:540
 msgid "``lmcache_mp.l1_usage_ratio``"
 msgstr "``lmcache_mp.l1_usage_ratio``"
 
-#: ../../source/mp/observability.rst:537
+#: ../../source/mp/observability.rst:542
 msgid ""
 "L1 used/total ratio (``0.0``–``1.0``), sampled at scrape time from "
 "``L1Manager.get_memory_usage()``. Returns ``0.0`` when the gauge target "
@@ -1051,15 +1063,15 @@ msgstr ""
 "采样。当计量目标尚未连接或 ``total_bytes`` 为零时返回 ``0.0``，因此在抓取期间回调不会触发。与逐出水位线 (默认 "
 "``0.8``) 进行比较，以判断逐出循环是否低于或高于触发阈值。"
 
-#: ../../source/mp/observability.rst:543
+#: ../../source/mp/observability.rst:548
 msgid "``lmcache_mp.l2_usage_bytes``"
 msgstr "``lmcache_mp.l2_usage_bytes``"
 
-#: ../../source/mp/observability.rst:544
+#: ../../source/mp/observability.rst:549
 msgid "ObservableGauge (attr: ``l2_name``)"
 msgstr "可观察仪表 (属性: ``l2_name``)"
 
-#: ../../source/mp/observability.rst:545
+#: ../../source/mp/observability.rst:550
 msgid ""
 "Bytes currently held in each L2 adapter, sampled at scrape time from "
 "``adapter.get_usage()``.  One observation per configured adapter, tagged "
@@ -1070,28 +1082,33 @@ msgid ""
 "poisoning the observation, so a missing datapoint for one ``l2_name`` can"
 " mean either \"not configured\" or \"adapter errored on this scrape\" — "
 "cross-check with the L2 store/load counters."
-msgstr "每个 L2 适配器当前持有的字节数，在抓取时从 ``adapter.get_usage()`` 采样。每个配置的适配器一个观测值，按 ``l2_name``（适配器类型，例如 ``\"fs\"``, ``\"nixl_store\"``, ``\"mooncake_store\"``）标记。与 L2 层的 ``l1_memory_usage_bytes`` 并行 — 使用它来查看每个 L2 后端当前持有多少。抛出异常的适配器 ``get_usage()`` 会被静默跳过，而不是污染观测，因此某个 ``l2_name`` 缺失的数据点可能意味着“未配置”或“适配器在此抓取时出错” — 请与 L2 存储/负载计数器交叉检查。"
+msgstr ""
+"每个 L2 适配器当前持有的字节数，在抓取时从 ``adapter.get_usage()`` 采样。每个配置的适配器一个观测值，按 "
+"``l2_name``（适配器类型，例如 ``\"fs\"``, ``\"nixl_store\"``, "
+"``\"mooncake_store\"``）标记。与 L2 层的 ``l1_memory_usage_bytes`` 并行 — 使用它来查看每个"
+" L2 后端当前持有多少。抛出异常的适配器 ``get_usage()`` 会被静默跳过，而不是污染观测，因此某个 ``l2_name`` "
+"缺失的数据点可能意味着“未配置”或“适配器在此抓取时出错” — 请与 L2 存储/负载计数器交叉检查。"
 
-#: ../../source/mp/observability.rst:555
+#: ../../source/mp/observability.rst:560
 msgid "``lmcache_mp.num_inflight_l2_stores``"
 msgstr "``lmcache_mp.num_inflight_l2_stores``"
 
-#: ../../source/mp/observability.rst:556 ../../source/mp/observability.rst:561
-#: ../../source/mp/observability.rst:566
+#: ../../source/mp/observability.rst:561 ../../source/mp/observability.rst:566
+#: ../../source/mp/observability.rst:571
 msgid "ObservableGauge (attrs: ``l2_name``, ``adapter_index``)"
 msgstr "可观察的仪表 (属性: ``l2_name``, ``adapter_index``)"
 
-#: ../../source/mp/observability.rst:557
+#: ../../source/mp/observability.rst:562
 msgid ""
 "L2 store tasks currently executing, per adapter.  Sustained non-zero "
 "values indicate the adapter cannot keep up with the L1 → L2 write rate."
 msgstr "每个适配器当前正在执行的 L2 存储任务。 持续的非零值表明适配器无法跟上 L1 → L2 写入速率。"
 
-#: ../../source/mp/observability.rst:560
+#: ../../source/mp/observability.rst:565
 msgid "``lmcache_mp.num_inflight_l2_loads``"
 msgstr "``lmcache_mp.num_inflight_l2_loads``"
 
-#: ../../source/mp/observability.rst:562
+#: ../../source/mp/observability.rst:567
 msgid ""
 "L2 → L1 prefetch load tasks currently executing, per adapter. Pair with "
 "``num_inflight_l2_stores`` to see whether read or write traffic dominates"
@@ -1100,11 +1117,11 @@ msgstr ""
 "每个适配器当前正在执行的 L2 → L1 预取加载任务。与 ``num_inflight_l2_stores`` "
 "配对，以查看给定后端是以读取流量还是写入流量为主。"
 
-#: ../../source/mp/observability.rst:565
+#: ../../source/mp/observability.rst:570
 msgid "``lmcache_mp.inflight_load_memory_usage_bytes``"
 msgstr "``lmcache_mp.inflight_load_memory_usage_bytes``"
 
-#: ../../source/mp/observability.rst:567
+#: ../../source/mp/observability.rst:572
 msgid ""
 "L1 bytes reserved by in-flight L2 → L1 prefetch loads, per adapter.  "
 "Rising in-flight bytes alongside rising ``l1_memory_usage_bytes`` is a "
@@ -1115,11 +1132,11 @@ msgstr ""
 "每个适配器保留的 L1 字节，用于正在进行的 L2 → L1 预取加载。随着正在进行的字节和 ``l1_memory_usage_bytes`` "
 "的增加，预取保留正在挤占可缓存数据。每个适配器的字节归属遵循每个请求的 ``load_plan`` 位图，因此跨适配器求和时不会重复计算。"
 
-#: ../../source/mp/observability.rst:575
+#: ../../source/mp/observability.rst:580
 msgid "EventBus Self-Monitoring"
 msgstr "事件总线自我监控"
 
-#: ../../source/mp/observability.rst:577
+#: ../../source/mp/observability.rst:582
 msgid ""
 "Health metrics for the EventBus itself, registered by "
 "``EventBusSelfMetricsSubscriber`` on the ``lmcache.event_bus`` OTel "
@@ -1131,7 +1148,7 @@ msgstr ""
 "``lmcache.event_bus`` OTel 计量器上注册。这些指标通过 ``EventBus`` 访问器直接观察总线状态，并在每次 "
 "OTel 抓取时报告——它们不是由事件驱动的，因此丢弃或失败的订阅者无法使其静默。"
 
-#: ../../source/mp/observability.rst:583
+#: ../../source/mp/observability.rst:588
 msgid ""
 "Use them to answer: is the EventBus keeping up with publishers, is "
 "anything being dropped, and are any subscriber callbacks raising? A non-"
@@ -1143,47 +1160,47 @@ msgstr ""
 "``dropped_events_total`` 或持续非零的 ``drain_lag_seconds`` 表明总线处于 ``--event-"
 "bus-queue-size`` 并且正在尾部丢弃；请提高该标志或调查慢速订阅者。"
 
-#: ../../source/mp/observability.rst:596
+#: ../../source/mp/observability.rst:601
 msgid "``lmcache_mp.event_bus.queue_depth``"
 msgstr "``lmcache_mp.event_bus.queue_depth``"
 
-#: ../../source/mp/observability.rst:598
+#: ../../source/mp/observability.rst:603
 msgid "Events currently queued in the EventBus (``len(_queue)`` at scrape time)."
 msgstr "事件总线中当前排队的事件（在抓取时的 ``len(_queue)``）。"
 
-#: ../../source/mp/observability.rst:600
+#: ../../source/mp/observability.rst:605
 msgid "``lmcache_mp.event_bus.drain_lag_seconds``"
 msgstr "``lmcache_mp.event_bus.drain_lag_seconds``"
 
-#: ../../source/mp/observability.rst:602
+#: ../../source/mp/observability.rst:607
 msgid ""
 "Seconds since the oldest queued event was published; ``0.0`` when empty."
 "  Rising values mean the drain thread is falling behind."
 msgstr "自最旧的排队事件发布以来的秒数；当为空时为 ``0.0``。值上升意味着排出线程落后。"
 
-#: ../../source/mp/observability.rst:605
+#: ../../source/mp/observability.rst:610
 msgid "``lmcache_mp.event_bus.dropped_events_total``"
 msgstr "``lmcache_mp.event_bus.dropped_events_total``"
 
-#: ../../source/mp/observability.rst:606
+#: ../../source/mp/observability.rst:611
 msgid "ObservableCounter"
 msgstr "可观察计数器"
 
-#: ../../source/mp/observability.rst:607
+#: ../../source/mp/observability.rst:612
 msgid ""
 "Cumulative events dropped because the EventBus queue was at ``--event-"
 "bus-queue-size``."
 msgstr "由于 EventBus 队列达到 ``--event-bus-queue-size``，累计丢弃的事件。"
 
-#: ../../source/mp/observability.rst:609
+#: ../../source/mp/observability.rst:614
 msgid "``lmcache_mp.event_bus.subscriber_exceptions``"
 msgstr "``lmcache_mp.event_bus.subscriber_exceptions``"
 
-#: ../../source/mp/observability.rst:610
+#: ../../source/mp/observability.rst:615
 msgid "ObservableCounter (attr: ``subscriber_name``)"
 msgstr "可观察计数器 (属性: ``subscriber_name``)"
 
-#: ../../source/mp/observability.rst:611
+#: ../../source/mp/observability.rst:616
 msgid ""
 "Cumulative exceptions raised by subscriber callbacks during EventBus "
 "dispatch, tagged by ``subscriber_name`` (the failing callback's owning "
@@ -1192,7 +1209,7 @@ msgstr ""
 "由 EventBus 分发期间由订阅者回调引发的累计异常，按 ``subscriber_name`` "
 "标记（对于绑定方法为失败回调的拥有类，对于自由函数为 ``__qualname__``）。"
 
-#: ../../source/mp/observability.rst:616
+#: ../../source/mp/observability.rst:621
 msgid ""
 "For the full design rationale and the in-process accessors that back each"
 " metric see ``docs/design/v1/mp_observability/METRICS.md`` and "
@@ -1202,19 +1219,19 @@ msgstr ""
 "``docs/design/v1/mp_observability/METRICS.md`` 和 "
 "``docs/design/v1/mp_observability/event-bus.md``。"
 
-#: ../../source/mp/observability.rst:621
+#: ../../source/mp/observability.rst:626
 msgid "Prometheus Scrape Configuration"
 msgstr "Prometheus 抓取配置"
 
-#: ../../source/mp/observability.rst:623
+#: ../../source/mp/observability.rst:628
 msgid "Add the LMCache server as a Prometheus scrape target:"
 msgstr "将 LMCache 服务器添加为 Prometheus 抓取目标："
 
-#: ../../source/mp/observability.rst:633
+#: ../../source/mp/observability.rst:638
 msgid "Logging"
 msgstr "日志记录"
 
-#: ../../source/mp/observability.rst:635
+#: ../../source/mp/observability.rst:640
 msgid ""
 "Logging subscribers emit debug-level messages for store, retrieve, "
 "lookup, L1, and StorageManager events via Python's standard ``logging`` "
@@ -1223,7 +1240,7 @@ msgstr ""
 "日志订阅者通过 Python 的标准 ``logging`` 模块为存储、检索、查找、L1 和 StorageManager "
 "事件发出调试级别的消息。"
 
-#: ../../source/mp/observability.rst:638
+#: ../../source/mp/observability.rst:643
 msgid ""
 "When OpenTelemetry is installed, ``init_logger`` automatically attaches "
 "an OTel ``LoggingHandler`` so that log records are forwarded to any "
@@ -1234,56 +1251,56 @@ msgstr ""
 "``LoggingHandler``，以便将日志记录转发到任何配置的 OTel ``LoggerProvider``。该处理程序遵循 "
 "``LMCACHE_LOG_LEVEL`` 环境变量。"
 
-#: ../../source/mp/observability.rst:647
+#: ../../source/mp/observability.rst:652
 msgid "Key log messages:"
 msgstr "关键日志消息："
 
-#: ../../source/mp/observability.rst:653
+#: ../../source/mp/observability.rst:658
 msgid "Level"
 msgstr "级别"
 
-#: ../../source/mp/observability.rst:654
+#: ../../source/mp/observability.rst:659
 msgid "Message"
 msgstr "消息"
 
-#: ../../source/mp/observability.rst:655 ../../source/mp/observability.rst:657
-#: ../../source/mp/observability.rst:659
+#: ../../source/mp/observability.rst:660 ../../source/mp/observability.rst:662
+#: ../../source/mp/observability.rst:664
 msgid "INFO"
 msgstr "信息"
 
-#: ../../source/mp/observability.rst:656
+#: ../../source/mp/observability.rst:661
 msgid "``Stored N tokens in X seconds``"
 msgstr "``在 X 秒内存储了 N 个令牌``"
 
-#: ../../source/mp/observability.rst:658
+#: ../../source/mp/observability.rst:663
 msgid "``Retrieved N tokens in X seconds``"
 msgstr ""
 "```\n"
 "在 X 秒内检索到 N 个令牌``"
 
-#: ../../source/mp/observability.rst:660
+#: ../../source/mp/observability.rst:665
 msgid "``Prefetch request completed (L1+L2): N/M prefix hits``"
 msgstr ""
 "```\n"
 "预取请求完成 (L1+L2): N/M 前缀命中``"
 
-#: ../../source/mp/observability.rst:661 ../../source/mp/observability.rst:663
+#: ../../source/mp/observability.rst:666 ../../source/mp/observability.rst:668
 msgid "DEBUG"
 msgstr "调试"
 
-#: ../../source/mp/observability.rst:662
+#: ../../source/mp/observability.rst:667
 msgid "``MP store start: session=... device=...``"
 msgstr "``MP store start: session=... device=...``"
 
-#: ../../source/mp/observability.rst:664
+#: ../../source/mp/observability.rst:669
 msgid "``MP retrieve end: session=... retrieved_count=...``"
 msgstr "``MP retrieve end: session=... retrieved_count=...``"
 
-#: ../../source/mp/observability.rst:667
+#: ../../source/mp/observability.rst:672
 msgid "Tracing"
 msgstr "追踪"
 
-#: ../../source/mp/observability.rst:671
+#: ../../source/mp/observability.rst:676
 msgid ""
 "``--enable-tracing`` **requires** ``--otlp-endpoint`` to be set. The "
 "server will refuse to start if tracing is enabled without an OTLP "
@@ -1292,7 +1309,7 @@ msgstr ""
 "``--enable-tracing`` **要求** 设置 ``--otlp-endpoint``。如果在没有 OTLP "
 "端点的情况下启用追踪，服务器将拒绝启动，因为没有本地回退用于追踪导出。"
 
-#: ../../source/mp/observability.rst:675
+#: ../../source/mp/observability.rst:680
 msgid ""
 "When tracing is enabled (``--enable-tracing --otlp-endpoint <URL>``), the"
 " tracing subscriber creates OTel spans from START/END event pairs:"
@@ -1300,15 +1317,15 @@ msgstr ""
 "当启用追踪时（``--enable-tracing --otlp-endpoint <URL>``），追踪订阅者会从 START/END "
 "事件对创建 OTel spans："
 
-#: ../../source/mp/observability.rst:678
+#: ../../source/mp/observability.rst:683
 msgid "``mp.store`` — from ``MP_STORE_START`` to ``MP_STORE_END``"
 msgstr "``mp.store`` — 从 ``MP_STORE_START`` 到 ``MP_STORE_END``"
 
-#: ../../source/mp/observability.rst:679
+#: ../../source/mp/observability.rst:684
 msgid "``mp.retrieve`` — from ``MP_RETRIEVE_START`` to ``MP_RETRIEVE_END``"
 msgstr "``mp.retrieve`` — from ``MP_RETRIEVE_START`` to ``MP_RETRIEVE_END``"
 
-#: ../../source/mp/observability.rst:680
+#: ../../source/mp/observability.rst:685
 msgid ""
 "``mp.lookup_prefetch`` — from ``MP_LOOKUP_PREFETCH_START`` to "
 "``MP_LOOKUP_PREFETCH_END``"
@@ -1316,23 +1333,23 @@ msgstr ""
 "``mp.lookup_prefetch`` — from ``MP_LOOKUP_PREFETCH_START`` to "
 "``MP_LOOKUP_PREFETCH_END``"
 
-#: ../../source/mp/observability.rst:682
+#: ../../source/mp/observability.rst:687
 msgid ""
 "Each span carries event metadata as span attributes (e.g. ``device``, "
 "``stored_count``, ``found_count``)."
 msgstr "每个跨度携带事件元数据作为跨度属性（例如 ``device``、``stored_count``、``found_count``）。"
 
-#: ../../source/mp/observability.rst:685
+#: ../../source/mp/observability.rst:690
 msgid ""
 "View traces in any OTel-compatible backend such as **Jaeger** or "
 "**Grafana Tempo**."
 msgstr "在任何 OTel 兼容的后端中查看追踪，例如 **Jaeger** 或 **Grafana Tempo**。"
 
-#: ../../source/mp/observability.rst:701
+#: ../../source/mp/observability.rst:706
 msgid "Per-Request Hit-Rate Attributes"
 msgstr "每个请求的命中率属性"
 
-#: ../../source/mp/observability.rst:703
+#: ../../source/mp/observability.rst:708
 msgid ""
 "Each session is wrapped in a per-request root span — ``request`` for the "
 "standard MP path and ``cb.request`` for the CacheBlend path — that nests "
@@ -1344,39 +1361,39 @@ msgstr ""
 "``cb.request``——它将所有子跨度（``mp.store``、``mp.retrieve``、``mp.lookup_prefetch``）嵌套在其下。当查找阶段结束时，根跨度会用三个"
 " OTel 属性进行注释，这些属性总结了请求级别的缓存命中率："
 
-#: ../../source/mp/observability.rst:714
+#: ../../source/mp/observability.rst:719
 msgid "OTel type"
 msgstr "OTel 类型"
 
-#: ../../source/mp/observability.rst:716
+#: ../../source/mp/observability.rst:721
 msgid "``hit_tokens``"
 msgstr "``hit_tokens``"
 
-#: ../../source/mp/observability.rst:717 ../../source/mp/observability.rst:720
+#: ../../source/mp/observability.rst:722 ../../source/mp/observability.rst:725
 msgid "``int``"
 msgstr "``int``"
 
-#: ../../source/mp/observability.rst:718
+#: ../../source/mp/observability.rst:723
 msgid "Tokens served from L1+L2 (numerator)."
 msgstr "从 L1+L2 服务的令牌（分子）。"
 
-#: ../../source/mp/observability.rst:719
+#: ../../source/mp/observability.rst:724
 msgid "``requested_tokens``"
 msgstr "``requested_tokens``"
 
-#: ../../source/mp/observability.rst:721
+#: ../../source/mp/observability.rst:726
 msgid "Chunk-aligned tokens submitted for lookup (denominator)."
 msgstr "用于查找的块对齐令牌（分母）。"
 
-#: ../../source/mp/observability.rst:722
+#: ../../source/mp/observability.rst:727
 msgid "``hit_rate``"
 msgstr "``hit_rate``"
 
-#: ../../source/mp/observability.rst:723
+#: ../../source/mp/observability.rst:728
 msgid "``float``"
 msgstr "``float``"
 
-#: ../../source/mp/observability.rst:724
+#: ../../source/mp/observability.rst:729
 msgid ""
 "``hit_tokens / requested_tokens``; ``0.0`` when the denominator is zero."
 "  Stored as a precomputed float because trace UIs (Tempo, Jaeger) cannot "
@@ -1385,7 +1402,7 @@ msgstr ""
 "``hit_tokens / requested_tokens``; ``0.0`` "
 "当分母为零时。存储为预计算的浮点数，因为跟踪用户界面（Tempo, Jaeger）无法在查询时从两个整数属性中推导出它。"
 
-#: ../../source/mp/observability.rst:728
+#: ../../source/mp/observability.rst:733
 msgid ""
 "The attributes are written when ``MP_LOOKUP_PREFETCH_END`` (standard MP "
 "path) or ``CB_LOOKUP_END`` (CacheBlend path) is processed — while the "
@@ -1397,11 +1414,11 @@ msgstr ""
 "路径）时被写入——当根跨度仍然打开时。**仅存储请求**在未调用 ``lookup_prefetch_start()`` "
 "时不会为查找阶段发出结束事件，因此它们的根跨度将不会携带这些属性。"
 
-#: ../../source/mp/observability.rst:734
+#: ../../source/mp/observability.rst:739
 msgid "Example TraceQL queries (Grafana Tempo):"
 msgstr "示例 TraceQL 查询 (Grafana Tempo):"
 
-#: ../../source/mp/observability.rst:747
+#: ../../source/mp/observability.rst:752
 msgid ""
 "For the full event-to-span mapping and the registry pattern that links "
 "child spans back to the root see ``docs/design/observability/request-"
@@ -1410,11 +1427,11 @@ msgstr ""
 "有关完整的事件到跨度映射以及将子跨度链接回根的注册模式，请参见源树中的 ``docs/design/observability/request-"
 "event-span.md``。"
 
-#: ../../source/mp/observability.rst:754
+#: ../../source/mp/observability.rst:759
 msgid "Trace Recording"
 msgstr "追踪记录"
 
-#: ../../source/mp/observability.rst:758
+#: ../../source/mp/observability.rst:763
 msgid ""
 "Trace recording is **distinct from** ``--enable-tracing`` (OTel spans). "
 "Trace recording captures every ``StorageManager`` public-API call to a "
@@ -1429,7 +1446,7 @@ msgstr ""
 "vLLM，并且最终无需 GPU。``--enable-tracing`` 将实时 OTel spans 导出到 OTLP "
 "端点以进行在线可观察性。这两个功能是独立的，可以一起使用。"
 
-#: ../../source/mp/observability.rst:766
+#: ../../source/mp/observability.rst:771
 #, python-brace-format
 msgid ""
 "When ``--trace-level storage`` is set, LMCache records every call to "
@@ -1441,7 +1458,7 @@ msgstr ""
 "``StorageManager.{reserve_write, finish_write, submit_prefetch_task, "
 "read_prefetched_results, finish_read_prefetched}`` 的调用记录到一个二进制文件中，以便后续重放。"
 
-#: ../../source/mp/observability.rst:771
+#: ../../source/mp/observability.rst:776
 msgid ""
 "Recording is **off by default** and adds near-zero overhead when off (a "
 "single boolean check per ``StorageManager`` call). When on, recording "
@@ -1450,48 +1467,48 @@ msgstr ""
 "记录默认是**关闭的**，关闭时几乎没有开销（每个``StorageManager``调用只需进行一次布尔检查）。开启时，记录发生在 "
 "EventBus 排出线程上，而不在请求路径上。"
 
-#: ../../source/mp/observability.rst:776
+#: ../../source/mp/observability.rst:781
 msgid "Capturing a trace"
 msgstr "捕获跟踪"
 
-#: ../../source/mp/observability.rst:778
+#: ../../source/mp/observability.rst:783
 msgid "With an explicit output path:"
 msgstr "带有显式输出路径："
 
-#: ../../source/mp/observability.rst:786
+#: ../../source/mp/observability.rst:791
 msgid "With an implicit timestamped output path under ``$TMPDIR``:"
 msgstr "在 ``$TMPDIR`` 下使用隐式时间戳输出路径："
 
-#: ../../source/mp/observability.rst:797
+#: ../../source/mp/observability.rst:802
 msgid ""
 "The trace file is closed cleanly on shutdown (SIGTERM is handled by the "
 "EventBus stop path)."
 msgstr "在关闭时（SIGTERM 由 EventBus 停止路径处理），跟踪文件会被干净地关闭。"
 
-#: ../../source/mp/observability.rst:801
+#: ../../source/mp/observability.rst:806
 msgid "Replay"
 msgstr "重放"
 
-#: ../../source/mp/observability.rst:803
+#: ../../source/mp/observability.rst:808
 msgid ""
 "Replaying a recorded trace, plus the full set of CLI flags for driving, "
 "monitoring, and exporting replay results, is covered in its own page: "
 ":doc:`tracing_and_debugging`."
 msgstr "重放记录的追踪，以及用于驱动、监控和导出重放结果的完整 CLI 标志集，详见其独立页面： :doc:`tracing_and_debugging`。"
 
-#: ../../source/mp/observability.rst:808
+#: ../../source/mp/observability.rst:813
 msgid "What is captured (and what is not)"
 msgstr "捕获了什么（以及未捕获的内容）"
 
-#: ../../source/mp/observability.rst:810
+#: ../../source/mp/observability.rst:815
 msgid "**Captured:**"
 msgstr "**捕获：**"
 
-#: ../../source/mp/observability.rst:812
+#: ../../source/mp/observability.rst:817
 msgid "The fully-qualified name of every decorated ``StorageManager`` call."
 msgstr "每个被装饰的 ``StorageManager`` 调用的完全限定名称。"
 
-#: ../../source/mp/observability.rst:813
+#: ../../source/mp/observability.rst:818
 msgid ""
 "Each call's input arguments (e.g. ``keys``, ``layout_desc``, ``mode``, "
 "``extra_count``, ``external_request_id``)."
@@ -1499,11 +1516,11 @@ msgstr ""
 "每个调用的输入参数（例如 ``keys``, ``layout_desc``, ``mode``, ``extra_count``, "
 "``external_request_id``）。"
 
-#: ../../source/mp/observability.rst:815
+#: ../../source/mp/observability.rst:820
 msgid "Wall-clock and monotonic timestamps of each call."
 msgstr "每个调用的墙钟时间和单调时间戳。"
 
-#: ../../source/mp/observability.rst:816
+#: ../../source/mp/observability.rst:821
 msgid ""
 "A header carrying a trace schema version, start times, and a SHA-256 "
 "digest of the active ``StorageManagerConfig`` so replay can detect "
@@ -1512,31 +1529,31 @@ msgstr ""
 "一个包含跟踪模式版本、开始时间和活动的 ``StorageManagerConfig`` 的 SHA-256 "
 "摘要的头部，以便重放可以检测到不匹配的配置。"
 
-#: ../../source/mp/observability.rst:820
+#: ../../source/mp/observability.rst:825
 msgid "**Not captured:**"
 msgstr "**未捕获：**"
 
-#: ../../source/mp/observability.rst:822
+#: ../../source/mp/observability.rst:827
 msgid ""
 "KV tensor bytes. Replay exercises bookkeeping and controller logic; "
 "payloads at replay time are zeros."
 msgstr "KV 张量字节。重放练习的记账和控制逻辑；重放时的有效载荷为零。"
 
-#: ../../source/mp/observability.rst:824
+#: ../../source/mp/observability.rst:829
 msgid ""
 "Calls inside the ``MPCacheEngine``, the message queue, or any GPU-copy "
 "code. These layers are **out of scope** for the storage trace level."
 msgstr "在 ``MPCacheEngine``、消息队列或任何 GPU 复制代码中的调用。这些层级在存储跟踪级别中是 **超出范围** 的。"
 
-#: ../../source/mp/observability.rst:829
+#: ../../source/mp/observability.rst:834
 msgid "File format"
 msgstr "文件格式"
 
-#: ../../source/mp/observability.rst:831
+#: ../../source/mp/observability.rst:836
 msgid "A length-prefixed `msgpack <https://msgpack.org/>`_ stream:"
 msgstr "一个长度前缀的 `msgpack <https://msgpack.org/>`_ 流："
 
-#: ../../source/mp/observability.rst:840
+#: ../../source/mp/observability.rst:845
 msgid ""
 "The ``Header`` carries a magic prefix (``LMCT``), a format version, the "
 "trace level (``storage`` today), a trace schema version, start "
@@ -1548,7 +1565,7 @@ msgstr ""
 "``storage``)、一个跟踪模式版本、开始时间戳和 StorageManagerConfig 摘要。每个 ``Record`` "
 "包含一个相对时间戳、一个墙钟时间戳、完全限定的调用位置 (``qualname``) 和一个参数字典。"
 
-#: ../../source/mp/observability.rst:846
+#: ../../source/mp/observability.rst:851
 msgid ""
 "The format is deliberately extensible: future trace **levels** (``mq``, "
 "``gpu``) will share this layout and use the ``level`` header field to "
@@ -1558,7 +1575,7 @@ msgstr ""
 "该格式故意具有可扩展性：未来的跟踪 **级别**（``mq``，``gpu``）将共享此布局，并使用 ``level`` "
 "头字段进行区分。额外捕获的操作会添加新的 ``qualname`` 字符串，而不会提升格式版本。"
 
-#: ../../source/mp/observability.rst:851
+#: ../../source/mp/observability.rst:856
 msgid ""
 "For the full design rationale see "
 "``docs/design/v1/mp_observability/trace.md`` in the source tree."
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/recipes/gemma3.po b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/gemma3.po
new file mode 100644
index 0000000000..1493de0fd0
--- /dev/null
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/gemma3.po
@@ -0,0 +1,151 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2024, The LMCache Team
+# This file is distributed under the same license as the LMCache package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2026.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: LMCache \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: zh_CN\n"
+"Language-Team: zh_CN <LL@li.org>\n"
+"Plural-Forms: nplurals=1; plural=0;\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.18.0\n"
+
+#: ../../source/recipes/gemma3.rst:4
+msgid "Gemma3ForConditionalGeneration"
+msgstr "Gemma3ForConditionalGeneration"
+
+#: ../../source/recipes/gemma3.rst:7
+msgid "Validated models"
+msgstr "验证过的模型"
+
+#: ../../source/recipes/gemma3.rst:9
+msgid "`google/gemma-3-4b-it <https://huggingface.co/google/gemma-3-4b-it>`_"
+msgstr "`google/gemma-3-4b-it <https://huggingface.co/google/gemma-3-4b-it>`_"
+
+#: ../../source/recipes/gemma3.rst
+msgid "vLLM"
+msgstr "vLLM"
+
+#: ../../source/recipes/gemma3.rst:16
+msgid ""
+"**Engine documentation:** `Gemma 3 in vLLM supported models "
+"<https://docs.vllm.ai/en/latest/models/supported_models.html#multimodal-"
+"language-models>`_ (architecture ``Gemma3ForConditionalGeneration``)."
+msgstr "**引擎文档：** `Gemma 3 在 vLLM 支持的模型 <https://docs.vllm.ai/en/latest/models/supported_models.html#multimodal-language-models>`_ (架构 ``Gemma3ForConditionalGeneration``)。"
+
+#: ../../source/recipes/gemma3.rst:21
+msgid "**Status:** Validated with LMCache."
+msgstr "**状态:** 已通过 LMCache 验证。"
+
+#: ../../source/recipes/gemma3.rst:23
+msgid "Start the LMCache MP server:"
+msgstr "启动 LMCache MP 服务器："
+
+#: ../../source/recipes/gemma3.rst:31
+msgid "Start vLLM with the LMCache MP connector:"
+msgstr "启动 vLLM 与 LMCache MP 连接器："
+
+#: ../../source/recipes/gemma3.rst:42
+msgid ""
+"Gemma 3 interleaves local (sliding-window) and global (full) attention "
+"layers, so vLLM keeps its **hybrid KV cache manager** on and exposes "
+"multiple KV cache groups. LMCache stores and retrieves all of them "
+"through its hybrid memory allocator support -- ``LMCacheMPConnector`` "
+"advertises ``SupportsHMA``, so vLLM does not auto-disable the hybrid "
+"manager and no extra configuration is required."
+msgstr "Gemma 3 交错使用局部（滑动窗口）和全局（完整）注意力层，因此 vLLM 保持其 **混合 KV Cache 管理器** 开启，并暴露多个 KV Cache 组。LMCache 通过其混合内存分配器支持存储和检索所有这些组 -- ``LMCacheMPConnector`` 宣告 ``SupportsHMA``，因此 vLLM 不会自动禁用混合管理器，也不需要额外的配置。"
+
+#: ../../source/recipes/gemma3.rst:49
+msgid ""
+"``google/gemma-3-4b-it`` is a gated model; authenticate with the Hugging "
+"Face Hub (e.g. set ``HF_TOKEN``) before serving. Adjust ``--tensor-"
+"parallel-size`` to match your hardware. For the generic LMCache + vLLM "
+"wiring (ports, remote hosts, in-process mode), see "
+":doc:`../mp/quickstart`."
+msgstr "``google/gemma-3-4b-it`` 是一个受限模型；在服务之前，请先通过 Hugging Face Hub 进行身份验证（例如，设置 ``HF_TOKEN``）。调整 ``--tensor-parallel-size`` 以匹配您的硬件。有关通用 LMCache + vLLM 连接（端口、远程主机、进程内模式），请参见 :doc:`../mp/quickstart`。"
+
+#: ../../source/recipes/gemma3.rst:55
+msgid ""
+"If there are any issues with vLLM setup, please refer to the `vLLM "
+"Recipes <https://docs.vllm.ai/projects/recipes/en/latest/index.html>`_ "
+"for more details."
+msgstr "如果在 vLLM 设置中遇到任何问题，请参考 `vLLM Recipes <https://docs.vllm.ai/projects/recipes/en/latest/index.html>`_ 以获取更多详细信息。"
+
+#: ../../source/recipes/gemma3.rst
+msgid "SGLang"
+msgstr "SGLang"
+
+#: ../../source/recipes/gemma3.rst:61
+msgid "**Status:** Not validated with LMCache."
+msgstr "**状态：** 未通过 LMCache 验证。"
+
+#: ../../source/recipes/gemma3.rst
+msgid "TRT-LLM"
+msgstr "TRT-LLM"
+
+#: ../../source/recipes/gemma3.rst:65
+msgid "**Status:** Not supported. LMCache TRT-LLM integration is in progress."
+msgstr "**状态：** 不支持。LMCache TRT-LLM 集成正在进行中。"
+
+#: ../../source/recipes/gemma3.rst:68
+msgid "CacheBlend support"
+msgstr "CacheBlend 支持"
+
+#: ../../source/recipes/gemma3.rst:70
+msgid "Not validated."
+msgstr "未验证。"
+
+#: ../../source/recipes/gemma3.rst:73
+msgid "Compression support"
+msgstr "压缩支持"
+
+#: ../../source/recipes/gemma3.rst:79
+msgid "Method"
+msgstr "方法"
+
+#: ../../source/recipes/gemma3.rst:80
+msgid "Status"
+msgstr "状态"
+
+#: ../../source/recipes/gemma3.rst:81
+msgid "Notes"
+msgstr "笔记"
+
+#: ../../source/recipes/gemma3.rst:82
+msgid ":doc:`CacheGen <../kv_cache_optimizations/compression/cachegen>`"
+msgstr ":doc:`CacheGen <../kv_cache_optimizations/compression/cachegen>`"
+
+#: ../../source/recipes/gemma3.rst:83
+msgid "Not validated"
+msgstr "未验证"
+
+#: ../../source/recipes/gemma3.rst:87
+msgid "Caveats"
+msgstr "注意事项"
+
+#: ../../source/recipes/gemma3.rst:89
+msgid ""
+"**Gated model.** ``google/gemma-3-4b-it`` requires accepting the license "
+"on Hugging Face and authenticating (e.g. ``HF_TOKEN``) before it can be "
+"served."
+msgstr "**门控模型。** ``google/gemma-3-4b-it`` 需要在 Hugging Face 上接受许可并进行身份验证（例如 ``HF_TOKEN``），才能提供服务。"
+
+#: ../../source/recipes/gemma3.rst:91
+msgid ""
+"**Hybrid attention.** Gemma 3 is a hybrid (sliding-window + full-"
+"attention) model. LMCache transfers every KV cache group via its hybrid "
+"memory allocator support, so caching works transparently. This applies to"
+" the standard paged attention used by Gemma 3; Mamba / linear-attention "
+"hybrids (whose recurrent state caches LMCache cannot yet transfer) are "
+"not supported."
+msgstr "**混合注意力。** Gemma 3 是一个混合（滑动窗口 + 全注意力）模型。LMCache 通过其混合内存分配器支持转移每个 KV Cache 组，因此缓存可以透明地工作。这适用于 Gemma 3 使用的标准分页注意力；不支持 Mamba / 线性注意力混合（其递归状态缓存 LMCache 目前尚无法转移）。"
+
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/recipes/gemma4.po b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/gemma4.po
index e7435c648b..b12e72b16e 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/recipes/gemma4.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/gemma4.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-18 17:25+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -31,48 +31,63 @@ msgstr "验证过的模型"
 msgid "`google/gemma-4-31B-it <https://huggingface.co/google/gemma-4-31B-it>`_"
 msgstr "`google/gemma-4-31B-it <https://huggingface.co/google/gemma-4-31B-it>`_"
 
+#: ../../source/recipes/gemma4.rst:10
+msgid "`google/gemma-4-E4B-it <https://huggingface.co/google/gemma-4-E4B-it>`_"
+msgstr "`google/gemma-4-E4B-it <https://huggingface.co/google/gemma-4-E4B-it>`_"
+
 #: ../../source/recipes/gemma4.rst
 msgid "vLLM"
 msgstr "vLLM"
 
-#: ../../source/recipes/gemma4.rst:16
+#: ../../source/recipes/gemma4.rst:17
 msgid ""
 "**Engine documentation:** `Gemma 4 in vLLM supported models "
 "<https://docs.vllm.ai/en/latest/models/supported_models.html#multimodal-"
 "language-models>`_ (architecture ``Gemma4ForConditionalGeneration``)."
-msgstr "**引擎文档:** `Gemma 4 在 vLLM 支持的模型 <https://docs.vllm.ai/en/latest/models/supported_models.html#multimodal-language-models>`_ (架构 ``Gemma4ForConditionalGeneration``)。"
+msgstr ""
+"**引擎文档:** `Gemma 4 在 vLLM 支持的模型 "
+"<https://docs.vllm.ai/en/latest/models/supported_models.html#multimodal-"
+"language-models>`_ (架构 ``Gemma4ForConditionalGeneration``)。"
 
-#: ../../source/recipes/gemma4.rst:21
+#: ../../source/recipes/gemma4.rst:22
 msgid "**Status:** Validated with LMCache."
 msgstr "**状态:** 已通过 LMCache 验证。"
 
-#: ../../source/recipes/gemma4.rst:23
+#: ../../source/recipes/gemma4.rst:24
 msgid "Start the LMCache MP server:"
 msgstr "启动 LMCache MP 服务器："
 
-#: ../../source/recipes/gemma4.rst:31
+#: ../../source/recipes/gemma4.rst:32
 msgid "Start vLLM with the LMCache MP connector:"
 msgstr "启动 vLLM 与 LMCache MP 连接器："
 
-#: ../../source/recipes/gemma4.rst:42
+#: ../../source/recipes/gemma4.rst:43
+msgid "The smaller ``google/gemma-4-E4B-it`` runs on a single GPU:"
+msgstr "较小的 ``google/gemma-4-E4B-it`` 在单个 GPU 上运行："
+
+#: ../../source/recipes/gemma4.rst:53
 msgid ""
 "Adjust ``--tensor-parallel-size`` to match your hardware. For the generic"
 " LMCache + vLLM wiring (ports, remote hosts, in-process mode), see "
 ":doc:`../mp/quickstart`."
-msgstr "调整 ``--tensor-parallel-size`` 以匹配您的硬件。有关通用 LMCache + vLLM 连接（端口、远程主机、进程内模式），请参见 :doc:`../mp/quickstart`。"
+msgstr ""
+"调整 ``--tensor-parallel-size`` 以匹配您的硬件。有关通用 LMCache + vLLM "
+"连接（端口、远程主机、进程内模式），请参见 :doc:`../mp/quickstart`。"
 
-#: ../../source/recipes/gemma4.rst:46
+#: ../../source/recipes/gemma4.rst:57
 msgid ""
 "If there are any issues with vLLM setup, please refer to the `vLLM "
 "Recipes <https://docs.vllm.ai/projects/recipes/en/latest/index.html>`_ "
 "for more details."
-msgstr "如果在 vLLM 设置中遇到任何问题，请参考 `vLLM Recipes <https://docs.vllm.ai/projects/recipes/en/latest/index.html>`_ 以获取更多详细信息。"
+msgstr ""
+"如果在 vLLM 设置中遇到任何问题，请参考 `vLLM Recipes "
+"<https://docs.vllm.ai/projects/recipes/en/latest/index.html>`_ 以获取更多详细信息。"
 
 #: ../../source/recipes/gemma4.rst
 msgid "SGLang"
 msgstr "SGLang"
 
-#: ../../source/recipes/gemma4.rst:52
+#: ../../source/recipes/gemma4.rst:63
 msgid "**Status:** Not validated with LMCache."
 msgstr "**状态：** 未通过 LMCache 验证。"
 
@@ -80,43 +95,69 @@ msgstr "**状态：** 未通过 LMCache 验证。"
 msgid "TRT-LLM"
 msgstr "TRT-LLM"
 
-#: ../../source/recipes/gemma4.rst:56
+#: ../../source/recipes/gemma4.rst:67
 msgid "**Status:** Not supported. LMCache TRT-LLM integration is in progress."
 msgstr "**状态：** 不支持。LMCache TRT-LLM 集成正在进行中。"
 
-#: ../../source/recipes/gemma4.rst:59
+#: ../../source/recipes/gemma4.rst:70
 msgid "CacheBlend support"
 msgstr "CacheBlend 支持"
 
-#: ../../source/recipes/gemma4.rst:62
+#: ../../source/recipes/gemma4.rst:73
 msgid "Compression support"
 msgstr "压缩支持"
 
-#: ../../source/recipes/gemma4.rst:68
+#: ../../source/recipes/gemma4.rst:79
 msgid "Method"
 msgstr "方法"
 
-#: ../../source/recipes/gemma4.rst:69
+#: ../../source/recipes/gemma4.rst:80
 msgid "Status"
 msgstr "状态"
 
-#: ../../source/recipes/gemma4.rst:70
+#: ../../source/recipes/gemma4.rst:81
 msgid "Notes"
 msgstr "笔记"
 
-#: ../../source/recipes/gemma4.rst:71
+#: ../../source/recipes/gemma4.rst:82
 msgid ":doc:`CacheGen <../kv_cache_optimizations/compression/cachegen>`"
 msgstr ":doc:`CacheGen <../kv_cache_optimizations/compression/cachegen>`"
 
-#: ../../source/recipes/gemma4.rst:72
+#: ../../source/recipes/gemma4.rst:83
 msgid "Not validated"
 msgstr "未验证"
 
-#: ../../source/recipes/gemma4.rst:76
+#: ../../source/recipes/gemma4.rst:87
 msgid "Caveats"
 msgstr "注意事项"
 
-#: ../../source/recipes/gemma4.rst:78
-msgid "None known."
-msgstr "没有已知的问题。"
+#: ../../source/recipes/gemma4.rst:89
+msgid ""
+"**Hybrid KV cache with heterogeneous block sizes.** Gemma 4 interleaves "
+"sliding-window and full-attention layers whose head dimensions differ "
+"(sliding 256, full 512), so vLLM unifies the physical page size by giving"
+" the two attention types different ``block_size``\\ s (e.g. "
+"``google/gemma-4-E4B-it``: sliding 32, full 16). LMCache stores and "
+"retrieves each KV cache group in its own block size; no extra flags are "
+"required."
+msgstr "**混合 KV Cache，具有异构块大小。** Gemma 4 交错了滑动窗口和全注意力层，其头部维度不同（滑动 256，全 512），因此 vLLM 通过为这两种注意力类型提供不同的 ``block_size``\\\\ s（例如 ``google/gemma-4-E4B-it``：滑动 32，全 16）来统一物理页面大小。LMCache 在其自己的块大小中存储和检索每个 KV Cache 组；不需要额外的标志。"
+
+#: ../../source/recipes/gemma4.rst:95
+msgid ""
+"**Cross-layer KV sharing.** ``google/gemma-4-E4B-it`` reuses some layers'"
+" KV caches across layers. LMCache stores the cache-owning layers only; "
+"the sharing layers' KV lives in the same blocks and is restored "
+"automatically."
+msgstr "**跨层 KV 共享。** ``google/gemma-4-E4B-it`` 在层与层之间重用一些层的 KV 缓存。LMCache 仅存储拥有缓存的层；共享层的 KV 存储在相同的块中，并会自动恢复。"
+
+#: ../../source/recipes/gemma4.rst:98
+msgid ""
+"**Determinism.** Gemma 4 runs on the Triton attention backend, which is "
+"not bit-exact under vLLM's batch-invariant mode, so a retrieved result "
+"may differ from a freshly computed one by a small numerical margin rather"
+" than being byte-identical."
+msgstr "**确定性。** Gemma 4 在 Triton 注意力后端上运行，该后端在 vLLM 的批量不变模式下不是位精确的，因此检索到的结果可能与新计算的结果存在小的数值差异，而不是字节完全相同。"
+
+#~ msgid "None known."
+#~ msgstr "没有已知的问题。"
 
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/recipes/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/index.po
index ced9f6bde5..a8b62446d9 100644
--- a/docs/source/locale/zh_CN/LC_MESSAGES/recipes/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/recipes/index.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: LMCache \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-05-29 22:44+0000\n"
+"POT-Creation-Date: 2026-06-08 10:45+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -119,21 +119,23 @@ msgstr "``MiniMaxM2ForCausalLM``"
 msgid "``MiniMaxAI/MiniMax-M2``"
 msgstr "``MiniMaxAI/MiniMax-M2``"
 
-#: ../../source/recipes/index.rst:48 ../../source/recipes/index.rst:55
-#: ../../source/recipes/index.rst:62 ../../source/recipes/index.rst:69
-#: ../../source/recipes/index.rst:76 ../../source/recipes/index.rst:83
-#: ../../source/recipes/index.rst:90 ../../source/recipes/index.rst:97
+#: ../../source/recipes/index.rst:48 ../../source/recipes/index.rst:56
+#: ../../source/recipes/index.rst:63 ../../source/recipes/index.rst:70
+#: ../../source/recipes/index.rst:77 ../../source/recipes/index.rst:84
+#: ../../source/recipes/index.rst:91 ../../source/recipes/index.rst:98
+#: ../../source/recipes/index.rst:105
 msgid "✓"
 msgstr "✓"
 
 #: ../../source/recipes/index.rst:49 ../../source/recipes/index.rst:50
-#: ../../source/recipes/index.rst:56 ../../source/recipes/index.rst:57
-#: ../../source/recipes/index.rst:63 ../../source/recipes/index.rst:64
-#: ../../source/recipes/index.rst:70 ../../source/recipes/index.rst:71
-#: ../../source/recipes/index.rst:77 ../../source/recipes/index.rst:78
-#: ../../source/recipes/index.rst:84 ../../source/recipes/index.rst:85
-#: ../../source/recipes/index.rst:91 ../../source/recipes/index.rst:92
-#: ../../source/recipes/index.rst:98 ../../source/recipes/index.rst:99
+#: ../../source/recipes/index.rst:57 ../../source/recipes/index.rst:58
+#: ../../source/recipes/index.rst:64 ../../source/recipes/index.rst:65
+#: ../../source/recipes/index.rst:71 ../../source/recipes/index.rst:72
+#: ../../source/recipes/index.rst:78 ../../source/recipes/index.rst:79
+#: ../../source/recipes/index.rst:85 ../../source/recipes/index.rst:86
+#: ../../source/recipes/index.rst:92 ../../source/recipes/index.rst:93
+#: ../../source/recipes/index.rst:99 ../../source/recipes/index.rst:100
+#: ../../source/recipes/index.rst:106 ../../source/recipes/index.rst:107
 msgid "—"
 msgstr "—"
 
@@ -145,105 +147,121 @@ msgstr ":doc:`minimax_m2`"
 msgid "``Gemma4ForConditionalGeneration``"
 msgstr "``Gemma4ForConditionalGeneration``"
 
-#: ../../source/recipes/index.rst:54
+#: ../../source/recipes/index.rst
 msgid "``google/gemma-4-31B-it``"
 msgstr "``google/gemma-4-31B-it``"
 
-#: ../../source/recipes/index.rst:58
+#: ../../source/recipes/index.rst
+msgid "``google/gemma-4-E4B-it``"
+msgstr "``google/gemma-4-E4B-it``"
+
+#: ../../source/recipes/index.rst:59
 msgid ":doc:`gemma4`"
 msgstr ":doc:`gemma4`"
 
-#: ../../source/recipes/index.rst:60
+#: ../../source/recipes/index.rst:61
+msgid "``Gemma3ForConditionalGeneration``"
+msgstr "``Gemma3ForConditionalGeneration``"
+
+#: ../../source/recipes/index.rst:62
+msgid "``google/gemma-3-4b-it``"
+msgstr "``google/gemma-3-4b-it``"
+
+#: ../../source/recipes/index.rst:66
+msgid ":doc:`gemma3`"
+msgstr "`:doc:`gemma3``"
+
+#: ../../source/recipes/index.rst:68
 msgid "``MistralForCausalLM``"
 msgstr "``MistralForCausalLM``"
 
-#: ../../source/recipes/index.rst:61
+#: ../../source/recipes/index.rst:69
 msgid "``mistralai/Devstral-2-123B-Instruct-2512``"
 msgstr "``mistralai/Devstral-2-123B-Instruct-2512``"
 
-#: ../../source/recipes/index.rst:65
+#: ../../source/recipes/index.rst:73
 msgid ":doc:`devstral`"
 msgstr ":doc:`devstral`"
 
-#: ../../source/recipes/index.rst:67
+#: ../../source/recipes/index.rst:75
 msgid "``GptOssForCausalLM``"
 msgstr "``GptOssForCausalLM``"
 
-#: ../../source/recipes/index.rst:68
+#: ../../source/recipes/index.rst:76
 msgid "``openai/gpt-oss-120b``"
 msgstr "``openai/gpt-oss-120b``"
 
-#: ../../source/recipes/index.rst:72
+#: ../../source/recipes/index.rst:80
 msgid ":doc:`gpt_oss`"
 msgstr ":gpt_oss:"
 
-#: ../../source/recipes/index.rst:74
+#: ../../source/recipes/index.rst:82
 msgid "``Qwen3MoeForCausalLM``"
 msgstr "``Qwen3MoeForCausalLM``"
 
-#: ../../source/recipes/index.rst:75
+#: ../../source/recipes/index.rst:83
 msgid "``Qwen/Qwen3-235B-A22B``"
 msgstr "``Qwen/Qwen3-235B-A22B``"
 
-#: ../../source/recipes/index.rst:79
+#: ../../source/recipes/index.rst:87
 msgid ":doc:`qwen3`"
 msgstr "：doc:`qwen3`"
 
-#: ../../source/recipes/index.rst:81
+#: ../../source/recipes/index.rst:89
 msgid "``LlamaForCausalLM``"
 msgstr "``LlamaForCausalLM``"
 
-#: ../../source/recipes/index.rst:82
+#: ../../source/recipes/index.rst:90
 msgid "``meta-llama/Meta-Llama-3.1-70B-Instruct``"
 msgstr "``meta-llama/Meta-Llama-3.1-70B-Instruct``"
 
-#: ../../source/recipes/index.rst:86
+#: ../../source/recipes/index.rst:94
 msgid ":doc:`llama`"
 msgstr "`:doc:`llama`"
 
-#: ../../source/recipes/index.rst:88
+#: ../../source/recipes/index.rst:96
 msgid "``Phi3ForCausalLM``"
 msgstr "``Phi3ForCausalLM``"
 
-#: ../../source/recipes/index.rst:89
+#: ../../source/recipes/index.rst:97
 msgid "``microsoft/Phi-4-mini-instruct``"
 msgstr "``microsoft/Phi-4-mini-instruct``"
 
-#: ../../source/recipes/index.rst:93
+#: ../../source/recipes/index.rst:101
 msgid ":doc:`phi3`"
 msgstr ":doc:`phi3`"
 
-#: ../../source/recipes/index.rst:95
+#: ../../source/recipes/index.rst:103
 msgid "``MixtralForCausalLM``"
 msgstr "``MixtralForCausalLM``"
 
-#: ../../source/recipes/index.rst:96
+#: ../../source/recipes/index.rst:104
 msgid "``mistralai/Mixtral-8x7B-Instruct-v0.1``"
 msgstr "``mistralai/Mixtral-8x7B-Instruct-v0.1``"
 
-#: ../../source/recipes/index.rst:100
+#: ../../source/recipes/index.rst:108
 msgid ":doc:`mixtral`"
 msgstr ":doc:`mixtral`"
 
-#: ../../source/recipes/index.rst:102
+#: ../../source/recipes/index.rst:110
 msgid "Legend: ``✓`` validated, ``—`` not validated."
 msgstr "图例：``✓`` 已验证，``—`` 未验证。"
 
-#: ../../source/recipes/index.rst:105
+#: ../../source/recipes/index.rst:113
 msgid "Contributing a recipe"
 msgstr "贡献一个使用指南"
 
-#: ../../source/recipes/index.rst:107
+#: ../../source/recipes/index.rst:115
 msgid "To add a new architecture:"
 msgstr "要添加一个新架构："
 
-#: ../../source/recipes/index.rst:109
+#: ../../source/recipes/index.rst:117
 msgid ""
 "Copy an existing page (e.g. ``minimax_m2.rst``) to "
 "``recipes/<architecture_snake_case>.rst``."
 msgstr "将现有页面（例如 ``minimax_m2.rst``）复制到 ``recipes/<architecture_snake_case>.rst``。"
 
-#: ../../source/recipes/index.rst:111
+#: ../../source/recipes/index.rst:119
 msgid ""
 "Fill in **Validated models**, **Engines**, **LMCache configuration**, and"
 " **Caveats**. Keep each section terse -- if a field has nothing to say, "
@@ -252,7 +270,7 @@ msgstr ""
 "填写 **已验证模型**、**引擎**、**LMCache 配置** 和 **注意事项**。保持每个部分简洁 -- "
 "如果某个字段没有内容，请用一行说明，而不是填充内容。"
 
-#: ../../source/recipes/index.rst:114
+#: ../../source/recipes/index.rst:122
 msgid "Add a row to the table above and an entry to the hidden toctree below."
 msgstr "在上面的表格中添加一行，并在下面的隐藏 toctree 中添加一个条目。"
 

From 068578fd38995a6aeec5ad8d43abbedccc470e19 Mon Sep 17 00:00:00 2001
From: LIUyujie <130735711+Lyj1007@users.noreply.github.com>
Date: Tue, 9 Jun 2026 15:16:44 +0800
Subject: [PATCH 13/57] [Refactor] Consolidate ParallelStrategy construction in
 vllm_multi_process_adapter (#3478)

Signed-off-by: Yujie Liu <milan021007@163.com>
---
 .../integration/vllm/lmcache_mp_connector.py  | 126 +++++-------------
 .../vllm/vllm_multi_process_adapter.py        |  66 ++++-----
 tests/v1/multiprocess/test_free_locks.py      |   4 +-
 tests/v1/test_vllm_mp_adapter.py              |   6 +-
 4 files changed, 72 insertions(+), 130 deletions(-)

diff --git a/lmcache/integration/vllm/lmcache_mp_connector.py b/lmcache/integration/vllm/lmcache_mp_connector.py
index 3ca1c9086d..cb231da758 100644
--- a/lmcache/integration/vllm/lmcache_mp_connector.py
+++ b/lmcache/integration/vllm/lmcache_mp_connector.py
@@ -95,89 +95,26 @@ class SupportsHMA:  # type: ignore[no-redef]
 
 
 # Helper functions
-def extract_world_size_and_kv_rank(
-    world_size: int,
-    rank: int,
-    vllm_config: VllmConfig,
-) -> tuple[int, int]:
-    """
-    Convert the rank for the MLA.
-    """
-    use_mla = mla_enabled(vllm_config.model_config)
-    if not use_mla:
-        return world_size, rank
-    else:
-        # Tensor parallel does not change the KV caches for MLA models.
-        # So we need to "exclude" the effect of TP on rank and world size
-        tp_size = vllm_config.parallel_config.tensor_parallel_size
-        # vLLM constructs TP groups first, and then construct other
-        # parallel groups on top of TP groups.
-        # for example, TP=4, PP=2,
-        # PP group: [0, 1, 2, 3], [4, 5, 6, 7]
-        # TP group: [0, 4], [1, 5], [2, 6], [3, 7]
-        # So we can "exclude" the effect of TP by rank // tp_size.
-        return world_size // tp_size, rank // tp_size
-
-
-def create_scheduler_adapter(
-    server_url: str,
-    zmq_context: zmq.Context,
-    vllm_config: VllmConfig,
-) -> LMCacheMPSchedulerAdapter:
-    world_size, kv_rank = extract_world_size_and_kv_rank(
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
-        vllm_config,
-    )
-    parallel_strategy = ParallelStrategy(
-        mla_enabled(vllm_config.model_config),
-        world_size,
-        kv_rank,
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
-        vllm_config.parallel_config.tensor_parallel_size,
-        vllm_config.parallel_config.pipeline_parallel_size,
-    )
-
-    extra_config = vllm_config.kv_transfer_config.kv_connector_extra_config
-    return LMCacheMPSchedulerAdapter(
-        server_url=server_url,
-        context=zmq_context,
-        model_name=vllm_config.model_config.model,
-        vllm_block_size=vllm_config.cache_config.block_size,
-        parallel_strategy=parallel_strategy,
-        extra_config=extra_config,
-    )
+def build_parallel_strategy_from_vllm_config(
+    vllm_config: "VllmConfig",
+) -> ParallelStrategy:
+    """Build a ParallelStrategy from a vLLM config.
 
+    Centralises the (vllm_config -> KV parallel geometry) mapping.
 
-def create_worker_adapter(
-    server_url: str,
-    zmq_context: zmq.Context,
-    vllm_config: VllmConfig,
-) -> LMCacheMPWorkerAdapter:
-    world_size, kv_rank = extract_world_size_and_kv_rank(
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
-        vllm_config,
-    )
-    parallel_strategy = ParallelStrategy(
-        mla_enabled(vllm_config.model_config),
-        world_size,
-        kv_rank,
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
-        vllm_config.parallel_config.tensor_parallel_size,
-        vllm_config.parallel_config.pipeline_parallel_size,
-    )
+    Args:
+        vllm_config: The vLLM configuration object.
 
-    extra_config = vllm_config.kv_transfer_config.kv_connector_extra_config
-    return LMCacheMPWorkerAdapter(
-        server_url=server_url,
-        context=zmq_context,
-        model_name=vllm_config.model_config.model,
-        vllm_block_size=vllm_config.cache_config.block_size,
-        parallel_strategy=parallel_strategy,
-        extra_config=extra_config,
+    Returns:
+        The constructed ParallelStrategy.
+    """
+    pc = vllm_config.parallel_config
+    return ParallelStrategy(
+        use_mla=mla_enabled(vllm_config.model_config),
+        vllm_world_size=pc.world_size,
+        vllm_worker_id=pc.rank,
+        tp_size=pc.tensor_parallel_size,
+        pp_size=pc.pipeline_parallel_size,
     )
 
 
@@ -549,18 +486,26 @@ def __init__(
 
         server_url = f"{server_host}:{server_port}"
         zmq_context = zmq.Context.instance()
+        parallel_strategy = build_parallel_strategy_from_vllm_config(vllm_config)
+
         if self.role == KVConnectorRole.SCHEDULER:
-            self.scheduler_adapter = create_scheduler_adapter(
-                server_url,
-                zmq_context,
-                vllm_config,
+            self.scheduler_adapter = LMCacheMPSchedulerAdapter(
+                server_url=server_url,
+                context=zmq_context,
+                model_name=vllm_config.model_config.model,
+                vllm_block_size=vllm_config.cache_config.block_size,
+                parallel_strategy=parallel_strategy,
+                extra_config=vllm_config.kv_transfer_config.kv_connector_extra_config,
             )
             self.request_trackers: dict[str, LMCacheMPRequestTracker] = {}
         elif self.role == KVConnectorRole.WORKER:
-            self.worker_adapter = create_worker_adapter(
-                server_url,
-                zmq_context,
-                vllm_config,
+            self.worker_adapter = LMCacheMPWorkerAdapter(
+                server_url=server_url,
+                context=zmq_context,
+                model_name=vllm_config.model_config.model,
+                vllm_block_size=vllm_config.cache_config.block_size,
+                parallel_strategy=parallel_strategy,
+                extra_config=vllm_config.kv_transfer_config.kv_connector_extra_config,
             )
         else:
             raise ValueError(f"Unknown KVConnectorRole: {self.role}")
@@ -713,10 +658,7 @@ def wait_for_save(self):
         """
         # In MLA scenario, only the first rank of the pipeline group
         # needs to save the KV cache.
-        if (
-            self.worker_adapter.use_mla
-            and not self.worker_adapter.is_first_rank_of_pp_group
-        ):
+        if not self.worker_adapter.is_kv_writer:
             return
 
         metadata = self._get_connector_metadata()
diff --git a/lmcache/integration/vllm/vllm_multi_process_adapter.py b/lmcache/integration/vllm/vllm_multi_process_adapter.py
index 3fcc0054e6..7e0477b92c 100644
--- a/lmcache/integration/vllm/vllm_multi_process_adapter.py
+++ b/lmcache/integration/vllm/vllm_multi_process_adapter.py
@@ -285,26 +285,14 @@ class ParallelStrategy:
     use_mla: bool
     """Whether to use the MLA."""
 
-    kv_world_size: int
-    """
-    The kv world size, kv_world_size may not be equal to the actual_world_size, 
-    in the case of mla, it will 'exclude' the effect of TP, the value is 
-    calculated by `extract_world_size_and_kv_rank` in `lmcache_mp_connector.py`.
-    """
+    vllm_world_size: int
+    """Number of workers managed by one vLLM scheduler (TP × PP; excludes DP).
 
-    kv_worker_id: int
+    Mirrors ``vllm.parallel_config.world_size``.
     """
-    The kv worker id of the sub-process, kv_worker_id may not be equal to the 
-    actual_worker_id, in the case of mla, it will 'exclude' the effect of TP, 
-    the value is calculated by `extract_world_size_and_kv_rank` in 
-    `lmcache_mp_connector.py`.
-    """
-
-    actual_world_size: int
-    """The actual world size."""
 
-    actual_worker_id: int
-    """The actual worker id of the sub-process."""
+    vllm_worker_id: int
+    """This worker's rank within its scheduler group."""
 
     tp_size: int
     """The tensor parallel size."""
@@ -312,6 +300,30 @@ class ParallelStrategy:
     pp_size: int
     """The pipeline parallel size."""
 
+    @property
+    def kv_world_size(self) -> int:
+        """Number of pieces a single token chunk's KV cache is split into
+        on the LMCache server storage."""
+        if self.use_mla:
+            return self.vllm_world_size // self.tp_size
+        return self.vllm_world_size
+
+    @property
+    def kv_worker_id(self) -> int:
+        """Index of the piece of a single token chunk's KV cache
+        that the current worker is responsible for,
+        in ``[0, kv_world_size)``."""
+        if self.use_mla:
+            return self.vllm_worker_id // self.tp_size
+        return self.vllm_worker_id
+
+    @property
+    def is_kv_writer(self) -> bool:
+        """Whether this rank is responsible for storing KV."""
+        if not self.use_mla:
+            return True
+        return self.vllm_worker_id % self.tp_size == 0
+
 
 def _normalize_adapter_init_args(
     vllm_block_size: int,
@@ -349,10 +361,8 @@ def _normalize_adapter_init_args(
     kv_worker_id = int(parallel_strategy)
     strategy = ParallelStrategy(
         use_mla=False,
-        kv_world_size=kv_world_size,
-        kv_worker_id=kv_worker_id,
-        actual_world_size=kv_world_size,
-        actual_worker_id=kv_worker_id,
+        vllm_world_size=kv_world_size,
+        vllm_worker_id=kv_worker_id,
         tp_size=kv_world_size,
         pp_size=1,
     )
@@ -1019,17 +1029,9 @@ def worker_id(self) -> int:
         return self.parallel_strategy.kv_worker_id
 
     @property
-    def use_mla(self) -> bool:
-        """Whether to use MLA."""
-        return self.parallel_strategy.use_mla
-
-    @property
-    def is_first_rank_of_pp_group(self) -> bool:
-        """Is the first rank of the pipeline parallel group."""
-        return (
-            self.parallel_strategy.actual_worker_id % self.parallel_strategy.tp_size
-            == 0
-        )
+    def is_kv_writer(self) -> bool:
+        """Whether this worker is responsible for storing KV."""
+        return self.parallel_strategy.is_kv_writer
 
     def register_kv_caches(
         self,
diff --git a/tests/v1/multiprocess/test_free_locks.py b/tests/v1/multiprocess/test_free_locks.py
index e08a791151..7efc0c6fb5 100644
--- a/tests/v1/multiprocess/test_free_locks.py
+++ b/tests/v1/multiprocess/test_free_locks.py
@@ -168,7 +168,7 @@ def test_adapter_free_lookup_locks_sends_request():
     adapter.model_name = "test_model"
     adapter.chunk_size = 256
     adapter.blocks_in_chunk = 16
-    adapter.parallel_strategy = ParallelStrategy(False, 1, 0, 1, 0, 1, 1)
+    adapter.parallel_strategy = ParallelStrategy(False, 1, 0, 1, 1)
     adapter._health_event = threading.Event()
     adapter._health_event.set()
     adapter._mq_timeout = 30.0
@@ -218,7 +218,7 @@ def test_adapter_free_lookup_locks_key_matches_lookup():
     adapter.model_name = "test_model"
     adapter.chunk_size = 256
     adapter.blocks_in_chunk = 16
-    adapter.parallel_strategy = ParallelStrategy(False, 1, 0, 1, 0, 1, 1)
+    adapter.parallel_strategy = ParallelStrategy(False, 1, 0, 1, 1)
     adapter._health_event = threading.Event()
     adapter._health_event.set()
     adapter._mq_timeout = 30.0
diff --git a/tests/v1/test_vllm_mp_adapter.py b/tests/v1/test_vllm_mp_adapter.py
index 54f5e36e23..d3f082a5b0 100644
--- a/tests/v1/test_vllm_mp_adapter.py
+++ b/tests/v1/test_vllm_mp_adapter.py
@@ -79,10 +79,8 @@ def start(self) -> None:
 
     parallel_strategy = ParallelStrategy(
         use_mla=False,
-        kv_world_size=1,
-        kv_worker_id=0,
-        actual_world_size=1,
-        actual_worker_id=0,
+        vllm_world_size=1,
+        vllm_worker_id=0,
         tp_size=1,
         pp_size=1,
     )

From a64ef9a78a83a2063ca98a6f9df0a16f1f9f20c7 Mon Sep 17 00:00:00 2001
From: maobaolong <baoloongmao@tencent.com>
Date: Tue, 9 Jun 2026 20:11:57 +0800
Subject: [PATCH 14/57] fix: handle NL_X_NB_NH_BS_TWO_HS in get_group_data_ptrs
 (#3602)

Signed-off-by: baoloongmao <baoloongmao@tencent.com>
---
 lmcache/v1/gpu_connector/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lmcache/v1/gpu_connector/utils.py b/lmcache/v1/gpu_connector/utils.py
index 8e39bbf672..21fd5b58cf 100644
--- a/lmcache/v1/gpu_connector/utils.py
+++ b/lmcache/v1/gpu_connector/utils.py
@@ -1274,6 +1274,7 @@ def get_group_data_ptrs(
         F.NL_X_NB_TWO_NH_BS_HS,
         F.NL_X_NB_BS_HS,
         F.NL_X_NBBS_ONE_HS,
+        F.NL_X_NB_NH_BS_TWO_HS,
     ):
         layers = cast(list[torch.Tensor], kv_caches)
         return [layers[i].data_ptr() for i in layer_indices]

From 36baf626350a962c167b491af4a279da9fcebfc7 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Tue, 9 Jun 2026 12:30:14 -0700
Subject: [PATCH 15/57] [Core][MP] Introduce object_group_id into the ObjectKey
 (#3608)

Signed-off-by: ApostaC <yihua@tensormesh.ai>
---
 lmcache/v1/distributed/api.py                 | 12 +++
 .../distributed/l2_adapters/fs_l2_adapter.py  | 27 +++---
 .../l2_adapters/hfbucket_l2_adapter.py        |  8 +-
 .../native_connector_l2_adapter.py            | 11 +--
 .../nixl_store_dynamic_l2_adapter.py          |  4 +-
 .../distributed/l2_adapters/s3_l2_adapter.py  | 16 ++--
 lmcache/v1/distributed/serde/utils.py         |  3 +-
 lmcache/v1/mp_observability/trace/codecs.py   |  2 +
 .../v1/storage_backend/raw_block/key_codec.py | 18 ++--
 tests/v1/distributed/serde/test_utils.py      |  6 +-
 .../v1/distributed/test_fs_l2_adapter_keys.py | 96 ++++++++++++++++---
 .../distributed/test_hfbucket_l2_adapter.py   | 15 ++-
 .../test_native_connector_l2_adapter.py       | 23 +++--
 tests/v1/distributed/test_s3_l2_adapter.py    | 15 ++-
 .../v1/mp_observability/trace/test_codecs.py  | 11 +++
 .../test_raw_block_key_codec.py               | 26 +++++
 16 files changed, 227 insertions(+), 66 deletions(-)

diff --git a/lmcache/v1/distributed/api.py b/lmcache/v1/distributed/api.py
index 105996cc03..b09e187434 100644
--- a/lmcache/v1/distributed/api.py
+++ b/lmcache/v1/distributed/api.py
@@ -55,6 +55,9 @@ class ObjectKey:
     kv_rank: int
     """ The rank that uniquely identifies the slice of the KV cache """
 
+    object_group_id: int = 0
+    """ Index of the object group this chunk belongs to. """
+
     cache_salt: str = ""
     """ Per-user isolation salt. Same content from different users with
     different cache_salt values produces different ObjectKeys, giving
@@ -79,6 +82,10 @@ def __post_init__(self) -> None:
             raise ValueError(
                 f"model_name must not contain '@' (got {self.model_name!r})"
             )
+        if self.object_group_id < 0:
+            raise ValueError(
+                f"object_group_id must be >= 0 (got {self.object_group_id})"
+            )
         bad = self._SALT_FORBIDDEN_CHARS & set(self.cache_salt)
         if bad:
             raise ValueError(
@@ -202,6 +209,7 @@ class PrefetchHandle:
 def ipc_key_to_object_keys(
     ipc_key: IPCCacheEngineKey,
     chunk_hashes: list[bytes],
+    object_group_id: int = 0,
 ) -> list[ObjectKey]:
     """
     Convert a single IPCCacheEngineKey and its chunk hashes to a list of ObjectKey.
@@ -219,6 +227,8 @@ def ipc_key_to_object_keys(
         ipc_key: The IPC key providing model_name, world_size, worker_id,
             and cache_salt.
         chunk_hashes: List of chunk hash bytes, one per chunk.
+        object_group_id: Index of the object group the chunks belong to.
+            Defaults to 0, the single-group case.
 
     Returns:
         list[ObjectKey]: The converted list of ObjectKey.
@@ -243,6 +253,7 @@ def ipc_key_to_object_keys(
                         chunk_hash=chunk_hash,
                         model_name=ipc_key.model_name,
                         kv_rank=kv_rank,
+                        object_group_id=object_group_id,
                         cache_salt=cache_salt,
                     )
                 )
@@ -259,6 +270,7 @@ def ipc_key_to_object_keys(
                     chunk_hash=chunk_hash,
                     model_name=ipc_key.model_name,
                     kv_rank=kv_rank,
+                    object_group_id=object_group_id,
                     cache_salt=cache_salt,
                 )
             )
diff --git a/lmcache/v1/distributed/l2_adapters/fs_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/fs_l2_adapter.py
index 6ba7a1e840..4333ffa1b6 100644
--- a/lmcache/v1/distributed/l2_adapters/fs_l2_adapter.py
+++ b/lmcache/v1/distributed/l2_adapters/fs_l2_adapter.py
@@ -100,22 +100,21 @@ def _object_key_to_filename(key: ObjectKey) -> str:
 
     Unsalted::
 
-        <safe_model>@0x<kv_rank_hex>@<chunk_hash_hex>.data
+        <safe_model>@0x<kv_rank_hex>@<object_group_id_hex>@<chunk_hash_hex>.data
 
     Salted (trailing ``cache_salt``)::
 
-        <safe_model>@0x<kv_rank_hex>@<chunk_hash_hex>@<cache_salt>.data
-
-    The 3-field unsalted shape is bit-identical to the pre-cache_salt
-    format, so existing un-salted cache directories remain valid and
-    no migration is needed.
+        <safe_model>@0x<kv_rank_hex>@<object_group_id_hex>@<chunk_hash_hex>@<cache_salt>.data
 
     ``kv_rank`` is written in ``0x`` prefixed hex so each byte
     of the bitmap ``(ws<<24)|(rank<<16)|(local_ws<<8)|local``
-    is directly readable.
+    is directly readable. ``object_group_id`` is written in plain hex.
     """
     safe_model = key.model_name.replace("/", _PATH_SLASH_REPLACEMENT)
-    base = f"{safe_model}{_KEY_SEP}{key.kv_rank:#010x}{_KEY_SEP}{key.chunk_hash.hex()}"
+    base = (
+        f"{safe_model}{_KEY_SEP}{key.kv_rank:#010x}"
+        f"{_KEY_SEP}{key.object_group_id:x}{_KEY_SEP}{key.chunk_hash.hex()}"
+    )
     if key.cache_salt:
         return f"{base}{_KEY_SEP}{key.cache_salt}{_FILE_EXT}"
     return f"{base}{_FILE_EXT}"
@@ -126,7 +125,7 @@ def _filename_to_object_key(
 ) -> Optional[ObjectKey]:
     """Reverse ``_object_key_to_filename``.
 
-    Accepts both the 3-field unsalted shape and the 4-field salted
+    Accepts both the 4-field unsalted shape and the 5-field salted
     shape (trailing ``cache_salt``). Returns ``None`` for anything
     else. Since ``model_name`` is guaranteed not to contain ``@``,
     plain ``split`` suffices — no marker, no rsplit.
@@ -135,11 +134,11 @@ def _filename_to_object_key(
         return None
     stem = filename[: -len(_FILE_EXT)]
     parts = stem.split(_KEY_SEP)
-    if len(parts) == 3:
-        safe_model, kv_rank_str, chunk_hash_hex = parts
+    if len(parts) == 4:
+        safe_model, kv_rank_str, object_group_str, chunk_hash_hex = parts
         cache_salt = ""
-    elif len(parts) == 4:
-        safe_model, kv_rank_str, chunk_hash_hex, cache_salt = parts
+    elif len(parts) == 5:
+        safe_model, kv_rank_str, object_group_str, chunk_hash_hex, cache_salt = parts
     else:
         return None
 
@@ -147,6 +146,7 @@ def _filename_to_object_key(
     try:
         chunk_hash = bytes.fromhex(chunk_hash_hex)
         kv_rank = int(kv_rank_str, 16)
+        object_group_id = int(object_group_str, 16)
         # ObjectKey.__post_init__ raises ValueError when the decoded
         # model_name / cache_salt violate the forbidden-char or length
         # invariants (e.g. a stray file from another tool on disk).
@@ -156,6 +156,7 @@ def _filename_to_object_key(
             chunk_hash=chunk_hash,
             model_name=model_name,
             kv_rank=kv_rank,
+            object_group_id=object_group_id,
             cache_salt=cache_salt,
         )
     except ValueError:
diff --git a/lmcache/v1/distributed/l2_adapters/hfbucket_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/hfbucket_l2_adapter.py
index b1cf97f576..e8b0807068 100644
--- a/lmcache/v1/distributed/l2_adapters/hfbucket_l2_adapter.py
+++ b/lmcache/v1/distributed/l2_adapters/hfbucket_l2_adapter.py
@@ -80,11 +80,15 @@ def __init__(
 def _object_key_to_string(key: ObjectKey) -> str:
     """Serialize an MP ``ObjectKey`` to the shared L2 object-name format.
 
-    Unsalted keys use ``<model_name>@<kv_rank_hex>@<chunk_hash_hex>``. Salted
+    Unsalted keys use
+    ``<model_name>@<kv_rank_hex>@<object_group_id_hex>@<chunk_hash_hex>``. Salted
     keys append ``@<cache_salt>`` so tenants/users with identical token chunks
     do not collide in the backing bucket.
     """
-    base = f"{key.model_name}@{key.kv_rank:08x}@{key.chunk_hash.hex()}"
+    base = (
+        f"{key.model_name}@{key.kv_rank:08x}"
+        f"@{key.object_group_id:x}@{key.chunk_hash.hex()}"
+    )
     if key.cache_salt:
         return f"{base}@{key.cache_salt}"
     return base
diff --git a/lmcache/v1/distributed/l2_adapters/native_connector_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/native_connector_l2_adapter.py
index 8464e485a3..67ec384531 100644
--- a/lmcache/v1/distributed/l2_adapters/native_connector_l2_adapter.py
+++ b/lmcache/v1/distributed/l2_adapters/native_connector_l2_adapter.py
@@ -53,18 +53,15 @@ def _object_key_to_string(key: ObjectKey) -> str:
 
     Unsalted::
 
-        <model_name>@<kv_rank_hex>@<chunk_hash_hex>
+        <model_name>@<kv_rank_hex>@<object_group_id_hex>@<chunk_hash_hex>
 
     Salted (trailing ``cache_salt``)::
 
-        <model_name>@<kv_rank_hex>@<chunk_hash_hex>@<cache_salt>
-
-    Keys with ``cache_salt=""`` produce the 3-field shape, which is
-    bit-identical to the format used before ``cache_salt`` existed —
-    so existing un-salted caches remain valid with no migration.
+        <model_name>@<kv_rank_hex>@<object_group_id_hex>@<chunk_hash_hex>@<cache_salt>
     """
     base = (
-        f"{key.model_name}{_KEY_SEP}{key.kv_rank:08x}{_KEY_SEP}{key.chunk_hash.hex()}"
+        f"{key.model_name}{_KEY_SEP}{key.kv_rank:08x}"
+        f"{_KEY_SEP}{key.object_group_id:x}{_KEY_SEP}{key.chunk_hash.hex()}"
     )
     if key.cache_salt:
         return f"{base}{_KEY_SEP}{key.cache_salt}"
diff --git a/lmcache/v1/distributed/l2_adapters/nixl_store_dynamic_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/nixl_store_dynamic_l2_adapter.py
index 9193918b76..cb65fef5c1 100644
--- a/lmcache/v1/distributed/l2_adapters/nixl_store_dynamic_l2_adapter.py
+++ b/lmcache/v1/distributed/l2_adapters/nixl_store_dynamic_l2_adapter.py
@@ -73,7 +73,9 @@ def _object_key_to_filename(key: ObjectKey) -> str:
     """
     safe_model_name = key.model_name.replace("/", "--")
     chunk_hex = key.chunk_hash.hex()
-    return f"{safe_model_name}_{key.kv_rank:08x}_{chunk_hex}.bin"
+    return (
+        f"{safe_model_name}_{key.kv_rank:08x}_{key.object_group_id:x}_{chunk_hex}.bin"
+    )
 
 
 # ---------------------------------------------------------------
diff --git a/lmcache/v1/distributed/l2_adapters/s3_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/s3_l2_adapter.py
index 749c570cc2..bbeb7402c4 100644
--- a/lmcache/v1/distributed/l2_adapters/s3_l2_adapter.py
+++ b/lmcache/v1/distributed/l2_adapters/s3_l2_adapter.py
@@ -62,19 +62,19 @@ def _object_key_to_string(key: ObjectKey) -> str:
 
     Unsalted::
 
-        <model_name>@<kv_rank_hex>@<chunk_hash_hex>
+        <model_name>@<kv_rank_hex>@<object_group_id_hex>@<chunk_hash_hex>
 
     Salted (trailing ``cache_salt``)::
 
-        <model_name>@<kv_rank_hex>@<chunk_hash_hex>@<cache_salt>
+        <model_name>@<kv_rank_hex>@<object_group_id_hex>@<chunk_hash_hex>@<cache_salt>
 
-    Keys with ``cache_salt=""`` produce the 3-field shape (bit-identical
-    to the pre-``cache_salt`` format), so existing un-salted caches
-    remain valid with no migration. ``@`` in ``model_name`` and
-    ``cache_salt`` is rejected by ``ObjectKey.__post_init__``, so the
-    format is unambiguous.
+    ``@`` in ``model_name`` and ``cache_salt`` is rejected by
+    ``ObjectKey.__post_init__``, so the format is unambiguous.
     """
-    base = f"{key.model_name}@{key.kv_rank:08x}@{key.chunk_hash.hex()}"
+    base = (
+        f"{key.model_name}@{key.kv_rank:08x}"
+        f"@{key.object_group_id:x}@{key.chunk_hash.hex()}"
+    )
     if key.cache_salt:
         return f"{base}@{key.cache_salt}"
     return base
diff --git a/lmcache/v1/distributed/serde/utils.py b/lmcache/v1/distributed/serde/utils.py
index 85fc3c16fe..ed220863b3 100644
--- a/lmcache/v1/distributed/serde/utils.py
+++ b/lmcache/v1/distributed/serde/utils.py
@@ -39,7 +39,7 @@ def make_temp_key(original_key: ObjectKey) -> ObjectKey:
     effectively zero at any realistic scale, so the same original key
     can be serde'd repeatedly without practical concern.
 
-    ``cache_salt`` is propagated so per-tenant L1 byte accounting and
+    ``cache_salt`` are propagated so per-tenant L1 byte accounting and
     quota / eviction logic continue to attribute temp buffers to the
     same bucket as the originals.
 
@@ -50,5 +50,6 @@ def make_temp_key(original_key: ObjectKey) -> ObjectKey:
         chunk_hash=original_key.chunk_hash + os.urandom(16),
         model_name=original_key.model_name,
         kv_rank=original_key.kv_rank,
+        object_group_id=original_key.object_group_id,
         cache_salt=original_key.cache_salt,
     )
diff --git a/lmcache/v1/mp_observability/trace/codecs.py b/lmcache/v1/mp_observability/trace/codecs.py
index b41dcad7bf..da6ec81ab9 100644
--- a/lmcache/v1/mp_observability/trace/codecs.py
+++ b/lmcache/v1/mp_observability/trace/codecs.py
@@ -152,6 +152,7 @@ def _enc_object_key(k: ObjectKey) -> dict[str, Any]:
         "chunk_hash": k.chunk_hash,
         "model_name": k.model_name,
         "kv_rank": k.kv_rank,
+        "object_group_id": k.object_group_id,
     }
 
 
@@ -160,6 +161,7 @@ def _dec_object_key(d: dict[str, Any]) -> ObjectKey:
         chunk_hash=d["chunk_hash"],
         model_name=d["model_name"],
         kv_rank=d["kv_rank"],
+        object_group_id=d.get("object_group_id", 0),
     )
 
 
diff --git a/lmcache/v1/storage_backend/raw_block/key_codec.py b/lmcache/v1/storage_backend/raw_block/key_codec.py
index 2809e6779d..5b59601911 100644
--- a/lmcache/v1/storage_backend/raw_block/key_codec.py
+++ b/lmcache/v1/storage_backend/raw_block/key_codec.py
@@ -34,14 +34,17 @@ def object_key_to_string(key: ObjectKey) -> str:
         key: Object key supplied by the MP storage layer.
 
     Returns:
-        A stable string containing model name, KV rank, chunk hash, and
-        optional cache salt.
+        A stable string containing model name, KV rank, object group id,
+        chunk hash, and optional cache salt.
 
     Raises:
         AttributeError: If ``key`` does not expose the ObjectKey fields.
     """
     safe_model = urllib.parse.quote(key.model_name, safe="")
-    base = f"{safe_model}{_KEY_SEP}{key.kv_rank:#010x}{_KEY_SEP}{key.chunk_hash.hex()}"
+    base = (
+        f"{safe_model}{_KEY_SEP}{key.kv_rank:#010x}"
+        f"{_KEY_SEP}{key.object_group_id:x}{_KEY_SEP}{key.chunk_hash.hex()}"
+    )
     if key.cache_salt:
         return f"{base}{_KEY_SEP}{key.cache_salt}"
     return base
@@ -61,11 +64,11 @@ def decode_object_key(encoded: str) -> ObjectKey:
             hexadecimal chunk hash.
     """
     parts = encoded.split(_KEY_SEP)
-    if len(parts) == 3:
-        safe_model, kv_rank_str, chunk_hash_hex = parts
+    if len(parts) == 4:
+        safe_model, kv_rank_str, object_group_str, chunk_hash_hex = parts
         cache_salt = ""
-    elif len(parts) == 4:
-        safe_model, kv_rank_str, chunk_hash_hex, cache_salt = parts
+    elif len(parts) == 5:
+        safe_model, kv_rank_str, object_group_str, chunk_hash_hex, cache_salt = parts
     else:
         raise ValueError(f"Invalid raw-block ObjectKey encoding: {encoded!r}")
 
@@ -73,6 +76,7 @@ def decode_object_key(encoded: str) -> ObjectKey:
         chunk_hash=bytes.fromhex(chunk_hash_hex),
         model_name=urllib.parse.unquote(safe_model),
         kv_rank=int(kv_rank_str, 16),
+        object_group_id=int(object_group_str, 16),
         cache_salt=cache_salt,
     )
 
diff --git a/tests/v1/distributed/serde/test_utils.py b/tests/v1/distributed/serde/test_utils.py
index a08f79ac8e..b8f4a53542 100644
--- a/tests/v1/distributed/serde/test_utils.py
+++ b/tests/v1/distributed/serde/test_utils.py
@@ -6,11 +6,12 @@
 from lmcache.v1.distributed.serde.utils import make_temp_key
 
 
-def _orig(salt: str = "") -> ObjectKey:
+def _orig(salt: str = "", object_group_id: int = 0) -> ObjectKey:
     return ObjectKey(
         chunk_hash=b"\x00" * 16,
         model_name="model",
         kv_rank=0,
+        object_group_id=object_group_id,
         cache_salt=salt,
     )
 
@@ -25,10 +26,11 @@ def test_make_temp_key_propagates_cache_salt() -> None:
 
 def test_make_temp_key_propagates_other_fields() -> None:
     """Non-hash identity fields are preserved verbatim."""
-    orig = _orig(salt="tenant-X")
+    orig = _orig(salt="tenant-X", object_group_id=3)
     temp = make_temp_key(orig)
     assert temp.model_name == orig.model_name
     assert temp.kv_rank == orig.kv_rank
+    assert temp.object_group_id == orig.object_group_id
 
 
 def test_make_temp_key_differs_from_original() -> None:
diff --git a/tests/v1/distributed/test_fs_l2_adapter_keys.py b/tests/v1/distributed/test_fs_l2_adapter_keys.py
index 8eb94bc3ef..858e7c9abf 100644
--- a/tests/v1/distributed/test_fs_l2_adapter_keys.py
+++ b/tests/v1/distributed/test_fs_l2_adapter_keys.py
@@ -2,9 +2,10 @@
 """
 Unit tests for fs_l2_adapter key serialization helpers.
 
-These helpers round-trip ObjectKey <-> filename. ``cache_salt`` is
-appended as a trailing field when non-empty; unsalted keys use the
-3-field shape that matches what older LMCache builds wrote to disk.
+These helpers round-trip ObjectKey <-> filename. ``object_group_id`` is
+embedded as a fixed field right after ``kv_rank``; ``cache_salt`` is
+appended as a trailing field when non-empty. Unsalted keys use the
+4-field shape and salted keys use the 5-field shape.
 """
 
 # Third Party
@@ -20,7 +21,7 @@
 
 class TestFilenameRoundtrip:
     """``_object_key_to_filename`` and ``_filename_to_object_key`` are
-    exact inverses for both the 3-field (unsalted) and 4-field (salted)
+    exact inverses for both the 4-field (unsalted) and 5-field (salted)
     shapes."""
 
     @pytest.mark.parametrize(
@@ -31,11 +32,13 @@ class TestFilenameRoundtrip:
         ],
     )
     @pytest.mark.parametrize("cache_salt", ["", "alice", "user-abc_123.xyz:42"])
-    def test_roundtrip(self, model_name: str, cache_salt: str):
+    @pytest.mark.parametrize("object_group_id", [0, 1, 255])
+    def test_roundtrip(self, model_name: str, cache_salt: str, object_group_id: int):
         key = ObjectKey(
             chunk_hash=b"\xde\xad\xbe\xef",
             model_name=model_name,
             kv_rank=42,
+            object_group_id=object_group_id,
             cache_salt=cache_salt,
         )
         fn = _object_key_to_filename(key)
@@ -46,26 +49,47 @@ def test_roundtrip(self, model_name: str, cache_salt: str):
         parsed = _filename_to_object_key(fn)
         assert parsed == key
 
+    def test_object_group_id_distinguishes_filenames(self):
+        """Keys differing only in object_group_id must not collide."""
+        fn0 = _object_key_to_filename(
+            ObjectKey(
+                chunk_hash=b"\xde\xad\xbe\xef",
+                model_name="llama",
+                kv_rank=42,
+                object_group_id=0,
+            )
+        )
+        fn1 = _object_key_to_filename(
+            ObjectKey(
+                chunk_hash=b"\xde\xad\xbe\xef",
+                model_name="llama",
+                kv_rank=42,
+                object_group_id=1,
+            )
+        )
+        assert fn0 != fn1
+
     def test_unsalted_format(self):
-        """Unsalted keys use the 3-field shape — identical to the
-        pre-cache_salt filename format, so existing caches stay valid."""
-        fn = "llama@0x0000002a@deadbeef.data"
+        """Unsalted keys use the 4-field shape."""
+        fn = "llama@0x0000002a@0@deadbeef.data"
         parsed = _filename_to_object_key(fn)
         assert parsed == ObjectKey(
             chunk_hash=b"\xde\xad\xbe\xef",
             model_name="llama",
             kv_rank=42,
+            object_group_id=0,
             cache_salt="",
         )
 
     def test_salted_format(self):
         """Salted keys append ``@<cache_salt>`` before the extension."""
-        fn = "llama@0x0000002a@deadbeef@alice.data"
+        fn = "llama@0x0000002a@2@deadbeef@alice.data"
         parsed = _filename_to_object_key(fn)
         assert parsed == ObjectKey(
             chunk_hash=b"\xde\xad\xbe\xef",
             model_name="llama",
             kv_rank=42,
+            object_group_id=2,
             cache_salt="alice",
         )
 
@@ -75,15 +99,19 @@ def test_non_data_file_returns_none(self):
     def test_too_few_fields_returns_none(self):
         assert _filename_to_object_key("just-one-field.data") is None
 
+    def test_old_three_field_format_returns_none(self):
+        """The pre-object_group_id 3-field shape is no longer accepted."""
+        assert _filename_to_object_key("llama@0x0000002a@deadbeef.data") is None
+
     def test_too_many_fields_returns_none(self):
-        assert _filename_to_object_key("a@b@c@d@e.data") is None
+        assert _filename_to_object_key("a@b@c@d@e@f.data") is None
 
     def test_salt_with_forbidden_char_returns_none(self):
-        # A filename that parses into 4 fields but whose trailing "salt"
+        # A filename that parses into 5 fields but whose trailing "salt"
         # slot contains a char ObjectKey.__post_init__ rejects (NUL here
         # is impossible in filenames, so use the length cap instead).
         too_long_salt = "x" * 129
-        fn = f"llama@0x0000002a@deadbeef@{too_long_salt}.data"
+        fn = f"llama@0x0000002a@0@deadbeef@{too_long_salt}.data"
         assert _filename_to_object_key(fn) is None
 
 
@@ -139,6 +167,50 @@ def test_empty_salt_passes_through(self):
         out = ipc_key_to_object_keys(k, [b"h1"])
         assert all(o.cache_salt == "" for o in out)
 
+    def test_object_group_id_defaults_to_zero(self):
+        # First Party
+        from lmcache.v1.distributed.api import ipc_key_to_object_keys
+        from lmcache.v1.multiprocess.custom_types import IPCCacheEngineKey
+
+        k = IPCCacheEngineKey.from_token_ids(
+            model_name="m",
+            world_size=1,
+            worker_id=0,
+            token_ids=[1, 2],
+        )
+        out = ipc_key_to_object_keys(k, [b"h1", b"h2"])
+        assert all(o.object_group_id == 0 for o in out)
+
+    def test_object_group_id_propagates_to_all_keys(self):
+        """A non-zero object_group_id reaches every produced ObjectKey,
+        including the worker-expansion (scheduler) path."""
+        # First Party
+        from lmcache.v1.distributed.api import ipc_key_to_object_keys
+        from lmcache.v1.multiprocess.custom_types import IPCCacheEngineKey
+
+        k = IPCCacheEngineKey.from_token_ids(
+            model_name="m",
+            world_size=4,
+            worker_id=None,
+            token_ids=[1, 2, 3],
+        )
+        out = ipc_key_to_object_keys(k, [b"h1"], object_group_id=3)
+        assert len(out) == 4
+        assert all(o.object_group_id == 3 for o in out)
+
+
+class TestObjectKeyValidation:
+    """``ObjectKey.__post_init__`` rejects invalid ``object_group_id``."""
+
+    def test_negative_object_group_id_rejected(self):
+        with pytest.raises(ValueError, match="object_group_id"):
+            ObjectKey(
+                chunk_hash=b"\xde\xad\xbe\xef",
+                model_name="llama",
+                kv_rank=0,
+                object_group_id=-1,
+            )
+
 
 class TestIPCCacheEngineKeyCacheSalt:
     """cache_salt on IPCCacheEngineKey: validation + wire compat."""
diff --git a/tests/v1/distributed/test_hfbucket_l2_adapter.py b/tests/v1/distributed/test_hfbucket_l2_adapter.py
index 2e69c6bb74..7647be7c9e 100644
--- a/tests/v1/distributed/test_hfbucket_l2_adapter.py
+++ b/tests/v1/distributed/test_hfbucket_l2_adapter.py
@@ -212,7 +212,16 @@ def test_format(self) -> None:
             model_name="llama",
             kv_rank=255,
         )
-        assert _object_key_to_string(key) == "llama@000000ff@00010203"
+        assert _object_key_to_string(key) == "llama@000000ff@0@00010203"
+
+    def test_object_group_id_embedded(self) -> None:
+        key = ObjectKey(
+            chunk_hash=b"\x00\x01\x02\x03",
+            model_name="llama",
+            kv_rank=255,
+            object_group_id=5,
+        )
+        assert _object_key_to_string(key) == "llama@000000ff@5@00010203"
 
     def test_cache_salt_appended(self) -> None:
         base_key = ObjectKey(
@@ -226,8 +235,8 @@ def test_cache_salt_appended(self) -> None:
             kv_rank=255,
             cache_salt="user-42",
         )
-        assert _object_key_to_string(base_key) == "llama@000000ff@00010203"
-        assert _object_key_to_string(salted) == "llama@000000ff@00010203@user-42"
+        assert _object_key_to_string(base_key) == "llama@000000ff@0@00010203"
+        assert _object_key_to_string(salted) == "llama@000000ff@0@00010203@user-42"
         assert _object_key_to_string(base_key) != _object_key_to_string(salted)
 
     def test_bucket_path_uses_prefix_and_encoding(self) -> None:
diff --git a/tests/v1/distributed/test_native_connector_l2_adapter.py b/tests/v1/distributed/test_native_connector_l2_adapter.py
index 4f24248163..fe83c10a5b 100644
--- a/tests/v1/distributed/test_native_connector_l2_adapter.py
+++ b/tests/v1/distributed/test_native_connector_l2_adapter.py
@@ -219,19 +219,28 @@ def test_different_keys_produce_different_strings(self):
         assert _object_key_to_string(k1) != _object_key_to_string(k2)
 
     def test_serialization_format(self):
-        """Unsalted keys use the 3-field shape — identical to the
-        pre-cache_salt wire format, so existing remote storage
-        stays valid."""
+        """Unsalted keys use the 4-field shape with object_group_id
+        embedded right after kv_rank."""
         key = ObjectKey(
             chunk_hash=b"\x00\x01\x02\x03",
             model_name="llama",
             kv_rank=255,
         )
         s = _object_key_to_string(key)
-        assert s == "llama@000000ff@00010203"
+        assert s == "llama@000000ff@0@00010203"
+
+    def test_object_group_id_embedded(self):
+        key = ObjectKey(
+            chunk_hash=b"\x00\x01\x02\x03",
+            model_name="llama",
+            kv_rank=255,
+            object_group_id=5,
+        )
+        s = _object_key_to_string(key)
+        assert s == "llama@000000ff@5@00010203"
 
     def test_salted_serialization_format(self):
-        """Salted keys append ``@<cache_salt>`` as a 4th field."""
+        """Salted keys append ``@<cache_salt>`` as a 5th field."""
         key = ObjectKey(
             chunk_hash=b"\x00\x01\x02\x03",
             model_name="llama",
@@ -239,7 +248,7 @@ def test_salted_serialization_format(self):
             cache_salt="alice",
         )
         s = _object_key_to_string(key)
-        assert s == "llama@000000ff@00010203@alice"
+        assert s == "llama@000000ff@0@00010203@alice"
 
     def test_different_salts_produce_different_strings(self):
         base = {
@@ -256,7 +265,7 @@ def test_different_salts_produce_different_strings(self):
         assert s_empty != s_alice
         assert s_alice != s_bob
         # Empty salt has no trailing "@salt", salted keys do.
-        assert s_empty.count("@") == 2  # 3 fields
+        assert s_empty.count("@") == 3  # 4 fields (model, kv_rank, group, hash)
         assert s_alice.endswith("@alice")
         assert s_bob.endswith("@bob")
 
diff --git a/tests/v1/distributed/test_s3_l2_adapter.py b/tests/v1/distributed/test_s3_l2_adapter.py
index afcb23c96a..094c07fe4c 100644
--- a/tests/v1/distributed/test_s3_l2_adapter.py
+++ b/tests/v1/distributed/test_s3_l2_adapter.py
@@ -330,7 +330,16 @@ def test_format(self):
             model_name="llama",
             kv_rank=255,
         )
-        assert _object_key_to_string(key) == "llama@000000ff@00010203"
+        assert _object_key_to_string(key) == "llama@000000ff@0@00010203"
+
+    def test_object_group_id_embedded(self):
+        key = ObjectKey(
+            chunk_hash=b"\x00\x01\x02\x03",
+            model_name="llama",
+            kv_rank=255,
+            object_group_id=5,
+        )
+        assert _object_key_to_string(key) == "llama@000000ff@5@00010203"
 
     def test_cache_salt_appended(self):
         """A non-empty cache_salt must be included in the S3 object name so
@@ -347,8 +356,8 @@ def test_cache_salt_appended(self):
             kv_rank=255,
             cache_salt="user-42",
         )
-        assert _object_key_to_string(base_key) == "llama@000000ff@00010203"
-        assert _object_key_to_string(salted) == "llama@000000ff@00010203@user-42"
+        assert _object_key_to_string(base_key) == "llama@000000ff@0@00010203"
+        assert _object_key_to_string(salted) == "llama@000000ff@0@00010203@user-42"
         assert _object_key_to_string(base_key) != _object_key_to_string(salted)
 
 
diff --git a/tests/v1/mp_observability/trace/test_codecs.py b/tests/v1/mp_observability/trace/test_codecs.py
index f9c6794e13..dbfd008526 100644
--- a/tests/v1/mp_observability/trace/test_codecs.py
+++ b/tests/v1/mp_observability/trace/test_codecs.py
@@ -50,6 +50,17 @@ def test_roundtrip(self):
         out = _roundtrip(k)
         assert out == k
 
+    def test_object_group_id_roundtrip(self):
+        k = ObjectKey(
+            chunk_hash=b"\x00\x01\x02",
+            model_name="m",
+            kv_rank=42,
+            object_group_id=7,
+        )
+        out = _roundtrip(k)
+        assert out == k
+        assert out.object_group_id == 7
+
     def test_inside_list(self):
         keys = [
             ObjectKey(chunk_hash=b"a", model_name="m", kv_rank=1),
diff --git a/tests/v1/storage_backend/test_raw_block_key_codec.py b/tests/v1/storage_backend/test_raw_block_key_codec.py
index 2ee1906a72..1bc3770e10 100644
--- a/tests/v1/storage_backend/test_raw_block_key_codec.py
+++ b/tests/v1/storage_backend/test_raw_block_key_codec.py
@@ -35,3 +35,29 @@ def test_raw_block_object_key_codec_roundtrips_slash_and_sep() -> None:
 
     assert "%2F" in encoded
     assert decoded == key
+
+
+def test_raw_block_object_key_codec_roundtrips_object_group_id() -> None:
+    """object_group_id must survive encode/decode for salted and unsalted keys."""
+    for object_group_id in (0, 1, 255):
+        for cache_salt in ("", "tenant"):
+            key = ObjectKey(
+                chunk_hash=ObjectKey.IntHash2Bytes(789),
+                model_name="org/model",
+                kv_rank=7,
+                object_group_id=object_group_id,
+                cache_salt=cache_salt,
+            )
+            assert decode_object_key(object_key_to_string(key)) == key
+
+
+def test_raw_block_object_group_id_distinguishes_encoding() -> None:
+    """Keys differing only in object_group_id must encode differently."""
+    chunk_hash = ObjectKey.IntHash2Bytes(1)
+    enc0 = object_key_to_string(
+        ObjectKey(chunk_hash=chunk_hash, model_name="m", kv_rank=0, object_group_id=0)
+    )
+    enc1 = object_key_to_string(
+        ObjectKey(chunk_hash=chunk_hash, model_name="m", kv_rank=0, object_group_id=1)
+    )
+    assert enc0 != enc1

From e10dc5f84fd352350314389373eaa99075fed7bc Mon Sep 17 00:00:00 2001
From: maobaolong <baoloongmao@tencent.com>
Date: Wed, 10 Jun 2026 07:58:33 +0800
Subject: [PATCH 16/57] feat(mp): add SHM-based NonGpuContext (server-side
 copy)  (#3352)

* feat(mp): add SHM-based NonGpuContext (server-side copy)  (#3346)

* feat(mp): add SHM-based NonGpuContext (server-side copy)

Porting upstream PR https://github.com/LMCache/LMCache/pull/3328 (commit 2/2)

Adapted to current branch:

- non_cuda_equivalents.py changes redirected to python_ops_fallback.py (renamed)

- test_cache.py changes redirected to bench/test_cache.py (relocated)

- skipped manual registration in cli/commands/__init__.py (now uses dynamic discovery)

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* Refactor

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* address gemini review on shm + stage_block_ids

- shm.py: hook munmap+shm_unlink via weakref.finalize so mmap and SHM
  segments are released when migrated tensors / to_tensor views are GC-ed
- shm.py: stop using id(tensor) as registry key; clear stale entries on
  finalize and use a monotonic counter for SHM names so id reuse can't
  trigger EEXIST in shm_open(O_EXCL)
- shm.py: use numel*element_size for the wrapped tensor byte count so
  views of larger storages are sized correctly
- cache_context.py: reject empty/None block_ids and bound-check against
  block_ids_buffer_ in stage_block_ids

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* shm: guard fd/mmap with try/finally on error paths

ensure shm_create_readwrite / shm_map_readwrite never leak the fd
or the mmap when ftruncate or mmap fails. also rename _nbytes to
nbytes so the test can read it without poking a private attribute.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* shm: validate cached entry via weakref to defeat id reuse

Cache id(tensor)->(weakref, name) instead of id->name. Lookups
verify ref() is the same tensor before reusing the cached SHM
name; a stale entry left behind by a GC'd tensor whose id has
since been recycled now reads as a miss instead of crashing the
next migration with EEXIST.

Adds inject_stale_cache_entry_for_test so the new regression
test can simulate id recycling without poking module-private
state.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* address comments

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

---------

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* Address comment

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

---------

Signed-off-by: baoloongmao <baoloongmao@tencent.com>
---
 .buildkite/k3_tests/multiprocess/pipeline.yml |  27 +
 .../scripts/run-cpu-e2e-validation.sh         |  52 +-
 docs/design/cli/commands.md                   |   5 +-
 docs/source/cli/bench.rst                     |  32 +-
 docs/source/mp/quickstart.rst                 |  62 ++
 .../commands/bench/server_bench/command.py    | 151 ++--
 .../commands/bench/server_bench/helpers.py    | 273 +++++---
 .../vllm/vllm_multi_process_adapter.py        |  14 +-
 lmcache/python_ops_fallback.py                |   8 +
 .../v1/multiprocess/modules/gpu_transfer.py   |  12 +-
 lmcache/v1/multiprocess/server.py             |   2 +-
 lmcache/v1/multiprocess/token_hasher.py       |  10 +-
 lmcache/v1/platform/cache_context.py          |  18 +-
 lmcache/v1/platform/cpu/cache_context.py      | 644 ++++++++++++++++++
 lmcache/v1/platform/cpu/shm.py                |  33 +-
 lmcache/v1/platform/cpu/stub_cpu_device.py    |  24 +
 .../test_blocks_first_fused_kv_format.py      |  76 +++
 .../test_non_cuda_data_transfer.py            | 114 +++-
 tests/v1/platform/test_cpu_shm.py             |  45 +-
 19 files changed, 1385 insertions(+), 217 deletions(-)
 create mode 100644 lmcache/v1/platform/cpu/cache_context.py

diff --git a/.buildkite/k3_tests/multiprocess/pipeline.yml b/.buildkite/k3_tests/multiprocess/pipeline.yml
index fa47e37108..b044224854 100644
--- a/.buildkite/k3_tests/multiprocess/pipeline.yml
+++ b/.buildkite/k3_tests/multiprocess/pipeline.yml
@@ -170,3 +170,30 @@ steps:
                 volumes:
                   - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } }
                   - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } }
+
+      - label: ":compression: cpu_e2e_validation (server-side copy)"
+        command: bash .buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+        env:
+          LMCACHE_MP_TRANSFER_MODE: "handle"
+        timeout_in_minutes: 30
+        agents: { queue: "k8s" }
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - name: container-0
+                    image: lmcache/ci-base:latest
+                    imagePullPolicy: Never
+                    resources:
+                      requests:
+                        cpu: "8"
+                        memory: "256Gi"
+                      limits:
+                        cpu: "8"
+                        memory: "256Gi"
+                    volumeMounts:
+                      - { name: hf-cache, mountPath: /root/.cache/huggingface }
+                      - { name: dshm, mountPath: /dev/shm }
+                volumes:
+                  - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } }
+                  - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } }
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh b/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
index 6c3448199c..7825f37e8a 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
@@ -20,6 +20,8 @@ LMCACHE_HEALTHCHECK_TIMEOUT="${LMCACHE_HEALTHCHECK_TIMEOUT:-30}"
 VLLM_READY_TIMEOUT="${VLLM_READY_TIMEOUT:-120}"
 # Set LMCACHE_SHM_NAME="" to use pickle transport; unset/default uses shm transport
 LMCACHE_SHM_NAME="${LMCACHE_SHM_NAME-__default__}"
+# Set LMCACHE_MP_TRANSFER_MODE=handle for server-side copy (POSIX SHM IPC)
+LMCACHE_MP_TRANSFER_MODE="${LMCACHE_MP_TRANSFER_MODE:-auto}"
 
 # Directory to collect artifacts before workspace is deleted
 ARTIFACT_DIR="/tmp/build_${BUILD_ID}_artifacts"
@@ -141,6 +143,36 @@ print(int(total))
 EOF
 }
 
+# Wait for a metric to change from its previous value
+wait_for_metric_change() {
+  local metric_name="$1"
+  local previous_value="$2"
+  local timeout_seconds="${3:-5}"
+  
+  echo "Waiting for metric '${metric_name}' to change from ${previous_value} (timeout: ${timeout_seconds}s)"
+  
+  local start_time current_time
+  start_time=$(date +%s)
+  
+  while true; do
+    current_time=$(date +%s)
+    if [ $((current_time - start_time)) -ge "${timeout_seconds}" ]; then
+      echo "Timeout: Metric '${metric_name}' did not change within ${timeout_seconds}s"
+      return 1
+    fi
+    
+    local current_value
+    current_value="$(scrape_metric "${metric_name}")"
+    
+    if [ "${current_value}" -gt "${previous_value}" ]; then
+      echo "Metric '${metric_name}' changed from ${previous_value} to ${current_value}"
+      return 0
+    fi
+    
+    sleep 1
+  done
+}
+
 # Send a completion request and print the text output
 send_completion() {
   local prompt_file="$1"
@@ -165,7 +197,9 @@ print(json.dumps({
 
 start_vllm() {
   echo "Starting vLLM server..."
-  VLLM_TARGET_DEVICE=cpu vllm serve facebook/opt-125m \
+  VLLM_TARGET_DEVICE=cpu \
+  LMCACHE_MP_TRANSFER_MODE="${LMCACHE_MP_TRANSFER_MODE}" \
+  vllm serve facebook/opt-125m \
     --port "${VLLM_PORT}" \
     --dtype bfloat16 \
     --disable-hybrid-kv-cache-manager \
@@ -273,7 +307,10 @@ LMCACHE_ARGS=(
   --eviction-policy "${LMCACHE_EVICTION_POLICY}"
   --chunk-size "${LMCACHE_CHUNK_SIZE}"
 )
-if [ "${LMCACHE_SHM_NAME}" = "__default__" ]; then
+if [ "${LMCACHE_MP_TRANSFER_MODE}" = "handle" ]; then
+  echo "Transport mode: server-side copy (handle via POSIX SHM IPC)"
+  EXPECTED_TRANSPORT="handle"
+elif [ "${LMCACHE_SHM_NAME}" = "__default__" ]; then
   echo "Transport mode: shared memory (shm)"
   EXPECTED_TRANSPORT="shm"
 else
@@ -320,7 +357,14 @@ echo "✅ E2E request validation passed"
 
 # Verify transport mode (logged after vLLM connects to LMCache server)
 echo "[Phase 2 / Step 5.5] Verifying transport mode: expecting '${EXPECTED_TRANSPORT}'"
-if [ "${EXPECTED_TRANSPORT}" = "shm" ]; then
+if [ "${EXPECTED_TRANSPORT}" = "handle" ]; then
+  if ! grep -q "CpuCacheContext" "${LMCACHE_LOG}" 2>/dev/null; then
+    echo "❌ Expected server-side copy but 'CpuCacheContext' not found in log"
+    tail -50 "${LMCACHE_LOG}"
+    false
+  fi
+  echo "✅ Transport mode confirmed: handle (server-side copy)"
+elif [ "${EXPECTED_TRANSPORT}" = "shm" ]; then
   if ! grep -q "Using shm" "${LMCACHE_LOG}" 2>/dev/null; then
     echo "❌ Expected shm transport but 'Using shm' not found in log"
     tail -50 "${LMCACHE_LOG}"
@@ -474,4 +518,4 @@ echo "=========================================="
 
 # Upload artifacts BEFORE deleting the workspace
 upload_artifacts
-cleanup_workspace
+cleanup_workspace
\ No newline at end of file
diff --git a/docs/design/cli/commands.md b/docs/design/cli/commands.md
index ccbbc56e13..7e2ec21382 100644
--- a/docs/design/cli/commands.md
+++ b/docs/design/cli/commands.md
@@ -194,11 +194,12 @@ Supports two run modes via ``--mode``:
 - **``gpu``** (default) -- allocates real CUDA tensors and uses CUDA IPC
   (handle transfer path).
 - **``cpu``** -- allocates POSIX-SHM-backed tensors; the server maps the same
-  physical pages for zero-copy STORE/RETRIEVE (data transfer path).
+  physical pages for zero-copy STORE/RETRIEVE (data transfer path by default).
+  To use the zero-copy SHM handle path, add ``--transfer-mode handle``.
 
 The transfer path can be overridden explicitly with ``--transfer-mode
 {auto,handle,data}``. ``auto`` keeps the historical mapping: gpu→handle,
-cpu→data. Note: ``--transfer-mode handle`` on CPU is not yet implemented.
+cpu→data.
 
 ```bash
 $ lmcache bench server \
diff --git a/docs/source/cli/bench.rst b/docs/source/cli/bench.rst
index 37d7410cb2..fd6462affc 100644
--- a/docs/source/cli/bench.rst
+++ b/docs/source/cli/bench.rst
@@ -777,6 +777,32 @@ Options
      - KV cache shape spec (see below).
 
 
+CPU mode (no GPU)
+~~~~~~~~~~~~~~~~~
+
+``--mode cpu`` runs the same end-to-end path without a GPU. The server
+runs on a CPU-only host (``StubCPUDevice``); the bench tool allocates
+POSIX-SHM-backed KV tensors and exercises the full RPC path.
+
+By default ``--mode cpu`` uses the data-transfer path (``auto`` →
+``cpu→data``). To use the zero-copy SHM handle path instead, pass
+``--transfer-mode handle``:
+
+.. code-block:: bash
+
+   # Terminal 1 -- start the LMCache server (no GPU required)
+   lmcache server \
+       --host localhost --port 5555 \
+       --l1-size-gb 2 --eviction-policy LRU
+
+   # Terminal 2 -- run bench in CPU + handle mode
+   lmcache bench server \
+       --rpc-url tcp://localhost:5555 \
+       --url http://localhost:8080 \
+       --mode cpu --transfer-mode handle \
+       --start 0 --end 2
+
+
 KV cache shape spec
 ~~~~~~~~~~~~~~~~~~~
 
@@ -851,12 +877,6 @@ Exit codes
      - Fatal error (for example, CUDA unavailable in ``--mode gpu``,
        server unreachable, or a checksum mismatch).
 
-.. note::
-
-   ``--transfer-mode handle`` on CPU mode is not yet implemented and
-   will be added in a future release.
-
-
 .. _lmcache-bench-l2:
 
 l2
diff --git a/docs/source/mp/quickstart.rst b/docs/source/mp/quickstart.rst
index c88a97d089..61a80b7333 100644
--- a/docs/source/mp/quickstart.rst
+++ b/docs/source/mp/quickstart.rst
@@ -180,3 +180,65 @@ Examples:
 
 The ZMQ server runs on the same default port (5555) and accepts vLLM
 connections exactly as in the local quick start.
+
+CPU-Only Quick Start
+--------------------
+
+LMCache MP mode works on hosts without a GPU. The server runs with a
+``StubCPUDevice`` and vLLM uses its CPU backend. KV tensors are shared
+between vLLM and the LMCache server via POSIX shared memory (zero-copy,
+no GPU required).
+
+**Step 1: Start the LMCache server (no GPU needed)**
+
+.. code-block:: bash
+
+    lmcache server \
+        --l1-size-gb 2 --eviction-policy LRU --port 5555
+
+Expected log output:
+
+.. code-block:: text
+
+    LMCache INFO: torch_dev=StubCPUDevice(device_type=cpu), ...
+    LMCache INFO: LMCache cache server is running...
+
+**Step 2: Start vLLM with the handle transfer mode**
+
+Pass ``lmcache.mp.mp_transfer_mode=handle`` in
+``kv_connector_extra_config`` to enable the POSIX-SHM zero-copy path.
+At startup the vLLM worker migrates each KV cache tensor to a shared
+memory segment (``/lmcache_kv_<pid>_<idx>``) so the LMCache server can
+map the same physical pages directly.
+
+.. code-block:: bash
+
+    vllm serve <model> --dtype bfloat16 \
+        --disable-hybrid-kv-cache-manager \
+        --no-enable-prefix-caching \
+        --kv-transfer-config \
+        '{"kv_connector": "LMCacheMPConnector",
+          "kv_role": "kv_both",
+          "kv_connector_module_path":
+            "lmcache.integration.vllm.lmcache_mp_connector",
+          "kv_connector_extra_config": {
+            "lmcache.mp.host": "tcp://localhost",
+            "lmcache.mp.port": 5555,
+            "lmcache.mp.mp_transfer_mode": "handle"
+          }}'
+
+Expected log output on the vLLM side:
+
+.. code-block:: text
+
+    LMCache INFO: lmcache.mp.mp_transfer_mode = handle (overridden, ...)
+    LMCache INFO: Creating transfer context (device_type=cpu, mode=handle)
+    LMCache INFO: Migrated CPU KV cache tensor (nbytes=...) to SHM /lmcache_kv_...
+
+**Step 3: Send requests** the same way as in the local quick start.
+
+.. note::
+   The default ``auto`` transfer mode routes CPU tensors to the
+   ``data`` path (worker-side gather/scatter). Use
+   ``mp_transfer_mode=handle`` explicitly to get the zero-copy SHM
+   path described above.
diff --git a/lmcache/cli/commands/bench/server_bench/command.py b/lmcache/cli/commands/bench/server_bench/command.py
index 269f01a22e..180ab29086 100644
--- a/lmcache/cli/commands/bench/server_bench/command.py
+++ b/lmcache/cli/commands/bench/server_bench/command.py
@@ -33,11 +33,11 @@
 from __future__ import annotations
 
 # Standard
-from multiprocessing import shared_memory
-from multiprocessing.resource_tracker import unregister
 from typing import TYPE_CHECKING
 import argparse
 import itertools
+import mmap
+import os
 import sys
 import time
 
@@ -53,11 +53,13 @@
     _DEFAULT_SHAPE_SPEC,
     _IMPORT_ERROR,
     DTYPE_MAP,
-    _allocate_kv_cache,
+    _allocate_cpu_shm_kv_cache,
+    _allocate_gpu_kv_cache,
     _get_chunk_size,
     _process_request,
     _require_full_install,
     _send_register_kv_cache,
+    shm_open_pool_as_mmap,
 )
 
 if TYPE_CHECKING:
@@ -134,10 +136,9 @@ def register_server_parser(
         choices=["cpu", "gpu"],
         default="gpu",
         help=(
-            "Run mode (default: gpu). In cpu mode the bench drives "
-            "the data-transfer path: the server allocates a SHM pool "
-            "and the client gathers/scatters chunks via slot "
-            "descriptors. CPU handle mode is not yet supported."
+            "Run mode (default: gpu). In cpu mode the client allocates "
+            "POSIX-SHM-backed KV cache tensors and the server maps the "
+            "same physical pages."
         ),
     )
     parser.add_argument(
@@ -147,7 +148,8 @@ def register_server_parser(
         help=(
             "Transport routing for STORE/RETRIEVE (default: auto). "
             "`handle` forces the GPU-style single-shot path "
-            "(REGISTER_KV_CACHE + STORE/RETRIEVE). "
+            "(REGISTER_KV_CACHE + STORE/RETRIEVE), which on CPU mode "
+            "uses POSIX SHM to back zero-copy server-side mappings. "
             "`data` forces the worker-side gather/scatter path "
             "(REGISTER_KV_CACHE_NON_GPU_CONTEXT + PREPARE/COMMIT). "
             "`auto` keeps the historical mapping: gpu->handle, cpu->data."
@@ -265,25 +267,29 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
         sys.exit(1)
 
     # Resolve transfer mode. ``auto`` reproduces the historical
-    # behaviour: gpu -> handle path, cpu -> data path.
-    transfer_mode = args.transfer_mode
+    # behaviour: gpu -> handle path, cpu -> data path. ``handle``
+    # / ``data`` are explicit overrides; ``handle`` on CPU mode is
+    # the SHM-backed zero-copy path (server-side copy).
+    transfer_mode = getattr(args, "transfer_mode", "auto")
     if transfer_mode == "auto":
         use_handle = use_gpu
     elif transfer_mode == "handle":
         use_handle = True
     else:
         use_handle = False
+    if use_handle and not use_gpu:
+        print(
+            "  [info] --transfer-mode=handle on cpu mode: using "
+            "REGISTER_KV_CACHE + STORE/RETRIEVE over POSIX SHM"
+        )
 
     url = args.rpc_url
     print(
-        "Connecting to LMCache MP Server at %s (mode=%s, transfer=%s) ..."
-        % (url, args.mode, transfer_mode),
+        "Connecting to LMCache MP Server at %s (mode=%s) ..." % (url, args.mode),
     )
 
     ctx = zmq.Context()
     client = MessageQueueClient(url, ctx)
-    server_shm: "shared_memory.SharedMemory | None" = None
-    server_pool: "memoryview | None" = None
 
     try:
         # Query chunk size from server
@@ -303,7 +309,7 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
         )
         # Paged KV demands identical ``NB`` / ``BS`` across all groups
         # (block_id -> slot maths is shared), but ``kv_size`` / ``NH`` /
-        # ``HS`` / ``dtype`` may vary per group. ``_allocate_kv_cache(
+        # ``HS`` / ``dtype`` may vary per group. ``_allocate_gpu_kv_cache(
         # groups=...)`` honours each group's own shape; ``_process_request``
         # only needs a single ``block_size`` / ``total_blocks``.
         first = layer_groups[0]
@@ -364,6 +370,14 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
             "head_size": head_size_disp,
             "num_blocks": num_blocks,
             "block_size": block_size,
+            # Tell the server the inference-engine-side logical block
+            # size explicitly. Otherwise ``KVLayerGroupsManager`` falls
+            # back to ``shape_desc.bs``, which on the CPU/HND path can
+            # be the per-block ``num_heads`` value instead of the real
+            # ``block_size`` (HND swaps NH and BS in the tensor shape),
+            # and STORE/RETRIEVE would then expect twice as many block
+            # IDs as the bench client actually sends.
+            "inference_engine_logical_block_size": block_size,
             "dtype": dtype_str,
         }
 
@@ -390,63 +404,65 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
         )
 
         # Allocate KV tensors. GPU mode wraps real CUDA tensors
-        # via CUDA IPC; CPU mode (data transfer mode) allocates
-        # plain CPU tensors used for client-side checksum self-check.
-        # TODO(baoloongmao): CPU handle mode (zero-copy SHM IPC with
-        # the server) will be implemented in a separate PR.
+        # via CUDA IPC; CPU mode allocates POSIX-SHM-backed
+        # tensors so the server can map the same physical pages.
+        # shm_names tracks per-layer SHM segment names allocated
+        # on demand (one per layer) so we can shm_unlink on exit.
+        shm_names: list[str] = []
         if use_gpu:
             # First Party
             from lmcache.v1.multiprocess.custom_types import CudaIPCWrapper
 
-            allocated = _allocate_kv_cache(groups=layer_groups, use_gpu=True)
+            allocated = _allocate_gpu_kv_cache(groups=layer_groups)
             print(
                 "Allocated %d GPU tensors on %s" % (len(allocated), allocated[0].device)
             )
-            kv_wrappers: list = [CudaIPCWrapper(t) for t in allocated]
+            kv_wrappers = [CudaIPCWrapper(t) for t in allocated]
+            # Keep the CUDA tensors alive for the lifetime of the
+            # bench process -- storage may be reclaimed otherwise --
+            # and reuse the same list as the client-side data-mode
+            # source/sink for the round-trip self-check.
             client_kv_tensors = allocated
         else:
-            if use_handle:
-                print(
-                    "ERROR: --mode cpu --transfer-mode handle is not yet "
-                    "supported in this PR (TODO: separate PR)."
-                )
-                sys.exit(1)
-            cpu_tensors = _allocate_kv_cache(groups=layer_groups, use_gpu=False)
-            print("Allocated %d CPU tensors" % len(cpu_tensors))
-            kv_wrappers = []
+            # First Party
+            from lmcache.v1.platform.cpu.shm import CpuShmTensorWrapper
+
+            shm_prefix = CpuShmTensorWrapper.SHM_NAME_PREFIX + str(os.getpid())
+            cpu_tensors, cpu_wrappers, shm_names = _allocate_cpu_shm_kv_cache(
+                groups=layer_groups, shm_prefix=shm_prefix
+            )
+            print(
+                "Allocated %d CPU SHM tensors (prefix=%s)"
+                % (len(cpu_tensors), shm_prefix)
+            )
+            kv_wrappers = list(cpu_wrappers)
             client_kv_tensors = cpu_tensors
 
-        # Register KV cache before any store/retrieve.
-        register_ok, register_response = _send_register_kv_cache(
+        # Register KV cache before any store/retrieve. In handle mode
+        # both GPU (CUDA-IPC) and CPU (POSIX-SHM) paths share the same
+        # ``REGISTER_KV_CACHE`` protocol since ``CpuShmTensorWrapper``
+        # is a ``CudaIPCWrapper`` subclass on the wire. In data mode
+        # we fall through to the non-GPU registration protocol.
+        register_result = _send_register_kv_cache(
             client,
             layout_hints=layout_hints,
             kv_caches=kv_wrappers if use_handle else None,
             use_gpu=use_gpu,
             use_handle=use_handle,
         )
-        print("REGISTER_KV_CACHE: %s" % ("OK" if register_ok else "FAIL"))
+        print("REGISTER_KV_CACHE: %s" % ("OK" if register_result else "FAIL"))
         print()
 
         # In data mode the server reply carries the SHM pool name
-        # and size; the bench attaches to the same pool so
-        # STORE/RETRIEVE can exchange tensor data via slot
-        # descriptors. We open via :class:`SharedMemory` (matching
-        # the server-side allocator in ``transfer_context/shm.py``)
-        # and unregister from the worker's resource tracker so the
-        # segment is not unlinked when the bench exits -- the server
-        # owns its lifetime.
-        if not use_handle and register_ok and register_response is not None:
-            shm_name = register_response.shm_name
-            pool_size = register_response.pool_size
+        # and size; the bench mmaps the same pool so STORE/RETRIEVE
+        # can exchange tensor data via slot descriptors instead of
+        # round-tripping pickle through the RPC layer.
+        server_pool: "mmap.mmap | None" = None
+        if not use_handle and not isinstance(register_result, bool):
+            shm_name = getattr(register_result, "shm_name", "")
+            pool_size = getattr(register_result, "pool_size", 0)
             if shm_name and pool_size > 0:
-                server_shm = shared_memory.SharedMemory(
-                    name=shm_name.lstrip("/"), create=False
-                )
-                try:
-                    unregister("/%s" % server_shm.name, "shared_memory")
-                except KeyError:
-                    pass
-                server_pool = server_shm.buf
+                server_pool = shm_open_pool_as_mmap(shm_name, pool_size)
 
         if args.end is not None:
             seq_iter: itertools.count | range = range(args.start, args.end)
@@ -455,9 +471,11 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
 
         http_base = args.url.rstrip("/")
 
-        # In data mode the server has no paged kv_tensors view to
-        # hash, so we self-check on the client. Handle mode keeps
-        # the legacy server-side /kvcache/check path.
+        # In data mode the server has no paged ``kv_tensors`` view to
+        # hash, so we self-check on the client: cold pass captures
+        # ground truth, warm pass zero-fills + re-hashes after
+        # RETRIEVE. Handle mode keeps the legacy server-side
+        # ``/kvcache/check`` path.
         client_tensors = None if use_handle else client_kv_tensors
 
         for seq_no in seq_iter:
@@ -527,21 +545,22 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
     except KeyboardInterrupt:
         print("\nStopping...")
     finally:
-        # Release the bench-side view of the server SHM pool first
-        # (data mode only; server_shm stays None otherwise). The
-        # ``memoryview`` returned by ``SharedMemory.buf`` must be
-        # released before ``SharedMemory.close``, otherwise CPython
-        # raises ``BufferError`` on shutdown.
-        if server_pool is not None:
+        # Release the bench-side mmap of the server SHM pool first
+        # (data mode only; ``server_pool`` stays ``None`` otherwise).
+        if "server_pool" in locals() and server_pool is not None:
             try:
-                server_pool.release()
-            except (BufferError, ValueError):
-                pass
-        if server_shm is not None:
-            try:
-                server_shm.close()
+                server_pool.close()
             except (BufferError, ValueError):
                 pass
         client.close()
         ctx.term()
+        # Best-effort SHM cleanup so segments don't linger.
+        for _name in shm_names if "shm_names" in locals() else []:
+            try:
+                # First Party
+                from lmcache.v1.platform.cpu.shm import shm_unlink
+
+                shm_unlink(_name)
+            except OSError:
+                pass
     print("Done.")
diff --git a/lmcache/cli/commands/bench/server_bench/helpers.py b/lmcache/cli/commands/bench/server_bench/helpers.py
index 1ed0be6d00..02e0a2d80b 100644
--- a/lmcache/cli/commands/bench/server_bench/helpers.py
+++ b/lmcache/cli/commands/bench/server_bench/helpers.py
@@ -18,8 +18,10 @@
 
 # Standard
 from typing import Any
+import ctypes
 import hashlib
 import json
+import mmap
 import sys
 import time
 import urllib.error
@@ -58,17 +60,30 @@
     )
     from lmcache.v1.multiprocess.futures import MessagingFuture
     from lmcache.v1.multiprocess.mq import MessageQueueClient
+    from lmcache.v1.multiprocess.posix_shm import shm_open_pool_as_mmap
     from lmcache.v1.multiprocess.protocols.base import RequestType
     from lmcache.v1.multiprocess.protocols.engine import (
         RegisterNonGpuContextResponse,
     )
     from lmcache.v1.multiprocess.transfer_context.shm import ShmSlotDescriptor
+    from lmcache.v1.platform.cpu.shm import (
+        CpuShmTensorWrapper,
+        shm_create_readwrite,
+    )
 except ImportError as _exc:
     _IMPORT_ERROR = _exc
     # Fallback placeholder so ``add_arguments`` can still build its
     # help text without crashing on a CLI-only install.
     DTYPE_MAP = {}  # type: ignore[assignment]
 
+    # Stubs so other modules (notably ``command.py``) can still import
+    # the SHM helpers on a slim install; ``_require_full_install`` is
+    # the gate that prevents them from ever being invoked there.
+    def shm_open_pool_as_mmap(name: str, nbytes: int) -> Any:  # type: ignore[misc]
+        raise RuntimeError(
+            "shm_open_pool_as_mmap unavailable on slim lmcache-cli install"
+        )
+
 
 def _require_full_install() -> None:
     """Exit with an install hint if the full LMCache runtime is missing.
@@ -180,36 +195,18 @@ def _make_key(
 # ------------------------------------------------------------------ #
 
 
-def _alloc_tensor(
-    shape: tuple[int, ...],
-    dtype: "torch.dtype",
-    device: "torch.device | None" = None,
-) -> "torch.Tensor":
-    """Allocate a random tensor of the given shape and dtype.
-
-    ``torch.randn`` only supports floating-point dtypes; integer dtypes
-    (e.g. ``uint8`` for FP8-quantised KV) fall back to ``randint``.
-    """
-    kwargs = {} if device is None else {"device": device}
-    if dtype.is_floating_point:
-        return torch.randn(shape, dtype=dtype, **kwargs)
-    iinfo = torch.iinfo(dtype)
-    return torch.randint(iinfo.min, iinfo.max + 1, shape, dtype=dtype, **kwargs)
-
-
-def _allocate_kv_cache(
+def _allocate_gpu_kv_cache(
     num_layers: int = 32,
     num_heads: int = 8,
     head_size: int = 128,
     num_blocks: int = 1024,
     block_size: int = 16,
-    dtype: "torch.dtype | None" = None,
-    device: "str | torch.device | None" = None,
+    dtype: torch.dtype | None = None,
+    device: str | torch.device | None = None,
     kv_size: int = 2,
-    groups: "list[KVLayerGroupInfo] | None" = None,
-    use_gpu: bool = True,
-) -> "list[torch.Tensor]":
-    """Allocate paged KV cache tensors for either GPU or CPU mode.
+    groups: list[KVLayerGroupInfo] | None = None,
+) -> list[torch.Tensor]:
+    """Allocate paged GPU KV cache tensors.
 
     Each layer is a tensor of shape
     ``(kv_size, num_blocks, block_size, num_heads, head_size)``
@@ -223,39 +220,97 @@ def _allocate_kv_cache(
     (for heterogeneous multi-group specs). In that mode the flat
     ``num_heads`` / ``head_size`` / ``dtype`` / ``kv_size`` kwargs
     are ignored, and ``num_layers`` is derived from the groups.
-
-    GPU mode wraps real CUDA tensors; CPU mode (``use_gpu=False``,
-    used by ``--transfer-mode data``) allocates plain CPU tensors --
-    the server owns the SHM pool and the bench only needs these
-    tensors for client-side checksum self-check. The handle-mode
-    CPU path (true zero-copy SHM IPC with the server) will need
-    :class:`CpuShmTensorWrapper` and is left for a separate PR --
-    see ``TODO(baoloongmao)`` markers below.
     """
     # ``torch.float16`` cannot be used as a default value because the
     # module must load on ``lmcache-cli`` (no torch) installs.
     if dtype is None:
         dtype = torch.float16
     torch.random.manual_seed(42)
-    if use_gpu:
-        dev: torch.device | None = (
-            torch.device(device)
-            if device
-            else torch.device(torch_device_type, torch_dev.current_device())
-        )
-    else:
-        dev = None
+    dev = (
+        torch.device(device)
+        if device
+        else torch.device(torch_device_type, torch_dev.current_device())
+    )
+
+    def _alloc(
+        shape: tuple[int, ...],
+        a_dtype: torch.dtype,
+    ) -> torch.Tensor:
+        if a_dtype.is_floating_point:
+            return torch.randn(shape, dtype=a_dtype, device=dev)
+        # ``torch.randn`` only supports floating-point dtypes; fall
+        # back to ``randint`` for integer dtypes (e.g. ``uint8``
+        # used by FP8 quantized KV cache layouts).
+        iinfo = torch.iinfo(a_dtype)
+        return torch.randint(iinfo.min, iinfo.max + 1, shape, dtype=a_dtype, device=dev)
 
     if groups:
         tensors: list[torch.Tensor] = []
         for g in groups:
             sd = g.shape_desc
             g_shape = (sd.kv_size, sd.nb, sd.bs, sd.nh, sd.hs)
-            tensors.extend(_alloc_tensor(g_shape, g.dtype, dev) for _ in range(sd.nl))
+            tensors.extend(_alloc(g_shape, g.dtype) for _ in range(sd.nl))
         return tensors
 
     shape = (kv_size, num_blocks, block_size, num_heads, head_size)
-    return [_alloc_tensor(shape, dtype, dev) for _ in range(num_layers)]
+    return [_alloc(shape, dtype) for _ in range(num_layers)]
+
+
+# Backward-compatible alias used by tests and older callers.
+_allocate_kv_cache = _allocate_gpu_kv_cache
+
+
+def _allocate_cpu_shm_kv_cache(
+    groups: list[KVLayerGroupInfo],
+    shm_prefix: str,
+) -> tuple[list[torch.Tensor], list[CpuShmTensorWrapper], list[str]]:
+    """Allocate paged CPU KV cache tensors backed by POSIX SHM.
+
+    For each (group, layer) we ``shm_open`` a fresh segment and
+    ``mmap`` it into the client process. The returned tensors share
+    storage with the SHM mapping, and the matching
+    :class:`CpuShmTensorWrapper` instances tell the LMCache mp
+    server how to map the very same physical pages -- i.e. true
+    zero-copy across processes (matching the GPU CUDA-IPC path).
+
+    Returns:
+        ``(tensors, wrappers, shm_names)``. ``shm_names`` is kept
+        so the caller can ``shm_unlink`` on shutdown.
+    """
+    # Fixed seed so the deterministic random fill below produces
+    # reproducible checksums across cold/warm bench iterations.
+    torch.random.manual_seed(42)
+    tensors: list[torch.Tensor] = []
+    wrappers: list[CpuShmTensorWrapper] = []
+    shm_names: list[str] = []
+    layer_idx = 0
+    for g_idx, g in enumerate(groups):
+        sd = g.shape_desc
+        g_shape = (sd.kv_size, sd.nb, sd.bs, sd.nh, sd.hs)
+        for _ in range(sd.nl):
+            n_elems = 1
+            for d in g_shape:
+                n_elems *= d
+            nbytes = n_elems * g.dtype.itemsize
+            name = "%s_%d_%d" % (shm_prefix, g_idx, layer_idx)
+            addr = shm_create_readwrite(name, nbytes)
+            buf_type = ctypes.c_uint8 * nbytes
+            buf = buf_type.from_address(addr)
+            flat = torch.frombuffer(buf, dtype=torch.uint8)
+            t = flat.view(g.dtype).reshape(g_shape)
+            # Initialise with deterministic random data so the
+            # cold/warm checksum compare in the bench loop is
+            # meaningful.
+            if g.dtype.is_floating_point:
+                t.copy_(torch.randn(g_shape, dtype=g.dtype))
+            else:
+                iinfo = torch.iinfo(g.dtype)
+                t.copy_(torch.randint(iinfo.min, iinfo.max + 1, g_shape, dtype=g.dtype))
+            tensors.append(t)
+            wrappers.append(CpuShmTensorWrapper(t, name))
+            shm_names.append(name)
+            layer_idx += 1
+    return tensors, wrappers, shm_names
 
 
 def _send_register_kv_cache(
@@ -267,27 +322,18 @@ def _send_register_kv_cache(
     kv_caches: list[CudaIPCWrapper] | None = None,
     use_gpu: bool = True,
     use_handle: bool | None = None,
-) -> "tuple[bool, RegisterNonGpuContextResponse | None]":
+) -> "bool | RegisterNonGpuContextResponse":
     """Register a KV cache context with the MP server.
 
     Dispatches to the correct protocol based on ``use_handle``:
 
-    * Handle mode: ``REGISTER_KV_CACHE`` with a ``CudaIPCWrapper``
-      list (GPU only in this PR).
+    * Handle mode: ``REGISTER_KV_CACHE`` with a wrapper list
+      (``CudaIPCWrapper`` for GPU, ``CpuShmTensorWrapper`` for CPU).
     * Data mode: ``REGISTER_KV_CACHE_NON_GPU_CONTEXT`` with a
       ``RegisterNonGpuContextPayload`` derived from ``layout_hints``.
 
-    Returns ``(success, response)`` where ``response`` is the
-    data-mode SHM pool descriptor (``None`` in handle mode or on
-    failure). Using an explicit success flag avoids relying on the
-    truthiness of a dataclass response, which is always truthy even
-    when the server returned an empty pool descriptor.
-
     ``use_handle`` defaults to ``use_gpu`` for backwards compatibility:
     GPU always goes through the handle path, CPU defaults to data.
-
-    TODO(baoloongmao): CPU handle mode (zero-copy SHM IPC with the
-    server) will be implemented in a separate PR.
     """
     if use_handle is None:
         use_handle = use_gpu
@@ -295,7 +341,7 @@ def _send_register_kv_cache(
         if not kv_caches:
             raise ValueError(
                 "kv_caches must be a non-empty list of wrappers "
-                "(CudaIPCWrapper for GPU)"
+                "(CudaIPCWrapper for GPU, CpuShmTensorWrapper for CPU)"
             )
         hints: dict = {"kv_layout": "NHD"}
         if layout_hints:
@@ -308,12 +354,12 @@ def _send_register_kv_cache(
             world_size,
             EngineType.VLLM,
             hints,
-            [],
+            [],  # group_views: empty = single non-hybrid group
         ]
         result = _call(client, RequestType.REGISTER_KV_CACHE, payloads)
-        return (result is not _TIMEOUT, None)
+        return result is not _TIMEOUT
 
-    # Data mode: use the non-GPU context registration protocol.
+    # CPU mode: use the non-GPU context registration protocol.
     # layout_hints carries num_layers, num_heads, head_size, block_size,
     # dtype.  hidden_dim_size = num_heads * head_size (NHD layout).
     hints_d: dict = layout_hints or {}
@@ -339,14 +385,13 @@ def _send_register_kv_cache(
         use_mla=False,
     )
     result = _call(client, RequestType.REGISTER_KV_CACHE_NON_GPU_CONTEXT, [payload])
-    if result is _TIMEOUT or result is None:
-        return (False, None)
+    if result is _TIMEOUT:
+        return False
     # The data-mode register reply carries the server's SHM pool name
-    # and size; the bench keeps it so STORE/RETRIEVE can mmap the same
-    # pool and exchange tensor data via slot descriptors. A non-empty
-    # ``shm_name`` is the authoritative success signal here.
-    success = bool(getattr(result, "shm_name", ""))
-    return (success, result)
+    # and size; the bench keeps it on the side so STORE / RETRIEVE
+    # can mmap the same pool and exchange tensor data without going
+    # through pickle.
+    return result
 
 
 def _send_lookup(
@@ -405,7 +450,7 @@ def _make_event_handle(use_gpu: bool = True) -> bytes:
 
 
 def _build_server_slot_views(
-    server_pool: "memoryview",
+    server_pool: "mmap.mmap",
     slots: list[dict[str, Any]],
 ) -> list["torch.Tensor"]:
     """Build zero-copy tensor views over server SHM slot descriptors.
@@ -442,9 +487,12 @@ def _gather_paged_to_flat_chunks(
     """Gather paged client tensors into flat per-chunk CPU tensors.
 
     Output layout matches the server's expected ``commit_store``
-    payload: each chunk is ``[2, num_layers, chunk_size, hidden_dim]``,
+    payload (set up at register time by
+    ``register_kv_cache_non_gpu_context``):
+    each chunk is ``[2, num_layers, chunk_size, hidden_dim]``,
     where ``hidden_dim = NH * HS``. Assumes a homogeneous group
-    (same NH/HS/dtype across all layers).
+    (same NH/HS/dtype across all layers); heterogeneous specs
+    fall outside the bench scope.
     """
     if chunk_size % block_size != 0:
         raise ValueError(
@@ -459,10 +507,14 @@ def _gather_paged_to_flat_chunks(
         start_b = block_offset + c * blocks_per_chunk
         per_layer: list[torch.Tensor] = []
         for t in tensors:
+            # paged: (2, NB, BS, NH, HS) -> slice block range ->
+            # (2, blocks_per_chunk, BS, NH, HS) -> flatten to
+            # (2, chunk_size, NH*HS).
             sliced = t.narrow(1, start_b, blocks_per_chunk)
             kv, _, bs, nh, hs = sliced.shape
             flat = sliced.contiguous().view(kv, blocks_per_chunk * bs, nh * hs)
             per_layer.append(flat)
+        # Stack along a new layer dim -> (2, NL, chunk_size, hidden).
         chunk = torch.stack(per_layer, dim=1).contiguous()
         if chunk.shape[1] != num_layers:
             raise RuntimeError(
@@ -497,11 +549,18 @@ def _scatter_flat_chunks_to_paged(
         for layer_idx, t in enumerate(tensors):
             kv, _, bs, nh, hs = t.shape
             target = t.narrow(1, start_b, blocks_per_chunk)
+            # chunk[:, layer_idx] is (chunk_size, hidden); reshape
+            # back to (2, blocks_per_chunk, BS, NH, HS).
             flat = chunk[:, layer_idx]
             reshaped = flat.reshape(kv, blocks_per_chunk, bs, nh, hs)
             target.copy_(reshaped)
 
 
+# ------------------------------------------------------------------ #
+#  Client-side checksum / zero-fill (data-mode self-check)             #
+# ------------------------------------------------------------------ #
+
+
 def _compute_client_checksums(
     tensors: list["torch.Tensor"],
     block_offset: int,
@@ -511,10 +570,13 @@ def _compute_client_checksums(
 ) -> list[str]:
     """Hash a paged block range from client-side KV tensors.
 
-    For each chunk, feed every layer's bytes for that block range
-    into a single MD5 digest. Used by the data-mode self-check:
-    cold-pass digest vs warm-pass digest verifies that RETRIEVE
-    actually wrote back the data we wrote during STORE.
+    For each chunk (``chunk_size // block_size`` consecutive blocks),
+    feed every layer's bytes for that block range into a single MD5
+    digest. The returned list maps 1:1 to the chunks the bench loop
+    expects, so a cold-pass digest can be compared with a warm-pass
+    digest to verify that ``RETRIEVE`` actually wrote back the data
+    we wrote during ``STORE`` -- without relying on a server-side
+    ``/kvcache/check`` endpoint (which only exists in handle mode).
     """
     if chunk_size % block_size != 0:
         raise ValueError(
@@ -529,6 +591,11 @@ def _compute_client_checksums(
         end_b = start_b + blocks_per_chunk
         h = hashlib.md5()
         for t in tensors:
+            # Paged layout: dim 1 is the block dim for both kv-major
+            # ``(kv, NB, BS, NH, HS)`` and MLA ``(NB, BS, NH, HS)``
+            # tensors. ``contiguous().numpy().tobytes()`` survives
+            # non-contiguous slices and dtype quirks (bfloat16 has no
+            # numpy view, but uint8 reinterpret works after slice).
             view = t.narrow(1, start_b, end_b - start_b).contiguous()
             h.update(view.view(torch.uint8).numpy().tobytes())
         checksums.append(h.hexdigest())
@@ -542,8 +609,11 @@ def _zero_fill_client_blocks(
 ) -> None:
     """Zero out a paged block range across all client tensors.
 
-    Used right before a warm-pass RETRIEVE so that any non-zero
+    Used right before a warm-pass ``RETRIEVE`` so that any non-zero
     bytes observed afterwards must have been written by the server.
+    Without this, a warm checksum equal to the cold checksum could
+    still happen even if ``RETRIEVE`` was a silent no-op (the SHM
+    pages were never overwritten in the first place).
     """
     for t in tensors:
         t.narrow(1, block_offset, num_blocks).zero_()
@@ -558,20 +628,20 @@ def _send_store(
     use_gpu: bool = True,
     use_handle: bool | None = None,
     client_tensors: list["torch.Tensor"] | None = None,
-    chunk_size: int = 256,
-    server_pool: "memoryview | None" = None,
+    chunk_size: int = 0,
+    server_pool: "mmap.mmap | None" = None,
 ) -> str:
     """Store KV cache blocks. Returns status string.
 
-    Handle mode uses the single-shot ``STORE`` RPC (GPU CUDA-IPC).
+    Handle mode uses the single-shot ``STORE`` RPC (GPU CUDA-IPC, or
+    CPU SHM with an empty event handle).
     Data mode uses the two-phase ``PREPARE_STORE`` + ``COMMIT_STORE``.
     When ``server_pool`` and ``client_tensors`` are both supplied the
     bench gathers the paged block range into flat per-chunk CPU
-    tensors and writes them into the server-owned SHM pool via the
-    slot descriptors returned by ``PREPARE_STORE``.
-
-    TODO(baoloongmao): CPU handle mode (zero-copy SHM IPC STORE path)
-    will be implemented in a separate PR.
+    tensors and writes them straight into the server-owned SHM pool
+    via the slot descriptors returned by ``PREPARE_STORE``, so the
+    follow-up ``COMMIT_STORE`` carries an empty payload and the
+    server stays on its zero-copy SHM path.
     """
     if use_handle is None:
         use_handle = use_gpu
@@ -583,22 +653,19 @@ def _send_store(
             key,
             _INSTANCE_ID,
             [block_ids] * num_engine_group_infos,
-            _make_event_handle(use_gpu),
+            _make_event_handle(),
         ]
         result = _call(client, RequestType.STORE, payloads)
         if result is _TIMEOUT:
             return "timeout"
         return "stored" if result[1] else "store_failed"
 
-    # Data mode: PREPARE_STORE -> COMMIT_STORE
+    # CPU mode: PREPARE_STORE -> COMMIT_STORE
     prep = _call(client, RequestType.PREPARE_STORE, [key, _INSTANCE_ID])
     if prep is _TIMEOUT:
         return "timeout"
-    if prep is None:
-        return "store_failed"
     if server_pool is not None and client_tensors is not None and chunk_size > 0:
-        raw_ctx = getattr(prep, "context", None)
-        ctx = raw_ctx if isinstance(raw_ctx, dict) else {}
+        ctx = prep.context if isinstance(prep.context, dict) else {}
         slots = ctx.get("slots", []) or []
         chunk_indices = ctx.get("chunk_indices", []) or []
         if slots and chunk_indices:
@@ -613,7 +680,7 @@ def _send_store(
             slot_views = _build_server_slot_views(server_pool, slots)
             for slot_view, chunk_idx in zip(slot_views, chunk_indices, strict=False):
                 if 0 <= chunk_idx < len(full_chunks):
-                    slot_view.copy_(full_chunks[chunk_idx].reshape(slot_view.shape))
+                    slot_view.copy_(full_chunks[chunk_idx].view(slot_view.shape))
     commit = _call(client, RequestType.COMMIT_STORE, [key, _INSTANCE_ID, b""])
     if commit is _TIMEOUT:
         return "timeout"
@@ -631,19 +698,19 @@ def _send_retrieve(
     use_gpu: bool = True,
     use_handle: bool | None = None,
     client_tensors: list["torch.Tensor"] | None = None,
-    server_pool: "memoryview | None" = None,
+    server_pool: "mmap.mmap | None" = None,
 ) -> str:
     """Retrieve KV cache blocks. Returns status.
 
-    Handle mode uses the single-shot ``RETRIEVE`` RPC (GPU CUDA-IPC).
+    Handle mode uses the single-shot ``RETRIEVE`` RPC (GPU CUDA-IPC, or
+    CPU SHM with an empty event handle).
     Data mode uses the two-phase ``PREPARE_RETRIEVE`` +
     ``COMMIT_RETRIEVE``. When ``server_pool`` and ``client_tensors``
     are both supplied the bench builds zero-copy tensor views over
     the slot descriptors returned by ``PREPARE_RETRIEVE`` and
-    scatters them back into the paged client SHM.
-
-    TODO(baoloongmao): CPU handle mode (zero-copy SHM IPC RETRIEVE
-    path) will be implemented in a separate PR.
+    scatters them back into the paged client SHM, so the round-trip
+    self-check can run without ``PREPARE_RETRIEVE`` having to ship a
+    pickled copy of the chunks.
     """
     if use_handle is None:
         use_handle = use_gpu
@@ -655,7 +722,7 @@ def _send_retrieve(
             key,
             _INSTANCE_ID,
             [block_ids] * num_engine_group_infos,
-            _make_event_handle(use_gpu),
+            _make_event_handle(),
             0,  # skip_first_n_tokens
         ]
         result = _call(client, RequestType.RETRIEVE, payloads)
@@ -663,15 +730,14 @@ def _send_retrieve(
             return "timeout"
         return "retrieved" if result[1] else "retrieve_failed"
 
-    # Data mode: PREPARE_RETRIEVE -> COMMIT_RETRIEVE
+    # CPU mode: PREPARE_RETRIEVE -> COMMIT_RETRIEVE
     prep = _call(client, RequestType.PREPARE_RETRIEVE, [key, _INSTANCE_ID])
     if prep is _TIMEOUT:
         return "timeout"
-    if prep is None or not getattr(prep, "success", False):
+    if not prep.success:
         return "retrieve_failed"
     if server_pool is not None and client_tensors is not None:
-        raw_ctx = getattr(prep, "context", None)
-        ctx = raw_ctx if isinstance(raw_ctx, dict) else {}
+        ctx = prep.context if isinstance(prep.context, dict) else {}
         slots = ctx.get("slots", []) or []
         if slots:
             try:
@@ -788,7 +854,7 @@ def _process_request(
     use_gpu: bool = True,
     use_handle: bool | None = None,
     client_tensors: list["torch.Tensor"] | None = None,
-    server_pool: "memoryview | None" = None,
+    server_pool: "mmap.mmap | None" = None,
 ) -> list[str] | None:
     """Run the full lookup -> retrieve/store flow.
 
@@ -802,7 +868,9 @@ def _process_request(
       proves the server returned the exact bytes we sent.
 
     Handle mode keeps the historical server-side
-    ``/kvcache/check`` path; client tensors are not consulted.
+    ``/kvcache/check`` path; client tensors are not consulted (in
+    handle mode the client and server share the same SHM/IPC
+    pages, so a client-side hash equals itself by construction).
     """
     token_ids = _build_token_ids(seq_no, num_tokens)
     request_id = "req-%d-%s" % (seq_no, pass_label)
@@ -963,7 +1031,8 @@ def _process_request(
     #       cold -> ground truth captured pre-STORE
     #       warm -> hash post-RETRIEVE; cold == warm proves the
     #               server returned the exact bytes we wrote.
-    #   * handle mode: query /kvcache/check on the server.
+    #   * handle mode: query /kvcache/check on the server, which
+    #     reads the shared SHM/IPC pages directly.
     checksums: list[str] | None = None
     if client_tensors is not None and num_full_tokens > 0:
         if pass_label == "cold":
diff --git a/lmcache/integration/vllm/vllm_multi_process_adapter.py b/lmcache/integration/vllm/vllm_multi_process_adapter.py
index 7e0477b92c..c2c4b9a4d9 100644
--- a/lmcache/integration/vllm/vllm_multi_process_adapter.py
+++ b/lmcache/integration/vllm/vllm_multi_process_adapter.py
@@ -919,9 +919,19 @@ def __init__(
             cfg = _resolve_extra_config(extra_config)
             mq_timeout = cfg[ExtraConfigDefault.mq_timeout.name]
             heartbeat_interval = cfg[ExtraConfigDefault.heartbeat_interval.name]
-            self._mp_transfer_mode = cfg[ExtraConfigDefault.mp_transfer_mode.name]
+            # Only treat ``mp_transfer_mode`` as an explicit override when
+            # the user actually set it in extra_config; otherwise leave it
+            # as ``None`` so ``create_transfer_context`` can still consult
+            # the ``LMCACHE_MP_TRANSFER_MODE`` env var.
+            mp_mode_key = (
+                _EXTRA_CONFIG_KEY_PREFIX + ExtraConfigDefault.mp_transfer_mode.name
+            )
+            if mp_mode_key in extra_config:
+                self._mp_transfer_mode = cfg[ExtraConfigDefault.mp_transfer_mode.name]
+            else:
+                self._mp_transfer_mode = None
         else:
-            self._mp_transfer_mode = ExtraConfigDefault.mp_transfer_mode.value
+            self._mp_transfer_mode = None
         self.mq_client = MessageQueueClient(server_url, context)
         self._mq_timeout = mq_timeout
 
diff --git a/lmcache/python_ops_fallback.py b/lmcache/python_ops_fallback.py
index a95bf24493..df15d2579e 100644
--- a/lmcache/python_ops_fallback.py
+++ b/lmcache/python_ops_fallback.py
@@ -775,6 +775,7 @@ def _is_hnd_format(gpu_kv_format: GPUKVFormat) -> bool:
     return int(gpu_kv_format) in (
         int(GPUKVFormat.NL_X_TWO_NB_NH_BS_HS),
         int(GPUKVFormat.NL_X_NB_TWO_NH_BS_HS),
+        int(GPUKVFormat.NL_X_NB_NH_BS_TWO_HS),
     )
 
 
@@ -836,6 +837,10 @@ def _per_layer_paged_shape(
         return (2, nb, nh, bs, hs)
     if fmt == int(GPUKVFormat.NL_X_NB_TWO_NH_BS_HS):
         return (nb, 2, nh, bs, hs)
+    if fmt == int(GPUKVFormat.NL_X_NB_NH_BS_TWO_HS):
+        # vLLM CPU blocks-first fused KV: K and V interleaved at the
+        # second-to-last dim so each layer is [NB, NH, BS, 2, HS].
+        return (nb, nh, bs, 2, hs)
     if fmt == int(GPUKVFormat.NL_X_TWO_NB_BS_NH_HS):
         return (2, nb, bs, nh, hs)
     # Covers NL_X_NB_TWO_BS_NH_HS and any future NHD variants.
@@ -1480,6 +1485,9 @@ def _transfer_per_layer_hnd(
         # Determine K/V split based on specific format
         if int(gpu_kv_format) == int(GPUKVFormat.NL_X_TWO_NB_NH_BS_HS):
             k_t, v_t = layer[0], layer[1]
+        elif int(gpu_kv_format) == int(GPUKVFormat.NL_X_NB_NH_BS_TWO_HS):
+            # vLLM CPU blocks-first fused KV: [NB, NH, BS, 2, HS].
+            k_t, v_t = layer[:, :, :, 0], layer[:, :, :, 1]
         else:
             k_t, v_t = layer[:, 0], layer[:, 1]
         _nb, nh, _bs, hs = k_t.shape
diff --git a/lmcache/v1/multiprocess/modules/gpu_transfer.py b/lmcache/v1/multiprocess/modules/gpu_transfer.py
index eaf4d1ff83..99bbe24ab0 100644
--- a/lmcache/v1/multiprocess/modules/gpu_transfer.py
+++ b/lmcache/v1/multiprocess/modules/gpu_transfer.py
@@ -104,12 +104,16 @@ def batched_iteration(lst: list, batch_size: int) -> Generator[tuple, None, None
 class ContextEntry:
     """Registered cache context metadata for a single worker instance.
 
-    The actual concrete type is whatever :func:`create_cache_context`
-    returned -- currently always a :class:`GPUCacheContext`.
+    The concrete type is whatever :func:`create_cache_context` returned
+    for the wrapper list at registration time -- a
+    :class:`GPUCacheContext` for CUDA-IPC wrappers, a
+    :class:`CpuCacheContext` for POSIX-SHM wrappers. Both expose
+    the same ``kv_tensors`` / ``gpu_kv_format_`` / ``num_layers`` / ...
+    duck-typed surface, so downstream consumers stay agnostic.
 
     Args:
-        cache_context: Platform cache context managing shape and pointers
-            to the registered KV cache tensors.
+        cache_context: Platform cache context (GPU or CPU) managing
+            shape and pointers to the registered KV cache tensors.
         model_name: The name of the model associated with this KV cache.
         world_size: The world size associated with this KV cache.
     """
diff --git a/lmcache/v1/multiprocess/server.py b/lmcache/v1/multiprocess/server.py
index eb70ca9d3b..b6d26cf1e3 100644
--- a/lmcache/v1/multiprocess/server.py
+++ b/lmcache/v1/multiprocess/server.py
@@ -113,7 +113,7 @@ def storage_manager(self) -> StorageManager:
 
     @property
     def gpu_contexts(self) -> dict[int, GPUCacheContext] | None:
-        """Used by ``/kvcache/check``; unwraps :class:`GPUContextEntry`."""
+        """Used by ``/kvcache/check``; unwraps :class:`ContextEntry`."""
         for module in self._modules:
             if isinstance(module, GPUTransferModule):
                 return {i: e.cache_context for i, e in module.cache_contexts.items()}
diff --git a/lmcache/v1/multiprocess/token_hasher.py b/lmcache/v1/multiprocess/token_hasher.py
index 14ddb0b6df..5fffc94236 100644
--- a/lmcache/v1/multiprocess/token_hasher.py
+++ b/lmcache/v1/multiprocess/token_hasher.py
@@ -164,7 +164,15 @@ def _init_none_hash(self) -> Any:
                     none_hash = kv_cache_utils.NONE_HASH
                     logger.info("Initialized NONE_HASH=%s from vLLM", none_hash)
                     return none_hash
-            except (ImportError, AttributeError, ValueError, RuntimeError):
+            except (
+                ImportError,
+                AttributeError,
+                ValueError,
+                RuntimeError,
+                # torch._dynamo.device_interface raises AssertionError
+                # when CudaInterface is defined on non-CUDA platforms.
+                AssertionError,
+            ):
                 pass
 
         # Fallback: compute none_hash using our hash function
diff --git a/lmcache/v1/platform/cache_context.py b/lmcache/v1/platform/cache_context.py
index b3f983e74f..81a9e4430d 100644
--- a/lmcache/v1/platform/cache_context.py
+++ b/lmcache/v1/platform/cache_context.py
@@ -5,10 +5,13 @@
 
 * :class:`~lmcache.v1.multiprocess.gpu_context.GPUCacheContext` --
   CUDA-backed.
+* :class:`~lmcache.v1.platform.cpu.cache_context.CpuCacheContext` --
+  CPU-only fallback (POSIX-SHM-backed KV tensors).
 
 :func:`create_cache_context` keeps the dispatch out of the call site
 in :mod:`lmcache.v1.multiprocess.server` so adding a new accelerator
-only requires shipping a new sub-package + extending the factory below.
+only requires shipping a new sub-package + extending the wrapper
+isinstance check below.
 """
 
 # Future
@@ -22,6 +25,7 @@
 from lmcache.utils import EngineType
 from lmcache.v1.gpu_connector.utils import LayoutHints
 from lmcache.v1.multiprocess.custom_types import KVCache
+from lmcache.v1.platform.cpu.cache_context import CpuCacheContext
 
 if TYPE_CHECKING:
     # First Party
@@ -41,6 +45,7 @@ def create_cache_context(
     forward their kwargs verbatim and stay agnostic of the active
     backend.
 
+    Selection is driven by the wrapper type of *kv_caches*:
     Currently only :class:`GPUCacheContext` is supported.  CPU and
     other accelerator backends will be added in follow-up PRs.
 
@@ -54,19 +59,24 @@ def create_cache_context(
         engine_type: Which serving engine produced the caches.
 
     Returns:
-        A concrete cache context instance (currently always
-        :class:`~lmcache.v1.multiprocess.gpu_context.GPUCacheContext`).
+        A concrete cache context instance.
 
     Raises:
         ValueError: If *kv_caches* is empty.
     """
     # First Party
     from lmcache.v1.multiprocess.gpu_context import GPUCacheContext
+    from lmcache.v1.platform.cpu.shm import CpuShmTensorWrapper
 
     if not kv_caches:
         raise ValueError("create_cache_context requires a non-empty kv_caches list")
 
-    return GPUCacheContext(
+    cls: type = (
+        CpuCacheContext
+        if any(isinstance(w, CpuShmTensorWrapper) for w in kv_caches)
+        else GPUCacheContext
+    )
+    return cls(
         kv_caches,
         lmcache_logical_chunk_size,
         layout_hints,
diff --git a/lmcache/v1/platform/cpu/cache_context.py b/lmcache/v1/platform/cpu/cache_context.py
new file mode 100644
index 0000000000..04dc8ea7fa
--- /dev/null
+++ b/lmcache/v1/platform/cpu/cache_context.py
@@ -0,0 +1,644 @@
+# SPDX-License-Identifier: Apache-2.0
+"""CPU-only cache context for platforms without CUDA GPUs.
+
+This module lives in the ``platform.cpu`` sub-package because it is
+the CPU-specific implementation of the cross-platform cache context
+-- it provides the same public API as
+:class:`~lmcache.v1.multiprocess.gpu_context.GPUCacheContext` but
+keeps all tensors on CPU. Stream / Event objects are provided by
+:class:`~lmcache.v1.platform.cpu.stub_cpu_device.StubStream` so
+CPU-only hosts never import ``cupy`` or instantiate a real CUDA
+stream object.
+
+The platform-agnostic dispatcher ``create_cache_context`` lives in
+:mod:`lmcache.v1.platform.cache_context`.
+"""
+
+# Future
+from __future__ import annotations
+
+# Standard
+from collections.abc import Sequence
+from typing import TYPE_CHECKING
+import array
+import os
+
+# Third Party
+import torch
+
+# First Party
+from lmcache.logging import init_logger
+from lmcache.utils import EngineType
+from lmcache.v1.gpu_connector.utils import (
+    LayoutHints,
+    get_attention_backend,
+    get_concrete_gpu_kv_shape_from_shape_desc,
+    get_gpu_kv_shape_description,
+    get_group_data_ptrs,
+    get_num_blocks,
+    get_num_layers,
+    is_mla,
+    normalize_kv_and_discover_format,
+)
+from lmcache.v1.kv_layer_groups import KVLayerGroupsManager
+from lmcache.v1.multiprocess.custom_types import KVCache
+from lmcache.v1.platform.cpu.stub_cpu_device import StubStream
+import lmcache.c_ops as lmc_ops
+
+if TYPE_CHECKING:
+    # First Party
+    from lmcache.v1.multiprocess.group_view import EngineGroupInfo
+
+logger = init_logger(__name__)
+
+
+class CpuCacheContext:
+    """CPU-only cache context with the same public API as
+    :class:`GPUCacheContext`.
+
+    All tensors live on CPU. CUDA streams and cupy streams are
+    replaced by :class:`StubStream` no-op objects so callers can keep
+    using ``stream.synchronize()`` / ``wait_event(...)`` etc. without
+    branching on the active backend.
+
+    KV cache tensors are reconstructed from the
+    :class:`CpuShmTensorWrapper` instances sent by the client over
+    POSIX shared memory -- the server does **not** allocate the KV
+    cache itself. This mirrors the GPU-mode CUDA-IPC flow where the
+    client owns the buffers and the server only maps them.
+    """
+
+    def __init__(
+        self,
+        kv_caches: KVCache,
+        lmcache_logical_chunk_size: int = 256,
+        layout_hints: LayoutHints | None = None,
+        engine_group_infos: "Sequence[EngineGroupInfo]" = (),
+        engine_type: EngineType = EngineType.VLLM,
+    ) -> None:
+        if not kv_caches:
+            raise ValueError(
+                "CpuCacheContext requires a non-empty list of "
+                "CpuShmTensorWrapper; the legacy server-side "
+                "self-allocation path has been removed."
+            )
+
+        # First Party
+        from lmcache.v1.multiprocess.gpu_context import (
+            unwrap_kv_cache_tensors,
+        )
+
+        unwrapped = unwrap_kv_cache_tensors(kv_caches)
+        self.device_ = torch.device("cpu")
+        self.lmcache_logical_chunk_size = lmcache_logical_chunk_size
+
+        # Discover layout & build KV layer groups via the same path
+        # GPUCacheContext uses, so we don't need to hand-roll any
+        # PageBufferShapeDesc here. ``layout_hints`` / ``engine_type``
+        # are forwarded so the signature matches GPUCacheContext.
+        (
+            self._gpu_kv_format,
+            kv_caches_normalized,
+        ) = normalize_kv_and_discover_format(
+            unwrapped,
+            engine_type,
+            layout_hints=layout_hints,
+        )
+        self.kv_caches_: list[torch.Tensor] = list(kv_caches_normalized)
+        self.is_mla_ = is_mla(self._gpu_kv_format)
+        self.num_layers_ = get_num_layers(self.kv_caches_, self._gpu_kv_format)
+        self.num_blocks_ = get_num_blocks(self.kv_caches_, self._gpu_kv_format)
+        self.kv_layer_groups_manager_ = KVLayerGroupsManager(
+            self.kv_caches_,
+            gpu_kv_format=self._gpu_kv_format,
+            num_blocks=self.num_blocks_,
+            layout_hints=layout_hints,
+            engine_group_infos=engine_group_infos,
+            lmcache_logical_chunk_size=lmcache_logical_chunk_size,
+        )
+
+        # Per-group KV pointer tensors (CPU). Reuse the same helper
+        # GPUCacheContext relies on so the layout matches exactly.
+        self.group_kv_pointers_: list[torch.Tensor] = [
+            torch.tensor(
+                get_group_data_ptrs(
+                    self.kv_caches_,
+                    self.gpu_kv_format_,
+                    group.layer_indices,
+                ),
+                dtype=torch.long,
+            )
+            for group in self.kv_layer_groups_manager_.kv_layer_groups
+        ]
+
+        # Backwards-compat aliases (a few callers still expect these).
+        self.hidden_dim_sizes_: list[int] = [
+            group.hidden_dim_size
+            for group in self.kv_layer_groups_manager_.kv_layer_groups
+        ]
+        self.kv_cache_pointers_ = torch.tensor(
+            [t.data_ptr() for t in self.kv_caches_], dtype=torch.long
+        )
+
+        # Pre-allocated block IDs buffer (CPU).
+        _MAX_BLOCK_IDS = 1_000_000
+        self.block_ids_buffer_ = torch.empty(_MAX_BLOCK_IDS, dtype=torch.long)
+
+        # Temporary buffer for transfers (same layout as
+        # GPUCacheContext but on CPU).
+        self.max_batch_size = 4
+        self.tmp_chunk_group_offsets_: list[int] = [0]
+        for group_idx, group in enumerate(
+            self.kv_layer_groups_manager_.kv_layer_groups
+        ):
+            shape = self.get_kv_buffer_shape(lmcache_logical_chunk_size, group_idx)
+            byte_size = shape.numel() * group.dtype.itemsize
+            self.tmp_chunk_group_offsets_.append(
+                self.tmp_chunk_group_offsets_[-1] + byte_size
+            )
+        self.tmp_chunk_bytes_ = self.tmp_chunk_group_offsets_[-1]
+        # Buffer lives on CPU; keep the attribute name aligned with the
+        # context to avoid GPU-prefixed naming bleeding into a CPU-only
+        # class. The public ``get_tmp_gpu_buffer_flat`` method name is
+        # preserved so ``server.py`` can duck-type across backends.
+        self.tmp_cpu_buffer_ = torch.empty(
+            self.tmp_chunk_bytes_ * self.max_batch_size,
+            dtype=torch.uint8,
+        )
+
+        # Mock streams. ``StubStream`` already implements the small
+        # subset of the API server-side code uses (``synchronize``,
+        # ``wait_event``, ``record_event`` ...), so we never import
+        # cupy or instantiate a real CUDA stream object here.
+        self.cuda_stream_: StubStream = StubStream(device="cpu")
+        self.cupy_stream_: StubStream = self.cuda_stream_
+        self.high_priority_cuda_stream_: StubStream = StubStream(
+            device="cpu", priority=0
+        )
+        self.high_priority_cupy_stream_: StubStream = self.high_priority_cuda_stream_
+
+        # Sanity-check: warn if /dev/shm looks too small for the
+        # registered KV cache. Only meaningful on Linux where
+        # /dev/shm is the default tmpfs backing POSIX SHM.
+        self._check_shm_capacity()
+
+        logger.info(
+            "CpuCacheContext: %d layers, %d blocks, dtype=%s (shm-backed)",
+            self.num_layers_,
+            self.num_blocks_,
+            self.kv_caches_[0].dtype,
+        )
+
+    # -- Internal helpers --
+
+    _SHM_PATH = "/dev/shm"
+
+    def _check_shm_capacity(self) -> None:
+        """Warn if /dev/shm free space is smaller than the KV cache."""
+        if not os.path.isdir(self._SHM_PATH):
+            return
+        try:
+            st = os.statvfs(self._SHM_PATH)
+        except OSError:
+            return
+        free_bytes = st.f_bavail * st.f_frsize
+        kv_bytes = sum(t.numel() * t.element_size() for t in self.kv_caches_)
+        if kv_bytes > free_bytes:
+            logger.warning(
+                "Insufficient /dev/shm space for CPU KV cache: "
+                "need %d bytes but only %d bytes available. "
+                "Consider increasing the size of /dev/shm "
+                "(e.g. mount -o remount,size=<N>G /dev/shm).",
+                kv_bytes,
+                free_bytes,
+            )
+
+    # -- Properties (same API as GPUCacheContext) --
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """Returns the dtype of the KV cache tensors."""
+        return self.kv_caches_[0].dtype
+
+    @property
+    def device(self) -> torch.device:
+        """Returns the device (always CPU)."""
+        return self.device_
+
+    @property
+    def kv_tensors(self) -> list[torch.Tensor]:
+        """Returns the list of per-layer KV cache tensors."""
+        return self.kv_caches_
+
+    @property
+    def kv_pointers(self) -> torch.Tensor:
+        """Returns a tensor of KV cache data pointers."""
+        return self.kv_cache_pointers_
+
+    @property
+    def stream(self) -> StubStream:
+        """Returns the (mock) CUDA stream."""
+        return self.cuda_stream_
+
+    @property
+    def cupy_stream(self) -> StubStream:
+        """Returns the (mock) external stream."""
+        return self.cupy_stream_
+
+    @property
+    def high_priority_stream(self) -> StubStream:
+        """Returns the (mock) high-priority CUDA stream."""
+        return self.high_priority_cuda_stream_
+
+    @property
+    def high_priority_cupy_stream(self) -> StubStream:
+        """Returns the (mock) high-priority external stream."""
+        return self.high_priority_cupy_stream_
+
+    @property
+    def block_size(self) -> int:
+        """Returns the block size (tokens per block)."""
+        return self.kv_layer_groups_manager_.kv_layer_groups[0].shape_desc.bs
+
+    @property
+    def num_layers(self) -> int:
+        """Returns the number of layers in the model."""
+        return self.num_layers_
+
+    @property
+    def num_blocks(self) -> int:
+        """Returns the number of blocks in the KV cache."""
+        return self.num_blocks_
+
+    @property
+    def is_mla(self) -> bool:
+        """Returns whether the model uses MLA."""
+        return self.is_mla_
+
+    @property
+    def hidden_dim_sizes(self) -> list[int]:
+        """Returns hidden dimension sizes per KV layer group."""
+        return self.hidden_dim_sizes_
+
+    @property
+    def group_physical_block_sizes(self) -> list[int]:
+        """Per-group physical slot count (``shape_desc.bs``) in group
+        order."""
+        return [
+            group.shape_desc.bs
+            for group in self.kv_layer_groups_manager_.kv_layer_groups
+        ]
+
+    @property
+    def group_compress_ratios(self) -> list[int]:
+        """Per-group compression ratio in group order.
+        ``1`` for non-compressed groups.
+        """
+        return [
+            group.compress_ratio
+            for group in self.kv_layer_groups_manager_.kv_layer_groups
+        ]
+
+    @property
+    def kv_layer_groups_manager(self) -> KVLayerGroupsManager:
+        """Returns the KV layer groups manager."""
+        return self.kv_layer_groups_manager_
+
+    @property
+    def gpu_kv_format_(self):
+        """Returns the GPU KV format enum (API parity with GPUCacheContext)."""
+        return self._gpu_kv_format
+
+    @property
+    def gpu_kv_shape(self) -> str:
+        """Returns the symbolic GPU KV cache layout description."""
+        return get_gpu_kv_shape_description(self._gpu_kv_format)
+
+    @property
+    def attention_backend(self) -> str:
+        """Returns the attention backend name."""
+        return get_attention_backend(self._gpu_kv_format)
+
+    @property
+    def concrete_gpu_kv_shape(self) -> str:
+        """Returns the GPU KV shape with actual numeric values."""
+        group = self.kv_layer_groups_manager_.kv_layer_groups[0]
+        return get_concrete_gpu_kv_shape_from_shape_desc(
+            group.shape_desc, self._gpu_kv_format
+        )
+
+    def calculate_num_blocks(self, num_tokens: int, kernel_group_idx: int) -> int:
+        """Calculate the number of blocks for a given number of tokens.
+
+        Mirrors :meth:`GPUCacheContext.calculate_num_blocks`.
+
+        Args:
+            num_tokens: The total number of tokens to be processed.
+            kernel_group_idx: 0-based index of the kernel group.
+
+        Returns:
+            The number of blocks.
+        """
+        return self.kv_layer_groups_manager_.calculate_num_blocks(
+            kernel_group_idx, num_tokens
+        )
+
+    def get_shape_desc(self, group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
+        """Returns the PageBufferShapeDesc for the given group."""
+        return self.kv_layer_groups_manager_.get_shape_desc(group_idx)
+
+    def get_physical_chunk_size(self, group_idx: int) -> int:
+        """Returns the per-chunk physical slot count for the group."""
+        return self.kv_layer_groups_manager_.get_physical_chunk_size(group_idx)
+
+    def blocks_for_tokens(self, num_logical_tokens: int, group_idx: int) -> int:
+        """Number of blocks that span *num_logical_tokens* for a group.
+
+        Mirrors :meth:`GPUCacheContext.blocks_for_tokens`.
+        """
+        group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
+        physical_slots = num_logical_tokens // group.compress_ratio
+        return physical_slots // group.shape_desc.bs
+
+    def get_group_kv_pointers(self, group_idx: int) -> torch.Tensor:
+        """Returns the KV cache pointer tensor for the given group."""
+        return self.group_kv_pointers_[group_idx]
+
+    def get_kernel_group_kv_pointers(self, kernel_group_idx: int) -> torch.Tensor:
+        """Returns the KV pointer tensor for the given kernel group.
+
+        Mirrors :meth:`GPUCacheContext.get_kernel_group_kv_pointers`.
+        """
+        return self.group_kv_pointers_[kernel_group_idx]
+
+    def get_kernel_group_shape_dtype(
+        self,
+        num_tokens: int,
+        kernel_group_idx: int,
+    ) -> tuple[torch.Size, torch.dtype]:
+        """Returns the shape and dtype for the given kernel group index and
+        number of tokens.
+
+        Mirrors :meth:`GPUCacheContext.get_kernel_group_shape_dtype` so
+        callers such as ``gpu_transfer.get_layout_desc`` can duck-type
+        across GPU and CPU backends.
+
+        Args:
+            num_tokens: Number of tokens.
+            kernel_group_idx: Index of the kernel group.
+
+        Returns:
+            A ``(shape, dtype)`` tuple for the given kernel group.
+        """
+        group = self.kv_layer_groups_manager_.kv_layer_groups[kernel_group_idx]
+        compress_ratio = group.compress_ratio
+        if num_tokens % compress_ratio != 0:
+            raise ValueError(
+                "num_tokens (%d) is not a multiple of compress_ratio (%d) "
+                "for kernel_group_idx %d"
+                % (num_tokens, compress_ratio, kernel_group_idx)
+            )
+        num_slots = num_tokens // compress_ratio
+        sd = group.shape_desc
+        shape = torch.Size(
+            (sd.kv_size, group.num_layers, num_slots, group.hidden_dim_size)
+        )
+        return shape, group.dtype
+
+    def get_kv_buffer_shape(
+        self, logical_num_tokens: int, group_idx: int = 0
+    ) -> torch.Size:
+        """Returns the KV buffer shape for the given number of
+        *logical* tokens.
+
+        Mirrors :meth:`GPUCacheContext.get_kv_buffer_shape`:
+        divides by ``compress_ratio`` and uses ``sd.kv_size`` so
+        compressed groups (MLA etc.) get the correct shape.
+        """
+        group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
+        compress_ratio = group.compress_ratio
+        if logical_num_tokens % compress_ratio != 0:
+            raise ValueError(
+                "logical_num_tokens (%d) is not a multiple of "
+                "compress_ratio (%d) for group %d"
+                % (logical_num_tokens, compress_ratio, group_idx)
+            )
+        num_slots = logical_num_tokens // compress_ratio
+        sd = group.shape_desc
+        return torch.Size(
+            (sd.kv_size, group.num_layers, num_slots, group.hidden_dim_size)
+        )
+
+    def get_tmp_gpu_buffer_flat(self, chunk_idx: int) -> torch.Tensor:
+        """Returns the flat uint8 temp buffer for the given chunk."""
+        if chunk_idx >= self.max_batch_size:
+            raise ValueError(
+                "chunk_idx %d >= max_batch_size %d" % (chunk_idx, self.max_batch_size)
+            )
+        start = chunk_idx * self.tmp_chunk_bytes_
+        return self.tmp_cpu_buffer_[start : start + self.tmp_chunk_bytes_]
+
+    def get_temp_kernel_group_buffer(
+        self, batch_idx: int, kernel_group_idx: int
+    ) -> torch.Tensor:
+        """Returns the typed temp buffer for the given batch and kernel group.
+
+        Mirrors :meth:`GPUCacheContext.get_temp_kernel_group_buffer`.
+
+        Args:
+            batch_idx: Batch slot index (0 <= batch_idx < max_batch_size).
+            kernel_group_idx: Index of the kernel group.
+
+        Returns:
+            A typed tensor view with the correct shape and dtype.
+        """
+        if batch_idx >= self.max_batch_size:
+            raise ValueError(
+                "batch_idx %d >= max_batch_size %d" % (batch_idx, self.max_batch_size)
+            )
+        group = self.kv_layer_groups_manager_.kv_layer_groups[kernel_group_idx]
+        shape = self.get_kv_buffer_shape(
+            self.lmcache_logical_chunk_size, kernel_group_idx
+        )
+        g_start = self.tmp_chunk_group_offsets_[kernel_group_idx]
+        g_end = self.tmp_chunk_group_offsets_[kernel_group_idx + 1]
+        chunk = self.tmp_chunk_bytes_
+        return (
+            self.tmp_cpu_buffer_[
+                batch_idx * chunk + g_start : batch_idx * chunk + g_end
+            ]
+            .view(group.dtype)
+            .view(shape)
+        )
+
+    def get_temp_object_group_buffer(
+        self, batch_idx: int, object_group_idx: int
+    ) -> torch.Tensor:
+        """Returns the flat uint8 temp buffer for the given batch and object
+        group.
+
+        Mirrors :meth:`GPUCacheContext.get_temp_object_group_buffer`.
+
+        Args:
+            batch_idx: Batch slot index (0 <= batch_idx < max_batch_size).
+            object_group_idx: Index of the object group.
+
+        Returns:
+            A flat uint8 tensor view covering the object group's byte range.
+        """
+        if batch_idx >= self.max_batch_size:
+            raise ValueError(
+                "batch_idx %d >= max_batch_size %d" % (batch_idx, self.max_batch_size)
+            )
+        manager = self.kv_layer_groups_manager_
+        object_group = manager.object_groups[object_group_idx]
+        kg_indices = object_group.kernel_group_indices
+        # Object group spans from the first to the last kernel group's range.
+        g_start = self.tmp_chunk_group_offsets_[kg_indices[0]]
+        g_end = self.tmp_chunk_group_offsets_[kg_indices[-1] + 1]
+        chunk = self.tmp_chunk_bytes_
+        return self.tmp_cpu_buffer_[
+            batch_idx * chunk + g_start : batch_idx * chunk + g_end
+        ]
+
+    def get_tmp_chunk_gpu_buffer(self, group_idx: int = 0) -> torch.Tensor:
+        """Returns a typed view of the temp buffer for one chunk."""
+        group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
+        shape = self.get_kv_buffer_shape(self.lmcache_logical_chunk_size, group_idx)
+        start = self.tmp_chunk_group_offsets_[group_idx]
+        end = self.tmp_chunk_group_offsets_[group_idx + 1]
+        return self.tmp_cpu_buffer_[start:end].view(group.dtype).view(shape)
+
+    def get_tmp_chunk_gpu_buffer_batched(
+        self, batch_size: int, group_idx: int = 0
+    ) -> list[torch.Tensor]:
+        """Returns a list of non-overlapping temp buffer views."""
+        if batch_size > self.max_batch_size:
+            raise ValueError(
+                "batch_size %d > max_batch_size %d" % (batch_size, self.max_batch_size)
+            )
+        group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
+        shape = self.get_kv_buffer_shape(self.lmcache_logical_chunk_size, group_idx)
+        g_start = self.tmp_chunk_group_offsets_[group_idx]
+        g_end = self.tmp_chunk_group_offsets_[group_idx + 1]
+        chunk = self.tmp_chunk_bytes_
+        return [
+            self.tmp_cpu_buffer_[i * chunk + g_start : i * chunk + g_end]
+            .view(group.dtype)
+            .view(shape)
+            for i in range(batch_size)
+        ]
+
+    def stage_block_ids(self, block_ids: list[int]) -> torch.Tensor:
+        """Copy block IDs into the pre-allocated buffer."""
+        if not block_ids:
+            raise ValueError("stage_block_ids requires a non-empty block_ids list")
+        n = len(block_ids)
+        capacity = self.block_ids_buffer_.shape[0]
+        if n > capacity:
+            raise ValueError(
+                "stage_block_ids: %d block IDs exceeds buffer capacity %d"
+                % (n, capacity)
+            )
+        cpu_tensor = torch.tensor(block_ids, dtype=torch.long)
+        buf = self.block_ids_buffer_[:n]
+        buf.copy_(cpu_tensor)
+        return buf
+
+    def copy_view_block_ids_to_gpu(
+        self, block_ids_per_group: list[list[int]]
+    ) -> list[torch.Tensor]:
+        """CPU-side counterpart to ``GPUCacheContext.copy_view_block_ids_to_gpu``.
+
+        Packs all per-group block IDs into the shared CPU buffer and
+        returns one non-overlapping view per LMCache group. The name
+        is kept for API parity; on a CPU-only host the buffer simply
+        lives on the host.
+        """
+        offsets = [0]
+        flat: array.array = array.array("l")
+        for view_block_ids in block_ids_per_group:
+            flat.extend(view_block_ids)
+            offsets.append(len(flat))
+
+        total = offsets[-1]
+        if total > self.block_ids_buffer_.shape[0]:
+            raise ValueError(
+                "block ID total %d exceeds the pre-allocated buffer "
+                "size %d" % (total, self.block_ids_buffer_.shape[0])
+            )
+        if total:
+            cpu_tensor = torch.frombuffer(flat, dtype=torch.long)
+            self.block_ids_buffer_[:total].copy_(cpu_tensor)
+
+        return [
+            self.block_ids_buffer_[offsets[i] : offsets[i + 1]]
+            for i in range(len(block_ids_per_group))
+        ]
+
+    def report_status(self) -> dict:
+        """Return this context's KV cache layout metadata.
+
+        Mirrors :meth:`GPUCacheContext.report_status` so
+        ``GPUTransferModule.report_status`` can duck-type across backends.
+        """
+        manager = self.kv_layer_groups_manager_
+        kernel_groups = manager.kernel_groups
+
+        kernel_group_to_object_group: dict[int, int] = {
+            kg_idx: og_idx
+            for og_idx, og in enumerate(manager.object_groups)
+            for kg_idx in og.kernel_group_indices
+        }
+
+        gpu_kv_format = self._gpu_kv_format
+        group_reports: list[dict] = []
+        for kernel_group_idx, group in enumerate(kernel_groups):
+            group_reports.append(
+                {
+                    "kernel_group_idx": kernel_group_idx,
+                    "engine_group_idx": group.engine_group_idx,
+                    "object_group_idx": kernel_group_to_object_group.get(
+                        kernel_group_idx, 0
+                    ),
+                    "num_layers": group.num_layers,
+                    "layer_indices": list(group.layer_indices),
+                    "physical_block_size": group.shape_desc.bs,
+                    "compress_ratio": group.compress_ratio,
+                    "dtype": str(group.dtype),
+                    "gpu_kv_concrete_shape": (
+                        get_concrete_gpu_kv_shape_from_shape_desc(
+                            group.shape_desc, gpu_kv_format
+                        )
+                    ),
+                    "is_mla": is_mla(gpu_kv_format),
+                    "gpu_kv_format": gpu_kv_format.name,
+                    "gpu_kv_shape": get_gpu_kv_shape_description(gpu_kv_format),
+                    "attention_backend": get_attention_backend(gpu_kv_format),
+                }
+            )
+
+        return {
+            "num_layers": self.num_layers_,
+            "inference_engine_logical_block_size": (
+                manager.inference_engine_logical_block_size
+            ),
+            "num_blocks": self.num_blocks_,
+            "cache_size_per_token": self.cache_size_per_token(),
+            "kernel_groups": group_reports,
+        }
+
+    def cache_size_per_token(self) -> int:
+        """Returns cache size per *logical* token in bytes,
+        summed across all groups.
+
+        Mirrors :meth:`GPUCacheContext.cache_size_per_token`.
+        """
+        total = 0
+        for group_idx, group in enumerate(
+            self.kv_layer_groups_manager_.kv_layer_groups
+        ):
+            numels = self.get_kv_buffer_shape(group.compress_ratio, group_idx).numel()
+            slot_bytes = numels * group.dtype.itemsize
+            total += slot_bytes // group.compress_ratio
+        return total
diff --git a/lmcache/v1/platform/cpu/shm.py b/lmcache/v1/platform/cpu/shm.py
index 7bbb702d5b..f438ae63da 100644
--- a/lmcache/v1/platform/cpu/shm.py
+++ b/lmcache/v1/platform/cpu/shm.py
@@ -123,10 +123,16 @@ def to_tensor(self) -> torch.Tensor:
         flat = torch.frombuffer(buf, dtype=torch.uint8)
         typed = flat.view(self.dtype)
         out = torch.as_strided(typed, self.shape, self.stride, self.storage_offset)
-        # Keep ``flat`` alive for the lifetime of ``out`` so its mmap
-        # is not released while still in use, then munmap on cleanup.
-        out._lmcache_shm_buf = flat  # type: ignore[attr-defined]
-        weakref.finalize(out, shm_munmap, addr, self.nbytes)
+        # Pin the mmap to the *storage*, not the outer tensor: views
+        # (reshape / slicing) create new tensor objects that share the
+        # storage but do not inherit Python attributes, so a finalizer
+        # attached to ``out`` would munmap as soon as ``out`` is GC'd
+        # even when a view is still reading the SHM segment.
+        # ``UntypedStorage`` is shared across views, so finalizing on it
+        # only fires once every view is also dropped.
+        storage = out.untyped_storage()
+        _CPU_SHM_KEEP_ALIVE[id(storage)] = flat
+        weakref.finalize(storage, _release_shm_segment, id(storage), addr, self.nbytes)
         return out
 
 
@@ -153,6 +159,25 @@ def to_tensor(self) -> torch.Tensor:
 _CPU_SHM_COUNTER = itertools.count()
 
 
+# Process-level registry that pins the base ``flat`` buffer of every live
+# ``to_tensor()`` mmap until its storage is finalized. Keyed by ``id(storage)``,
+# which is stable across views because PyTorch caches the storage Python
+# wrapper (so reshape / slicing returns the same ``UntypedStorage`` object).
+_CPU_SHM_KEEP_ALIVE: dict[int, torch.Tensor] = {}
+
+
+def _release_shm_segment(storage_id: int, addr: int, nbytes: int) -> None:
+    """Drop the pinned base buffer and ``munmap`` the mapping.
+
+    Invoked by ``weakref.finalize`` on the tensor's ``UntypedStorage`` once
+    every view of the mapping is gone, so views (e.g. ``reshape`` returning
+    a new tensor without ``_lmcache_shm_buf``) cannot trigger a premature
+    unmap that would turn into a use-after-free in the next read.
+    """
+    _CPU_SHM_KEEP_ALIVE.pop(storage_id, None)
+    shm_munmap(addr, nbytes)
+
+
 def _cleanup_shm_segment(tid: int, shm_name: str, addr: int, nbytes: int) -> None:
     """Release the mmap, unlink, and forget the cached SHM name."""
     with _CPU_SHM_LOCK:
diff --git a/lmcache/v1/platform/cpu/stub_cpu_device.py b/lmcache/v1/platform/cpu/stub_cpu_device.py
index af54bf3a31..8c5e6eaa4e 100644
--- a/lmcache/v1/platform/cpu/stub_cpu_device.py
+++ b/lmcache/v1/platform/cpu/stub_cpu_device.py
@@ -6,6 +6,11 @@
 from contextlib import nullcontext
 from typing import Any
 
+# First Party
+from lmcache.logging import init_logger
+
+logger = init_logger(__name__)
+
 
 class StubDeviceProperties:
     """Stub for torch_dev.get_device_properties() return value."""
@@ -116,6 +121,25 @@ def __init__(self, device: Any = "cpu", priority: int = 0, **kwargs: Any) -> Non
         self.device = device
         self.priority = priority
         self.cuda_stream = 0
+        # Mirrors the ``ptr`` attribute exposed by ``cupy.cuda.Stream``
+        # so callers (e.g. ``mp_observability.event_bus``) that pass a
+        # raw stream pointer to native recorders accept this stub
+        # without an isinstance check.
+        self.ptr = 0
+
+    def launch_host_func(self, callback: Any, arg: Any = None) -> None:
+        """Run ``callback(arg)`` synchronously.
+
+        ``cupy.cuda.Stream.launch_host_func`` schedules the callback
+        on the GPU stream's host-side completion queue; with no real
+        stream there's nothing to wait for, so we just invoke it
+        immediately. Exceptions are swallowed to mirror the cupy
+        contract (callbacks are best-effort and must not propagate).
+        """
+        try:
+            callback(arg)
+        except Exception as e:  # noqa: BLE001
+            logger.warning("launch_host_func callback raised: %s", e)
 
     def synchronize(self) -> None:
         """Block the host until all kernels on this stream complete.
diff --git a/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py b/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py
index 72ce1ecd3e..bf93d66a04 100644
--- a/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py
+++ b/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py
@@ -16,6 +16,12 @@
 import torch
 
 # First Party
+from lmcache.python_ops_fallback import (
+    multi_layer_block_kv_transfer as fallback_multi_layer_block_kv_transfer,
+)
+from lmcache.python_ops_fallback import (
+    set_shape_desc_dtype,
+)
 from lmcache.utils import EngineType
 from lmcache.v1.gpu_connector import utils as U
 from lmcache.v1.multiprocess.transfer_context.base import (
@@ -98,3 +104,73 @@ def test_mp_gather_scatter_roundtrip():
     untouched = torch.tensor([b for b in range(NB) if b not in block_ids])
     for k in dst:
         assert torch.equal(dst[k][untouched], ref[k][untouched])
+
+
+def test_multi_layer_block_kv_transfer_roundtrip():
+    """Server-side copy (handle mode) D2H + H2D round-trip.
+
+    Regression for the CI ``cpu_e2e_validation (server-side copy)`` failure:
+    ``GPUTransferModule.store`` calls ``multi_layer_block_kv_transfer`` for
+    this format, so the per-layer HND fallback must recognize it and split
+    K/V at dim 3.
+    """
+    # Use canonical 5D layers so the fallback exercises the HND split path.
+    fmt, norm = U.normalize_kv_and_discover_format(
+        _raw_blocks_first_caches(), EngineType.VLLM, HINTS
+    )
+    chunk_tokens = NB * BS
+    obj = torch.zeros((2, NL, chunk_tokens, NH * HS), dtype=norm[0].dtype)
+
+    sd = lmc_ops.PageBufferShapeDesc()
+    sd.kv_size = 2
+    sd.nl = NL
+    sd.nb = NB
+    sd.bs = BS
+    sd.nh = NH
+    sd.hs = HS
+    sd.element_size = norm[0].element_size()
+    sd.block_stride_elems = NH * BS * 2 * HS
+    set_shape_desc_dtype(sd, norm[0].dtype)
+
+    block_ids = torch.tensor(list(range(NB)), dtype=torch.long)
+
+    # Match the C++ binding's strict signature: 1D int64 tensor of paged
+    # buffer ``data_ptr()`` values, and a list of int ``data_ptr()`` for
+    # the lmcache objects (the fallback also accepts tensors directly,
+    # but the compiled extension does not).
+    norm_ptrs = torch.tensor([t.data_ptr() for t in norm], dtype=torch.long)
+    obj_ptrs = [obj.data_ptr()]
+
+    # Drive the python fallback directly: this regression specifically
+    # targets the CPU handle-mode path. ``lmc_ops.multi_layer_block_kv_transfer``
+    # is replaced by the CUDA C++ extension when CUDA is available, and that
+    # extension rejects ``torch.device("cpu")`` with a CUDAGuard error.
+    fallback_multi_layer_block_kv_transfer(
+        norm_ptrs,
+        obj_ptrs,
+        block_ids,
+        torch.device("cpu"),
+        lmc_ops.TransferDirection.D2H,
+        sd,
+        chunk_tokens,
+        fmt,
+        0,
+    )
+
+    # H2D into a fresh per-layer buffer set; round-trip must be bit-exact.
+    out = [torch.zeros_like(layer) for layer in norm]
+    out_ptrs = torch.tensor([t.data_ptr() for t in out], dtype=torch.long)
+    fallback_multi_layer_block_kv_transfer(
+        out_ptrs,
+        obj_ptrs,
+        block_ids,
+        torch.device("cpu"),
+        lmc_ops.TransferDirection.H2D,
+        sd,
+        chunk_tokens,
+        fmt,
+        0,
+    )
+
+    for original, recovered in zip(norm, out, strict=True):
+        assert torch.equal(original, recovered)
diff --git a/tests/v1/multiprocess/test_non_cuda_data_transfer.py b/tests/v1/multiprocess/test_non_cuda_data_transfer.py
index c60290b917..cfea3d0abf 100644
--- a/tests/v1/multiprocess/test_non_cuda_data_transfer.py
+++ b/tests/v1/multiprocess/test_non_cuda_data_transfer.py
@@ -4,7 +4,6 @@
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Callable, Protocol
 from unittest.mock import MagicMock, patch
-import mmap
 import os
 import pickle
 import sys
@@ -15,6 +14,12 @@
 
 # First Party
 from lmcache.v1.distributed.api import MemoryLayoutDesc
+from lmcache.v1.multiprocess.posix_shm import (
+    shm_create_readwrite,
+    shm_munmap,
+    shm_open_pool_as_mmap,
+    shm_unlink,
+)
 from lmcache.v1.multiprocess.protocol import RequestType
 from lmcache.v1.multiprocess.protocols.engine import (
     PrepareRetrieveResponse,
@@ -277,6 +282,43 @@ def test_resolve_extra_config_overrides_mp_transfer_mode() -> None:
     assert cfg[ExtraConfigDefault.mp_transfer_mode.name] == "data"
 
 
+def test_extra_config_default_lets_env_var_select_mp_transfer_mode(
+    monkeypatch: Any,
+) -> None:
+    """When extra_config omits mp_transfer_mode, env var must still win.
+
+    The adapter detects the absence of ``lmcache.mp.mp_transfer_mode`` and
+    passes ``mode=None`` to ``create_transfer_context``, which then reads
+    the ``LMCACHE_MP_TRANSFER_MODE`` env var. Regression test for
+    buildkite k3-multiprocess CI ``cpu_e2e_validation (server-side copy)``.
+    """
+    # First Party
+    from lmcache.integration.vllm.vllm_multi_process_adapter import (
+        _EXTRA_CONFIG_KEY_PREFIX,
+        ExtraConfigDefault,
+    )
+    from lmcache.v1.multiprocess.transfer_context import (
+        HandleTransferContext,
+        create_transfer_context,
+    )
+    from lmcache.v1.multiprocess.transfer_context.worker_transfer import (
+        ENV_MP_TRANSFER_MODE,
+    )
+
+    mp_mode_key = _EXTRA_CONFIG_KEY_PREFIX + ExtraConfigDefault.mp_transfer_mode.name
+    # Simulate adapter init: extra_config omits the mp_transfer_mode key.
+    extra_config: dict[str, Any] = {"lmcache.mp.mq_timeout": "1"}
+    resolved_mode = extra_config[mp_mode_key] if mp_mode_key in extra_config else None
+    assert resolved_mode is None
+
+    # With env=handle and mode=None, CPU KV must pick HandleTransferContext.
+    monkeypatch.setenv(ENV_MP_TRANSFER_MODE, "handle")
+    context = create_transfer_context(
+        {"layer_0": torch.randn(2, 2)}, mode=resolved_mode
+    )
+    assert isinstance(context, HandleTransferContext)
+
+
 def test_create_transfer_context_force_data_mode() -> None:
     """``mode='data'`` must always pick DataTransferContext, even for CUDA."""
     # First Party
@@ -292,6 +334,21 @@ def test_create_transfer_context_force_data_mode() -> None:
     assert isinstance(context, DataTransferContext)
 
 
+def test_create_transfer_context_force_handle_mode_on_cpu() -> None:
+    """``mode='handle'`` on CPU works because the CPU SHM wrapper is registered."""
+    # First Party
+    from lmcache.v1.multiprocess.transfer_context import (
+        HandleTransferContext,
+        create_transfer_context,
+    )
+
+    # Importing the CPU sub-package self-registers its KV-wrapper factory.
+    import lmcache.v1.platform.cpu  # noqa: F401
+
+    context = create_transfer_context({"layer_0": torch.randn(2, 2)}, mode="handle")
+    assert isinstance(context, HandleTransferContext)
+
+
 def test_create_transfer_context_invalid_mode_raises() -> None:
     """Unknown mode strings must raise a clear ValueError."""
     # First Party
@@ -319,6 +376,24 @@ def test_create_transfer_context_handle_mode_unsupported_device_raises(
         platform_registry.restore(snapshot)
 
 
+def test_create_transfer_context_env_var_overrides_default(
+    monkeypatch: Any,
+) -> None:
+    """``LMCACHE_MP_TRANSFER_MODE=data`` must force the data path."""
+    # First Party
+    from lmcache.v1.multiprocess.transfer_context import (
+        DataTransferContext,
+        create_transfer_context,
+    )
+    from lmcache.v1.multiprocess.transfer_context.worker_transfer import (
+        ENV_MP_TRANSFER_MODE,
+    )
+
+    monkeypatch.setenv(ENV_MP_TRANSFER_MODE, "data")
+    context = create_transfer_context({"layer_0": torch.randn(2, 2)})
+    assert isinstance(context, DataTransferContext)
+
+
 @pytest.mark.parametrize(
     ("builder_fn", "expected_block_size", "expected_hidden_dim", "layout_hints"),
     [
@@ -965,22 +1040,25 @@ def result(self, timeout=None):  # noqa: ARG002
         return self._value
 
 
-def _create_shm_file(shm_name: str, size: int) -> str:
-    path = os.path.join("/dev/shm", shm_name.lstrip("/"))
-    fd = os.open(path, os.O_CREAT | os.O_RDWR, 0o600)
-    os.ftruncate(fd, size)
-    os.close(fd)
-    return path
+def _create_shm_segment(shm_name: str, size: int) -> int:
+    """Create a POSIX SHM segment via the project facade.
+
+    Returns the owner mmap address so the test can release the segment
+    with ``shm_munmap`` + ``shm_unlink`` regardless of platform
+    (Linux/macOS), instead of hard-coding ``/dev/shm`` paths.
+    """
+    return shm_create_readwrite(shm_name, size)
 
 
 def test_non_gpu_context_shm_tensor_view_from_buffer() -> None:
     shm_name = f"lmcache_test_view_{os.getpid()}"
-    shm_path = _create_shm_file(shm_name, 4096)
+    addr = _create_shm_segment(shm_name, 4096)
     try:
-        with open(shm_path, "r+b") as f:
-            mm = mmap.mmap(f.fileno(), 4096, access=mmap.ACCESS_WRITE)
+        mm = shm_open_pool_as_mmap(shm_name, 4096)
+        try:
             src = torch.arange(8, dtype=torch.float32).reshape(2, 4)
             mm[: src.numel() * src.element_size()] = src.numpy().tobytes()
+        finally:
             mm.close()
 
         context = NonGpuContextShm(
@@ -1008,13 +1086,13 @@ def test_non_gpu_context_shm_tensor_view_from_buffer() -> None:
         finally:
             context.close()
     finally:
-        if os.path.exists(shm_path):
-            os.unlink(shm_path)
+        shm_munmap(addr, 4096)
+        shm_unlink(shm_name)
 
 
 def test_non_gpu_context_shm_store_retrieve_flow_with_mocked_mq() -> None:
     shm_name = f"lmcache_test_flow_{os.getpid()}"
-    shm_path = _create_shm_file(shm_name, 4096)
+    addr = _create_shm_segment(shm_name, 4096)
     slots = [
         {
             "offset": 0,
@@ -1080,8 +1158,8 @@ def _submit_request(req_type, payload, response_cls):  # noqa: ARG001
         assert context.commit_retrieve(key, 1)
     finally:
         context.close()
-        if os.path.exists(shm_path):
-            os.unlink(shm_path)
+        shm_munmap(addr, 4096)
+        shm_unlink(shm_name)
 
 
 def test_non_gpu_context_shm_init_raises_when_segment_missing() -> None:
@@ -1141,7 +1219,7 @@ def test_create_non_gpu_context_use_pickle_ignores_valid_shm_info() -> None:
 
 def test_non_gpu_context_shm_close_is_idempotent() -> None:
     shm_name = f"lmcache_test_close_{os.getpid()}"
-    shm_path = _create_shm_file(shm_name, 4096)
+    addr = _create_shm_segment(shm_name, 4096)
     try:
         context = NonGpuContextShm(
             metadata=NonGpuContextMetadata(
@@ -1160,5 +1238,5 @@ def test_non_gpu_context_shm_close_is_idempotent() -> None:
         context.close()
         context.close()
     finally:
-        if os.path.exists(shm_path):
-            os.unlink(shm_path)
+        shm_munmap(addr, 4096)
+        shm_unlink(shm_name)
diff --git a/tests/v1/platform/test_cpu_shm.py b/tests/v1/platform/test_cpu_shm.py
index 65a52cedae..02a11a4e49 100644
--- a/tests/v1/platform/test_cpu_shm.py
+++ b/tests/v1/platform/test_cpu_shm.py
@@ -127,7 +127,7 @@ def test_shm_create_cleans_up_on_existing_name():
 
 
 def test_to_tensor_view_carries_munmap_finalizer():
-    """``to_tensor`` returns a tensor that releases its mmap on GC."""
+    """``to_tensor`` returns a tensor whose storage releases its mmap on GC."""
     # Standard
     import gc
     import weakref
@@ -136,8 +136,6 @@ def test_to_tensor_view_carries_munmap_finalizer():
     w = migrate_to_shm_and_wrap(src)
     try:
         view = w.to_tensor()
-        # The view must keep ``flat`` alive so its mmap stays valid.
-        assert hasattr(view, "_lmcache_shm_buf")
         ref = weakref.ref(view)
         del view
         gc.collect()
@@ -148,6 +146,47 @@ def test_to_tensor_view_carries_munmap_finalizer():
         shm_unlink(w.shm_name)
 
 
+def test_to_tensor_view_survives_reshape_after_original_is_gced():
+    """A reshape view must keep working after the original tensor is GC-ed.
+
+    Regression for the CI ``cpu_e2e_validation (server-side copy)`` segfault
+    where ``normalize_kv_and_discover_format`` reshapes the unwrapped 4D
+    tensor into the canonical 5D ``[NB, NH, BS, 2, HS]`` and the original
+    4D tensor goes out of scope. The reshape view shares the same storage
+    but used to be left without a lifetime hook, so the next read after GC
+    landed on an already-``munmap``-ed page and crashed the LMCache server.
+    """
+    # Standard
+    import gc
+
+    NB, NH, BS, HS = 8, 4, 16, 8
+    src = torch.zeros((NB, NH, BS, 2 * HS), dtype=torch.bfloat16)
+    w = migrate_to_shm_and_wrap(src)
+    try:
+        unwrapped = [w.to_tensor()]
+        normalized = [
+            layer.reshape(*layer.shape[:3], 2, layer.shape[3] // 2)
+            for layer in unwrapped
+        ]
+        # Drop every reference to the 4D unwrapped tensor; only the 5D
+        # reshape view keeps the storage alive now.
+        del unwrapped
+        gc.collect()
+
+        # These ops would segfault before the fix because the SHM mapping
+        # had been munmap-ed when the 4D tensor's finalizer ran.
+        idx = torch.tensor([0, 3, 5], dtype=torch.long)
+        gathered = normalized[0].index_select(0, idx)
+        assert tuple(gathered.shape) == (3, NH, BS, 2, HS)
+        # The SHM segment is freshly mmap'd zeros; just make sure the
+        # bytes are addressable end-to-end so the kernel does not fault.
+        assert gathered.float().abs().sum().item() == 0.0
+    finally:
+        del src
+        gc.collect()
+        shm_unlink(w.shm_name)
+
+
 def test_to_tensor_replays_stride_and_storage_offset():
     """``to_tensor`` rebuilds the view via stride+offset (not reshape)."""
     src = torch.arange(24, dtype=torch.float32).reshape(2, 3, 4).contiguous()

From 2cb0bc14f37cc3efc77146c746d89cfc8339c312 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 10 Jun 2026 09:08:25 +0800
Subject: [PATCH 17/57] [Feat] Print LMCache startup banner in CLI and vLLM
 connectors (#3611)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 lmcache/banner.py                             | 119 ++++++++++++++++++
 lmcache/cli/main.py                           |   2 +
 .../integration/vllm/lmcache_mp_connector.py  |   5 +
 lmcache/integration/vllm/vllm_v1_adapter.py   |   7 ++
 tests/test_banner.py                          |  81 ++++++++++++
 tests/v1/distributed/serde/test_serde_e2e.py  |  38 ++++--
 .../v1/distributed/serde/test_serde_fs_e2e.py |  22 +++-
 7 files changed, 262 insertions(+), 12 deletions(-)
 create mode 100644 lmcache/banner.py
 create mode 100644 tests/test_banner.py

diff --git a/lmcache/banner.py b/lmcache/banner.py
new file mode 100644
index 0000000000..6d58830baa
--- /dev/null
+++ b/lmcache/banner.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Startup banner shown when LMCache starts serving.
+
+The banner is printed at most once per process, to ``stderr``, by the
+``lmcache`` CLI and by the vLLM connector integrations (scheduler role
+only, so tensor-parallel deployments print a single banner). Setting the
+``LMCACHE_DISABLE_BANNER=1`` environment variable suppresses it.
+"""
+
+# Standard
+from typing import TextIO
+import os
+
+try:
+    # First Party
+    from lmcache import _version
+
+    _LMCACHE_VERSION = getattr(_version, "__version__", "unknown")
+    _LMCACHE_COMMIT = getattr(_version, "__commit_id__", "")
+except ImportError:  # pragma: no cover - version file is generated at build time
+    _LMCACHE_VERSION = "unknown"
+    _LMCACHE_COMMIT = ""
+
+DISABLE_BANNER_ENV = "LMCACHE_DISABLE_BANNER"
+
+LMCACHE_WEBSITE = "https://lmcache.ai/"
+LMCACHE_RECIPES = "https://docs.lmcache.ai/recipes"
+LMCACHE_LINKEDIN = "https://www.linkedin.com/company/lmcache-lab"
+
+# Solarized palette, 24-bit ANSI escapes (TTY only): "LM" in bold italic
+# orange (#cb4b16), "Cache" in cyan (#2aa198).
+_LM_STYLE = "\x1b[1;3;38;2;203;75;22m"
+_CACHE_STYLE = "\x1b[38;2;42;161;152m"
+_DIM_STYLE = "\x1b[2m"
+_RESET = "\x1b[0m"
+
+# Figlet "standard" font, split into the two color groups.
+_LM_ART = (
+    " _     __  __ ",
+    "| |   |  \\/  |",
+    "| |   | |\\/| |",
+    "| |___| |  | |",
+    "|_____|_|  |_|",
+)
+_CACHE_ART = (
+    "  ____           _          ",
+    " / ___|__ _  ___| |__   ___ ",
+    "| |   / _` |/ __| '_ \\ / _ \\",
+    "| |__| (_| | (__| | | |  __/",
+    " \\____\\__,_|\\___|_| |_|\\___|",
+)
+_RIGHT_TEXT_GAP = "     "
+
+_banner_printed = False
+
+
+def _banner_disabled() -> bool:
+    """Return whether ``LMCACHE_DISABLE_BANNER`` is set to a truthy value."""
+    return os.getenv(DISABLE_BANNER_ENV, "").strip().lower() in ("1", "true", "yes")
+
+
+def _render_banner(colored: bool) -> str:
+    """Render the banner text.
+
+    Args:
+        colored: Whether to wrap the logo in ANSI color escapes.
+
+    Returns:
+        The multi-line banner: the LMCache logo with the version (and
+        commit id when available), website, recipes, and LinkedIn links
+        on its right, and a final line describing the
+        ``LMCACHE_DISABLE_BANNER`` opt-out. A blank line surrounds the
+        banner on each side to set it apart from adjacent log output.
+    """
+    lm_style = _LM_STYLE if colored else ""
+    cache_style = _CACHE_STYLE if colored else ""
+    dim_style = _DIM_STYLE if colored else ""
+    reset = _RESET if colored else ""
+
+    version = f"LMCache v{_LMCACHE_VERSION}"
+    if _LMCACHE_COMMIT:
+        version += f" ({_LMCACHE_COMMIT[:9]})"
+    right_text = {
+        1: version,
+        2: f"Website:  {LMCACHE_WEBSITE}",
+        3: f"Recipes:  {LMCACHE_RECIPES}",
+        4: f"LinkedIn: {LMCACHE_LINKEDIN}",
+    }
+    lines = [""]
+    for row, (lm_part, cache_part) in enumerate(zip(_LM_ART, _CACHE_ART, strict=True)):
+        line = f"{lm_style}{lm_part}{reset} {cache_style}{cache_part}{reset}"
+        if row in right_text:
+            line += _RIGHT_TEXT_GAP + right_text[row]
+        lines.append(line)
+    lines.append(f"{dim_style}Set {DISABLE_BANNER_ENV}=1 to hide this banner.{reset}")
+    lines.append("")
+    return "\n".join(lines)
+
+
+def print_banner_once(stream: TextIO) -> None:
+    """Print the LMCache startup banner to ``stream`` at most once.
+
+    The banner shows the LMCache logo, version, and website, followed by
+    a hint describing the ``LMCACHE_DISABLE_BANNER`` opt-out. ANSI colors
+    are used only when ``stream`` is a TTY. Subsequent calls in the same
+    process are no-ops, as are all calls when ``LMCACHE_DISABLE_BANNER``
+    is set to ``1``/``true``/``yes``.
+
+    Args:
+        stream: Destination text stream. Callers should pass
+            ``sys.stderr`` so the banner never interferes with
+            machine-readable stdout output.
+    """
+    global _banner_printed
+    if _banner_printed or _banner_disabled():
+        return
+    _banner_printed = True
+    stream.write(_render_banner(stream.isatty()) + "\n")
+    stream.flush()
diff --git a/lmcache/cli/main.py b/lmcache/cli/main.py
index a1649f9e11..754b8c6961 100644
--- a/lmcache/cli/main.py
+++ b/lmcache/cli/main.py
@@ -10,6 +10,7 @@
 import sys
 
 # First Party
+from lmcache.banner import print_banner_once
 from lmcache.cli.commands import ALL_COMMANDS
 from lmcache.logging import init_logger
 
@@ -18,6 +19,7 @@
 
 def main() -> None:
     """CLI entry point registered as ``lmcache`` in *pyproject.toml*."""
+    print_banner_once(sys.stderr)
     parser = argparse.ArgumentParser(
         prog="lmcache",
         description="LMCache — KV cache management for LLM serving",
diff --git a/lmcache/integration/vllm/lmcache_mp_connector.py b/lmcache/integration/vllm/lmcache_mp_connector.py
index cb231da758..1357cbbed6 100644
--- a/lmcache/integration/vllm/lmcache_mp_connector.py
+++ b/lmcache/integration/vllm/lmcache_mp_connector.py
@@ -5,6 +5,7 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Literal
 import enum
+import sys
 
 # Third Party
 from vllm.config import VllmConfig
@@ -36,6 +37,7 @@ class SupportsHMA:  # type: ignore[no-redef]
 
 # First Party
 from lmcache import torch_dev
+from lmcache.banner import print_banner_once
 from lmcache.integration.vllm.kv_cache_groups import (
     create_engine_group_infos_from_vllm,
 )
@@ -489,6 +491,9 @@ def __init__(
         parallel_strategy = build_parallel_strategy_from_vllm_config(vllm_config)
 
         if self.role == KVConnectorRole.SCHEDULER:
+            # Banner from the scheduler role only, so tensor-parallel
+            # deployments print it once rather than once per worker.
+            print_banner_once(sys.stderr)
             self.scheduler_adapter = LMCacheMPSchedulerAdapter(
                 server_url=server_url,
                 context=zmq_context,
diff --git a/lmcache/integration/vllm/vllm_v1_adapter.py b/lmcache/integration/vllm/vllm_v1_adapter.py
index 60928af4ee..82899dc311 100644
--- a/lmcache/integration/vllm/vllm_v1_adapter.py
+++ b/lmcache/integration/vllm/vllm_v1_adapter.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Any, Generator, Optional, Union
 import math
 import os
+import sys
 
 # Third Party
 from vllm.config import (
@@ -28,6 +29,7 @@
 # Use LMCache's own math utilities instead of vllm's
 # (avoids dependency on vllm internal changes like https://github.com/vllm-project/vllm/pull/27188)
 from lmcache import utils
+from lmcache.banner import print_banner_once
 from lmcache.integration.vllm.utils import (
     ENGINE_NAME,
     apply_mm_hashes_to_token_ids,
@@ -455,6 +457,11 @@ def __init__(
         role: KVConnectorRole,
         parent: KVConnectorBase_V1,
     ):
+        # Banner from the scheduler role only, so tensor-parallel
+        # deployments print it once rather than once per worker.
+        if role == KVConnectorRole.SCHEDULER:
+            print_banner_once(sys.stderr)
+
         self._parent = parent
         self._vllm_config = vllm_config
         self._role = role
diff --git a/tests/test_banner.py b/tests/test_banner.py
new file mode 100644
index 0000000000..b7598a7d78
--- /dev/null
+++ b/tests/test_banner.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for the LMCache startup banner."""
+
+# Standard
+import io
+
+# Third Party
+import pytest
+
+# First Party
+from lmcache import banner
+from lmcache.banner import (
+    DISABLE_BANNER_ENV,
+    LMCACHE_LINKEDIN,
+    LMCACHE_RECIPES,
+    LMCACHE_WEBSITE,
+    print_banner_once,
+)
+
+
+class _TTYStream(io.StringIO):
+    """In-memory stream that reports itself as a terminal."""
+
+    def isatty(self) -> bool:
+        return True
+
+
+@pytest.fixture(autouse=True)
+def _fresh_banner_state(monkeypatch):
+    """Reset the once-per-process guard and opt-out env var between tests."""
+    monkeypatch.setattr(banner, "_banner_printed", False)
+    monkeypatch.delenv(DISABLE_BANNER_ENV, raising=False)
+
+
+def test_banner_contains_version_links_and_opt_out_hint():
+    stream = io.StringIO()
+    print_banner_once(stream)
+    output = stream.getvalue()
+    assert "LMCache v" in output
+    assert LMCACHE_WEBSITE in output
+    assert LMCACHE_RECIPES in output
+    assert LMCACHE_LINKEDIN in output
+    assert DISABLE_BANNER_ENV in output
+
+
+def test_banner_is_plain_on_non_tty_stream():
+    stream = io.StringIO()
+    print_banner_once(stream)
+    assert "\x1b[" not in stream.getvalue()
+
+
+def test_banner_is_colored_on_tty_stream():
+    stream = _TTYStream()
+    print_banner_once(stream)
+    output = stream.getvalue()
+    assert "\x1b[1;3;38;2;203;75;22m" in output  # bold italic solarized orange
+    assert "\x1b[38;2;42;161;152m" in output  # solarized cyan
+
+
+def test_banner_prints_at_most_once_per_process():
+    first = io.StringIO()
+    second = io.StringIO()
+    print_banner_once(first)
+    print_banner_once(second)
+    assert first.getvalue() != ""
+    assert second.getvalue() == ""
+
+
+@pytest.mark.parametrize("value", ["1", "true", "YES"])
+def test_banner_disabled_by_env_var(monkeypatch, value):
+    monkeypatch.setenv(DISABLE_BANNER_ENV, value)
+    stream = io.StringIO()
+    print_banner_once(stream)
+    assert stream.getvalue() == ""
+
+
+def test_banner_not_disabled_by_falsy_env_var(monkeypatch):
+    monkeypatch.setenv(DISABLE_BANNER_ENV, "0")
+    stream = io.StringIO()
+    print_banner_once(stream)
+    assert stream.getvalue() != ""
diff --git a/tests/v1/distributed/serde/test_serde_e2e.py b/tests/v1/distributed/serde/test_serde_e2e.py
index 2aa42f7bbb..2f2c0a6b82 100644
--- a/tests/v1/distributed/serde/test_serde_e2e.py
+++ b/tests/v1/distributed/serde/test_serde_e2e.py
@@ -131,6 +131,13 @@ def make_storage_manager_config(
     )
 
 
+def get_l2_stored_object_count(sm: StorageManager) -> int:
+    """Return the total stored object count across all L2 adapters."""
+    return sum(
+        adapter["stored_object_count"] for adapter in sm.report_status()["l2_adapters"]
+    )
+
+
 def write_and_wait_for_l2(
     sm: StorageManager,
     keys: list[ObjectKey],
@@ -141,6 +148,8 @@ def write_and_wait_for_l2(
 
     Fills each chunk with deterministic data so round-trip can be verified.
     """
+    stored_before = get_l2_stored_object_count(sm)
+
     ret = sm.reserve_write(keys, layout, mode="new")
     assert len(ret) == len(keys), f"reserve_write: {len(ret)}/{len(keys)} succeeded"
 
@@ -153,15 +162,26 @@ def write_and_wait_for_l2(
 
     sm.finish_write(list(ret.keys()))
 
-    # Wait for StoreController to flush to L2.
-    # We poll the store_controller status for in_flight==0 and pending==0.
-    ok = wait_for_condition(
-        lambda: (
-            sm.report_status()["store_controller"]["in_flight_task_count"] == 0
-            and sm.report_status()["store_controller"]["pending_keys_count"] == 0
-        ),
-        timeout=timeout,
-    )
+    # Wait for StoreController to flush to L2. Polling the controller's queue
+    # counters alone is racy: the background loop pops the pending keys
+    # (pending_keys_count -> 0) before it submits the store tasks
+    # (in_flight_task_count is still 0 in between), so a poll landing in that
+    # window declares the store complete before anything reached L2. Anchor
+    # the wait on the adapters' stored object counts, then use the queue
+    # counters only to confirm the controller has settled.
+    def flushed_to_l2() -> bool:
+        status = sm.report_status()
+        stored_total = sum(
+            adapter["stored_object_count"] for adapter in status["l2_adapters"]
+        )
+        store_controller = status["store_controller"]
+        return (
+            stored_total >= stored_before + len(keys)
+            and store_controller["in_flight_task_count"] == 0
+            and store_controller["pending_keys_count"] == 0
+        )
+
+    ok = wait_for_condition(flushed_to_l2, timeout=timeout)
     assert ok, "Store to L2 did not complete within timeout"
 
 
diff --git a/tests/v1/distributed/serde/test_serde_fs_e2e.py b/tests/v1/distributed/serde/test_serde_fs_e2e.py
index f29aceaf07..a2ed305161 100644
--- a/tests/v1/distributed/serde/test_serde_fs_e2e.py
+++ b/tests/v1/distributed/serde/test_serde_fs_e2e.py
@@ -150,14 +150,30 @@ def _run(self, disk_path: str) -> None:
         sm.finish_write(keys)
 
         # ---- Step 2: wait for L2 store to disk ----
+        # The FS adapter writes one file per key (staged as "*.tmp" in the
+        # same directory), so wait for all final files rather than the
+        # first: the controller's queue counters can both read zero in the
+        # window between popping pending keys and submitting the store
+        # tasks, so they only confirm settling after the files prove the
+        # store actually happened.
+        def count_stored_files() -> int:
+            return sum(
+                1
+                for e in os.scandir(disk_path)
+                if e.is_file() and not e.name.endswith(".tmp")
+            )
+
         ok = wait_for_condition(
-            lambda: any(e.is_file() for e in os.scandir(disk_path)),
+            lambda: count_stored_files() >= len(keys),
             timeout=10.0,
         )
-        assert ok, f"No files appeared under {disk_path}"
+        assert ok, f"Expected {len(keys)} files under {disk_path}"
 
         ok = wait_for_condition(
-            lambda: sm.report_status()["store_controller"]["in_flight_task_count"] == 0,
+            lambda: (
+                sm.report_status()["store_controller"]["in_flight_task_count"] == 0
+                and sm.report_status()["store_controller"]["pending_keys_count"] == 0
+            ),
             timeout=10.0,
         )
         assert ok, "Store controller did not finish in time"

From d4c16f89a38c2776ed53907198251aa9afdd5607 Mon Sep 17 00:00:00 2001
From: Zhengfei He <157287166+zhengfeihe@users.noreply.github.com>
Date: Wed, 10 Jun 2026 11:01:23 +0900
Subject: [PATCH 18/57] [Doc] Auto-select model in CPU-offloading example to
 fit GPU (#3433)

Signed-off-by: zhengfeihe <hezhengfei1999@gmail.com>
---
 .../quickstart/offload_kv_cache.rst           | 124 +++++++++++++-----
 1 file changed, 93 insertions(+), 31 deletions(-)

diff --git a/docs/source/getting_started/quickstart/offload_kv_cache.rst b/docs/source/getting_started/quickstart/offload_kv_cache.rst
index 1e3aeac48c..7a9b52fdf2 100644
--- a/docs/source/getting_started/quickstart/offload_kv_cache.rst
+++ b/docs/source/getting_started/quickstart/offload_kv_cache.rst
@@ -167,8 +167,17 @@ This section demonstrates the performance benefits of using CPU offloading with
 Prerequisites (Setup)
 ~~~~~~~~~~~~~~~~~~~~~~
 
-- At least 24GB GPU memory
-- Sufficient CPU memory (LMCache will use 15 GB by default in this example).
+- A CUDA GPU. The example picks a model that fits the GPU automatically:
+
+  - ``Qwen/Qwen3-8B`` (bf16) when the GPU has ~36 GiB or more (e.g. A100-80G, H100).
+  - ``Qwen/Qwen3-8B-FP8`` with ``kv_cache_dtype="fp8"`` when the GPU has ~24 GiB
+    and supports native FP8 (Ada Lovelace / Hopper, ``sm_89+``; e.g. L4, L40, RTX 4090).
+  - ``Qwen/Qwen3-1.7B`` as the fallback for smaller GPUs (~10 GiB and up),
+    including Ampere 24 GiB cards (RTX A5000, RTX 3090) where FP8 is unsupported.
+
+- Sufficient CPU memory. The example clamps the LMCache pinned host buffer to
+  fit your system RAM and ``RLIMIT_MEMLOCK`` (``ulimit -l``), so it also works
+  on smaller hosts without manual tuning.
 
 Example script
 ~~~~~~~~~~~~~~
@@ -194,7 +203,7 @@ Save the following script as ``cpu-offloading.py``:
     from vllm import LLM, SamplingParams
     from vllm.config import KVTransferConfig
 
-    def parse_arguments():
+    def parse_arguments() -> argparse.Namespace:
         """Parse command line arguments."""
         parser = argparse.ArgumentParser(description="CPU offloading example with LMCache")
         parser.add_argument("--num-prompts", type=int, default=10,
@@ -205,43 +214,89 @@ Save the following script as ``cpu-offloading.py``:
                           help="Enable LMCache for CPU offloading (default: True)")
         return parser.parse_args()
 
-    def setup_lmcache_environment(num_prompts, num_tokens):
+    def pick_cpu_size_gb(workload_gb: float) -> float:
+        """
+        Clamp the LMCache pinned host buffer to fit system RAM and RLIMIT_MEMLOCK.
+
+        cudaHostAlloc pins pages, so the buffer cannot exceed total RAM nor the
+        per-process memlock limit (`ulimit -l`). On hosts where either is small,
+        the original "1.5 GB per 10k tokens" formula fails with cudaErrorMemoryAllocation.
+
+        Args:
+            workload_gb: Desired buffer size for the workload, in GiB.
+        Returns:
+            float: A buffer size in GiB that fits both caps, never below 1.0.
+        """
+        import psutil
+
+        ram_gib = psutil.virtual_memory().total / (1024 ** 3)
+        try:
+            import resource
+            memlock_soft, _ = resource.getrlimit(resource.RLIMIT_MEMLOCK)
+            memlock_gib = (
+                float("inf")
+                if memlock_soft == resource.RLIM_INFINITY
+                else memlock_soft / (1024 ** 3)
+            )
+        except ImportError:
+            # `resource` is POSIX-only; on Windows treat memlock as unbounded.
+            memlock_gib = float("inf")
+        return max(min(workload_gb, ram_gib * 0.5, memlock_gib * 0.9), 1.0)
+
+    def setup_lmcache_environment(num_prompts: int, num_tokens: int) -> None:
         """
         Configure LMCache environment variables.
         Args:
             num_prompts: Number of prompts to process
             num_tokens: Number of tokens per prompt
         """
-        cpu_size = num_prompts * num_tokens * 1.5 / 10000  # 1.5GB per 10000 tokens
-        
+        workload_gb = num_prompts * num_tokens * 1.5 / 10000  # 1.5 GB per 10k tokens
+        cpu_size = pick_cpu_size_gb(workload_gb)
+
         env_vars = {
             "LMCACHE_CHUNK_SIZE": "256",         # Set tokens per chunk
             "LMCACHE_LOCAL_CPU": "True",         # Enable local CPU backend
-            "LMCACHE_MAX_LOCAL_CPU_SIZE": str(cpu_size)  # Dynamic CPU memory limit (GB)
+            "LMCACHE_MAX_LOCAL_CPU_SIZE": str(cpu_size)  # CPU memory limit (GB)
         }
         for key, value in env_vars.items():
             os.environ[key] = value
 
-    def calculate_gpu_utilization(target_memory_gb=24):
+    def pick_model_and_kwargs() -> tuple[str, dict]:
         """
-        Calculate GPU memory utilization to use exactly target_memory_gb of GPU memory.
-        Args:
-            target_memory_gb: Target GPU memory usage in gigabytes
+        Pick a Qwen model that fits the current GPU's memory and compute capability.
+
+        Tiers:
+            - >= 36 GiB                    -> Qwen/Qwen3-8B (bf16)
+            - >= 20 GiB and sm >= 89       -> Qwen/Qwen3-8B-FP8 (native FP8)
+            - >= 10 GiB                    -> Qwen/Qwen3-1.7B
+            - otherwise                    -> RuntimeError
+
         Returns:
-            float: GPU memory utilization ratio (0.0 to 1.0)
+            tuple[str, dict]: (model id, extra kwargs to pass to ``LLM``).
         Raises:
-            RuntimeError: If GPU memory is less than target_memory_gb
+            RuntimeError: If no CUDA GPU is visible or it is too small.
         """
         if not torch.cuda.is_available():
             raise RuntimeError("No GPU available")
-        
-        total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
-        if total_memory < target_memory_gb:
-            raise RuntimeError(f"GPU memory ({total_memory:.1f}GB) is less than required memory ({target_memory_gb}GB)")
-        
-        return target_memory_gb / total_memory
 
-    def create_test_prompts(num_prompts=10, num_tokens=1000):
+        total_gib = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
+        major, minor = torch.cuda.get_device_capability(0)
+        sm = major * 10 + minor
+        has_fp8 = sm >= 89  # Ada Lovelace / Hopper
+
+        if total_gib >= 36:
+            return "Qwen/Qwen3-8B", {}
+        if total_gib >= 20 and has_fp8:
+            print(f"[fallback] GPU {total_gib:.1f} GiB sm_{sm}: using Qwen3-8B-FP8")
+            return "Qwen/Qwen3-8B-FP8", {"kv_cache_dtype": "fp8"}
+        if total_gib >= 10:
+            print(f"[fallback] GPU {total_gib:.1f} GiB sm_{sm}: using Qwen3-1.7B")
+            return "Qwen/Qwen3-1.7B", {}
+        raise RuntimeError(
+            f"GPU has {total_gib:.1f} GiB; need at least 10 GiB for Qwen3-1.7B"
+        )
+
+    def create_test_prompts(num_prompts: int = 10, num_tokens: int = 1000) -> list[str]:
         """
         Create test prompts with index prefix and dummy body.
         Args:
@@ -252,36 +307,43 @@ Save the following script as ``cpu-offloading.py``:
         """
         prompts = []
         dummy_text = "Hi " * num_tokens
-        
+
         for i in range(num_prompts):
             prompt = f"[Prompt {i}] {dummy_text} how are you?"
             prompts.append(prompt)
-        
+
         return prompts
 
-    def initialize_llm(model_name="Qwen/Qwen3-8B", max_len=16384, enable_lmcache=True):
+    def initialize_llm(max_len: int = 16384, enable_lmcache: bool = True) -> LLM:
         """
-        Initialize the LLM with appropriate configurations.
+        Initialize the LLM with a model auto-selected for the current GPU.
         Args:
-            model_name: Name of the model to load
             max_len: Maximum sequence length
+            enable_lmcache: Whether to wire up the LMCache KV connector
         Returns:
             LLM: Configured LLM instance
         """
+        model_name, extra_kwargs = pick_model_and_kwargs()
+
         ktc = KVTransferConfig(
             kv_connector="LMCacheConnectorV1",
             kv_role="kv_both",
         ) if enable_lmcache else None
-        
+
         return LLM(
             model=model_name,
             kv_transfer_config=ktc,
             max_model_len=max_len,
             enable_prefix_caching=False,
-            gpu_memory_utilization=calculate_gpu_utilization()
+            gpu_memory_utilization=0.9,
+            **extra_kwargs,
         )
 
-    def generate_and_print_output(llm, prompts, sampling_params):
+    def generate_and_print_output(
+        llm: LLM,
+        prompts: list[str],
+        sampling_params: SamplingParams,
+    ) -> float:
         """
         Generate text and print the results.
         Args:
@@ -294,14 +356,14 @@ Save the following script as ``cpu-offloading.py``:
         start_time = time.time()
         outputs = llm.generate(prompts, sampling_params)
         end_time = time.time()
-        
+
         for output in outputs:
             generated_text = output.outputs[0].text
             print(f"Generated text: {generated_text!r}")
-        
+
         return end_time - start_time
 
-    def main():
+    def main() -> None:
         """Main execution function."""
         # Parse command line arguments
         args = parse_arguments()

From aad3fdb706c1f20db4c44345d539ffc65e1edc0c Mon Sep 17 00:00:00 2001
From: Tony Lin <tony.lin@intel.com>
Date: Wed, 10 Jun 2026 10:44:48 +0800
Subject: [PATCH 19/57] refactor: utilize multi_layer_block_kv_transfer ops for
 data transfer path (#3600)

* refactor: utilize multi_layer_block_kv_transfer ops for data transfer path

Consolidate the data transfer path by utilizing the `multi_layer_block_kv_transfer` operation.
This update allows a single op to support both handle and data paths simultaneously,
streamlining the underlying transfer logic.

Signed-off-by: Tony Lin <tony.lin@intel.com>

* more comments for clarity

Signed-off-by: Tony Lin <tony.lin@intel.com>

* refactor(test): skip blocks-first fused KV tests on non-CPU devices

- Apply module-level pytestmark to skip all tests in this file when
  torch_device_type is not 'cpu', as the blocks-first fused shape
  (Format 10) is currently CPU-only.
- Move pytestmark to the top of the file for better clarity and
  correct test execution control.

Signed-off-by: Tony Lin <tony.lin@intel.com>

* add GPUKVFormat.NL_X_NB_NH_BS_TWO_HS in python fallback path

Signed-off-by: Tony Lin <tony.lin@intel.com>

* properly handle cuda kernel's limitation

Signed-off-by: Tony Lin <tony.lin@intel.com>

* fix bug

Signed-off-by: Tony Lin <tony.lin@intel.com>

---------

Signed-off-by: Tony Lin <tony.lin@intel.com>
---
 lmcache/python_ops_fallback.py                |   2 +
 .../v1/multiprocess/transfer_context/base.py  | 561 ++++++++++++------
 .../test_blocks_first_fused_kv_format.py      |   6 +
 .../test_non_cuda_data_transfer.py            |  16 +-
 4 files changed, 404 insertions(+), 181 deletions(-)

diff --git a/lmcache/python_ops_fallback.py b/lmcache/python_ops_fallback.py
index df15d2579e..0c28a176d8 100644
--- a/lmcache/python_ops_fallback.py
+++ b/lmcache/python_ops_fallback.py
@@ -843,6 +843,8 @@ def _per_layer_paged_shape(
         return (nb, nh, bs, 2, hs)
     if fmt == int(GPUKVFormat.NL_X_TWO_NB_BS_NH_HS):
         return (2, nb, bs, nh, hs)
+    if fmt == int(GPUKVFormat.NL_X_NB_NH_BS_TWO_HS):
+        return (nb, nh, bs, 2, hs)
     # Covers NL_X_NB_TWO_BS_NH_HS and any future NHD variants.
     return (nb, 2, bs, nh, hs)
 
diff --git a/lmcache/v1/multiprocess/transfer_context/base.py b/lmcache/v1/multiprocess/transfer_context/base.py
index ef43dd6121..afdcc29df7 100644
--- a/lmcache/v1/multiprocess/transfer_context/base.py
+++ b/lmcache/v1/multiprocess/transfer_context/base.py
@@ -18,12 +18,15 @@
 # Standard
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING
+import inspect
 
 # Third Party
+import numpy as np
 import torch
 
 # First Party
+from lmcache import torch_dev
 from lmcache.logging import init_logger
 from lmcache.utils import EngineType
 from lmcache.v1.distributed.api import MemoryLayoutDesc
@@ -38,6 +41,70 @@
 logger = init_logger(__name__)
 
 
+# ---------------------------------------------------------------------------
+# Global capability flag: does lmc_ops.multi_layer_block_kv_transfer accept
+# list[torch.Tensor] directly for lmcache_objects_ptrs, or only list[int]?
+#
+# We inspect the function signature once at import time. If the annotation
+# for ``lmcache_objects_ptrs`` includes ``Tensor``, the op can handle tensors
+# natively and we pass them through. Otherwise (annotation is list[int], or
+# inspect fails entirely) we must convert tensors to data pointers before
+# calling.
+# ---------------------------------------------------------------------------
+def _detect_block_transfer_accepts_tensor() -> bool:
+    """Return True if lmc_ops.multi_layer_block_kv_transfer accepts
+    list[torch.Tensor] for its lmcache_objects_ptrs parameter."""
+    try:
+        # First Party
+        import lmcache.c_ops as _lmc_ops
+
+        fn = _lmc_ops.multi_layer_block_kv_transfer
+
+        # Attempt: use inspect.signature (works on newer pybind11 builds)
+        # Assumptions: if lmcache_objects_ptrs accepts tensors,
+        # it's fallback path, and we do not convert tensors to ptrs explicitly.
+        # TODO: String matching on annotations is fragile. Wait for lmc_ops to
+        # expose a direct version flag (e.g., lmc_ops.__version__) or
+        # an explicit capability boolean.
+        try:
+            sig = inspect.signature(fn)
+            param = sig.parameters.get("lmcache_objects_ptrs")
+            if param is not None and param.annotation is not inspect.Parameter.empty:
+                ann_str = str(param.annotation)
+                if "Tensor" in ann_str:
+                    return True
+                # Annotation exists but no Tensor mention → ptr-only
+                return False
+        except (ValueError, TypeError):
+            pass
+
+    except Exception:
+        # Import failed or any other error → conservative: assume ptr-only
+        pass
+
+    # Default: inspect failed or lmc_ops not available → assume ptr-only
+    return False
+
+
+_LMC_OPS_BLOCK_TRANSFER_ACCEPTS_TENSOR: bool = _detect_block_transfer_accepts_tensor()
+"""If True, ``lmc_ops.multi_layer_block_kv_transfer`` accepts
+``list[torch.Tensor]`` directly for ``lmcache_objects_ptrs``.
+If False, callers must convert tensors to ``list[int]`` data pointers."""
+
+logger.info(
+    "multi_layer_block_kv_transfer mode: %s",
+    "tensor" if _LMC_OPS_BLOCK_TRANSFER_ACCEPTS_TENSOR else "ptr",
+)
+
+
+def _tensors_to_ptrs(tensors: list[torch.Tensor]) -> list[int]:
+    """Convert a list of tensors to a list of their data_ptr() values."""
+    return [t.data_ptr() for t in tensors]
+
+
+# ---------------------------------------------------------------------------
+
+
 @dataclass
 class NonGpuContextMetadata:
     """Non-GPU context layout metadata for non-CUDA workers.
@@ -249,9 +316,10 @@ def gather_paged_kv_to_cpu(
         blocks_per_chunk: Number of paged blocks in one LMCache chunk.
         layout_hints: Optional engine layout hints.
         gpu_kv_format: Optional pre-detected KV format.
-        out: Optional pre-allocated output tensors (one per entry in
-            ``chunk_indices`` when ``chunk_indices`` is given, or one per
-            chunk otherwise).
+        out: Optional pre-allocated output tensors.  If provided, length
+            must be at least ``len(chunk_indices)`` when ``chunk_indices``
+            is given, or the total number of chunks otherwise.  Any extra
+            buffers beyond the number of gathered chunks are ignored.
         chunk_indices: Optional list of chunk positions (into the full
             ``block_ids`` sequence) to gather.  When provided together with
             ``out``, only those chunks are gathered and written into
@@ -263,11 +331,19 @@ def gather_paged_kv_to_cpu(
         ``[2, num_layers, chunk_tokens, hidden_dim]`` where dimension ``0``
         stores ``(K, V)``. For MLA (multi-head latent attention) each chunk
         has shape ``[num_layers, chunk_tokens, hidden_dim]``.
+
+    Raises:
+        ValueError: If ``out`` is provided with fewer buffers than the number
+            of gathered chunks.
     """
     # First Party
     from lmcache.v1.gpu_connector.utils import (
         get_block_size,
+        get_hidden_dim_size,
+        get_num_blocks,
+        get_num_layers,
         is_mla,
+        make_page_buffer_shape_desc,
         normalize_kv_and_discover_format,
     )
     import lmcache.c_ops as lmc_ops
@@ -278,104 +354,187 @@ def gather_paged_kv_to_cpu(
     )
     if gpu_kv_format is None:
         gpu_kv_format = fmt
-    use_mla = is_mla(gpu_kv_format)
-    is_hnd = gpu_kv_format in (
-        lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
-        lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
-        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
-    )
 
     block_size = get_block_size(normalized, gpu_kv_format)
+    num_layers = get_num_layers(normalized, gpu_kv_format)
+    hidden_dim_size = get_hidden_dim_size(normalized, gpu_kv_format)
+    num_blocks = get_num_blocks(normalized, gpu_kv_format)
     num_chunks = len(block_ids) // blocks_per_chunk
+    chunk_tokens = blocks_per_chunk * block_size
+
+    shape_desc = make_page_buffer_shape_desc(
+        normalized,
+        gpu_kv_format,
+        layer_idx=0,
+        num_layers_in_group=num_layers,
+        num_blocks=num_blocks,
+        block_size=block_size,
+    )
+
+    iter_indices = (
+        list(chunk_indices) if chunk_indices is not None else list(range(num_chunks))
+    )
+    # Require at least one output buffer per gathered chunk. Extra trailing
+    # buffers are ignored (see ``chunks = out[: len(iter_indices)]`` below),
+    # mirroring the scatter-side length check for consistency.
+    if out is not None and len(out) < len(iter_indices):
+        raise ValueError(
+            f"out length ({len(out)}) must be at least the number of "
+            f"gathered chunks ({len(iter_indices)})"
+        )
+
+    # Determine if pinned memory is strictly required
+    # (only for the compiled C++ path which does not accept tensor)
+    requires_pinned = not _LMC_OPS_BLOCK_TRANSFER_ACCEPTS_TENSOR
+    needs_staging = False
+    staged_chunks = []
+
+    if out is None:
+        use_mla = is_mla(gpu_kv_format)
+        if use_mla:
+            chunks = [
+                torch.empty(
+                    (num_layers, chunk_tokens, hidden_dim_size),
+                    dtype=tensors[0].dtype,
+                    device=torch.device("cpu"),
+                    pin_memory=requires_pinned,
+                )
+                for _ in iter_indices
+            ]
+        else:
+            chunks = [
+                torch.empty(
+                    (2, num_layers, chunk_tokens, hidden_dim_size),
+                    dtype=tensors[0].dtype,
+                    device=torch.device("cpu"),
+                    pin_memory=requires_pinned,
+                )
+                for _ in iter_indices
+            ]
+    else:
+        _target_out = out[: len(iter_indices)]
+
+        if requires_pinned and not all(t.is_pinned() for t in _target_out):
+            # Core fallback: Unpinned memory (e.g., IPC Shared Memory) detected.
+            # We cannot dynamically call `.pin_memory()` on `out` because it
+            # would allocate new tensors, breaking the caller's expectation
+            # of an in-place update. Instead, we allocate a temporary pinned
+            # staging buffer for the C++ kernel to write to safely.
+            logger.warning(
+                "Unpinned memory detected in 'out' during "
+                "gather_paged_kv_to_cpu (likely Shared Memory). "
+                "Using an internal pinned staging buffer, which "
+                "adds a CPU memory copy overhead."
+            )
+            needs_staging = True
+            staged_chunks = [torch.empty_like(t, pin_memory=True) for t in _target_out]
+            chunks = (
+                staged_chunks  # Point to the safe staging buffer for the H2D transfer
+            )
+        else:
+            # Ideal case: Memory is pinned, or we are using Python fallback.
+            # Ignore any extra trailing buffers beyond what we actually gather so
+            # the kernel's ``total_blocks % num_objects`` invariant still holds.
+            # Return ``out`` unchanged when no trimming is needed so the in-place
+            # fill contract (result is out) is preserved.
+            if len(out) == len(iter_indices):
+                chunks = out
+            else:
+                chunks = out[: len(iter_indices)]
+
+    selected_block_ids: list[int] = []
+    for chunk_idx in iter_indices:
+        selected_block_ids.extend(
+            block_ids[chunk_idx * blocks_per_chunk : (chunk_idx + 1) * blocks_per_chunk]
+        )
+
+    if selected_block_ids:
+        if _LMC_OPS_BLOCK_TRANSFER_ACCEPTS_TENSOR:
+            # Python fallback: accepts tensor list directly for all params.
+            paged_arg = normalized
+            objs_arg = chunks
+            block_ids_arg = selected_block_ids
+
+            # call kernel in one shot
+            lmc_ops.multi_layer_block_kv_transfer(
+                paged_arg,
+                objs_arg,
+                block_ids_arg,
+                tensors[0].device,
+                lmc_ops.TransferDirection.D2H,
+                shape_desc,
+                chunk_tokens,
+                gpu_kv_format,
+                0,
+            )
 
-    # After normalization the structure is always a list of per-layer
-    # tensors. Cast once so all downstream indexing is typed correctly.
-    layer_tensors = cast(list[torch.Tensor], normalized)
+        else:
+            # Compiled C++/CUDA/XPU: requires int64 pointer tensor and list[int].
+            _ptrs_np = np.array(
+                [t.data_ptr() for t in normalized],  # type: ignore[union-attr]
+                dtype=np.uint64,
+            ).view(np.int64)
+            paged_arg = torch.from_numpy(_ptrs_np).to(device=tensors[0].device)
+
+            # This safely points to either the pre-pinned chunks
+            # OR the temporary staged_chunks
+            objs_arg = _tensors_to_ptrs(chunks)
+
+            block_ids_arg = torch.tensor(
+                selected_block_ids, dtype=torch.int64, device=tensors[0].device
+            )
 
-    chunks: list[torch.Tensor] = [] if out is None else out
+            # Split transfer to respect CUDA kernel's object count limitation
+            MAX_OBJECTS = 4
+            req_blocks_per_obj = blocks_per_chunk
+            total_objects = len(objs_arg)
 
-    # When chunk_indices is given (SHM partial-reservation path), only
-    # process the specified subset.  The i-th entry in chunk_indices is the
-    # position of that chunk within the full block_ids sequence; the
-    # corresponding pre-allocated slot lives at out[i].
-    iter_indices = chunk_indices if chunk_indices is not None else range(num_chunks)
+            for i in range(0, total_objects, MAX_OBJECTS):
+                # Slice object pointers and corresponding block IDs
+                batch_objs_ptrs = objs_arg[i : i + MAX_OBJECTS]
 
-    for out_idx, chunk_idx in enumerate(iter_indices):
-        chunk_block_ids = block_ids[
-            chunk_idx * blocks_per_chunk : (chunk_idx + 1) * blocks_per_chunk
-        ]
-        if use_mla:
-            mla_layers: list[torch.Tensor] = []
-            idx = torch.tensor(chunk_block_ids, dtype=torch.long)
-            for layer in layer_tensors:
-                layer_blocks = layer[idx]
-                mla_layers.append(
-                    layer_blocks.reshape(
-                        len(chunk_block_ids) * block_size, layer_blocks.shape[-1]
-                    )
+                start_block = i * req_blocks_per_obj
+                end_block = min(
+                    (i + MAX_OBJECTS) * req_blocks_per_obj, len(selected_block_ids)
                 )
-            chunk_tensor = torch.stack(mla_layers, dim=0)
-            if out is not None:
-                out[out_idx].copy_(chunk_tensor, non_blocking=True)
-            else:
-                chunks.append(chunk_tensor.cpu())
+                batch_blocks = block_ids_arg[start_block:end_block]
+
+                # Execute batched transfer
+                lmc_ops.multi_layer_block_kv_transfer(
+                    paged_arg,
+                    batch_objs_ptrs,
+                    batch_blocks,
+                    tensors[0].device,
+                    lmc_ops.TransferDirection.D2H,
+                    shape_desc,
+                    chunk_tokens,
+                    gpu_kv_format,
+                    0,
+                )
+
+    # --- Final reconciliation ---
+    # If we used a staging buffer to protect unpinned shared memory,
+    # we now copy the gathered data back into the caller's original tensors.
+    if needs_staging:
+        assert out is not None
+        # The CPU MUST block and wait for the GPU ONLY when a temporary
+        # staging buffer is used. This is because the CPU needs to immediately
+        # read this data for the memory copy below.
+        torch_dev.synchronize()
+
+        for dst, src in zip(_target_out, staged_chunks, strict=False):
+            dst.copy_(src)  # High-speed CPU-to-CPU memory copy
+
+        if len(out) == len(iter_indices):
+            chunks = out
         else:
-            k_layers: list[torch.Tensor] = []
-            v_layers: list[torch.Tensor] = []
-            for layer in layer_tensors:
-                if is_hnd:
-                    if gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS:
-                        k_t = layer[0]
-                        v_t = layer[1]
-                    elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
-                        # [NB, NH, BS, 2, HS] — K/V fused at dim 3
-                        k_t = layer[:, :, :, 0]
-                        v_t = layer[:, :, :, 1]
-                    else:
-                        k_t = layer[:, 0]
-                        v_t = layer[:, 1]
-                    _num_blocks, num_heads, _block_size, head_size = k_t.shape
-                    k_blocks = k_t[torch.tensor(chunk_block_ids, dtype=torch.long)]
-                    v_blocks = v_t[torch.tensor(chunk_block_ids, dtype=torch.long)]
-                    # HND blocks are [NB, NH, BS, HS]; convert to token-major
-                    # [NB, BS, NH, HS] before flattening to [tokens, NH*HS].
-                    k_layers.append(
-                        k_blocks.permute(0, 2, 1, 3).reshape(
-                            len(chunk_block_ids) * block_size, num_heads * head_size
-                        )
-                    )
-                    v_layers.append(
-                        v_blocks.permute(0, 2, 1, 3).reshape(
-                            len(chunk_block_ids) * block_size, num_heads * head_size
-                        )
-                    )
-                else:
-                    if gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS:
-                        k_t = layer[0]
-                        v_t = layer[1]
-                    else:
-                        k_t = layer[:, 0]
-                        v_t = layer[:, 1]
-                    _num_blocks, _block_size, num_heads, head_size = k_t.shape
-                    k_blocks = k_t[torch.tensor(chunk_block_ids, dtype=torch.long)]
-                    v_blocks = v_t[torch.tensor(chunk_block_ids, dtype=torch.long)]
-                    k_layers.append(
-                        k_blocks.reshape(
-                            len(chunk_block_ids) * block_size, num_heads * head_size
-                        )
-                    )
-                    v_layers.append(
-                        v_blocks.reshape(
-                            len(chunk_block_ids) * block_size, num_heads * head_size
-                        )
-                    )
-            k_stacked = torch.stack(k_layers, dim=0)
-            v_stacked = torch.stack(v_layers, dim=0)
-            chunk_tensor = torch.stack([k_stacked, v_stacked], dim=0)
-            if out is not None:
-                out[out_idx].copy_(chunk_tensor, non_blocking=True)
-            else:
-                chunks.append(chunk_tensor.cpu())
+            chunks = _target_out
+
+    # Fast path: The async GPU copy might still be in progress.
+    # We intentionally omit synchronization here for performance.
+    # WARNING: The caller MUST explicitly call `torch_dev.synchronize()`
+    # before consuming these chunks to ensure data validity.
+
     return chunks
 
 
@@ -392,24 +551,44 @@ def scatter_cpu_to_paged_kv(
 
     Args:
         kv_caches: Per-layer KV tensor mapping to write into.
-        block_ids: Flattened destination block IDs for all chunks.
+        block_ids: Flattened destination block IDs for all chunks.  Length
+            must be at least ``len(chunks) * blocks_per_chunk``; any extra
+            trailing block IDs are ignored.
         chunks: List of CPU chunk tensors (as returned by
             :func:`gather_paged_kv_to_cpu`).
         blocks_per_chunk: Number of paged blocks in one LMCache chunk.
-        skip_first_n_tokens: Token prefix to skip when scattering.
+        skip_first_n_tokens: Token prefix to skip when scattering.  Must be a
+            multiple of ``block_size``; non-aligned values are rounded down
+            to the nearest whole block and an error is logged (matching the
+            GPU transfer path).
         layout_hints: Optional engine layout hints.
         gpu_kv_format: Optional pre-detected KV format.
+
+    Raises:
+        ValueError: If ``block_ids`` is shorter than
+            ``len(chunks) * blocks_per_chunk``.
     """
     # First Party
     from lmcache.v1.gpu_connector.utils import (
         get_block_size,
-        is_mla,
+        get_num_blocks,
+        get_num_layers,
+        make_page_buffer_shape_desc,
         normalize_kv_and_discover_format,
     )
     import lmcache.c_ops as lmc_ops
 
     if not chunks:
         return
+    # Require enough block IDs to cover every chunk. Extra trailing block IDs
+    # are ignored by the per-chunk slicing below, mirroring the gather-side
+    # ``out`` length check for consistency.
+    if len(block_ids) < len(chunks) * blocks_per_chunk:
+        raise ValueError(
+            f"block_ids length ({len(block_ids)}) must be at least "
+            f"len(chunks) ({len(chunks)}) * blocks_per_chunk "
+            f"({blocks_per_chunk})"
+        )
 
     tensors = list(kv_caches.values())
     fmt, normalized = normalize_kv_and_discover_format(
@@ -417,89 +596,119 @@ def scatter_cpu_to_paged_kv(
     )
     if gpu_kv_format is None:
         gpu_kv_format = fmt
-    use_mla = is_mla(gpu_kv_format)
 
     block_size = get_block_size(normalized, gpu_kv_format)
-    device = tensors[0].device
-    is_hnd = gpu_kv_format in (
-        lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS,
-        lmc_ops.GPUKVFormat.NL_X_NB_TWO_NH_BS_HS,
-        lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS,
+    num_layers = get_num_layers(normalized, gpu_kv_format)
+    num_blocks = get_num_blocks(normalized, gpu_kv_format)
+    chunk_tokens = blocks_per_chunk * block_size
+
+    # Block-level transfer can only skip whole blocks. A non-aligned prefix is
+    # rounded down to the nearest block (matching the GPU transfer path in
+    # gpu_transfer.py) rather than raising, so a slightly misaligned skip
+    # degrades gracefully instead of failing the whole retrieve.
+    if skip_first_n_tokens % block_size != 0:
+        logger.error(
+            "skip_first_n_tokens (%d) is not block-aligned (block_size=%d); "
+            "rounding down to %d blocks",
+            skip_first_n_tokens,
+            block_size,
+            skip_first_n_tokens // block_size,
+        )
+    skip_prefix_n_blocks = skip_first_n_tokens // block_size
+
+    shape_desc = make_page_buffer_shape_desc(
+        normalized,
+        gpu_kv_format,
+        layer_idx=0,
+        num_layers_in_group=num_layers,
+        num_blocks=num_blocks,
+        block_size=block_size,
     )
 
-    # After normalization the structure is always a list of per-layer
-    # tensors. Cast once so all downstream indexing is typed correctly.
-    layer_tensors = cast(list[torch.Tensor], normalized)
-
-    for chunk_idx, chunk_cpu in enumerate(chunks):
-        chunk_block_ids = block_ids[
-            chunk_idx * blocks_per_chunk : (chunk_idx + 1) * blocks_per_chunk
-        ]
-        if not chunk_block_ids:
-            continue
-
-        chunk_start_token = chunk_idx * blocks_per_chunk * block_size
-        chunk_end_token = chunk_start_token + len(chunk_block_ids) * block_size
-        effective_start = max(chunk_start_token, skip_first_n_tokens)
-        if effective_start >= chunk_end_token:
-            continue
+    selected_block_ids: list[int] = []
+    for chunk_idx in range(len(chunks)):
+        selected_block_ids.extend(
+            block_ids[chunk_idx * blocks_per_chunk : (chunk_idx + 1) * blocks_per_chunk]
+        )
 
-        skip_blocks_in_chunk = (effective_start - chunk_start_token) // block_size
-        effective_block_ids = chunk_block_ids[skip_blocks_in_chunk:]
-        if not effective_block_ids:
-            continue
-
-        skip_tokens = skip_blocks_in_chunk * block_size
-        chunk_device = chunk_cpu.to(device)
+    if not selected_block_ids:
+        return
 
-        if use_mla:
-            eff_idx = torch.tensor(effective_block_ids, dtype=torch.long)
-            for layer_idx, layer in enumerate(layer_tensors):
-                mla_src = chunk_device[layer_idx, skip_tokens:]
-                hidden_size = layer.shape[-1]
-                mla_src_3d = mla_src.reshape(
-                    len(effective_block_ids), block_size, hidden_size
-                )
-                layer[eff_idx] = mla_src_3d
-        elif is_hnd:
-            for layer_idx, layer in enumerate(layer_tensors):
-                k_src = chunk_device[0, layer_idx, skip_tokens:]
-                v_src = chunk_device[1, layer_idx, skip_tokens:]
-                if gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_TWO_NB_NH_BS_HS:
-                    k_t = layer[0]
-                    v_t = layer[1]
-                elif gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_NB_NH_BS_TWO_HS:
-                    # [NB, NH, BS, 2, HS] — K/V fused at dim 3
-                    k_t = layer[:, :, :, 0]
-                    v_t = layer[:, :, :, 1]
-                else:
-                    k_t = layer[:, 0]
-                    v_t = layer[:, 1]
-                _nb, nh, _bs, hs = k_t.shape
-                k_blocks = k_src.reshape(
-                    len(effective_block_ids), block_size, nh, hs
-                ).permute(0, 2, 1, 3)
-                v_blocks = v_src.reshape(
-                    len(effective_block_ids), block_size, nh, hs
-                ).permute(0, 2, 1, 3)
-                k_t[effective_block_ids] = k_blocks
-                v_t[effective_block_ids] = v_blocks
-        else:
-            for layer_idx, layer in enumerate(layer_tensors):
-                k_src = chunk_device[0, layer_idx, skip_tokens:]
-                v_src = chunk_device[1, layer_idx, skip_tokens:]
-                if gpu_kv_format == lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS:
-                    k_t = layer[0]
-                    v_t = layer[1]
-                else:
-                    k_t = layer[:, 0]
-                    v_t = layer[:, 1]
-                _num_blocks, _block_size, num_heads, head_size = k_t.shape
-                k_src_4d = k_src.reshape(
-                    len(effective_block_ids), block_size, num_heads, head_size
-                )
-                v_src_4d = v_src.reshape(
-                    len(effective_block_ids), block_size, num_heads, head_size
-                )
-                k_t[effective_block_ids] = k_src_4d
-                v_t[effective_block_ids] = v_src_4d
+    if _LMC_OPS_BLOCK_TRANSFER_ACCEPTS_TENSOR:
+        # Python fallback: accepts tensor list directly for all params.
+        paged_arg = normalized
+        objs_arg = chunks
+        block_ids_arg = selected_block_ids
+
+        lmc_ops.multi_layer_block_kv_transfer(
+            paged_arg,
+            objs_arg,
+            block_ids_arg,
+            tensors[0].device,
+            lmc_ops.TransferDirection.H2D,
+            shape_desc,
+            chunk_tokens,
+            gpu_kv_format,
+            skip_prefix_n_blocks,
+        )
+    else:
+        # assuming this is c ops path which requires pin memory
+        # TODO: may have a better approach here
+        # Defensive check: Ensure all incoming CPU chunks are pinned memory.
+        # Otherwise, the underlying CUDA kernel may throw an Illegal
+        # Memory Access error during H2D transfer.
+        if not all(chunk.is_pinned() for chunk in chunks):
+            logger.warning(
+                "Received unpinned CPU tensors in scatter_cpu_to_paged_kv. "
+                "Dynamically pinning memory now, which may incur additional"
+                "synchronization overhead."
+            )
+            chunks = [
+                chunk.pin_memory() if not chunk.is_pinned() else chunk
+                for chunk in chunks
+            ]
+
+        # Compiled C++/CUDA/XPU: requires int64 pointer tensor and list[int].
+        _ptrs_np = np.array(
+            [t.data_ptr() for t in normalized],  # type: ignore[union-attr]
+            dtype=np.uint64,
+        ).view(np.int64)
+        paged_arg = torch.from_numpy(_ptrs_np).to(device=tensors[0].device)
+        objs_arg = _tensors_to_ptrs(chunks)
+        block_ids_arg = torch.tensor(
+            selected_block_ids, dtype=torch.int64, device=tensors[0].device
+        )
+
+        # Batched transfer to satisfy cuda's limitation (max 4 objects)
+        MAX_OBJECTS = 4
+        req_blocks_per_obj = (
+            blocks_per_chunk  # Each chunk corresponds to one object's blocks
+        )
+        total_chunks = len(chunks)
+
+        for i in range(0, total_chunks, MAX_OBJECTS):
+            # Slice objects and block IDs for this batch
+            batch_objs_ptrs = objs_arg[i : i + MAX_OBJECTS]
+
+            start_block = i * req_blocks_per_obj
+            end_block = min(
+                (i + MAX_OBJECTS) * req_blocks_per_obj, len(selected_block_ids)
+            )
+            batch_blocks = block_ids_arg[start_block:end_block]
+
+            # Execute transfer for this batch
+            lmc_ops.multi_layer_block_kv_transfer(
+                paged_arg,
+                batch_objs_ptrs,
+                batch_blocks,
+                tensors[0].device,
+                lmc_ops.TransferDirection.H2D,
+                shape_desc,
+                chunk_tokens,
+                gpu_kv_format,
+                skip_prefix_n_blocks if i == 0 else 0,
+            )
+    # Fast path: The async GPU copy might still be in progress.
+    # We intentionally omit synchronization here for performance.
+    # WARNING: The caller MUST explicitly call `torch_dev.synchronize()`
+    # before consuming these chunks to ensure data validity.
diff --git a/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py b/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py
index bf93d66a04..e664c368a1 100644
--- a/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py
+++ b/tests/v1/gpu_connector/test_blocks_first_fused_kv_format.py
@@ -16,6 +16,7 @@
 import torch
 
 # First Party
+from lmcache import torch_device_type
 from lmcache.python_ops_fallback import (
     multi_layer_block_kv_transfer as fallback_multi_layer_block_kv_transfer,
 )
@@ -30,6 +31,11 @@
 )
 import lmcache.c_ops as lmc_ops
 
+pytestmark = pytest.mark.skipif(
+    torch_device_type != "cpu",
+    reason="vLLM blocks-first fused format (Format 10) is strictly CPU-only.",
+)
+
 NB, NH, BS, HS, NL = 16, 4, 128, 64, 3
 HINTS = {"kv_layout": "HND"}
 
diff --git a/tests/v1/multiprocess/test_non_cuda_data_transfer.py b/tests/v1/multiprocess/test_non_cuda_data_transfer.py
index cfea3d0abf..6c0f56cd64 100644
--- a/tests/v1/multiprocess/test_non_cuda_data_transfer.py
+++ b/tests/v1/multiprocess/test_non_cuda_data_transfer.py
@@ -13,6 +13,7 @@
 import torch
 
 # First Party
+from lmcache import torch_dev, torch_device_type
 from lmcache.v1.distributed.api import MemoryLayoutDesc
 from lmcache.v1.multiprocess.posix_shm import (
     shm_create_readwrite,
@@ -429,7 +430,7 @@ def test_compute_kv_layout_and_gather_scatter_roundtrip(
         scatter_cpu_to_paged_kv,
     )
 
-    source = builder_fn()
+    source = {k: v.to(torch_device_type) for k, v in builder_fn().items()}
     (
         block_size,
         num_layers,
@@ -477,7 +478,7 @@ def test_gather_scatter_roundtrip_hnd_layout(
     )
     import lmcache.c_ops as lmc_ops
 
-    source = hnd_builder(2, 8, 4, 2, 8)
+    source = {k: v.to(torch_device_type) for k, v in hnd_builder(2, 8, 4, 2, 8).items()}
     layout_hints: LayoutHints = {"kv_layout": "HND"}
     (
         block_size,
@@ -576,7 +577,7 @@ def test_scatter_respects_skip_first_n_tokens(
         scatter_cpu_to_paged_kv,
     )
 
-    source = builder_fn()
+    source = {k: v.to(torch_device_type) for k, v in builder_fn().items()}
     destination = {
         name: torch.full_like(tensor, 999.0) for name, tensor in source.items()
     }
@@ -962,7 +963,10 @@ def test_gather_paged_kv_with_chunk_indices_subset() -> None:
     from lmcache.v1.multiprocess.transfer_context.base import gather_paged_kv_to_cpu
 
     # 3 chunks (6 blocks, 2 blocks per chunk), but we only want chunks 0 and 2
-    source = _make_kv_caches(num_layers=2, num_blocks=6, block_size=4)
+    source = {
+        k: v.to(torch_device_type)
+        for k, v in _make_kv_caches(num_layers=2, num_blocks=6, block_size=4).items()
+    }
     blocks_per_chunk = 2
     # Pre-allocate output buffers for chunks 0 and 2 only (2 tensors, not 3).
     # Shape: [2, num_layers, chunk_tokens, hidden_dim] where
@@ -980,7 +984,7 @@ def test_gather_paged_kv_with_chunk_indices_subset() -> None:
         out=out_buffers,
         chunk_indices=[0, 2],
     )
-
+    torch_dev.synchronize()
     # Result should be the same list as out_buffers (in-place fill)
     assert result is out_buffers
 
@@ -988,6 +992,8 @@ def test_gather_paged_kv_with_chunk_indices_subset() -> None:
     # out_buffers[1] should contain chunk 2 (blocks 4,5) data
     # Verify by independently gathering all chunks and comparing
     all_chunks = gather_paged_kv_to_cpu(source, [0, 1, 2, 3, 4, 5], blocks_per_chunk)
+    torch_dev.synchronize()
+
     assert torch.allclose(out_buffers[0], all_chunks[0])
     assert torch.allclose(out_buffers[1], all_chunks[2])
 

From 566698e88fa20ebeda3503147ed65a95685a7953 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 10 Jun 2026 02:58:21 +0000
Subject: [PATCH 20/57] chore(deps): bump sphinxcontrib-mermaid from 1.2.2 to
 2.0.2 (#3596)

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements/docs.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/docs.txt b/requirements/docs.txt
index 65589f4284..4666cf5a52 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -9,7 +9,7 @@ sphinxcontrib-qthelp==2.0.0
 sphinxcontrib-serializinghtml==2.0.0
 sphinxawesome_theme==5.3.2
 sphinx-copybutton==0.5.2
-sphinxcontrib-mermaid==1.2.2
+sphinxcontrib-mermaid==2.0.2
 sphinx-multiversion==0.2.4
 sphinx-intl==2.3.1
 sphinxcontrib-images

From fc39f476a0a9aaf1b873dfd0d362f316b6bc2044 Mon Sep 17 00:00:00 2001
From: maobaolong <baoloongmao@tencent.com>
Date: Wed, 10 Jun 2026 11:28:47 +0800
Subject: [PATCH 21/57] [CI] Reduce ci cpu e2e test memory request (#3614)

* Reduce ci cpu e2e test memory request

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* use python to compute kv cache bytes so float values work

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

---------

Signed-off-by: baoloongmao <baoloongmao@tencent.com>
---
 .../scripts/run-cpu-e2e-validation.sh         | 33 ++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh b/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
index 7825f37e8a..cd9bd24604 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
@@ -13,9 +13,16 @@ LMCACHE_PID=""
 VLLM_PID=""
 LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
 VLLM_PORT="${VLLM_PORT:-8000}"
-LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-2}"
+LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-1}"
 LMCACHE_EVICTION_POLICY="${LMCACHE_EVICTION_POLICY:-LRU}"
 LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-128}"
+VLLM_GPU_MEMORY_UTILIZATION="${VLLM_GPU_MEMORY_UTILIZATION:-0.1}"
+# CPU-only KV cache pool size in GiB. vLLM defaults to 4 GiB when unset;
+# 1 GiB is plenty for opt-125m e2e validation.
+VLLM_CPU_KVCACHE_SPACE="${VLLM_CPU_KVCACHE_SPACE:-1}"
+# Cap context length and concurrent sequences to shrink scheduler buffers.
+VLLM_MAX_MODEL_LEN="${VLLM_MAX_MODEL_LEN:-2048}"
+VLLM_MAX_NUM_SEQS="${VLLM_MAX_NUM_SEQS:-4}"
 LMCACHE_HEALTHCHECK_TIMEOUT="${LMCACHE_HEALTHCHECK_TIMEOUT:-30}"
 VLLM_READY_TIMEOUT="${VLLM_READY_TIMEOUT:-120}"
 # Set LMCACHE_SHM_NAME="" to use pickle transport; unset/default uses shm transport
@@ -197,14 +204,24 @@ print(json.dumps({
 
 start_vllm() {
   echo "Starting vLLM server..."
-  VLLM_TARGET_DEVICE=cpu \
-  LMCACHE_MP_TRANSFER_MODE="${LMCACHE_MP_TRANSFER_MODE}" \
+  # Export so multiproc_executor worker children inherit these. Without
+  # VLLM_CPU_KVCACHE_SPACE, CPU backend falls back to
+  # `total_memory * gpu_memory_utilization`, which can request 100s of GiB
+  # on big hosts and OOM (see vllm/v1/worker/cpu_worker.py:determine_available_memory).
+  export VLLM_TARGET_DEVICE=cpu
+  export VLLM_CPU_KVCACHE_SPACE="${VLLM_CPU_KVCACHE_SPACE}"
+  export LMCACHE_MP_TRANSFER_MODE="${LMCACHE_MP_TRANSFER_MODE}"
+  local kv_cache_bytes
+  kv_cache_bytes="$(python3 -c "print(int(${VLLM_CPU_KVCACHE_SPACE} * 1024 * 1024 * 1024))")"
   vllm serve facebook/opt-125m \
     --port "${VLLM_PORT}" \
     --dtype bfloat16 \
     --disable-hybrid-kv-cache-manager \
     --no-enable-prefix-caching \
-    --gpu-memory-utilization 0.3 \
+    --gpu-memory-utilization "${VLLM_GPU_MEMORY_UTILIZATION}" \
+    --kv-cache-memory-bytes "${kv_cache_bytes}" \
+    --max-model-len "${VLLM_MAX_MODEL_LEN}" \
+    --max-num-seqs "${VLLM_MAX_NUM_SEQS}" \
     --kv-transfer-config '{"kv_connector":"LMCacheMPConnector","kv_role":"kv_both"}' \
     >"${VLLM_LOG}" 2>&1 &
   VLLM_PID=$!
@@ -421,7 +438,7 @@ start_vllm
 # Request A (first time) → should trigger store
 echo "[Phase 3 / Step 3] Request A (first) — expecting LMCache store"
 L1_WRITE_BEFORE=$(scrape_metric "lmcache_mp_l1_write_chunks_total")
-OUTPUT_1=$(send_completion "${PROMPT_FILE}" 200)
+OUTPUT_1=$(send_completion "${PROMPT_FILE}" 50)
 echo "Output 1: ${OUTPUT_1}"
 sleep 2  # allow async store to complete
 L1_WRITE_AFTER=$(scrape_metric "lmcache_mp_l1_write_chunks_total")
@@ -436,7 +453,7 @@ echo "✅ LMCache store verified (${STORE_DELTA} chunks written)"
 # Request A (second time, same vLLM instance) → should trigger read/hit
 echo "[Phase 3 / Step 4] Request A (second) — expecting LMCache hit"
 L1_READ_BEFORE=$(scrape_metric "lmcache_mp_l1_read_chunks_total")
-OUTPUT_2=$(send_completion "${PROMPT_FILE}" 200)
+OUTPUT_2=$(send_completion "${PROMPT_FILE}" 50)
 echo "Output 2: ${OUTPUT_2}"
 sleep 2
 L1_READ_AFTER=$(scrape_metric "lmcache_mp_l1_read_chunks_total")
@@ -457,7 +474,7 @@ start_vllm
 # Request A (third time, new vLLM instance) → should trigger read/hit from LMCache
 echo "[Phase 3 / Step 6] Request A (third) — expecting LMCache hit after vLLM restart"
 L1_READ_BEFORE=$(scrape_metric "lmcache_mp_l1_read_chunks_total")
-OUTPUT_3=$(send_completion "${PROMPT_FILE}" 200)
+OUTPUT_3=$(send_completion "${PROMPT_FILE}" 50)
 echo "Output 3: ${OUTPUT_3}"
 sleep 2
 L1_READ_AFTER=$(scrape_metric "lmcache_mp_l1_read_chunks_total")
@@ -494,7 +511,7 @@ story_b = '''In the year 2147, humanity established its first permanent colony o
 print(story_b, end='')
 " > "${PROMPT_FILE_B}"
 L1_READ_BEFORE=$(scrape_metric "lmcache_mp_l1_read_chunks_total")
-OUTPUT_B=$(send_completion "${PROMPT_FILE_B}" 200)
+OUTPUT_B=$(send_completion "${PROMPT_FILE_B}" 50)
 echo "Output B: ${OUTPUT_B}"
 wait_for_metric_change "lmcache_mp_l1_read_chunks_total" "${L1_READ_BEFORE}" 5 || true
 L1_READ_AFTER=$(scrape_metric "lmcache_mp_l1_read_chunks_total")

From a5b70473675e1c1358ff274c3c4ac70b2235ab2f Mon Sep 17 00:00:00 2001
From: Javen-Ke <javen@arcfra.com>
Date: Wed, 10 Jun 2026 12:28:28 +0800
Subject: [PATCH 22/57] bench: support aligned L1 buffers for L2 adapters
 (#3603)

Signed-off-by: Javen Ke <javen@arcfra.com>
---
 docs/source/cli/bench.rst                     | 27 ++++++-
 .../bench/l2_adapter_bench/command.py         | 73 ++++++++++++-----
 .../commands/bench/l2_adapter_bench/data.py   | 80 ++++++++++++++++---
 .../bench/l2_adapter_bench/test_data.py       | 61 ++++++++++++++
 4 files changed, 205 insertions(+), 36 deletions(-)
 create mode 100644 tests/cli/commands/bench/l2_adapter_bench/test_data.py

diff --git a/docs/source/cli/bench.rst b/docs/source/cli/bench.rst
index fd6462affc..67d66e267e 100644
--- a/docs/source/cli/bench.rst
+++ b/docs/source/cli/bench.rst
@@ -949,6 +949,12 @@ support a clean store -> load round-trip.
    ``"use_odirect": true``) or that talk to a remote service without
    a local cache, the default combined run is usually fine.
 
+   O_DIRECT adapters may also require the benchmark L1 buffer to
+   satisfy the adapter's block alignment. Use ``--l1-align-bytes`` to
+   set that alignment, commonly ``4096`` for local block devices. The
+   payload size (``--data-size-kb * 1024``) must be a multiple of the
+   selected alignment.
+
 
 Quick start
 ~~~~~~~~~~~
@@ -973,6 +979,15 @@ Stress the adapter with more in-flight submits and larger payloads:
        --data-size-kb 512 \
        --rounds 5 --warmup-rounds 1
 
+Benchmark an O_DIRECT adapter with aligned L1 buffers:
+
+.. code-block:: bash
+
+   lmcache bench l2 \
+       --l2-adapter '{"type":"raw_block","device_path":"/dev/nvme0n1","slot_bytes":4194304,"use_odirect":true,"block_align":4096}' \
+       --data-size-kb 1024 \
+       --l1-align-bytes 4096
+
 Run only one operation (useful to isolate store vs. load throughput):
 
 .. code-block:: bash
@@ -1039,6 +1054,13 @@ Options
    * - ``--data-size-kb N``
      - ``256``
      - Data size per key, in KiB.
+   * - ``--l1-align-bytes N``
+     - ``1``
+     - Alignment in bytes for benchmark L1 buffers. Use a value
+       at least as large as the adapter's block alignment when
+       benchmarking O_DIRECT backends, for example ``4096`` for local
+       block devices. ``--data-size-kb * 1024`` must be a multiple of
+       this value.
    * - ``--rounds N``
      - ``1``
      - Measurement rounds per operation.
@@ -1186,9 +1208,8 @@ round against the byte pattern that ``store`` wrote (see
    [Verify] OK
 
 Verification is **off** by default because the stricter byte pattern
-also forces every key to allocate its own ``data_size`` buffer
-(otherwise the runner is free to reuse a single shared buffer across
-keys to keep the memory footprint small).
+requires both the store and load object batches to stay resident so the
+loaded data can be compared against the original store pattern.
 
 
 Exit codes
diff --git a/lmcache/cli/commands/bench/l2_adapter_bench/command.py b/lmcache/cli/commands/bench/l2_adapter_bench/command.py
index 0befc1e057..1b07d2f5b0 100644
--- a/lmcache/cli/commands/bench/l2_adapter_bench/command.py
+++ b/lmcache/cli/commands/bench/l2_adapter_bench/command.py
@@ -99,6 +99,15 @@ def register_l2_parser(
         default=256,
         help="Data size per key in KB (default: 256).",
     )
+    parser.add_argument(
+        "--l1-align-bytes",
+        type=int,
+        default=1,
+        help=(
+            "Alignment in bytes for benchmark L1 buffers. "
+            "Use 4096 when benchmarking O_DIRECT backends. Default: 1."
+        ),
+    )
     parser.add_argument(
         "--rounds",
         type=int,
@@ -125,8 +134,8 @@ def register_l2_parser(
             "never stored. Default: 0.0."
         ),
     )
-    # Round-trip verification is OFF by default (cheaper memory
-    # footprint: see make_memory_objects' share_buffer layout).
+    # Round-trip verification is OFF by default because it needs both
+    # store and load object batches resident at the same time.
     # Use --no-skip-verify to enable verification.
     parser.add_argument(
         "--skip-verify",
@@ -168,12 +177,10 @@ def run_l2_adapter_bench(command: "BaseCommand", args: argparse.Namespace) -> No
         args: Parsed CLI arguments from the ``bench l2`` subparser.
     """
     # Lazy imports: keep CLI loadable without torch / native deps.
-    # Third Party
-    import torch
-
     # First Party
     from lmcache.cli.commands.bench.l2_adapter_bench.data import (
         create_l1_memory_desc,
+        make_aligned_tensor,
         make_memory_objects,
         make_object_keys,
         verify_round_trip,
@@ -191,6 +198,17 @@ def run_l2_adapter_bench(command: "BaseCommand", args: argparse.Namespace) -> No
     kb = 1024
     mb = 1024 * 1024
     data_size = args.data_size_kb * kb
+    l1_align_bytes = int(args.l1_align_bytes)
+    if l1_align_bytes <= 0:
+        print("Error: --l1-align-bytes must be positive", file=sys.stderr)
+        sys.exit(2)
+    if data_size % l1_align_bytes != 0:
+        print(
+            "Error: --data-size-kb must produce a payload size that is "
+            "a multiple of --l1-align-bytes",
+            file=sys.stderr,
+        )
+        sys.exit(2)
     in_flight = args.in_flight
     num_keys = args.num_keys
     rounds = args.rounds
@@ -241,8 +259,8 @@ def log(msg: str) -> None:
 
     # Backing L1 memory buffer for adapters that need an L1 desc.
     # Sized for one in-flight wave of store + load buffers.
-    l1_buffer = torch.empty(2 * keys_per_round * data_size, dtype=torch.uint8)
-    l1_memory_desc = create_l1_memory_desc(l1_buffer)
+    l1_buffer = make_aligned_tensor(2 * keys_per_round * data_size, l1_align_bytes)
+    l1_memory_desc = create_l1_memory_desc(l1_buffer, align_bytes=l1_align_bytes)
 
     log("\n[Init] Creating adapter...")
     try:
@@ -277,20 +295,28 @@ def _build_round_keys(r: int) -> list[list]:
             for i in range(in_flight)
         ]
 
-    def _build_round_objs() -> list[list]:
-        """Allocate per-submit object batches for one round.
+    def _build_round_objs(base_offset: int, fill_offset: int = 0) -> list[list]:
+        """Build per-submit object batches backed by the registered L1 buffer.
 
-        Every key in every batch gets its OWN ``data_size`` tensor,
-        pre-filled with a distinguishing byte pattern. This keeps
-        ``verify_round_trip`` meaningful (it can detect cross-key
-        corruption after a store -> load cycle) and keeps the
-        memory layout consistent regardless of whether verify is
-        actually run.
+        Some adapters register the L1 buffer passed through ``L1MemoryDesc``
+        during initialization. The benchmark objects must therefore be views
+        into that same buffer rather than independent tensors allocated
+        elsewhere.
 
-        Per-round (per direction) memory:
-        ``in_flight * num_keys * data_size`` bytes.
+        ``fill_offset`` lets load buffers start with a pattern that differs
+        from store buffers, so round-trip verification catches silent no-op
+        loads that nevertheless report success.
         """
-        return [make_memory_objects(num_keys, data_size) for _ in range(in_flight)]
+        return [
+            make_memory_objects(
+                l1_buffer,
+                num_keys,
+                data_size,
+                base_offset + i * num_keys * data_size,
+                fill_offset=fill_offset,
+            )
+            for i in range(in_flight)
+        ]
 
     # Lookup hit/miss split per round.
     per_round_hit = int(keys_per_round * max_hit_rate)
@@ -338,13 +364,16 @@ def _build_lookup_round_keys(r: int) -> list[list]:
     def _store_objs(_r: int) -> list[list]:
         nonlocal store_obj_batches
         if store_obj_batches is None:
-            store_obj_batches = _build_round_objs()
+            store_obj_batches = _build_round_objs(0)
         return store_obj_batches
 
     def _load_objs(_r: int) -> list[list]:
         nonlocal load_obj_batches
         if load_obj_batches is None:
-            load_obj_batches = _build_round_objs()
+            load_obj_batches = _build_round_objs(
+                keys_per_round * data_size,
+                fill_offset=1,
+            )
         return load_obj_batches
 
     results: list = []
@@ -418,8 +447,8 @@ def _load_objs(_r: int) -> list[list]:
             # Sanity: store and load used the same key idx range for
             # the last measured round, and load buffers now hold what
             # the adapter returned. Compare against the byte pattern
-            # written by store (i & 0xFF, where i is position within
-            # the batch — see make_memory_objects).
+            # written by the store object batch (i & 0xFF, where i is
+            # position within the batch).
             log(
                 "[Verify] Checking store -> load data integrity for last "
                 "measured round..."
diff --git a/lmcache/cli/commands/bench/l2_adapter_bench/data.py b/lmcache/cli/commands/bench/l2_adapter_bench/data.py
index 9da2fb9242..6224a8feac 100644
--- a/lmcache/cli/commands/bench/l2_adapter_bench/data.py
+++ b/lmcache/cli/commands/bench/l2_adapter_bench/data.py
@@ -24,6 +24,38 @@
 _KB = 1024
 
 
+def make_aligned_tensor(num_bytes: int, align_bytes: int = 1) -> torch.Tensor:
+    """Create a 1-D uint8 tensor whose data pointer is aligned.
+
+    Args:
+        num_bytes: Number of bytes in the returned tensor.
+        align_bytes: Required data pointer alignment in bytes.
+
+    Returns:
+        A 1-D ``torch.uint8`` tensor with ``num_bytes`` elements.
+
+    Raises:
+        ValueError: If ``num_bytes`` is negative or ``align_bytes`` is
+            not positive.
+        RuntimeError: If the allocated tensor cannot be aligned.
+    """
+    if num_bytes < 0:
+        raise ValueError("num_bytes must be non-negative")
+    if align_bytes <= 0:
+        raise ValueError("align_bytes must be positive")
+    if align_bytes == 1:
+        return torch.empty(num_bytes, dtype=torch.uint8)
+
+    backing = torch.empty(num_bytes + align_bytes - 1, dtype=torch.uint8)
+    offset = (-backing.data_ptr()) % align_bytes
+    aligned = backing[offset : offset + num_bytes]
+    if aligned.data_ptr() % align_bytes != 0:
+        raise RuntimeError(
+            f"failed to allocate {align_bytes}-byte aligned benchmark buffer"
+        )
+    return aligned
+
+
 def make_object_keys(
     num_keys: int, model_name: str = "bench-model", key_offset: int = 0
 ) -> list[ObjectKey]:
@@ -53,23 +85,46 @@ def make_object_keys(
 
 
 def make_memory_objects(
+    buffer: torch.Tensor,
     num_keys: int,
     data_size: int,
+    base_offset: int,
+    fill_offset: int = 0,
 ) -> list[MemoryObj]:
-    """Create *num_keys* ``TensorMemoryObj`` instances of *data_size* bytes.
+    """Create MemoryObj views backed by a shared L1 benchmark buffer.
 
-    Each returned object owns an independent ``data_size``-byte tensor
-    pre-filled with a distinguishing byte pattern (key index mod 256)
-    so that ``verify_round_trip`` can detect cross-key corruption after
-    a store -> load cycle.
+    Each returned object is a ``data_size``-byte slice of ``buffer``,
+    pre-filled with a distinguishing byte pattern
+    ``(key_index + fill_offset) mod 256`` so that ``verify_round_trip``
+    can detect cross-key corruption after a store -> load cycle.
 
-    Per-call memory: ``num_keys * data_size``.
+    Args:
+        buffer: Contiguous benchmark L1 buffer that backs all objects.
+        num_keys: Number of memory objects to create.
+        data_size: Size of each memory object in bytes.
+        base_offset: Byte offset of the first object within ``buffer``.
+        fill_offset: Offset added to each key index before generating the
+            byte fill pattern.
+
+    Returns:
+        ``TensorMemoryObj`` instances whose ``raw_data`` tensors are views
+        into ``buffer``.
+
+    Raises:
+        ValueError: If the requested object range falls outside ``buffer``.
     """
-    # Independent buffers with distinguishing fill patterns for verify.
+    flat_buffer = buffer.view(-1)
     objects: list[MemoryObj] = []
     for i in range(num_keys):
-        raw_tensor = torch.empty(data_size, dtype=torch.uint8)
-        raw_tensor.fill_(i & 0xFF)
+        start = base_offset + i * data_size
+        end = start + data_size
+        if start < 0 or end > flat_buffer.numel():
+            raise ValueError(
+                f"L1 benchmark buffer too small for object {i}: "
+                f"[{start}, {end}) > {flat_buffer.numel()}"
+            )
+        raw_tensor = flat_buffer[start:end]
+        raw_tensor.fill_((i + fill_offset) & 0xFF)
         metadata = MemoryObjMetadata(
             shape=torch.Size([data_size]),
             dtype=torch.uint8,
@@ -88,13 +143,16 @@ def make_memory_objects(
     return objects
 
 
-def create_l1_memory_desc(buffer: torch.Tensor) -> L1MemoryDesc:
+def create_l1_memory_desc(
+    buffer: torch.Tensor,
+    align_bytes: int = 1,
+) -> L1MemoryDesc:
     """Create an L1 memory descriptor for a contiguous test buffer."""
     flat_buffer = buffer.view(-1)
     return L1MemoryDesc(
         ptr=flat_buffer.data_ptr(),
         size=flat_buffer.numel() * flat_buffer.element_size(),
-        align_bytes=flat_buffer.element_size(),
+        align_bytes=align_bytes,
     )
 
 
diff --git a/tests/cli/commands/bench/l2_adapter_bench/test_data.py b/tests/cli/commands/bench/l2_adapter_bench/test_data.py
new file mode 100644
index 0000000000..affa8c70c4
--- /dev/null
+++ b/tests/cli/commands/bench/l2_adapter_bench/test_data.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Third Party
+import torch
+
+# First Party
+from lmcache.cli.commands.bench.l2_adapter_bench.data import (
+    create_l1_memory_desc,
+    make_aligned_tensor,
+    make_memory_objects,
+)
+
+
+def test_make_aligned_tensor_returns_aligned_buffer() -> None:
+    tensor = make_aligned_tensor(4096 * 3, align_bytes=4096)
+
+    assert tensor.numel() == 4096 * 3
+    assert tensor.dtype == torch.uint8
+    assert tensor.data_ptr() % 4096 == 0
+
+
+def test_create_l1_memory_desc_uses_requested_alignment() -> None:
+    tensor = make_aligned_tensor(8192, align_bytes=4096)
+
+    desc = create_l1_memory_desc(tensor, align_bytes=4096)
+
+    assert desc.ptr == tensor.data_ptr()
+    assert desc.size == 8192
+    assert desc.align_bytes == 4096
+
+
+def test_make_memory_objects_uses_shared_l1_range() -> None:
+    buffer = make_aligned_tensor(4096, align_bytes=1024)
+
+    objects = make_memory_objects(
+        buffer,
+        num_keys=2,
+        data_size=1024,
+        base_offset=1024,
+    )
+
+    assert len(objects) == 2
+    assert objects[0].raw_data.data_ptr() == buffer.data_ptr() + 1024
+    assert objects[1].raw_data.data_ptr() == buffer.data_ptr() + 2048
+    assert torch.all(objects[0].raw_data == 0)
+    assert torch.all(objects[1].raw_data == 1)
+
+
+def test_make_memory_objects_can_use_different_fill_pattern() -> None:
+    buffer = make_aligned_tensor(2048, align_bytes=1024)
+
+    objects = make_memory_objects(
+        buffer,
+        num_keys=2,
+        data_size=1024,
+        base_offset=0,
+        fill_offset=1,
+    )
+
+    assert torch.all(objects[0].raw_data == 1)
+    assert torch.all(objects[1].raw_data == 2)

From 4bbfd11b0f7b57af61fa0868b444d41ce14f401b Mon Sep 17 00:00:00 2001
From: Shaoting <shaotingf@uchicago.edu>
Date: Wed, 10 Jun 2026 12:32:52 -0700
Subject: [PATCH 23/57] [core] Add GDS L1 tier (cuFile DMA) for MP mode (#3589)

Signed-off-by: Shaoting-Feng <shaotingf@tensormesh.ai>
---
 .buildkite/k3_tests/multiprocess/pipeline.yml |  22 +
 .buildkite/k3_tests/multiprocess/run.sh       |   1 +
 .../k3_tests/multiprocess/scripts/cleanup.sh  |   8 +
 .../multiprocess/scripts/launch-processes.sh  |  11 +
 .../multiprocess/scripts/run-gds-smoke.sh     | 113 +++++
 .../multiprocess/scripts/run-single-test.sh   |  14 +-
 docs/source/mp/architecture.rst               |  15 +-
 docs/source/mp/configuration.rst              |  31 +-
 docs/source/mp/index.rst                      |   5 +-
 docs/source/mp/l2_storage.rst                 |   6 +-
 lmcache/v1/distributed/config.py              |  61 +++
 lmcache/v1/distributed/l1_manager.py          |  16 +-
 .../v1/distributed/memory_manager/__init__.py |  25 ++
 .../memory_manager/gds_l1_memory_manager.py   | 158 +++++++
 .../memory_manager/l1_manager_protocol.py     |  50 +++
 .../l1_memory_manager.py}                     |   1 +
 lmcache/v1/gpu_connector/_cufile_async.py     | 384 +++++++++++++++++
 lmcache/v1/gpu_connector/gds_context.py       | 408 ++++++++++++++++++
 lmcache/v1/gpu_connector/gpu_ops.py           |   9 +-
 lmcache/v1/memory_management.py               | 122 ++++++
 lmcache/v1/multiprocess/engine_context.py     |  17 +
 lmcache/v1/multiprocess/gpu_context.py        |  19 +
 .../v1/multiprocess/modules/gpu_transfer.py   |   3 +
 lmcache/v1/multiprocess/server.py             |   2 +-
 .../v1/distributed/memory_manager/__init__.py |   1 +
 .../test_gds_l1_memory_manager.py             |  85 ++++
 tests/v1/gpu_connector/test_gds_context.py    | 300 +++++++++++++
 .../test_gpu_transfer_layout_registry.py      |   3 +
 .../test_non_cuda_data_transfer.py            |   8 +-
 29 files changed, 1884 insertions(+), 14 deletions(-)
 create mode 100755 .buildkite/k3_tests/multiprocess/scripts/run-gds-smoke.sh
 create mode 100644 lmcache/v1/distributed/memory_manager/__init__.py
 create mode 100644 lmcache/v1/distributed/memory_manager/gds_l1_memory_manager.py
 create mode 100644 lmcache/v1/distributed/memory_manager/l1_manager_protocol.py
 rename lmcache/v1/distributed/{memory_manager.py => memory_manager/l1_memory_manager.py} (99%)
 create mode 100644 lmcache/v1/gpu_connector/_cufile_async.py
 create mode 100644 lmcache/v1/gpu_connector/gds_context.py
 create mode 100644 tests/v1/distributed/memory_manager/__init__.py
 create mode 100644 tests/v1/distributed/memory_manager/test_gds_l1_memory_manager.py
 create mode 100644 tests/v1/gpu_connector/test_gds_context.py

diff --git a/.buildkite/k3_tests/multiprocess/pipeline.yml b/.buildkite/k3_tests/multiprocess/pipeline.yml
index b044224854..64e965935f 100644
--- a/.buildkite/k3_tests/multiprocess/pipeline.yml
+++ b/.buildkite/k3_tests/multiprocess/pipeline.yml
@@ -117,6 +117,28 @@ steps:
         plugins: [{ kubernetes: { podSpec: *pod-1gpu } }]
         artifact_paths: ["*.log"]
 
+      - label: ":compression: gds_smoke_test"
+        command: .buildkite/k3_tests/multiprocess/run.sh gds_smoke_test
+        timeout_in_minutes: 30
+        agents: { queue: "k8s" }
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - name: container-0
+                    image: lmcache/ci-base:latest
+                    imagePullPolicy: Never
+                    resources: { limits: { "nvidia.com/gpu": "1" } }
+                    volumeMounts:
+                      - { name: hf-cache, mountPath: /root/.cache/huggingface }
+                      - { name: scratch, mountPath: /scratch }
+                      - { name: udev, mountPath: /run/udev, readOnly: true }
+                volumes:
+                  - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } }
+                  - { name: scratch, hostPath: { path: /data/gds-scratch, type: DirectoryOrCreate } }
+                  - { name: udev, hostPath: { path: /run/udev, type: Directory } }
+        artifact_paths: ["*.log"]
+
   - group: ":compression: Multiprocess (CPU-only)"
     steps:
       - label: ":compression: cpu_e2e_validation (shm)"
diff --git a/.buildkite/k3_tests/multiprocess/run.sh b/.buildkite/k3_tests/multiprocess/run.sh
index 62e2e9b0ba..369c90bec5 100755
--- a/.buildkite/k3_tests/multiprocess/run.sh
+++ b/.buildkite/k3_tests/multiprocess/run.sh
@@ -3,6 +3,7 @@
 # Usage: run.sh <test_name>
 #   test_name: lm_eval | hma_lm_eval_gemma4 | vllm_bench | long_doc_qa
 #              | long_doc_qa_l2 | fault_tolerance | deadlock | restart_recovery
+#              | gds_smoke_test
 # Thin wrapper: sets up environment, then delegates to scripts/.
 # No Docker -- all processes run natively in the pod.
 set -euo pipefail
diff --git a/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh b/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh
index 4ae44b79e9..2a6160118f 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh
@@ -28,6 +28,14 @@ for port in "${VLLM_PORT:-8000}" "${VLLM_BASELINE_PORT:-9000}" "${LMCACHE_PORT:-
     fuser -k "${port}/tcp" 2>/dev/null || true
 done
 
+# Remove the GDS slab scratch dir (only set for gds_* tests). It lives on the
+# /scratch hostPath (host-local NVMe), so it persists past the pod and the
+# preallocated slab is large -- drop it now that the server is stopped.
+if [[ -n "${GDS_L1_PATH:-}" ]]; then
+    echo "Removing GDS slab dir: $GDS_L1_PATH"
+    rm -rf "${GDS_L1_PATH}" 2>/dev/null || true
+fi
+
 echo "=== Cleanup complete ==="
 
 # Copy server logs to the workspace so Buildkite can collect them as artifacts
diff --git a/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh b/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh
index c0b52ae605..65034f2e55 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh
@@ -76,12 +76,23 @@ PID_FILE="/tmp/lmcache_mp_pids_${BUILD_ID}"
 echo "=== Launching LMCache MP server ==="
 echo "Port: $LMCACHE_PORT"
 
+# Optional GDS L1 slab tier (gds_* tests). When GDS_L1_PATH is set, the L1
+# medium becomes an NVMe slab accessed via cuFile DMA instead of pinned DRAM;
+# --l1-size-gb then sizes the slab. The path must be on a GDS-capable
+# filesystem (local NVMe), provided by the /scratch hostPath mount.
+GDS_L1_ARG=""
+if [ -n "${GDS_L1_PATH:-}" ]; then
+    echo "GDS L1 tier enabled; slab directory: $GDS_L1_PATH"
+    GDS_L1_ARG="--gds-l1-path ${GDS_L1_PATH}"
+fi
+
 CUDA_VISIBLE_DEVICES="${GPU_FOR_VLLM}" \
 lmcache server \
     --l1-size-gb "$CPU_BUFFER_SIZE" \
     --eviction-policy LRU \
     --max-workers "$MAX_WORKERS" \
     --port "$LMCACHE_PORT" \
+    ${GDS_L1_ARG} \
     > "/tmp/build_${BUILD_ID}_lmcache.log" 2>&1 &
 
 LMCACHE_PID=$!
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-gds-smoke.sh b/.buildkite/k3_tests/multiprocess/scripts/run-gds-smoke.sh
new file mode 100755
index 0000000000..c33e5c676a
--- /dev/null
+++ b/.buildkite/k3_tests/multiprocess/scripts/run-gds-smoke.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+# GDS L1 smoke test. Sends a few completions (cold) to store KV to the slab,
+# resets vLLM's prefix cache, then re-sends them (warm) to read the KV back from
+# LMCache/GDS. Passes if every request returns HTTP 200, a real LMCache retrieve
+# happened, and the warm (GDS-retrieved) outputs match the cold (recomputed)
+# ones -- i.e. the GDS store/retrieve path works and is correct.
+#
+# Expects the GDS-enabled LMCache server + vLLM to already be running, with
+# VLLM_SERVER_DEV_MODE=1 (for /reset_prefix_cache).
+set -e
+set -o pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
+source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh"
+
+VLLM_PORT="${VLLM_PORT:-8000}"
+MODEL="${MODEL:-Qwen/Qwen3-14B}"
+BUILD_ID="${BUILD_ID:-local_$$}"
+LMCACHE_LOG="/tmp/build_${BUILD_ID}_lmcache.log"
+N_PROMPTS="${GDS_SMOKE_PROMPTS:-4}"
+OUT_DIR="$(mktemp -d)"
+trap 'rm -rf "$OUT_DIR"' EXIT
+
+# A long-ish prompt so each request stores at least one LMCache chunk.
+build_prompt() {  # $1 = unique id
+    local filler="The key-value cache stores attention keys and values across transformer layers. "
+    local body="" i
+    for i in $(seq 1 80); do body="${body}${filler}"; done
+    printf 'Document %s. %s' "$1" "$body"
+}
+
+# Send N_PROMPTS completions; capture each generated text to
+# $OUT_DIR/<label>_<i>.txt and require every request to return HTTP 200.
+send_batch() {  # $1 = phase label (cold|warm)
+    local label="$1" ok=0 i prompt payload resp http body
+    for i in $(seq 1 "$N_PROMPTS"); do
+        prompt="$(build_prompt "$i")"
+        payload=$(python3 -c 'import json,sys; print(json.dumps({"model":sys.argv[1],"prompt":sys.argv[2],"max_tokens":16,"temperature":0}))' "$MODEL" "$prompt")
+        resp=$(curl -s -w $'\n%{http_code}' \
+            "http://127.0.0.1:${VLLM_PORT}/v1/completions" \
+            -H "Content-Type: application/json" -d "$payload")
+        http="${resp##*$'\n'}"
+        body="${resp%$'\n'*}"
+        printf '%s' "$body" \
+            | python3 -c 'import json,sys; print(json.load(sys.stdin)["choices"][0]["text"])' \
+            > "${OUT_DIR}/${label}_${i}.txt" 2>/dev/null \
+            || echo "<no-output>" > "${OUT_DIR}/${label}_${i}.txt"
+        echo "  [$label] req $i -> HTTP $http"
+        [ "$http" = "200" ] && ok=$((ok + 1))
+    done
+    [ "$ok" -eq "$N_PROMPTS" ] || { echo "[$label] only $ok/$N_PROMPTS returned HTTP 200"; return 1; }
+}
+
+# Count completed LMCache retrieves recorded in the server log (0 if no log yet).
+count_retrieves() {
+    [ -f "$LMCACHE_LOG" ] || { echo 0; return; }
+    grep -c "Retrieved" "$LMCACHE_LOG" 2>/dev/null || true
+}
+
+echo "============================================"
+echo "=== GDS smoke: phase 1 (cold -> store KV to the GDS slab) ==="
+echo "============================================"
+send_batch cold
+echo "Waiting for async stores to drain to the LMCache server..."
+sleep 3
+retrieves_before=$(count_retrieves)
+
+echo "============================================"
+echo "=== Reset vLLM prefix cache (force warm requests through LMCache/GDS) ==="
+echo "============================================"
+reset_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
+    "http://127.0.0.1:${VLLM_PORT}/reset_prefix_cache")
+if [ "$reset_code" != "200" ]; then
+    echo "reset_prefix_cache failed (HTTP $reset_code); is VLLM_SERVER_DEV_MODE=1?"
+    exit 1
+fi
+sleep 2
+
+echo "============================================"
+echo "=== GDS smoke: phase 2 (warm -> retrieve KV from the GDS slab) ==="
+echo "============================================"
+send_batch warm
+retrieves_after=$(count_retrieves)
+
+# 1. A real GDS retrieve must have happened (else warm recomputed / hit the APC).
+echo ""
+echo "LMCache retrieves logged: before=${retrieves_before} after=${retrieves_after}"
+if [ "$retrieves_after" -le "$retrieves_before" ]; then
+    echo "GDS smoke FAILED: no LMCache retrieve recorded -- the GDS read path was"
+    echo "not exercised (warm requests recomputed or hit vLLM's prefix cache)."
+    exit 1
+fi
+
+# 2. The KV retrieved from the GDS slab must produce the same output as the
+#    cold recompute (deterministic decoding -> byte-identical completions).
+echo "=== Verifying warm (GDS-retrieved) outputs match cold (recomputed) ==="
+mismatch=0
+for i in $(seq 1 "$N_PROMPTS"); do
+    if diff -q "${OUT_DIR}/cold_${i}.txt" "${OUT_DIR}/warm_${i}.txt" >/dev/null 2>&1; then
+        echo "  prompt $i: match"
+    else
+        echo "  prompt $i: MISMATCH"
+        mismatch=$((mismatch + 1))
+    fi
+done
+if [ "$mismatch" -ne 0 ]; then
+    echo "GDS smoke FAILED: ${mismatch}/${N_PROMPTS} warm outputs differ from cold"
+    echo "-- the KV retrieved from the GDS slab is incorrect."
+    exit 1
+fi
+
+echo "=== GDS smoke test passed: GDS store + retrieve path works and is correct ==="
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh b/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
index 4df0e2ad95..58bedc7619 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
@@ -3,6 +3,7 @@
 # Usage: run-single-test.sh <test_name>
 #   test_name: lm_eval | hma_lm_eval_gemma4 | vllm_bench | long_doc_qa
 #              | long_doc_qa_l2 | fault_tolerance | deadlock | restart_recovery
+#              | gds_smoke_test
 #
 # Each invocation is self-contained: launches servers, runs one test, cleans up.
 # This mirrors the comprehensive tests' run-single-config.sh pattern.
@@ -21,6 +22,14 @@ export VLLM_PORT="${VLLM_PORT:-8000}"
 export VLLM_BASELINE_PORT="${VLLM_BASELINE_PORT:-9000}"
 export MAX_WAIT_SECONDS="${MAX_WAIT_SECONDS:-300}"
 export BUILD_ID="${BUILDKITE_BUILD_ID:-local_$$}"
+
+# gds_smoke_test enables the GDS L1 NVMe-slab tier
+GDS_SCRATCH="${GDS_SCRATCH:-/scratch}"
+if [ "$TEST_NAME" = "gds_smoke_test" ]; then
+    export GDS_L1_PATH="${GDS_SCRATCH}/lmcache-gds-${BUILD_ID}-${TEST_NAME}"
+    echo "GDS L1 tier enabled (slab dir: $GDS_L1_PATH)"
+fi
+
 # Per-test default model (overridable via the MODEL env var). The HMA test needs
 # a hybrid model whose KV cache groups have different block sizes, so the
 # connector exercises the per-group hybrid-memory-allocator path.
@@ -127,9 +136,12 @@ case "$TEST_NAME" in
     http_api)
         exec_script="${SCRIPT_DIR}/run-http-api.sh"
         ;;
+    gds_smoke_test)
+        exec_script="${SCRIPT_DIR}/run-gds-smoke.sh"
+        ;;
     *)
         echo "Unknown test: $TEST_NAME"
-        echo "Valid tests: lm_eval, hma_lm_eval_gemma4, vllm_bench, long_doc_qa, long_doc_qa_l2, fault_tolerance, deadlock, restart_recovery, cache_stats, http_api"
+        echo "Valid tests: lm_eval, hma_lm_eval_gemma4, vllm_bench, long_doc_qa, long_doc_qa_l2, fault_tolerance, deadlock, restart_recovery, cache_stats, http_api, gds_smoke_test"
         exit 1
         ;;
 esac
diff --git a/docs/source/mp/architecture.rst b/docs/source/mp/architecture.rst
index 5ea835b8aa..4392cfd6ac 100644
--- a/docs/source/mp/architecture.rst
+++ b/docs/source/mp/architecture.rst
@@ -29,7 +29,8 @@ High-Level Architecture
     StorageManager (distributed/storage_manager.py)
          |
          |--- L1Manager (l1_manager.py)
-         |       |--- L1MemoryManager (memory allocator)
+         |       |--- L1MemoryManager (CPU DRAM) or
+         |       |    GDSL1MemoryManager (NVMe slab via cuFile)
          |       |--- TTLLock per object (read/write)
          |
          |--- StoreController  -----> L2 Adapter(s) (async L1->L2 push)
@@ -235,8 +236,16 @@ Manages objects in CPU memory with a state machine:
 Each object has two ``TTLLock`` instances (read and write) with configurable
 timeouts to prevent deadlocks from crashed clients.
 
-The ``L1MemoryManager`` handles the underlying memory allocation (lazy growth
-up to ``--l1-size-gb``).
+The underlying memory allocation is handled by one of two interchangeable
+tiers selected at startup (both satisfy ``L1ManagerProtocol``):
+
+- ``L1MemoryManager`` (default) -- pinned CPU DRAM, with lazy growth up to
+  ``--l1-size-gb``.
+- ``GDSL1MemoryManager`` -- an NVMe slab file when ``--gds-l1-path`` is set.
+  The bytes live on disk; reads/writes DMA directly between the GPU staging
+  buffer and the slab via cuFile, driven by the process-global ``GDSContext``
+  (``gpu_connector/gds_context.py``) and dispatched from ``gpu_ops``. The CPU
+  tier is disabled in this mode.
 
 L2 Adapters
 ~~~~~~~~~~~
diff --git a/docs/source/mp/configuration.rst b/docs/source/mp/configuration.rst
index 6eb0468624..9a9d2e749e 100644
--- a/docs/source/mp/configuration.rst
+++ b/docs/source/mp/configuration.rst
@@ -156,7 +156,8 @@ Source: ``lmcache/v1/distributed/config.py``
      - Description
    * - ``--l1-size-gb``
      - *required*
-     - Size of L1 memory in GB.
+     - Size of the L1 tier in GB. Sizes the pinned-DRAM L1 by default, or the
+       GDS slab file when ``--gds-l1-path`` is set (see *GDS L1 Tier* below).
    * - ``--l1-use-lazy`` / ``--no-l1-use-lazy``
      - ``True``
      - Enable or disable lazy allocation for L1 memory.
@@ -173,6 +174,34 @@ Source: ``lmcache/v1/distributed/config.py``
      - ``4096``
      - Alignment size in bytes (default 4 KB).
 
+GDS L1 Tier
+-----------
+
+Source: ``lmcache/v1/distributed/config.py``
+
+Opt-in. Setting ``--gds-l1-path`` switches the L1 medium from pinned DRAM to
+an NVMe slab file accessed via GPUDirect Storage (cuFile DMA). The CPU
+pinned-DRAM tier is then disabled, and ``--l1-size-gb`` sizes the slab.
+Disable byte-array L2 adapters when this is on (the GDS tier exposes no L1
+memory buffer for them to register).
+
+.. list-table::
+   :header-rows: 1
+   :widths: 30 15 55
+
+   * - Argument
+     - Default
+     - Description
+   * - ``--gds-l1-path``
+     - Not set
+     - NVMe directory for the GDS L1 slab. Setting this enables the GDS L1
+       tier; one shared slab per process lives at
+       ``<path>/lmcache_gds_slab.bin``.
+   * - ``--gds-l1-use-direct-io`` / ``--no-gds-l1-use-direct-io``
+     - ``True``
+     - Open the slab with ``O_DIRECT`` (required for the GDS DMA fast path on
+       ext4).
+
 L1 Manager TTLs
 ----------------
 
diff --git a/docs/source/mp/index.rst b/docs/source/mp/index.rst
index 77ec21e3b5..35d5eba164 100644
--- a/docs/source/mp/index.rst
+++ b/docs/source/mp/index.rst
@@ -18,8 +18,9 @@ Key Benefits
   share a single L1 cache, maximizing KV reuse.
 - **Independent resource scaling** -- Allocate CPU memory for caching
   independently of GPU memory for inference.
-- **Multi-tier storage (L1 + L2)** -- In-memory L1 cache backed by persistent
-  L2 storage via NIXL (GDS, POSIX, HF3FS, and more).
+- **Multi-tier storage (L1 + L2)** -- An L1 cache (in CPU DRAM, or an NVMe
+  slab via GPUDirect Storage) backed by persistent L2 storage via NIXL (GDS,
+  POSIX, HF3FS, and more).
 - **Built-in observability** -- Prometheus metrics and a telemetry event system
   out of the box.
 
diff --git a/docs/source/mp/l2_storage.rst b/docs/source/mp/l2_storage.rst
index 0f5138c251..0ac34d4cf0 100644
--- a/docs/source/mp/l2_storage.rst
+++ b/docs/source/mp/l2_storage.rst
@@ -3,8 +3,10 @@ L2 Storage (Persistent Cache)
 
 LMCache multiprocess mode supports a two-tier storage architecture:
 
-- **L1 (in-memory)** -- Fast CPU memory managed by the L1 Manager.  All KV
-  cache chunks live here during active use.
+- **L1 (fast tier)** -- CPU memory by default, or an NVMe slab via GPUDirect
+  Storage (cuFile) when ``--gds-l1-path`` is set, managed by the L1 Manager.
+  All KV cache chunks live here during active use. (Byte-array L2 adapters are
+  unsupported under the GDS L1 tier, which exposes no L1 memory buffer.)
 - **L2 (persistent)** -- Durable storage backends (NIXL-based or plain
   file-system/raw-block).  The StoreController asynchronously pushes data from L1
   to L2, and the PrefetchController loads data from L2 back into L1 on
diff --git a/lmcache/v1/distributed/config.py b/lmcache/v1/distributed/config.py
index 69cb178054..6ce9dc450a 100644
--- a/lmcache/v1/distributed/config.py
+++ b/lmcache/v1/distributed/config.py
@@ -56,6 +56,31 @@ def __post_init__(self):
             self.use_lazy = False
 
 
+@dataclass
+class GdsL1Config:
+    """Configuration for the GDS slab-file L1 tier.
+
+    When present on :class:`L1ManagerConfig`, the L1 medium becomes an NVMe
+    slab file accessed via cuFile DMA instead of pinned DRAM (mutually
+    exclusive with the pinned-DRAM tier in ``memory_config``). Carries the
+    slab location, capacity, and DMA mode.
+    """
+
+    file_location: str
+    """Directory for the slab file (one shared slab per process, used by all
+    GPU instances)."""
+
+    size_in_bytes: int
+    """Slab capacity in bytes (from ``--l1-size-gb``). Sizes both the
+    preallocated slab file and the GDS tier's address space."""
+
+    use_direct_io: bool = True
+    """Open the slab with ``O_DIRECT`` (required for the GDS DMA fast path)."""
+
+    align_bytes: int = 4096
+    """Allocation alignment; cuFile/O_DIRECT require 4 KiB."""
+
+
 @dataclass
 class L1ManagerConfig:
     """
@@ -65,6 +90,10 @@ class L1ManagerConfig:
     memory_config: L1MemoryManagerConfig
     """ The memory manager configuration for L1 cache. """
 
+    gds_l1_config: "GdsL1Config | None" = None
+    """ Optional GDS L1 tier. When set, the GDS slab is the L1 medium
+    (mutually exclusive with the pinned-DRAM tier in ``memory_config``). """
+
     write_ttl_seconds: int = field(default=600)
     """ Time to live for each object's write lock. Default is 600s (10 minutes). """
 
@@ -172,6 +201,28 @@ def add_storage_manager_args(
         help="The alignment size in bytes. Default is 4KB (4096 bytes).",
     )
 
+    # GDS L1 tier (optional, opt-in via --gds-l1-path)
+    gds_group = parser.add_argument_group(
+        "GDS L1 tier",
+        "Configuration for the GDS slab-file L1 tier. Setting --gds-l1-path "
+        "makes the L1 medium an NVMe slab accessed via cuFile DMA instead of "
+        "pinned DRAM; --l1-size-gb then sizes the slab. Disable byte-array L2 "
+        "adapters when this is on.",
+    )
+    gds_group.add_argument(
+        "--gds-l1-path",
+        type=str,
+        default=None,
+        help="NVMe directory path for the GDS L1 slab. Setting this enables GDS L1.",
+    )
+    gds_group.add_argument(
+        "--gds-l1-use-direct-io",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Open the slab file with O_DIRECT (required for the GDS DMA fast "
+        "path on ext4). Default True.",
+    )
+
     # L1 Manager Config (TTL settings)
     ttl_group = parser.add_argument_group(
         "L1 Manager TTL", "TTL configuration for L1 manager locks"
@@ -307,8 +358,18 @@ def parse_args_to_config(
         align_bytes=args.l1_align_bytes,
     )
 
+    gds_l1_config: GdsL1Config | None = None
+    if getattr(args, "gds_l1_path", None):
+        # --l1-size-gb is the single L1 size flag; under GDS it sizes the slab.
+        gds_l1_config = GdsL1Config(
+            file_location=args.gds_l1_path,
+            size_in_bytes=int(args.l1_size_gb * (1 << 30)),
+            use_direct_io=args.gds_l1_use_direct_io,
+        )
+
     l1_manager_config = L1ManagerConfig(
         memory_config=memory_config,
+        gds_l1_config=gds_l1_config,
         write_ttl_seconds=args.l1_write_ttl_seconds,
         read_ttl_seconds=args.l1_read_ttl_seconds,
     )
diff --git a/lmcache/v1/distributed/l1_manager.py b/lmcache/v1/distributed/l1_manager.py
index e4e4379f30..64ae54cfaa 100644
--- a/lmcache/v1/distributed/l1_manager.py
+++ b/lmcache/v1/distributed/l1_manager.py
@@ -15,7 +15,11 @@
 from lmcache.v1.distributed.config import L1ManagerConfig
 from lmcache.v1.distributed.error import L1Error
 from lmcache.v1.distributed.internal_api import L1ManagerListener
-from lmcache.v1.distributed.memory_manager import L1MemoryManager
+from lmcache.v1.distributed.memory_manager import (
+    GDSL1MemoryManager,
+    L1ManagerProtocol,
+    L1MemoryManager,
+)
 from lmcache.v1.memory_management import MemoryObj
 from lmcache.v1.mp_observability.event import Event, EventType
 from lmcache.v1.mp_observability.event_bus import get_event_bus
@@ -184,7 +188,15 @@ def __init__(self, config: L1ManagerConfig):
 
         self._objects: dict[ObjectKey, L1ObjectState] = {}
 
-        self._memory_manager = L1MemoryManager(config.memory_config)
+        # GDS and CPU L1 are mutually exclusive tiers, each driven by its own
+        # config: the GDS tier reads only ``gds_l1_config`` (slab size +
+        # alignment), the CPU tier only ``memory_config``.
+        self._memory_manager: L1ManagerProtocol
+        if config.gds_l1_config is not None:
+            self._memory_manager = GDSL1MemoryManager(config.gds_l1_config)
+            logger.info("L1Manager: GDS L1 tier enabled; CPU pinned-DRAM L1 disabled")
+        else:
+            self._memory_manager = L1MemoryManager(config.memory_config)
 
         self._write_ttl_seconds = config.write_ttl_seconds
         self._read_ttl_seconds = config.read_ttl_seconds
diff --git a/lmcache/v1/distributed/memory_manager/__init__.py b/lmcache/v1/distributed/memory_manager/__init__.py
new file mode 100644
index 0000000000..b7fe59ba95
--- /dev/null
+++ b/lmcache/v1/distributed/memory_manager/__init__.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+"""L1 memory managers for the distributed cache.
+
+Two interchangeable tiers behind :class:`L1ManagerProtocol`:
+
+- :class:`L1MemoryManager` -- CPU pinned-DRAM slab.
+- :class:`GDSL1MemoryManager` -- GDS slab file (cuFile DMA).
+"""
+
+# First Party
+from lmcache.v1.distributed.memory_manager.gds_l1_memory_manager import (
+    GDSL1MemoryManager,
+)
+from lmcache.v1.distributed.memory_manager.l1_manager_protocol import L1ManagerProtocol
+from lmcache.v1.distributed.memory_manager.l1_memory_manager import (
+    L1MemoryManager,
+    create_memory_allocator,
+)
+
+__all__ = [
+    "GDSL1MemoryManager",
+    "L1ManagerProtocol",
+    "L1MemoryManager",
+    "create_memory_allocator",
+]
diff --git a/lmcache/v1/distributed/memory_manager/gds_l1_memory_manager.py b/lmcache/v1/distributed/memory_manager/gds_l1_memory_manager.py
new file mode 100644
index 0000000000..e1275f3535
--- /dev/null
+++ b/lmcache/v1/distributed/memory_manager/gds_l1_memory_manager.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+"""GDS slab-file L1 memory manager."""
+
+# Standard
+from typing import Optional
+
+# First Party
+from lmcache.integration.vllm.utils import get_size_bytes
+from lmcache.logging import init_logger
+from lmcache.v1.distributed.api import MemoryLayoutDesc
+from lmcache.v1.distributed.config import GdsL1Config
+from lmcache.v1.distributed.error import L1Error
+from lmcache.v1.distributed.internal_api import L1MemoryDesc
+from lmcache.v1.memory_management import (
+    AddressManager,
+    GDSMemoryObject,
+    MemoryObj,
+    MemoryObjMetadata,
+)
+
+logger = init_logger(__name__)
+
+
+class GDSL1MemoryManager:
+    """L1 memory manager for the GDS slab-file tier.
+
+    A peer of
+    :class:`~lmcache.v1.distributed.memory_manager.l1_memory_manager.L1MemoryManager`
+    (both satisfy
+    :class:`~lmcache.v1.distributed.memory_manager.l1_manager_protocol.L1ManagerProtocol`).
+    It owns an :class:`AddressManager` over the slab's byte-offset space and
+    hands out :class:`GDSMemoryObject` chunks; the actual GPU<->slab DMA is
+    performed by the global
+    :class:`~lmcache.v1.gpu_connector.gds_context.GDSContext`, reached from the
+    ``gpu_ops`` dispatch.
+
+    There is no on-disk index: the slab is created and cleared at startup
+    (treated like DRAM), so allocations do not survive a restart.
+    """
+
+    def __init__(self, config: GdsL1Config) -> None:
+        """Create the manager.
+
+        Args:
+            config: The GDS tier config. ``size_in_bytes`` sizes the slab
+                address space, and ``align_bytes`` sets the allocation alignment
+                (cuFile/O_DIRECT require 4 KiB). The same ``GdsL1Config`` drives
+                the :class:`GDSContext` that preallocates the slab file, so the
+                address space and the file match by construction. The CPU-tier
+                ``memory_config`` is not referenced on the GDS path.
+        """
+        self._address_manager = AddressManager(config.size_in_bytes, config.align_bytes)
+
+    def allocate(
+        self, layout_desc: MemoryLayoutDesc, count: int
+    ) -> tuple[L1Error, list[MemoryObj]]:
+        """Reserve ``count`` slab regions for the given layout.
+
+        All-or-nothing: on the first slab OOM, frees what was reserved and
+        returns ``(L1Error.OUT_OF_MEMORY, [])``.
+
+        Args:
+            layout_desc: Layout descriptor; all ``count`` chunks share its
+                shape/dtype, and its byte size sets each chunk's size.
+            count: Number of chunks to reserve.
+
+        Returns:
+            ``(L1Error.SUCCESS, objects)`` on success, otherwise
+            ``(L1Error.OUT_OF_MEMORY, [])``.
+        """
+        chunk_bytes = get_size_bytes(layout_desc.shapes, layout_desc.dtypes)
+        shape = layout_desc.shapes[0]
+        dtype = layout_desc.dtypes[0]
+        objects: list[MemoryObj] = []
+        for _ in range(count):
+            try:
+                address, allocated = self._address_manager.allocate(chunk_bytes)
+            except RuntimeError:
+                for obj in objects:
+                    self._address_manager.free(
+                        obj.metadata.address, obj.get_physical_size()
+                    )
+                return L1Error.OUT_OF_MEMORY, []
+            meta = MemoryObjMetadata(
+                shape=shape,
+                dtype=dtype,
+                address=address,
+                phy_size=allocated,
+                ref_count=0,
+            )
+            objects.append(GDSMemoryObject(meta))
+        return L1Error.SUCCESS, objects
+
+    def free(self, mem_objs: list[MemoryObj]) -> L1Error:
+        """Return the slab regions of the given objects to the address manager.
+
+        Args:
+            mem_objs: Objects to free (slab-anchored :class:`GDSMemoryObject`s
+                handed out by :meth:`allocate`).
+
+        Returns:
+            ``L1Error.SUCCESS``.
+        """
+        for mo in mem_objs:
+            self._address_manager.free(mo.metadata.address, mo.get_physical_size())
+        return L1Error.SUCCESS
+
+    def get_memory_usage(self) -> tuple[int, int]:
+        """Return ``(used_bytes, total_bytes)`` of the slab."""
+        free_size = self._address_manager.get_free_size()
+        total_size = self._address_manager.get_heap_size()
+        return total_size - free_size, total_size
+
+    def get_l1_memory_desc(self) -> Optional[L1MemoryDesc]:
+        """Return ``None``: the GDS L1 medium is the slab file, not a buffer.
+
+        The only registerable memory on the GDS path is the GPU staging buffer,
+        not an L1 pool, so there is no descriptor to hand to L2 adapters (which
+        must be disabled when GDS L1 is enabled).
+        """
+        return None
+
+    def close(self) -> None:
+        """No-op: the GDSContext owning the slab is closed at server shutdown."""
+        return
+
+    def memcheck(self) -> bool:
+        """For debug purposes; logs allocator state and checks consistency.
+
+        Mirrors ``TensorMemoryAllocator.memcheck`` for the GDS slab address
+        space: logs the allocated / free sizes, then verifies the free and
+        allocated bytes add up to the slab size and that free blocks are
+        coalesced. Returns ``True`` when consistent, ``False`` otherwise.
+        """
+        clear = True
+        logger.info("Checking memory allocator consistency")
+        logger.info(
+            " - Total allocated size: %s MB",
+            self._address_manager.total_allocated_size / 1048576,
+        )
+
+        total_free_size = self._address_manager.get_free_size()
+        logger.info(" - Total free size: %s MB", total_free_size / 1048576)
+
+        if (
+            total_free_size + self._address_manager.total_allocated_size
+            != self._address_manager.get_heap_size()
+        ):
+            logger.error("Memory allocator size is inconsistent")
+            logger.error("This implies a bug in the memory allocator")
+            clear = False
+
+        if not self._address_manager.check_consistency():
+            logger.error("Memory allocator has non-coalesced blocks")
+            logger.error("This implies a bug in the memory allocator")
+            clear = False
+
+        return clear
diff --git a/lmcache/v1/distributed/memory_manager/l1_manager_protocol.py b/lmcache/v1/distributed/memory_manager/l1_manager_protocol.py
new file mode 100644
index 0000000000..d4560050a1
--- /dev/null
+++ b/lmcache/v1/distributed/memory_manager/l1_manager_protocol.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Structural interface shared by the L1 memory manager tiers."""
+
+# Standard
+from typing import Optional, Protocol, runtime_checkable
+
+# First Party
+from lmcache.v1.distributed.api import MemoryLayoutDesc
+from lmcache.v1.distributed.error import L1Error
+from lmcache.v1.distributed.internal_api import L1MemoryDesc
+from lmcache.v1.memory_management import MemoryObj
+
+
+@runtime_checkable
+class L1ManagerProtocol(Protocol):
+    """Structural interface for an L1 memory manager.
+
+    Both :class:`L1MemoryManager` (CPU pinned-DRAM tier) and
+    :class:`GDSL1MemoryManager` (GDS slab-file tier) satisfy this protocol, so
+    ``L1Manager`` can hold either behind one type.
+    """
+
+    def allocate(
+        self, layout_desc: MemoryLayoutDesc, count: int
+    ) -> tuple[L1Error, list[MemoryObj]]:
+        """Allocate ``count`` memory objects for the given layout."""
+        ...
+
+    def free(self, mem_objs: list[MemoryObj]) -> L1Error:
+        """Free the given memory objects."""
+        ...
+
+    def get_memory_usage(self) -> tuple[int, int]:
+        """Return ``(used_bytes, total_bytes)``."""
+        ...
+
+    def get_l1_memory_desc(self) -> Optional[L1MemoryDesc]:
+        """Describe the underlying L1 buffer for L2-adapter registration.
+
+        Returns ``None`` for tiers with no registerable L1 buffer (e.g. GDS).
+        """
+        ...
+
+    def close(self) -> None:
+        """Release all resources."""
+        ...
+
+    def memcheck(self) -> bool:
+        """Verify allocator bookkeeping consistency."""
+        ...
diff --git a/lmcache/v1/distributed/memory_manager.py b/lmcache/v1/distributed/memory_manager/l1_memory_manager.py
similarity index 99%
rename from lmcache/v1/distributed/memory_manager.py
rename to lmcache/v1/distributed/memory_manager/l1_memory_manager.py
index 29b53f16a5..1ad21400bd 100644
--- a/lmcache/v1/distributed/memory_manager.py
+++ b/lmcache/v1/distributed/memory_manager/l1_memory_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+"""CPU pinned-DRAM L1 memory manager."""
 
 # Standard
 from multiprocessing import shared_memory
diff --git a/lmcache/v1/gpu_connector/_cufile_async.py b/lmcache/v1/gpu_connector/_cufile_async.py
new file mode 100644
index 0000000000..a0d0c94f19
--- /dev/null
+++ b/lmcache/v1/gpu_connector/_cufile_async.py
@@ -0,0 +1,384 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Minimal ctypes wrapper around the cuFile async C API.
+
+kvikio's ``raw_read_async`` / ``raw_write_async`` work but on ext4 +
+real GDS they leave ~60% read throughput on the table compared to the
+bare C path (``cuFileReadAsync`` + ``cuFileStreamRegister`` + batched
+submit + single ``cudaStreamSynchronize``). This module exposes the
+same C primitives directly from Python so callers that batch
+submissions can match the C-direct throughput.
+
+Surface:
+
+- :func:`register_buffer` / :func:`deregister_buffer` — wrap
+  ``cuFileBufRegister`` / ``cuFileBufDeregister`` on a torch tensor.
+- :func:`register_stream` / :func:`deregister_stream` — wrap
+  ``cuFileStreamRegister`` / ``cuFileStreamDeregister`` on a raw
+  CUDA stream handle.
+- :class:`AsyncHandle` — opens a file with ``O_DIRECT`` (required by
+  cuFile on ext4) and registers the cuFile handle. ``read_async`` /
+  ``write_async`` enqueue an async IO on a stream and return a
+  :class:`Submission`. Callers run ``cudaStreamSynchronize`` once to
+  drain a batch; :meth:`Submission.bytes_done` returns the actual
+  byte count after the sync.
+
+This module is intentionally narrow: no thread pool, no future
+abstraction, no LRU. It is the layer :class:`GDSContext`
+(``lmcache.v1.gpu_connector.gds_context``) uses to talk to libcufile on
+the GDS DMA fast path.
+"""
+
+# Standard
+from typing import TYPE_CHECKING, Any, Optional
+import ctypes
+import os
+
+# Third Party
+import torch
+
+if TYPE_CHECKING:
+    # Third Party
+    from cufile.bindings import CUfileError
+
+# ``cufile.bindings`` dlopens ``libcufile.so`` at import time, which is absent
+# on CPU-only / macOS hosts. Importing this module (transitively pulled in by
+# the CLI command discovery via ``storage_manager``) must not trigger that, so
+# every cufile symbol is imported lazily inside the function that uses it and
+# the dlopen happens only when GDS is actually exercised. This mirrors the
+# lazy ``import cufile`` in the legacy ``GdsBackend``.
+
+# --- Declare ctypes signatures for the async symbols (see cufile.h) --
+
+
+def _declare_signatures() -> None:
+    """Set argtypes/restype on libcufile symbols. Idempotent."""
+    # Third Party
+    from cufile.bindings import CUfileError, libcufile
+
+    if getattr(libcufile.cuFileReadAsync, "argtypes", None):
+        return
+    libcufile.cuFileReadAsync.argtypes = [
+        ctypes.c_void_p,  # CUfileHandle_t fh
+        ctypes.c_void_p,  # void *bufPtr_base
+        ctypes.POINTER(ctypes.c_size_t),  # size_t *size_p
+        ctypes.POINTER(ctypes.c_int64),  # off_t *file_offset_p
+        ctypes.POINTER(ctypes.c_int64),  # off_t *bufPtr_offset_p
+        ctypes.POINTER(ctypes.c_int64),  # ssize_t *bytes_read_p
+        ctypes.c_void_p,  # CUstream stream
+    ]
+    libcufile.cuFileReadAsync.restype = CUfileError
+
+    libcufile.cuFileWriteAsync.argtypes = [
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.POINTER(ctypes.c_size_t),
+        ctypes.POINTER(ctypes.c_int64),
+        ctypes.POINTER(ctypes.c_int64),
+        ctypes.POINTER(ctypes.c_int64),
+        ctypes.c_void_p,
+    ]
+    libcufile.cuFileWriteAsync.restype = CUfileError
+
+    libcufile.cuFileStreamRegister.argtypes = [ctypes.c_void_p, ctypes.c_uint]
+    libcufile.cuFileStreamRegister.restype = CUfileError
+
+    libcufile.cuFileStreamDeregister.argtypes = [ctypes.c_void_p]
+    libcufile.cuFileStreamDeregister.restype = CUfileError
+
+
+_driver_opened = False
+
+
+def _ensure_driver_open() -> None:
+    """Idempotently open the cuFile driver and declare async signatures."""
+    global _driver_opened
+    if _driver_opened:
+        return
+    # Third Party
+    from cufile.bindings import cuFileDriverOpen
+
+    cuFileDriverOpen()
+    _declare_signatures()
+    _driver_opened = True
+
+
+def close_driver() -> None:
+    """Close the cuFile driver. Optional — useful in tests."""
+    global _driver_opened
+    if not _driver_opened:
+        return
+    # Third Party
+    from cufile.bindings import cuFileDriverClose
+
+    try:
+        cuFileDriverClose()
+    finally:
+        _driver_opened = False
+
+
+def _check(err: "CUfileError", op: str) -> None:
+    """Convert a non-zero ``CUfileError_t`` into a Python exception."""
+    if err.err != 0:
+        raise RuntimeError(
+            f"{op} failed: cuFileError(err={err.err}, cu_err={err.cu_err})"
+        )
+
+
+# --- Buffer / stream registration ----------------------------------
+
+
+def register_buffer(buf: torch.Tensor) -> None:
+    """Register a device tensor with cuFile for GDS DMA.
+
+    Must be called before any ``read_async`` / ``write_async`` whose
+    ``buf_base`` falls inside this tensor's allocation. Implicitly
+    opens the cuFile driver on first use.
+
+    Uses ``libcufile.cuFileBufRegister`` directly (not the
+    ``cufile.bindings`` wrapper) because the wrapper hides the error
+    code by raising internally — we want the raw status so callers
+    see ``cuFileError(err=…, cu_err=…)`` instead of a Python re-raise.
+    """
+    if not buf.is_cuda:
+        raise ValueError("register_buffer: tensor must be on CUDA")
+    # Third Party
+    from cufile.bindings import libcufile
+
+    nbytes = buf.numel() * buf.element_size()
+    _check(
+        libcufile.cuFileBufRegister(
+            ctypes.c_void_p(buf.data_ptr()),
+            ctypes.c_size_t(nbytes),
+            ctypes.c_int(0),
+        ),
+        "cuFileBufRegister",
+    )
+
+
+def deregister_buffer(buf: torch.Tensor) -> None:
+    """Reverse of :func:`register_buffer`."""
+    # Third Party
+    from cufile.bindings import libcufile
+
+    _check(
+        libcufile.cuFileBufDeregister(ctypes.c_void_p(buf.data_ptr())),
+        "cuFileBufDeregister",
+    )
+
+
+# cuFileStreamRegister flags (cufile.h): declare the buffer offset, file offset,
+# and size are all set at submission time (CU_FILE_STREAM_FIXED_* = 0x1|0x2|0x4).
+# Worth ~12% higher read throughput vs 0x0 in our benchmark (write unchanged).
+# PAGE_ALIGNED_INPUTS (0x8) is omitted -- transfer sizes are not always 4 KiB.
+_STREAM_REGISTER_FLAGS = 0x7
+
+
+def register_stream(raw_stream: int) -> None:
+    """Register a CUDA stream with cuFile.
+
+    ``raw_stream`` is the integer ``CUstream`` handle — get it via
+    ``torch_dev.current_stream().cuda_stream``.
+
+    Optional for correctness (``read_async`` / ``write_async`` also take the
+    stream per call). We register with the FIXED_* flags (0x7): cuFile still
+    reads the size/offset pointers at stream-execution time -- so their storage
+    must stay alive and unchanged until completion (see ``Submission``) -- but
+    promising the values are fixed at submission lets cuFile skip per-op setup,
+    worth ~12% higher read throughput in our benchmark.
+    """
+    # Third Party
+    from cufile.bindings import libcufile
+
+    _ensure_driver_open()
+    _check(
+        libcufile.cuFileStreamRegister(
+            ctypes.c_void_p(raw_stream), _STREAM_REGISTER_FLAGS
+        ),
+        "cuFileStreamRegister",
+    )
+
+
+def deregister_stream(raw_stream: int) -> None:
+    """Reverse of :func:`register_stream`."""
+    # Third Party
+    from cufile.bindings import libcufile
+
+    _check(
+        libcufile.cuFileStreamDeregister(ctypes.c_void_p(raw_stream)),
+        "cuFileStreamDeregister",
+    )
+
+
+# --- AsyncHandle + Submission --------------------------------------
+
+
+class Submission:
+    """One in-flight ``cuFileReadAsync`` / ``cuFileWriteAsync``.
+
+    Holds the host-side ``size_p`` / ``file_offset_p`` /
+    ``bufPtr_offset_p`` / ``bytes_done_p`` storage that cuFile writes
+    into asynchronously. These ctypes objects MUST stay alive until
+    the stream actually executes the op — keep the :class:`Submission`
+    reference (or stash it in a list) until after the stream sync.
+    """
+
+    __slots__ = ("_size", "_file_offset", "_buf_offset", "_bytes_done")
+
+    def __init__(
+        self,
+        size: int,
+        file_offset: int,
+        buf_offset: int,
+    ) -> None:
+        self._size = ctypes.c_size_t(size)
+        self._file_offset = ctypes.c_int64(file_offset)
+        self._buf_offset = ctypes.c_int64(buf_offset)
+        self._bytes_done = ctypes.c_int64(0)
+
+    @property
+    def bytes_done(self) -> int:
+        """Bytes actually transferred. Valid only AFTER the stream sync."""
+        return self._bytes_done.value
+
+
+class AsyncHandle:
+    """Open file + cuFile handle wrapper.
+
+    Opens with ``O_DIRECT`` (required for cuFile's GDS fast path on
+    ext4). Optionally pre-allocates the file via ``posix_fallocate``.
+    """
+
+    __slots__ = ("_fd", "_handle", "path", "writable")
+
+    def __init__(
+        self,
+        path: str,
+        writable: bool = False,
+        fallocate_size: Optional[int] = None,
+        mode: int = 0o644,
+    ) -> None:
+        _ensure_driver_open()
+        # Third Party
+        from cufile.bindings import cuFileHandleRegister
+
+        flags = os.O_DIRECT
+        if writable:
+            flags |= os.O_CREAT | os.O_RDWR
+        else:
+            flags |= os.O_RDONLY
+        self.path = path
+        self.writable = writable
+        self._fd = os.open(path, flags, mode)
+        try:
+            if fallocate_size is not None and writable:
+                os.posix_fallocate(self._fd, 0, fallocate_size)
+            self._handle = cuFileHandleRegister(self._fd)
+        except Exception:
+            os.close(self._fd)
+            raise
+
+    @classmethod
+    def from_fd(
+        cls,
+        fd: int,
+        handle: Any,
+        path: str,
+        writable: bool = False,
+    ) -> "AsyncHandle":
+        """Wrap an already-opened fd and registered cuFile handle.
+
+        For callers that open + register the file themselves (e.g. a slab that
+        must be created, truncated, and ``posix_fallocate``d before
+        ``cuFileHandleRegister``) and just need an ``AsyncHandle`` around the
+        result.
+        """
+        obj = cls.__new__(cls)
+        obj._fd = fd
+        obj._handle = handle
+        obj.path = path
+        obj.writable = writable
+        return obj
+
+    @property
+    def fd(self) -> int:
+        return self._fd
+
+    def read_async(
+        self,
+        buf_base: int,
+        size: int,
+        file_offset: int,
+        buf_offset: int,
+        raw_stream: int,
+    ) -> Submission:
+        """Enqueue a ``cuFileReadAsync`` on the stream.
+
+        ``buf_base`` is the registered base pointer (e.g.
+        ``buf.data_ptr()``). ``buf_offset`` is the byte offset within
+        that registration that the data should land at.
+        """
+        # Third Party
+        from cufile.bindings import libcufile
+
+        sub = Submission(size=size, file_offset=file_offset, buf_offset=buf_offset)
+        _check(
+            libcufile.cuFileReadAsync(
+                self._handle,
+                ctypes.c_void_p(buf_base),
+                ctypes.byref(sub._size),
+                ctypes.byref(sub._file_offset),
+                ctypes.byref(sub._buf_offset),
+                ctypes.byref(sub._bytes_done),
+                ctypes.c_void_p(raw_stream),
+            ),
+            "cuFileReadAsync",
+        )
+        return sub
+
+    def write_async(
+        self,
+        buf_base: int,
+        size: int,
+        file_offset: int,
+        buf_offset: int,
+        raw_stream: int,
+    ) -> Submission:
+        """Enqueue a ``cuFileWriteAsync`` on the stream."""
+        # Third Party
+        from cufile.bindings import libcufile
+
+        sub = Submission(size=size, file_offset=file_offset, buf_offset=buf_offset)
+        _check(
+            libcufile.cuFileWriteAsync(
+                self._handle,
+                ctypes.c_void_p(buf_base),
+                ctypes.byref(sub._size),
+                ctypes.byref(sub._file_offset),
+                ctypes.byref(sub._buf_offset),
+                ctypes.byref(sub._bytes_done),
+                ctypes.c_void_p(raw_stream),
+            ),
+            "cuFileWriteAsync",
+        )
+        return sub
+
+    def close(self) -> None:
+        """Deregister the cuFile handle and close the fd."""
+        if self._fd < 0:
+            return
+        # Third Party
+        from cufile.bindings import cuFileHandleDeregister
+
+        try:
+            cuFileHandleDeregister(self._handle)
+        finally:
+            try:
+                os.close(self._fd)
+            finally:
+                self._fd = -1
+
+    def __enter__(self) -> "AsyncHandle":
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.close()
diff --git a/lmcache/v1/gpu_connector/gds_context.py b/lmcache/v1/gpu_connector/gds_context.py
new file mode 100644
index 0000000000..dbdcae7c68
--- /dev/null
+++ b/lmcache/v1/gpu_connector/gds_context.py
@@ -0,0 +1,408 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Process-global cuFile data path for the GDS L1 tier.
+
+One :class:`GDSContext` per worker process owns the slab file, its cuFile
+handle, the registered GPU staging buffers, and the stream-ordered cuFile
+submissions. Created once at startup by :func:`initialize_gds_context`,
+reached via :func:`get_gds_context`. :meth:`GDSContext.register_gpu_buffer`
+registers a staging buffer; :meth:`GDSContext.transfer_async` moves a chunk
+between that buffer and the slab. No POSIX fallback -- if cuFile is
+unavailable, construction fails loudly. The slab is cleared on init, so it
+does not survive a restart (GDS L1 is treated like DRAM).
+"""
+
+# Standard
+from dataclasses import dataclass, field
+from typing import Optional
+import bisect
+import enum
+import functools
+import os
+import threading
+
+# Third Party
+import torch
+
+# First Party
+from lmcache import torch_dev
+from lmcache.logging import init_logger
+from lmcache.v1.distributed.config import GdsL1Config
+from lmcache.v1.gpu_connector import _cufile_async as ca
+from lmcache.v1.memory_management import GDSMemoryObject
+
+logger = init_logger(__name__)
+
+_SLAB_FILENAME = "lmcache_gds_slab.bin"
+_CUFILE_ALIGNMENT = 4096
+# A single cuFileBufRegister/DMA is capped at 16 MiB; larger buffers and chunks
+# are registered and transferred in <=16 MiB regions.
+_MAX_CUFILE_REGION = 16 * 1024 * 1024
+# cuFile submissions to accumulate before recording a completion event and
+# draining finished ones (keeps the live submission set bounded).
+_SUBMISSION_CHECKPOINT_EVERY = 64
+
+
+class SlabDirection(enum.Enum):
+    """Direction of a GDS slab transfer. GPUDirect DMAs run straight between GPU
+    memory and the slab *file* (no host buffer), so directions are file I/O
+    (READ/WRITE), not host<->device (H2D/D2H)."""
+
+    READ = enum.auto()  # slab file -> GPU buffer
+    WRITE = enum.auto()  # GPU buffer -> slab file
+
+
+@dataclass
+class _StreamSubmissions:
+    """Per-stream cuFile submissions, kept alive until their DMA has run.
+
+    Submissions accumulate in ``uncommitted``, move to ``inflight`` behind a
+    CUDA event on the stream, and drop once it completes. Per-stream because an
+    event only orders work on its own stream.
+    """
+
+    uncommitted: list[ca.Submission] = field(default_factory=list)
+    inflight: list[tuple[torch.Event, list[ca.Submission]]] = field(
+        default_factory=list
+    )
+    ops_since_checkpoint: int = 0
+
+
+class GDSContext:
+    """Per-process cuFile context owning the slab file and its DMA path.
+
+    The singleton always exists but is inert until :meth:`initialize` creates
+    the slab and registers the cuFile handle (flipping :attr:`initialized`).
+    While off, ``register_gpu_buffer`` is a no-op.
+    """
+
+    #: Whether :meth:`initialize` has completed (GDS L1 is active).
+    initialized: bool = False
+
+    def __init__(self) -> None:
+        # ``initialized`` defaults to False via the class attribute; it is
+        # flipped to True by ``initialize``.
+        self._slab_size = 0
+        self._slab_path = ""
+        self._slab_handle: Optional[ca.AsyncHandle] = None
+        # Per-stream in-flight submissions (keyed by raw ``CUstream``), released
+        # once a CUDA event recorded on that stream completes. Guarded by
+        # ``_submissions_lock`` (see ``_record_submission``).
+        self._submissions_lock = threading.Lock()
+        self._submissions: dict[int, _StreamSubmissions] = {}
+        # Registry of cuFile-registered GPU regions and the CUDA streams they run on
+        self._registry_lock = threading.Lock()
+        self._buffers: list[torch.Tensor] = []
+        self._base_ptrs: list[int] = []
+        self._nbytes: list[int] = []
+        self._registered_streams: set[int] = set()  # maintained for close()
+
+    def initialize(self, config: GdsL1Config) -> None:
+        """Create + clear the slab and register it with cuFile.
+
+        Args:
+            config: GDS tier config. ``size_in_bytes`` sizes the preallocated
+                slab (rounded up to 4 KiB) at
+                ``<file_location>/lmcache_gds_slab.bin`` (one per process);
+                ``use_direct_io`` opens it with ``O_DIRECT``.
+
+        Raises:
+            Exception: Whatever ``cufile`` raises if GDS is unavailable.
+        """
+        self._slab_size = (config.size_in_bytes + _CUFILE_ALIGNMENT - 1) & ~(
+            _CUFILE_ALIGNMENT - 1
+        )
+
+        # One shared slab per process (the GDSContext is a process-global
+        # singleton used by every GPU instance).
+        selected = config.file_location
+        os.makedirs(selected, exist_ok=True)
+        self._slab_path = os.path.join(selected, _SLAB_FILENAME)
+
+        self._open_and_register_slab(config.use_direct_io)
+        self.initialized = True
+
+    # --- Public API ---------------------------------------------------
+
+    def register_gpu_buffer(self, buffer: torch.Tensor) -> None:
+        """Register a staging buffer (and its CUDA stream) with cuFile.
+
+        Registered as contiguous <=16 MiB regions (the cuFileBufRegister cap);
+        :meth:`transfer_async` splits transfers at these boundaries.
+
+        Args:
+            buffer: Contiguous CUDA staging buffer, 4 KiB-aligned in size.
+        """
+        if not self.initialized:
+            return
+        raw_stream = torch_dev.current_stream().cuda_stream
+        buf = buffer.view(torch.uint8)
+        nbytes = buf.numel()
+        with self._registry_lock:
+            if raw_stream not in self._registered_streams:
+                ca.register_stream(raw_stream)
+                self._registered_streams.add(raw_stream)
+            for start in range(0, nbytes, _MAX_CUFILE_REGION):
+                self._register_region_locked(
+                    buf[start : min(start + _MAX_CUFILE_REGION, nbytes)]
+                )
+
+    def deregister_gpu_buffer(self, buffer: torch.Tensor) -> None:
+        """Reverse of :meth:`register_gpu_buffer`: deregister its regions + stream.
+
+        Args:
+            buffer: The buffer passed to :meth:`register_gpu_buffer`.
+        """
+        if not self.initialized:
+            return
+        stream = torch_dev.current_stream()
+        raw_stream = stream.cuda_stream
+        # No in-flight DMA on this stream may still reference the buffer.
+        stream.synchronize()
+        buf = buffer.view(torch.uint8)
+        nbytes = buf.numel()
+        with self._registry_lock:
+            for start in range(0, nbytes, _MAX_CUFILE_REGION):
+                self._deregister_region_locked(
+                    buf[start : min(start + _MAX_CUFILE_REGION, nbytes)]
+                )
+            if raw_stream in self._registered_streams:
+                try:
+                    ca.deregister_stream(raw_stream)
+                except Exception as e:
+                    logger.warning(
+                        "GDSContext.deregister_gpu_buffer: deregister_stream: %s", e
+                    )
+                self._registered_streams.discard(raw_stream)
+        # Stream is synced above, so its submissions' DMAs are done -- drop them.
+        with self._submissions_lock:
+            self._submissions.pop(raw_stream, None)
+
+    def transfer_async(
+        self,
+        memory_obj: GDSMemoryObject,
+        gpu_buffer: torch.Tensor,
+        direction: SlabDirection,
+    ) -> None:
+        """DMA a chunk between ``gpu_buffer`` and its slab region.
+
+        ``READ`` pulls slab -> ``gpu_buffer``; ``WRITE`` pushes the reverse.
+        Split at registered-region boundaries (each cuFile DMA must stay within
+        one <=16 MiB region), so any chunk size works. Stream-ordered, no sync.
+
+        Args:
+            memory_obj: The chunk; ``slab_offset`` / ``get_size()`` give the
+                file offset and length.
+            gpu_buffer: A slice of a registered staging buffer; its first
+                ``get_size()`` bytes are transferred.
+            direction: :attr:`SlabDirection.READ` or ``.WRITE``.
+        """
+        slab_op = (
+            self._slab_read if direction is SlabDirection.READ else self._slab_write
+        )
+        nbytes = memory_obj.get_size()
+        buf = gpu_buffer.view(torch.uint8)
+        pos = 0
+        while pos < nbytes:
+            base_ptr, dev_offset, region_nbytes = self._resolve_buffer(buf[pos:])
+            seg_len = min(nbytes - pos, region_nbytes - dev_offset)
+            slab_op(memory_obj.slab_offset + pos, seg_len, dev_offset, base_ptr)
+            pos += seg_len
+
+    def close(self) -> None:
+        """Sync the stream, deregister cuFile state, and close the slab handle."""
+        if self._buffers:
+            torch_dev.synchronize(device=self._buffers[0].device)
+        with self._submissions_lock:
+            self._submissions.clear()
+        # Deregister any regions/streams still live (per-instance teardown via
+        # ``deregister_gpu_buffer`` normally clears these first; this is the
+        # shutdown sweep for anything left).
+        with self._registry_lock:
+            for buf in self._buffers:
+                try:
+                    ca.deregister_buffer(buf)
+                except Exception as e:
+                    logger.warning("GDSContext.close: deregister_buffer: %s", e)
+            self._buffers.clear()
+            self._base_ptrs.clear()
+            self._nbytes.clear()
+            for raw_stream in list(self._registered_streams):
+                try:
+                    ca.deregister_stream(raw_stream)
+                except Exception as e:
+                    logger.warning("GDSContext.close: deregister_stream: %s", e)
+            self._registered_streams.clear()
+        if self._slab_handle is not None:
+            try:
+                self._slab_handle.close()
+            except Exception as e:
+                logger.warning("GDSContext.close: slab handle close failed: %s", e)
+            self._slab_handle = None
+
+    # --- Internal -----------------------------------------------------
+
+    def _open_and_register_slab(self, use_direct_io: bool) -> None:
+        """Create, truncate, preallocate the slab file and register it with cuFile.
+
+        Args:
+            use_direct_io: Open with ``O_DIRECT`` (required for the GDS fast path).
+        """
+        # Create, truncate, and fallocate via a regular (non-O_DIRECT) fd.
+        creator_fd = os.open(
+            self._slab_path, os.O_CREAT | os.O_RDWR | os.O_TRUNC, 0o644
+        )
+        try:
+            os.posix_fallocate(creator_fd, 0, self._slab_size)
+        finally:
+            os.close(creator_fd)
+        flags = os.O_RDWR
+        if use_direct_io:
+            flags |= os.O_DIRECT
+        fd = os.open(self._slab_path, flags)
+        try:
+            # Third Party
+            from cufile.bindings import cuFileHandleRegister
+
+            handle = cuFileHandleRegister(fd)
+        except Exception:
+            os.close(fd)
+            raise
+        self._slab_handle = ca.AsyncHandle.from_fd(
+            fd, handle, self._slab_path, writable=True
+        )
+        logger.info(
+            "GDSContext: slab created at %s (%.1f GiB, O_DIRECT=%s), cuFile "
+            "handle registered",
+            self._slab_path,
+            self._slab_size / (1 << 30),
+            use_direct_io,
+        )
+
+    def _register_region_locked(self, buffer: torch.Tensor) -> None:
+        """cuFile-register one <=16 MiB region (caller holds the lock)."""
+        nbytes = buffer.numel() * buffer.element_size()
+        base = buffer.data_ptr()
+        ca.register_buffer(buffer)
+        idx = bisect.bisect_left(self._base_ptrs, base)
+        self._buffers.insert(idx, buffer)
+        self._base_ptrs.insert(idx, base)
+        self._nbytes.insert(idx, nbytes)
+        logger.info(
+            "GDSContext: registered %d bytes at 0x%x via cuFile "
+            "(total registrations: %d)",
+            nbytes,
+            base,
+            len(self._buffers),
+        )
+
+    def _deregister_region_locked(self, buffer: torch.Tensor) -> None:
+        """Deregister one region with cuFile (caller holds the lock).
+
+        Args:
+            buffer: A staging-buffer slot previously registered.
+        """
+        base = buffer.data_ptr()
+        idx = bisect.bisect_left(self._base_ptrs, base)
+        try:
+            ca.deregister_buffer(self._buffers[idx])
+        except Exception as e:
+            logger.warning("GDSContext: deregister_buffer: %s", e)
+        del self._buffers[idx]
+        del self._base_ptrs[idx]
+        del self._nbytes[idx]
+
+    def _resolve_buffer(self, gpu_buffer: torch.Tensor) -> tuple[int, int, int]:
+        """Locate the registered region ``gpu_buffer`` starts in.
+
+        Returns ``(base_ptr, dev_offset, region_nbytes)``; ``region_nbytes -
+        dev_offset`` is the room left in the region, which :meth:`transfer_async`
+        uses to cut DMAs at region boundaries. Callers always pass a pointer
+        inside a registered region.
+        """
+        ptr = gpu_buffer.data_ptr()
+        # Held briefly so a concurrent deregister can't mutate the parallel
+        # lists mid-lookup.
+        with self._registry_lock:
+            idx = bisect.bisect_right(self._base_ptrs, ptr) - 1
+            base = self._base_ptrs[idx]
+            nbytes = self._nbytes[idx]
+        offset = ptr - base
+        return base, offset, nbytes
+
+    def _slab_read(
+        self, slab_offset: int, size: int, dev_offset: int, buf_base: int
+    ) -> None:
+        """Submit one ``cuFileReadAsync`` against the slab handle (stream-ordered)."""
+        if self._slab_handle is None:
+            raise RuntimeError("GDSContext._slab_read: slab handle not open")
+        stream_handle = torch_dev.current_stream().cuda_stream
+        sub = self._slab_handle.read_async(
+            buf_base, size, slab_offset, dev_offset, stream_handle
+        )
+        self._record_submission(sub)
+
+    def _slab_write(
+        self, slab_offset: int, size: int, dev_offset: int, buf_base: int
+    ) -> None:
+        """Submit one ``cuFileWriteAsync`` against the slab handle (stream-ordered)."""
+        if self._slab_handle is None:
+            raise RuntimeError("GDSContext._slab_write: slab handle not open")
+        stream_handle = torch_dev.current_stream().cuda_stream
+        sub = self._slab_handle.write_async(
+            buf_base, size, slab_offset, dev_offset, stream_handle
+        )
+        self._record_submission(sub)
+
+    def _record_submission(self, sub: "ca.Submission") -> None:
+        """Track an in-flight submission so its ctypes storage outlives the DMA.
+
+        Accumulated per (current) stream; every ``_SUBMISSION_CHECKPOINT_EVERY``
+        ops a CUDA event is recorded and completed batches are released.
+        """
+        stream = torch_dev.current_stream()
+        raw_stream = stream.cuda_stream
+        with self._submissions_lock:
+            st = self._submissions.get(raw_stream)
+            if st is None:
+                st = self._submissions[raw_stream] = _StreamSubmissions()
+            st.uncommitted.append(sub)
+            st.ops_since_checkpoint += 1
+            if st.ops_since_checkpoint >= _SUBMISSION_CHECKPOINT_EVERY:
+                self._checkpoint_submissions_locked(st, stream)
+
+    def _checkpoint_submissions_locked(
+        self, st: _StreamSubmissions, stream: "torch.Stream"
+    ) -> None:
+        """Close ``st``'s current batch behind a CUDA event on ``stream`` and
+        drop earlier batches whose event has completed. Hold
+        ``self._submissions_lock``.
+        """
+        if st.uncommitted:
+            event = torch_dev.Event()
+            event.record(stream)
+            st.inflight.append((event, st.uncommitted))
+            st.uncommitted = []
+        st.ops_since_checkpoint = 0
+        st.inflight = [
+            (event, subs) for (event, subs) in st.inflight if not event.query()
+        ]
+
+
+@functools.cache
+def get_gds_context() -> GDSContext:
+    """Return the process-global :class:`GDSContext` singleton (created empty on
+    first access). Consult :attr:`GDSContext.initialized` to tell whether GDS L1
+    is active."""
+    return GDSContext()
+
+
+def initialize_gds_context(config: Optional[GdsL1Config]) -> GDSContext:
+    """Set up the process-global :class:`GDSContext` (once, at startup).
+
+    ``config=None`` leaves it uninitialized (GDS L1 disabled); otherwise the
+    slab is created and registered. Returns the singleton.
+    """
+    context = get_gds_context()
+    if config is not None:
+        context.initialize(config)
+    return context
diff --git a/lmcache/v1/gpu_connector/gpu_ops.py b/lmcache/v1/gpu_connector/gpu_ops.py
index 19d902f585..e1e49ab9cb 100644
--- a/lmcache/v1/gpu_connector/gpu_ops.py
+++ b/lmcache/v1/gpu_connector/gpu_ops.py
@@ -3,8 +3,9 @@
 import torch
 
 # First Party
+from lmcache.v1.gpu_connector.gds_context import SlabDirection, get_gds_context
 from lmcache.v1.lazy_memory_allocator import LazyMemoryAllocator
-from lmcache.v1.memory_management import MemoryObj
+from lmcache.v1.memory_management import GDSMemoryObject, MemoryObj
 import lmcache.c_ops as lmc_ops
 
 
@@ -21,6 +22,9 @@ def lmcache_memcpy_async_h2d(
     :param MemoryObj memory_obj: The memory object to be copied.
     :param torch.Tensor gpu_buffer: The GPU buffer to copy the data to.
     """
+    if isinstance(memory_obj, GDSMemoryObject):
+        get_gds_context().transfer_async(memory_obj, gpu_buffer, SlabDirection.READ)
+        return
     src_tensor = memory_obj.raw_tensor
     if src_tensor is None:
         raise ValueError(
@@ -59,6 +63,9 @@ def lmcache_memcpy_async_d2h(
     :param torch.Tensor gpu_buffer: The GPU buffer to copy the data from.
     :param MemoryObj memory_obj: The memory object to be copied to.
     """
+    if isinstance(memory_obj, GDSMemoryObject):
+        get_gds_context().transfer_async(memory_obj, gpu_buffer, SlabDirection.WRITE)
+        return
     dst_tensor = memory_obj.raw_tensor
     if dst_tensor is None:
         raise ValueError(
diff --git a/lmcache/v1/memory_management.py b/lmcache/v1/memory_management.py
index 25326d6aea..fc099d9b98 100644
--- a/lmcache/v1/memory_management.py
+++ b/lmcache/v1/memory_management.py
@@ -937,6 +937,128 @@ def parent(self) -> Optional["MemoryAllocatorInterface"]:
         return None
 
 
+class GDSMemoryObject(MemoryObj):
+    """A slab-anchored ``MemoryObj`` for the GDS L1 tier.
+
+    The bytes live in the GDS slab file, not in host or device memory, so
+    this object carries only the slab ``(offset, size)`` (in ``meta.address``
+    / ``meta.phy_size``) and is otherwise a placeholder: ``tensor`` is always
+    ``None`` and ``byte_array`` / ``data_ptr`` raise.
+    """
+
+    def __init__(self, metadata: MemoryObjMetadata) -> None:
+        super().__init__(metadata)
+        self.valid = True
+
+    @property
+    def slab_offset(self) -> int:
+        """Byte offset of this chunk within the slab file (== ``meta.address``)."""
+        return self.meta.address
+
+    def invalidate(self) -> None:
+        self.valid = False
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    def get_size(self) -> int:
+        return self.meta.phy_size
+
+    def get_shape(self) -> torch.Size:
+        return self.meta.shape
+
+    def get_dtype(self) -> Optional[torch.dtype]:
+        return self.meta.dtype
+
+    def get_shapes(self) -> list[torch.Size]:
+        raise NotImplementedError(
+            "GDSMemoryObject.get_shapes: per-group shapes are not tracked on "
+            "the GDS path (only the singular meta.shape is); use get_shape()"
+        )
+
+    def get_dtypes(self) -> list[torch.dtype]:
+        raise NotImplementedError(
+            "GDSMemoryObject.get_dtypes: per-group dtypes are not tracked on "
+            "the GDS path (only the singular meta.dtype is); use get_dtype()"
+        )
+
+    def get_memory_format(self) -> MemoryFormat:
+        return self.meta.fmt
+
+    def get_physical_size(self) -> int:
+        return self.meta.phy_size
+
+    def ref_count_up(self) -> None:
+        raise NotImplementedError(
+            "GDSMemoryObject.ref_count_up: not used on the GDS path"
+        )
+
+    def ref_count_down(self) -> None:
+        raise NotImplementedError(
+            "GDSMemoryObject.ref_count_down: not used on the GDS path"
+        )
+
+    def get_ref_count(self) -> int:
+        raise NotImplementedError(
+            "GDSMemoryObject.get_ref_count: not used on the GDS path"
+        )
+
+    def get_num_tokens(self) -> int:
+        raise NotImplementedError(
+            "GDSMemoryObject.get_num_tokens: not used on the GDS path"
+        )
+
+    def pin(self) -> bool:
+        raise NotImplementedError("GDSMemoryObject.pin: not used on the GDS path")
+
+    def unpin(self) -> bool:
+        raise NotImplementedError("GDSMemoryObject.unpin: not used on the GDS path")
+
+    @property
+    def metadata(self) -> MemoryObjMetadata:
+        return self.meta
+
+    @property
+    def tensor(self) -> Optional[torch.Tensor]:
+        return None
+
+    @property
+    def byte_array(self) -> bytes:
+        raise NotImplementedError(
+            f"GDSMemoryObject(slab_offset={self.slab_offset}).byte_array is not "
+            "supported; bytes live in the GDS slab file and the staging buffer "
+            "is registered VRAM (no buffer protocol)."
+        )
+
+    @property
+    def data_ptr(self) -> int:
+        raise NotImplementedError(
+            f"GDSMemoryObject(slab_offset={self.slab_offset}).data_ptr is not "
+            "supported; GDS reads/writes use gpu_buffer.data_ptr() via the "
+            "gpu_ops dispatch, never the MemoryObj's data_ptr."
+        )
+
+    @property
+    def is_pinned(self) -> bool:
+        raise NotImplementedError("GDSMemoryObject.is_pinned: not used on the GDS path")
+
+    @property
+    def can_evict(self) -> bool:
+        raise NotImplementedError("GDSMemoryObject.can_evict: not used on the GDS path")
+
+    @property
+    def raw_tensor(self) -> Optional[torch.Tensor]:
+        return None
+
+    def get_tensor(self, index: int) -> Optional[torch.Tensor]:
+        return None
+
+    def parent(self) -> Optional["MemoryAllocatorInterface"]:
+        # The GDS slab is not a MemoryAllocatorInterface; dispatch in gpu_ops
+        # keys off the GDSMemoryObject type, not the parent allocator.
+        return None
+
+
 class MemoryAllocatorInterface(metaclass=abc.ABCMeta):
     @abc.abstractmethod
     def allocate(
diff --git a/lmcache/v1/multiprocess/engine_context.py b/lmcache/v1/multiprocess/engine_context.py
index 2224e65d05..14e97e0e4d 100644
--- a/lmcache/v1/multiprocess/engine_context.py
+++ b/lmcache/v1/multiprocess/engine_context.py
@@ -15,6 +15,10 @@
 )
 from lmcache.v1.distributed.config import StorageManagerConfig
 from lmcache.v1.distributed.storage_manager import StorageManager
+from lmcache.v1.gpu_connector.gds_context import (
+    get_gds_context,
+    initialize_gds_context,
+)
 from lmcache.v1.mp_observability.event_bus import EventBus, get_event_bus
 from lmcache.v1.multiprocess.custom_types import IPCCacheEngineKey
 from lmcache.v1.multiprocess.session import SessionManager
@@ -141,6 +145,11 @@ def __init__(
         hash_algorithm: str = "blake3",
     ) -> None:
         self._chunk_size = chunk_size
+
+        # Initialize the process-global GDS context.
+        # No-op when GDS L1 is disabled (config is None).
+        initialize_gds_context(storage_manager_config.l1_manager_config.gds_l1_config)
+
         self.shm_pool_info: ShmPoolInfo = self._compute_shm_pool_info(
             storage_manager_config
         )
@@ -152,6 +161,14 @@ def __init__(
         self._event_bus = get_event_bus()
         self._layout_desc_registry = LayoutDescRegistry()
 
+    def close(self) -> None:
+        """
+        Tear down the storage manager and the process-global GDS context.
+        """
+        self._storage_manager.close()
+        # Tear down the GDS cuFile context (the shared slab + its handle).
+        get_gds_context().close()
+
     @property
     def chunk_size(self) -> int:
         """Chunk size for KV cache operations."""
diff --git a/lmcache/v1/multiprocess/gpu_context.py b/lmcache/v1/multiprocess/gpu_context.py
index ac20961721..dbf074ceb0 100644
--- a/lmcache/v1/multiprocess/gpu_context.py
+++ b/lmcache/v1/multiprocess/gpu_context.py
@@ -23,6 +23,7 @@
 from lmcache import torch_dev
 from lmcache.logging import init_logger
 from lmcache.utils import EngineType, lmcache_deprecate
+from lmcache.v1.gpu_connector.gds_context import get_gds_context
 from lmcache.v1.gpu_connector.utils import (
     LayoutHints,
     get_attention_backend,
@@ -178,6 +179,11 @@ def max_batch_size(self) -> int:
         """Maximum number of chunks (batch slots) the buffer holds."""
         return self._max_batch_size
 
+    @property
+    def buffer(self) -> torch.Tensor:
+        """The flat staging tensor (for GDS cuFile registration)."""
+        return self._temp_buffer
+
     def get_temp_kernel_group_buffer(
         self, batch_idx: int, kernel_group_idx: int
     ) -> torch.Tensor:
@@ -391,6 +397,12 @@ def __init__(
 
         # GPU streams
         self.cuda_stream_ = torch_dev.Stream(device=self.device_)
+
+        # Register the staging buffer with the GDS cuFile context on the
+        # context's CUDA stream.
+        with torch_dev.stream(self.cuda_stream_):
+            get_gds_context().register_gpu_buffer(self._temp_buffer.buffer)
+
         # Third Party
         import cupy
 
@@ -406,6 +418,13 @@ def __init__(
             logger,
         )
 
+    def close(self) -> None:
+        """
+        Deregister this context's GDS staging buffer (reverse of __init__).
+        """
+        with torch_dev.stream(self.cuda_stream_):
+            get_gds_context().deregister_gpu_buffer(self._temp_buffer.buffer)
+
     @property
     def dtype(self) -> torch.dtype:
         return get_dtype(self.kv_caches_, self.gpu_kv_format_)
diff --git a/lmcache/v1/multiprocess/modules/gpu_transfer.py b/lmcache/v1/multiprocess/modules/gpu_transfer.py
index 99bbe24ab0..6c43a28eb1 100644
--- a/lmcache/v1/multiprocess/modules/gpu_transfer.py
+++ b/lmcache/v1/multiprocess/modules/gpu_transfer.py
@@ -223,6 +223,8 @@ def close(self) -> None:
         self._device_host_func_dispatcher.stop()
 
         had_contexts = len(self._cache_contexts) > 0
+        for entry in self._cache_contexts.values():
+            entry.cache_context.close()
         self._cache_contexts.clear()
         if had_contexts:
             torch_dev.empty_cache()
@@ -297,6 +299,7 @@ def unregister_kv_cache(self, instance_id: int) -> None:
             )
             return
 
+        entry.cache_context.close()
         self._ctx.layout_desc_registry.unregister(entry.model_name, entry.world_size)
         logger.info("Unregistered KV cache for GPU ID %d", instance_id)
         torch_dev.empty_cache()
diff --git a/lmcache/v1/multiprocess/server.py b/lmcache/v1/multiprocess/server.py
index b6d26cf1e3..76e8d148f6 100644
--- a/lmcache/v1/multiprocess/server.py
+++ b/lmcache/v1/multiprocess/server.py
@@ -101,7 +101,7 @@ def close(self) -> None:
         """Close all modules and release shared resources."""
         for module in self._modules:
             module.close()
-        self._context.storage_manager.close()
+        self._context.close()
         logger.info("MPCacheEngine closed")
 
     # HTTP-layer passthroughs lost in the engine refactor.
diff --git a/tests/v1/distributed/memory_manager/__init__.py b/tests/v1/distributed/memory_manager/__init__.py
new file mode 100644
index 0000000000..9881313609
--- /dev/null
+++ b/tests/v1/distributed/memory_manager/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/v1/distributed/memory_manager/test_gds_l1_memory_manager.py b/tests/v1/distributed/memory_manager/test_gds_l1_memory_manager.py
new file mode 100644
index 0000000000..56f770267c
--- /dev/null
+++ b/tests/v1/distributed/memory_manager/test_gds_l1_memory_manager.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for ``GDSL1MemoryManager``.
+
+These are pure: the manager sits on the in-memory ``AddressManager``, so no
+CUDA / cuFile / GDS hardware is required (the actual slab DMA lives in
+``GDSContext`` and is covered separately).
+"""
+
+# Third Party
+import torch
+
+# First Party
+from lmcache.v1.distributed.api import MemoryLayoutDesc
+from lmcache.v1.distributed.config import GdsL1Config
+from lmcache.v1.distributed.error import L1Error
+from lmcache.v1.distributed.memory_manager import GDSL1MemoryManager
+from lmcache.v1.memory_management import GDSMemoryObject
+
+_ALIGN = 4096
+
+
+def _config(size_bytes: int) -> GdsL1Config:
+    return GdsL1Config(file_location="/unused", size_in_bytes=size_bytes)
+
+
+def _layout(nbytes: int) -> MemoryLayoutDesc:
+    return MemoryLayoutDesc(shapes=[torch.Size([nbytes])], dtypes=[torch.uint8])
+
+
+class TestAllocate:
+    def test_returns_distinct_gds_objects(self):
+        mgr = GDSL1MemoryManager(_config(1 << 20))
+        err, objs = mgr.allocate(_layout(4096), 3)
+        assert err == L1Error.SUCCESS
+        assert len(objs) == 3
+        assert all(isinstance(o, GDSMemoryObject) for o in objs)
+        # Non-overlapping slab regions.
+        assert len({o.slab_offset for o in objs}) == 3
+
+    def test_oom_is_all_or_nothing(self):
+        # Slab fits exactly two 4 KiB chunks; asking for three must fail and
+        # leave nothing reserved.
+        mgr = GDSL1MemoryManager(_config(2 * _ALIGN))
+        err, objs = mgr.allocate(_layout(4096), 3)
+        assert err == L1Error.OUT_OF_MEMORY
+        assert objs == []
+        assert mgr.get_memory_usage()[0] == 0
+
+    def test_chunk_size_rounded_up_to_alignment(self):
+        # 5000-byte chunk rounds up to 8192 (next 4 KiB multiple).
+        mgr = GDSL1MemoryManager(_config(1 << 20))
+        _, objs = mgr.allocate(_layout(5000), 1)
+        assert objs[0].get_physical_size() == 8192
+
+
+class TestFreeAndUsage:
+    def test_free_returns_all_space(self):
+        mgr = GDSL1MemoryManager(_config(1 << 20))
+        _, objs = mgr.allocate(_layout(4096), 2)
+        assert mgr.get_memory_usage()[0] == 2 * _ALIGN
+        assert mgr.free(objs) == L1Error.SUCCESS
+        assert mgr.get_memory_usage()[0] == 0
+
+    def test_total_is_slab_size(self):
+        used, total = GDSL1MemoryManager(_config(1 << 20)).get_memory_usage()
+        assert used == 0
+        assert total == (1 << 20)
+
+    def test_memcheck_consistent_through_cycles(self):
+        mgr = GDSL1MemoryManager(_config(1 << 20))
+        assert mgr.memcheck() is True
+        _, a = mgr.allocate(_layout(4096), 2)
+        _, b = mgr.allocate(_layout(8192), 1)
+        assert mgr.memcheck() is True
+        mgr.free(a)
+        mgr.free(b)
+        assert mgr.memcheck() is True
+
+
+class TestMisc:
+    def test_get_l1_memory_desc_is_none(self):
+        assert GDSL1MemoryManager(_config(1 << 20)).get_l1_memory_desc() is None
+
+    def test_close_does_not_raise(self):
+        GDSL1MemoryManager(_config(1 << 20)).close()
diff --git a/tests/v1/gpu_connector/test_gds_context.py b/tests/v1/gpu_connector/test_gds_context.py
new file mode 100644
index 0000000000..e08e5c4ce6
--- /dev/null
+++ b/tests/v1/gpu_connector/test_gds_context.py
@@ -0,0 +1,300 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for the GDS cuFile context (``GDSContext``).
+
+Most tests are pure (no cuFile): they exercise the public interface
+(singleton/no-op semantics, the <=16 MiB region split observed at the ``ca``
+cuFile seam, and the registered-region mapping driven through
+:meth:`GDSContext.transfer_async`). The ``test_gds_*_roundtrip`` tests
+exercise the real cuFile DMA path and are skipped unless CUDA + nvidia-fs
+(real GDS) are present.
+"""
+
+# Standard
+from types import SimpleNamespace
+import os
+
+# Third Party
+import pytest
+import torch
+
+# First Party
+from lmcache import torch_dev
+from lmcache.v1.distributed.api import MemoryLayoutDesc
+from lmcache.v1.distributed.config import GdsL1Config
+from lmcache.v1.distributed.error import L1Error
+from lmcache.v1.distributed.memory_manager import GDSL1MemoryManager
+from lmcache.v1.gpu_connector import _cufile_async as ca
+from lmcache.v1.gpu_connector.gds_context import (
+    GDSContext,
+    SlabDirection,
+    get_gds_context,
+    initialize_gds_context,
+)
+
+
+def _fake_stream(handle: int):
+    """A stand-in for ``torch_dev.current_stream()`` (no CUDA needed)."""
+    return SimpleNamespace(cuda_stream=handle, synchronize=lambda: None)
+
+
+requires_gds = pytest.mark.skipif(
+    not (torch.cuda.is_available() and os.path.exists("/proc/driver/nvidia-fs/stats")),
+    reason="needs CUDA + nvidia-fs (real GPUDirect Storage)",
+)
+
+
+@pytest.fixture(autouse=True)
+def _reset_singleton():
+    """Drop the process-global GDSContext between tests."""
+    get_gds_context.cache_clear()
+    yield
+    get_gds_context.cache_clear()
+
+
+class TestSingleton:
+    def test_singleton_identity(self):
+        assert get_gds_context() is get_gds_context()
+
+    def test_fresh_context_is_off(self):
+        assert GDSContext().initialized is False
+
+    def test_initialize_with_none_is_noop(self):
+        ctx = initialize_gds_context(None)
+        assert ctx is get_gds_context()
+        assert ctx.initialized is False
+
+
+class TestRegisterGpuBuffer:
+    def test_noop_when_uninitialized(self, monkeypatch):
+        ctx = GDSContext()
+        registered = []
+        monkeypatch.setattr(ca, "register_buffer", registered.append)
+        # GDS off -> registers nothing, makes no cuFile calls.
+        ctx.register_gpu_buffer(torch.empty(4096, dtype=torch.uint8))
+        assert registered == []
+
+    def test_splits_buffer_into_regions(self, monkeypatch):
+        ctx = GDSContext()
+        ctx.initialized = True
+        # Record each cuFile registration's byte size at the ca seam.
+        sizes = []
+        monkeypatch.setattr(
+            ca,
+            "register_buffer",
+            lambda buf: sizes.append(buf.numel() * buf.element_size()),
+        )
+        monkeypatch.setattr(ca, "register_stream", lambda raw: None)
+        monkeypatch.setattr(torch_dev, "current_stream", lambda: _fake_stream(0))
+
+        # The whole buffer is registered in <=16 MiB regions, irrespective of
+        # any chunk/slot layout. A 40 MiB buffer -> 16 + 16 + 8 MiB.
+        # A CPU tensor is fine: the cuFile calls are mocked.
+        buf = torch.empty(40 << 20, dtype=torch.uint8)
+        ctx.register_gpu_buffer(buf)
+
+        assert sizes == [16 << 20, 16 << 20, 8 << 20]
+
+
+class TestResolveBuffer:
+    """Region mapping: a buffer slice resolves to ``(region base, offset)``,
+    exercised through the public ``transfer_async`` path."""
+
+    def _registered_ctx(self, monkeypatch, buf: torch.Tensor):
+        """Register ``buf``; capture the ``(base, offset)`` that
+        ``transfer_async`` resolves a slice to before handing it to the slab."""
+        ctx = GDSContext()
+        ctx.initialized = True
+        monkeypatch.setattr(ca, "register_buffer", lambda b: None)
+        monkeypatch.setattr(ca, "register_stream", lambda raw: None)
+        monkeypatch.setattr(torch_dev, "current_stream", lambda: _fake_stream(0))
+        ctx.register_gpu_buffer(buf)
+        resolved: list[tuple[int, int]] = []
+        monkeypatch.setattr(
+            ctx,
+            "_slab_write",
+            lambda slab_offset, size, dev_offset, buf_base: resolved.append(
+                (buf_base, dev_offset)
+            ),
+        )
+        return ctx, resolved
+
+    def test_maps_slice_to_base_and_offset(self, monkeypatch):
+        buf = torch.empty(8192, dtype=torch.uint8)
+        ctx, resolved = self._registered_ctx(monkeypatch, buf)
+        mem_obj = SimpleNamespace(get_size=lambda: 4096, slab_offset=0)
+        # A slice 4 KiB into the region must map to (region base, offset 4096).
+        ctx.transfer_async(mem_obj, buf[4096:], SlabDirection.WRITE)
+        assert resolved == [(buf.data_ptr(), 4096)]
+
+
+class TestPerStreamRegistration:
+    """Each distinct stream is cuFile-registered once and deregistered once its
+    last region is gone -- observed at the ``ca`` seam (no private state)."""
+
+    def test_register_and_deregister_per_stream(self, monkeypatch):
+        ctx = GDSContext()
+        ctx.initialized = True
+        reg_str: list[int] = []
+        dereg_str: list[int] = []
+        dereg_buf: list[int] = []
+        monkeypatch.setattr(ca, "register_buffer", lambda b: None)
+        monkeypatch.setattr(
+            ca, "deregister_buffer", lambda b: dereg_buf.append(b.data_ptr())
+        )
+        monkeypatch.setattr(ca, "register_stream", reg_str.append)
+        monkeypatch.setattr(ca, "deregister_stream", dereg_str.append)
+
+        def use_stream(handle: int):
+            monkeypatch.setattr(
+                torch_dev, "current_stream", lambda: _fake_stream(handle)
+            )
+
+        buf_a = torch.empty(24 << 20, dtype=torch.uint8)  # 2 regions on stream 11
+        buf_b = torch.empty(4096, dtype=torch.uint8)  # 1 region on stream 22
+        use_stream(11)
+        ctx.register_gpu_buffer(buf_a)
+        use_stream(22)
+        ctx.register_gpu_buffer(buf_b)
+        # Each distinct stream registered exactly once.
+        assert reg_str == [11, 22]
+
+        # Deregistering buf_b frees stream 22's only region -> stream 22 dropped.
+        use_stream(22)
+        ctx.deregister_gpu_buffer(buf_b)
+        assert dereg_str == [22]
+        # Stream 11 still has 2 regions (24 MiB -> 16 + 8), so not yet dropped.
+        use_stream(11)
+        ctx.deregister_gpu_buffer(buf_a)
+        assert dereg_str == [22, 11]
+        assert len(dereg_buf) == 3  # all three slots deregistered
+
+
+@requires_gds
+def test_gds_two_stream_write_read(tmp_path):
+    """Two CUDA streams each register their own buffer and round-trip a chunk
+    through real cuFile DMA; verify the data stays isolated per stream."""
+    cfg = GdsL1Config(file_location=str(tmp_path), size_in_bytes=64 << 20)
+    chunk_bytes = 8 << 20
+    ctx = GDSContext()
+    ctx.initialize(cfg)
+    mgr = GDSL1MemoryManager(cfg)
+
+    def register_and_write(stream, pattern):
+        """Register a buffer on ``stream`` and write ``pattern`` to a chunk."""
+        with torch.cuda.stream(stream):
+            buf = torch.empty(chunk_bytes, dtype=torch.uint8, device="cuda")
+            ctx.register_gpu_buffer(buf)
+            err, objs = mgr.allocate(
+                MemoryLayoutDesc(
+                    shapes=[torch.Size([chunk_bytes])], dtypes=[torch.uint8]
+                ),
+                1,
+            )
+            assert err == L1Error.SUCCESS
+            buf.fill_(pattern)
+            torch.cuda.synchronize()
+            ctx.transfer_async(objs[0], buf, SlabDirection.WRITE)
+            torch.cuda.synchronize()
+        return buf, objs[0]
+
+    stream_a = torch.cuda.Stream()
+    stream_b = torch.cuda.Stream()
+    try:
+        buf_a, mem_a = register_and_write(stream_a, 0xA1)
+        buf_b, mem_b = register_and_write(stream_b, 0xB2)
+
+        # Read each chunk back on its own stream; each must see its own pattern,
+        # confirming the two streams' buffers/regions don't clobber each other.
+        for stream, buf, mem, pattern in (
+            (stream_a, buf_a, mem_a, 0xA1),
+            (stream_b, buf_b, mem_b, 0xB2),
+        ):
+            with torch.cuda.stream(stream):
+                buf.zero_()
+                torch.cuda.synchronize()
+                ctx.transfer_async(mem, buf, SlabDirection.READ)
+                torch.cuda.synchronize()
+                expected = torch.full((chunk_bytes,), pattern, dtype=torch.uint8)
+                assert torch.equal(buf.cpu(), expected)
+
+        # Deregister each buffer on its own stream.
+        for stream, buf in ((stream_a, buf_a), (stream_b, buf_b)):
+            with torch.cuda.stream(stream):
+                ctx.deregister_gpu_buffer(buf)
+    finally:
+        ctx.close()
+
+
+@requires_gds
+def test_gds_write_read_roundtrip(tmp_path):
+    """Cold write then read of a chunk through the real cuFile DMA path."""
+    cfg = GdsL1Config(file_location=str(tmp_path), size_in_bytes=64 << 20)
+    ctx = GDSContext()
+    ctx.initialize(cfg)
+    try:
+        chunk_bytes = 8 << 20
+        buf = torch.empty(chunk_bytes, dtype=torch.uint8, device="cuda")
+        ctx.register_gpu_buffer(buf)
+
+        mgr = GDSL1MemoryManager(cfg)
+        err, objs = mgr.allocate(
+            MemoryLayoutDesc(shapes=[torch.Size([chunk_bytes])], dtypes=[torch.uint8]),
+            1,
+        )
+        assert err == L1Error.SUCCESS
+        mem_obj = objs[0]
+
+        buf.fill_(0xAB)
+        torch.cuda.synchronize()
+        ctx.transfer_async(mem_obj, buf, SlabDirection.WRITE)
+
+        buf.zero_()
+        torch.cuda.synchronize()
+        ctx.transfer_async(mem_obj, buf, SlabDirection.READ)
+        torch.cuda.synchronize()
+
+        expected = torch.full((chunk_bytes,), 0xAB, dtype=torch.uint8)
+        assert torch.equal(buf.cpu(), expected)
+    finally:
+        ctx.close()
+
+
+@requires_gds
+def test_gds_chunk_larger_than_region_roundtrip(tmp_path):
+    """A chunk larger than the 16 MiB cuFile region cap round-trips correctly.
+
+    Exercises the multi-region registration and the split (per-segment) DMA
+    path: a 24 MiB chunk is registered/transferred as a 16 MiB + 8 MiB pair.
+    """
+    cfg = GdsL1Config(file_location=str(tmp_path), size_in_bytes=64 << 20)
+    ctx = GDSContext()
+    ctx.initialize(cfg)
+    try:
+        chunk_bytes = 24 << 20  # > 16 MiB -> two registered regions / two DMAs
+        buf = torch.empty(chunk_bytes, dtype=torch.uint8, device="cuda")
+        ctx.register_gpu_buffer(buf)
+
+        mgr = GDSL1MemoryManager(cfg)
+        err, objs = mgr.allocate(
+            MemoryLayoutDesc(shapes=[torch.Size([chunk_bytes])], dtypes=[torch.uint8]),
+            1,
+        )
+        assert err == L1Error.SUCCESS
+        mem_obj = objs[0]
+
+        # Position-dependent pattern: a mis-offset or swapped segment (e.g. the
+        # second segment using the wrong slab offset) would corrupt the bytes
+        # around the 16 MiB boundary, which a uniform fill would not catch.
+        pattern = (torch.arange(chunk_bytes, dtype=torch.int64) % 251).to(torch.uint8)
+        buf.copy_(pattern.cuda())
+        torch.cuda.synchronize()
+        ctx.transfer_async(mem_obj, buf, SlabDirection.WRITE)
+
+        buf.zero_()
+        torch.cuda.synchronize()
+        ctx.transfer_async(mem_obj, buf, SlabDirection.READ)
+        torch.cuda.synchronize()
+
+        assert torch.equal(buf.cpu(), pattern)
+    finally:
+        ctx.close()
diff --git a/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py b/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
index 09abf56228..7143f82066 100644
--- a/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
+++ b/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
@@ -17,6 +17,9 @@ class _FakeGPUContext:
 
     num_layers: int = 2
 
+    def close(self) -> None:
+        """No-op teardown (real GPUCacheContext.close deregisters its GDS buffer)."""
+
 
 class _FakeDeviceHostFuncDispatcher:
     """No-op dispatcher to avoid starting native completion threads."""
diff --git a/tests/v1/multiprocess/test_non_cuda_data_transfer.py b/tests/v1/multiprocess/test_non_cuda_data_transfer.py
index 6c0f56cd64..c8e998b5a6 100644
--- a/tests/v1/multiprocess/test_non_cuda_data_transfer.py
+++ b/tests/v1/multiprocess/test_non_cuda_data_transfer.py
@@ -683,8 +683,14 @@ def _create(
         )
 
         session_cls.return_value.get_or_create.return_value = mock_session
+        if storage_manager_config is None:
+            storage_manager_config = MagicMock()
+            # GDS L1 is off in these tests. A bare MagicMock would auto-vivify
+            # gds_l1_config to a truthy mock, making MPCacheEngineContext attempt
+            # real cuFile init; pin it to None so GDS init stays a no-op.
+            storage_manager_config.l1_manager_config.gds_l1_config = None
         ctx = MPCacheEngineContext(
-            storage_manager_config=storage_manager_config or MagicMock(),
+            storage_manager_config=storage_manager_config,
             chunk_size=chunk_size,
         )
         module = NonGPUTransferModule(ctx)

From 65c2ae814526289ec9086493e94606c87e88ba1a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 11 Jun 2026 06:02:42 +0800
Subject: [PATCH 24/57] [Core][MP] Support Mamba/GDN hybrid models (Qwen3.5)
 (#3613)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 .../multiprocess/scripts/launch-processes.sh  |  32 +-
 .../multiprocess/scripts/run-hma-lm-eval.sh   |  42 +-
 .../multiprocess/scripts/run-single-test.sh   |  21 +
 .../vllm/hybrid-kv-cache-groups.md            |  13 +-
 .../integration/vllm/kv_cache_group_edits.md  | 150 +++++++
 docs/source/mp/hybrid_models.rst              |  55 ++-
 docs/source/recipes/index.rst                 |  12 +-
 docs/source/recipes/qwen3_5.rst               |  96 +++++
 .../integration/vllm/kv_cache_group_edits.py  | 402 ++++++++++++++++++
 .../integration/vllm/lmcache_mp_connector.py  |  48 +++
 10 files changed, 834 insertions(+), 37 deletions(-)
 create mode 100644 docs/design/integration/vllm/kv_cache_group_edits.md
 create mode 100644 docs/source/recipes/qwen3_5.rst
 create mode 100644 lmcache/integration/vllm/kv_cache_group_edits.py

diff --git a/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh b/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh
index 65034f2e55..4d70634298 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh
@@ -17,9 +17,9 @@ MAX_WORKERS="${MAX_WORKERS:-4}"
 MODEL="${MODEL:-Qwen/Qwen3-14B}"
 BUILD_ID="${BUILD_ID:-local_$$}"
 
-# K8s assigns exactly 2 GPUs as devices 0 and 1
-GPU_FOR_VLLM=0
-GPU_FOR_BASELINE=1
+# K8s assigns exactly 2 GPUs as devices 0 and 1 (overridable for local runs).
+GPU_FOR_VLLM="${GPU_FOR_VLLM:-0}"
+GPU_FOR_BASELINE="${GPU_FOR_BASELINE:-1}"
 echo "Using GPU $GPU_FOR_VLLM for vLLM with LMCache"
 echo "Using GPU $GPU_FOR_BASELINE for vLLM baseline"
 
@@ -68,6 +68,27 @@ fi
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-auto}"
 MAX_MODEL_LEN_ARG="--max-model-len ${MAX_MODEL_LEN}"
 
+# LMCache server chunk size in tokens. Empty -> server default.
+CHUNK_SIZE_ARG=""
+if [ -n "${CHUNK_SIZE:-}" ]; then
+    CHUNK_SIZE_ARG="--chunk-size ${CHUNK_SIZE}"
+fi
+
+# vLLM batch-invariant mode. On by default; GDN/Mamba backends do not support it.
+BATCH_INVARIANT="${BATCH_INVARIANT:-1}"
+
+# Mamba KV cache mode + prefix caching, set only for hybrid Mamba models.
+MAMBA_ARGS=""
+if [ -n "${MAMBA_CACHE_MODE:-}" ]; then
+    MAMBA_ARGS="--mamba-cache-mode ${MAMBA_CACHE_MODE} --enable-prefix-caching"
+fi
+
+# Max tokens per scheduler step. Empty -> vLLM default.
+MAX_NUM_BATCHED_TOKENS_ARG=""
+if [ -n "${MAX_NUM_BATCHED_TOKENS:-}" ]; then
+    MAX_NUM_BATCHED_TOKENS_ARG="--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}"
+fi
+
 # Store PIDs in a file so cleanup.sh can find them
 PID_FILE="/tmp/lmcache_mp_pids_${BUILD_ID}"
 > "$PID_FILE"
@@ -91,6 +112,7 @@ lmcache server \
     --l1-size-gb "$CPU_BUFFER_SIZE" \
     --eviction-policy LRU \
     --max-workers "$MAX_WORKERS" \
+    $CHUNK_SIZE_ARG \
     --port "$LMCACHE_PORT" \
     ${GDS_L1_ARG} \
     > "/tmp/build_${BUILD_ID}_lmcache.log" 2>&1 &
@@ -116,7 +138,7 @@ echo "Port: $vllm_port"
 CUDA_VISIBLE_DEVICES="${GPU_FOR_VLLM}" \
 VLLM_ENABLE_V1_MULTIPROCESSING=0 \
 VLLM_SERVER_DEV_MODE=1 \
-VLLM_BATCH_INVARIANT=1 \
+VLLM_BATCH_INVARIANT=${BATCH_INVARIANT} \
 PYTHONHASHSEED=0 \
 vllm serve "$MODEL" \
     --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 10}}" \
@@ -126,6 +148,8 @@ vllm serve "$MODEL" \
     $MAX_MODEL_LEN_ARG \
     $ENFORCE_EAGER_ARG \
     $GPU_MEMORY_UTIL_ARG \
+    $MAMBA_ARGS \
+    $MAX_NUM_BATCHED_TOKENS_ARG \
     > "/tmp/build_${BUILD_ID}_vllm.log" 2>&1 &
 
 VLLM_PID=$!
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-hma-lm-eval.sh b/.buildkite/k3_tests/multiprocess/scripts/run-hma-lm-eval.sh
index 5d2f33e9c5..89dcfaa0da 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/run-hma-lm-eval.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/run-hma-lm-eval.sh
@@ -1,15 +1,12 @@
 #!/usr/bin/env bash
 # HMA (hybrid memory allocator) correctness test using a real hybrid model.
 #
-# Why google/gemma-4-31B-it:
-#   - Hybrid (sliding-window + full-attention), so vLLM keeps its hybrid KV
-#     cache manager on and exposes multiple KV cache groups.
-#   - Its full layers have a larger head_dim, so the groups get different block
-#     sizes -- this exercises the per-group HMA store/retrieve path.
-#   - Standard paged attention for both layer families, so LMCache's transfer
-#     kernels support it (unlike Mamba/linear-attention hybrids like
-#     Qwen3.5/Qwen3-Next, whose state caches LMCache cannot yet transfer).
-#   - Public, so no HF_TOKEN is required.
+# Models (selected by run-single-test.sh):
+#   - google/gemma-4-31B-it: sliding-window + full-attention hybrid whose full
+#     layers have a larger head_dim, so vLLM gives the KV cache groups
+#     different block sizes -- exercising per-group HMA store/retrieve.
+#   - Qwen/Qwen3.5-0.8B: Mamba/GDN + full-attention hybrid, exercising the
+#     registration-time cache re-views (kv_cache_group_edits.py).
 #
 # Flow (single GPU, no baseline server):
 #   1. vLLM run: lm_eval (gsm8k) against vLLM+LMCache, populating LMCache.
@@ -39,7 +36,8 @@ NUM_CONCURRENT="${NUM_CONCURRENT:-50}"
 # set fits the CPU pool (a too-large set thrashes and the retrieve run misses).
 LIMIT="${LIMIT:-100}"
 # Max abs difference allowed between the two runs' gsm8k scores; 0 requires an
-# exact match.
+# exact match. For non-bit-exact backends, raise LIMIT to shrink run-to-run
+# drift (~1/sqrt(LIMIT)) rather than loosening this.
 SCORE_TOLERANCE="${SCORE_TOLERANCE:-0}"
 # Seconds to let async LMCache stores drain before the retrieve run.
 STORE_DRAIN_SECONDS="${STORE_DRAIN_SECONDS:-20}"
@@ -177,11 +175,11 @@ retrieves_before = int(before_s)
 retrieves_after = int(after_s)
 
 
-def gsm8k_exact_match(results_dir: str) -> float:
-    """Return the gsm8k exact_match score from an lm_eval results directory.
+def gsm8k_score_and_stderr(results_dir: str) -> tuple[float, float]:
+    """Return the gsm8k (exact_match, stderr) from an lm_eval results directory.
 
     Prefers the strict-match variant; falls back to any non-stderr
-    ``exact_match`` metric key.
+    ``exact_match`` metric key (paired with its ``exact_match_stderr`` twin).
 
     Args:
         results_dir: Directory passed to ``lm_eval --output_path``. Searched
@@ -190,7 +188,8 @@ def gsm8k_exact_match(results_dir: str) -> float:
             timestamp).
 
     Returns:
-        The gsm8k ``exact_match`` accuracy as a float in ``[0.0, 1.0]``.
+        ``(score, stderr)``: the gsm8k ``exact_match`` accuracy in
+        ``[0.0, 1.0]`` and its reported sampling stderr (0.0 if absent).
 
     Raises:
         SystemExit: If no ``results_*.json`` exists under ``results_dir`` or the
@@ -205,18 +204,21 @@ def gsm8k_exact_match(results_dir: str) -> float:
     metrics = data["results"]["gsm8k"]
     preferred = "exact_match,strict-match"
     if preferred in metrics:
-        return float(metrics[preferred])
+        stderr = float(metrics.get("exact_match_stderr,strict-match", 0.0))
+        return float(metrics[preferred]), stderr
     for key, value in metrics.items():
         if key.startswith("exact_match,") and "stderr" not in key:
-            return float(value)
+            variant = key.split(",", 1)[1]
+            stderr = float(metrics.get(f"exact_match_stderr,{variant}", 0.0))
+            return float(value), stderr
     raise SystemExit(f"No exact_match metric in {latest}: {sorted(metrics)}")
 
 
-s_vllm = gsm8k_exact_match(vllm_run_dir)
-s_retrieve = gsm8k_exact_match(retrieve_run_dir)
+s_vllm, e_vllm = gsm8k_score_and_stderr(vllm_run_dir)
+s_retrieve, e_retrieve = gsm8k_score_and_stderr(retrieve_run_dir)
 
-print(f"  vLLM run             gsm8k exact_match = {s_vllm:.4f}")
-print(f"  LMCache retrieve run gsm8k exact_match = {s_retrieve:.4f}")
+print(f"  vLLM run             gsm8k exact_match = {s_vllm:.4f} +/- {e_vllm:.4f}")
+print(f"  LMCache retrieve run gsm8k exact_match = {s_retrieve:.4f} +/- {e_retrieve:.4f}")
 print(f"  tolerance = {tol}")
 
 failures = []
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh b/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
index 58bedc7619..23a3dbcfb3 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
+++ b/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
@@ -41,6 +41,24 @@ if [ "$TEST_NAME" = "hma_lm_eval_gemma4" ]; then
     # pipeline sets ATTENTION_BACKEND=auto; its ~63GB of weights also need a
     # higher GPU_MEMORY_UTILIZATION than the default (all set in pipeline.yml).
     export MODEL="${MODEL:-google/gemma-4-31B-it}"
+elif [ "$TEST_NAME" = "hma_lm_eval_qwen3_5" ]; then
+    # Qwen3.5-0.8B is a Mamba/GDN + full-attention hybrid (caches re-viewed at
+    # registration; see lmcache/integration/vllm/kv_cache_group_edits.py).
+    export MODEL="${MODEL:-Qwen/Qwen3.5-0.8B}"
+    export ATTENTION_BACKEND="${ATTENTION_BACKEND:-auto}"
+    # LMCache chunk size must be a multiple of the unified vLLM block size (544).
+    export CHUNK_SIZE="${CHUNK_SIZE:-544}"
+    # GDN supports only the 'align' Mamba cache mode.
+    export MAMBA_CACHE_MODE="${MAMBA_CACHE_MODE:-align}"
+    # 'align' snapshots the Mamba state only at scheduler-step boundaries; cap
+    # the step at the chunk size for one reusable snapshot per chunk.
+    export MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-544}"
+    # GDN has no batch-invariant mode, so runs are not bit-exact; compare within
+    # a score tolerance and use enough samples to shrink run-to-run drift
+    # (~1/sqrt(LIMIT)) well inside it.
+    export BATCH_INVARIANT="${BATCH_INVARIANT:-0}"
+    export SCORE_TOLERANCE="${SCORE_TOLERANCE:-0.05}"
+    export LIMIT="${LIMIT:-300}"
 else
     export MODEL="${MODEL:-Qwen/Qwen3-14B}"
 fi
@@ -112,6 +130,9 @@ case "$TEST_NAME" in
     hma_lm_eval_gemma4)
         exec_script="${SCRIPT_DIR}/run-hma-lm-eval.sh"
         ;;
+    hma_lm_eval_qwen3_5)
+        exec_script="${SCRIPT_DIR}/run-hma-lm-eval.sh"
+        ;;
     vllm_bench)
         exec_script="${SCRIPT_DIR}/run-vllm-bench.sh"
         ;;
diff --git a/docs/design/integration/vllm/hybrid-kv-cache-groups.md b/docs/design/integration/vllm/hybrid-kv-cache-groups.md
index 95f2f05ad8..40769b0a26 100644
--- a/docs/design/integration/vllm/hybrid-kv-cache-groups.md
+++ b/docs/design/integration/vllm/hybrid-kv-cache-groups.md
@@ -141,11 +141,15 @@ Block IDs `{group 0: [10,11], group 1: [20,21]}` are sent as
 - The server reproduces grouping with the same `group_layers_by_identity`; real
   tensors remain the source of truth for shape/dtype/stride.
 
-## Not supported
+## Mamba / linear-attention hybrids
 
-Mamba / linear-attention hybrids (e.g. Qwen3-Next): their recurrent state caches
-have no LMCache transfer format yet. vLLM still exposes them as KV cache groups,
-but LMCache cannot store/retrieve those layers.
+Supported via registration-time tensor re-views (e.g. Qwen3.5 GDN): Mamba
+state pairs become opaque page views, and full-attention layers whose logical
+block size was inflated for page-size unification are re-viewed at
+logical-block granularity. See
+[kv-cache-group-edits](kv_cache_group_edits.md) for the design and its
+limits (notably: edited groups are byte-opaque — no content-aware processing,
+no cross-backend cache sharing).
 
 ## Code map
 
@@ -154,6 +158,7 @@ but LMCache cannot store/retrieve those layers.
 | Engine group info (IPC type) + helpers | `lmcache/v1/multiprocess/group_view.py` |
 | Shared grouping primitive | `lmcache/v1/kv_layer_groups.py` |
 | vLLM → `list[EngineGroupInfo]` | `lmcache/integration/vllm/kv_cache_groups.py` |
+| Group metadata edits (Mamba, sub-paged attention) | `lmcache/integration/vllm/kv_cache_group_edits.py` |
 | Register / store / retrieve | `lmcache/integration/vllm/{lmcache_mp_connector,vllm_multi_process_adapter}.py` |
 | Server GPU context / transfer | `lmcache/v1/multiprocess/{gpu_context,modules/gpu_transfer}.py` |
 | ZMQ protocol | `lmcache/v1/multiprocess/protocols/engine.py` |
diff --git a/docs/design/integration/vllm/kv_cache_group_edits.md b/docs/design/integration/vllm/kv_cache_group_edits.md
new file mode 100644
index 0000000000..b63a3b51da
--- /dev/null
+++ b/docs/design/integration/vllm/kv_cache_group_edits.md
@@ -0,0 +1,150 @@
+# KV Cache Group Edits
+
+## Summary
+
+`lmcache/integration/vllm/kv_cache_group_edits.py` is the single place where
+vLLM KV cache groups are re-presented ("edited") before LMCache registration.
+The connector calls `apply_kv_cache_group_edits(kv_cache_config, kv_caches)`
+once in `register_kv_caches`, and the edited dict feeds both engine-group-info
+creation (`kv_cache_groups.py`) and transfer registration.
+
+LMCache derives each group's transfer metadata (block size, page layout,
+dtype) from the registered tensors, and interprets store/retrieve block IDs in
+vLLM's scheduler-side block-id space (`kv_cache_spec.block_size` units). The
+edits exist to restore one invariant the raw tensors can violate:
+
+> **The registered tensor's paging granularity must equal the block-id
+> granularity.**
+
+## Structure
+
+Each case is one `KVCacheGroupEdit` rule in the module's `_EDITS` registry:
+`matches(spec, kv_cache)` decides **structurally** — from the vLLM spec kind
+(`get_kv_cache_spec_kind`, which also unwraps `UniformTypeKVCacheSpecs`) and
+the registered tensor, never from model name or architecture — and
+`apply(spec, kv_cache)` produces a view over the same storage. First matching
+rule wins; unmatched layers pass through. Covering a new group kind means
+adding one rule.
+
+Model name/arch is deliberately not an input: the same architecture yields
+different group structures depending on runtime decisions
+(`mamba_cache_mode`, attention backend's kernel block size, TP), all of which
+the config + tensors already resolve.
+
+## Edits
+
+Both rules apply only to Mamba-hybrid models (the registry is only consulted
+when `kv_cache_config.has_mamba_layers`).
+
+### 1. Mamba state pages
+
+A Mamba / linear-attention layer (e.g. Qwen3.5 GDN) registers `[conv_state,
+ssm_state]` — two tensors with different shapes and dtypes, laid out
+contiguously in one padded page (`conv | ssm | pad`). The raw pair trips
+format discovery (the SSM view starts mid-page). The edit reinterprets each
+page as one bf16 tensor shaped `(num_blocks, 2, block_size, 1, head_size)`
+over the same storage, where `head_size` is derived so the bytes fill the page
+exactly.
+
+### 2. Sub-paged full attention
+
+vLLM unifies page sizes across hybrid groups by inflating the attention
+*logical* block size (`vllm/platforms/interface.py:_align_hybrid_block_size`;
+Qwen3.5-0.8B: 544), while the attention backend re-pages the physical tensor
+at its own *kernel* block size (`vllm/v1/worker/utils.py:
+prepare_kernel_block_sizes`; FlashAttention on hybrids: 32). Logical block `n`
+then occupies the `k = logical/kernel` contiguous kernel pages
+`n*k .. n*k+k-1` (vLLM expands the worker-side block table the same way,
+`BlockTable.map_to_kernel_blocks`); the scheduler-side block IDs LMCache
+receives stay logical.
+
+Registering the raw kernel-paged tensor makes LMCache discover
+`block_size == kernel < logical`, and `_derive_compression_metadata`
+(`lmcache/v1/kv_layer_groups.py`) misclassifies the group as compressed:
+only `1/k` of each chunk's KV is transferred, addressed against the kernel
+page space. The edit re-views the tensor as
+`(num_kernel_pages / k, 2, logical_block_size, 1, head_size)` — a pure
+`view()`, valid because `k` kernel pages tile each logical page's bytes
+exactly (enforced; see Invariants).
+
+## Startup validation
+
+`validate_kv_cache_groups` (called at connector init and again at
+registration) rejects group specs the transfer path cannot serve correctly,
+with one aggregated error: `CrossAttentionSpec`, and Mamba with
+`mamba_cache_mode != "align"` (no reusable snapshots). Declared slot
+compression (`compress_ratio > 1` / `tq_slot_size > 0`, e.g. DeepSeek-V4) is
+*not* rejected — those groups are served by the compression path in
+`lmcache/v1/kv_layer_groups.py` and only skipped by the edits here. Note the
+compression path still derives per-group ratios from the unified vLLM block
+size; switching it to per-group block sizes is pending in a separate PR.
+
+Reference: vLLM PR #42828 (Mooncake store HMA support) uses the same
+validate-and-reject-up-front pattern, and is the reference design for the
+deferred follow-ups (per-group store/load masks; manager-mirroring hit
+computation). Caveat for the latter: LMCache's lookup doubles as prefetch, so
+vLLM's lookup-first-then-trim flow does not map directly. See the module
+docstring for the full check-when list.
+
+## Non-edit: declared compression
+
+Groups whose spec *declares* slot compression — `MLAAttentionSpec.
+compress_ratio > 1` (DeepSeek-V4 slot packing, `storage_block_size <
+block_size`) or `TQFullAttentionSpec.tq_slot_size > 0` — genuinely store fewer
+physical slots than logical tokens. They must reach the compression path in
+`lmcache/v1/kv_layer_groups.py` unedited. (DeepSeek-V3.2's `fp8_ds_mla` cache
+packs *bytes per slot*, not slots per block: its specs keep
+`block_size == scheduler block size` and `compress_ratio == 1`, so it never
+needs an edit either.)
+
+The sub-paged rule's `matches` excludes declared-compression specs by their
+own fields (`compress_ratio` / `tq_slot_size`), and its `apply` additionally
+verifies by byte accounting that `k` kernel pages tile the logical page's
+bytes exactly — any *undeclared* packed layout fails with a loud `ValueError`
+rather than being transferred wrongly.
+
+## The opaque-page contract
+
+An edited view's dims are addressing metadata only (block id → byte range).
+The named dims are **not** semantic: a Mamba view's "K plane" is conv/ssm
+bytes, and a sub-paged attention view's "K plane" interleaves true K and V at
+kernel-page granularity (true K is not contiguous across kernel pages, so no
+logical-block view can have a pure-K plane). The synthetic head shape
+`(1, page_bytes / (2 * block_size * elem))` signals this deliberately.
+
+Byte transport round-trips correctly because store and retrieve share the same
+bijective block-id → bytes mapping. Consequences:
+
+- **Valid**: store/retrieve through the MP transfer path on the same engine
+  configuration.
+- **Not valid** for edited groups: content-aware processing (serde
+  compression, blending, head resharding, layout conversion), and sharing
+  cache entries across engines whose attention backends choose different
+  kernel block sizes (the byte order inside a logical page is
+  backend-dependent).
+
+## Invariants
+
+- Edits are pure tensor views over the registered storage — never copies.
+- A sub-paged view is only produced when `kernel_page_bytes * k ==
+  spec.page_size_bytes`; any mismatch raises `ValueError` (fail loudly rather
+  than silently transfer a compressed layout).
+- After edits, every registered tensor's block dim equals its group's
+  `kv_cache_spec.block_size`, so the server derives `compress_ratio == 1` for
+  these groups.
+
+## Code map
+
+| Area | File |
+|---|---|
+| Edits (this doc) | `lmcache/integration/vllm/kv_cache_group_edits.py` |
+| Caller | `lmcache/integration/vllm/lmcache_mp_connector.py` (`register_kv_caches`) |
+| Compression-ratio derivation (downstream consumer) | `lmcache/v1/kv_layer_groups.py` |
+| vLLM block-size inflation | `vllm/platforms/interface.py` (`_align_hybrid_block_size`) |
+| vLLM kernel-page split + block-table expansion | `vllm/v1/worker/utils.py`, `vllm/v1/worker/block_table.py` |
+| End-to-end test | `.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh` (`hma_lm_eval_qwen3_5`) |
+
+Testing is end-to-end only (the `hma_lm_eval_qwen3_5` store-vs-retrieve gsm8k
+check): the edit internals are expected to change as more group kinds are
+covered, so tests pin the observable contract — faithful retrieve — rather
+than view shapes.
diff --git a/docs/source/mp/hybrid_models.rst b/docs/source/mp/hybrid_models.rst
index 9810a376a1..616c5fef87 100644
--- a/docs/source/mp/hybrid_models.rst
+++ b/docs/source/mp/hybrid_models.rst
@@ -34,6 +34,9 @@ configuration. Examples:
    * - gpt-oss
      - Interleaved sliding-window + full
      - Supported
+   * - Qwen3.5 (and other Gated-DeltaNet hybrids)
+     - Interleaved Mamba/GDN + full
+     - Supported (see below)
    * - Llama, Qwen2/Qwen3 (dense), Mistral, …
      - Single attention type
      - Supported
@@ -48,16 +51,54 @@ detects the model's KV cache groups automatically at registration time.
    back to a single unified group). You do not need
    ``--no-disable-hybrid-kv-cache-manager`` or any related flag.
 
+Mamba / Linear-Attention Hybrids
+--------------------------------
+
+Models that interleave **Mamba / Gated-DeltaNet layers** with full attention
+(e.g. ``Qwen/Qwen3.5-0.8B``) are supported. Their recurrent state caches are
+reinterpreted as opaque pages at registration time, so prefix caching and KV
+reuse work end to end. They need three extra flags:
+
+#. vLLM must run with prefix caching and the ``align`` Mamba cache mode::
+
+       vllm serve Qwen/Qwen3.5-0.8B \
+           --enable-prefix-caching --mamba-cache-mode align \
+           --kv-transfer-config \
+           '{"kv_connector":"LMCacheMPConnector", "kv_role":"kv_both"}'
+
+#. The LMCache server's ``--chunk-size`` must be a multiple of vLLM's unified
+   block size for the model (vLLM logs ``Setting attention block size to N
+   tokens`` at startup; for Qwen3.5-0.8B, ``N = 544``)::
+
+       lmcache server --chunk-size 544 --l1-size-gb 100 --eviction-policy LRU
+
+#. ``--max-num-batched-tokens`` must be at least the unified block size and
+   below twice it (LMCache raises at engine startup otherwise; setting it
+   equal to the block size is the simple choice)::
+
+       vllm serve ... --max-num-batched-tokens 544
+
+   ``align`` mode snapshots the Mamba state only at the *end* of each
+   scheduler step; a larger budget would let one step skip block boundaries,
+   leaving no snapshot for LMCache to store at those prefixes.
+
+Caveats:
+
+- Generation is **not bit-exact** between a cached and a fresh run: GDN
+  backends do not support vLLM's batch-invariant mode. Expect score-level
+  equivalence, not token-level.
+- The cached pages are byte-opaque, so content-aware features (CacheGen
+  compression, CacheBlend) do not apply, and cache entries must not be shared
+  across engines with different attention backends or kernel block sizes.
+
+See the :doc:`Qwen3.5 recipe <../recipes/qwen3_5>` for the validated
+end-to-end commands.
+
 What Is Not Supported Yet
 -------------------------
 
-- **Mamba / linear-attention hybrids** (e.g. Qwen3-Next, Qwen3.5, and other
-  Gated-DeltaNet models). These layers keep a recurrent *state cache* (a
-  convolution + SSM state) instead of a paged key/value cache, which LMCache's
-  transfer path cannot represent today. Such models will fail to register with
-  the LMCache server. Tracking support is future work.
-- **DeepSeek-V4-style compressed / indexer caches** are likewise not yet
-  handled by the multiprocess connector.
+- **DeepSeek-V4-style compressed / indexer caches** are not yet handled by the
+  multiprocess connector.
 
 Verifying Correctness
 ---------------------
diff --git a/docs/source/recipes/index.rst b/docs/source/recipes/index.rst
index 8dd0fdd370..ecd8a53c5d 100644
--- a/docs/source/recipes/index.rst
+++ b/docs/source/recipes/index.rst
@@ -26,8 +26,8 @@ Each recipe page is intentionally minimal:
 - **Caveats** -- known limitations, if any.
 
 For the generic LMCache + engine wiring (ports, remote hosts, in-process mode,
-sending a first request), see :doc:`../getting_started/quickstart` and
-:doc:`../mp/quickstart`. Recipes assume those pages as a prerequisite.
+sending a first request), see :doc:`../mp/quickstart`. Recipes assume that
+page as a prerequisite.
 
 Supported architectures
 -----------------------
@@ -93,6 +93,13 @@ Supported architectures
      - —
      - :doc:`qwen3`
 
+   * - ``Qwen3_5ForConditionalGeneration``
+     - ``Qwen/Qwen3.5-0.8B``
+     - ✓
+     - —
+     - —
+     - :doc:`qwen3_5`
+
    * - ``LlamaForCausalLM``
      - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
      - ✓
@@ -138,6 +145,7 @@ To add a new architecture:
    devstral
    gpt_oss
    qwen3
+   qwen3_5
    llama
    phi3
    mixtral
\ No newline at end of file
diff --git a/docs/source/recipes/qwen3_5.rst b/docs/source/recipes/qwen3_5.rst
new file mode 100644
index 0000000000..2a94418fe4
--- /dev/null
+++ b/docs/source/recipes/qwen3_5.rst
@@ -0,0 +1,96 @@
+.. _recipe_qwen3_5:
+
+Qwen3_5ForConditionalGeneration
+===============================
+
+A hybrid architecture interleaving Mamba / Gated-DeltaNet (GDN) linear-attention
+layers with full-attention layers. LMCache reinterprets the recurrent state
+caches as opaque pages at registration time; see :doc:`../mp/hybrid_models`.
+
+Validated models
+----------------
+
+- `Qwen/Qwen3.5-0.8B <https://huggingface.co/Qwen/Qwen3.5-0.8B>`_
+
+.. tab-set::
+   :sync-group: engine
+
+   .. tab-item:: vLLM
+
+      **Engine documentation:**
+      `Qwen3.5 in vLLM supported models
+      <https://docs.vllm.ai/en/latest/models/supported_models.html#text-generation>`_
+      (architecture ``Qwen3_5ForConditionalGeneration``).
+
+      **Status:** Validated with LMCache.
+
+      Start the LMCache MP server. ``--chunk-size`` must be a multiple of
+      vLLM's unified block size for the model — vLLM logs ``Setting attention
+      block size to N tokens`` at startup; for Qwen3.5-0.8B, ``N = 544``:
+
+      .. code-block:: bash
+
+         lmcache server --chunk-size 544 --l1-size-gb 100 --eviction-policy LRU
+
+      |
+
+      **Qwen3.5-0.8B** (1 GPU):
+
+      .. code-block:: bash
+
+         vllm serve Qwen/Qwen3.5-0.8B \
+             --enable-prefix-caching \
+             --mamba-cache-mode align \
+             --max-num-batched-tokens 544 \
+             --kv-transfer-config \
+             '{"kv_connector":"LMCacheMPConnector", "kv_role":"kv_both"}'
+
+      ``--mamba-cache-mode align`` is required (GDN does not support the
+      ``all`` mode). ``--max-num-batched-tokens`` must be at least the unified
+      block size and below twice it — LMCache raises at engine startup
+      otherwise. ``align`` snapshots the Mamba state only at scheduler-step
+      ends, so each prefill step must advance exactly one block for every
+      block boundary to hold a reusable snapshot.
+
+      For the generic LMCache + vLLM wiring (ports, remote hosts, in-process
+      mode), see :doc:`../mp/quickstart`.
+
+   .. tab-item:: SGLang
+
+      **Status:** Not validated with LMCache.
+
+   .. tab-item:: TRT-LLM
+
+      **Status:** Not supported. LMCache TRT-LLM integration is in progress.
+
+CacheBlend support
+------------------
+
+Not supported: the hybrid groups' cached pages are byte-opaque (see Caveats).
+
+Compression support
+-------------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 20 55
+
+   * - Method
+     - Status
+     - Notes
+   * - :doc:`CacheGen <../kv_cache_optimizations/compression/cachegen>`
+     - Not supported
+     - Hybrid groups' cached pages are byte-opaque.
+
+Caveats
+-------
+
+- Generation is **not bit-exact** between a cached and a fresh run: GDN
+  backends do not support vLLM's batch-invariant mode. Expect score-level
+  equivalence, not token-level (the CI gate is the ``hma_lm_eval_qwen3_5``
+  gsm8k store-vs-retrieve comparison).
+- Cached pages for the Mamba and full-attention groups are byte-opaque views,
+  so content-aware processing does not apply, and cache entries must not be
+  shared across engines with different attention backends or kernel block
+  sizes.
+- vLLM's Mamba prefix caching in ``align`` mode is experimental.
diff --git a/lmcache/integration/vllm/kv_cache_group_edits.py b/lmcache/integration/vllm/kv_cache_group_edits.py
new file mode 100644
index 0000000000..c8feb86ed5
--- /dev/null
+++ b/lmcache/integration/vllm/kv_cache_group_edits.py
@@ -0,0 +1,402 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Centralized edits to vLLM kv cache specs.
+
+This is needed to mask out attention-specific details while making sure that
+LMCache can still store / load KV cache correctly.
+
+Currently there are two edits, one per :class:`KVCacheGroupEdit` subclass
+below. Both are for Mamba-hybrid models; the registry is only consulted when
+``kv_cache_config.has_mamba_layers``. :func:`validate_kv_cache_groups`
+additionally rejects, at startup, group specs the transfer path cannot serve
+correctly yet (see its docstring).
+
+Reference design: vLLM PR #42828 ("[KVConnector][DSV4] HMA support for
+Mooncake store connector") solves the same problem class for an external KV
+store. Check it again when working on any of the following:
+
+- Extending :func:`validate_kv_cache_groups` (its connector validates and
+  rejects unsupported specs up front, with one aggregated error).
+- Per-group store/load masks (SWA / Mamba tail-only transfer, the
+  "sliding-window load-plan trimming" deferred in the hybrid design doc):
+  its ``MooncakeStoreCoordinator.store_mask`` / ``load_mask`` derive masks
+  from vLLM's own per-spec managers. Requires per-group objects in LMCache
+  (``ObjectKey.object_group_id``, LMCache #3608).
+- Hit-length computation for hybrid models: its ``ExternalCachedBlockPool``
+  duck-types vLLM's ``BlockPool`` over a ``(group_id, hash)`` existence set
+  and reuses ``KVCacheSpecRegistry`` manager classes, so promised hits are
+  consumable by construction. NOTE: LMCache's lookup doubles as prefetch, so
+  vLLM's lookup-first-then-trim flow does not map directly -- this needs its
+  own design before borrowing.
+- Eagle + HMA: see its ``apply_eagle`` notes (the eagle last-block prune must
+  be applied exactly once between hit-length and mask computation).
+"""
+
+# Future
+from __future__ import annotations
+
+# Standard
+from abc import ABC, abstractmethod
+from collections import Counter
+from collections.abc import Mapping
+from typing import TypeAlias
+
+# Third Party
+from vllm.v1.kv_cache_interface import (
+    KVCacheConfig,
+    KVCacheSpec,
+    KVCacheSpecKind,
+    get_kv_cache_spec_kind,
+)
+import torch
+
+# First Party
+from lmcache.logging import init_logger
+
+logger = init_logger(__name__)
+
+# One registered cache value: a paged KV tensor, or [conv_state, ssm_state]
+# for Mamba layers.
+RegisteredKVCache: TypeAlias = torch.Tensor | list[torch.Tensor]
+
+# Synthetic head count for a reinterpreted page. The page is opaque bytes, so
+# one "head" holding the whole per-(K/V) slab is enough; head_size is derived
+# to fill the page.
+_SYNTHETIC_NUM_HEADS = 1
+
+# Standard-paged (non-MLA) attention kinds eligible for the sub-paged edit.
+_SUBPAGEABLE_ATTENTION_KINDS = frozenset(
+    {
+        KVCacheSpecKind.FULL_ATTENTION,
+        KVCacheSpecKind.SLIDING_WINDOW,
+        KVCacheSpecKind.CHUNKED_LOCAL_ATTENTION,
+        KVCacheSpecKind.SINK_FULL_ATTENTION,
+    }
+)
+
+
+def _declares_slot_compression(spec: KVCacheSpec) -> bool:
+    """Return whether a spec declares slot compression (must not be edited).
+
+    Covers ``MLAAttentionSpec.compress_ratio > 1`` (DeepSeek-V4 slot packing)
+    and ``TQFullAttentionSpec.tq_slot_size > 0`` (TurboQuant slots); such
+    groups belong to the compression path in ``lmcache.v1.kv_layer_groups``.
+    """
+    return (
+        getattr(spec, "compress_ratio", 1) > 1 or getattr(spec, "tq_slot_size", 0) > 0
+    )
+
+
+def _leaf_specs(spec: KVCacheSpec) -> list[KVCacheSpec]:
+    """Return a group spec's leaf specs, unwrapping ``UniformTypeKVCacheSpecs``."""
+    inner = getattr(spec, "kv_cache_specs", None)
+    if isinstance(inner, dict):
+        return list(inner.values())
+    return [spec]
+
+
+def validate_kv_cache_groups(kv_cache_config: KVCacheConfig | None) -> None:
+    """Reject KV cache group specs the transfer path cannot serve correctly.
+
+    Rejected, with one aggregated error listing every offending group:
+
+    - ``CrossAttentionSpec`` (encoder-decoder caches).
+    - Mamba groups with ``mamba_cache_mode != "align"``: other modes keep no
+      reusable per-block state snapshots.
+
+    Specs declaring slot compression (``compress_ratio > 1`` /
+    ``tq_slot_size > 0``, e.g. DeepSeek-V4) are NOT rejected: they are served
+    by the compression path in ``lmcache.v1.kv_layer_groups`` and merely
+    skipped by the edits here (see ``_declares_slot_compression``).
+
+    Args:
+        kv_cache_config: vLLM ``KVCacheConfig``; ``None`` skips validation
+            (callers without the config validate again at registration).
+
+    Raises:
+        ValueError: Listing every unsupported group and why.
+    """
+    if kv_cache_config is None:
+        return
+    unsupported: list[str] = []
+    for group_idx, group in enumerate(kv_cache_config.kv_cache_groups):
+        for spec in _leaf_specs(group.kv_cache_spec):
+            kind = get_kv_cache_spec_kind(spec)
+            if kind == KVCacheSpecKind.CROSS_ATTENTION:
+                unsupported.append(f"group {group_idx}: CrossAttentionSpec")
+            elif (
+                kind == KVCacheSpecKind.MAMBA
+                and getattr(spec, "mamba_cache_mode", "none") != "align"
+            ):
+                unsupported.append(
+                    f"group {group_idx}: MambaSpec with mamba_cache_mode="
+                    f"'{getattr(spec, 'mamba_cache_mode', 'none')}' "
+                    f"(only 'align' keeps reusable state snapshots)"
+                )
+    if unsupported:
+        raise ValueError(
+            "LMCache cannot serve this model's KV cache groups: "
+            + "; ".join(unsupported)
+            + ". See lmcache/integration/vllm/kv_cache_group_edits.py."
+        )
+
+
+def _synthetic_attention_shape(elems_per_page: int, block_size: int) -> tuple[int, int]:
+    """Factor a page's element count into the synthetic attention layout.
+
+    Args:
+        elems_per_page: Total elements in one page (one logical block).
+        block_size: Logical block size (tokens per page).
+
+    Returns:
+        ``(num_heads, head_size)`` such that
+        ``2 * block_size * num_heads * head_size == elems_per_page``.
+
+    Raises:
+        ValueError: If the page size does not factor into the target shape.
+    """
+    denom = 2 * block_size * _SYNTHETIC_NUM_HEADS
+    if elems_per_page % denom != 0:
+        raise ValueError(
+            f"page ({elems_per_page} elems) does not factor into "
+            f"(2, block_size={block_size}, num_heads={_SYNTHETIC_NUM_HEADS}, head_size)"
+        )
+    return _SYNTHETIC_NUM_HEADS, elems_per_page // denom
+
+
+class KVCacheGroupEdit(ABC):
+    """One structural edit rule for a KV cache group's registered cache.
+
+    ``matches`` must be side-effect free and decide purely from the vLLM spec
+    and the registered cache value; ``apply`` must return a view over the same
+    storage (never a copy). ``name`` labels the rule in logs.
+    """
+
+    name: str
+
+    @abstractmethod
+    def matches(self, spec: KVCacheSpec, kv_cache: RegisteredKVCache) -> bool:
+        """Return whether this rule applies to the layer's registered cache.
+
+        Args:
+            spec: The layer's vLLM KV cache spec (from its group).
+            kv_cache: The layer's registered cache value -- a tensor, or a
+                list of tensors for Mamba layers.
+        """
+
+    @abstractmethod
+    def apply(self, spec: KVCacheSpec, kv_cache: RegisteredKVCache) -> torch.Tensor:
+        """Return the edited view for a layer this rule matched.
+
+        Args:
+            spec: The layer's vLLM KV cache spec (from its group).
+            kv_cache: The layer's registered cache value.
+
+        Raises:
+            ValueError: If the cache's layout violates the rule's invariants.
+        """
+
+
+class _MambaPageViewEdit(KVCacheGroupEdit):
+    """Convert a Mamba page to its equivalent: a sliding-window-style layer.
+
+    A Mamba layer registers ``[conv_state, ssm_state]`` -- two tensors with
+    different shapes and dtypes sharing one padded page per block
+    (``conv | ssm | pad``) -- which LMCache's attention-shaped transfer path
+    cannot represent. Each page is one recurrent state snapshot, equivalent
+    for caching purposes to one block of a sliding-window attention layer with
+    window == block_size: only the last matched block is ever consumed. The
+    edit reinterprets the page buffer as one
+    ``[#blocks, 2, block_size, 1, head_size]`` tensor in the conv state's
+    dtype, with ``head_size`` derived to fill the page exactly.
+
+    The view's dims are addressing metadata only; the bytes are opaque
+    (conv | ssm | pad, not K/V), so content-aware processing does not apply.
+    """
+
+    name = "mamba-page-view"
+
+    def matches(self, spec: KVCacheSpec, kv_cache: RegisteredKVCache) -> bool:
+        return get_kv_cache_spec_kind(spec) == KVCacheSpecKind.MAMBA
+
+    def apply(self, spec: KVCacheSpec, kv_cache: RegisteredKVCache) -> torch.Tensor:
+        # vLLM lays out one padded page per block as (conv | ssm | pad), and
+        # the conv state is the view that starts at the page base: its leading
+        # dim is the block count and its per-block stride is one full page.
+        # Re-striding it therefore covers the whole page, ssm and pad included.
+        if not isinstance(kv_cache, list) or not kv_cache:
+            raise ValueError(
+                f"expected a Mamba [conv_state, ssm_state] tensor list, "
+                f"got {type(kv_cache).__name__}"
+            )
+        conv_state = kv_cache[0]
+        if conv_state.storage_offset() != 0:
+            raise ValueError(
+                f"Mamba conv state must view the page base, got "
+                f"storage_offset={conv_state.storage_offset()}"
+            )
+        if conv_state.stride(0) * conv_state.element_size() != spec.page_size_bytes:
+            raise ValueError(
+                f"Mamba conv state per-block stride "
+                f"({conv_state.stride(0) * conv_state.element_size()} bytes) "
+                f"does not equal the page size ({spec.page_size_bytes} bytes)"
+            )
+        num_blocks = conv_state.shape[0]
+        elems_per_page = spec.page_size_bytes // conv_state.element_size()
+        num_heads, head_size = _synthetic_attention_shape(
+            elems_per_page, spec.block_size
+        )
+        flat = conv_state.as_strided((num_blocks, elems_per_page), (elems_per_page, 1))
+        return flat.reshape(num_blocks, 2, spec.block_size, num_heads, head_size)
+
+
+class _SubpagedAttentionViewEdit(KVCacheGroupEdit):
+    """Re-view a kernel-paged attention tensor as logical-block pages.
+
+    For a Mamba-hybrid model vLLM inflates the attention block size to align
+    with the Mamba page (e.g. 544 for Qwen3.5-0.8B), and that size is used for
+    all prefix-caching logic at the scheduler. But at the worker the attention
+    kernel has to run at block size 32 for numerical stability (vLLM #27753,
+    working around the NaN-propagation issue
+    Dao-AILab/flash-attention#1974), so the registered tensor is paged as
+
+        ``[#blocks, 2, 32, #heads, head_size]``
+
+    which makes LMCache detect block size 32, mistake the group for a
+    DeepSeek-compression layer (``block size < scheduler block size``), and
+    corrupt the store/retrieve. Fix: re-view the tensor at the scheduler
+    block size (one logical block = its 17 contiguous kernel pages),
+
+        ``[#blocks / 17, 2, 544, 1, head_size']``
+
+    Cost: before this fix ``kv_caches[:, 0]`` is just the K tensor; after, it
+    interleaves K and V at kernel-page granularity. The bytes round-trip
+    correctly (store and retrieve share the mapping), but the dims are no
+    longer semantic, so content-aware processing does not apply.
+    """
+
+    name = "subpaged-attention-view"
+
+    def matches(self, spec: KVCacheSpec, kv_cache: RegisteredKVCache) -> bool:
+        return (
+            # Standard-paged attention only; MLA layouts and declared slot
+            # compression (DeepSeek) belong to other transfer paths.
+            get_kv_cache_spec_kind(spec) in _SUBPAGEABLE_ATTENTION_KINDS
+            and not _declares_slot_compression(spec)
+            # (num_blocks, 2, block_size, num_heads, head_size) layout whose
+            # block dim disagrees with the scheduler block-id unit -- the
+            # backend re-paged the tensor at its kernel block size.
+            and isinstance(kv_cache, torch.Tensor)
+            and kv_cache.ndim == 5
+            and kv_cache.shape[2] != spec.block_size
+        )
+
+    def apply(self, spec: KVCacheSpec, kv_cache: RegisteredKVCache) -> torch.Tensor:
+        """Re-view ``kv_cache`` at logical-block granularity.
+
+        The tensor is kernel-paged as ``(num_kernel_pages, 2,
+        kernel_block_size, num_kv_heads, head_size)``; the result is
+        ``(num_logical_blocks, 2, spec.block_size, num_heads, head_size)``
+        over the same storage.
+
+        Raises:
+            ValueError: If the layout is not the expected kernel-paged shape,
+                the sizes do not divide evenly, or the kernel pages of one
+                logical block do not tile its page bytes exactly (which would
+                indicate an undeclared packed layout that must not be edited).
+        """
+        if not isinstance(kv_cache, torch.Tensor) or kv_cache.shape[1] != 2:
+            got = (
+                tuple(kv_cache.shape)
+                if isinstance(kv_cache, torch.Tensor)
+                else type(kv_cache).__name__
+            )
+            raise ValueError(
+                f"expected a (num_blocks, 2, block_size, num_heads, head_size) "
+                f"attention KV tensor, got {got}"
+            )
+        logical_block_size = spec.block_size
+        kernel_block_size = kv_cache.shape[2]
+        if logical_block_size % kernel_block_size != 0:
+            raise ValueError(
+                f"logical block size {logical_block_size} is not a multiple of "
+                f"kernel block size {kernel_block_size}"
+            )
+        ratio = logical_block_size // kernel_block_size
+
+        num_kernel_pages = kv_cache.shape[0]
+        if num_kernel_pages % ratio != 0:
+            raise ValueError(
+                f"kernel page count {num_kernel_pages} is not a multiple of "
+                f"the logical/kernel block ratio {ratio}"
+            )
+        kernel_page_bytes = kv_cache.shape[1:].numel() * kv_cache.element_size()
+        if kernel_page_bytes * ratio != spec.page_size_bytes:
+            raise ValueError(
+                f"{ratio} kernel pages ({kernel_page_bytes * ratio} bytes) do "
+                f"not tile the logical page ({spec.page_size_bytes} bytes)"
+            )
+        if not kv_cache.is_contiguous():
+            raise ValueError(
+                "kernel-paged attention KV tensor must be contiguous to "
+                "re-view as logical pages"
+            )
+
+        num_blocks = num_kernel_pages // ratio
+        elems_per_page = spec.page_size_bytes // kv_cache.element_size()
+        num_heads, head_size = _synthetic_attention_shape(
+            elems_per_page, logical_block_size
+        )
+        return kv_cache.view(num_blocks, 2, logical_block_size, num_heads, head_size)
+
+
+# Rule registry, in match priority order.
+_EDITS: tuple[KVCacheGroupEdit, ...] = (
+    _MambaPageViewEdit(),
+    _SubpagedAttentionViewEdit(),
+)
+
+
+def apply_kv_cache_group_edits(
+    kv_cache_config: KVCacheConfig | None,
+    kv_caches: Mapping[str, RegisteredKVCache],
+) -> dict[str, RegisteredKVCache]:
+    """Apply all KV cache group metadata edits for LMCache registration.
+
+    Each layer is checked against the ``_EDITS`` rules (first match wins) and
+    re-viewed by the matching rule; layers matching no rule pass through
+    unchanged. ``None`` configs and configs without Mamba groups are returned
+    as-is (as a dict): all current rules only apply to Mamba-hybrid models.
+
+    Args:
+        kv_cache_config: vLLM ``KVCacheConfig`` (read for per-group specs).
+        kv_caches: Registered tensors keyed by layer name. Mamba entries are
+            ``[conv_state, ssm_state]`` lists; others are single tensors.
+
+    Returns:
+        A new ``dict`` with edited layers re-viewed, others untouched.
+
+    Raises:
+        ValueError: If the groups fail :func:`validate_kv_cache_groups`, or a
+            matched layer's cache layout violates its rule's invariants (see
+            each rule's ``apply``).
+    """
+    # Backstop for connectors initialized without a kv_cache_config.
+    validate_kv_cache_groups(kv_cache_config)
+    if kv_cache_config is None or not kv_cache_config.has_mamba_layers:
+        return dict(kv_caches)
+
+    edited = dict(kv_caches)
+    counts: Counter[str] = Counter()
+    for group in kv_cache_config.kv_cache_groups:
+        spec = group.kv_cache_spec
+        for name in group.layer_names:
+            for edit in _EDITS:
+                if edit.matches(spec, kv_caches[name]):
+                    edited[name] = edit.apply(spec, kv_caches[name])
+                    counts[edit.name] += 1
+                    break
+    logger.info(
+        "KV cache group edits applied: %s",
+        dict(counts) if counts else "none",
+    )
+    return edited
diff --git a/lmcache/integration/vllm/lmcache_mp_connector.py b/lmcache/integration/vllm/lmcache_mp_connector.py
index 1357cbbed6..57b5d1d390 100644
--- a/lmcache/integration/vllm/lmcache_mp_connector.py
+++ b/lmcache/integration/vllm/lmcache_mp_connector.py
@@ -38,6 +38,10 @@ class SupportsHMA:  # type: ignore[no-redef]
 # First Party
 from lmcache import torch_dev
 from lmcache.banner import print_banner_once
+from lmcache.integration.vllm.kv_cache_group_edits import (
+    apply_kv_cache_group_edits,
+    validate_kv_cache_groups,
+)
 from lmcache.integration.vllm.kv_cache_groups import (
     create_engine_group_infos_from_vllm,
 )
@@ -97,6 +101,43 @@ class SupportsHMA:  # type: ignore[no-redef]
 
 
 # Helper functions
+def validate_mamba_step_alignment(vllm_config: VllmConfig) -> None:
+    """Reject scheduler configs that can skip Mamba state snapshots.
+
+    In ``mamba_cache_mode="align"`` vLLM snapshots the recurrent state only at
+    the end of each scheduler step, and a step that advances more than one
+    block fills the skipped block-table positions with the null block
+    (``MambaManager.allocate_new_blocks``). LMCache keys chunks by token hash,
+    so a skipped boundary would be stored as null-block garbage under a valid
+    key and silently corrupt any request that later resumes from that prefix.
+    Requiring ``block_size <= max_num_batched_tokens < 2 * block_size`` makes
+    vLLM's block-aligned splitting (``Scheduler._mamba_block_aligned_split``)
+    advance every mid-prefill step by exactly one block, so every chunk
+    boundary holds a real snapshot.
+
+    Args:
+        vllm_config: The vLLM config; only Mamba-hybrid models in ``align``
+            cache mode are constrained, others pass.
+
+    Raises:
+        ValueError: If ``max_num_batched_tokens`` is not in
+            ``[block_size, 2 * block_size)``.
+    """
+    if getattr(vllm_config.cache_config, "mamba_cache_mode", "none") != "align":
+        return
+    block_size = vllm_config.cache_config.block_size
+    max_batched = vllm_config.scheduler_config.max_num_batched_tokens
+    if not (block_size <= max_batched < 2 * block_size):
+        raise ValueError(
+            f"Mamba-hybrid models with LMCache require "
+            f"block_size <= max_num_batched_tokens < 2 * block_size so every "
+            f"prefill step advances exactly one block and every block boundary "
+            f"gets a state snapshot; got max_num_batched_tokens={max_batched}, "
+            f"block_size={block_size}. Set --max-num-batched-tokens "
+            f"{block_size}."
+        )
+
+
 def build_parallel_strategy_from_vllm_config(
     vllm_config: "VllmConfig",
 ) -> ParallelStrategy:
@@ -478,6 +519,10 @@ def __init__(
     ):
         super().__init__(vllm_config, role, kv_cache_config)
 
+        # Fail fast, before the server handshake below.
+        validate_mamba_step_alignment(vllm_config)
+        validate_kv_cache_groups(getattr(self, "_kv_cache_config", None))
+
         assert vllm_config.kv_transfer_config is not None
         server_host = vllm_config.kv_transfer_config.get_from_extra_config(
             "lmcache.mp.host", "tcp://localhost"
@@ -569,6 +614,9 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """
         logger.info("Registering kv caches!")
         kv_cache_config = getattr(self, "_kv_cache_config", None)
+        # Must precede both group-info creation and transfer registration so
+        # they see the same edited views.
+        kv_caches = apply_kv_cache_group_edits(kv_cache_config, kv_caches)
         engine_group_infos = create_engine_group_infos_from_vllm(
             kv_cache_config,
             kv_caches,

From fca2e497a0c90955e3e69c1ae8071401f21ad710 Mon Sep 17 00:00:00 2001
From: Chris Manteuffel <christopher.manteuffel@intel.com>
Date: Wed, 10 Jun 2026 16:39:50 -0700
Subject: [PATCH 25/57] (fix) Add missing enum to GPUVKFormat (#3606)

---
 csrc/sycl/mem_kernels_sycl.cpp |  6 +++++-
 csrc/sycl/mem_kernels_sycl.h   | 10 ++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/csrc/sycl/mem_kernels_sycl.cpp b/csrc/sycl/mem_kernels_sycl.cpp
index 3465867c92..72d625e308 100644
--- a/csrc/sycl/mem_kernels_sycl.cpp
+++ b/csrc/sycl/mem_kernels_sycl.cpp
@@ -568,10 +568,14 @@ void single_layer_kv_transfer(torch::Tensor& lmc_key_value_cache,
     vllm_block_key_stride_in_64bit =
         vllm_key_value_cache.stride(1) / elements_per_entry;
     vllm_value_offset = vllm_key_value_cache.stride(0) / elements_per_entry;
-  } else {  // NL_X_NB_TWO_BS_NH_HS
+  } else if (gpu_kv_format == GPUKVFormat::NL_X_NB_TWO_BS_NH_HS) {
     vllm_block_key_stride_in_64bit =
         vllm_key_value_cache.stride(0) / elements_per_entry;
     vllm_value_offset = vllm_key_value_cache.stride(1) / elements_per_entry;
+  } else {
+    throw std::runtime_error(
+        "Unsupported non-MLA GPUKVFormat in single_layer_kv_transfer: " +
+        std::to_string(static_cast<int>(gpu_kv_format)));
   }
 
   int n = num_heads * head_size_in_64bit;
diff --git a/csrc/sycl/mem_kernels_sycl.h b/csrc/sycl/mem_kernels_sycl.h
index 16d4656344..def2a3ab5e 100644
--- a/csrc/sycl/mem_kernels_sycl.h
+++ b/csrc/sycl/mem_kernels_sycl.h
@@ -98,6 +98,16 @@ enum class GPUKVFormat : int {
   - SGLang MHA via the MP daemon path
   physical shape per layer: [num_blocks, block_size, num_heads, head_size]
   */
+
+  NL_X_NB_NH_BS_TWO_HS = 10,
+  /*
+  used by:
+  - vLLM non-MLA blocks-first attention with K/V fused into the trailing dim
+  physical shape per layer: [num_blocks, num_heads, block_size, 2, head_size]
+  (recovered by splitting the fused trailing [block_size, 2 * head_size]).
+  Currently only reached via the host gather/scatter path, not the SYCL
+  transfer kernels.
+  */
 };
 
 void multi_layer_kv_transfer(

From efa6900abedafa559672a68b80c8129e8e8787d0 Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Wed, 10 Jun 2026 16:46:10 -0700
Subject: [PATCH 26/57] init

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 .../mp_coordinator/l2_usage_and_eviction.md   | 168 +++++++++++++
 lmcache/v1/distributed/eviction.py            |   2 +-
 lmcache/v1/distributed/internal_api.py        |   3 +-
 lmcache/v1/distributed/l2_adapters/base.py    |   2 +-
 .../l2_adapters/raw_block_l2_adapter.py       |   2 +-
 lmcache/v1/distributed/storage_manager.py     |  10 +
 lmcache/v1/mp_coordinator/app.py              |  46 +++-
 lmcache/v1/mp_coordinator/config.py           |  14 ++
 lmcache/v1/mp_coordinator/http_apis/l2_api.py | 202 +++++++++++++++
 lmcache/v1/mp_coordinator/l2/__init__.py      |   1 +
 .../v1/mp_coordinator/l2/event_listener.py    | 140 +++++++++++
 .../mp_coordinator/l2/eviction_controller.py  | 193 +++++++++++++++
 lmcache/v1/mp_coordinator/l2/quota_store.py   |  92 +++++++
 lmcache/v1/mp_coordinator/l2/usage_tracker.py | 121 +++++++++
 lmcache/v1/mp_coordinator/schemas.py          | 123 ++++++++++
 lmcache/v1/multiprocess/config.py             |  51 ++++
 lmcache/v1/multiprocess/http_server.py        |  29 +++
 .../test_eviction_controller.py               | 175 +++++++++++++
 tests/v1/mp_coordinator/test_l2_api.py        | 229 ++++++++++++++++++
 tests/v1/mp_coordinator/test_quota_store.py   |  64 +++++
 tests/v1/mp_coordinator/test_usage_tracker.py |  91 +++++++
 21 files changed, 1744 insertions(+), 14 deletions(-)
 create mode 100644 docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
 create mode 100644 lmcache/v1/mp_coordinator/http_apis/l2_api.py
 create mode 100644 lmcache/v1/mp_coordinator/l2/__init__.py
 create mode 100644 lmcache/v1/mp_coordinator/l2/event_listener.py
 create mode 100644 lmcache/v1/mp_coordinator/l2/eviction_controller.py
 create mode 100644 lmcache/v1/mp_coordinator/l2/quota_store.py
 create mode 100644 lmcache/v1/mp_coordinator/l2/usage_tracker.py
 create mode 100644 tests/v1/mp_coordinator/test_eviction_controller.py
 create mode 100644 tests/v1/mp_coordinator/test_l2_api.py
 create mode 100644 tests/v1/mp_coordinator/test_quota_store.py
 create mode 100644 tests/v1/mp_coordinator/test_usage_tracker.py

diff --git a/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md b/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
new file mode 100644
index 0000000000..141ec8a6e6
--- /dev/null
+++ b/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
@@ -0,0 +1,168 @@
+# Fleet-Wide L2 Usage Tracking and Eviction
+
+A coordinator-level capability that gives fleet-wide visibility into per-tenant
+L2 cache usage and enforces per-``cache_salt`` byte quotas via LRU eviction.
+MP servers **report store/lookup events** to the coordinator; the coordinator
+aggregates usage, manages quotas, and periodically selects LRU keys to evict
+when a tenant exceeds its quota. It is **opt-in** (gated by
+``l2_event_reporting`` in ``CoordinatorConfig``) and **additive** (the existing
+per-server eviction is unchanged).
+
+Code: `lmcache/v1/mp_coordinator/l2/` (coordinator side),
+`lmcache/v1/mp_coordinator/http_apis/l2_api.py` (REST endpoints),
+`lmcache/v1/mp_coordinator/schemas.py` (wire types),
+`lmcache/v1/multiprocess/http_server.py` (MP-server wiring).
+
+## Why
+
+L2 eviction today is **local to each MP server**: the ``IsolatedLRUEvictionPolicy``
+tracks only what that server stored and enforces quotas within that scope. With
+a shared L2 backend (e.g. S3), multiple servers write to the same storage, but
+no single server has a fleet-wide view of total per-tenant usage. The coordinator
+centralizes usage accounting and quota enforcement so limits apply to the
+aggregate, not per-replica.
+
+## Architecture
+
+```
+MP server (store/lookup)
+  L2 adapter fires on_l2_keys_stored / on_l2_keys_accessed
+        │
+        ▼
+  CoordinatorL2EventListener (L2AdapterListener)
+    converts ObjectKey → CacheKey, buffers UsageEvents
+        │  flush every l2_event_flush_interval (default 1s)
+        │
+        ▼
+  POST /l2/events ──▶ Coordinator
+                        ├─ UsageTracker: per-salt byte accounting
+                        ├─ CoordinatorEvictionController: per-salt LRU
+                        └─ QuotaStore: per-salt byte limits
+
+  Coordinator background loop (every eviction_check_interval, default 5s)
+        │
+        ▼
+  execute_evictions():
+    for each tracked salt:
+      limit = quota (default 0 → evict all)
+      if usage > limit → select LRU keys, log eviction plan
+```
+
+## Wire types (`schemas.py`)
+
+- **``CacheKey``** — frozen dataclass: ``chunk_hash_hex``, ``model_name``,
+  ``kv_rank``, ``cache_salt``. Torch-free equivalent of ``ObjectKey``;
+  ``chunk_hash`` is hex-encoded instead of raw bytes.
+- **``EventType``** — ``str`` enum: ``STORE``, ``LOOKUP``.
+- **``UsageEvent``** — ``type: EventType``, ``key: CacheKey``, ``bytes: int``.
+- **``ReportUsageRequest``** — batch of ``UsageEvent``s.
+
+The ``ObjectKey`` → ``CacheKey`` conversion happens at the MP-server boundary
+(``_object_key_to_cache_key`` in ``event_listener.py``), so the coordinator
+never imports ``torch``.
+
+## Coordinator components (`l2/`)
+
+### UsageTracker (`usage_tracker.py`)
+
+Thread-safe per-salt byte counter. Two operations:
+
+- ``record_stored(cache_salt, n_bytes)`` — increment.
+- ``record_evicted(cache_salt, n_bytes)`` — decrement (clamped at zero).
+
+Exposes ``get(salt)``, ``get_all()``, ``get_total()`` for the status endpoints.
+
+### QuotaStore (`quota_store.py`)
+
+Thread-safe in-memory quota registry (``dict[str, int]`` + lock). CRUD via
+``set``, ``get``, ``delete``, ``list_all``. Quotas are set in GiB at the API
+and stored as bytes internally.
+
+### CoordinatorEvictionController (`eviction_controller.py`)
+
+Per-``cache_salt`` LRU, mirroring ``IsolatedLRUEvictionPolicy`` but using
+``CacheKey`` and running in the coordinator process.
+
+Data structures:
+
+```
+_per_salt_order : dict[str, OrderedDict[CacheKey, None]]   # LRU per salt
+_key_sizes      : dict[CacheKey, int]                       # byte size per key
+```
+
+- ``on_store(key, size_bytes)`` — insert/refresh in LRU, record size.
+- ``on_lookup(key)`` — touch (move to MRU end).
+- ``on_remove(keys)`` — remove from LRU tracking after confirmed deletion.
+- ``execute_evictions()`` — for each tracked salt, compare usage (from
+  ``UsageTracker``) against quota (from ``QuotaStore``, default 0). If over
+  quota, select LRU keys targeting ``eviction_ratio`` of the overage. No quota
+  or zero quota means evict all keys for that salt.
+
+Eviction is currently **log-only**: ``execute_evictions`` returns the plan but
+does not issue deletes. Once wired end-to-end, ``on_remove`` will be called
+after the MP server confirms deletion.
+
+## REST endpoints (`l2_api.py`)
+
+| Method | Path | Description |
+| --- | --- | --- |
+| ``PUT`` | ``/l2/quota/{cache_salt}`` | Set quota (GiB) |
+| ``DELETE`` | ``/l2/quota/{cache_salt}`` | Remove quota |
+| ``POST`` | ``/l2/events`` | Ingest batch of store/lookup events |
+| ``GET`` | ``/l2/status/{cache_salt}`` | Quota + usage for one salt |
+| ``GET`` | ``/l2/status`` | Quota + usage for all salts |
+
+Status responses report usage in GiB only (no raw bytes in the API).
+
+## MP-server event listener (`event_listener.py`)
+
+``CoordinatorL2EventListener`` implements ``L2AdapterListener`` and is registered
+on all L2 adapters via ``StorageManager.register_l2_listener()``. It:
+
+1. Receives ``on_l2_keys_stored(keys, sizes)`` and ``on_l2_keys_accessed(keys)``
+   callbacks from the L2 adapter (any thread).
+2. Converts each ``ObjectKey`` to ``CacheKey`` (hex-encodes ``chunk_hash``).
+3. Buffers ``UsageEvent``s under a lock.
+4. Flushes the buffer to ``POST /l2/events`` on a timer
+   (``l2_event_flush_interval``, default 1s). Failures are logged and the
+   batch is dropped to prevent unbounded growth.
+
+``on_l2_keys_deleted`` is a no-op — the coordinator handles deletion via its
+own eviction loop.
+
+## Configuration
+
+### Coordinator side (`MPCoordinatorConfig`)
+
+| Field | Default | Description |
+| --- | --- | --- |
+| ``eviction_check_interval`` | ``5.0`` | Seconds between eviction cycles (0 disables) |
+| ``eviction_ratio`` | ``0.5`` | Fraction of over-quota bytes to target per cycle |
+
+### MP-server side (`CoordinatorConfig`)
+
+| Field | Default | Env var | Description |
+| --- | --- | --- | --- |
+| ``l2_event_reporting`` | ``False`` | ``LMCACHE_COORDINATOR_L2_EVENT_REPORTING`` | Enable event reporting |
+| ``l2_event_flush_interval`` | ``1.0`` | ``LMCACHE_COORDINATOR_L2_EVENT_FLUSH_INTERVAL`` | Seconds between flushes |
+
+Both also accept CLI flags (``--coordinator-l2-event-reporting``,
+``--coordinator-l2-event-flush-interval``).
+
+## Failure modes
+
+| Event | Effect | Handling |
+| --- | --- | --- |
+| Coordinator down | Events not delivered | Flush fails → batch dropped, logged; MP server unaffected |
+| Coordinator restart | Usage/LRU state lost | Rebuilt from incoming events; stale until servers report |
+| Flush timeout | One batch delayed | Next flush sends new batch; no retry of old batch |
+| Usage accounting drift | Quota enforcement imprecise | Self-correcting as new events arrive |
+
+## Scope
+
+Additive: no change to per-server eviction, L2 adapter store/lookup paths, or
+the coordinator's membership/health loop. Composes via the ``L2AdapterListener``
+interface and the ``http_apis`` auto-discovery — a new router reading
+``app.state``, plus the opt-in event listener — with no edits to existing
+controllers or adapters beyond passing ``sizes`` through to
+``on_l2_keys_stored``.
diff --git a/lmcache/v1/distributed/eviction.py b/lmcache/v1/distributed/eviction.py
index 74edab3a41..00583673c0 100644
--- a/lmcache/v1/distributed/eviction.py
+++ b/lmcache/v1/distributed/eviction.py
@@ -182,7 +182,7 @@ def __init__(self, policy: EvictionPolicy):
     def policy(self) -> EvictionPolicy:
         return self._policy
 
-    def on_l2_keys_stored(self, keys: list[ObjectKey]):
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]):
         self._policy.on_keys_created(keys)
 
     def on_l2_keys_accessed(self, keys: list[ObjectKey]):
diff --git a/lmcache/v1/distributed/internal_api.py b/lmcache/v1/distributed/internal_api.py
index ad76ab206d..025274aaf2 100644
--- a/lmcache/v1/distributed/internal_api.py
+++ b/lmcache/v1/distributed/internal_api.py
@@ -114,12 +114,13 @@ class L2AdapterListener(EventListener):
     """Listener for L2 adapter events, analogous to L1ManagerListener."""
 
     @abstractmethod
-    def on_l2_keys_stored(self, keys: list[ObjectKey]):
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]):
         """
         Notify the listener that keys have been successfully stored in L2.
 
         Args:
             keys (list[ObjectKey]): The keys that have been stored.
+            sizes (list[int]): The byte size of each stored key.
         """
         pass
 
diff --git a/lmcache/v1/distributed/l2_adapters/base.py b/lmcache/v1/distributed/l2_adapters/base.py
index a972efe043..892ac6aa2b 100644
--- a/lmcache/v1/distributed/l2_adapters/base.py
+++ b/lmcache/v1/distributed/l2_adapters/base.py
@@ -374,7 +374,7 @@ def _notify_keys_stored(self, keys: list[ObjectKey], sizes: list[int]) -> None:
                     self._bytes_by_cache_salt.get(salt, 0) + d
                 )
         for listener in self._listeners:
-            listener.on_l2_keys_stored(keys)
+            listener.on_l2_keys_stored(keys, sizes)
 
     def _notify_keys_accessed(self, keys: list[ObjectKey]) -> None:
         # ``_notify_keys_accessed`` carries no byte impact — only LRU
diff --git a/lmcache/v1/distributed/l2_adapters/raw_block_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/raw_block_l2_adapter.py
index 558df6907c..8df3dbc4fa 100644
--- a/lmcache/v1/distributed/l2_adapters/raw_block_l2_adapter.py
+++ b/lmcache/v1/distributed/l2_adapters/raw_block_l2_adapter.py
@@ -527,7 +527,7 @@ def register_listener(self, listener: "L2AdapterListener") -> None:
         if not keys:
             return
         try:
-            listener.on_l2_keys_stored(keys)
+            listener.on_l2_keys_stored(keys, [0] * len(keys))
         except Exception as e:
             logger.warning(
                 "RawBlockL2Adapter listener recovery bootstrap failed: %s", e
diff --git a/lmcache/v1/distributed/storage_manager.py b/lmcache/v1/distributed/storage_manager.py
index 06574b9869..89bf85d28e 100644
--- a/lmcache/v1/distributed/storage_manager.py
+++ b/lmcache/v1/distributed/storage_manager.py
@@ -19,6 +19,7 @@
 )
 from lmcache.v1.distributed.config import StorageManagerConfig
 from lmcache.v1.distributed.error import L1Error, strerror
+from lmcache.v1.distributed.internal_api import L2AdapterListener
 from lmcache.v1.distributed.l1_manager import L1Manager
 from lmcache.v1.distributed.l2_adapters import create_l2_adapter
 from lmcache.v1.distributed.l2_adapters.base import L2AdapterInterface
@@ -769,6 +770,15 @@ def report_status(self) -> dict:
             "num_l2_adapters": len(self._l2_adapters),
         }
 
+    def register_l2_listener(self, listener: L2AdapterListener) -> None:
+        """Register a listener on all L2 adapters.
+
+        Args:
+            listener: The listener to register.
+        """
+        for adapter in self._l2_adapters:
+            adapter.register_listener(listener)
+
     # Functions for debugging and testing
     def memcheck(self) -> bool:
         """
diff --git a/lmcache/v1/mp_coordinator/app.py b/lmcache/v1/mp_coordinator/app.py
index d4dc64c849..97426bff2a 100644
--- a/lmcache/v1/mp_coordinator/app.py
+++ b/lmcache/v1/mp_coordinator/app.py
@@ -3,9 +3,10 @@
 
 The coordinator is a FastAPI app. Endpoints are auto-discovered from the
 ``http_apis`` package (the same convention as the mp server's HTTP API) and stay
-thin, operating on the shared collaborators carried on ``app.state``: ``config``
-and ``registry``. The lifespan runs the background health-check task (eviction
-of instances whose heartbeats have lapsed).
+thin, operating on the shared collaborators carried on ``app.state``: ``config``,
+``registry``, ``quota_store``, ``usage_tracker``, and ``eviction_controller``.
+The lifespan runs background tasks for health-checking (eviction of instances
+whose heartbeats have lapsed) and L2 eviction (quota enforcement).
 
 Adding a capability = a new ``http_apis/<name>_api.py`` router (auto-discovered)
 that uses those shared collaborators. To push to an mp server, a future router
@@ -28,6 +29,11 @@
 # First Party
 from lmcache.logging import init_logger
 from lmcache.v1.mp_coordinator.config import MPCoordinatorConfig
+from lmcache.v1.mp_coordinator.l2.eviction_controller import (
+    CoordinatorEvictionController,
+)
+from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
+from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
 from lmcache.v1.mp_coordinator.registry import InstanceRegistry
 from lmcache.v1.utils.router_discovery import discover_api_routers
 
@@ -60,10 +66,17 @@ def create_app(config: MPCoordinatorConfig) -> FastAPI:
 
     Returns:
         A configured FastAPI application. ``app.state`` carries the shared
-        collaborators (``config``, ``registry``); all ``http_apis`` routers are
-        registered.
+        collaborators (``config``, ``registry``, ``quota_store``,
+        ``usage_tracker``); all ``http_apis`` routers are registered.
     """
     registry = InstanceRegistry()
+    quota_store = QuotaStore()
+    usage_tracker = UsageTracker()
+    eviction_controller = CoordinatorEvictionController(
+        quota_store=quota_store,
+        usage_tracker=usage_tracker,
+        eviction_ratio=config.eviction_ratio,
+    )
 
     async def _health_loop() -> None:
         """Evict stale instances on a timer until cancelled."""
@@ -71,27 +84,40 @@ async def _health_loop() -> None:
             await asyncio.sleep(config.health_check_interval)
             evict_stale(registry, config.instance_timeout)
 
+    async def _eviction_loop() -> None:
+        """Periodically check usage against quotas and log eviction plans."""
+        while True:
+            await asyncio.sleep(config.eviction_check_interval)
+            eviction_controller.execute_evictions()
+
     @asynccontextmanager
     async def lifespan(app: FastAPI) -> AsyncIterator[None]:
-        """Start the health-check task and clean up resources on shutdown."""
+        """Start background tasks and clean up resources on shutdown."""
         health_task = None
+        eviction_task = None
         if config.health_check_interval > 0:
             health_task = asyncio.create_task(_health_loop())
+        if config.eviction_check_interval > 0:
+            eviction_task = asyncio.create_task(_eviction_loop())
         logger.info(
             "MP coordinator listening on http://%s:%d", config.host, config.port
         )
         try:
             yield
         finally:
-            if health_task is not None:
-                health_task.cancel()
-                with contextlib.suppress(asyncio.CancelledError):
-                    await health_task
+            for task in (health_task, eviction_task):
+                if task is not None:
+                    task.cancel()
+                    with contextlib.suppress(asyncio.CancelledError):
+                        await task
 
     app = FastAPI(title="LMCache MP Coordinator", version="1.0.0", lifespan=lifespan)
     # Shared collaborators on app.state so routers compose from them.
     app.state.config = config
     app.state.registry = registry
+    app.state.quota_store = quota_store
+    app.state.usage_tracker = usage_tracker
+    app.state.eviction_controller = eviction_controller
 
     apis_path = Path(__file__).parent / "http_apis"
     package = f"{__package__}.http_apis"
diff --git a/lmcache/v1/mp_coordinator/config.py b/lmcache/v1/mp_coordinator/config.py
index 4c32e831b4..dd7876dcbb 100644
--- a/lmcache/v1/mp_coordinator/config.py
+++ b/lmcache/v1/mp_coordinator/config.py
@@ -29,12 +29,18 @@ class MPCoordinatorConfig:
             servers' own heartbeat cadence (which they choose).
         health_check_interval: Seconds between health-check sweeps. A value of
             ``0`` disables the health-check loop.
+        eviction_check_interval: Seconds between eviction sweeps. A value of
+            ``0`` disables the eviction loop.
+        eviction_ratio: Fraction of over-quota bytes to target per eviction
+            cycle (0.0 to 1.0).
     """
 
     host: str = "0.0.0.0"
     port: int = 9300
     instance_timeout: float = 30.0
     health_check_interval: float = 10.0
+    eviction_check_interval: float = 5.0
+    eviction_ratio: float = 0.5
 
     def __post_init__(self) -> None:
         """Validate timing parameters.
@@ -46,6 +52,10 @@ def __post_init__(self) -> None:
             raise ValueError("instance_timeout must be positive")
         if self.health_check_interval < 0:
             raise ValueError("health_check_interval must be non-negative")
+        if self.eviction_check_interval < 0:
+            raise ValueError("eviction_check_interval must be non-negative")
+        if not 0.0 <= self.eviction_ratio <= 1.0:
+            raise ValueError("eviction_ratio must be between 0.0 and 1.0")
 
     @classmethod
     def from_env(cls) -> "MPCoordinatorConfig":
@@ -79,4 +89,8 @@ def _num(name: str, default: float, cast) -> float:
             health_check_interval=_num(
                 "HEALTH_CHECK_INTERVAL", cls.health_check_interval, float
             ),
+            eviction_check_interval=_num(
+                "EVICTION_CHECK_INTERVAL", cls.eviction_check_interval, float
+            ),
+            eviction_ratio=_num("EVICTION_RATIO", cls.eviction_ratio, float),
         )
diff --git a/lmcache/v1/mp_coordinator/http_apis/l2_api.py b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
new file mode 100644
index 0000000000..8a03263a34
--- /dev/null
+++ b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+"""L2 cache management endpoints on the coordinator.
+
+Quota writes (set/delete), usage event ingestion, and combined
+status queries (quota + usage) for per-``cache_salt`` L2 data.
+"""
+
+# Third Party
+from fastapi import APIRouter, Request
+from fastapi.responses import JSONResponse
+
+# First Party
+from lmcache.v1.mp_coordinator.l2.eviction_controller import (
+    CoordinatorEvictionController,
+)
+from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
+from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
+from lmcache.v1.mp_coordinator.schemas import (
+    EventType,
+    L2StatusListResponse,
+    L2StatusResponse,
+    QuotaResponse,
+    ReportUsageRequest,
+    ReportUsageResponse,
+    SetQuotaRequest,
+)
+
+router = APIRouter()
+
+_GB = 1024**3
+
+
+def _gb(n_bytes: int) -> float:
+    """Convert bytes to GiB."""
+    return n_bytes / _GB
+
+
+def _quota_store(request: Request) -> QuotaStore:
+    """Return the shared quota store from app state.
+
+    Args:
+        request: The incoming request.
+
+    Returns:
+        The shared :class:`QuotaStore`.
+
+    Raises:
+        RuntimeError: If the store is not initialized.
+    """
+    store = getattr(request.app.state, "quota_store", None)
+    if store is None:
+        raise RuntimeError("quota store not initialized")
+    return store
+
+
+def _tracker(request: Request) -> UsageTracker:
+    """Return the shared usage tracker from app state.
+
+    Args:
+        request: The incoming request.
+
+    Returns:
+        The shared :class:`UsageTracker`.
+
+    Raises:
+        RuntimeError: If the tracker is not initialized.
+    """
+    tracker = getattr(request.app.state, "usage_tracker", None)
+    if tracker is None:
+        raise RuntimeError("usage tracker not initialized")
+    return tracker
+
+
+def _eviction_controller(request: Request) -> CoordinatorEvictionController:
+    """Return the shared eviction controller from app state.
+
+    Args:
+        request: The incoming request.
+
+    Returns:
+        The shared :class:`CoordinatorEvictionController`.
+
+    Raises:
+        RuntimeError: If the controller is not initialized.
+    """
+    ctrl = getattr(request.app.state, "eviction_controller", None)
+    if ctrl is None:
+        raise RuntimeError("eviction controller not initialized")
+    return ctrl
+
+
+# -- Quota writes ------------------------------------------------------------
+
+
+@router.put("/l2/quota/{cache_salt}")
+async def set_quota(
+    cache_salt: str, body: SetQuotaRequest, request: Request
+) -> QuotaResponse | JSONResponse:
+    """Create or update a quota for the given ``cache_salt``.
+
+    Returns:
+        The applied quota.
+    """
+    limit_bytes = int(body.limit_gb * _GB)
+    try:
+        _quota_store(request).set(cache_salt, limit_bytes)
+    except ValueError as exc:
+        return JSONResponse(status_code=400, content={"error": str(exc)})
+    return QuotaResponse(
+        cache_salt=cache_salt,
+        limit_gb=body.limit_gb,
+        status="ok",
+    )
+
+
+@router.delete("/l2/quota/{cache_salt}")
+async def delete_quota(cache_salt: str, request: Request) -> QuotaResponse:
+    """Remove a salt's quota entry.
+
+    Returns:
+        Whether the entry was found and removed.
+    """
+    removed = _quota_store(request).delete(cache_salt)
+    return QuotaResponse(
+        cache_salt=cache_salt,
+        limit_gb=0.0,
+        status="removed" if removed else "not_found",
+    )
+
+
+# -- event ingestion ---------------------------------------------------
+
+
+@router.post("/l2/events")
+async def report_events(
+    body: ReportUsageRequest, request: Request
+) -> ReportUsageResponse:
+    """Record a batch of L2 store/lookup events.
+
+    Returns:
+        Number of events processed.
+    """
+    tracker = _tracker(request)
+    ctrl = _eviction_controller(request)
+    for event in body.events:
+        if event.type == EventType.STORE:
+            tracker.record_stored(event.key.cache_salt, event.bytes)
+            ctrl.on_store(event.key, event.bytes)
+        elif event.type == EventType.LOOKUP:
+            ctrl.on_lookup(event.key)
+    return ReportUsageResponse(recorded=len(body.events))
+
+
+# -- Combined status queries -------------------------------------------------
+
+
+@router.get("/l2/status/{cache_salt}")
+async def get_status(cache_salt: str, request: Request) -> L2StatusResponse:
+    """Read quota and usage for a single salt.
+
+    Returns:
+        Combined quota and usage detail.
+    """
+    tracker = _tracker(request)
+    store = _quota_store(request)
+    usage = tracker.get(cache_salt)
+    limit = store.get(cache_salt)
+    return L2StatusResponse(
+        cache_salt=cache_salt,
+        quota_limit_gb=_gb(limit) if limit is not None else 0.0,
+        quota_exists=limit is not None,
+        usage_gb=_gb(usage),
+    )
+
+
+@router.get("/l2/status")
+async def list_status(request: Request) -> L2StatusListResponse:
+    """List quota and usage across all cache salts.
+
+    Returns:
+        Total usage and per-salt breakdown with quota info.
+    """
+    tracker = _tracker(request)
+    store = _quota_store(request)
+    by_salt = tracker.get_all()
+    total = tracker.get_total()
+    quota_entries = {e.cache_salt: e.limit_bytes for e in store.list_all()}
+    all_salts = sorted(set(by_salt) | set(quota_entries))
+    return L2StatusListResponse(
+        total_gb=_gb(total),
+        by_cache_salt=[
+            L2StatusResponse(
+                cache_salt=salt,
+                quota_limit_gb=_gb(quota_entries[salt])
+                if salt in quota_entries
+                else 0.0,
+                quota_exists=salt in quota_entries,
+                usage_gb=_gb(by_salt.get(salt, 0)),
+            )
+            for salt in all_salts
+        ],
+    )
diff --git a/lmcache/v1/mp_coordinator/l2/__init__.py b/lmcache/v1/mp_coordinator/l2/__init__.py
new file mode 100644
index 0000000000..9881313609
--- /dev/null
+++ b/lmcache/v1/mp_coordinator/l2/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: Apache-2.0
diff --git a/lmcache/v1/mp_coordinator/l2/event_listener.py b/lmcache/v1/mp_coordinator/l2/event_listener.py
new file mode 100644
index 0000000000..1518a16ea8
--- /dev/null
+++ b/lmcache/v1/mp_coordinator/l2/event_listener.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: Apache-2.0
+"""MP-server-side L2 event client.
+
+Implements :class:`L2AdapterListener` to receive store/lookup/delete
+notifications from the L2 adapter, converts ``ObjectKey`` to
+``CacheKey``, buffers events, and flushes them to the coordinator in
+batches on a timer.
+
+Thread-safe: listener callbacks can fire from any thread while
+``run`` drains the buffer on the event loop.
+"""
+
+# Standard
+import asyncio
+import threading
+
+# Third Party
+import httpx
+
+# First Party
+from lmcache.logging import init_logger
+from lmcache.v1.distributed.api import ObjectKey
+from lmcache.v1.distributed.internal_api import L2AdapterListener
+from lmcache.v1.mp_coordinator.schemas import (
+    CacheKey,
+    EventType,
+    ReportUsageRequest,
+    ReportUsageResponse,
+    UsageEvent,
+)
+
+logger = init_logger(__name__)
+
+_DEFAULT_FLUSH_INTERVAL = 1.0
+
+
+def _object_key_to_cache_key(obj: ObjectKey) -> CacheKey:
+    """Convert an ``ObjectKey`` to a ``CacheKey``.
+
+    Args:
+        obj: The object key to convert.
+
+    Returns:
+        The equivalent cache key.
+    """
+    return CacheKey(
+        chunk_hash_hex=obj.chunk_hash.hex(),
+        model_name=obj.model_name,
+        kv_rank=obj.kv_rank,
+        cache_salt=obj.cache_salt,
+    )
+
+
+class CoordinatorL2EventListener(L2AdapterListener):
+    """L2 adapter listener that batches events and flushes to the coordinator.
+
+    Register as a listener on the L2 adapter via
+    ``adapter.register_listener(client)``. The ``run`` coroutine should
+    be started as a background task and cancelled on shutdown.
+
+    Args:
+        client: The HTTP client to send with.
+        coordinator_url: Coordinator base URL (e.g. ``http://host:9300``).
+        flush_interval: Seconds between flush attempts.
+    """
+
+    def __init__(
+        self,
+        client: httpx.AsyncClient,
+        coordinator_url: str,
+        flush_interval: float = _DEFAULT_FLUSH_INTERVAL,
+    ) -> None:
+        self._client = client
+        self._base_url = coordinator_url.rstrip("/")
+        self._flush_interval = flush_interval
+        self._lock = threading.Lock()
+        self._buffer: list[UsageEvent] = []
+
+    # -- L2AdapterListener implementation ------------------------------------
+
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]):
+        """Buffer store events for each key. Thread-safe."""
+        for obj, size in zip(keys, sizes, strict=True):
+            event = UsageEvent(
+                type=EventType.STORE,
+                key=_object_key_to_cache_key(obj),
+                bytes=size,
+            )
+            with self._lock:
+                self._buffer.append(event)
+
+    def on_l2_keys_accessed(self, keys: list[ObjectKey]):
+        """Buffer lookup events for each key. Thread-safe."""
+        for obj in keys:
+            event = UsageEvent(
+                type=EventType.LOOKUP,
+                key=_object_key_to_cache_key(obj),
+                bytes=0,
+            )
+            with self._lock:
+                self._buffer.append(event)
+
+    def on_l2_keys_deleted(self, keys: list[ObjectKey]):
+        """No-op — the coordinator handles deletion separately."""
+
+    # -- Flush loop ----------------------------------------------------------
+
+    async def run(self) -> None:
+        """Drain the buffer on a timer until cancelled.
+
+        Resilient: flush failures are logged and the batch is dropped
+        to prevent unbounded growth when the coordinator is down.
+        """
+        while True:
+            await asyncio.sleep(self._flush_interval)
+            await self._flush()
+
+    async def _flush(self) -> None:
+        """Send buffered events to the coordinator."""
+        with self._lock:
+            if not self._buffer:
+                return
+            batch = self._buffer
+            self._buffer = []
+
+        body = ReportUsageRequest(events=batch)
+        try:
+            resp = await self._client.post(
+                f"{self._base_url}/l2/events",
+                json=body.model_dump(),
+            )
+            resp.raise_for_status()
+            result = ReportUsageResponse.model_validate(resp.json())
+            logger.debug("Flushed %d L2 events to coordinator", result.recorded)
+        except (httpx.HTTPError, ValueError) as e:
+            logger.warning(
+                "Failed to flush %d L2 events to coordinator: %s",
+                len(batch),
+                e,
+            )
diff --git a/lmcache/v1/mp_coordinator/l2/eviction_controller.py b/lmcache/v1/mp_coordinator/l2/eviction_controller.py
new file mode 100644
index 0000000000..b7e83acb4a
--- /dev/null
+++ b/lmcache/v1/mp_coordinator/l2/eviction_controller.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Coordinator-side eviction controller with per-``cache_salt`` LRU.
+
+Mirrors the structure of
+:class:`~lmcache.v1.distributed.eviction_policy.isolated_lru.IsolatedLRUEvictionPolicy`
+but runs inside the coordinator process and uses a lightweight
+:class:`CacheKey` instead of :class:`ObjectKey` (which pulls in
+``torch``).
+
+The controller periodically checks per-salt usage (from
+:class:`UsageTracker`) against limits (from :class:`QuotaStore`).
+When a salt exceeds its quota, it selects LRU victims and **logs**
+them — actual deletion is not implemented yet.
+"""
+
+# Future
+from __future__ import annotations
+
+# Standard
+from collections import OrderedDict
+import threading
+
+# First Party
+from lmcache.logging import init_logger
+from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
+from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
+from lmcache.v1.mp_coordinator.schemas import CacheKey
+
+logger = init_logger(__name__)
+
+
+class CoordinatorEvictionController:
+    """Per-``cache_salt`` LRU eviction controller for the coordinator.
+
+    Maintains one ``OrderedDict`` per ``cache_salt``, ordered from
+    least-recently-used (front) to most-recently-used (end). Also
+    tracks per-key byte sizes so eviction can be byte-aware.
+
+    Thread-safety: every public method acquires ``_lock``.
+
+    Args:
+        quota_store: The shared quota registry.
+        usage_tracker: The shared usage tracker.
+        eviction_ratio: Fraction of over-quota bytes to target for
+            eviction each cycle.
+    """
+
+    def __init__(
+        self,
+        quota_store: QuotaStore,
+        usage_tracker: UsageTracker,
+        eviction_ratio: float = 0.5,
+    ) -> None:
+        self._lock = threading.Lock()
+        self._quota_store = quota_store
+        self._usage_tracker = usage_tracker
+        self._eviction_ratio = max(0.0, min(1.0, eviction_ratio))
+        self._per_salt_order: dict[str, OrderedDict[CacheKey, None]] = {}
+        self._key_sizes: dict[CacheKey, int] = {}
+
+    def on_store(self, key: CacheKey, size_bytes: int) -> None:
+        """Record that a key was stored.
+
+        Inserts into (or refreshes) the LRU for the key's
+        ``cache_salt``, and records the per-key byte size.
+
+        Args:
+            key: The cache key that was stored.
+            size_bytes: Number of bytes stored for this key.
+        """
+        with self._lock:
+            order = self._per_salt_order.get(key.cache_salt)
+            if order is None:
+                order = OrderedDict()
+                self._per_salt_order[key.cache_salt] = order
+            if key in order:
+                order.move_to_end(key)
+            else:
+                order[key] = None
+            self._key_sizes[key] = size_bytes
+
+    def on_lookup(self, key: CacheKey) -> None:
+        """Record that a key was looked up (touch — move to MRU end).
+
+        Args:
+            key: The cache key that was looked up.
+        """
+        with self._lock:
+            order = self._per_salt_order.get(key.cache_salt)
+            if order is not None and key in order:
+                order.move_to_end(key)
+
+    def on_remove(self, keys: list[CacheKey]) -> None:
+        """Remove keys from LRU tracking (after eviction is executed).
+
+        Args:
+            keys: The cache keys that were removed.
+        """
+        if not keys:
+            return
+        with self._lock:
+            for key in keys:
+                order = self._per_salt_order.get(key.cache_salt)
+                if order is None:
+                    continue
+                order.pop(key, None)
+                if not order:
+                    del self._per_salt_order[key.cache_salt]
+                self._key_sizes.pop(key, None)
+
+    def execute_evictions(self) -> dict[str, list[CacheKey]]:
+        """Check all tracked salts against their quotas and log eviction candidates.
+
+        Salts with no quota or a zero quota are fully evicted. Salts
+        over quota have LRU keys selected targeting ``eviction_ratio``
+        of the over-quota bytes. Keys are logged but not actually
+        deleted.
+
+        Returns:
+            A mapping of ``cache_salt`` to the list of keys selected
+            for eviction.
+        """
+        quotas = {e.cache_salt: e.limit_bytes for e in self._quota_store.list_all()}
+        with self._lock:
+            tracked_salts = list(self._per_salt_order.keys())
+
+        eviction_plan: dict[str, list[CacheKey]] = {}
+
+        for cache_salt in tracked_salts:
+            limit_bytes = quotas.get(cache_salt, 0)
+            current_bytes = self._usage_tracker.get(cache_salt)
+            if current_bytes <= limit_bytes:
+                continue
+
+            over_bytes = current_bytes - limit_bytes
+            target_bytes = int(over_bytes * self._eviction_ratio)
+            if target_bytes == 0 and over_bytes > 0:
+                target_bytes = over_bytes
+
+            keys_to_evict = self._select_keys_to_evict(cache_salt, target_bytes)
+            if keys_to_evict:
+                eviction_plan[cache_salt] = keys_to_evict
+                evict_bytes = sum(self._key_sizes.get(k, 0) for k in keys_to_evict)
+                logger.info(
+                    "Eviction plan for cache_salt=%r: %d keys "
+                    "(%d bytes) to free; usage=%d, quota=%d, "
+                    "over_by=%d",
+                    cache_salt,
+                    len(keys_to_evict),
+                    evict_bytes,
+                    current_bytes,
+                    limit_bytes,
+                    over_bytes,
+                )
+                for k in keys_to_evict:
+                    logger.info(
+                        "  -> evict key: model=%s, kv_rank=%d, hash=%s, size=%d",
+                        k.model_name,
+                        k.kv_rank,
+                        k.chunk_hash_hex,
+                        self._key_sizes.get(k, 0),
+                    )
+
+        # TODO: once eviction is wired end-to-end, call on_remove()
+        # for each salt's victims after the MP server confirms deletion.
+        return eviction_plan
+
+    def _select_keys_to_evict(
+        self, cache_salt: str, target_bytes: int
+    ) -> list[CacheKey]:
+        """Select LRU victims from a salt's bucket to free ``target_bytes``.
+
+        Args:
+            cache_salt: The salt to evict from.
+            target_bytes: Target number of bytes to free.
+
+        Returns:
+            List of keys in LRU order (oldest first).
+        """
+        with self._lock:
+            order = self._per_salt_order.get(cache_salt)
+            if not order:
+                return []
+
+            keys_to_evict: list[CacheKey] = []
+            freed = 0
+            for key in order:
+                keys_to_evict.append(key)
+                freed += self._key_sizes.get(key, 0)
+                if freed >= target_bytes:
+                    break
+
+            return keys_to_evict
diff --git a/lmcache/v1/mp_coordinator/l2/quota_store.py b/lmcache/v1/mp_coordinator/l2/quota_store.py
new file mode 100644
index 0000000000..a9d0c3fef7
--- /dev/null
+++ b/lmcache/v1/mp_coordinator/l2/quota_store.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Lightweight in-memory quota registry for the MP coordinator.
+
+Holds per-``cache_salt`` byte limits. The coordinator is the single
+source of truth for quotas; MP servers query the coordinator to obtain
+their limits. This class is intentionally free of heavy dependencies
+(no ``torch``, no ``distributed`` layer imports) so the coordinator
+process stays lightweight.
+"""
+
+# Future
+from __future__ import annotations
+
+# Standard
+from dataclasses import dataclass
+import threading
+
+
+@dataclass(frozen=True)
+class QuotaEntry:
+    """Snapshot of a single quota registration.
+
+    Attributes:
+        cache_salt: The tenant identifier.
+        limit_bytes: The byte budget for this tenant.
+    """
+
+    cache_salt: str
+    limit_bytes: int
+
+
+class QuotaStore:
+    """Thread-safe in-memory registry of byte quotas keyed by ``cache_salt``.
+
+    All public methods acquire an internal lock so the store stays
+    consistent under concurrent access from FastAPI endpoint handlers.
+    """
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._limits: dict[str, int] = {}
+
+    def set(self, cache_salt: str, limit_bytes: int) -> None:
+        """Create or update the quota for ``cache_salt``.
+
+        Args:
+            cache_salt: The tenant identifier.
+            limit_bytes: The byte budget (must be non-negative).
+
+        Raises:
+            ValueError: If ``limit_bytes`` is negative.
+        """
+        if limit_bytes < 0:
+            raise ValueError(f"limit_bytes must be non-negative (got {limit_bytes})")
+        with self._lock:
+            self._limits[cache_salt] = limit_bytes
+
+    def get(self, cache_salt: str) -> int | None:
+        """Return the limit for ``cache_salt``, or ``None`` if unregistered.
+
+        Args:
+            cache_salt: The tenant identifier.
+
+        Returns:
+            The byte limit, or ``None`` if no quota is registered.
+        """
+        with self._lock:
+            return self._limits.get(cache_salt)
+
+    def delete(self, cache_salt: str) -> bool:
+        """Remove the quota entry for ``cache_salt``.
+
+        Args:
+            cache_salt: The tenant identifier.
+
+        Returns:
+            ``True`` if an entry was removed, ``False`` if none existed.
+        """
+        with self._lock:
+            return self._limits.pop(cache_salt, None) is not None
+
+    def list_all(self) -> list[QuotaEntry]:
+        """Return a snapshot of all registered quotas.
+
+        Returns:
+            A detached list of all quota entries.
+        """
+        with self._lock:
+            return [
+                QuotaEntry(cache_salt=salt, limit_bytes=limit)
+                for salt, limit in self._limits.items()
+            ]
diff --git a/lmcache/v1/mp_coordinator/l2/usage_tracker.py b/lmcache/v1/mp_coordinator/l2/usage_tracker.py
new file mode 100644
index 0000000000..241202f409
--- /dev/null
+++ b/lmcache/v1/mp_coordinator/l2/usage_tracker.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Per-``cache_salt`` L2 usage tracker for the MP coordinator.
+
+Maintains running byte totals per tenant, updated by store events
+reported by MP servers. Eviction (byte subtraction) is driven by
+the coordinator itself, not by MP servers.
+Thread-safe and dependency-free.
+"""
+
+# Future
+from __future__ import annotations
+
+# Standard
+import threading
+
+# First Party
+from lmcache.logging import init_logger
+
+logger = init_logger(__name__)
+
+
+class UsageTracker:
+    """Thread-safe in-memory tracker of L2 byte usage per ``cache_salt``.
+
+    MP servers report ``store`` events. The coordinator calls
+    ``record_evicted`` when it decides to evict data. Byte counters
+    are clamped at zero on underflow.
+    """
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._bytes_by_salt: dict[str, int] = {}
+        self._total_bytes: int = 0
+
+    def record_stored(self, cache_salt: str, num_bytes: int) -> None:
+        """Record that ``num_bytes`` were stored under ``cache_salt``.
+
+        Args:
+            cache_salt: The tenant identifier.
+            num_bytes: Bytes stored (must be non-negative).
+
+        Raises:
+            ValueError: If ``num_bytes`` is negative.
+        """
+        if num_bytes < 0:
+            raise ValueError(f"num_bytes must be non-negative (got {num_bytes})")
+        if num_bytes == 0:
+            return
+        with self._lock:
+            self._bytes_by_salt[cache_salt] = (
+                self._bytes_by_salt.get(cache_salt, 0) + num_bytes
+            )
+            self._total_bytes += num_bytes
+
+    def record_evicted(self, cache_salt: str, num_bytes: int) -> None:
+        """Record that the coordinator evicted ``num_bytes`` under ``cache_salt``.
+
+        Clamps per-salt and total counters at zero if a subtraction
+        would underflow (logs a warning).
+
+        Args:
+            cache_salt: The tenant identifier.
+            num_bytes: Bytes evicted (must be non-negative).
+
+        Raises:
+            ValueError: If ``num_bytes`` is negative.
+        """
+        if num_bytes < 0:
+            raise ValueError(f"num_bytes must be non-negative (got {num_bytes})")
+        if num_bytes == 0:
+            return
+        with self._lock:
+            current = self._bytes_by_salt.get(cache_salt, 0)
+            new_val = current - num_bytes
+            if new_val < 0:
+                logger.warning(
+                    "Usage underflow for cache_salt=%r: %d - %d = %d, clamping to 0",
+                    cache_salt,
+                    current,
+                    num_bytes,
+                    new_val,
+                )
+                new_val = 0
+            if new_val == 0:
+                self._bytes_by_salt.pop(cache_salt, None)
+            else:
+                self._bytes_by_salt[cache_salt] = new_val
+
+            self._total_bytes -= num_bytes
+            if self._total_bytes < 0:
+                self._total_bytes = 0
+
+    def get(self, cache_salt: str) -> int:
+        """Return the current byte usage for ``cache_salt``.
+
+        Args:
+            cache_salt: The tenant identifier.
+
+        Returns:
+            Bytes currently tracked, or 0 if no usage recorded.
+        """
+        with self._lock:
+            return self._bytes_by_salt.get(cache_salt, 0)
+
+    def get_all(self) -> dict[str, int]:
+        """Return a snapshot of per-salt byte usage.
+
+        Returns:
+            A copy of the internal mapping (salt -> bytes).
+        """
+        with self._lock:
+            return dict(self._bytes_by_salt)
+
+    def get_total(self) -> int:
+        """Return the total bytes tracked across all salts.
+
+        Returns:
+            Total byte usage.
+        """
+        with self._lock:
+            return self._total_bytes
diff --git a/lmcache/v1/mp_coordinator/schemas.py b/lmcache/v1/mp_coordinator/schemas.py
index fdf87ecda5..880bb37877 100644
--- a/lmcache/v1/mp_coordinator/schemas.py
+++ b/lmcache/v1/mp_coordinator/schemas.py
@@ -8,6 +8,8 @@
 """
 
 # Standard
+from dataclasses import dataclass
+from enum import Enum
 from typing import Annotated
 
 # Third Party
@@ -53,3 +55,124 @@ class HeartbeatResponse(BaseModel):
     """
 
     instance_id: str
+
+
+# -- Quota management --------------------------------------------------------
+
+
+class SetQuotaRequest(BaseModel):
+    """Body of ``PUT /quota/{cache_salt}``.
+
+    Attributes:
+        limit_gb: Non-negative byte budget in GiB.
+    """
+
+    limit_gb: float = Field(ge=0.0)
+
+
+class QuotaResponse(BaseModel):
+    """Reply to ``PUT`` or ``DELETE /quota/{cache_salt}``.
+
+    Attributes:
+        cache_salt: The tenant identifier (``_default`` for empty salt).
+        limit_gb: The current limit in GiB (0.0 after deletion).
+        status: ``"ok"`` or ``"removed"`` or ``"not_found"``.
+    """
+
+    cache_salt: str
+    limit_gb: float
+    status: str
+
+
+# -- L2 usage tracking -------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class CacheKey:
+    """Lightweight, torch-free equivalent of ``ObjectKey``.
+
+    Used both as the wire format in usage events and as the key type
+    in the eviction controller's per-salt LRU. Frozen so it can be
+    used as a dict key / in ``OrderedDict``.
+
+    Attributes:
+        chunk_hash_hex: Hex-encoded content hash of the chunk.
+        model_name: Name of the model this chunk belongs to.
+        kv_rank: Packed rank bitmap (world_size, global_rank, etc.).
+        cache_salt: The tenant identifier.
+    """
+
+    chunk_hash_hex: str
+    model_name: str
+    kv_rank: int
+    cache_salt: str
+
+
+class EventType(str, Enum):
+    """Type of L2 cache event."""
+
+    STORE = "store"
+    LOOKUP = "lookup"
+
+
+class UsageEvent(BaseModel):
+    """A single L2 store or lookup event reported by an MP server.
+
+    Attributes:
+        type: The event type.
+        key: The cache key this event applies to. The tenant
+            identifier (``cache_salt``) is carried inside the key.
+        bytes: Number of bytes stored (for ``"store"`` events).
+    """
+
+    type: EventType
+    key: CacheKey
+    bytes: int = Field(ge=0)
+
+
+class ReportUsageRequest(BaseModel):
+    """Body of ``POST /l2/events``.
+
+    Attributes:
+        events: Batch of store/lookup events to record.
+    """
+
+    events: list[UsageEvent]
+
+
+class ReportUsageResponse(BaseModel):
+    """Reply to ``POST /l2/events``.
+
+    Attributes:
+        recorded: Number of events processed.
+    """
+
+    recorded: int
+
+
+class L2StatusResponse(BaseModel):
+    """Combined quota and usage for a single ``cache_salt``.
+
+    Attributes:
+        cache_salt: The tenant identifier.
+        quota_limit_gb: The byte budget in GiB (0.0 if no quota set).
+        quota_exists: Whether an explicit quota is registered.
+        usage_gb: Current L2 usage in GiB.
+    """
+
+    cache_salt: str
+    quota_limit_gb: float
+    quota_exists: bool
+    usage_gb: float
+
+
+class L2StatusListResponse(BaseModel):
+    """Reply to ``GET /l2/status``.
+
+    Attributes:
+        total_gb: Aggregate L2 usage in GiB.
+        by_cache_salt: Per-tenant breakdown with quota and usage.
+    """
+
+    total_gb: float
+    by_cache_salt: list[L2StatusResponse]
diff --git a/lmcache/v1/multiprocess/config.py b/lmcache/v1/multiprocess/config.py
index 7b8eec772a..452bf9415a 100644
--- a/lmcache/v1/multiprocess/config.py
+++ b/lmcache/v1/multiprocess/config.py
@@ -114,6 +114,13 @@ class CoordinatorConfig:
     """Seconds between heartbeats. Must be strictly positive and kept well below
     the coordinator's ``INSTANCE_TIMEOUT``."""
 
+    l2_event_reporting: bool = False
+    """When ``True``, report L2 store/lookup events to the coordinator for
+    fleet-wide usage tracking and eviction."""
+
+    l2_event_flush_interval: float = 1.0
+    """Seconds between L2 event flush attempts to the coordinator."""
+
 
 DEFAULT_COORDINATOR_CONFIG = CoordinatorConfig()
 
@@ -362,6 +369,21 @@ def add_coordinator_args(
         help="Seconds between heartbeats (must be > 0). Defaults to "
         "LMCACHE_COORDINATOR_HEARTBEAT_INTERVAL, then 5.0.",
     )
+    group.add_argument(
+        "--coordinator-l2-event-reporting",
+        action="store_true",
+        default=None,
+        help="Report L2 store/lookup events to the coordinator for "
+        "fleet-wide usage tracking and eviction. Defaults to "
+        "LMCACHE_COORDINATOR_L2_EVENT_REPORTING; unset disables.",
+    )
+    group.add_argument(
+        "--coordinator-l2-event-flush-interval",
+        type=float,
+        default=None,
+        help="Seconds between L2 event flush attempts (must be > 0). "
+        "Defaults to LMCACHE_COORDINATOR_L2_EVENT_FLUSH_INTERVAL, then 1.0.",
+    )
     return parser
 
 
@@ -413,8 +435,37 @@ def parse_args_to_coordinator_config(
             "coordinator heartbeat interval must be a finite number > 0, "
             "got %s" % heartbeat_interval
         )
+    if args.coordinator_l2_event_reporting is not None:
+        l2_event_reporting = args.coordinator_l2_event_reporting
+    else:
+        l2_event_reporting = os.getenv(
+            "LMCACHE_COORDINATOR_L2_EVENT_REPORTING", ""
+        ).lower() in ("1", "true", "yes")
+
+    if args.coordinator_l2_event_flush_interval is not None:
+        l2_event_flush_interval = args.coordinator_l2_event_flush_interval
+    else:
+        raw = os.getenv("LMCACHE_COORDINATOR_L2_EVENT_FLUSH_INTERVAL")
+        if raw:
+            try:
+                l2_event_flush_interval = float(raw)
+            except ValueError as exc:
+                raise ValueError(
+                    "LMCACHE_COORDINATOR_L2_EVENT_FLUSH_INTERVAL is not a number: %r"
+                    % raw
+                ) from exc
+        else:
+            l2_event_flush_interval = 1.0
+    if not math.isfinite(l2_event_flush_interval) or l2_event_flush_interval <= 0:
+        raise ValueError(
+            "coordinator L2 event flush interval must be a finite number > 0, "
+            "got %s" % l2_event_flush_interval
+        )
+
     return CoordinatorConfig(
         url=url,
         advertise_ip=advertise_ip,
         heartbeat_interval=heartbeat_interval,
+        l2_event_reporting=l2_event_reporting,
+        l2_event_flush_interval=l2_event_flush_interval,
     )
diff --git a/lmcache/v1/multiprocess/http_server.py b/lmcache/v1/multiprocess/http_server.py
index e01c714bbc..1c606aad7d 100644
--- a/lmcache/v1/multiprocess/http_server.py
+++ b/lmcache/v1/multiprocess/http_server.py
@@ -18,6 +18,7 @@
     add_storage_manager_args,
     parse_args_to_config,
 )
+from lmcache.v1.mp_coordinator.l2.event_listener import CoordinatorL2EventListener
 from lmcache.v1.mp_coordinator.registrar import keep_registered
 from lmcache.v1.mp_observability.config import (
     ObservabilityConfig,
@@ -131,8 +132,31 @@ async def lifespan(app: FastAPI):
                 heartbeat_interval=coordinator_config.heartbeat_interval,
             )
         )
+    # Optionally report L2 store/lookup events to the coordinator for
+    # fleet-wide usage tracking and eviction. Registers as a listener on
+    # all L2 adapters and flushes batched events on a timer.
+    coordinator_l2_event_client = None
+    coordinator_l2_event_task = None
+    if (
+        coordinator_client is not None
+        and coordinator_config is not None
+        and coordinator_config.url
+        and coordinator_config.l2_event_reporting
+    ):
+        coordinator_l2_event_client = CoordinatorL2EventListener(
+            coordinator_client,
+            coordinator_config.url,
+            flush_interval=coordinator_config.l2_event_flush_interval,
+        )
+        if engine.storage_manager is not None:
+            engine.storage_manager.register_l2_listener(coordinator_l2_event_client)
+        coordinator_l2_event_task = asyncio.create_task(
+            coordinator_l2_event_client.run()
+        )
+
     app.state.coordinator_client = coordinator_client
     app.state.coordinator_registration_task = coordinator_registration_task
+    app.state.coordinator_l2_event_task = coordinator_l2_event_task
 
     logger.info("LMCache HTTP server initialized")
 
@@ -140,6 +164,11 @@ async def lifespan(app: FastAPI):
 
     # Shutdown
     logger.info("Shutting down LMCache HTTP server...")
+    coordinator_l2_event_task = getattr(app.state, "coordinator_l2_event_task", None)
+    if coordinator_l2_event_task is not None:
+        coordinator_l2_event_task.cancel()
+        with contextlib.suppress(asyncio.CancelledError):
+            await coordinator_l2_event_task
     coordinator_registration_task = getattr(
         app.state, "coordinator_registration_task", None
     )
diff --git a/tests/v1/mp_coordinator/test_eviction_controller.py b/tests/v1/mp_coordinator/test_eviction_controller.py
new file mode 100644
index 0000000000..84d55c9f46
--- /dev/null
+++ b/tests/v1/mp_coordinator/test_eviction_controller.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for the coordinator eviction controller."""
+
+# First Party
+from lmcache.v1.mp_coordinator.l2.eviction_controller import (
+    CoordinatorEvictionController,
+)
+from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
+from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
+from lmcache.v1.mp_coordinator.schemas import CacheKey
+
+
+def _make_key(salt: str, model: str = "m", rank: int = 0, h: str = "aa") -> CacheKey:
+    return CacheKey(chunk_hash_hex=h, model_name=model, kv_rank=rank, cache_salt=salt)
+
+
+def _setup(
+    eviction_ratio: float = 0.5,
+) -> tuple[CoordinatorEvictionController, QuotaStore, UsageTracker]:
+    qs = QuotaStore()
+    ut = UsageTracker()
+    ctrl = CoordinatorEvictionController(qs, ut, eviction_ratio=eviction_ratio)
+    return ctrl, qs, ut
+
+
+def test_on_store_tracks_key():
+    ctrl, _, _ = _setup()
+    k = _make_key("a")
+    ctrl.on_store(k, 100)
+    assert ctrl._select_keys_to_evict("a", 100) == [k]
+
+
+def test_on_store_updates_existing_key():
+    ctrl, _, _ = _setup()
+    k = _make_key("a")
+    ctrl.on_store(k, 100)
+    ctrl.on_store(k, 200)
+    assert ctrl._select_keys_to_evict("a", 200) == [k]
+
+
+def test_on_lookup_touches_key():
+    ctrl, _, _ = _setup()
+    k1 = _make_key("a", h="01")
+    k2 = _make_key("a", h="02")
+    ctrl.on_store(k1, 100)
+    ctrl.on_store(k2, 100)
+    ctrl.on_lookup(k1)
+    keys_to_evict = ctrl._select_keys_to_evict("a", 100)
+    assert keys_to_evict[0] == k2
+
+
+def test_on_lookup_unknown_key_is_noop():
+    ctrl, _, _ = _setup()
+    k = _make_key("a")
+    ctrl.on_lookup(k)
+    assert ctrl._select_keys_to_evict("a", 1) == []
+
+
+def test_on_remove():
+    ctrl, _, _ = _setup()
+    k1 = _make_key("a", h="01")
+    k2 = _make_key("a", h="02")
+    ctrl.on_store(k1, 100)
+    ctrl.on_store(k2, 200)
+    ctrl.on_remove([k1])
+    assert ctrl._select_keys_to_evict("a", 200) == [k2]
+
+
+def test_on_remove_cleans_empty_bucket():
+    ctrl, _, _ = _setup()
+    k = _make_key("a")
+    ctrl.on_store(k, 100)
+    ctrl.on_remove([k])
+    assert ctrl._select_keys_to_evict("a", 1) == []
+
+
+def test_on_remove_empty_list_is_noop():
+    ctrl, _, _ = _setup()
+    ctrl.on_remove([])
+    assert ctrl._select_keys_to_evict("a", 1) == []
+
+
+def test_select_keys_to_evict_lru_order():
+    ctrl, _, _ = _setup()
+    k1 = _make_key("a", h="01")
+    k2 = _make_key("a", h="02")
+    k3 = _make_key("a", h="03")
+    ctrl.on_store(k1, 100)
+    ctrl.on_store(k2, 200)
+    ctrl.on_store(k3, 300)
+    keys_to_evict = ctrl._select_keys_to_evict("a", 250)
+    assert keys_to_evict == [k1, k2]
+
+
+def test_select_keys_to_evict_empty_bucket():
+    ctrl, _, _ = _setup()
+    assert ctrl._select_keys_to_evict("nonexistent", 100) == []
+
+
+def test_check_and_log_no_quotas_evicts_all():
+    ctrl, _, ut = _setup()
+    k = _make_key("a")
+    ctrl.on_store(k, 1000)
+    ut.record_stored("a", 1000)
+    result = ctrl.execute_evictions()
+    assert "a" in result
+    assert result["a"] == [k]
+
+
+def test_check_and_log_under_quota():
+    ctrl, qs, ut = _setup()
+    qs.set("a", 2000)
+    ut.record_stored("a", 1000)
+    ctrl.on_store(_make_key("a"), 1000)
+    result = ctrl.execute_evictions()
+    assert result == {}
+
+
+def test_check_and_log_over_quota():
+    ctrl, qs, ut = _setup(eviction_ratio=1.0)
+    qs.set("a", 500)
+    ut.record_stored("a", 1000)
+    k1 = _make_key("a", h="01")
+    k2 = _make_key("a", h="02")
+    ctrl.on_store(k1, 400)
+    ctrl.on_store(k2, 600)
+    result = ctrl.execute_evictions()
+    assert "a" in result
+    keys_to_evict = result["a"]
+    assert keys_to_evict[0] == k1
+    total_evict_bytes = 400 + 600
+    assert total_evict_bytes >= 500
+
+
+def test_check_and_log_eviction_ratio():
+    ctrl, qs, ut = _setup(eviction_ratio=0.5)
+    qs.set("a", 500)
+    ut.record_stored("a", 1000)
+    k1 = _make_key("a", h="01")
+    k2 = _make_key("a", h="02")
+    k3 = _make_key("a", h="03")
+    ctrl.on_store(k1, 200)
+    ctrl.on_store(k2, 200)
+    ctrl.on_store(k3, 600)
+    result = ctrl.execute_evictions()
+    assert "a" in result
+    keys_to_evict = result["a"]
+    assert len(keys_to_evict) >= 1
+    assert keys_to_evict[0] == k1
+
+
+def test_check_and_log_zero_quota_evicts_all():
+    ctrl, qs, ut = _setup()
+    qs.set("a", 0)
+    ut.record_stored("a", 1000)
+    k = _make_key("a")
+    ctrl.on_store(k, 1000)
+    result = ctrl.execute_evictions()
+    assert "a" in result
+    assert result["a"] == [k]
+
+
+def test_multiple_salts_independent():
+    ctrl, qs, ut = _setup(eviction_ratio=1.0)
+    qs.set("a", 100)
+    qs.set("b", 5000)
+    ut.record_stored("a", 500)
+    ut.record_stored("b", 1000)
+    ka = _make_key("a", h="01")
+    kb = _make_key("b", h="02")
+    ctrl.on_store(ka, 500)
+    ctrl.on_store(kb, 1000)
+    result = ctrl.execute_evictions()
+    assert "a" in result
+    assert "b" not in result
diff --git a/tests/v1/mp_coordinator/test_l2_api.py b/tests/v1/mp_coordinator/test_l2_api.py
new file mode 100644
index 0000000000..4ba5e4bb5e
--- /dev/null
+++ b/tests/v1/mp_coordinator/test_l2_api.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for the coordinator L2 REST API (quota, usage, status)."""
+
+# Third Party
+from fastapi.testclient import TestClient
+
+# First Party
+from lmcache.v1.mp_coordinator.app import create_app
+from lmcache.v1.mp_coordinator.config import MPCoordinatorConfig
+
+
+def _client() -> TestClient:
+    config = MPCoordinatorConfig(health_check_interval=0.0, eviction_check_interval=0.0)
+    return TestClient(create_app(config))
+
+
+def _key(salt: str, h: str = "aa", model: str = "m", rank: int = 0) -> dict:
+    return {
+        "chunk_hash_hex": h,
+        "model_name": model,
+        "kv_rank": rank,
+        "cache_salt": salt,
+    }
+
+
+def _store(salt: str, nbytes: int, **kw) -> dict:
+    return {"type": "store", "key": _key(salt, **kw), "bytes": nbytes}
+
+
+def _lookup(salt: str, **kw) -> dict:
+    return {"type": "lookup", "key": _key(salt, **kw), "bytes": 0}
+
+
+# -- Quota writes ------------------------------------------------------------
+
+
+def test_set_quota():
+    with _client() as client:
+        resp = client.put("/l2/quota/user-a", json={"limit_gb": 2.5})
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["cache_salt"] == "user-a"
+        assert data["limit_gb"] == 2.5
+        assert data["status"] == "ok"
+
+
+def test_update_quota():
+    with _client() as client:
+        client.put("/l2/quota/user-a", json={"limit_gb": 1.0})
+        client.put("/l2/quota/user-a", json={"limit_gb": 5.0})
+        data = client.get("/l2/status/user-a").json()
+        assert abs(data["quota_limit_gb"] - 5.0) < 1e-6
+
+
+def test_delete_quota():
+    with _client() as client:
+        client.put("/l2/quota/user-a", json={"limit_gb": 1.0})
+        resp = client.delete("/l2/quota/user-a")
+        assert resp.status_code == 200
+        assert resp.json()["status"] == "removed"
+
+        data = client.get("/l2/status/user-a").json()
+        assert data["quota_exists"] is False
+
+
+def test_delete_nonexistent_quota():
+    with _client() as client:
+        resp = client.delete("/l2/quota/unknown")
+        assert resp.status_code == 200
+        assert resp.json()["status"] == "not_found"
+
+
+def test_negative_limit_rejected():
+    with _client() as client:
+        resp = client.put("/l2/quota/user-a", json={"limit_gb": -1.0})
+        assert resp.status_code == 422
+
+
+def test_missing_body_rejected():
+    with _client() as client:
+        resp = client.put("/l2/quota/user-a")
+        assert resp.status_code == 422
+
+
+def test_zero_limit_accepted():
+    with _client() as client:
+        resp = client.put("/l2/quota/user-a", json={"limit_gb": 0.0})
+        assert resp.status_code == 200
+        data = client.get("/l2/status/user-a").json()
+        assert data["quota_exists"] is True
+        assert data["quota_limit_gb"] == 0.0
+
+
+# -- Usage event ingestion ---------------------------------------------------
+
+
+def test_report_store_events():
+    with _client() as client:
+        resp = client.post(
+            "/l2/events",
+            json={
+                "events": [
+                    _store("user-a", 1000, h="01"),
+                    _store("user-a", 500, h="02"),
+                    _store("user-b", 2000, h="03"),
+                ]
+            },
+        )
+        assert resp.status_code == 200
+        assert resp.json()["recorded"] == 3
+
+        data = client.get("/l2/status/user-a").json()
+        assert abs(data["usage_gb"] - 1500 / 1024**3) < 1e-12
+
+        data = client.get("/l2/status/user-b").json()
+        assert abs(data["usage_gb"] - 2000 / 1024**3) < 1e-12
+
+
+def test_report_lookup_events_accepted():
+    with _client() as client:
+        resp = client.post(
+            "/l2/events",
+            json={"events": [_lookup("user-a")]},
+        )
+        assert resp.status_code == 200
+        assert resp.json()["recorded"] == 1
+
+
+def test_empty_events_batch():
+    with _client() as client:
+        resp = client.post("/l2/events", json={"events": []})
+        assert resp.status_code == 200
+        assert resp.json()["recorded"] == 0
+
+
+def test_invalid_event_type_rejected():
+    with _client() as client:
+        resp = client.post(
+            "/l2/events",
+            json={"events": [{"type": "delete", "key": _key("a"), "bytes": 100}]},
+        )
+        assert resp.status_code == 422
+
+
+def test_negative_bytes_rejected():
+    with _client() as client:
+        resp = client.post(
+            "/l2/events",
+            json={"events": [{"type": "store", "key": _key("a"), "bytes": -1}]},
+        )
+        assert resp.status_code == 422
+
+
+# -- Combined status queries -------------------------------------------------
+
+
+def test_status_single_salt():
+    with _client() as client:
+        client.put("/l2/quota/user-a", json={"limit_gb": 2.5})
+        client.post(
+            "/l2/events",
+            json={"events": [_store("user-a", 1000)]},
+        )
+        data = client.get("/l2/status/user-a").json()
+        assert data["cache_salt"] == "user-a"
+        assert abs(data["quota_limit_gb"] - 2.5) < 1e-6
+        assert data["quota_exists"] is True
+        assert abs(data["usage_gb"] - 1000 / 1024**3) < 1e-12
+
+
+def test_status_unknown_salt():
+    with _client() as client:
+        data = client.get("/l2/status/unknown").json()
+        assert data["usage_gb"] == 0.0
+        assert data["quota_exists"] is False
+        assert data["quota_limit_gb"] == 0.0
+
+
+def test_status_list():
+    with _client() as client:
+        client.put("/l2/quota/a", json={"limit_gb": 1.0})
+        client.post(
+            "/l2/events",
+            json={
+                "events": [
+                    _store("a", 100, h="01"),
+                    _store("b", 200, h="02"),
+                ]
+            },
+        )
+        data = client.get("/l2/status").json()
+        assert abs(data["total_gb"] - 300 / 1024**3) < 1e-12
+        by_salt = {e["cache_salt"]: e for e in data["by_cache_salt"]}
+        assert abs(by_salt["a"]["usage_gb"] - 100 / 1024**3) < 1e-12
+        assert by_salt["a"]["quota_exists"] is True
+        assert abs(by_salt["b"]["usage_gb"] - 200 / 1024**3) < 1e-12
+        assert by_salt["b"]["quota_exists"] is False
+
+
+def test_status_list_empty():
+    with _client() as client:
+        data = client.get("/l2/status").json()
+        assert data["total_gb"] == 0.0
+        assert data["by_cache_salt"] == []
+
+
+def test_status_list_includes_quota_only_salt():
+    """A salt with a quota but no usage should appear in the list."""
+    with _client() as client:
+        client.put("/l2/quota/q-only", json={"limit_gb": 5.0})
+        data = client.get("/l2/status").json()
+        by_salt = {e["cache_salt"]: e for e in data["by_cache_salt"]}
+        assert "q-only" in by_salt
+        assert by_salt["q-only"]["quota_exists"] is True
+        assert by_salt["q-only"]["usage_gb"] == 0.0
+
+
+def test_default_salt_sentinel():
+    with _client() as client:
+        client.put("/l2/quota/_default", json={"limit_gb": 3.0})
+        client.post(
+            "/l2/events",
+            json={"events": [_store("_default", 500)]},
+        )
+        data = client.get("/l2/status/_default").json()
+        assert data["cache_salt"] == "_default"
+        assert data["quota_exists"] is True
+        assert abs(data["quota_limit_gb"] - 3.0) < 1e-6
+        assert abs(data["usage_gb"] - 500 / 1024**3) < 1e-12
diff --git a/tests/v1/mp_coordinator/test_quota_store.py b/tests/v1/mp_coordinator/test_quota_store.py
new file mode 100644
index 0000000000..b79b44f6f3
--- /dev/null
+++ b/tests/v1/mp_coordinator/test_quota_store.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for the coordinator QuotaStore."""
+
+# Third Party
+import pytest
+
+# First Party
+from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
+
+
+def test_set_and_get():
+    store = QuotaStore()
+    store.set("salt-a", 1000)
+    assert store.get("salt-a") == 1000
+
+
+def test_get_unregistered_returns_none():
+    store = QuotaStore()
+    assert store.get("unknown") is None
+
+
+def test_set_overwrites():
+    store = QuotaStore()
+    store.set("salt-a", 1000)
+    store.set("salt-a", 2000)
+    assert store.get("salt-a") == 2000
+
+
+def test_delete():
+    store = QuotaStore()
+    store.set("salt-a", 1000)
+    assert store.delete("salt-a") is True
+    assert store.get("salt-a") is None
+
+
+def test_delete_nonexistent():
+    store = QuotaStore()
+    assert store.delete("unknown") is False
+
+
+def test_list_all():
+    store = QuotaStore()
+    store.set("a", 100)
+    store.set("b", 200)
+    entries = store.list_all()
+    by_salt = {e.cache_salt: e.limit_bytes for e in entries}
+    assert by_salt == {"a": 100, "b": 200}
+
+
+def test_list_all_empty():
+    store = QuotaStore()
+    assert store.list_all() == []
+
+
+def test_negative_limit_raises():
+    store = QuotaStore()
+    with pytest.raises(ValueError, match="non-negative"):
+        store.set("salt-a", -1)
+
+
+def test_zero_limit_accepted():
+    store = QuotaStore()
+    store.set("salt-a", 0)
+    assert store.get("salt-a") == 0
diff --git a/tests/v1/mp_coordinator/test_usage_tracker.py b/tests/v1/mp_coordinator/test_usage_tracker.py
new file mode 100644
index 0000000000..4f00a58890
--- /dev/null
+++ b/tests/v1/mp_coordinator/test_usage_tracker.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for the coordinator UsageTracker."""
+
+# Third Party
+import pytest
+
+# First Party
+from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
+
+
+def test_record_stored():
+    t = UsageTracker()
+    t.record_stored("a", 100)
+    assert t.get("a") == 100
+    assert t.get_total() == 100
+
+
+def test_record_stored_accumulates():
+    t = UsageTracker()
+    t.record_stored("a", 100)
+    t.record_stored("a", 200)
+    assert t.get("a") == 300
+    assert t.get_total() == 300
+
+
+def test_record_evicted():
+    t = UsageTracker()
+    t.record_stored("a", 100)
+    t.record_evicted("a", 40)
+    assert t.get("a") == 60
+    assert t.get_total() == 60
+
+
+def test_evict_clamps_at_zero():
+    t = UsageTracker()
+    t.record_stored("a", 50)
+    t.record_evicted("a", 100)
+    assert t.get("a") == 0
+    assert t.get_total() == 0
+
+
+def test_evict_removes_zero_entry():
+    t = UsageTracker()
+    t.record_stored("a", 100)
+    t.record_evicted("a", 100)
+    assert t.get_all() == {}
+
+
+def test_multiple_salts():
+    t = UsageTracker()
+    t.record_stored("a", 100)
+    t.record_stored("b", 200)
+    assert t.get("a") == 100
+    assert t.get("b") == 200
+    assert t.get_total() == 300
+
+
+def test_get_unknown_returns_zero():
+    t = UsageTracker()
+    assert t.get("unknown") == 0
+
+
+def test_get_all():
+    t = UsageTracker()
+    t.record_stored("a", 100)
+    t.record_stored("b", 200)
+    assert t.get_all() == {"a": 100, "b": 200}
+
+
+def test_get_all_empty():
+    t = UsageTracker()
+    assert t.get_all() == {}
+
+
+def test_zero_bytes_is_noop():
+    t = UsageTracker()
+    t.record_stored("a", 0)
+    assert t.get("a") == 0
+    assert t.get_all() == {}
+
+
+def test_negative_store_raises():
+    t = UsageTracker()
+    with pytest.raises(ValueError, match="non-negative"):
+        t.record_stored("a", -1)
+
+
+def test_negative_evict_raises():
+    t = UsageTracker()
+    with pytest.raises(ValueError, match="non-negative"):
+        t.record_evicted("a", -1)

From 5db61993c9e99db56d0b60ca98bf0b025f1755f7 Mon Sep 17 00:00:00 2001
From: chunxiaozheng <idellzheng@tencent.com>
Date: Thu, 11 Jun 2026 07:52:46 +0800
Subject: [PATCH 27/57] [cli] refactor query and trace cli (#3625)

* refactor: refactor query cli

Signed-off-by: idellzheng <idellzheng@tencent.com>

* refactor: refactor trace cli

Signed-off-by: idellzheng <idellzheng@tencent.com>

* bugfix

Signed-off-by: idellzheng <idellzheng@tencent.com>

---------

Signed-off-by: idellzheng <idellzheng@tencent.com>
---
 lmcache/cli/commands/query/__init__.py        | 149 ++----
 lmcache/cli/commands/query/engine_command.py  | 130 ++++++
 lmcache/cli/commands/query/kvcache_command.py |  46 ++
 lmcache/cli/commands/trace/__init__.py        | 439 ++----------------
 lmcache/cli/commands/trace/info_command.py    |  77 +++
 lmcache/cli/commands/trace/replay_command.py  | 340 ++++++++++++++
 tests/cli/commands/test_query.py              |   4 +-
 7 files changed, 665 insertions(+), 520 deletions(-)
 create mode 100644 lmcache/cli/commands/query/engine_command.py
 create mode 100644 lmcache/cli/commands/query/kvcache_command.py
 create mode 100644 lmcache/cli/commands/trace/info_command.py
 create mode 100644 lmcache/cli/commands/trace/replay_command.py

diff --git a/lmcache/cli/commands/query/__init__.py b/lmcache/cli/commands/query/__init__.py
index 8a3d164a57..9d4ecb1bb1 100644
--- a/lmcache/cli/commands/query/__init__.py
+++ b/lmcache/cli/commands/query/__init__.py
@@ -1,14 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Run one OpenAI-compatible inference request and report token/latency metrics."""
+"""``lmcache query`` command — single-shot inference request interface.
+
+Subcommands:
+
+* ``engine`` — send one request to an OpenAI-compatible HTTP API
+* ``kvcache`` — query KV-cache endpoints (not implemented yet)
+"""
 
 # Standard
 import argparse
 import sys
 
 # First Party
-from lmcache.cli.commands.base import BaseCommand, _add_output_args
-from lmcache.cli.commands.query.prompt import PromptBuilder
-from lmcache.cli.commands.query.request import Request
+from lmcache.cli.commands.base import BaseCommand
+from lmcache.cli.commands.query.engine_command import (
+    register_engine_parser,
+    run_query_engine,
+)
+from lmcache.cli.commands.query.kvcache_command import (
+    register_kvcache_parser,
+    run_query_kvcache,
+)
 
 
 class QueryCommand(BaseCommand):
@@ -21,9 +33,14 @@ def help(self) -> str:
         return "Run one inference request and report metrics."
 
     def add_arguments(self, _parser: argparse.ArgumentParser) -> None:
-        pass
+        pass  # args registered in register() via subparsers
 
     def register(self, subparsers: argparse._SubParsersAction) -> None:
+        """Register ``lmcache query`` and all query sub-subcommands.
+
+        Args:
+            subparsers: The subparsers action from the root parser.
+        """
         parser = subparsers.add_parser(
             self.name(),
             help=self.help(),
@@ -36,120 +53,24 @@ def register(self, subparsers: argparse._SubParsersAction) -> None:
             required=True,
             metavar="{engine,kvcache}",
         )
-        self._register_engine(inner)
-        self._register_kvcache(inner)
-
-    def _register_engine(self, subparsers: argparse._SubParsersAction) -> None:
-        parser = subparsers.add_parser(
-            "engine",
-            help="Send one request to an OpenAI-compatible HTTP API.",
-        )
-        parser.add_argument("--url", required=True, help="Serving engine base URL.")
-        parser.add_argument(
-            "--prompt",
-            required=True,
-            help="Prompt text with optional {name} placeholders.",
-        )
-        parser.add_argument(
-            "--model",
-            default=None,
-            metavar="ID",
-            help="Model ID for the serving engine.",
-        )
-        parser.add_argument(
-            "--max-tokens",
-            type=int,
-            default=128,
-            help="Maximum completion tokens (default: 128).",
-        )
-        parser.add_argument(
-            "--timeout",
-            type=float,
-            default=30.0,
-            help="HTTP timeout in seconds (default: 30).",
-        )
-        parser.add_argument(
-            "--documents",
-            action="extend",
-            nargs="+",
-            default=[],
-            metavar="NAME=PATH",
-            help=(
-                "Load file text for {NAME} in --prompt. "
-                "Accepts one or more NAME=PATH values."
-            ),
-        )
-        parser.add_argument(
-            "--path",
-            dest="documents",
-            action="extend",
-            nargs="+",
-            metavar="NAME=PATH",
-            help=argparse.SUPPRESS,
-        )
-        parser.add_argument(
-            "--completions",
-            action="store_true",
-            help="Use POST /v1/completions only.",
-        )
-        parser.add_argument(
-            "--chat-first",
-            action="store_true",
-            help="Try /v1/chat/completions first, then fall back to /v1/completions.",
-        )
-        _add_output_args(parser)
-        parser.set_defaults(func=self.execute)
-
-    def _register_kvcache(self, subparsers: argparse._SubParsersAction) -> None:
-        parser = subparsers.add_parser(
-            "kvcache",
-            help="Query KV-cache endpoints (not implemented yet).",
-        )
-        _add_output_args(parser)
-        parser.set_defaults(func=self.execute)
+        register_engine_parser(inner, self.execute)
+        register_kvcache_parser(inner, self.execute)
 
     def execute(self, args: argparse.Namespace) -> None:
+        """Dispatch to the appropriate query subcommand handler.
+
+        Args:
+            args: Parsed CLI arguments containing ``query_target``.
+        """
         handlers = {
-            "engine": self.query_engine,
-            "kvcache": self.query_kvcache,
+            "engine": lambda a: run_query_engine(self, a),
+            "kvcache": lambda a: run_query_kvcache(self, a),
         }
         handler = handlers.get(args.query_target)
         if handler is None:
-            print(f"Unknown query target: {args.query_target}", file=sys.stderr)
-            sys.exit(1)
-        handler(args)
-
-    def query_engine(self, args: argparse.Namespace) -> None:
-        try:
-            prompt_builder = PromptBuilder(args.prompt, args.documents)
-            sender = Request(
-                base=args.url,
-                model=args.model,
-                max_tokens=args.max_tokens,
-                timeout=args.timeout,
-                completions_only=args.completions,
-                chat_first=args.chat_first,
+            print(
+                f"Unknown query target: {args.query_target}",
+                file=sys.stderr,
             )
-            engine_stats = sender.send_request(prompt_builder.complete_prompt)
-
-            model_id = args.model or str(engine_stats["model"][1])
-            metrics = self.create_metrics("Query Engine", args)
-            metrics.add("model", "Model", model_id)
-            prompt_name, prompt_value = engine_stats["prompt_tokens"]
-            metrics.add("prompt_tokens", prompt_name, int(prompt_value))
-            output_name, output_value = engine_stats["output_tokens"]
-            metrics.add("output_tokens", output_name, int(output_value))
-
-            latency = metrics.add_section("latency", "Latency Metrics")
-            for key, (name, value) in engine_stats.items():
-                if key in ("model", "prompt_tokens", "output_tokens"):
-                    continue
-                latency.add(key, name, round(float(value), 2))
-
-            metrics.emit()
-        except (RuntimeError, ValueError) as err:
-            print(str(err), file=sys.stderr)
             sys.exit(1)
-
-    def query_kvcache(self, args: argparse.Namespace) -> None:
-        pass
+        handler(args)
diff --git a/lmcache/cli/commands/query/engine_command.py b/lmcache/cli/commands/query/engine_command.py
new file mode 100644
index 0000000000..79610d9fe6
--- /dev/null
+++ b/lmcache/cli/commands/query/engine_command.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+"""``lmcache query engine`` — send one request to an OpenAI-compatible API."""
+
+# Standard
+from typing import TYPE_CHECKING
+import argparse
+import sys
+
+# First Party
+from lmcache.cli.commands.base import _add_output_args
+from lmcache.cli.commands.query.prompt import PromptBuilder
+from lmcache.cli.commands.query.request import Request
+
+if TYPE_CHECKING:
+    # First Party
+    from lmcache.cli.commands.base import BaseCommand
+
+
+def register_engine_parser(
+    subparsers: argparse._SubParsersAction,
+    dispatch_func,
+) -> argparse.ArgumentParser:
+    """Register the ``lmcache query engine`` subcommand parser.
+
+    Args:
+        subparsers: The ``query`` subparsers action.
+        dispatch_func: Function to bind via ``set_defaults(func=...)``.
+
+    Returns:
+        The created ``ArgumentParser``.
+    """
+    parser = subparsers.add_parser(
+        "engine",
+        help="Send one request to an OpenAI-compatible HTTP API.",
+    )
+    parser.add_argument("--url", required=True, help="Serving engine base URL.")
+    parser.add_argument(
+        "--prompt",
+        required=True,
+        help="Prompt text with optional {name} placeholders.",
+    )
+    parser.add_argument(
+        "--model",
+        default=None,
+        metavar="ID",
+        help="Model ID for the serving engine.",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=128,
+        help="Maximum completion tokens (default: 128).",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP timeout in seconds (default: 30).",
+    )
+    parser.add_argument(
+        "--documents",
+        action="extend",
+        nargs="+",
+        default=[],
+        metavar="NAME=PATH",
+        help=(
+            "Load file text for {NAME} in --prompt. "
+            "Accepts one or more NAME=PATH values."
+        ),
+    )
+    parser.add_argument(
+        "--path",
+        dest="documents",
+        action="extend",
+        nargs="+",
+        metavar="NAME=PATH",
+        help=argparse.SUPPRESS,
+    )
+    parser.add_argument(
+        "--completions",
+        action="store_true",
+        help="Use POST /v1/completions only.",
+    )
+    parser.add_argument(
+        "--chat-first",
+        action="store_true",
+        help="Try /v1/chat/completions first, then fall back to /v1/completions.",
+    )
+    _add_output_args(parser)
+    parser.set_defaults(func=dispatch_func)
+    return parser
+
+
+def run_query_engine(cmd: "BaseCommand", args: argparse.Namespace) -> None:
+    """Execute the ``lmcache query engine`` subcommand.
+
+    Args:
+        cmd: The parent command instance (for metrics creation).
+        args: Parsed CLI arguments.
+    """
+    try:
+        prompt_builder = PromptBuilder(args.prompt, args.documents)
+        sender = Request(
+            base=args.url,
+            model=args.model,
+            max_tokens=args.max_tokens,
+            timeout=args.timeout,
+            completions_only=args.completions,
+            chat_first=args.chat_first,
+        )
+        engine_stats = sender.send_request(prompt_builder.complete_prompt)
+
+        model_id = args.model or str(engine_stats["model"][1])
+        metrics = cmd.create_metrics("Query Engine", args)
+        metrics.add("model", "Model", model_id)
+        prompt_name, prompt_value = engine_stats["prompt_tokens"]
+        metrics.add("prompt_tokens", prompt_name, int(prompt_value))
+        output_name, output_value = engine_stats["output_tokens"]
+        metrics.add("output_tokens", output_name, int(output_value))
+
+        latency = metrics.add_section("latency", "Latency Metrics")
+        for key, (name, value) in engine_stats.items():
+            if key in ("model", "prompt_tokens", "output_tokens"):
+                continue
+            latency.add(key, name, round(float(value), 2))
+
+        metrics.emit()
+    except (RuntimeError, ValueError) as err:
+        print(str(err), file=sys.stderr)
+        sys.exit(1)
diff --git a/lmcache/cli/commands/query/kvcache_command.py b/lmcache/cli/commands/query/kvcache_command.py
new file mode 100644
index 0000000000..3fed79a633
--- /dev/null
+++ b/lmcache/cli/commands/query/kvcache_command.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+"""``lmcache query kvcache`` — query KV-cache endpoints (placeholder)."""
+
+# Standard
+from typing import TYPE_CHECKING
+import argparse
+
+# First Party
+from lmcache.cli.commands.base import _add_output_args
+
+if TYPE_CHECKING:
+    # First Party
+    from lmcache.cli.commands.base import BaseCommand
+
+
+def register_kvcache_parser(
+    subparsers: argparse._SubParsersAction,
+    dispatch_func,
+) -> argparse.ArgumentParser:
+    """Register the ``lmcache query kvcache`` subcommand parser.
+
+    Args:
+        subparsers: The ``query`` subparsers action.
+        dispatch_func: Function to bind via ``set_defaults(func=...)``.
+
+    Returns:
+        The created ``ArgumentParser``.
+    """
+    parser = subparsers.add_parser(
+        "kvcache",
+        help="Query KV-cache endpoints (not implemented yet).",
+    )
+    _add_output_args(parser)
+    parser.set_defaults(func=dispatch_func)
+    return parser
+
+
+def run_query_kvcache(cmd: "BaseCommand", args: argparse.Namespace) -> None:
+    """Execute the ``lmcache query kvcache`` subcommand.
+
+    Args:
+        cmd: The parent command instance (for metrics creation).
+        args: Parsed CLI arguments.
+    """
+    # TODO: implement kvcache query logic
+    pass
diff --git a/lmcache/cli/commands/trace/__init__.py b/lmcache/cli/commands/trace/__init__.py
index f9d67f6d3d..569a6c00fd 100644
--- a/lmcache/cli/commands/trace/__init__.py
+++ b/lmcache/cli/commands/trace/__init__.py
@@ -1,75 +1,57 @@
 # SPDX-License-Identifier: Apache-2.0
-
-"""``lmcache trace`` — inspect and replay storage-level trace files.
+"""``lmcache trace`` command — inspect and replay storage-level trace files.
 
 Subcommands:
 
 * ``info FILE`` — print a summary (header metadata + per-qualname
   record counts).
 * ``replay FILE ...`` — reissue every recorded call against a fresh
-  StorageManager, honoring the recorded inter-call timings.  Takes
-  the standard storage-manager CLI flags (see
-  :func:`lmcache.v1.distributed.config.add_storage_manager_args`),
-  plus per-record output (``--verbose`` / ``--jsonl-out``),
-  aggregated CSV/JSON summary export (``--output-dir`` / ``--no-csv``
-  / ``--json``), and a terminal metrics table (suppressible with
-  ``-q``).
+  StorageManager, honoring the recorded inter-call timings.
 
 Trace *capture* is not a ``trace`` subcommand — recording is bound to
 the live process via ``lmcache server --trace-level storage
-[--trace-output ...]``.  Surfacing a CLI stub here would only
-duplicate that flag while leaving the user wondering why it cannot
-start a recorder against an already-running server.
+[--trace-output ...]``.
 """
 
 # Future
 from __future__ import annotations
 
 # Standard
-from collections import Counter
 from typing import Callable
 import argparse
-import json
-import os
 import sys
 
 # First Party
 from lmcache.cli.commands.base import BaseCommand
-from lmcache.cli.metrics import Metrics, StreamHandler, get_formatter
-from lmcache.logging import init_logger
-
-logger = init_logger(__name__)
-
-# ``lmcache trace`` drives a real StorageManager and decodes a binary
-# trace file — both pulled from the full LMCache runtime
-# (``lmcache.v1.*``, torch kernels, native ops).  Users who installed
-# the thin ``lmcache-cli`` shell lack those modules, so importing them
-# unconditionally at the top of this file would kill the *entire*
-# ``lmcache`` CLI with an opaque ImportError the first time
-# ``lmcache/cli/commands/__init__.py`` loads the command registry.
-#
-# Wrap the heavy imports and remember the error so each subcommand
-# handler can bail out with an actionable install hint.  ``record`` is
-# a stub that needs no runtime, so it keeps working on a CLI-only
-# install.
+from lmcache.cli.commands.trace.info_command import register_info_parser, run_trace_info
+from lmcache.cli.commands.trace.replay_command import (
+    register_replay_parser,
+    run_trace_replay,
+)
+
+# The full LMCache runtime (``lmcache.v1.*``, torch kernels, native
+# ops) is required for ``trace info`` and ``trace replay``.  Users who
+# installed the thin ``lmcache-cli`` shell lack those modules.  Wrap
+# the heavy imports and remember the error so each subcommand handler
+# can bail out with an actionable install hint.
 _IMPORT_ERROR: ImportError | None = None
 try:
     # First Party
-    from lmcache.cli.commands.trace.driver import (
+    from lmcache.cli.commands.trace.driver import (  # noqa: F401
         ReplayResult,
         StorageReplayDriver,
     )
-    from lmcache.cli.commands.trace.stats import ReplayStatsCollector
-    from lmcache.v1.distributed.config import (
+    from lmcache.cli.commands.trace.stats import ReplayStatsCollector  # noqa: F401
+    from lmcache.v1.distributed.config import (  # noqa: F401
         StorageManagerConfig,
         add_storage_manager_args,
         parse_args_to_config,
     )
-    from lmcache.v1.mp_observability.config import (
+    from lmcache.v1.mp_observability.config import (  # noqa: F401
         add_observability_args,
         parse_args_to_observability_config,
     )
-    from lmcache.v1.mp_observability.trace.reader import TraceReader
+    from lmcache.v1.mp_observability.trace.reader import TraceReader  # noqa: F401
 except ImportError as _exc:
     _IMPORT_ERROR = _exc
 
@@ -85,10 +67,6 @@ def _require_full_install() -> None:
     exits with status ``2`` so scripts can detect the install gap
     programmatically.
 
-    Writes directly to :data:`sys.stderr` rather than going through
-    :mod:`logging` so the message reaches the user even when the
-    lmcache logger has been suppressed or its handlers redirected.
-
     No-op when imports succeeded, so it is safe to call
     unconditionally at the top of every trace handler.
     """
@@ -116,19 +94,13 @@ def help(self) -> str:
         return "Inspect and replay LMCache storage-level trace files."
 
     def add_arguments(self, _parser: argparse.ArgumentParser) -> None:
-        # Empty; all args live under the per-subcommand parsers added
-        # in :meth:`register`.
-        pass
+        pass  # args registered in register() via subparsers
 
     def register(self, subparsers: argparse._SubParsersAction) -> None:
         """Register ``trace`` with the root parser.
 
         Overrides :meth:`BaseCommand.register` because ``trace`` has
-        its own nested subparsers (``info`` and ``replay``).  The
-        base-class ``--format``/``--output``/``--quiet`` flags do not
-        apply uniformly across the subcommands — ``replay`` has its
-        own ``--jsonl-out`` output channel — so they are added only
-        to ``info``.
+        its own nested subparsers (``info`` and ``replay``).
         """
         parser = subparsers.add_parser(
             self.name(),
@@ -140,366 +112,25 @@ def register(self, subparsers: argparse._SubParsersAction) -> None:
             required=True,
             metavar="{info,replay}",
         )
-        self._register_info(inner)
-        self._register_replay(inner)
+        register_info_parser(inner, self.execute)
+        register_replay_parser(inner, self.execute)
 
     def execute(self, args: argparse.Namespace) -> None:
-        """Dispatch the parsed subcommand."""
+        """Dispatch to the appropriate trace subcommand handler.
+
+        Args:
+            args: Parsed CLI arguments containing ``trace_target``.
+        """
+        _require_full_install()
         handlers: dict[str, Callable[[argparse.Namespace], None]] = {
-            "info": self._run_info,
-            "replay": self._run_replay,
+            "info": lambda a: run_trace_info(self, a),
+            "replay": lambda a: run_trace_replay(self, a),
         }
         handler = handlers.get(args.trace_target)
         if handler is None:
-            # ``required=True`` on the subparser makes this unreachable
-            # in practice; branch is kept for defensive logging.
-            print(f"Unknown trace target: {args.trace_target}", file=sys.stderr)
-            sys.exit(1)
-        handler(args)
-
-    # ------------------------------------------------------------------
-    # ``info``
-    # ------------------------------------------------------------------
-
-    def _register_info(self, subparsers: argparse._SubParsersAction) -> None:
-        parser = subparsers.add_parser(
-            "info",
-            help="Print a summary of a trace file.",
-        )
-        parser.add_argument(
-            "trace_path",
-            metavar="FILE",
-            help="Path to a .lct trace file.",
-        )
-        parser.set_defaults(func=self.execute)
-
-    def _run_info(self, args: argparse.Namespace) -> None:
-        """Read a trace file and print a one-screen summary."""
-        _require_full_install()
-        with TraceReader(args.trace_path) as r:
-            header = r.header
-            counts: Counter[str] = Counter()
-            max_mono = 0.0
-            for record in r.records():
-                counts[record.qualname] += 1
-                if record.t_mono > max_mono:
-                    max_mono = record.t_mono
-
-        print(f"Trace file: {args.trace_path}")
-        print(f"  level:                {header.level}")
-        print(f"  format_version:       {header.format_version}")
-        print(f"  trace_schema_version: {header.trace_schema_version}")
-        print(f"  duration:             {max_mono:.3f}s")
-        print(f"  sm_config_digest:     {header.sm_config_digest or '(none)'}")
-        print(f"  total_records:        {sum(counts.values())}")
-        if counts:
-            print("  ops:")
-            for qn in sorted(counts):
-                print(f"    {qn}: {counts[qn]}")
-        else:
-            print("  ops: (none)")
-
-    # ------------------------------------------------------------------
-    # ``replay``
-    # ------------------------------------------------------------------
-
-    def _register_replay(self, subparsers: argparse._SubParsersAction) -> None:
-        parser = subparsers.add_parser(
-            "replay",
-            help="Replay a trace file against a fresh StorageManager.",
-            description=(
-                "Replay a trace file against a fresh StorageManager.  "
-                "Accepts the standard storage-manager config flags "
-                "(--l1-size-gb, --eviction-policy, --l2-…); see "
-                "'lmcache server --help' for the full list."
-            ),
-        )
-        parser.add_argument(
-            "trace_path",
-            metavar="FILE",
-            help="Path to a .lct trace file.",
-        )
-        parser.add_argument(
-            "--verbose",
-            action="store_true",
-            default=False,
-            help="Print one line per replayed record.",
-        )
-        parser.add_argument(
-            "--jsonl-out",
-            default=None,
-            metavar="PATH",
-            help=(
-                "Write one JSON object per replayed record to PATH "
-                "(qualname, latency_ms, failed).  Useful for post-hoc "
-                "analysis."
-            ),
-        )
-        parser.add_argument(
-            "--output-dir",
-            default=".",
-            help=(
-                "Directory for aggregated CSV/JSON summary output "
-                "(default: current directory)."
-            ),
-        )
-        parser.add_argument(
-            "--no-csv",
-            action="store_true",
-            help="Skip the aggregated CSV summary export.",
-        )
-        parser.add_argument(
-            "--json",
-            action="store_true",
-            help="Also export an aggregated JSON summary.",
-        )
-        parser.add_argument(
-            "-q",
-            "--quiet",
-            action="store_true",
-            help="Suppress the terminal metrics table (files are still written).",
-        )
-        # ``add_storage_manager_args`` and ``add_observability_args``
-        # live in the full LMCache runtime (``lmcache.v1.*``) and are
-        # unavailable in the CLI-only install.  When those imports
-        # failed at module load, register ``replay`` with only the
-        # CLI-local flags above so ``--help`` still works; the actual
-        # execute path bails via :func:`_require_full_install` before
-        # it would try to parse the missing-flag namespace.
-        #
-        # When the full runtime *is* present we share the whole
-        # observability surface with ``lmcache server``.
-        # ``--trace-level`` / ``--trace-output`` configure *recording*
-        # and have no effect during replay; :meth:`_run_replay`
-        # overrides them to ``None`` before constructing the
-        # observability config rather than duplicating the argparse
-        # registration to strip them.
-        if _IMPORT_ERROR is None:
-            add_storage_manager_args(parser)
-            add_observability_args(parser)
-        parser.set_defaults(func=self.execute)
-
-    def _run_replay(self, args: argparse.Namespace) -> None:
-        """Construct a StorageManager from *args* and drive replay.
-
-        Produces three kinds of output:
-
-        * Per-record stream: every dispatch is logged at INFO with its
-          progress (``[N/total]``), qualname, and latency.
-          ``--verbose`` additionally mirrors each record to stdout,
-          and ``--jsonl-out PATH`` writes one JSON object per record
-          to ``PATH`` for post-hoc analysis.
-        * Aggregated per-qualname summary: CSV (unless ``--no-csv``)
-          and JSON (with ``--json``) written under ``--output-dir``.
-        * Terminal metrics table (unless ``--quiet``) using the shared
-          :class:`~lmcache.cli.metrics.Metrics` renderer.
-        """
-        _require_full_install()
-        sm_config: StorageManagerConfig = parse_args_to_config(args)
-
-        # ``--trace-level`` / ``--trace-output`` belong to the recording
-        # surface.  They are still registered on the parser (see
-        # :meth:`_register_replay`) so the flag set stays in lock-step
-        # with ``lmcache server``, but they have no meaning here — any
-        # value a caller passes is silently clobbered to ``None`` so
-        # the replay-side ``ObservabilityConfig`` never tries to start
-        # a recorder.
-        args.trace_level = None
-        args.trace_output = None
-        obs_config = parse_args_to_observability_config(args)
-
-        # Create output directories *before* replay starts.  A replay
-        # can run for minutes; surfacing a bad ``--output-dir`` or
-        # unwritable ``--jsonl-out`` parent now avoids silently losing
-        # the summary/stream after the work has already happened.
-        os.makedirs(args.output_dir, exist_ok=True)
-        if args.jsonl_out:
-            jsonl_parent = os.path.dirname(os.path.abspath(args.jsonl_out))
-            if jsonl_parent:
-                os.makedirs(jsonl_parent, exist_ok=True)
-
-        # ANSI: bold + yellow for the banner text, reset at the end.
-        # The lmcache log formatter only colors the WARNING prefix;
-        # these codes highlight the message body too.  Writing them
-        # into a file via shell redirection leaves the escape bytes
-        # visible but still readable.
-        bold = "\033[1;33m"
-        reset = "\033[0m"
-        bar = "=" * 78
-        logger.warning(
-            "\n%s%s\n"
-            "  !! REPLAY ENVIRONMENT MISMATCH MAY CAUSE RETRIEVE MISSES !!\n"
-            "%s%s\n"
-            "  * Replay uses the *replay-side* StorageManager config, which\n"
-            "    may differ from the config recorded in the trace.\n"
-            "  * Replay runs on a host whose performance may differ from\n"
-            "    the recording host.\n"
-            "  * StorageManager reads/writes are async — an L2 load that\n"
-            "    had finished at record time may not have finished yet at\n"
-            "    replay time, so the matching retrieve can miss.\n"
-            "\n"
-            "  Treat retrieve-miss counts as a signal about the replay\n"
-            "  environment, not as a defect in the trace.\n"
-            "%s%s",
-            bold,
-            bar,
-            bar,
-            reset,
-            bar,
-            reset,
-        )
-
-        # Pre-scan to count total records so progress logs can show
-        # [N/total].  The reader streams frames, so counting is cheap
-        # relative to replay (which actually dispatches StorageManager
-        # calls).
-        with TraceReader(args.trace_path) as r:
-            total_records = sum(1 for _ in r.records())
-        logger.info(
-            "trace replay: file=%s records=%d",
-            args.trace_path,
-            total_records,
-        )
-
-        jsonl_fh = open(args.jsonl_out, "w") if args.jsonl_out else None
-        verbose = args.verbose
-        counter = {"n": 0}
-
-        def _on_record(qualname: str, latency_s: float, failed: bool) -> None:
-            counter["n"] += 1
-            status = "FAIL" if failed else "OK"
-            logger.info(
-                "[%d/%d] %s %s (%.3fms)",
-                counter["n"],
-                total_records,
-                status,
-                qualname,
-                latency_s * 1000.0,
+            print(
+                f"Unknown trace target: {args.trace_target}",
+                file=sys.stderr,
             )
-            if verbose:
-                print(
-                    f"  [{counter['n']}/{total_records}]  "
-                    f"{status:<4}  {latency_s * 1000:8.3f}ms  {qualname}"
-                )
-            if jsonl_fh is not None:
-                jsonl_fh.write(
-                    json.dumps(
-                        {
-                            "qualname": qualname,
-                            "latency_ms": latency_s * 1000.0,
-                            "failed": failed,
-                        }
-                    )
-                    + "\n"
-                )
-
-        try:
-            with StorageReplayDriver(
-                sm_config, args.trace_path, obs_config=obs_config
-            ) as driver:
-                result = driver.run(on_record=_on_record)
-        finally:
-            if jsonl_fh is not None:
-                jsonl_fh.close()
-
-        if not args.no_csv:
-            csv_path = os.path.join(args.output_dir, "trace_replay_ops.csv")
-            result.stats.export_csv(csv_path)
-            logger.info("CSV written to %s", csv_path)
-        if args.json:
-            json_path = os.path.join(args.output_dir, "trace_replay_summary.json")
-            result.stats.export_json(json_path)
-            logger.info("JSON written to %s", json_path)
-
-        if not args.quiet:
-            self._emit_replay_metrics(result.stats, result)
-
-        if result.records_failed > 0:
             sys.exit(1)
-
-    @staticmethod
-    def _emit_replay_metrics(
-        stats: ReplayStatsCollector,
-        result: ReplayResult,
-    ) -> None:
-        """Print the replay summary using the shared :class:`Metrics` renderer.
-
-        Args:
-            stats: The stats collector populated during replay.
-            result: The full :class:`ReplayResult` — used for the
-                replayed/skipped/failed totals and digest comparison.
-        """
-        metrics = Metrics(title="Trace Replay Result")
-        metrics.add_handler(StreamHandler(get_formatter("terminal", width=64)))
-
-        overall = metrics.add_section("overall", "Overall")
-        overall.add("level", "Trace level", result.header_level)
-        overall.add("replayed", "Records replayed", result.records_replayed)
-        overall.add("skipped", "Records skipped", result.records_skipped)
-        overall.add("failed", "Records failed", result.records_failed)
-        overall.add(
-            "duration",
-            "Replay duration (s)",
-            round(stats.total_duration_s(), 3),
-        )
-        header_digest = result.header_digest
-        replay_digest = result.replay_config_digest
-        if header_digest and replay_digest and header_digest != replay_digest:
-            overall.add(
-                "digest",
-                "Config digest",
-                f"MISMATCH (rec={header_digest[:8]}, run={replay_digest[:8]})",
-            )
-        elif header_digest:
-            overall.add("digest", "Config digest", f"match ({header_digest[:8]})")
-
-        summary = stats.summary()
-        if summary:
-            ops_section = metrics.add_section("ops", "Per-Op Latency (ms)")
-            for qn in sorted(summary):
-                s = summary[qn]
-                short = _short_op_name(qn)
-                ops_section.add(f"{short}_count", f"{short} count", s.count)
-                ops_section.add(
-                    f"{short}_mean",
-                    f"{short} mean",
-                    round(s.mean_ms, 3),
-                )
-                ops_section.add(
-                    f"{short}_p50",
-                    f"{short} p50",
-                    round(s.p50_ms, 3),
-                )
-                ops_section.add(
-                    f"{short}_p99",
-                    f"{short} p99",
-                    round(s.p99_ms, 3),
-                )
-
-        metrics.emit()
-
-
-def _short_op_name(qualname: str) -> str:
-    """Return a compact, human-readable label for a traced qualname.
-
-    Plain methods collapse to the method name: the table has limited
-    column width and the fully-qualified path is verbose.
-
-    Context-manager handlers (``__enter__`` / ``__exit__``) instead
-    collapse to ``<owning_method>.enter`` / ``<owning_method>.exit``,
-    so the reader can tell *which* context manager the pair belongs
-    to — the bare ``__enter__`` / ``__exit__`` label is useless when
-    multiple context-manager-returning methods are traced.
-
-    Args:
-        qualname: Dotted qualname recorded by the tracer, e.g.
-            ``lmcache.v1.distributed.storage_manager.StorageManager.read_prefetched_results.__enter__``.
-
-    Returns:
-        A short label suitable as a metrics row prefix.
-    """
-    parts = qualname.split(".")
-    last = parts[-1]
-    if last in ("__enter__", "__exit__") and len(parts) >= 2:
-        return f"{parts[-2]}.{last.strip('_')}"
-    return last
+        handler(args)
diff --git a/lmcache/cli/commands/trace/info_command.py b/lmcache/cli/commands/trace/info_command.py
new file mode 100644
index 0000000000..b62fee8455
--- /dev/null
+++ b/lmcache/cli/commands/trace/info_command.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+"""``lmcache trace info`` — print a summary of a trace file."""
+
+# Future
+from __future__ import annotations
+
+# Standard
+from collections import Counter
+from typing import TYPE_CHECKING
+import argparse
+
+if TYPE_CHECKING:
+    # First Party
+    from lmcache.cli.commands.base import BaseCommand
+
+
+def register_info_parser(
+    subparsers: argparse._SubParsersAction,
+    dispatch_func,
+) -> argparse.ArgumentParser:
+    """Register the ``lmcache trace info`` subcommand parser.
+
+    Args:
+        subparsers: The ``trace`` subparsers action.
+        dispatch_func: Function to bind via ``set_defaults(func=...)``.
+
+    Returns:
+        The created ``ArgumentParser``.
+    """
+    parser = subparsers.add_parser(
+        "info",
+        help="Print a summary of a trace file.",
+    )
+    parser.add_argument(
+        "trace_path",
+        metavar="FILE",
+        help="Path to a .lct trace file.",
+    )
+    parser.set_defaults(func=dispatch_func)
+    return parser
+
+
+def run_trace_info(cmd: "BaseCommand", args: argparse.Namespace) -> None:
+    """Read a trace file and print a one-screen summary.
+
+    Args:
+        cmd: The parent command instance (unused but kept for interface
+            consistency with other subcommands).
+        args: Parsed CLI arguments containing ``trace_path``.
+    """
+    # Deferred import — guarded by _require_full_install() in the
+    # dispatcher before this function is called.
+    # First Party
+    from lmcache.v1.mp_observability.trace.reader import TraceReader
+
+    with TraceReader(args.trace_path) as r:
+        header = r.header
+        counts: Counter[str] = Counter()
+        max_mono = 0.0
+        for record in r.records():
+            counts[record.qualname] += 1
+            if record.t_mono > max_mono:
+                max_mono = record.t_mono
+
+    print(f"Trace file: {args.trace_path}")
+    print(f"  level:                {header.level}")
+    print(f"  format_version:       {header.format_version}")
+    print(f"  trace_schema_version: {header.trace_schema_version}")
+    print(f"  duration:             {max_mono:.3f}s")
+    print(f"  sm_config_digest:     {header.sm_config_digest or '(none)'}")
+    print(f"  total_records:        {sum(counts.values())}")
+    if counts:
+        print("  ops:")
+        for qn in sorted(counts):
+            print(f"    {qn}: {counts[qn]}")
+    else:
+        print("  ops: (none)")
diff --git a/lmcache/cli/commands/trace/replay_command.py b/lmcache/cli/commands/trace/replay_command.py
new file mode 100644
index 0000000000..51deb4cca1
--- /dev/null
+++ b/lmcache/cli/commands/trace/replay_command.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: Apache-2.0
+"""``lmcache trace replay`` — replay a trace file against a StorageManager."""
+
+# Future
+from __future__ import annotations
+
+# Standard
+from typing import TYPE_CHECKING
+import argparse
+import json
+import os
+import sys
+
+# First Party
+from lmcache.cli.metrics import Metrics, StreamHandler, get_formatter
+from lmcache.logging import init_logger
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    # First Party
+    from lmcache.cli.commands.base import BaseCommand
+    from lmcache.cli.commands.trace.driver import ReplayResult
+    from lmcache.cli.commands.trace.stats import ReplayStatsCollector
+
+
+def register_replay_parser(
+    subparsers: argparse._SubParsersAction,
+    dispatch_func,
+) -> argparse.ArgumentParser:
+    """Register the ``lmcache trace replay`` subcommand parser.
+
+    Args:
+        subparsers: The ``trace`` subparsers action.
+        dispatch_func: Function to bind via ``set_defaults(func=...)``.
+
+    Returns:
+        The created ``ArgumentParser``.
+    """
+    parser = subparsers.add_parser(
+        "replay",
+        help="Replay a trace file against a fresh StorageManager.",
+        description=(
+            "Replay a trace file against a fresh StorageManager.  "
+            "Accepts the standard storage-manager config flags "
+            "(--l1-size-gb, --eviction-policy, --l2-…); see "
+            "'lmcache server --help' for the full list."
+        ),
+    )
+    parser.add_argument(
+        "trace_path",
+        metavar="FILE",
+        help="Path to a .lct trace file.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        default=False,
+        help="Print one line per replayed record.",
+    )
+    parser.add_argument(
+        "--jsonl-out",
+        default=None,
+        metavar="PATH",
+        help=(
+            "Write one JSON object per replayed record to PATH "
+            "(qualname, latency_ms, failed).  Useful for post-hoc "
+            "analysis."
+        ),
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=".",
+        help=(
+            "Directory for aggregated CSV/JSON summary output "
+            "(default: current directory)."
+        ),
+    )
+    parser.add_argument(
+        "--no-csv",
+        action="store_true",
+        help="Skip the aggregated CSV summary export.",
+    )
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Also export an aggregated JSON summary.",
+    )
+    parser.add_argument(
+        "-q",
+        "--quiet",
+        action="store_true",
+        help="Suppress the terminal metrics table (files are still written).",
+    )
+
+    try:
+        # First Party
+        from lmcache.v1.distributed.config import add_storage_manager_args
+        from lmcache.v1.mp_observability.config import add_observability_args
+
+        add_storage_manager_args(parser)
+        add_observability_args(parser)
+    except ImportError as e:
+        logger.warning("lmcache trace replay import error, error is %s", e)
+
+    parser.set_defaults(func=dispatch_func)
+    return parser
+
+
+def run_trace_replay(cmd: "BaseCommand", args: argparse.Namespace) -> None:
+    """Construct a StorageManager from *args* and drive replay.
+
+    Produces three kinds of output:
+
+    * Per-record stream: every dispatch is logged at INFO with its
+      progress (``[N/total]``), qualname, and latency.
+      ``--verbose`` additionally mirrors each record to stdout,
+      and ``--jsonl-out PATH`` writes one JSON object per record
+      to ``PATH`` for post-hoc analysis.
+    * Aggregated per-qualname summary: CSV (unless ``--no-csv``)
+      and JSON (with ``--json``) written under ``--output-dir``.
+    * Terminal metrics table (unless ``--quiet``) using the shared
+      :class:`~lmcache.cli.metrics.Metrics` renderer.
+
+    Args:
+        cmd: The parent command instance (unused but kept for interface
+            consistency).
+        args: Parsed CLI arguments.
+    """
+    # Deferred imports — guarded by _require_full_install() in the
+    # dispatcher before this function is called.
+    # First Party
+    from lmcache.cli.commands.trace.driver import StorageReplayDriver
+    from lmcache.v1.distributed.config import StorageManagerConfig, parse_args_to_config
+    from lmcache.v1.mp_observability.config import parse_args_to_observability_config
+    from lmcache.v1.mp_observability.trace.reader import TraceReader
+
+    sm_config: StorageManagerConfig = parse_args_to_config(args)
+
+    # ``--trace-level`` / ``--trace-output`` belong to the recording
+    # surface.  They are still registered on the parser so the flag set
+    # stays in lock-step with ``lmcache server``, but they have no
+    # meaning here — any value a caller passes is silently clobbered to
+    # ``None`` so the replay-side ``ObservabilityConfig`` never tries to
+    # start a recorder.
+    args.trace_level = None
+    args.trace_output = None
+    obs_config = parse_args_to_observability_config(args)
+
+    # Create output directories *before* replay starts.  A replay
+    # can run for minutes; surfacing a bad ``--output-dir`` or
+    # unwritable ``--jsonl-out`` parent now avoids silently losing
+    # the summary/stream after the work has already happened.
+    os.makedirs(args.output_dir, exist_ok=True)
+    if args.jsonl_out:
+        jsonl_parent = os.path.dirname(os.path.abspath(args.jsonl_out))
+        if jsonl_parent:
+            os.makedirs(jsonl_parent, exist_ok=True)
+
+    # ANSI: bold + yellow for the banner text, reset at the end.
+    bold = "\033[1;33m"
+    reset = "\033[0m"
+    bar = "=" * 78
+    logger.warning(
+        "\n%s%s\n"
+        "  !! REPLAY ENVIRONMENT MISMATCH MAY CAUSE RETRIEVE MISSES !!\n"
+        "%s%s\n"
+        "  * Replay uses the *replay-side* StorageManager config, which\n"
+        "    may differ from the config recorded in the trace.\n"
+        "  * Replay runs on a host whose performance may differ from\n"
+        "    the recording host.\n"
+        "  * StorageManager reads/writes are async — an L2 load that\n"
+        "    had finished at record time may not have finished yet at\n"
+        "    replay time, so the matching retrieve can miss.\n"
+        "\n"
+        "  Treat retrieve-miss counts as a signal about the replay\n"
+        "  environment, not as a defect in the trace.\n"
+        "%s%s",
+        bold,
+        bar,
+        bar,
+        reset,
+        bar,
+        reset,
+    )
+
+    # Pre-scan to count total records so progress logs can show
+    # [N/total].  The reader streams frames, so counting is cheap
+    # relative to replay (which actually dispatches StorageManager
+    # calls).
+    with TraceReader(args.trace_path) as r:
+        total_records = sum(1 for _ in r.records())
+    logger.info(
+        "trace replay: file=%s records=%d",
+        args.trace_path,
+        total_records,
+    )
+
+    jsonl_fh = open(args.jsonl_out, "w") if args.jsonl_out else None
+    verbose = args.verbose
+    counter = {"n": 0}
+
+    def _on_record(qualname: str, latency_s: float, failed: bool) -> None:
+        counter["n"] += 1
+        status = "FAIL" if failed else "OK"
+        logger.info(
+            "[%d/%d] %s %s (%.3fms)",
+            counter["n"],
+            total_records,
+            status,
+            qualname,
+            latency_s * 1000.0,
+        )
+        if verbose:
+            print(
+                f"  [{counter['n']}/{total_records}]  "
+                f"{status:<4}  {latency_s * 1000:8.3f}ms  {qualname}"
+            )
+        if jsonl_fh is not None:
+            jsonl_fh.write(
+                json.dumps(
+                    {
+                        "qualname": qualname,
+                        "latency_ms": latency_s * 1000.0,
+                        "failed": failed,
+                    }
+                )
+                + "\n"
+            )
+
+    try:
+        with StorageReplayDriver(
+            sm_config, args.trace_path, obs_config=obs_config
+        ) as driver:
+            result = driver.run(on_record=_on_record)
+    finally:
+        if jsonl_fh is not None:
+            jsonl_fh.close()
+
+    if not args.no_csv:
+        csv_path = os.path.join(args.output_dir, "trace_replay_ops.csv")
+        result.stats.export_csv(csv_path)
+        logger.info("CSV written to %s", csv_path)
+    if args.json:
+        json_path = os.path.join(args.output_dir, "trace_replay_summary.json")
+        result.stats.export_json(json_path)
+        logger.info("JSON written to %s", json_path)
+
+    if not args.quiet:
+        _emit_replay_metrics(result.stats, result)
+
+    if result.records_failed > 0:
+        sys.exit(1)
+
+
+def _emit_replay_metrics(
+    stats: "ReplayStatsCollector",
+    result: "ReplayResult",
+) -> None:
+    """Print the replay summary using the shared :class:`Metrics` renderer.
+
+    Args:
+        stats: The stats collector populated during replay.
+        result: The full :class:`ReplayResult` — used for the
+            replayed/skipped/failed totals and digest comparison.
+    """
+    metrics = Metrics(title="Trace Replay Result")
+    metrics.add_handler(StreamHandler(get_formatter("terminal", width=64)))
+
+    overall = metrics.add_section("overall", "Overall")
+    overall.add("level", "Trace level", result.header_level)
+    overall.add("replayed", "Records replayed", result.records_replayed)
+    overall.add("skipped", "Records skipped", result.records_skipped)
+    overall.add("failed", "Records failed", result.records_failed)
+    overall.add(
+        "duration",
+        "Replay duration (s)",
+        round(stats.total_duration_s(), 3),
+    )
+    header_digest = result.header_digest
+    replay_digest = result.replay_config_digest
+    if header_digest and replay_digest and header_digest != replay_digest:
+        overall.add(
+            "digest",
+            "Config digest",
+            f"MISMATCH (rec={header_digest[:8]}, run={replay_digest[:8]})",
+        )
+    elif header_digest:
+        overall.add("digest", "Config digest", f"match ({header_digest[:8]})")
+
+    summary = stats.summary()
+    if summary:
+        ops_section = metrics.add_section("ops", "Per-Op Latency (ms)")
+        for qn in sorted(summary):
+            s = summary[qn]
+            short = _short_op_name(qn)
+            ops_section.add(f"{short}_count", f"{short} count", s.count)
+            ops_section.add(
+                f"{short}_mean",
+                f"{short} mean",
+                round(s.mean_ms, 3),
+            )
+            ops_section.add(
+                f"{short}_p50",
+                f"{short} p50",
+                round(s.p50_ms, 3),
+            )
+            ops_section.add(
+                f"{short}_p99",
+                f"{short} p99",
+                round(s.p99_ms, 3),
+            )
+
+    metrics.emit()
+
+
+def _short_op_name(qualname: str) -> str:
+    """Return a compact, human-readable label for a traced qualname.
+
+    Plain methods collapse to the method name: the table has limited
+    column width and the fully-qualified path is verbose.
+
+    Context-manager handlers (``__enter__`` / ``__exit__``) instead
+    collapse to ``<owning_method>.enter`` / ``<owning_method>.exit``,
+    so the reader can tell *which* context manager the pair belongs
+    to — the bare ``__enter__`` / ``__exit__`` label is useless when
+    multiple context-manager-returning methods are traced.
+
+    Args:
+        qualname: Dotted qualname recorded by the tracer, e.g.
+            ``lmcache.v1.distributed.storage_manager.StorageManager.read_prefetched_results.__enter__``.
+
+    Returns:
+        A short label suitable as a metrics row prefix.
+    """
+    parts = qualname.split(".")
+    last = parts[-1]
+    if last in ("__enter__", "__exit__") and len(parts) >= 2:
+        return f"{parts[-2]}.{last.strip('_')}"
+    return last
diff --git a/tests/cli/commands/test_query.py b/tests/cli/commands/test_query.py
index 9ab3576566..9640078fce 100644
--- a/tests/cli/commands/test_query.py
+++ b/tests/cli/commands/test_query.py
@@ -141,7 +141,7 @@ def test_func_bound_to_execute(
         )
         assert args.func == cmd.execute
 
-    @patch("lmcache.cli.commands.query.Request")
+    @patch("lmcache.cli.commands.query.engine_command.Request")
     def test_execute_calls_request_send_request(
         self,
         mock_request_cls: MagicMock,
@@ -184,7 +184,7 @@ def test_execute_calls_request_send_request(
         assert "Input tokens" in out
         assert "Prompt tokens" not in out
 
-    @patch("lmcache.cli.commands.query.Request")
+    @patch("lmcache.cli.commands.query.engine_command.Request")
     def test_execute_uses_engine_model_when_cli_model_omitted(
         self,
         mock_request_cls: MagicMock,

From 3235ca7edb282cc3a2a28e09df7b38d2c8a0a13c Mon Sep 17 00:00:00 2001
From: Emine Ugur Kaynar <Ugur.Kaynar@dell.com>
Date: Wed, 10 Jun 2026 20:26:24 -0400
Subject: [PATCH 28/57] [Core] Add multipath KV-cache offloading support in
 LMCache NIXL backend (#2418)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Core] Add multipath KV-cache offloading support in LMCache NIXL backend


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Address feedback: add validate_nixl_path helper function and update NixlFilePool path handling


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Addresses PR feedback for documentation, unit tests, and formatting


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Use metadata.worker_id for path sharding instead of torch.cuda.current_device()

For CPU-buffer backends (POSIX, HF3FS), initialize_allocator does not call
torch.cuda.set_device(), so torch.cuda.current_device() may return 0 for
all workers, defeating multipath sharding. Replace with metadata.worker_id
which reliably distinguishes workers regardless of CUDA state.


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Use local_worker_id instead of worker_id for path sharding

In multi-node deployments, worker_id is the global rank which causes
inconsistent path distribution across nodes. local_worker_id is the
local GPU ID on the node, ensuring each node's GPUs map to paths
consistently (e.g. GPU 0 -> path0, GPU 1 -> path1 on every node).


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Fix code formatting (ruff format)


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Remove redundant assert in NixlFilePool.__init__

validate_nixl_path already checks for None path with a more
informative error message, making this assertion unnecessary.


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Add test_nixl_multipath.py to Buildkite unit-test ignore list

test_nixl_multipath.py imports NixlStorageConfig from
nixl_storage_backend.py, which has top-level nixl C extension imports.
When the nixl native bindings cannot fully load in the CI environment,
this causes an ImportError during pytest collection, and --maxfail=1
immediately aborts the entire test suite.

This matches the existing ignore for test_nixl_storage.py.


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Rebase NIXL multipath support to use PathSharder

- Remove validate_nixl_path method from NixlStorageConfig
- Update NixlFilePool to accept PathSharder instance
- Update createPool to use PathSharder with buffer_device
- Update tests to use PathSharder directly

This aligns with PR #2982 which centralized path sharding logic.


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Fix cursor bot issues: nixl_path validation and multipath sharding

1. Add validation to ensure nixl_path is not None
   - Add assert in from_cache_engine_config to validate nixl_path
   - Add assert in createPool as additional safeguard
   - Prevents TypeError when PathSharder receives None value

2. Fix CPU buffer device multipath sharding issue
   - Pass f'cuda:{metadata.worker_id}' to PathSharder instead of buffer_device
   - Ensures proper path selection based on worker_id for by_gpu sharding
   - Agent still uses correct buffer_device for memory allocation

These fixes resolve both high-severity issues identified by cursor bot.


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Add docstring and warning for nixl createPool path sharding

- Document createPool arguments/returns and path sharding behavior
- Warn when list paths contain commas (may affect sharding); PathSharder unchanged


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Resolve merge conflicts in nixl_storage_backend.py


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* fix: resolve buildkite pipeline merge conflict


Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>

* Fix: Restore missing use_hugepages extraction from merge conflict resolution

During the merge conflict resolution in commit 01da799f, the use_hugepages
extraction line was accidentally deleted. This line is part of the huge pages
feature (commit a68bd0a0) and is needed for the NIXL backend to properly
allocate CPU memory with hugepages support.

Changes:
- Restore use_hugepages: bool field in NixlStorageConfig dataclass
- Restore use_hugepages extraction: extra_config.get("nixl_use_hugepages", False)
- Remove unused import sys from test file (auto-fixed by ruff)

Signed-off-by: Ugur Kaynar <Ugur.kaynar@dell.com>

* [Core] Fix NIXL multipath PR: lint, tests, and dead code

Make CI green for the multipath KV-cache offloading change:

- createPool: remove the duplicate `elif backend in ("OBJ","AZURE_BLOB")`
  branch and the unreachable `return NixlFilePool(...)` left over from a
  merge; collapse back to the single OBJ/AZURE_BLOB/DOCA_MEMOS object-pool
  branch (no behavior change — OBJ/AZURE_BLOB still get b128=False).
- NixlDynamicStorageBackend: reject a list `nixl_path` at init. The dynamic
  backend uses self.path directly as a single directory, and path sharding
  across multiple paths is only implemented for static pools. This narrows
  self.path to str and fixes the three mypy str|list[str] arg-type errors
  by failing loud instead of silently mishandling a list.
- test_nixl_doca_memos: pass the new createPool path_sharding/dst_device
  args (ignored for object backends) to fix the missing-positional-arg
  failures.
- test_nixl_posix_backend_multipath: use the valid 5-element kv_shape
  torch.Size([4, 2, 256, 8, 128]) like the other run()-based tests; the
  previous [2048, 2048] shape crashed in metadata.get_shapes().

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Samuel Shen <slshen@tensormesh.ai>

* [Core] NIXL: don't require nixl_path for non-file backends

The multipath change added an unconditional
`assert path is not None, "nixl_path cannot be None"` in
NixlStorageConfig.from_cache_engine_config, which broke object/CPU
backends that legitimately have no path (e.g. OBJ/DOCA_MEMOS) — this is
what caused the test_nixl_shared_pool.py failures on CI.

Remove the unconditional assert; the existing conditional check already
requires a path only for the file backends that need one:

    if backend in ("GDS", "GDS_MT", "POSIX", "HF3FS"):
        assert path is not None, f"nixl_path must be provided for {backend} backend"

This restores the pre-PR behavior (path optional for object backends).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Samuel Shen <slshen@tensormesh.ai>

---------

Signed-off-by: Ugur Kaynar <Ugur.Kaynar@dell.com>
Signed-off-by: Ugur Kaynar <Ugur.kaynar@dell.com>
Signed-off-by: Emine Ugur Kaynar <Ugur.Kaynar@dell.com>
Signed-off-by: Samuel Shen <slshen@tensormesh.ai>
Co-authored-by: Samuel Shen <slshen@tensormesh.ai>
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .buildkite/pipeline.yml                       |   1 +
 .../source/kv_cache/storage_backends/nixl.rst |  21 +-
 .../storage_backend/nixl_storage_backend.py   |  82 +++++-
 tests/v1/data/nixl_multipath.yaml             |  14 +
 tests/v1/test_nixl_doca_memos.py              |  14 +-
 tests/v1/test_nixl_multipath.py               |  65 +++++
 tests/v1/test_nixl_storage.py                 | 248 +-----------------
 7 files changed, 193 insertions(+), 252 deletions(-)
 create mode 100644 tests/v1/data/nixl_multipath.yaml
 create mode 100644 tests/v1/test_nixl_multipath.py

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 313b0e1e6d..505df83cdd 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -40,6 +40,7 @@ steps:
         --ignore=tests/disagg --ignore=tests/v1/test_pos_kernels.py \
         --ignore=tests/v1/test_nixl_batched_contains.py \
         --ignore=tests/v1/test_device_id_race.py \
+        --ignore=tests/v1/test_nixl_multipath.py \
         --ignore=tests/skipped \
         --ignore=tests/v1/storage_backend/test_eic.py
 
diff --git a/docs/source/kv_cache/storage_backends/nixl.rst b/docs/source/kv_cache/storage_backends/nixl.rst
index 681ccbe14d..12fd2b436c 100644
--- a/docs/source/kv_cache/storage_backends/nixl.rst
+++ b/docs/source/kv_cache/storage_backends/nixl.rst
@@ -47,12 +47,13 @@ Key settings:
 
 - ``nixl_pool_size``: number of descriptors opened at init time for nixl backend. Set to 0 for dynamic mode.
 
-- ``nixl_path``: directory under which the storage files will be saved (e.g. /mnt/nixl/). Needed for NIXL backends that store to file.
+- ``nixl_path``: directory (or list of directories) under which the storage files will be saved (e.g. /mnt/nixl/). Needed for NIXL backends that store to file. When using a list of paths with ``path_sharding``, paths will be selected based on the sharding strategy.
 
 - ``nixl_buffer_device``: dictates where the memory managed by NIXL should be on. "cpu" or "cuda" is supported for "GDS", "GDS_MT", and "OBJ" backends - for "POSIX", "HF3FS", "AZURE_BLOB" & "DOCA_MEMOS", must be "cpu". In CPU mode, NIXL shares ``LocalCPUBackend``'s pinned buffer; ``LocalCPUBackend`` is always created when ``nixl_buffer_device: cpu``, regardless of the ``local_cpu`` setting. ``local_cpu: false`` still suppresses hot-cache promotions — the backend acts as a staging buffer only, mirroring how ``local_disk`` already uses ``LocalCPUBackend``.
 
 - ``nixl_backend``: configuration of which nixl backend to use for storage.
 
+- ``nixl_path_sharding``: strategy for selecting path when multiple paths are provided. Currently only "by_gpu" is supported, which selects paths based on GPU device ID.
 - ``local_cpu_use_hugepages``: whether to use Linux hugepages (2 MiB) for ``LocalCPUBackend``'s pinned pool (which NIXL shares in CPU mode). Requires pre-allocated hugepages (``sysctl vm.nr_hugepages``). Default: ``false``. **Deprecated alias:** ``extra_config.nixl_use_hugepages`` — accepted with a warning and copied into this field; will be removed in a future release.
 
 .. note::
@@ -69,6 +70,24 @@ Key settings:
 
     Backend specific params should be provided via ``extra_config.nixl_backend_params``. Please refer to NIXL documentation for specifics.
 
+Example ``lmcache-config.yaml`` for POSIX backend with multipath support:
+
+.. code-block:: yaml
+
+    chunk_size: 256
+    nixl_buffer_size: 1073741824 # 1GB
+    nixl_buffer_device: cpu
+    extra_config:
+      enable_nixl_storage: true
+      nixl_backend: POSIX
+      nixl_pool_size: 64
+      nixl_path: 
+        - /mnt/nixl/cache0/
+        - /mnt/nixl/cache1/
+        - /mnt/nixl/cache2/
+      nixl_path_sharding: by_gpu
+      use_direct_io: True
+
 Example ``lmcache-config.yaml`` for OBJ backend using S3 API:
 
 .. code-block:: yaml
diff --git a/lmcache/v1/storage_backend/nixl_storage_backend.py b/lmcache/v1/storage_backend/nixl_storage_backend.py
index 3690623a5c..aba33bbad8 100644
--- a/lmcache/v1/storage_backend/nixl_storage_backend.py
+++ b/lmcache/v1/storage_backend/nixl_storage_backend.py
@@ -79,6 +79,7 @@
 from lmcache.v1.metadata import LMCacheMetadata
 from lmcache.v1.storage_backend.abstract_backend import AllocatorBackendInterface
 from lmcache.v1.storage_backend.cache_policy import get_cache_policy
+from lmcache.v1.storage_backend.path_sharder import PathSharder
 from lmcache.v1.transfer_channel.transfer_utils import get_correct_device
 
 if TYPE_CHECKING:
@@ -110,10 +111,11 @@ class NixlStorageConfig:
     enable_presence_cache: bool
     enable_async_put: bool
     use_direct_io: bool
-    path: str
+    path: Union[str, List[str]]
     use_hugepages: bool
     enable_prog_thread: bool
     sync_mode: Optional[Any]  # nixl_thread_sync_t, None if unsupported
+    path_sharding: str
 
     @staticmethod
     def validate_nixl_backend(backend: str, device: str) -> bool:
@@ -192,6 +194,7 @@ def from_cache_engine_config(
                     f"in nixl_thread_sync_t."
                 )
             sync_mode = getattr(nixl_thread_sync_t, attr_name)
+        path_sharding = extra_config.get("nixl_path_sharding", "by_gpu")
 
         assert pool_size is not None
         assert backend is not None
@@ -250,6 +253,7 @@ def from_cache_engine_config(
             use_hugepages=use_hugepages,
             enable_prog_thread=enable_prog_thread,
             sync_mode=sync_mode,
+            path_sharding=path_sharding,
         )
 
 
@@ -280,13 +284,15 @@ def close(self):
 
 
 class NixlFilePool(NixlDescPool):
-    def __init__(self, size: int, path: str, use_direct_io: bool):
+    def __init__(
+        self,
+        size: int,
+        sharder: PathSharder,
+        use_direct_io: bool,
+    ):
         super().__init__(size)
         self.fds: List[int] = []
 
-        assert path is not None
-        os.makedirs(path, exist_ok=True)
-
         flags = os.O_CREAT | os.O_RDWR
         if use_direct_io:
             if hasattr(os, "O_DIRECT"):
@@ -296,10 +302,12 @@ def __init__(self, size: int, path: str, use_direct_io: bool):
                     "use_direct_io is True, but O_DIRECT is not available on "
                     "this system. Falling back to buffered I/O."
                 )
+        base_path = sharder.selected
+
         for i in reversed(range(size)):
             filename = f"obj_{i}_{uuid.uuid4().hex[0:4]}.bin"
-            tmp_path = os.path.join(path, filename)
-            fd = os.open(tmp_path, flags, DEFAULT_FILE_CREATE_MODE)
+            tmp_path = os.path.join(base_path, filename)
+            fd = os.open(tmp_path, flags)
             self.fds.append(fd)
 
     def close(self):
@@ -993,6 +1001,8 @@ def __init__(
             nixl_config.pool_size,
             nixl_config.path,
             nixl_config.use_direct_io,
+            nixl_config.path_sharding,
+            f"cuda:{metadata.worker_id}",
         )
         assert self.pool is not None
 
@@ -1010,9 +1020,50 @@ def __init__(
         )
 
     @staticmethod
-    def createPool(backend: str, size: int, path: str, use_direct_io: bool):
+    def createPool(
+        backend: str,
+        size: int,
+        path: Union[str, List[str]],
+        use_direct_io: bool,
+        path_sharding: str,
+        dst_device: str,
+    ) -> NixlDescPool:
+        """Create a NIXL descriptor pool with path sharding support.
+
+        Args:
+            backend: Backend type (e.g., "GDS", "POSIX", "OBJ").
+            size: Pool size.
+            path: Single path string or list of paths for sharding.
+            use_direct_io: Whether to use direct I/O.
+            path_sharding: Sharding strategy (e.g., "by_gpu").
+            dst_device: Device string for path selection.
+
+        Returns:
+            NixlDescPool: The created descriptor pool.
+
+        Raises:
+            ValueError: If backend is unsupported or path is invalid.
+
+        Note:
+            When *path* is provided as a list, entries containing commas will be
+            split when joined for PathSharder. Avoid commas in path entries to
+            prevent unintended sharding.
+        """
+
         if backend in ("GDS", "GDS_MT", "POSIX", "HF3FS"):
-            return NixlFilePool(size, path, use_direct_io)
+            if isinstance(path, list) and any("," in p for p in path):
+                logger.warning(
+                    "nixl_path entries contain commas; joining for PathSharder may "
+                    "cause unintended sharding. Consider paths without commas or a "
+                    "single comma-separated string."
+                )
+            sharder = PathSharder(
+                raw_csv=path if isinstance(path, str) else ",".join(path),
+                strategy=path_sharding,
+                dst_device=dst_device,
+                create_dirs=True,
+            )
+            return NixlFilePool(size, sharder, use_direct_io)
         elif backend in ("OBJ", "AZURE_BLOB", "DOCA_MEMOS"):
             return NixlObjectPool(size, b128=(backend == "DOCA_MEMOS"))
         else:
@@ -1309,7 +1360,18 @@ def __init__(
 
         self.async_mode = nixl_config.enable_async_put
         self.enable_presence_cache = nixl_config.enable_presence_cache
-        self.path = nixl_config.path
+        # The dynamic backend uses ``self.path`` directly as a single directory
+        # (see ``_build_descs``/``key_exists``). Path sharding across multiple
+        # paths is only supported for static pools via ``PathSharder``, so reject
+        # a list here rather than silently mishandling it later.
+        if isinstance(nixl_config.path, list):
+            raise ValueError(
+                "NixlDynamicStorageBackend (nixl_pool_size=0) does not support "
+                "multiple nixl_path entries; provide a single path string. "
+                "Path sharding across multiple paths is only available for "
+                "static pools."
+            )
+        self.path: str = nixl_config.path
         self.direct_io_flag = 0
         if nixl_config.use_direct_io:
             if hasattr(os, "O_DIRECT"):
diff --git a/tests/v1/data/nixl_multipath.yaml b/tests/v1/data/nixl_multipath.yaml
new file mode 100644
index 0000000000..01ec7a8b4f
--- /dev/null
+++ b/tests/v1/data/nixl_multipath.yaml
@@ -0,0 +1,14 @@
+local_cpu: false
+chunk_size: 256
+
+nixl_buffer_size: 1073741824
+nixl_buffer_device: cpu
+extra_config:
+  enable_nixl_storage: true
+  nixl_backend: POSIX
+  nixl_pool_size: 2
+  nixl_path: 
+    - /tmp/nixl/cache0
+    - /tmp/nixl/cache1
+    - /tmp/nixl/cache2
+  nixl_path_sharding: by_gpu
diff --git a/tests/v1/test_nixl_doca_memos.py b/tests/v1/test_nixl_doca_memos.py
index bc64966774..8fa0464e62 100644
--- a/tests/v1/test_nixl_doca_memos.py
+++ b/tests/v1/test_nixl_doca_memos.py
@@ -156,7 +156,12 @@ class TestCreatePool:
 
     def test_doca_memos_creates_b128_object_pool(self) -> None:
         pool = NixlStaticStorageBackend.createPool(
-            "DOCA_MEMOS", size=8, path="/tmp/unused", use_direct_io=False
+            "DOCA_MEMOS",
+            size=8,
+            path="/tmp/unused",
+            use_direct_io=False,
+            path_sharding="by_gpu",
+            dst_device="cpu",
         )
         assert isinstance(pool, NixlObjectPool)
         # b128 slot names are 32-char lowercase hex (no "obj_" prefix).
@@ -165,7 +170,12 @@ def test_doca_memos_creates_b128_object_pool(self) -> None:
 
     def test_obj_creates_non_b128_object_pool(self) -> None:
         pool = NixlStaticStorageBackend.createPool(
-            "OBJ", size=8, path="/tmp/unused", use_direct_io=False
+            "OBJ",
+            size=8,
+            path="/tmp/unused",
+            use_direct_io=False,
+            path_sharding="by_gpu",
+            dst_device="cpu",
         )
         assert isinstance(pool, NixlObjectPool)
         assert all(k.startswith("obj_") for k in pool.keys)
diff --git a/tests/v1/test_nixl_multipath.py b/tests/v1/test_nixl_multipath.py
new file mode 100644
index 0000000000..3781b5edfa
--- /dev/null
+++ b/tests/v1/test_nixl_multipath.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# Third Party
+import pytest
+
+pytest.importorskip("nixl", reason="nixl package is required for nixl tests")
+
+# First Party
+from lmcache.v1.storage_backend.path_sharder import PathSharder
+
+
+class TestNixlMultipath:
+    """Test cases for NIXL multipath functionality using PathSharder."""
+
+    def test_path_sharder_single_path(self):
+        """Test PathSharder with a single path string."""
+        path = "/tmp/nixl/cache"
+        path_sharding = "by_gpu"
+        dst_device = "cuda:0"
+
+        sharder = PathSharder(path, path_sharding, dst_device)
+
+        # Should return the same path since there's only one
+        assert sharder.selected == path
+        assert sharder.all_paths == [path]
+
+    def test_path_sharder_multiple_paths(self):
+        """Test PathSharder with a CSV string of paths."""
+        paths = "/tmp/nixl/cache0,/tmp/nixl/cache1,/tmp/nixl/cache2"
+        path_sharding = "by_gpu"
+
+        # Test with cuda:0 (device 0)
+        sharder = PathSharder(paths, path_sharding, "cuda:0")
+        assert sharder.selected == "/tmp/nixl/cache0"
+
+        # Test with cuda:1 (device 1)
+        sharder = PathSharder(paths, path_sharding, "cuda:1")
+        assert sharder.selected == "/tmp/nixl/cache1"
+
+        # Test with cuda:2 (device 2)
+        sharder = PathSharder(paths, path_sharding, "cuda:2")
+        assert sharder.selected == "/tmp/nixl/cache2"
+
+        # Test with cuda:3 (should wrap around to cache0)
+        sharder = PathSharder(paths, path_sharding, "cuda:3")
+        assert sharder.selected == "/tmp/nixl/cache0"
+
+    def test_path_sharder_empty_path(self):
+        """Test PathSharder with empty path."""
+        with pytest.raises(ValueError, match="At least one path must be provided"):
+            PathSharder("", "by_gpu", "cuda:0")
+
+    def test_path_sharder_unsupported_sharding(self):
+        """Test PathSharder with unsupported path sharding."""
+        path = "/tmp/nixl/cache"
+        with pytest.raises(ValueError, match="Unsupported path sharding"):
+            PathSharder(path, "unsupported_sharding", "cuda:0")
+
+    def test_path_sharder_cpu_device(self):
+        """Test PathSharder with CPU device."""
+        path = "/tmp/nixl/cache"
+        path_sharding = "by_gpu"
+        dst_device = "cpu"
+
+        sharder = PathSharder(path, path_sharding, dst_device)
+        assert sharder.selected == path
diff --git a/tests/v1/test_nixl_storage.py b/tests/v1/test_nixl_storage.py
index d2f887f315..0b63e43342 100644
--- a/tests/v1/test_nixl_storage.py
+++ b/tests/v1/test_nixl_storage.py
@@ -6,7 +6,6 @@
 import functools
 import os
 import shutil
-import sys
 import tempfile
 import threading
 import uuid
@@ -410,251 +409,22 @@ def test_nixl_posix_backend(nixl_tmp_path):
     run(config, shape, dtype)
 
 
-_DYNAMIC_KV_SHAPE = (4, 2, 256, 8, 128)
-
-
-def _build_dynamic_file_backend(config, dtype):
-    """
-    Build a NixlStorageBackend in dynamic-FILE mode and the surrounding
-    event-loop thread. Returns (backend, backends, thread_loop, thread, keys,
-    objs) so the caller can drive the test and tear everything down via
-    ``_teardown_dynamic_file_backend``.
-    """
-    BACKEND_NAME = "NixlStorageBackend"
-
-    keys = [
-        create_key("e3229141e680fb413d2c5d3ebb416c4ad300d381e309fc9e417757b91406c157"),
-        create_key("e3229141e680fb413d2c5d3ebb416c4ad300d381e309fc9e417757b91406d268"),
-    ]
-
-    thread_loop = asyncio.new_event_loop()
-    thread = threading.Thread(target=thread_loop.run_forever)
-    thread.start()
-
-    metadata = LMCacheMetadata(
-        model_name="Llama-3.1-70B-Instruct",
-        world_size=1,
-        local_world_size=1,
-        worker_id=0,
-        local_worker_id=0,
-        kv_dtype=dtype,
-        kv_shape=_DYNAMIC_KV_SHAPE,
-    )
-
-    backends = CreateStorageBackends(
-        config,
-        metadata,
-        thread_loop,
-        dst_device=config.nixl_buffer_device,
-    )
-    nixl_backend = backends[BACKEND_NAME]
-    assert isinstance(nixl_backend, NixlStorageBackend)
-
-    # In dynamic mode the backend internally allocates with meta_shape
-    # (derived from kv_shape via init_chunk_meta), so allocate the test
-    # objects with the same shape so put/get round-trip shapes match.
-    obj_shape = nixl_backend.meta_shape
-    obj_dtype = nixl_backend.meta_dtype
-    assert obj_shape is not None
-    assert obj_dtype is not None
-
-    obj_fmt = nixl_backend.meta_fmt
-    assert obj_fmt is not None
-
-    objs = []
-    for _ in keys:
-        obj = nixl_backend.memory_allocator.allocate(obj_shape, obj_dtype, obj_fmt)
-        assert obj is not None
-        assert obj.tensor is not None
-        objs.append(obj)
-
-    objs[0].tensor.zero_()
-    objs[1].tensor.zero_()
-    objs[0].tensor[0, 0, 100, 200] = 1
-    objs[1].tensor[1, 0, 50, 300] = 1
-
-    return nixl_backend, backends, thread_loop, thread, keys, objs
-
-
-def _teardown_dynamic_file_backend(backends, thread_loop, thread, objs=()):
-    for obj in objs:
-        if obj is None:
-            continue
-        if obj.is_valid() and obj.get_ref_count() > 0:
-            obj.ref_count_down()
-    for backend in backends.values():
-        backend.close()
-    if thread_loop and thread_loop.is_running():
-        thread_loop.call_soon_threadsafe(thread_loop.stop)
-    if thread and thread.is_alive():
-        thread.join()
-
-
-def run_dynamic_file(config, dtype, tmp_path):
-    """
-    Exercise the dynamic-FILE backend's new code paths: contains/key_exists,
-    put/get round-trip, and remove for both present and missing files.
-    """
-    nixl_backend, backends, thread_loop, thread, keys, objs = (
-        _build_dynamic_file_backend(config, dtype)
-    )
-
-    retained_objs = list(objs)
-
-    try:
-        for key in keys:
-            assert not nixl_backend.contains(key, False)
-            assert not nixl_backend.exists_in_put_tasks(key)
-
-        nixl_backend.batched_submit_put_task(keys, objs)
-
-        for key in keys:
-            assert nixl_backend.contains(key, False)
-
-        files_after_put = set(os.listdir(str(tmp_path)))
-        expected_files = {nixl_backend._format_object_key(k) for k in keys}
-        assert expected_files.issubset(files_after_put), (
-            f"missing files in {tmp_path}: {expected_files - files_after_put}"
-        )
-
-        for key, obj in zip(keys, objs, strict=False):
-            returned = nixl_backend.get_blocking(key)
-            assert returned is not None
-            retained_objs.append(returned)
-            assert returned.get_size() == obj.get_size()
-            assert returned.get_shape() == obj.get_shape()
-            assert returned.get_dtype() == obj.get_dtype()
-            assert torch.equal(returned.tensor, obj.tensor)
-
-        first_remove = nixl_backend.remove(keys[0])
-        assert first_remove is True
-        assert not os.path.exists(
-            os.path.join(str(tmp_path), nixl_backend._format_object_key(keys[0]))
-        )
-
-        # Removing an already-gone file must return False
-        # instead of raising FileNotFoundError.
-        second_remove = nixl_backend.remove(keys[0])
-        assert second_remove is False
-    finally:
-        _teardown_dynamic_file_backend(backends, thread_loop, thread, retained_objs)
-
-
 @pytest.mark.no_shared_allocator
-def test_nixl_posix_dynamic_file_backend(tmp_path):
+def test_nixl_posix_backend_multipath():
+    """Test NIXL backend with multipath support and path sharding."""
     BASE_DIR = Path(__file__).parent
-    config = LMCacheEngineConfig.from_file(BASE_DIR / "data/nixl.yaml")
-
-    dtype = torch.bfloat16
-
-    config.nixl_buffer_device = "cpu"
-    config.extra_config["nixl_backend"] = "POSIX"
-    config.extra_config["nixl_pool_size"] = 0  # dynamic mode
-    config.extra_config["nixl_path"] = str(tmp_path)
-    config.extra_config["enable_cuda"] = False
-
-    run_dynamic_file(config, dtype, tmp_path)
-
-
-def _count_open_fds() -> int:
-    return len(os.listdir("/proc/self/fd"))
-
-
-@pytest.mark.no_shared_allocator
-@pytest.mark.skipif(
-    not sys.platform.startswith("linux"),
-    reason="Requires /proc/self/fd to count open FDs",
-)
-def test_nixl_dynamic_file_fd_leak_on_setup_failure(tmp_path, monkeypatch):
-    """
-    If any operation between the per-key ``os.open`` loop and
-    ``release_storage_handler`` raises, the already-opened FDs must be
-    closed and the just-created files unlinked instead of leaked.
-    """
-    BASE_DIR = Path(__file__).parent
-    config = LMCacheEngineConfig.from_file(BASE_DIR / "data/nixl.yaml")
-
-    dtype = torch.bfloat16
-
-    config.nixl_buffer_device = "cpu"
-    config.extra_config["nixl_backend"] = "POSIX"
-    config.extra_config["nixl_pool_size"] = 0
-    config.extra_config["nixl_path"] = str(tmp_path)
-    config.extra_config["nixl_async_put"] = False
-    config.extra_config["enable_cuda"] = False
-
-    nixl_backend, backends, thread_loop, thread, keys, objs = (
-        _build_dynamic_file_backend(config, dtype)
-    )
-
-    try:
-        baseline = _count_open_fds()
-
-        def boom(*args, **kwargs):
-            raise RuntimeError("induced failure")
-
-        monkeypatch.setattr(nixl_backend.agent, "create_batched_storage_handler", boom)
-
-        # Sync mode: batched_submit_put_task calls future.result(), so the
-        # induced RuntimeError propagates here.
-        with pytest.raises(RuntimeError):
-            nixl_backend.batched_submit_put_task(keys, objs)
-
-        assert _count_open_fds() == baseline, "FDs leaked on transfer-setup failure"
-
-        # The put path opens the final key files with O_CREAT before
-        # registering the storage handler, so a failure here must clean
-        # up those just-created files.
-        for key in keys:
-            assert not os.path.exists(
-                os.path.join(str(tmp_path), nixl_backend._format_object_key(key))
-            ), "final key file leaked on transfer-setup failure"
-    finally:
-        _teardown_dynamic_file_backend(backends, thread_loop, thread, objs)
-
-
-@pytest.mark.no_shared_allocator
-def test_nixl_dynamic_file_no_leak_on_transfer_failure(tmp_path, monkeypatch):
-    """
-    When the NIXL transfer itself fails after the final key
-    files have been opened with ``O_CREAT``, the backend must remove
-    those empty / partially-written files.
-    """
-    BASE_DIR = Path(__file__).parent
-    config = LMCacheEngineConfig.from_file(BASE_DIR / "data/nixl.yaml")
+    config = LMCacheEngineConfig.from_file(BASE_DIR / "data/nixl_multipath.yaml")
 
     dtype = torch.bfloat16
+    shape = torch.Size([4, 2, 256, 8, 128])
 
     config.nixl_buffer_device = "cpu"
     config.extra_config["nixl_backend"] = "POSIX"
-    config.extra_config["nixl_pool_size"] = 0
-    config.extra_config["nixl_path"] = str(tmp_path)
-    config.extra_config["nixl_async_put"] = False
     config.extra_config["enable_cuda"] = False
 
-    nixl_backend, backends, thread_loop, thread, keys, objs = (
-        _build_dynamic_file_backend(config, dtype)
-    )
-
-    try:
-
-        def boom(*args, **kwargs):
-            raise RuntimeError("induced post_blocking failure")
+    # Test that multipath configuration is properly handled
+    assert isinstance(config.extra_config["nixl_path"], list)
+    assert len(config.extra_config["nixl_path"]) == 3
+    assert config.extra_config["nixl_path_sharding"] == "by_gpu"
 
-        monkeypatch.setattr(nixl_backend.agent, "post_blocking", boom)
-
-        with pytest.raises(RuntimeError):
-            nixl_backend.batched_submit_put_task(keys, objs)
-
-        for key in keys:
-            final_path = os.path.join(
-                str(tmp_path), nixl_backend._format_object_key(key)
-            )
-            assert not os.path.exists(final_path), (
-                f"final key file leaked on transfer failure: {final_path}"
-            )
-            assert not nixl_backend.contains(key, False), (
-                "contains() reports key present after failed write"
-            )
-    finally:
-        _teardown_dynamic_file_backend(backends, thread_loop, thread, objs)
+    run(config, shape, dtype)

From 14876f28f9102dd6e8d9496c9a6d56fb6ea5a44d Mon Sep 17 00:00:00 2001
From: Dhruva Kumar <dhruvadhruvakumar160@gmail.com>
Date: Thu, 11 Jun 2026 06:10:41 +0530
Subject: [PATCH 29/57] MAINT: Remove unnecessary global statement in
 cuda_extension (#3581)

MAINT: Remove unnecessary global statement and trailing whitespace

Signed-off-by: Dhruva Kumar <dhruvadhruvakumar160@gmail.com>
---
 setup.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index f0693a3dd1..9c59f2d9d9 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@ def hipify_wrapper() -> None:
     # Third Party
     from torch.utils.hipify.hipify_python import hipify
 
-    print("Hipifying sources ")
+    print("Hipifying sources")
 
     # Get absolute path for all source files.
     extra_files = [
@@ -221,7 +221,6 @@ def cuda_extension() -> tuple[list, dict]:
     from torch.utils import cpp_extension  # Import here
 
     print("Building CUDA extensions")
-    global ENABLE_CXX11_ABI
     if ENABLE_CXX11_ABI:
         flag_cxx_abi = "-D_GLIBCXX_USE_CXX11_ABI=1"
     else:

From 5f699d3f21cc14782f129c5fdf46fe1bdfb27651 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Wed, 10 Jun 2026 17:53:21 -0700
Subject: [PATCH 30/57] [MP][Core][HMA] Implement the interface for
 multi-object group and sliding window support (#3612)

Signed-off-by: ApostaC <yihua@tensormesh.ai>
---
 lmcache/v1/distributed/api.py                 |  83 ++-
 lmcache/v1/kv_layer_groups.py                 |  75 ++-
 lmcache/v1/multiprocess/engine_context.py     |  12 +-
 lmcache/v1/multiprocess/gpu_context.py        |   1 +
 lmcache/v1/multiprocess/modules/blend.py      |   8 +-
 lmcache/v1/multiprocess/modules/blend_v3.py   |   6 +-
 .../v1/multiprocess/modules/gpu_transfer.py   | 611 ++++++++++++------
 lmcache/v1/multiprocess/modules/lookup.py     |   6 +-
 .../multiprocess/modules/non_gpu_transfer.py  |  19 +-
 .../v1/distributed/test_fs_l2_adapter_keys.py |  37 +-
 .../test_batched_iteration_with_skip.py       | 112 ++++
 tests/v1/multiprocess/test_free_locks.py      |   2 +-
 .../test_non_cuda_data_transfer.py            |   2 +-
 tests/v1/multiprocess/test_unified_touch.py   |   6 +-
 14 files changed, 682 insertions(+), 298 deletions(-)
 create mode 100644 tests/v1/multiprocess/test_batched_iteration_with_skip.py

diff --git a/lmcache/v1/distributed/api.py b/lmcache/v1/distributed/api.py
index b09e187434..c0da7f30ef 100644
--- a/lmcache/v1/distributed/api.py
+++ b/lmcache/v1/distributed/api.py
@@ -209,10 +209,11 @@ class PrefetchHandle:
 def ipc_key_to_object_keys(
     ipc_key: IPCCacheEngineKey,
     chunk_hashes: list[bytes],
-    object_group_id: int = 0,
-) -> list[ObjectKey]:
+    object_group_ids: list[int],
+) -> list[list[ObjectKey]]:
     """
-    Convert a single IPCCacheEngineKey and its chunk hashes to a list of ObjectKey.
+    Convert a single IPCCacheEngineKey and its chunk hashes to per-object-group
+    lists of ObjectKey.
 
     When the ipc_key's worker_id is None, each chunk hash is exploded into
     multiple ObjectKeys (one per worker in world_size).
@@ -227,52 +228,50 @@ def ipc_key_to_object_keys(
         ipc_key: The IPC key providing model_name, world_size, worker_id,
             and cache_salt.
         chunk_hashes: List of chunk hash bytes, one per chunk.
-        object_group_id: Index of the object group the chunks belong to.
-            Defaults to 0, the single-group case.
+        object_group_ids: Object group ids to produce keys for.
 
     Returns:
-        list[ObjectKey]: The converted list of ObjectKey.
+        list[list[ObjectKey]]: The i-th element is the list of ObjectKeys
+        for ``object_group_ids[i]``.
     """
     cache_salt = ipc_key.cache_salt
-    storage_keys = []
-    for chunk_hash in chunk_hashes:
-        if ipc_key.worker_id is None:
-            # For look up request, we want to expand to all workers
-            for worker_id in range(ipc_key.world_size):
-                # TODO (ApostaC): include local world size/rank info
-                # in the future once it's in IPCCacheEngineKey
-                kv_rank = ObjectKey.ComputeKVRank(
-                    world_size=ipc_key.world_size,
-                    global_rank=worker_id,
-                    local_world_size=ipc_key.world_size,
-                    local_rank=worker_id,
-                )
-
-                storage_keys.append(
-                    ObjectKey(
-                        chunk_hash=chunk_hash,
-                        model_name=ipc_key.model_name,
-                        kv_rank=kv_rank,
-                        object_group_id=object_group_id,
-                        cache_salt=cache_salt,
-                    )
-                )
-        else:
-            kv_rank = ObjectKey.ComputeKVRank(
+
+    # The (chunk_hash, kv_rank) expansion is independent of the object group,
+    # so compute it once and reuse it for every group.
+    if ipc_key.worker_id is None:
+        # For look up request, we want to expand to all workers
+        # TODO (ApostaC): include local world size/rank info
+        # in the future once it's in IPCCacheEngineKey
+        kv_ranks = [
+            ObjectKey.ComputeKVRank(
+                world_size=ipc_key.world_size,
+                global_rank=worker_id,
+                local_world_size=ipc_key.world_size,
+                local_rank=worker_id,
+            )
+            for worker_id in range(ipc_key.world_size)
+        ]
+    else:
+        kv_ranks = [
+            ObjectKey.ComputeKVRank(
                 world_size=ipc_key.world_size,
                 global_rank=ipc_key.worker_id,
                 local_world_size=ipc_key.world_size,
                 local_rank=ipc_key.worker_id,
             )
-
-            storage_keys.append(
-                ObjectKey(
-                    chunk_hash=chunk_hash,
-                    model_name=ipc_key.model_name,
-                    kv_rank=kv_rank,
-                    object_group_id=object_group_id,
-                    cache_salt=cache_salt,
-                )
+        ]
+
+    return [
+        [
+            ObjectKey(
+                chunk_hash=chunk_hash,
+                model_name=ipc_key.model_name,
+                kv_rank=kv_rank,
+                object_group_id=object_group_id,
+                cache_salt=cache_salt,
             )
-
-    return storage_keys
+            for chunk_hash in chunk_hashes
+            for kv_rank in kv_ranks
+        ]
+        for object_group_id in object_group_ids
+    ]
diff --git a/lmcache/v1/kv_layer_groups.py b/lmcache/v1/kv_layer_groups.py
index 78ea6ba055..4b563a75de 100644
--- a/lmcache/v1/kv_layer_groups.py
+++ b/lmcache/v1/kv_layer_groups.py
@@ -291,27 +291,16 @@ def __init__(
                 :func:`normalize_kv_and_discover_format`.
             gpu_kv_format: Format returned by
                 :func:`normalize_kv_and_discover_format`.
-            num_blocks: Number of paged blocks. Stamped into every
-                ``shape_desc.nb``. Each group's ``shape_desc.bs`` is
-                discovered per-layer via :func:`get_block_size`, so
-                compressed and non-compressed groups can coexist.
+            num_blocks: Number of paged blocks in the device KV cache.
             layout_hints: Engine-provided hints. The manager only reads
                 ``inference_engine_logical_block_size`` (logical tokens
                 per inference-engine block) from it to derive each
                 group's ``compress_ratio`` and ``physical_chunk_size``.
                 ``None`` means every group is treated as non-compressed
                 (``compress_ratio == 1``).
-            engine_group_infos: LMCache-owned engine KV cache group
-                metadata. When present, it is used to keep layers from
-                different engine block-ID spaces in separate LMCache
-                transfer groups.
-            lmcache_logical_chunk_size: Logical tokens per LMCache chunk
-                (one logical token = one inference-engine token).
-                Together with ``compress_ratio`` it determines each
-                group's ``physical_chunk_size =
-                lmcache_logical_chunk_size // compress_ratio``, the
-                number of *physical* slots per chunk fed to the
-                block-level transfer kernel.
+            engine_group_infos: Engine KV cache group metadata, including
+                the engine group ids, and the sliding window information.
+            lmcache_logical_chunk_size: Tokens per LMCache chunk
         """
         # Import here to break a circular import via
         # lmcache.v1.gpu_connector.__init__ → metadata → kv_layer_groups.
@@ -406,6 +395,8 @@ def __init__(
             or self._kernel_groups[0].shape_desc.bs
         )
 
+        self._lmcache_chunk_size = lmcache_logical_chunk_size
+
         logger.info(
             "KV layer groups: ---\n%s\n---",
             "\n".join(repr(g) for g in self._kernel_groups),
@@ -453,6 +444,7 @@ def num_groups(self) -> int:
         return len(self._kernel_groups)
 
     @property
+    @lmcache_deprecate("This function will be removed soon")
     def inference_engine_logical_block_size(self) -> int:
         """Inference-engine-side logical block size.
 
@@ -469,8 +461,6 @@ def inference_engine_logical_block_size(self) -> int:
     def get_shape_desc(self, kernel_group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
         """Return the :class:`PageBufferShapeDesc` for *kernel_group_idx*.
 
-        Equivalent to ``self._kernel_groups[kernel_group_idx].shape_desc``.
-
         Args:
             kernel_group_idx: 0-based kernel group index.
 
@@ -479,17 +469,10 @@ def get_shape_desc(self, kernel_group_idx: int) -> "lmc_ops.PageBufferShapeDesc"
         """
         return self._kernel_groups[kernel_group_idx].shape_desc
 
+    @lmcache_deprecate("This function will be renamed to get_num_slots_per_chunk")
     def get_physical_chunk_size(self, kernel_group_idx: int) -> int:
         """Return the per-chunk *physical* slot count for *kernel_group_idx*.
 
-        Equivalent to
-        ``self._kernel_groups[kernel_group_idx].physical_chunk_size``.
-        For non-compressed groups this equals
-        ``lmcache_logical_chunk_size``; for compressed groups it equals
-        ``lmcache_logical_chunk_size // compress_ratio`` and is what the
-        block-level transfer kernel must be told (the logical chunk size
-        in *vLLM tokens* is not what the kernel addresses).
-
         Args:
             kernel_group_idx: 0-based kernel group index.
 
@@ -498,6 +481,48 @@ def get_physical_chunk_size(self, kernel_group_idx: int) -> int:
         """
         return self._kernel_groups[kernel_group_idx].physical_chunk_size
 
+    def get_subchunk_sw_size_tokens(self, kernel_group_idx: int) -> int:
+        """Return the sub-chunk sliding window size of a given kernel group.
+        The size is measured in the number of tokens.
+
+        This is for the models like DSV4 where the sliding window size is
+        smaller than the tokens in a single lmcache chunk.
+
+        Args:
+            kernel_group_idx: 0-based kernel group index.
+
+        Returns:
+            The sub-chunk sliding window size. Will be the same as the
+            chunk size for non-slding-window models or big-sliding-
+            window models.
+        """
+        # TODO(ApostaC): now here's the 'dummy' implementation.
+        # Need to wire the real sw size from the kernel group info once it's available
+        return self._lmcache_chunk_size
+
+    def get_sw_size_chunks(self, object_group_idx: int) -> int:
+        """Return the sliding window size of a given kernel group,
+        The size is measured in lmcache chunks.
+
+        If the kernel group is non-sliding window, return -1
+
+        Args:
+            object_group_idx: 0-based kernel group index.
+
+        Returns:
+            The sliding window size rounded up to chunks for sliding
+            window models. -1 otherwise.
+
+        Note:
+            It uses object_group_idx, because the kernel groups in the same
+            object group must share the same "big-sliding-window" size -- so that
+            they can be retrieved at the same time from the same object.
+            For small sliding window (subchunk window) models, it will return 1.
+        """
+        # TODO(ApostaC): now here's the 'dummy' implementation.
+        # Need to wire the real sw size from the object group info once it's available
+        return -1
+
     def calculate_num_blocks(self, kernel_group_idx: int, num_tokens: int) -> int:
         """Calculate the number of blocks for a given number of tokens in a
         specified kernel group.
diff --git a/lmcache/v1/multiprocess/engine_context.py b/lmcache/v1/multiprocess/engine_context.py
index 14e97e0e4d..c7055e5156 100644
--- a/lmcache/v1/multiprocess/engine_context.py
+++ b/lmcache/v1/multiprocess/engine_context.py
@@ -199,17 +199,21 @@ def layout_desc_registry(self) -> LayoutDescRegistry:
         """Registry mapping (model_name, world_size) to MemoryLayoutDesc."""
         return self._layout_desc_registry
 
-    def resolve_obj_keys(self, key: IPCCacheEngineKey) -> list[ObjectKey]:
-        """Resolve object keys from an IPC cache key.
+    def resolve_obj_keys(
+        self, key: IPCCacheEngineKey, object_group_ids: list[int]
+    ) -> list[list[ObjectKey]]:
+        """Resolve per-object-group object keys from an IPC cache key.
 
         Uses the session manager to track token state and the token hasher
         to compute chunk hashes for the requested range.
 
         Args:
             key: IPC cache key describing model/session/token range.
+            object_group_ids: Object group ids to produce keys for.
 
         Returns:
-            Resolved object keys for the requested token range.
+            The i-th element is the list of ObjectKeys for
+            ``object_group_ids[i]``.
 
         Raises:
             ValueError: If ``key.worker_id`` is ``None``.
@@ -221,7 +225,7 @@ def resolve_obj_keys(self, key: IPCCacheEngineKey) -> list[ObjectKey]:
         ]
         if key.worker_id is None:
             raise ValueError("Must resolve keys with worker_id != None")
-        return ipc_key_to_object_keys(key, chunk_hashes)
+        return ipc_key_to_object_keys(key, chunk_hashes, object_group_ids)
 
     @staticmethod
     def _compute_shm_pool_info(
diff --git a/lmcache/v1/multiprocess/gpu_context.py b/lmcache/v1/multiprocess/gpu_context.py
index dbf074ceb0..ed5f81bdf2 100644
--- a/lmcache/v1/multiprocess/gpu_context.py
+++ b/lmcache/v1/multiprocess/gpu_context.py
@@ -486,6 +486,7 @@ def get_shape_desc(self, group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
         """Returns the PageBufferShapeDesc for the given KV layer group."""
         return self.kv_layer_groups_manager_.get_shape_desc(group_idx)
 
+    @lmcache_deprecate("this function will be renamed to get_num_slots_per_chunk")
     def get_physical_chunk_size(self, group_idx: int) -> int:
         """Returns the per-chunk physical slot count for the given group.
 
diff --git a/lmcache/v1/multiprocess/modules/blend.py b/lmcache/v1/multiprocess/modules/blend.py
index e9b7eaf38c..93fb3d7be6 100644
--- a/lmcache/v1/multiprocess/modules/blend.py
+++ b/lmcache/v1/multiprocess/modules/blend.py
@@ -597,7 +597,7 @@ def cb_lookup_pre_computed(self, key: IPCCacheEngineKey) -> list[CBMatchResult]:
         # time, so ipc_key_to_object_keys resolves correctly.
         for group in groups:
             chunk_hashes = [r.hash for r in group]
-            obj_keys = ipc_key_to_object_keys(key, chunk_hashes)
+            obj_keys = ipc_key_to_object_keys(key, chunk_hashes, [0])[0]
             handle = self._ctx.storage_manager.submit_prefetch_task(
                 obj_keys,
                 layout_desc,
@@ -827,7 +827,7 @@ def cb_store_pre_computed(
         # the CB lookup path and via the standard lookup/retrieve path.
         chunk_hashes = self._ctx.token_hasher.compute_chunk_hashes(list(key.token_ids))
         # convert to object key
-        obj_keys = ipc_key_to_object_keys(key, chunk_hashes)
+        obj_keys = ipc_key_to_object_keys(key, chunk_hashes, [0])[0]
 
         reserved_dict: dict = {}
         try:
@@ -937,7 +937,7 @@ def cb_retrieve_pre_computed(
         cb_match_result = sorted(cb_match_result, key=lambda r: r.cur_st)
         num_chunks = len(cb_match_result)
         chunk_hashes = [r.hash for r in cb_match_result]
-        all_obj_keys = ipc_key_to_object_keys(key, chunk_hashes)
+        all_obj_keys = ipc_key_to_object_keys(key, chunk_hashes, [0])[0]
 
         # CPU-synchronous sentinel: GPU retrieve is about to be enqueued.
         self._ctx.event_bus.publish(
@@ -1110,7 +1110,7 @@ def cb_store_final(
         chunk_hashes = self._ctx.token_hasher.compute_chunk_hashes(list(key.token_ids))
 
         # convert to object key
-        obj_keys = ipc_key_to_object_keys(key, chunk_hashes)
+        obj_keys = ipc_key_to_object_keys(key, chunk_hashes, [0])[0]
 
         reserved_dict: dict = {}
         try:
diff --git a/lmcache/v1/multiprocess/modules/blend_v3.py b/lmcache/v1/multiprocess/modules/blend_v3.py
index 7f0dba6889..1409aa6a6d 100644
--- a/lmcache/v1/multiprocess/modules/blend_v3.py
+++ b/lmcache/v1/multiprocess/modules/blend_v3.py
@@ -539,7 +539,7 @@ def _sparse_prefetch_submit(
         world_size = key.world_size
         per_hash_obj_keys: dict[bytes, list] = {}
         all_hashes = [r.hash for r in matches]
-        all_obj_keys = ipc_key_to_object_keys(key, all_hashes)
+        all_obj_keys = ipc_key_to_object_keys(key, all_hashes, [0])[0]
         for i, h in enumerate(all_hashes):
             per_hash_obj_keys[h] = all_obj_keys[i * world_size : (i + 1) * world_size]
 
@@ -906,8 +906,8 @@ def cb_retrieve_pre_computed(
                 all_obj_keys = [k for r in cb_match_result for k in cached[r.hash]]
         else:
             all_obj_keys = ipc_key_to_object_keys(
-                key, [r.hash for r in cb_match_result]
-            )
+                key, [r.hash for r in cb_match_result], [0]
+            )[0]
 
         # Lookup read-locked the full found set, but the connector may have
         # dropped some matches (parent-covered / misaligned) before retrieve,
diff --git a/lmcache/v1/multiprocess/modules/gpu_transfer.py b/lmcache/v1/multiprocess/modules/gpu_transfer.py
index 6c43a28eb1..27fe390de9 100644
--- a/lmcache/v1/multiprocess/modules/gpu_transfer.py
+++ b/lmcache/v1/multiprocess/modules/gpu_transfer.py
@@ -4,9 +4,12 @@
 # Standard
 from dataclasses import dataclass
 from itertools import islice
-from typing import Generator
+from typing import Generator, Sequence
 import time
 
+# Third Party
+import torch
+
 # First Party
 from lmcache import torch_dev, torch_device_type
 from lmcache.logging import init_logger
@@ -51,7 +54,7 @@
 def get_layout_desc(
     gpu_context: GPUCacheContext,
     num_tokens: int,
-    object_group_id: int = 0,
+    object_group_id: int,
 ) -> MemoryLayoutDesc:
     """Get the memory layout description for a specific object group.
 
@@ -64,8 +67,6 @@ def get_layout_desc(
         cache_context: The GPU cache context containing the KV cache information.
         num_tokens: The number of tokens to determine the layout for.
         object_group_id: Index of the object group whose layout to build.
-            Defaults to 0; under the current single-object-group assumption this
-            covers every kernel group.
 
     Returns:
         MemoryLayoutDesc: The memory layout description containing shapes and
@@ -80,24 +81,303 @@ def get_layout_desc(
     return MemoryLayoutDesc(shapes=list(shapes), dtypes=list(dtypes))
 
 
-def batched_iteration(lst: list, batch_size: int) -> Generator[tuple, None, None]:
-    """Utility function to iterate over a list in batches.
+def batched_iteration_with_skip(
+    lst: Sequence,
+    batch_size: int,
+    skip_count: int,
+) -> Generator[tuple[int, tuple], None, None]:
+    """Utility function to iterate over a list in batches with an initial skip.
 
     Args:
         lst: The list to iterate over.
         batch_size: The size of each batch.
+        skip_count: The number of items to skip at the start of the list.
 
     Yields:
-        Batches of the list as tuples.
+        Tuples of (batch_start_idx, batch) where batch is a tuple of items
+        from the list, and batch_start_idx is the "original" index of the first
+        item in the batch.
 
     Raises:
-        ValueError: If batch_size is less than 1.
+        ValueError: If batch_size is less than 1 or skip_count is negative.
+
+    Note:
+        Batch_idx is the index of the batch in the original list, accounting
+        for the skipped items. For example, if skip_count is 10 and batch_size
+        is 5, the first yielded batch will have batch_start_idx=10.
     """
     if batch_size < 1:
         raise ValueError("batch size must be at least one")
+    if skip_count < 0:
+        raise ValueError("skip_count must be non-negative")
+
     it = iter(lst)
+    # Skip the initial items
+    for _ in range(skip_count):
+        next(it, None)
+    batch_start_idx = skip_count
     while batch := tuple(islice(it, batch_size)):
-        yield batch
+        yield batch_start_idx, batch
+        batch_start_idx += len(batch)
+
+
+def downsample_and_stage_block_ids(
+    cache_context: GPUCacheContext,
+    block_ids: list[list[int]],
+) -> list[torch.Tensor]:
+    """Cut the block id lists to skip the unneeded blocks in a chunk and
+    stage it into GPU tensors for later use.
+
+    This mainly targets the case where a portion of the blocks are not
+    needed for every chunk, such as deepseek v4's swa cache.
+
+    Note that the we do NOT do any object-level skipping here.
+
+    Args:
+        cache_context: The GPU cache context containing the KV cache information.
+        block_ids: The original block id lists, indexed by LMCache KV group index.
+
+    Returns:
+        The cut block id lists, indexed by LMCache KV group index.
+
+    Note:
+        This function has some coupled logic with transfer_kv_per_object_group below.
+        The caller need to make sure that the block ids seen by
+        transfer_kv_per_object_group are produced by this function.
+
+    Example:
+        If a model have 2 kernel groups, one is full attention with block size 32,
+        one is swa attention with block size 32 and sliding window size 64, and
+        LMCache has a chunk size of 128. And there are 2 chunks in total (256 tokens).
+
+        The input will be:
+        [
+          [1, 2, 3, 4, 5, 6, 7, 8],  # block ids for the full attention group
+          [11, 12, 13, 14, 15, 16, 17, 18], # block ids for the swa attention group
+        ]
+
+        The output will be
+        [
+          [1, 2, 3, 4, 5, 6, 7, 8],  # full attention group still needs all block ids
+          [13, 14, 17, 18], # swa attention group only needs the last 2 block per chunk
+        ]
+    """
+    num_kernel_groups = cache_context.kv_layer_groups_manager.num_kernel_groups
+    for kernel_group_id in range(num_kernel_groups):
+        subchunk_sw_size_tokens = (
+            cache_context.kv_layer_groups_manager.get_subchunk_sw_size_tokens(
+                kernel_group_id
+            )
+        )
+        tokens_per_chunk = min(
+            cache_context.lmcache_logical_chunk_size, subchunk_sw_size_tokens
+        )
+        keep_blocks_per_chunk = cache_context.calculate_num_blocks(
+            tokens_per_chunk, kernel_group_id
+        )
+        total_blocks_per_chunk = cache_context.calculate_num_blocks(
+            cache_context.lmcache_logical_chunk_size, kernel_group_id
+        )
+
+        new_block_ids = []
+        old_block_ids = block_ids[kernel_group_id]
+        assert len(old_block_ids) % total_blocks_per_chunk == 0, (
+            f"len(block_ids[{kernel_group_id}]) should be a multiple "
+            f"of total_blocks_per_chunk ({total_blocks_per_chunk}), but got "
+            f"{len(old_block_ids)}"
+        )
+
+        for i in range(0, len(old_block_ids), total_blocks_per_chunk):
+            chunk_block_ids = old_block_ids[i : i + total_blocks_per_chunk]
+            new_block_ids.extend(chunk_block_ids[-keep_blocks_per_chunk:])
+
+        block_ids[kernel_group_id] = new_block_ids
+
+    # Stage the cut block ids into GPU tensors
+    block_ids_gpu = cache_context.copy_view_block_ids_to_gpu(block_ids)
+    return block_ids_gpu
+
+
+def _recalculate_blocks_to_skip(
+    blocks_per_chunk: int,
+    blocks_per_window: int,
+    blocks_to_skip: int,
+) -> int:
+    """Re-calculate the number of blocks to skip for a batch of chunks based
+    on the blocks per chunk and blocks per sliding window WHEN the window
+    size is smaller than the lmcache chunk size.
+
+    Args:
+        blocks_per_chunk: The total number of blocks in one chunk for the
+            current group.
+        blocks_per_window: The number of blocks in the sliding window
+            for the current group. Should be less than or equal to
+            blocks_per_chunk.
+        blocks_to_skip: The number of blocks to skip.
+
+    Returns:
+        The re-calculated number of blocks to skip for the current batch of
+        chunks.
+    """
+    if blocks_per_chunk == blocks_per_window:
+        return blocks_to_skip
+
+    full_windows_to_skip = blocks_to_skip // blocks_per_chunk
+    tail_blocks = blocks_to_skip % blocks_per_chunk
+    tail_blocks_to_skip = tail_blocks - (blocks_per_chunk - blocks_per_window)
+    return full_windows_to_skip * blocks_per_window + max(0, tail_blocks_to_skip)
+
+
+def transfer_kv_per_object_group(
+    cache_context: GPUCacheContext,
+    block_ids_gpu: list[torch.Tensor],
+    memory_objs: Sequence[MemoryObj | None],
+    object_group_id: int,
+    batch_size: int,
+    skip_first_n_tokens: int,
+    direction: "lmc_ops.TransferDirection",
+) -> None:
+    """Helper function to transfer memory objects of a single object group
+    to/from GPU, with batching support.
+
+    Args:
+        cache_context: The GPU cache context containing the KV cache information.
+        block_ids_gpu: GPU block IDs to retrieve into, indexed by LMCache KV group
+            index. It should satisfy `len(block_ids_gpu[i]) == len(memory_objs) *
+            blocks_per_chunk[i]` for each group `i`.
+            Note that the block IDs list are already on GPU.
+        memory_objs: The list of MemoryObj instances to copy from. It could be
+            None when allocation or retrieval fails. For store (D2H), it should
+            ignore the None entry and continue copying the rest. For retrieve
+            (H2D), it should raise the error and stop copying.
+        object_group_id: Index of the object group being copied.
+        batch_size: The number of memory objects to perform batched copy
+        skip_first_n_tokens: Number of tokens to skip writing at the start of
+            the retrieve range. This avoids overwriting APC-shared GPU blocks that
+            may be read concurrently by other requests.
+        direction: The transfer direction, H2D (retrieve) or D2H (store).
+
+    Raises:
+        ValueError: If it founds None entry in memory_objs when direction is H2D.
+    Note:
+        This function expects the caller to stage the block ids (list[list[int]])
+        into GPU tensors and pass them in as `block_ids_gpu`.
+    """
+    lmcache_chunk_size = cache_context.lmcache_logical_chunk_size
+    kv_groups_manager = cache_context.kv_layer_groups_manager
+    object_group = kv_groups_manager.object_groups[object_group_id]
+    kernel_group_ids = object_group.kernel_group_indices
+    is_h2d = direction == lmc_ops.TransferDirection.H2D
+
+    sw_size_chunks = kv_groups_manager.get_sw_size_chunks(object_group_id)
+    num_objects_to_skip = 0
+    if sw_size_chunks >= 1 and is_h2d:
+        num_objects_to_skip = max(0, len(memory_objs) - sw_size_chunks)
+        logger.debug(
+            "Detected sliding window for object group %d: "
+            "skipping the first %d objects in the batch",
+            object_group_id,
+            num_objects_to_skip,
+        )
+
+    for start_object_idx, memory_object_batch in batched_iteration_with_skip(
+        memory_objs, batch_size, skip_count=num_objects_to_skip
+    ):
+        if any(mo is None for mo in memory_object_batch):
+            if is_h2d:
+                raise ValueError(
+                    "MemoryObj is None for some objects in the batch, cannot "
+                    "perform H2D copy. memory_object_batch: "
+                    f"{memory_object_batch}"
+                )
+            else:
+                continue
+
+        batch_len = len(memory_object_batch)
+        batch_start_token = start_object_idx * lmcache_chunk_size
+        batch_end_token = batch_start_token + batch_len * lmcache_chunk_size
+
+        effective_start = max(batch_start_token, skip_first_n_tokens)
+        if effective_start >= batch_end_token:
+            continue
+
+        skip_tokens_in_chunk = effective_start - batch_start_token
+
+        # For H2D, copy from CPU to GPU tmp buffers before the kernel launch
+        if is_h2d:
+            for chunk_idx, memory_obj in enumerate(memory_object_batch):
+                lmcache_memcpy_async_h2d(
+                    memory_obj,
+                    cache_context.get_temp_object_group_buffer(
+                        chunk_idx, object_group_id
+                    ),
+                )
+
+        # Do paged KV copy
+        for kernel_group_id in kernel_group_ids:
+            blocks_per_chunk = cache_context.calculate_num_blocks(
+                lmcache_chunk_size, kernel_group_id
+            )
+            tokens_per_window = min(
+                lmcache_chunk_size,
+                kv_groups_manager.get_subchunk_sw_size_tokens(kernel_group_id),
+            )
+            blocks_per_window = cache_context.calculate_num_blocks(
+                tokens_per_window, kernel_group_id
+            )
+
+            # Get the block ids for this chunk
+            start_block_pos = start_object_idx * blocks_per_window
+            end_block_pos = (start_object_idx + batch_len) * blocks_per_window
+
+            block_ids_curr_batch = block_ids_gpu[kernel_group_id][
+                start_block_pos:end_block_pos
+            ]
+
+            # Re-calculate the skip blocks for this kernel group
+            orig_skip_blocks = cache_context.calculate_num_blocks(
+                skip_tokens_in_chunk, kernel_group_id
+            )
+            recalculated_skip_blocks = _recalculate_blocks_to_skip(
+                blocks_per_chunk,
+                blocks_per_window,
+                orig_skip_blocks,
+            )
+
+            # Launch kernel
+            group_kv_pointers = cache_context.get_kernel_group_kv_pointers(
+                kernel_group_id
+            )
+            group_lmcache_chunk_size = cache_context.get_physical_chunk_size(
+                kernel_group_id
+            )
+            tmp_gpu_buffers_batched = [
+                cache_context.get_temp_kernel_group_buffer(
+                    i, kernel_group_id
+                ).data_ptr()
+                for i in range(batch_len)
+            ]
+            lmc_ops.multi_layer_block_kv_transfer(
+                group_kv_pointers,
+                tmp_gpu_buffers_batched,
+                block_ids_curr_batch,
+                cache_context.device,
+                direction,
+                cache_context.get_shape_desc(kernel_group_id),
+                group_lmcache_chunk_size,
+                cache_context.gpu_kv_format_,
+                recalculated_skip_blocks,
+            )
+
+        # For D2H, copy from GPU tmp buffers to CPU after the kernel launch
+        if not is_h2d:
+            for chunk_idx, memory_obj in enumerate(memory_object_batch):
+                lmcache_memcpy_async_d2h(
+                    cache_context.get_temp_object_group_buffer(
+                        chunk_idx, object_group_id
+                    ),
+                    memory_obj,
+                )
 
 
 @dataclass
@@ -342,7 +622,6 @@ def store(
             store completed without such a failure.
         """
         st = time.perf_counter()
-        obj_keys = self._ctx.resolve_obj_keys(key)
 
         entry = self._cache_contexts.get(instance_id)
         if entry is None:
@@ -350,15 +629,20 @@ def store(
         cache_context = entry.cache_context
         model_name = entry.model_name
 
-        # TODO(refactor): only single-object-group transfers are wired up so far.
-        assert cache_context.kv_layer_groups_manager.num_object_groups == 1
+        num_object_groups = cache_context.kv_layer_groups_manager.num_object_groups
+        obj_keys_per_obj_group = self._ctx.resolve_obj_keys(
+            key, list(range(num_object_groups))
+        )
+        num_chunks = len(obj_keys_per_obj_group[0])
 
         # NOTE: different engine groups may have different block sizes, so
         # ``blocks_per_chunk[i]`` is the number of blocks in one chunk for
         # group ``i``.
         blocks_per_chunk = [
             cache_context.calculate_num_blocks(self._ctx.chunk_size, group_idx)
-            for group_idx in range(cache_context.kv_layer_groups_manager.num_groups)
+            for group_idx in range(
+                cache_context.kv_layer_groups_manager.num_kernel_groups
+            )
         ]
 
         with (
@@ -368,33 +652,34 @@ def store(
             check_interprocess_event_support()
             event = torch_dev.Event(interprocess=True)
 
-            block_ids_per_group_gpu = cache_context.copy_view_block_ids_to_gpu(
-                gpu_block_ids
-            )
-
             # Fail closed: every LMCache group must have block IDs covering all
             # chunks. A short list (e.g. a caller/protocol bug) would otherwise
             # drive the transfer kernel to read out-of-bounds GPU memory, so skip
             # the whole store and commit nothing rather than caching a partial or
             # garbage entry. A later request can store it once the block IDs are
-            # complete.
+            # complete. Checked on the raw block ids, before cutting drops the
+            # per-chunk blocks that sliding-window groups do not need.
             if any(
-                group_block_ids.shape[0] < len(obj_keys) * bpc
+                len(group_block_ids) < num_chunks * bpc
                 for group_block_ids, bpc in zip(
-                    block_ids_per_group_gpu, blocks_per_chunk, strict=True
+                    gpu_block_ids, blocks_per_chunk, strict=True
                 )
             ):
                 logger.warning(
                     "STORE block ID underflow for request_id=%s: each group needs "
-                    "len(obj_keys) * blocks_per_chunk block IDs for %d chunks "
+                    "num_chunks * blocks_per_chunk block IDs for %d chunks "
                     "(per-group blocks_per_chunk=%s); skipping the store.",
                     key.request_id,
-                    len(obj_keys),
+                    num_chunks,
                     blocks_per_chunk,
                 )
                 event.record()
                 return event.ipc_handle(), False
 
+            block_ids_per_group_gpu = downsample_and_stage_block_ids(
+                cache_context, gpu_block_ids
+            )
+
             if not hasattr(torch_dev.Event, "from_ipc_handle"):
                 raise RuntimeError(
                     f"Backend '{torch_device_type}' does not support IPC event "
@@ -431,62 +716,43 @@ def store(
             )
 
             reserved_dict: dict[ObjectKey, MemoryObj] = {}
+            all_dict: dict[ObjectKey, MemoryObj] = {}
+            total_bytes: int = 0
             store_succeeded = False
             try:
-                layout_desc = get_layout_desc(
-                    cache_context, self._ctx.chunk_size, object_group_id=0
-                )
-                reserved_dict = self._ctx.storage_manager.reserve_write(
-                    obj_keys, layout_desc, "new"
-                )
+                for obj_group_id in range(num_object_groups):
+                    obj_keys = obj_keys_per_obj_group[obj_group_id]
+                    layout_desc = get_layout_desc(
+                        cache_context,
+                        self._ctx.chunk_size,
+                        object_group_id=obj_group_id,
+                    )
+                    reserved_dict = self._ctx.storage_manager.reserve_write(
+                        obj_keys, layout_desc, "new"
+                    )
+                    all_dict.update(reserved_dict)
+                    if reserved_dict:
+                        total_bytes += next(
+                            iter(reserved_dict.values())
+                        ).get_size() * len(reserved_dict)
+
+                    # Keys not in reserved_dict (skipped by the storage manager)
+                    # become None entries; the helper skips them for D2H.
+                    memory_objs: list[MemoryObj | None] = [
+                        reserved_dict.get(obj_key) for obj_key in obj_keys
+                    ]
 
-                # NOTE: Store is not batched because some obj_keys may be
-                # skipped (not in reserved_dict), making block_ids
-                # non-contiguous. Batching would require torch.cat to
-                # reassemble block_ids, negating the benefit.
-                num_groups = cache_context.kv_layer_groups_manager.num_groups
-                for idx, obj_key in enumerate(obj_keys):
-                    if obj_key in reserved_dict:
-                        memory_obj = reserved_dict[obj_key]
-                    else:
-                        continue
-
-                    # Copy from GPU paged buffer to tmp buffer, then to CPU — per
-                    # group. Each group uses its own block-id list (HMA).
-                    for group_idx in range(num_groups):
-                        bpc = blocks_per_chunk[group_idx]
-                        chunk_block_ids_gpu = block_ids_per_group_gpu[group_idx][
-                            idx * bpc : (idx + 1) * bpc
-                        ]
-                        # Store is not batched, so we always use batch_idx=0.
-                        tmp_buffer = cache_context.get_temp_kernel_group_buffer(
-                            0, group_idx
-                        )
-                        group_kv_pointers = cache_context.get_kernel_group_kv_pointers(
-                            group_idx
-                        )
-                        # Kernel contract: ``group_lmcache_chunk_size`` here is the
-                        # number of *physical* slots per chunk for this group
-                        # (= logical chunk_size // compress_ratio).
-                        group_lmcache_chunk_size = (
-                            cache_context.get_physical_chunk_size(group_idx)
-                        )
-                        lmc_ops.multi_layer_block_kv_transfer(
-                            group_kv_pointers,
-                            [tmp_buffer.data_ptr()],
-                            chunk_block_ids_gpu,
-                            cache_context.device,
-                            lmc_ops.TransferDirection.D2H,
-                            cache_context.get_shape_desc(group_idx),
-                            group_lmcache_chunk_size,
-                            cache_context.gpu_kv_format_,
-                            0,
-                        )
-                    # Store is not batched, so we always use batch_idx=0 (single
-                    # slot). Single object group => object_group_idx=0.
-                    lmcache_memcpy_async_d2h(
-                        cache_context.get_temp_object_group_buffer(0, 0), memory_obj
+                    # NOTE: batch_size must stay 1 for store.
+                    transfer_kv_per_object_group(
+                        cache_context,
+                        block_ids_per_group_gpu,
+                        memory_objs,
+                        object_group_id=obj_group_id,
+                        batch_size=1,
+                        skip_first_n_tokens=0,
+                        direction=lmc_ops.TransferDirection.D2H,
                     )
+
                 store_succeeded = True
             except Exception:
                 logger.exception("Cannot store keys due to exception")
@@ -495,20 +761,15 @@ def store(
                 event.record()
                 # Fail closed: commit the reserved objects only when every chunk
                 # copied successfully; otherwise the whole store is skipped.
-                stored_count = len(reserved_dict) if store_succeeded else 0
+                stored_count = len(all_dict) if store_succeeded else 0
                 if stored_count:
                     submit_callback_to_stream(
                         cache_context.cupy_stream,
                         "finish_write",
-                        list(reserved_dict.keys()),
+                        list(all_dict.keys()),
                     )
-                # All reserved MemoryObjs share one layout_desc, so per-object
-                # size is identical — avoid summing N identical values.
-                total_bytes = (
-                    next(iter(reserved_dict.values())).get_size() * stored_count
-                    if stored_count
-                    else 0
-                )
+                else:
+                    total_bytes = 0
                 self._ctx.event_bus.publish_on_stream(
                     cache_context.cupy_stream,
                     Event(
@@ -525,10 +786,10 @@ def store(
                 )
 
         ed = time.perf_counter()
-        if length := len(reserved_dict):
+        if stored_count:
             logger.info(
                 "Stored %d tokens in %.3f seconds",
-                length * self._ctx.chunk_size,
+                num_chunks * self._ctx.chunk_size,
                 ed - st,
             )
         return event.ipc_handle(), True
@@ -565,7 +826,6 @@ def retrieve(
             ValueError: If no GPU context is registered for the given instance ID.
         """
         st = time.perf_counter()
-        obj_keys = self._ctx.resolve_obj_keys(key)
 
         entry = self._cache_contexts.get(instance_id)
         if entry is None:
@@ -573,8 +833,11 @@ def retrieve(
         cache_context = entry.cache_context
         model_name = entry.model_name
 
-        # TODO(refactor): only single-object-group transfers are wired up so far.
-        assert cache_context.kv_layer_groups_manager.num_object_groups == 1
+        num_object_groups = cache_context.kv_layer_groups_manager.num_object_groups
+        obj_keys_per_obj_group = self._ctx.resolve_obj_keys(
+            key, list(range(num_object_groups))
+        )
+        num_chunks = len(obj_keys_per_obj_group[0])
 
         # CPU-synchronous sentinel: a GPU retrieve is about to be enqueued.
         # Must be published via publish() (not publish_on_stream) so the
@@ -600,134 +863,80 @@ def retrieve(
             ),
         )
 
-        # ``skip_*_in_chunk`` is expressed in engine-block units
-        # (logical tokens), which is what the kernel's
-        # ``skip_blocks_in_chunk`` argument expects regardless
-        # of per-group compression.
-        ie_logical_block_size = (
-            cache_context.kv_layer_groups_manager.inference_engine_logical_block_size
-        )
-
-        def _retrieve_loop(keys: list[ObjectKey], memory_objs: list[MemoryObj]) -> None:
-            _BATCH_SIZE = cache_context.max_batch_size
-            groups = cache_context.kv_layer_groups_manager.kv_layer_groups
-            for batch_idx, memory_obj_batch in enumerate(
-                batched_iteration(memory_objs, batch_size=_BATCH_SIZE)
-            ):
-                batch_len = len(memory_obj_batch)
-                chunk_start = batch_idx * self._ctx.chunk_size * _BATCH_SIZE
-                chunk_end = chunk_start + self._ctx.chunk_size * batch_len
-
-                effective_start = max(chunk_start, skip_first_n_tokens)
-                if effective_start >= chunk_end:
-                    # Entire batch is within APC range, skip it
-                    continue
-
-                skip_tokens_in_chunk = max(
-                    0,
-                    min(
-                        effective_start - chunk_start,
-                        self._ctx.chunk_size * batch_len - 1,
-                    ),
-                )
-                if skip_tokens_in_chunk % ie_logical_block_size != 0:
-                    logger.error(
-                        "skip_first_n_tokens (%d) is not aligned to "
-                        "inference_engine_logical_block_size (%d), "
-                        "rounding down from %d tokens to %d blocks",
-                        skip_first_n_tokens,
-                        ie_logical_block_size,
-                        skip_tokens_in_chunk,
-                        skip_tokens_in_chunk // ie_logical_block_size,
-                    )
-                start_chunk_id = batch_idx * _BATCH_SIZE
-                end_chunk_id = start_chunk_id + batch_len
-                # Copy from CPU to GPU tmp buffers, then scatter to paged KV — per group
-                # H2D copy: each memory_obj maps to its own batch slot
-                for chunk_idx, memory_obj in enumerate(memory_obj_batch):
-                    # Single object group => object_group_idx=0.
-                    lmcache_memcpy_async_h2d(
-                        memory_obj,
-                        cache_context.get_temp_object_group_buffer(chunk_idx, 0),
-                    )
-                for group_idx, group in enumerate(groups):
-                    bpc = cache_context.calculate_num_blocks(
-                        self._ctx.chunk_size, group_idx
-                    )
-                    chunk_block_ids_gpu = block_ids_per_group_gpu[group_idx][
-                        start_chunk_id * bpc : end_chunk_id * bpc
-                    ]
-                    if chunk_block_ids_gpu.shape[0] != batch_len * bpc:
-                        # Fail closed: a short block-id slice would make the
-                        # transfer kernel write out-of-bounds GPU memory.
-                        raise ValueError(
-                            "RETRIEVE block ID underflow: "
-                            f"group_idx={group_idx} "
-                            f"engine_group_idx={group.engine_group_idx} "
-                            f"batch={batch_idx} "
-                            f"expected={batch_len * bpc} "
-                            f"got={chunk_block_ids_gpu.shape[0]}"
-                        )
-                    group_skip_blocks = cache_context.calculate_num_blocks(
-                        skip_tokens_in_chunk, group_idx
-                    )
-                    tmp_buffers = [
-                        cache_context.get_temp_kernel_group_buffer(i, group_idx)
-                        for i in range(batch_len)
-                    ]
-                    group_kv_pointers = cache_context.get_kernel_group_kv_pointers(
-                        group_idx
-                    )
-                    group_lmcache_chunk_size = cache_context.get_physical_chunk_size(
-                        group_idx
-                    )
-
-                    lmc_ops.multi_layer_block_kv_transfer(
-                        group_kv_pointers,
-                        [tb.data_ptr() for tb in tmp_buffers],
-                        chunk_block_ids_gpu,
-                        cache_context.device,
-                        lmc_ops.TransferDirection.H2D,
-                        cache_context.get_shape_desc(group_idx),
-                        group_lmcache_chunk_size,
-                        cache_context.gpu_kv_format_,
-                        group_skip_blocks,
-                    )
+        blocks_per_chunk = [
+            cache_context.calculate_num_blocks(self._ctx.chunk_size, group_idx)
+            for group_idx in range(
+                cache_context.kv_layer_groups_manager.num_kernel_groups
+            )
+        ]
 
         with (
             torch_dev.device(cache_context.device),
             torch_dev.stream(cache_context.stream),
         ):
-            # Copy all block_ids to GPU once before the loop
-            block_ids_per_group_gpu = cache_context.copy_view_block_ids_to_gpu(
-                gpu_block_ids
-            )
-
             check_interprocess_event_support()
             event = torch_dev.Event(interprocess=True)
 
+            # Fail closed: a short block-id list would drive the transfer
+            # kernel to write out-of-bounds GPU memory. Checked on the raw
+            # block ids, before cutting drops the per-chunk blocks that
+            # sliding-window groups do not need.
+            if any(
+                len(group_block_ids) < num_chunks * bpc
+                for group_block_ids, bpc in zip(
+                    gpu_block_ids, blocks_per_chunk, strict=True
+                )
+            ):
+                logger.error(
+                    "RETRIEVE block ID underflow for request_id=%s: each group "
+                    "needs num_chunks * blocks_per_chunk block IDs for %d "
+                    "chunks (per-group blocks_per_chunk=%s); skipping the "
+                    "retrieve.",
+                    key.request_id,
+                    num_chunks,
+                    blocks_per_chunk,
+                )
+                event.record()
+                return event.ipc_handle(), False
+
+            # Cut and stage all block_ids to GPU once before the transfer
+            block_ids_per_group_gpu = downsample_and_stage_block_ids(
+                cache_context, gpu_block_ids
+            )
+
             prefetched_keys: list[ObjectKey] = []
-            retrieve_succeeded = False
             total_bytes = 0
             try:
-                with self._ctx.storage_manager.read_prefetched_results(
-                    obj_keys
-                ) as memory_objs:
-                    if not memory_objs or len(memory_objs) != len(obj_keys):
-                        logger.error("Some keys not found during retrieve!")
-                        return event.ipc_handle(), False
-
-                    prefetched_keys = obj_keys[: len(memory_objs)]
-                    total_bytes = sum(mo.get_size() for mo in memory_objs)
-                    _retrieve_loop(obj_keys, memory_objs)
-                # Only set True when with-block exits normally
-                retrieve_succeeded = True
+                for obj_group_id in range(num_object_groups):
+                    obj_keys = obj_keys_per_obj_group[obj_group_id]
+                    with self._ctx.storage_manager.read_prefetched_results(
+                        obj_keys
+                    ) as memory_objs:
+                        if not memory_objs or len(memory_objs) != len(obj_keys):
+                            logger.error("Some keys not found during retrieve!")
+                            return event.ipc_handle(), False
+
+                        total_bytes += sum(mo.get_size() for mo in memory_objs)
+
+                        transfer_kv_per_object_group(
+                            cache_context,
+                            block_ids_per_group_gpu,
+                            memory_objs,
+                            object_group_id=obj_group_id,
+                            batch_size=cache_context.max_batch_size,
+                            skip_first_n_tokens=skip_first_n_tokens,
+                            direction=lmc_ops.TransferDirection.H2D,
+                        )
+                        # Extend only after the copy is enqueued: on exception,
+                        # read_prefetched_results releases this group's locks
+                        # itself, and a key must not be released twice.
+                        prefetched_keys.extend(obj_keys)
             except Exception:
                 logger.exception("Cannot retrieve keys due to exception")
                 return event.ipc_handle(), False
             finally:
                 event.record()
-                if retrieve_succeeded:
+                if prefetched_keys:
                     submit_callback_to_stream(
                         cache_context.cupy_stream,
                         "finish_read_prefetched",
@@ -748,7 +957,7 @@ def _retrieve_loop(keys: list[ObjectKey], memory_objs: list[MemoryObj]) -> None:
                         },
                     ),
                 )
-        tokens_retrieved = len(obj_keys) * self._ctx.chunk_size
+        tokens_retrieved = num_chunks * self._ctx.chunk_size
         ed = time.perf_counter()
         logger.info(
             "Retrieved %d tokens in %.3f seconds",
diff --git a/lmcache/v1/multiprocess/modules/lookup.py b/lmcache/v1/multiprocess/modules/lookup.py
index 56f814da51..8172ad03a3 100644
--- a/lmcache/v1/multiprocess/modules/lookup.py
+++ b/lmcache/v1/multiprocess/modules/lookup.py
@@ -266,7 +266,7 @@ def lookup(
         session.set_tokens(list(key.token_ids))
         session.lookup_ipc_key = key
 
-        obj_keys = ipc_key_to_object_keys(key, chunk_hashes)
+        obj_keys = ipc_key_to_object_keys(key, chunk_hashes, [0])[0]
 
         handle = self._ctx.storage_manager.submit_prefetch_task(
             obj_keys,
@@ -399,7 +399,7 @@ def free_lookup_locks(
         )
         if not chunk_hashes:
             return
-        obj_keys = ipc_key_to_object_keys(key, chunk_hashes)
+        obj_keys = ipc_key_to_object_keys(key, chunk_hashes, [0])[0]
 
         extra_count = compute_extra_count(tp_size, key.world_size)
 
@@ -437,7 +437,7 @@ def end_session(self, request_id: str) -> None:
             return
 
         chunk_hashes = [TokenHasher.hash_to_bytes(h) for h in session.get_hashes(0)]
-        obj_keys = ipc_key_to_object_keys(session.lookup_ipc_key, chunk_hashes)
+        obj_keys = ipc_key_to_object_keys(session.lookup_ipc_key, chunk_hashes, [0])[0]
         # unified touch of all keys, which include retrieved and stored keys
         # TODO(chunxiaozheng): when l2 is enabled, the prefetched keys from l2 are temp
         #  and will be deleted after finish_read_prefetched, when we touch all keys,
diff --git a/lmcache/v1/multiprocess/modules/non_gpu_transfer.py b/lmcache/v1/multiprocess/modules/non_gpu_transfer.py
index 0f15026176..5e17911e91 100644
--- a/lmcache/v1/multiprocess/modules/non_gpu_transfer.py
+++ b/lmcache/v1/multiprocess/modules/non_gpu_transfer.py
@@ -161,6 +161,11 @@ def _make_transfer_key(
     ) -> tuple[int, IPCCacheEngineKey]:
         return (instance_id, key)
 
+    def _resolve_single_group_obj_keys(self, key: IPCCacheEngineKey) -> list[ObjectKey]:
+        """Resolve object keys for the single object group used by
+        non-GPU transfers."""
+        return self._ctx.resolve_obj_keys(key, [0])[0]
+
     def register_kv_cache_non_gpu_context(
         self,
         payload: RegisterNonGpuContextPayload,
@@ -310,7 +315,7 @@ def prepare_store(
             key=key,
             instance_id=instance_id,
             context=entry.metadata,
-            resolve_obj_keys=self._ctx.resolve_obj_keys,
+            resolve_obj_keys=self._resolve_single_group_obj_keys,
         )
         session = self._ctx.session_manager.get_or_create(key.request_id)
         session.extras["store_start_time"] = time.perf_counter()
@@ -354,10 +359,12 @@ def commit_store(
             instance_id=instance_id,
             cpu_data=cpu_data,
             context=entry.metadata,
-            resolve_obj_keys=self._ctx.resolve_obj_keys,
+            resolve_obj_keys=self._resolve_single_group_obj_keys,
         )
         if st is not None and result:
-            num_tokens = len(self._ctx.resolve_obj_keys(key)) * self._ctx.chunk_size
+            num_tokens = (
+                len(self._resolve_single_group_obj_keys(key)) * self._ctx.chunk_size
+            )
             logger.info(
                 "Stored %d tokens in %.3f seconds",
                 num_tokens,
@@ -392,7 +399,7 @@ def prepare_retrieve(
         response = strategy.prepare_retrieve(
             key=key,
             instance_id=instance_id,
-            resolve_obj_keys=self._ctx.resolve_obj_keys,
+            resolve_obj_keys=self._resolve_single_group_obj_keys,
         )
         session = self._ctx.session_manager.get_or_create(key.request_id)
         session.extras["retrieve_start_time"] = time.perf_counter()
@@ -422,7 +429,9 @@ def commit_retrieve(
         st = session.extras.pop("retrieve_start_time", None)
         result = strategy.commit_retrieve(key=key, instance_id=instance_id)
         if st is not None:
-            num_tokens = len(self._ctx.resolve_obj_keys(key)) * self._ctx.chunk_size
+            num_tokens = (
+                len(self._resolve_single_group_obj_keys(key)) * self._ctx.chunk_size
+            )
             logger.info(
                 "Retrieved %d tokens in %.3f seconds",
                 num_tokens,
diff --git a/tests/v1/distributed/test_fs_l2_adapter_keys.py b/tests/v1/distributed/test_fs_l2_adapter_keys.py
index 858e7c9abf..c2d834e933 100644
--- a/tests/v1/distributed/test_fs_l2_adapter_keys.py
+++ b/tests/v1/distributed/test_fs_l2_adapter_keys.py
@@ -132,7 +132,7 @@ def test_forwards_cache_salt_single_worker(self):
             token_ids=[1, 2, 3],
             cache_salt="alice",
         )
-        out = ipc_key_to_object_keys(k, [b"h1", b"h2"])
+        out = ipc_key_to_object_keys(k, [b"h1", b"h2"], [0])[0]
         assert len(out) == 2
         assert all(o.cache_salt == "alice" for o in out)
 
@@ -149,7 +149,7 @@ def test_forwards_cache_salt_scheduler_path(self):
             token_ids=[1, 2, 3],
             cache_salt="alice",
         )
-        out = ipc_key_to_object_keys(k, [b"h1"])
+        out = ipc_key_to_object_keys(k, [b"h1"], [0])[0]
         assert len(out) == 4
         assert all(o.cache_salt == "alice" for o in out)
 
@@ -164,10 +164,10 @@ def test_empty_salt_passes_through(self):
             worker_id=0,
             token_ids=[1],
         )
-        out = ipc_key_to_object_keys(k, [b"h1"])
+        out = ipc_key_to_object_keys(k, [b"h1"], [0])[0]
         assert all(o.cache_salt == "" for o in out)
 
-    def test_object_group_id_defaults_to_zero(self):
+    def test_object_group_id_zero(self):
         # First Party
         from lmcache.v1.distributed.api import ipc_key_to_object_keys
         from lmcache.v1.multiprocess.custom_types import IPCCacheEngineKey
@@ -178,7 +178,7 @@ def test_object_group_id_defaults_to_zero(self):
             worker_id=0,
             token_ids=[1, 2],
         )
-        out = ipc_key_to_object_keys(k, [b"h1", b"h2"])
+        out = ipc_key_to_object_keys(k, [b"h1", b"h2"], [0])[0]
         assert all(o.object_group_id == 0 for o in out)
 
     def test_object_group_id_propagates_to_all_keys(self):
@@ -194,10 +194,35 @@ def test_object_group_id_propagates_to_all_keys(self):
             worker_id=None,
             token_ids=[1, 2, 3],
         )
-        out = ipc_key_to_object_keys(k, [b"h1"], object_group_id=3)
+        out = ipc_key_to_object_keys(k, [b"h1"], [3])[0]
         assert len(out) == 4
         assert all(o.object_group_id == 3 for o in out)
 
+    def test_multiple_object_groups(self):
+        """Each requested object group gets its own positional key list."""
+        # First Party
+        from lmcache.v1.distributed.api import ipc_key_to_object_keys
+        from lmcache.v1.multiprocess.custom_types import IPCCacheEngineKey
+
+        k = IPCCacheEngineKey.from_token_ids(
+            model_name="m",
+            world_size=2,
+            worker_id=None,
+            token_ids=[1, 2, 3],
+            cache_salt="alice",
+        )
+        out = ipc_key_to_object_keys(k, [b"h1", b"h2"], [0, 3])
+        assert len(out) == 2
+        # 2 chunks * 2 workers = 4 keys per group.
+        assert all(len(group_keys) == 4 for group_keys in out)
+        assert all(o.object_group_id == 0 for o in out[0])
+        assert all(o.object_group_id == 3 for o in out[1])
+        # The groups differ only in object_group_id.
+        for first, second in zip(out[0], out[1], strict=True):
+            assert first.chunk_hash == second.chunk_hash
+            assert first.kv_rank == second.kv_rank
+            assert first.cache_salt == second.cache_salt
+
 
 class TestObjectKeyValidation:
     """``ObjectKey.__post_init__`` rejects invalid ``object_group_id``."""
diff --git a/tests/v1/multiprocess/test_batched_iteration_with_skip.py b/tests/v1/multiprocess/test_batched_iteration_with_skip.py
new file mode 100644
index 0000000000..bc9a7bee50
--- /dev/null
+++ b/tests/v1/multiprocess/test_batched_iteration_with_skip.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for ``batched_iteration_with_skip``."""
+
+# Third Party
+import pytest
+
+# First Party
+from lmcache.v1.multiprocess.modules.gpu_transfer import (
+    batched_iteration_with_skip,
+)
+
+
+def test_basic_batching_with_skip():
+    """Skipped items are dropped and reported indices stay in original space."""
+    data = list(range(10))
+    result = list(batched_iteration_with_skip(data, batch_size=3, skip_count=2))
+
+    assert result == [
+        (2, (2, 3, 4)),
+        (5, (5, 6, 7)),
+        (8, (8, 9)),
+    ]
+
+
+def test_skip_count_zero_matches_plain_batching():
+    """With skip_count=0 every item is yielded, indexed from 0."""
+    data = list(range(7))
+    result = list(batched_iteration_with_skip(data, batch_size=2, skip_count=0))
+
+    assert result == [
+        (0, (0, 1)),
+        (2, (2, 3)),
+        (4, (4, 5)),
+        (6, (6,)),
+    ]
+    # The concatenation of all batches equals the unskipped tail of the list.
+    flattened = [item for _, batch in result for item in batch]
+    assert flattened == data
+
+
+def test_batch_start_indices_are_original_indices():
+    """Reported start index is the original list index, accounting for skip."""
+    data = list(range(20))
+    result = list(batched_iteration_with_skip(data, batch_size=5, skip_count=10))
+
+    start_indices = [start for start, _ in result]
+    assert start_indices == [10, 15]
+    # The docstring example: skip_count=10, batch_size=5 -> first start idx 10.
+    assert result[0] == (10, (10, 11, 12, 13, 14))
+
+
+def test_partial_final_batch():
+    """The final short batch still reports the correct start index."""
+    data = list(range(8))
+    result = list(batched_iteration_with_skip(data, batch_size=3, skip_count=1))
+
+    assert result == [
+        (1, (1, 2, 3)),
+        (4, (4, 5, 6)),
+        (7, (7,)),
+    ]
+
+
+def test_skip_equal_to_length_yields_nothing():
+    """Skipping the entire list yields no batches."""
+    data = list(range(5))
+    result = list(batched_iteration_with_skip(data, batch_size=2, skip_count=5))
+    assert result == []
+
+
+def test_skip_larger_than_length_yields_nothing():
+    """Skipping past the end of the list yields no batches and does not raise."""
+    data = list(range(5))
+    result = list(batched_iteration_with_skip(data, batch_size=2, skip_count=100))
+    assert result == []
+
+
+def test_empty_list():
+    """An empty input yields no batches regardless of skip_count."""
+    assert list(batched_iteration_with_skip([], batch_size=4, skip_count=0)) == []
+    assert list(batched_iteration_with_skip([], batch_size=4, skip_count=3)) == []
+
+
+def test_batch_size_larger_than_remaining():
+    """A batch_size exceeding the remaining items yields one full-remainder batch."""
+    data = list(range(6))
+    result = list(batched_iteration_with_skip(data, batch_size=100, skip_count=2))
+    assert result == [(2, (2, 3, 4, 5))]
+
+
+@pytest.mark.parametrize("batch_size", [0, -1, -10])
+def test_invalid_batch_size_raises(batch_size):
+    """A batch_size below 1 raises ValueError."""
+    with pytest.raises(ValueError, match="batch size must be at least one"):
+        list(batched_iteration_with_skip([1, 2, 3], batch_size, skip_count=0))
+
+
+@pytest.mark.parametrize("skip_count", [-1, -5])
+def test_negative_skip_count_raises(skip_count):
+    """A negative skip_count raises ValueError."""
+    with pytest.raises(ValueError, match="skip_count must be non-negative"):
+        list(
+            batched_iteration_with_skip([1, 2, 3], batch_size=2, skip_count=skip_count)
+        )
+
+
+def test_returns_tuples_not_lists():
+    """Each yielded batch is a tuple, mirroring batched_iteration."""
+    _, batch = next(
+        batched_iteration_with_skip([1, 2, 3, 4], batch_size=2, skip_count=0)
+    )
+    assert isinstance(batch, tuple)
diff --git a/tests/v1/multiprocess/test_free_locks.py b/tests/v1/multiprocess/test_free_locks.py
index 7efc0c6fb5..a9e695d235 100644
--- a/tests/v1/multiprocess/test_free_locks.py
+++ b/tests/v1/multiprocess/test_free_locks.py
@@ -105,7 +105,7 @@ def test_server_free_lookup_locks_calls_finish_read_prefetched():
     sentinel_obj_keys = [MagicMock()]
     with patch(
         "lmcache.v1.multiprocess.modules.lookup.ipc_key_to_object_keys",
-        return_value=sentinel_obj_keys,
+        return_value=[sentinel_obj_keys],
     ):
         module.free_lookup_locks(key, 1)
 
diff --git a/tests/v1/multiprocess/test_non_cuda_data_transfer.py b/tests/v1/multiprocess/test_non_cuda_data_transfer.py
index c8e998b5a6..8262d8566d 100644
--- a/tests/v1/multiprocess/test_non_cuda_data_transfer.py
+++ b/tests/v1/multiprocess/test_non_cuda_data_transfer.py
@@ -678,7 +678,7 @@ def _create(
         stack.enter_context(
             patch(
                 "lmcache.v1.multiprocess.engine_context.ipc_key_to_object_keys",
-                return_value=object_keys or ["obj"],
+                return_value=[object_keys or ["obj"]],
             )
         )
 
diff --git a/tests/v1/multiprocess/test_unified_touch.py b/tests/v1/multiprocess/test_unified_touch.py
index 530af8de8b..856cbb7157 100644
--- a/tests/v1/multiprocess/test_unified_touch.py
+++ b/tests/v1/multiprocess/test_unified_touch.py
@@ -210,7 +210,7 @@ def test_end_session_generates_correct_keys(self, hasher: TokenHasher):
         assert removed.lookup_ipc_key is not None
 
         chunk_hashes = [TokenHasher.hash_to_bytes(h) for h in removed.get_hashes(0)]
-        obj_keys = ipc_key_to_object_keys(removed.lookup_ipc_key, chunk_hashes)
+        obj_keys = ipc_key_to_object_keys(removed.lookup_ipc_key, chunk_hashes, [0])[0]
 
         # With world_size=1 and worker_id=None, should have 3 keys
         assert len(obj_keys) == 3
@@ -238,7 +238,7 @@ def test_end_session_expands_keys_for_world_size(self, hasher: TokenHasher):
         assert removed.lookup_ipc_key is not None
 
         chunk_hashes = [TokenHasher.hash_to_bytes(h) for h in removed.get_hashes(0)]
-        obj_keys = ipc_key_to_object_keys(removed.lookup_ipc_key, chunk_hashes)
+        obj_keys = ipc_key_to_object_keys(removed.lookup_ipc_key, chunk_hashes, [0])[0]
 
         # 2 chunks * 2 workers = 4 keys
         assert len(obj_keys) == 4
@@ -279,7 +279,7 @@ def test_end_session_empty_hashes_produces_no_keys(self, hasher: TokenHasher):
         assert removed.lookup_ipc_key is not None
 
         chunk_hashes = [TokenHasher.hash_to_bytes(h) for h in removed.get_hashes(0)]
-        obj_keys = ipc_key_to_object_keys(removed.lookup_ipc_key, chunk_hashes)
+        obj_keys = ipc_key_to_object_keys(removed.lookup_ipc_key, chunk_hashes, [0])[0]
         assert len(obj_keys) == 0
 
     def test_end_session_hashes_cover_retrieve_and_store(self, hasher: TokenHasher):

From adedd1fcfdee3ccf1025918a33c39e80b39f90ef Mon Sep 17 00:00:00 2001
From: Roy Huang <roy.y.huang@gmail.com>
Date: Wed, 10 Jun 2026 18:03:38 -0700
Subject: [PATCH 31/57] [Operator] CacheBlend: CacheBlendEngine CRD + injection
 webhook (#3543)

* feat(operator): add CacheBlendEngine CRD + CacheBlend injection webhook

Signed-off-by: royyhuang <roy.y.huang@gmail.com>
---
 docs/source/mp/operator.rst                   |  168 +
 operator/DESIGN.md                            |  122 +-
 operator/PROJECT                              |    9 +
 operator/README.md                            |  279 +-
 .../api/v1alpha1/cacheblendengine_defaults.go |   85 +
 .../api/v1alpha1/cacheblendengine_test.go     |  559 +++
 .../api/v1alpha1/cacheblendengine_types.go    |  265 ++
 .../v1alpha1/cacheblendengine_validation.go   |   91 +
 .../api/v1alpha1/lmcacheengine_defaults.go    |   12 +-
 operator/api/v1alpha1/lmcacheengine_test.go   |    8 +-
 .../api/v1alpha1/lmcacheengine_validation.go  |  134 +-
 .../api/v1alpha1/zz_generated.deepcopy.go     |  296 +-
 operator/cmd/main.go                          |   19 +
 operator/config/certmanager/certificate.yaml  |   26 +
 operator/config/certmanager/issuer.yaml       |   12 +
 .../config/certmanager/kustomization.yaml     |    6 +
 .../config/certmanager/kustomizeconfig.yaml   |    8 +
 .../lmcache.lmcache.ai_cacheblendengines.yaml | 3712 +++++++++++++++++
 operator/config/crd/kustomization.yaml        |    5 +-
 operator/config/default/kustomization.yaml    |  144 +-
 .../config/default/manager_webhook_patch.yaml |   20 +
 operator/config/rbac/role.yaml                |    3 +
 operator/config/samples/kustomization.yaml    |    1 +
 .../lmcache_v1alpha1_cacheblendengine.yaml    |   88 +
 .../samples/vllm_cacheblend_deployment.yaml   |   86 +
 operator/config/webhook/kustomization.yaml    |   15 +
 operator/config/webhook/kustomizeconfig.yaml  |   25 +
 operator/config/webhook/manifests.yaml        |   26 +
 .../mutating_webhook_selectors_patch.yaml     |   26 +
 operator/config/webhook/service.yaml          |   16 +
 operator/go.mod                               |    2 +-
 .../cacheblend_reconcile_helpers.go           |  435 ++
 .../controller/cacheblendengine_controller.go |  128 +
 .../cacheblendengine_controller_test.go       |  212 +
 .../internal/resources/cacheblend_engine.go   |  183 +
 .../resources/cacheblend_engine_test.go       |  379 ++
 operator/internal/resources/configmap.go      |   63 +-
 operator/internal/resources/daemonset.go      |   65 +-
 operator/internal/resources/resources_test.go |   20 +-
 operator/internal/resources/service.go        |   39 +-
 operator/internal/resources/servicemonitor.go |   17 +-
 .../webhook/cacheblend_inject_builders.go     |  296 ++
 operator/internal/webhook/pod_injector.go     |  415 ++
 .../webhook/pod_injector_envtest_test.go      |  146 +
 .../internal/webhook/pod_injector_test.go     |  574 +++
 .../internal/webhook/webhook_suite_test.go    |  173 +
 operator/make/build.mk                        |    4 +-
 47 files changed, 8984 insertions(+), 433 deletions(-)
 create mode 100644 operator/api/v1alpha1/cacheblendengine_defaults.go
 create mode 100644 operator/api/v1alpha1/cacheblendengine_test.go
 create mode 100644 operator/api/v1alpha1/cacheblendengine_types.go
 create mode 100644 operator/api/v1alpha1/cacheblendengine_validation.go
 create mode 100644 operator/config/certmanager/certificate.yaml
 create mode 100644 operator/config/certmanager/issuer.yaml
 create mode 100644 operator/config/certmanager/kustomization.yaml
 create mode 100644 operator/config/certmanager/kustomizeconfig.yaml
 create mode 100644 operator/config/crd/bases/lmcache.lmcache.ai_cacheblendengines.yaml
 create mode 100644 operator/config/default/manager_webhook_patch.yaml
 create mode 100644 operator/config/samples/lmcache_v1alpha1_cacheblendengine.yaml
 create mode 100644 operator/config/samples/vllm_cacheblend_deployment.yaml
 create mode 100644 operator/config/webhook/kustomization.yaml
 create mode 100644 operator/config/webhook/kustomizeconfig.yaml
 create mode 100644 operator/config/webhook/manifests.yaml
 create mode 100644 operator/config/webhook/mutating_webhook_selectors_patch.yaml
 create mode 100644 operator/config/webhook/service.yaml
 create mode 100644 operator/internal/controller/cacheblend_reconcile_helpers.go
 create mode 100644 operator/internal/controller/cacheblendengine_controller.go
 create mode 100644 operator/internal/controller/cacheblendengine_controller_test.go
 create mode 100644 operator/internal/resources/cacheblend_engine.go
 create mode 100644 operator/internal/resources/cacheblend_engine_test.go
 create mode 100644 operator/internal/webhook/cacheblend_inject_builders.go
 create mode 100644 operator/internal/webhook/pod_injector.go
 create mode 100644 operator/internal/webhook/pod_injector_envtest_test.go
 create mode 100644 operator/internal/webhook/pod_injector_test.go
 create mode 100644 operator/internal/webhook/webhook_suite_test.go

diff --git a/docs/source/mp/operator.rst b/docs/source/mp/operator.rst
index b01b73642b..666a914b0e 100644
--- a/docs/source/mp/operator.rst
+++ b/docs/source/mp/operator.rst
@@ -591,6 +591,174 @@ Override Auto-Computed Resources
         limits:
           memory: "100Gi"
 
+CacheBlend
+----------
+
+CacheBlend reuses cached KV at shifted (non-prefix) positions by recomputing a
+small subset of tokens.  The operator manages it as a second CRD,
+``CacheBlendEngine``, plus a **mutating admission webhook** that injects the
+pure-Python ``lmcache-cacheblend`` vLLM plugin into your serving pods -- so you
+do **not** rebuild the vLLM image.  See :doc:`/kv_cache_optimizations/blending`
+for the technique itself.
+
+It has two halves the operator runs together:
+
+- a GPU-resident ``blend_v3`` engine (``lmcache server --engine-type blend_v3``),
+  deployed as a DaemonSet with the **same GPU model as** ``LMCacheEngine``
+  (``privileged`` + ``runtimeClassName: nvidia`` + ``NVIDIA_VISIBLE_DEVICES=all``
+  + ``hostIPC``, and **no** ``nvidia.com/gpu`` claim) so it shares the vLLM GPU
+  for same-device CUDA IPC; and
+- the vLLM-side plugin, injected into opted-in pods by the webhook.
+
+Additional Prerequisites
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Beyond the operator prerequisites above:
+
+- **cert-manager** -- the webhook's serving certificate is issued by a
+  cert-manager ``Issuer`` + ``Certificate``.  Install it before ``make deploy``:
+
+  .. code-block:: bash
+
+      kubectl apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.yaml
+      kubectl -n cert-manager wait --for=condition=Available deploy --all --timeout=180s
+
+- **Deploy with the webhook** -- use ``make deploy`` (not ``make run``, which is
+  controller-only and disables the webhook via ``ENABLE_WEBHOOKS=false``).
+- **Pod Security Standards** -- the webhook injects ``hostIPC``/``privileged``,
+  which the ``baseline``/``restricted`` profiles reject, so label the engine's
+  and the vLLM pod's namespaces ``pod-security.kubernetes.io/enforce=privileged``.
+
+Deploying a CacheBlendEngine
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: yaml
+
+    apiVersion: lmcache.lmcache.ai/v1alpha1
+    kind: CacheBlendEngine
+    metadata:
+      name: my-cacheblend
+    spec:
+      l1:
+        sizeGB: 60
+      injection:
+        # The (private) cacheblend-plugin init-container image -- repository/tag/
+        # pullPolicy, like spec.image.  Set repository to YOUR image; the
+        # inherited engine-image default is not a valid payload.
+        payloadImage:
+          repository: <registry>/cacheblend-plugin
+          tag: <tag>
+        # Appended to the vLLM pod so the private payload image can pull; the
+        # Secret must exist in the vLLM pod's namespace.
+        imagePullSecrets:
+          - name: my-registry-secret
+
+The engine runs ``lmcache server --engine-type blend_v3`` as a DaemonSet and
+emits a ``my-cacheblend-connection`` ConfigMap with the ``CBKVConnector``
+``kv-transfer-config`` (the operator wires the node-local Service host/port and
+the ``cb.*`` tunables).
+
+Opting a vLLM Pod In
+~~~~~~~~~~~~~~~~~~~~~
+
+Label the pod template for the webhook and bind it to an engine by name.  Launch
+vLLM via the image **ENTRYPOINT** (args only) -- a
+``command: ["/bin/sh", "-c", ...]`` wrapper is skipped, since appended args would
+not reach ``vllm serve``:
+
+.. code-block:: yaml
+
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: vllm-cacheblend
+    spec:
+      replicas: 1
+      selector:
+        matchLabels:
+          app: vllm-cacheblend
+      template:
+        metadata:
+          labels:
+            app: vllm-cacheblend
+            lmcache.ai/cacheblend-inject: "true"          # opt-in (webhook objectSelector)
+          annotations:
+            lmcache.ai/cacheblend-engine: "my-cacheblend" # bind to the engine
+        spec:
+          runtimeClassName: nvidia
+          containers:
+            - name: vllm
+              image: lmcache/vllm-openai:<pinned-tag>
+              args: ["<your-model>", "--port", "8000", "--gpu-memory-utilization", "0.8"]
+              resources:
+                limits:
+                  nvidia.com/gpu: "1"
+
+The webhook injects the plugin init container, ``PYTHONPATH``, ``hostIPC``, the
+private-image pull secret, and the required CacheBlend vLLM flags
+(``--attention-backend CUSTOM``, ``--kv-transfer-config`` from the engine's
+connection ConfigMap, ``--block-size 64``, ``--pipeline-parallel-size 1``,
+``--no-enable-chunked-prefill``, ``--no-async-scheduling``, ``--enforce-eager``).
+You supply only the model and your non-CacheBlend flags.
+
+Verifying Injection
+~~~~~~~~~~~~~~~~~~~~~
+
+The webhook mutates **Pods**, not the Deployment, so inspect a pod:
+
+.. code-block:: bash
+
+    kubectl get pod -l app=vllm-cacheblend -o yaml | \
+      grep -E "initContainers|cb-plugin|PYTHONPATH|attention-backend|cacheblend-injected|skip-reason"
+
+If nothing was injected, check the pod's ``lmcache.ai/cacheblend-skip-reason``
+annotation: ``command-override`` (a ``sh -c`` wrapper was used),
+``kv-transfer-config-present`` (you set your own), ``engine-not-found`` (the
+``<name>-connection`` ConfigMap is missing), ``payload-image-unset`` (the
+engine's ``injection.payloadImage`` has no repository), or
+``target-container-not-found`` (the requested ``targetContainer`` /
+``cacheblend-container`` annotation names a container the pod does not have).
+With ``failurePolicy: Ignore`` a
+webhook/cert problem also leaves the pod un-mutated silently -- confirm the
+operator pod is ``Running`` and the ``MutatingWebhookConfiguration`` exists.
+
+CacheBlendEngine Fields
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``CacheBlendEngineSpec`` mirrors ``LMCacheEngineSpec`` (every field in the CRD
+Spec Reference above) and adds:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 35 20 45
+
+   * - Field
+     - Default
+     - Description
+   * - ``blend.checkLayer``
+     - ``1``
+     - Layer at which token importance is scored (``cb.check_layer``).
+   * - ``blend.recompRatio``
+     - ``0.15``
+     - Fraction of non-prefix-hit tokens recomputed (``cb.recomp_ratio``).
+   * - ``injection.payloadImage``
+     - *required*
+     - The (private) cacheblend-plugin init-container image
+       (``repository`` / ``tag`` / ``pullPolicy``).  Set ``repository`` -- the
+       inherited engine-image default is not a valid payload.
+   * - ``injection.imagePullSecrets``
+     - --
+     - Pull secrets appended to the vLLM pod for the private payload image.
+   * - ``injection.targetContainer``
+     - first container
+     - Name of the vLLM container to inject into.
+   * - ``injection.cudagraph``
+     - ``eager``
+     - ``eager`` | ``piecewise`` | ``full_decode_only`` (never ``full``).
+
+``server.chunkSize`` defaults to ``256`` and must equal 256 (the blend matcher
+requires ``chunk_size == vLLM --block-size * 4``).
+
 Operator vs Manual Deployment
 -----------------------------
 
diff --git a/operator/DESIGN.md b/operator/DESIGN.md
index 883ae5cc06..044e3079d4 100644
--- a/operator/DESIGN.md
+++ b/operator/DESIGN.md
@@ -357,10 +357,130 @@ reach (Redis L2 keys, federation deregistration, etc.).
 
 ---
 
+## CacheBlend: `CacheBlendEngine` CRD + Injection Webhook
+
+CacheBlend reuses cached KV at shifted positions. It has two halves the operator
+manages together: a **GPU-resident blend engine** (server side) and a
+**vLLM-side plugin** that must be loaded into the serving container. The operator
+ships both as a second CRD plus a mutating admission webhook.
+
+> This implements what was previously deferred as a future `blend.enabled` field
+> on `LMCacheEngine`. It is instead a **separate `CacheBlendEngine` kind** (with
+> its own controller) plus an injection webhook — cleaner separation, and no
+> behavior change to `LMCacheEngine`.
+
+### `CacheBlendEngine` CRD
+
+Group `lmcache.lmcache.ai`, `v1alpha1`, kind `CacheBlendEngine` (shortName `cbe`).
+The spec **mirrors `LMCacheEngineSpec`** (image, server, l1, eviction, prometheus,
+l2Backend, scheduling, overrides, imagePullSecrets) and adds:
+
+- `blend.checkLayer` (default 1) and `blend.recompRatio` (default 0.15) — CB
+  tunables fed to the vLLM connector.
+- `injection` — what the webhook injects into vLLM pods: `payloadImage` (an
+  `ImageSpec` — `repository`/`tag`/`pullPolicy`, like `spec.image` — for the
+  private `lmcache-cacheblend` init-container image; set `repository` explicitly,
+  the inherited engine-image default is not a valid payload), `imagePullSecrets`
+  (appended to the vLLM pod so the private payload image can pull — the Secret
+  must exist in the vLLM pod's namespace), `targetContainer` (default: first
+  container), and `cudagraph` (`eager`|`piecewise`|`full_decode_only`, default
+  `eager`).
+- `server.chunkSize` defaults to **256** and is validated to equal 256 (the blend
+  matcher requires `chunk_size == vLLM --block-size * 4`).
+
+### The blend engine (controller)
+
+`CacheBlendEngineReconciler` mirrors `LMCacheEngineReconciler` and reconciles a
+DaemonSet running `lmcache server --engine-type blend_v3` (plus
+`--l1-align-bytes 16777216`), a node-local lookup Service, a metrics Service, and
+a `<name>-connection` ConfigMap. **GPU model is identical to `LMCacheEngine`**:
+`privileged` + `runtimeClassName: nvidia` + `NVIDIA_VISIBLE_DEVICES=all` +
+`hostIPC: true`, with **no `nvidia.com/gpu` device-plugin claim** — the engine
+*shares* the vLLM GPU rather than reserving one, because the blend server scatters
+re-RoPE'd KV directly into vLLM's paged KV over **same-device CUDA IPC**. The
+engine resource builders are the same name/spec-keyed cores used by
+`LMCacheEngine`.
+
+The `<name>-connection` ConfigMap carries the **`CBKVConnector`**
+`kv-transfer-config` (vs `LMCacheMPConnector` for `LMCacheEngine`) — same node-local
+`tcp://` host/port shape, plus the `cb.*` tunables:
+
+```json
+{
+  "kv_connector": "CBKVConnector",
+  "kv_connector_module_path": "lmcache_cacheblend.connector",
+  "kv_role": "kv_both",
+  "kv_connector_extra_config": {
+    "lmcache.mp.host": "tcp://<name>.<namespace>.svc.cluster.local",
+    "lmcache.mp.port": "<server.port>",
+    "cb.check_layer": <blend.checkLayer>,
+    "cb.recomp_ratio": <blend.recompRatio>
+  }
+}
+```
+
+Co-location works exactly like `LMCacheEngine`: one engine per GPU node
+(DaemonSet), and the node-local Service (`internalTrafficPolicy: Local`) routes a
+vLLM pod to the same-node engine. The control-plane RPC is TCP via that Service;
+the data-plane KV write is CUDA IPC on the shared GPU.
+
+### The injection webhook
+
+A mutating admission webhook (`/mutate--v1-pod`, `CREATE`, `failurePolicy: Ignore`)
+injects the `lmcache-cacheblend` plugin into opted-in pods so a **stock vLLM image
+needs no rebuild**. A pod opts in with label `lmcache.ai/cacheblend-inject: "true"`
+and binds to an engine with annotation `lmcache.ai/cacheblend-engine: <name>`. The
+webhook then applies:
+
+| Mutation | What |
+|---|---|
+| pod `hostIPC: true` | required for CUDA IPC with the node-local engine |
+| `cb-plugin` emptyDir + payload init container | the busybox payload `cp -a`'s the pure-Python plugin tree onto the shared volume |
+| readOnly mount + `PYTHONPATH=/cb-plugin` on the vLLM container | vLLM discovers the plugin via its `vllm.general_plugins` entry point |
+| append required vLLM args | `--attention-backend CUSTOM`, `--kv-transfer-config <from the connection ConfigMap>`, `--block-size 64`, `--pipeline-parallel-size 1`, `--no-enable-chunked-prefill`, `--no-async-scheduling`, `--enforce-eager` (or the configured cudagraph) |
+| append `injection.imagePullSecrets` | so the private payload image can pull |
+| stamp `lmcache.ai/cacheblend-injected: "true"` | idempotency guard |
+
+The webhook **skips** (stamping `lmcache.ai/cacheblend-skip-reason`) when: the
+target container overrides `command` (a `sh -c` wrapper — appended args wouldn't
+reach `vllm serve`); the user already supplies `--kv-transfer-config` (not
+clobbered); the named engine's connection ConfigMap doesn't exist; the engine's
+`injection.payloadImage` resolves to an empty reference (`payload-image-unset`);
+or the requested `targetContainer`/`cacheblend-container` annotation names a
+container that does not exist on the pod (`target-container-not-found`). It does
+**not** gate on engine readiness — like `LMCacheEngine`, the connector connects
+when the engine comes up. Args are emitted in two-token form
+(`--attention-backend CUSTOM`); the replace-not-duplicate dedup still recognizes a
+user-supplied `--flag=value`.
+
+### Prerequisites
+
+- **cert-manager** — the webhook's serving cert is a cert-manager `Issuer` +
+  `Certificate` (caBundle injected via `inject-ca-from`); install it before
+  `make deploy`.
+- **`make deploy`, not `make run`** — `make run` sets `ENABLE_WEBHOOKS=false` and
+  installs no `MutatingWebhookConfiguration`; it is controller-only. The webhook
+  needs the operator running as an in-cluster pod.
+- **Pod Security Standards** — the injected `hostIPC`/`privileged` is rejected by
+  the `baseline`/`restricted` profiles, so the engine's and the vLLM pod's
+  namespaces must be labeled `pod-security.kubernetes.io/enforce=privileged`.
+
+### Resources created (for a `CacheBlendEngine` named `cb`)
+
+| Resource | Name | Purpose |
+|---|---|---|
+| DaemonSet | `cb` | `lmcache server --engine-type blend_v3` on GPU nodes |
+| Service (node-local) | `cb` | same-node discovery for vLLM (`CBKVConnector`) |
+| Service (headless) | `cb-metrics` | Prometheus scrape target |
+| ConfigMap | `cb-connection` | `CBKVConnector` kv-transfer-config |
+| MutatingWebhookConfiguration | (operator-wide) | injects the plugin into opted-in vLLM pods |
+
+---
+
 ## Future Extensibility
 
 - **L2 backends:** The RESP (Redis/Valkey) adapter is natively supported with typed CRD fields and Secret-based auth injection. Other adapter types (nixl_store, fs, mock, mooncake_store, raw_block) can be configured via the `raw` escape hatch. Currently only a single L2 adapter is supported at a time. LMCache MP mode is designed to support multiple adapters in cascade, but this is not yet fully tested — once validated, the operator will support multiple adapters.
-- **Blend mode:** Future `LMCacheEngine` field `blend.enabled` to switch entrypoint from `server.py` to `blend_server.py` (deferred from v1alpha1).
+- **Blend mode:** Implemented as the separate `CacheBlendEngine` CRD + injection webhook — see [CacheBlend](#cacheblend-cacheblendengine-crd--injection-webhook) above. (This supersedes the earlier idea of a `blend.enabled` field on `LMCacheEngine`.)
 - **Update strategy:** Future `spec.updateStrategy` field for `RollingUpdate`/`OnDelete` control on the DaemonSet.
 - **Additional CRDs:** `LMCacheKeyManager` (global key management), `LMCacheMonitor` (engine state monitoring), `LMCacheFederation` (cross-cluster P2P topology).
 
diff --git a/operator/PROJECT b/operator/PROJECT
index 4146b6ec23..c41409d423 100644
--- a/operator/PROJECT
+++ b/operator/PROJECT
@@ -18,4 +18,13 @@ resources:
   kind: LMCacheEngine
   path: github.com/LMCache/LMCache/api/v1alpha1
   version: v1alpha1
+- api:
+    crdVersion: v1
+    namespaced: true
+  controller: true
+  domain: lmcache.ai
+  group: lmcache
+  kind: CacheBlendEngine
+  path: github.com/LMCache/LMCache/api/v1alpha1
+  version: v1alpha1
 version: "3"
diff --git a/operator/README.md b/operator/README.md
index 2c79771b9c..633a9f0202 100644
--- a/operator/README.md
+++ b/operator/README.md
@@ -11,6 +11,7 @@ See [DESIGN.md](DESIGN.md) for architecture details, reconciliation logic, and C
 - For NVIDIA GPUs (default): NVIDIA GPU Operator with the `nvidia` RuntimeClass available on GPU nodes
 - For AMD GPUs: set `spec.gpuVendor: amd` in your `LMCacheEngine` (see [AMD GPUs (ROCm)](#amd-gpus-rocm) below)
 - (Optional) [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) for ServiceMonitor support
+- (CacheBlend only) [cert-manager](https://cert-manager.io) for the injection webhook's serving cert — see [CacheBlend](#cacheblend) below
 
 > [!IMPORTANT]
 > By default the operator runs LMCache pods with `runtimeClassName: nvidia` and `privileged: true` to gain GPU visibility without consuming GPU resources via the device plugin. This allows the serving engine (e.g., vLLM) to claim all GPUs on the node. Clusters using Pod Security Standards must allow the `privileged` profile for the LMCache namespace.
@@ -46,72 +47,17 @@ make deploy IMG=<your-registry>/lmcache-operator:latest
 
 ### 2. Deploy an LMCacheEngine
 
-A minimal CR deploys a DaemonSet with 60 GB L1 cache on every node:
-
-```yaml
-# lmcache-engine.yaml
-apiVersion: lmcache.lmcache.ai/v1alpha1
-kind: LMCacheEngine
-metadata:
-  name: my-cache
-spec:
-  l1:
-    sizeGB: 60
-```
+The minimal CR just needs `l1.sizeGB`. Apply the sample (a fully-commented field reference covering every option):
 
 ```bash
-kubectl apply -f lmcache-engine.yaml
+kubectl apply -f config/samples/lmcache_v1alpha1_lmcacheengine.yaml
 ```
 
 The operator automatically handles `hostIPC`, GPU visibility (`runtimeClassName: nvidia`, `privileged: true`), node-local service routing, resource sizing, and Prometheus metrics — see [DESIGN.md](DESIGN.md) for details.
 
 ### 3. Connect vLLM to LMCache
 
-The operator creates a ConfigMap named `<engine-name>-connection` containing the `kv-transfer-config` JSON that vLLM needs. Use it in your vLLM Deployment:
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vllm
-  template:
-    metadata:
-      labels:
-        app: vllm
-    spec:
-      # Required for CUDA IPC between vLLM and LMCache
-      hostIPC: true
-      containers:
-        - name: vllm
-          image: lmcache/vllm-openai:<pinned-tag>
-          command: ["/bin/sh", "-c"]
-          args:
-            - |
-              exec python3 -m vllm.entrypoints.openai.api_server \
-                --model <your-model> \
-                --port 8000 \
-                --gpu-memory-utilization 0.8 \
-                --kv-transfer-config "$(cat /etc/lmcache/kv-transfer-config.json)"
-          ports:
-            - name: http
-              containerPort: 8000
-          volumeMounts:
-            - name: kv-transfer-config
-              mountPath: /etc/lmcache
-              readOnly: true
-          resources:
-            limits:
-              nvidia.com/gpu: "1"
-      volumes:
-        - name: kv-transfer-config
-          configMap:
-            name: my-cache-connection  # Must match your LMCacheEngine name + "-connection"
-```
+The operator creates a ConfigMap named `<engine-name>-connection` with the `kv-transfer-config` JSON vLLM needs. The vLLM pod mounts it and passes it to `--kv-transfer-config` — see the sample [`config/samples/vllm_deployment.yaml`](config/samples/vllm_deployment.yaml).
 
 Key points for vLLM pods:
 
@@ -151,187 +97,60 @@ kubectl describe lmc my-cache
 
 ## Examples
 
-### Target Only GPU Nodes
-
-Use `nodeSelector` to run LMCache only on GPU nodes. New GPU nodes automatically get an LMCache pod:
-
-```yaml
-apiVersion: lmcache.lmcache.ai/v1alpha1
-kind: LMCacheEngine
-metadata:
-  name: my-cache
-spec:
-  nodeSelector:
-    nvidia.com/gpu.present: "true"
-  l1:
-    sizeGB: 60
-```
-
-### AMD GPUs (ROCm)
-
-Set `spec.gpuVendor: amd` to run on AMD GPU nodes. The operator omits `runtimeClassName` from the pod spec and skips the NVIDIA env vars. AMD GPU nodes don't have a universal label equivalent to `nvidia.com/gpu.present`, so supply a `nodeSelector` that matches the label your platform exposes (e.g. `feature.node.kubernetes.io/amd-gpu: "true"` when using the [ROCm/gpu-operator](https://github.com/ROCm/gpu-operator)):
-
-```yaml
-apiVersion: lmcache.lmcache.ai/v1alpha1
-kind: LMCacheEngine
-metadata:
-  name: amd-cache
-spec:
-  gpuVendor: amd
-  nodeSelector:
-    feature.node.kubernetes.io/amd-gpu: "true"
-  l1:
-    sizeGB: 60
-```
-
-vLLM connects to LMCache via HIP IPC over `hostIPC` exactly the same way as CUDA IPC on NVIDIA — the `hostIPC: true` and `PYTHONHASHSEED=0` requirements above apply unchanged. Use a ROCm-built LMCache image for `spec.image`.
-
-### Custom Server Port
-
-If the default port (5555) conflicts with other services:
-
-```yaml
-apiVersion: lmcache.lmcache.ai/v1alpha1
-kind: LMCacheEngine
-metadata:
-  name: my-cache
-spec:
-  server:
-    port: 6555
-  l1:
-    sizeGB: 60
-```
+Every scenario has a ready-to-edit manifest under [`config/samples/`](config/samples/) (`kubectl apply -f config/samples/<file>`):
 
-The connection ConfigMap updates automatically — vLLM pods pick up the new port on restart.
-
-### Production with Prometheus Monitoring
-
-```yaml
-apiVersion: lmcache.lmcache.ai/v1alpha1
-kind: LMCacheEngine
-metadata:
-  name: production-cache
-  namespace: llm-serving
-spec:
-  nodeSelector:
-    nvidia.com/gpu.present: "true"
-  image:
-    repository: lmcache/standalone
-    tag: v0.1.0
-  server:
-    port: 6555
-    chunkSize: 256
-    maxWorkers: 4
-  l1:
-    sizeGB: 60
-  eviction:
-    triggerWatermark: 0.8
-    evictionRatio: 0.2
-  prometheus:
-    enabled: true
-    port: 9090
-    serviceMonitor:
-      enabled: true
-      labels:
-        release: kube-prometheus-stack
-  podAnnotations:
-    prometheus.io/scrape: "true"
-    prometheus.io/port: "9090"
-  priorityClassName: system-node-critical
-```
+| Scenario | Sample |
+|---|---|
+| Minimal + **full commented field reference** (GPU `nodeSelector`, custom `server.port`, L2 `raw`/`raw_block`, `resourceOverrides`, …) | [`lmcache_v1alpha1_lmcacheengine.yaml`](config/samples/lmcache_v1alpha1_lmcacheengine.yaml) |
+| Production: Prometheus `ServiceMonitor`, custom port, `priorityClassName` | [`lmcache_v1alpha1_lmcacheengine_production.yaml`](config/samples/lmcache_v1alpha1_lmcacheengine_production.yaml) |
+| L2 storage: Redis/Valkey (optional Secret auth) | [`lmcache_v1alpha1_lmcacheengine_l2_redis.yaml`](config/samples/lmcache_v1alpha1_lmcacheengine_l2_redis.yaml) |
+| AMD GPUs (ROCm) | [`lmcache_v1alpha1_lmcacheengine_amd.yaml`](config/samples/lmcache_v1alpha1_lmcacheengine_amd.yaml) |
+| vLLM Deployment wired to an LMCacheEngine | [`vllm_deployment.yaml`](config/samples/vllm_deployment.yaml) |
+| CacheBlend engine + opted-in vLLM (see [CacheBlend](#cacheblend)) | [`lmcache_v1alpha1_cacheblendengine.yaml`](config/samples/lmcache_v1alpha1_cacheblendengine.yaml), [`vllm_cacheblend_deployment.yaml`](config/samples/vllm_cacheblend_deployment.yaml) |
 
-### L2 Storage: Redis/Valkey
-
-Add a Redis L2 adapter for persistent KV cache storage beyond L1 memory:
-
-```yaml
-apiVersion: lmcache.lmcache.ai/v1alpha1
-kind: LMCacheEngine
-metadata:
-  name: cache-with-redis
-spec:
-  l1:
-    sizeGB: 60
-  l2Backend:
-    resp:
-      host: redis.default.svc.cluster.local
-      port: 6379
-      numWorkers: 8
-```
+Notes:
 
-For Redis authentication, create a Secret with `username` and `password` keys and reference it. Credentials are injected as environment variables and never appear in pod args or `kubectl describe` output. The Secret can live in a different namespace — the operator creates a managed copy automatically:
-
-```yaml
-# Create the secret (or reference an existing one in another namespace):
-# kubectl create secret generic redis-auth \
-#   --from-literal=username=myuser \
-#   --from-literal=password=mypassword
-spec:
-  l2Backend:
-    resp:
-      host: redis.default.svc.cluster.local
-      port: 6379
-      authSecretRef:
-        name: redis-auth
-        namespace: redis    # omit if the Secret is in the same namespace
-```
+- **GPU targeting** — `nodeSelector: {nvidia.com/gpu.present: "true"}` runs LMCache only on GPU nodes; new GPU nodes auto-get a pod.
+- **AMD (ROCm)** — `spec.gpuVendor: amd` omits `runtimeClassName` and the NVIDIA env vars; vLLM connects via HIP IPC over `hostIPC` the same way (`PYTHONHASHSEED=0` still required). Supply a `nodeSelector` matching your platform's AMD label and a ROCm-built `spec.image`.
+- **Custom port** — set `server.port`; the connection ConfigMap updates automatically and vLLM picks it up on restart.
+- **L2 adapters** — only one at a time today. Redis/Valkey is natively typed; cross-namespace auth Secrets are copied automatically and injected via env (never in args or `kubectl describe`). Other types (`nixl_store`, `fs`, `mock`, `raw_block`) use the `raw` escape hatch — see the commented blocks in the minimal sample. For `raw_block` with `use_odirect: true`, `--l1-align-bytes` must be ≥ `block_align`.
+- **Resources** auto-compute from `l1.sizeGB`; override with `resourceOverrides`.
 
-### L2 Storage: Other Adapters (Raw Escape Hatch)
-
-For adapter types not yet natively supported by the operator (e.g. `nixl_store`, `fs`, `mock`, `raw_block`), use the `raw` escape hatch. The JSON is passed through to `--l2-adapter` as-is:
-
-```yaml
-spec:
-  l2Backend:
-    raw:
-      type: nixl_store
-      config:
-        backend: "POSIX"
-        backend_params:
-          file_path: "/data/lmcache/l2"
-          use_direct_io: "false"
-        pool_size: 64
-```
+## CacheBlend
 
-Example `raw_block` configuration via the same escape hatch:
-
-```yaml
-spec:
-  l2Backend:
-    raw:
-      type: raw_block
-      config:
-        device_path: "/dev/nvme0n1"
-        slot_bytes: 1048576
-        block_align: 4096
-        header_bytes: 4096
-        meta_total_bytes: 268435456
-        use_odirect: true
-        num_store_workers: 2
-        num_lookup_workers: 1
-        num_load_workers: 4
-```
-
-Use an unmounted raw block device or a dedicated file path reserved for LMCache. With `use_odirect: true`, the LMCache server's `--l1-align-bytes` setting must be at least `block_align`.
+CacheBlend reuses cached KV at shifted positions. The operator manages it as a
+second CRD (`CacheBlendEngine`) plus a **mutating webhook** that injects the
+`lmcache-cacheblend` plugin into your vLLM pods — no vLLM image rebuild. See
+[DESIGN.md](DESIGN.md#cacheblend-cacheblendengine-crd--injection-webhook) for the
+architecture and the full field reference.
 
-> [!NOTE]
-> Currently only a single L2 adapter is supported at a time. While LMCache multiprocess mode is designed to support multiple L2 adapters in cascade, this functionality is not yet fully tested. Once the multi-adapter pipeline is validated and performance is confirmed, the operator will be updated to support multiple adapters.
+Quick start: deploy an engine, then opt a vLLM pod in with the label
+`lmcache.ai/cacheblend-inject: "true"` and the annotation
+`lmcache.ai/cacheblend-engine: "<engine>"` on its pod template (launch vLLM via the
+image ENTRYPOINT — a `sh -c` wrapper is skipped). Editable samples:
 
-### Override Auto-Computed Resources
+- [`config/samples/lmcache_v1alpha1_cacheblendengine.yaml`](config/samples/lmcache_v1alpha1_cacheblendengine.yaml) — the `CacheBlendEngine`
+- [`config/samples/vllm_cacheblend_deployment.yaml`](config/samples/vllm_cacheblend_deployment.yaml) — an opted-in vLLM Deployment
 
-By default, the operator derives memory requests/limits from `l1.sizeGB`. To override:
+> [!IMPORTANT]
+> CacheBlend needs the **webhook**, so deploy with `make deploy` (not `make run`,
+> which is controller-only) and install **cert-manager** first
+> (`kubectl apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.yaml`).
+> If Pod Security Standards are enforced, label the engine's and the vLLM pod's
+> namespaces `pod-security.kubernetes.io/enforce=privileged` — the webhook injects
+> `hostIPC`/`privileged`, which `baseline`/`restricted` reject.
 
-```yaml
-spec:
-  l1:
-    sizeGB: 60
-  resourceOverrides:
-    requests:
-      memory: "70Gi"
-      cpu: "8"
-    limits:
-      memory: "100Gi"
-```
+> [!IMPORTANT]
+> CacheBlend is still in early stage development and under heavy testing. Its
+> docker image will not be publicly released until we are confident that it is
+> ready to be shipped for general use cases. If you would like to try it first,
+> please contact us in Slack Channel.
+
+The webhook mutates **Pods**, not the Deployment, so verify on a pod
+(`kubectl get pod -l app=vllm-cacheblend -o yaml | grep -E "cb-plugin|cacheblend-injected|skip-reason"`).
+If nothing was injected, check the pod's `lmcache.ai/cacheblend-skip-reason`
+annotation (`command-override`, `kv-transfer-config-present`, `engine-not-found`,
+`payload-image-unset`, or `target-container-not-found`).
 
 ## Development
 
diff --git a/operator/api/v1alpha1/cacheblendengine_defaults.go b/operator/api/v1alpha1/cacheblendengine_defaults.go
new file mode 100644
index 0000000000..eb823d6a07
--- /dev/null
+++ b/operator/api/v1alpha1/cacheblendengine_defaults.go
@@ -0,0 +1,85 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1alpha1
+
+// SetDefaults applies defaults that cannot be expressed purely via kubebuilder
+// markers. It mirrors LMCacheEngine.SetDefaults and additionally pins the blend
+// tunables, injection defaults, and the CacheBlend-required chunk size of 256.
+func (e *CacheBlendEngine) SetDefaults() {
+	spec := &e.Spec
+
+	// Default logLevel to INFO if unset (belt-and-suspenders with kubebuilder default).
+	if spec.LogLevel == nil {
+		info := defaultLogLevel
+		spec.LogLevel = &info
+	}
+
+	if spec.GPUVendor == nil {
+		v := GPUVendorNvidia
+		spec.GPUVendor = &v
+	}
+
+	if spec.NodeSelector == nil && *spec.GPUVendor == GPUVendorNvidia {
+		spec.NodeSelector = map[string]string{
+			"nvidia.com/gpu.present": labelValueTrue,
+		}
+	}
+
+	// CacheBlend requires chunk_size == 256 (block_size 64 * 4). Default the
+	// server block and pin chunkSize to 256 if unset.
+	if spec.Server == nil {
+		spec.Server = &ServerSpec{}
+	}
+	if spec.Server.ChunkSize == nil {
+		cs := CacheBlendChunkSize
+		spec.Server.ChunkSize = &cs
+	}
+
+	// Blend tunables.
+	if spec.Blend == nil {
+		spec.Blend = &BlendSpec{}
+	}
+	if spec.Blend.CheckLayer == nil {
+		cl := int32(1)
+		spec.Blend.CheckLayer = &cl
+	}
+	if spec.Blend.RecompRatio == nil {
+		rr := 0.15
+		spec.Blend.RecompRatio = &rr
+	}
+
+	// Injection defaults.
+	if spec.Injection == nil {
+		spec.Injection = &InjectionSpec{}
+	}
+	// payloadImage tag/pullPolicy default when the image is set; repository is
+	// required (no cluster-wide default for the private payload image).
+	if spec.Injection.PayloadImage != nil {
+		if spec.Injection.PayloadImage.Tag == nil {
+			t := "latest"
+			spec.Injection.PayloadImage.Tag = &t
+		}
+		if spec.Injection.PayloadImage.PullPolicy == nil {
+			pp := "IfNotPresent"
+			spec.Injection.PayloadImage.PullPolicy = &pp
+		}
+	}
+	if spec.Injection.Cudagraph == nil {
+		cg := CudagraphEager
+		spec.Injection.Cudagraph = &cg
+	}
+}
diff --git a/operator/api/v1alpha1/cacheblendengine_test.go b/operator/api/v1alpha1/cacheblendengine_test.go
new file mode 100644
index 0000000000..b679017384
--- /dev/null
+++ b/operator/api/v1alpha1/cacheblendengine_test.go
@@ -0,0 +1,559 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1alpha1
+
+import (
+	"testing"
+)
+
+// ptr is defined in lmcacheengine_test.go (same package); reuse it here.
+
+// --- SetDefaults tests ---
+
+func TestCBSetDefaults_LogLevelNil(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{L1: L1BackendSpec{SizeGB: 10}}}
+	e.SetDefaults()
+	if e.Spec.LogLevel == nil || *e.Spec.LogLevel != defaultLogLevel {
+		t.Fatalf("expected LogLevel=INFO, got %v", e.Spec.LogLevel)
+	}
+}
+
+func TestCBSetDefaults_LogLevelPreserved(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:       L1BackendSpec{SizeGB: 10},
+		LogLevel: ptr("DEBUG"),
+	}}
+	e.SetDefaults()
+	if *e.Spec.LogLevel != "DEBUG" {
+		t.Fatalf("expected LogLevel=DEBUG, got %s", *e.Spec.LogLevel)
+	}
+}
+
+func TestCBSetDefaults_NodeSelectorDefaultGPU(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{L1: L1BackendSpec{SizeGB: 10}}}
+	e.SetDefaults()
+	if e.Spec.NodeSelector == nil {
+		t.Fatal("expected default NodeSelector, got nil")
+	}
+	if e.Spec.NodeSelector["nvidia.com/gpu.present"] != labelValueTrue {
+		t.Fatalf("expected nvidia.com/gpu.present=true, got %v", e.Spec.NodeSelector)
+	}
+}
+
+func TestCBSetDefaults_NodeSelectorPreserved(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:           L1BackendSpec{SizeGB: 10},
+		NodeSelector: map[string]string{"custom": "label"},
+	}}
+	e.SetDefaults()
+	if e.Spec.NodeSelector["custom"] != "label" {
+		t.Fatal("expected custom node selector preserved")
+	}
+	if _, ok := e.Spec.NodeSelector["nvidia.com/gpu.present"]; ok {
+		t.Fatal("default should not override user-provided NodeSelector")
+	}
+}
+
+func TestCBSetDefaults_GPUVendorDefaultNvidia(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{L1: L1BackendSpec{SizeGB: 10}}}
+	e.SetDefaults()
+	if e.Spec.GPUVendor == nil || *e.Spec.GPUVendor != GPUVendorNvidia {
+		t.Fatalf("expected GPUVendor=nvidia, got %v", e.Spec.GPUVendor)
+	}
+}
+
+func TestCBSetDefaults_GPUVendorAMDSkipsNodeSelectorDefault(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		GPUVendor: ptr(GPUVendorAMD),
+	}}
+	e.SetDefaults()
+	if e.Spec.NodeSelector != nil {
+		t.Fatalf("expected nil NodeSelector for AMD vendor, got %v", e.Spec.NodeSelector)
+	}
+}
+
+func TestCBSetDefaults_ChunkSizeDefault256(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{L1: L1BackendSpec{SizeGB: 10}}}
+	e.SetDefaults()
+	if e.Spec.Server == nil || e.Spec.Server.ChunkSize == nil {
+		t.Fatal("expected Server.ChunkSize to be defaulted, got nil")
+	}
+	if *e.Spec.Server.ChunkSize != CacheBlendChunkSize {
+		t.Fatalf("expected ChunkSize=256, got %d", *e.Spec.Server.ChunkSize)
+	}
+}
+
+func TestCBSetDefaults_ChunkSizePreserved(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:     L1BackendSpec{SizeGB: 10},
+		Server: &ServerSpec{ChunkSize: ptr(int32(512))},
+	}}
+	e.SetDefaults()
+	if *e.Spec.Server.ChunkSize != 512 {
+		t.Fatalf("expected ChunkSize preserved at 512, got %d", *e.Spec.Server.ChunkSize)
+	}
+}
+
+func TestCBSetDefaults_BlendDefaults(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{L1: L1BackendSpec{SizeGB: 10}}}
+	e.SetDefaults()
+	if e.Spec.Blend == nil {
+		t.Fatal("expected Blend to be defaulted, got nil")
+	}
+	if e.Spec.Blend.CheckLayer == nil || *e.Spec.Blend.CheckLayer != 1 {
+		t.Fatalf("expected CheckLayer=1, got %v", e.Spec.Blend.CheckLayer)
+	}
+	if e.Spec.Blend.RecompRatio == nil || *e.Spec.Blend.RecompRatio != 0.15 {
+		t.Fatalf("expected RecompRatio=0.15, got %v", e.Spec.Blend.RecompRatio)
+	}
+}
+
+func TestCBSetDefaults_BlendPreserved(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:    L1BackendSpec{SizeGB: 10},
+		Blend: &BlendSpec{CheckLayer: ptr(int32(3)), RecompRatio: ptr(0.5)},
+	}}
+	e.SetDefaults()
+	if *e.Spec.Blend.CheckLayer != 3 {
+		t.Fatalf("expected CheckLayer preserved at 3, got %d", *e.Spec.Blend.CheckLayer)
+	}
+	if *e.Spec.Blend.RecompRatio != 0.5 {
+		t.Fatalf("expected RecompRatio preserved at 0.5, got %v", *e.Spec.Blend.RecompRatio)
+	}
+}
+
+func TestCBSetDefaults_InjectionDefaults(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{L1: L1BackendSpec{SizeGB: 10}}}
+	e.SetDefaults()
+	if e.Spec.Injection == nil {
+		t.Fatal("expected Injection to be defaulted, got nil")
+	}
+	if e.Spec.Injection.Cudagraph == nil || *e.Spec.Injection.Cudagraph != CudagraphEager {
+		t.Fatalf("expected Cudagraph=eager, got %v", e.Spec.Injection.Cudagraph)
+	}
+	// payloadImage was not set, so it stays nil (no cluster-wide default).
+	if e.Spec.Injection.PayloadImage != nil {
+		t.Fatalf("expected PayloadImage nil when unset, got %v", e.Spec.Injection.PayloadImage)
+	}
+
+	// When payloadImage is set, tag and pullPolicy default.
+	e2 := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: &InjectionSpec{PayloadImage: &ImageSpec{Repository: ptr("myreg/cacheblend-plugin")}},
+	}}
+	e2.SetDefaults()
+	pi := e2.Spec.Injection.PayloadImage
+	if pi.Tag == nil || *pi.Tag != "latest" {
+		t.Fatalf("expected payloadImage.Tag=latest, got %v", pi.Tag)
+	}
+	if pi.PullPolicy == nil || *pi.PullPolicy != "IfNotPresent" {
+		t.Fatalf("expected payloadImage.PullPolicy=IfNotPresent, got %v", pi.PullPolicy)
+	}
+}
+
+func TestCBSetDefaults_InjectionPreserved(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1: L1BackendSpec{SizeGB: 10},
+		Injection: &InjectionSpec{
+			PayloadImage: &ImageSpec{Repository: ptr("myreg/cacheblend-plugin"), Tag: ptr("v1"), PullPolicy: ptr("Always")},
+			Cudagraph:    ptr(CudagraphFullDecodeOnly),
+		},
+	}}
+	e.SetDefaults()
+	if *e.Spec.Injection.PayloadImage.PullPolicy != "Always" {
+		t.Fatalf("expected payloadImage.PullPolicy preserved at Always, got %s", *e.Spec.Injection.PayloadImage.PullPolicy)
+	}
+	if *e.Spec.Injection.PayloadImage.Tag != "v1" {
+		t.Fatalf("expected payloadImage.Tag preserved at v1, got %s", *e.Spec.Injection.PayloadImage.Tag)
+	}
+	if *e.Spec.Injection.Cudagraph != CudagraphFullDecodeOnly {
+		t.Fatalf("expected Cudagraph preserved at full_decode_only, got %s", *e.Spec.Injection.Cudagraph)
+	}
+}
+
+// --- ValidateSpec tests ---
+
+// validCBInjection returns the minimal injection block ValidateSpec requires
+// (injection.payloadImage.repository). Tests that exercise other fields include
+// it so the injection requirement does not perturb their expected error counts.
+func validCBInjection() *InjectionSpec {
+	return &InjectionSpec{PayloadImage: &ImageSpec{Repository: ptr("myreg/cacheblend-plugin")}}
+}
+
+func TestCBValidateSpec_ValidMinimal(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: validCBInjection(),
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 0 {
+		t.Fatalf("expected no errors, got %v", errs)
+	}
+}
+
+func TestCBValidateSpec_ValidDefaulted(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: validCBInjection(),
+	}}
+	e.SetDefaults()
+	errs := e.ValidateSpec()
+	if len(errs) != 0 {
+		t.Fatalf("expected no errors after SetDefaults, got %v", errs)
+	}
+}
+
+func TestCBValidateSpec_InjectionRequired(t *testing.T) {
+	// A spec with no injection block is invalid: the webhook has no payload
+	// image to inject.
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{L1: L1BackendSpec{SizeGB: 10}}}
+	errs := e.ValidateSpec()
+	if len(errs) != 1 {
+		t.Fatalf("expected 1 error, got %d: %v", len(errs), errs)
+	}
+	if errs[0].Field != "spec.injection" {
+		t.Fatalf("expected field spec.injection, got %s", errs[0].Field)
+	}
+}
+
+func TestCBValidateSpec_InjectionPayloadRepositoryRequired(t *testing.T) {
+	// injection present but payloadImage.repository empty is invalid.
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: &InjectionSpec{PayloadImage: &ImageSpec{Repository: ptr("")}},
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 1 {
+		t.Fatalf("expected 1 error, got %d: %v", len(errs), errs)
+	}
+	if errs[0].Field != "spec.injection.payloadImage.repository" {
+		t.Fatalf("expected field spec.injection.payloadImage.repository, got %s", errs[0].Field)
+	}
+}
+
+func TestCBValidateSpec_SizeGBZero(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 0},
+		Injection: validCBInjection(),
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 1 {
+		t.Fatalf("expected 1 error, got %d: %v", len(errs), errs)
+	}
+	if errs[0].Field != "spec.l1.sizeGB" {
+		t.Fatalf("expected field spec.l1.sizeGB, got %s", errs[0].Field)
+	}
+}
+
+func TestCBValidateSpec_SizeGBNegative(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: -1},
+		Injection: validCBInjection(),
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 1 {
+		t.Fatalf("expected 1 error, got %d", len(errs))
+	}
+}
+
+func TestCBValidateSpec_EvictionPolicyInvalid(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: validCBInjection(),
+		Eviction:  &EvictionSpec{Policy: ptr("FIFO")},
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 1 {
+		t.Fatalf("expected 1 error, got %d: %v", len(errs), errs)
+	}
+}
+
+func TestCBValidateSpec_EvictionPolicyLRU(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: validCBInjection(),
+		Eviction:  &EvictionSpec{Policy: ptr("LRU")},
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 0 {
+		t.Fatalf("expected no errors, got %v", errs)
+	}
+}
+
+func TestCBValidateSpec_TriggerWatermarkBounds(t *testing.T) {
+	tests := []struct {
+		name    string
+		value   float64
+		wantErr bool
+	}{
+		{"zero", 0.0, true},
+		{"negative", -0.1, true},
+		{"valid_low", 0.01, false},
+		{"valid_mid", 0.5, false},
+		{"valid_one", 1.0, false},
+		{"above_one", 1.1, true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+				L1:        L1BackendSpec{SizeGB: 10},
+				Injection: validCBInjection(),
+				Eviction:  &EvictionSpec{TriggerWatermark: ptr(tt.value)},
+			}}
+			errs := e.ValidateSpec()
+			if tt.wantErr && len(errs) == 0 {
+				t.Fatal("expected error, got none")
+			}
+			if !tt.wantErr && len(errs) != 0 {
+				t.Fatalf("expected no error, got %v", errs)
+			}
+		})
+	}
+}
+
+func TestCBValidateSpec_EvictionRatioBounds(t *testing.T) {
+	tests := []struct {
+		name    string
+		value   float64
+		wantErr bool
+	}{
+		{"zero", 0.0, true},
+		{"negative", -0.5, true},
+		{"valid_low", 0.1, false},
+		{"valid_one", 1.0, false},
+		{"above_one", 1.5, true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+				L1:        L1BackendSpec{SizeGB: 10},
+				Injection: validCBInjection(),
+				Eviction:  &EvictionSpec{EvictionRatio: ptr(tt.value)},
+			}}
+			errs := e.ValidateSpec()
+			if tt.wantErr && len(errs) == 0 {
+				t.Fatal("expected error, got none")
+			}
+			if !tt.wantErr && len(errs) != 0 {
+				t.Fatalf("expected no error, got %v", errs)
+			}
+		})
+	}
+}
+
+func TestCBValidateSpec_ServerPort(t *testing.T) {
+	tests := []struct {
+		name    string
+		port    int32
+		wantErr bool
+	}{
+		{"below_min", 80, true},
+		{"min_valid", 1024, false},
+		{"max_valid", 65535, false},
+		{"above_max", 65536, true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+				L1:        L1BackendSpec{SizeGB: 10},
+				Injection: validCBInjection(),
+				Server:    &ServerSpec{Port: ptr(tt.port), ChunkSize: ptr(CacheBlendChunkSize)},
+			}}
+			errs := e.ValidateSpec()
+			if tt.wantErr && len(errs) == 0 {
+				t.Fatal("expected error, got none")
+			}
+			if !tt.wantErr && len(errs) != 0 {
+				t.Fatalf("expected no error, got %v", errs)
+			}
+		})
+	}
+}
+
+func TestCBValidateSpec_ChunkSize(t *testing.T) {
+	tests := []struct {
+		name    string
+		value   int32
+		wantErr bool
+	}{
+		{"valid_256", 256, false},
+		{"too_small", 128, true},
+		{"too_large", 512, true},
+		{"zero", 0, true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+				L1:        L1BackendSpec{SizeGB: 10},
+				Injection: validCBInjection(),
+				Server:    &ServerSpec{ChunkSize: ptr(tt.value)},
+			}}
+			errs := e.ValidateSpec()
+			if tt.wantErr && len(errs) == 0 {
+				t.Fatal("expected error, got none")
+			}
+			if !tt.wantErr && len(errs) != 0 {
+				t.Fatalf("expected no error, got %v", errs)
+			}
+		})
+	}
+}
+
+func TestCBValidateSpec_CheckLayerBounds(t *testing.T) {
+	tests := []struct {
+		name    string
+		value   int32
+		wantErr bool
+	}{
+		{"zero", 0, false},
+		{"positive", 5, false},
+		{"negative", -1, true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+				L1:        L1BackendSpec{SizeGB: 10},
+				Injection: validCBInjection(),
+				Blend:     &BlendSpec{CheckLayer: ptr(tt.value)},
+			}}
+			errs := e.ValidateSpec()
+			if tt.wantErr && len(errs) == 0 {
+				t.Fatal("expected error, got none")
+			}
+			if !tt.wantErr && len(errs) != 0 {
+				t.Fatalf("expected no error, got %v", errs)
+			}
+		})
+	}
+}
+
+func TestCBValidateSpec_RecompRatioBounds(t *testing.T) {
+	tests := []struct {
+		name    string
+		value   float64
+		wantErr bool
+	}{
+		{"zero", 0.0, true},
+		{"negative", -0.1, true},
+		{"valid_low", 0.01, false},
+		{"valid_default", 0.15, false},
+		{"valid_one", 1.0, false},
+		{"above_one", 1.1, true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+				L1:        L1BackendSpec{SizeGB: 10},
+				Injection: validCBInjection(),
+				Blend:     &BlendSpec{RecompRatio: ptr(tt.value)},
+			}}
+			errs := e.ValidateSpec()
+			if tt.wantErr && len(errs) == 0 {
+				t.Fatal("expected error, got none")
+			}
+			if !tt.wantErr && len(errs) != 0 {
+				t.Fatalf("expected no error, got %v", errs)
+			}
+		})
+	}
+}
+
+func TestCBValidateSpec_MultipleErrors(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 0},
+		Injection: validCBInjection(),
+		Server:    &ServerSpec{Port: ptr(int32(80)), ChunkSize: ptr(int32(128))},
+		Eviction: &EvictionSpec{
+			Policy:           ptr("FIFO"),
+			TriggerWatermark: ptr(0.0),
+			EvictionRatio:    ptr(0.0),
+		},
+		Blend: &BlendSpec{CheckLayer: ptr(int32(-1)), RecompRatio: ptr(2.0)},
+	}}
+	errs := e.ValidateSpec()
+	// sizeGB, port, chunkSize, policy, watermark, ratio, checkLayer, recompRatio = 8 errors
+	if len(errs) != 8 {
+		t.Fatalf("expected 8 errors, got %d: %v", len(errs), errs)
+	}
+}
+
+// --- L2 Backend validation tests ---
+
+func TestCBValidateSpec_L2RESPValid(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: validCBInjection(),
+		L2Backend: &L2BackendSpec{
+			RESP: &RESPL2AdapterSpec{
+				Host: "redis.default.svc",
+				Port: 6379,
+			},
+		},
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 0 {
+		t.Fatalf("expected no errors, got %v", errs)
+	}
+}
+
+func TestCBValidateSpec_L2RESPEmptyHost(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: validCBInjection(),
+		L2Backend: &L2BackendSpec{
+			RESP: &RESPL2AdapterSpec{
+				Host: "",
+				Port: 6379,
+			},
+		},
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 1 {
+		t.Fatalf("expected 1 error, got %d: %v", len(errs), errs)
+	}
+	if errs[0].Field != "spec.l2Backend.resp.host" {
+		t.Fatalf("expected field spec.l2Backend.resp.host, got %s", errs[0].Field)
+	}
+}
+
+func TestCBValidateSpec_L2NoneSet(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: validCBInjection(),
+		L2Backend: &L2BackendSpec{},
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 1 {
+		t.Fatalf("expected 1 error, got %d: %v", len(errs), errs)
+	}
+}
+
+func TestCBValidateSpec_L2BothSet(t *testing.T) {
+	e := &CacheBlendEngine{Spec: CacheBlendEngineSpec{
+		L1:        L1BackendSpec{SizeGB: 10},
+		Injection: validCBInjection(),
+		L2Backend: &L2BackendSpec{
+			RESP: &RESPL2AdapterSpec{Host: "redis", Port: 6379},
+			Raw:  &RawL2AdapterSpec{Type: "mock"},
+		},
+	}}
+	errs := e.ValidateSpec()
+	if len(errs) != 1 {
+		t.Fatalf("expected 1 error, got %d: %v", len(errs), errs)
+	}
+}
diff --git a/operator/api/v1alpha1/cacheblendengine_types.go b/operator/api/v1alpha1/cacheblendengine_types.go
new file mode 100644
index 0000000000..36c35ef1f5
--- /dev/null
+++ b/operator/api/v1alpha1/cacheblendengine_types.go
@@ -0,0 +1,265 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1alpha1
+
+import (
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+// CacheBlendChunkSize is the only chunk size CacheBlend supports. The blend
+// matcher requires chunk_size == vLLM --block-size (64) * 4 == 256, so the
+// CacheBlendEngine server is locked to this value (see design §4).
+const CacheBlendChunkSize int32 = 256
+
+// Cudagraph mode constants for CacheBlendEngine injection.
+const (
+	// CudagraphEager forces eager execution (--enforce-eager). Default.
+	CudagraphEager = "eager"
+	// CudagraphPiecewise enables piecewise CUDA graph capture.
+	CudagraphPiecewise = "piecewise"
+	// CudagraphFullDecodeOnly enables full CUDA graphs for decode only.
+	CudagraphFullDecodeOnly = "full_decode_only"
+)
+
+// BlendSpec defines the CacheBlend tunables injected into the vLLM connect-config.
+type BlendSpec struct {
+	// checkLayer is the layer index used by CacheBlend to decide which tokens
+	// to recompute. It is surfaced to the connector as
+	// kv_connector_extra_config["cb.check_layer"].
+	// +optional
+	// +kubebuilder:default=1
+	// +kubebuilder:validation:Minimum=0
+	CheckLayer *int32 `json:"checkLayer,omitempty"`
+
+	// recompRatio is the fraction of tokens CacheBlend recomputes. It is
+	// surfaced to the connector as kv_connector_extra_config["cb.recomp_ratio"]
+	// and must be in (0, 1].
+	// +optional
+	// +kubebuilder:default=0.15
+	RecompRatio *float64 `json:"recompRatio,omitempty"`
+}
+
+// InjectionSpec defines the defaults the mutating webhook reads when injecting
+// the CacheBlend payload into vLLM pods bound to this engine (see design §7, §8).
+type InjectionSpec struct {
+	// payloadImage is the init-container image (repository/tag/pullPolicy, like
+	// spec.image) that stages the lmcache-cacheblend vLLM plugin into a shared
+	// emptyDir. It is a SEPARATE, usually PRIVATE image: set
+	// payloadImage.repository to your cacheblend-plugin image — the repository
+	// default inherited from ImageSpec is the engine image and is NOT a valid
+	// payload. For private registries, imagePullSecrets must reference Secret(s)
+	// that exist in the vLLM pod's namespace.
+	// +optional
+	PayloadImage *ImageSpec `json:"payloadImage,omitempty"`
+
+	// imagePullSecrets are appended to the vLLM pod's spec.imagePullSecrets so
+	// the PRIVATE payload init-container image can pull. The referenced
+	// Secret(s) must already exist in the vLLM pod's namespace; the operator
+	// does not copy them cross-namespace.
+	// +optional
+	ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"`
+
+	// targetContainer is the name of the vLLM container to inject into. Empty
+	// (the default) selects the first container; a per-pod annotation may
+	// override it.
+	// +optional
+	TargetContainer *string `json:"targetContainer,omitempty"`
+
+	// cudagraph selects the CUDA graph mode injected into the vLLM args. "eager"
+	// (default) maps to --enforce-eager; "full_decode_only" enables decode-only
+	// graphs. Full graphs are never used.
+	// +optional
+	// +kubebuilder:default="eager"
+	// +kubebuilder:validation:Enum=eager;piecewise;full_decode_only
+	Cudagraph *string `json:"cudagraph,omitempty"`
+}
+
+// CacheBlendEngineSpec defines the desired state of CacheBlendEngine. It mirrors
+// LMCacheEngineSpec (reusing its shared sub-structs) and adds the blend tunables
+// and injection defaults specific to CacheBlend.
+type CacheBlendEngineSpec struct {
+	// gpuVendor selects the GPU vendor. "nvidia" (default) requires the NVIDIA
+	// GPU Operator's "nvidia" RuntimeClass; "amd" runs on the default container
+	// runtime with privileged: true.
+	// +optional
+	// +kubebuilder:default="nvidia"
+	// +kubebuilder:validation:Enum=nvidia;amd
+	GPUVendor *string `json:"gpuVendor,omitempty"`
+
+	// image defines the container image to use for the blend_v3 engine. This
+	// may be a PRIVATE image; use imagePullSecrets to pull it.
+	// +optional
+	Image *ImageSpec `json:"image,omitempty"`
+
+	// imagePullSecrets is a list of references to secrets for pulling the
+	// engine image.
+	// +optional
+	ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"`
+
+	// server defines server configuration. chunkSize defaults to 256 because
+	// CacheBlend requires chunk_size == 256.
+	// +optional
+	Server *ServerSpec `json:"server,omitempty"`
+
+	// l1 defines the L1 memory cache configuration.
+	L1 L1BackendSpec `json:"l1"`
+
+	// eviction defines the cache eviction configuration.
+	// +optional
+	Eviction *EvictionSpec `json:"eviction,omitempty"`
+
+	// prometheus defines Prometheus monitoring configuration.
+	// +optional
+	Prometheus *PrometheusSpec `json:"prometheus,omitempty"`
+
+	// l2Backend defines the L2 storage backend.
+	// Currently only a single adapter is supported.
+	// +optional
+	L2Backend *L2BackendSpec `json:"l2Backend,omitempty"`
+
+	// blend defines the CacheBlend tunables injected into the vLLM connect-config.
+	// +optional
+	Blend *BlendSpec `json:"blend,omitempty"`
+
+	// injection defines the defaults the mutating webhook reads for pods bound
+	// to this engine.
+	// +optional
+	Injection *InjectionSpec `json:"injection,omitempty"`
+
+	// resourceOverrides allows overriding auto-computed resource requirements.
+	// +optional
+	ResourceOverrides *corev1.ResourceRequirements `json:"resourceOverrides,omitempty"`
+
+	// logLevel is the log level for the LMCache server.
+	// +optional
+	// +kubebuilder:default="INFO"
+	// +kubebuilder:validation:Enum=DEBUG;INFO;WARNING;ERROR
+	LogLevel *string `json:"logLevel,omitempty"`
+
+	// nodeSelector determines which nodes get a CacheBlend engine instance.
+	// +optional
+	NodeSelector map[string]string `json:"nodeSelector,omitempty"`
+
+	// affinity defines pod scheduling affinity rules.
+	// +optional
+	Affinity *corev1.Affinity `json:"affinity,omitempty"`
+
+	// tolerations defines pod tolerations.
+	// +optional
+	Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
+
+	// env defines additional environment variables.
+	// +optional
+	Env []corev1.EnvVar `json:"env,omitempty"`
+
+	// volumes defines additional volumes.
+	// +optional
+	Volumes []corev1.Volume `json:"volumes,omitempty"`
+
+	// volumeMounts defines additional volume mounts.
+	// +optional
+	VolumeMounts []corev1.VolumeMount `json:"volumeMounts,omitempty"`
+
+	// podAnnotations are additional annotations added to pods.
+	// +optional
+	PodAnnotations map[string]string `json:"podAnnotations,omitempty"`
+
+	// podLabels are additional labels added to pods.
+	// +optional
+	PodLabels map[string]string `json:"podLabels,omitempty"`
+
+	// serviceAccountName is the name of the ServiceAccount to use.
+	// +optional
+	ServiceAccountName string `json:"serviceAccountName,omitempty"`
+
+	// priorityClassName is the priority class for the pods.
+	// +optional
+	PriorityClassName string `json:"priorityClassName,omitempty"`
+
+	// extraArgs are additional CLI flags appended to the server command.
+	// They are appended last and can override any auto-generated flag.
+	// +optional
+	ExtraArgs []string `json:"extraArgs,omitempty"`
+}
+
+// CacheBlendEngineStatus defines the observed state of CacheBlendEngine.
+type CacheBlendEngineStatus struct {
+	// phase is the overall phase of the CacheBlendEngine.
+	// +optional
+	Phase string `json:"phase,omitempty"`
+
+	// observedGeneration is the most recent generation observed.
+	// +optional
+	ObservedGeneration int64 `json:"observedGeneration,omitempty"`
+
+	// desiredInstances is the number of desired instances.
+	// +optional
+	DesiredInstances int32 `json:"desiredInstances,omitempty"`
+
+	// readyInstances is the number of ready instances.
+	// +optional
+	ReadyInstances int32 `json:"readyInstances,omitempty"`
+
+	// endpoints lists per-node connection info.
+	// +optional
+	Endpoints []EndpointStatus `json:"endpoints,omitempty"`
+
+	// conditions represent the current state of the CacheBlendEngine resource.
+	// +listType=map
+	// +listMapKey=type
+	// +optional
+	Conditions []metav1.Condition `json:"conditions,omitempty"`
+}
+
+// +kubebuilder:object:root=true
+// +kubebuilder:subresource:status
+// +kubebuilder:resource:shortName=cbe
+// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase`
+// +kubebuilder:printcolumn:name="Ready",type=integer,JSONPath=`.status.readyInstances`
+// +kubebuilder:printcolumn:name="Desired",type=integer,JSONPath=`.status.desiredInstances`
+// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`
+
+// CacheBlendEngine is the Schema for the cacheblendengines API.
+type CacheBlendEngine struct {
+	metav1.TypeMeta `json:",inline"`
+
+	// metadata is a standard object metadata.
+	// +optional
+	metav1.ObjectMeta `json:"metadata,omitzero"`
+
+	// spec defines the desired state of CacheBlendEngine.
+	// +required
+	Spec CacheBlendEngineSpec `json:"spec"`
+
+	// status defines the observed state of CacheBlendEngine.
+	// +optional
+	Status CacheBlendEngineStatus `json:"status,omitzero"`
+}
+
+// +kubebuilder:object:root=true
+
+// CacheBlendEngineList contains a list of CacheBlendEngine.
+type CacheBlendEngineList struct {
+	metav1.TypeMeta `json:",inline"`
+	metav1.ListMeta `json:"metadata,omitzero"`
+	Items           []CacheBlendEngine `json:"items"`
+}
+
+func init() {
+	SchemeBuilder.Register(&CacheBlendEngine{}, &CacheBlendEngineList{})
+}
diff --git a/operator/api/v1alpha1/cacheblendengine_validation.go b/operator/api/v1alpha1/cacheblendengine_validation.go
new file mode 100644
index 0000000000..3070d49152
--- /dev/null
+++ b/operator/api/v1alpha1/cacheblendengine_validation.go
@@ -0,0 +1,91 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1alpha1
+
+import (
+	"k8s.io/apimachinery/pkg/util/validation/field"
+)
+
+// ValidateSpec validates the CacheBlendEngineSpec and returns any validation
+// errors. It mirrors LMCacheEngine.ValidateSpec and additionally enforces the
+// CacheBlend invariants: chunkSize == 256, recompRatio in (0, 1], and
+// checkLayer >= 0.
+func (e *CacheBlendEngine) ValidateSpec() field.ErrorList {
+	var errs field.ErrorList
+	spec := &e.Spec
+
+	// l1.sizeGB must be > 0
+	if spec.L1.SizeGB <= 0 {
+		errs = append(errs, field.Invalid(field.NewPath("spec", "l1", "sizeGB"), spec.L1.SizeGB, "must be greater than 0"))
+	}
+
+	errs = append(errs, validateEvictionSpec(spec.Eviction)...)
+
+	// Server validation
+	if spec.Server != nil {
+		serverPath := field.NewPath("spec", "server")
+
+		if spec.Server.Port != nil {
+			port := *spec.Server.Port
+			if port < 1024 || port > 65535 {
+				errs = append(errs, field.Invalid(serverPath.Child("port"), port, "must be in [1024, 65535]"))
+			}
+		}
+
+		// CacheBlend requires chunk_size == 256 (block_size 64 * 4).
+		if spec.Server.ChunkSize != nil && *spec.Server.ChunkSize != CacheBlendChunkSize {
+			errs = append(errs, field.Invalid(serverPath.Child("chunkSize"), *spec.Server.ChunkSize,
+				"must be 256 for CacheBlend (chunk_size == block_size 64 * 4)"))
+		}
+	}
+
+	// Blend validation
+	if spec.Blend != nil {
+		blendPath := field.NewPath("spec", "blend")
+
+		if spec.Blend.CheckLayer != nil && *spec.Blend.CheckLayer < 0 {
+			errs = append(errs, field.Invalid(blendPath.Child("checkLayer"), *spec.Blend.CheckLayer, "must be >= 0"))
+		}
+
+		if spec.Blend.RecompRatio != nil {
+			rr := *spec.Blend.RecompRatio
+			if rr <= 0.0 || rr > 1.0 {
+				errs = append(errs, field.Invalid(blendPath.Child("recompRatio"), rr, "must be in (0.0, 1.0]"))
+			}
+		}
+	}
+
+	// Injection validation. injection.payloadImage.repository is functionally
+	// required: the mutating webhook needs it to inject a valid init container.
+	// Without it the webhook would produce a Pod with an empty init-container
+	// image, which the API server rejects at Pod creation. Enforce it here so
+	// the misconfiguration is caught at `kubectl apply` time instead.
+	injPath := field.NewPath("spec", "injection")
+	if spec.Injection == nil {
+		errs = append(errs, field.Required(injPath, "must be specified for CacheBlend injection"))
+	} else if spec.Injection.PayloadImage == nil {
+		errs = append(errs, field.Required(injPath.Child("payloadImage"),
+			"must be specified for CacheBlend injection"))
+	} else if spec.Injection.PayloadImage.Repository == nil || *spec.Injection.PayloadImage.Repository == "" {
+		errs = append(errs, field.Required(injPath.Child("payloadImage", "repository"),
+			"must be a non-empty string"))
+	}
+
+	errs = append(errs, validateL2BackendSpec(spec.L2Backend)...)
+
+	return errs
+}
diff --git a/operator/api/v1alpha1/lmcacheengine_defaults.go b/operator/api/v1alpha1/lmcacheengine_defaults.go
index a3c81bb896..0c910e4cab 100644
--- a/operator/api/v1alpha1/lmcacheengine_defaults.go
+++ b/operator/api/v1alpha1/lmcacheengine_defaults.go
@@ -16,13 +16,21 @@ limitations under the License.
 
 package v1alpha1
 
+// defaultLogLevel is the log level applied when spec.logLevel is unset (it
+// mirrors the kubebuilder default on both engine kinds).
+const defaultLogLevel = "INFO"
+
+// labelValueTrue is the string value of boolean-style node-selector labels
+// (e.g. nvidia.com/gpu.present: "true").
+const labelValueTrue = "true"
+
 // SetDefaults applies defaults that cannot be expressed purely via kubebuilder markers.
 func (e *LMCacheEngine) SetDefaults() {
 	spec := &e.Spec
 
 	// Default logLevel to INFO if unset (belt-and-suspenders with kubebuilder default).
 	if spec.LogLevel == nil {
-		info := "INFO"
+		info := defaultLogLevel
 		spec.LogLevel = &info
 	}
 
@@ -33,7 +41,7 @@ func (e *LMCacheEngine) SetDefaults() {
 
 	if spec.NodeSelector == nil && *spec.GPUVendor == GPUVendorNvidia {
 		spec.NodeSelector = map[string]string{
-			"nvidia.com/gpu.present": "true",
+			"nvidia.com/gpu.present": labelValueTrue,
 		}
 	}
 }
diff --git a/operator/api/v1alpha1/lmcacheengine_test.go b/operator/api/v1alpha1/lmcacheengine_test.go
index 8755e98a22..2efcf9c6eb 100644
--- a/operator/api/v1alpha1/lmcacheengine_test.go
+++ b/operator/api/v1alpha1/lmcacheengine_test.go
@@ -29,7 +29,7 @@ func ptr[T any](v T) *T { return &v }
 func TestSetDefaults_LogLevelNil(t *testing.T) {
 	e := &LMCacheEngine{Spec: LMCacheEngineSpec{L1: L1BackendSpec{SizeGB: 10}}}
 	e.SetDefaults()
-	if e.Spec.LogLevel == nil || *e.Spec.LogLevel != "INFO" {
+	if e.Spec.LogLevel == nil || *e.Spec.LogLevel != defaultLogLevel {
 		t.Fatalf("expected LogLevel=INFO, got %v", e.Spec.LogLevel)
 	}
 }
@@ -51,7 +51,7 @@ func TestSetDefaults_NodeSelectorDefaultGPU(t *testing.T) {
 	if e.Spec.NodeSelector == nil {
 		t.Fatal("expected default NodeSelector, got nil")
 	}
-	if e.Spec.NodeSelector["nvidia.com/gpu.present"] != "true" {
+	if e.Spec.NodeSelector["nvidia.com/gpu.present"] != labelValueTrue {
 		t.Fatalf("expected nvidia.com/gpu.present=true, got %v", e.Spec.NodeSelector)
 	}
 }
@@ -93,10 +93,10 @@ func TestSetDefaults_GPUVendorAMDPreservesUserNodeSelector(t *testing.T) {
 	e := &LMCacheEngine{Spec: LMCacheEngineSpec{
 		L1:           L1BackendSpec{SizeGB: 10},
 		GPUVendor:    ptr(GPUVendorAMD),
-		NodeSelector: map[string]string{"feature.node.kubernetes.io/amd-gpu": "true"},
+		NodeSelector: map[string]string{"feature.node.kubernetes.io/amd-gpu": labelValueTrue},
 	}}
 	e.SetDefaults()
-	if e.Spec.NodeSelector["feature.node.kubernetes.io/amd-gpu"] != "true" {
+	if e.Spec.NodeSelector["feature.node.kubernetes.io/amd-gpu"] != labelValueTrue {
 		t.Fatalf("expected user-supplied AMD NodeSelector preserved, got %v", e.Spec.NodeSelector)
 	}
 }
diff --git a/operator/api/v1alpha1/lmcacheengine_validation.go b/operator/api/v1alpha1/lmcacheengine_validation.go
index a484bb3d4c..2fbdd077db 100644
--- a/operator/api/v1alpha1/lmcacheengine_validation.go
+++ b/operator/api/v1alpha1/lmcacheengine_validation.go
@@ -24,35 +24,13 @@ import (
 func (e *LMCacheEngine) ValidateSpec() field.ErrorList {
 	var errs field.ErrorList
 	spec := &e.Spec
-	l1Path := field.NewPath("spec", "l1")
 
 	// l1.sizeGB must be > 0
 	if spec.L1.SizeGB <= 0 {
-		errs = append(errs, field.Invalid(l1Path.Child("sizeGB"), spec.L1.SizeGB, "must be greater than 0"))
+		errs = append(errs, field.Invalid(field.NewPath("spec", "l1", "sizeGB"), spec.L1.SizeGB, "must be greater than 0"))
 	}
 
-	// Eviction validation
-	if spec.Eviction != nil {
-		evPath := field.NewPath("spec", "eviction")
-
-		if spec.Eviction.Policy != nil && *spec.Eviction.Policy != "LRU" {
-			errs = append(errs, field.NotSupported(evPath.Child("policy"), *spec.Eviction.Policy, []string{"LRU"}))
-		}
-
-		if spec.Eviction.TriggerWatermark != nil {
-			tw := *spec.Eviction.TriggerWatermark
-			if tw <= 0.0 || tw > 1.0 {
-				errs = append(errs, field.Invalid(evPath.Child("triggerWatermark"), tw, "must be in (0.0, 1.0]"))
-			}
-		}
-
-		if spec.Eviction.EvictionRatio != nil {
-			er := *spec.Eviction.EvictionRatio
-			if er <= 0.0 || er > 1.0 {
-				errs = append(errs, field.Invalid(evPath.Child("evictionRatio"), er, "must be in (0.0, 1.0]"))
-			}
-		}
-	}
+	errs = append(errs, validateEvictionSpec(spec.Eviction)...)
 
 	// Server port validation
 	if spec.Server != nil && spec.Server.Port != nil {
@@ -62,46 +40,86 @@ func (e *LMCacheEngine) ValidateSpec() field.ErrorList {
 		}
 	}
 
-	// L2 backend validation
-	if spec.L2Backend != nil {
-		l2Path := field.NewPath("spec", "l2Backend")
-		b := spec.L2Backend
+	errs = append(errs, validateL2BackendSpec(spec.L2Backend)...)
 
-		setCount := 0
-		if b.RESP != nil {
-			setCount++
-		}
-		if b.Raw != nil {
-			setCount++
+	return errs
+}
+
+// validateEvictionSpec validates the shared eviction configuration used by both
+// LMCacheEngine and CacheBlendEngine. A nil eviction is valid (defaults apply).
+// Returned errors are rooted at spec.eviction.
+func validateEvictionSpec(eviction *EvictionSpec) field.ErrorList {
+	var errs field.ErrorList
+	if eviction == nil {
+		return errs
+	}
+	evPath := field.NewPath("spec", "eviction")
+
+	if eviction.Policy != nil && *eviction.Policy != "LRU" {
+		errs = append(errs, field.NotSupported(evPath.Child("policy"), *eviction.Policy, []string{"LRU"}))
+	}
+
+	if eviction.TriggerWatermark != nil {
+		tw := *eviction.TriggerWatermark
+		if tw <= 0.0 || tw > 1.0 {
+			errs = append(errs, field.Invalid(evPath.Child("triggerWatermark"), tw, "must be in (0.0, 1.0]"))
 		}
-		// For now we only support one kind at each LMCache server. LMCache
-		// MP mode is designed to support multiple ones at the same time
-		// but tests and performance validation is needed before we ship
-		// it into operator.
-		if setCount == 0 {
-			errs = append(errs, field.Required(l2Path, "exactly one of resp or raw must be set"))
-		} else if setCount > 1 {
-			errs = append(errs, field.Invalid(l2Path, "", "exactly one of resp or raw must be set, got multiple"))
+	}
+
+	if eviction.EvictionRatio != nil {
+		er := *eviction.EvictionRatio
+		if er <= 0.0 || er > 1.0 {
+			errs = append(errs, field.Invalid(evPath.Child("evictionRatio"), er, "must be in (0.0, 1.0]"))
 		}
+	}
+
+	return errs
+}
+
+// validateL2BackendSpec validates the shared L2 backend configuration used by
+// both LMCacheEngine and CacheBlendEngine. A nil backend is valid (L2 is
+// optional). Returned errors are rooted at spec.l2Backend.
+func validateL2BackendSpec(b *L2BackendSpec) field.ErrorList {
+	var errs field.ErrorList
+	if b == nil {
+		return errs
+	}
+	l2Path := field.NewPath("spec", "l2Backend")
+
+	setCount := 0
+	if b.RESP != nil {
+		setCount++
+	}
+	if b.Raw != nil {
+		setCount++
+	}
+	// For now we only support one kind at each LMCache server. LMCache
+	// MP mode is designed to support multiple ones at the same time
+	// but tests and performance validation is needed before we ship
+	// it into operator.
+	if setCount == 0 {
+		errs = append(errs, field.Required(l2Path, "exactly one of resp or raw must be set"))
+	} else if setCount > 1 {
+		errs = append(errs, field.Invalid(l2Path, "", "exactly one of resp or raw must be set, got multiple"))
+	}
 
-		if b.RESP != nil {
-			respPath := l2Path.Child("resp")
-			if b.RESP.Host == "" {
-				errs = append(errs, field.Required(respPath.Child("host"), "must be a non-empty string"))
-			}
-			if b.RESP.Port < 1 || b.RESP.Port > 65535 {
-				errs = append(errs, field.Invalid(respPath.Child("port"), b.RESP.Port, "must be in [1, 65535]"))
-			}
-			if b.RESP.AuthSecretRef != nil && b.RESP.AuthSecretRef.Name == "" {
-				errs = append(errs, field.Required(respPath.Child("authSecretRef", "name"), "must be non-empty"))
-			}
+	if b.RESP != nil {
+		respPath := l2Path.Child("resp")
+		if b.RESP.Host == "" {
+			errs = append(errs, field.Required(respPath.Child("host"), "must be a non-empty string"))
+		}
+		if b.RESP.Port < 1 || b.RESP.Port > 65535 {
+			errs = append(errs, field.Invalid(respPath.Child("port"), b.RESP.Port, "must be in [1, 65535]"))
 		}
+		if b.RESP.AuthSecretRef != nil && b.RESP.AuthSecretRef.Name == "" {
+			errs = append(errs, field.Required(respPath.Child("authSecretRef", "name"), "must be non-empty"))
+		}
+	}
 
-		if b.Raw != nil {
-			rawPath := l2Path.Child("raw")
-			if b.Raw.Type == "" {
-				errs = append(errs, field.Required(rawPath.Child("type"), "must be a non-empty string"))
-			}
+	if b.Raw != nil {
+		rawPath := l2Path.Child("raw")
+		if b.Raw.Type == "" {
+			errs = append(errs, field.Required(rawPath.Child("type"), "must be a non-empty string"))
 		}
 	}
 
diff --git a/operator/api/v1alpha1/zz_generated.deepcopy.go b/operator/api/v1alpha1/zz_generated.deepcopy.go
index 2009859225..3bdad7657b 100644
--- a/operator/api/v1alpha1/zz_generated.deepcopy.go
+++ b/operator/api/v1alpha1/zz_generated.deepcopy.go
@@ -21,12 +21,253 @@ limitations under the License.
 package v1alpha1
 
 import (
-	corev1 "k8s.io/api/core/v1"
-	"k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
+	"k8s.io/api/core/v1"
+	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	runtime "k8s.io/apimachinery/pkg/runtime"
 )
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *BlendSpec) DeepCopyInto(out *BlendSpec) {
+	*out = *in
+	if in.CheckLayer != nil {
+		in, out := &in.CheckLayer, &out.CheckLayer
+		*out = new(int32)
+		**out = **in
+	}
+	if in.RecompRatio != nil {
+		in, out := &in.RecompRatio, &out.RecompRatio
+		*out = new(float64)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BlendSpec.
+func (in *BlendSpec) DeepCopy() *BlendSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(BlendSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *CacheBlendEngine) DeepCopyInto(out *CacheBlendEngine) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
+	in.Spec.DeepCopyInto(&out.Spec)
+	in.Status.DeepCopyInto(&out.Status)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CacheBlendEngine.
+func (in *CacheBlendEngine) DeepCopy() *CacheBlendEngine {
+	if in == nil {
+		return nil
+	}
+	out := new(CacheBlendEngine)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *CacheBlendEngine) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *CacheBlendEngineList) DeepCopyInto(out *CacheBlendEngineList) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ListMeta.DeepCopyInto(&out.ListMeta)
+	if in.Items != nil {
+		in, out := &in.Items, &out.Items
+		*out = make([]CacheBlendEngine, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CacheBlendEngineList.
+func (in *CacheBlendEngineList) DeepCopy() *CacheBlendEngineList {
+	if in == nil {
+		return nil
+	}
+	out := new(CacheBlendEngineList)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *CacheBlendEngineList) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *CacheBlendEngineSpec) DeepCopyInto(out *CacheBlendEngineSpec) {
+	*out = *in
+	if in.GPUVendor != nil {
+		in, out := &in.GPUVendor, &out.GPUVendor
+		*out = new(string)
+		**out = **in
+	}
+	if in.Image != nil {
+		in, out := &in.Image, &out.Image
+		*out = new(ImageSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.ImagePullSecrets != nil {
+		in, out := &in.ImagePullSecrets, &out.ImagePullSecrets
+		*out = make([]v1.LocalObjectReference, len(*in))
+		copy(*out, *in)
+	}
+	if in.Server != nil {
+		in, out := &in.Server, &out.Server
+		*out = new(ServerSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	out.L1 = in.L1
+	if in.Eviction != nil {
+		in, out := &in.Eviction, &out.Eviction
+		*out = new(EvictionSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.Prometheus != nil {
+		in, out := &in.Prometheus, &out.Prometheus
+		*out = new(PrometheusSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.L2Backend != nil {
+		in, out := &in.L2Backend, &out.L2Backend
+		*out = new(L2BackendSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.Blend != nil {
+		in, out := &in.Blend, &out.Blend
+		*out = new(BlendSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.Injection != nil {
+		in, out := &in.Injection, &out.Injection
+		*out = new(InjectionSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.ResourceOverrides != nil {
+		in, out := &in.ResourceOverrides, &out.ResourceOverrides
+		*out = new(v1.ResourceRequirements)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.LogLevel != nil {
+		in, out := &in.LogLevel, &out.LogLevel
+		*out = new(string)
+		**out = **in
+	}
+	if in.NodeSelector != nil {
+		in, out := &in.NodeSelector, &out.NodeSelector
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
+	if in.Affinity != nil {
+		in, out := &in.Affinity, &out.Affinity
+		*out = new(v1.Affinity)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.Tolerations != nil {
+		in, out := &in.Tolerations, &out.Tolerations
+		*out = make([]v1.Toleration, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.Env != nil {
+		in, out := &in.Env, &out.Env
+		*out = make([]v1.EnvVar, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.Volumes != nil {
+		in, out := &in.Volumes, &out.Volumes
+		*out = make([]v1.Volume, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.VolumeMounts != nil {
+		in, out := &in.VolumeMounts, &out.VolumeMounts
+		*out = make([]v1.VolumeMount, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.PodAnnotations != nil {
+		in, out := &in.PodAnnotations, &out.PodAnnotations
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
+	if in.PodLabels != nil {
+		in, out := &in.PodLabels, &out.PodLabels
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
+	if in.ExtraArgs != nil {
+		in, out := &in.ExtraArgs, &out.ExtraArgs
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CacheBlendEngineSpec.
+func (in *CacheBlendEngineSpec) DeepCopy() *CacheBlendEngineSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(CacheBlendEngineSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *CacheBlendEngineStatus) DeepCopyInto(out *CacheBlendEngineStatus) {
+	*out = *in
+	if in.Endpoints != nil {
+		in, out := &in.Endpoints, &out.Endpoints
+		*out = make([]EndpointStatus, len(*in))
+		copy(*out, *in)
+	}
+	if in.Conditions != nil {
+		in, out := &in.Conditions, &out.Conditions
+		*out = make([]metav1.Condition, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CacheBlendEngineStatus.
+func (in *CacheBlendEngineStatus) DeepCopy() *CacheBlendEngineStatus {
+	if in == nil {
+		return nil
+	}
+	out := new(CacheBlendEngineStatus)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *EndpointStatus) DeepCopyInto(out *EndpointStatus) {
 	*out = *in
@@ -102,6 +343,41 @@ func (in *ImageSpec) DeepCopy() *ImageSpec {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *InjectionSpec) DeepCopyInto(out *InjectionSpec) {
+	*out = *in
+	if in.PayloadImage != nil {
+		in, out := &in.PayloadImage, &out.PayloadImage
+		*out = new(ImageSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.ImagePullSecrets != nil {
+		in, out := &in.ImagePullSecrets, &out.ImagePullSecrets
+		*out = make([]v1.LocalObjectReference, len(*in))
+		copy(*out, *in)
+	}
+	if in.TargetContainer != nil {
+		in, out := &in.TargetContainer, &out.TargetContainer
+		*out = new(string)
+		**out = **in
+	}
+	if in.Cudagraph != nil {
+		in, out := &in.Cudagraph, &out.Cudagraph
+		*out = new(string)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InjectionSpec.
+func (in *InjectionSpec) DeepCopy() *InjectionSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(InjectionSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *L1BackendSpec) DeepCopyInto(out *L1BackendSpec) {
 	*out = *in
@@ -231,7 +507,7 @@ func (in *LMCacheEngineSpec) DeepCopyInto(out *LMCacheEngineSpec) {
 	}
 	if in.ImagePullSecrets != nil {
 		in, out := &in.ImagePullSecrets, &out.ImagePullSecrets
-		*out = make([]corev1.LocalObjectReference, len(*in))
+		*out = make([]v1.LocalObjectReference, len(*in))
 		copy(*out, *in)
 	}
 	if in.Server != nil {
@@ -257,7 +533,7 @@ func (in *LMCacheEngineSpec) DeepCopyInto(out *LMCacheEngineSpec) {
 	}
 	if in.ResourceOverrides != nil {
 		in, out := &in.ResourceOverrides, &out.ResourceOverrides
-		*out = new(corev1.ResourceRequirements)
+		*out = new(v1.ResourceRequirements)
 		(*in).DeepCopyInto(*out)
 	}
 	if in.LogLevel != nil {
@@ -274,33 +550,33 @@ func (in *LMCacheEngineSpec) DeepCopyInto(out *LMCacheEngineSpec) {
 	}
 	if in.Affinity != nil {
 		in, out := &in.Affinity, &out.Affinity
-		*out = new(corev1.Affinity)
+		*out = new(v1.Affinity)
 		(*in).DeepCopyInto(*out)
 	}
 	if in.Tolerations != nil {
 		in, out := &in.Tolerations, &out.Tolerations
-		*out = make([]corev1.Toleration, len(*in))
+		*out = make([]v1.Toleration, len(*in))
 		for i := range *in {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
 	if in.Env != nil {
 		in, out := &in.Env, &out.Env
-		*out = make([]corev1.EnvVar, len(*in))
+		*out = make([]v1.EnvVar, len(*in))
 		for i := range *in {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
 	if in.Volumes != nil {
 		in, out := &in.Volumes, &out.Volumes
-		*out = make([]corev1.Volume, len(*in))
+		*out = make([]v1.Volume, len(*in))
 		for i := range *in {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
 	if in.VolumeMounts != nil {
 		in, out := &in.VolumeMounts, &out.VolumeMounts
-		*out = make([]corev1.VolumeMount, len(*in))
+		*out = make([]v1.VolumeMount, len(*in))
 		for i := range *in {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
@@ -428,7 +704,7 @@ func (in *RawL2AdapterSpec) DeepCopyInto(out *RawL2AdapterSpec) {
 	*out = *in
 	if in.Config != nil {
 		in, out := &in.Config, &out.Config
-		*out = make(map[string]v1.JSON, len(*in))
+		*out = make(map[string]apiextensionsv1.JSON, len(*in))
 		for key, val := range *in {
 			(*out)[key] = *val.DeepCopy()
 		}
diff --git a/operator/cmd/main.go b/operator/cmd/main.go
index 69cda26a68..93a45ff764 100644
--- a/operator/cmd/main.go
+++ b/operator/cmd/main.go
@@ -34,11 +34,13 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 	"sigs.k8s.io/controller-runtime/pkg/webhook"
+	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 
 	monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
 
 	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
 	"github.com/LMCache/LMCache/internal/controller"
+	cbwebhook "github.com/LMCache/LMCache/internal/webhook"
 	// +kubebuilder:scaffold:imports
 )
 
@@ -188,6 +190,23 @@ func main() {
 		setupLog.Error(err, "Failed to create controller", "controller", "LMCacheEngine")
 		os.Exit(1)
 	}
+	if err := (&controller.CacheBlendEngineReconciler{
+		Client: mgr.GetClient(),
+		Scheme: mgr.GetScheme(),
+	}).SetupWithManager(mgr); err != nil {
+		setupLog.Error(err, "Failed to create controller", "controller", "CacheBlendEngine")
+		os.Exit(1)
+	}
+	// ENABLE_WEBHOOKS=false skips registering the pod-mutating webhook, so a
+	// local `make run` (which has no serving certs on the host) can start the
+	// controller without the webhook server failing on a missing tls.crt. The
+	// deployed manager leaves it unset, so the webhook is on by default.
+	if os.Getenv("ENABLE_WEBHOOKS") != "false" {
+		mgr.GetWebhookServer().Register("/mutate--v1-pod", &webhook.Admission{Handler: &cbwebhook.PodInjector{
+			Client:  mgr.GetClient(),
+			Decoder: admission.NewDecoder(mgr.GetScheme()),
+		}})
+	}
 	// +kubebuilder:scaffold:builder
 
 	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
diff --git a/operator/config/certmanager/certificate.yaml b/operator/config/certmanager/certificate.yaml
new file mode 100644
index 0000000000..41c2259946
--- /dev/null
+++ b/operator/config/certmanager/certificate.yaml
@@ -0,0 +1,26 @@
+# The serving certificate for the mutating admission webhook. cert-manager writes the
+# resulting key pair into the secret named by secretName, which the controller-manager
+# Deployment mounts at /tmp/k8s-webhook-server/serving-certs (see manager_webhook_patch.yaml).
+# The MutatingWebhookConfiguration's cert-manager.io/inject-ca-from annotation points at
+# this Certificate so the caBundle is injected and rotated automatically (design §10).
+#
+# The dnsNames carry a placeholder Service name/namespace; the config/default overlay
+# rewrites the SERVICE_NAME / SERVICE_NAMESPACE segments via kustomize replacements so the
+# rendered SAN matches the actual (namePrefix-ed, namespaced) Service.
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: serving-cert
+  namespace: system
+  labels:
+    app.kubernetes.io/name: operator
+    app.kubernetes.io/managed-by: kustomize
+spec:
+  dnsNames:
+  # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize replacements.
+  - SERVICE_NAME.SERVICE_NAMESPACE.svc
+  - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local
+  issuerRef:
+    kind: Issuer
+    name: selfsigned-issuer
+  secretName: webhook-server-cert
diff --git a/operator/config/certmanager/issuer.yaml b/operator/config/certmanager/issuer.yaml
new file mode 100644
index 0000000000..e1fd502a04
--- /dev/null
+++ b/operator/config/certmanager/issuer.yaml
@@ -0,0 +1,12 @@
+# A self-signed Issuer used to mint the webhook serving certificate. cert-manager must
+# be installed in the cluster (it is a hard prerequisite for the webhook, per design §10).
+apiVersion: cert-manager.io/v1
+kind: Issuer
+metadata:
+  name: selfsigned-issuer
+  namespace: system
+  labels:
+    app.kubernetes.io/name: operator
+    app.kubernetes.io/managed-by: kustomize
+spec:
+  selfSigned: {}
diff --git a/operator/config/certmanager/kustomization.yaml b/operator/config/certmanager/kustomization.yaml
new file mode 100644
index 0000000000..3d0c656edd
--- /dev/null
+++ b/operator/config/certmanager/kustomization.yaml
@@ -0,0 +1,6 @@
+resources:
+- issuer.yaml
+- certificate.yaml
+
+configurations:
+- kustomizeconfig.yaml
diff --git a/operator/config/certmanager/kustomizeconfig.yaml b/operator/config/certmanager/kustomizeconfig.yaml
new file mode 100644
index 0000000000..ddc1f90698
--- /dev/null
+++ b/operator/config/certmanager/kustomizeconfig.yaml
@@ -0,0 +1,8 @@
+# This configuration is for teaching kustomize how to update name ref substitution.
+nameReference:
+- kind: Issuer
+  group: cert-manager.io
+  fieldSpecs:
+  - kind: Certificate
+    group: cert-manager.io
+    path: spec/issuerRef/name
diff --git a/operator/config/crd/bases/lmcache.lmcache.ai_cacheblendengines.yaml b/operator/config/crd/bases/lmcache.lmcache.ai_cacheblendengines.yaml
new file mode 100644
index 0000000000..8f6fe2a543
--- /dev/null
+++ b/operator/config/crd/bases/lmcache.lmcache.ai_cacheblendengines.yaml
@@ -0,0 +1,3712 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.20.1
+  name: cacheblendengines.lmcache.lmcache.ai
+spec:
+  group: lmcache.lmcache.ai
+  names:
+    kind: CacheBlendEngine
+    listKind: CacheBlendEngineList
+    plural: cacheblendengines
+    shortNames:
+    - cbe
+    singular: cacheblendengine
+  scope: Namespaced
+  versions:
+  - additionalPrinterColumns:
+    - jsonPath: .status.phase
+      name: Phase
+      type: string
+    - jsonPath: .status.readyInstances
+      name: Ready
+      type: integer
+    - jsonPath: .status.desiredInstances
+      name: Desired
+      type: integer
+    - jsonPath: .metadata.creationTimestamp
+      name: Age
+      type: date
+    name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: CacheBlendEngine is the Schema for the cacheblendengines API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: spec defines the desired state of CacheBlendEngine.
+            properties:
+              affinity:
+                description: affinity defines pod scheduling affinity rules.
+                properties:
+                  nodeAffinity:
+                    description: Describes node affinity scheduling rules for the
+                      pod.
+                    properties:
+                      preferredDuringSchedulingIgnoredDuringExecution:
+                        description: |-
+                          The scheduler will prefer to schedule pods to nodes that satisfy
+                          the affinity expressions specified by this field, but it may choose
+                          a node that violates one or more of the expressions. The node that is
+                          most preferred is the one with the greatest sum of weights, i.e.
+                          for each node that meets all of the scheduling requirements (resource
+                          request, requiredDuringScheduling affinity expressions, etc.),
+                          compute a sum by iterating through the elements of this field and adding
+                          "weight" to the sum if the node matches the corresponding matchExpressions; the
+                          node(s) with the highest sum are the most preferred.
+                        items:
+                          description: |-
+                            An empty preferred scheduling term matches all objects with implicit weight 0
+                            (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op).
+                          properties:
+                            preference:
+                              description: A node selector term, associated with the
+                                corresponding weight.
+                              properties:
+                                matchExpressions:
+                                  description: A list of node selector requirements
+                                    by node's labels.
+                                  items:
+                                    description: |-
+                                      A node selector requirement is a selector that contains values, a key, and an operator
+                                      that relates the key and values.
+                                    properties:
+                                      key:
+                                        description: The label key that the selector
+                                          applies to.
+                                        type: string
+                                      operator:
+                                        description: |-
+                                          Represents a key's relationship to a set of values.
+                                          Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
+                                        type: string
+                                      values:
+                                        description: |-
+                                          An array of string values. If the operator is In or NotIn,
+                                          the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                          the values array must be empty. If the operator is Gt or Lt, the values
+                                          array must have a single element, which will be interpreted as an integer.
+                                          This array is replaced during a strategic merge patch.
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    required:
+                                    - key
+                                    - operator
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                matchFields:
+                                  description: A list of node selector requirements
+                                    by node's fields.
+                                  items:
+                                    description: |-
+                                      A node selector requirement is a selector that contains values, a key, and an operator
+                                      that relates the key and values.
+                                    properties:
+                                      key:
+                                        description: The label key that the selector
+                                          applies to.
+                                        type: string
+                                      operator:
+                                        description: |-
+                                          Represents a key's relationship to a set of values.
+                                          Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
+                                        type: string
+                                      values:
+                                        description: |-
+                                          An array of string values. If the operator is In or NotIn,
+                                          the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                          the values array must be empty. If the operator is Gt or Lt, the values
+                                          array must have a single element, which will be interpreted as an integer.
+                                          This array is replaced during a strategic merge patch.
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    required:
+                                    - key
+                                    - operator
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            weight:
+                              description: Weight associated with matching the corresponding
+                                nodeSelectorTerm, in the range 1-100.
+                              format: int32
+                              type: integer
+                          required:
+                          - preference
+                          - weight
+                          type: object
+                        type: array
+                        x-kubernetes-list-type: atomic
+                      requiredDuringSchedulingIgnoredDuringExecution:
+                        description: |-
+                          If the affinity requirements specified by this field are not met at
+                          scheduling time, the pod will not be scheduled onto the node.
+                          If the affinity requirements specified by this field cease to be met
+                          at some point during pod execution (e.g. due to an update), the system
+                          may or may not try to eventually evict the pod from its node.
+                        properties:
+                          nodeSelectorTerms:
+                            description: Required. A list of node selector terms.
+                              The terms are ORed.
+                            items:
+                              description: |-
+                                A null or empty node selector term matches no objects. The requirements of
+                                them are ANDed.
+                                The TopologySelectorTerm type implements a subset of the NodeSelectorTerm.
+                              properties:
+                                matchExpressions:
+                                  description: A list of node selector requirements
+                                    by node's labels.
+                                  items:
+                                    description: |-
+                                      A node selector requirement is a selector that contains values, a key, and an operator
+                                      that relates the key and values.
+                                    properties:
+                                      key:
+                                        description: The label key that the selector
+                                          applies to.
+                                        type: string
+                                      operator:
+                                        description: |-
+                                          Represents a key's relationship to a set of values.
+                                          Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
+                                        type: string
+                                      values:
+                                        description: |-
+                                          An array of string values. If the operator is In or NotIn,
+                                          the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                          the values array must be empty. If the operator is Gt or Lt, the values
+                                          array must have a single element, which will be interpreted as an integer.
+                                          This array is replaced during a strategic merge patch.
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    required:
+                                    - key
+                                    - operator
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                matchFields:
+                                  description: A list of node selector requirements
+                                    by node's fields.
+                                  items:
+                                    description: |-
+                                      A node selector requirement is a selector that contains values, a key, and an operator
+                                      that relates the key and values.
+                                    properties:
+                                      key:
+                                        description: The label key that the selector
+                                          applies to.
+                                        type: string
+                                      operator:
+                                        description: |-
+                                          Represents a key's relationship to a set of values.
+                                          Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt.
+                                        type: string
+                                      values:
+                                        description: |-
+                                          An array of string values. If the operator is In or NotIn,
+                                          the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                          the values array must be empty. If the operator is Gt or Lt, the values
+                                          array must have a single element, which will be interpreted as an integer.
+                                          This array is replaced during a strategic merge patch.
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    required:
+                                    - key
+                                    - operator
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            type: array
+                            x-kubernetes-list-type: atomic
+                        required:
+                        - nodeSelectorTerms
+                        type: object
+                        x-kubernetes-map-type: atomic
+                    type: object
+                  podAffinity:
+                    description: Describes pod affinity scheduling rules (e.g. co-locate
+                      this pod in the same node, zone, etc. as some other pod(s)).
+                    properties:
+                      preferredDuringSchedulingIgnoredDuringExecution:
+                        description: |-
+                          The scheduler will prefer to schedule pods to nodes that satisfy
+                          the affinity expressions specified by this field, but it may choose
+                          a node that violates one or more of the expressions. The node that is
+                          most preferred is the one with the greatest sum of weights, i.e.
+                          for each node that meets all of the scheduling requirements (resource
+                          request, requiredDuringScheduling affinity expressions, etc.),
+                          compute a sum by iterating through the elements of this field and adding
+                          "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the
+                          node(s) with the highest sum are the most preferred.
+                        items:
+                          description: The weights of all of the matched WeightedPodAffinityTerm
+                            fields are added per-node to find the most preferred node(s)
+                          properties:
+                            podAffinityTerm:
+                              description: Required. A pod affinity term, associated
+                                with the corresponding weight.
+                              properties:
+                                labelSelector:
+                                  description: |-
+                                    A label query over a set of resources, in this case pods.
+                                    If it's null, this PodAffinityTerm matches with no Pods.
+                                  properties:
+                                    matchExpressions:
+                                      description: matchExpressions is a list of label
+                                        selector requirements. The requirements are
+                                        ANDed.
+                                      items:
+                                        description: |-
+                                          A label selector requirement is a selector that contains values, a key, and an operator that
+                                          relates the key and values.
+                                        properties:
+                                          key:
+                                            description: key is the label key that
+                                              the selector applies to.
+                                            type: string
+                                          operator:
+                                            description: |-
+                                              operator represents a key's relationship to a set of values.
+                                              Valid operators are In, NotIn, Exists and DoesNotExist.
+                                            type: string
+                                          values:
+                                            description: |-
+                                              values is an array of string values. If the operator is In or NotIn,
+                                              the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                              the values array must be empty. This array is replaced during a strategic
+                                              merge patch.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        required:
+                                        - key
+                                        - operator
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                    matchLabels:
+                                      additionalProperties:
+                                        type: string
+                                      description: |-
+                                        matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                        map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                        operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                      type: object
+                                  type: object
+                                  x-kubernetes-map-type: atomic
+                                matchLabelKeys:
+                                  description: |-
+                                    MatchLabelKeys is a set of pod label keys to select which pods will
+                                    be taken into consideration. The keys are used to lookup values from the
+                                    incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)`
+                                    to select the group of existing pods which pods will be taken into consideration
+                                    for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                    pod labels will be ignored. The default value is empty.
+                                    The same key is forbidden to exist in both matchLabelKeys and labelSelector.
+                                    Also, matchLabelKeys cannot be set when labelSelector isn't set.
+                                  items:
+                                    type: string
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                mismatchLabelKeys:
+                                  description: |-
+                                    MismatchLabelKeys is a set of pod label keys to select which pods will
+                                    be taken into consideration. The keys are used to lookup values from the
+                                    incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)`
+                                    to select the group of existing pods which pods will be taken into consideration
+                                    for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                    pod labels will be ignored. The default value is empty.
+                                    The same key is forbidden to exist in both mismatchLabelKeys and labelSelector.
+                                    Also, mismatchLabelKeys cannot be set when labelSelector isn't set.
+                                  items:
+                                    type: string
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                namespaceSelector:
+                                  description: |-
+                                    A label query over the set of namespaces that the term applies to.
+                                    The term is applied to the union of the namespaces selected by this field
+                                    and the ones listed in the namespaces field.
+                                    null selector and null or empty namespaces list means "this pod's namespace".
+                                    An empty selector ({}) matches all namespaces.
+                                  properties:
+                                    matchExpressions:
+                                      description: matchExpressions is a list of label
+                                        selector requirements. The requirements are
+                                        ANDed.
+                                      items:
+                                        description: |-
+                                          A label selector requirement is a selector that contains values, a key, and an operator that
+                                          relates the key and values.
+                                        properties:
+                                          key:
+                                            description: key is the label key that
+                                              the selector applies to.
+                                            type: string
+                                          operator:
+                                            description: |-
+                                              operator represents a key's relationship to a set of values.
+                                              Valid operators are In, NotIn, Exists and DoesNotExist.
+                                            type: string
+                                          values:
+                                            description: |-
+                                              values is an array of string values. If the operator is In or NotIn,
+                                              the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                              the values array must be empty. This array is replaced during a strategic
+                                              merge patch.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        required:
+                                        - key
+                                        - operator
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                    matchLabels:
+                                      additionalProperties:
+                                        type: string
+                                      description: |-
+                                        matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                        map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                        operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                      type: object
+                                  type: object
+                                  x-kubernetes-map-type: atomic
+                                namespaces:
+                                  description: |-
+                                    namespaces specifies a static list of namespace names that the term applies to.
+                                    The term is applied to the union of the namespaces listed in this field
+                                    and the ones selected by namespaceSelector.
+                                    null or empty namespaces list and null namespaceSelector means "this pod's namespace".
+                                  items:
+                                    type: string
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                topologyKey:
+                                  description: |-
+                                    This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching
+                                    the labelSelector in the specified namespaces, where co-located is defined as running on a node
+                                    whose value of the label with key topologyKey matches that of any node on which any of the
+                                    selected pods is running.
+                                    Empty topologyKey is not allowed.
+                                  type: string
+                              required:
+                              - topologyKey
+                              type: object
+                            weight:
+                              description: |-
+                                weight associated with matching the corresponding podAffinityTerm,
+                                in the range 1-100.
+                              format: int32
+                              type: integer
+                          required:
+                          - podAffinityTerm
+                          - weight
+                          type: object
+                        type: array
+                        x-kubernetes-list-type: atomic
+                      requiredDuringSchedulingIgnoredDuringExecution:
+                        description: |-
+                          If the affinity requirements specified by this field are not met at
+                          scheduling time, the pod will not be scheduled onto the node.
+                          If the affinity requirements specified by this field cease to be met
+                          at some point during pod execution (e.g. due to a pod label update), the
+                          system may or may not try to eventually evict the pod from its node.
+                          When there are multiple elements, the lists of nodes corresponding to each
+                          podAffinityTerm are intersected, i.e. all terms must be satisfied.
+                        items:
+                          description: |-
+                            Defines a set of pods (namely those matching the labelSelector
+                            relative to the given namespace(s)) that this pod should be
+                            co-located (affinity) or not co-located (anti-affinity) with,
+                            where co-located is defined as running on a node whose value of
+                            the label with key <topologyKey> matches that of any node on which
+                            a pod of the set of pods is running
+                          properties:
+                            labelSelector:
+                              description: |-
+                                A label query over a set of resources, in this case pods.
+                                If it's null, this PodAffinityTerm matches with no Pods.
+                              properties:
+                                matchExpressions:
+                                  description: matchExpressions is a list of label
+                                    selector requirements. The requirements are ANDed.
+                                  items:
+                                    description: |-
+                                      A label selector requirement is a selector that contains values, a key, and an operator that
+                                      relates the key and values.
+                                    properties:
+                                      key:
+                                        description: key is the label key that the
+                                          selector applies to.
+                                        type: string
+                                      operator:
+                                        description: |-
+                                          operator represents a key's relationship to a set of values.
+                                          Valid operators are In, NotIn, Exists and DoesNotExist.
+                                        type: string
+                                      values:
+                                        description: |-
+                                          values is an array of string values. If the operator is In or NotIn,
+                                          the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                          the values array must be empty. This array is replaced during a strategic
+                                          merge patch.
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    required:
+                                    - key
+                                    - operator
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                matchLabels:
+                                  additionalProperties:
+                                    type: string
+                                  description: |-
+                                    matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                    map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                    operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                  type: object
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            matchLabelKeys:
+                              description: |-
+                                MatchLabelKeys is a set of pod label keys to select which pods will
+                                be taken into consideration. The keys are used to lookup values from the
+                                incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)`
+                                to select the group of existing pods which pods will be taken into consideration
+                                for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                pod labels will be ignored. The default value is empty.
+                                The same key is forbidden to exist in both matchLabelKeys and labelSelector.
+                                Also, matchLabelKeys cannot be set when labelSelector isn't set.
+                              items:
+                                type: string
+                              type: array
+                              x-kubernetes-list-type: atomic
+                            mismatchLabelKeys:
+                              description: |-
+                                MismatchLabelKeys is a set of pod label keys to select which pods will
+                                be taken into consideration. The keys are used to lookup values from the
+                                incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)`
+                                to select the group of existing pods which pods will be taken into consideration
+                                for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                pod labels will be ignored. The default value is empty.
+                                The same key is forbidden to exist in both mismatchLabelKeys and labelSelector.
+                                Also, mismatchLabelKeys cannot be set when labelSelector isn't set.
+                              items:
+                                type: string
+                              type: array
+                              x-kubernetes-list-type: atomic
+                            namespaceSelector:
+                              description: |-
+                                A label query over the set of namespaces that the term applies to.
+                                The term is applied to the union of the namespaces selected by this field
+                                and the ones listed in the namespaces field.
+                                null selector and null or empty namespaces list means "this pod's namespace".
+                                An empty selector ({}) matches all namespaces.
+                              properties:
+                                matchExpressions:
+                                  description: matchExpressions is a list of label
+                                    selector requirements. The requirements are ANDed.
+                                  items:
+                                    description: |-
+                                      A label selector requirement is a selector that contains values, a key, and an operator that
+                                      relates the key and values.
+                                    properties:
+                                      key:
+                                        description: key is the label key that the
+                                          selector applies to.
+                                        type: string
+                                      operator:
+                                        description: |-
+                                          operator represents a key's relationship to a set of values.
+                                          Valid operators are In, NotIn, Exists and DoesNotExist.
+                                        type: string
+                                      values:
+                                        description: |-
+                                          values is an array of string values. If the operator is In or NotIn,
+                                          the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                          the values array must be empty. This array is replaced during a strategic
+                                          merge patch.
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    required:
+                                    - key
+                                    - operator
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                matchLabels:
+                                  additionalProperties:
+                                    type: string
+                                  description: |-
+                                    matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                    map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                    operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                  type: object
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            namespaces:
+                              description: |-
+                                namespaces specifies a static list of namespace names that the term applies to.
+                                The term is applied to the union of the namespaces listed in this field
+                                and the ones selected by namespaceSelector.
+                                null or empty namespaces list and null namespaceSelector means "this pod's namespace".
+                              items:
+                                type: string
+                              type: array
+                              x-kubernetes-list-type: atomic
+                            topologyKey:
+                              description: |-
+                                This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching
+                                the labelSelector in the specified namespaces, where co-located is defined as running on a node
+                                whose value of the label with key topologyKey matches that of any node on which any of the
+                                selected pods is running.
+                                Empty topologyKey is not allowed.
+                              type: string
+                          required:
+                          - topologyKey
+                          type: object
+                        type: array
+                        x-kubernetes-list-type: atomic
+                    type: object
+                  podAntiAffinity:
+                    description: Describes pod anti-affinity scheduling rules (e.g.
+                      avoid putting this pod in the same node, zone, etc. as some
+                      other pod(s)).
+                    properties:
+                      preferredDuringSchedulingIgnoredDuringExecution:
+                        description: |-
+                          The scheduler will prefer to schedule pods to nodes that satisfy
+                          the anti-affinity expressions specified by this field, but it may choose
+                          a node that violates one or more of the expressions. The node that is
+                          most preferred is the one with the greatest sum of weights, i.e.
+                          for each node that meets all of the scheduling requirements (resource
+                          request, requiredDuringScheduling anti-affinity expressions, etc.),
+                          compute a sum by iterating through the elements of this field and subtracting
+                          "weight" from the sum if the node has pods which matches the corresponding podAffinityTerm; the
+                          node(s) with the highest sum are the most preferred.
+                        items:
+                          description: The weights of all of the matched WeightedPodAffinityTerm
+                            fields are added per-node to find the most preferred node(s)
+                          properties:
+                            podAffinityTerm:
+                              description: Required. A pod affinity term, associated
+                                with the corresponding weight.
+                              properties:
+                                labelSelector:
+                                  description: |-
+                                    A label query over a set of resources, in this case pods.
+                                    If it's null, this PodAffinityTerm matches with no Pods.
+                                  properties:
+                                    matchExpressions:
+                                      description: matchExpressions is a list of label
+                                        selector requirements. The requirements are
+                                        ANDed.
+                                      items:
+                                        description: |-
+                                          A label selector requirement is a selector that contains values, a key, and an operator that
+                                          relates the key and values.
+                                        properties:
+                                          key:
+                                            description: key is the label key that
+                                              the selector applies to.
+                                            type: string
+                                          operator:
+                                            description: |-
+                                              operator represents a key's relationship to a set of values.
+                                              Valid operators are In, NotIn, Exists and DoesNotExist.
+                                            type: string
+                                          values:
+                                            description: |-
+                                              values is an array of string values. If the operator is In or NotIn,
+                                              the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                              the values array must be empty. This array is replaced during a strategic
+                                              merge patch.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        required:
+                                        - key
+                                        - operator
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                    matchLabels:
+                                      additionalProperties:
+                                        type: string
+                                      description: |-
+                                        matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                        map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                        operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                      type: object
+                                  type: object
+                                  x-kubernetes-map-type: atomic
+                                matchLabelKeys:
+                                  description: |-
+                                    MatchLabelKeys is a set of pod label keys to select which pods will
+                                    be taken into consideration. The keys are used to lookup values from the
+                                    incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)`
+                                    to select the group of existing pods which pods will be taken into consideration
+                                    for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                    pod labels will be ignored. The default value is empty.
+                                    The same key is forbidden to exist in both matchLabelKeys and labelSelector.
+                                    Also, matchLabelKeys cannot be set when labelSelector isn't set.
+                                  items:
+                                    type: string
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                mismatchLabelKeys:
+                                  description: |-
+                                    MismatchLabelKeys is a set of pod label keys to select which pods will
+                                    be taken into consideration. The keys are used to lookup values from the
+                                    incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)`
+                                    to select the group of existing pods which pods will be taken into consideration
+                                    for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                    pod labels will be ignored. The default value is empty.
+                                    The same key is forbidden to exist in both mismatchLabelKeys and labelSelector.
+                                    Also, mismatchLabelKeys cannot be set when labelSelector isn't set.
+                                  items:
+                                    type: string
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                namespaceSelector:
+                                  description: |-
+                                    A label query over the set of namespaces that the term applies to.
+                                    The term is applied to the union of the namespaces selected by this field
+                                    and the ones listed in the namespaces field.
+                                    null selector and null or empty namespaces list means "this pod's namespace".
+                                    An empty selector ({}) matches all namespaces.
+                                  properties:
+                                    matchExpressions:
+                                      description: matchExpressions is a list of label
+                                        selector requirements. The requirements are
+                                        ANDed.
+                                      items:
+                                        description: |-
+                                          A label selector requirement is a selector that contains values, a key, and an operator that
+                                          relates the key and values.
+                                        properties:
+                                          key:
+                                            description: key is the label key that
+                                              the selector applies to.
+                                            type: string
+                                          operator:
+                                            description: |-
+                                              operator represents a key's relationship to a set of values.
+                                              Valid operators are In, NotIn, Exists and DoesNotExist.
+                                            type: string
+                                          values:
+                                            description: |-
+                                              values is an array of string values. If the operator is In or NotIn,
+                                              the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                              the values array must be empty. This array is replaced during a strategic
+                                              merge patch.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        required:
+                                        - key
+                                        - operator
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                    matchLabels:
+                                      additionalProperties:
+                                        type: string
+                                      description: |-
+                                        matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                        map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                        operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                      type: object
+                                  type: object
+                                  x-kubernetes-map-type: atomic
+                                namespaces:
+                                  description: |-
+                                    namespaces specifies a static list of namespace names that the term applies to.
+                                    The term is applied to the union of the namespaces listed in this field
+                                    and the ones selected by namespaceSelector.
+                                    null or empty namespaces list and null namespaceSelector means "this pod's namespace".
+                                  items:
+                                    type: string
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                topologyKey:
+                                  description: |-
+                                    This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching
+                                    the labelSelector in the specified namespaces, where co-located is defined as running on a node
+                                    whose value of the label with key topologyKey matches that of any node on which any of the
+                                    selected pods is running.
+                                    Empty topologyKey is not allowed.
+                                  type: string
+                              required:
+                              - topologyKey
+                              type: object
+                            weight:
+                              description: |-
+                                weight associated with matching the corresponding podAffinityTerm,
+                                in the range 1-100.
+                              format: int32
+                              type: integer
+                          required:
+                          - podAffinityTerm
+                          - weight
+                          type: object
+                        type: array
+                        x-kubernetes-list-type: atomic
+                      requiredDuringSchedulingIgnoredDuringExecution:
+                        description: |-
+                          If the anti-affinity requirements specified by this field are not met at
+                          scheduling time, the pod will not be scheduled onto the node.
+                          If the anti-affinity requirements specified by this field cease to be met
+                          at some point during pod execution (e.g. due to a pod label update), the
+                          system may or may not try to eventually evict the pod from its node.
+                          When there are multiple elements, the lists of nodes corresponding to each
+                          podAffinityTerm are intersected, i.e. all terms must be satisfied.
+                        items:
+                          description: |-
+                            Defines a set of pods (namely those matching the labelSelector
+                            relative to the given namespace(s)) that this pod should be
+                            co-located (affinity) or not co-located (anti-affinity) with,
+                            where co-located is defined as running on a node whose value of
+                            the label with key <topologyKey> matches that of any node on which
+                            a pod of the set of pods is running
+                          properties:
+                            labelSelector:
+                              description: |-
+                                A label query over a set of resources, in this case pods.
+                                If it's null, this PodAffinityTerm matches with no Pods.
+                              properties:
+                                matchExpressions:
+                                  description: matchExpressions is a list of label
+                                    selector requirements. The requirements are ANDed.
+                                  items:
+                                    description: |-
+                                      A label selector requirement is a selector that contains values, a key, and an operator that
+                                      relates the key and values.
+                                    properties:
+                                      key:
+                                        description: key is the label key that the
+                                          selector applies to.
+                                        type: string
+                                      operator:
+                                        description: |-
+                                          operator represents a key's relationship to a set of values.
+                                          Valid operators are In, NotIn, Exists and DoesNotExist.
+                                        type: string
+                                      values:
+                                        description: |-
+                                          values is an array of string values. If the operator is In or NotIn,
+                                          the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                          the values array must be empty. This array is replaced during a strategic
+                                          merge patch.
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    required:
+                                    - key
+                                    - operator
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                matchLabels:
+                                  additionalProperties:
+                                    type: string
+                                  description: |-
+                                    matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                    map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                    operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                  type: object
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            matchLabelKeys:
+                              description: |-
+                                MatchLabelKeys is a set of pod label keys to select which pods will
+                                be taken into consideration. The keys are used to lookup values from the
+                                incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)`
+                                to select the group of existing pods which pods will be taken into consideration
+                                for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                pod labels will be ignored. The default value is empty.
+                                The same key is forbidden to exist in both matchLabelKeys and labelSelector.
+                                Also, matchLabelKeys cannot be set when labelSelector isn't set.
+                              items:
+                                type: string
+                              type: array
+                              x-kubernetes-list-type: atomic
+                            mismatchLabelKeys:
+                              description: |-
+                                MismatchLabelKeys is a set of pod label keys to select which pods will
+                                be taken into consideration. The keys are used to lookup values from the
+                                incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)`
+                                to select the group of existing pods which pods will be taken into consideration
+                                for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming
+                                pod labels will be ignored. The default value is empty.
+                                The same key is forbidden to exist in both mismatchLabelKeys and labelSelector.
+                                Also, mismatchLabelKeys cannot be set when labelSelector isn't set.
+                              items:
+                                type: string
+                              type: array
+                              x-kubernetes-list-type: atomic
+                            namespaceSelector:
+                              description: |-
+                                A label query over the set of namespaces that the term applies to.
+                                The term is applied to the union of the namespaces selected by this field
+                                and the ones listed in the namespaces field.
+                                null selector and null or empty namespaces list means "this pod's namespace".
+                                An empty selector ({}) matches all namespaces.
+                              properties:
+                                matchExpressions:
+                                  description: matchExpressions is a list of label
+                                    selector requirements. The requirements are ANDed.
+                                  items:
+                                    description: |-
+                                      A label selector requirement is a selector that contains values, a key, and an operator that
+                                      relates the key and values.
+                                    properties:
+                                      key:
+                                        description: key is the label key that the
+                                          selector applies to.
+                                        type: string
+                                      operator:
+                                        description: |-
+                                          operator represents a key's relationship to a set of values.
+                                          Valid operators are In, NotIn, Exists and DoesNotExist.
+                                        type: string
+                                      values:
+                                        description: |-
+                                          values is an array of string values. If the operator is In or NotIn,
+                                          the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                          the values array must be empty. This array is replaced during a strategic
+                                          merge patch.
+                                        items:
+                                          type: string
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                    required:
+                                    - key
+                                    - operator
+                                    type: object
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                matchLabels:
+                                  additionalProperties:
+                                    type: string
+                                  description: |-
+                                    matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                    map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                    operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                  type: object
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            namespaces:
+                              description: |-
+                                namespaces specifies a static list of namespace names that the term applies to.
+                                The term is applied to the union of the namespaces listed in this field
+                                and the ones selected by namespaceSelector.
+                                null or empty namespaces list and null namespaceSelector means "this pod's namespace".
+                              items:
+                                type: string
+                              type: array
+                              x-kubernetes-list-type: atomic
+                            topologyKey:
+                              description: |-
+                                This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching
+                                the labelSelector in the specified namespaces, where co-located is defined as running on a node
+                                whose value of the label with key topologyKey matches that of any node on which any of the
+                                selected pods is running.
+                                Empty topologyKey is not allowed.
+                              type: string
+                          required:
+                          - topologyKey
+                          type: object
+                        type: array
+                        x-kubernetes-list-type: atomic
+                    type: object
+                type: object
+              blend:
+                description: blend defines the CacheBlend tunables injected into the
+                  vLLM connect-config.
+                properties:
+                  checkLayer:
+                    default: 1
+                    description: |-
+                      checkLayer is the layer index used by CacheBlend to decide which tokens
+                      to recompute. It is surfaced to the connector as
+                      kv_connector_extra_config["cb.check_layer"].
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  recompRatio:
+                    default: 0.15
+                    description: |-
+                      recompRatio is the fraction of tokens CacheBlend recomputes. It is
+                      surfaced to the connector as kv_connector_extra_config["cb.recomp_ratio"]
+                      and must be in (0, 1].
+                    type: number
+                type: object
+              env:
+                description: env defines additional environment variables.
+                items:
+                  description: EnvVar represents an environment variable present in
+                    a Container.
+                  properties:
+                    name:
+                      description: |-
+                        Name of the environment variable.
+                        May consist of any printable ASCII characters except '='.
+                      type: string
+                    value:
+                      description: |-
+                        Variable references $(VAR_NAME) are expanded
+                        using the previously defined environment variables in the container and
+                        any service environment variables. If a variable cannot be resolved,
+                        the reference in the input string will be unchanged. Double $$ are reduced
+                        to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e.
+                        "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)".
+                        Escaped references will never be expanded, regardless of whether the variable
+                        exists or not.
+                        Defaults to "".
+                      type: string
+                    valueFrom:
+                      description: Source for the environment variable's value. Cannot
+                        be used if value is not empty.
+                      properties:
+                        configMapKeyRef:
+                          description: Selects a key of a ConfigMap.
+                          properties:
+                            key:
+                              description: The key to select.
+                              type: string
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                            optional:
+                              description: Specify whether the ConfigMap or its key
+                                must be defined
+                              type: boolean
+                          required:
+                          - key
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        fieldRef:
+                          description: |-
+                            Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['<KEY>']`, `metadata.annotations['<KEY>']`,
+                            spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.
+                          properties:
+                            apiVersion:
+                              description: Version of the schema the FieldPath is
+                                written in terms of, defaults to "v1".
+                              type: string
+                            fieldPath:
+                              description: Path of the field to select in the specified
+                                API version.
+                              type: string
+                          required:
+                          - fieldPath
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        fileKeyRef:
+                          description: |-
+                            FileKeyRef selects a key of the env file.
+                            Requires the EnvFiles feature gate to be enabled.
+                          properties:
+                            key:
+                              description: |-
+                                The key within the env file. An invalid key will prevent the pod from starting.
+                                The keys defined within a source may consist of any printable ASCII characters except '='.
+                                During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters.
+                              type: string
+                            optional:
+                              default: false
+                              description: |-
+                                Specify whether the file or its key must be defined. If the file or key
+                                does not exist, then the env var is not published.
+                                If optional is set to true and the specified key does not exist,
+                                the environment variable will not be set in the Pod's containers.
+
+                                If optional is set to false and the specified key does not exist,
+                                an error will be returned during Pod creation.
+                              type: boolean
+                            path:
+                              description: |-
+                                The path within the volume from which to select the file.
+                                Must be relative and may not contain the '..' path or start with '..'.
+                              type: string
+                            volumeName:
+                              description: The name of the volume mount containing
+                                the env file.
+                              type: string
+                          required:
+                          - key
+                          - path
+                          - volumeName
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        resourceFieldRef:
+                          description: |-
+                            Selects a resource of the container: only resources limits and requests
+                            (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.
+                          properties:
+                            containerName:
+                              description: 'Container name: required for volumes,
+                                optional for env vars'
+                              type: string
+                            divisor:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              description: Specifies the output format of the exposed
+                                resources, defaults to "1"
+                              pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                              x-kubernetes-int-or-string: true
+                            resource:
+                              description: 'Required: resource to select'
+                              type: string
+                          required:
+                          - resource
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        secretKeyRef:
+                          description: Selects a key of a secret in the pod's namespace
+                          properties:
+                            key:
+                              description: The key of the secret to select from.  Must
+                                be a valid secret key.
+                              type: string
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                            optional:
+                              description: Specify whether the Secret or its key must
+                                be defined
+                              type: boolean
+                          required:
+                          - key
+                          type: object
+                          x-kubernetes-map-type: atomic
+                      type: object
+                  required:
+                  - name
+                  type: object
+                type: array
+              eviction:
+                description: eviction defines the cache eviction configuration.
+                properties:
+                  evictionRatio:
+                    default: 0.2
+                    description: evictionRatio is the fraction of cache to evict when
+                      triggered.
+                    type: number
+                  policy:
+                    default: LRU
+                    description: policy is the eviction policy. Currently only LRU
+                      is supported.
+                    enum:
+                    - LRU
+                    type: string
+                  triggerWatermark:
+                    default: 0.8
+                    description: triggerWatermark is the cache usage ratio that triggers
+                      eviction.
+                    type: number
+                type: object
+              extraArgs:
+                description: |-
+                  extraArgs are additional CLI flags appended to the server command.
+                  They are appended last and can override any auto-generated flag.
+                items:
+                  type: string
+                type: array
+              gpuVendor:
+                default: nvidia
+                description: |-
+                  gpuVendor selects the GPU vendor. "nvidia" (default) requires the NVIDIA
+                  GPU Operator's "nvidia" RuntimeClass; "amd" runs on the default container
+                  runtime with privileged: true.
+                enum:
+                - nvidia
+                - amd
+                type: string
+              image:
+                description: |-
+                  image defines the container image to use for the blend_v3 engine. This
+                  may be a PRIVATE image; use imagePullSecrets to pull it.
+                properties:
+                  pullPolicy:
+                    default: IfNotPresent
+                    description: pullPolicy is the image pull policy.
+                    enum:
+                    - Always
+                    - Never
+                    - IfNotPresent
+                    type: string
+                  repository:
+                    default: lmcache/vllm-openai
+                    description: repository is the container image repository.
+                    type: string
+                  tag:
+                    default: latest
+                    description: tag is the container image tag.
+                    type: string
+                type: object
+              imagePullSecrets:
+                description: |-
+                  imagePullSecrets is a list of references to secrets for pulling the
+                  engine image.
+                items:
+                  description: |-
+                    LocalObjectReference contains enough information to let you locate the
+                    referenced object inside the same namespace.
+                  properties:
+                    name:
+                      default: ""
+                      description: |-
+                        Name of the referent.
+                        This field is effectively required, but due to backwards compatibility is
+                        allowed to be empty. Instances of this type with an empty value here are
+                        almost certainly wrong.
+                        More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                      type: string
+                  type: object
+                  x-kubernetes-map-type: atomic
+                type: array
+              injection:
+                description: |-
+                  injection defines the defaults the mutating webhook reads for pods bound
+                  to this engine.
+                properties:
+                  cudagraph:
+                    default: eager
+                    description: |-
+                      cudagraph selects the CUDA graph mode injected into the vLLM args. "eager"
+                      (default) maps to --enforce-eager; "full_decode_only" enables decode-only
+                      graphs. Full graphs are never used.
+                    enum:
+                    - eager
+                    - piecewise
+                    - full_decode_only
+                    type: string
+                  imagePullSecrets:
+                    description: |-
+                      imagePullSecrets are appended to the vLLM pod's spec.imagePullSecrets so
+                      the PRIVATE payload init-container image can pull. The referenced
+                      Secret(s) must already exist in the vLLM pod's namespace; the operator
+                      does not copy them cross-namespace.
+                    items:
+                      description: |-
+                        LocalObjectReference contains enough information to let you locate the
+                        referenced object inside the same namespace.
+                      properties:
+                        name:
+                          default: ""
+                          description: |-
+                            Name of the referent.
+                            This field is effectively required, but due to backwards compatibility is
+                            allowed to be empty. Instances of this type with an empty value here are
+                            almost certainly wrong.
+                            More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                          type: string
+                      type: object
+                      x-kubernetes-map-type: atomic
+                    type: array
+                  payloadImage:
+                    description: |-
+                      payloadImage is the init-container image (repository/tag/pullPolicy, like
+                      spec.image) that stages the lmcache-cacheblend vLLM plugin into a shared
+                      emptyDir. It is a SEPARATE, usually PRIVATE image: set
+                      payloadImage.repository to your cacheblend-plugin image — the repository
+                      default inherited from ImageSpec is the engine image and is NOT a valid
+                      payload. For private registries, imagePullSecrets must reference Secret(s)
+                      that exist in the vLLM pod's namespace.
+                    properties:
+                      pullPolicy:
+                        default: IfNotPresent
+                        description: pullPolicy is the image pull policy.
+                        enum:
+                        - Always
+                        - Never
+                        - IfNotPresent
+                        type: string
+                      repository:
+                        default: lmcache/vllm-openai
+                        description: repository is the container image repository.
+                        type: string
+                      tag:
+                        default: latest
+                        description: tag is the container image tag.
+                        type: string
+                    type: object
+                  targetContainer:
+                    description: |-
+                      targetContainer is the name of the vLLM container to inject into. Empty
+                      (the default) selects the first container; a per-pod annotation may
+                      override it.
+                    type: string
+                type: object
+              l1:
+                description: l1 defines the L1 memory cache configuration.
+                properties:
+                  sizeGB:
+                    description: |-
+                      sizeGB is the L1 cache size in gigabytes. Required, must be > 0.
+                      The CRD-level constraint (exclusiveMinimum=0) rejects invalid values
+                      at admission time so the controller never sees them; the in-Go
+                      ValidateSpec keeps the same rule for defense in depth.
+                    exclusiveMinimum: true
+                    minimum: 0
+                    type: number
+                required:
+                - sizeGB
+                type: object
+              l2Backend:
+                description: |-
+                  l2Backend defines the L2 storage backend.
+                  Currently only a single adapter is supported.
+                properties:
+                  prefetchMaxInFlight:
+                    default: 8
+                    description: |-
+                      prefetchMaxInFlight limits the number of concurrent prefetch
+                      (L2→L1 load) requests, preventing excessive L1 memory pressure.
+                    format: int32
+                    minimum: 1
+                    type: integer
+                  prefetchPolicy:
+                    default: default
+                    description: |-
+                      prefetchPolicy controls how keys flow from L2 back to L1 on
+                      cache misses. "default" picks the first adapter that has the key.
+                    enum:
+                    - default
+                    type: string
+                  raw:
+                    description: |-
+                      raw is an escape hatch for adapter types not yet natively
+                      supported by the operator (e.g. nixl_store, fs, mock).
+                      The JSON is passed through to --l2-adapter as-is.
+                    properties:
+                      config:
+                        additionalProperties:
+                          x-kubernetes-preserve-unknown-fields: true
+                        description: config is type-specific configuration as a free-form
+                          map.
+                        type: object
+                      type:
+                        description: type is the adapter type name (e.g. "nixl_store",
+                          "fs", "mock").
+                        type: string
+                    required:
+                    - type
+                    type: object
+                  resp:
+                    description: |-
+                      resp configures a Redis/Valkey RESP L2 adapter backed by the
+                      native C++ connector.
+                    properties:
+                      authSecretRef:
+                        description: |-
+                          authSecretRef is a reference to a Secret containing "username"
+                          and "password" keys for Redis authentication.
+                          The Secret may live in a different namespace; the operator will
+                          create a managed copy in the LMCacheEngine's namespace.
+                          The credentials are injected via environment variables so they
+                          do not appear in container args or kubectl describe output.
+                        properties:
+                          name:
+                            description: name is the name of the Secret.
+                            type: string
+                          namespace:
+                            description: |-
+                              namespace is the namespace of the Secret.
+                              If empty, defaults to the namespace of the LMCacheEngine resource.
+                            type: string
+                        required:
+                        - name
+                        type: object
+                      host:
+                        description: host is the Redis/Valkey server hostname or IP.
+                        type: string
+                      maxCapacityGB:
+                        default: 0
+                        description: |-
+                          maxCapacityGB is the max L2 capacity in GB for usage tracking
+                          and eviction. 0 means disabled.
+                        type: number
+                      numWorkers:
+                        default: 8
+                        description: numWorkers is the number of C++ worker threads
+                          for I/O.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      port:
+                        description: port is the Redis/Valkey server port.
+                        format: int32
+                        maximum: 65535
+                        minimum: 1
+                        type: integer
+                    required:
+                    - host
+                    - port
+                    type: object
+                  storePolicy:
+                    default: default
+                    description: |-
+                      storePolicy controls how keys flow from L1 to L2.
+                      "default" stores all keys to the adapter and keeps L1.
+                      "skip_l1" stores all keys to the adapter and deletes them from L1
+                      (buffer-only mode — pair with eviction.policy=noop).
+                    enum:
+                    - default
+                    - skip_l1
+                    type: string
+                type: object
+              logLevel:
+                default: INFO
+                description: logLevel is the log level for the LMCache server.
+                enum:
+                - DEBUG
+                - INFO
+                - WARNING
+                - ERROR
+                type: string
+              nodeSelector:
+                additionalProperties:
+                  type: string
+                description: nodeSelector determines which nodes get a CacheBlend
+                  engine instance.
+                type: object
+              podAnnotations:
+                additionalProperties:
+                  type: string
+                description: podAnnotations are additional annotations added to pods.
+                type: object
+              podLabels:
+                additionalProperties:
+                  type: string
+                description: podLabels are additional labels added to pods.
+                type: object
+              priorityClassName:
+                description: priorityClassName is the priority class for the pods.
+                type: string
+              prometheus:
+                description: prometheus defines Prometheus monitoring configuration.
+                properties:
+                  enabled:
+                    default: true
+                    description: enabled controls whether Prometheus metrics are exposed.
+                    type: boolean
+                  port:
+                    default: 9090
+                    description: port is the Prometheus metrics port.
+                    format: int32
+                    type: integer
+                  serviceMonitor:
+                    description: serviceMonitor configures the Prometheus ServiceMonitor.
+                    properties:
+                      enabled:
+                        default: false
+                        description: enabled controls whether a ServiceMonitor CR
+                          is created.
+                        type: boolean
+                      interval:
+                        default: 30s
+                        description: interval is the Prometheus scrape interval.
+                        type: string
+                      labels:
+                        additionalProperties:
+                          type: string
+                        description: labels are additional labels added to the ServiceMonitor.
+                        type: object
+                    type: object
+                type: object
+              resourceOverrides:
+                description: resourceOverrides allows overriding auto-computed resource
+                  requirements.
+                properties:
+                  claims:
+                    description: |-
+                      Claims lists the names of resources, defined in spec.resourceClaims,
+                      that are used by this container.
+
+                      This field depends on the
+                      DynamicResourceAllocation feature gate.
+
+                      This field is immutable. It can only be set for containers.
+                    items:
+                      description: ResourceClaim references one entry in PodSpec.ResourceClaims.
+                      properties:
+                        name:
+                          description: |-
+                            Name must match the name of one entry in pod.spec.resourceClaims of
+                            the Pod where this field is used. It makes that resource available
+                            inside a container.
+                          type: string
+                        request:
+                          description: |-
+                            Request is the name chosen for a request in the referenced claim.
+                            If empty, everything from the claim is made available, otherwise
+                            only the result of this request.
+                          type: string
+                      required:
+                      - name
+                      type: object
+                    type: array
+                    x-kubernetes-list-map-keys:
+                    - name
+                    x-kubernetes-list-type: map
+                  limits:
+                    additionalProperties:
+                      anyOf:
+                      - type: integer
+                      - type: string
+                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                      x-kubernetes-int-or-string: true
+                    description: |-
+                      Limits describes the maximum amount of compute resources allowed.
+                      More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                    type: object
+                  requests:
+                    additionalProperties:
+                      anyOf:
+                      - type: integer
+                      - type: string
+                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                      x-kubernetes-int-or-string: true
+                    description: |-
+                      Requests describes the minimum amount of compute resources required.
+                      If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                      otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                      More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                    type: object
+                type: object
+              server:
+                description: |-
+                  server defines server configuration. chunkSize defaults to 256 because
+                  CacheBlend requires chunk_size == 256.
+                properties:
+                  chunkSize:
+                    default: 256
+                    description: chunkSize is the token chunk size.
+                    format: int32
+                    type: integer
+                  hashAlgorithm:
+                    default: blake3
+                    description: hashAlgorithm is the hash algorithm used for token
+                      hashing.
+                    enum:
+                    - builtin
+                    - sha256_cbor
+                    - blake3
+                    type: string
+                  httpPort:
+                    default: 8080
+                    description: httpPort is the HTTP frontend port (health checks,
+                      cache admin).
+                    format: int32
+                    maximum: 65535
+                    minimum: 1024
+                    type: integer
+                  maxWorkers:
+                    default: 1
+                    description: maxWorkers is the number of worker threads.
+                    format: int32
+                    type: integer
+                  port:
+                    default: 5555
+                    description: port is the server listening port.
+                    format: int32
+                    maximum: 65535
+                    minimum: 1024
+                    type: integer
+                type: object
+              serviceAccountName:
+                description: serviceAccountName is the name of the ServiceAccount
+                  to use.
+                type: string
+              tolerations:
+                description: tolerations defines pod tolerations.
+                items:
+                  description: |-
+                    The pod this Toleration is attached to tolerates any taint that matches
+                    the triple <key,value,effect> using the matching operator <operator>.
+                  properties:
+                    effect:
+                      description: |-
+                        Effect indicates the taint effect to match. Empty means match all taint effects.
+                        When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+                      type: string
+                    key:
+                      description: |-
+                        Key is the taint key that the toleration applies to. Empty means match all taint keys.
+                        If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+                      type: string
+                    operator:
+                      description: |-
+                        Operator represents a key's relationship to the value.
+                        Valid operators are Exists, Equal, Lt, and Gt. Defaults to Equal.
+                        Exists is equivalent to wildcard for value, so that a pod can
+                        tolerate all taints of a particular category.
+                        Lt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators).
+                      type: string
+                    tolerationSeconds:
+                      description: |-
+                        TolerationSeconds represents the period of time the toleration (which must be
+                        of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
+                        it is not set, which means tolerate the taint forever (do not evict). Zero and
+                        negative values will be treated as 0 (evict immediately) by the system.
+                      format: int64
+                      type: integer
+                    value:
+                      description: |-
+                        Value is the taint value the toleration matches to.
+                        If the operator is Exists, the value should be empty, otherwise just a regular string.
+                      type: string
+                  type: object
+                type: array
+              volumeMounts:
+                description: volumeMounts defines additional volume mounts.
+                items:
+                  description: VolumeMount describes a mounting of a Volume within
+                    a container.
+                  properties:
+                    mountPath:
+                      description: |-
+                        Path within the container at which the volume should be mounted.  Must
+                        not contain ':'.
+                      type: string
+                    mountPropagation:
+                      description: |-
+                        mountPropagation determines how mounts are propagated from the host
+                        to container and the other way around.
+                        When not set, MountPropagationNone is used.
+                        This field is beta in 1.10.
+                        When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified
+                        (which defaults to None).
+                      type: string
+                    name:
+                      description: This must match the Name of a Volume.
+                      type: string
+                    readOnly:
+                      description: |-
+                        Mounted read-only if true, read-write otherwise (false or unspecified).
+                        Defaults to false.
+                      type: boolean
+                    recursiveReadOnly:
+                      description: |-
+                        RecursiveReadOnly specifies whether read-only mounts should be handled
+                        recursively.
+
+                        If ReadOnly is false, this field has no meaning and must be unspecified.
+
+                        If ReadOnly is true, and this field is set to Disabled, the mount is not made
+                        recursively read-only.  If this field is set to IfPossible, the mount is made
+                        recursively read-only, if it is supported by the container runtime.  If this
+                        field is set to Enabled, the mount is made recursively read-only if it is
+                        supported by the container runtime, otherwise the pod will not be started and
+                        an error will be generated to indicate the reason.
+
+                        If this field is set to IfPossible or Enabled, MountPropagation must be set to
+                        None (or be unspecified, which defaults to None).
+
+                        If this field is not specified, it is treated as an equivalent of Disabled.
+                      type: string
+                    subPath:
+                      description: |-
+                        Path within the volume from which the container's volume should be mounted.
+                        Defaults to "" (volume's root).
+                      type: string
+                    subPathExpr:
+                      description: |-
+                        Expanded path within the volume from which the container's volume should be mounted.
+                        Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment.
+                        Defaults to "" (volume's root).
+                        SubPathExpr and SubPath are mutually exclusive.
+                      type: string
+                  required:
+                  - mountPath
+                  - name
+                  type: object
+                type: array
+              volumes:
+                description: volumes defines additional volumes.
+                items:
+                  description: Volume represents a named volume in a pod that may
+                    be accessed by any container in the pod.
+                  properties:
+                    awsElasticBlockStore:
+                      description: |-
+                        awsElasticBlockStore represents an AWS Disk resource that is attached to a
+                        kubelet's host machine and then exposed to the pod.
+                        Deprecated: AWSElasticBlockStore is deprecated. All operations for the in-tree
+                        awsElasticBlockStore type are redirected to the ebs.csi.aws.com CSI driver.
+                        More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore
+                      properties:
+                        fsType:
+                          description: |-
+                            fsType is the filesystem type of the volume that you want to mount.
+                            Tip: Ensure that the filesystem type is supported by the host operating system.
+                            Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore
+                          type: string
+                        partition:
+                          description: |-
+                            partition is the partition in the volume that you want to mount.
+                            If omitted, the default is to mount by volume name.
+                            Examples: For volume /dev/sda1, you specify the partition as "1".
+                            Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty).
+                          format: int32
+                          type: integer
+                        readOnly:
+                          description: |-
+                            readOnly value true will force the readOnly setting in VolumeMounts.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore
+                          type: boolean
+                        volumeID:
+                          description: |-
+                            volumeID is unique ID of the persistent disk resource in AWS (Amazon EBS volume).
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore
+                          type: string
+                      required:
+                      - volumeID
+                      type: object
+                    azureDisk:
+                      description: |-
+                        azureDisk represents an Azure Data Disk mount on the host and bind mount to the pod.
+                        Deprecated: AzureDisk is deprecated. All operations for the in-tree azureDisk type
+                        are redirected to the disk.csi.azure.com CSI driver.
+                      properties:
+                        cachingMode:
+                          description: 'cachingMode is the Host Caching mode: None,
+                            Read Only, Read Write.'
+                          type: string
+                        diskName:
+                          description: diskName is the Name of the data disk in the
+                            blob storage
+                          type: string
+                        diskURI:
+                          description: diskURI is the URI of data disk in the blob
+                            storage
+                          type: string
+                        fsType:
+                          default: ext4
+                          description: |-
+                            fsType is Filesystem type to mount.
+                            Must be a filesystem type supported by the host operating system.
+                            Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                          type: string
+                        kind:
+                          description: 'kind expected values are Shared: multiple
+                            blob disks per storage account  Dedicated: single blob
+                            disk per storage account  Managed: azure managed data
+                            disk (only in managed availability set). defaults to shared'
+                          type: string
+                        readOnly:
+                          default: false
+                          description: |-
+                            readOnly Defaults to false (read/write). ReadOnly here will force
+                            the ReadOnly setting in VolumeMounts.
+                          type: boolean
+                      required:
+                      - diskName
+                      - diskURI
+                      type: object
+                    azureFile:
+                      description: |-
+                        azureFile represents an Azure File Service mount on the host and bind mount to the pod.
+                        Deprecated: AzureFile is deprecated. All operations for the in-tree azureFile type
+                        are redirected to the file.csi.azure.com CSI driver.
+                      properties:
+                        readOnly:
+                          description: |-
+                            readOnly defaults to false (read/write). ReadOnly here will force
+                            the ReadOnly setting in VolumeMounts.
+                          type: boolean
+                        secretName:
+                          description: secretName is the  name of secret that contains
+                            Azure Storage Account Name and Key
+                          type: string
+                        shareName:
+                          description: shareName is the azure share Name
+                          type: string
+                      required:
+                      - secretName
+                      - shareName
+                      type: object
+                    cephfs:
+                      description: |-
+                        cephFS represents a Ceph FS mount on the host that shares a pod's lifetime.
+                        Deprecated: CephFS is deprecated and the in-tree cephfs type is no longer supported.
+                      properties:
+                        monitors:
+                          description: |-
+                            monitors is Required: Monitors is a collection of Ceph monitors
+                            More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                          items:
+                            type: string
+                          type: array
+                          x-kubernetes-list-type: atomic
+                        path:
+                          description: 'path is Optional: Used as the mounted root,
+                            rather than the full Ceph tree, default is /'
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly is Optional: Defaults to false (read/write). ReadOnly here will force
+                            the ReadOnly setting in VolumeMounts.
+                            More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                          type: boolean
+                        secretFile:
+                          description: |-
+                            secretFile is Optional: SecretFile is the path to key ring for User, default is /etc/ceph/user.secret
+                            More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                          type: string
+                        secretRef:
+                          description: |-
+                            secretRef is Optional: SecretRef is reference to the authentication secret for User, default is empty.
+                            More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                          properties:
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        user:
+                          description: |-
+                            user is optional: User is the rados user name, default is admin
+                            More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it
+                          type: string
+                      required:
+                      - monitors
+                      type: object
+                    cinder:
+                      description: |-
+                        cinder represents a cinder volume attached and mounted on kubelets host machine.
+                        Deprecated: Cinder is deprecated. All operations for the in-tree cinder type
+                        are redirected to the cinder.csi.openstack.org CSI driver.
+                        More info: https://examples.k8s.io/mysql-cinder-pd/README.md
+                      properties:
+                        fsType:
+                          description: |-
+                            fsType is the filesystem type to mount.
+                            Must be a filesystem type supported by the host operating system.
+                            Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                            More info: https://examples.k8s.io/mysql-cinder-pd/README.md
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly defaults to false (read/write). ReadOnly here will force
+                            the ReadOnly setting in VolumeMounts.
+                            More info: https://examples.k8s.io/mysql-cinder-pd/README.md
+                          type: boolean
+                        secretRef:
+                          description: |-
+                            secretRef is optional: points to a secret object containing parameters used to connect
+                            to OpenStack.
+                          properties:
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        volumeID:
+                          description: |-
+                            volumeID used to identify the volume in cinder.
+                            More info: https://examples.k8s.io/mysql-cinder-pd/README.md
+                          type: string
+                      required:
+                      - volumeID
+                      type: object
+                    configMap:
+                      description: configMap represents a configMap that should populate
+                        this volume
+                      properties:
+                        defaultMode:
+                          description: |-
+                            defaultMode is optional: mode bits used to set permissions on created files by default.
+                            Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                            YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                            Defaults to 0644.
+                            Directories within the path are not affected by this setting.
+                            This might be in conflict with other options that affect the file
+                            mode, like fsGroup, and the result can be other mode bits set.
+                          format: int32
+                          type: integer
+                        items:
+                          description: |-
+                            items if unspecified, each key-value pair in the Data field of the referenced
+                            ConfigMap will be projected into the volume as a file whose name is the
+                            key and content is the value. If specified, the listed keys will be
+                            projected into the specified paths, and unlisted keys will not be
+                            present. If a key is specified which is not present in the ConfigMap,
+                            the volume setup will error unless it is marked optional. Paths must be
+                            relative and may not contain the '..' path or start with '..'.
+                          items:
+                            description: Maps a string key to a path within a volume.
+                            properties:
+                              key:
+                                description: key is the key to project.
+                                type: string
+                              mode:
+                                description: |-
+                                  mode is Optional: mode bits used to set permissions on this file.
+                                  Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                  YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                  If not specified, the volume defaultMode will be used.
+                                  This might be in conflict with other options that affect the file
+                                  mode, like fsGroup, and the result can be other mode bits set.
+                                format: int32
+                                type: integer
+                              path:
+                                description: |-
+                                  path is the relative path of the file to map the key to.
+                                  May not be an absolute path.
+                                  May not contain the path element '..'.
+                                  May not start with the string '..'.
+                                type: string
+                            required:
+                            - key
+                            - path
+                            type: object
+                          type: array
+                          x-kubernetes-list-type: atomic
+                        name:
+                          default: ""
+                          description: |-
+                            Name of the referent.
+                            This field is effectively required, but due to backwards compatibility is
+                            allowed to be empty. Instances of this type with an empty value here are
+                            almost certainly wrong.
+                            More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                          type: string
+                        optional:
+                          description: optional specify whether the ConfigMap or its
+                            keys must be defined
+                          type: boolean
+                      type: object
+                      x-kubernetes-map-type: atomic
+                    csi:
+                      description: csi (Container Storage Interface) represents ephemeral
+                        storage that is handled by certain external CSI drivers.
+                      properties:
+                        driver:
+                          description: |-
+                            driver is the name of the CSI driver that handles this volume.
+                            Consult with your admin for the correct name as registered in the cluster.
+                          type: string
+                        fsType:
+                          description: |-
+                            fsType to mount. Ex. "ext4", "xfs", "ntfs".
+                            If not provided, the empty value is passed to the associated CSI driver
+                            which will determine the default filesystem to apply.
+                          type: string
+                        nodePublishSecretRef:
+                          description: |-
+                            nodePublishSecretRef is a reference to the secret object containing
+                            sensitive information to pass to the CSI driver to complete the CSI
+                            NodePublishVolume and NodeUnpublishVolume calls.
+                            This field is optional, and  may be empty if no secret is required. If the
+                            secret object contains more than one secret, all secret references are passed.
+                          properties:
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        readOnly:
+                          description: |-
+                            readOnly specifies a read-only configuration for the volume.
+                            Defaults to false (read/write).
+                          type: boolean
+                        volumeAttributes:
+                          additionalProperties:
+                            type: string
+                          description: |-
+                            volumeAttributes stores driver-specific properties that are passed to the CSI
+                            driver. Consult your driver's documentation for supported values.
+                          type: object
+                      required:
+                      - driver
+                      type: object
+                    downwardAPI:
+                      description: downwardAPI represents downward API about the pod
+                        that should populate this volume
+                      properties:
+                        defaultMode:
+                          description: |-
+                            Optional: mode bits to use on created files by default. Must be a
+                            Optional: mode bits used to set permissions on created files by default.
+                            Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                            YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                            Defaults to 0644.
+                            Directories within the path are not affected by this setting.
+                            This might be in conflict with other options that affect the file
+                            mode, like fsGroup, and the result can be other mode bits set.
+                          format: int32
+                          type: integer
+                        items:
+                          description: Items is a list of downward API volume file
+                          items:
+                            description: DownwardAPIVolumeFile represents information
+                              to create the file containing the pod field
+                            properties:
+                              fieldRef:
+                                description: 'Required: Selects a field of the pod:
+                                  only annotations, labels, name, namespace and uid
+                                  are supported.'
+                                properties:
+                                  apiVersion:
+                                    description: Version of the schema the FieldPath
+                                      is written in terms of, defaults to "v1".
+                                    type: string
+                                  fieldPath:
+                                    description: Path of the field to select in the
+                                      specified API version.
+                                    type: string
+                                required:
+                                - fieldPath
+                                type: object
+                                x-kubernetes-map-type: atomic
+                              mode:
+                                description: |-
+                                  Optional: mode bits used to set permissions on this file, must be an octal value
+                                  between 0000 and 0777 or a decimal value between 0 and 511.
+                                  YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                  If not specified, the volume defaultMode will be used.
+                                  This might be in conflict with other options that affect the file
+                                  mode, like fsGroup, and the result can be other mode bits set.
+                                format: int32
+                                type: integer
+                              path:
+                                description: 'Required: Path is  the relative path
+                                  name of the file to be created. Must not be absolute
+                                  or contain the ''..'' path. Must be utf-8 encoded.
+                                  The first item of the relative path must not start
+                                  with ''..'''
+                                type: string
+                              resourceFieldRef:
+                                description: |-
+                                  Selects a resource of the container: only resources limits and requests
+                                  (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported.
+                                properties:
+                                  containerName:
+                                    description: 'Container name: required for volumes,
+                                      optional for env vars'
+                                    type: string
+                                  divisor:
+                                    anyOf:
+                                    - type: integer
+                                    - type: string
+                                    description: Specifies the output format of the
+                                      exposed resources, defaults to "1"
+                                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                    x-kubernetes-int-or-string: true
+                                  resource:
+                                    description: 'Required: resource to select'
+                                    type: string
+                                required:
+                                - resource
+                                type: object
+                                x-kubernetes-map-type: atomic
+                            required:
+                            - path
+                            type: object
+                          type: array
+                          x-kubernetes-list-type: atomic
+                      type: object
+                    emptyDir:
+                      description: |-
+                        emptyDir represents a temporary directory that shares a pod's lifetime.
+                        More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir
+                      properties:
+                        medium:
+                          description: |-
+                            medium represents what type of storage medium should back this directory.
+                            The default is "" which means to use the node's default medium.
+                            Must be an empty string (default) or Memory.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir
+                          type: string
+                        sizeLimit:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          description: |-
+                            sizeLimit is the total amount of local storage required for this EmptyDir volume.
+                            The size limit is also applicable for memory medium.
+                            The maximum usage on memory medium EmptyDir would be the minimum value between
+                            the SizeLimit specified here and the sum of memory limits of all containers in a pod.
+                            The default is nil which means that the limit is undefined.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir
+                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                          x-kubernetes-int-or-string: true
+                      type: object
+                    ephemeral:
+                      description: |-
+                        ephemeral represents a volume that is handled by a cluster storage driver.
+                        The volume's lifecycle is tied to the pod that defines it - it will be created before the pod starts,
+                        and deleted when the pod is removed.
+
+                        Use this if:
+                        a) the volume is only needed while the pod runs,
+                        b) features of normal volumes like restoring from snapshot or capacity
+                           tracking are needed,
+                        c) the storage driver is specified through a storage class, and
+                        d) the storage driver supports dynamic volume provisioning through
+                           a PersistentVolumeClaim (see EphemeralVolumeSource for more
+                           information on the connection between this volume type
+                           and PersistentVolumeClaim).
+
+                        Use PersistentVolumeClaim or one of the vendor-specific
+                        APIs for volumes that persist for longer than the lifecycle
+                        of an individual pod.
+
+                        Use CSI for light-weight local ephemeral volumes if the CSI driver is meant to
+                        be used that way - see the documentation of the driver for
+                        more information.
+
+                        A pod can use both types of ephemeral volumes and
+                        persistent volumes at the same time.
+                      properties:
+                        volumeClaimTemplate:
+                          description: |-
+                            Will be used to create a stand-alone PVC to provision the volume.
+                            The pod in which this EphemeralVolumeSource is embedded will be the
+                            owner of the PVC, i.e. the PVC will be deleted together with the
+                            pod.  The name of the PVC will be `<pod name>-<volume name>` where
+                            `<volume name>` is the name from the `PodSpec.Volumes` array
+                            entry. Pod validation will reject the pod if the concatenated name
+                            is not valid for a PVC (for example, too long).
+
+                            An existing PVC with that name that is not owned by the pod
+                            will *not* be used for the pod to avoid using an unrelated
+                            volume by mistake. Starting the pod is then blocked until
+                            the unrelated PVC is removed. If such a pre-created PVC is
+                            meant to be used by the pod, the PVC has to updated with an
+                            owner reference to the pod once the pod exists. Normally
+                            this should not be necessary, but it may be useful when
+                            manually reconstructing a broken cluster.
+
+                            This field is read-only and no changes will be made by Kubernetes
+                            to the PVC after it has been created.
+
+                            Required, must not be nil.
+                          properties:
+                            metadata:
+                              description: |-
+                                May contain labels and annotations that will be copied into the PVC
+                                when creating it. No other fields are allowed and will be rejected during
+                                validation.
+                              type: object
+                            spec:
+                              description: |-
+                                The specification for the PersistentVolumeClaim. The entire content is
+                                copied unchanged into the PVC that gets created from this
+                                template. The same fields as in a PersistentVolumeClaim
+                                are also valid here.
+                              properties:
+                                accessModes:
+                                  description: |-
+                                    accessModes contains the desired access modes the volume should have.
+                                    More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1
+                                  items:
+                                    type: string
+                                  type: array
+                                  x-kubernetes-list-type: atomic
+                                dataSource:
+                                  description: |-
+                                    dataSource field can be used to specify either:
+                                    * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot)
+                                    * An existing PVC (PersistentVolumeClaim)
+                                    If the provisioner or an external controller can support the specified data source,
+                                    it will create a new volume based on the contents of the specified data source.
+                                    When the AnyVolumeDataSource feature gate is enabled, dataSource contents will be copied to dataSourceRef,
+                                    and dataSourceRef contents will be copied to dataSource when dataSourceRef.namespace is not specified.
+                                    If the namespace is specified, then dataSourceRef will not be copied to dataSource.
+                                  properties:
+                                    apiGroup:
+                                      description: |-
+                                        APIGroup is the group for the resource being referenced.
+                                        If APIGroup is not specified, the specified Kind must be in the core API group.
+                                        For any other third-party types, APIGroup is required.
+                                      type: string
+                                    kind:
+                                      description: Kind is the type of resource being
+                                        referenced
+                                      type: string
+                                    name:
+                                      description: Name is the name of resource being
+                                        referenced
+                                      type: string
+                                  required:
+                                  - kind
+                                  - name
+                                  type: object
+                                  x-kubernetes-map-type: atomic
+                                dataSourceRef:
+                                  description: |-
+                                    dataSourceRef specifies the object from which to populate the volume with data, if a non-empty
+                                    volume is desired. This may be any object from a non-empty API group (non
+                                    core object) or a PersistentVolumeClaim object.
+                                    When this field is specified, volume binding will only succeed if the type of
+                                    the specified object matches some installed volume populator or dynamic
+                                    provisioner.
+                                    This field will replace the functionality of the dataSource field and as such
+                                    if both fields are non-empty, they must have the same value. For backwards
+                                    compatibility, when namespace isn't specified in dataSourceRef,
+                                    both fields (dataSource and dataSourceRef) will be set to the same
+                                    value automatically if one of them is empty and the other is non-empty.
+                                    When namespace is specified in dataSourceRef,
+                                    dataSource isn't set to the same value and must be empty.
+                                    There are three important differences between dataSource and dataSourceRef:
+                                    * While dataSource only allows two specific types of objects, dataSourceRef
+                                      allows any non-core object, as well as PersistentVolumeClaim objects.
+                                    * While dataSource ignores disallowed values (dropping them), dataSourceRef
+                                      preserves all values, and generates an error if a disallowed value is
+                                      specified.
+                                    * While dataSource only allows local objects, dataSourceRef allows objects
+                                      in any namespaces.
+                                    (Beta) Using this field requires the AnyVolumeDataSource feature gate to be enabled.
+                                    (Alpha) Using the namespace field of dataSourceRef requires the CrossNamespaceVolumeDataSource feature gate to be enabled.
+                                  properties:
+                                    apiGroup:
+                                      description: |-
+                                        APIGroup is the group for the resource being referenced.
+                                        If APIGroup is not specified, the specified Kind must be in the core API group.
+                                        For any other third-party types, APIGroup is required.
+                                      type: string
+                                    kind:
+                                      description: Kind is the type of resource being
+                                        referenced
+                                      type: string
+                                    name:
+                                      description: Name is the name of resource being
+                                        referenced
+                                      type: string
+                                    namespace:
+                                      description: |-
+                                        Namespace is the namespace of resource being referenced
+                                        Note that when a namespace is specified, a gateway.networking.k8s.io/ReferenceGrant object is required in the referent namespace to allow that namespace's owner to accept the reference. See the ReferenceGrant documentation for details.
+                                        (Alpha) This field requires the CrossNamespaceVolumeDataSource feature gate to be enabled.
+                                      type: string
+                                  required:
+                                  - kind
+                                  - name
+                                  type: object
+                                resources:
+                                  description: |-
+                                    resources represents the minimum resources the volume should have.
+                                    Users are allowed to specify resource requirements
+                                    that are lower than previous value but must still be higher than capacity recorded in the
+                                    status field of the claim.
+                                    More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources
+                                  properties:
+                                    limits:
+                                      additionalProperties:
+                                        anyOf:
+                                        - type: integer
+                                        - type: string
+                                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                        x-kubernetes-int-or-string: true
+                                      description: |-
+                                        Limits describes the maximum amount of compute resources allowed.
+                                        More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                      type: object
+                                    requests:
+                                      additionalProperties:
+                                        anyOf:
+                                        - type: integer
+                                        - type: string
+                                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                        x-kubernetes-int-or-string: true
+                                      description: |-
+                                        Requests describes the minimum amount of compute resources required.
+                                        If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                                        otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                                        More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                                      type: object
+                                  type: object
+                                selector:
+                                  description: selector is a label query over volumes
+                                    to consider for binding.
+                                  properties:
+                                    matchExpressions:
+                                      description: matchExpressions is a list of label
+                                        selector requirements. The requirements are
+                                        ANDed.
+                                      items:
+                                        description: |-
+                                          A label selector requirement is a selector that contains values, a key, and an operator that
+                                          relates the key and values.
+                                        properties:
+                                          key:
+                                            description: key is the label key that
+                                              the selector applies to.
+                                            type: string
+                                          operator:
+                                            description: |-
+                                              operator represents a key's relationship to a set of values.
+                                              Valid operators are In, NotIn, Exists and DoesNotExist.
+                                            type: string
+                                          values:
+                                            description: |-
+                                              values is an array of string values. If the operator is In or NotIn,
+                                              the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                              the values array must be empty. This array is replaced during a strategic
+                                              merge patch.
+                                            items:
+                                              type: string
+                                            type: array
+                                            x-kubernetes-list-type: atomic
+                                        required:
+                                        - key
+                                        - operator
+                                        type: object
+                                      type: array
+                                      x-kubernetes-list-type: atomic
+                                    matchLabels:
+                                      additionalProperties:
+                                        type: string
+                                      description: |-
+                                        matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                        map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                        operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                      type: object
+                                  type: object
+                                  x-kubernetes-map-type: atomic
+                                storageClassName:
+                                  description: |-
+                                    storageClassName is the name of the StorageClass required by the claim.
+                                    More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1
+                                  type: string
+                                volumeAttributesClassName:
+                                  description: |-
+                                    volumeAttributesClassName may be used to set the VolumeAttributesClass used by this claim.
+                                    If specified, the CSI driver will create or update the volume with the attributes defined
+                                    in the corresponding VolumeAttributesClass. This has a different purpose than storageClassName,
+                                    it can be changed after the claim is created. An empty string or nil value indicates that no
+                                    VolumeAttributesClass will be applied to the claim. If the claim enters an Infeasible error state,
+                                    this field can be reset to its previous value (including nil) to cancel the modification.
+                                    If the resource referred to by volumeAttributesClass does not exist, this PersistentVolumeClaim will be
+                                    set to a Pending state, as reflected by the modifyVolumeStatus field, until such as a resource
+                                    exists.
+                                    More info: https://kubernetes.io/docs/concepts/storage/volume-attributes-classes/
+                                  type: string
+                                volumeMode:
+                                  description: |-
+                                    volumeMode defines what type of volume is required by the claim.
+                                    Value of Filesystem is implied when not included in claim spec.
+                                  type: string
+                                volumeName:
+                                  description: volumeName is the binding reference
+                                    to the PersistentVolume backing this claim.
+                                  type: string
+                              type: object
+                          required:
+                          - spec
+                          type: object
+                      type: object
+                    fc:
+                      description: fc represents a Fibre Channel resource that is
+                        attached to a kubelet's host machine and then exposed to the
+                        pod.
+                      properties:
+                        fsType:
+                          description: |-
+                            fsType is the filesystem type to mount.
+                            Must be a filesystem type supported by the host operating system.
+                            Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                          type: string
+                        lun:
+                          description: 'lun is Optional: FC target lun number'
+                          format: int32
+                          type: integer
+                        readOnly:
+                          description: |-
+                            readOnly is Optional: Defaults to false (read/write). ReadOnly here will force
+                            the ReadOnly setting in VolumeMounts.
+                          type: boolean
+                        targetWWNs:
+                          description: 'targetWWNs is Optional: FC target worldwide
+                            names (WWNs)'
+                          items:
+                            type: string
+                          type: array
+                          x-kubernetes-list-type: atomic
+                        wwids:
+                          description: |-
+                            wwids Optional: FC volume world wide identifiers (wwids)
+                            Either wwids or combination of targetWWNs and lun must be set, but not both simultaneously.
+                          items:
+                            type: string
+                          type: array
+                          x-kubernetes-list-type: atomic
+                      type: object
+                    flexVolume:
+                      description: |-
+                        flexVolume represents a generic volume resource that is
+                        provisioned/attached using an exec based plugin.
+                        Deprecated: FlexVolume is deprecated. Consider using a CSIDriver instead.
+                      properties:
+                        driver:
+                          description: driver is the name of the driver to use for
+                            this volume.
+                          type: string
+                        fsType:
+                          description: |-
+                            fsType is the filesystem type to mount.
+                            Must be a filesystem type supported by the host operating system.
+                            Ex. "ext4", "xfs", "ntfs". The default filesystem depends on FlexVolume script.
+                          type: string
+                        options:
+                          additionalProperties:
+                            type: string
+                          description: 'options is Optional: this field holds extra
+                            command options if any.'
+                          type: object
+                        readOnly:
+                          description: |-
+                            readOnly is Optional: defaults to false (read/write). ReadOnly here will force
+                            the ReadOnly setting in VolumeMounts.
+                          type: boolean
+                        secretRef:
+                          description: |-
+                            secretRef is Optional: secretRef is reference to the secret object containing
+                            sensitive information to pass to the plugin scripts. This may be
+                            empty if no secret object is specified. If the secret object
+                            contains more than one secret, all secrets are passed to the plugin
+                            scripts.
+                          properties:
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                          type: object
+                          x-kubernetes-map-type: atomic
+                      required:
+                      - driver
+                      type: object
+                    flocker:
+                      description: |-
+                        flocker represents a Flocker volume attached to a kubelet's host machine. This depends on the Flocker control service being running.
+                        Deprecated: Flocker is deprecated and the in-tree flocker type is no longer supported.
+                      properties:
+                        datasetName:
+                          description: |-
+                            datasetName is Name of the dataset stored as metadata -> name on the dataset for Flocker
+                            should be considered as deprecated
+                          type: string
+                        datasetUUID:
+                          description: datasetUUID is the UUID of the dataset. This
+                            is unique identifier of a Flocker dataset
+                          type: string
+                      type: object
+                    gcePersistentDisk:
+                      description: |-
+                        gcePersistentDisk represents a GCE Disk resource that is attached to a
+                        kubelet's host machine and then exposed to the pod.
+                        Deprecated: GCEPersistentDisk is deprecated. All operations for the in-tree
+                        gcePersistentDisk type are redirected to the pd.csi.storage.gke.io CSI driver.
+                        More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                      properties:
+                        fsType:
+                          description: |-
+                            fsType is filesystem type of the volume that you want to mount.
+                            Tip: Ensure that the filesystem type is supported by the host operating system.
+                            Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                          type: string
+                        partition:
+                          description: |-
+                            partition is the partition in the volume that you want to mount.
+                            If omitted, the default is to mount by volume name.
+                            Examples: For volume /dev/sda1, you specify the partition as "1".
+                            Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty).
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                          format: int32
+                          type: integer
+                        pdName:
+                          description: |-
+                            pdName is unique name of the PD resource in GCE. Used to identify the disk in GCE.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly here will force the ReadOnly setting in VolumeMounts.
+                            Defaults to false.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk
+                          type: boolean
+                      required:
+                      - pdName
+                      type: object
+                    gitRepo:
+                      description: |-
+                        gitRepo represents a git repository at a particular revision.
+                        Deprecated: GitRepo is deprecated. To provision a container with a git repo, mount an
+                        EmptyDir into an InitContainer that clones the repo using git, then mount the EmptyDir
+                        into the Pod's container.
+                      properties:
+                        directory:
+                          description: |-
+                            directory is the target directory name.
+                            Must not contain or start with '..'.  If '.' is supplied, the volume directory will be the
+                            git repository.  Otherwise, if specified, the volume will contain the git repository in
+                            the subdirectory with the given name.
+                          type: string
+                        repository:
+                          description: repository is the URL
+                          type: string
+                        revision:
+                          description: revision is the commit hash for the specified
+                            revision.
+                          type: string
+                      required:
+                      - repository
+                      type: object
+                    glusterfs:
+                      description: |-
+                        glusterfs represents a Glusterfs mount on the host that shares a pod's lifetime.
+                        Deprecated: Glusterfs is deprecated and the in-tree glusterfs type is no longer supported.
+                      properties:
+                        endpoints:
+                          description: endpoints is the endpoint name that details
+                            Glusterfs topology.
+                          type: string
+                        path:
+                          description: |-
+                            path is the Glusterfs volume path.
+                            More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly here will force the Glusterfs volume to be mounted with read-only permissions.
+                            Defaults to false.
+                            More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod
+                          type: boolean
+                      required:
+                      - endpoints
+                      - path
+                      type: object
+                    hostPath:
+                      description: |-
+                        hostPath represents a pre-existing file or directory on the host
+                        machine that is directly exposed to the container. This is generally
+                        used for system agents or other privileged things that are allowed
+                        to see the host machine. Most containers will NOT need this.
+                        More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath
+                      properties:
+                        path:
+                          description: |-
+                            path of the directory on the host.
+                            If the path is a symlink, it will follow the link to the real path.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath
+                          type: string
+                        type:
+                          description: |-
+                            type for HostPath Volume
+                            Defaults to ""
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath
+                          type: string
+                      required:
+                      - path
+                      type: object
+                    image:
+                      description: |-
+                        image represents an OCI object (a container image or artifact) pulled and mounted on the kubelet's host machine.
+                        The volume is resolved at pod startup depending on which PullPolicy value is provided:
+
+                        - Always: the kubelet always attempts to pull the reference. Container creation will fail If the pull fails.
+                        - Never: the kubelet never pulls the reference and only uses a local image or artifact. Container creation will fail if the reference isn't present.
+                        - IfNotPresent: the kubelet pulls if the reference isn't already present on disk. Container creation will fail if the reference isn't present and the pull fails.
+
+                        The volume gets re-resolved if the pod gets deleted and recreated, which means that new remote content will become available on pod recreation.
+                        A failure to resolve or pull the image during pod startup will block containers from starting and may add significant latency. Failures will be retried using normal volume backoff and will be reported on the pod reason and message.
+                        The types of objects that may be mounted by this volume are defined by the container runtime implementation on a host machine and at minimum must include all valid types supported by the container image field.
+                        The OCI object gets mounted in a single directory (spec.containers[*].volumeMounts.mountPath) by merging the manifest layers in the same way as for container images.
+                        The volume will be mounted read-only (ro) and non-executable files (noexec).
+                        Sub path mounts for containers are not supported (spec.containers[*].volumeMounts.subpath) before 1.33.
+                        The field spec.securityContext.fsGroupChangePolicy has no effect on this volume type.
+                      properties:
+                        pullPolicy:
+                          description: |-
+                            Policy for pulling OCI objects. Possible values are:
+                            Always: the kubelet always attempts to pull the reference. Container creation will fail If the pull fails.
+                            Never: the kubelet never pulls the reference and only uses a local image or artifact. Container creation will fail if the reference isn't present.
+                            IfNotPresent: the kubelet pulls if the reference isn't already present on disk. Container creation will fail if the reference isn't present and the pull fails.
+                            Defaults to Always if :latest tag is specified, or IfNotPresent otherwise.
+                          type: string
+                        reference:
+                          description: |-
+                            Required: Image or artifact reference to be used.
+                            Behaves in the same way as pod.spec.containers[*].image.
+                            Pull secrets will be assembled in the same way as for the container image by looking up node credentials, SA image pull secrets, and pod spec image pull secrets.
+                            More info: https://kubernetes.io/docs/concepts/containers/images
+                            This field is optional to allow higher level config management to default or override
+                            container images in workload controllers like Deployments and StatefulSets.
+                          type: string
+                      type: object
+                    iscsi:
+                      description: |-
+                        iscsi represents an ISCSI Disk resource that is attached to a
+                        kubelet's host machine and then exposed to the pod.
+                        More info: https://kubernetes.io/docs/concepts/storage/volumes/#iscsi
+                      properties:
+                        chapAuthDiscovery:
+                          description: chapAuthDiscovery defines whether support iSCSI
+                            Discovery CHAP authentication
+                          type: boolean
+                        chapAuthSession:
+                          description: chapAuthSession defines whether support iSCSI
+                            Session CHAP authentication
+                          type: boolean
+                        fsType:
+                          description: |-
+                            fsType is the filesystem type of the volume that you want to mount.
+                            Tip: Ensure that the filesystem type is supported by the host operating system.
+                            Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#iscsi
+                          type: string
+                        initiatorName:
+                          description: |-
+                            initiatorName is the custom iSCSI Initiator Name.
+                            If initiatorName is specified with iscsiInterface simultaneously, new iSCSI interface
+                            <target portal>:<volume name> will be created for the connection.
+                          type: string
+                        iqn:
+                          description: iqn is the target iSCSI Qualified Name.
+                          type: string
+                        iscsiInterface:
+                          default: default
+                          description: |-
+                            iscsiInterface is the interface Name that uses an iSCSI transport.
+                            Defaults to 'default' (tcp).
+                          type: string
+                        lun:
+                          description: lun represents iSCSI Target Lun number.
+                          format: int32
+                          type: integer
+                        portals:
+                          description: |-
+                            portals is the iSCSI Target Portal List. The portal is either an IP or ip_addr:port if the port
+                            is other than default (typically TCP ports 860 and 3260).
+                          items:
+                            type: string
+                          type: array
+                          x-kubernetes-list-type: atomic
+                        readOnly:
+                          description: |-
+                            readOnly here will force the ReadOnly setting in VolumeMounts.
+                            Defaults to false.
+                          type: boolean
+                        secretRef:
+                          description: secretRef is the CHAP Secret for iSCSI target
+                            and initiator authentication
+                          properties:
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        targetPortal:
+                          description: |-
+                            targetPortal is iSCSI Target Portal. The Portal is either an IP or ip_addr:port if the port
+                            is other than default (typically TCP ports 860 and 3260).
+                          type: string
+                      required:
+                      - iqn
+                      - lun
+                      - targetPortal
+                      type: object
+                    name:
+                      description: |-
+                        name of the volume.
+                        Must be a DNS_LABEL and unique within the pod.
+                        More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                      type: string
+                    nfs:
+                      description: |-
+                        nfs represents an NFS mount on the host that shares a pod's lifetime
+                        More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs
+                      properties:
+                        path:
+                          description: |-
+                            path that is exported by the NFS server.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly here will force the NFS export to be mounted with read-only permissions.
+                            Defaults to false.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs
+                          type: boolean
+                        server:
+                          description: |-
+                            server is the hostname or IP address of the NFS server.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs
+                          type: string
+                      required:
+                      - path
+                      - server
+                      type: object
+                    persistentVolumeClaim:
+                      description: |-
+                        persistentVolumeClaimVolumeSource represents a reference to a
+                        PersistentVolumeClaim in the same namespace.
+                        More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims
+                      properties:
+                        claimName:
+                          description: |-
+                            claimName is the name of a PersistentVolumeClaim in the same namespace as the pod using this volume.
+                            More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly Will force the ReadOnly setting in VolumeMounts.
+                            Default false.
+                          type: boolean
+                      required:
+                      - claimName
+                      type: object
+                    photonPersistentDisk:
+                      description: |-
+                        photonPersistentDisk represents a PhotonController persistent disk attached and mounted on kubelets host machine.
+                        Deprecated: PhotonPersistentDisk is deprecated and the in-tree photonPersistentDisk type is no longer supported.
+                      properties:
+                        fsType:
+                          description: |-
+                            fsType is the filesystem type to mount.
+                            Must be a filesystem type supported by the host operating system.
+                            Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                          type: string
+                        pdID:
+                          description: pdID is the ID that identifies Photon Controller
+                            persistent disk
+                          type: string
+                      required:
+                      - pdID
+                      type: object
+                    portworxVolume:
+                      description: |-
+                        portworxVolume represents a portworx volume attached and mounted on kubelets host machine.
+                        Deprecated: PortworxVolume is deprecated. All operations for the in-tree portworxVolume type
+                        are redirected to the pxd.portworx.com CSI driver when the CSIMigrationPortworx feature-gate
+                        is on.
+                      properties:
+                        fsType:
+                          description: |-
+                            fSType represents the filesystem type to mount
+                            Must be a filesystem type supported by the host operating system.
+                            Ex. "ext4", "xfs". Implicitly inferred to be "ext4" if unspecified.
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly defaults to false (read/write). ReadOnly here will force
+                            the ReadOnly setting in VolumeMounts.
+                          type: boolean
+                        volumeID:
+                          description: volumeID uniquely identifies a Portworx volume
+                          type: string
+                      required:
+                      - volumeID
+                      type: object
+                    projected:
+                      description: projected items for all in one resources secrets,
+                        configmaps, and downward API
+                      properties:
+                        defaultMode:
+                          description: |-
+                            defaultMode are the mode bits used to set permissions on created files by default.
+                            Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                            YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                            Directories within the path are not affected by this setting.
+                            This might be in conflict with other options that affect the file
+                            mode, like fsGroup, and the result can be other mode bits set.
+                          format: int32
+                          type: integer
+                        sources:
+                          description: |-
+                            sources is the list of volume projections. Each entry in this list
+                            handles one source.
+                          items:
+                            description: |-
+                              Projection that may be projected along with other supported volume types.
+                              Exactly one of these fields must be set.
+                            properties:
+                              clusterTrustBundle:
+                                description: |-
+                                  ClusterTrustBundle allows a pod to access the `.spec.trustBundle` field
+                                  of ClusterTrustBundle objects in an auto-updating file.
+
+                                  Alpha, gated by the ClusterTrustBundleProjection feature gate.
+
+                                  ClusterTrustBundle objects can either be selected by name, or by the
+                                  combination of signer name and a label selector.
+
+                                  Kubelet performs aggressive normalization of the PEM contents written
+                                  into the pod filesystem.  Esoteric PEM features such as inter-block
+                                  comments and block headers are stripped.  Certificates are deduplicated.
+                                  The ordering of certificates within the file is arbitrary, and Kubelet
+                                  may change the order over time.
+                                properties:
+                                  labelSelector:
+                                    description: |-
+                                      Select all ClusterTrustBundles that match this label selector.  Only has
+                                      effect if signerName is set.  Mutually-exclusive with name.  If unset,
+                                      interpreted as "match nothing".  If set but empty, interpreted as "match
+                                      everything".
+                                    properties:
+                                      matchExpressions:
+                                        description: matchExpressions is a list of
+                                          label selector requirements. The requirements
+                                          are ANDed.
+                                        items:
+                                          description: |-
+                                            A label selector requirement is a selector that contains values, a key, and an operator that
+                                            relates the key and values.
+                                          properties:
+                                            key:
+                                              description: key is the label key that
+                                                the selector applies to.
+                                              type: string
+                                            operator:
+                                              description: |-
+                                                operator represents a key's relationship to a set of values.
+                                                Valid operators are In, NotIn, Exists and DoesNotExist.
+                                              type: string
+                                            values:
+                                              description: |-
+                                                values is an array of string values. If the operator is In or NotIn,
+                                                the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                                                the values array must be empty. This array is replaced during a strategic
+                                                merge patch.
+                                              items:
+                                                type: string
+                                              type: array
+                                              x-kubernetes-list-type: atomic
+                                          required:
+                                          - key
+                                          - operator
+                                          type: object
+                                        type: array
+                                        x-kubernetes-list-type: atomic
+                                      matchLabels:
+                                        additionalProperties:
+                                          type: string
+                                        description: |-
+                                          matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                                          map is equivalent to an element of matchExpressions, whose key field is "key", the
+                                          operator is "In", and the values array contains only "value". The requirements are ANDed.
+                                        type: object
+                                    type: object
+                                    x-kubernetes-map-type: atomic
+                                  name:
+                                    description: |-
+                                      Select a single ClusterTrustBundle by object name.  Mutually-exclusive
+                                      with signerName and labelSelector.
+                                    type: string
+                                  optional:
+                                    description: |-
+                                      If true, don't block pod startup if the referenced ClusterTrustBundle(s)
+                                      aren't available.  If using name, then the named ClusterTrustBundle is
+                                      allowed not to exist.  If using signerName, then the combination of
+                                      signerName and labelSelector is allowed to match zero
+                                      ClusterTrustBundles.
+                                    type: boolean
+                                  path:
+                                    description: Relative path from the volume root
+                                      to write the bundle.
+                                    type: string
+                                  signerName:
+                                    description: |-
+                                      Select all ClusterTrustBundles that match this signer name.
+                                      Mutually-exclusive with name.  The contents of all selected
+                                      ClusterTrustBundles will be unified and deduplicated.
+                                    type: string
+                                required:
+                                - path
+                                type: object
+                              configMap:
+                                description: configMap information about the configMap
+                                  data to project
+                                properties:
+                                  items:
+                                    description: |-
+                                      items if unspecified, each key-value pair in the Data field of the referenced
+                                      ConfigMap will be projected into the volume as a file whose name is the
+                                      key and content is the value. If specified, the listed keys will be
+                                      projected into the specified paths, and unlisted keys will not be
+                                      present. If a key is specified which is not present in the ConfigMap,
+                                      the volume setup will error unless it is marked optional. Paths must be
+                                      relative and may not contain the '..' path or start with '..'.
+                                    items:
+                                      description: Maps a string key to a path within
+                                        a volume.
+                                      properties:
+                                        key:
+                                          description: key is the key to project.
+                                          type: string
+                                        mode:
+                                          description: |-
+                                            mode is Optional: mode bits used to set permissions on this file.
+                                            Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                            YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                            If not specified, the volume defaultMode will be used.
+                                            This might be in conflict with other options that affect the file
+                                            mode, like fsGroup, and the result can be other mode bits set.
+                                          format: int32
+                                          type: integer
+                                        path:
+                                          description: |-
+                                            path is the relative path of the file to map the key to.
+                                            May not be an absolute path.
+                                            May not contain the path element '..'.
+                                            May not start with the string '..'.
+                                          type: string
+                                      required:
+                                      - key
+                                      - path
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  name:
+                                    default: ""
+                                    description: |-
+                                      Name of the referent.
+                                      This field is effectively required, but due to backwards compatibility is
+                                      allowed to be empty. Instances of this type with an empty value here are
+                                      almost certainly wrong.
+                                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                    type: string
+                                  optional:
+                                    description: optional specify whether the ConfigMap
+                                      or its keys must be defined
+                                    type: boolean
+                                type: object
+                                x-kubernetes-map-type: atomic
+                              downwardAPI:
+                                description: downwardAPI information about the downwardAPI
+                                  data to project
+                                properties:
+                                  items:
+                                    description: Items is a list of DownwardAPIVolume
+                                      file
+                                    items:
+                                      description: DownwardAPIVolumeFile represents
+                                        information to create the file containing
+                                        the pod field
+                                      properties:
+                                        fieldRef:
+                                          description: 'Required: Selects a field
+                                            of the pod: only annotations, labels,
+                                            name, namespace and uid are supported.'
+                                          properties:
+                                            apiVersion:
+                                              description: Version of the schema the
+                                                FieldPath is written in terms of,
+                                                defaults to "v1".
+                                              type: string
+                                            fieldPath:
+                                              description: Path of the field to select
+                                                in the specified API version.
+                                              type: string
+                                          required:
+                                          - fieldPath
+                                          type: object
+                                          x-kubernetes-map-type: atomic
+                                        mode:
+                                          description: |-
+                                            Optional: mode bits used to set permissions on this file, must be an octal value
+                                            between 0000 and 0777 or a decimal value between 0 and 511.
+                                            YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                            If not specified, the volume defaultMode will be used.
+                                            This might be in conflict with other options that affect the file
+                                            mode, like fsGroup, and the result can be other mode bits set.
+                                          format: int32
+                                          type: integer
+                                        path:
+                                          description: 'Required: Path is  the relative
+                                            path name of the file to be created. Must
+                                            not be absolute or contain the ''..''
+                                            path. Must be utf-8 encoded. The first
+                                            item of the relative path must not start
+                                            with ''..'''
+                                          type: string
+                                        resourceFieldRef:
+                                          description: |-
+                                            Selects a resource of the container: only resources limits and requests
+                                            (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported.
+                                          properties:
+                                            containerName:
+                                              description: 'Container name: required
+                                                for volumes, optional for env vars'
+                                              type: string
+                                            divisor:
+                                              anyOf:
+                                              - type: integer
+                                              - type: string
+                                              description: Specifies the output format
+                                                of the exposed resources, defaults
+                                                to "1"
+                                              pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                              x-kubernetes-int-or-string: true
+                                            resource:
+                                              description: 'Required: resource to
+                                                select'
+                                              type: string
+                                          required:
+                                          - resource
+                                          type: object
+                                          x-kubernetes-map-type: atomic
+                                      required:
+                                      - path
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                type: object
+                              podCertificate:
+                                description: |-
+                                  Projects an auto-rotating credential bundle (private key and certificate
+                                  chain) that the pod can use either as a TLS client or server.
+
+                                  Kubelet generates a private key and uses it to send a
+                                  PodCertificateRequest to the named signer.  Once the signer approves the
+                                  request and issues a certificate chain, Kubelet writes the key and
+                                  certificate chain to the pod filesystem.  The pod does not start until
+                                  certificates have been issued for each podCertificate projected volume
+                                  source in its spec.
+
+                                  Kubelet will begin trying to rotate the certificate at the time indicated
+                                  by the signer using the PodCertificateRequest.Status.BeginRefreshAt
+                                  timestamp.
+
+                                  Kubelet can write a single file, indicated by the credentialBundlePath
+                                  field, or separate files, indicated by the keyPath and
+                                  certificateChainPath fields.
+
+                                  The credential bundle is a single file in PEM format.  The first PEM
+                                  entry is the private key (in PKCS#8 format), and the remaining PEM
+                                  entries are the certificate chain issued by the signer (typically,
+                                  signers will return their certificate chain in leaf-to-root order).
+
+                                  Prefer using the credential bundle format, since your application code
+                                  can read it atomically.  If you use keyPath and certificateChainPath,
+                                  your application must make two separate file reads. If these coincide
+                                  with a certificate rotation, it is possible that the private key and leaf
+                                  certificate you read may not correspond to each other.  Your application
+                                  will need to check for this condition, and re-read until they are
+                                  consistent.
+
+                                  The named signer controls chooses the format of the certificate it
+                                  issues; consult the signer implementation's documentation to learn how to
+                                  use the certificates it issues.
+                                properties:
+                                  certificateChainPath:
+                                    description: |-
+                                      Write the certificate chain at this path in the projected volume.
+
+                                      Most applications should use credentialBundlePath.  When using keyPath
+                                      and certificateChainPath, your application needs to check that the key
+                                      and leaf certificate are consistent, because it is possible to read the
+                                      files mid-rotation.
+                                    type: string
+                                  credentialBundlePath:
+                                    description: |-
+                                      Write the credential bundle at this path in the projected volume.
+
+                                      The credential bundle is a single file that contains multiple PEM blocks.
+                                      The first PEM block is a PRIVATE KEY block, containing a PKCS#8 private
+                                      key.
+
+                                      The remaining blocks are CERTIFICATE blocks, containing the issued
+                                      certificate chain from the signer (leaf and any intermediates).
+
+                                      Using credentialBundlePath lets your Pod's application code make a single
+                                      atomic read that retrieves a consistent key and certificate chain.  If you
+                                      project them to separate files, your application code will need to
+                                      additionally check that the leaf certificate was issued to the key.
+                                    type: string
+                                  keyPath:
+                                    description: |-
+                                      Write the key at this path in the projected volume.
+
+                                      Most applications should use credentialBundlePath.  When using keyPath
+                                      and certificateChainPath, your application needs to check that the key
+                                      and leaf certificate are consistent, because it is possible to read the
+                                      files mid-rotation.
+                                    type: string
+                                  keyType:
+                                    description: |-
+                                      The type of keypair Kubelet will generate for the pod.
+
+                                      Valid values are "RSA3072", "RSA4096", "ECDSAP256", "ECDSAP384",
+                                      "ECDSAP521", and "ED25519".
+                                    type: string
+                                  maxExpirationSeconds:
+                                    description: |-
+                                      maxExpirationSeconds is the maximum lifetime permitted for the
+                                      certificate.
+
+                                      Kubelet copies this value verbatim into the PodCertificateRequests it
+                                      generates for this projection.
+
+                                      If omitted, kube-apiserver will set it to 86400(24 hours). kube-apiserver
+                                      will reject values shorter than 3600 (1 hour).  The maximum allowable
+                                      value is 7862400 (91 days).
+
+                                      The signer implementation is then free to issue a certificate with any
+                                      lifetime *shorter* than MaxExpirationSeconds, but no shorter than 3600
+                                      seconds (1 hour).  This constraint is enforced by kube-apiserver.
+                                      `kubernetes.io` signers will never issue certificates with a lifetime
+                                      longer than 24 hours.
+                                    format: int32
+                                    type: integer
+                                  signerName:
+                                    description: Kubelet's generated CSRs will be
+                                      addressed to this signer.
+                                    type: string
+                                  userAnnotations:
+                                    additionalProperties:
+                                      type: string
+                                    description: |-
+                                      userAnnotations allow pod authors to pass additional information to
+                                      the signer implementation.  Kubernetes does not restrict or validate this
+                                      metadata in any way.
+
+                                      These values are copied verbatim into the `spec.unverifiedUserAnnotations` field of
+                                      the PodCertificateRequest objects that Kubelet creates.
+
+                                      Entries are subject to the same validation as object metadata annotations,
+                                      with the addition that all keys must be domain-prefixed. No restrictions
+                                      are placed on values, except an overall size limitation on the entire field.
+
+                                      Signers should document the keys and values they support. Signers should
+                                      deny requests that contain keys they do not recognize.
+                                    type: object
+                                required:
+                                - keyType
+                                - signerName
+                                type: object
+                              secret:
+                                description: secret information about the secret data
+                                  to project
+                                properties:
+                                  items:
+                                    description: |-
+                                      items if unspecified, each key-value pair in the Data field of the referenced
+                                      Secret will be projected into the volume as a file whose name is the
+                                      key and content is the value. If specified, the listed keys will be
+                                      projected into the specified paths, and unlisted keys will not be
+                                      present. If a key is specified which is not present in the Secret,
+                                      the volume setup will error unless it is marked optional. Paths must be
+                                      relative and may not contain the '..' path or start with '..'.
+                                    items:
+                                      description: Maps a string key to a path within
+                                        a volume.
+                                      properties:
+                                        key:
+                                          description: key is the key to project.
+                                          type: string
+                                        mode:
+                                          description: |-
+                                            mode is Optional: mode bits used to set permissions on this file.
+                                            Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                            YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                            If not specified, the volume defaultMode will be used.
+                                            This might be in conflict with other options that affect the file
+                                            mode, like fsGroup, and the result can be other mode bits set.
+                                          format: int32
+                                          type: integer
+                                        path:
+                                          description: |-
+                                            path is the relative path of the file to map the key to.
+                                            May not be an absolute path.
+                                            May not contain the path element '..'.
+                                            May not start with the string '..'.
+                                          type: string
+                                      required:
+                                      - key
+                                      - path
+                                      type: object
+                                    type: array
+                                    x-kubernetes-list-type: atomic
+                                  name:
+                                    default: ""
+                                    description: |-
+                                      Name of the referent.
+                                      This field is effectively required, but due to backwards compatibility is
+                                      allowed to be empty. Instances of this type with an empty value here are
+                                      almost certainly wrong.
+                                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                    type: string
+                                  optional:
+                                    description: optional field specify whether the
+                                      Secret or its key must be defined
+                                    type: boolean
+                                type: object
+                                x-kubernetes-map-type: atomic
+                              serviceAccountToken:
+                                description: serviceAccountToken is information about
+                                  the serviceAccountToken data to project
+                                properties:
+                                  audience:
+                                    description: |-
+                                      audience is the intended audience of the token. A recipient of a token
+                                      must identify itself with an identifier specified in the audience of the
+                                      token, and otherwise should reject the token. The audience defaults to the
+                                      identifier of the apiserver.
+                                    type: string
+                                  expirationSeconds:
+                                    description: |-
+                                      expirationSeconds is the requested duration of validity of the service
+                                      account token. As the token approaches expiration, the kubelet volume
+                                      plugin will proactively rotate the service account token. The kubelet will
+                                      start trying to rotate the token if the token is older than 80 percent of
+                                      its time to live or if the token is older than 24 hours.Defaults to 1 hour
+                                      and must be at least 10 minutes.
+                                    format: int64
+                                    type: integer
+                                  path:
+                                    description: |-
+                                      path is the path relative to the mount point of the file to project the
+                                      token into.
+                                    type: string
+                                required:
+                                - path
+                                type: object
+                            type: object
+                          type: array
+                          x-kubernetes-list-type: atomic
+                      type: object
+                    quobyte:
+                      description: |-
+                        quobyte represents a Quobyte mount on the host that shares a pod's lifetime.
+                        Deprecated: Quobyte is deprecated and the in-tree quobyte type is no longer supported.
+                      properties:
+                        group:
+                          description: |-
+                            group to map volume access to
+                            Default is no group
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly here will force the Quobyte volume to be mounted with read-only permissions.
+                            Defaults to false.
+                          type: boolean
+                        registry:
+                          description: |-
+                            registry represents a single or multiple Quobyte Registry services
+                            specified as a string as host:port pair (multiple entries are separated with commas)
+                            which acts as the central registry for volumes
+                          type: string
+                        tenant:
+                          description: |-
+                            tenant owning the given Quobyte volume in the Backend
+                            Used with dynamically provisioned Quobyte volumes, value is set by the plugin
+                          type: string
+                        user:
+                          description: |-
+                            user to map volume access to
+                            Defaults to serivceaccount user
+                          type: string
+                        volume:
+                          description: volume is a string that references an already
+                            created Quobyte volume by name.
+                          type: string
+                      required:
+                      - registry
+                      - volume
+                      type: object
+                    rbd:
+                      description: |-
+                        rbd represents a Rados Block Device mount on the host that shares a pod's lifetime.
+                        Deprecated: RBD is deprecated and the in-tree rbd type is no longer supported.
+                      properties:
+                        fsType:
+                          description: |-
+                            fsType is the filesystem type of the volume that you want to mount.
+                            Tip: Ensure that the filesystem type is supported by the host operating system.
+                            Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#rbd
+                          type: string
+                        image:
+                          description: |-
+                            image is the rados image name.
+                            More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                          type: string
+                        keyring:
+                          default: /etc/ceph/keyring
+                          description: |-
+                            keyring is the path to key ring for RBDUser.
+                            Default is /etc/ceph/keyring.
+                            More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                          type: string
+                        monitors:
+                          description: |-
+                            monitors is a collection of Ceph monitors.
+                            More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                          items:
+                            type: string
+                          type: array
+                          x-kubernetes-list-type: atomic
+                        pool:
+                          default: rbd
+                          description: |-
+                            pool is the rados pool name.
+                            Default is rbd.
+                            More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly here will force the ReadOnly setting in VolumeMounts.
+                            Defaults to false.
+                            More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                          type: boolean
+                        secretRef:
+                          description: |-
+                            secretRef is name of the authentication secret for RBDUser. If provided
+                            overrides keyring.
+                            Default is nil.
+                            More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                          properties:
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        user:
+                          default: admin
+                          description: |-
+                            user is the rados user name.
+                            Default is admin.
+                            More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it
+                          type: string
+                      required:
+                      - image
+                      - monitors
+                      type: object
+                    scaleIO:
+                      description: |-
+                        scaleIO represents a ScaleIO persistent volume attached and mounted on Kubernetes nodes.
+                        Deprecated: ScaleIO is deprecated and the in-tree scaleIO type is no longer supported.
+                      properties:
+                        fsType:
+                          default: xfs
+                          description: |-
+                            fsType is the filesystem type to mount.
+                            Must be a filesystem type supported by the host operating system.
+                            Ex. "ext4", "xfs", "ntfs".
+                            Default is "xfs".
+                          type: string
+                        gateway:
+                          description: gateway is the host address of the ScaleIO
+                            API Gateway.
+                          type: string
+                        protectionDomain:
+                          description: protectionDomain is the name of the ScaleIO
+                            Protection Domain for the configured storage.
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly Defaults to false (read/write). ReadOnly here will force
+                            the ReadOnly setting in VolumeMounts.
+                          type: boolean
+                        secretRef:
+                          description: |-
+                            secretRef references to the secret for ScaleIO user and other
+                            sensitive information. If this is not provided, Login operation will fail.
+                          properties:
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        sslEnabled:
+                          description: sslEnabled Flag enable/disable SSL communication
+                            with Gateway, default false
+                          type: boolean
+                        storageMode:
+                          default: ThinProvisioned
+                          description: |-
+                            storageMode indicates whether the storage for a volume should be ThickProvisioned or ThinProvisioned.
+                            Default is ThinProvisioned.
+                          type: string
+                        storagePool:
+                          description: storagePool is the ScaleIO Storage Pool associated
+                            with the protection domain.
+                          type: string
+                        system:
+                          description: system is the name of the storage system as
+                            configured in ScaleIO.
+                          type: string
+                        volumeName:
+                          description: |-
+                            volumeName is the name of a volume already created in the ScaleIO system
+                            that is associated with this volume source.
+                          type: string
+                      required:
+                      - gateway
+                      - secretRef
+                      - system
+                      type: object
+                    secret:
+                      description: |-
+                        secret represents a secret that should populate this volume.
+                        More info: https://kubernetes.io/docs/concepts/storage/volumes#secret
+                      properties:
+                        defaultMode:
+                          description: |-
+                            defaultMode is Optional: mode bits used to set permissions on created files by default.
+                            Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                            YAML accepts both octal and decimal values, JSON requires decimal values
+                            for mode bits. Defaults to 0644.
+                            Directories within the path are not affected by this setting.
+                            This might be in conflict with other options that affect the file
+                            mode, like fsGroup, and the result can be other mode bits set.
+                          format: int32
+                          type: integer
+                        items:
+                          description: |-
+                            items If unspecified, each key-value pair in the Data field of the referenced
+                            Secret will be projected into the volume as a file whose name is the
+                            key and content is the value. If specified, the listed keys will be
+                            projected into the specified paths, and unlisted keys will not be
+                            present. If a key is specified which is not present in the Secret,
+                            the volume setup will error unless it is marked optional. Paths must be
+                            relative and may not contain the '..' path or start with '..'.
+                          items:
+                            description: Maps a string key to a path within a volume.
+                            properties:
+                              key:
+                                description: key is the key to project.
+                                type: string
+                              mode:
+                                description: |-
+                                  mode is Optional: mode bits used to set permissions on this file.
+                                  Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511.
+                                  YAML accepts both octal and decimal values, JSON requires decimal values for mode bits.
+                                  If not specified, the volume defaultMode will be used.
+                                  This might be in conflict with other options that affect the file
+                                  mode, like fsGroup, and the result can be other mode bits set.
+                                format: int32
+                                type: integer
+                              path:
+                                description: |-
+                                  path is the relative path of the file to map the key to.
+                                  May not be an absolute path.
+                                  May not contain the path element '..'.
+                                  May not start with the string '..'.
+                                type: string
+                            required:
+                            - key
+                            - path
+                            type: object
+                          type: array
+                          x-kubernetes-list-type: atomic
+                        optional:
+                          description: optional field specify whether the Secret or
+                            its keys must be defined
+                          type: boolean
+                        secretName:
+                          description: |-
+                            secretName is the name of the secret in the pod's namespace to use.
+                            More info: https://kubernetes.io/docs/concepts/storage/volumes#secret
+                          type: string
+                      type: object
+                    storageos:
+                      description: |-
+                        storageOS represents a StorageOS volume attached and mounted on Kubernetes nodes.
+                        Deprecated: StorageOS is deprecated and the in-tree storageos type is no longer supported.
+                      properties:
+                        fsType:
+                          description: |-
+                            fsType is the filesystem type to mount.
+                            Must be a filesystem type supported by the host operating system.
+                            Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                          type: string
+                        readOnly:
+                          description: |-
+                            readOnly defaults to false (read/write). ReadOnly here will force
+                            the ReadOnly setting in VolumeMounts.
+                          type: boolean
+                        secretRef:
+                          description: |-
+                            secretRef specifies the secret to use for obtaining the StorageOS API
+                            credentials.  If not specified, default values will be attempted.
+                          properties:
+                            name:
+                              default: ""
+                              description: |-
+                                Name of the referent.
+                                This field is effectively required, but due to backwards compatibility is
+                                allowed to be empty. Instances of this type with an empty value here are
+                                almost certainly wrong.
+                                More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                              type: string
+                          type: object
+                          x-kubernetes-map-type: atomic
+                        volumeName:
+                          description: |-
+                            volumeName is the human-readable name of the StorageOS volume.  Volume
+                            names are only unique within a namespace.
+                          type: string
+                        volumeNamespace:
+                          description: |-
+                            volumeNamespace specifies the scope of the volume within StorageOS.  If no
+                            namespace is specified then the Pod's namespace will be used.  This allows the
+                            Kubernetes name scoping to be mirrored within StorageOS for tighter integration.
+                            Set VolumeName to any name to override the default behaviour.
+                            Set to "default" if you are not using namespaces within StorageOS.
+                            Namespaces that do not pre-exist within StorageOS will be created.
+                          type: string
+                      type: object
+                    vsphereVolume:
+                      description: |-
+                        vsphereVolume represents a vSphere volume attached and mounted on kubelets host machine.
+                        Deprecated: VsphereVolume is deprecated. All operations for the in-tree vsphereVolume type
+                        are redirected to the csi.vsphere.vmware.com CSI driver.
+                      properties:
+                        fsType:
+                          description: |-
+                            fsType is filesystem type to mount.
+                            Must be a filesystem type supported by the host operating system.
+                            Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified.
+                          type: string
+                        storagePolicyID:
+                          description: storagePolicyID is the storage Policy Based
+                            Management (SPBM) profile ID associated with the StoragePolicyName.
+                          type: string
+                        storagePolicyName:
+                          description: storagePolicyName is the storage Policy Based
+                            Management (SPBM) profile name.
+                          type: string
+                        volumePath:
+                          description: volumePath is the path that identifies vSphere
+                            volume vmdk
+                          type: string
+                      required:
+                      - volumePath
+                      type: object
+                  required:
+                  - name
+                  type: object
+                type: array
+            required:
+            - l1
+            type: object
+          status:
+            description: status defines the observed state of CacheBlendEngine.
+            properties:
+              conditions:
+                description: conditions represent the current state of the CacheBlendEngine
+                  resource.
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+              desiredInstances:
+                description: desiredInstances is the number of desired instances.
+                format: int32
+                type: integer
+              endpoints:
+                description: endpoints lists per-node connection info.
+                items:
+                  description: EndpointStatus represents a single LMCache instance
+                    endpoint.
+                  properties:
+                    hostIP:
+                      description: hostIP is the IP address of the host.
+                      type: string
+                    metricsPort:
+                      description: metricsPort is the Prometheus metrics port.
+                      format: int32
+                      type: integer
+                    nodeName:
+                      description: nodeName is the name of the node running this instance.
+                      type: string
+                    podName:
+                      description: podName is the name of the pod.
+                      type: string
+                    port:
+                      description: port is the server port.
+                      format: int32
+                      type: integer
+                    ready:
+                      description: ready indicates whether this instance is ready.
+                      type: boolean
+                  required:
+                  - hostIP
+                  - metricsPort
+                  - nodeName
+                  - podName
+                  - port
+                  - ready
+                  type: object
+                type: array
+              observedGeneration:
+                description: observedGeneration is the most recent generation observed.
+                format: int64
+                type: integer
+              phase:
+                description: phase is the overall phase of the CacheBlendEngine.
+                type: string
+              readyInstances:
+                description: readyInstances is the number of ready instances.
+                format: int32
+                type: integer
+            type: object
+        required:
+        - spec
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
diff --git a/operator/config/crd/kustomization.yaml b/operator/config/crd/kustomization.yaml
index 1b870b8f13..86c58497d0 100644
--- a/operator/config/crd/kustomization.yaml
+++ b/operator/config/crd/kustomization.yaml
@@ -3,6 +3,7 @@
 # It should be run by config/default
 resources:
 - bases/lmcache.lmcache.ai_lmcacheengines.yaml
+- bases/lmcache.lmcache.ai_cacheblendengines.yaml
 # +kubebuilder:scaffold:crdkustomizeresource
 
 patches:
@@ -12,5 +13,5 @@ patches:
 
 # [WEBHOOK] To enable webhook, uncomment the following section
 # the following config is for teaching kustomize how to do kustomization for CRDs.
-#configurations:
-#- kustomizeconfig.yaml
+configurations:
+- kustomizeconfig.yaml
diff --git a/operator/config/default/kustomization.yaml b/operator/config/default/kustomization.yaml
index 1898df7ad3..19bf3c8282 100644
--- a/operator/config/default/kustomization.yaml
+++ b/operator/config/default/kustomization.yaml
@@ -20,9 +20,9 @@ resources:
 - ../manager
 # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 # crd/kustomization.yaml
-#- ../webhook
+- ../webhook
 # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required.
-#- ../certmanager
+- ../certmanager
 # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'.
 #- ../prometheus
 # [METRICS] Expose the controller manager metrics service.
@@ -50,13 +50,13 @@ patches:
 
 # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 # crd/kustomization.yaml
-#- path: manager_webhook_patch.yaml
-#  target:
-#    kind: Deployment
+- path: manager_webhook_patch.yaml
+  target:
+    kind: Deployment
 
 # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix.
 # Uncomment the following replacements to add the cert-manager CA injection annotations
-#replacements:
+replacements:
 # - source: # Uncomment the following block to enable certificates for metrics
 #     kind: Service
 #     version: v1
@@ -117,42 +117,42 @@ patches:
 #         index: 1
 #         create: true
 
-# - source: # Uncomment the following block if you have any webhook
-#     kind: Service
-#     version: v1
-#     name: webhook-service
-#     fieldPath: .metadata.name # Name of the service
-#   targets:
-#     - select:
-#         kind: Certificate
-#         group: cert-manager.io
-#         version: v1
-#         name: serving-cert
-#       fieldPaths:
-#         - .spec.dnsNames.0
-#         - .spec.dnsNames.1
-#       options:
-#         delimiter: '.'
-#         index: 0
-#         create: true
-# - source:
-#     kind: Service
-#     version: v1
-#     name: webhook-service
-#     fieldPath: .metadata.namespace # Namespace of the service
-#   targets:
-#     - select:
-#         kind: Certificate
-#         group: cert-manager.io
-#         version: v1
-#         name: serving-cert
-#       fieldPaths:
-#         - .spec.dnsNames.0
-#         - .spec.dnsNames.1
-#       options:
-#         delimiter: '.'
-#         index: 1
-#         create: true
+- source: # Uncomment the following block if you have any webhook
+    kind: Service
+    version: v1
+    name: webhook-service
+    fieldPath: .metadata.name # Name of the service
+  targets:
+    - select:
+        kind: Certificate
+        group: cert-manager.io
+        version: v1
+        name: serving-cert
+      fieldPaths:
+        - .spec.dnsNames.0
+        - .spec.dnsNames.1
+      options:
+        delimiter: '.'
+        index: 0
+        create: true
+- source:
+    kind: Service
+    version: v1
+    name: webhook-service
+    fieldPath: .metadata.namespace # Namespace of the service
+  targets:
+    - select:
+        kind: Certificate
+        group: cert-manager.io
+        version: v1
+        name: serving-cert
+      fieldPaths:
+        - .spec.dnsNames.0
+        - .spec.dnsNames.1
+      options:
+        delimiter: '.'
+        index: 1
+        create: true
 
 # - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation)
 #     kind: Certificate
@@ -185,36 +185,36 @@ patches:
 #         index: 1
 #         create: true
 
-# - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting )
-#     kind: Certificate
-#     group: cert-manager.io
-#     version: v1
-#     name: serving-cert
-#     fieldPath: .metadata.namespace # Namespace of the certificate CR
-#   targets:
-#     - select:
-#         kind: MutatingWebhookConfiguration
-#       fieldPaths:
-#         - .metadata.annotations.[cert-manager.io/inject-ca-from]
-#       options:
-#         delimiter: '/'
-#         index: 0
-#         create: true
-# - source:
-#     kind: Certificate
-#     group: cert-manager.io
-#     version: v1
-#     name: serving-cert
-#     fieldPath: .metadata.name
-#   targets:
-#     - select:
-#         kind: MutatingWebhookConfiguration
-#       fieldPaths:
-#         - .metadata.annotations.[cert-manager.io/inject-ca-from]
-#       options:
-#         delimiter: '/'
-#         index: 1
-#         create: true
+- source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting )
+    kind: Certificate
+    group: cert-manager.io
+    version: v1
+    name: serving-cert
+    fieldPath: .metadata.namespace # Namespace of the certificate CR
+  targets:
+    - select:
+        kind: MutatingWebhookConfiguration
+      fieldPaths:
+        - .metadata.annotations.[cert-manager.io/inject-ca-from]
+      options:
+        delimiter: '/'
+        index: 0
+        create: true
+- source:
+    kind: Certificate
+    group: cert-manager.io
+    version: v1
+    name: serving-cert
+    fieldPath: .metadata.name
+  targets:
+    - select:
+        kind: MutatingWebhookConfiguration
+      fieldPaths:
+        - .metadata.annotations.[cert-manager.io/inject-ca-from]
+      options:
+        delimiter: '/'
+        index: 1
+        create: true
 
 # - source: # Uncomment the following block if you have a ConversionWebhook (--conversion)
 #     kind: Certificate
diff --git a/operator/config/default/manager_webhook_patch.yaml b/operator/config/default/manager_webhook_patch.yaml
new file mode 100644
index 0000000000..bc918a32f0
--- /dev/null
+++ b/operator/config/default/manager_webhook_patch.yaml
@@ -0,0 +1,20 @@
+# This patch wires the webhook serving certificate into the controller-manager Deployment:
+# it tells the manager where to read the cert (the --webhook-cert-path arg consumed by
+# cmd/main.go), and mounts the cert-manager-managed secret (webhook-server-cert) read-only
+# at that path. The default cert file names (tls.crt / tls.key) match cmd/main.go's
+# --webhook-cert-name / --webhook-cert-key defaults.
+- op: add
+  path: /spec/template/spec/containers/0/args/-
+  value: --webhook-cert-path=/tmp/k8s-webhook-server/serving-certs
+- op: add
+  path: /spec/template/spec/containers/0/volumeMounts/-
+  value:
+    name: webhook-certs
+    mountPath: /tmp/k8s-webhook-server/serving-certs
+    readOnly: true
+- op: add
+  path: /spec/template/spec/volumes/-
+  value:
+    name: webhook-certs
+    secret:
+      secretName: webhook-server-cert
diff --git a/operator/config/rbac/role.yaml b/operator/config/rbac/role.yaml
index 93131ecf1d..b9892bcc18 100644
--- a/operator/config/rbac/role.yaml
+++ b/operator/config/rbac/role.yaml
@@ -41,6 +41,7 @@ rules:
 - apiGroups:
   - lmcache.lmcache.ai
   resources:
+  - cacheblendengines
   - lmcacheengines
   verbs:
   - create
@@ -53,12 +54,14 @@ rules:
 - apiGroups:
   - lmcache.lmcache.ai
   resources:
+  - cacheblendengines/finalizers
   - lmcacheengines/finalizers
   verbs:
   - update
 - apiGroups:
   - lmcache.lmcache.ai
   resources:
+  - cacheblendengines/status
   - lmcacheengines/status
   verbs:
   - get
diff --git a/operator/config/samples/kustomization.yaml b/operator/config/samples/kustomization.yaml
index 30bfa6a3af..cf3430830b 100644
--- a/operator/config/samples/kustomization.yaml
+++ b/operator/config/samples/kustomization.yaml
@@ -1,4 +1,5 @@
 ## Append samples of your project ##
 resources:
 - lmcache_v1alpha1_lmcacheengine.yaml
+- lmcache_v1alpha1_cacheblendengine.yaml
 # +kubebuilder:scaffold:manifestskustomizesamples
diff --git a/operator/config/samples/lmcache_v1alpha1_cacheblendengine.yaml b/operator/config/samples/lmcache_v1alpha1_cacheblendengine.yaml
new file mode 100644
index 0000000000..3340ec7243
--- /dev/null
+++ b/operator/config/samples/lmcache_v1alpha1_cacheblendengine.yaml
@@ -0,0 +1,88 @@
+apiVersion: lmcache.lmcache.ai/v1alpha1
+kind: CacheBlendEngine
+metadata:
+  name: my-cacheblend
+  namespace: default
+spec:
+  # -- Container image (the LMCache blend_v3 ENGINE image) --
+  # The engine runs `lmcache server --engine-type blend_v3`. If this image lives
+  # in a PRIVATE registry, set imagePullSecrets (the Secret must exist in this
+  # namespace).
+  image:
+    repository: lmcache/vllm-openai # default
+    tag: latest-nightly # default
+    pullPolicy: IfNotPresent # Always | Never | IfNotPresent
+  # imagePullSecrets:
+  #   - name: my-registry-secret
+
+  # -- Server config --
+  # server:
+  #   port: 6566                  # MP port the CB connector dials (default 6566)
+  #   httpPort: 8080              # HTTP frontend port (health, admin)
+  #   chunkSize: 256              # FIXED at 256 (must equal vLLM --block-size * 4)
+  #   maxWorkers: 1               # default, worker threads
+  #   hashAlgorithm: blake3       # builtin | sha256_cbor | blake3
+
+  # -- L1 cache (REQUIRED) --
+  l1:
+    sizeGB: 60
+
+  # -- CacheBlend tunables (injected into the vLLM connector config) --
+  # blend:
+  #   checkLayer: 1               # default, cb.check_layer
+  #   recompRatio: 0.15           # default, cb.recomp_ratio
+
+  # -- Injection: how the webhook injects the plugin into annotated vLLM pods --
+  injection:
+    # payloadImage is the PRIVATE cacheblend-plugin init-container image (not
+    # open-source) — repository/tag/pullPolicy, like spec.image. Set repository
+    # to YOUR cacheblend-plugin image and pin the tag to the target vLLM/lmcache
+    # window. The webhook injects it as an init container into vLLM pods.
+    payloadImage:
+      repository: lmcache/cacheblend-plugin
+      tag: latest
+      pullPolicy: IfNotPresent             # Always | Never | IfNotPresent
+    # imagePullSecrets pulls the PRIVATE payload image. The webhook appends these
+    # to the vLLM pod's spec.imagePullSecrets — the Secret(s) must exist in the
+    # vLLM pod's namespace.
+    # imagePullSecrets:
+    #   - name: my-registry-secret
+    # targetContainer: ""          # default: first container; name the vLLM container
+    # cudagraph: eager             # eager | piecewise | full_decode_only (never full)
+
+  # -- Eviction --
+  # eviction:
+  #   policy: LRU                 # default (only supported value)
+  #   triggerWatermark: 0.8       # default, range (0.0, 1.0]
+  #   evictionRatio: 0.2          # default, range (0.0, 1.0]
+
+  # -- Prometheus monitoring --
+  # prometheus:
+  #   enabled: true               # default
+  #   port: 9090                  # default
+  #   serviceMonitor:
+  #     enabled: false            # default, requires Prometheus Operator
+
+  # -- Scheduling (place one engine per GPU node; vLLM pods must land on these nodes) --
+  # nodeSelector:
+  #   nvidia.com/gpu.present: "true"
+
+  # -- Logging --
+  # logLevel: INFO                # DEBUG | INFO | WARNING | ERROR
+
+# ---------------------------------------------------------------------------
+# To enable CacheBlend on a vLLM pod, label it for the webhook and annotate it
+# with this engine's name. The pod (and this engine) must run in a namespace
+# labeled `pod-security.kubernetes.io/enforce=privileged` (hostIPC is injected).
+#
+#   metadata:
+#     labels:
+#       lmcache.ai/cacheblend-inject: "true"
+#     annotations:
+#       lmcache.ai/cacheblend-engine: "my-cacheblend"
+#       # lmcache.ai/cacheblend-container: "vllm"          # optional
+#       # lmcache.ai/cacheblend-image-pull-secrets: "..."  # optional override
+#
+# The vLLM container must launch via the image ENTRYPOINT (args-only) — a
+# `command: ["/bin/sh","-c", ...]` wrapper is skipped (injection cannot reach it).
+# ---------------------------------------------------------------------------
diff --git a/operator/config/samples/vllm_cacheblend_deployment.yaml b/operator/config/samples/vllm_cacheblend_deployment.yaml
new file mode 100644
index 0000000000..c1d4bc8436
--- /dev/null
+++ b/operator/config/samples/vllm_cacheblend_deployment.yaml
@@ -0,0 +1,86 @@
+# Sample vLLM Deployment that OPTS IN to CacheBlend dependency injection.
+#
+# The mutating webhook injects everything CacheBlend needs at pod CREATE — you
+# only supply the two highlighted bits below (the opt-in label + the engine
+# annotation) and a normal, args-only vLLM launch. After admission the webhook
+# adds: pod hostIPC=true; the `cb-plugin` emptyDir + payload init container;
+# a readOnly mount + PYTHONPATH on the vLLM container; the required vLLM flags
+# (--attention-backend CUSTOM, --kv-transfer-config <node-local engine>,
+# --block-size 64, --pipeline-parallel-size 1, --no-enable-chunked-prefill,
+# --no-async-scheduling, --enforce-eager); and the private-payload imagePullSecrets.
+#
+# PREREQUISITES (else injection silently no-ops or the pod is rejected):
+#   1. A CacheBlendEngine named below exists IN THIS NAMESPACE and is reconciled
+#      (its `<name>-connection` ConfigMap exists — the webhook reads it).
+#   2. This namespace is labeled pod-security.kubernetes.io/enforce=privileged
+#      (the injected hostIPC/privileged is rejected by baseline/restricted PSS).
+#   3. The vLLM container launches via the image ENTRYPOINT (["vllm","serve"])
+#      with args ONLY. Do NOT use `command: ["/bin/sh","-c", ...]` — a command
+#      override makes the webhook SKIP injection (appended args can't reach
+#      `vllm serve`); it stamps lmcache.ai/cacheblend-skip-reason=command-override.
+#   4. The private payload image's pull secret exists in THIS namespace
+#      (referenced by the engine's injection.imagePullSecrets, or overridden via
+#      the annotation below).
+#
+# Verify after creation:
+#   kubectl get pod -l app=vllm-cacheblend -o yaml | grep -E "hostIPC|cb-plugin|PYTHONPATH|attention-backend|cacheblend-injected"
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-cacheblend
+  namespace: default            # must match the CacheBlendEngine's namespace + be PSS-privileged
+  labels:
+    app: vllm-cacheblend
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-cacheblend
+  template:
+    metadata:
+      labels:
+        app: vllm-cacheblend
+        # (1) OPT-IN — the webhook's objectSelector matches this label.
+        lmcache.ai/cacheblend-inject: "true"
+      annotations:
+        # (2) BIND — names the CacheBlendEngine (same namespace) to inject for.
+        lmcache.ai/cacheblend-engine: "my-cacheblend"
+        # Optional: name the vLLM container if it is not the first one.
+        # lmcache.ai/cacheblend-container: "vllm"
+        # Optional: override the engine's payload imagePullSecrets (comma-separated).
+        # lmcache.ai/cacheblend-image-pull-secrets: "my-registry-secret"
+    spec:
+      # Needed for GPU access unless `nvidia` is the default containerd runtime.
+      # The blend engine shares THIS pod's GPU via CUDA IPC (it does not claim
+      # its own), so the engine + this pod must land on the same GPU node.
+      runtimeClassName: nvidia
+      # NOTE: do NOT set hostIPC here or mount an emptyDir at /dev/shm — the
+      # webhook injects hostIPC=true, which shares the host's /dev/shm for CUDA
+      # IPC; an emptyDir would shadow it and break cudaIpcOpenMemHandle.
+      containers:
+        - name: vllm
+          image: lmcache/vllm-openai:latest-nightly # vLLM + lmcache; must be in the cacheblend-supported version window
+          env:
+            # Deterministic prefix hashing across processes — required by LMCache.
+            - name: PYTHONHASHSEED
+              value: "0"
+          # Args-only launch (image ENTRYPOINT is ["vllm","serve"]). Put the
+          # model + your NON-CacheBlend serving flags here; the webhook APPENDS
+          # the CacheBlend flags. Do NOT add --attention-backend / --kv-transfer-config
+          # yourself (a user-supplied --kv-transfer-config makes the webhook skip it).
+          args:
+            - "Qwen/Qwen3-4B-Instruct-2507"
+            - "--port"
+            - "8000"
+            - "--gpu-memory-utilization"
+            - "0.8"
+            - "--load-format"        # uncomment for a weightless smoke test
+            - "dummy"
+          ports:
+            - name: http
+              containerPort: 8000
+          resources:
+            limits:
+              # vLLM claims its GPU normally; the blend engine (NVIDIA_VISIBLE_DEVICES=all)
+              # opens CUDA IPC handles on whichever GPU this pod is assigned.
+              nvidia.com/gpu: "1"
diff --git a/operator/config/webhook/kustomization.yaml b/operator/config/webhook/kustomization.yaml
new file mode 100644
index 0000000000..9f1267cb0d
--- /dev/null
+++ b/operator/config/webhook/kustomization.yaml
@@ -0,0 +1,15 @@
+resources:
+- manifests.yaml
+- service.yaml
+
+# The generated manifests.yaml only carries the fields controller-gen can emit
+# (path/failurePolicy/sideEffects/reinvocationPolicy/rules). controller-gen cannot
+# emit arbitrary objectSelector/namespaceSelector matchExpressions, so the
+# blast-radius gating is layered on here as a strategic-merge patch. Never edit the
+# generated manifests.yaml directly.
+patches:
+- path: mutating_webhook_selectors_patch.yaml
+
+# the following config is for teaching kustomize how to do var substitution
+configurations:
+- kustomizeconfig.yaml
diff --git a/operator/config/webhook/kustomizeconfig.yaml b/operator/config/webhook/kustomizeconfig.yaml
new file mode 100644
index 0000000000..a78df75b0d
--- /dev/null
+++ b/operator/config/webhook/kustomizeconfig.yaml
@@ -0,0 +1,25 @@
+# the following config is for teaching kustomize where to look at when substituting nameReference.
+# It requires kustomize v2.1.0 or newer to work properly.
+nameReference:
+- kind: Service
+  version: v1
+  fieldSpecs:
+  - kind: MutatingWebhookConfiguration
+    group: admissionregistration.k8s.io
+    path: webhooks/clientConfig/service/name
+  - kind: ValidatingWebhookConfiguration
+    group: admissionregistration.k8s.io
+    path: webhooks/clientConfig/service/name
+
+namespace:
+- kind: MutatingWebhookConfiguration
+  group: admissionregistration.k8s.io
+  path: webhooks/clientConfig/service/namespace
+  create: true
+- kind: ValidatingWebhookConfiguration
+  group: admissionregistration.k8s.io
+  path: webhooks/clientConfig/service/namespace
+  create: true
+
+varReference:
+- path: metadata/annotations
diff --git a/operator/config/webhook/manifests.yaml b/operator/config/webhook/manifests.yaml
new file mode 100644
index 0000000000..8fb2327c74
--- /dev/null
+++ b/operator/config/webhook/manifests.yaml
@@ -0,0 +1,26 @@
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: MutatingWebhookConfiguration
+metadata:
+  name: mutating-webhook-configuration
+webhooks:
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: webhook-service
+      namespace: system
+      path: /mutate--v1-pod
+  failurePolicy: Ignore
+  name: mpod.lmcache.ai
+  reinvocationPolicy: Never
+  rules:
+  - apiGroups:
+    - ""
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    resources:
+    - pods
+  sideEffects: None
diff --git a/operator/config/webhook/mutating_webhook_selectors_patch.yaml b/operator/config/webhook/mutating_webhook_selectors_patch.yaml
new file mode 100644
index 0000000000..878dcf1b4a
--- /dev/null
+++ b/operator/config/webhook/mutating_webhook_selectors_patch.yaml
@@ -0,0 +1,26 @@
+# Strategic-merge patch adding the blast-radius selectors that controller-gen
+# cannot express. Webhooks are merged by their `name`, so only the CacheBlend pod
+# webhook (mpod.lmcache.ai) is affected.
+#
+# - objectSelector: only mutate pods carrying lmcache.ai/cacheblend-inject="true";
+#   keeps the cluster-wide CREATE-pods rule from touching unrelated workloads.
+# - namespaceSelector: never mutate pods in kube-system / kube-public / the operator
+#   namespace (which runs under the "restricted" PSS profile and would reject the
+#   injected hostIPC), per design §10.
+apiVersion: admissionregistration.k8s.io/v1
+kind: MutatingWebhookConfiguration
+metadata:
+  name: mutating-webhook-configuration
+webhooks:
+- name: mpod.lmcache.ai
+  objectSelector:
+    matchLabels:
+      lmcache.ai/cacheblend-inject: "true"
+  namespaceSelector:
+    matchExpressions:
+    - key: kubernetes.io/metadata.name
+      operator: NotIn
+      values:
+      - kube-system
+      - kube-public
+      - lmcache-operator-system
diff --git a/operator/config/webhook/service.yaml b/operator/config/webhook/service.yaml
new file mode 100644
index 0000000000..79879ea9ae
--- /dev/null
+++ b/operator/config/webhook/service.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: webhook-service
+  namespace: system
+  labels:
+    app.kubernetes.io/name: operator
+    app.kubernetes.io/managed-by: kustomize
+spec:
+  ports:
+    - port: 443
+      protocol: TCP
+      targetPort: 9443
+  selector:
+    control-plane: controller-manager
+    app.kubernetes.io/name: operator
diff --git a/operator/go.mod b/operator/go.mod
index 17b52e5bd1..808a1d40fb 100644
--- a/operator/go.mod
+++ b/operator/go.mod
@@ -3,6 +3,7 @@ module github.com/LMCache/LMCache
 go 1.25.3
 
 require (
+	github.com/evanphx/json-patch/v5 v5.9.11
 	github.com/onsi/ginkgo/v2 v2.27.2
 	github.com/onsi/gomega v1.38.2
 	github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.89.0
@@ -24,7 +25,6 @@ require (
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/emicklei/go-restful/v3 v3.12.2 // indirect
-	github.com/evanphx/json-patch/v5 v5.9.11 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/fsnotify/fsnotify v1.9.0 // indirect
 	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
diff --git a/operator/internal/controller/cacheblend_reconcile_helpers.go b/operator/internal/controller/cacheblend_reconcile_helpers.go
new file mode 100644
index 0000000000..7eec5c4e61
--- /dev/null
+++ b/operator/internal/controller/cacheblend_reconcile_helpers.go
@@ -0,0 +1,435 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"context"
+	"fmt"
+
+	monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/meta"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	logf "sigs.k8s.io/controller-runtime/pkg/log"
+
+	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
+	"github.com/LMCache/LMCache/internal/resources"
+)
+
+// validateAndSetCondition runs validation and updates the ConfigValid condition.
+// Returns an error if validation fails (to stop reconciliation).
+func (r *CacheBlendEngineReconciler) validateAndSetCondition(ctx context.Context, engine *lmcachev1alpha1.CacheBlendEngine) error {
+	errs := engine.ValidateSpec()
+
+	if len(errs) > 0 {
+		// Re-fetch to get the latest resourceVersion before status update.
+		generation := engine.Generation
+		if err := r.Get(ctx, types.NamespacedName{Name: engine.Name, Namespace: engine.Namespace}, engine); err != nil {
+			return fmt.Errorf("failed to re-fetch engine for status update: %w", err)
+		}
+		meta.SetStatusCondition(&engine.Status.Conditions, metav1.Condition{
+			Type:               lmcachev1alpha1.ConditionConfigValid,
+			Status:             metav1.ConditionFalse,
+			Reason:             "ValidationFailed",
+			Message:            errs.ToAggregate().Error(),
+			ObservedGeneration: generation,
+		})
+		engine.Status.Phase = lmcachev1alpha1.PhaseFailed
+		engine.Status.ObservedGeneration = generation
+		if err := r.Status().Update(ctx, engine); err != nil {
+			return fmt.Errorf("failed to update status after validation failure: %w", err)
+		}
+		return fmt.Errorf("spec validation failed: %s", errs.ToAggregate().Error())
+	}
+
+	// ConfigValid=True condition is set in updateStatus (after re-fetch)
+	// to avoid resourceVersion conflicts.
+	return nil
+}
+
+// reconcileDaemonSet creates or updates the blend_v3 engine DaemonSet.
+func (r *CacheBlendEngineReconciler) reconcileDaemonSet(ctx context.Context, engine *lmcachev1alpha1.CacheBlendEngine) error {
+	desired := resources.BuildCBEngineDaemonSet(engine)
+
+	existing := &appsv1.DaemonSet{}
+	err := r.Get(ctx, types.NamespacedName{Name: desired.Name, Namespace: desired.Namespace}, existing)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+				return err
+			}
+			return r.Create(ctx, desired)
+		}
+		return err
+	}
+
+	// Preserve immutable selector
+	desired.Spec.Selector = existing.Spec.Selector
+	desired.Spec.Template.Labels = resources.MergeLabels(
+		existing.Spec.Selector.MatchLabels,
+		desired.Spec.Template.Labels,
+	)
+
+	if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+		return err
+	}
+
+	patch := client.MergeFrom(existing.DeepCopy())
+	existing.Spec.Template = desired.Spec.Template
+	existing.Labels = desired.Labels
+
+	return r.Patch(ctx, existing, patch)
+}
+
+// reconcileLookupService creates or updates the node-local lookup Service.
+func (r *CacheBlendEngineReconciler) reconcileLookupService(ctx context.Context, engine *lmcachev1alpha1.CacheBlendEngine) error {
+	desired := resources.BuildCBEngineLookupService(engine)
+
+	existing := &corev1.Service{}
+	err := r.Get(ctx, types.NamespacedName{Name: desired.Name, Namespace: desired.Namespace}, existing)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+				return err
+			}
+			return r.Create(ctx, desired)
+		}
+		return err
+	}
+
+	if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+		return err
+	}
+
+	patch := client.MergeFrom(existing.DeepCopy())
+	existing.Spec.Ports = desired.Spec.Ports
+	existing.Spec.InternalTrafficPolicy = desired.Spec.InternalTrafficPolicy
+	existing.Labels = desired.Labels
+
+	return r.Patch(ctx, existing, patch)
+}
+
+// reconcileMetricsService creates or updates the headless metrics Service.
+func (r *CacheBlendEngineReconciler) reconcileMetricsService(ctx context.Context, engine *lmcachev1alpha1.CacheBlendEngine) error {
+	desired := resources.BuildCBEngineMetricsService(engine)
+
+	existing := &corev1.Service{}
+	err := r.Get(ctx, types.NamespacedName{Name: desired.Name, Namespace: desired.Namespace}, existing)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+				return err
+			}
+			return r.Create(ctx, desired)
+		}
+		return err
+	}
+
+	if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+		return err
+	}
+
+	patch := client.MergeFrom(existing.DeepCopy())
+	existing.Spec.Ports = desired.Spec.Ports
+	existing.Labels = desired.Labels
+
+	return r.Patch(ctx, existing, patch)
+}
+
+// reconcileConnectionConfigMap creates or updates the <engine>-connection ConfigMap.
+func (r *CacheBlendEngineReconciler) reconcileConnectionConfigMap(ctx context.Context, engine *lmcachev1alpha1.CacheBlendEngine) error {
+	desired := resources.BuildCBConnectionConfigMap(engine)
+
+	existing := &corev1.ConfigMap{}
+	err := r.Get(ctx, types.NamespacedName{Name: desired.Name, Namespace: desired.Namespace}, existing)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+				return err
+			}
+			return r.Create(ctx, desired)
+		}
+		return err
+	}
+
+	if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+		return err
+	}
+
+	patch := client.MergeFrom(existing.DeepCopy())
+	existing.Data = desired.Data
+	existing.Labels = desired.Labels
+
+	return r.Patch(ctx, existing, patch)
+}
+
+// reconcileServiceMonitor creates, updates, or deletes the ServiceMonitor.
+func (r *CacheBlendEngineReconciler) reconcileServiceMonitor(ctx context.Context, engine *lmcachev1alpha1.CacheBlendEngine) error {
+	log := logf.FromContext(ctx)
+
+	if !resources.CBServiceMonitorEnabled(engine) {
+		// Delete ServiceMonitor if it exists
+		existing := &monitoringv1.ServiceMonitor{}
+		err := r.Get(ctx, types.NamespacedName{Name: engine.Name, Namespace: engine.Namespace}, existing)
+		if err != nil {
+			if apierrors.IsNotFound(err) {
+				return nil
+			}
+			// If the CRD is not installed, ignore the error
+			if meta.IsNoMatchError(err) {
+				return nil
+			}
+			return err
+		}
+		log.Info("Deleting ServiceMonitor", "name", engine.Name)
+		return r.Delete(ctx, existing)
+	}
+
+	desired := resources.BuildCBServiceMonitor(engine)
+
+	existing := &monitoringv1.ServiceMonitor{}
+	err := r.Get(ctx, types.NamespacedName{Name: desired.Name, Namespace: desired.Namespace}, existing)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+				return err
+			}
+			return r.Create(ctx, desired)
+		}
+		if meta.IsNoMatchError(err) {
+			log.Info("ServiceMonitor CRD not available, skipping")
+			return nil
+		}
+		return err
+	}
+
+	if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+		return err
+	}
+
+	patch := client.MergeFrom(existing.DeepCopy())
+	existing.Spec = desired.Spec
+	existing.Labels = desired.Labels
+
+	return r.Patch(ctx, existing, patch)
+}
+
+// updateStatus queries the DaemonSet and pods to compute status fields.
+// It re-fetches the engine to get the latest resourceVersion, avoiding
+// conflicts from watch events triggered by earlier reconcile steps.
+func (r *CacheBlendEngineReconciler) updateStatus(ctx context.Context, engine *lmcachev1alpha1.CacheBlendEngine) error {
+	// Re-fetch to get the latest resourceVersion, avoiding conflicts
+	// from watch events triggered by earlier reconcile steps (e.g.
+	// DaemonSet/Service creation fires Owns watches).
+	if err := r.Get(ctx, types.NamespacedName{Name: engine.Name, Namespace: engine.Namespace}, engine); err != nil {
+		return err
+	}
+
+	// ConfigValid condition (set here after re-fetch so it's not lost).
+	meta.SetStatusCondition(&engine.Status.Conditions, metav1.Condition{
+		Type:               lmcachev1alpha1.ConditionConfigValid,
+		Status:             metav1.ConditionTrue,
+		Reason:             "Valid",
+		Message:            "Spec validation passed",
+		ObservedGeneration: engine.Generation,
+	})
+
+	ds := &appsv1.DaemonSet{}
+	err := r.Get(ctx, types.NamespacedName{Name: engine.Name, Namespace: engine.Namespace}, ds)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			engine.Status.Phase = lmcachev1alpha1.PhasePending
+			engine.Status.DesiredInstances = 0
+			engine.Status.ReadyInstances = 0
+			engine.Status.ObservedGeneration = engine.Generation
+			return r.Status().Update(ctx, engine)
+		}
+		return err
+	}
+
+	engine.Status.DesiredInstances = ds.Status.DesiredNumberScheduled
+	engine.Status.ReadyInstances = ds.Status.NumberReady
+	engine.Status.ObservedGeneration = engine.Generation
+
+	// Compute phase
+	switch {
+	case ds.Status.DesiredNumberScheduled == 0:
+		engine.Status.Phase = lmcachev1alpha1.PhasePending
+	case ds.Status.NumberReady == ds.Status.DesiredNumberScheduled:
+		engine.Status.Phase = lmcachev1alpha1.PhaseRunning
+	case ds.Status.NumberReady > 0:
+		engine.Status.Phase = lmcachev1alpha1.PhaseDegraded
+	default:
+		engine.Status.Phase = lmcachev1alpha1.PhasePending
+	}
+
+	// Set conditions
+	meta.SetStatusCondition(&engine.Status.Conditions, metav1.Condition{
+		Type:               lmcachev1alpha1.ConditionAvailable,
+		Status:             conditionBool(ds.Status.NumberReady > 0),
+		Reason:             reasonFromReady(ds.Status.NumberReady > 0, "AtLeastOneReady", "NoReadyInstances"),
+		Message:            fmt.Sprintf("%d/%d instances ready", ds.Status.NumberReady, ds.Status.DesiredNumberScheduled),
+		ObservedGeneration: engine.Generation,
+	})
+
+	allReady := ds.Status.NumberReady == ds.Status.DesiredNumberScheduled && ds.Status.DesiredNumberScheduled > 0
+	meta.SetStatusCondition(&engine.Status.Conditions, metav1.Condition{
+		Type:               lmcachev1alpha1.ConditionAllInstancesReady,
+		Status:             conditionBool(allReady),
+		Reason:             reasonFromReady(allReady, "AllReady", "NotAllReady"),
+		Message:            fmt.Sprintf("%d/%d instances ready", ds.Status.NumberReady, ds.Status.DesiredNumberScheduled),
+		ObservedGeneration: engine.Generation,
+	})
+
+	// Build endpoints from pods
+	serverPort := int32(5555)
+	if engine.Spec.Server != nil && engine.Spec.Server.Port != nil {
+		serverPort = *engine.Spec.Server.Port
+	}
+	metricsPort := int32(9090)
+	if engine.Spec.Prometheus != nil && engine.Spec.Prometheus.Port != nil {
+		metricsPort = *engine.Spec.Prometheus.Port
+	}
+
+	podList := &corev1.PodList{}
+	if err := r.List(ctx, podList,
+		client.InNamespace(engine.Namespace),
+		client.MatchingLabels(resources.SelectorLabels(engine.Name)),
+	); err != nil {
+		return err
+	}
+
+	endpoints := make([]lmcachev1alpha1.EndpointStatus, 0, len(podList.Items))
+	for i := range podList.Items {
+		pod := &podList.Items[i]
+		ready := false
+		for _, cond := range pod.Status.Conditions {
+			if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
+				ready = true
+				break
+			}
+		}
+		endpoints = append(endpoints, lmcachev1alpha1.EndpointStatus{
+			NodeName:    pod.Spec.NodeName,
+			HostIP:      pod.Status.HostIP,
+			PodName:     pod.Name,
+			Port:        serverPort,
+			MetricsPort: metricsPort,
+			Ready:       ready,
+		})
+	}
+	engine.Status.Endpoints = endpoints
+
+	return r.Status().Update(ctx, engine)
+}
+
+// reconcileRESPAuthSecret ensures a local copy of the RESP auth secret exists in
+// the engine's namespace. It mirrors the LMCacheEngine path: a no-op unless
+// spec.l2Backend.RESP.authSecretRef is set, otherwise it reads the source secret
+// (possibly cross-namespace) and creates/updates a managed copy owned by the CR.
+func (r *CacheBlendEngineReconciler) reconcileRESPAuthSecret(ctx context.Context, engine *lmcachev1alpha1.CacheBlendEngine) error {
+	log := logf.FromContext(ctx)
+	spec := &engine.Spec
+
+	// No RESP auth configured — clean up any stale managed secret.
+	if spec.L2Backend == nil || spec.L2Backend.RESP == nil || spec.L2Backend.RESP.AuthSecretRef == nil {
+		return r.deleteRESPAuthSecretIfExists(ctx, engine)
+	}
+
+	ref := spec.L2Backend.RESP.AuthSecretRef
+	sourceNS := ref.Namespace
+	if sourceNS == "" {
+		sourceNS = engine.Namespace
+	}
+	localName := resources.RESPAuthSecretName(engine.Name)
+
+	// Read the source secret.
+	source := &corev1.Secret{}
+	if err := r.Get(ctx, types.NamespacedName{Name: ref.Name, Namespace: sourceNS}, source); err != nil {
+		return fmt.Errorf("failed to read RESP auth secret %s/%s: %w", sourceNS, ref.Name, err)
+	}
+
+	// Validate that the source secret contains the required "password" key.
+	password, ok := source.Data["password"]
+	if !ok || len(password) == 0 {
+		return fmt.Errorf("RESP auth secret %s/%s is missing required 'password' key", sourceNS, ref.Name)
+	}
+
+	// Build the local managed copy.
+	// Only "password" is required; "username" is optional (Redis Enterprise
+	// often uses password-only auth).
+	secretData := map[string][]byte{
+		"password": password,
+	}
+	if u, ok := source.Data["username"]; ok {
+		secretData["username"] = u
+	}
+	desired := &corev1.Secret{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      localName,
+			Namespace: engine.Namespace,
+			Labels:    resources.StandardLabels(engine.Name),
+		},
+		Data: secretData,
+	}
+
+	existing := &corev1.Secret{}
+	err := r.Get(ctx, types.NamespacedName{Name: localName, Namespace: engine.Namespace}, existing)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			if err := ctrl.SetControllerReference(engine, desired, r.Scheme); err != nil {
+				return err
+			}
+			log.Info("Creating managed RESP auth secret", "name", localName, "source", sourceNS+"/"+ref.Name)
+			return r.Create(ctx, desired)
+		}
+		return err
+	}
+
+	// Update existing — apply ownerRef, data, and labels.
+	if err := ctrl.SetControllerReference(engine, existing, r.Scheme); err != nil {
+		return err
+	}
+	patch := client.MergeFrom(existing.DeepCopy())
+	existing.Data = desired.Data
+	existing.Labels = desired.Labels
+	return r.Patch(ctx, existing, patch)
+}
+
+// deleteRESPAuthSecretIfExists removes the managed RESP auth secret if it exists
+// (e.g. when authSecretRef is removed from the spec).
+func (r *CacheBlendEngineReconciler) deleteRESPAuthSecretIfExists(ctx context.Context, engine *lmcachev1alpha1.CacheBlendEngine) error {
+	secret := &corev1.Secret{}
+	name := resources.RESPAuthSecretName(engine.Name)
+	err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: engine.Namespace}, secret)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		return err
+	}
+	// Only delete if we own it.
+	if metav1.IsControlledBy(secret, engine) {
+		return r.Delete(ctx, secret)
+	}
+	return nil
+}
diff --git a/operator/internal/controller/cacheblendengine_controller.go b/operator/internal/controller/cacheblendengine_controller.go
new file mode 100644
index 0000000000..c30a8dbae3
--- /dev/null
+++ b/operator/internal/controller/cacheblendengine_controller.go
@@ -0,0 +1,128 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"context"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/runtime"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	logf "sigs.k8s.io/controller-runtime/pkg/log"
+
+	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
+)
+
+// CacheBlendEngineReconciler reconciles a CacheBlendEngine object. It mirrors
+// LMCacheEngineReconciler but targets the blend_v3 engine workload and carries
+// no finalizer of its own: a brand-new CRD has no legacy CRs to migrate, so
+// owner-reference garbage collection alone cascade-deletes the child DaemonSet,
+// Services, ConfigMap, and managed Secret when the CR is removed.
+type CacheBlendEngineReconciler struct {
+	client.Client
+	Scheme *runtime.Scheme
+}
+
+// +kubebuilder:rbac:groups=lmcache.lmcache.ai,resources=cacheblendengines,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=lmcache.lmcache.ai,resources=cacheblendengines/status,verbs=get;update;patch
+// +kubebuilder:rbac:groups=lmcache.lmcache.ai,resources=cacheblendengines/finalizers,verbs=update
+
+// Reconcile reconciles the CacheBlendEngine CR. It applies defaults, validates
+// the spec, then converges the RESP auth Secret (only when RESP auth is
+// configured), the blend_v3 DaemonSet, the node-local lookup Service, the
+// headless metrics Service, the <engine>-connection ConfigMap, and the optional
+// ServiceMonitor before updating status. Every child carries a controller
+// reference for cascade deletion.
+func (r *CacheBlendEngineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+	log := logf.FromContext(ctx)
+
+	// 1. Fetch CR
+	engine := &lmcachev1alpha1.CacheBlendEngine{}
+	if err := r.Get(ctx, req.NamespacedName, engine); err != nil {
+		if apierrors.IsNotFound(err) {
+			return ctrl.Result{}, nil
+		}
+		return ctrl.Result{}, err
+	}
+
+	// 2. Set defaults
+	engine.SetDefaults()
+
+	// 3. Validate
+	if err := r.validateAndSetCondition(ctx, engine); err != nil {
+		return ctrl.Result{}, err
+	}
+
+	// 4. Reconcile RESP auth secret (cross-namespace copy if needed)
+	if err := r.reconcileRESPAuthSecret(ctx, engine); err != nil {
+		log.Error(err, "Failed to reconcile RESP auth secret")
+		return ctrl.Result{}, err
+	}
+
+	// 5. Reconcile DaemonSet
+	if err := r.reconcileDaemonSet(ctx, engine); err != nil {
+		log.Error(err, "Failed to reconcile DaemonSet")
+		return ctrl.Result{}, err
+	}
+
+	// 6. Reconcile lookup Service (node-local discovery for vLLM)
+	if err := r.reconcileLookupService(ctx, engine); err != nil {
+		log.Error(err, "Failed to reconcile lookup Service")
+		return ctrl.Result{}, err
+	}
+
+	// 7. Reconcile metrics Service
+	if err := r.reconcileMetricsService(ctx, engine); err != nil {
+		log.Error(err, "Failed to reconcile metrics Service")
+		return ctrl.Result{}, err
+	}
+
+	// 8. Reconcile ConfigMap
+	if err := r.reconcileConnectionConfigMap(ctx, engine); err != nil {
+		log.Error(err, "Failed to reconcile ConfigMap")
+		return ctrl.Result{}, err
+	}
+
+	// 9. Reconcile ServiceMonitor
+	if err := r.reconcileServiceMonitor(ctx, engine); err != nil {
+		log.Error(err, "Failed to reconcile ServiceMonitor")
+		return ctrl.Result{}, err
+	}
+
+	// 10. Update status
+	if err := r.updateStatus(ctx, engine); err != nil {
+		log.Error(err, "Failed to update status")
+		return ctrl.Result{}, err
+	}
+
+	return ctrl.Result{}, nil
+}
+
+// SetupWithManager sets up the controller with the Manager.
+func (r *CacheBlendEngineReconciler) SetupWithManager(mgr ctrl.Manager) error {
+	return ctrl.NewControllerManagedBy(mgr).
+		For(&lmcachev1alpha1.CacheBlendEngine{}).
+		Owns(&appsv1.DaemonSet{}).
+		Owns(&corev1.ConfigMap{}).
+		Owns(&corev1.Service{}).
+		Owns(&corev1.Secret{}).
+		Named("cacheblendengine").
+		Complete(r)
+}
diff --git a/operator/internal/controller/cacheblendengine_controller_test.go b/operator/internal/controller/cacheblendengine_controller_test.go
new file mode 100644
index 0000000000..2e19dcb8d9
--- /dev/null
+++ b/operator/internal/controller/cacheblendengine_controller_test.go
@@ -0,0 +1,212 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"sigs.k8s.io/controller-runtime/pkg/reconcile"
+
+	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
+)
+
+// cbeResourceName is the name of the CacheBlendEngine fixture reconciled by the
+// controller tests; the owner-reference helper checks against it.
+const cbeResourceName = "test-cbe"
+
+// argsContainFlagValue reports whether the ["--flag", "value", ...] slice
+// contains the given two-token flag/value pair.
+func argsContainFlagValue(args []string, flag, value string) bool {
+	for i := 0; i < len(args)-1; i++ {
+		if args[i] == flag && args[i+1] == value {
+			return true
+		}
+	}
+	return false
+}
+
+// ownedBy reports whether owners contains a controller reference to the
+// CacheBlendEngine test fixture (cbeResourceName).
+func ownedBy(owners []metav1.OwnerReference) bool {
+	for _, o := range owners {
+		if o.Kind == "CacheBlendEngine" && o.Name == cbeResourceName && o.Controller != nil && *o.Controller {
+			return true
+		}
+	}
+	return false
+}
+
+var _ = Describe("CacheBlendEngine Controller", func() {
+	Context("When reconciling a resource", func() {
+		const resourceName = cbeResourceName
+
+		ctx := context.Background()
+
+		typeNamespacedName := types.NamespacedName{
+			Name:      resourceName,
+			Namespace: "default",
+		}
+
+		BeforeEach(func() {
+			By("creating the custom resource for the Kind CacheBlendEngine")
+			engine := &lmcachev1alpha1.CacheBlendEngine{}
+			err := k8sClient.Get(ctx, typeNamespacedName, engine)
+			if err != nil && errors.IsNotFound(err) {
+				// injection.payloadImage.repository is required by ValidateSpec
+				// (the webhook needs it to inject a valid init container).
+				payloadRepo := "lmcache/cacheblend-plugin"
+				resource := &lmcachev1alpha1.CacheBlendEngine{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      resourceName,
+						Namespace: "default",
+					},
+					Spec: lmcachev1alpha1.CacheBlendEngineSpec{
+						L1: lmcachev1alpha1.L1BackendSpec{SizeGB: 10},
+						Injection: &lmcachev1alpha1.InjectionSpec{
+							PayloadImage: &lmcachev1alpha1.ImageSpec{Repository: &payloadRepo},
+						},
+					},
+				}
+				Expect(k8sClient.Create(ctx, resource)).To(Succeed())
+			}
+		})
+
+		AfterEach(func() {
+			resource := &lmcachev1alpha1.CacheBlendEngine{}
+			err := k8sClient.Get(ctx, typeNamespacedName, resource)
+			Expect(err).NotTo(HaveOccurred())
+
+			By("Cleanup the specific resource instance CacheBlendEngine")
+			Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
+
+			// Drain child resources so a subsequent test starts clean (envtest
+			// has no GC controller, so ownerRef cascade deletion does not run).
+			// The lookup Service shares the engine's name (no suffix).
+			_ = k8sClient.Delete(ctx, &appsv1.DaemonSet{ObjectMeta: metav1.ObjectMeta{Name: resourceName, Namespace: "default"}})
+			_ = k8sClient.Delete(ctx, &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: resourceName, Namespace: "default"}})
+			_ = k8sClient.Delete(ctx, &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: resourceName + "-metrics", Namespace: "default"}})
+			_ = k8sClient.Delete(ctx, &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: resourceName + "-connection", Namespace: "default"}})
+		})
+
+		It("should reconcile to a blend_v3 DaemonSet, Services, and connection ConfigMap with ownerRefs", func() {
+			controllerReconciler := &CacheBlendEngineReconciler{
+				Client: k8sClient,
+				Scheme: k8sClient.Scheme(),
+			}
+
+			By("Reconciling the created resource")
+			_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			By("Verifying the blend_v3 DaemonSet")
+			ds := &appsv1.DaemonSet{}
+			Expect(k8sClient.Get(ctx, typeNamespacedName, ds)).To(Succeed())
+			Expect(ownedBy(ds.OwnerReferences)).To(BeTrue())
+
+			podSpec := ds.Spec.Template.Spec
+			Expect(podSpec.HostIPC).To(BeTrue())
+			Expect(podSpec.Containers).To(HaveLen(1))
+			engineContainer := podSpec.Containers[0]
+
+			Expect(argsContainFlagValue(engineContainer.Args, "--engine-type", "blend_v3")).To(BeTrue())
+			Expect(argsContainFlagValue(engineContainer.Args, "--l1-align-bytes", "16777216")).To(BeTrue())
+
+			By("Verifying there is no GPU resource claim")
+			_, hasGPU := engineContainer.Resources.Limits["nvidia.com/gpu"]
+			Expect(hasGPU).To(BeFalse())
+
+			By("Verifying the lookup Service is node-local (named after the engine)")
+			lookupSvc := &corev1.Service{}
+			Expect(k8sClient.Get(ctx, typeNamespacedName, lookupSvc)).To(Succeed())
+			Expect(ownedBy(lookupSvc.OwnerReferences)).To(BeTrue())
+			Expect(lookupSvc.Spec.InternalTrafficPolicy).NotTo(BeNil())
+			Expect(*lookupSvc.Spec.InternalTrafficPolicy).To(Equal(corev1.ServiceInternalTrafficPolicyLocal))
+
+			By("Verifying the headless metrics Service")
+			metricsSvc := &corev1.Service{}
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: resourceName + "-metrics", Namespace: "default"}, metricsSvc)).To(Succeed())
+			Expect(ownedBy(metricsSvc.OwnerReferences)).To(BeTrue())
+			Expect(metricsSvc.Spec.ClusterIP).To(Equal(corev1.ClusterIPNone))
+
+			By("Verifying the connection ConfigMap carries CBKVConnector JSON")
+			cm := &corev1.ConfigMap{}
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: resourceName + "-connection", Namespace: "default"}, cm)).To(Succeed())
+			Expect(ownedBy(cm.OwnerReferences)).To(BeTrue())
+			jsonStr, ok := cm.Data["kv-transfer-config.json"]
+			Expect(ok).To(BeTrue())
+			Expect(strings.Contains(jsonStr, "CBKVConnector")).To(BeTrue())
+
+			config := map[string]any{}
+			Expect(json.Unmarshal([]byte(jsonStr), &config)).To(Succeed())
+			Expect(config["kv_connector"]).To(Equal("CBKVConnector"))
+
+			// design §7: the connector must dial the node-local Service over TCP
+			// (the key correction vs the single-machine model card), and carry the
+			// blend tunables.
+			extra, ok := config["kv_connector_extra_config"].(map[string]any)
+			Expect(ok).To(BeTrue())
+			Expect(extra["lmcache.mp.host"]).To(Equal(
+				fmt.Sprintf("tcp://%s.default.svc.cluster.local", resourceName)))
+			Expect(extra).To(HaveKey("lmcache.mp.port"))
+			Expect(extra).To(HaveKey("cb.check_layer"))
+			Expect(extra).To(HaveKey("cb.recomp_ratio"))
+		})
+
+		It("should converge status with ConfigValid=True after reconcile", func() {
+			controllerReconciler := &CacheBlendEngineReconciler{
+				Client: k8sClient,
+				Scheme: k8sClient.Scheme(),
+			}
+
+			_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			updated := &lmcachev1alpha1.CacheBlendEngine{}
+			Expect(k8sClient.Get(ctx, typeNamespacedName, updated)).To(Succeed())
+
+			By("Verifying the ConfigValid condition is True")
+			var found bool
+			for _, cond := range updated.Status.Conditions {
+				if cond.Type == lmcachev1alpha1.ConditionConfigValid {
+					found = true
+					Expect(cond.Status).To(Equal(metav1.ConditionTrue))
+				}
+			}
+			Expect(found).To(BeTrue())
+
+			By("Verifying the observed generation tracks the spec")
+			Expect(updated.Status.ObservedGeneration).To(Equal(updated.Generation))
+			// envtest has no kubelet, so no instances are scheduled: phase Pending.
+			Expect(updated.Status.Phase).To(Equal(lmcachev1alpha1.PhasePending))
+		})
+	})
+})
diff --git a/operator/internal/resources/cacheblend_engine.go b/operator/internal/resources/cacheblend_engine.go
new file mode 100644
index 0000000000..156f22ac31
--- /dev/null
+++ b/operator/internal/resources/cacheblend_engine.go
@@ -0,0 +1,183 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package resources
+
+import (
+	monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+
+	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
+)
+
+const (
+	// cbEngineType is the value of the --engine-type flag that selects the
+	// CacheBlend blend_v3 engine on the lmcache server binary
+	// (cacheblend-plugin/README.md:24).
+	cbEngineType = "blend_v3"
+
+	// cbL1AlignBytes is the value of the --l1-align-bytes flag required by the
+	// blend server (blend_server.sh:31).
+	cbL1AlignBytes = "16777216"
+
+	// cbDefaultImageRepo is the default engine image repository. CacheBlend runs
+	// the same lmcache server binary as LMCacheEngine, so it shares the default
+	// image; a private blend image is set via spec.image.
+	cbDefaultImageRepo = "lmcache/vllm-openai"
+
+	// cbKVConnector is the connector name injected into vLLM pods bound to a
+	// CacheBlendEngine (design §7).
+	cbKVConnector = "CBKVConnector"
+
+	// cbKVConnectorModulePath is the import path of the CacheBlend connector
+	// (cacheblend-plugin/README.md:66).
+	cbKVConnectorModulePath = "lmcache_cacheblend.connector"
+)
+
+// cbSpecToEngineSpec projects a CacheBlendEngineSpec onto an LMCacheEngineSpec so
+// the shared, spec-keyed resource builders (ComputeResources, BuildContainerArgs,
+// and the buildDaemonSetCore/buildLookupServiceCore/buildMetricsServiceCore
+// scaffolding) can be reused without duplicating the GPU/security pod template.
+//
+// CacheBlendEngineSpec deliberately reuses the same shared sub-structs as
+// LMCacheEngineSpec (ServerSpec, L1BackendSpec, EvictionSpec, PrometheusSpec,
+// L2BackendSpec, ImageSpec, etc.), so every field consumed by the shared builders
+// maps across one-to-one. The CacheBlend-only fields (Blend, Injection) are not
+// consumed by the engine builders and are intentionally dropped here; they are
+// surfaced separately (Blend via BuildCBConnectionConfigMap, Injection via the
+// admission webhook).
+func cbSpecToEngineSpec(spec *lmcachev1alpha1.CacheBlendEngineSpec) *lmcachev1alpha1.LMCacheEngineSpec {
+	return &lmcachev1alpha1.LMCacheEngineSpec{
+		GPUVendor:          spec.GPUVendor,
+		Image:              spec.Image,
+		ImagePullSecrets:   spec.ImagePullSecrets,
+		Server:             spec.Server,
+		L1:                 spec.L1,
+		Eviction:           spec.Eviction,
+		Prometheus:         spec.Prometheus,
+		L2Backend:          spec.L2Backend,
+		ResourceOverrides:  spec.ResourceOverrides,
+		LogLevel:           spec.LogLevel,
+		NodeSelector:       spec.NodeSelector,
+		Affinity:           spec.Affinity,
+		Tolerations:        spec.Tolerations,
+		Env:                spec.Env,
+		Volumes:            spec.Volumes,
+		VolumeMounts:       spec.VolumeMounts,
+		PodAnnotations:     spec.PodAnnotations,
+		PodLabels:          spec.PodLabels,
+		ServiceAccountName: spec.ServiceAccountName,
+		PriorityClassName:  spec.PriorityClassName,
+		ExtraArgs:          spec.ExtraArgs,
+	}
+}
+
+// BuildCBEngineArgs returns the server CLI args for the blend_v3 engine: the
+// proven LMCacheEngine serialization (--host/--port/--l1-size-gb/--chunk-size/
+// eviction/prometheus/L2) plus the CacheBlend-specific --engine-type blend_v3 and
+// --l1-align-bytes flags. The blend flags are inserted before the user-supplied
+// extraArgs so a user can still override them.
+func BuildCBEngineArgs(spec *lmcachev1alpha1.CacheBlendEngineSpec) []string {
+	// Serialize the base server args WITHOUT the user extraArgs, append the blend
+	// flags, then append extraArgs last so they retain their override precedence.
+	base := cbSpecToEngineSpec(spec)
+	base.ExtraArgs = nil
+	args := BuildContainerArgs(base)
+	args = append(args,
+		"--engine-type", cbEngineType,
+		"--l1-align-bytes", cbL1AlignBytes,
+	)
+	args = append(args, spec.ExtraArgs...)
+	return args
+}
+
+// BuildCBEngineDaemonSet constructs the DaemonSet for the blend_v3 engine of the
+// given CacheBlendEngine. It reuses the shared GPU/security pod-template
+// scaffolding (hostIPC, runtimeClassName=nvidia, privileged, NVIDIA_VISIBLE_DEVICES=all,
+// CPU+memory-only resources with no nvidia.com/gpu claim) so the engine shares the
+// node's GPU via CUDA IPC, and adds the blend-specific server args.
+func BuildCBEngineDaemonSet(engine *lmcachev1alpha1.CacheBlendEngine) *appsv1.DaemonSet {
+	engineSpec := cbSpecToEngineSpec(&engine.Spec)
+	return buildDaemonSetCore(
+		engine.Name,
+		engine.Namespace,
+		engineSpec,
+		BuildCBEngineArgs(&engine.Spec),
+		cbDefaultImageRepo,
+	)
+}
+
+// BuildCBEngineLookupService creates the node-local lookup Service
+// (internalTrafficPolicy=Local) for the CacheBlendEngine, so opted-in vLLM pods
+// reach the blend_v3 engine on their own node.
+func BuildCBEngineLookupService(engine *lmcachev1alpha1.CacheBlendEngine) *corev1.Service {
+	return buildLookupServiceCore(engine.Name, engine.Namespace, cbSpecToEngineSpec(&engine.Spec))
+}
+
+// BuildCBEngineMetricsService creates the headless metrics Service for the
+// CacheBlendEngine.
+func BuildCBEngineMetricsService(engine *lmcachev1alpha1.CacheBlendEngine) *corev1.Service {
+	return buildMetricsServiceCore(engine.Name, engine.Namespace, cbSpecToEngineSpec(&engine.Spec))
+}
+
+// BuildCBConnectionConfigMap creates the <engine>-connection ConfigMap carrying
+// the CBKVConnector kv-transfer-config JSON (design §7). The JSON points vLLM at
+// the node-local Service (lmcache.mp.host) and carries the blend tunables
+// cb.check_layer and cb.recomp_ratio read from spec.Blend (defaults are pinned by
+// SetDefaults: checkLayer=1, recompRatio=0.15).
+func BuildCBConnectionConfigMap(engine *lmcachev1alpha1.CacheBlendEngine) *corev1.ConfigMap {
+	spec := &engine.Spec
+	// Use the same default (5555) as BuildContainerArgs/getServerPort so the
+	// connection ConfigMap's lmcache.mp.port always matches the engine's actual
+	// --port; the two artifacts must never drift (design §9.10).
+	port := derefInt32(getServerPort(cbSpecToEngineSpec(spec)), 5555)
+
+	checkLayer := int32(1)
+	recompRatio := 0.15
+	if spec.Blend != nil {
+		checkLayer = derefInt32(spec.Blend.CheckLayer, 1)
+		recompRatio = derefFloat64(spec.Blend.RecompRatio, 0.15)
+	}
+
+	extra := map[string]any{
+		"cb.check_layer":  checkLayer,
+		"cb.recomp_ratio": recompRatio,
+	}
+
+	return buildConnectionConfigMapCore(
+		engine.Name,
+		engine.Namespace,
+		cbKVConnector,
+		cbKVConnectorModulePath,
+		port,
+		extra,
+	)
+}
+
+// CBServiceMonitorEnabled reports whether a ServiceMonitor should be created for
+// the given CacheBlendEngine. It reuses the shared spec-keyed predicate since
+// CacheBlendEngineSpec embeds the same PrometheusSpec sub-struct.
+func CBServiceMonitorEnabled(engine *lmcachev1alpha1.CacheBlendEngine) bool {
+	return ServiceMonitorEnabled(cbSpecToEngineSpec(&engine.Spec))
+}
+
+// BuildCBServiceMonitor creates the ServiceMonitor CR for the CacheBlendEngine,
+// reusing the shared name/namespace/spec-keyed core. Callers must ensure
+// CBServiceMonitorEnabled(engine) is true.
+func BuildCBServiceMonitor(engine *lmcachev1alpha1.CacheBlendEngine) *monitoringv1.ServiceMonitor {
+	return buildServiceMonitorCore(engine.Name, engine.Namespace, cbSpecToEngineSpec(&engine.Spec))
+}
diff --git a/operator/internal/resources/cacheblend_engine_test.go b/operator/internal/resources/cacheblend_engine_test.go
new file mode 100644
index 0000000000..f92876221e
--- /dev/null
+++ b/operator/internal/resources/cacheblend_engine_test.go
@@ -0,0 +1,379 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package resources
+
+import (
+	"encoding/json"
+	"slices"
+	"testing"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
+)
+
+// minimalCBEngine returns a CacheBlendEngine with only the required L1 size set,
+// then applies SetDefaults so blend/injection/chunk-size defaults are pinned —
+// matching how the controller hands the object to the builders.
+func minimalCBEngine() *lmcachev1alpha1.CacheBlendEngine {
+	e := &lmcachev1alpha1.CacheBlendEngine{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      testEngineName,
+			Namespace: testNamespace,
+		},
+		Spec: lmcachev1alpha1.CacheBlendEngineSpec{
+			L1: lmcachev1alpha1.L1BackendSpec{SizeGB: 10},
+		},
+	}
+	e.SetDefaults()
+	return e
+}
+
+// ===========================
+// BuildCBEngineArgs
+// ===========================
+
+func TestBuildCBEngineArgs_BlendFlags(t *testing.T) {
+	args := BuildCBEngineArgs(&minimalCBEngine().Spec)
+
+	// Blend-specific flags.
+	assertArg(t, args, "--engine-type", "blend_v3")
+	assertArg(t, args, "--l1-align-bytes", "16777216")
+
+	// Reuses the proven LMCacheEngine serialization (NOT --l1-size).
+	assertArg(t, args, "--l1-size-gb", "10.0")
+	assertNoArg(t, args, "--l1-size")
+
+	// chunk-size pinned to 256 by SetDefaults.
+	assertArg(t, args, "--chunk-size", "256")
+
+	// Standard server args carried over.
+	assertArg(t, args, "--host", "0.0.0.0")
+	assertArg(t, args, "--port", "5555")
+	assertArg(t, args, "--hash-algorithm", "blake3")
+}
+
+func TestBuildCBEngineArgs_ExtraArgsAfterBlendFlags(t *testing.T) {
+	engine := minimalCBEngine()
+	engine.Spec.ExtraArgs = []string{"--engine-type", "override-me"}
+
+	args := BuildCBEngineArgs(&engine.Spec)
+
+	// The user's extraArgs are appended last so they can override the blend
+	// defaults; assert the override value appears after the operator-set one.
+	firstIdx := slices.Index(args, "--engine-type")
+	lastIdx := -1
+	for i, a := range args {
+		if a == "--engine-type" {
+			lastIdx = i
+		}
+	}
+	if firstIdx == lastIdx {
+		t.Fatalf("expected --engine-type to appear twice, got args=%v", args)
+	}
+	if args[firstIdx+1] != "blend_v3" {
+		t.Fatalf("expected operator-set --engine-type blend_v3 first, got %s", args[firstIdx+1])
+	}
+	if args[lastIdx+1] != "override-me" {
+		t.Fatalf("expected user --engine-type override-me last, got %s", args[lastIdx+1])
+	}
+}
+
+// ===========================
+// BuildCBEngineDaemonSet
+// ===========================
+
+func TestBuildCBEngineDaemonSet_GPUAndSecurity(t *testing.T) {
+	engine := minimalCBEngine()
+	ds := BuildCBEngineDaemonSet(engine)
+
+	if ds.Name != testEngineName {
+		t.Fatalf("expected name %s, got %s", testEngineName, ds.Name)
+	}
+	if ds.Namespace != testNamespace {
+		t.Fatalf("expected namespace %s, got %s", testNamespace, ds.Namespace)
+	}
+
+	podSpec := ds.Spec.Template.Spec
+
+	// hostIPC required for CUDA IPC with the node-local engine.
+	if !podSpec.HostIPC {
+		t.Fatal("expected HostIPC=true")
+	}
+	// runtimeClassName=nvidia for the default (nvidia) vendor.
+	if podSpec.RuntimeClassName == nil || *podSpec.RuntimeClassName != nvidiaRuntimeClass {
+		t.Fatalf("expected RuntimeClassName=nvidia, got %v", podSpec.RuntimeClassName)
+	}
+
+	if len(podSpec.Containers) != 1 {
+		t.Fatalf("expected 1 container, got %d", len(podSpec.Containers))
+	}
+	c := podSpec.Containers[0]
+
+	// privileged: true.
+	if c.SecurityContext == nil || c.SecurityContext.Privileged == nil || !*c.SecurityContext.Privileged {
+		t.Fatal("expected privileged=true")
+	}
+
+	// NVIDIA env exposes all GPUs without a device-plugin claim.
+	if !hasEnvAll(c.Env, "NVIDIA_VISIBLE_DEVICES") {
+		t.Fatal("missing NVIDIA_VISIBLE_DEVICES=all")
+	}
+	if !hasEnvAll(c.Env, "NVIDIA_DRIVER_CAPABILITIES") {
+		t.Fatal("missing NVIDIA_DRIVER_CAPABILITIES=all")
+	}
+
+	// Command is the same lmcache server binary as LMCacheEngine.
+	if len(c.Command) < 2 || c.Command[0] != lmcacheServerBinary || c.Command[1] != serverSubcommand {
+		t.Fatalf("expected lmcache server command, got %v", c.Command)
+	}
+
+	// Blend args present on the container.
+	assertArg(t, c.Args, "--engine-type", "blend_v3")
+	assertArg(t, c.Args, "--l1-align-bytes", "16777216")
+}
+
+func TestBuildCBEngineDaemonSet_NoGPUResourceClaim(t *testing.T) {
+	engine := minimalCBEngine()
+	ds := BuildCBEngineDaemonSet(engine)
+	c := ds.Spec.Template.Spec.Containers[0]
+
+	// The engine shares vLLM's GPU via CUDA IPC; it must NOT claim a
+	// device-plugin GPU (nvidia.com/gpu) in requests or limits.
+	const gpuResource = corev1.ResourceName("nvidia.com/gpu")
+	if _, ok := c.Resources.Requests[gpuResource]; ok {
+		t.Fatal("nvidia.com/gpu must not be in resource requests")
+	}
+	if _, ok := c.Resources.Limits[gpuResource]; ok {
+		t.Fatal("nvidia.com/gpu must not be in resource limits")
+	}
+
+	// CPU + memory ARE present (auto-computed from L1 size = 10).
+	if c.Resources.Requests.Cpu().IsZero() {
+		t.Fatal("expected a CPU request")
+	}
+	if c.Resources.Requests.Memory().IsZero() {
+		t.Fatal("expected a memory request")
+	}
+}
+
+func TestBuildCBEngineDaemonSet_ImagePullSecrets(t *testing.T) {
+	engine := minimalCBEngine()
+	engine.Spec.ImagePullSecrets = []corev1.LocalObjectReference{{Name: "private-engine-reg"}}
+
+	ds := BuildCBEngineDaemonSet(engine)
+
+	secrets := ds.Spec.Template.Spec.ImagePullSecrets
+	if len(secrets) != 1 || secrets[0].Name != "private-engine-reg" {
+		t.Fatalf("expected engine imagePullSecrets wired onto the pod, got %v", secrets)
+	}
+}
+
+func TestBuildCBEngineDaemonSet_DefaultImage(t *testing.T) {
+	engine := minimalCBEngine()
+	ds := BuildCBEngineDaemonSet(engine)
+	c := ds.Spec.Template.Spec.Containers[0]
+
+	if c.Image != "lmcache/vllm-openai:latest" {
+		t.Fatalf("expected default image lmcache/vllm-openai:latest, got %s", c.Image)
+	}
+}
+
+func TestBuildCBEngineDaemonSet_AMDNoRuntimeClass(t *testing.T) {
+	engine := minimalCBEngine()
+	engine.Spec.GPUVendor = ptr(lmcachev1alpha1.GPUVendorAMD)
+
+	ds := BuildCBEngineDaemonSet(engine)
+	podSpec := ds.Spec.Template.Spec
+
+	if podSpec.RuntimeClassName != nil {
+		t.Fatalf("expected nil RuntimeClassName for AMD, got %q", *podSpec.RuntimeClassName)
+	}
+	// Still privileged + hostIPC.
+	if !podSpec.HostIPC {
+		t.Fatal("expected HostIPC=true even for AMD")
+	}
+}
+
+// ===========================
+// BuildCBEngineLookupService / MetricsService
+// ===========================
+
+func TestBuildCBEngineLookupService_Local(t *testing.T) {
+	engine := minimalCBEngine()
+	svc := BuildCBEngineLookupService(engine)
+
+	if svc.Name != testEngineName {
+		t.Fatalf("expected name %s, got %s", testEngineName, svc.Name)
+	}
+	if svc.Spec.ClusterIP == corev1.ClusterIPNone {
+		t.Fatal("lookup service must not be headless")
+	}
+	if svc.Spec.InternalTrafficPolicy == nil ||
+		*svc.Spec.InternalTrafficPolicy != corev1.ServiceInternalTrafficPolicyLocal {
+		t.Fatal("expected internalTrafficPolicy=Local")
+	}
+	if svc.Spec.Ports[0].Port != 5555 {
+		t.Fatalf("expected server port 5555, got %d", svc.Spec.Ports[0].Port)
+	}
+}
+
+func TestBuildCBEngineMetricsService_Headless(t *testing.T) {
+	engine := minimalCBEngine()
+	svc := BuildCBEngineMetricsService(engine)
+
+	if svc.Name != testEngineName+"-metrics" {
+		t.Fatalf("expected name %s-metrics, got %s", testEngineName, svc.Name)
+	}
+	if svc.Spec.ClusterIP != corev1.ClusterIPNone {
+		t.Fatal("expected headless metrics service (ClusterIP=None)")
+	}
+	if svc.Spec.Ports[0].Port != 9090 {
+		t.Fatalf("expected metrics port 9090, got %d", svc.Spec.Ports[0].Port)
+	}
+}
+
+// ===========================
+// BuildCBConnectionConfigMap
+// ===========================
+
+// parseCBConnectionConfig unmarshals the CBKVConnector kv-transfer-config JSON
+// from the connection ConfigMap and returns the top-level config plus the
+// kv_connector_extra_config submap.
+func parseCBConnectionConfig(t *testing.T, cm *corev1.ConfigMap) (map[string]any, map[string]any) {
+	t.Helper()
+	jsonStr, ok := cm.Data["kv-transfer-config.json"]
+	if !ok {
+		t.Fatal("missing kv-transfer-config.json key")
+	}
+	var config map[string]any
+	if err := json.Unmarshal([]byte(jsonStr), &config); err != nil {
+		t.Fatalf("invalid JSON: %v", err)
+	}
+	extra, ok := config["kv_connector_extra_config"].(map[string]any)
+	if !ok {
+		t.Fatal("missing kv_connector_extra_config map")
+	}
+	return config, extra
+}
+
+func TestBuildCBConnectionConfigMap_Default(t *testing.T) {
+	engine := minimalCBEngine()
+	cm := BuildCBConnectionConfigMap(engine)
+
+	if cm.Name != "test-engine-connection" {
+		t.Fatalf("expected name test-engine-connection, got %s", cm.Name)
+	}
+	if cm.Namespace != testNamespace {
+		t.Fatalf("expected namespace %s, got %s", testNamespace, cm.Namespace)
+	}
+
+	config, extra := parseCBConnectionConfig(t, cm)
+
+	if config["kv_connector"] != "CBKVConnector" {
+		t.Fatalf("expected kv_connector=CBKVConnector, got %v", config["kv_connector"])
+	}
+	if config["kv_connector_module_path"] != "lmcache_cacheblend.connector" {
+		t.Fatalf("expected kv_connector_module_path=lmcache_cacheblend.connector, got %v",
+			config["kv_connector_module_path"])
+	}
+	if config["kv_role"] != "kv_both" {
+		t.Fatalf("expected kv_role=kv_both, got %v", config["kv_role"])
+	}
+
+	if extra["lmcache.mp.host"] != "tcp://test-engine.default.svc.cluster.local" {
+		t.Fatalf("expected tcp:// node-local Service host, got %v", extra["lmcache.mp.host"])
+	}
+	if extra["lmcache.mp.port"] != "5555" {
+		t.Fatalf("expected lmcache.mp.port=5555, got %v", extra["lmcache.mp.port"])
+	}
+
+	// Blend tunables from SetDefaults (checkLayer=1, recompRatio=0.15). JSON
+	// numbers decode to float64.
+	if extra["cb.check_layer"] != float64(1) {
+		t.Fatalf("expected cb.check_layer=1, got %v", extra["cb.check_layer"])
+	}
+	if extra["cb.recomp_ratio"] != 0.15 {
+		t.Fatalf("expected cb.recomp_ratio=0.15, got %v", extra["cb.recomp_ratio"])
+	}
+}
+
+func TestBuildCBConnectionConfigMap_CustomBlendAndPort(t *testing.T) {
+	engine := minimalCBEngine()
+	engine.Spec.Server.Port = ptr(int32(6566))
+	engine.Spec.Blend = &lmcachev1alpha1.BlendSpec{
+		CheckLayer:  ptr(int32(3)),
+		RecompRatio: ptr(0.5),
+	}
+
+	cm := BuildCBConnectionConfigMap(engine)
+	_, extra := parseCBConnectionConfig(t, cm)
+
+	if extra["lmcache.mp.port"] != "6566" {
+		t.Fatalf("expected lmcache.mp.port=6566, got %v", extra["lmcache.mp.port"])
+	}
+	if extra["cb.check_layer"] != float64(3) {
+		t.Fatalf("expected cb.check_layer=3, got %v", extra["cb.check_layer"])
+	}
+	if extra["cb.recomp_ratio"] != 0.5 {
+		t.Fatalf("expected cb.recomp_ratio=0.5, got %v", extra["cb.recomp_ratio"])
+	}
+}
+
+// TestBuildCBConnectionConfigMap_PortMatchesEngineArgs asserts the connection
+// ConfigMap's lmcache.mp.port and the engine DaemonSet's --port never drift, for
+// both the default and a user-set port.
+func TestBuildCBConnectionConfigMap_PortMatchesEngineArgs(t *testing.T) {
+	tests := []struct {
+		name     string
+		port     *int32
+		wantPort string
+	}{
+		{name: "default", port: nil, wantPort: "5555"},
+		{name: "custom", port: ptr(int32(6566)), wantPort: "6566"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			engine := minimalCBEngine()
+			engine.Spec.Server.Port = tt.port
+
+			args := BuildCBEngineArgs(&engine.Spec)
+			cm := BuildCBConnectionConfigMap(engine)
+			_, extra := parseCBConnectionConfig(t, cm)
+
+			assertArg(t, args, "--port", tt.wantPort)
+			if extra["lmcache.mp.port"] != tt.wantPort {
+				t.Fatalf("connection port %v != engine --port %s", extra["lmcache.mp.port"], tt.wantPort)
+			}
+		})
+	}
+}
+
+// TestBuildCBEngine_ChunkSizeConsistency asserts the engine's chunk-size is 256
+// (the only value CacheBlend supports — block_size 64 * 4), matching the locked
+// CacheBlendChunkSize constant, so it cannot drift from the injected --block-size.
+func TestBuildCBEngine_ChunkSizeConsistency(t *testing.T) {
+	engine := minimalCBEngine()
+	args := BuildCBEngineArgs(&engine.Spec)
+
+	assertArg(t, args, "--chunk-size", "256")
+	if lmcachev1alpha1.CacheBlendChunkSize != 256 {
+		t.Fatalf("CacheBlendChunkSize constant must be 256, got %d", lmcachev1alpha1.CacheBlendChunkSize)
+	}
+}
diff --git a/operator/internal/resources/configmap.go b/operator/internal/resources/configmap.go
index 7ebc413122..db4a2a6251 100644
--- a/operator/internal/resources/configmap.go
+++ b/operator/internal/resources/configmap.go
@@ -19,6 +19,7 @@ package resources
 import (
 	"encoding/json"
 	"fmt"
+	"maps"
 
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -31,28 +32,68 @@ func LookupServiceName(engineName string) string {
 	return engineName
 }
 
+// ConnectionConfigMapName returns the name of the <engine>-connection ConfigMap.
+func ConnectionConfigMapName(engineName string) string {
+	return fmt.Sprintf("%s-connection", engineName)
+}
+
 // BuildConnectionConfigMap creates the <name>-connection ConfigMap with kv-transfer-config JSON.
 func BuildConnectionConfigMap(engine *lmcachev1alpha1.LMCacheEngine) *corev1.ConfigMap {
 	port := derefInt32(getServerPort(&engine.Spec), 5555)
-	svcHost := fmt.Sprintf("%s.%s.svc.cluster.local", LookupServiceName(engine.Name), engine.Namespace)
+
+	return buildConnectionConfigMapCore(
+		engine.Name,
+		engine.Namespace,
+		"LMCacheMPConnector",
+		"lmcache.integration.vllm.lmcache_mp_connector",
+		port,
+		nil,
+	)
+}
+
+// buildConnectionConfigMapCore is the shared core for the <engine>-connection
+// ConfigMap that both engine controllers emit. It produces the kv-transfer-config
+// JSON with the node-local Service host/port and lets the caller select the
+// connector name, its module path, and any connector-specific extra config keys
+// (e.g. CacheBlend's cb.check_layer / cb.recomp_ratio).
+//
+// Parameters:
+//   - name, namespace: the owning engine's identity (drives the ConfigMap name,
+//     labels, and the node-local Service DNS host).
+//   - connectorName: the kv_connector value (e.g. "LMCacheMPConnector" or
+//     "CBKVConnector").
+//   - modulePath: the kv_connector_module_path value.
+//   - port: the engine server port, emitted as lmcache.mp.port (string).
+//   - extraConfig: additional kv_connector_extra_config keys merged on top of the
+//     base lmcache.mp.host / lmcache.mp.port entries; nil for the default
+//     connector.
+func buildConnectionConfigMapCore(
+	name, namespace, connectorName, modulePath string,
+	port int32,
+	extraConfig map[string]any,
+) *corev1.ConfigMap {
+	svcHost := fmt.Sprintf("%s.%s.svc.cluster.local", LookupServiceName(name), namespace)
+
+	extra := map[string]any{
+		"lmcache.mp.host": fmt.Sprintf("tcp://%s", svcHost),
+		"lmcache.mp.port": fmt.Sprintf("%d", port),
+	}
+	maps.Copy(extra, extraConfig)
 
 	config := map[string]any{
-		"kv_connector":             "LMCacheMPConnector",
-		"kv_connector_module_path": "lmcache.integration.vllm.lmcache_mp_connector",
-		"kv_role":                  "kv_both",
-		"kv_connector_extra_config": map[string]any{
-			"lmcache.mp.host": fmt.Sprintf("tcp://%s", svcHost),
-			"lmcache.mp.port": fmt.Sprintf("%d", port),
-		},
+		"kv_connector":              connectorName,
+		"kv_connector_module_path":  modulePath,
+		"kv_role":                   "kv_both",
+		"kv_connector_extra_config": extra,
 	}
 
 	configJSON, _ := json.MarshalIndent(config, "", "  ")
 
 	return &corev1.ConfigMap{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      fmt.Sprintf("%s-connection", engine.Name),
-			Namespace: engine.Namespace,
-			Labels:    StandardLabels(engine.Name),
+			Name:      ConnectionConfigMapName(name),
+			Namespace: namespace,
+			Labels:    StandardLabels(name),
 		},
 		Data: map[string]string{
 			"kv-transfer-config.json": string(configJSON),
diff --git a/operator/internal/resources/daemonset.go b/operator/internal/resources/daemonset.go
index 35b89b01ca..33b3d306aa 100644
--- a/operator/internal/resources/daemonset.go
+++ b/operator/internal/resources/daemonset.go
@@ -27,23 +27,63 @@ import (
 	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
 )
 
+const (
+	// nvidiaRuntimeClass is the RuntimeClass name registered by the NVIDIA GPU
+	// Operator; engine pods request it when gpuVendor is nvidia.
+	nvidiaRuntimeClass = "nvidia"
+
+	// lmcacheServerBinary is the entrypoint binary for the LMCache server inside
+	// the engine image.
+	lmcacheServerBinary = "/opt/venv/bin/lmcache"
+
+	// serverSubcommand is the `lmcache server` subcommand that starts the engine.
+	serverSubcommand = "server"
+
+	// serverPortName is the name of the engine's serving port on the container
+	// and the node-local Service.
+	serverPortName = "server"
+)
+
 // BuildDaemonSet constructs a DaemonSet for the given LMCacheEngine.
 func BuildDaemonSet(engine *lmcachev1alpha1.LMCacheEngine) *appsv1.DaemonSet {
-	spec := &engine.Spec
-	selectorLabels := SelectorLabels(engine.Name)
-	podLabels := MergeLabels(StandardLabels(engine.Name), spec.PodLabels)
+	return buildDaemonSetCore(engine.Name, engine.Namespace, &engine.Spec, BuildContainerArgs(&engine.Spec), "lmcache/vllm-openai")
+}
+
+// buildDaemonSetCore constructs the DaemonSet shared by the LMCacheEngine and
+// CacheBlendEngine controllers. It is the single source of truth for the
+// GPU/security pod-template scaffolding (hostIPC, runtimeClassName, privileged,
+// NVIDIA_VISIBLE_DEVICES, resources without a device-plugin GPU claim) so those
+// settings cannot drift between the two engines.
+//
+// Parameters:
+//   - name, namespace: the owning object's identity, used for labels and metadata.
+//   - spec: the engine spec (LMCacheEngine and CacheBlendEngine reuse the same
+//     shared sub-structs, so callers project the CacheBlendEngine spec into an
+//     *LMCacheEngineSpec before calling).
+//   - containerArgs: the fully serialized server CLI args (callers append any
+//     engine-specific flags such as --engine-type before passing them in).
+//   - defaultImageRepo: the container image repository to use when spec.Image
+//     does not set one.
+func buildDaemonSetCore(
+	name, namespace string,
+	spec *lmcachev1alpha1.LMCacheEngineSpec,
+	containerArgs []string,
+	defaultImageRepo string,
+) *appsv1.DaemonSet {
+	selectorLabels := SelectorLabels(name)
+	podLabels := MergeLabels(StandardLabels(name), spec.PodLabels)
 	podAnnotations := spec.PodAnnotations
 
 	gpuVendor := derefString(spec.GPUVendor, lmcachev1alpha1.GPUVendorNvidia)
 	var runtimeClassName *string
 	if gpuVendor == lmcachev1alpha1.GPUVendorNvidia {
-		rc := "nvidia"
+		rc := nvidiaRuntimeClass
 		runtimeClassName = &rc
 	}
 	privileged := true
 
 	serverPort := derefInt32(getServerPort(spec), 5555)
-	imgRepo := "lmcache/vllm-openai"
+	imgRepo := defaultImageRepo
 	imgTag := "latest"
 	imgPullPolicy := corev1.PullIfNotPresent
 	if spec.Image != nil {
@@ -86,7 +126,7 @@ func BuildDaemonSet(engine *lmcachev1alpha1.LMCacheEngine) *appsv1.DaemonSet {
 	// The DaemonSet references the local (same-namespace) managed copy
 	// created by the controller via reconcileRESPAuthSecret.
 	if spec.L2Backend != nil && spec.L2Backend.RESP != nil && spec.L2Backend.RESP.AuthSecretRef != nil {
-		secretName := RESPAuthSecretName(engine.Name)
+		secretName := RESPAuthSecretName(name)
 		optional := true
 		envVars = append(envVars,
 			corev1.EnvVar{
@@ -124,10 +164,9 @@ func BuildDaemonSet(engine *lmcachev1alpha1.LMCacheEngine) *appsv1.DaemonSet {
 	// (LMCACHE_RESP_USERNAME / LMCACHE_RESP_PASSWORD) injected above,
 	// so no shell wrapper is needed.
 	containerCommand := []string{
-		"/opt/venv/bin/lmcache",
-		"server",
+		lmcacheServerBinary,
+		serverSubcommand,
 	}
-	containerArgs := BuildContainerArgs(spec)
 
 	// Probes
 	tcpProbe := &corev1.TCPSocketAction{
@@ -161,7 +200,7 @@ func BuildDaemonSet(engine *lmcachev1alpha1.LMCacheEngine) *appsv1.DaemonSet {
 	httpPort := getHTTPPort(spec)
 	containerPorts := []corev1.ContainerPort{
 		{
-			Name:          "server",
+			Name:          serverPortName,
 			ContainerPort: serverPort,
 			Protocol:      corev1.ProtocolTCP,
 		},
@@ -189,9 +228,9 @@ func BuildDaemonSet(engine *lmcachev1alpha1.LMCacheEngine) *appsv1.DaemonSet {
 
 	ds := &appsv1.DaemonSet{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      engine.Name,
-			Namespace: engine.Namespace,
-			Labels:    StandardLabels(engine.Name),
+			Name:      name,
+			Namespace: namespace,
+			Labels:    StandardLabels(name),
 		},
 		Spec: appsv1.DaemonSetSpec{
 			Selector: &metav1.LabelSelector{
diff --git a/operator/internal/resources/resources_test.go b/operator/internal/resources/resources_test.go
index c9e1f2ae5c..554983116c 100644
--- a/operator/internal/resources/resources_test.go
+++ b/operator/internal/resources/resources_test.go
@@ -633,7 +633,7 @@ func TestBuildDaemonSet_PrometheusDisabled(t *testing.T) {
 	if len(c.Ports) != 2 {
 		t.Fatalf("expected 2 container ports, got %d", len(c.Ports))
 	}
-	if c.Ports[0].Name != "server" { //nolint:goconst // test assertion
+	if c.Ports[0].Name != serverPortName {
 		t.Fatalf("expected port name 'server', got %s", c.Ports[0].Name)
 	}
 }
@@ -723,7 +723,7 @@ func TestBuildDaemonSet_RESPNoAuth(t *testing.T) {
 	c := ds.Spec.Template.Spec.Containers[0]
 
 	// Should use direct command, not shell wrapper
-	if c.Command[0] != "/opt/venv/bin/lmcache" || c.Command[1] != "server" {
+	if c.Command[0] != lmcacheServerBinary || c.Command[1] != serverSubcommand {
 		t.Fatalf("expected lmcache server command, got %v", c.Command)
 	}
 
@@ -756,7 +756,7 @@ func TestBuildDaemonSet_RESPWithAuth(t *testing.T) {
 	c := ds.Spec.Template.Spec.Containers[0]
 
 	// Should use direct python command (no shell wrapper needed)
-	if c.Command[0] != "/opt/venv/bin/lmcache" || c.Command[1] != "server" {
+	if c.Command[0] != lmcacheServerBinary || c.Command[1] != serverSubcommand {
 		t.Fatalf("expected lmcache server command, got %v", c.Command)
 	}
 
@@ -823,7 +823,7 @@ func TestBuildLookupService_Default(t *testing.T) {
 	if svc.Spec.Ports[0].Port != 5555 {
 		t.Fatalf("expected server port 5555, got %d", svc.Spec.Ports[0].Port)
 	}
-	if svc.Spec.Ports[0].Name != "server" {
+	if svc.Spec.Ports[0].Name != serverPortName {
 		t.Fatalf("expected port name server, got %s", svc.Spec.Ports[0].Name)
 	}
 	if svc.Spec.Ports[1].Port != 8080 {
@@ -1103,15 +1103,15 @@ func TestBuildDaemonSet_GPUVendorNvidiaDefault(t *testing.T) {
 	ds := BuildDaemonSet(engine)
 	podSpec := ds.Spec.Template.Spec
 
-	if podSpec.RuntimeClassName == nil || *podSpec.RuntimeClassName != "nvidia" {
+	if podSpec.RuntimeClassName == nil || *podSpec.RuntimeClassName != nvidiaRuntimeClass {
 		t.Fatalf("expected RuntimeClassName=nvidia, got %v", podSpec.RuntimeClassName)
 	}
 
 	c := podSpec.Containers[0]
-	if !hasEnv(c.Env, "NVIDIA_VISIBLE_DEVICES", "all") {
+	if !hasEnvAll(c.Env, "NVIDIA_VISIBLE_DEVICES") {
 		t.Fatal("missing NVIDIA_VISIBLE_DEVICES=all on default (nvidia) vendor")
 	}
-	if !hasEnv(c.Env, "NVIDIA_DRIVER_CAPABILITIES", "all") {
+	if !hasEnvAll(c.Env, "NVIDIA_DRIVER_CAPABILITIES") {
 		t.Fatal("missing NVIDIA_DRIVER_CAPABILITIES=all on default (nvidia) vendor")
 	}
 }
@@ -1136,9 +1136,11 @@ func TestBuildDaemonSet_GPUVendorAMD(t *testing.T) {
 	}
 }
 
-func hasEnv(envs []corev1.EnvVar, name, value string) bool {
+// hasEnvAll reports whether envs contains an env var named name set to the
+// literal "all" (the value the GPU passthrough vars are always set to).
+func hasEnvAll(envs []corev1.EnvVar, name string) bool {
 	for _, e := range envs {
-		if e.Name == name && e.Value == value {
+		if e.Name == name && e.Value == "all" {
 			return true
 		}
 	}
diff --git a/operator/internal/resources/service.go b/operator/internal/resources/service.go
index addad80a0c..2fbc3c523c 100644
--- a/operator/internal/resources/service.go
+++ b/operator/internal/resources/service.go
@@ -29,22 +29,29 @@ import (
 // for node-local service discovery. vLLM pods connect to this service and kube-proxy
 // routes traffic only to the LMCache pod on the same node.
 func BuildLookupService(engine *lmcachev1alpha1.LMCacheEngine) *corev1.Service {
-	serverPort := derefInt32(getServerPort(&engine.Spec), 5555)
-	httpPort := getHTTPPort(&engine.Spec)
+	return buildLookupServiceCore(engine.Name, engine.Namespace, &engine.Spec)
+}
+
+// buildLookupServiceCore is the name/namespace/spec-keyed core shared by the
+// LMCacheEngine and CacheBlendEngine lookup-Service builders. The node-local
+// internalTrafficPolicy=Local routing guarantee is owned here so it cannot drift.
+func buildLookupServiceCore(name, namespace string, spec *lmcachev1alpha1.LMCacheEngineSpec) *corev1.Service {
+	serverPort := derefInt32(getServerPort(spec), 5555)
+	httpPort := getHTTPPort(spec)
 	localPolicy := corev1.ServiceInternalTrafficPolicyLocal
 
 	return &corev1.Service{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      LookupServiceName(engine.Name),
-			Namespace: engine.Namespace,
-			Labels:    StandardLabels(engine.Name),
+			Name:      LookupServiceName(name),
+			Namespace: namespace,
+			Labels:    StandardLabels(name),
 		},
 		Spec: corev1.ServiceSpec{
-			Selector:              SelectorLabels(engine.Name),
+			Selector:              SelectorLabels(name),
 			InternalTrafficPolicy: &localPolicy,
 			Ports: []corev1.ServicePort{
 				{
-					Name:     "server",
+					Name:     serverPortName,
 					Port:     serverPort,
 					Protocol: corev1.ProtocolTCP,
 				},
@@ -60,20 +67,26 @@ func BuildLookupService(engine *lmcachev1alpha1.LMCacheEngine) *corev1.Service {
 
 // BuildMetricsService creates a headless Service for Prometheus scraping.
 func BuildMetricsService(engine *lmcachev1alpha1.LMCacheEngine) *corev1.Service {
+	return buildMetricsServiceCore(engine.Name, engine.Namespace, &engine.Spec)
+}
+
+// buildMetricsServiceCore is the name/namespace/spec-keyed core shared by the
+// LMCacheEngine and CacheBlendEngine metrics-Service builders.
+func buildMetricsServiceCore(name, namespace string, spec *lmcachev1alpha1.LMCacheEngineSpec) *corev1.Service {
 	promPort := int32(9090)
-	if engine.Spec.Prometheus != nil {
-		promPort = derefInt32(engine.Spec.Prometheus.Port, 9090)
+	if spec.Prometheus != nil {
+		promPort = derefInt32(spec.Prometheus.Port, 9090)
 	}
 
 	return &corev1.Service{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      fmt.Sprintf("%s-metrics", engine.Name),
-			Namespace: engine.Namespace,
-			Labels:    StandardLabels(engine.Name),
+			Name:      fmt.Sprintf("%s-metrics", name),
+			Namespace: namespace,
+			Labels:    StandardLabels(name),
 		},
 		Spec: corev1.ServiceSpec{
 			ClusterIP: corev1.ClusterIPNone,
-			Selector:  SelectorLabels(engine.Name),
+			Selector:  SelectorLabels(name),
 			Ports: []corev1.ServicePort{
 				{
 					Name:     "metrics",
diff --git a/operator/internal/resources/servicemonitor.go b/operator/internal/resources/servicemonitor.go
index b81a64bc52..898af2ef3b 100644
--- a/operator/internal/resources/servicemonitor.go
+++ b/operator/internal/resources/servicemonitor.go
@@ -33,22 +33,29 @@ func ServiceMonitorEnabled(spec *lmcachev1alpha1.LMCacheEngineSpec) bool {
 
 // BuildServiceMonitor creates a ServiceMonitor CR for Prometheus Operator integration.
 func BuildServiceMonitor(engine *lmcachev1alpha1.LMCacheEngine) *monitoringv1.ServiceMonitor {
-	spec := &engine.Spec
+	return buildServiceMonitorCore(engine.Name, engine.Namespace, &engine.Spec)
+}
+
+// buildServiceMonitorCore is the name/namespace/spec-keyed core shared by the
+// LMCacheEngine and CacheBlendEngine ServiceMonitor builders. Callers must
+// ensure ServiceMonitorEnabled(spec) is true (spec.Prometheus.ServiceMonitor is
+// dereferenced here).
+func buildServiceMonitorCore(name, namespace string, spec *lmcachev1alpha1.LMCacheEngineSpec) *monitoringv1.ServiceMonitor {
 	smSpec := spec.Prometheus.ServiceMonitor
 
 	interval := monitoringv1.Duration(derefString(smSpec.Interval, "30s"))
 
-	labels := MergeLabels(StandardLabels(engine.Name), smSpec.Labels)
+	labels := MergeLabels(StandardLabels(name), smSpec.Labels)
 
 	return &monitoringv1.ServiceMonitor{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      engine.Name,
-			Namespace: engine.Namespace,
+			Name:      name,
+			Namespace: namespace,
 			Labels:    labels,
 		},
 		Spec: monitoringv1.ServiceMonitorSpec{
 			Selector: metav1.LabelSelector{
-				MatchLabels: SelectorLabels(engine.Name),
+				MatchLabels: SelectorLabels(name),
 			},
 			Endpoints: []monitoringv1.Endpoint{
 				{
diff --git a/operator/internal/webhook/cacheblend_inject_builders.go b/operator/internal/webhook/cacheblend_inject_builders.go
new file mode 100644
index 0000000000..e69a00b1c3
--- /dev/null
+++ b/operator/internal/webhook/cacheblend_inject_builders.go
@@ -0,0 +1,296 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package webhook implements the CacheBlend mutating admission webhook that
+// injects the lmcache-cacheblend vLLM plugin into opted-in vLLM pods (see
+// design §7). This file holds the pure, side-effect-free mutation builders; the
+// admission handler that orchestrates them lives in pod_injector.go.
+package webhook
+
+import (
+	corev1 "k8s.io/api/core/v1"
+
+	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
+)
+
+const (
+	// cbPluginVolumeName is the name of the shared emptyDir volume that the init
+	// container stages the lmcache-cacheblend plugin tree into and the vLLM
+	// container reads it back from (design §7 M1/M2/M3).
+	cbPluginVolumeName = "cb-plugin"
+
+	// cbPluginMountPath is the in-container path the cb-plugin volume mounts at,
+	// in both the init container (read-write, the cp target) and the vLLM
+	// container (read-only). It must stay in lockstep with cbPythonPath
+	// (design §9.5).
+	cbPluginMountPath = "/cb-plugin"
+
+	// cbSharedDirEnvName is the env var the payload init container reads to learn
+	// where to copy the plugin tree (cacheblend-plugin docker/Dockerfile:22-29).
+	cbSharedDirEnvName = "SHARED_DIR"
+
+	// cbPythonPath is the value prepended to the vLLM container's PYTHONPATH so
+	// vLLM (and every spawned engine-core/worker/front-end subprocess) discovers
+	// the staged plugin (design §7 M4). It must equal cbPluginMountPath.
+	cbPythonPath = "/cb-plugin"
+
+	// pythonPathEnvName is the standard Python module search-path env var.
+	pythonPathEnvName = "PYTHONPATH"
+
+	// cbInitContainerName is the name of the injected payload init container.
+	cbInitContainerName = "cb-plugin-stage"
+)
+
+// CacheBlend-required vLLM flag names and fixed values (design §7 M5). The
+// CacheBlend matcher and connector hard-require these; several fail loudly,
+// --no-async-scheduling fails silently (MoE garble).
+const (
+	cbFlagAttentionBackend = "--attention-backend"
+	cbValAttentionBackend  = "CUSTOM"
+
+	cbFlagKVTransferConfig = "--kv-transfer-config"
+
+	cbFlagNoChunkedPrefill = "--no-enable-chunked-prefill"
+
+	cbFlagBlockSize = "--block-size"
+	cbValBlockSize  = "64"
+
+	cbFlagPipelineParallelSize = "--pipeline-parallel-size"
+	cbValPipelineParallelSize  = "1"
+
+	cbFlagNoAsyncScheduling = "--no-async-scheduling"
+
+	// cudagraph mode flags. Eager (default) forces --enforce-eager; full
+	// decode-only enables decode graphs while never using full graphs in prefill
+	// (design §4, §7 M5).
+	cbFlagEnforceEager      = "--enforce-eager"
+	cbFlagCompilationConfig = "--compilation-config"
+)
+
+// BuildCBPluginVolume returns the shared emptyDir volume the init container
+// stages the plugin into and the vLLM container reads it back from (M1).
+func BuildCBPluginVolume() corev1.Volume {
+	return corev1.Volume{
+		Name: cbPluginVolumeName,
+		VolumeSource: corev1.VolumeSource{
+			EmptyDir: &corev1.EmptyDirVolumeSource{},
+		},
+	}
+}
+
+// BuildCBInitContainer returns the payload init container (M2). It mounts the
+// cb-plugin volume read-write at /cb-plugin, sets SHARED_DIR=/cb-plugin, and runs
+// the payload image's own ENTRYPOINT (busybox `cp -a`) with no command override.
+//
+// Parameters:
+//   - payloadImage: the (possibly private) image that ships the unpacked
+//     lmcache_cacheblend plugin tree under /payload.
+//   - pullPolicy: the image pull policy for that image.
+func BuildCBInitContainer(payloadImage string, pullPolicy corev1.PullPolicy) corev1.Container {
+	return corev1.Container{
+		Name:            cbInitContainerName,
+		Image:           payloadImage,
+		ImagePullPolicy: pullPolicy,
+		Env: []corev1.EnvVar{
+			{Name: cbSharedDirEnvName, Value: cbPluginMountPath},
+		},
+		VolumeMounts: []corev1.VolumeMount{
+			{
+				Name:      cbPluginVolumeName,
+				MountPath: cbPluginMountPath,
+				ReadOnly:  false,
+			},
+		},
+	}
+}
+
+// BuildCBVolumeMount returns the read-only mount of the cb-plugin volume added to
+// the target vLLM container (M3).
+func BuildCBVolumeMount() corev1.VolumeMount {
+	return corev1.VolumeMount{
+		Name:      cbPluginVolumeName,
+		MountPath: cbPluginMountPath,
+		ReadOnly:  true,
+	}
+}
+
+// BuildCBPodEnv returns the env list for the target vLLM container with
+// PYTHONPATH set to (or prepended with) /cb-plugin (M4). It is set on the
+// container, never the pod, so every spawned worker inherits it; it never sets
+// VLLM_PLUGINS (design §9.8). An existing PYTHONPATH is prepended, not replaced,
+// so /cb-plugin:<existing> keeps the plugin discoverable without dropping the
+// user's path entries.
+//
+// Parameters:
+//   - existing: the target container's current env list (may be nil).
+//
+// Returns a new env list; the input is not mutated.
+func BuildCBPodEnv(existing []corev1.EnvVar) []corev1.EnvVar {
+	out := make([]corev1.EnvVar, 0, len(existing)+1)
+	found := false
+	for _, e := range existing {
+		if e.Name == pythonPathEnvName {
+			found = true
+			prepended := e
+			if prepended.ValueFrom != nil {
+				// A valueFrom PYTHONPATH cannot be string-prepended safely; in
+				// that rare case overwrite with the plugin path so the plugin is
+				// at least discoverable (the alternative is no plugin at all).
+				prepended.ValueFrom = nil
+				prepended.Value = cbPythonPath
+			} else if prepended.Value == "" {
+				prepended.Value = cbPythonPath
+			} else {
+				prepended.Value = cbPythonPath + ":" + prepended.Value
+			}
+			out = append(out, prepended)
+			continue
+		}
+		out = append(out, e)
+	}
+	if !found {
+		out = append(out, corev1.EnvVar{Name: pythonPathEnvName, Value: cbPythonPath})
+	}
+	return out
+}
+
+// cudagraphArgs returns the cudagraph-mode flag set for the given mode. "eager"
+// (and any unrecognized value) maps to --enforce-eager; "full_decode_only"
+// enables decode-only CUDA graphs without ever using full graphs in prefill;
+// "piecewise" enables piecewise capture (no --enforce-eager).
+func cudagraphArgs(cudagraph string) []string {
+	switch cudagraph {
+	case lmcachev1alpha1.CudagraphPiecewise:
+		// Piecewise graph capture: do not force eager. vLLM's default
+		// compilation already does piecewise capture, so no extra flag is
+		// emitted; we simply omit --enforce-eager.
+		return nil
+	case lmcachev1alpha1.CudagraphFullDecodeOnly:
+		// Decode-only full graphs: enable cudagraph for decode but never full
+		// graphs in prefill (CacheBlend re-RoPE happens in prefill).
+		return []string{cbFlagCompilationConfig, `{"cudagraph_mode":"FULL_DECODE_ONLY"}`}
+	default:
+		// eager (default): force --enforce-eager.
+		return []string{cbFlagEnforceEager}
+	}
+}
+
+// BuildCBArgs returns the target vLLM container's args with the CacheBlend
+// required flag set applied (M5). Each flag is appended-or-replaced (design §9.1):
+// a user-supplied --flag value (in either "--flag v" or "--flag=v" form) is
+// overwritten in place rather than duplicated, and a flag the user never set is
+// appended.
+//
+// The --kv-transfer-config flag is included here only when the caller passes a
+// non-empty kvTransferConfigJSON; the handler skips it (and stamps a reason)
+// when the user already supplied their own --kv-transfer-config (design §9.2),
+// in which case it passes "" so this builder leaves the user's value untouched.
+//
+// Parameters:
+//   - existingArgs: the target container's current args (may be nil).
+//   - kvTransferConfigJSON: the CBKVConnector JSON from the engine's connection
+//     ConfigMap, or "" to skip injecting/replacing --kv-transfer-config.
+//   - cudagraph: the cudagraph mode (eager|piecewise|full_decode_only).
+//
+// Returns a new args slice; the input is not mutated.
+func BuildCBArgs(existingArgs []string, kvTransferConfigJSON, cudagraph string) []string {
+	args := make([]string, len(existingArgs))
+	copy(args, existingArgs)
+
+	args = applyArg(args, cbFlagAttentionBackend, cbValAttentionBackend)
+	if kvTransferConfigJSON != "" {
+		args = applyArg(args, cbFlagKVTransferConfig, kvTransferConfigJSON)
+	}
+	args = applyBareFlag(args, cbFlagNoChunkedPrefill)
+	args = applyArg(args, cbFlagBlockSize, cbValBlockSize)
+	args = applyArg(args, cbFlagPipelineParallelSize, cbValPipelineParallelSize)
+	args = applyBareFlag(args, cbFlagNoAsyncScheduling)
+
+	// cudagraphArgs returns either a single bare flag (--enforce-eager), a
+	// [flag, value] pair (full_decode_only), or nil (piecewise). Apply with
+	// append-or-replace semantics so a user's pre-existing value is overwritten.
+	switch cg := cudagraphArgs(cudagraph); len(cg) {
+	case 1:
+		args = applyBareFlag(args, cg[0])
+	case 2:
+		args = applyArg(args, cg[0], cg[1])
+	}
+
+	return args
+}
+
+// applyArg appends-or-replaces a "--flag value" pair in args (design §9.1). It
+// recognizes both the two-token form ["--flag", "value"] and the single-token
+// form ["--flag=value"]; a match is overwritten in place (preserving the form),
+// otherwise the two-token pair is appended. Returns a new slice.
+func applyArg(args []string, flag, value string) []string {
+	eqPrefix := flag + "="
+	for i := range len(args) {
+		if args[i] == flag {
+			// Two-token form: overwrite the following value token if present,
+			// else append the value after the flag.
+			if i+1 < len(args) {
+				args[i+1] = value
+				return args
+			}
+			return append(args, value)
+		}
+		if len(args[i]) >= len(eqPrefix) && args[i][:len(eqPrefix)] == eqPrefix {
+			// Single-token --flag=value form: overwrite in place.
+			args[i] = eqPrefix + value
+			return args
+		}
+	}
+	return append(args, flag, value)
+}
+
+// applyBareFlag appends a valueless flag (e.g. --no-enable-chunked-prefill) if it
+// is not already present in either bare or --flag=... form. Returns a new slice.
+func applyBareFlag(args []string, flag string) []string {
+	eqPrefix := flag + "="
+	for _, a := range args {
+		if a == flag {
+			return args
+		}
+		if len(a) >= len(eqPrefix) && a[:len(eqPrefix)] == eqPrefix {
+			return args
+		}
+	}
+	return append(args, flag)
+}
+
+// MergeImagePullSecrets returns existing with injected appended, deduplicated by
+// secret name (M7). A secret already present in existing is not duplicated, and
+// the order of existing is preserved with new secrets appended in injected order.
+// Returns a new slice; the inputs are not mutated.
+func MergeImagePullSecrets(
+	existing, injected []corev1.LocalObjectReference,
+) []corev1.LocalObjectReference {
+	seen := make(map[string]struct{}, len(existing))
+	out := make([]corev1.LocalObjectReference, 0, len(existing)+len(injected))
+	for _, ref := range existing {
+		out = append(out, ref)
+		seen[ref.Name] = struct{}{}
+	}
+	for _, ref := range injected {
+		if _, ok := seen[ref.Name]; ok {
+			continue
+		}
+		seen[ref.Name] = struct{}{}
+		out = append(out, ref)
+	}
+	return out
+}
diff --git a/operator/internal/webhook/pod_injector.go b/operator/internal/webhook/pod_injector.go
new file mode 100644
index 0000000000..6d5b66343c
--- /dev/null
+++ b/operator/internal/webhook/pod_injector.go
@@ -0,0 +1,415 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package webhook
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"strings"
+
+	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/types"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
+
+	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
+	"github.com/LMCache/LMCache/internal/resources"
+)
+
+// Annotation keys the webhook reads and stamps (design §8).
+const (
+	// AnnotationEngine binds a pod to a CacheBlendEngine in the same namespace.
+	// Its presence is the opt-in signal; its value is the engine name.
+	AnnotationEngine = "lmcache.ai/cacheblend-engine"
+
+	// AnnotationContainer optionally names the target vLLM container; empty or
+	// absent selects the first container.
+	AnnotationContainer = "lmcache.ai/cacheblend-container"
+
+	// AnnotationImagePullSecrets optionally overrides the engine's
+	// injection.imagePullSecrets with a comma-separated list of Secret names
+	// appended to the pod's spec.imagePullSecrets for the private payload image.
+	AnnotationImagePullSecrets = "lmcache.ai/cacheblend-image-pull-secrets"
+
+	// AnnotationInjected is the idempotency guard stamped after a successful
+	// injection; a re-admitted pod carrying it is allowed unchanged.
+	AnnotationInjected = "lmcache.ai/cacheblend-injected"
+
+	// AnnotationSkipReason records why injection was skipped (fail-open).
+	AnnotationSkipReason = "lmcache.ai/cacheblend-skip-reason"
+)
+
+// valueTrue is the boolean-true string stamped on AnnotationInjected and used as
+// the opt-in value of the lmcache.ai/cacheblend-inject label.
+const valueTrue = "true"
+
+// Skip-reason values stamped on AnnotationSkipReason (design §8).
+const (
+	// SkipReasonEngineNotFound is stamped when the named engine's connection
+	// ConfigMap (or the engine CR) does not exist (fail-open).
+	SkipReasonEngineNotFound = "engine-not-found"
+
+	// SkipReasonCommandOverride is stamped when the target container overrides
+	// command, so appended args may never reach `vllm serve`.
+	SkipReasonCommandOverride = "command-override"
+
+	// SkipReasonKVTransferConfigPresent is stamped when the user already supplies
+	// --kv-transfer-config; the webhook does not clobber their structured JSON.
+	SkipReasonKVTransferConfigPresent = "kv-transfer-config-present"
+
+	// SkipReasonPayloadImageUnset is stamped when the engine's
+	// injection.payloadImage resolves to an empty reference (no repository). The
+	// webhook skips rather than inject an init container with an empty image,
+	// which the API server would reject. CRD validation normally prevents this.
+	SkipReasonPayloadImageUnset = "payload-image-unset"
+
+	// SkipReasonTargetContainerNotFound is stamped when the requested target
+	// container (injection.targetContainer or the cacheblend-container
+	// annotation) names a container that does not exist on the pod, so there is
+	// nothing to inject into.
+	SkipReasonTargetContainerNotFound = "target-container-not-found"
+)
+
+// kvTransferConfigDataKey is the key within the <engine>-connection ConfigMap's
+// Data map that holds the CBKVConnector kv-transfer-config JSON. It must match
+// the key written by resources.buildConnectionConfigMapCore.
+const kvTransferConfigDataKey = "kv-transfer-config.json"
+
+// +kubebuilder:webhook:path=/mutate--v1-pod,mutating=true,failurePolicy=ignore,sideEffects=None,groups="",resources=pods,verbs=create,versions=v1,name=mpod.lmcache.ai,admissionReviewVersions=v1,reinvocationPolicy=Never
+
+// PodInjector is the mutating admission handler that injects the
+// lmcache-cacheblend vLLM plugin into opted-in pods (design §7). It is gated by
+// the CacheBlendEngine CR: it mutates a pod only when the pod's
+// lmcache.ai/cacheblend-engine annotation names an engine whose connection
+// ConfigMap exists. It fails open (failurePolicy: Ignore) and is idempotent.
+type PodInjector struct {
+	// Client reads the named CacheBlendEngine and its connection ConfigMap. It
+	// uses the shared manager ServiceAccount, whose RBAC already grants
+	// cacheblendengines get and configmaps get (design §7 RBAC note).
+	Client client.Client
+
+	// Decoder decodes the admission request's raw pod object.
+	Decoder admission.Decoder
+}
+
+// Handle implements admission.Handler. It applies mutations M0–M7 to an opted-in
+// pod whose named CacheBlendEngine connection ConfigMap exists, then returns a
+// JSON patch via admission.PatchResponseFromRaw. It short-circuits to an
+// unchanged Allowed response for non-opted-in or already-injected pods, and
+// stamps a skip-reason annotation (still Allowed, fail-open) when it declines to
+// mutate (engine missing, command override, or user-supplied
+// --kv-transfer-config).
+func (p *PodInjector) Handle(ctx context.Context, req admission.Request) admission.Response {
+	log := ctrl.LoggerFrom(ctx)
+
+	pod := &corev1.Pod{}
+	if err := p.Decoder.Decode(req, pod); err != nil {
+		return admission.Errored(http.StatusBadRequest, err)
+	}
+
+	// (1) Idempotency short-circuit: a pod already carrying the injected guard is
+	// allowed unchanged on re-admission.
+	if pod.Annotations[AnnotationInjected] == valueTrue {
+		return admission.Allowed("already injected")
+	}
+
+	// (2) Opt-in gate: no engine annotation means this pod did not opt in.
+	engineName := strings.TrimSpace(pod.Annotations[AnnotationEngine])
+	if engineName == "" {
+		return admission.Allowed("not opted in")
+	}
+
+	// The webhook config carries no defaulting for the pod object, so apply the
+	// engine annotation value as the lookup namespace using the admission
+	// request's namespace (the pod's namespace; pod.Namespace may be empty on
+	// CREATE before the API server stamps it).
+	namespace := req.Namespace
+	if namespace == "" {
+		namespace = pod.Namespace
+	}
+
+	// (3a) Resolve the engine CR for its injection defaults.
+	engine := &lmcachev1alpha1.CacheBlendEngine{}
+	if err := p.Client.Get(ctx, types.NamespacedName{Name: engineName, Namespace: namespace}, engine); err != nil {
+		if apierrors.IsNotFound(err) {
+			log.Info("Skipped CacheBlend injection: engine not found",
+				"engine", engineName, "namespace", namespace)
+			return p.skip(req, pod, SkipReasonEngineNotFound)
+		}
+		return admission.Errored(http.StatusInternalServerError, err)
+	}
+	engine.SetDefaults()
+
+	// (3b) Read the engine's connection ConfigMap (existence gate, no readiness
+	// check — design §7/§9.9). Absent means the engine is not provisioned yet.
+	connCM := &corev1.ConfigMap{}
+	connName := resources.ConnectionConfigMapName(engineName)
+	if err := p.Client.Get(ctx, types.NamespacedName{Name: connName, Namespace: namespace}, connCM); err != nil {
+		if apierrors.IsNotFound(err) {
+			log.Info("Skipped CacheBlend injection: connection ConfigMap not found",
+				"configMap", connName, "namespace", namespace)
+			return p.skip(req, pod, SkipReasonEngineNotFound)
+		}
+		return admission.Errored(http.StatusInternalServerError, err)
+	}
+	kvTransferConfigJSON := connCM.Data[kvTransferConfigDataKey]
+
+	// (6) Resolve the target container (annotation or first). A requested
+	// container name that does not exist (or a pod with no containers) is a
+	// misconfiguration: skip + stamp rather than silently allowing it through.
+	containerIdx, ok := resolveTargetContainer(pod, engine.Spec.Injection.TargetContainer,
+		pod.Annotations[AnnotationContainer])
+	if !ok {
+		log.Info("Skipped CacheBlend injection: target container not found",
+			"engine", engineName,
+			"annotationContainer", pod.Annotations[AnnotationContainer],
+			"specTargetContainer", deref(engine.Spec.Injection.TargetContainer))
+		return p.skip(req, pod, SkipReasonTargetContainerNotFound)
+	}
+	target := &pod.Spec.Containers[containerIdx]
+
+	// (4) command-override gate: a wrapper command means appended args may never
+	// reach `vllm serve`, so skip + stamp (design §8).
+	if len(target.Command) > 0 {
+		log.Info("Skipped CacheBlend injection: target container overrides command",
+			"engine", engineName, "container", target.Name)
+		return p.skip(req, pod, SkipReasonCommandOverride)
+	}
+
+	// (5) user --kv-transfer-config gate: skip that flag (do not clobber the
+	// user's structured JSON) but still apply the rest of the mutation, stamping
+	// the skip reason for diagnostics (design §9.2).
+	userHasKVTransferConfig := argsHasFlag(target.Args, cbFlagKVTransferConfig)
+
+	original := req.Object.Raw
+
+	// --- Apply mutations M0–M7 ---
+
+	// M0: pod hostIPC for CUDA IPC with the node-local engine.
+	pod.Spec.HostIPC = true
+
+	// M1: shared emptyDir volume.
+	pod.Spec.Volumes = appendVolumeIfAbsent(pod.Spec.Volumes, BuildCBPluginVolume())
+
+	// M2: payload init container (payloadImage is an ImageSpec: repo/tag/policy).
+	// Fail open if the image resolves to empty (no repository) rather than inject
+	// an init container with an empty image, which the API server would reject.
+	payloadRef, payloadPullPolicy := resolvePayloadImage(engine.Spec.Injection.PayloadImage)
+	if payloadRef == "" {
+		log.Info("Skipped CacheBlend injection: payload image repository is unset",
+			"engine", engineName)
+		return p.skip(req, pod, SkipReasonPayloadImageUnset)
+	}
+	pod.Spec.InitContainers = appendInitContainerIfAbsent(pod.Spec.InitContainers,
+		BuildCBInitContainer(payloadRef, payloadPullPolicy))
+
+	// M3: read-only mount on the target container.
+	target.VolumeMounts = appendVolumeMountIfAbsent(target.VolumeMounts, BuildCBVolumeMount())
+
+	// M4: PYTHONPATH on the target container.
+	target.Env = BuildCBPodEnv(target.Env)
+
+	// M5: required vLLM args. Pass "" for the kv-transfer-config JSON when the
+	// user already supplies one so BuildCBArgs leaves their value untouched.
+	kvForArgs := kvTransferConfigJSON
+	if userHasKVTransferConfig {
+		kvForArgs = ""
+	}
+	cudagraph := deref(engine.Spec.Injection.Cudagraph)
+	target.Args = BuildCBArgs(target.Args, kvForArgs, cudagraph)
+
+	// M7: append injection pull secrets (annotation override wins) to the pod's
+	// imagePullSecrets, deduped (private payload image).
+	injectedSecrets := resolveInjectedPullSecrets(engine.Spec.Injection.ImagePullSecrets,
+		pod.Annotations[AnnotationImagePullSecrets])
+	pod.Spec.ImagePullSecrets = MergeImagePullSecrets(pod.Spec.ImagePullSecrets, injectedSecrets)
+
+	// M6 + skip-reason stamping: idempotency guard, plus the diagnostic reason if
+	// we skipped the --kv-transfer-config flag.
+	if pod.Annotations == nil {
+		pod.Annotations = map[string]string{}
+	}
+	pod.Annotations[AnnotationInjected] = valueTrue
+	if userHasKVTransferConfig {
+		pod.Annotations[AnnotationSkipReason] = SkipReasonKVTransferConfigPresent
+	}
+
+	marshaled, err := json.Marshal(pod)
+	if err != nil {
+		return admission.Errored(http.StatusInternalServerError, err)
+	}
+	log.Info("Injected CacheBlend plugin", "engine", engineName, "container", target.Name)
+	return admission.PatchResponseFromRaw(original, marshaled)
+}
+
+// skip stamps the given skip reason on the pod (without injecting), marshals it,
+// and returns an Allowed patch response. The pod is still admitted (fail-open).
+func (p *PodInjector) skip(req admission.Request, pod *corev1.Pod, reason string) admission.Response {
+	if pod.Annotations == nil {
+		pod.Annotations = map[string]string{}
+	}
+	pod.Annotations[AnnotationSkipReason] = reason
+	marshaled, err := json.Marshal(pod)
+	if err != nil {
+		return admission.Errored(http.StatusInternalServerError, err)
+	}
+	return admission.PatchResponseFromRaw(req.Object.Raw, marshaled)
+}
+
+// resolveTargetContainer returns the index of the container to inject into and
+// whether one was found. The per-pod annotation override (annotationName) takes
+// precedence over the engine's injection.targetContainer default; an empty
+// selection falls back to the first container. A non-empty name that matches no
+// container yields ok=false.
+//
+// Parameters:
+//   - pod: the decoded pod.
+//   - specDefault: the engine's injection.targetContainer (nil/"" = first).
+//   - annotationName: the per-pod cacheblend-container annotation value.
+func resolveTargetContainer(
+	pod *corev1.Pod,
+	specDefault *string,
+	annotationName string,
+) (int, bool) {
+	if len(pod.Spec.Containers) == 0 {
+		return 0, false
+	}
+
+	name := strings.TrimSpace(annotationName)
+	if name == "" && specDefault != nil {
+		name = strings.TrimSpace(*specDefault)
+	}
+	if name == "" {
+		return 0, true
+	}
+	for i := range pod.Spec.Containers {
+		if pod.Spec.Containers[i].Name == name {
+			return i, true
+		}
+	}
+	return 0, false
+}
+
+// resolveInjectedPullSecrets returns the pull-secret references to inject: the
+// per-pod annotation override (a comma-separated list of Secret names) when
+// present, otherwise the engine's injection.imagePullSecrets.
+//
+// Parameters:
+//   - specSecrets: the engine's injection.imagePullSecrets.
+//   - annotationCSV: the cacheblend-image-pull-secrets annotation value.
+func resolveInjectedPullSecrets(
+	specSecrets []corev1.LocalObjectReference,
+	annotationCSV string,
+) []corev1.LocalObjectReference {
+	csv := strings.TrimSpace(annotationCSV)
+	if csv == "" {
+		return specSecrets
+	}
+	out := make([]corev1.LocalObjectReference, 0)
+	for part := range strings.SplitSeq(csv, ",") {
+		name := strings.TrimSpace(part)
+		if name == "" {
+			continue
+		}
+		out = append(out, corev1.LocalObjectReference{Name: name})
+	}
+	return out
+}
+
+// argsHasFlag reports whether args already carries the given flag in either the
+// two-token "--flag value" form or the single-token "--flag=value" form.
+func argsHasFlag(args []string, flag string) bool {
+	eqPrefix := flag + "="
+	for _, a := range args {
+		if a == flag {
+			return true
+		}
+		if strings.HasPrefix(a, eqPrefix) {
+			return true
+		}
+	}
+	return false
+}
+
+// appendVolumeIfAbsent appends v to volumes unless a volume of the same name is
+// already present (idempotency within a single Handle call). Returns the slice.
+func appendVolumeIfAbsent(volumes []corev1.Volume, v corev1.Volume) []corev1.Volume {
+	for i := range volumes {
+		if volumes[i].Name == v.Name {
+			return volumes
+		}
+	}
+	return append(volumes, v)
+}
+
+// appendInitContainerIfAbsent appends c to initContainers unless one of the same
+// name is already present. Returns the slice.
+func appendInitContainerIfAbsent(
+	initContainers []corev1.Container,
+	c corev1.Container,
+) []corev1.Container {
+	for i := range initContainers {
+		if initContainers[i].Name == c.Name {
+			return initContainers
+		}
+	}
+	return append(initContainers, c)
+}
+
+// appendVolumeMountIfAbsent appends m to mounts unless one of the same name is
+// already present. Returns the slice.
+func appendVolumeMountIfAbsent(
+	mounts []corev1.VolumeMount,
+	m corev1.VolumeMount,
+) []corev1.VolumeMount {
+	for i := range mounts {
+		if mounts[i].Name == m.Name {
+			return mounts
+		}
+	}
+	return append(mounts, m)
+}
+
+// deref returns the value pointed to by s, or "" if s is nil.
+func deref(s *string) string {
+	if s == nil {
+		return ""
+	}
+	return *s
+}
+
+// resolvePayloadImage builds the "<repository>:<tag>" reference and pull policy
+// for the payload init container from the engine's injection.payloadImage. Tag
+// and pull policy fall back to "latest" / IfNotPresent when unset; repository is
+// taken as-is (it has no sensible cluster-wide default — see InjectionSpec docs).
+func resolvePayloadImage(img *lmcachev1alpha1.ImageSpec) (string, corev1.PullPolicy) {
+	if img == nil || deref(img.Repository) == "" {
+		return "", corev1.PullIfNotPresent
+	}
+	tag := deref(img.Tag)
+	if tag == "" {
+		tag = "latest"
+	}
+	policy := corev1.PullPolicy(deref(img.PullPolicy))
+	if policy == "" {
+		policy = corev1.PullIfNotPresent
+	}
+	return deref(img.Repository) + ":" + tag, policy
+}
diff --git a/operator/internal/webhook/pod_injector_envtest_test.go b/operator/internal/webhook/pod_injector_envtest_test.go
new file mode 100644
index 0000000000..1fdd8d7cf9
--- /dev/null
+++ b/operator/internal/webhook/pod_injector_envtest_test.go
@@ -0,0 +1,146 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package webhook
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	"github.com/LMCache/LMCache/internal/resources"
+)
+
+// These specs exercise the FULL admission pipeline: a pod CREATE goes to the
+// envtest API server, which calls the registered PodInjector webhook over TLS,
+// and we read the persisted (mutated) pod back. This validates the wiring the
+// fake-client unit tests cannot: the generated webhook manifest path, the
+// decoder, and PatchResponseFromRaw round-tripping through the API server.
+var _ = Describe("PodInjector webhook (envtest)", Ordered, func() {
+	BeforeAll(func() {
+		By("creating the target namespace")
+		ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testNamespace}}
+		Expect(client.IgnoreAlreadyExists(k8sClient.Create(envtestCtx, ns))).To(Succeed())
+
+		By("creating the CacheBlendEngine and its connection ConfigMap")
+		engine := newTestEngine(nil)
+		Expect(k8sClient.Create(envtestCtx, engine)).To(Succeed())
+		Expect(k8sClient.Create(envtestCtx, resources.BuildCBConnectionConfigMap(engine))).To(Succeed())
+	})
+
+	It("injects the plugin, flags, hostIPC, and private-image pull secret into an annotated pod", func() {
+		pod := vllmPod(func(p *corev1.Pod) {
+			p.Name = "vllm-injected"
+			if p.Labels == nil {
+				p.Labels = map[string]string{}
+			}
+			p.Labels["lmcache.ai/cacheblend-inject"] = valueTrue
+		})
+		Expect(k8sClient.Create(envtestCtx, pod)).To(Succeed())
+
+		got := &corev1.Pod{}
+		Expect(k8sClient.Get(envtestCtx,
+			types.NamespacedName{Name: "vllm-injected", Namespace: testNamespace}, got)).To(Succeed())
+
+		By("the idempotency annotation is stamped")
+		Expect(got.Annotations).To(HaveKeyWithValue(AnnotationInjected, valueTrue))
+
+		By("M0: hostIPC is set on the pod")
+		Expect(got.Spec.HostIPC).To(BeTrue())
+
+		By("M1/M2: the cb-plugin emptyDir volume and payload init container are present")
+		Expect(hasVolume(got, "cb-plugin")).To(BeTrue())
+		Expect(got.Spec.InitContainers).NotTo(BeEmpty())
+		init := got.Spec.InitContainers[0]
+		Expect(init.Image).To(Equal("registry.example.com/lmcache/cacheblend-payload:pinned"))
+
+		By("M3/M4: the vLLM container has the readOnly mount and PYTHONPATH")
+		c := findContainer(got, "vllm")
+		Expect(c).NotTo(BeNil())
+		Expect(hasReadOnlyMount(c, "cb-plugin")).To(BeTrue())
+		Expect(envValue(c, "PYTHONPATH")).To(ContainSubstring("/cb-plugin"))
+
+		By("M5: every required vLLM flag is present, with the node-local CBKVConnector config")
+		// Form-agnostic: the handler may emit two-token (--flag value) or
+		// =-token (--flag=value) forms; argsHasFlagValue handles both.
+		Expect(argsHasFlagValue(c.Args, "--attention-backend", "CUSTOM")).To(BeTrue())
+		Expect(argsHasFlagValue(c.Args, "--block-size", "64")).To(BeTrue())
+		Expect(argsHasFlagValue(c.Args, "--pipeline-parallel-size", "1")).To(BeTrue())
+		Expect(c.Args).To(ContainElement("--no-enable-chunked-prefill"))
+		Expect(c.Args).To(ContainElement("--no-async-scheduling"))
+		kv := argsFlagValue(c.Args, "--kv-transfer-config")
+		Expect(kv).To(ContainSubstring("CBKVConnector"))
+		Expect(kv).To(ContainSubstring("tcp://" + testEngineName + "." + testNamespace + ".svc"))
+
+		By("M7: the private payload pull secret is appended to the pod")
+		Expect(pullSecretNames(got)).To(ContainElement("cb-payload-pull"))
+	})
+
+	It("leaves a pod without the engine annotation untouched", func() {
+		pod := vllmPod(func(p *corev1.Pod) {
+			p.Name = "vllm-no-annotation"
+			p.Annotations = nil
+		})
+		Expect(k8sClient.Create(envtestCtx, pod)).To(Succeed())
+
+		got := &corev1.Pod{}
+		Expect(k8sClient.Get(envtestCtx,
+			types.NamespacedName{Name: "vllm-no-annotation", Namespace: testNamespace}, got)).To(Succeed())
+
+		Expect(got.Annotations).NotTo(HaveKey(AnnotationInjected))
+		Expect(got.Spec.HostIPC).To(BeFalse())
+		Expect(hasVolume(got, "cb-plugin")).To(BeFalse())
+		Expect(got.Spec.InitContainers).To(BeEmpty())
+	})
+
+	It("skips a pod whose target container overrides command, stamping a skip reason", func() {
+		pod := vllmPod(func(p *corev1.Pod) {
+			p.Name = "vllm-wrapped"
+			p.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "exec vllm serve"}
+		})
+		Expect(k8sClient.Create(envtestCtx, pod)).To(Succeed())
+
+		got := &corev1.Pod{}
+		Expect(k8sClient.Get(envtestCtx,
+			types.NamespacedName{Name: "vllm-wrapped", Namespace: testNamespace}, got)).To(Succeed())
+
+		Expect(got.Annotations).To(HaveKeyWithValue(AnnotationSkipReason, SkipReasonCommandOverride))
+		Expect(got.Annotations).NotTo(HaveKey(AnnotationInjected))
+		Expect(hasVolume(got, "cb-plugin")).To(BeFalse())
+	})
+})
+
+func hasVolume(pod *corev1.Pod, name string) bool {
+	for _, v := range pod.Spec.Volumes {
+		if v.Name == name {
+			return true
+		}
+	}
+	return false
+}
+
+func hasReadOnlyMount(c *corev1.Container, name string) bool {
+	for _, m := range c.VolumeMounts {
+		if m.Name == name && m.ReadOnly {
+			return true
+		}
+	}
+	return false
+}
diff --git a/operator/internal/webhook/pod_injector_test.go b/operator/internal/webhook/pod_injector_test.go
new file mode 100644
index 0000000000..ba8a7606cc
--- /dev/null
+++ b/operator/internal/webhook/pod_injector_test.go
@@ -0,0 +1,574 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package webhook
+
+import (
+	"context"
+	"encoding/json"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	jsonpatch "github.com/evanphx/json-patch/v5"
+	admissionv1 "k8s.io/api/admission/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
+
+	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
+	"github.com/LMCache/LMCache/internal/resources"
+)
+
+const (
+	testEngineName = "cb"
+	testNamespace  = "vllm-ns"
+	testPodName    = "vllm-pod"
+	testSvcHost    = "tcp://cb.vllm-ns.svc.cluster.local"
+)
+
+// newTestScheme returns a scheme with clientgo + the lmcache v1alpha1 types.
+func newTestScheme() *runtime.Scheme {
+	s := runtime.NewScheme()
+	Expect(clientgoscheme.AddToScheme(s)).To(Succeed())
+	Expect(lmcachev1alpha1.AddToScheme(s)).To(Succeed())
+	return s
+}
+
+// newTestEngine returns a defaulted CacheBlendEngine with the given injection
+// overrides applied via mutate (nil = pure defaults).
+func newTestEngine(mutate func(*lmcachev1alpha1.CacheBlendEngine)) *lmcachev1alpha1.CacheBlendEngine {
+	payloadRepo := "registry.example.com/lmcache/cacheblend-payload"
+	payloadTag := "pinned"
+	engine := &lmcachev1alpha1.CacheBlendEngine{
+		ObjectMeta: metav1.ObjectMeta{Name: testEngineName, Namespace: testNamespace},
+		Spec: lmcachev1alpha1.CacheBlendEngineSpec{
+			L1: lmcachev1alpha1.L1BackendSpec{SizeGB: 10},
+			Injection: &lmcachev1alpha1.InjectionSpec{
+				PayloadImage: &lmcachev1alpha1.ImageSpec{Repository: &payloadRepo, Tag: &payloadTag},
+				ImagePullSecrets: []corev1.LocalObjectReference{
+					{Name: "cb-payload-pull"},
+				},
+			},
+		},
+	}
+	engine.SetDefaults()
+	if mutate != nil {
+		mutate(engine)
+	}
+	return engine
+}
+
+// newPodInjector returns a PodInjector backed by a fake client seeded with the
+// given objects, plus the engine's connection ConfigMap when seedConn is true.
+func newPodInjector(
+	engine *lmcachev1alpha1.CacheBlendEngine,
+	seedConn bool,
+) *PodInjector {
+	scheme := newTestScheme()
+	builder := fake.NewClientBuilder().WithScheme(scheme)
+	objs := []runtime.Object{engine}
+	if seedConn {
+		objs = append(objs, resources.BuildCBConnectionConfigMap(engine))
+	}
+	builder = builder.WithRuntimeObjects(objs...)
+	return &PodInjector{
+		Client:  builder.Build(),
+		Decoder: admission.NewDecoder(scheme),
+	}
+}
+
+// makeRequest builds a CREATE admission.Request carrying the given pod as raw
+// JSON in req.Object.
+func makeRequest(pod *corev1.Pod) admission.Request {
+	pod.TypeMeta = metav1.TypeMeta{APIVersion: "v1", Kind: "Pod"}
+	if pod.Namespace == "" {
+		pod.Namespace = testNamespace
+	}
+	raw, err := json.Marshal(pod)
+	Expect(err).NotTo(HaveOccurred())
+	return admission.Request{
+		AdmissionRequest: admissionv1.AdmissionRequest{
+			Operation: admissionv1.Create,
+			Namespace: pod.Namespace,
+			Object:    runtime.RawExtension{Raw: raw},
+		},
+	}
+}
+
+// applyResponse applies the response's JSON patches to the original pod JSON and
+// returns the mutated pod. It asserts the response is Allowed.
+func applyResponse(original *corev1.Pod, resp admission.Response) *corev1.Pod {
+	Expect(resp.Allowed).To(BeTrue(), "expected the response to be Allowed")
+
+	origRaw, err := json.Marshal(original)
+	Expect(err).NotTo(HaveOccurred())
+
+	if len(resp.Patches) == 0 {
+		out := &corev1.Pod{}
+		Expect(json.Unmarshal(origRaw, out)).To(Succeed())
+		return out
+	}
+
+	patchRaw, err := json.Marshal(resp.Patches)
+	Expect(err).NotTo(HaveOccurred())
+	patch, err := jsonpatch.DecodePatch(patchRaw)
+	Expect(err).NotTo(HaveOccurred())
+	mutatedRaw, err := patch.Apply(origRaw)
+	Expect(err).NotTo(HaveOccurred())
+
+	out := &corev1.Pod{}
+	Expect(json.Unmarshal(mutatedRaw, out)).To(Succeed())
+	return out
+}
+
+// vllmPod returns a minimal args-only vLLM pod (no command override) bound to
+// the test engine via annotation. mutate may further customize it.
+func vllmPod(mutate func(*corev1.Pod)) *corev1.Pod {
+	pod := &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      testPodName,
+			Namespace: testNamespace,
+			Annotations: map[string]string{
+				AnnotationEngine: testEngineName,
+			},
+		},
+		Spec: corev1.PodSpec{
+			Containers: []corev1.Container{
+				{
+					Name:  "vllm",
+					Image: "vllm/vllm-openai:latest",
+					Args:  []string{"--model", "Qwen/Qwen2.5-0.5B"},
+				},
+			},
+		},
+	}
+	if mutate != nil {
+		mutate(pod)
+	}
+	return pod
+}
+
+// findContainer returns the named container from the pod, or nil.
+func findContainer(pod *corev1.Pod, name string) *corev1.Container {
+	for i := range pod.Spec.Containers {
+		if pod.Spec.Containers[i].Name == name {
+			return &pod.Spec.Containers[i]
+		}
+	}
+	return nil
+}
+
+// envValue returns the value of the named env var on the container, or "".
+func envValue(c *corev1.Container, name string) string {
+	for _, e := range c.Env {
+		if e.Name == name {
+			return e.Value
+		}
+	}
+	return ""
+}
+
+// pullSecretNames returns the names in the pod's imagePullSecrets.
+func pullSecretNames(pod *corev1.Pod) []string {
+	out := make([]string, 0, len(pod.Spec.ImagePullSecrets))
+	for _, s := range pod.Spec.ImagePullSecrets {
+		out = append(out, s.Name)
+	}
+	return out
+}
+
+var _ = Describe("PodInjector", func() {
+	ctx := context.Background()
+
+	Describe("full M0–M7 injection", func() {
+		It("injects all required mutations for an opted-in pod", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(nil)
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+
+			By("M0: hostIPC")
+			Expect(out.Spec.HostIPC).To(BeTrue())
+
+			By("M1: cb-plugin emptyDir volume")
+			var vol *corev1.Volume
+			for i := range out.Spec.Volumes {
+				if out.Spec.Volumes[i].Name == cbPluginVolumeName {
+					vol = &out.Spec.Volumes[i]
+				}
+			}
+			Expect(vol).NotTo(BeNil())
+			Expect(vol.EmptyDir).NotTo(BeNil())
+
+			By("M2: payload init container with image + pull policy + SHARED_DIR")
+			Expect(out.Spec.InitContainers).To(HaveLen(1))
+			init := out.Spec.InitContainers[0]
+			Expect(init.Image).To(Equal("registry.example.com/lmcache/cacheblend-payload:pinned"))
+			Expect(init.ImagePullPolicy).To(Equal(corev1.PullIfNotPresent))
+			Expect(init.Command).To(BeEmpty())
+			Expect(envValue(&init, cbSharedDirEnvName)).To(Equal(cbPluginMountPath))
+			Expect(init.VolumeMounts).To(HaveLen(1))
+			Expect(init.VolumeMounts[0].Name).To(Equal(cbPluginVolumeName))
+			Expect(init.VolumeMounts[0].MountPath).To(Equal(cbPluginMountPath))
+			Expect(init.VolumeMounts[0].ReadOnly).To(BeFalse())
+
+			c := findContainer(out, "vllm")
+			Expect(c).NotTo(BeNil())
+
+			By("M3: read-only mount on the target container")
+			var mount *corev1.VolumeMount
+			for i := range c.VolumeMounts {
+				if c.VolumeMounts[i].Name == cbPluginVolumeName {
+					mount = &c.VolumeMounts[i]
+				}
+			}
+			Expect(mount).NotTo(BeNil())
+			Expect(mount.ReadOnly).To(BeTrue())
+			Expect(mount.MountPath).To(Equal(cbPluginMountPath))
+
+			By("M4: PYTHONPATH on the container")
+			Expect(envValue(c, pythonPathEnvName)).To(Equal(cbPythonPath))
+
+			By("M5: required vLLM args asserted individually")
+			Expect(argsHasFlagValue(c.Args, cbFlagAttentionBackend, cbValAttentionBackend)).To(BeTrue(),
+				"--attention-backend=CUSTOM")
+			Expect(argsHasFlag(c.Args, cbFlagNoChunkedPrefill)).To(BeTrue(),
+				"--no-enable-chunked-prefill")
+			Expect(argsHasFlagValue(c.Args, cbFlagBlockSize, cbValBlockSize)).To(BeTrue(),
+				"--block-size=64")
+			Expect(argsHasFlagValue(c.Args, cbFlagPipelineParallelSize, cbValPipelineParallelSize)).To(BeTrue(),
+				"--pipeline-parallel-size=1")
+			Expect(argsHasFlag(c.Args, cbFlagNoAsyncScheduling)).To(BeTrue(),
+				"--no-async-scheduling")
+			Expect(argsHasFlag(c.Args, cbFlagEnforceEager)).To(BeTrue(),
+				"default cudagraph eager -> --enforce-eager")
+
+			By("M5: --kv-transfer-config carries CBKVConnector + tcp:// host")
+			kv := argsFlagValue(c.Args, cbFlagKVTransferConfig)
+			Expect(kv).NotTo(BeEmpty())
+			Expect(kv).To(ContainSubstring("CBKVConnector"))
+			Expect(kv).To(ContainSubstring(testSvcHost))
+
+			By("M7: injection.imagePullSecrets appended to spec.imagePullSecrets")
+			Expect(pullSecretNames(out)).To(ContainElement("cb-payload-pull"))
+
+			By("M6: idempotency annotation stamped")
+			Expect(out.Annotations[AnnotationInjected]).To(Equal(valueTrue))
+			Expect(out.Annotations).NotTo(HaveKey(AnnotationSkipReason))
+		})
+	})
+
+	Describe("M7 image pull secrets", func() {
+		It("does not duplicate a secret the pod already lists", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Spec.ImagePullSecrets = []corev1.LocalObjectReference{{Name: "cb-payload-pull"}}
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+
+			Expect(pullSecretNames(out)).To(Equal([]string{"cb-payload-pull"}))
+		})
+
+		It("honors the cacheblend-image-pull-secrets annotation override", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Annotations[AnnotationImagePullSecrets] = "override-a, override-b"
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+
+			names := pullSecretNames(out)
+			Expect(names).To(ContainElements("override-a", "override-b"))
+			Expect(names).NotTo(ContainElement("cb-payload-pull"))
+		})
+	})
+
+	Describe("gating", func() {
+		It("allows a pod with no engine annotation unchanged", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				delete(p.Annotations, AnnotationEngine)
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			Expect(resp.Allowed).To(BeTrue())
+			Expect(resp.Patches).To(BeEmpty())
+		})
+
+		It("skips + stamps engine-not-found when the connection ConfigMap is absent", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, false) // no connection ConfigMap seeded
+			pod := vllmPod(nil)
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+
+			Expect(out.Annotations[AnnotationSkipReason]).To(Equal(SkipReasonEngineNotFound))
+			Expect(out.Annotations).NotTo(HaveKey(AnnotationInjected))
+			Expect(out.Spec.HostIPC).To(BeFalse())
+			Expect(out.Spec.InitContainers).To(BeEmpty())
+		})
+
+		It("skips + stamps engine-not-found when the engine CR is absent", func() {
+			engine := newTestEngine(nil)
+			// Seed an injector whose engine name differs from the pod's annotation.
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Annotations[AnnotationEngine] = "does-not-exist"
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+
+			Expect(out.Annotations[AnnotationSkipReason]).To(Equal(SkipReasonEngineNotFound))
+			Expect(out.Spec.HostIPC).To(BeFalse())
+		})
+
+		It("allows an already-injected pod as a no-op", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Annotations[AnnotationInjected] = valueTrue
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			Expect(resp.Allowed).To(BeTrue())
+			Expect(resp.Patches).To(BeEmpty())
+		})
+
+		It("skips + stamps command-override when the target container overrides command", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "exec vllm serve"}
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+
+			Expect(out.Annotations[AnnotationSkipReason]).To(Equal(SkipReasonCommandOverride))
+			Expect(out.Annotations).NotTo(HaveKey(AnnotationInjected))
+			Expect(out.Spec.HostIPC).To(BeFalse())
+			Expect(out.Spec.InitContainers).To(BeEmpty())
+		})
+	})
+
+	Describe("append-or-replace arg semantics", func() {
+		It("replaces a pre-existing --attention-backend value", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Spec.Containers[0].Args = []string{"--attention-backend", "FLASH_ATTN", "--model", "m"}
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+			c := findContainer(out, "vllm")
+
+			Expect(argsHasFlagValue(c.Args, cbFlagAttentionBackend, cbValAttentionBackend)).To(BeTrue())
+			Expect(argsHasFlagValue(c.Args, cbFlagAttentionBackend, "FLASH_ATTN")).To(BeFalse())
+			// Not duplicated.
+			Expect(countFlag(c.Args, cbFlagAttentionBackend)).To(Equal(1))
+		})
+
+		It("replaces a pre-existing --attention-backend=value (single-token form)", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Spec.Containers[0].Args = []string{"--attention-backend=FLASH_ATTN", "--model", "m"}
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+			c := findContainer(out, "vllm")
+
+			Expect(c.Args).To(ContainElement("--attention-backend=CUSTOM"))
+			Expect(c.Args).NotTo(ContainElement("--attention-backend=FLASH_ATTN"))
+		})
+
+		It("skips + stamps when the user already supplies --kv-transfer-config", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Spec.Containers[0].Args = []string{"--kv-transfer-config", `{"kv_connector":"Other"}`}
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+			c := findContainer(out, "vllm")
+
+			By("the user's kv-transfer-config JSON is untouched")
+			Expect(argsFlagValue(c.Args, cbFlagKVTransferConfig)).To(Equal(`{"kv_connector":"Other"}`))
+			Expect(argsFlagValue(c.Args, cbFlagKVTransferConfig)).NotTo(ContainSubstring("CBKVConnector"))
+
+			By("the skip reason is stamped but the rest of the injection still applies")
+			Expect(out.Annotations[AnnotationSkipReason]).To(Equal(SkipReasonKVTransferConfigPresent))
+			Expect(out.Annotations[AnnotationInjected]).To(Equal(valueTrue))
+			Expect(out.Spec.HostIPC).To(BeTrue())
+			Expect(argsHasFlagValue(c.Args, cbFlagBlockSize, cbValBlockSize)).To(BeTrue())
+		})
+
+		It("prepends to a pre-existing PYTHONPATH", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Spec.Containers[0].Env = []corev1.EnvVar{
+					{Name: "PYTHONPATH", Value: "/opt/extra"},
+				}
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+			c := findContainer(out, "vllm")
+
+			Expect(envValue(c, pythonPathEnvName)).To(Equal("/cb-plugin:/opt/extra"))
+		})
+	})
+
+	Describe("target container resolution", func() {
+		It("injects into the annotation-named non-first container", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Annotations[AnnotationContainer] = "vllm"
+				p.Spec.Containers = []corev1.Container{
+					{Name: "sidecar", Image: "busybox", Args: []string{"sleep"}},
+					{Name: "vllm", Image: "vllm/vllm-openai:latest", Args: []string{"--model", "m"}},
+				}
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+
+			By("the vLLM container is mutated")
+			vllm := findContainer(out, "vllm")
+			Expect(envValue(vllm, pythonPathEnvName)).To(Equal(cbPythonPath))
+			Expect(argsHasFlagValue(vllm.Args, cbFlagAttentionBackend, cbValAttentionBackend)).To(BeTrue())
+
+			By("the sidecar container is untouched")
+			sidecar := findContainer(out, "sidecar")
+			Expect(envValue(sidecar, pythonPathEnvName)).To(BeEmpty())
+			Expect(sidecar.Args).To(Equal([]string{"sleep"}))
+		})
+
+		It("uses the engine injection.targetContainer default when set", func() {
+			named := "vllm"
+			engine := newTestEngine(func(e *lmcachev1alpha1.CacheBlendEngine) {
+				e.Spec.Injection.TargetContainer = &named
+			})
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Spec.Containers = []corev1.Container{
+					{Name: "sidecar", Image: "busybox", Args: []string{"sleep"}},
+					{Name: "vllm", Image: "vllm/vllm-openai:latest", Args: []string{"--model", "m"}},
+				}
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+
+			vllm := findContainer(out, "vllm")
+			Expect(argsHasFlagValue(vllm.Args, cbFlagAttentionBackend, cbValAttentionBackend)).To(BeTrue())
+			sidecar := findContainer(out, "sidecar")
+			Expect(sidecar.Args).To(Equal([]string{"sleep"}))
+		})
+
+		It("skips + stamps target-container-not-found for an unknown container name", func() {
+			engine := newTestEngine(nil)
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(func(p *corev1.Pod) {
+				p.Annotations[AnnotationContainer] = "does-not-exist"
+			})
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+
+			Expect(out.Annotations[AnnotationSkipReason]).To(Equal(SkipReasonTargetContainerNotFound))
+			Expect(out.Annotations).NotTo(HaveKey(AnnotationInjected))
+			Expect(out.Spec.HostIPC).To(BeFalse())
+			Expect(out.Spec.InitContainers).To(BeEmpty())
+			By("the original vLLM container is left untouched")
+			vllm := findContainer(out, "vllm")
+			Expect(envValue(vllm, pythonPathEnvName)).To(BeEmpty())
+		})
+	})
+
+	Describe("cudagraph modes", func() {
+		It("emits decode-only compilation config for full_decode_only", func() {
+			mode := lmcachev1alpha1.CudagraphFullDecodeOnly
+			engine := newTestEngine(func(e *lmcachev1alpha1.CacheBlendEngine) {
+				e.Spec.Injection.Cudagraph = &mode
+			})
+			injector := newPodInjector(engine, true)
+			pod := vllmPod(nil)
+
+			resp := injector.Handle(ctx, makeRequest(pod))
+			out := applyResponse(pod, resp)
+			c := findContainer(out, "vllm")
+
+			Expect(argsHasFlag(c.Args, cbFlagEnforceEager)).To(BeFalse())
+			Expect(argsFlagValue(c.Args, cbFlagCompilationConfig)).To(ContainSubstring("FULL_DECODE_ONLY"))
+		})
+	})
+})
+
+// --- test arg helpers (mirror the package's two-token / =-token recognition) ---
+
+// argsHasFlagValue reports whether args carries flag with the given value in
+// either the two-token or single-token form.
+func argsHasFlagValue(args []string, flag, value string) bool {
+	return argsFlagValue(args, flag) == value
+}
+
+// argsFlagValue returns the value bound to flag in args (two-token or
+// single-token form), or "" if the flag is absent.
+func argsFlagValue(args []string, flag string) string {
+	eqPrefix := flag + "="
+	for i := range len(args) {
+		if args[i] == flag && i+1 < len(args) {
+			return args[i+1]
+		}
+		if after, ok := strings.CutPrefix(args[i], eqPrefix); ok {
+			return after
+		}
+	}
+	return ""
+}
+
+// countFlag returns how many times flag appears (two-token or single-token).
+func countFlag(args []string, flag string) int {
+	eqPrefix := flag + "="
+	n := 0
+	for _, a := range args {
+		if a == flag || strings.HasPrefix(a, eqPrefix) {
+			n++
+		}
+	}
+	return n
+}
diff --git a/operator/internal/webhook/webhook_suite_test.go b/operator/internal/webhook/webhook_suite_test.go
new file mode 100644
index 0000000000..d4eea967c8
--- /dev/null
+++ b/operator/internal/webhook/webhook_suite_test.go
@@ -0,0 +1,173 @@
+/*
+Copyright 2026.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package webhook
+
+import (
+	"context"
+	"crypto/tls"
+	"fmt"
+	"net"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"k8s.io/apimachinery/pkg/runtime"
+	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/rest"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/envtest"
+	logf "sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/log/zap"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+	"sigs.k8s.io/controller-runtime/pkg/webhook"
+	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
+
+	lmcachev1alpha1 "github.com/LMCache/LMCache/api/v1alpha1"
+)
+
+// These package-level handles back the envtest-based integration spec
+// (pod_injector_envtest_test.go). The fake-client unit tests in
+// pod_injector_test.go do not use them. envtest is started once for the suite;
+// if the binaries are missing the suite fails fast (run `make setup-envtest`).
+var (
+	envtestCtx    context.Context
+	envtestCancel context.CancelFunc
+	mgrCancel     context.CancelFunc
+	testEnv       *envtest.Environment
+	cfg           *rest.Config
+	k8sClient     client.Client
+	envtestScheme *runtime.Scheme
+)
+
+// TestWebhook runs the Ginkgo suite for the CacheBlend mutating webhook package.
+func TestWebhook(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "CacheBlend Webhook Suite")
+}
+
+var _ = BeforeSuite(func() {
+	logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
+
+	envtestCtx, envtestCancel = context.WithCancel(context.TODO())
+
+	envtestScheme = runtime.NewScheme()
+	Expect(clientgoscheme.AddToScheme(envtestScheme)).To(Succeed())
+	Expect(lmcachev1alpha1.AddToScheme(envtestScheme)).To(Succeed())
+
+	By("bootstrapping the test environment with the mutating webhook installed")
+	testEnv = &envtest.Environment{
+		CRDDirectoryPaths:     []string{filepath.Join("..", "..", "config", "crd", "bases")},
+		ErrorIfCRDPathMissing: true,
+		// Point at the generated manifest FILE, not the config/webhook dir: the
+		// dir also holds the kustomize selectors patch (a partial
+		// MutatingWebhookConfiguration with the same name), which envtest's
+		// path loader would try to install and reject as invalid. The base
+		// manifest has no objectSelector, so the webhook matches every pod —
+		// fine here, since the specs control which pods exist.
+		WebhookInstallOptions: envtest.WebhookInstallOptions{
+			Paths: []string{filepath.Join("..", "..", "config", "webhook", "manifests.yaml")},
+		},
+	}
+	if dir := firstFoundEnvtestBinaryDir(); dir != "" {
+		testEnv.BinaryAssetsDirectory = dir
+	}
+
+	var err error
+	cfg, err = testEnv.Start()
+	Expect(err).NotTo(HaveOccurred())
+	Expect(cfg).NotTo(BeNil())
+
+	k8sClient, err = client.New(cfg, client.Options{Scheme: envtestScheme})
+	Expect(err).NotTo(HaveOccurred())
+
+	By("starting a manager that serves the PodInjector webhook")
+	wio := &testEnv.WebhookInstallOptions
+	mgr, err := ctrl.NewManager(cfg, ctrl.Options{
+		Scheme:  envtestScheme,
+		Metrics: metricsserver.Options{BindAddress: "0"},
+		WebhookServer: webhook.NewServer(webhook.Options{
+			Host:    wio.LocalServingHost,
+			Port:    wio.LocalServingPort,
+			CertDir: wio.LocalServingCertDir,
+		}),
+		LeaderElection: false,
+	})
+	Expect(err).NotTo(HaveOccurred())
+
+	// The handler uses a direct (uncached) client so reads of the
+	// CacheBlendEngine and its connection ConfigMap succeed without waiting on
+	// informer cache sync — the production wiring uses mgr.GetClient().
+	directClient, err := client.New(cfg, client.Options{Scheme: envtestScheme})
+	Expect(err).NotTo(HaveOccurred())
+	mgr.GetWebhookServer().Register("/mutate--v1-pod", &webhook.Admission{Handler: &PodInjector{
+		Client:  directClient,
+		Decoder: admission.NewDecoder(mgr.GetScheme()),
+	}})
+
+	var mgrCtx context.Context
+	mgrCtx, mgrCancel = context.WithCancel(envtestCtx)
+	go func() {
+		defer GinkgoRecover()
+		Expect(mgr.Start(mgrCtx)).To(Succeed())
+	}()
+
+	By("waiting for the webhook server's TLS port to accept connections")
+	addr := fmt.Sprintf("%s:%d", wio.LocalServingHost, wio.LocalServingPort)
+	Eventually(func() error {
+		conn, derr := tls.DialWithDialer(&net.Dialer{Timeout: time.Second}, "tcp", addr,
+			&tls.Config{InsecureSkipVerify: true}) //nolint:gosec // test-only readiness probe
+		if derr != nil {
+			return derr
+		}
+		return conn.Close()
+	}, 30*time.Second, 200*time.Millisecond).Should(Succeed())
+})
+
+var _ = AfterSuite(func() {
+	if mgrCancel != nil {
+		mgrCancel()
+	}
+	if envtestCancel != nil {
+		envtestCancel()
+	}
+	By("tearing down the test environment")
+	if testEnv != nil {
+		Eventually(func() error { return testEnv.Stop() }, time.Minute, time.Second).Should(Succeed())
+	}
+})
+
+// firstFoundEnvtestBinaryDir locates the envtest binary directory under bin/k8s
+// so the suite runs from an IDE without KUBEBUILDER_ASSETS set, mirroring the
+// controller suite's helper.
+func firstFoundEnvtestBinaryDir() string {
+	basePath := filepath.Join("..", "..", "bin", "k8s")
+	entries, err := os.ReadDir(basePath)
+	if err != nil {
+		return ""
+	}
+	for _, entry := range entries {
+		if entry.IsDir() {
+			return filepath.Join(basePath, entry.Name())
+		}
+	}
+	return ""
+}
diff --git a/operator/make/build.mk b/operator/make/build.mk
index afdbff11c0..eb77c2be7f 100644
--- a/operator/make/build.mk
+++ b/operator/make/build.mk
@@ -5,8 +5,8 @@ build: manifests generate fmt vet ## Build manager binary.
 	go build -o bin/manager cmd/main.go
 
 .PHONY: run
-run: manifests generate fmt vet ## Run a controller from your host.
-	go run ./cmd/main.go
+run: manifests generate fmt vet ## Run a controller from your host (webhook off; no host certs needed).
+	ENABLE_WEBHOOKS=false go run ./cmd/main.go
 
 # If you wish to build the manager image targeting other platforms you can use the --platform flag.
 # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.

From 04ea5fbc522a9fb278ec61384659f88fb3bd419a Mon Sep 17 00:00:00 2001
From: longguo <107740309+abinggo@users.noreply.github.com>
Date: Thu, 11 Jun 2026 10:26:44 +0800
Subject: [PATCH 32/57] fix(v1): graceful skip on slot_mapping/token_ids desync
 in wait_for_save (fixes #3318) (#3325)

Signed-off-by: abinggo <107740309+abinggo@users.noreply.github.com>
---
 lmcache/integration/vllm/vllm_v1_adapter.py |  12 +-
 tests/v1/test_v1_adapter_state_desync.py    | 155 ++++++++++++++++++++
 2 files changed, 166 insertions(+), 1 deletion(-)
 create mode 100644 tests/v1/test_v1_adapter_state_desync.py

diff --git a/lmcache/integration/vllm/vllm_v1_adapter.py b/lmcache/integration/vllm/vllm_v1_adapter.py
index 82899dc311..1bd3a675e7 100644
--- a/lmcache/integration/vllm/vllm_v1_adapter.py
+++ b/lmcache/integration/vllm/vllm_v1_adapter.py
@@ -1146,7 +1146,17 @@ def wait_for_save(self):
 
             slot_mapping = request.slot_mapping
             assert isinstance(slot_mapping, torch.Tensor)
-            assert len(slot_mapping) == len(token_ids)
+            if len(slot_mapping) != len(token_ids):
+                logger.warning(
+                    "Skipping KV save for request %s: slot_mapping/token_ids "
+                    "length mismatch (slot_mapping=%d, token_ids=%d). Likely "
+                    "an upstream allocation/preemption desync; the engine "
+                    "stays alive and only this request's save is dropped.",
+                    request.req_id,
+                    len(slot_mapping),
+                    len(token_ids),
+                )
+                continue
 
             # TODO: have a pre-allocated buffer to hold the slot_mappings
             slot_mapping = slot_mapping.to(self.device)
diff --git a/tests/v1/test_v1_adapter_state_desync.py b/tests/v1/test_v1_adapter_state_desync.py
new file mode 100644
index 0000000000..a57b77c0c5
--- /dev/null
+++ b/tests/v1/test_v1_adapter_state_desync.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Regression test for LMCache#3318.
+
+The vLLM v1 adapter previously asserted
+``len(slot_mapping) == len(token_ids)`` inside ``wait_for_save``. When the
+state desynced (e.g. upstream allocation failure or preemption-induced
+mismatch) the assertion fired as an unhandled ``AssertionError`` and
+killed the entire EngineCore process for every connected user.
+
+The fix replaces the assert with a logged ``continue`` so the engine
+stays alive and only the affected request's save is dropped. This test
+locks in that behavior by feeding ``wait_for_save`` a request whose
+``slot_mapping`` and ``token_ids`` lengths disagree and asserting:
+
+1. ``wait_for_save`` does not raise.
+2. A warning is emitted naming the request id and both lengths.
+3. ``lmcache_engine.store`` is not called for the desynced request
+   (the save is dropped, not silently corrupted).
+4. ``lookup_unpin`` is still called so the pin count stays balanced.
+"""
+
+# Standard
+from types import SimpleNamespace
+import logging
+
+# Third Party
+import pytest
+import torch
+
+pytest.importorskip("vllm")
+
+# First Party
+from lmcache.integration.vllm.vllm_v1_adapter import (
+    LMCacheConnectorMetadata,
+    LMCacheConnectorV1Impl,
+    SaveSpec,
+)
+
+
+class _FakeParent:
+    def __init__(self, metadata: LMCacheConnectorMetadata) -> None:
+        self._connector_metadata = metadata
+
+    def _get_connector_metadata(self) -> LMCacheConnectorMetadata:
+        return self._connector_metadata
+
+
+class _FakeEngine:
+    """Records calls to ``lookup_unpin`` and ``store`` so the test can
+    assert which paths fired."""
+
+    def __init__(self) -> None:
+        self.unpinned: list[str] = []
+        self.store_calls: list[str] = []
+
+    def lookup_unpin(self, req_id: str) -> None:
+        self.unpinned.append(req_id)
+
+    def store(self, *args, **kwargs) -> None:
+        self.store_calls.append(kwargs.get("req_id", "<unknown>"))
+
+
+def _make_desync_request(
+    req_id: str, token_ids_len: int, slot_mapping_len: int
+) -> SimpleNamespace:
+    """Build a request whose ``token_ids`` and ``slot_mapping`` lengths
+    disagree, simulating a state desync."""
+    return SimpleNamespace(
+        req_id=req_id,
+        token_ids=list(range(token_ids_len)),
+        slot_mapping=torch.arange(slot_mapping_len, dtype=torch.long),
+        save_spec=SaveSpec(skip_leading_tokens=0, can_save=True),
+        disagg_spec=None,
+        is_last_prefill=True,
+        request_configs=None,
+    )
+
+
+def _make_connector(
+    requests: list[SimpleNamespace],
+) -> tuple[LMCacheConnectorV1Impl, _FakeEngine]:
+    metadata = LMCacheConnectorMetadata(requests=requests)  # type: ignore[arg-type]
+    engine = _FakeEngine()
+    connector = LMCacheConnectorV1Impl.__new__(LMCacheConnectorV1Impl)
+    connector._parent = _FakeParent(metadata)
+    # ``lmcache_engine`` is a read-only property backed by ``self._manager``;
+    # inject the fake engine through the manager so the property resolves to it.
+    connector._manager = SimpleNamespace(  # type: ignore[assignment]
+        lmcache_engine=engine
+    )
+    connector.kv_role = "kv_producer"
+    connector.use_layerwise = False
+    connector.enable_blending = False
+    connector.device = "cpu"
+    connector._lmcache_chunk_size = 8
+    connector.kv_caches = {"layer0": torch.zeros(1)}
+    connector.config = SimpleNamespace(pd_bidirectional=False)
+    return connector, engine
+
+
+def test_wait_for_save_skips_desynced_request_and_keeps_engine_alive() -> None:
+    """Length mismatch must drop only the affected request's save, log a
+    warning, and let ``wait_for_save`` return normally.
+
+    Regression for https://github.com/LMCache/LMCache/issues/3318.
+    """
+    # lmcache's ``init_logger`` sets ``propagate = False`` on the adapter
+    # logger so its records do not reach pytest's ``caplog`` (which
+    # attaches to the root logger). Toggling ``propagate`` is fragile --
+    # any lazy import that re-runs ``init_logger`` resets it. Attach a
+    # local handler directly to the named logger instead so we capture
+    # the warning regardless of how lmcache configures propagation.
+    captured_records: list[logging.LogRecord] = []
+
+    class _ListHandler(logging.Handler):
+        def emit(self, record: logging.LogRecord) -> None:
+            captured_records.append(record)
+
+    handler = _ListHandler(level=logging.WARNING)
+    adapter_logger = logging.getLogger("lmcache.integration.vllm.vllm_v1_adapter")
+    # ``init_logger`` sets the logger level from ``LMCACHE_LOG_LEVEL`` (default
+    # INFO). If a prior import set it above WARNING, ``logger.warning`` would be
+    # filtered before reaching our handler. Force WARNING for the duration of
+    # the test and restore the original level in ``finally``.
+    original_level = adapter_logger.level
+    adapter_logger.setLevel(logging.WARNING)
+    adapter_logger.addHandler(handler)
+    try:
+        desync_req = _make_desync_request(
+            "req-desync", token_ids_len=4, slot_mapping_len=3
+        )
+        connector, engine = _make_connector([desync_req])
+
+        connector.wait_for_save()
+
+        # 1. lookup_unpin still ran (pin balance preserved)
+        assert engine.unpinned == ["req-desync"]
+
+        # 2. store was NOT called for the desynced request (save dropped)
+        assert engine.store_calls == []
+
+        # 3. A warning was emitted naming the request and both lengths
+        warnings = [r for r in captured_records if r.levelno == logging.WARNING]
+        assert any(
+            "req-desync" in r.getMessage()
+            and "slot_mapping=3" in r.getMessage()
+            and "token_ids=4" in r.getMessage()
+            for r in warnings
+        ), (
+            "Expected desync warning naming req-desync; "
+            f"got {[r.getMessage() for r in warnings]}"
+        )
+    finally:
+        adapter_logger.removeHandler(handler)
+        adapter_logger.setLevel(original_level)

From fc9df67d1b6898e8cb4d3709da0e22df117a7bd4 Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Wed, 10 Jun 2026 19:27:57 -0700
Subject: [PATCH 33/57] renaming

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 .../mp_coordinator/l2_usage_and_eviction.md   | 14 ++--
 lmcache/v1/mp_coordinator/app.py              | 32 ++++-----
 lmcache/v1/mp_coordinator/http_apis/l2_api.py | 72 +++++++++----------
 ...tion_controller.py => eviction_manager.py} | 31 ++++----
 .../l2/{quota_store.py => quota_manager.py}   |  2 +-
 .../l2/{usage_tracker.py => usage_manager.py} |  6 +-
 ...controller.py => test_eviction_manager.py} | 20 +++---
 ...t_quota_store.py => test_quota_manager.py} | 22 +++---
 ...usage_tracker.py => test_usage_manager.py} | 28 ++++----
 9 files changed, 115 insertions(+), 112 deletions(-)
 rename lmcache/v1/mp_coordinator/l2/{eviction_controller.py => eviction_manager.py} (88%)
 rename lmcache/v1/mp_coordinator/l2/{quota_store.py => quota_manager.py} (98%)
 rename lmcache/v1/mp_coordinator/l2/{usage_tracker.py => usage_manager.py} (95%)
 rename tests/v1/mp_coordinator/{test_eviction_controller.py => test_eviction_manager.py} (88%)
 rename tests/v1/mp_coordinator/{test_quota_store.py => test_quota_manager.py} (69%)
 rename tests/v1/mp_coordinator/{test_usage_tracker.py => test_usage_manager.py} (74%)

diff --git a/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md b/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
index 141ec8a6e6..1cb77af14f 100644
--- a/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
+++ b/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
@@ -35,9 +35,9 @@ MP server (store/lookup)
         │
         ▼
   POST /l2/events ──▶ Coordinator
-                        ├─ UsageTracker: per-salt byte accounting
-                        ├─ CoordinatorEvictionController: per-salt LRU
-                        └─ QuotaStore: per-salt byte limits
+                        ├─ CoordinatorUsageManager: per-salt byte accounting
+                        ├─ CoordinatorEvictionManager: per-salt LRU
+                        └─ CoordinatorQuotaManager: per-salt byte limits
 
   Coordinator background loop (every eviction_check_interval, default 5s)
         │
@@ -63,7 +63,7 @@ never imports ``torch``.
 
 ## Coordinator components (`l2/`)
 
-### UsageTracker (`usage_tracker.py`)
+### CoordinatorUsageManager (`usage_manager.py`)
 
 Thread-safe per-salt byte counter. Two operations:
 
@@ -72,13 +72,13 @@ Thread-safe per-salt byte counter. Two operations:
 
 Exposes ``get(salt)``, ``get_all()``, ``get_total()`` for the status endpoints.
 
-### QuotaStore (`quota_store.py`)
+### CoordinatorQuotaManager (`quota_manager.py`)
 
 Thread-safe in-memory quota registry (``dict[str, int]`` + lock). CRUD via
 ``set``, ``get``, ``delete``, ``list_all``. Quotas are set in GiB at the API
 and stored as bytes internally.
 
-### CoordinatorEvictionController (`eviction_controller.py`)
+### CoordinatorEvictionManager (`eviction_manager.py`)
 
 Per-``cache_salt`` LRU, mirroring ``IsolatedLRUEvictionPolicy`` but using
 ``CacheKey`` and running in the coordinator process.
@@ -94,7 +94,7 @@ _key_sizes      : dict[CacheKey, int]                       # byte size per key
 - ``on_lookup(key)`` — touch (move to MRU end).
 - ``on_remove(keys)`` — remove from LRU tracking after confirmed deletion.
 - ``execute_evictions()`` — for each tracked salt, compare usage (from
-  ``UsageTracker``) against quota (from ``QuotaStore``, default 0). If over
+  ``CoordinatorUsageManager``) against quota (from ``CoordinatorQuotaManager``, default 0). If over
   quota, select LRU keys targeting ``eviction_ratio`` of the overage. No quota
   or zero quota means evict all keys for that salt.
 
diff --git a/lmcache/v1/mp_coordinator/app.py b/lmcache/v1/mp_coordinator/app.py
index 97426bff2a..7fe1ce6d19 100644
--- a/lmcache/v1/mp_coordinator/app.py
+++ b/lmcache/v1/mp_coordinator/app.py
@@ -4,7 +4,7 @@
 The coordinator is a FastAPI app. Endpoints are auto-discovered from the
 ``http_apis`` package (the same convention as the mp server's HTTP API) and stay
 thin, operating on the shared collaborators carried on ``app.state``: ``config``,
-``registry``, ``quota_store``, ``usage_tracker``, and ``eviction_controller``.
+``registry``, ``quota_manager``, ``usage_manager``, and ``eviction_manager``.
 The lifespan runs background tasks for health-checking (eviction of instances
 whose heartbeats have lapsed) and L2 eviction (quota enforcement).
 
@@ -29,11 +29,11 @@
 # First Party
 from lmcache.logging import init_logger
 from lmcache.v1.mp_coordinator.config import MPCoordinatorConfig
-from lmcache.v1.mp_coordinator.l2.eviction_controller import (
-    CoordinatorEvictionController,
+from lmcache.v1.mp_coordinator.l2.eviction_manager import (
+    CoordinatorEvictionManager,
 )
-from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
-from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
+from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
+from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
 from lmcache.v1.mp_coordinator.registry import InstanceRegistry
 from lmcache.v1.utils.router_discovery import discover_api_routers
 
@@ -66,15 +66,15 @@ def create_app(config: MPCoordinatorConfig) -> FastAPI:
 
     Returns:
         A configured FastAPI application. ``app.state`` carries the shared
-        collaborators (``config``, ``registry``, ``quota_store``,
-        ``usage_tracker``); all ``http_apis`` routers are registered.
+        collaborators (``config``, ``registry``, ``quota_manager``,
+        ``usage_manager``); all ``http_apis`` routers are registered.
     """
     registry = InstanceRegistry()
-    quota_store = QuotaStore()
-    usage_tracker = UsageTracker()
-    eviction_controller = CoordinatorEvictionController(
-        quota_store=quota_store,
-        usage_tracker=usage_tracker,
+    quota_manager = CoordinatorQuotaManager()
+    usage_manager = CoordinatorUsageManager()
+    eviction_manager = CoordinatorEvictionManager(
+        quota_manager=quota_manager,
+        usage_manager=usage_manager,
         eviction_ratio=config.eviction_ratio,
     )
 
@@ -88,7 +88,7 @@ async def _eviction_loop() -> None:
         """Periodically check usage against quotas and log eviction plans."""
         while True:
             await asyncio.sleep(config.eviction_check_interval)
-            eviction_controller.execute_evictions()
+            eviction_manager.execute_evictions()
 
     @asynccontextmanager
     async def lifespan(app: FastAPI) -> AsyncIterator[None]:
@@ -115,9 +115,9 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     # Shared collaborators on app.state so routers compose from them.
     app.state.config = config
     app.state.registry = registry
-    app.state.quota_store = quota_store
-    app.state.usage_tracker = usage_tracker
-    app.state.eviction_controller = eviction_controller
+    app.state.quota_manager = quota_manager
+    app.state.usage_manager = usage_manager
+    app.state.eviction_manager = eviction_manager
 
     apis_path = Path(__file__).parent / "http_apis"
     package = f"{__package__}.http_apis"
diff --git a/lmcache/v1/mp_coordinator/http_apis/l2_api.py b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
index 8a03263a34..b7bfc8a747 100644
--- a/lmcache/v1/mp_coordinator/http_apis/l2_api.py
+++ b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
@@ -10,11 +10,11 @@
 from fastapi.responses import JSONResponse
 
 # First Party
-from lmcache.v1.mp_coordinator.l2.eviction_controller import (
-    CoordinatorEvictionController,
+from lmcache.v1.mp_coordinator.l2.eviction_manager import (
+    CoordinatorEvictionManager,
 )
-from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
-from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
+from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
+from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
 from lmcache.v1.mp_coordinator.schemas import (
     EventType,
     L2StatusListResponse,
@@ -35,58 +35,58 @@ def _gb(n_bytes: int) -> float:
     return n_bytes / _GB
 
 
-def _quota_store(request: Request) -> QuotaStore:
-    """Return the shared quota store from app state.
+def _quota_manager(request: Request) -> CoordinatorQuotaManager:
+    """Return the shared quota manager from app state.
 
     Args:
         request: The incoming request.
 
     Returns:
-        The shared :class:`QuotaStore`.
+        The shared :class:`CoordinatorQuotaManager`.
 
     Raises:
-        RuntimeError: If the store is not initialized.
+        RuntimeError: If the manager is not initialized.
     """
-    store = getattr(request.app.state, "quota_store", None)
-    if store is None:
-        raise RuntimeError("quota store not initialized")
-    return store
+    mgr = getattr(request.app.state, "quota_manager", None)
+    if mgr is None:
+        raise RuntimeError("quota manager not initialized")
+    return mgr
 
 
-def _tracker(request: Request) -> UsageTracker:
-    """Return the shared usage tracker from app state.
+def _usage_manager(request: Request) -> CoordinatorUsageManager:
+    """Return the shared usage manager from app state.
 
     Args:
         request: The incoming request.
 
     Returns:
-        The shared :class:`UsageTracker`.
+        The shared :class:`CoordinatorUsageManager`.
 
     Raises:
-        RuntimeError: If the tracker is not initialized.
+        RuntimeError: If the manager is not initialized.
     """
-    tracker = getattr(request.app.state, "usage_tracker", None)
-    if tracker is None:
-        raise RuntimeError("usage tracker not initialized")
-    return tracker
+    mgr = getattr(request.app.state, "usage_manager", None)
+    if mgr is None:
+        raise RuntimeError("usage manager not initialized")
+    return mgr
 
 
-def _eviction_controller(request: Request) -> CoordinatorEvictionController:
-    """Return the shared eviction controller from app state.
+def _eviction_manager(request: Request) -> CoordinatorEvictionManager:
+    """Return the shared eviction manager from app state.
 
     Args:
         request: The incoming request.
 
     Returns:
-        The shared :class:`CoordinatorEvictionController`.
+        The shared :class:`CoordinatorEvictionManager`.
 
     Raises:
-        RuntimeError: If the controller is not initialized.
+        RuntimeError: If the manager is not initialized.
     """
-    ctrl = getattr(request.app.state, "eviction_controller", None)
-    if ctrl is None:
-        raise RuntimeError("eviction controller not initialized")
-    return ctrl
+    mgr = getattr(request.app.state, "eviction_manager", None)
+    if mgr is None:
+        raise RuntimeError("eviction manager not initialized")
+    return mgr
 
 
 # -- Quota writes ------------------------------------------------------------
@@ -103,7 +103,7 @@ async def set_quota(
     """
     limit_bytes = int(body.limit_gb * _GB)
     try:
-        _quota_store(request).set(cache_salt, limit_bytes)
+        _quota_manager(request).set(cache_salt, limit_bytes)
     except ValueError as exc:
         return JSONResponse(status_code=400, content={"error": str(exc)})
     return QuotaResponse(
@@ -120,7 +120,7 @@ async def delete_quota(cache_salt: str, request: Request) -> QuotaResponse:
     Returns:
         Whether the entry was found and removed.
     """
-    removed = _quota_store(request).delete(cache_salt)
+    removed = _quota_manager(request).delete(cache_salt)
     return QuotaResponse(
         cache_salt=cache_salt,
         limit_gb=0.0,
@@ -140,8 +140,8 @@ async def report_events(
     Returns:
         Number of events processed.
     """
-    tracker = _tracker(request)
-    ctrl = _eviction_controller(request)
+    tracker = _usage_manager(request)
+    ctrl = _eviction_manager(request)
     for event in body.events:
         if event.type == EventType.STORE:
             tracker.record_stored(event.key.cache_salt, event.bytes)
@@ -161,8 +161,8 @@ async def get_status(cache_salt: str, request: Request) -> L2StatusResponse:
     Returns:
         Combined quota and usage detail.
     """
-    tracker = _tracker(request)
-    store = _quota_store(request)
+    tracker = _usage_manager(request)
+    store = _quota_manager(request)
     usage = tracker.get(cache_salt)
     limit = store.get(cache_salt)
     return L2StatusResponse(
@@ -180,8 +180,8 @@ async def list_status(request: Request) -> L2StatusListResponse:
     Returns:
         Total usage and per-salt breakdown with quota info.
     """
-    tracker = _tracker(request)
-    store = _quota_store(request)
+    tracker = _usage_manager(request)
+    store = _quota_manager(request)
     by_salt = tracker.get_all()
     total = tracker.get_total()
     quota_entries = {e.cache_salt: e.limit_bytes for e in store.list_all()}
diff --git a/lmcache/v1/mp_coordinator/l2/eviction_controller.py b/lmcache/v1/mp_coordinator/l2/eviction_manager.py
similarity index 88%
rename from lmcache/v1/mp_coordinator/l2/eviction_controller.py
rename to lmcache/v1/mp_coordinator/l2/eviction_manager.py
index b7e83acb4a..cbba14cc1b 100644
--- a/lmcache/v1/mp_coordinator/l2/eviction_controller.py
+++ b/lmcache/v1/mp_coordinator/l2/eviction_manager.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Coordinator-side eviction controller with per-``cache_salt`` LRU.
+"""Coordinator-side eviction manager with per-``cache_salt`` LRU.
 
 Mirrors the structure of
 :class:`~lmcache.v1.distributed.eviction_policy.isolated_lru.IsolatedLRUEvictionPolicy`
@@ -7,8 +7,9 @@
 :class:`CacheKey` instead of :class:`ObjectKey` (which pulls in
 ``torch``).
 
-The controller periodically checks per-salt usage (from
-:class:`UsageTracker`) against limits (from :class:`QuotaStore`).
+The manager periodically checks per-salt usage
+(from :class:`CoordinatorUsageManager`) against limits
+(from :class:`CoordinatorQuotaManager`).
 When a salt exceeds its quota, it selects LRU victims and **logs**
 them — actual deletion is not implemented yet.
 """
@@ -22,15 +23,15 @@
 
 # First Party
 from lmcache.logging import init_logger
-from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
-from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
+from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
+from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
 from lmcache.v1.mp_coordinator.schemas import CacheKey
 
 logger = init_logger(__name__)
 
 
-class CoordinatorEvictionController:
-    """Per-``cache_salt`` LRU eviction controller for the coordinator.
+class CoordinatorEvictionManager:
+    """Per-``cache_salt`` LRU eviction manager for the coordinator.
 
     Maintains one ``OrderedDict`` per ``cache_salt``, ordered from
     least-recently-used (front) to most-recently-used (end). Also
@@ -39,21 +40,21 @@ class CoordinatorEvictionController:
     Thread-safety: every public method acquires ``_lock``.
 
     Args:
-        quota_store: The shared quota registry.
-        usage_tracker: The shared usage tracker.
+        quota_manager: The shared quota registry.
+        usage_manager: The shared usage manager.
         eviction_ratio: Fraction of over-quota bytes to target for
             eviction each cycle.
     """
 
     def __init__(
         self,
-        quota_store: QuotaStore,
-        usage_tracker: UsageTracker,
+        quota_manager: CoordinatorQuotaManager,
+        usage_manager: CoordinatorUsageManager,
         eviction_ratio: float = 0.5,
     ) -> None:
         self._lock = threading.Lock()
-        self._quota_store = quota_store
-        self._usage_tracker = usage_tracker
+        self._quota_manager = quota_manager
+        self._usage_manager = usage_manager
         self._eviction_ratio = max(0.0, min(1.0, eviction_ratio))
         self._per_salt_order: dict[str, OrderedDict[CacheKey, None]] = {}
         self._key_sizes: dict[CacheKey, int] = {}
@@ -120,7 +121,7 @@ def execute_evictions(self) -> dict[str, list[CacheKey]]:
             A mapping of ``cache_salt`` to the list of keys selected
             for eviction.
         """
-        quotas = {e.cache_salt: e.limit_bytes for e in self._quota_store.list_all()}
+        quotas = {e.cache_salt: e.limit_bytes for e in self._quota_manager.list_all()}
         with self._lock:
             tracked_salts = list(self._per_salt_order.keys())
 
@@ -128,7 +129,7 @@ def execute_evictions(self) -> dict[str, list[CacheKey]]:
 
         for cache_salt in tracked_salts:
             limit_bytes = quotas.get(cache_salt, 0)
-            current_bytes = self._usage_tracker.get(cache_salt)
+            current_bytes = self._usage_manager.get(cache_salt)
             if current_bytes <= limit_bytes:
                 continue
 
diff --git a/lmcache/v1/mp_coordinator/l2/quota_store.py b/lmcache/v1/mp_coordinator/l2/quota_manager.py
similarity index 98%
rename from lmcache/v1/mp_coordinator/l2/quota_store.py
rename to lmcache/v1/mp_coordinator/l2/quota_manager.py
index a9d0c3fef7..102deaccc9 100644
--- a/lmcache/v1/mp_coordinator/l2/quota_store.py
+++ b/lmcache/v1/mp_coordinator/l2/quota_manager.py
@@ -29,7 +29,7 @@ class QuotaEntry:
     limit_bytes: int
 
 
-class QuotaStore:
+class CoordinatorQuotaManager:
     """Thread-safe in-memory registry of byte quotas keyed by ``cache_salt``.
 
     All public methods acquire an internal lock so the store stays
diff --git a/lmcache/v1/mp_coordinator/l2/usage_tracker.py b/lmcache/v1/mp_coordinator/l2/usage_manager.py
similarity index 95%
rename from lmcache/v1/mp_coordinator/l2/usage_tracker.py
rename to lmcache/v1/mp_coordinator/l2/usage_manager.py
index 241202f409..426275a763 100644
--- a/lmcache/v1/mp_coordinator/l2/usage_tracker.py
+++ b/lmcache/v1/mp_coordinator/l2/usage_manager.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Per-``cache_salt`` L2 usage tracker for the MP coordinator.
+"""Per-``cache_salt`` L2 usage manager for the MP coordinator.
 
 Maintains running byte totals per tenant, updated by store events
 reported by MP servers. Eviction (byte subtraction) is driven by
@@ -19,8 +19,8 @@
 logger = init_logger(__name__)
 
 
-class UsageTracker:
-    """Thread-safe in-memory tracker of L2 byte usage per ``cache_salt``.
+class CoordinatorUsageManager:
+    """Thread-safe in-memory manager of L2 byte usage per ``cache_salt``.
 
     MP servers report ``store`` events. The coordinator calls
     ``record_evicted`` when it decides to evict data. Byte counters
diff --git a/tests/v1/mp_coordinator/test_eviction_controller.py b/tests/v1/mp_coordinator/test_eviction_manager.py
similarity index 88%
rename from tests/v1/mp_coordinator/test_eviction_controller.py
rename to tests/v1/mp_coordinator/test_eviction_manager.py
index 84d55c9f46..ff815a3502 100644
--- a/tests/v1/mp_coordinator/test_eviction_controller.py
+++ b/tests/v1/mp_coordinator/test_eviction_manager.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Tests for the coordinator eviction controller."""
+"""Tests for the coordinator eviction manager."""
 
 # First Party
-from lmcache.v1.mp_coordinator.l2.eviction_controller import (
-    CoordinatorEvictionController,
+from lmcache.v1.mp_coordinator.l2.eviction_manager import (
+    CoordinatorEvictionManager,
 )
-from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
-from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
+from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
+from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
 from lmcache.v1.mp_coordinator.schemas import CacheKey
 
 
@@ -16,10 +16,12 @@ def _make_key(salt: str, model: str = "m", rank: int = 0, h: str = "aa") -> Cach
 
 def _setup(
     eviction_ratio: float = 0.5,
-) -> tuple[CoordinatorEvictionController, QuotaStore, UsageTracker]:
-    qs = QuotaStore()
-    ut = UsageTracker()
-    ctrl = CoordinatorEvictionController(qs, ut, eviction_ratio=eviction_ratio)
+) -> tuple[
+    CoordinatorEvictionManager, CoordinatorQuotaManager, CoordinatorUsageManager
+]:
+    qs = CoordinatorQuotaManager()
+    ut = CoordinatorUsageManager()
+    ctrl = CoordinatorEvictionManager(qs, ut, eviction_ratio=eviction_ratio)
     return ctrl, qs, ut
 
 
diff --git a/tests/v1/mp_coordinator/test_quota_store.py b/tests/v1/mp_coordinator/test_quota_manager.py
similarity index 69%
rename from tests/v1/mp_coordinator/test_quota_store.py
rename to tests/v1/mp_coordinator/test_quota_manager.py
index b79b44f6f3..295cf26233 100644
--- a/tests/v1/mp_coordinator/test_quota_store.py
+++ b/tests/v1/mp_coordinator/test_quota_manager.py
@@ -1,45 +1,45 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Tests for the coordinator QuotaStore."""
+"""Tests for the coordinator CoordinatorQuotaManager."""
 
 # Third Party
 import pytest
 
 # First Party
-from lmcache.v1.mp_coordinator.l2.quota_store import QuotaStore
+from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
 
 
 def test_set_and_get():
-    store = QuotaStore()
+    store = CoordinatorQuotaManager()
     store.set("salt-a", 1000)
     assert store.get("salt-a") == 1000
 
 
 def test_get_unregistered_returns_none():
-    store = QuotaStore()
+    store = CoordinatorQuotaManager()
     assert store.get("unknown") is None
 
 
 def test_set_overwrites():
-    store = QuotaStore()
+    store = CoordinatorQuotaManager()
     store.set("salt-a", 1000)
     store.set("salt-a", 2000)
     assert store.get("salt-a") == 2000
 
 
 def test_delete():
-    store = QuotaStore()
+    store = CoordinatorQuotaManager()
     store.set("salt-a", 1000)
     assert store.delete("salt-a") is True
     assert store.get("salt-a") is None
 
 
 def test_delete_nonexistent():
-    store = QuotaStore()
+    store = CoordinatorQuotaManager()
     assert store.delete("unknown") is False
 
 
 def test_list_all():
-    store = QuotaStore()
+    store = CoordinatorQuotaManager()
     store.set("a", 100)
     store.set("b", 200)
     entries = store.list_all()
@@ -48,17 +48,17 @@ def test_list_all():
 
 
 def test_list_all_empty():
-    store = QuotaStore()
+    store = CoordinatorQuotaManager()
     assert store.list_all() == []
 
 
 def test_negative_limit_raises():
-    store = QuotaStore()
+    store = CoordinatorQuotaManager()
     with pytest.raises(ValueError, match="non-negative"):
         store.set("salt-a", -1)
 
 
 def test_zero_limit_accepted():
-    store = QuotaStore()
+    store = CoordinatorQuotaManager()
     store.set("salt-a", 0)
     assert store.get("salt-a") == 0
diff --git a/tests/v1/mp_coordinator/test_usage_tracker.py b/tests/v1/mp_coordinator/test_usage_manager.py
similarity index 74%
rename from tests/v1/mp_coordinator/test_usage_tracker.py
rename to tests/v1/mp_coordinator/test_usage_manager.py
index 4f00a58890..a5cee0b6db 100644
--- a/tests/v1/mp_coordinator/test_usage_tracker.py
+++ b/tests/v1/mp_coordinator/test_usage_manager.py
@@ -1,22 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Tests for the coordinator UsageTracker."""
+"""Tests for the coordinator CoordinatorUsageManager."""
 
 # Third Party
 import pytest
 
 # First Party
-from lmcache.v1.mp_coordinator.l2.usage_tracker import UsageTracker
+from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
 
 
 def test_record_stored():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     t.record_stored("a", 100)
     assert t.get("a") == 100
     assert t.get_total() == 100
 
 
 def test_record_stored_accumulates():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     t.record_stored("a", 100)
     t.record_stored("a", 200)
     assert t.get("a") == 300
@@ -24,7 +24,7 @@ def test_record_stored_accumulates():
 
 
 def test_record_evicted():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     t.record_stored("a", 100)
     t.record_evicted("a", 40)
     assert t.get("a") == 60
@@ -32,7 +32,7 @@ def test_record_evicted():
 
 
 def test_evict_clamps_at_zero():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     t.record_stored("a", 50)
     t.record_evicted("a", 100)
     assert t.get("a") == 0
@@ -40,14 +40,14 @@ def test_evict_clamps_at_zero():
 
 
 def test_evict_removes_zero_entry():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     t.record_stored("a", 100)
     t.record_evicted("a", 100)
     assert t.get_all() == {}
 
 
 def test_multiple_salts():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     t.record_stored("a", 100)
     t.record_stored("b", 200)
     assert t.get("a") == 100
@@ -56,36 +56,36 @@ def test_multiple_salts():
 
 
 def test_get_unknown_returns_zero():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     assert t.get("unknown") == 0
 
 
 def test_get_all():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     t.record_stored("a", 100)
     t.record_stored("b", 200)
     assert t.get_all() == {"a": 100, "b": 200}
 
 
 def test_get_all_empty():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     assert t.get_all() == {}
 
 
 def test_zero_bytes_is_noop():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     t.record_stored("a", 0)
     assert t.get("a") == 0
     assert t.get_all() == {}
 
 
 def test_negative_store_raises():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     with pytest.raises(ValueError, match="non-negative"):
         t.record_stored("a", -1)
 
 
 def test_negative_evict_raises():
-    t = UsageTracker()
+    t = CoordinatorUsageManager()
     with pytest.raises(ValueError, match="non-negative"):
         t.record_evicted("a", -1)

From f1897fe6294b12ae77c13d3aef37ef378859b5d2 Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Wed, 10 Jun 2026 19:36:32 -0700
Subject: [PATCH 34/57] change naming again

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 .../mp_coordinator/l2_usage_and_eviction.md   | 18 ++++++------
 lmcache/v1/mp_coordinator/app.py              | 12 ++++----
 lmcache/v1/mp_coordinator/http_apis/l2_api.py | 18 ++++++------
 .../v1/mp_coordinator/l2/event_listener.py    |  2 +-
 .../v1/mp_coordinator/l2/eviction_manager.py  | 14 +++++-----
 lmcache/v1/mp_coordinator/l2/quota_manager.py |  2 +-
 lmcache/v1/mp_coordinator/l2/usage_manager.py |  2 +-
 lmcache/v1/multiprocess/http_server.py        |  4 +--
 .../mp_coordinator/test_eviction_manager.py   | 16 +++++------
 tests/v1/mp_coordinator/test_quota_manager.py | 22 +++++++--------
 tests/v1/mp_coordinator/test_usage_manager.py | 28 +++++++++----------
 11 files changed, 68 insertions(+), 70 deletions(-)

diff --git a/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md b/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
index 1cb77af14f..ccca93bf7f 100644
--- a/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
+++ b/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
@@ -29,15 +29,15 @@ MP server (store/lookup)
   L2 adapter fires on_l2_keys_stored / on_l2_keys_accessed
         │
         ▼
-  CoordinatorL2EventListener (L2AdapterListener)
+  L2EventListener (L2AdapterListener)
     converts ObjectKey → CacheKey, buffers UsageEvents
         │  flush every l2_event_flush_interval (default 1s)
         │
         ▼
   POST /l2/events ──▶ Coordinator
-                        ├─ CoordinatorUsageManager: per-salt byte accounting
-                        ├─ CoordinatorEvictionManager: per-salt LRU
-                        └─ CoordinatorQuotaManager: per-salt byte limits
+                        ├─ L2UsageManager: per-salt byte accounting
+                        ├─ L2EvictionManager: per-salt LRU
+                        └─ L2QuotaManager: per-salt byte limits
 
   Coordinator background loop (every eviction_check_interval, default 5s)
         │
@@ -63,7 +63,7 @@ never imports ``torch``.
 
 ## Coordinator components (`l2/`)
 
-### CoordinatorUsageManager (`usage_manager.py`)
+### L2UsageManager (`usage_manager.py`)
 
 Thread-safe per-salt byte counter. Two operations:
 
@@ -72,13 +72,13 @@ Thread-safe per-salt byte counter. Two operations:
 
 Exposes ``get(salt)``, ``get_all()``, ``get_total()`` for the status endpoints.
 
-### CoordinatorQuotaManager (`quota_manager.py`)
+### L2QuotaManager (`quota_manager.py`)
 
 Thread-safe in-memory quota registry (``dict[str, int]`` + lock). CRUD via
 ``set``, ``get``, ``delete``, ``list_all``. Quotas are set in GiB at the API
 and stored as bytes internally.
 
-### CoordinatorEvictionManager (`eviction_manager.py`)
+### L2EvictionManager (`eviction_manager.py`)
 
 Per-``cache_salt`` LRU, mirroring ``IsolatedLRUEvictionPolicy`` but using
 ``CacheKey`` and running in the coordinator process.
@@ -94,7 +94,7 @@ _key_sizes      : dict[CacheKey, int]                       # byte size per key
 - ``on_lookup(key)`` — touch (move to MRU end).
 - ``on_remove(keys)`` — remove from LRU tracking after confirmed deletion.
 - ``execute_evictions()`` — for each tracked salt, compare usage (from
-  ``CoordinatorUsageManager``) against quota (from ``CoordinatorQuotaManager``, default 0). If over
+  ``L2UsageManager``) against quota (from ``L2QuotaManager``, default 0). If over
   quota, select LRU keys targeting ``eviction_ratio`` of the overage. No quota
   or zero quota means evict all keys for that salt.
 
@@ -116,7 +116,7 @@ Status responses report usage in GiB only (no raw bytes in the API).
 
 ## MP-server event listener (`event_listener.py`)
 
-``CoordinatorL2EventListener`` implements ``L2AdapterListener`` and is registered
+``L2EventListener`` implements ``L2AdapterListener`` and is registered
 on all L2 adapters via ``StorageManager.register_l2_listener()``. It:
 
 1. Receives ``on_l2_keys_stored(keys, sizes)`` and ``on_l2_keys_accessed(keys)``
diff --git a/lmcache/v1/mp_coordinator/app.py b/lmcache/v1/mp_coordinator/app.py
index 7fe1ce6d19..13b52bbc6a 100644
--- a/lmcache/v1/mp_coordinator/app.py
+++ b/lmcache/v1/mp_coordinator/app.py
@@ -30,10 +30,10 @@
 from lmcache.logging import init_logger
 from lmcache.v1.mp_coordinator.config import MPCoordinatorConfig
 from lmcache.v1.mp_coordinator.l2.eviction_manager import (
-    CoordinatorEvictionManager,
+    L2EvictionManager,
 )
-from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
-from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
+from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
+from lmcache.v1.mp_coordinator.l2.usage_manager import L2UsageManager
 from lmcache.v1.mp_coordinator.registry import InstanceRegistry
 from lmcache.v1.utils.router_discovery import discover_api_routers
 
@@ -70,9 +70,9 @@ def create_app(config: MPCoordinatorConfig) -> FastAPI:
         ``usage_manager``); all ``http_apis`` routers are registered.
     """
     registry = InstanceRegistry()
-    quota_manager = CoordinatorQuotaManager()
-    usage_manager = CoordinatorUsageManager()
-    eviction_manager = CoordinatorEvictionManager(
+    quota_manager = L2QuotaManager()
+    usage_manager = L2UsageManager()
+    eviction_manager = L2EvictionManager(
         quota_manager=quota_manager,
         usage_manager=usage_manager,
         eviction_ratio=config.eviction_ratio,
diff --git a/lmcache/v1/mp_coordinator/http_apis/l2_api.py b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
index b7bfc8a747..d12d775e6f 100644
--- a/lmcache/v1/mp_coordinator/http_apis/l2_api.py
+++ b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
@@ -11,10 +11,10 @@
 
 # First Party
 from lmcache.v1.mp_coordinator.l2.eviction_manager import (
-    CoordinatorEvictionManager,
+    L2EvictionManager,
 )
-from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
-from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
+from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
+from lmcache.v1.mp_coordinator.l2.usage_manager import L2UsageManager
 from lmcache.v1.mp_coordinator.schemas import (
     EventType,
     L2StatusListResponse,
@@ -35,14 +35,14 @@ def _gb(n_bytes: int) -> float:
     return n_bytes / _GB
 
 
-def _quota_manager(request: Request) -> CoordinatorQuotaManager:
+def _quota_manager(request: Request) -> L2QuotaManager:
     """Return the shared quota manager from app state.
 
     Args:
         request: The incoming request.
 
     Returns:
-        The shared :class:`CoordinatorQuotaManager`.
+        The shared :class:`L2QuotaManager`.
 
     Raises:
         RuntimeError: If the manager is not initialized.
@@ -53,14 +53,14 @@ def _quota_manager(request: Request) -> CoordinatorQuotaManager:
     return mgr
 
 
-def _usage_manager(request: Request) -> CoordinatorUsageManager:
+def _usage_manager(request: Request) -> L2UsageManager:
     """Return the shared usage manager from app state.
 
     Args:
         request: The incoming request.
 
     Returns:
-        The shared :class:`CoordinatorUsageManager`.
+        The shared :class:`L2UsageManager`.
 
     Raises:
         RuntimeError: If the manager is not initialized.
@@ -71,14 +71,14 @@ def _usage_manager(request: Request) -> CoordinatorUsageManager:
     return mgr
 
 
-def _eviction_manager(request: Request) -> CoordinatorEvictionManager:
+def _eviction_manager(request: Request) -> L2EvictionManager:
     """Return the shared eviction manager from app state.
 
     Args:
         request: The incoming request.
 
     Returns:
-        The shared :class:`CoordinatorEvictionManager`.
+        The shared :class:`L2EvictionManager`.
 
     Raises:
         RuntimeError: If the manager is not initialized.
diff --git a/lmcache/v1/mp_coordinator/l2/event_listener.py b/lmcache/v1/mp_coordinator/l2/event_listener.py
index 1518a16ea8..75285c55e0 100644
--- a/lmcache/v1/mp_coordinator/l2/event_listener.py
+++ b/lmcache/v1/mp_coordinator/l2/event_listener.py
@@ -51,7 +51,7 @@ def _object_key_to_cache_key(obj: ObjectKey) -> CacheKey:
     )
 
 
-class CoordinatorL2EventListener(L2AdapterListener):
+class L2EventListener(L2AdapterListener):
     """L2 adapter listener that batches events and flushes to the coordinator.
 
     Register as a listener on the L2 adapter via
diff --git a/lmcache/v1/mp_coordinator/l2/eviction_manager.py b/lmcache/v1/mp_coordinator/l2/eviction_manager.py
index cbba14cc1b..a7b8559994 100644
--- a/lmcache/v1/mp_coordinator/l2/eviction_manager.py
+++ b/lmcache/v1/mp_coordinator/l2/eviction_manager.py
@@ -8,8 +8,8 @@
 ``torch``).
 
 The manager periodically checks per-salt usage
-(from :class:`CoordinatorUsageManager`) against limits
-(from :class:`CoordinatorQuotaManager`).
+(from :class:`L2UsageManager`) against limits
+(from :class:`L2QuotaManager`).
 When a salt exceeds its quota, it selects LRU victims and **logs**
 them — actual deletion is not implemented yet.
 """
@@ -23,14 +23,14 @@
 
 # First Party
 from lmcache.logging import init_logger
-from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
-from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
+from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
+from lmcache.v1.mp_coordinator.l2.usage_manager import L2UsageManager
 from lmcache.v1.mp_coordinator.schemas import CacheKey
 
 logger = init_logger(__name__)
 
 
-class CoordinatorEvictionManager:
+class L2EvictionManager:
     """Per-``cache_salt`` LRU eviction manager for the coordinator.
 
     Maintains one ``OrderedDict`` per ``cache_salt``, ordered from
@@ -48,8 +48,8 @@ class CoordinatorEvictionManager:
 
     def __init__(
         self,
-        quota_manager: CoordinatorQuotaManager,
-        usage_manager: CoordinatorUsageManager,
+        quota_manager: L2QuotaManager,
+        usage_manager: L2UsageManager,
         eviction_ratio: float = 0.5,
     ) -> None:
         self._lock = threading.Lock()
diff --git a/lmcache/v1/mp_coordinator/l2/quota_manager.py b/lmcache/v1/mp_coordinator/l2/quota_manager.py
index 102deaccc9..c85962e2b1 100644
--- a/lmcache/v1/mp_coordinator/l2/quota_manager.py
+++ b/lmcache/v1/mp_coordinator/l2/quota_manager.py
@@ -29,7 +29,7 @@ class QuotaEntry:
     limit_bytes: int
 
 
-class CoordinatorQuotaManager:
+class L2QuotaManager:
     """Thread-safe in-memory registry of byte quotas keyed by ``cache_salt``.
 
     All public methods acquire an internal lock so the store stays
diff --git a/lmcache/v1/mp_coordinator/l2/usage_manager.py b/lmcache/v1/mp_coordinator/l2/usage_manager.py
index 426275a763..793fd2f4c0 100644
--- a/lmcache/v1/mp_coordinator/l2/usage_manager.py
+++ b/lmcache/v1/mp_coordinator/l2/usage_manager.py
@@ -19,7 +19,7 @@
 logger = init_logger(__name__)
 
 
-class CoordinatorUsageManager:
+class L2UsageManager:
     """Thread-safe in-memory manager of L2 byte usage per ``cache_salt``.
 
     MP servers report ``store`` events. The coordinator calls
diff --git a/lmcache/v1/multiprocess/http_server.py b/lmcache/v1/multiprocess/http_server.py
index 1c606aad7d..f4b71fbd0a 100644
--- a/lmcache/v1/multiprocess/http_server.py
+++ b/lmcache/v1/multiprocess/http_server.py
@@ -18,7 +18,7 @@
     add_storage_manager_args,
     parse_args_to_config,
 )
-from lmcache.v1.mp_coordinator.l2.event_listener import CoordinatorL2EventListener
+from lmcache.v1.mp_coordinator.l2.event_listener import L2EventListener
 from lmcache.v1.mp_coordinator.registrar import keep_registered
 from lmcache.v1.mp_observability.config import (
     ObservabilityConfig,
@@ -143,7 +143,7 @@ async def lifespan(app: FastAPI):
         and coordinator_config.url
         and coordinator_config.l2_event_reporting
     ):
-        coordinator_l2_event_client = CoordinatorL2EventListener(
+        coordinator_l2_event_client = L2EventListener(
             coordinator_client,
             coordinator_config.url,
             flush_interval=coordinator_config.l2_event_flush_interval,
diff --git a/tests/v1/mp_coordinator/test_eviction_manager.py b/tests/v1/mp_coordinator/test_eviction_manager.py
index ff815a3502..811ea0dac0 100644
--- a/tests/v1/mp_coordinator/test_eviction_manager.py
+++ b/tests/v1/mp_coordinator/test_eviction_manager.py
@@ -3,10 +3,10 @@
 
 # First Party
 from lmcache.v1.mp_coordinator.l2.eviction_manager import (
-    CoordinatorEvictionManager,
+    L2EvictionManager,
 )
-from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
-from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
+from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
+from lmcache.v1.mp_coordinator.l2.usage_manager import L2UsageManager
 from lmcache.v1.mp_coordinator.schemas import CacheKey
 
 
@@ -16,12 +16,10 @@ def _make_key(salt: str, model: str = "m", rank: int = 0, h: str = "aa") -> Cach
 
 def _setup(
     eviction_ratio: float = 0.5,
-) -> tuple[
-    CoordinatorEvictionManager, CoordinatorQuotaManager, CoordinatorUsageManager
-]:
-    qs = CoordinatorQuotaManager()
-    ut = CoordinatorUsageManager()
-    ctrl = CoordinatorEvictionManager(qs, ut, eviction_ratio=eviction_ratio)
+) -> tuple[L2EvictionManager, L2QuotaManager, L2UsageManager]:
+    qs = L2QuotaManager()
+    ut = L2UsageManager()
+    ctrl = L2EvictionManager(qs, ut, eviction_ratio=eviction_ratio)
     return ctrl, qs, ut
 
 
diff --git a/tests/v1/mp_coordinator/test_quota_manager.py b/tests/v1/mp_coordinator/test_quota_manager.py
index 295cf26233..7ccf110378 100644
--- a/tests/v1/mp_coordinator/test_quota_manager.py
+++ b/tests/v1/mp_coordinator/test_quota_manager.py
@@ -1,45 +1,45 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Tests for the coordinator CoordinatorQuotaManager."""
+"""Tests for the coordinator L2QuotaManager."""
 
 # Third Party
 import pytest
 
 # First Party
-from lmcache.v1.mp_coordinator.l2.quota_manager import CoordinatorQuotaManager
+from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
 
 
 def test_set_and_get():
-    store = CoordinatorQuotaManager()
+    store = L2QuotaManager()
     store.set("salt-a", 1000)
     assert store.get("salt-a") == 1000
 
 
 def test_get_unregistered_returns_none():
-    store = CoordinatorQuotaManager()
+    store = L2QuotaManager()
     assert store.get("unknown") is None
 
 
 def test_set_overwrites():
-    store = CoordinatorQuotaManager()
+    store = L2QuotaManager()
     store.set("salt-a", 1000)
     store.set("salt-a", 2000)
     assert store.get("salt-a") == 2000
 
 
 def test_delete():
-    store = CoordinatorQuotaManager()
+    store = L2QuotaManager()
     store.set("salt-a", 1000)
     assert store.delete("salt-a") is True
     assert store.get("salt-a") is None
 
 
 def test_delete_nonexistent():
-    store = CoordinatorQuotaManager()
+    store = L2QuotaManager()
     assert store.delete("unknown") is False
 
 
 def test_list_all():
-    store = CoordinatorQuotaManager()
+    store = L2QuotaManager()
     store.set("a", 100)
     store.set("b", 200)
     entries = store.list_all()
@@ -48,17 +48,17 @@ def test_list_all():
 
 
 def test_list_all_empty():
-    store = CoordinatorQuotaManager()
+    store = L2QuotaManager()
     assert store.list_all() == []
 
 
 def test_negative_limit_raises():
-    store = CoordinatorQuotaManager()
+    store = L2QuotaManager()
     with pytest.raises(ValueError, match="non-negative"):
         store.set("salt-a", -1)
 
 
 def test_zero_limit_accepted():
-    store = CoordinatorQuotaManager()
+    store = L2QuotaManager()
     store.set("salt-a", 0)
     assert store.get("salt-a") == 0
diff --git a/tests/v1/mp_coordinator/test_usage_manager.py b/tests/v1/mp_coordinator/test_usage_manager.py
index a5cee0b6db..05d4bd860f 100644
--- a/tests/v1/mp_coordinator/test_usage_manager.py
+++ b/tests/v1/mp_coordinator/test_usage_manager.py
@@ -1,22 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Tests for the coordinator CoordinatorUsageManager."""
+"""Tests for the coordinator L2UsageManager."""
 
 # Third Party
 import pytest
 
 # First Party
-from lmcache.v1.mp_coordinator.l2.usage_manager import CoordinatorUsageManager
+from lmcache.v1.mp_coordinator.l2.usage_manager import L2UsageManager
 
 
 def test_record_stored():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     t.record_stored("a", 100)
     assert t.get("a") == 100
     assert t.get_total() == 100
 
 
 def test_record_stored_accumulates():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     t.record_stored("a", 100)
     t.record_stored("a", 200)
     assert t.get("a") == 300
@@ -24,7 +24,7 @@ def test_record_stored_accumulates():
 
 
 def test_record_evicted():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     t.record_stored("a", 100)
     t.record_evicted("a", 40)
     assert t.get("a") == 60
@@ -32,7 +32,7 @@ def test_record_evicted():
 
 
 def test_evict_clamps_at_zero():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     t.record_stored("a", 50)
     t.record_evicted("a", 100)
     assert t.get("a") == 0
@@ -40,14 +40,14 @@ def test_evict_clamps_at_zero():
 
 
 def test_evict_removes_zero_entry():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     t.record_stored("a", 100)
     t.record_evicted("a", 100)
     assert t.get_all() == {}
 
 
 def test_multiple_salts():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     t.record_stored("a", 100)
     t.record_stored("b", 200)
     assert t.get("a") == 100
@@ -56,36 +56,36 @@ def test_multiple_salts():
 
 
 def test_get_unknown_returns_zero():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     assert t.get("unknown") == 0
 
 
 def test_get_all():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     t.record_stored("a", 100)
     t.record_stored("b", 200)
     assert t.get_all() == {"a": 100, "b": 200}
 
 
 def test_get_all_empty():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     assert t.get_all() == {}
 
 
 def test_zero_bytes_is_noop():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     t.record_stored("a", 0)
     assert t.get("a") == 0
     assert t.get_all() == {}
 
 
 def test_negative_store_raises():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     with pytest.raises(ValueError, match="non-negative"):
         t.record_stored("a", -1)
 
 
 def test_negative_evict_raises():
-    t = CoordinatorUsageManager()
+    t = L2UsageManager()
     with pytest.raises(ValueError, match="non-negative"):
         t.record_evicted("a", -1)

From 113043def41dd7236a514bc7b71a80007b907610 Mon Sep 17 00:00:00 2001
From: deng451e <57919305+deng451e@users.noreply.github.com>
Date: Wed, 10 Jun 2026 19:39:44 -0700
Subject: [PATCH 35/57] [observability] blend server trace sub-spans + V3
 hit-rate breakdown (#3607)

Signed-off-by: deng451e <838677410@qq.com>
---
 .../blend_v3_observability.md                 | 207 ++++++++++++++
 examples/observability/README.md              |  60 +++-
 .../provisioning/dashboards/lmcache.json      | 247 +++++++++++++++-
 examples/observability/tempo.yml              |  23 ++
 lmcache/v1/mp_observability/event.py          |  16 ++
 .../subscribers/metrics/cb_server.py          |  14 +
 .../subscribers/tracing/cb_server.py          |  47 ++-
 lmcache/v1/multiprocess/modules/blend_v3.py   | 270 ++++++++++++++++--
 .../subscribers/tracing/test_cb_server.py     | 170 +++++++++++
 9 files changed, 1011 insertions(+), 43 deletions(-)
 create mode 100644 docs/design/v1/mp_observability/blend_v3_observability.md

diff --git a/docs/design/v1/mp_observability/blend_v3_observability.md b/docs/design/v1/mp_observability/blend_v3_observability.md
new file mode 100644
index 0000000000..66336293db
--- /dev/null
+++ b/docs/design/v1/mp_observability/blend_v3_observability.md
@@ -0,0 +1,207 @@
+# CacheBlend V3 Observability — Design
+
+**Status:** Proposal · **Scope:** unify CB V3 tracing across the vLLM plugin
+(scheduler + worker) and the LMCache blend server into one distributed trace,
+plus the metrics each side exposes.
+
+## 1. Goal
+
+A single CB request touches **three processes**:
+
+```
+vLLM scheduler ──CB_UNIFIED_LOOKUP──▶ LMCache blend server
+vLLM worker    ──CB_RETRIEVE_V3─────▶ LMCache blend server
+vLLM worker    ── model forward (FULL_RECOMP → CHECK → PARTIAL)
+```
+
+Today these are observed by **two disjoint systems**:
+
+| side | mechanism | output |
+|---|---|---|
+| LMCache blend server | EventBus → `BlendTracingSubscriber` (`subscribers/tracing/cb_server.py`) | **OTel spans** (`cb.request` + children), OTLP export |
+| vLLM plugin (connector/shim/attn) | `_cb_span` / `_cb_stats_emit` (`lmcache_cacheblend/connector.py`) | **ad-hoc JSONL** (`CB_PROFILE=1`) |
+
+They never share a trace: you cannot see, in one view, that a slow request's
+time went into the server-side L2 load vs the worker-side scatter vs the PARTIAL
+forward. **Goal: one `cb.request` trace spanning all three processes**, with
+sub-spans owned by whichever process did the work, plus aligned metrics.
+
+## 2. The unified trace model
+
+One trace per `request_id`. Process owner in brackets; cross-process children
+linked by trace-context propagation (§5).
+
+```
+cb.request                         [scheduler — root]  request_id, model, world_size, n_prompt_tokens
+│
+├─ cb.schedule                     [scheduler]  the get_num_new_matched_tokens defer loop
+│  ├─ cb.lookup.rpc                [scheduler]  CB_UNIFIED_LOOKUP incl. N poll re-issues; attr: n_polls
+│  │  └─ cb.lookup                 [SERVER]  ← cross-process child; attr prefix_chunks
+│  │     ├─ cb.fingerprint_match   [server]  n_probes, table_hits, matches (token-stride=1, any offset)
+│  │     │  (no cb.prefix_lookup span — prefix is traced by mp.lookup_prefetch)
+│  │     ├─ cb.sparse_prefetch     [server]  n_keys, l1_hits, l2_misses
+│  │     │  └─ cb.l2_load          [server·IO]  chunks, bytes, ms        (coalesced L2→L1)
+│  │     └─ cb.classify            [server]  found, stale, per_rank_ok
+│  │        ↳ on end: stamp hit_rate / prefix_coverage_tokens / n_non_prefix_tokens on cb.request
+│  └─ cb.build_meta                [scheduler]  broadcast metadata to workers
+│
+└─ cb.execute                      [worker]
+   ├─ cb.start_load_kv             [worker]  submit + stream-wait (may fire 2×: partial→full block alloc)
+   │  └─ cb.retrieve               [SERVER]  ← cross-process child
+   │     └─ cb.scatter             [server·GPU]  scattered_tokens, n_prefix, n_shifted (re-RoPE'd), dropped
+   │        (re-RoPE folded in — interleaved per-batch, not a separate span)
+   └─ cb.model_forward             [worker]  the sliced forward
+      ├─ cb.full_recomp            [worker·GPU]  layers 0..cl-1
+      ├─ cb.check                  [worker·GPU]  layer cl; imp_count, recomp_ratio
+      └─ cb.partial                [worker·GPU]  layers cl+1..L; dispatch=flex|unified, imp_empty
+```
+
+This is the contract both sides implement against. The server owns `cb.lookup`
+and `cb.retrieve` subtrees; the plugin owns everything else.
+
+### 2.1 V3 reuse is token-granular (#3582)
+
+As of #3582, CB matches and scatters at **token** granularity, not vLLM-block /
+chunk granularity — which is what the spans/attrs below must reflect:
+
+- **Matching** runs at `probe_stride=1`, so the shared body is found at *any*
+  token offset (`cb_unified_lookup` no longer filters non-prefix matches to a
+  chunk-aligned `cur_st`). `cb.fingerprint_match` reports token-offset matches,
+  not aligned chunks.
+- **Scatter** writes per-token via `multi_layer_kv_transfer` with
+  `slot_mapping = block_id[pos // bs] * bs + pos % bs`. The reused token range is
+  written slot-by-slot, so a **partial vLLM block** holding both matched and
+  recomputed tokens is written correctly with **no block-alignment trim on the
+  write** — the old whole-block scatter path and block-aligned drop checks are
+  gone. `cb.scatter`'s unit is `scattered_tokens` / `slot_writes`.
+- **L2 storage stays chunk-granular** (256-token chunks): a non-block-aligned
+  match still fetches whole chunks (`cb.l2_load` = chunks/bytes), then
+  `cb.scatter` writes only the matched token sub-range. So `cb.l2_load` is
+  measured in chunks while `cb.scatter` is measured in tokens/slots.
+- **`cb.start_load_kv` may fire twice** (vLLM allocates the request's blocks
+  partial-then-full). The first pass writes only slots inside the
+  already-allocated block table — a slot-bound guard (`cur_ed > num_slots`),
+  *not* a block-alignment trim; the second writes the rest. Expect two
+  `cb.retrieve` children, the first with `scattered_tokens` < total.
+
+## 3. LMCache-server side — what to expose
+
+The server already emits `cb.request` / `cb.lookup` / `cb.retrieve` via the
+EventBus. Two changes:
+
+**(a) Finer V3 events for the lookup/retrieve subtrees.** V3 currently emits only
+`CB_LOOKUP_START/END` and `CB_RETRIEVE_START/END` — too coarse for the subtree in
+§2. Add paired events (CPU-sync for compute, `publish_on_stream` for GPU ops so
+timing is GPU-accurate):
+
+| new event pair | span | timing source |
+|---|---|---|
+| `CB_FINGERPRINT_MATCH_*` | `cb.fingerprint_match` | CPU |
+| (prefix lookup) | `mp.lookup_prefetch` (reused; `prefix_chunks` attr on `cb.lookup`) | CPU |
+| `CB_SPARSE_PREFETCH_*` | `cb.sparse_prefetch` (+ existing L2 prefetch span as `cb.l2_load`) | CPU + IO |
+| `CB_SCATTER_*` | `cb.scatter` (re-RoPE folded in via `n_shifted`) | `publish_on_stream` (GPU) |
+
+`BlendTracingSubscriber.SPAN_DEFS` gains the matching entries; all nest under
+`cb.lookup` / `cb.retrieve` via the existing `SpanRegistry`.
+
+**(b) Simplify the deferral logic for the V3 model.** The current root-close
+deferral (`_waiting_for_store_final`, the `STORE_FINAL_SUBMITTED` bridge) is V2-only
+and **inert under V3** (V3 never emits those). Under V3 the request ends at
+`CB_RETRIEVE_END` (no async store-final after inference). Gate the `cb.request`
+close on `_pending_gpu_ops[sid] == 0` only, and drop the V2 store-final bridge
+from the V3 path. (The V2-only event handlers stay for `blend_legacy`.)
+
+**Span attributes (server):** `request_id`, `prefix_coverage_tokens`,
+`fingerprint_hits`, `storage_hits`, `stale_chunks`, `hit_tokens`,
+`requested_tokens`, `hit_rate`, `prefix_hit_tokens`, `non_prefix_hit_tokens`,
+`scatter_ms`, `scattered_tokens`, `slot_writes`, `partial_blocks`,
+`n_shifted_tokens`, `n_prefix_tokens` (token-granular per §2.1 — not chunks).
+
+**V3 hit rate.** `hit_rate = hit_tokens / requested_tokens`, where the numerator
+counts **both reuse paths**: `hit_tokens = prefix_hit_tokens +
+non_prefix_hit_tokens`. The two ranges are disjoint (the non-prefix complement
+is `cur_st >= prefix coverage`), so they sum without double-counting. Both
+components are also recorded individually on `cb.request` so a dashboard can
+split prefix-reuse vs re-RoPE'd non-prefix reuse.
+
+**Metrics (already present, keep):** the `lmcache_blend.*` counters
+(`lookup_requests`, `lookup_hit_tokens`, `lookup_storage_hits`,
+`lookup_stale_chunks`, `retrieve_requests`, `retrieve_failures`,
+`chunks_evicted`, …). Note the V2-only store counters won't populate under V3 —
+document, or recompute the "stored" notion from the unified path.
+
+## 4. vLLM-plugin side — what to expose
+
+The plugin's `_cb_span` spans (`sched.gnnmt`, `gnnmt.cb_unified_submit/poll`,
+`sched.build_meta`, `slk.*`, `shim.wrapper.{prepare,fwd}`, `flex.*`,
+`cb_admission_check`) already cover the §2 plugin subtree — but as **JSONL, not
+OTel**. Make `_cb_span` dual-mode:
+
+- when an OTel tracer is available → emit an **OTel span** (start/end, attributes);
+- always (under `CB_PROFILE`) → keep the JSONL line (cheap local profiling).
+
+Mapping plugin span → unified name: `sched.gnnmt`→`cb.schedule`,
+`gnnmt.cb_unified_*`→`cb.lookup.rpc`, `sched.build_meta`→`cb.build_meta`,
+`slk.*`→`cb.start_load_kv`, `shim.wrapper.fwd`→`cb.model_forward`,
+`flex.*` + the layer hooks → `cb.full_recomp`/`cb.check`/`cb.partial`.
+
+**Tracer source.** vLLM has its own OTel (`--otlp-traces-endpoint`) and creates a
+per-request span. Prefer to **reuse vLLM's tracer** so CB spans nest under vLLM's
+request span intra-process; if vLLM tracing is off, the plugin owns a tracer
+pointed at the same OTLP endpoint as the LMCache server. Gate on a single
+`CB_TRACING=1` (or reuse `--enable-tracing` semantics) so it's off by default.
+
+## 5. Unification — linking the three processes
+
+The blocker (from the surface map): **the RPC envelope carries no trace-context**
+— `IPCCacheEngineKey` and `CBUnifiedLookupResult` have no `traceparent` field. Two
+ways to bridge:
+
+**Option A — propagate W3C trace-context through the RPC (recommended).**
+Add an optional `trace_context: str | None` (W3C `traceparent`) to the CB RPC
+payloads (the lookup key + the retrieve args). The scheduler/worker **inject**
+the current span's context; the server **extracts** it and starts `cb.lookup` /
+`cb.retrieve` as remote children of it. Result: a *true* parent→child distributed
+trace across processes. Cost: one optional protocol field (backward-compatible —
+`None` when tracing off), an `inject`/`extract` at the two RPC boundaries.
+
+**Option B — deterministic trace-id from `request_id` (zero protocol change).**
+Both sides derive a 128-bit trace-id `= hash(request_id)` and tag every span with
+it (+ `request_id` attribute). Backends group by trace-id, so the spans land in
+one trace — but there are **no cross-process parent links** (sibling spans, not
+nested). Use if the protocol field is undesirable short-term.
+
+**Recommendation:** Option A. The field is tiny, optional, and gives real
+parent/child causality (e.g. "the 90 ms gnnmt was 50 ms server L2-load + 40 ms
+poll-wait"). Keep `request_id` as a span attribute regardless, so Option B is a
+trivial fallback. The `SpanRegistry` already handles intra-process nesting on
+each side; Option A only adds the *cross*-process edge.
+
+## 6. Phasing
+
+1. **Plugin OTel** — make `_cb_span` dual-mode (OTel + JSONL); reuse vLLM's tracer;
+   gate with `CB_TRACING`. (plugin repo)
+2. **V3 server sub-spans** — add the §3(a) events + `SPAN_DEFS`; simplify the
+   §3(b) V3 deferral. (LMCache) — **DONE**: `cb.fingerprint_match` /
+   `cb.sparse_prefetch` nest under `cb.lookup` (prefix lookup reuses
+   `mp.lookup_prefetch`; `prefix_chunks` is a `cb.lookup` attr); `cb.scatter`
+   (re-RoPE folded) nests under `cb.retrieve`; `hit_rate` = prefix + non-prefix.
+   `cb.l2_load` GB/s is already covered by the existing `L2ThroughputSubscriber`
+   (`L2_LOAD_TASK_*`), correlated by request; nesting that span under
+   `cb.sparse_prefetch` is a cross-subsystem follow-up.
+3. **Cross-process link** — add the optional `trace_context` RPC field; inject on
+   the plugin side, extract on the server side. (both repos, in lockstep)
+4. **Dashboards** — one trace view + the `lmcache_blend.*` / plugin latency metrics
+   aligned on `request_id`.
+
+Each phase is independently useful (1 and 2 give per-process traces; 3 unifies).
+
+## 7. Open questions
+
+- Reuse vLLM's tracer/provider, or a CB-owned one? (affects nesting under vLLM's
+  request span vs a standalone `cb.request` root)
+- Is the protocol field (Option A) acceptable for upstream, or start with the
+  deterministic-trace-id fallback (Option B)?
+- Sampling: per-request tracing is expensive at scale — head sampling at the
+  scheduler (propagated via the same `trace_context`) so a sampled-out request is
+  cheap on all three processes.
diff --git a/examples/observability/README.md b/examples/observability/README.md
index fe195aca98..f422b37e49 100644
--- a/examples/observability/README.md
+++ b/examples/observability/README.md
@@ -82,14 +82,70 @@ request  [═══════════════════════
 Store-only requests (no lookup phase) do not carry these attributes.
 
 The pre-provisioned **LMCache** dashboard under **Dashboards** shows cache hit
-rate, StorageManager read/write rates, and the live trace panel.
+rate, StorageManager read/write rates, and the live trace panel. The collapsed
+**CacheBlend** row adds blend-server panels (see below).
+
+## CacheBlend (blend server) traces
+
+When LMCache runs the **blend** engine (`lmcache server --engine-type blend`),
+CacheBlend V3 emits its own span tree to Tempo alongside the standard spans.
+Expand the collapsed **CacheBlend** row on the dashboard, or query Tempo:
+
+```
+# All CacheBlend request traces
+{ name = "cb.request" }
+
+# Requests that actually blended non-prefix (shifted) KV
+{ name = "cb.request" && span.non_prefix_hit_tokens > 0 }
+
+# The token-scatter GPU step
+{ name = "cb.scatter" }
+```
+
+Click a `cb.request` row to open the waterfall:
+
+```
+cb.request
+  cb.lookup                (attr prefix_chunks; prefix timing is in mp.lookup_prefetch)
+    cb.fingerprint_match   match probe hashes vs stored fingerprints
+    cb.sparse_prefetch     non-prefix (shifted) chunks, sparse L2->L1
+                           (emitted only on an actual L2 load; carries l2_keys)
+  cb.retrieve
+    cb.scatter             L1 -> paged KV per-token slot-scatter + re-RoPE
+  cb.store_pre_computed
+  cb.store_final
+```
+
+The root `cb.request` span carries the V3 hit-rate breakdown
+(`hit_rate = prefix + non-prefix`):
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `prefix_hit_tokens` | int | tokens reused from the prefix (L1+L2) |
+| `non_prefix_hit_tokens` | int | tokens reused from sparse non-prefix chunks |
+| `hit_tokens` | int | `prefix_hit_tokens + non_prefix_hit_tokens` |
+| `requested_tokens` | int | total chunk-aligned tokens submitted |
+| `hit_rate` | float | `hit_tokens / requested_tokens` |
+| `prefix_hit_rate` | float | `prefix_hit_tokens / requested_tokens` |
+| `non_prefix_hit_rate` | float | `non_prefix_hit_tokens / requested_tokens` (sums to `hit_rate`) |
+
+The **CacheBlend Hit Rate & Chunks** panel overlays the overall token hit rate
+(Prometheus) with the per-request prefix/non-prefix breakdown via
+[TraceQL metrics](https://grafana.com/docs/tempo/latest/metrics-from-traces/),
+served by Tempo's `local-blocks` metrics generator (enabled in `tempo.yml`):
+
+```
+# prefix vs non-prefix hit rate over time
+{ name = "cb.request" } | avg_over_time(span.prefix_hit_rate)
+{ name = "cb.request" } | avg_over_time(span.non_prefix_hit_rate)
+```
 
 ## Files
 
 ```
 docker-compose.yml          — 4-service stack (collector, tempo, prometheus, grafana)
 otel-collector.yml          — OTLP receiver → Tempo + Prometheus fan-out
-tempo.yml                   — local trace storage
+tempo.yml                   — local trace storage + local-blocks TraceQL metrics
 prometheus.yml              — scrapes lmcache metrics from collector
 grafana/provisioning/       — auto-provisioned datasources + dashboard
 start-server.sh             — launches LMCache server + vLLM with OTLP enabled
diff --git a/examples/observability/grafana/provisioning/dashboards/lmcache.json b/examples/observability/grafana/provisioning/dashboards/lmcache.json
index 0178be2b34..c77db8d068 100644
--- a/examples/observability/grafana/provisioning/dashboards/lmcache.json
+++ b/examples/observability/grafana/provisioning/dashboards/lmcache.json
@@ -2348,7 +2348,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "description": "CacheBlend token-level hit rate and chunk throughput for store and retrieve paths.",
+          "description": "Overall blend token hit rate (left, %) and chunk throughput (right, ops/s). Prometheus.",
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -2389,6 +2389,8 @@
                 }
               },
               "mappings": [],
+              "min": 0,
+              "unit": "percentunit",
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -2397,14 +2399,93 @@
                     "value": null
                   }
                 ]
-              },
-              "unit": "percentunit"
+              }
             },
             "overrides": [
               {
                 "matcher": {
-                  "id": "byRegexp",
-                  "options": "/chunks/"
+                  "id": "byFrameRefID",
+                  "options": "B"
+                },
+                "properties": [
+                  {
+                    "id": "unit",
+                    "value": "ops"
+                  },
+                  {
+                    "id": "custom.axisPlacement",
+                    "value": "right"
+                  },
+                  {
+                    "id": "custom.axisLabel",
+                    "value": "chunks/s"
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byFrameRefID",
+                  "options": "C"
+                },
+                "properties": [
+                  {
+                    "id": "unit",
+                    "value": "ops"
+                  },
+                  {
+                    "id": "custom.axisPlacement",
+                    "value": "right"
+                  },
+                  {
+                    "id": "custom.axisLabel",
+                    "value": "chunks/s"
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byFrameRefID",
+                  "options": "D"
+                },
+                "properties": [
+                  {
+                    "id": "unit",
+                    "value": "ops"
+                  },
+                  {
+                    "id": "custom.axisPlacement",
+                    "value": "right"
+                  },
+                  {
+                    "id": "custom.axisLabel",
+                    "value": "chunks/s"
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byFrameRefID",
+                  "options": "E"
+                },
+                "properties": [
+                  {
+                    "id": "unit",
+                    "value": "ops"
+                  },
+                  {
+                    "id": "custom.axisPlacement",
+                    "value": "right"
+                  },
+                  {
+                    "id": "custom.axisLabel",
+                    "value": "chunks/s"
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byFrameRefID",
+                  "options": "F"
                 },
                 "properties": [
                   {
@@ -2414,6 +2495,10 @@
                   {
                     "id": "custom.axisPlacement",
                     "value": "right"
+                  },
+                  {
+                    "id": "custom.axisLabel",
+                    "value": "chunks/s"
                   }
                 ]
               }
@@ -2482,6 +2567,154 @@
           ],
           "title": "CacheBlend Hit Rate & Chunks",
           "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "description": "Prefix vs non-prefix token hit rate (each / requested tokens; they sum to the overall hit rate). Prometheus.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": false,
+                "lineInterpolation": "linear",
+                "lineWidth": 2,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "showValues": false,
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "min": 0,
+              "unit": "percentunit",
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 24,
+            "x": 0,
+            "y": 92
+          },
+          "id": 34,
+          "options": {
+            "legend": {
+              "calcs": [
+                "mean",
+                "lastNotNull"
+              ],
+              "displayMode": "table",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "hideZeros": false,
+              "mode": "multi",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "prometheus"
+              },
+              "editorMode": "code",
+              "refId": "A",
+              "expr": "rate(lmcache_blend_lookup_prefix_hit_tokens_total[$__rate_interval]) / rate(lmcache_blend_lookup_requested_tokens_total[$__rate_interval])",
+              "legendFormat": "prefix hit rate"
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "prometheus"
+              },
+              "editorMode": "code",
+              "refId": "B",
+              "expr": "rate(lmcache_blend_lookup_non_prefix_hit_tokens_total[$__rate_interval]) / rate(lmcache_blend_lookup_requested_tokens_total[$__rate_interval])",
+              "legendFormat": "non-prefix hit rate"
+            }
+          ],
+          "title": "CacheBlend prefix vs non-prefix hit rate",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "uid": "tempo"
+          },
+          "description": "Waterfall of CacheBlend (blend server) request spans: cb.request \u2192 cb.lookup {cb.fingerprint_match, cb.sparse_prefetch}, cb.retrieve {cb.scatter}, cb.store_pre_computed, cb.store_final. Click a row for the full Tempo waterfall.",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 12,
+            "w": 24,
+            "x": 0,
+            "y": 100
+          },
+          "id": 51,
+          "options": {
+            "dedupStrategy": "none",
+            "sortBy": [
+              {
+                "desc": true,
+                "displayName": "Start time"
+              }
+            ],
+            "spanFilters": {
+              "criticalPathOnly": false,
+              "matchesOnly": false
+            }
+          },
+          "targets": [
+            {
+              "limit": 20,
+              "query": "{ name = \"cb.request\" }",
+              "queryType": "traceql",
+              "refId": "A"
+            }
+          ],
+          "title": "CacheBlend Request Traces",
+          "type": "traces"
         }
       ]
     },
@@ -2553,6 +2786,6 @@
   "timezone": "browser",
   "title": "LMCache",
   "uid": "lmcache-standard",
-  "version": 2,
+  "version": 8,
   "weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/examples/observability/tempo.yml b/examples/observability/tempo.yml
index de6cda2878..a7fe8d41d2 100644
--- a/examples/observability/tempo.yml
+++ b/examples/observability/tempo.yml
@@ -10,6 +10,23 @@ distributor:
         http:
           endpoint: 0.0.0.0:4318
 
+# Keep recent traces in the local-blocks processor so Grafana can compute
+# TraceQL metrics (span latency / rate graphs) without a separate backend.
+metrics_generator:
+  processor:
+    local_blocks:
+      flush_to_storage: true
+      # Cut/complete blocks quickly so TraceQL-metrics see recent traces within
+      # seconds (sparse demo traffic otherwise sits in an open block for minutes).
+      trace_idle_period: 5s
+      flush_check_period: 5s
+      max_block_duration: 30s
+      complete_block_timeout: 1h
+  storage:
+    path: /var/tempo/generator/wal
+  traces_storage:
+    path: /var/tempo/generator/traces
+
 storage:
   trace:
     backend: local
@@ -17,3 +34,9 @@ storage:
       path: /var/tempo/blocks
     wal:
       path: /var/tempo/wal
+
+# Turn on the local-blocks processor for the (single, default) tenant.
+overrides:
+  defaults:
+    metrics_generator:
+      processors: [local-blocks]
diff --git a/lmcache/v1/mp_observability/event.py b/lmcache/v1/mp_observability/event.py
index f5e47ddde6..47133c562a 100644
--- a/lmcache/v1/mp_observability/event.py
+++ b/lmcache/v1/mp_observability/event.py
@@ -105,6 +105,22 @@ class EventType(Enum):
     CB_FINGERPRINTS_REGISTERED = "cb.fingerprints.registered"
     CB_CHUNKS_EVICTED = "cb.chunks.evicted"
 
+    # CB V3 lookup sub-spans (CPU) — nest under cb.lookup. Submitted-once but
+    # END may fire on a later poll (the non-blocking lookup re-issues), so the
+    # span captures submit→resident incl. poll-wait.
+    CB_FINGERPRINT_MATCH_START = "cb.fingerprint_match.start"
+    CB_FINGERPRINT_MATCH_END = "cb.fingerprint_match.end"
+    # No cb.prefix_lookup span: the prefix lookup is already traced by
+    # mp.lookup_prefetch (CB reuses LookupModule). prefix_chunks rides on
+    # cb.lookup via CB_LOOKUP_END instead.
+    CB_SPARSE_PREFETCH_START = "cb.sparse_prefetch.start"
+    CB_SPARSE_PREFETCH_END = "cb.sparse_prefetch.end"
+
+    # CB V3 retrieve sub-span (GPU) — nest under cb.retrieve. Emitted via
+    # publish_on_stream for GPU-accurate timing of the L1->paged scatter.
+    CB_SCATTER_START = "cb.scatter.start"
+    CB_SCATTER_END = "cb.scatter.end"
+
     # Cache Blending (CB) events — lifecycle sentinels (CPU-synchronous)
     CB_REQUEST_START = "cb.request.start"
     CB_STORE_PRE_COMPUTED_SUBMITTED = "cb.store_pre_computed.submitted"
diff --git a/lmcache/v1/mp_observability/subscribers/metrics/cb_server.py b/lmcache/v1/mp_observability/subscribers/metrics/cb_server.py
index 230a5413e4..f0dc3ed7d2 100644
--- a/lmcache/v1/mp_observability/subscribers/metrics/cb_server.py
+++ b/lmcache/v1/mp_observability/subscribers/metrics/cb_server.py
@@ -67,6 +67,16 @@ def __init__(self) -> None:
             ),
             unit="tokens",
         )
+        self._lookup_prefix_hit_tokens = meter.create_counter(
+            "lmcache_blend.lookup_prefix_hit_tokens",
+            description="Tokens served by blend from the prefix (L1+L2).",
+            unit="tokens",
+        )
+        self._lookup_non_prefix_hit_tokens = meter.create_counter(
+            "lmcache_blend.lookup_non_prefix_hit_tokens",
+            description="Tokens served by blend from non-prefix (shifted) chunks.",
+            unit="tokens",
+        )
         self._lookup_fingerprint_hits = meter.create_counter(
             "lmcache_blend.lookup_fingerprint_hits",
             description="Chunks matched by local fingerprint table",
@@ -149,6 +159,10 @@ def _on_lookup_start(self, event: Event) -> None:
     def _on_lookup_end(self, event: Event) -> None:
         self._lookup_requested_tokens.add(event.metadata["requested_tokens"])
         self._lookup_hit_tokens.add(event.metadata["hit_tokens"])
+        self._lookup_prefix_hit_tokens.add(event.metadata.get("prefix_hit_tokens", 0))
+        self._lookup_non_prefix_hit_tokens.add(
+            event.metadata.get("non_prefix_hit_tokens", 0)
+        )
         self._lookup_fingerprint_hits.add(event.metadata["fingerprint_hits"])
         self._lookup_storage_hits.add(event.metadata["storage_hits"])
         self._lookup_stale_chunks.add(event.metadata["stale_chunks"])
diff --git a/lmcache/v1/mp_observability/subscribers/tracing/cb_server.py b/lmcache/v1/mp_observability/subscribers/tracing/cb_server.py
index 04cf5fa315..28a5dec09a 100644
--- a/lmcache/v1/mp_observability/subscribers/tracing/cb_server.py
+++ b/lmcache/v1/mp_observability/subscribers/tracing/cb_server.py
@@ -72,6 +72,19 @@ class BlendTracingSubscriber(EventSubscriber):
         EventType.CB_LOOKUP_START: "cb.lookup",
         EventType.CB_RETRIEVE_START: "cb.retrieve",
         EventType.CB_STORE_FINAL_START: "cb.store_final",
+        # V3 lookup sub-spans (nest under cb.lookup, see _SPAN_PARENTS).
+        EventType.CB_FINGERPRINT_MATCH_START: "cb.fingerprint_match",
+        EventType.CB_SPARSE_PREFETCH_START: "cb.sparse_prefetch",
+        # V3 retrieve sub-span (nests under cb.retrieve).
+        EventType.CB_SCATTER_START: "cb.scatter",
+    }
+
+    # Child span -> parent span name for nesting; absent => nest under the
+    # cb.request root (the default for the top-level lookup/retrieve/store spans).
+    _SPAN_PARENTS: dict[str, str] = {
+        "cb.fingerprint_match": "cb.lookup",
+        "cb.sparse_prefetch": "cb.lookup",
+        "cb.scatter": "cb.retrieve",
     }
 
     _END_TO_START: dict[EventType, EventType] = {
@@ -79,6 +92,9 @@ class BlendTracingSubscriber(EventSubscriber):
         EventType.CB_LOOKUP_END: EventType.CB_LOOKUP_START,
         EventType.CB_RETRIEVE_END: EventType.CB_RETRIEVE_START,
         EventType.CB_STORE_FINAL_END: EventType.CB_STORE_FINAL_START,
+        EventType.CB_FINGERPRINT_MATCH_END: EventType.CB_FINGERPRINT_MATCH_START,
+        EventType.CB_SPARSE_PREFETCH_END: EventType.CB_SPARSE_PREFETCH_START,
+        EventType.CB_SCATTER_END: EventType.CB_SCATTER_START,
     }
 
     # END events that correspond to a SUBMITTED sentinel (decrement ops counter)
@@ -126,6 +142,14 @@ def get_subscriptions(self) -> dict[EventType, EventCallback]:
             EventType.CB_RETRIEVE_END: self._on_end,
             EventType.CB_STORE_FINAL_START: self._on_start,
             EventType.CB_STORE_FINAL_END: self._on_end,
+            # V3 lookup sub-spans (nested under cb.lookup)
+            EventType.CB_FINGERPRINT_MATCH_START: self._on_start,
+            EventType.CB_FINGERPRINT_MATCH_END: self._on_end,
+            EventType.CB_SPARSE_PREFETCH_START: self._on_start,
+            EventType.CB_SPARSE_PREFETCH_END: self._on_end,
+            # V3 retrieve sub-span (nested under cb.retrieve, GPU-timed)
+            EventType.CB_SCATTER_START: self._on_start,
+            EventType.CB_SCATTER_END: self._on_end,
             # Point events
             EventType.CB_FINGERPRINTS_REGISTERED: self._on_point,
             EventType.CB_CHUNKS_EVICTED: self._on_point,
@@ -245,7 +269,12 @@ def _on_start(self, event: Event) -> None:
         sid = event.session_id
         span_name = self._SPAN_DEFS[event.event_type]
 
-        parent_ctx = self._registry.get_context(sid, "cb.request")
+        # Nest under the mapped parent span (e.g. cb.fingerprint_match under
+        # cb.lookup); top-level spans and any orphan fall back to cb.request.
+        parent_name = self._SPAN_PARENTS.get(span_name, "cb.request")
+        parent_ctx = self._registry.get_context(sid, parent_name)
+        if parent_ctx is None and parent_name != "cb.request":
+            parent_ctx = self._registry.get_context(sid, "cb.request")
         span = _tracer.start_span(
             span_name,
             context=parent_ctx,
@@ -298,15 +327,25 @@ def _on_end(self, event: Event) -> None:
                 root_span, _ = root_entry
                 hit_tokens = int(event.metadata.get("hit_tokens", 0))
                 requested_tokens = int(event.metadata.get("requested_tokens", 0))
-                hit_rate = (
-                    hit_tokens / requested_tokens if requested_tokens > 0 else 0.0
+                prefix_hit_tokens = int(event.metadata.get("prefix_hit_tokens", 0))
+                non_prefix_hit_tokens = int(
+                    event.metadata.get("non_prefix_hit_tokens", 0)
                 )
+                denom = requested_tokens or 1  # avoid /0; rates are 0 when requested=0
                 root_span.set_attribute("hit_tokens", hit_tokens)
                 root_span.set_attribute("requested_tokens", requested_tokens)
-                root_span.set_attribute("hit_rate", hit_rate)
+                # hit_rate numerator = prefix + non-prefix reuse (hit_tokens).
+                root_span.set_attribute("hit_rate", hit_tokens / denom)
                 root_span.set_attribute(
                     "prefix_hits", int(event.metadata.get("prefix_hits", 0))
                 )
+                root_span.set_attribute("prefix_hit_tokens", prefix_hit_tokens)
+                root_span.set_attribute("non_prefix_hit_tokens", non_prefix_hit_tokens)
+                # Per-component hit rates (sum to hit_rate).
+                root_span.set_attribute("prefix_hit_rate", prefix_hit_tokens / denom)
+                root_span.set_attribute(
+                    "non_prefix_hit_rate", non_prefix_hit_tokens / denom
+                )
 
         if event.event_type in self._GPU_OP_END_EVENTS:
             if (count := self._pending_gpu_ops.get(sid, 0)) > 0:
diff --git a/lmcache/v1/multiprocess/modules/blend_v3.py b/lmcache/v1/multiprocess/modules/blend_v3.py
index 1409aa6a6d..d1dcd8ee83 100644
--- a/lmcache/v1/multiprocess/modules/blend_v3.py
+++ b/lmcache/v1/multiprocess/modules/blend_v3.py
@@ -79,6 +79,7 @@ class _CBUnifiedJob:
     per_hash_obj_keys: dict | None = None
     expanded_uidx: list[int] | None = None
     found_uidx: set[int] | None = None  # stashed when the sparse poll completes
+    l2_keys: int = 0  # sparse keys needing an L2 load (0 => no L2 read, span skipped)
 
 
 class BlendTokenRangeMatcherV3:
@@ -255,7 +256,14 @@ def match_sub_sequence(
             return results
 
     def remove_chunks(self, token_hashes: list[bytes]) -> None:
-        """Evict stale entries; clears poly_hash so re-probes can't match."""
+        """Evict the given chunks from the matcher.
+
+        Clears each chunk's table slot + poly hash so later probes cannot match
+        it. Thread-safe.
+
+        Args:
+            token_hashes (list[bytes]): Content hashes of the chunks to evict.
+        """
         with self._lock:
             for th in token_hashes:
                 cid = self._token_hash_to_compact_id.get(th)
@@ -409,8 +417,22 @@ def cb_register_rope(
         head_size: int,
         is_neox_style: bool,
     ) -> None:
-        """Bolt rope state onto an already-registered cache_contexts entry;
-        idempotent. REGISTER_KV_CACHE must precede this."""
+        """Bolt CB re-RoPE state onto an already-registered KV-cache instance.
+
+        Idempotent; ``REGISTER_KV_CACHE`` must precede this. Strips any
+        YaRN/longrope mscale baked into the rope cache so re-RoPE stays a pure
+        rotation.
+
+        Args:
+            instance_id (int): KV-cache instance to attach rope state to.
+            cos_sin_cache_ipc (CudaIPCWrapper): IPC handle to vLLM's cos/sin
+                rope cache.
+            head_size (int): Rotary head dimension.
+            is_neox_style (bool): True for NeoX (contiguous halves), else GPT-J.
+
+        Raises:
+            ValueError: If ``instance_id`` has no registered KV cache.
+        """
         cache_contexts = self._gpu_transfer.cache_contexts
         if instance_id not in cache_contexts:
             raise ValueError(
@@ -463,7 +485,12 @@ def cb_register_rope(
         )
 
     def cb_unregister_rope(self, instance_id: int) -> None:
-        """Drop rope state. Paged KV cache stays (use UNREGISTER_KV_CACHE)."""
+        """Drop the instance's CB rope state; the paged KV cache is left intact.
+
+        Args:
+            instance_id (int): Instance whose rope state to remove (use
+                ``UNREGISTER_KV_CACHE`` to free the KV cache itself).
+        """
         self._cb_rope_state.pop(instance_id, None)
         self._cb_gpu_contexts.pop(instance_id, None)
         self._cb_gpu_context_meta.pop(instance_id, None)
@@ -498,9 +525,18 @@ def _drain_fingerprints_sync(self) -> None:
                 logger.exception("CB fingerprint registration failed (sync drain)")
 
     def _match_fingerprints(self, key: IPCCacheEngineKey) -> list[CBMatchResult]:
-        """Drain pending registrations, fingerprint-match sub-sequences, then
-        leftmost-greedy dedup over overlapping ranges. Returns matches sorted
-        by cur_st (empty if none)."""
+        """Match the query's reusable chunks, leftmost-greedy deduped.
+
+        Drains pending fingerprint registrations, probes the matcher, then keeps
+        a non-overlapping leftmost-greedy subset.
+
+        Args:
+            key (IPCCacheEngineKey): The query request key.
+
+        Returns:
+            list[CBMatchResult]: Non-overlapping matches sorted by cur_st
+            (empty if none).
+        """
         self._drain_fingerprints_sync()
         matches = self._token_range_matcher.match_sub_sequence(list(key.token_ids))
         if not matches:
@@ -517,7 +553,16 @@ def _match_fingerprints(self, key: IPCCacheEngineKey) -> list[CBMatchResult]:
     def _resolve_cb_layout_desc(
         self, model_name: str, world_size: int
     ) -> "MemoryLayoutDesc | None":
-        """Find the CB KV buffer layout for (model, world_size), or None."""
+        """Find the CB KV buffer layout for ``(model_name, world_size)``.
+
+        Args:
+            model_name (str): Model name to match.
+            world_size (int): Tensor-parallel world size to match.
+
+        Returns:
+            MemoryLayoutDesc | None: The matching layout, or None if no
+            registered CB GPU context matches.
+        """
         for gpu_id, (m_name, w_size) in self._cb_gpu_context_meta.items():
             if m_name == model_name and w_size == world_size:
                 cb_ctx = self._cb_gpu_contexts[gpu_id]
@@ -533,9 +578,24 @@ def _sparse_prefetch_submit(
         layout_desc: "MemoryLayoutDesc",
         matches: list[CBMatchResult],
     ) -> "tuple[PrefetchHandle, dict[bytes, list], list[int]]":
-        """Coalesce all matches into one sparse prefetch and submit it
-        (non-blocking). The caller polls query_prefetch_status(handle) then
-        calls :meth:`_sparse_classify` with the found set."""
+        """Coalesce all matches into one sparse L2->L1 prefetch and submit it.
+
+        Non-blocking. Dedups object keys before submit (sparse keeps one read
+        lock per loaded key, so a duplicate would leak). The caller polls
+        ``query_prefetch_status(handle)`` then calls :meth:`_sparse_classify`
+        with the found set.
+
+        Args:
+            key (IPCCacheEngineKey): The request key.
+            layout_desc (MemoryLayoutDesc): CB KV buffer layout for L1 alloc.
+            matches (list[CBMatchResult]): Non-prefix matches to prefetch.
+
+        Returns:
+            tuple[PrefetchHandle, dict[bytes, list], list[int]]: the prefetch
+            handle, per-hash TP-expanded object keys, and each expanded
+            position's deduped-key index (maps the per-key found set back to
+            every chunk).
+        """
         world_size = key.world_size
         per_hash_obj_keys: dict[bytes, list] = {}
         all_hashes = [r.hash for r in matches]
@@ -573,9 +633,22 @@ def _sparse_classify(
         per_hash_obj_keys: dict[bytes, list],
         expanded_uidx: list[int],
     ) -> list[CBMatchResult]:
-        """Classify each chunk found/stale by whether every TP rank's key
-        loaded, run stale-strike bookkeeping, and stash the obj_keys cache for
-        retrieve. Returns the found subset (cur_st order)."""
+        """Classify each prefetched chunk as found or stale, and finalize state.
+
+        A chunk is found only if every TP rank's key loaded; stale chunks take
+        an eviction strike (evicted at threshold, kept while still in-flight).
+        Stashes the found chunks' obj_keys for the retrieve path.
+
+        Args:
+            key (IPCCacheEngineKey): The request key.
+            matches (list[CBMatchResult]): The submitted non-prefix matches.
+            found_uidx (set[int]): Deduped-key indices that loaded.
+            per_hash_obj_keys (dict[bytes, list]): Per-hash TP-expanded keys.
+            expanded_uidx (list[int]): Each expanded position's deduped index.
+
+        Returns:
+            list[CBMatchResult]: The found subset, in cur_st order.
+        """
         world_size = key.world_size
         found_cb_match_result: list[CBMatchResult] = []
         stale_hashes: list[bytes] = []
@@ -631,10 +704,20 @@ def cb_unified_lookup(
         """Non-blocking single-RPC CB lookup (submit-once, poll-on-recall).
 
         First call submits the prefix lookup + fingerprint match; later calls
-        poll both legs, returning None until the prefix and the sparse
-        complement are both resident in L1 (so a worker thread never blocks on
-        the L2->L1 loads). The prefix job's L1 read locks persist for the
-        retrieve.
+        poll both legs, returning ``None`` until the prefix and the sparse
+        non-prefix complement are both resident in L1 (so a worker thread never
+        blocks on the L2->L1 loads). The prefix job's L1 read locks persist for
+        the retrieve.
+
+        Args:
+            key (IPCCacheEngineKey): Request key (token IDs, request_id, model,
+                world_size).
+            tp_size (int): Tensor-parallel size for the prefix lookup.
+
+        Returns:
+            CBUnifiedLookupResult | None: ``None`` while either leg is still
+            loading (the caller re-issues to poll); on completion, the prefix
+            coverage in tokens plus the found non-prefix segments.
         """
         rid = key.request_id
         chunk_size = self._ctx.chunk_size
@@ -653,11 +736,23 @@ def cb_unified_lookup(
                     metadata={"num_tokens": len(key.token_ids)},
                 )
             )
-            self._lookup_module.lookup(key, tp_size)  # submit prefix (non-blocking)
-            job = _CBUnifiedJob(
-                matches=self._match_fingerprints(key),
-                num_tokens=len(key.token_ids),
+            # Prefix leg: submit (non-blocking). Already traced upstream by
+            # mp.lookup_prefetch (LookupModule self-instruments); prefix_chunks
+            # lands on cb.lookup via CB_LOOKUP_END below.
+            self._lookup_module.lookup(key, tp_size)
+            # Fingerprint match: CPU-bound, tight span.
+            self._event_bus.publish(
+                Event(event_type=EventType.CB_FINGERPRINT_MATCH_START, session_id=rid)
             )
+            matches = self._match_fingerprints(key)
+            self._event_bus.publish(
+                Event(
+                    event_type=EventType.CB_FINGERPRINT_MATCH_END,
+                    session_id=rid,
+                    metadata={"matches": len(matches)},
+                )
+            )
+            job = _CBUnifiedJob(matches=matches, num_tokens=len(key.token_ids))
             with self._cb_jobs_lock:
                 self._cb_jobs[rid] = job
 
@@ -686,6 +781,22 @@ def cb_unified_lookup(
                         job.per_hash_obj_keys,
                         job.expanded_uidx,
                     ) = self._sparse_prefetch_submit(key, layout_desc, job.non_prefix)
+                    # Only trace the span when the prefetch actually reads L2;
+                    # all-L1-resident matches do no L2 work worth a span.
+                    job.l2_keys = len(job.handle.l2_orig_indices)
+                    if job.l2_keys > 0:
+                        self._event_bus.publish(
+                            Event(
+                                event_type=EventType.CB_SPARSE_PREFETCH_START,
+                                session_id=rid,
+                                metadata={
+                                    "n_chunks": len(job.non_prefix),
+                                    "world_size": key.world_size,
+                                    "n_keys": len(job.non_prefix) * key.world_size,
+                                    "l2_keys": job.l2_keys,
+                                },
+                            )
+                        )
                 else:
                     logger.error(
                         "No CB GPU context for model %s ws %d during cb_unified_lookup",
@@ -701,6 +812,17 @@ def cb_unified_lookup(
             if bm is None:
                 return None  # sparse still loading -> defer
             job.found_uidx = set(bm.get_indices_list())
+            if job.l2_keys > 0:
+                self._event_bus.publish(
+                    Event(
+                        event_type=EventType.CB_SPARSE_PREFETCH_END,
+                        session_id=rid,
+                        metadata={
+                            "found_keys": len(job.found_uidx),
+                            "l2_keys": job.l2_keys,
+                        },
+                    )
+                )
 
         # --- BOTH legs ready: classify the complement + finalize. ---
         if job.handle is not None:
@@ -716,6 +838,10 @@ def cb_unified_lookup(
 
         prefix_tokens = job.prefix_chunks * chunk_size
         num_tokens = job.num_tokens
+        # V3 hit rate = (prefix + non-prefix) reuse. The two ranges are disjoint
+        # (non_prefix has cur_st >= prefix_tokens), so they sum without double-
+        # counting. hit_tokens carries the sum (the hit_rate numerator).
+        non_prefix_hit_tokens = _unique_token_coverage(found)
         self._event_bus.publish(
             Event(
                 event_type=EventType.CB_LOOKUP_END,
@@ -724,10 +850,13 @@ def cb_unified_lookup(
                     "num_tokens": num_tokens,
                     "fingerprint_hits": len(found),
                     "prefix_hits": job.prefix_chunks,
+                    "prefix_chunks": job.prefix_chunks,
                     "storage_hits": len(found),
                     "stale_chunks": len(job.non_prefix or []) - len(found),
                     "no_gpu_context": False,
-                    "hit_tokens": _unique_token_coverage(found),
+                    "prefix_hit_tokens": prefix_tokens,
+                    "non_prefix_hit_tokens": non_prefix_hit_tokens,
+                    "hit_tokens": prefix_tokens + non_prefix_hit_tokens,
                     "requested_tokens": (num_tokens // chunk_size) * chunk_size,
                 },
             )
@@ -746,8 +875,24 @@ def store(
         gpu_block_ids: list[list[int]],
         event_ipc_handle: bytes,
     ) -> tuple[bytes, bool]:
-        """Paged store + matcher fingerprint registration (skips pos-0
-        chunks; fingerprint failures logged, never raised)."""
+        """Paged store, then register the stored chunks as match fingerprints.
+
+        Delegates the KV write to ``GPUTransfer.store``, then (worker 0 only)
+        enqueues the chunk hashes for async fingerprint registration ordered
+        after the L1 commit. Chunk 0 of a position-0 store is skipped (owned by
+        the standard prefix path). Fingerprint failures are logged, never
+        raised — they do not affect store correctness.
+
+        Args:
+            key (IPCCacheEngineKey): Store key (token IDs + ``[start, end)``).
+            instance_id (int): Target KV-cache instance.
+            gpu_block_ids (list[list[int]]): Per-layer-group paged block IDs.
+            event_ipc_handle (bytes): IPC handle to the producer's CUDA event.
+
+        Returns:
+            tuple[bytes, bool]: The underlying ``GPUTransfer.store`` result
+            (event handle, success).
+        """
         result = self._gpu_transfer.store(
             key, instance_id, gpu_block_ids, event_ipc_handle
         )
@@ -819,8 +964,20 @@ def _apply_cb_rope_batched(
         batch_len: int,
         slots_to_rope: list[tuple[int, int, int]],
     ) -> None:
-        """Re-RoPE tmp-pool slots in-place (K-only, per group); list of
-        (slot_idx, old_st, cur_st)."""
+        """Re-RoPE the given tmp-pool slots in place (K-only, per kernel group).
+
+        Args:
+            gpu_context (GPUCacheContext): The instance's GPU cache context.
+            rope_state (_CBRopeState): Cached cos/sin + head layout.
+            batch_len (int): Number of tmp slots staged for this batch.
+            slots_to_rope (list[tuple[int, int, int]]): ``(slot_idx, old_st,
+                cur_st)`` per shifted slot — re-RoPE K from stored position
+                ``old_st`` to new position ``cur_st``.
+
+        Raises:
+            RuntimeError: On a compressed (compress_ratio != 1) or MLA
+                (kv_size != 2) layout, or a head_size/hidden_dim mismatch.
+        """
         if not slots_to_rope:
             return
         num_groups = gpu_context.kv_layer_groups_manager.num_kernel_groups
@@ -873,9 +1030,31 @@ def cb_retrieve_pre_computed(
         instance_id: int,
         event_ipc_handle: bytes,
     ) -> tuple[bytes, bool]:
-        """Scatter EVERY matched chunk into paged KV (prefix-hit + shifted);
-        K-only re-RoPE on the shifted subset. Drops misaligned matches;
-        MLA layouts unsupported."""
+        """Scatter every matched token range into the request's paged KV.
+
+        Reuses the lookup's prefetched chunks: fills tmp slots, K-only re-RoPEs
+        the shifted (non-prefix) subset, then writes per-token via the slot
+        kernel — so non-block-aligned matches and partial vLLM blocks shared
+        with recomputed tokens are written correctly (no block-alignment trim).
+        Only matches past the currently allocated slots are dropped (vLLM may
+        call this twice: partial- then full-block alloc).
+
+        Args:
+            key (IPCCacheEngineKey): The request key.
+            cb_match_result (list[CBMatchResult]): Matched ranges to scatter
+                (prefix-hit and shifted), any order.
+            gpu_block_ids (list[int]): This request's full paged block table.
+            instance_id (int): Target KV-cache instance.
+            event_ipc_handle (bytes): IPC handle to the forward's CUDA event.
+
+        Returns:
+            tuple[bytes, bool]: The scatter-complete event handle and whether
+            the scatter ran (False if the prefetched objects were unavailable).
+
+        Raises:
+            ValueError: If the instance has no registered KV cache or rope
+                state. MLA layouts are unsupported (raised during re-RoPE).
+        """
         cache_contexts = self._gpu_transfer.cache_contexts
         if instance_id not in cache_contexts:
             raise ValueError(
@@ -1010,6 +1189,29 @@ def cb_retrieve_pre_computed(
                             continue
                         pairs.append((r, memory_obj))
 
+                    # cb.scatter span (GPU): the L1->paged write of every
+                    # applied match. Re-RoPE is folded in (n_shifted) — it is
+                    # interleaved per-batch, so not a separate span.
+                    self._event_bus.publish_on_stream(
+                        gpu_context.cupy_stream,
+                        Event(
+                            event_type=EventType.CB_SCATTER_START,
+                            session_id=key.request_id,
+                            metadata={
+                                "scattered_tokens": sum(
+                                    r.cur_ed - r.cur_st for r, _ in pairs
+                                ),
+                                "n_prefix": sum(
+                                    1 for r, _ in pairs if r.old_st == r.cur_st
+                                ),
+                                "n_shifted": sum(
+                                    1 for r, _ in pairs if r.old_st != r.cur_st
+                                ),
+                                "dropped": len(cb_match_result) - len(pairs),
+                            },
+                        ),
+                    )
+
                     # Consecutive matches → one batched scatter per group.
                     runs: list[list[tuple[CBMatchResult, Any]]] = []
                     for r_obj in pairs:
@@ -1080,6 +1282,14 @@ def cb_retrieve_pre_computed(
                                     block_size=bs,
                                     head_size=rope_state.head_size,
                                 )
+
+                    self._event_bus.publish_on_stream(
+                        gpu_context.cupy_stream,
+                        Event(
+                            event_type=EventType.CB_SCATTER_END,
+                            session_id=key.request_id,
+                        ),
+                    )
             except Exception:
                 logger.exception("Error during retrieving prefetched results")
                 self._event_bus.publish_on_stream(
diff --git a/tests/v1/mp_observability/subscribers/tracing/test_cb_server.py b/tests/v1/mp_observability/subscribers/tracing/test_cb_server.py
index 39b5e68ba2..37555d1fc5 100644
--- a/tests/v1/mp_observability/subscribers/tracing/test_cb_server.py
+++ b/tests/v1/mp_observability/subscribers/tracing/test_cb_server.py
@@ -867,3 +867,173 @@ def test_prefix_hits_defaults_to_zero_when_absent(self, exporter):
         root = self._root_span(exporter, sid)
         assert root is not None
         assert root.attributes["prefix_hits"] == 0
+
+    def test_hit_rate_includes_prefix_and_non_prefix(self, exporter):
+        """V3 hit_rate numerator = prefix_hit_tokens + non_prefix_hit_tokens;
+        both are also recorded as separate attributes on the root span."""
+        bus = EventBus(EventBusConfig(enabled=True, max_queue_size=100))
+        bus.register_subscriber(BlendTracingSubscriber(SpanRegistry()))
+        bus.start()
+        now = time.time()
+        sid = "cb-hr-split"
+        bus.publish(
+            Event(event_type=EventType.CB_REQUEST_START, session_id=sid, timestamp=now)
+        )
+        bus.publish(
+            Event(
+                event_type=EventType.CB_LOOKUP_START,
+                session_id=sid,
+                timestamp=now + 0.001,
+                metadata={"num_tokens": 1024},
+            )
+        )
+        bus.publish(
+            Event(
+                event_type=EventType.CB_LOOKUP_END,
+                session_id=sid,
+                timestamp=now + 0.010,
+                metadata={
+                    "prefix_hit_tokens": 256,
+                    "non_prefix_hit_tokens": 256,
+                    "hit_tokens": 512,  # = prefix + non_prefix (set by blend_v3)
+                    "requested_tokens": 1024,
+                },
+            )
+        )
+        bus.publish(
+            Event(
+                event_type=EventType.CB_REQUEST_END,
+                session_id=sid,
+                timestamp=now + 0.011,
+            )
+        )
+        time.sleep(0.2)
+        bus.stop()
+
+        root = self._root_span(exporter, sid)
+        assert root is not None
+        assert root.attributes["prefix_hit_tokens"] == 256
+        assert root.attributes["non_prefix_hit_tokens"] == 256
+        assert root.attributes["hit_tokens"] == 512
+        assert root.attributes["hit_rate"] == 0.5
+
+
+class TestCBLookupSubspans:
+    """V3 lookup sub-spans (cb.fingerprint_match / cb.sparse_prefetch) nest under
+    cb.lookup, not the cb.request root. The prefix lookup has no cb.* span (it is
+    traced by mp.lookup_prefetch); prefix_chunks rides on cb.lookup."""
+
+    @pytest.fixture
+    def exporter(self):
+        exp = InMemorySpanExporter()
+        provider = TracerProvider()
+        provider.add_span_processor(SimpleSpanProcessor(exp))
+        real_tracer = provider.get_tracer("lmcache_mp.blend")
+        with (
+            patch.object(cb_server_module, "_tracer", real_tracer),
+            patch.object(cb_server_module, "_HAS_OTEL", True),
+        ):
+            yield exp
+        exp.shutdown()
+
+    def _spans_by_name(self, exporter: InMemorySpanExporter, sid: str):
+        return {
+            s.name: s
+            for s in exporter.get_finished_spans()
+            if s.attributes.get("session_id") == sid
+        }
+
+    def test_lookup_subspans_nest_under_cb_lookup(self, exporter):
+        bus = EventBus(EventBusConfig(enabled=True, max_queue_size=100))
+        bus.register_subscriber(BlendTracingSubscriber(SpanRegistry()))
+        bus.start()
+        now = time.time()
+        sid = "cb-subspans"
+        seq = [
+            (EventType.CB_REQUEST_START, {}),
+            (EventType.CB_LOOKUP_START, {"num_tokens": 1024}),
+            (EventType.CB_FINGERPRINT_MATCH_START, {}),
+            (EventType.CB_FINGERPRINT_MATCH_END, {"matches": 7}),
+            (EventType.CB_SPARSE_PREFETCH_START, {"n_chunks": 5}),
+            (EventType.CB_SPARSE_PREFETCH_END, {"found_keys": 5}),
+            (EventType.CB_LOOKUP_END, {"num_tokens": 1024, "prefix_chunks": 2}),
+            (EventType.CB_REQUEST_END, {}),
+        ]
+        for i, (et, md) in enumerate(seq):
+            bus.publish(
+                Event(
+                    event_type=et,
+                    session_id=sid,
+                    timestamp=now + i * 0.001,
+                    metadata=md,
+                )
+            )
+        time.sleep(0.3)
+        bus.stop()
+
+        spans = self._spans_by_name(exporter, sid)
+        for name in (
+            "cb.lookup",
+            "cb.fingerprint_match",
+            "cb.sparse_prefetch",
+        ):
+            assert name in spans, f"missing span {name}; have {sorted(spans)}"
+        # No cb.prefix_lookup span (traced by mp.lookup_prefetch instead).
+        assert "cb.prefix_lookup" not in spans
+        lookup_id = spans["cb.lookup"].context.span_id
+        for name in ("cb.fingerprint_match", "cb.sparse_prefetch"):
+            assert spans[name].parent is not None, f"{name} has no parent span"
+            assert spans[name].parent.span_id == lookup_id, (
+                f"{name} should nest under cb.lookup"
+            )
+        # cb.lookup itself nests under the cb.request root (unchanged behavior).
+        assert spans["cb.lookup"].parent.span_id == spans["cb.request"].context.span_id
+        # sub-span metadata propagates as attributes.
+        assert spans["cb.fingerprint_match"].attributes.get("matches") == "7"
+        # prefix coverage rides on cb.lookup (no dedicated prefix span).
+        assert spans["cb.lookup"].attributes.get("prefix_chunks") == "2"
+        assert spans["cb.sparse_prefetch"].attributes.get("found_keys") == "5"
+
+    def test_scatter_span_nests_under_cb_retrieve(self, exporter):
+        bus = EventBus(EventBusConfig(enabled=True, max_queue_size=100))
+        bus.register_subscriber(BlendTracingSubscriber(SpanRegistry()))
+        bus.start()
+        now = time.time()
+        sid = "cb-scatter"
+        seq = [
+            (EventType.CB_REQUEST_START, {}),
+            (EventType.CB_RETRIEVE_START, {"num_chunks": 5}),
+            (
+                EventType.CB_SCATTER_START,
+                {
+                    "scattered_tokens": 1280,
+                    "n_prefix": 1,
+                    "n_shifted": 4,
+                    "dropped": 0,
+                },
+            ),
+            (EventType.CB_SCATTER_END, {}),
+            (EventType.CB_RETRIEVE_END, {}),
+            (EventType.CB_REQUEST_END, {}),
+        ]
+        for i, (et, md) in enumerate(seq):
+            bus.publish(
+                Event(
+                    event_type=et,
+                    session_id=sid,
+                    timestamp=now + i * 0.001,
+                    metadata=md,
+                )
+            )
+        time.sleep(0.3)
+        bus.stop()
+
+        spans = self._spans_by_name(exporter, sid)
+        assert "cb.scatter" in spans, f"missing cb.scatter; have {sorted(spans)}"
+        assert "cb.retrieve" in spans
+        assert (
+            spans["cb.scatter"].parent.span_id == spans["cb.retrieve"].context.span_id
+        ), "cb.scatter should nest under cb.retrieve"
+        assert spans["cb.scatter"].attributes.get("scattered_tokens") == "1280"
+        assert spans["cb.scatter"].attributes.get("n_shifted") == "4"
+        assert spans["cb.scatter"].attributes.get("dropped") == "0"

From 27f4eee53ecc05809a021bc7ff03f42c60a8f8df Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Wed, 10 Jun 2026 19:49:34 -0700
Subject: [PATCH 36/57] add comments

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 lmcache/v1/mp_coordinator/http_apis/l2_api.py | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/lmcache/v1/mp_coordinator/http_apis/l2_api.py b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
index d12d775e6f..65fb599455 100644
--- a/lmcache/v1/mp_coordinator/http_apis/l2_api.py
+++ b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
@@ -98,6 +98,11 @@ async def set_quota(
 ) -> QuotaResponse | JSONResponse:
     """Create or update a quota for the given ``cache_salt``.
 
+    Args:
+        cache_salt: The tenant identifier.
+        body: Quota limit to apply.
+        request: The incoming request.
+
     Returns:
         The applied quota.
     """
@@ -117,6 +122,10 @@ async def set_quota(
 async def delete_quota(cache_salt: str, request: Request) -> QuotaResponse:
     """Remove a salt's quota entry.
 
+    Args:
+        cache_salt: The tenant identifier.
+        request: The incoming request.
+
     Returns:
         Whether the entry was found and removed.
     """
@@ -137,6 +146,10 @@ async def report_events(
 ) -> ReportUsageResponse:
     """Record a batch of L2 store/lookup events.
 
+    Args:
+        body: Batch of store/lookup events to record.
+        request: The incoming request.
+
     Returns:
         Number of events processed.
     """
@@ -158,6 +171,10 @@ async def report_events(
 async def get_status(cache_salt: str, request: Request) -> L2StatusResponse:
     """Read quota and usage for a single salt.
 
+    Args:
+        cache_salt: The tenant identifier.
+        request: The incoming request.
+
     Returns:
         Combined quota and usage detail.
     """
@@ -177,6 +194,9 @@ async def get_status(cache_salt: str, request: Request) -> L2StatusResponse:
 async def list_status(request: Request) -> L2StatusListResponse:
     """List quota and usage across all cache salts.
 
+    Args:
+        request: The incoming request.
+
     Returns:
         Total usage and per-salt breakdown with quota info.
     """

From 3f0313f5488597a1a9e021bf785edc377eb6e979 Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Wed, 10 Jun 2026 19:52:12 -0700
Subject: [PATCH 37/57] fix info leak

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 lmcache/v1/mp_coordinator/http_apis/l2_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmcache/v1/mp_coordinator/http_apis/l2_api.py b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
index 65fb599455..7312605a84 100644
--- a/lmcache/v1/mp_coordinator/http_apis/l2_api.py
+++ b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
@@ -109,8 +109,8 @@ async def set_quota(
     limit_bytes = int(body.limit_gb * _GB)
     try:
         _quota_manager(request).set(cache_salt, limit_bytes)
-    except ValueError as exc:
-        return JSONResponse(status_code=400, content={"error": str(exc)})
+    except ValueError:
+        return JSONResponse(status_code=400, content={"error": "invalid quota limit"})
     return QuotaResponse(
         cache_salt=cache_salt,
         limit_gb=body.limit_gb,

From d424ba863dedde21e3c77c5aa6b0417536a9b0c2 Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Wed, 10 Jun 2026 19:56:25 -0700
Subject: [PATCH 38/57] fix data race

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 lmcache/v1/mp_coordinator/l2/eviction_manager.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lmcache/v1/mp_coordinator/l2/eviction_manager.py b/lmcache/v1/mp_coordinator/l2/eviction_manager.py
index a7b8559994..1ce491d23c 100644
--- a/lmcache/v1/mp_coordinator/l2/eviction_manager.py
+++ b/lmcache/v1/mp_coordinator/l2/eviction_manager.py
@@ -141,7 +141,9 @@ def execute_evictions(self) -> dict[str, list[CacheKey]]:
             keys_to_evict = self._select_keys_to_evict(cache_salt, target_bytes)
             if keys_to_evict:
                 eviction_plan[cache_salt] = keys_to_evict
-                evict_bytes = sum(self._key_sizes.get(k, 0) for k in keys_to_evict)
+                with self._lock:
+                    sizes = [self._key_sizes.get(k, 0) for k in keys_to_evict]
+                evict_bytes = sum(sizes)
                 logger.info(
                     "Eviction plan for cache_salt=%r: %d keys "
                     "(%d bytes) to free; usage=%d, quota=%d, "
@@ -153,13 +155,13 @@ def execute_evictions(self) -> dict[str, list[CacheKey]]:
                     limit_bytes,
                     over_bytes,
                 )
-                for k in keys_to_evict:
+                for k, size in zip(keys_to_evict, sizes, strict=True):
                     logger.info(
                         "  -> evict key: model=%s, kv_rank=%d, hash=%s, size=%d",
                         k.model_name,
                         k.kv_rank,
                         k.chunk_hash_hex,
-                        self._key_sizes.get(k, 0),
+                        size,
                     )
 
         # TODO: once eviction is wired end-to-end, call on_remove()

From 7021790bf5d562ed8887d8e901bf25edfa2ff18e Mon Sep 17 00:00:00 2001
From: Ankit Kumar <33925670+ankit-sam@users.noreply.github.com>
Date: Thu, 11 Jun 2026 10:01:43 +0530
Subject: [PATCH 39/57] Missing io_uring changes and introducing nvme
 io_uring_cmd (passthrough) changes (#3274)

* rust: Add io_uring sync write path for checkpoint

Signed-off-by: Ankit Kumar <ankit.kumar@samsung.com>

* enable io_uring from raw block plugin

The io_uring changes were omitted during the MP mode integration
This commit partially adds them back.
The request batching is only done for headers and payloads, since
for MP mode we need to order the requests as they may send together.
This will be fixed later.

Fixes: #3119

Signed-off-by: Ankit Kumar <ankit.kumar@samsung.com>
Signed-off-by: Dongjoo Seo <dongjoo.seo1@samsung.com>

* Add nvme helpers to enable io_uring command

This adds the required nvme helpers for getting namespace information,
lba size etc. to enable io_uring command support.

NVMe io_uring command support (io_uring_cmd) enables asynchronous,
low-latency passthrough of NVMe commands directly from user space,
bypassing file system and most of block layer overhead.
Introduced in Linux kernel 5.19, it allows using IORING_OP_URING_CMD
for raw NVMe commands, featuring big SQE (128 bytes) / CQE (32 bytes)
support for larger command structures.

Unlike block device interface it requires nvme namespace character device
(/dev/ngXnY)

Signed-off-by: Ankit Kumar <ankit.kumar@samsung.com>
Signed-off-by: Daegyu Han <daegyu94.han@samsung.com>
Signed-off-by: Dongjoo Seo <dongjoo.seo1@samsung.com>

* rust Add io_uring command read write support

NVMe io_uring command utilizes
 * big Submission Queue Entries 128 bytes, standard is 64 bytes
 * and, big Completion Queue Entries 32 bytes, standard is 16 bytes.

The NVMe command is embedded within the last 80 bytes of the submission
queue entry.

The io_uring worker thread has been rebased for better readability.

The nvme namespace character device doesn't support I/O
sizes greater than /sys/block/nvmeXnY/queue/max_hw_sectors_kb
Usually the block layer handles I/O split for any requests
larger than this limitation, which is not there for char devices

To handle this add support for I/O splitting based on user specified
maximum data transfer limit. "rust_raw_block.max_data_transfer_size"
If not specified the commands will be auto split based to the queue
max_hw_sectors_kb limit.

Added comprehensive test suite for uring_command.
Expanded the raw block l2_adpater tests.

Signed-off-by: Ankit Kumar <ankit.kumar@samsung.com>
Signed-off-by: Daegyu Han <daegyu94.han@samsung.com>
Signed-off-by: Dongjoo Seo <dongjoo.seo1@samsung.com>

* doc: Add missing io_uring and use_uring_cmd docs

Signed-off-by: Ankit Kumar <ankit.kumar@samsung.com>

* fix (rawblock): callback for succeeded keys before raising error

Signed-off-by: Ankit Kumar <ankit.kumar@samsung.com>

---------

Signed-off-by: Ankit Kumar <ankit.kumar@samsung.com>
Signed-off-by: Dongjoo Seo <dongjoo.seo1@samsung.com>
Signed-off-by: Daegyu Han <daegyu94.han@samsung.com>
---
 benchmarks/storage_backend_io/README.md       |   21 +-
 .../storage_backend_io_benchmark.py           |   43 +-
 docs/source/mp/l2_storage.rst                 |   21 +-
 .../l2_adapters/raw_block_l2_adapter.py       |   30 +-
 .../plugins/rust_raw_block_backend.py         |  115 +-
 lmcache/v1/storage_backend/raw_block/core.py  |  530 ++++++-
 rust/raw_block/README.md                      |   34 +-
 rust/raw_block/src/lib.rs                     | 1389 +++++++++++++----
 .../distributed/test_raw_block_l2_adapter.py  |   45 +
 .../test_raw_block_uring_cmd.py               |  320 ++++
 .../test_rust_raw_block_backend.py            |    1 +
 11 files changed, 2222 insertions(+), 327 deletions(-)
 create mode 100644 tests/v1/storage_backend/test_raw_block_uring_cmd.py

diff --git a/benchmarks/storage_backend_io/README.md b/benchmarks/storage_backend_io/README.md
index 5e1edac7fa..ca52b4dcee 100644
--- a/benchmarks/storage_backend_io/README.md
+++ b/benchmarks/storage_backend_io/README.md
@@ -5,6 +5,7 @@ This microbenchmark compares multiple storage backends under high write/read con
 - **RustRawBlockBackend** - Raw block device with optional O_DIRECT and io_uring
 - **Hf3fsBackend** - HF3FS remote storage
 - **FsBackend** - Filesystem connector backend
+- The **io_uring** backend can optionally use **io_uring_cmd** (NVMe passthrough) for direct device access.
 
 ## What It Measures
 
@@ -45,6 +46,19 @@ python benchmarks/storage_backend_io/storage_backend_io_benchmark.py \
   --verify-integrity \
   --output-json /tmp/storage_backend_io.json
 
+# Rust raw block backend with io_uring_cmd (write benchmark)
+# Note: io_uring_cmd requires NVMe character device node (/dev/ngXnY)
+python benchmarks/storage_backend_io/storage_backend_io_benchmark.py \
+  --num-ops 1024 \
+  --concurrency 4 \
+  --backend rust_raw_block \
+  --raw-device /dev/ng0n1 \
+  --chunk-size 256 \
+  --alignment 4096 \
+  --use-uring \
+  --use-uring-cmd \
+  --output-json /tmp/rust_raw_block_uring_cmd.json
+
 # HF3FS backend (write benchmark)
 python benchmarks/storage_backend_io/storage_backend_io_benchmark.py \
   --num-ops 512 \
@@ -89,6 +103,10 @@ python benchmarks/storage_backend_io/storage_backend_io_benchmark.py \
 - When `--local-disk-odirect` is enabled, the benchmark allocates **page-aligned** buffers to avoid EINVAL from O_DIRECT.
 - Local disk backend uses its internal worker pool; completion is tracked via callbacks.
 - Rust raw block benchmark uses a unique manifest path per run to avoid stale-index reuse between runs.
+- For io_uring there is a limit on the number of fixed buffers that can be registered. For unprivileged users its 16384.
+- Buffer registration and de-registration is time consuming.
+- **io_uring_cmd** requires using the NVMe character device node (e.g., `/dev/ng0n1`) instead of the block device node (e.g., `/dev/nvme0n1`).
+- **io_uring_cmd** requires io_uring as the underlying I/O engine.
 
 ## How To Compare On Real NVMe
 
@@ -201,7 +219,8 @@ For write+read benchmarks (`--write_bench False`):
     "verify_integrity": true,
     "integrity_errors": 0,
     "integrity_passed": true,
-    "use_uring": false
+    "use_uring": false,
+    "use_uring_cmd": false
   }
 ]
 ```
diff --git a/benchmarks/storage_backend_io/storage_backend_io_benchmark.py b/benchmarks/storage_backend_io/storage_backend_io_benchmark.py
index 10d6c7be5f..bfcde62ebc 100644
--- a/benchmarks/storage_backend_io/storage_backend_io_benchmark.py
+++ b/benchmarks/storage_backend_io/storage_backend_io_benchmark.py
@@ -708,8 +708,10 @@ def __init__(
         cleanup_raw_device: bool,
         write_bench: bool,
         use_uring: bool,
+        use_uring_cmd: bool,
         chunk_size: int = DEFAULT_CHUNK_SIZE,
         verify_integrity: bool = False,
+        max_data_transfer_size: int = 0,
     ):
         super().__init__(
             "rust_raw_block",
@@ -728,6 +730,8 @@ def __init__(
         self._temp_dir: Optional[str] = None
         self._manifest_path: Optional[str] = None
         self.use_uring = use_uring
+        self.use_uring_cmd = use_uring_cmd
+        self.max_data_transfer_size = max_data_transfer_size
         # Create manifest path
         self._manifest_path = os.path.join(
             tempfile.gettempdir(),
@@ -744,11 +748,14 @@ def extra_config_keys(self) -> dict:
             "rust_raw_block.manifest_path": self._manifest_path,
             "rust_raw_block.manifest_write_interval": 0,
             "rust_raw_block.use_uring": self.use_uring,
+            "rust_raw_block.use_uring_cmd": self.use_uring_cmd,
+            "rust_raw_block.max_data_transfer_size": self.max_data_transfer_size,
         }
 
     def _setup_device(self) -> None:
         """Setup raw block device or temp file."""
         is_block_device = False
+        is_char_device = False
         self._temp_dir = None
         prefix = "write" if self.write_bench else "read"
         prefix = prefix + "raw_block_bench_"
@@ -756,16 +763,18 @@ def _setup_device(self) -> None:
             try:
                 st_mode = os.stat(self.raw_device).st_mode
                 is_block_device = stat.S_ISBLK(st_mode)
+                is_char_device = stat.S_ISCHR(st_mode)
             except FileNotFoundError:
                 is_block_device = False
+                is_char_device = False
 
         # Create temp file if no device specified
         if not self.raw_device:
             self._temp_dir = tempfile.mkdtemp(prefix)
             self.raw_device = os.path.join(self._temp_dir, "raw_block.bin")
 
-        # Truncate if not a real block device
-        if self.raw_device and not is_block_device:
+        # Truncate if not a real block device or character device
+        if self.raw_device and not is_block_device and not is_char_device:
             with open(self.raw_device, "wb") as f:
                 f.truncate(int(self.raw_device_size_gb * 1024**3))
 
@@ -823,7 +832,10 @@ def _close_backend(self) -> None:
 
     def _get_extra_result_fields(self) -> dict:
         """Get extra fields for RustRawBlockBackend benchmark results."""
-        return {"use_uring": self.use_uring}
+        return {
+            "use_uring": self.use_uring,
+            "use_uring_cmd": self.use_uring_cmd,
+        }
 
     def _cleanup_device(self) -> None:
         """Cleanup temp files."""
@@ -1006,6 +1018,25 @@ def main() -> None:
         action="store_true",
         help="Enable io_uring for raw block backend",
     )
+    parser.add_argument(
+        "--use-uring-cmd",
+        action="store_true",
+        help=(
+            "Enable io_uring_cmd for raw block backend. "
+            "This automatically enables --use-uring. "
+            "Must use nvme character device node (/dev/ngXnY)"
+        ),
+    )
+    parser.add_argument(
+        "--max-data-transfer-size",
+        type=int,
+        default=0,
+        help=(
+            "Maximum data transfer size for use_uring_cmd. "
+            " > 0: Split based on the specified limit. "
+            " <= 0: Split based on device reported max hardware sector size"
+        ),
+    )
     parser.add_argument(
         "--verify-integrity",
         action="store_true",
@@ -1058,6 +1089,10 @@ def main() -> None:
 
     args = parser.parse_args()
 
+    # use_uring_cmd requires io_uring as io_engine
+    if args.use_uring_cmd:
+        args.use_uring = True
+
     # write_bench defaults to True (write benchmark), set to False for read benchmark
     write_bench = args.write_bench.lower() in ("true", "1", "yes", "y", "")
 
@@ -1098,8 +1133,10 @@ def main() -> None:
             cleanup_raw_device=cleanup_raw_device,
             write_bench=write_bench,
             use_uring=args.use_uring,
+            use_uring_cmd=args.use_uring_cmd,
             chunk_size=args.chunk_size,
             verify_integrity=args.verify_integrity,
+            max_data_transfer_size=args.max_data_transfer_size,
         )
         result = rustraw_bench.run()
         result["raw_device"] = raw_device
diff --git a/docs/source/mp/l2_storage.rst b/docs/source/mp/l2_storage.rst
index 0ac34d4cf0..5d92746cc5 100644
--- a/docs/source/mp/l2_storage.rst
+++ b/docs/source/mp/l2_storage.rst
@@ -366,7 +366,13 @@ caller-provided load buffers during prefetch.
 - ``io_engine``: Rust raw-block I/O engine. Valid values are ``"posix"``
   (default synchronous ``pread``/``pwrite`` path), ``"io_uring"`` (direct Rust
   io_uring syscall path).
+- ``use_uring_cmd``: Enable NVMe passthrough via io_uring command interface
+  for direct device access. Requires ``io_engine="io_uring"`` and NVMe
+  character device node (e.g., ``/dev/ng0n1``).
 - ``iouring_queue_depth``: Queue depth for ``io_engine="io_uring"``.
+- ``max_data_transfer_size``: Maximum data transfer size for
+  ``use_uring_cmd=true``. Large transfers are split into smaller chunks
+  that fit within device limits.
 - ``num_store_workers`` / ``num_lookup_workers`` / ``num_load_workers``:
   Worker-thread counts for each operation type.
 
@@ -374,22 +380,35 @@ caller-provided load buffers during prefetch.
 
 - ``raw_block`` is a server-owned MP adapter. It does **not** support
   per-TP device-path mappings in MP mode.
-- ``raw_block`` remains ``"type": "raw_block"`` for both supported engines.
+- ``raw_block`` remains ``"type": "raw_block"`` for all supported engines.
 - ``raw_block`` owns on-device slot allocation, checkpointing, and recovery
   through ``RawBlockCore``. Slot reclamation is driven by the shared/global
   L2 eviction controller or explicit ``delete()`` calls.
 - If ``use_odirect`` is enabled, the server's ``--l1-align-bytes`` should be
   at least ``block_align``.
 - ``persist_enabled`` must remain ``true`` for this adapter.
+- For ``use_uring_cmd=true``, ``device_path`` must use the NVMe character
+  device node (e.g., ``/dev/ng0n1``) instead of the block device node
+  (``/dev/nvme0n1``). The character device provides direct NVMe
+  command passthrough.
+- ``use_uring_cmd`` requires ``io_engine="io_uring"`` to be set.
+- When ``use_uring_cmd=true``, ``use_odirect`` is ignored for NVMe namespace
+  character devices.
 
 **Configuration examples:**
 
 .. code-block:: bash
 
+    # Basic raw_block with posix I/O
     --l2-adapter '{"type": "raw_block", "device_path": "/dev/nvme0n1", "slot_bytes": 1048576, "block_align": 4096, "header_bytes": 4096, "meta_total_bytes": 268435456, "use_odirect": true, "num_store_workers": 2, "num_lookup_workers": 1, "num_load_workers": 4}'
 
+    # With io_uring
     --l2-adapter '{"type": "raw_block", "device_path": "/dev/nvme0n1", "slot_bytes": 1048576, "io_engine": "io_uring", "iouring_queue_depth": 256, "use_odirect": true}'
 
+    # With io_uring_cmd (NVMe passthrough)
+    --l2-adapter '{"type": "raw_block", "device_path": "/dev/ng0n1", "slot_bytes": 1048576, "io_engine": "io_uring", "use_uring_cmd": true, "iouring_queue_depth": 256, "max_data_transfer_size": 131072, "use_odirect": false}'
+
+    # With eviction
     --l2-adapter '{"type": "raw_block", "device_path": "/dev/nvme0n1", "slot_bytes": 1048576, "load_checkpoint_on_init": false, "eviction": {"eviction_policy": "LRU", "trigger_watermark": 0.9, "eviction_ratio": 0.1}}'
 
 ``mooncake_store`` -- Mooncake Store native connector
diff --git a/lmcache/v1/distributed/l2_adapters/raw_block_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/raw_block_l2_adapter.py
index 558df6907c..7f57a7979a 100644
--- a/lmcache/v1/distributed/l2_adapters/raw_block_l2_adapter.py
+++ b/lmcache/v1/distributed/l2_adapters/raw_block_l2_adapter.py
@@ -86,6 +86,8 @@ def __init__(
         enable_zero_copy: bool = True,
         io_engine: str = "posix",
         iouring_queue_depth: int = DEFAULT_IOURING_QUEUE_DEPTH,
+        use_uring_cmd: bool = False,
+        max_data_transfer_size: int = 0,
         num_store_workers: int = 2,
         num_lookup_workers: int = 1,
         num_load_workers: int = 4,
@@ -110,6 +112,8 @@ def __init__(
             enable_zero_copy: Whether to use aligned direct-buffer I/O.
             io_engine: Raw-block I/O engine: ``"posix"`` or ``"io_uring"``.
             iouring_queue_depth: Queue depth for the Rust io_uring engine.
+            use_uring_cmd: Whether to use NVMe io_uring_cmd passthrough.
+            max_data_transfer_size: Max data transfer size for a single request.
             num_store_workers: Number of store worker threads.
             num_lookup_workers: Number of lookup worker threads.
             num_load_workers: Number of load worker threads.
@@ -135,6 +139,8 @@ def __init__(
         validate_raw_block_io_options(
             iouring_queue_depth=self.iouring_queue_depth,
         )
+        self.use_uring_cmd = bool(use_uring_cmd)
+        self.max_data_transfer_size = int(max_data_transfer_size)
         self.num_store_workers = int(num_store_workers)
         self.num_lookup_workers = int(num_lookup_workers)
         self.num_load_workers = int(num_load_workers)
@@ -168,6 +174,8 @@ def from_dict(cls, d: dict) -> "RawBlockL2AdapterConfig":
         iouring_queue_depth = int(
             d.get("iouring_queue_depth", DEFAULT_IOURING_QUEUE_DEPTH)
         )
+        use_uring_cmd = bool(d.get("use_uring_cmd", False))
+        max_data_transfer_size = int(d.get("max_data_transfer_size", 0))
 
         if block_align <= 0:
             raise ValueError("block_align must be > 0")
@@ -184,6 +192,8 @@ def from_dict(cls, d: dict) -> "RawBlockL2AdapterConfig":
         validate_raw_block_io_options(
             iouring_queue_depth=iouring_queue_depth,
         )
+        if use_uring_cmd and io_engine != "io_uring":
+            raise ValueError("use_uring_cmd requires io_uring io_engine")
 
         worker_defaults = {
             "num_store_workers": 2,
@@ -215,6 +225,8 @@ def from_dict(cls, d: dict) -> "RawBlockL2AdapterConfig":
             enable_zero_copy=bool(d.get("enable_zero_copy", True)),
             io_engine=io_engine,
             iouring_queue_depth=iouring_queue_depth,
+            use_uring_cmd=use_uring_cmd,
+            max_data_transfer_size=max_data_transfer_size,
             num_store_workers=worker_counts["num_store_workers"],
             num_lookup_workers=worker_counts["num_lookup_workers"],
             num_load_workers=worker_counts["num_load_workers"],
@@ -251,6 +263,11 @@ def help(cls) -> str:
             "- io_engine (str): posix or io_uring (default posix)\n"
             "- iouring_queue_depth (int): Rust io_uring queue depth "
             f"(default {DEFAULT_IOURING_QUEUE_DEPTH})\n"
+            "- use_uring_cmd (bool): enable NVMe io_uring_cmd path "
+            "(default false, requires io_uring as the io_engine)\n"
+            "- max_data_transfer_size (int): for a single I/O request "
+            "(0: (default) auto detect limit splitting, > 0: explicit split, "
+            "< 0: auto detect limit splitting)\n"
             "- num_store_workers (int): store worker threads (default 2)\n"
             "- num_lookup_workers (int): lookup worker threads (default 1)\n"
             "- num_load_workers (int): load worker threads (default 4)"
@@ -276,6 +293,8 @@ def to_core_config(self) -> RawBlockCoreConfig:
             meta_verify_on_load=self.meta_verify_on_load,
             io_engine=self.io_engine,
             iouring_queue_depth=self.iouring_queue_depth,
+            use_uring_cmd=self.use_uring_cmd,
+            max_data_transfer_size=self.max_data_transfer_size,
         )
 
 
@@ -305,12 +324,13 @@ def __init__(
         """
         super().__init__()
         if (
-            config.use_odirect
+            (config.use_odirect or config.io_engine == "io_uring")
             and l1_memory_desc is not None
             and l1_memory_desc.align_bytes < config.block_align
         ):
             raise ValueError(
-                "raw_block requires l1_align_bytes >= block_align when use_odirect=true"
+                "raw_block requires l1_align_bytes >= block_align when "
+                "use_odirect=true or io_engine=io_uring"
             )
 
         self._closed = False
@@ -324,6 +344,12 @@ def __init__(
 
         try:
             self._core = RawBlockCore(config.to_core_config(), key_namespace="object")
+            if config.io_engine == "io_uring":
+                logger.warning(
+                    "RawBlockL2Adapter: MP raw_block uses io_uring without "
+                    "fixed-buffer registration; zero-copy fixed buffers are "
+                    "disabled unless registered by a future MP allocator path"
+                )
             self._max_capacity_bytes = int(
                 self._core.report_status().get("usable_capacity_bytes", 0)
             )
diff --git a/lmcache/v1/storage_backend/plugins/rust_raw_block_backend.py b/lmcache/v1/storage_backend/plugins/rust_raw_block_backend.py
index 27c9e9265a..b19f6828f0 100644
--- a/lmcache/v1/storage_backend/plugins/rust_raw_block_backend.py
+++ b/lmcache/v1/storage_backend/plugins/rust_raw_block_backend.py
@@ -139,6 +139,17 @@ def __init__(
             self._build_core_config(extra),
             key_namespace="legacy",
         )
+        if self._core.io_engine == "io_uring":
+            try:
+                self._core.register_fixed_buffers_from_allocator(
+                    self.local_cpu_backend.get_memory_allocator()
+                )
+            except Exception as e:
+                logger.warning(
+                    "RustRawBlockBackend: failed to register io_uring fixed "
+                    "buffers: %s. Falling back to non-fixed buffer mode.",
+                    e,
+                )
         self._warn_if_loaded_metadata_looks_cross_rank()
 
         self._put_lock = threading.Lock()
@@ -189,6 +200,16 @@ def data_base_offset(self) -> int:
         """Return the byte offset where data slots begin."""
         return self._core.data_base_offset()
 
+    @property
+    def _raw(self) -> Any:
+        """Return the raw device handle for legacy test compatibility."""
+        return self._core.raw_device()
+
+    @_raw.setter
+    def _raw(self, raw_device: Any) -> None:
+        """Replace the raw device handle for legacy test compatibility."""
+        self._core.set_raw_device_for_testing(raw_device)
+
     def lock_refcount(self, encoded_key: str) -> int:
         """Return the L2 lock refcount for a legacy encoded key."""
         return self._core.lock_refcount(encoded_key)
@@ -227,6 +248,10 @@ def _build_core_config(self, extra: Mapping[str, Any]) -> RawBlockCoreConfig:
         iouring_queue_depth = int(
             extra.get("rust_raw_block.iouring_queue_depth", DEFAULT_IOURING_QUEUE_DEPTH)
         )
+        use_uring_cmd = bool(extra.get("rust_raw_block.use_uring_cmd", False))
+        max_data_transfer_size = int(
+            extra.get("rust_raw_block.max_data_transfer_size", 0)
+        )
         validate_raw_block_io_options(
             iouring_queue_depth=iouring_queue_depth,
         )
@@ -285,8 +310,10 @@ def _build_core_config(self, extra: Mapping[str, Any]) -> RawBlockCoreConfig:
             meta_verify_on_load=bool(
                 extra.get("rust_raw_block.meta_verify_on_load", True)
             ),
+            max_data_transfer_size=max_data_transfer_size,
             io_engine=io_engine,
             iouring_queue_depth=iouring_queue_depth,
+            use_uring_cmd=use_uring_cmd,
         )
 
     def _warn_if_loaded_metadata_looks_cross_rank(self) -> None:
@@ -352,7 +379,11 @@ def batched_submit_put_task(
         on_complete_callback: Optional[Callable[[CacheEngineKey], None]] = None,
     ) -> list[Future] | None:
         del transfer_spec
-        futures: list[Future] = []
+        loop = self.loop
+        if loop is None:
+            raise RuntimeError("RustRawBlockBackend requires an asyncio event loop")
+
+        pending: list[tuple[CacheEngineKey, RawBlockKeySpec, MemoryObj]] = []
         for key, obj in zip(keys, objs, strict=False):
             with self._put_lock:
                 if key in self._put_tasks:
@@ -370,16 +401,27 @@ def batched_submit_put_task(
                 continue
 
             obj.ref_count_up()
-            loop = self.loop
-            if loop is None:
-                obj.ref_count_down()
-                raise RuntimeError("RustRawBlockBackend requires an asyncio event loop")
+            pending.append((key, spec, obj))
+
+        if not pending:
+            return None
+
+        if self._core.io_engine == "io_uring" and len(pending) > 1:
             fut = asyncio.run_coroutine_threadsafe(
-                self._submit_put_one(key, spec, obj, on_complete_callback),
+                self._submit_put_many(pending, on_complete_callback),
                 loop,
             )
-            futures.append(fut)
-        return futures or None
+            return [fut]
+
+        futures: list[Future] = []
+        for key, spec, obj in pending:
+            futures.append(
+                asyncio.run_coroutine_threadsafe(
+                    self._submit_put_one(key, spec, obj, on_complete_callback),
+                    loop,
+                )
+            )
+        return futures
 
     async def _submit_put_one(
         self,
@@ -406,6 +448,63 @@ async def _submit_put_one(
             with self._put_lock:
                 self._put_tasks.discard(key)
 
+    async def _submit_put_many(
+        self,
+        pending: Sequence[tuple[CacheEngineKey, RawBlockKeySpec, MemoryObj]],
+        on_complete_callback: Optional[Callable[[CacheEngineKey], None]],
+    ) -> None:
+        """Persist multiple legacy raw-block keys in one background batch.
+
+        Args:
+            pending: Ordered ``(key, spec, memory_obj)`` tuples to persist.
+            on_complete_callback: Optional per-key completion callback.
+
+        Raises:
+            RuntimeError: If any key fails to persist.
+            Exception: Propagates raw-device write failures from the core.
+        """
+        keys = [item[0] for item in pending]
+        specs = [item[1] for item in pending]
+        memory_objs = [item[2] for item in pending]
+        try:
+            put_result = await asyncio.to_thread(
+                self._core.put_many,
+                specs,
+                memory_objs,
+            )
+            if len(put_result.results) != len(pending) or not all(put_result.results):
+                failed = []
+
+                for key, spec, ok in zip(keys, specs, put_result.results, strict=False):
+                    if ok:
+                        if on_complete_callback is not None:
+                            try:
+                                on_complete_callback(key)
+                            except Exception as e:
+                                logger.warning(
+                                    "on_complete_callback failed for key %s: %s", key, e
+                                )
+                    else:
+                        failed.append(spec.encoded)
+
+                if failed:
+                    raise RuntimeError(
+                        "Failed to persist raw-block keys: " + ", ".join(failed)
+                    )
+            if on_complete_callback is not None:
+                for key in keys:
+                    try:
+                        on_complete_callback(key)
+                    except Exception as e:
+                        logger.warning(
+                            "on_complete_callback failed for key %s: %s", key, e
+                        )
+        finally:
+            for key, _spec, memory_obj in pending:
+                memory_obj.ref_count_down()
+                with self._put_lock:
+                    self._put_tasks.discard(key)
+
     def _batched_get_prefix(
         self,
         keys: Sequence[CacheEngineKey],
diff --git a/lmcache/v1/storage_backend/raw_block/core.py b/lmcache/v1/storage_backend/raw_block/core.py
index 903295aba4..d48e5e1cbe 100644
--- a/lmcache/v1/storage_backend/raw_block/core.py
+++ b/lmcache/v1/storage_backend/raw_block/core.py
@@ -9,6 +9,9 @@
 from typing import Any, Optional
 import ctypes
 import json
+import os
+import re
+import stat
 import struct
 import threading
 import time
@@ -104,6 +107,25 @@ def validate_raw_block_io_options(
         raise ValueError("iouring_queue_depth must be > 0")
 
 
+def _resolve_sysfs_queue_dir(device_path: str) -> Optional[str]:
+    """Resolve sysfs queue directory for NVMe character device paths."""
+    base_name = os.path.basename(device_path)
+    match = re.fullmatch(r"ng(\d+)n(\d+)", base_name)
+    if match:
+        ctrl, nsid = match.groups()
+        return f"/sys/block/nvme{ctrl}n{nsid}/queue"
+    return None
+
+
+def _read_sysfs_int(path: str) -> Optional[int]:
+    """Read an integer value from sysfs and return None on failure."""
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return int(f.read().strip())
+    except Exception:
+        return None
+
+
 @dataclass(frozen=True)
 class RawBlockCoreConfig:
     """Configuration for RawBlockCore device layout, I/O, and checkpoints."""
@@ -122,9 +144,11 @@ class RawBlockCoreConfig:
     meta_idle_quiet_ms: int
     meta_enable_periodic: bool
     meta_verify_on_load: bool
+    max_data_transfer_size: int = 0
     load_checkpoint_on_init: bool = True
     io_engine: str = "posix"
     iouring_queue_depth: int = DEFAULT_IOURING_QUEUE_DEPTH
+    use_uring_cmd: bool = False
 
 
 @dataclass
@@ -197,6 +221,13 @@ def __init__(
         self.meta_verify_on_load = bool(config.meta_verify_on_load)
         self.io_engine = normalize_raw_block_io_engine(config.io_engine)
         self.iouring_queue_depth = int(config.iouring_queue_depth)
+        self.use_uring_cmd = bool(config.use_uring_cmd)
+        if self.use_uring_cmd and self.use_odirect:
+            logger.warning(
+                "RawBlockCore: use_odirect is ignored for NVMe namespace "
+                "character devices when use_uring_cmd=true"
+            )
+            self.use_odirect = False
         self.key_namespace = key_namespace
 
         if not self.device_path:
@@ -222,6 +253,38 @@ def __init__(
         validate_raw_block_io_options(
             iouring_queue_depth=self.iouring_queue_depth,
         )
+        if self.use_uring_cmd and self.io_engine != "io_uring":
+            raise ValueError("use_uring_cmd requires io_uring as io_engine")
+        if self.use_uring_cmd:
+            try:
+                mode = os.stat(self.device_path).st_mode
+            except OSError as e:
+                raise ValueError(
+                    "use_uring_cmd requires an existing NVMe namespace "
+                    f"character device path, got {self.device_path!r}"
+                ) from e
+            if not stat.S_ISCHR(mode):
+                raise ValueError(
+                    "use_uring_cmd requires an NVMe namespace character device "
+                    f"(for example /dev/ng0n1), got {self.device_path!r}"
+                )
+            # Validate NVMe generic namespace naming pattern (ng<ctrl>n<ns>)
+            basename = os.path.basename(self.device_path)
+            if not re.match(r"^ng\d+n\d+$", basename):
+                raise ValueError(
+                    "use_uring_cmd requires an NVMe generic namespace character device "
+                    f"with naming pattern ng<ctrl>n<ns> (for example /dev/ng0n1), "
+                    f"got {self.device_path!r}"
+                )
+
+        # Maximum data transfer size for a single I/O request.
+        # Default is 0 (no splitting).
+        # > 0 : explicit manual split size
+        # <= 0: opt-in auto-detect from device queue limits
+        if self.use_uring_cmd:
+            self.max_data_transfer_size = self._resolve_max_data_transfer_size(
+                config.max_data_transfer_size
+            )
 
         try:
             self.meta_magic_text = self.meta_magic.decode("ascii")
@@ -277,6 +340,69 @@ def __init__(
             self._cleanup_after_init_failure()
             raise
 
+    @property
+    def _requires_transfer_alignment(self) -> bool:
+        """Return whether I/O transfers require block alignment.
+
+        Returns:
+            True when transfers must be aligned to ``self.block_align``.
+            This is required for O_DIRECT I/O and for io_uring_cmd operations.
+        """
+        return self.use_odirect or self.use_uring_cmd
+
+    def _resolve_max_data_transfer_size(self, configured_size: int) -> int:
+        """Resolve transfer split size from config or NVMe sysfs queue limits.
+
+        Args:
+            configured_size: Explicitly configured max data transfer size in bytes.
+                If > 0, this value is used directly. If <= 0, the size is
+                auto-detected from device queue limits.
+
+        Returns:
+            The resolved max data transfer size in bytes, guaranteed to be
+            a multiple of ``self.block_align``.
+
+        Raises:
+            ValueError: If ``configured_size`` is > 0 but not a multiple of
+                ``self.block_align``.
+        """
+        if configured_size > 0:
+            if configured_size % self.block_align != 0:
+                raise ValueError(
+                    f"max_data_transfer_size ({configured_size}) must be a "
+                    f"multiple of block_align ({self.block_align})"
+                )
+            return configured_size
+
+        queue_dir = _resolve_sysfs_queue_dir(self.device_path)
+        if queue_dir is None:
+            raise RuntimeError(
+                "RustRawBlockBackend: unable to derive NVMe sysfs queue path from "
+                "NVMe character device path "
+                f"{self.device_path} for auto max_data_transfer_size"
+            )
+
+        max_hw_sectors_kb = _read_sysfs_int(f"{queue_dir}/max_hw_sectors_kb")
+        if max_hw_sectors_kb is None or max_hw_sectors_kb <= 0:
+            raise RuntimeError(
+                "RustRawBlockBackend: failed to read max_hw_sectors_kb from "
+                f"{queue_dir} for auto max_data_transfer_size"
+            )
+
+        resolved_bytes = max_hw_sectors_kb * 1024
+        aligned_bytes = (resolved_bytes // self.block_align) * self.block_align
+        if aligned_bytes <= 0:
+            aligned_bytes = self.block_align
+
+        logger.info(
+            "RustRawBlockBackend: auto max_data_transfer_size=%d bytes "
+            "(device=%s, max_hw_sectors_kb=%s)",
+            aligned_bytes,
+            self.device_path,
+            max_hw_sectors_kb,
+        )
+        return aligned_bytes
+
     def _rawdev(self):
         """Return the lazily opened Rust raw-block device binding."""
         if self._raw is None:
@@ -295,9 +421,63 @@ def _rawdev(self):
                 alignment=self.block_align,
                 io_engine=self.io_engine,
                 iouring_queue_depth=self.iouring_queue_depth,
+                use_uring_cmd=self.use_uring_cmd,
             )
         return self._raw
 
+    def raw_device(self) -> Any:
+        """Return the lazily opened Rust raw-block device.
+
+        Returns:
+            The underlying Rust ``RawBlockDevice`` object.
+
+        Raises:
+            Exception: Propagates raw-device open errors from the Rust binding.
+        """
+        return self._rawdev()
+
+    def set_raw_device_for_testing(self, raw_device: Any) -> None:
+        """Replace the raw device handle used by this core.
+
+        Args:
+            raw_device: Object implementing the Rust raw-device methods.
+        """
+        self._raw = raw_device
+
+    def register_fixed_buffers_from_allocator(self, memory_allocator: Any) -> None:
+        """Register allocator pages with io_uring when the allocator exposes them.
+
+        Args:
+            memory_allocator: Local CPU allocator that may expose
+                ``get_paged_buffers()``.
+
+        Raises:
+            Exception: Propagates Rust registration errors after logging.
+        """
+        if self.io_engine != "io_uring":
+            return
+        paged_buffers = getattr(memory_allocator, "get_paged_buffers", None)
+        if not callable(paged_buffers):
+            logger.warning(
+                "RawBlockCore: allocator does not expose paged buffers; "
+                "io_uring fixed-buffer zero-copy is disabled"
+            )
+            return
+        buffers = paged_buffers()
+        if not buffers:
+            logger.warning(
+                "RawBlockCore: allocator returned no paged buffers; "
+                "io_uring fixed-buffer zero-copy is disabled"
+            )
+            return
+        buffer_ptrs = [buf.data_ptr() for buf in buffers]
+        buffer_sizes = [buf.numel() * buf.element_size() for buf in buffers]
+        self._rawdev().register_fixed_buffers(buffer_ptrs, buffer_sizes)
+        logger.info(
+            "RawBlockCore: registered %d paged buffers for io_uring fixed I/O",
+            len(buffers),
+        )
+
     def contains_key(self, encoded_key: str, *, lock: bool = False) -> bool:
         """Return whether one encoded key is present in the raw-block index.
 
@@ -585,7 +765,6 @@ def load_many_into(
 
         results = [False] * len(encoded_keys)
         try:
-            raw_dev = self._rawdev()
             for i, (encoded_key, entry) in enumerate(items):
                 if entry is None:
                     continue
@@ -593,7 +772,7 @@ def load_many_into(
                     payload_len = int(entry.size)
                     total_len = (
                         round_up(payload_len, self.block_align)
-                        if self.use_odirect
+                        if self._requires_transfer_alignment
                         else payload_len
                     )
                     buf = memoryview(objs[i].byte_array)
@@ -610,18 +789,22 @@ def load_many_into(
                         zero_tail=False,
                     )
                     if direct_view is not None:
-                        raw_dev.pread_into(
-                            entry.offset + self.header_bytes,
-                            direct_view,
-                            total_len if len(direct_view) >= total_len else payload_len,
-                            total_len,
+                        self._read_buffers(
+                            [entry.offset + self.header_bytes],
+                            [direct_view],
+                            [
+                                total_len
+                                if len(direct_view) >= total_len
+                                else payload_len
+                            ],
+                            [total_len],
                         )
                     else:
-                        raw_dev.pread_into(
-                            entry.offset + self.header_bytes,
-                            buf,
-                            payload_len,
-                            total_len,
+                        self._read_buffers(
+                            [entry.offset + self.header_bytes],
+                            [buf],
+                            [payload_len],
+                            [total_len],
                         )
                     objs[i].metadata.cached_positions = entry.meta.cached_positions
                     results[i] = True
@@ -749,6 +932,7 @@ def report_status(self) -> dict:
                 "enable_zero_copy": self.enable_zero_copy,
                 "io_engine": self.io_engine,
                 "iouring_queue_depth": self.iouring_queue_depth,
+                "use_uring_cmd": self.use_uring_cmd,
             }
 
     def close(self) -> None:
@@ -795,6 +979,39 @@ def _cleanup_after_init_failure(self) -> None:
                 self._raw = None
         self._closed = True
 
+    def _byte_view(self, buf: Any) -> memoryview:
+        """Return a byte-addressable memoryview over a Python buffer.
+
+        Args:
+            buf: Object implementing the Python buffer protocol.
+
+        Returns:
+            A memoryview with one-byte elements.
+
+        Raises:
+            TypeError: If ``buf`` does not expose a compatible contiguous buffer.
+        """
+        view = buf if isinstance(buf, memoryview) else memoryview(buf)
+        if view.itemsize == 1 and view.format in ("B", "b", "c"):
+            return view
+        return view.cast("B")
+
+    def _is_buffer_aligned(self, buf: Any) -> bool:
+        """Check if a buffer is aligned to the block alignment boundary.
+
+        Args:
+            buf: Object implementing the Python buffer protocol.
+
+        Returns:
+            True if the buffer is aligned, False otherwise.
+        """
+        if not self.use_odirect:
+            return True
+        view = self._byte_view(buf)
+        # Check if the buffer pointer is aligned
+        ptr = ctypes.addressof((ctypes.c_byte * 1).from_buffer(view))
+        return ptr % self.block_align == 0
+
     def _build_direct_odirect_view(
         self,
         memory_obj: MemoryObj,
@@ -874,11 +1091,11 @@ def _prepare_write_payload(self, memory_obj: MemoryObj) -> tuple[Any, int, int]:
                 f"{payload_capacity}"
             )
         total_len = payload_len
-        if self.use_odirect:
+        if self._requires_transfer_alignment:
             total_len = round_up(payload_len, self.block_align)
             if total_len > payload_capacity:
                 raise RuntimeError(
-                    f"O_DIRECT payload {total_len} exceeds slot capacity "
+                    f"Aligned payload {total_len} exceeds slot capacity "
                     f"{payload_capacity}"
                 )
             direct_view = self._build_direct_odirect_view(
@@ -892,6 +1109,244 @@ def _prepare_write_payload(self, memory_obj: MemoryObj) -> tuple[Any, int, int]:
                 buf = direct_view
         return buf, payload_len, total_len
 
+    def _validate_uring_cmd_chunk(self, offset: int, total_len: int) -> None:
+        """Validate one NVMe raw-command transfer range.
+
+        Args:
+            offset: Device byte offset for the transfer.
+            total_len: Transfer size in bytes.
+
+        Raises:
+            ValueError: If either value is not block aligned.
+        """
+        if offset % self.block_align != 0:
+            raise ValueError("io_uring_cmd requires aligned offsets")
+        if total_len % self.block_align != 0:
+            raise ValueError("io_uring_cmd requires aligned transfer lengths")
+
+    def _write_uring_cmd_buffers(
+        self,
+        offsets: Sequence[int],
+        buffers: Sequence[Any],
+        payload_lens: Sequence[int],
+        total_lens: Sequence[int],
+    ) -> None:
+        """Write buffers as bounded NVMe raw-command chunks.
+
+        Args:
+            offsets: Device offsets for each logical write.
+            buffers: Source buffers.
+            payload_lens: Logical source byte counts.
+            total_lens: Physical transfer sizes, including padding.
+
+        Raises:
+            ValueError: If lengths are inconsistent or unaligned.
+            Exception: Propagates Rust raw-device write errors.
+        """
+        raw_dev = self._rawdev()
+        chunk_offsets: list[int] = []
+        chunk_buffers: list[memoryview] = []
+        chunk_lens: list[int] = []
+        keepalive: list[memoryview] = []
+
+        for offset, buf, payload_len, total_len in zip(
+            offsets, buffers, payload_lens, total_lens, strict=True
+        ):
+            offset = int(offset)
+            payload_len = int(payload_len)
+            total_len = int(total_len)
+            self._validate_uring_cmd_chunk(offset, total_len)
+
+            view = self._byte_view(buf)
+            if len(view) < total_len:
+                if len(view) < payload_len:
+                    raise ValueError("input buffer shorter than payload_len")
+                padded = bytearray(total_len)
+                padded[:payload_len] = view[:payload_len]
+                view = memoryview(padded)
+            else:
+                view = view[:total_len]
+            keepalive.append(view)
+
+            cursor = 0
+            while cursor < total_len:
+                chunk_len = min(self.max_data_transfer_size, total_len - cursor)
+                self._validate_uring_cmd_chunk(offset + cursor, chunk_len)
+                chunk_offsets.append(offset + cursor)
+                chunk_buffers.append(view[cursor : cursor + chunk_len])
+                chunk_lens.append(chunk_len)
+                cursor += chunk_len
+
+        if not chunk_offsets:
+            return
+        batch_id = raw_dev.batched_write(
+            chunk_offsets,
+            chunk_buffers,
+            chunk_lens,
+        )
+        raw_dev.wait_iouring(batch_id)
+        keepalive.clear()
+
+    def _read_uring_cmd_buffers(
+        self,
+        offsets: Sequence[int],
+        buffers: Sequence[Any],
+        payload_lens: Sequence[int],
+        total_lens: Sequence[int],
+    ) -> None:
+        """Read buffers as bounded NVMe raw-command chunks.
+
+        Args:
+            offsets: Device offsets for each logical read.
+            buffers: Destination buffers.
+            payload_lens: Logical bytes to expose to callers.
+            total_lens: Physical transfer sizes, including padding.
+
+        Raises:
+            ValueError: If lengths are inconsistent or unaligned.
+            Exception: Propagates Rust raw-device read errors.
+        """
+        raw_dev = self._rawdev()
+        read_uring = raw_dev.read_uring
+
+        for offset, buf, payload_len, total_len in zip(
+            offsets, buffers, payload_lens, total_lens, strict=True
+        ):
+            offset = int(offset)
+            payload_len = int(payload_len)
+            total_len = int(total_len)
+            self._validate_uring_cmd_chunk(offset, total_len)
+
+            dst = self._byte_view(buf)
+            if len(dst) < total_len:
+                if len(dst) < payload_len:
+                    raise ValueError("output buffer shorter than payload_len")
+                target = memoryview(bytearray(total_len))
+                copy_back = True
+            else:
+                target = dst[:total_len]
+                copy_back = False
+
+            cursor = 0
+            while cursor < total_len:
+                chunk_len = min(self.max_data_transfer_size, total_len - cursor)
+                self._validate_uring_cmd_chunk(offset + cursor, chunk_len)
+                read_uring(
+                    offset + cursor,
+                    target[cursor : cursor + chunk_len],
+                    chunk_len,
+                    chunk_len,
+                )
+                cursor += chunk_len
+
+            if copy_back:
+                dst[:payload_len] = target[:payload_len]
+
+    def _write_buffers(
+        self,
+        offsets: Sequence[int],
+        buffers: Sequence[Any],
+        payload_lens: Sequence[int],
+        total_lens: Sequence[int],
+    ) -> None:
+        """Write one or more buffers through the configured Rust I/O path.
+
+        Args:
+            offsets: Device offsets for each write.
+            buffers: Python buffers to write.
+            payload_lens: Logical payload lengths for each buffer.
+            total_lens: Physical I/O lengths for each buffer.
+
+        Raises:
+            RuntimeError: If the requested io_uring mode is unavailable.
+            Exception: Propagates Rust raw-device write errors.
+        """
+        raw_dev = self._rawdev()
+        if self.io_engine != "io_uring":
+            for offset, buf, payload_len, total_len in zip(
+                offsets, buffers, payload_lens, total_lens, strict=True
+            ):
+                raw_dev.pwrite_from_buffer(offset, buf, payload_len, total_len)
+            return
+
+        if self.use_uring_cmd:
+            self._write_uring_cmd_buffers(
+                offsets,
+                buffers,
+                payload_lens,
+                total_lens,
+            )
+            return
+
+        can_batch = all(
+            int(payload_len) == int(total_len)
+            for payload_len, total_len in zip(payload_lens, total_lens, strict=True)
+        )
+        if can_batch:
+            batch_id = raw_dev.batched_write(
+                [int(offset) for offset in offsets],
+                list(buffers),
+                [int(total_len) for total_len in total_lens],
+            )
+            raw_dev.wait_iouring(batch_id)
+            return
+
+        for offset, buf, payload_len, total_len in zip(
+            offsets, buffers, payload_lens, total_lens, strict=True
+        ):
+            raw_dev.write_uring(int(offset), buf, int(payload_len), int(total_len))
+
+    def _read_buffers(
+        self,
+        offsets: Sequence[int],
+        buffers: Sequence[Any],
+        payload_lens: Sequence[int],
+        total_lens: Sequence[int],
+    ) -> None:
+        """Read one or more buffers through the configured Rust I/O path.
+
+        Args:
+            offsets: Device offsets for each read.
+            buffers: Destination Python buffers.
+            payload_lens: Logical payload lengths to expose to callers.
+            total_lens: Physical I/O lengths for each read.
+
+        Raises:
+            RuntimeError: If the requested io_uring mode is unavailable.
+            Exception: Propagates Rust raw-device read errors.
+        """
+        raw_dev = self._rawdev()
+        if self.io_engine != "io_uring":
+            for offset, buf, payload_len, total_len in zip(
+                offsets, buffers, payload_lens, total_lens, strict=True
+            ):
+                raw_dev.pread_into(offset, buf, payload_len, total_len)
+            return
+
+        if self.use_uring_cmd:
+            self._read_uring_cmd_buffers(offsets, buffers, payload_lens, total_lens)
+            return
+
+        can_batch = all(
+            int(payload_len) == int(total_len)
+            for payload_len, total_len in zip(payload_lens, total_lens, strict=True)
+        )
+        # batched_read requires aligned buffers when O_DIRECT is enabled
+        # Check alignment before using batched_read
+        if can_batch and all(self._is_buffer_aligned(buf) for buf in buffers):
+            batch_id = raw_dev.batched_read(
+                [int(offset) for offset in offsets],
+                list(buffers),
+                [int(total_len) for total_len in total_lens],
+            )
+            raw_dev.wait_iouring(batch_id)
+            return
+
+        for offset, buf, payload_len, total_len in zip(
+            offsets, buffers, payload_lens, total_lens, strict=True
+        ):
+            raw_dev.read_uring(int(offset), buf, int(payload_len), int(total_len))
+
     def _write_one(
         self, key: RawBlockKeySpec, memory_obj: MemoryObj, offset: int
     ) -> bool:
@@ -912,18 +1367,24 @@ def _write_one(
             with self._lock:
                 self._inflight_io_count += 1
             try:
-                raw_dev = self._rawdev()
                 hdr_total = (
                     round_up(len(header), self.block_align)
-                    if self.use_odirect
+                    if self._requires_transfer_alignment
                     else len(header)
                 )
-                raw_dev.pwrite_from_buffer(offset, header, len(header), hdr_total)
-                raw_dev.pwrite_from_buffer(
-                    offset + self.header_bytes,
-                    buf,
-                    payload_len,
-                    total_len,
+                header_buf: Any = header
+                if self.io_engine != "io_uring" and len(header) < hdr_total:
+                    padded_header = bytearray(header)
+                    padded_header.extend(b"\x00" * (hdr_total - len(header)))
+                    header_buf = padded_header
+                self._write_buffers(
+                    [offset, offset + self.header_bytes],
+                    [header_buf, buf],
+                    [
+                        hdr_total if self.io_engine == "io_uring" else len(header),
+                        payload_len,
+                    ],
+                    [hdr_total, total_len],
                 )
             finally:
                 with self._lock:
@@ -960,7 +1421,12 @@ def _read_slot_header(self, offset: int) -> Optional[tuple[int, int]]:
         try:
             with self._lock:
                 self._inflight_io_count += 1
-            self._rawdev().pread_into(offset, buf, self.header_bytes, self.header_bytes)
+            self._read_buffers(
+                [offset],
+                [buf],
+                [self.header_bytes],
+                [self.header_bytes],
+            )
             return self._decode_slot_header(buf)
         except Exception:
             return None
@@ -1040,8 +1506,11 @@ def _read_meta_header(self, container_offset: int) -> Optional[dict[str, int]]:
         """Read and validate a metadata checkpoint header."""
         buf = bytearray(self.block_align)
         try:
-            self._rawdev().pread_into(
-                container_offset, buf, self.block_align, self.block_align
+            self._read_buffers(
+                [container_offset],
+                [buf],
+                [self.block_align],
+                [self.block_align],
             )
         except Exception:
             return None
@@ -1068,7 +1537,7 @@ def _load_meta_payload(self, header: dict[str, int]) -> Optional[bytes]:
         total_len = round_up(payload_len, self.block_align)
         buf = bytearray(total_len)
         try:
-            self._rawdev().pread_into(payload_off, buf, payload_len, total_len)
+            self._read_buffers([payload_off], [buf], [payload_len], [total_len])
         except Exception:
             return None
 
@@ -1185,9 +1654,12 @@ def _write_checkpoint(self, payload: bytes, dirty_total_snapshot: int) -> bool:
             int(crc),
         )
 
-        raw = self._rawdev()
-        raw.pwrite_from_buffer(payload_off, payload, payload_len, payload_total_len)
-        raw.pwrite_from_buffer(target, header_block, self.block_align, self.block_align)
+        self._write_buffers(
+            [payload_off, target],
+            [payload, header_block],
+            [payload_len, self.block_align],
+            [payload_total_len, self.block_align],
+        )
 
         with self._lock:
             self._meta_seq = int(next_seq)
diff --git a/rust/raw_block/README.md b/rust/raw_block/README.md
index 8bd40cf022..7c788e8ff6 100644
--- a/rust/raw_block/README.md
+++ b/rust/raw_block/README.md
@@ -21,6 +21,21 @@ checkpointing, recovery, and MP task orchestration all live in Python.
 `use_iouring=True` remains accepted for backward compatibility. If `io_engine`
 is explicitly set, it wins over the legacy flag.
 
+## io_uring_cmd (NVMe Passthrough)
+
+When `io_engine="io_uring"`, you can optionally enable `use_uring_cmd=True` to
+use NVMe passthrough via the io_uring command interface for direct device access.
+
+**io_uring_cmd notes:**
+
+- Requires NVMe character device node (`/dev/ngXnY`) instead of the block device
+  node (`/dev/nvmeXnY`) for direct NVMe passthrough command.
+- Requires `io_engine="io_uring"` to be set.
+- Supports `max_data_transfer_size` parameter to split large transfers into
+  smaller chunks that fit within device limits.
+- When `use_uring_cmd=True`, `use_odirect` is ignored for NVMe namespace
+  character devices.
+
 ## MP Mode Integration
 
 In MP mode, the stack looks like this:
@@ -112,6 +127,21 @@ dev = RawBlockDevice(
 )
 ```
 
+io_uring with io_uring_cmd (NVMe passthrough):
+
+```python
+dev = RawBlockDevice(
+    "/dev/ng0n1",  # Note: NVMe character device node
+    True,
+    use_odirect=False,
+    alignment=4096,
+    io_engine="io_uring",
+    use_uring_cmd=True,
+    iouring_queue_depth=256,
+    max_data_transfer_size=131072,  # Optional: split large transfers
+)
+```
+
 ## MP Adapter Example
 
 To use the MP adapter from `lmcache server`, pass a `raw_block` L2 adapter
@@ -141,9 +171,11 @@ Notes:
 
 - `device_path` should point to an unmounted raw block device or a dedicated
   file used only by LMCache.
+- For `use_uring_cmd=true`, `device_path` must use the NVMe character
+  device node (e.g., `/dev/ng0n1`) instead of the block device node.
 - With `use_odirect=true`, LMCache MP L1 alignment must be at least
   `block_align`.
 - Restart recovery uses the metadata checkpoint region on the same device.
 - Raw-block slot reclamation is driven by the shared/global L2 eviction
   controller or explicit `delete()` calls.
-- `raw_block` remains the adapter type for both supported engines.
+- `raw_block` remains the adapter type for all supported engines.
diff --git a/rust/raw_block/src/lib.rs b/rust/raw_block/src/lib.rs
index f170de532a..8da60477eb 100644
--- a/rust/raw_block/src/lib.rs
+++ b/rust/raw_block/src/lib.rs
@@ -28,9 +28,151 @@ use std::sync::{Arc, Condvar, Mutex};
 use std::thread;
 use std::time::Duration;
 
+use io_uring::cqueue::{Entry, Entry32};
+use io_uring::squeue::{Entry as SqueueEntry, Entry128};
 use io_uring::types::Fd;
 use io_uring::{opcode, IoUring};
 
+// Wrapper enum to support both standard and big io_uring entries
+// This allows fallback to standard entries on kernels < 5.19
+#[derive(Clone)]
+enum IoUringWrapper {
+    Standard(Arc<Mutex<IoUring<SqueueEntry, Entry>>>),
+    Big(Arc<Mutex<IoUring<Entry128, Entry32>>>),
+}
+
+impl IoUringWrapper {
+    // Get the submission queue length
+    fn submission_len(&self) -> usize {
+        match self {
+            IoUringWrapper::Standard(ring) => {
+                let mut ring = ring.lock().unwrap();
+                let len = ring.submission().len();
+                len
+            }
+            IoUringWrapper::Big(ring) => {
+                let mut ring = ring.lock().unwrap();
+                let len = ring.submission().len();
+                len
+            }
+        }
+    }
+
+    // Sync the submission queue
+    fn submission_sync(&self) {
+        match self {
+            IoUringWrapper::Standard(ring) => {
+                let mut ring = ring.lock().unwrap();
+                ring.submission().sync();
+            }
+            IoUringWrapper::Big(ring) => {
+                let mut ring = ring.lock().unwrap();
+                ring.submission().sync();
+            }
+        }
+    }
+}
+
+// NVMe identify namespace data structure
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+struct NvmeIdNs {
+    nsze: u64,
+    ncap: u64,
+    nuse: u64,
+    nsfeat: u8,
+    nlbaf: u8,
+    flbas: u8,
+    mc: u8,
+    dpc: u8,
+    dps: u8,
+    nmic: u8,
+    rescap: u8,
+    fpi: u8,
+    dlfeat: u8,
+    nawun: u16,
+    nawupf: u16,
+    nacwu: u16,
+    nabsn: u16,
+    nabo: u16,
+    nabspf: u16,
+    noiob: u16,
+    nvmcap: [u8; 16],
+    npwg: u16,
+    npwa: u16,
+    npdg: u16,
+    npda: u16,
+    nows: u16,
+    mssrl: u16,
+    mcl: u32,
+    msrc: u8,
+    rsvd81: [u8; 11],
+    anagrpid: u32,
+    rsvd96: [u8; 3],
+    nsattr: u8,
+    nvmsetid: u16,
+    endgid: u16,
+    nguid: [u8; 16],
+    eui64: [u8; 8],
+    lbaf: [NvmeLbaf; 64],
+    vs: [u8; 3712],
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+struct NvmeLbaf {
+    ms: u16,
+    ds: u8,
+    rp: u8,
+}
+
+// NVMe admin opcodes
+const NVME_ADMIN_IDENTIFY: u8 = 0x06;
+
+// NVMe identify CNS values
+const NVME_IDENTIFY_CNS_NS: u32 = 0x00;
+
+// NVMe I/O opcodes
+const NVME_IO_READ: u8 = 0x02;
+const NVME_IO_WRITE: u8 = 0x01;
+
+// NVMe uring command structure (80 bytes)
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+struct NvmeUringCmd {
+    opcode: u8,
+    flags: u8,
+    rsvd1: u16,
+    nsid: u32,
+    cdw2: u32,
+    cdw3: u32,
+    metadata: u64,
+    addr: u64,
+    metadata_len: u32,
+    data_len: u32,
+    cdw10: u32,
+    cdw11: u32,
+    cdw12: u32,
+    cdw13: u32,
+    cdw14: u32,
+    cdw15: u32,
+    rsvd2: [u32; 4],
+}
+
+// Linux ioctl for NVMe admin command
+// Defined in <linux/nvme_ioctl.h>: NVME_IOCTL_ADMIN_CMD _IOWR ('N', 0x41)
+const NVME_IOCTL_ADMIN_CMD: libc::c_ulong = 0xC048_4E41;
+
+// Defined in <linux/nvme_ioctl.h>: NVME_IOCTL_IO_CMD _IOWR ('N', 0x43)
+// const NVME_IOCTL_IO_CMD: libc::c_ulong = 0xC048_4E43;
+
+// NVMe io_uring_cmd opcodes
+const NVME_URING_CMD_IO: u32 = 0xC048_4E80;
+
+// Linux ioctl for NVMe namespace ID
+// Defined in <linux/nvme_ioctl.h>: NVME_IOCTL_ID _IO ('N', 0x40)
+const NVME_IOCTL_ID: libc::c_ulong = 0x4e40;
+
 // Linux ioctl for block device size in bytes.
 // Defined in <linux/fs.h>: BLKGETSIZE64 _IOR(0x12,114,size_t)
 const BLKGETSIZE64: libc::c_ulong = 0x8008_1272; // ioctl op to query block size
@@ -165,6 +307,213 @@ fn fd_size_bytes(fd: RawFd) -> Result<u64, PyErr> {
     Ok(st.st_size as u64)
 }
 
+// NVMe helper functions for io_uring command support
+
+// Calculate NVMe namespace size in bytes from identify namespace data
+fn nvme_ns_size_bytes(id_ns: &NvmeIdNs, lba_size: u32) -> u64 {
+    id_ns.nsze * lba_size as u64
+}
+
+/// Check if device path is a character device (e.g., /dev/ng0n1)
+fn is_character_device(path: &str) -> Result<bool, PyErr> {
+    let cpath = CString::new(path).map_err(|_| PyValueError::new_err("path contains NUL"))?;
+
+    // SAFETY: stat call
+    let mut st: libc::stat = unsafe { std::mem::zeroed() };
+    let rc = unsafe { libc::stat(cpath.as_ptr(), &mut st as *mut libc::stat) };
+
+    if rc != 0 {
+        return Err(os_err("stat failed"));
+    }
+
+    Ok((st.st_mode & libc::S_IFMT) == libc::S_IFCHR)
+}
+
+/// Get namespace ID from NVMe device using ioctl.
+fn nvme_get_nsid_from_fd(fd: RawFd) -> Result<u32, PyErr> {
+    //SAFETY: ioctl call with request that returns an integer
+    let ret = unsafe { libc::ioctl(fd, NVME_IOCTL_ID) };
+    if ret < 0 {
+        return Err(os_err("Failed to get namespace ID via ioctl"));
+    }
+    Ok(ret as u32)
+}
+
+/// Get LBA shift (log2 of LBA size) from identify namespace data
+fn nvme_get_lba_shift(id_ns: &NvmeIdNs) -> Result<u32, PyErr> {
+    // Extract LBA format index from FLBAS
+
+    let lbaf_index = if id_ns.nlbaf < 16 {
+        (id_ns.flbas & 0x0F) as usize
+    } else {
+        let lsb = (id_ns.flbas & 0x0F) as usize;
+        let msb = ((id_ns.flbas >> 5) & 0x03) as usize;
+        lsb + (msb << 4)
+    };
+
+    if lbaf_index >= 64 {
+        return Err(PyValueError::new_err("Invalid LBA format index"));
+    }
+
+    // Get LBA data size from LBAF
+    let ds = id_ns.lbaf[lbaf_index].ds;
+    if ds == 0 {
+        return Err(PyValueError::new_err("Invalid LBA data size"));
+    }
+
+    // Check for metadata support
+    let ms = id_ns.lbaf[lbaf_index].ms;
+    if ms != 0 {
+        return Err(PyValueError::new_err(
+            "Device is formatted with metadata, can't be supported.",
+        ));
+    }
+
+    Ok(ds as u32)
+}
+
+/// Get LBA size in bytes from identify namespace data
+fn nvme_get_lba_size(id_ns: &NvmeIdNs) -> Result<u32, PyErr> {
+    let lba_shift = nvme_get_lba_shift(id_ns)?;
+    Ok(1u32 << lba_shift)
+}
+
+/// NVMe passthrough command structure for ioctl
+#[repr(C)]
+struct NvmePassthruCmd {
+    opcode: u8,
+    flags: u8,
+    rsvd1: u16,
+    nsid: u32,
+    cdw2: u32,
+    cdw3: u32,
+    metadata: u64,
+    addr: u64,
+    metadata_len: u32,
+    data_len: u32,
+    cdw10: u32,
+    cdw11: u32,
+    cdw12: u32,
+    cdw13: u32,
+    cdw14: u32,
+    cdw15: u32,
+    timeout_ms: u32,
+    result: u32,
+}
+
+/// Send NVMe identify namespace command via ioctl
+fn nvme_identify_ns(fd: RawFd, nsid: u32) -> Result<NvmeIdNs, PyErr> {
+    let mut id_ns: NvmeIdNs = unsafe { std::mem::zeroed() };
+
+    let cmd = NvmePassthruCmd {
+        opcode: NVME_ADMIN_IDENTIFY,
+        nsid,
+        addr: &mut id_ns as *mut NvmeIdNs as u64,
+        data_len: std::mem::size_of::<NvmeIdNs>() as u32,
+        cdw10: NVME_IDENTIFY_CNS_NS,
+        timeout_ms: 0,
+        ..unsafe { std::mem::zeroed() }
+    };
+
+    // SAFETY: ioctl with properly initialized command structure
+    let rc = unsafe { libc::ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd as *const NvmePassthruCmd) };
+
+    if rc < 0 {
+        return Err(os_err("NVMe identify namespace ioctl failed"));
+    }
+
+    Ok(id_ns)
+}
+
+/// Prepare NVMe uring command for read/write operations
+#[allow(clippy::too_many_arguments)]
+fn nvme_uring_cmd_prep(
+    cmd: &mut NvmeUringCmd,
+    is_write: bool,
+    nsid: u32,
+    offset: u64,
+    len: usize,
+    lba_shift: u32,
+    ptr: *const u8,
+    dtype: u8,
+    dspec: u16,
+) -> Result<(), PyErr> {
+    let lba_size = 1usize << lba_shift;
+
+    // Validate offset alignment
+    if !(offset as usize).is_multiple_of(lba_size) {
+        return Err(PyValueError::new_err(format!(
+            "offset must be aligned to LBA size ({} bytes), got offset={}",
+            lba_size, offset
+        )));
+    }
+
+    // Validate length alignment
+    if !len.is_multiple_of(lba_size) {
+        return Err(PyValueError::new_err(format!(
+            "length must be aligned to LBA size ({} bytes), got len={}",
+            lba_size, len
+        )));
+    }
+
+    // Validate non-zero length
+    if len == 0 {
+        return Err(PyValueError::new_err("length must be non-zero"));
+    }
+
+    // Calculate SLBA (Starting LBA) and NLB (Number of LBAs)
+    let slba = offset >> lba_shift;
+    let nlb = (len >> lba_shift) - 1; // NLB is 0-based
+
+    // Validate NLB fits in NVMe field (16 bits, max 0xFFFF)
+    if nlb > 0xFFFF {
+        return Err(PyValueError::new_err(format!(
+            "NLB ({}) exceeds NVMe field maximum (65535)",
+            nlb
+        )));
+    }
+
+    // Set opcode
+    cmd.opcode = if is_write {
+        NVME_IO_WRITE
+    } else {
+        NVME_IO_READ
+    };
+    cmd.nsid = nsid;
+
+    // Set SLBA in cdw10 and cdw11
+    cmd.cdw10 = (slba & 0xFFFFFFFF) as u32;
+    cmd.cdw11 = (slba >> 32) as u32;
+
+    // Set NLB in cdw12 (bits 0-15) and dtype in bits 20-23
+    cmd.cdw12 = nlb as u32 | ((dtype as u32) << 20);
+
+    // Set dspec in cdw13 bits 16-31
+    cmd.cdw13 = (dspec as u32) << 16;
+
+    // Set data address and length
+    cmd.addr = ptr as u64;
+    cmd.data_len = len as u32;
+
+    // No metadata support for now
+    cmd.metadata = 0;
+    cmd.metadata_len = 0;
+
+    Ok(())
+}
+
+/// NVMe command data for io_uring_cmd submissions.
+///
+/// This structure contains NVMe-specific information needed for
+/// passthrough commands via io_uring_cmd.
+#[derive(Clone, Debug)]
+struct NvmeCmdData {
+    nsid: u32,      // Namespace ID
+    lba_shift: u32, // LBA shift (log2 of LBA size)
+    dtype: u8,      // Directive Type
+    dspec: u16,     // Directive Specific
+}
+
 /// Aligned buffer for O_DIRECT I/O.
 /// Allocated with posix_memalign so the pointer satisfies alignment requirements.
 /// Automatically freed on drop.
@@ -483,6 +832,7 @@ impl Drop for UringNotify {
 /// - `original_ptr`: For reads with bounce buffer, the original destination pointer.
 /// - `payload_len`: For reads with bounce buffer, the actual payload length to copy back.
 /// - `batch_id`: The batch ID this submission belongs to (for per-batch tracking)
+/// - `nvme_cmd_data`: Optional NVMe command for io_uring_cmd submission
 #[derive(Clone)]
 struct IoSubmission {
     fd: RawFd,
@@ -493,9 +843,10 @@ struct IoSubmission {
     completion: Arc<IoCompletion>,
     fixed_buffer_idx: Option<u16>,
     bounce: Option<std::sync::Arc<AlignedBuf>>,
-    original_ptr: Option<usize>, // For bounce buffer reads
-    payload_len: Option<usize>,  // For bounce buffer reads
-    batch_id: u64,               // Batch ID for per-batch tracking
+    original_ptr: Option<usize>,        // For bounce buffer reads
+    payload_len: Option<usize>,         // For bounce buffer reads
+    batch_id: u64,                      // Batch ID for per-batch tracking
+    nvme_cmd_data: Option<NvmeCmdData>, // NVMe command data for io_uring_cmd
 }
 
 impl Default for IoSubmission {
@@ -512,6 +863,7 @@ impl Default for IoSubmission {
             original_ptr: None,
             payload_len: None,
             batch_id: 0,
+            nvme_cmd_data: None,
         }
     }
 }
@@ -523,14 +875,20 @@ impl Default for IoSubmission {
 /// Higher-level policies (slotting, manifests, etc.) live in Python.
 #[pyclass]
 struct RawBlockDevice {
-    fd: RawFd,          // raw file descriptor
-    size: u64,          // cached device size in bytes
-    closed: AtomicBool, // avoid double-close
-    use_odirect: bool,  // enforce alignment + bypass page cache
-    alignment: usize,   // required alignment in bytes
-    use_iouring: bool,  // Enable io_uring
+    fd: RawFd,           // raw file descriptor
+    size: u64,           // cached device size in bytes
+    closed: AtomicBool,  // avoid double-close
+    use_odirect: bool,   // enforce alignment + bypass page cache
+    alignment: usize,    // required alignment in bytes
+    use_iouring: bool,   // Enable io_uring
+    use_uring_cmd: bool, // Enable io_uring_cmd for NVMe passthrough
+    // NVMe device data (only when use_uring_cmd=true)
+    nvme_nsid: Option<u32>,      // Namespace ID
+    nvme_lba_shift: Option<u32>, // LBA shift (log2 of LBA size)
+    nvme_lba_size: Option<u32>,  // LBA size in bytes
     // io_uring ring instance (only when use_iouring=true)
-    ring: Option<Arc<Mutex<IoUring>>>,
+    // Uses wrapper to support both standard and big entries for kernel compatibility
+    ring: Option<IoUringWrapper>,
     // Queue for sending I/O requests from Python to worker thread
     queue: Option<Arc<Mutex<Vec<IoSubmission>>>>,
     // Background worker thread handle
@@ -603,24 +961,33 @@ impl Drop for FdGuard {
 
 impl RawBlockDevice {
     /// Internal constructor performs all low level setup.
+    #[allow(clippy::too_many_arguments)]
     fn new_internal(
         path: String,
         writable: bool,
         use_odirect: bool,
         alignment: usize,
         use_iouring: bool,
+        use_uring_cmd: bool,
         io_engine: Option<String>,
         iouring_queue_depth: usize,
     ) -> PyResult<Self> {
         let use_iouring = parse_use_iouring(io_engine, use_iouring)?;
         let iouring_queue_depth = iouring_queue_depth.max(1);
-        let cpath = CString::new(path).map_err(|_| PyValueError::new_err("path contains NUL"))?;
+        // use_uring_cmd requires use_iouring to be enabled
+        if use_uring_cmd && !use_iouring {
+            return Err(PyValueError::new_err(
+                "use_uring_cmd requires use_iouring to be enabled",
+            ));
+        }
+        let cpath =
+            CString::new(path.clone()).map_err(|_| PyValueError::new_err("path contains NUL"))?;
         let mut flags = if writable {
             libc::O_RDWR
         } else {
             libc::O_RDONLY
         };
-        if use_odirect {
+        if use_odirect && !use_uring_cmd {
             flags |= O_DIRECT;
         }
         // SAFETY: open returns fd or -1.
@@ -632,7 +999,40 @@ impl RawBlockDevice {
         // below returns early before the fd is moved into the RawBlockDevice.
         // Disarmed once the struct is successfully constructed.
         let fd_guard = FdGuard::new(fd);
-        let size = fd_size_bytes(fd)?;
+
+        // Initialize NVMe data if io_uring command support is enabled
+        let (nvme_nsid, nvme_lba_shift, nvme_lba_size, nvme_id_ns) = if use_uring_cmd {
+            // Validate that device is a character device (required for io_uring_cmd)
+            let is_char_dev = is_character_device(&path)?;
+            if !is_char_dev {
+                return Err(PyValueError::new_err(
+                    "use_uring_cmd requires an NVMe namespace character device (e.g., /dev/ng0n1)",
+                ));
+            }
+
+            // Get namespace ID from device path
+            let nsid = nvme_get_nsid_from_fd(fd)?;
+            // Send identify namespace command to get LBA size
+            let id_ns = nvme_identify_ns(fd, nsid)?;
+            let lba_shift = nvme_get_lba_shift(&id_ns)?;
+            let lba_size = nvme_get_lba_size(&id_ns)?;
+
+            (Some(nsid), Some(lba_shift), Some(lba_size), Some(id_ns))
+        } else {
+            (None, None, None, None)
+        };
+
+        // Calculate device size. Use NVMe ns info for character device
+        // Use ioctl/fstat for block devices and regular files
+        let size = if use_uring_cmd {
+            if let (Some(id_ns), Some(lba_size)) = (nvme_id_ns, nvme_lba_size) {
+                nvme_ns_size_bytes(&id_ns, lba_size)
+            } else {
+                0
+            }
+        } else {
+            fd_size_bytes(fd)?
+        };
 
         let (
             ring_opt,
@@ -647,17 +1047,63 @@ impl RawBlockDevice {
             next_batch_id_opt,
             batch_in_flight_opt,
         ) = if use_iouring {
-            let ring = IoUring::new(iouring_queue_depth as u32)
-                .map_err(|e| PyRuntimeError::new_err(format!("io_uring init failed: {}", e)))?;
             let notify = UringNotify::new()
                 .map_err(|e| PyRuntimeError::new_err(format!("UringNotify init failed: {}", e)))?;
-            // Register the CQ eventfd with the ring so the kernel writes to it
-            // whenever a CQE is posted. Must happen before the ring is wrapped
-            // in a Mutex / handed to the worker.
-            ring.submitter()
-                .register_eventfd(notify.cq_efd)
-                .map_err(|e| PyRuntimeError::new_err(format!("register_eventfd failed: {}", e)))?;
-            let ring = Arc::new(Mutex::new(ring));
+            // Try to create IoUring with big entries (Entry128/Entry32) first
+            // This is required for io_uring_cmd support (kernel 5.19+)
+            // If that fails, fall back to standard entries (Entry/Entry) for kernel 5.4-5.18
+            let ring = match IoUring::<Entry128, Entry32>::builder()
+                .build(iouring_queue_depth as u32)
+            {
+                Ok(big_ring) => {
+                    // Big entries supported - io_uring_cmd can be used
+                    if use_uring_cmd {
+                        // Validate that device is a character device (required for io_uring_cmd)
+                        let is_char_dev = is_character_device(&path)?;
+                        if !is_char_dev {
+                            return Err(PyValueError::new_err(
+                                "use_uring_cmd requires an NVMe namespace character device (e.g., /dev/ng0n1)",
+                            ));
+                        }
+                    }
+                    // Register the CQ eventfd with the ring so the kernel writes to it
+                    // whenever a CQE is posted. Must happen before the ring is wrapped
+                    // in a Mutex / handed to the worker.
+                    big_ring
+                        .submitter()
+                        .register_eventfd(notify.cq_efd)
+                        .map_err(|e| {
+                            PyRuntimeError::new_err(format!("register_eventfd failed: {}", e))
+                        })?;
+                    let big_ring = Arc::new(Mutex::new(big_ring));
+                    IoUringWrapper::Big(big_ring)
+                }
+                Err(_) => {
+                    // Big entries not supported (kernel < 5.19), fall back to standard entries
+                    // io_uring_cmd is not available on these kernels
+                    if use_uring_cmd {
+                        return Err(PyRuntimeError::new_err(
+                            "io_uring_cmd requires kernel 5.19 or later (big SQE/CQE entries not supported)",
+                        ));
+                    }
+                    let std_ring = IoUring::<SqueueEntry, Entry>::builder()
+                        .build(iouring_queue_depth as u32)
+                        .map_err(|e| {
+                            PyRuntimeError::new_err(format!("io_uring init failed: {}", e))
+                        })?;
+                    // Register the CQ eventfd with the ring so the kernel writes to it
+                    // whenever a CQE is posted. Must happen before the ring is wrapped
+                    // in a Mutex / handed to the worker.
+                    std_ring
+                        .submitter()
+                        .register_eventfd(notify.cq_efd)
+                        .map_err(|e| {
+                            PyRuntimeError::new_err(format!("register_eventfd failed: {}", e))
+                        })?;
+                    let std_ring = Arc::new(Mutex::new(std_ring));
+                    IoUringWrapper::Standard(std_ring)
+                }
+            };
             let queue = Arc::new(Mutex::new(Vec::<IoSubmission>::new()));
             let shutdown = Arc::new(AtomicBool::new(false));
             let batch_ready = Arc::new(notify);
@@ -669,7 +1115,7 @@ impl RawBlockDevice {
             let next_batch_id = Arc::new(AtomicU64::new(1));
             let batch_in_flight = Arc::new(Mutex::new(HashMap::<u64, BatchTracking>::new()));
 
-            let ring_clone = Arc::clone(&ring);
+            let ring_clone = ring.clone();
             let queue_clone = Arc::clone(&queue);
             let shutdown_clone = Arc::clone(&shutdown);
             let batch_ready_clone = Arc::clone(&batch_ready);
@@ -678,6 +1124,213 @@ impl RawBlockDevice {
             let batch_in_flight_clone = Arc::clone(&batch_in_flight);
             let ring_size = iouring_queue_depth;
 
+            // Helper function to copy data from bounce buffer to original buffer
+            fn copy_from_bounce_buffer(bounce: &AlignedBuf, orig_ptr: usize, payload_len: usize) {
+                unsafe {
+                    libc::memcpy(
+                        orig_ptr as *mut libc::c_void,
+                        bounce.as_ptr() as *const libc::c_void,
+                        payload_len,
+                    );
+                }
+            }
+
+            // Helper function to handle completion result and set IoCompletion
+            // Returns Ok(()) for successful completion, Err for errors
+            // Note: Short I/O resubmission for regular I/O is handled BEFORE calling this function
+            // in the main completion loop. This function only handles:
+            // - Full completions
+            // - Errors (negative results)
+            // - Short I/O during shutdown (cannot resubmit)
+            fn handle_completion_result(
+                sub: &mut IoSubmission,
+                cqe_result: i32,
+                is_shutdown: bool,
+            ) -> PyResult<()> {
+                let is_uring_cmd = sub.nvme_cmd_data.is_some();
+
+                if cqe_result < 0 {
+                    let code = -cqe_result;
+                    let _ = sub.bounce.take();
+                    Err(PyOSError::new_err((code, "io_uring I/O error")))
+                } else if is_uring_cmd {
+                    // Non-zero result indicates NVMe command error
+                    if cqe_result != 0 {
+                        let code = cqe_result;
+                        let _ = sub.bounce.take();
+                        Err(PyOSError::new_err((code, "io_uring_cmd NVMe error")))
+                    } else {
+                        // io_uring_cmd successful completion (result == 0)
+                        // For reads with bounce buffer, copy data back to original buffer
+                        if !sub.is_write {
+                            if let (Some(bounce), Some(orig_ptr), Some(payload_len)) =
+                                (sub.bounce.take(), sub.original_ptr, sub.payload_len)
+                            {
+                                copy_from_bounce_buffer(&bounce, orig_ptr, payload_len);
+                            }
+                        } else {
+                            let _ = sub.bounce.take();
+                        }
+                        Ok(())
+                    }
+                } else {
+                    // Regular io_uring read/write
+                    let bytes_transferred = cqe_result as usize;
+                    if bytes_transferred < sub.len {
+                        if is_shutdown {
+                            // Short read/write during shutdown: fail the request
+                            let _ = sub.bounce.take();
+                            Err(PyRuntimeError::new_err(
+                                "io_uring worker shutting down: short I/O during shutdown",
+                            ))
+                        } else {
+                            // This should never happen
+                            let _ = sub.bounce.take();
+                            Err(PyRuntimeError::new_err(
+                                "Unexpected short I/O: internal error",
+                            ))
+                        }
+                    } else {
+                        // Full completion
+                        // For reads with bounce buffer, copy data back to original buffer
+                        if !sub.is_write {
+                            if let (Some(bounce), Some(orig_ptr), Some(payload_len)) =
+                                (sub.bounce.take(), sub.original_ptr, sub.payload_len)
+                            {
+                                copy_from_bounce_buffer(&bounce, orig_ptr, payload_len);
+                            }
+                        } else {
+                            let _ = sub.bounce.take();
+                        }
+                        Ok(())
+                    }
+                }
+            }
+
+            // Helper function to decrement in-flight counts and notify condition variables
+            fn decrement_in_flight(
+                in_flight_count: &Arc<AtomicU64>,
+                in_flight_cvar: &Arc<Condvar>,
+                batch_in_flight: &Arc<Mutex<HashMap<u64, BatchTracking>>>,
+                batch_id: u64,
+            ) {
+                let prev = in_flight_count.fetch_sub(1, Ordering::Relaxed);
+                if prev == 1 {
+                    in_flight_cvar.notify_all();
+                }
+                // Decrement per-batch in-flight count and notify if batch is complete
+                if batch_id != 0 {
+                    let batch_map = batch_in_flight.lock().unwrap();
+                    if let Some((batch_count, batch_cvar)) = batch_map.get(&batch_id) {
+                        let prev_batch = batch_count.fetch_sub(1, Ordering::Relaxed);
+                        if prev_batch == 1 {
+                            batch_cvar.notify_all();
+                        }
+                    }
+                }
+            }
+
+            // Helper function to build and submit an SQE for a submission
+            fn build_and_submit_sqe(
+                ring: &IoUringWrapper,
+                sub: &IoSubmission,
+                user_data: u64,
+            ) -> Result<(), PyErr> {
+                let ptr = sub.ptr_addr as *mut u8;
+
+                // Check if this is an io_uring_cmd submission
+                if let Some(nvme_data) = &sub.nvme_cmd_data {
+                    // Prepare NVMe uring command
+                    let mut nvme_cmd: NvmeUringCmd = unsafe { std::mem::zeroed() };
+                    nvme_uring_cmd_prep(
+                        &mut nvme_cmd,
+                        sub.is_write,
+                        nvme_data.nsid,
+                        sub.offset,
+                        sub.len,
+                        nvme_data.lba_shift,
+                        ptr,
+                        nvme_data.dtype,
+                        nvme_data.dspec,
+                    )?;
+
+                    // Convert NvmeUringCmd to byte array for UringCmd80
+                    let cmd_bytes: [u8; 80] = unsafe { std::mem::transmute_copy(&nvme_cmd) };
+
+                    // Build UringCmd80 with big SQE entry
+                    let mut uring_cmd =
+                        opcode::UringCmd80::new(Fd(sub.fd), NVME_URING_CMD_IO).cmd(cmd_bytes);
+
+                    // Set buf_index if using fixed buffers
+                    if let Some(idx) = sub.fixed_buffer_idx {
+                        uring_cmd = uring_cmd.buf_index(Some(idx));
+                    }
+
+                    let sqe128 = uring_cmd.build().user_data(user_data);
+
+                    // Push the big SQE entry (128 bytes)
+                    match ring {
+                        IoUringWrapper::Big(ring) => {
+                            let mut ring = ring.lock().unwrap();
+                            unsafe {
+                                ring.submission()
+                                    .push(&sqe128)
+                                    .expect("failed to push sqe128");
+                            }
+                        }
+                        IoUringWrapper::Standard(_) => {
+                            return Err(PyRuntimeError::new_err(
+                                "io_uring_cmd requires big entries (kernel 5.19+)",
+                            ));
+                        }
+                    }
+                } else {
+                    // Regular read/write operations
+                    let sqe = if sub.is_write {
+                        if let Some(idx) = sub.fixed_buffer_idx {
+                            opcode::WriteFixed::new(
+                                Fd(sub.fd),
+                                ptr as *const u8,
+                                sub.len as u32,
+                                idx,
+                            )
+                            .offset(sub.offset)
+                            .build()
+                        } else {
+                            opcode::Write::new(Fd(sub.fd), ptr as *const u8, sub.len as u32)
+                                .offset(sub.offset)
+                                .build()
+                        }
+                    } else if let Some(idx) = sub.fixed_buffer_idx {
+                        opcode::ReadFixed::new(Fd(sub.fd), ptr, sub.len as u32, idx)
+                            .offset(sub.offset)
+                            .build()
+                    } else {
+                        opcode::Read::new(Fd(sub.fd), ptr, sub.len as u32)
+                            .offset(sub.offset)
+                            .build()
+                    };
+                    let sqe = sqe.user_data(user_data);
+                    // Convert to appropriate entry type based on ring type
+                    match ring {
+                        IoUringWrapper::Big(ring) => {
+                            let mut ring = ring.lock().unwrap();
+                            let sqe128: Entry128 = sqe.into();
+                            unsafe {
+                                ring.submission().push(&sqe128).expect("failed to push sqe");
+                            }
+                        }
+                        IoUringWrapper::Standard(ring) => {
+                            let mut ring = ring.lock().unwrap();
+                            unsafe {
+                                ring.submission().push(&sqe).expect("failed to push sqe");
+                            }
+                        }
+                    }
+                }
+                Ok(())
+            }
+
             // Worker thread that handles io_uring submissions and completions.
             //
             // Runs a continuous loop that:
@@ -709,24 +1362,25 @@ impl RawBlockDevice {
                         //   - Decrement the in_flight_count atomic
                         //   - Wake up any threads waiting for all I/O to complete
                         {
-                            let mut ring = ring_clone.lock().unwrap();
-                            let completions: Vec<_> = ring.completion().collect();
-                            for cqe in completions {
-                                let user_data = cqe.user_data();
-                                if let Some(mut sub) = in_flight.remove(&user_data) {
-                                    let batch_id = sub.batch_id;
-                                    if cqe.result() < 0 {
-                                        let code = -cqe.result();
-                                        // Drop any bounce buffer associated with this submission.
-                                        let _ = sub.bounce.take();
-                                        sub.completion.set(Err(PyOSError::new_err((
-                                            code,
-                                            "io_uring I/O error",
-                                        ))));
-                                    } else {
-                                        let bytes_transferred = cqe.result() as usize;
-                                        if bytes_transferred < sub.len {
-                                            // Short read/write: update offset and length, then resubmit
+                            // Process completions for standard ring
+                            if let IoUringWrapper::Standard(ring) = &ring_clone {
+                                let completions: Vec<_> = {
+                                    let mut ring = ring.lock().unwrap();
+                                    ring.completion().collect()
+                                };
+                                for cqe in completions {
+                                    let user_data = cqe.user_data();
+                                    if let Some(mut sub) = in_flight.remove(&user_data) {
+                                        let batch_id = sub.batch_id;
+                                        let cqe_result = cqe.result();
+
+                                        // Handle short I/O with resubmission (only for regular I/O, not io_uring_cmd)
+                                        if cqe_result >= 0
+                                            && (cqe_result as usize) < sub.len
+                                            && sub.nvme_cmd_data.is_none()
+                                        {
+                                            let bytes_transferred = cqe_result as usize;
+                                            // Update offset and length for resubmission
                                             sub.offset += bytes_transferred as u64;
                                             sub.len -= bytes_transferred;
                                             // Update buffer pointer for writes and direct reads
@@ -744,13 +1398,11 @@ impl RawBlockDevice {
                                                     sub.original_ptr,
                                                     sub.payload_len,
                                                 ) {
-                                                    unsafe {
-                                                        libc::memcpy(
-                                                            orig_ptr as *mut libc::c_void,
-                                                            bounce.as_ptr() as *const libc::c_void,
-                                                            bytes_transferred.min(payload_len),
-                                                        );
-                                                    }
+                                                    copy_from_bounce_buffer(
+                                                        bounce,
+                                                        orig_ptr,
+                                                        bytes_transferred.min(payload_len),
+                                                    );
                                                     sub.original_ptr =
                                                         Some(orig_ptr + bytes_transferred);
                                                     sub.payload_len = Some(
@@ -763,97 +1415,118 @@ impl RawBlockDevice {
                                             // Don't decrement in_flight_count since we're resubmitting
                                             in_flight.insert(user_data, sub.clone());
                                             // Push a new SQE for the remaining data
-                                            let ptr = sub.ptr_addr as *mut u8;
-                                            let sqe = if sub.is_write {
-                                                if let Some(idx) = sub.fixed_buffer_idx {
-                                                    opcode::WriteFixed::new(
-                                                        Fd(sub.fd),
-                                                        ptr as *const u8,
-                                                        sub.len as u32,
-                                                        idx,
-                                                    )
-                                                    .offset(sub.offset)
-                                                    .build()
-                                                } else {
-                                                    opcode::Write::new(
-                                                        Fd(sub.fd),
-                                                        ptr as *const u8,
-                                                        sub.len as u32,
-                                                    )
-                                                    .offset(sub.offset)
-                                                    .build()
+                                            let _ =
+                                                build_and_submit_sqe(&ring_clone, &sub, user_data);
+                                            let _ = match &ring_clone {
+                                                IoUringWrapper::Standard(ring) => {
+                                                    let ring = ring.lock().unwrap();
+                                                    ring.submitter().submit()
+                                                }
+                                                IoUringWrapper::Big(ring) => {
+                                                    let ring = ring.lock().unwrap();
+                                                    ring.submitter().submit()
                                                 }
-                                            } else if let Some(idx) = sub.fixed_buffer_idx {
-                                                opcode::ReadFixed::new(
-                                                    Fd(sub.fd),
-                                                    ptr,
-                                                    sub.len as u32,
-                                                    idx,
-                                                )
-                                                .offset(sub.offset)
-                                                .build()
-                                            } else {
-                                                opcode::Read::new(Fd(sub.fd), ptr, sub.len as u32)
-                                                    .offset(sub.offset)
-                                                    .build()
                                             };
-                                            let sqe = sqe.user_data(user_data);
-                                            unsafe {
-                                                ring.submission().push(&sqe).expect(
-                                                    "failed to push sqe for short read/write",
-                                                );
-                                            }
-                                            // Submit the new SQE to the kernel
-                                            let _ = ring.submitter().submit();
                                             continue;
                                         }
-                                        // Full completion
-                                        // For reads with bounce buffer, copy data back to original buffer
-                                        if !sub.is_write {
-                                            if let (
-                                                Some(bounce),
-                                                Some(orig_ptr),
-                                                Some(payload_len),
-                                            ) = (
-                                                sub.bounce.take(),
-                                                sub.original_ptr,
-                                                sub.payload_len,
-                                            ) {
-                                                unsafe {
-                                                    libc::memcpy(
-                                                        orig_ptr as *mut libc::c_void,
-                                                        bounce.as_ptr() as *const libc::c_void,
-                                                        payload_len,
-                                                    );
-                                                }
-                                            }
-                                        } else {
-                                            // Drop any bounce buffer associated with this submission.
-                                            let _ = sub.bounce.take();
-                                        }
-                                        sub.completion.set(Ok(()));
-                                    }
-                                    let prev =
-                                        in_flight_count_clone.fetch_sub(1, Ordering::Relaxed);
-                                    if prev == 1 {
-                                        in_flight_cvar_clone.notify_all();
+
+                                        // Handle completion result
+                                        let result =
+                                            handle_completion_result(&mut sub, cqe_result, false);
+                                        sub.completion.set(result);
+
+                                        // Decrement in-flight counts and notify
+                                        decrement_in_flight(
+                                            &in_flight_count_clone,
+                                            &in_flight_cvar_clone,
+                                            &batch_in_flight_clone,
+                                            batch_id,
+                                        );
                                     }
-                                    // Decrement per-batch in-flight count and notify if batch is complete
-                                    if batch_id != 0 {
-                                        let batch_map = batch_in_flight_clone.lock().unwrap();
-                                        if let Some((batch_count, batch_cvar)) =
-                                            batch_map.get(&batch_id)
+                                }
+                            } else if let IoUringWrapper::Big(ring) = &ring_clone {
+                                let completions: Vec<_> = {
+                                    let mut ring = ring.lock().unwrap();
+                                    ring.completion().collect()
+                                };
+                                for cqe in completions {
+                                    let user_data = cqe.user_data();
+                                    if let Some(mut sub) = in_flight.remove(&user_data) {
+                                        let batch_id = sub.batch_id;
+                                        let cqe_result = cqe.result();
+
+                                        // Handle short I/O with resubmission (only for regular I/O, not io_uring_cmd)
+                                        if cqe_result >= 0
+                                            && (cqe_result as usize) < sub.len
+                                            && sub.nvme_cmd_data.is_none()
                                         {
-                                            let prev_batch =
-                                                batch_count.fetch_sub(1, Ordering::Relaxed);
-                                            if prev_batch == 1 {
-                                                batch_cvar.notify_all();
+                                            let bytes_transferred = cqe_result as usize;
+                                            // Update offset and length for resubmission
+                                            sub.offset += bytes_transferred as u64;
+                                            sub.len -= bytes_transferred;
+                                            // Update buffer pointer for writes and direct reads
+                                            if sub.is_write || sub.bounce.is_none() {
+                                                sub.ptr_addr += bytes_transferred;
+                                            }
+                                            // For read with bounce buffer, copy partial data back
+                                            if !sub.is_write {
+                                                if let (
+                                                    Some(bounce),
+                                                    Some(orig_ptr),
+                                                    Some(payload_len),
+                                                ) = (
+                                                    sub.bounce.as_ref(),
+                                                    sub.original_ptr,
+                                                    sub.payload_len,
+                                                ) {
+                                                    copy_from_bounce_buffer(
+                                                        bounce,
+                                                        orig_ptr,
+                                                        bytes_transferred.min(payload_len),
+                                                    );
+                                                    sub.original_ptr =
+                                                        Some(orig_ptr + bytes_transferred);
+                                                    sub.payload_len = Some(
+                                                        payload_len
+                                                            .saturating_sub(bytes_transferred),
+                                                    );
+                                                }
                                             }
+                                            // Re-insert into in_flight with updated values
+                                            // Don't decrement in_flight_count since we're resubmitting
+                                            in_flight.insert(user_data, sub.clone());
+                                            // Push a new SQE for the remaining data
+                                            let _ =
+                                                build_and_submit_sqe(&ring_clone, &sub, user_data);
+                                            let _ = match &ring_clone {
+                                                IoUringWrapper::Standard(ring) => {
+                                                    let ring = ring.lock().unwrap();
+                                                    ring.submitter().submit()
+                                                }
+                                                IoUringWrapper::Big(ring) => {
+                                                    let ring = ring.lock().unwrap();
+                                                    ring.submitter().submit()
+                                                }
+                                            };
+                                            continue;
                                         }
+
+                                        // Handle completion result
+                                        let result =
+                                            handle_completion_result(&mut sub, cqe_result, false);
+                                        sub.completion.set(result);
+
+                                        // Decrement in-flight counts and notify
+                                        decrement_in_flight(
+                                            &in_flight_count_clone,
+                                            &in_flight_cvar_clone,
+                                            &batch_in_flight_clone,
+                                            batch_id,
+                                        );
                                     }
                                 }
                             }
-                            ring.submission().sync();
+                            ring_clone.submission_sync();
                         }
 
                         // Block on epoll only if there's truly nothing pending. The empty +
@@ -886,9 +1559,7 @@ impl RawBlockDevice {
                             let mut batch: Vec<IoSubmission> = std::mem::take(&mut *q);
                             let batch_len = batch.len();
 
-                            let mut ring = ring_clone.lock().unwrap();
-
-                            let available = ring_size - ring.submission().len();
+                            let available = ring_size - ring_clone.submission_len();
                             let to_submit_count = std::cmp::min(available, batch_len);
 
                             if to_submit_count < batch_len {
@@ -909,42 +1580,20 @@ impl RawBlockDevice {
                                 user_data_list.push(user_data);
                                 in_flight.insert(user_data, sub.clone());
 
-                                let ptr = sub.ptr_addr as *mut u8;
-                                let sqe = if sub.is_write {
-                                    if let Some(idx) = sub.fixed_buffer_idx {
-                                        opcode::WriteFixed::new(
-                                            Fd(sub.fd),
-                                            ptr as *const u8,
-                                            sub.len as u32,
-                                            idx,
-                                        )
-                                        .offset(sub.offset)
-                                        .build()
-                                    } else {
-                                        opcode::Write::new(
-                                            Fd(sub.fd),
-                                            ptr as *const u8,
-                                            sub.len as u32,
-                                        )
-                                        .offset(sub.offset)
-                                        .build()
-                                    }
-                                } else if let Some(idx) = sub.fixed_buffer_idx {
-                                    opcode::ReadFixed::new(Fd(sub.fd), ptr, sub.len as u32, idx)
-                                        .offset(sub.offset)
-                                        .build()
-                                } else {
-                                    opcode::Read::new(Fd(sub.fd), ptr, sub.len as u32)
-                                        .offset(sub.offset)
-                                        .build()
-                                };
-                                let sqe = sqe.user_data(user_data);
-                                unsafe {
-                                    ring.submission().push(&sqe).expect("failed to push sqe");
-                                }
+                                // Build and submit SQE
+                                let _ = build_and_submit_sqe(&ring_clone, sub, user_data);
                             }
 
-                            let submit_result = ring.submitter().submit();
+                            let submit_result = match &ring_clone {
+                                IoUringWrapper::Standard(ring) => {
+                                    let ring = ring.lock().unwrap();
+                                    ring.submitter().submit()
+                                }
+                                IoUringWrapper::Big(ring) => {
+                                    let ring = ring.lock().unwrap();
+                                    ring.submitter().submit()
+                                }
+                            };
                             // Handle EAGAIN (ring full) and EINTR (interrupted syscall)
                             match submit_result {
                                 Ok(submitted) => {
@@ -959,7 +1608,6 @@ impl RawBlockDevice {
                                         let unsubmitted: Vec<_> =
                                             batch[submitted..to_submit_count].to_vec();
                                         if !unsubmitted.is_empty() {
-                                            drop(ring);
                                             let mut q = queue_clone.lock().unwrap();
                                             // Insert unsubmitted requests back at the front preserving order
                                             q.splice(0..0, unsubmitted);
@@ -981,7 +1629,6 @@ impl RawBlockDevice {
                                             if to_submit_count > 0 {
                                                 let unsubmitted: Vec<_> =
                                                     batch[..to_submit_count].to_vec();
-                                                drop(ring);
                                                 let mut q = queue_clone.lock().unwrap();
                                                 // Insert unsubmitted requests back at the front preserving order
                                                 q.splice(0..0, unsubmitted);
@@ -999,25 +1646,12 @@ impl RawBlockDevice {
                                                     format!("io_uring submit error: {:?}", e),
                                                 )));
                                                 let _ = sub.bounce.take();
-                                                let prev = in_flight_count_clone
-                                                    .fetch_sub(1, Ordering::Relaxed);
-                                                if prev == 1 {
-                                                    in_flight_cvar_clone.notify_all();
-                                                }
-                                                // Decrement per-batch in-flight count and notify if batch is complete
-                                                if batch_id != 0 {
-                                                    let batch_map =
-                                                        batch_in_flight_clone.lock().unwrap();
-                                                    if let Some((batch_count, batch_cvar)) =
-                                                        batch_map.get(&batch_id)
-                                                    {
-                                                        let prev_batch = batch_count
-                                                            .fetch_sub(1, Ordering::Relaxed);
-                                                        if prev_batch == 1 {
-                                                            batch_cvar.notify_all();
-                                                        }
-                                                    }
-                                                }
+                                                decrement_in_flight(
+                                                    &in_flight_count_clone,
+                                                    &in_flight_cvar_clone,
+                                                    &batch_in_flight_clone,
+                                                    batch_id,
+                                                );
                                             }
                                         }
                                     }
@@ -1034,22 +1668,16 @@ impl RawBlockDevice {
                             .expect("Worker: queue mutex poisoned during shutdown");
                         while let Some(mut sub) = q.pop() {
                             let batch_id = sub.batch_id;
-                            // Drop any bounce buffer associated with this submission.
                             let _ = sub.bounce.take();
-                            in_flight_count_clone.fetch_sub(1, Ordering::Relaxed);
                             sub.completion.set(Err(PyRuntimeError::new_err(
                                 "io_uring worker shutting down",
                             )));
-                            // Decrement per-batch in-flight count and notify if batch is complete
-                            if batch_id != 0 {
-                                let batch_map = batch_in_flight_clone.lock().unwrap();
-                                if let Some((batch_count, batch_cvar)) = batch_map.get(&batch_id) {
-                                    let prev_batch = batch_count.fetch_sub(1, Ordering::Relaxed);
-                                    if prev_batch == 1 {
-                                        batch_cvar.notify_all();
-                                    }
-                                }
-                            }
+                            decrement_in_flight(
+                                &in_flight_count_clone,
+                                &in_flight_cvar_clone,
+                                &batch_in_flight_clone,
+                                batch_id,
+                            );
                         }
                     }
 
@@ -1059,100 +1687,65 @@ impl RawBlockDevice {
                     let graceful_shutdown = Duration::from_millis(1000);
                     thread::sleep(graceful_shutdown);
                     {
-                        let mut ring = ring_clone
-                            .lock()
-                            .expect("Worker: ring mutex poisoned during shutdown");
-                        for cqe in ring.completion() {
-                            let user_data = cqe.user_data();
-                            if let Some(mut sub) = in_flight.remove(&user_data) {
-                                let batch_id = sub.batch_id;
-                                if cqe.result() < 0 {
-                                    let code = -cqe.result();
-                                    // Drop any bounce buffer associated with this submission.
-                                    let _ = sub.bounce.take();
-                                    sub.completion
-                                        .set(Err(PyOSError::new_err((code, "io_uring I/O error"))));
-                                } else {
-                                    let bytes_transferred = cqe.result() as usize;
-                                    if bytes_transferred < sub.len {
-                                        // Short read/write during shutdown: fail the request
-                                        // We cannot resubmit because the worker is about to exit
-                                        // Drop any bounce buffer associated with this submission.
-                                        let _ = sub.bounce.take();
-                                        sub.completion.set(Err(PyRuntimeError::new_err(
-                                        "io_uring worker shutting down - short I/O during shutdown",
-                                    )));
-                                        // Continue to decrement in_flight_count below
-                                    } else {
-                                        // Full completion
-                                        // For reads with bounce buffer, copy data back to original buffer
-                                        if !sub.is_write {
-                                            if let (
-                                                Some(bounce),
-                                                Some(orig_ptr),
-                                                Some(payload_len),
-                                            ) = (
-                                                sub.bounce.take(),
-                                                sub.original_ptr,
-                                                sub.payload_len,
-                                            ) {
-                                                unsafe {
-                                                    libc::memcpy(
-                                                        orig_ptr as *mut libc::c_void,
-                                                        bounce.as_ptr() as *const libc::c_void,
-                                                        payload_len,
-                                                    );
-                                                }
-                                            }
-                                        } else {
-                                            // Drop any bounce buffer associated with this submission.
-                                            let _ = sub.bounce.take();
-                                        }
-                                        sub.completion.set(Ok(()));
-                                    }
-                                }
-                                let prev = in_flight_count_clone.fetch_sub(1, Ordering::Relaxed);
-                                if prev == 1 {
-                                    in_flight_cvar_clone.notify_all();
+                        // Process completions for standard ring
+                        if let IoUringWrapper::Standard(ring) = &ring_clone {
+                            let completions: Vec<_> = {
+                                let mut ring = ring.lock().unwrap();
+                                ring.completion().collect()
+                            };
+                            for cqe in completions {
+                                let user_data = cqe.user_data();
+                                if let Some(mut sub) = in_flight.remove(&user_data) {
+                                    let batch_id = sub.batch_id;
+                                    let result =
+                                        handle_completion_result(&mut sub, cqe.result(), true);
+                                    sub.completion.set(result);
+                                    decrement_in_flight(
+                                        &in_flight_count_clone,
+                                        &in_flight_cvar_clone,
+                                        &batch_in_flight_clone,
+                                        batch_id,
+                                    );
                                 }
-                                // Decrement per-batch in-flight count and notify if batch is complete
-                                if batch_id != 0 {
-                                    let batch_map = batch_in_flight_clone.lock().unwrap();
-                                    if let Some((batch_count, batch_cvar)) =
-                                        batch_map.get(&batch_id)
-                                    {
-                                        let prev_batch =
-                                            batch_count.fetch_sub(1, Ordering::Relaxed);
-                                        if prev_batch == 1 {
-                                            batch_cvar.notify_all();
-                                        }
-                                    }
+                            }
+                        } else if let IoUringWrapper::Big(ring) = &ring_clone {
+                            let completions: Vec<_> = {
+                                let mut ring = ring.lock().unwrap();
+                                ring.completion().collect()
+                            };
+                            for cqe in completions {
+                                let user_data = cqe.user_data();
+                                if let Some(mut sub) = in_flight.remove(&user_data) {
+                                    let batch_id = sub.batch_id;
+                                    let result =
+                                        handle_completion_result(&mut sub, cqe.result(), true);
+                                    sub.completion.set(result);
+                                    decrement_in_flight(
+                                        &in_flight_count_clone,
+                                        &in_flight_cvar_clone,
+                                        &batch_in_flight_clone,
+                                        batch_id,
+                                    );
                                 }
                             }
                         }
-                        ring.submission().sync();
+                        ring_clone.submission_sync();
                     }
 
                     // Any remaining in_flight requests, force wake with error
                     // (these were submitted to kernel but won't get completions)
                     for (_user_data, mut sub) in in_flight.drain() {
                         let batch_id = sub.batch_id;
-                        // Drop any bounce buffer associated with this submission.
                         let _ = sub.bounce.take();
-                        in_flight_count_clone.fetch_sub(1, Ordering::Relaxed);
                         sub.completion.set(Err(PyRuntimeError::new_err(
                             "io_uring worker shutting down - request cancelled",
                         )));
-                        // Decrement per-batch in-flight count and notify if batch is complete
-                        if batch_id != 0 {
-                            let batch_map = batch_in_flight_clone.lock().unwrap();
-                            if let Some((batch_count, batch_cvar)) = batch_map.get(&batch_id) {
-                                let prev_batch = batch_count.fetch_sub(1, Ordering::Relaxed);
-                                if prev_batch == 1 {
-                                    batch_cvar.notify_all();
-                                }
-                            }
-                        }
+                        decrement_in_flight(
+                            &in_flight_count_clone,
+                            &in_flight_cvar_clone,
+                            &batch_in_flight_clone,
+                            batch_id,
+                        );
                     }
 
                     // Final notification in case any thread is waiting on in_flight_count
@@ -1191,6 +1784,10 @@ impl RawBlockDevice {
             use_odirect,
             alignment,
             use_iouring,
+            use_uring_cmd,
+            nvme_nsid,
+            nvme_lba_shift,
+            nvme_lba_size,
             ring: ring_opt,
             queue: queue_opt,
             worker: worker_opt,
@@ -1209,6 +1806,24 @@ impl RawBlockDevice {
                 .unwrap_or_else(|| Arc::new(Mutex::new(HashMap::new()))),
         })
     }
+
+    /// Build NVMe command data for io_uring_cmd operations.
+    /// Returns None if use_uring_cmd is disabled.
+    fn _build_nvme_cmd_data(&self, dtype: u8, dspec: u16) -> PyResult<Option<NvmeCmdData>> {
+        if !self.use_uring_cmd {
+            return Ok(None);
+        }
+        Ok(Some(NvmeCmdData {
+            nsid: self
+                .nvme_nsid
+                .ok_or_else(|| PyRuntimeError::new_err("NVMe namespace ID not available"))?,
+            lba_shift: self
+                .nvme_lba_shift
+                .ok_or_else(|| PyRuntimeError::new_err("NVMe LBA shift not available"))?,
+            dtype,
+            dspec,
+        }))
+    }
 }
 
 #[pymethods]
@@ -1220,6 +1835,7 @@ impl RawBlockDevice {
             writable,
             use_odirect = false,
             use_iouring = false,
+            use_uring_cmd = false,
             alignment = 4096,
             io_engine = None,
             iouring_queue_depth = RING_SIZE
@@ -1231,6 +1847,7 @@ impl RawBlockDevice {
         writable: bool,
         use_odirect: bool,
         use_iouring: bool,
+        use_uring_cmd: bool,
         alignment: usize,
         io_engine: Option<String>,
         iouring_queue_depth: usize,
@@ -1241,6 +1858,7 @@ impl RawBlockDevice {
             use_odirect,
             alignment,
             use_iouring,
+            use_uring_cmd,
             io_engine,
             iouring_queue_depth,
         )
@@ -1251,6 +1869,27 @@ impl RawBlockDevice {
         Ok(self.size)
     }
 
+    /// Get NVMe namespace ID (only available when use_uring_cmd=true)
+    fn nvme_nsid(&self) -> PyResult<u32> {
+        self.nvme_nsid.ok_or_else(|| {
+            PyRuntimeError::new_err("NVMe namespace ID not available (use_uring_cmd not enabled)")
+        })
+    }
+
+    /// Get NVMe LBA shift (log2 of LBA size, only available when use_uring_cmd=true)
+    fn nvme_lba_shift(&self) -> PyResult<u32> {
+        self.nvme_lba_shift.ok_or_else(|| {
+            PyRuntimeError::new_err("NVMe LBA shift not available (use_uring_cmd not enabled)")
+        })
+    }
+
+    /// Get NVMe LBA size in bytes (only available when use_uring_cmd=true)
+    fn nvme_lba_size(&self) -> PyResult<u32> {
+        self.nvme_lba_size.ok_or_else(|| {
+            PyRuntimeError::new_err("NVMe LBA size not available (use_uring_cmd not enabled)")
+        })
+    }
+
     /// Register fixed buffers for zero-copy io_uring operations.
     ///
     /// - Pre-registering memory buffers with the kernel
@@ -1292,7 +1931,6 @@ impl RawBlockDevice {
         }
 
         if let Some(ring) = &self.ring {
-            let ring = ring.lock().unwrap();
             let mut iovecs: Vec<libc::iovec> = Vec::new();
             for (ptr, size) in buffer_ptrs.iter().zip(buffer_sizes.iter()) {
                 iovecs.push(libc::iovec {
@@ -1301,7 +1939,17 @@ impl RawBlockDevice {
                 });
             }
             unsafe {
-                match ring.submitter().register_buffers(&iovecs) {
+                let result = match ring {
+                    IoUringWrapper::Standard(ring) => {
+                        let ring = ring.lock().unwrap();
+                        ring.submitter().register_buffers(&iovecs)
+                    }
+                    IoUringWrapper::Big(ring) => {
+                        let ring = ring.lock().unwrap();
+                        ring.submitter().register_buffers(&iovecs)
+                    }
+                };
+                match result {
                     Ok(_) => {
                         self.fixed_buffers_registered.store(true, Ordering::Relaxed);
                     }
@@ -1395,6 +2043,7 @@ impl RawBlockDevice {
         let fd = self.fd;
         let use_odirect = self.use_odirect;
         let alignment = self.alignment;
+        let use_uring_cmd = self.use_uring_cmd;
         let fixed_buffers_registered = self.fixed_buffers_registered.load(Ordering::Relaxed);
         // Clone the fixed buffer map before releasing GIL to avoid lock contention
         let fixed_buffer_map: HashMap<usize, (u16, usize)> = if fixed_buffers_registered {
@@ -1403,6 +2052,17 @@ impl RawBlockDevice {
         } else {
             HashMap::new()
         };
+
+        let nvme_cmd_data_base = if use_uring_cmd {
+            Some((
+                self.nvme_nsid
+                    .ok_or_else(|| PyRuntimeError::new_err("NVMe namespace ID not available"))?,
+                self.nvme_lba_shift
+                    .ok_or_else(|| PyRuntimeError::new_err("NVMe LBA shift not available"))?,
+            ))
+        } else {
+            None
+        };
         let in_flight_count = Arc::clone(&self.in_flight_count);
         let queue = Arc::clone(self.queue.as_ref().unwrap());
         let batch_ready = Arc::clone(self.batch_ready.as_ref().unwrap());
@@ -1451,6 +2111,18 @@ impl RawBlockDevice {
                     (ptr, None, fixed_idx)
                 };
 
+                // Build NVMe command data
+                let nvme_cmd_data = if let Some((nsid, lba_shift)) = nvme_cmd_data_base {
+                    Some(NvmeCmdData {
+                        nsid,
+                        lba_shift,
+                        dtype: 0,
+                        dspec: 0,
+                    })
+                } else {
+                    None
+                };
+
                 let sub = IoSubmission {
                     fd,
                     offset,
@@ -1463,6 +2135,7 @@ impl RawBlockDevice {
                     original_ptr: None,
                     payload_len: None,
                     batch_id,
+                    nvme_cmd_data,
                 };
 
                 submissions.push((sub, comp));
@@ -1697,6 +2370,7 @@ impl RawBlockDevice {
                 original_ptr: None,
                 payload_len: None,
                 batch_id: 0,
+                nvme_cmd_data: self._build_nvme_cmd_data(0, 0)?,
             };
             {
                 let q = self.queue.as_ref().expect("queue must exist");
@@ -1725,6 +2399,146 @@ impl RawBlockDevice {
                 original_ptr: Some(ptr as usize),
                 payload_len: Some(payload_len),
                 batch_id: 0,
+                nvme_cmd_data: self._build_nvme_cmd_data(0, 0)?,
+            };
+            {
+                let q = self.queue.as_ref().expect("queue must exist");
+                let mut q = q.lock().unwrap();
+                q.push(sub);
+            }
+            if let Some(batch_ready) = &self.batch_ready {
+                batch_ready.signal_producer();
+            }
+            py.allow_threads(move || comp.wait())
+        };
+
+        release_pybuffer(view);
+        res?;
+        Ok(())
+    }
+
+    /// Synchronous write using io_uring.
+    #[pyo3(signature = (offset, data, payload_len, total_len = None))]
+    fn write_uring(
+        &self,
+        py: Python<'_>,
+        offset: u64,
+        data: &Bound<'_, PyAny>,
+        payload_len: usize,
+        total_len: Option<usize>,
+    ) -> PyResult<()> {
+        if !self.use_iouring {
+            return Err(PyRuntimeError::new_err("io_uring not enabled"));
+        }
+        if self.closed.load(Ordering::Relaxed) {
+            return Err(PyRuntimeError::new_err("device is closed"));
+        }
+
+        let view = get_pybuffer(py, data, false)?;
+        let ptr = view.buf as *const u8;
+        if ptr.is_null() {
+            release_pybuffer(view);
+            return Err(PyValueError::new_err("null buffer pointer"));
+        }
+
+        let cap = view.len as usize;
+        let total_len = total_len.unwrap_or(payload_len);
+        if cap < payload_len {
+            release_pybuffer(view);
+            return Err(PyValueError::new_err(format!(
+                "input buffer too small: cap={cap} need={payload_len}"
+            )));
+        }
+        if total_len < payload_len {
+            release_pybuffer(view);
+            return Err(PyValueError::new_err("total_len must be >= payload_len"));
+        }
+
+        let align = self.alignment;
+        if self.use_odirect {
+            #[allow(clippy::manual_is_multiple_of)]
+            if (offset as usize) % align != 0 {
+                release_pybuffer(view);
+                return Err(PyValueError::new_err("O_DIRECT requires aligned offset"));
+            }
+            #[allow(clippy::manual_is_multiple_of)]
+            if total_len % align != 0 {
+                release_pybuffer(view);
+                return Err(PyValueError::new_err("O_DIRECT requires aligned total_len"));
+            }
+        }
+
+        // Check if the buffer is aligned for O_DIRECT
+        let ptr_aligned = if self.use_odirect {
+            (ptr as usize).is_multiple_of(align)
+        } else {
+            true
+        };
+
+        // Fixed buffers are pre-registered with io_uring, enabling true zero-copy I/O
+        let use_fixed = self.fixed_buffers_registered.load(Ordering::Relaxed);
+        let fixed_idx = if use_fixed && ptr_aligned {
+            let map = self.fixed_buffer_map.lock().unwrap();
+            let ptr_addr = ptr as usize;
+            map.get(&ptr_addr).map(|(idx, _)| *idx)
+        } else {
+            None
+        };
+
+        // Use bounce buffer if:
+        // Buffer is not aligned (O_DIRECT requirement)
+        // Buffer capacity is less than total_len
+        let use_bounce = !ptr_aligned || cap < total_len;
+
+        let res = if !use_bounce {
+            self.in_flight_count.fetch_add(1, Ordering::Relaxed);
+            let comp = Arc::new(IoCompletion::new());
+            let sub = IoSubmission {
+                fd: self.fd,
+                offset,
+                len: total_len,
+                ptr_addr: ptr as usize,
+                is_write: true,
+                completion: comp.clone(),
+                fixed_buffer_idx: fixed_idx,
+                bounce: None,
+                original_ptr: None,
+                payload_len: None,
+                batch_id: 0,
+                nvme_cmd_data: self._build_nvme_cmd_data(0, 0)?,
+            };
+            {
+                let q = self.queue.as_ref().expect("queue must exist");
+                let mut q = q.lock().unwrap();
+                q.push(sub);
+            }
+            if let Some(batch_ready) = &self.batch_ready {
+                batch_ready.signal_producer();
+            }
+            py.allow_threads(move || comp.wait())
+        } else {
+            let bounce = AlignedBuf::new(total_len, align)?;
+            let bounce_arc = std::sync::Arc::new(bounce);
+            let bounce_ptr = bounce_arc.as_mut_ptr();
+            // Copy data to bounce buffer before submission
+            unsafe {
+                std::ptr::copy_nonoverlapping(ptr, bounce_ptr, payload_len);
+            }
+            self.in_flight_count.fetch_add(1, Ordering::Relaxed);
+            let comp = Arc::new(IoCompletion::new());
+            let sub = IoSubmission {
+                fd: self.fd,
+                offset,
+                len: total_len,
+                ptr_addr: bounce_ptr as usize,
+                is_write: true,
+                completion: comp.clone(),
+                fixed_buffer_idx: None,
+                bounce: Some(bounce_arc),
+                original_ptr: None,
+                payload_len: Some(payload_len),
+                batch_id: 0,
+                nvme_cmd_data: self._build_nvme_cmd_data(0, 0)?,
             };
             {
                 let q = self.queue.as_ref().expect("queue must exist");
@@ -1837,6 +2651,8 @@ impl RawBlockDevice {
         } else {
             HashMap::new()
         };
+        // Get NVMe data for io_uring_cmd
+        let nvme_cmd_data = self._build_nvme_cmd_data(0, 0)?;
         let in_flight_count = Arc::clone(&self.in_flight_count);
         let queue = Arc::clone(self.queue.as_ref().unwrap());
         let batch_ready = Arc::clone(self.batch_ready.as_ref().unwrap());
@@ -1898,6 +2714,7 @@ impl RawBlockDevice {
                     original_ptr: None,
                     payload_len: None,
                     batch_id,
+                    nvme_cmd_data: nvme_cmd_data.clone(),
                 };
 
                 submissions.push((sub, comp));
@@ -2235,8 +3052,16 @@ impl RawBlockDevice {
 
             if self.fixed_buffers_registered.load(Ordering::Relaxed) {
                 if let Some(ring) = &self.ring {
-                    let ring = ring.lock().unwrap();
-                    let _ = ring.submitter().unregister_buffers();
+                    let _ = match ring {
+                        IoUringWrapper::Standard(ring) => {
+                            let ring = ring.lock().unwrap();
+                            ring.submitter().unregister_buffers()
+                        }
+                        IoUringWrapper::Big(ring) => {
+                            let ring = ring.lock().unwrap();
+                            ring.submitter().unregister_buffers()
+                        }
+                    };
                 }
                 self.fixed_buffers_registered
                     .store(false, Ordering::Relaxed);
diff --git a/tests/v1/distributed/test_raw_block_l2_adapter.py b/tests/v1/distributed/test_raw_block_l2_adapter.py
index 697dad81ee..db3102e959 100644
--- a/tests/v1/distributed/test_raw_block_l2_adapter.py
+++ b/tests/v1/distributed/test_raw_block_l2_adapter.py
@@ -134,12 +134,16 @@ def _make_config(
     *,
     slot_bytes: int = 64 * 1024,
     capacity_bytes: int = 0,
+    io_engine: str = "posix",
+    use_uring_cmd: bool = False,
 ) -> RawBlockL2AdapterConfig:
     return RawBlockL2AdapterConfig(
         device_path=device_path,
         slot_bytes=slot_bytes,
         capacity_bytes=capacity_bytes,
         use_odirect=False,
+        io_engine=io_engine,
+        use_uring_cmd=use_uring_cmd,
         block_align=4096,
         header_bytes=4096,
         meta_total_bytes=1 * 1024 * 1024,
@@ -218,6 +222,47 @@ def _run_load(adapter: RawBlockL2Adapter, keys, objects):
     return task_id, adapter.query_load_result(task_id)
 
 
+def test_raw_block_l2_adapter_config_parses_uring_flags():
+    cfg = RawBlockL2AdapterConfig.from_dict(
+        {
+            "type": "raw_block",
+            "device_path": "/tmp/raw-block-dev",
+            "slot_bytes": 64 * 1024,
+            "use_odirect": False,
+            "io_engine": "io_uring",
+        }
+    )
+
+    assert cfg.io_engine == "io_uring"
+    assert cfg.use_uring_cmd is False
+
+    with pytest.raises(ValueError, match="use_uring_cmd requires io_uring"):
+        RawBlockL2AdapterConfig.from_dict(
+            {
+                "type": "raw_block",
+                "device_path": "/tmp/raw-block-dev",
+                "slot_bytes": 64 * 1024,
+                "use_uring_cmd": True,
+            }
+        )
+
+
+def test_raw_block_l2_adapter_uring_cmd_rejects_regular_file():
+    with tempfile.TemporaryDirectory() as td:
+        dev_path = os.path.join(td, "dev.bin")
+        with open(dev_path, "wb") as f:
+            f.truncate(8 * 1024 * 1024)
+
+        with pytest.raises(ValueError, match="NVMe namespace character device"):
+            RawBlockL2Adapter(
+                _make_config(
+                    dev_path,
+                    io_engine="io_uring",
+                    use_uring_cmd=True,
+                )
+            )
+
+
 @requires_raw_block_ext
 def test_raw_block_l2_adapter_store_lookup_load_roundtrip():
     with tempfile.TemporaryDirectory() as td:
diff --git a/tests/v1/storage_backend/test_raw_block_uring_cmd.py b/tests/v1/storage_backend/test_raw_block_uring_cmd.py
new file mode 100644
index 0000000000..e109953acc
--- /dev/null
+++ b/tests/v1/storage_backend/test_raw_block_uring_cmd.py
@@ -0,0 +1,320 @@
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for io_uring command (passthrough) support in Rust raw block backend."""
+
+# Standard
+from unittest.mock import MagicMock, patch
+import asyncio
+import os
+
+# Third Party
+import pytest
+import torch
+
+# First Party
+from lmcache.logging import init_logger
+from lmcache.v1.metadata import LMCacheMetadata
+from lmcache.v1.storage_backend.plugins.rust_raw_block_backend import (
+    RustRawBlockBackend,
+)
+from lmcache.v1.storage_backend.raw_block.core import (
+    RawBlockCore,
+)
+
+logger = init_logger(__name__)
+
+
+# GLobal test device configuration with environment variables
+TEST_DEVICES = {
+    "block_device": os.environ.get("LMCACHE_TEST_BLOCK_DEVICE", "/dev/nvme0n1"),
+    "char_device": os.environ.get("LMCACHE_TEST_CHAR_DEVICE", "/dev/ng0n1"),
+    "null_device": os.environ.get("LMCACHE_TEST_NULL_DEVICE", "/dev/null"),
+}
+
+
+def _get_sysfs_path(device_path: str) -> str:
+    """Derive sysfs path from device path."""
+
+    device_name = os.path.basename(device_path)
+
+    if device_name.startswith("ng"):
+        parts = device_name[2:]
+        device_name = f"nvme{parts}"
+
+    return f"/sys/block/{device_name}"
+
+
+def _has_ext() -> bool:
+    """Check if the Rust raw block I/O extension is available."""
+    try:
+        # Third Party
+        import lmcache_rust_raw_block_io  # noqa: F401
+
+        return True
+    except Exception:
+        return False
+
+
+# Skip all tests in this file if the Rust extension is not available
+pytestmark = pytest.mark.skipif(
+    not _has_ext(), reason="lmcache_rust_raw_block_io extension not available"
+)
+
+
+@pytest.fixture
+def loop_in_thread():
+    loop = asyncio.new_event_loop()
+    try:
+        yield loop
+    finally:
+        loop.close()
+
+
+class MockConfig:
+    """Mock configuration for testing."""
+
+    def __init__(
+        self,
+        device_path: str,
+        use_uring_cmd: bool = False,
+        meta_total_bytes=4 * 1024 * 1024,
+    ):
+        self.extra_config = {
+            "rust_raw_block.device_path": device_path,
+            "rust_raw_block.use_odirect": False,
+            "rust_raw_block.use_uring": True,
+            "rust_raw_block.use_uring_cmd": use_uring_cmd,
+            "rust_raw_block.capacity_bytes": 1024 * 1024 * 1024,  # 1GB
+            "rust_raw_block.block_align": 4096,
+            "rust_raw_block.header_bytes": 4096,
+            "rust_raw_block.meta_total_bytes": meta_total_bytes,
+        }
+
+
+class MockMetadata:
+    """Mock metadata for testing."""
+
+    def __init__(self, worker_id: int = 0, world_size: int = 1):
+        self.worker_id = worker_id
+        self.world_size = world_size
+
+
+class MockLocalCPUBackend:
+    """Mock local CPU backend for testing."""
+
+    def __init__(self):
+        pass
+
+    def get_memory_allocator(self):
+        return None
+
+    def get_full_chunk_size_bytes(self) -> int:
+        """return a default chunk size only for testing."""
+        return 256 * 1024
+
+
+def _build_rust_raw_block_metadata(
+    worker_id: int = 0,
+    world_size: int = 1,
+) -> LMCacheMetadata:
+    return LMCacheMetadata(
+        model_name="test_model",
+        world_size=world_size,
+        local_world_size=world_size,
+        worker_id=worker_id,
+        local_worker_id=worker_id,
+        kv_dtype=torch.bfloat16,
+        kv_shape=(4, 2, 256, 8, 128),
+    )
+
+
+def _build_rust_raw_block_local_cpu_backend() -> MagicMock:
+    local_cpu_backend = MagicMock()
+    local_cpu_backend.get_full_chunk_size_bytes.return_value = 4096
+    return local_cpu_backend
+
+
+def _build_transfer_limit_backend(
+    dev_path: str,
+    max_data_transfer_size: int | None = None,
+) -> RustRawBlockBackend:
+    config = MockConfig(device_path=dev_path, use_uring_cmd=True)
+    if max_data_transfer_size is not None:
+        config.extra_config["rust_raw_block.max_data_transfer_size"] = (
+            max_data_transfer_size
+        )
+
+    metadata = MockMetadata()
+    loop = asyncio.new_event_loop()
+    try:
+        with (
+            patch.object(RawBlockCore, "_rawdev", return_value=MagicMock()),
+            patch.object(RawBlockCore, "_ensure_capacity_and_layout"),
+            patch.object(RawBlockCore, "_load_checkpoint_from_device"),
+        ):
+            backend = RustRawBlockBackend(
+                config=config,
+                metadata=metadata,
+                local_cpu_backend=MockLocalCPUBackend(),
+                loop=loop,
+                dst_device="cpu",
+            )
+            return backend
+    finally:
+        loop.close()
+
+
+def test_uring_cmd_requires_character_device(loop_in_thread):
+    """Test that io_uring_cmd requires a character device, not a block device."""
+    # This test requires a block device device
+    # Skip if this doesn't exist
+    device_path = TEST_DEVICES["block_device"]
+
+    if not os.path.exists(device_path):
+        pytest.skip(f"Test device {device_path} not found.")
+
+    config = MockConfig(device_path=device_path, use_uring_cmd=True)
+    metadata = MockMetadata(worker_id=0, world_size=1)
+    local_cpu_backend = MockLocalCPUBackend()
+
+    # This should raise an error because the device is not a character device
+    with pytest.raises(
+        ValueError, match="use_uring_cmd requires an NVMe namespace character device"
+    ):
+        RustRawBlockBackend(
+            config=config,
+            metadata=metadata,
+            local_cpu_backend=local_cpu_backend,
+            loop=loop_in_thread,
+        )
+
+
+def test_uring_cmd_get_nvme_info(loop_in_thread):
+    """Test getting NVMe namespace ID and LBA size from character device."""
+    # This test requires a nvme NS character device
+    # Skip if this doesn't exist
+    device_path = TEST_DEVICES["char_device"]
+
+    if not os.path.exists(device_path):
+        pytest.skip(f"Test device {device_path} not found.")
+
+    config = MockConfig(device_path=device_path, use_uring_cmd=True)
+    metadata = MockMetadata(worker_id=0, world_size=1)
+    local_cpu_backend = MockLocalCPUBackend()
+
+    try:
+        backend = RustRawBlockBackend(
+            config=config,
+            metadata=metadata,
+            local_cpu_backend=local_cpu_backend,
+            loop=loop_in_thread,
+        )
+
+        # Get the raw device
+        raw_device = backend._core.raw_device()
+
+        # Test getting namespace ID
+        nsid = raw_device.nvme_nsid()
+        assert nsid > 0, f"Expected positive nsid, got {nsid}"
+        logger.info(f"NVMe namespace ID: {nsid}")
+
+        # Test getting LBA size
+        lba_size = raw_device.nvme_lba_size()
+        assert lba_size > 0, f"Expected positive lba_size, got {lba_size}"
+        logger.info(f"NVMe LBA size: {lba_size} bytes")
+
+    except Exception as e:
+        pytest.fail(f"Failed to get NVMe info: {e}")
+
+
+def test_uring_cmd_disabled(loop_in_thread):
+    """Test that NVMe methods are not available when use_uring_cmd is disabled."""
+    config = MockConfig(device_path=TEST_DEVICES["null_device"], use_uring_cmd=False)
+    metadata = MockMetadata(worker_id=0, world_size=1)
+    local_cpu_backend = MockLocalCPUBackend()
+    raw_device = MagicMock()
+    raw_device.nvme_nsid.side_effect = RuntimeError("use_uring_cmd not enabled")
+    raw_device.nvme_lba_size.side_effect = RuntimeError("use_uring_cmd not enabled")
+
+    with (
+        patch.object(RawBlockCore, "_rawdev", return_value=MagicMock()),
+        patch.object(RawBlockCore, "_ensure_capacity_and_layout"),
+        patch.object(RawBlockCore, "_load_checkpoint_from_device"),
+    ):
+        backend = RustRawBlockBackend(
+            config=config,
+            metadata=metadata,
+            local_cpu_backend=local_cpu_backend,
+            loop=loop_in_thread,
+        )
+        backend._raw = raw_device
+
+    # These should raise errors when use_uring_cmd is disabled
+    with pytest.raises(RuntimeError, match="use_uring_cmd not enabled"):
+        raw_device.nvme_nsid()
+
+    with pytest.raises(RuntimeError, match="use_uring_cmd not enabled"):
+        raw_device.nvme_lba_size()
+
+
+def test_uring_cmd_auto_transfer_limit_from_sysfs_ng_device():
+    expected_path = (
+        f"{_get_sysfs_path(TEST_DEVICES['char_device'])}/queue/max_hw_sectors_kb"
+    )
+    with patch(
+        "lmcache.v1.storage_backend.raw_block.core._read_sysfs_int",
+        return_value=1024,
+    ) as mock_read:
+        backend = _build_transfer_limit_backend(
+            TEST_DEVICES["char_device"], max_data_transfer_size=-1
+        )
+
+    mock_read.assert_called_once_with(expected_path)
+    assert backend._core.max_data_transfer_size == 1024 * 1024
+
+
+def test_uring_cmd_auto_transfer_limit_fails_when_sysfs_unavailable():
+    expected_path = (
+        f"{_get_sysfs_path(TEST_DEVICES['char_device'])}/queue/max_hw_sectors_kb"
+    )
+    with patch(
+        "lmcache.v1.storage_backend.raw_block.core._read_sysfs_int",
+        return_value=None,
+    ) as mock_read:
+        with pytest.raises(RuntimeError, match="failed to read max_hw_sectors_kb"):
+            _build_transfer_limit_backend(
+                TEST_DEVICES["char_device"], max_data_transfer_size=-1
+            )
+
+    mock_read.assert_called_once_with(expected_path)
+
+
+def test_uring_cmd_explicit_transfer_limit_must_be_block_aligned():
+    """Test that explicitly configured max_data_transfer_size must be block-aligned."""
+    # Default block_align is 4096 bytes
+    # 4096 is block-aligned (4096 % 4096 == 0)
+    backend = _build_transfer_limit_backend(
+        TEST_DEVICES["char_device"], max_data_transfer_size=4096
+    )
+    assert backend._core.max_data_transfer_size == 4096
+
+    # 8192 is block-aligned (8192 % 4096 == 0)
+    backend = _build_transfer_limit_backend(
+        TEST_DEVICES["char_device"], max_data_transfer_size=8192
+    )
+    assert backend._core.max_data_transfer_size == 8192
+
+    # 5000 is NOT block-aligned (5000 % 4096 != 0)
+    with pytest.raises(
+        ValueError,
+        match=r"max_data_transfer_size \(5000\) must be a multiple of "
+        "block_align \(4096\)",
+    ):
+        _build_transfer_limit_backend(
+            TEST_DEVICES["char_device"], max_data_transfer_size=5000
+        )
+
+
+if __name__ == "__main__":
+    # Run tests with pytest
+    pytest.main([__file__, "-v"])
diff --git a/tests/v1/storage_backend/test_rust_raw_block_backend.py b/tests/v1/storage_backend/test_rust_raw_block_backend.py
index 48d5308e9f..099933ba9a 100644
--- a/tests/v1/storage_backend/test_rust_raw_block_backend.py
+++ b/tests/v1/storage_backend/test_rust_raw_block_backend.py
@@ -178,6 +178,7 @@ def close(self):
                 "use_odirect": False,
                 "alignment": 4096,
                 "io_engine": "io_uring",
+                "use_uring_cmd": False,
                 "iouring_queue_depth": 512,
             }
         ]

From 8efc6a61d0a2d3090775017854f9f5588b43d0f2 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 11 Jun 2026 13:09:59 +0800
Subject: [PATCH 40/57] [Core] implement per-group `tokens_per_chunk` and
 `slots_per_chunk`, instead of inferring from cache_config.block_size (#3616)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 docs/design/cli/commands/describe.md          |  14 +-
 .../vllm/hybrid-kv-cache-groups.md            |  69 +++--
 docs/source/cli/describe.rst                  |   6 +-
 docs/source/mp/http_api.rst                   |   5 +-
 docs/source/recipes/deepseek_v4_flash.rst     | 107 +++++++
 docs/source/recipes/index.rst                 |   8 +
 .../commands/bench/server_bench/command.py    |  25 +-
 .../commands/bench/server_bench/helpers.py    |  11 +-
 lmcache/cli/commands/describe.py              |   4 +-
 lmcache/integration/vllm/kv_cache_groups.py   |   6 +
 .../integration/vllm/lmcache_mp_connector.py  | 286 +++++++++---------
 .../vllm/vllm_multi_process_adapter.py        |  47 +--
 lmcache/v1/gpu_connector/utils.py             |  11 -
 lmcache/v1/kv_layer_groups.py                 | 201 ++++++------
 lmcache/v1/multiprocess/gpu_context.py        |  45 ++-
 lmcache/v1/multiprocess/group_view.py         |  62 ++--
 lmcache/v1/multiprocess/modules/blend_v3.py   |  25 +-
 .../v1/multiprocess/modules/gpu_transfer.py   |   8 +-
 .../transfer_context/worker_transfer.py       |   6 +-
 lmcache/v1/platform/cache_context.py          |   6 +-
 lmcache/v1/platform/cpu/cache_context.py      |  51 ++--
 tests/cli/test_describe.py                    |   8 +-
 tests/v1/multiprocess/test_blend_server_v2.py |   2 +-
 .../test_blend_v3_load_store_opts.py          |  19 +-
 tests/v1/multiprocess/test_cache_server.py    |  16 +-
 tests/v1/multiprocess/test_free_locks.py      |   7 +-
 tests/v1/multiprocess/test_gpu_context.py     |  43 +--
 .../test_gpu_transfer_layout_registry.py      |   2 +-
 .../multiprocess/test_mq_handler_helpers.py   |  11 +-
 tests/v1/test_kv_cache_groups.py              |  63 ++--
 tests/v1/test_kv_layer_groups_manager.py      |  87 ++++--
 tests/v1/test_vllm_kv_cache_groups.py         |  23 +-
 tests/v1/test_vllm_mp_adapter.py              |   5 +-
 33 files changed, 734 insertions(+), 555 deletions(-)
 create mode 100644 docs/source/recipes/deepseek_v4_flash.rst

diff --git a/docs/design/cli/commands/describe.md b/docs/design/cli/commands/describe.md
index 0613a80620..303396fb26 100644
--- a/docs/design/cli/commands/describe.md
+++ b/docs/design/cli/commands/describe.md
@@ -41,8 +41,7 @@ Kernel group index:                      0
 Engine group index:                      0
 Object group index:                      0
 Num layers:                              80
-Physical block size:                     128
-Compress ratio:                          1
+Slots per block:                         128
 Dtype:                                   torch.float16
 MLA:                                     False
 Attention backend:         vLLM non-MLA flash attention
@@ -85,8 +84,7 @@ programmatic access:
         "engine_group_idx": 0,
         "object_group_idx": 0,
         "num_layers": 80,
-        "physical_block_size": 128,
-        "compress_ratio": 1,
+        "slots_per_block": 128,
         "dtype": "torch.float16",
         "is_mla": false,
         "attention_backend": "vLLM non-MLA flash attention",
@@ -119,9 +117,8 @@ Each kernel group section includes:
   `kernel_group_idx` enumerates the manager's kernel groups, `engine_group_idx`
   is the paged-block address space (0 for non-hybrid), and `object_group_idx` is
   the owning object group.
-- **Num layers** and **Physical block size** — the group's layer count and
+- **Num layers** and **Slots per block** — the group's layer count and
   `shape_desc.bs`.
-- **Compress ratio** — logical tokens per physical slot (1 for non-compressed).
 - **Dtype** and **MLA** — the group's torch dtype and MLA flag.
 - **Attention backend** — which attention implementation is active (e.g.,
   `vLLM non-MLA flash attention`, `vLLM MLA`, `SGLang MHA`), derived from the
@@ -301,7 +298,6 @@ live inside each group:
 ```python
 "kv_cache_layout": {
     "num_layers": 80,
-    "inference_engine_logical_block_size": 128,
     "num_blocks": 2048,
     "cache_size_per_token": 327680,
     "kernel_groups": [
@@ -311,8 +307,8 @@ live inside each group:
             "object_group_idx": 0,
             "num_layers": 80,
             "layer_indices": [0, 1, ...],
-            "physical_block_size": 128,
-            "compress_ratio": 1,
+            "tokens_per_block": 128,
+            "slots_per_block": 128,
             "dtype": "torch.float16",
             "gpu_kv_concrete_shape": "80 x [2, 2048, 128, 8, 128]",
             "is_mla": false,
diff --git a/docs/design/integration/vllm/hybrid-kv-cache-groups.md b/docs/design/integration/vllm/hybrid-kv-cache-groups.md
index 40769b0a26..3204dca0e3 100644
--- a/docs/design/integration/vllm/hybrid-kv-cache-groups.md
+++ b/docs/design/integration/vllm/hybrid-kv-cache-groups.md
@@ -27,26 +27,28 @@ store/retrieve address those infos directly.
   IDs are indexed by that order.
 - Reuse one grouping primitive (`group_layers_by_identity`) on both the vLLM and
   server sides so group order matches.
-- **Not** in scope: sliding-window load-plan trimming; DeepSeek-V4 slot
-  compression (`compress_ratio > 1`, packing several logical tokens per physical
-  slot — the per-group machinery exists but is validated separately); HMA on the
-  non-GPU transfer path (it rejects multi-group); removing `layout_hints` (still
-  used for tensor layout). Per-group block *sizes* and cross-layer KV sharing
-  *are* supported (see Store and retrieve).
+- **Not** in scope: sliding-window load-plan trimming; HMA on the non-GPU
+  transfer path (it rejects multi-group); removing `layout_hints` (still used
+  for tensor layout). Per-group block *sizes*, cross-layer KV sharing, and
+  DeepSeek-V4-style slot compression (`compress_ratio > 1`, packing several
+  logical tokens per physical slot) *are* supported (see Store and retrieve).
 
 ## Types
 
 - **`EngineGroupInfo`** (`msgspec.Struct`): `engine_group_id` (which engine
-  block group its layers live in; dense from 0) + `layer_indices`. Several infos
-  may share an `engine_group_id` when one engine group is split by physical
-  transfer identity. The list order is the protocol-visible group order; an
-  empty list means a single non-hybrid group.
+  block group its layers live in; dense from 0) + `layer_indices` +
+  `tokens_per_block` (logical tokens covered by one of the group's paged
+  chunks, from the engine's KV cache spec `block_size`; `0` = unreported).
+  Several infos may share an `engine_group_id` when one engine group is split
+  by physical transfer identity. The list order is the protocol-visible group
+  order; an empty list means a single non-hybrid group.
 - Helpers in `group_view.py` operate on `Sequence[EngineGroupInfo]`:
   `num_engine_groups`, `num_engine_group_infos`, `expand_engine_block_ids`,
   `get_engine_group_indices`.
 - **`KVLayerGroupInfo`** (runtime, server-only): layer indices,
-  `PageBufferShapeDesc`, dtype, compress ratio, physical chunk size,
-  `engine_group_idx`. Derived from real tensors — never the API contract.
+  `PageBufferShapeDesc`, dtype, `tokens_per_block` / `slots_per_block`,
+  `slots_per_chunk`, `engine_group_idx`. Derived from real tensors — never the
+  API contract.
 
 ## Data flow
 
@@ -83,22 +85,33 @@ info reuses its source engine group's block IDs), so `STORE`/`RETRIEVE` receive
 `list[list[int]]` indexed by info order. The server loop is then trivial: for
 info `i`, use `gpu_block_ids[i]`.
 
-### Per-group block sizes
-
-Engine groups may use *different* `block_size`s. When a hybrid model's
-attention types have different per-token page sizes, vLLM unifies the physical
-page size by scaling the smaller-page group's `block_size` up (e.g.
-`google/gemma-4-E4B-it`: sliding-window groups `block_size=32`, full-attention
-groups `block_size=16`). The connector's block accounting (hit counts,
-`blocks_in_chunk`, the `start`/`end` range) stays in the *canonical* unit —
-`cache_config.block_size`, the GCD of all group block sizes — while each group's
-block IDs are in its own `block_size`. So the scheduler-side slice divides the
-canonical range by `k_g = group_block_size / canonical` per group
-(`_slice_block_ids`), and the server counts `blocks_per_chunk = chunk // bs` per
-group (`GPUCacheContext.blocks_for_tokens`). The server's per-group
-`compress_ratio` is derived from the *per-group* logical block size
-(`max(canonical, bs)`), so an uncompressed larger-block group gets
-`compress_ratio == 1` rather than being rejected.
+### Per-group block sizes and compression
+
+There is no single "engine block size". Each group has two per-group
+quantities, and everything else is derived from them:
+
+- **`tokens_per_block`** — logical tokens covered by one of the group's paged
+  chunks (one block ID). Read from the group's KV cache spec `block_size` in
+  `kv_cache_config` at initialization and carried in `EngineGroupInfo`.
+  Hybrid models mix values freely (`google/gemma-4-E4B-it`: sliding-window
+  groups 32, full-attention groups 16; DeepSeek-V4-Flash: 256/64/8/4).
+- **`slots_per_block`** — physical slots in one paged chunk, detected from the
+  registered tensors at registration time (the batch-size dimension,
+  `shape_desc.bs`). Only available per kernel group.
+
+A group is compressed when `tokens_per_block > slots_per_block` (each physical
+slot packs `tokens_per_block // slots_per_block` logical tokens): ordinary
+attention has one token per slot, while DeepSeek-V4-Flash's MLA / indexer
+caches pack 4 and 128. No `compress_ratio` is stored — wherever a ratio is
+needed it is computed inline from these two ground-truth quantities. The
+LMCache chunk size must be a multiple of every group's `tokens_per_block`
+(validated at connector init and registration).
+
+The scheduler-side connector does all accounting (hit counts, store/retrieve
+ranges) in *tokens* — the only unit shared by every group — and slices each
+group's block IDs by `token_range / tokens_per_block_g`
+(`slice_block_ids_per_group`). The server counts
+`blocks_per_chunk = lmcache_tokens_per_chunk // tokens_per_block` per group.
 
 ### Cross-layer KV sharing
 
diff --git a/docs/source/cli/describe.rst b/docs/source/cli/describe.rst
index 1b07751210..af8ad57132 100644
--- a/docs/source/cli/describe.rst
+++ b/docs/source/cli/describe.rst
@@ -33,8 +33,7 @@ L2 adapters.
    Engine group index:                              0
    Object group index:                              0
    Num layers:                                     80
-   Physical block size:                           128
-   Compress ratio:                                  1
+   Slots per block:                               128
    Dtype:                               torch.float16
    MLA:                                         False
    Attention backend:    vLLM non-MLA flash attention
@@ -119,8 +118,7 @@ L2 adapters are collected into lists for easy programmatic access:
            "engine_group_idx": 0,
            "object_group_idx": 0,
            "num_layers": 80,
-           "physical_block_size": 128,
-           "compress_ratio": 1,
+           "slots_per_block": 128,
            "dtype": "torch.float16",
            "is_mla": false,
            "attention_backend": "vLLM non-MLA flash attention",
diff --git a/docs/source/mp/http_api.rst b/docs/source/mp/http_api.rst
index 06086b76de..619b0d42af 100644
--- a/docs/source/mp/http_api.rst
+++ b/docs/source/mp/http_api.rst
@@ -237,7 +237,6 @@ prefetch jobs. Intended for operators and debugging, not for monitoring
           "world_size": 1,
           "kv_cache_layout": {
             "num_layers": 32,
-            "inference_engine_logical_block_size": 16,
             "num_blocks": 12345,
             "cache_size_per_token": 131072,
             "kernel_groups": [
@@ -247,8 +246,8 @@ prefetch jobs. Intended for operators and debugging, not for monitoring
                 "object_group_idx": 0,
                 "num_layers": 32,
                 "layer_indices": [0, 1, "..."],
-                "physical_block_size": 16,
-                "compress_ratio": 1,
+                "tokens_per_block": 16,
+                "slots_per_block": 16,
                 "dtype": "torch.bfloat16",
                 "gpu_kv_concrete_shape": "...",
                 "is_mla": false,
diff --git a/docs/source/recipes/deepseek_v4_flash.rst b/docs/source/recipes/deepseek_v4_flash.rst
new file mode 100644
index 0000000000..4bb4baaad6
--- /dev/null
+++ b/docs/source/recipes/deepseek_v4_flash.rst
@@ -0,0 +1,107 @@
+.. _recipe_deepseek_v4_flash:
+
+DeepSeek-V4-Flash
+=================
+
+Validated models
+----------------
+
+- `deepseek-ai/DeepSeek-V4-Flash <https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash>`_
+
+.. tab-set::
+   :sync-group: engine
+
+   .. tab-item:: vLLM
+
+      **Engine documentation:**
+      `DeepSeek-V4-Flash in vLLM supported models
+      <https://docs.vllm.ai/en/latest/models/supported_models.html#text-generation>`_
+      (architecture ``DeepseekV4ForCausalLM``).
+
+      **Status:** Validated with LMCache.
+
+      **Installing vLLM:** DeepSeek-V4-Flash needs the sparse-MLA attention
+      backends and the ``fp8_ds_mla`` KV cache kernels, so install vLLM by
+      following its own recipe rather than a bare ``pip install vllm``:
+      `vLLM DeepSeek-V4-Flash recipe
+      <https://docs.vllm.ai/projects/recipes/en/latest/index.html>`_
+      (also mirrored at https://recipes.vllm.ai/deepseek-ai/DeepSeek-V4-Flash).
+
+      .. warning::
+
+         Use the **latest vLLM release**, not the ``main``/dev branch. The
+         current vLLM development branch is broken for DeepSeek-V4-Flash (the
+         ``fp4`` MoE experts are misdispatched and the real weights fail to
+         load). Pin to the latest tagged release as the vLLM recipe instructs.
+
+      Start the LMCache MP server:
+
+      .. code-block:: bash
+
+         lmcache server --l1-size-gb 100 --eviction-policy LRU
+
+      |
+
+      Start vLLM with the LMCache MP connector (8 GPUs):
+
+      .. code-block:: bash
+
+         vllm serve deepseek-ai/DeepSeek-V4-Flash \
+             --tensor-parallel-size 8 \
+             --enable-expert-parallel \
+             --kv-cache-dtype fp8_ds_mla \
+             --trust-remote-code \
+             --tokenizer-mode deepseek_v4 \
+             --kv-transfer-config \
+             '{"kv_connector":"LMCacheMPConnector", "kv_role":"kv_both"}'
+
+      |
+
+      ``--kv-cache-dtype fp8_ds_mla`` and ``--tokenizer-mode deepseek_v4`` are
+      required for this model; ``--enable-expert-parallel`` distributes the MoE
+      experts across the tensor-parallel ranks. Adjust
+      ``--tensor-parallel-size`` to match your hardware. For the generic
+      LMCache + vLLM wiring (ports, remote hosts, in-process mode), see
+      :doc:`../mp/quickstart`.
+
+      If there are any issues with vLLM setup, please refer to the
+      `vLLM Recipes <https://docs.vllm.ai/projects/recipes/en/latest/index.html>`_
+      for more details.
+
+   .. tab-item:: SGLang
+
+      **Status:** Not validated with LMCache.
+
+   .. tab-item:: TRT-LLM
+
+      **Status:** Not supported. LMCache TRT-LLM integration is in progress.
+
+CacheBlend support
+------------------
+
+Compression support
+-------------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 20 55
+
+   * - Method
+     - Status
+     - Notes
+   * - :doc:`CacheGen <../kv_cache_optimizations/compression/cachegen>`
+     - Not validated
+     -
+
+Caveats
+-------
+
+- **Requires the latest vLLM release.** The vLLM dev branch is currently broken
+  for this model (see the warning above) -- use a tagged release installed via
+  the vLLM recipe.
+- **Sparse-MLA hybrid KV cache.** DeepSeek-V4-Flash interleaves several KV
+  cache groups with different block geometries (the compressed MLA latents are
+  stored as ``fp8``/``uint8`` while the sparse-attention indexer groups are
+  ``float32``), so the groups do not share a single block size. LMCache stores
+  and retrieves each group in its own block size; no extra flags are required
+  beyond ``--kv-cache-dtype fp8_ds_mla``.
diff --git a/docs/source/recipes/index.rst b/docs/source/recipes/index.rst
index ecd8a53c5d..aaf053a3d7 100644
--- a/docs/source/recipes/index.rst
+++ b/docs/source/recipes/index.rst
@@ -43,6 +43,13 @@ Supported architectures
      - TRT-LLM
      - Recipe
 
+   * - ``DeepseekV4ForCausalLM``
+     - ``deepseek-ai/DeepSeek-V4-Flash``
+     - ✓
+     - —
+     - —
+     - :doc:`deepseek_v4_flash`
+
    * - ``MiniMaxM2ForCausalLM``
      - ``MiniMaxAI/MiniMax-M2``
      - ✓
@@ -139,6 +146,7 @@ To add a new architecture:
    :hidden:
    :maxdepth: 1
 
+   deepseek_v4_flash
    minimax_m2
    gemma4
    gemma3
diff --git a/lmcache/cli/commands/bench/server_bench/command.py b/lmcache/cli/commands/bench/server_bench/command.py
index 180ab29086..112be89f1e 100644
--- a/lmcache/cli/commands/bench/server_bench/command.py
+++ b/lmcache/cli/commands/bench/server_bench/command.py
@@ -259,6 +259,7 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
         format_kvcache_shape_spec,
         parse_kvcache_shape_spec,
     )
+    from lmcache.v1.multiprocess.group_view import EngineGroupInfo
     from lmcache.v1.multiprocess.mq import MessageQueueClient
 
     use_gpu = args.mode == "gpu"
@@ -370,16 +371,23 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
             "head_size": head_size_disp,
             "num_blocks": num_blocks,
             "block_size": block_size,
-            # Tell the server the inference-engine-side logical block
-            # size explicitly. Otherwise ``KVLayerGroupsManager`` falls
-            # back to ``shape_desc.bs``, which on the CPU/HND path can
-            # be the per-block ``num_heads`` value instead of the real
-            # ``block_size`` (HND swaps NH and BS in the tensor shape),
-            # and STORE/RETRIEVE would then expect twice as many block
-            # IDs as the bench client actually sends.
-            "inference_engine_logical_block_size": block_size,
             "dtype": dtype_str,
         }
+        # Tell the server each group's true tokens-per-paged-chunk
+        # explicitly. Otherwise the server falls back to the block size
+        # discovered from the tensors (``shape_desc.bs``), which on the
+        # CPU/HND path can be the per-block ``num_heads`` value instead
+        # of the real ``block_size`` (HND swaps NH and BS in the tensor
+        # shape), and STORE/RETRIEVE would then expect twice as many
+        # block IDs as the bench client actually sends.
+        engine_group_infos = [
+            EngineGroupInfo(
+                engine_group_id=group_idx,
+                layer_indices=tuple(group.layer_indices),
+                tokens_per_block=block_size,
+            )
+            for group_idx, group in enumerate(layer_groups)
+        ]
 
         num_tokens = args.num_tokens
         print(
@@ -449,6 +457,7 @@ def run_server_bench(  # noqa: ARG001  (command kept for symmetry with siblings)
             kv_caches=kv_wrappers if use_handle else None,
             use_gpu=use_gpu,
             use_handle=use_handle,
+            engine_group_infos=engine_group_infos,
         )
         print("REGISTER_KV_CACHE: %s" % ("OK" if register_result else "FAIL"))
         print()
diff --git a/lmcache/cli/commands/bench/server_bench/helpers.py b/lmcache/cli/commands/bench/server_bench/helpers.py
index 02e0a2d80b..7980f8ce55 100644
--- a/lmcache/cli/commands/bench/server_bench/helpers.py
+++ b/lmcache/cli/commands/bench/server_bench/helpers.py
@@ -59,6 +59,7 @@
         RegisterNonGpuContextPayload,
     )
     from lmcache.v1.multiprocess.futures import MessagingFuture
+    from lmcache.v1.multiprocess.group_view import EngineGroupInfo
     from lmcache.v1.multiprocess.mq import MessageQueueClient
     from lmcache.v1.multiprocess.posix_shm import shm_open_pool_as_mmap
     from lmcache.v1.multiprocess.protocols.base import RequestType
@@ -322,6 +323,7 @@ def _send_register_kv_cache(
     kv_caches: list[CudaIPCWrapper] | None = None,
     use_gpu: bool = True,
     use_handle: bool | None = None,
+    engine_group_infos: "list[EngineGroupInfo] | None" = None,
 ) -> "bool | RegisterNonGpuContextResponse":
     """Register a KV cache context with the MP server.
 
@@ -334,6 +336,13 @@ def _send_register_kv_cache(
 
     ``use_handle`` defaults to ``use_gpu`` for backwards compatibility:
     GPU always goes through the handle path, CPU defaults to data.
+
+    ``engine_group_infos`` (handle mode only) carries the per-group
+    metadata — including each group's true ``tokens_per_block`` — so the
+    server does not have to trust the block size discovered from the
+    tensors (which the HND layout can swap with ``num_heads``). ``None``
+    sends an empty list (single non-hybrid group, geometry discovered
+    from the tensors).
     """
     if use_handle is None:
         use_handle = use_gpu
@@ -354,7 +363,7 @@ def _send_register_kv_cache(
             world_size,
             EngineType.VLLM,
             hints,
-            [],  # group_views: empty = single non-hybrid group
+            list(engine_group_infos or ()),
         ]
         result = _call(client, RequestType.REGISTER_KV_CACHE, payloads)
         return result is not _TIMEOUT
diff --git a/lmcache/cli/commands/describe.py b/lmcache/cli/commands/describe.py
index d23350821b..2772dc99a6 100644
--- a/lmcache/cli/commands/describe.py
+++ b/lmcache/cli/commands/describe.py
@@ -241,8 +241,8 @@ def _add_kernel_groups(
                 ("engine_group_idx", "Engine group index"),
                 ("object_group_idx", "Object group index"),
                 ("num_layers", "Num layers"),
-                ("physical_block_size", "Physical block size"),
-                ("compress_ratio", "Compress ratio"),
+                ("tokens_per_block", "Tokens per block"),
+                ("slots_per_block", "Slots per block"),
                 ("dtype", "Dtype"),
                 ("is_mla", "MLA"),
                 ("attention_backend", "Attention backend"),
diff --git a/lmcache/integration/vllm/kv_cache_groups.py b/lmcache/integration/vllm/kv_cache_groups.py
index fdc9459410..4945a08481 100644
--- a/lmcache/integration/vllm/kv_cache_groups.py
+++ b/lmcache/integration/vllm/kv_cache_groups.py
@@ -81,9 +81,14 @@ def create_engine_group_infos_from_vllm(
     # them EXCLUDED_ENGINE_GROUP so they form no group of their own (a
     # wrong-block-size group would corrupt the per-group block-id counts).
     per_layer_group_idx: list[int] | None = None
+    group_tokens_per_block: dict[int, int] = {}
     if vllm_groups:
         per_layer_group_idx = [EXCLUDED_ENGINE_GROUP] * num_layers
         for engine_group_id, group in enumerate(vllm_groups):
+            # The spec's block_size is the logical tokens covered by one of
+            # this group's paged chunks (block IDs); the physical slot count
+            # per chunk is discovered later from the registered tensors.
+            group_tokens_per_block[engine_group_id] = group.kv_cache_spec.block_size
             for name in group.layer_names:
                 per_layer_group_idx[layer_to_idx[name]] = engine_group_id
 
@@ -98,6 +103,7 @@ def create_engine_group_infos_from_vllm(
         EngineGroupInfo(
             engine_group_id=identity[4],
             layer_indices=tuple(indices),
+            tokens_per_block=group_tokens_per_block.get(identity[4], 0),
         )
         for identity, indices in group_layers_by_identity(
             normalized_kv_caches,
diff --git a/lmcache/integration/vllm/lmcache_mp_connector.py b/lmcache/integration/vllm/lmcache_mp_connector.py
index 57b5d1d390..ebdbf78186 100644
--- a/lmcache/integration/vllm/lmcache_mp_connector.py
+++ b/lmcache/integration/vllm/lmcache_mp_connector.py
@@ -5,6 +5,7 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Literal
 import enum
+import math
 import sys
 
 # Third Party
@@ -93,7 +94,6 @@ class SupportsHMA:  # type: ignore[no-redef]
     )
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
-    from vllm.v1.core.kv_cache_utils import BlockHash
     from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
 
@@ -180,26 +180,25 @@ class LMCacheMPRequestTracker:
 
     request_id: str
 
-    # Read-only lists to track the token ids and block hashes
+    # Read-only list to track the token ids
     all_token_ids: ConstantList[int]
-    block_hashes: ConstantList["BlockHash"]
 
-    # Block ids and hashes will be updated at update_states_after_alloc and
+    # Block ids will be updated at update_states_after_alloc and
     # during generation. Keyed by engine_group_idx; non-HMA models use 0.
     allocated_block_ids: dict[int, list[int]] = field(default_factory=dict)
 
     # Number of scheduled tokens in this request. We keep tracking this to
-    # avoid saving half-full blocks.
+    # avoid saving tokens whose KV has not been computed yet.
     num_scheduled_tokens: int = 0
 
-    # Number of blocks stored will be initialized when lookup the external
+    # Number of tokens stored will be initialized when lookup the external
     # hit tokens and will be updated when processing new requests and cached
     # requests.
-    num_stored_blocks: int = 0
+    num_stored_tokens: int = 0
 
     # Staging load operation -- save vllm and lmcache hit tokens during lookup
-    num_vllm_hit_blocks: int = 0
-    num_lmcache_hit_blocks: int = 0
+    num_vllm_hit_tokens: int = 0
+    num_lmcache_hit_tokens: int = 0
 
     # Main state
     state: LMCacheMPRequestState = LMCacheMPRequestState.PREFETCHING
@@ -210,11 +209,10 @@ def __init__(self, request: "Request"):
         self.request_id = request.request_id
         self.cache_salt: str = request.cache_salt or ""
         self.all_token_ids = request.all_token_ids
-        self.block_hashes = ConstantList(request.block_hashes)
         self.allocated_block_ids = {}
-        self.num_stored_blocks = 0
-        self.num_vllm_hit_blocks = 0
-        self.num_lmcache_hit_blocks = 0
+        self.num_stored_tokens = 0
+        self.num_vllm_hit_tokens = 0
+        self.num_lmcache_hit_tokens = 0
         self.state = LMCacheMPRequestState.PREFETCHING
 
     ####
@@ -224,7 +222,7 @@ def needs_retrieve(self) -> bool:
         """Check whether the current request needs retrieve, will be used
         update_stage_after_alloc"""
         return (
-            self.num_lmcache_hit_blocks > self.num_vllm_hit_blocks
+            self.num_lmcache_hit_tokens > self.num_vllm_hit_tokens
             and self.state != LMCacheMPRequestState.READY
         )
 
@@ -242,11 +240,11 @@ def is_ready_for_retrieving(self) -> bool:
     def increase_num_scheduled_tokens(self, num_new_tokens: int):
         self.num_scheduled_tokens += num_new_tokens
 
-    def increase_num_stored_blocks(self, num_new_blocks: int):
-        """Increase the number of stored blocks for the current request
+    def increase_num_stored_tokens(self, num_new_tokens: int):
+        """Increase the number of stored tokens for the current request
         This function will be called when processing the cached requests.
         """
-        self.num_stored_blocks += num_new_blocks
+        self.num_stored_tokens += num_new_tokens
 
     def append_block_ids(
         self,
@@ -274,12 +272,11 @@ def __repr__(self) -> str:
         return (
             f"LMCacheMPRequestTracker(request_id={self.request_id}, "
             f"num_tokens={len(self.all_token_ids)}, "
-            f"num_block_hashes={len(self.block_hashes)}, "
             f"num_allocated_blocks="
             f"{self.num_allocated_blocks()}, "
-            f"num_stored_blocks={self.num_stored_blocks}, "
-            f"vllm_hit_blocks={self.num_vllm_hit_blocks}, "
-            f"lmcache_hit_blocks={self.num_lmcache_hit_blocks}, "
+            f"num_stored_tokens={self.num_stored_tokens}, "
+            f"vllm_hit_tokens={self.num_vllm_hit_tokens}, "
+            f"lmcache_hit_tokens={self.num_lmcache_hit_tokens}, "
             f"state={self.state})"
         )
 
@@ -297,83 +294,76 @@ class LMCacheMPRequestMetadata:
     @staticmethod
     def GetStoreMetadata(
         tracker: LMCacheMPRequestTracker,
-        blocks_in_chunk: int,
-        vllm_block_size: int,
-        group_block_sizes: list[int],
+        lmcache_tokens_per_chunk: int,
+        group_tokens_per_block: list[int],
     ) -> "LMCacheMPRequestMetadata | None":
         """
         Generate the store metadata for the current request tracker.
 
         Args:
             tracker: The request tracker to generate the metadata from.
-            blocks_in_chunk: the number of ``vllm_block_size`` blocks in a
-                LMCache data chunk
-            vllm_block_size: the vLLM block size (= ``cache_config.block_size``);
-                block IDs and ranges are counted in this unit
-            group_block_sizes: per-engine-group vLLM block size. A group's own
-                block size may be a larger multiple of ``vllm_block_size``
-                (hybrid models).
+            lmcache_tokens_per_chunk: the number of tokens in a LMCache data chunk
+            group_tokens_per_block: per-engine-group tokens covered by one
+                paged chunk (one block ID) of that group, i.e. the group's
+                KV cache spec ``block_size``. Must each divide
+                ``lmcache_tokens_per_chunk`` (hybrid models can mix different values).
         """
-        num_engine_groups = len(group_block_sizes)
-        # Store the blocks that has block hashes
-        # NOTE: the invariant here is that `num_stored_blocks` should
-        # always be a multiple of `blocks_in_chunk`
-        # TODO: This should be checked every time we update the num_stored_blocks
+        num_engine_groups = len(group_tokens_per_block)
+        # NOTE: the invariant here is that `num_stored_tokens` should
+        # always be a multiple of `lmcache_tokens_per_chunk`
+        # TODO: This should be checked every time we update the num_stored_tokens
         #
-        # Why computed_blocks uses max(num_vllm_hit_blocks, num_lmcache_hit_blocks):
+        # Why computed_tokens uses max(num_vllm_hit_tokens, num_lmcache_hit_tokens):
         #
-        # Both values represent a prefix of blocks whose KV data is already
+        # Both values represent a prefix of tokens whose KV data is already
         # available (either from vLLM APC or from LMCache), so they must NOT
         # be summed (that would double-count the overlapping prefix).
         #
-        # * num_lmcache_hit_blocks: LMCache-hit blocks are already counted in
-        #   num_stored_blocks (set during lookup), so they must be included
+        # * num_lmcache_hit_tokens: LMCache-hit tokens are already counted in
+        #   num_stored_tokens (set during lookup), so they must be included
         #   here to keep the upper bound consistent.  They are NOT re-stored.
-        # * num_vllm_hit_blocks: LMCache stores in units of chunks (N blocks),
-        #   so num_lmcache_hit_blocks is rounded DOWN to the nearest chunk
-        #   boundary.  When vLLM APC hits more blocks than that rounded value
-        #   (e.g. APC=44 blocks, LMCache=32 blocks after chunk alignment),
-        #   using only num_lmcache_hit_blocks would set the upper bound too
-        #   low and silently skip the APC-hit blocks that fall between the
+        # * num_vllm_hit_tokens: LMCache stores in units of chunks, so
+        #   num_lmcache_hit_tokens is rounded DOWN to the nearest chunk
+        #   boundary.  When vLLM APC hits more tokens than that rounded value
+        #   (e.g. APC=704 tokens, LMCache=512 tokens after chunk alignment),
+        #   using only num_lmcache_hit_tokens would set the upper bound too
+        #   low and silently skip the APC-hit tokens that fall between the
         #   two values, causing under-storing.  Taking the max ensures we
         #   always use the tighter (larger) of the two hit counts.
-        computed_blocks = tracker.num_scheduled_tokens // vllm_block_size + max(
-            tracker.num_vllm_hit_blocks, tracker.num_lmcache_hit_blocks
+        computed_tokens = tracker.num_scheduled_tokens + max(
+            tracker.num_vllm_hit_tokens, tracker.num_lmcache_hit_tokens
         )
-        # Normalize each group's count to ``vllm_block_size`` units before the
-        # min: a group with block size ``k * vllm_block_size`` holds ``k`` such
-        # blocks per stored block ID (e.g. gemma-4 sliding: 32-token IDs = 2 of
-        # the 16-token blocks).
+        # Each group covers ``len(block_ids) * tokens_per_block`` tokens; the
+        # storable prefix is bounded by the least-covered group (e.g.
+        # gemma-4 sliding: one 32-token ID covers 2x the tokens of a
+        # 16-token full-attention ID).
         allocated_lengths = tracker.num_allocated_blocks()
-        allocated_blocks = (
+        allocated_tokens = (
             min(
                 allocated_lengths.get(engine_group_idx, 0)
-                * (group_block_sizes[engine_group_idx] // vllm_block_size)
+                * group_tokens_per_block[engine_group_idx]
                 for engine_group_idx in range(num_engine_groups)
             )
             if num_engine_groups > 0
             else 0
         )
-        min_available_blocks = min(
-            len(tracker.block_hashes),
-            allocated_blocks,
-            computed_blocks,
+        min_available_tokens = min(
+            len(tracker.all_token_ids),
+            allocated_tokens,
+            computed_tokens,
         )
-        num_staging_blocks = min_available_blocks - tracker.num_stored_blocks
-        num_chunks = num_staging_blocks // blocks_in_chunk
+        num_staging_tokens = min_available_tokens - tracker.num_stored_tokens
+        num_chunks = num_staging_tokens // lmcache_tokens_per_chunk
 
         if num_chunks >= 1:
-            start = tracker.num_stored_blocks
-            end = start + num_chunks * blocks_in_chunk
+            start_token_idx = tracker.num_stored_tokens
+            end_token_idx = start_token_idx + num_chunks * lmcache_tokens_per_chunk
             block_ids = slice_block_ids_per_group(
                 tracker.allocated_block_ids,
-                group_block_sizes,
-                vllm_block_size,
-                start,
-                end,
+                group_tokens_per_block,
+                start_token_idx,
+                end_token_idx,
             )
-            start_token_idx = start * vllm_block_size
-            end_token_idx = end * vllm_block_size
             token_ids = list(tracker.all_token_ids)
             op = LoadStoreOp(
                 token_ids=token_ids,
@@ -390,7 +380,7 @@ def GetStoreMetadata(
             )
 
             # Update the request tracker
-            tracker.increase_num_stored_blocks(end - start)
+            tracker.increase_num_stored_tokens(end_token_idx - start_token_idx)
             return ret
 
         return None
@@ -398,51 +388,49 @@ def GetStoreMetadata(
     @staticmethod
     def GetRetrieveMetadata(
         tracker: LMCacheMPRequestTracker,
-        blocks_in_chunk: int,
-        vllm_block_size: int,
-        group_block_sizes: list[int],
+        lmcache_tokens_per_chunk: int,
+        group_tokens_per_block: list[int],
     ) -> "LMCacheMPRequestMetadata | None":
         """
         Generate the retrieve metadata for the current request tracker.
 
         Args:
             tracker: The request tracker to generate the metadata from.
-            blocks_in_chunk: the number of ``vllm_block_size`` blocks in a
-                LMCache data chunk
-            vllm_block_size: the vLLM block size (= ``cache_config.block_size``);
-                block IDs and ranges are counted in this unit
-            group_block_sizes: per-engine-group vLLM block size. A group's own
-                block size may be a larger multiple of ``vllm_block_size``
-                (hybrid models).
+            lmcache_tokens_per_chunk: the number of tokens in a LMCache data chunk
+            group_tokens_per_block: per-engine-group tokens covered by one
+                paged chunk (one block ID) of that group, i.e. the group's
+                KV cache spec ``block_size``. Must each divide
+                ``lmcache_tokens_per_chunk`` (hybrid models can mix different values).
         """
         if not tracker.is_ready_for_retrieving():
             return None
 
         # |---------------------|-----------------|----------------|
-        # | num_vllm_hit_blocks |
+        # | num_vllm_hit_tokens |
         # | lmcache chunk 1   | lmcache chunk 2   |
         #                     |  need to retrieve |
 
-        start = tracker.num_vllm_hit_blocks // blocks_in_chunk * blocks_in_chunk
-        end = tracker.num_lmcache_hit_blocks
-        assert end % blocks_in_chunk == 0, (
-            "The number of LMCache hit blocks should be a multiple of the "
-            "number of blocks in a lmcache chunk. "
+        start_token_idx = (
+            tracker.num_vllm_hit_tokens
+            // lmcache_tokens_per_chunk
+            * lmcache_tokens_per_chunk
         )
-        assert len(tracker.block_hashes) >= end, (
-            "The number of block hashes should be greater than or equal to the "
-            "number of LMCache hit blocks. "
+        end_token_idx = tracker.num_lmcache_hit_tokens
+        assert end_token_idx % lmcache_tokens_per_chunk == 0, (
+            "The number of LMCache hit tokens should be a multiple of the "
+            "LMCache chunk size. "
         )
-        if end > start:
+        assert len(tracker.all_token_ids) >= end_token_idx, (
+            "The number of tokens should be greater than or equal to the "
+            "number of LMCache hit tokens. "
+        )
+        if end_token_idx > start_token_idx:
             block_ids = slice_block_ids_per_group(
                 tracker.allocated_block_ids,
-                group_block_sizes,
-                vllm_block_size,
-                start,
-                end,
+                group_tokens_per_block,
+                start_token_idx,
+                end_token_idx,
             )
-            start_token_idx = start * vllm_block_size
-            end_token_idx = end * vllm_block_size
             token_ids = list(tracker.all_token_ids)
 
             # Compute how many tokens at the start of the retrieve range
@@ -450,8 +438,7 @@ def GetRetrieveMetadata(
             # to these positions to avoid a cross-stream data race: the
             # retrieve writes on the LMCache CUDA stream while concurrent
             # requests may read these APC-shared blocks on the vLLM stream.
-            apc_overlap_blocks = tracker.num_vllm_hit_blocks - start
-            skip_first_n_tokens = apc_overlap_blocks * vllm_block_size
+            skip_first_n_tokens = tracker.num_vllm_hit_tokens - start_token_idx
 
             op = LoadStoreOp(
                 token_ids=token_ids,
@@ -560,28 +547,45 @@ def __init__(
         else:
             raise ValueError(f"Unknown KVConnectorRole: {self.role}")
 
-        self.vllm_block_size = vllm_config.cache_config.block_size
         kv_cache_config = getattr(self, "_kv_cache_config", None)
         vllm_groups = (
             getattr(kv_cache_config, "kv_cache_groups", ()) or ()
             if kv_cache_config is not None
             else ()
         )
-        # NOTE: Hybrid models can give each group its own block size that is
-        # different from ``vllm_block_size`` (e.g. gemma-4: sliding-window
-        # groups 32, full-attention groups 16, vllm_block_size 16).
-        self._group_block_sizes: list[int] = [
+        # Tokens covered by one paged chunk (one block ID) of each engine
+        # group, from the group's KV cache spec. Hybrid models can mix
+        # different values (e.g. gemma-4: sliding-window groups 32,
+        # full-attention groups 16; DeepSeek V4: 256/64/8/4). Falls back to
+        # the engine's base block size when no group metadata is available
+        # (single non-hybrid group).
+        self._group_tokens_per_block: list[int] = [
             group.kv_cache_spec.block_size for group in vllm_groups
-        ] or [self.vllm_block_size]
-        # Validate that the block size for each group can be divided by
-        # ``self.vllm_block_size`` (per-group slicing relies on it).
-        for engine_group_idx, block_size in enumerate(self._group_block_sizes):
-            if block_size <= 0 or block_size % self.vllm_block_size != 0:
+        ] or [vllm_config.cache_config.block_size]
+        for engine_group_idx, tokens_per_block in enumerate(
+            self._group_tokens_per_block
+        ):
+            if tokens_per_block <= 0:
                 raise ValueError(
-                    f"group {engine_group_idx} block size {block_size} must be "
-                    f"a positive multiple of vllm_block_size "
-                    f"{self.vllm_block_size}"
+                    f"group {engine_group_idx} tokens_per_block "
+                    f"{tokens_per_block} must be positive"
                 )
+        # Smallest token count aligned to every group's paged-chunk
+        # boundary; used to round down vLLM APC hit counts.
+        self._hit_alignment_tokens = math.lcm(*self._group_tokens_per_block)
+        if self.role == KVConnectorRole.SCHEDULER:
+            # Chunk boundaries must land on every group's paged-chunk
+            # boundary so per-group block-id slicing stays aligned.
+            lmcache_tokens_per_chunk = self.scheduler_adapter.lmcache_tokens_per_chunk
+            for engine_group_idx, tokens_per_block in enumerate(
+                self._group_tokens_per_block
+            ):
+                if lmcache_tokens_per_chunk % tokens_per_block != 0:
+                    raise ValueError(
+                        f"LMCache chunk size {lmcache_tokens_per_chunk} must be "
+                        f"a multiple of group {engine_group_idx} "
+                        f"tokens_per_block {tokens_per_block}"
+                    )
 
     @property
     def role(self) -> KVConnectorRole:
@@ -848,19 +852,21 @@ def get_num_new_matched_tokens(
         if ret == 0:
             return 0, False
 
-        assert (
-            ret % (self.scheduler_adapter.num_blocks_per_chunk() * self.vllm_block_size)
-            == 0
-        )
+        assert ret % self.scheduler_adapter.lmcache_tokens_per_chunk == 0
 
-        # Update num stored blocks for the tracker
-        num_vllm_blocks = num_computed_tokens // self.vllm_block_size
-        num_lmcache_blocks = ret // self.vllm_block_size
-        tracker.increase_num_stored_blocks(num_lmcache_blocks)
+        # Update num stored tokens for the tracker
+        tracker.increase_num_stored_tokens(ret)
 
-        # Save the vllm and lmcache hit tokens
-        tracker.num_vllm_hit_blocks = num_vllm_blocks
-        tracker.num_lmcache_hit_blocks = num_lmcache_blocks
+        # Save the vllm and lmcache hit tokens. The vLLM hit count is
+        # rounded down to a boundary aligned for every engine group (e.g.
+        # a full-prompt APC hit reports ``num_prompt_tokens - 1``), so the
+        # retrieve-skip range stays paged-chunk-aligned in all groups.
+        tracker.num_vllm_hit_tokens = (
+            num_computed_tokens
+            // self._hit_alignment_tokens
+            * self._hit_alignment_tokens
+        )
+        tracker.num_lmcache_hit_tokens = ret
 
         need_to_load = max(0, ret - num_computed_tokens)
         logger.debug(
@@ -921,10 +927,10 @@ def update_state_after_alloc(
 
             # Free locks on chunks that vLLM already computed and won't
             # retrieve from LMCache.
-            if tracker.num_lmcache_hit_blocks > 0:
+            if tracker.num_lmcache_hit_tokens > 0:
                 if not condition:
                     # No retrieve needed — free ALL locked chunks
-                    free_end = tracker.num_lmcache_hit_blocks * self.vllm_block_size
+                    free_end = tracker.num_lmcache_hit_tokens
                 else:
                     # Note(Roy): Boundary misalignment between vLLM blocks and LMCache
                     # blocks is handled in free_lookup_locks. It makes sure that if
@@ -932,7 +938,7 @@ def update_state_after_alloc(
                     # block, the end LMCache block is not freed (i.e., floor division)
                     # since it will still be needed by vLLM and such block's lock will
                     # be freed by vLLM's retrieve.
-                    free_end = tracker.num_vllm_hit_blocks * self.vllm_block_size
+                    free_end = tracker.num_vllm_hit_tokens
 
                 if free_end > 0:
                     self.scheduler_adapter.free_lookup_locks(
@@ -1012,8 +1018,8 @@ def request_finished(
             and "cached_token_stats" in params
         ):
             request_tracker = self._get_request_tracker(request.request_id)
-            num_vllm = request_tracker.num_vllm_hit_blocks * self.vllm_block_size
-            num_lmcache = request_tracker.num_lmcache_hit_blocks * self.vllm_block_size
+            num_vllm = request_tracker.num_vllm_hit_tokens
+            num_lmcache = request_tracker.num_lmcache_hit_tokens
             return_params["cached_token_stats"] = {
                 "num_vllm_cached_tokens": num_vllm,
                 "num_lmcache_cached_tokens": num_lmcache,
@@ -1107,16 +1113,15 @@ def _process_retrieve_requests(
         self,
         metadata: LMCacheMPConnectorMetadata,
     ) -> None:
-        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+        lmcache_tokens_per_chunk = self.scheduler_adapter.lmcache_tokens_per_chunk
 
         for request_tracker in self.request_trackers.values():
             if request_tracker.state != LMCacheMPRequestState.WAITING_FOR_LOAD:
                 continue
             r_metadata = LMCacheMPRequestMetadata.GetRetrieveMetadata(
                 request_tracker,
-                blocks_per_chunk,
-                vllm_block_size=self.vllm_block_size,
-                group_block_sizes=self._group_block_sizes,
+                lmcache_tokens_per_chunk,
+                group_tokens_per_block=self._group_tokens_per_block,
             )
             if r_metadata is not None:
                 metadata.add_request_metadata(r_metadata)
@@ -1127,7 +1132,7 @@ def _process_new_requests(
         scheduler_output: SchedulerOutput,
         metadata: LMCacheMPConnectorMetadata,
     ) -> None:
-        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+        lmcache_tokens_per_chunk = self.scheduler_adapter.lmcache_tokens_per_chunk
 
         for new_request in scheduler_output.scheduled_new_reqs:
             request_tracker = self._get_request_tracker(new_request.req_id)
@@ -1137,9 +1142,8 @@ def _process_new_requests(
 
             r_meta = LMCacheMPRequestMetadata.GetStoreMetadata(
                 request_tracker,
-                blocks_per_chunk,
-                self.vllm_block_size,
-                self._group_block_sizes,
+                lmcache_tokens_per_chunk,
+                self._group_tokens_per_block,
             )
             if r_meta is not None:
                 metadata.add_request_metadata(r_meta)
@@ -1149,7 +1153,7 @@ def _process_cached_requests(
         scheduler_output: SchedulerOutput,
         metadata: LMCacheMPConnectorMetadata,
     ) -> None:
-        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+        lmcache_tokens_per_chunk = self.scheduler_adapter.lmcache_tokens_per_chunk
 
         cached_reqs = scheduler_output.scheduled_cached_reqs
         for idx, request_id in enumerate(cached_reqs.req_ids):
@@ -1167,9 +1171,8 @@ def _process_cached_requests(
 
             r_meta = LMCacheMPRequestMetadata.GetStoreMetadata(
                 request_tracker,
-                blocks_per_chunk,
-                self.vllm_block_size,
-                self._group_block_sizes,
+                lmcache_tokens_per_chunk,
+                self._group_tokens_per_block,
             )
 
             if r_meta is not None:
@@ -1196,7 +1199,7 @@ def _report_block_allocation_deltas(
                 continue
             primary_block_ids = tracker.allocated_block_ids.get(0, [])
             num_blocks = len(primary_block_ids)
-            total_tokens = num_blocks * self.vllm_block_size
+            total_tokens = num_blocks * self._group_tokens_per_block[0]
             records.append(
                 RequestAllocationRecord(
                     req_id=new_request.req_id,
@@ -1223,8 +1226,9 @@ def _report_block_allocation_deltas(
             # Compute the token range they cover.
             total_blocks = len(tracker.allocated_block_ids.get(0, []))
             num_new_blocks = len(new_block_ids)
-            start_token = (total_blocks - num_new_blocks) * self.vllm_block_size
-            end_token = total_blocks * self.vllm_block_size
+            tokens_per_block = self._group_tokens_per_block[0]
+            start_token = (total_blocks - num_new_blocks) * tokens_per_block
+            end_token = total_blocks * tokens_per_block
             new_token_ids = list(tracker.all_token_ids[start_token:end_token])
             records.append(
                 RequestAllocationRecord(
diff --git a/lmcache/integration/vllm/vllm_multi_process_adapter.py b/lmcache/integration/vllm/vllm_multi_process_adapter.py
index c2c4b9a4d9..631453767b 100644
--- a/lmcache/integration/vllm/vllm_multi_process_adapter.py
+++ b/lmcache/integration/vllm/vllm_multi_process_adapter.py
@@ -227,8 +227,8 @@ def get_lmcache_chunk_size(
         An integer representing the LMCache chunk size
     """
     future = send_lmcache_request(mq_client, RequestType.GET_CHUNK_SIZE, [])
-    chunk_size = future.result(timeout=timeout)
-    return chunk_size
+    lmcache_tokens_per_chunk = future.result(timeout=timeout)
+    return lmcache_tokens_per_chunk
 
 
 def _raise_server_unreachable(server_url: str, timeout: float) -> NoReturn:
@@ -570,16 +570,16 @@ def __init__(
 
         # Read chunk size from lmcache
         try:
-            self.chunk_size = get_lmcache_chunk_size(
+            self.lmcache_tokens_per_chunk = get_lmcache_chunk_size(
                 self.mq_client, timeout=self._mq_timeout
             )
         except TimeoutError:
             self.mq_client.close()
             _raise_server_unreachable(server_url, self._mq_timeout)
-        assert self.chunk_size % vllm_block_size == 0, (
+        assert self.lmcache_tokens_per_chunk % vllm_block_size == 0, (
             "LMCache chunk size should be a multiple of vLLM block size"
         )
-        self.blocks_in_chunk = self.chunk_size // vllm_block_size
+        self.blocks_in_chunk = self.lmcache_tokens_per_chunk // vllm_block_size
 
         # Health state (shared with heartbeat thread)
         self._health_event = threading.Event()
@@ -661,7 +661,9 @@ def maybe_submit_lookup_request(
             # Skip if there is already a lookup request
             return
 
-        aligned_end = (len(token_ids) // self.chunk_size) * self.chunk_size
+        aligned_end = (
+            len(token_ids) // self.lmcache_tokens_per_chunk
+        ) * self.lmcache_tokens_per_chunk
 
         key = self._create_key(
             token_ids,
@@ -736,7 +738,7 @@ def check_lookup_result(self, request_id: str) -> int | None:
         if result is None:
             return None
 
-        token_count = result * self.chunk_size
+        token_count = result * self.lmcache_tokens_per_chunk
         self._finished_lookup_results[request_id] = token_count
         return token_count
 
@@ -973,24 +975,17 @@ def __init__(
 
         # Read chunk size from lmcache
         try:
-            chunk_size = get_lmcache_chunk_size(
+            lmcache_tokens_per_chunk = get_lmcache_chunk_size(
                 self.mq_client, timeout=self._mq_timeout
             )
         except TimeoutError:
             self.mq_client.close()
             _raise_server_unreachable(server_url, self._mq_timeout)
-        assert chunk_size % vllm_block_size == 0, (
+        self.lmcache_tokens_per_chunk = lmcache_tokens_per_chunk
+        assert lmcache_tokens_per_chunk % vllm_block_size == 0, (
             "LMCache chunk size should be a multiple of vLLM block size"
         )
-        self.blocks_in_chunk = chunk_size // vllm_block_size
-        # Retain the vLLM logical block size so we can ship it to the
-        # LMCache server in ``register_kv_caches`` — the server uses it
-        # (as ``layout_hints["inference_engine_logical_block_size"]``)
-        # to derive per-group compression ratios when some KV layer
-        # groups compress multiple logical tokens into a single physical
-        # slot (``shape_desc.bs <
-        # inference_engine_logical_block_size``).
-        self.vllm_logical_block_size = vllm_block_size
+        self.blocks_in_chunk = lmcache_tokens_per_chunk // vllm_block_size
 
         # Health state (shared with heartbeat thread)
         self._health_event = threading.Event()
@@ -1059,8 +1054,21 @@ def register_kv_caches(
         Raises:
             ConnectionError: if the server does not respond within
                 mq_timeout.
+            ValueError: if the LMCache chunk size is not a multiple of an
+                engine group's ``tokens_per_block`` (chunk boundaries would
+                not align with that group's paged-chunk boundaries).
         """
         logger.info("Registering kv caches")
+        for info in engine_group_infos:
+            if (
+                info.tokens_per_block > 0
+                and self.lmcache_tokens_per_chunk % info.tokens_per_block
+            ):
+                raise ValueError(
+                    f"LMCache chunk size {self.lmcache_tokens_per_chunk} must be a "
+                    f"multiple of engine group {info.engine_group_id} "
+                    f"tokens_per_block {info.tokens_per_block}"
+                )
         self.kv_caches = kv_caches
         self.engine_group_infos = list(engine_group_infos)
         self._send_register_kv_caches_request(kv_caches)
@@ -1088,9 +1096,6 @@ def _send_register_kv_caches_request(
             kv_caches, mode=self._mp_transfer_mode
         )
         layout_hints = vllm_layout_hints()
-        layout_hints["inference_engine_logical_block_size"] = (
-            self.vllm_logical_block_size
-        )
         try:
             self.transfer_ctx.register(
                 self.instance_id,
diff --git a/lmcache/v1/gpu_connector/utils.py b/lmcache/v1/gpu_connector/utils.py
index 21fd5b58cf..a2729939d0 100644
--- a/lmcache/v1/gpu_connector/utils.py
+++ b/lmcache/v1/gpu_connector/utils.py
@@ -77,23 +77,12 @@ class LayoutHints(TypedDict, total=False):
             on a SGLang registration is what triggers the daemon-side
             depth-1 → depth-2 un-flatten + 3-D → 4-D reshape.
         head_dim: Per-head dimension. Used by TRT-LLM (same).
-        inference_engine_logical_block_size: Inference-engine-side block
-            size (logical tokens per engine block; for vLLM this is
-            ``cache_config.block_size``). Carried inside
-            ``LayoutHints`` (instead of as a standalone
-            ``REGISTER_KV_CACHE`` argument) so that engines without a
-            logical block-size concept can simply omit it. The server
-            uses it to derive per-group compression ratios when some
-            KV layer groups compress multiple logical tokens into a
-            single physical slot
-            (``shape_desc.bs < inference_engine_logical_block_size``).
     """
 
     kv_layout: Literal["NHD", "HND"]
     num_kv_heads: int
     tokens_per_block: int
     head_dim: int
-    inference_engine_logical_block_size: int
 
 
 def attempt_permute_to_contiguous_view(
diff --git a/lmcache/v1/kv_layer_groups.py b/lmcache/v1/kv_layer_groups.py
index 4b563a75de..4b712687de 100644
--- a/lmcache/v1/kv_layer_groups.py
+++ b/lmcache/v1/kv_layer_groups.py
@@ -19,7 +19,7 @@
 
 if TYPE_CHECKING:
     # First Party
-    from lmcache.v1.gpu_connector.utils import DiscoverableKVCache, LayoutHints
+    from lmcache.v1.gpu_connector.utils import DiscoverableKVCache
     from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 
 logger = init_logger(__name__)
@@ -173,21 +173,19 @@ class KernelGroupInfo:
     """Torch dtype of the KV cache tensors for this group. Used for
     kernel template instantiation; see class docstring for why we keep
     this alongside ``shape_desc.element_size``."""
-    compress_ratio: int = 1
-    """Logical-tokens-per-physical-slot for this group. ``1`` for
-    non-compressed groups (one logical token per physical slot);
-    greater than ``1`` for compressed groups where each physical slot
-    packs ``compress_ratio`` logical tokens (e.g. DeepSeek V4
-    compressor / indexer caches). Derived from
-    ``inference_engine_logical_block_size`` carried in ``layout_hints``
-    at :class:`KVLayerGroupsManager` construction time."""
-    physical_chunk_size: int = 0
+    tokens_per_block: int = 0
+    """Logical engine tokens covered by one paged chunk (one engine block
+    ID) of this group, as declared by the engine's KV cache spec at
+    initialization time (carried in ``EngineGroupInfo.tokens_per_block``).
+    ``0`` means the engine did not report it; the group is then treated as
+    uncompressed (``compress_ratio == 1``)."""
+    slots_per_chunk: int = 0
     """Number of *physical* slots in one LMCache chunk for this group
-    (= ``lmcache_logical_chunk_size // compress_ratio``). This is what
-    the block-level transfer kernel must be told, not the logical
-    ``lmcache_logical_chunk_size`` which counts vLLM tokens. ``0``
+    (= ``lmcache_tokens_per_chunk // tokens_per_block * slots_per_block``).
+    This is what the block-level transfer kernel must be told, not the
+    logical ``lmcache_tokens_per_chunk`` which counts vLLM tokens. ``0``
     means the field has not been populated yet; ``GPUCacheContext``
-    fills it in after construction once ``lmcache_logical_chunk_size``
+    fills it in after construction once ``lmcache_tokens_per_chunk``
     is known."""
     engine_group_idx: int = 0
     """Engine group index (paged-block address space). 0 for non-hybrid."""
@@ -206,8 +204,9 @@ def __repr__(self) -> str:
             f"element_size={sd.element_size}, "
             f"block_stride_elems={sd.block_stride_elems}), "
             f"dtype={self.dtype}, "
-            f"compress_ratio={self.compress_ratio}, "
-            f"physical_chunk_size={self.physical_chunk_size}, "
+            f"tokens_per_block={self.tokens_per_block}, "
+            f"slots_per_block={self.slots_per_block}, "
+            f"slots_per_chunk={self.slots_per_chunk}, "
             f"engine_group_idx={self.engine_group_idx})"
         )
 
@@ -221,6 +220,13 @@ def hidden_dim_size(self) -> int:
         """Hidden dimension size (``num_heads * head_size``)."""
         return self.shape_desc.nh * self.shape_desc.hs
 
+    @property
+    def slots_per_block(self) -> int:
+        """Physical slots in one paged chunk of this group, detected from
+        the registered KV tensors at registration time (the batch-size
+        dimension, ``shape_desc.bs``)."""
+        return self.shape_desc.bs
+
 
 KVLayerGroupInfo = KernelGroupInfo  # Alias for compatibility
 
@@ -270,9 +276,8 @@ def __init__(
         kv_caches: "DiscoverableKVCache",
         gpu_kv_format: "lmc_ops.GPUKVFormat",
         num_blocks: int,
-        layout_hints: "LayoutHints | None" = None,
         engine_group_infos: "Sequence[EngineGroupInfo]" = (),
-        lmcache_logical_chunk_size: int = 256,
+        lmcache_tokens_per_chunk: int = 256,
     ) -> None:
         """Partition layers into groups keyed by
         :data:`LayerGroupIdentity`.
@@ -292,12 +297,6 @@ def __init__(
             gpu_kv_format: Format returned by
                 :func:`normalize_kv_and_discover_format`.
             num_blocks: Number of paged blocks in the device KV cache.
-            layout_hints: Engine-provided hints. The manager only reads
-                ``inference_engine_logical_block_size`` (logical tokens
-                per inference-engine block) from it to derive each
-                group's ``compress_ratio`` and ``physical_chunk_size``.
-                ``None`` means every group is treated as non-compressed
-                (``compress_ratio == 1``).
             engine_group_infos: Engine KV cache group metadata, including
                 the engine group ids, and the sliding window information.
             lmcache_logical_chunk_size: Tokens per LMCache chunk
@@ -312,17 +311,6 @@ def __init__(
         )
         from lmcache.v1.multiprocess.group_view import get_engine_group_indices
 
-        # Pull the inference-engine logical block size out of
-        # ``layout_hints`` once; ``None`` means no compression info
-        # available and every group is treated as non-compressed below.
-        # The attribute is finalised after the group-building loop
-        # below, where ``None`` is replaced by the first group's
-        # physical ``bs`` so the public ``int`` contract holds.
-        self.inference_engine_logical_block_size_: "int | None" = (
-            layout_hints.get("inference_engine_logical_block_size")
-            if layout_hints
-            else None
-        )
         self._kernel_groups: list[KernelGroupInfo] = []
         self._object_groups: list[ObjectGroupInfo] = []
 
@@ -335,6 +323,14 @@ def __init__(
             engine_group_infos, num_layers
         )
 
+        # Engine-reported logical tokens per paged chunk, keyed by engine
+        # group id. 0 / missing means the engine did not report it.
+        engine_tokens_per_block: dict[int, int] = {
+            info.engine_group_id: info.tokens_per_block
+            for info in engine_group_infos
+            if info.tokens_per_block > 0
+        }
+
         groups_by_identity = group_layers_by_identity(
             kv_caches, gpu_kv_format, num_layers, per_layer_engine_group_idx
         )
@@ -360,23 +356,19 @@ def __init__(
                 block_stride_elems=block_stride_elems,
             )
 
-            # Per-group logical block size: a group's own block_size can exceed
-            # the global GCD hint (e.g. gemma-4 sliding=32, hint=16).
-            # ``max(hint, bs)`` gives compress_ratio=1 for uncompressed groups
-            # and the engine block size for compressed ones (bs < hint, DeepSeek).
-            global_logical = self.inference_engine_logical_block_size_
-            group_logical_block_size = (
-                max(global_logical, bs) if global_logical is not None else None
-            )
+            # tokens_per_block comes from the engine's KV cache spec; when
+            # absent, fall back to the physical slot count so the group is
+            # treated as non-compressed (compress_ratio == 1).
+            tokens_per_block = engine_tokens_per_block.get(engine_group_idx, bs)
 
             # TODO (ApostaC): the code here is not very good.
             # Conceptually, KV Layer Group should not be aware of lmcache logical
             # chunk size at all.
-            compress_ratio, physical_chunk_size = self._derive_compression_metadata(
+            slots_per_chunk = self._derive_slots_per_chunk(
                 group_idx=group_idx,
-                bs=bs,
-                ie_logical_block_size=group_logical_block_size,
-                lmcache_logical_chunk_size=lmcache_logical_chunk_size,
+                slots_per_block=bs,
+                tokens_per_block=tokens_per_block,
+                lmcache_tokens_per_chunk=lmcache_tokens_per_chunk,
             )
 
             self._kernel_groups.append(
@@ -384,18 +376,13 @@ def __init__(
                     layer_indices=indices,
                     shape_desc=shape_desc,
                     dtype=dt,
-                    compress_ratio=compress_ratio,
-                    physical_chunk_size=physical_chunk_size,
+                    tokens_per_block=tokens_per_block,
+                    slots_per_chunk=slots_per_chunk,
                     engine_group_idx=engine_group_idx,
                 )
             )
 
-        self.inference_engine_logical_block_size_ = (
-            self.inference_engine_logical_block_size_
-            or self._kernel_groups[0].shape_desc.bs
-        )
-
-        self._lmcache_chunk_size = lmcache_logical_chunk_size
+        self._lmcache_chunk_size = lmcache_tokens_per_chunk
 
         logger.info(
             "KV layer groups: ---\n%s\n---",
@@ -443,21 +430,6 @@ def num_groups(self) -> int:
         """
         return len(self._kernel_groups)
 
-    @property
-    @lmcache_deprecate("This function will be removed soon")
-    def inference_engine_logical_block_size(self) -> int:
-        """Inference-engine-side logical block size.
-
-        Taken from ``layout_hints`` at construction time, or falls back
-        to the first group's physical ``bs`` when no hint is provided
-        (non-vLLM engines, or vLLM without mixed-compression KV groups),
-        in which case every group is treated as non-compressed.
-        """
-        return (
-            self.inference_engine_logical_block_size_
-            or self._kernel_groups[0].shape_desc.bs
-        )
-
     def get_shape_desc(self, kernel_group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
         """Return the :class:`PageBufferShapeDesc` for *kernel_group_idx*.
 
@@ -469,8 +441,7 @@ def get_shape_desc(self, kernel_group_idx: int) -> "lmc_ops.PageBufferShapeDesc"
         """
         return self._kernel_groups[kernel_group_idx].shape_desc
 
-    @lmcache_deprecate("This function will be renamed to get_num_slots_per_chunk")
-    def get_physical_chunk_size(self, kernel_group_idx: int) -> int:
+    def get_slots_per_chunk(self, kernel_group_idx: int) -> int:
         """Return the per-chunk *physical* slot count for *kernel_group_idx*.
 
         Args:
@@ -479,7 +450,7 @@ def get_physical_chunk_size(self, kernel_group_idx: int) -> int:
         Raises:
             IndexError: If *kernel_group_idx* is out of range.
         """
-        return self._kernel_groups[kernel_group_idx].physical_chunk_size
+        return self._kernel_groups[kernel_group_idx].slots_per_chunk
 
     def get_subchunk_sw_size_tokens(self, kernel_group_idx: int) -> int:
         """Return the sub-chunk sliding window size of a given kernel group.
@@ -538,7 +509,11 @@ def calculate_num_blocks(self, kernel_group_idx: int, num_tokens: int) -> int:
             IndexError: If *kernel_group_idx* is out of range.
         """
         group = self._kernel_groups[kernel_group_idx]
-        num_physical_slots = num_tokens // group.compress_ratio
+        # Physical slots for num_tokens, derived from the per-block geometry
+        # (slots_per_block / tokens_per_block) rather than a compress ratio.
+        num_physical_slots = (
+            num_tokens * group.slots_per_block // group.tokens_per_block
+        )
         return num_physical_slots // group.shape_desc.bs
 
     ### Helper methods
@@ -563,51 +538,53 @@ def _detect_object_groups(
         ]
 
     @staticmethod
-    def _derive_compression_metadata(
+    def _derive_slots_per_chunk(
         group_idx: int,
-        bs: int,
-        ie_logical_block_size: "int | None",
-        lmcache_logical_chunk_size: int,
-    ) -> tuple[int, int]:
-        """Resolve ``(compress_ratio, physical_chunk_size)`` for one group.
-
-        ``compress_ratio`` falls back to ``1`` when
-        ``ie_logical_block_size`` is absent (no compression info
-        available); otherwise it equals
-        ``ie_logical_block_size // bs`` and the divisibility invariants
-        are enforced loudly. ``physical_chunk_size`` is then
-        ``lmcache_logical_chunk_size // compress_ratio``, the per-chunk
-        physical slot count fed to the block-level transfer kernel.
+        slots_per_block: int,
+        tokens_per_block: int,
+        lmcache_tokens_per_chunk: int,
+    ) -> int:
+        """Resolve ``slots_per_chunk`` (physical slots per LMCache chunk).
+
+        Derived directly from the three ground-truth quantities: the LMCache
+        chunk size ``lmcache_tokens_per_chunk`` (logical tokens), the group's
+        logical ``tokens_per_block`` and its physical ``slots_per_block``. One
+        LMCache chunk spans ``lmcache_tokens_per_chunk // tokens_per_block``
+        paged blocks, each holding ``slots_per_block`` physical slots, so
+        ``slots_per_chunk = lmcache_tokens_per_chunk // tokens_per_block
+        * slots_per_block``. This is the per-chunk physical slot count fed to
+        the block-level transfer kernel.
+
+        Raises:
+            ValueError: If ``tokens_per_block`` is not a whole multiple of
+                ``slots_per_block`` (each physical slot must pack a whole number
+                of logical tokens), or if ``lmcache_tokens_per_chunk`` is not a
+                whole multiple of ``tokens_per_block`` (an LMCache chunk must
+                align to a whole number of the group's paged blocks).
         """
-        if ie_logical_block_size is None:
-            compress_ratio = 1
-        else:
-            if ie_logical_block_size % bs != 0:
-                raise ValueError(
-                    f"inference engine logical block size "
-                    f"{ie_logical_block_size} must be a multiple of "
-                    f"group {group_idx} physical slot count {bs}"
-                )
-            compress_ratio = ie_logical_block_size // bs
-        if lmcache_logical_chunk_size % compress_ratio != 0:
+        if tokens_per_block % slots_per_block != 0:
+            raise ValueError(
+                f"group {group_idx}: tokens_per_block {tokens_per_block} "
+                f"must be a multiple of slots_per_block {slots_per_block}"
+            )
+        if lmcache_tokens_per_chunk % tokens_per_block != 0:
             raise ValueError(
-                f"lmcache_logical_chunk_size {lmcache_logical_chunk_size} "
-                f"must be a multiple of compress_ratio {compress_ratio} "
-                f"(group {group_idx})"
+                f"group {group_idx}: lmcache_tokens_per_chunk "
+                f"{lmcache_tokens_per_chunk} must be a multiple of "
+                f"tokens_per_block {tokens_per_block}"
             )
-        physical_chunk_size = lmcache_logical_chunk_size // compress_ratio
-        if compress_ratio != 1:
+        blocks_per_chunk = lmcache_tokens_per_chunk // tokens_per_block
+        slots_per_chunk = blocks_per_chunk * slots_per_block
+        if slots_per_block != tokens_per_block:
             logger.info(
-                "group %d: compressed "
-                "(inference_engine_logical_block_size=%d -> "
-                "slots=%d, compress_ratio=%d, physical_chunk_size=%d)",
+                "group %d: compressed (tokens_per_block=%d, slots_per_block=%d "
+                "-> slots_per_chunk=%d)",
                 group_idx,
-                ie_logical_block_size,
-                bs,
-                compress_ratio,
-                physical_chunk_size,
+                tokens_per_block,
+                slots_per_block,
+                slots_per_chunk,
             )
-        return compress_ratio, physical_chunk_size
+        return slots_per_chunk
 
 
 # ------------------------------------------------------------------ #
diff --git a/lmcache/v1/multiprocess/gpu_context.py b/lmcache/v1/multiprocess/gpu_context.py
index ed5f81bdf2..e21494f24d 100644
--- a/lmcache/v1/multiprocess/gpu_context.py
+++ b/lmcache/v1/multiprocess/gpu_context.py
@@ -112,12 +112,12 @@ class _TempGPUBuffer:
     def __init__(
         self,
         kv_layer_groups_manager: KVLayerGroupsManager,
-        lmcache_logical_chunk_size: int,
+        lmcache_tokens_per_chunk: int,
         device: torch.device,
         max_batch_size: int = 4,
     ) -> None:
         self._kv_groups_manager = kv_layer_groups_manager
-        self._lmcache_chunk_size = lmcache_logical_chunk_size
+        self._lmcache_chunk_size = lmcache_tokens_per_chunk
         self._max_batch_size = max_batch_size
 
         self._temp_buffer = torch.empty(
@@ -282,7 +282,7 @@ def _get_shape_for_kernel_group(
             The shape of the temp GPU buffer for the given kernel group index.
         """
         group = self._kv_groups_manager.kernel_groups[kernel_group_idx]
-        compress_ratio = group.compress_ratio
+        compress_ratio = group.tokens_per_block // group.slots_per_block
         sd = group.shape_desc
 
         if num_tokens % compress_ratio != 0:
@@ -346,7 +346,7 @@ class GPUCacheContext:
     def __init__(
         self,
         kv_caches: KVCache,
-        lmcache_logical_chunk_size: int = 256,
+        lmcache_tokens_per_chunk: int = 256,
         layout_hints: LayoutHints | None = None,
         engine_group_infos: Sequence[EngineGroupInfo] = (),
         engine_type: EngineType = EngineType.VLLM,
@@ -361,15 +361,14 @@ def __init__(
         self.is_mla_ = is_mla(self.gpu_kv_format_)
         self.num_layers_ = get_num_layers(self.kv_caches_, self.gpu_kv_format_)
         self.num_blocks_ = get_num_blocks(self.kv_caches_, self.gpu_kv_format_)
-        self.lmcache_logical_chunk_size = lmcache_logical_chunk_size
+        self.lmcache_tokens_per_chunk = lmcache_tokens_per_chunk
 
         self.kv_layer_groups_manager_ = KVLayerGroupsManager(
             self.kv_caches_,
             gpu_kv_format=self.gpu_kv_format_,
             num_blocks=self.num_blocks_,
-            layout_hints=layout_hints,
             engine_group_infos=engine_group_infos,
-            lmcache_logical_chunk_size=lmcache_logical_chunk_size,
+            lmcache_tokens_per_chunk=lmcache_tokens_per_chunk,
         )
 
         self.group_kv_pointers_: list[torch.Tensor] = []
@@ -390,7 +389,7 @@ def __init__(
         # Temporary GPU buffer for transfers — a single flat uint8 buffer
         self._temp_buffer = _TempGPUBuffer(
             kv_layer_groups_manager=self.kv_layer_groups_manager_,
-            lmcache_logical_chunk_size=lmcache_logical_chunk_size,
+            lmcache_tokens_per_chunk=lmcache_tokens_per_chunk,
             device=self.device_,
             max_batch_size=4,
         )
@@ -482,19 +481,15 @@ def kv_layer_groups_manager(self) -> KVLayerGroupsManager:
         """Returns the KV layer groups manager."""
         return self.kv_layer_groups_manager_
 
-    def get_shape_desc(self, group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
+    def get_shape_desc(self, kernel_group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
         """Returns the PageBufferShapeDesc for the given KV layer group."""
-        return self.kv_layer_groups_manager_.get_shape_desc(group_idx)
-
-    @lmcache_deprecate("this function will be renamed to get_num_slots_per_chunk")
-    def get_physical_chunk_size(self, group_idx: int) -> int:
-        """Returns the per-chunk physical slot count for the given group.
+        return self.kv_layer_groups_manager_.get_shape_desc(kernel_group_idx)
 
-        Equal to ``lmcache_logical_chunk_size // compress_ratio``; for
-        non-compressed groups this is just ``lmcache_logical_chunk_size``.
-        This is the value the block-level transfer kernel must be told.
+    def get_slots_per_chunk(self, kernel_group_idx: int) -> int:
+        """Returns the per-chunk physical slot count for the given kernel
+        group.
         """
-        return self.kv_layer_groups_manager_.get_physical_chunk_size(group_idx)
+        return self.kv_layer_groups_manager_.get_slots_per_chunk(kernel_group_idx)
 
     def get_kernel_group_kv_pointers(self, kernel_group_idx: int) -> torch.Tensor:
         """Returns the pre-computed GPU tensor of KV cache pointers for the
@@ -613,7 +608,7 @@ def get_kv_buffer_shape(
         """
         # TODO: remove this!
         group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
-        compress_ratio = group.compress_ratio
+        compress_ratio = group.tokens_per_block // group.slots_per_block
         if logical_num_tokens % compress_ratio != 0:
             raise ValueError(
                 f"logical_num_tokens ({logical_num_tokens}) is not a multiple of "
@@ -663,7 +658,6 @@ def report_status(self) -> dict:
             A dict with these top-level fields:
 
             - ``num_layers`` (int): total layers in the model.
-            - ``inference_engine_logical_block_size`` (int)
             - ``num_blocks`` (int)
             - ``cache_size_per_token`` (int): bytes per logical token,
               summed across groups.
@@ -675,7 +669,9 @@ def report_status(self) -> dict:
               - ``object_group_idx`` (int): owning object group.
               - ``num_layers`` (int): layers in this group.
               - ``layer_indices`` (list[int]): the group's layer indices.
-              - ``physical_block_size`` (int): ``shape_desc.bs``.
+              - ``tokens_per_block`` (int): logical tokens per paged chunk.
+              - ``slots_per_block`` (int): physical slots per paged
+                chunk (``slots_per_block``, i.e. ``shape_desc.bs``).
               - ``compress_ratio`` (int)
               - ``dtype`` (str): stringified torch dtype.
               - ``gpu_kv_concrete_shape`` (str): group-accurate numeric shape.
@@ -706,8 +702,8 @@ def report_status(self) -> dict:
                     ),
                     "num_layers": group.num_layers,
                     "layer_indices": list(group.layer_indices),
-                    "physical_block_size": group.shape_desc.bs,
-                    "compress_ratio": group.compress_ratio,
+                    "tokens_per_block": group.tokens_per_block,
+                    "slots_per_block": group.slots_per_block,
                     "dtype": str(group.dtype),
                     "gpu_kv_concrete_shape": get_concrete_gpu_kv_shape_from_shape_desc(
                         group.shape_desc, gpu_kv_format
@@ -721,9 +717,6 @@ def report_status(self) -> dict:
 
         return {
             "num_layers": self.num_layers,
-            "inference_engine_logical_block_size": (
-                manager.inference_engine_logical_block_size
-            ),
             "num_blocks": self.num_blocks,
             "cache_size_per_token": self.cache_size_per_token(),
             "kernel_groups": group_reports,
diff --git a/lmcache/v1/multiprocess/group_view.py b/lmcache/v1/multiprocess/group_view.py
index 5c95cef30d..46d45f040b 100644
--- a/lmcache/v1/multiprocess/group_view.py
+++ b/lmcache/v1/multiprocess/group_view.py
@@ -41,6 +41,14 @@ class EngineGroupInfo(msgspec.Struct, frozen=True):
     layer_indices: tuple[int, ...] = ()
     """Registered KV tensor indices assigned to this group."""
 
+    tokens_per_block: int = 0
+    """Logical tokens covered by one paged chunk (one engine block ID) of
+    this engine group, as declared by the engine's KV cache spec
+    (``kv_cache_spec.block_size`` for vLLM). ``0`` means the engine did not
+    report it; consumers then fall back to the physical slot count detected
+    from the registered tensors (i.e. the group is treated as
+    uncompressed)."""
+
 
 def num_engine_groups(groups: Sequence[EngineGroupInfo]) -> int:
     """Return the number of engine groups (block-id lists per transfer request).
@@ -131,45 +139,53 @@ def expand_engine_block_ids(
 
 def slice_block_ids_per_group(
     allocated_block_ids: Mapping[int, Sequence[int]],
-    group_block_sizes: Sequence[int],
-    base_block_size: int,
-    start_block_idx: int,
-    end_block_idx: int,
+    group_tokens_per_block: Sequence[int],
+    start_token_idx: int,
+    end_token_idx: int,
 ) -> list[list[int]]:
-    """Slice each engine group's block IDs for a block range.
+    """Slice each engine group's block IDs for a token range.
 
-    The range is given in *base* blocks -- the block size that every group's
-    block size is a multiple of. A group whose own block size is ``k`` times the
-    base size holds ``1/k`` as many block IDs over the same tokens, so the range
-    is divided by ``k = group_block_size // base_block_size`` for that group.
-    Example: with base 16, a block_size-32 group gets half the IDs of a
-    block_size-16 group.
+    The range is given in tokens — the only unit shared by every engine
+    group. A group whose paged chunks each cover ``tokens_per_block`` tokens
+    holds one block ID per ``tokens_per_block`` tokens, so the range is
+    divided by that group's ``tokens_per_block``. Example: over the same 256
+    tokens, a tokens_per_block-64 group gets 4 IDs while a
+    tokens_per_block-256 group gets 1.
 
     Args:
         allocated_block_ids: Block IDs keyed by engine group id; a missing group
             yields an empty list.
-        group_block_sizes: Each group's block size, in engine-group order. Every
-            value must be a positive multiple of ``base_block_size``.
-        base_block_size: Block size the range indices are counted in.
-        start_block_idx: Range start block index, inclusive.
-        end_block_idx: Range end block index, exclusive.
+        group_tokens_per_block: Each group's tokens-per-paged-chunk, in
+            engine-group order. Every value must be positive and divide both
+            range endpoints.
+        start_token_idx: Range start token index, inclusive.
+        end_token_idx: Range end token index, exclusive.
 
     Returns:
         One block-ID list per engine group, in engine-group order.
 
     Raises:
-        ValueError: If the range does not align to a group's block boundary.
+        ValueError: If the range does not align to a group's chunk boundary.
     """
     sliced: list[list[int]] = []
-    for engine_group_idx, block_size in enumerate(group_block_sizes):
-        k = block_size // base_block_size
-        if start_block_idx % k != 0 or end_block_idx % k != 0:
+    for engine_group_idx, tokens_per_block in enumerate(group_tokens_per_block):
+        if start_token_idx % tokens_per_block != 0 or (
+            end_token_idx % tokens_per_block != 0
+        ):
             raise ValueError(
-                f"block range [{start_block_idx}, {end_block_idx}) does not "
-                f"align to group {engine_group_idx} block factor {k}"
+                f"token range [{start_token_idx}, {end_token_idx}) does not "
+                f"align to group {engine_group_idx} tokens_per_block "
+                f"{tokens_per_block}"
             )
         group_block_ids = allocated_block_ids.get(engine_group_idx, [])
-        sliced.append(list(group_block_ids[start_block_idx // k : end_block_idx // k]))
+        sliced.append(
+            list(
+                group_block_ids[
+                    start_token_idx // tokens_per_block : end_token_idx
+                    // tokens_per_block
+                ]
+            )
+        )
     return sliced
 
 
diff --git a/lmcache/v1/multiprocess/modules/blend_v3.py b/lmcache/v1/multiprocess/modules/blend_v3.py
index d1dcd8ee83..e24ad36be7 100644
--- a/lmcache/v1/multiprocess/modules/blend_v3.py
+++ b/lmcache/v1/multiprocess/modules/blend_v3.py
@@ -983,10 +983,12 @@ def _apply_cb_rope_batched(
         num_groups = gpu_context.kv_layer_groups_manager.num_kernel_groups
         for group_idx in range(num_groups):
             group = gpu_context.kv_layer_groups_manager.kernel_groups[group_idx]
-            if group.compress_ratio != 1:
+            if group.tokens_per_block != group.slots_per_block:
                 raise RuntimeError(
-                    f"CB v3: group {group_idx} has compress_ratio="
-                    f"{group.compress_ratio}; compressed layouts unsupported."
+                    f"CB v3: group {group_idx} is compressed "
+                    f"(tokens_per_block={group.tokens_per_block}, "
+                    f"slots_per_block={group.slots_per_block}); "
+                    f"compressed layouts unsupported."
                 )
             all_slots = [
                 gpu_context.get_temp_kernel_group_buffer(slot_idx, group_idx)
@@ -1120,13 +1122,16 @@ def cb_retrieve_pre_computed(
 
         logger.debug("CB V3 retrieving object keys: %s", all_obj_keys)
 
-        ie_logical_block_size = (
-            gpu_context.kv_layer_groups_manager.inference_engine_logical_block_size
-        )
-        if chunk_size % ie_logical_block_size != 0:
+        # CB v3 only supports uncompressed single-block-id-space layouts
+        # (enforced per group in ``_apply_cb_rope_batched``), so the first
+        # kernel group's chunk geometry is representative.
+        tokens_per_block = gpu_context.kv_layer_groups_manager.kernel_groups[
+            0
+        ].tokens_per_block
+        if chunk_size % tokens_per_block != 0:
             raise ValueError(
                 f"chunk_size {chunk_size} must be a multiple of "
-                f"inference_engine_logical_block_size {ie_logical_block_size}"
+                f"tokens_per_block {tokens_per_block}"
             )
         num_groups = gpu_context.kv_layer_groups_manager.num_kernel_groups
 
@@ -1175,7 +1180,7 @@ def cb_retrieve_pre_computed(
                     # Per-token scatter handles any cur_st; just bound the
                     # matched range to the allocated slots.
                     pairs: list[tuple[CBMatchResult, Any]] = []
-                    num_slots = int(all_block_ids_gpu.numel()) * ie_logical_block_size
+                    num_slots = int(all_block_ids_gpu.numel()) * tokens_per_block
                     for r, memory_obj in zip(cb_match_result, memory_objs, strict=True):
                         if r.cur_ed > num_slots:
                             logger.warning(
@@ -1247,7 +1252,7 @@ def cb_retrieve_pre_computed(
 
                             # (c) Per-token slot scatter: partial vLLM blocks
                             # shared with recomputed tokens stay disjoint.
-                            bs = ie_logical_block_size
+                            bs = tokens_per_block
                             pos = torch.cat(
                                 [
                                     torch.arange(
diff --git a/lmcache/v1/multiprocess/modules/gpu_transfer.py b/lmcache/v1/multiprocess/modules/gpu_transfer.py
index 27fe390de9..1d95038e4d 100644
--- a/lmcache/v1/multiprocess/modules/gpu_transfer.py
+++ b/lmcache/v1/multiprocess/modules/gpu_transfer.py
@@ -170,13 +170,13 @@ def downsample_and_stage_block_ids(
             )
         )
         tokens_per_chunk = min(
-            cache_context.lmcache_logical_chunk_size, subchunk_sw_size_tokens
+            cache_context.lmcache_tokens_per_chunk, subchunk_sw_size_tokens
         )
         keep_blocks_per_chunk = cache_context.calculate_num_blocks(
             tokens_per_chunk, kernel_group_id
         )
         total_blocks_per_chunk = cache_context.calculate_num_blocks(
-            cache_context.lmcache_logical_chunk_size, kernel_group_id
+            cache_context.lmcache_tokens_per_chunk, kernel_group_id
         )
 
         new_block_ids = []
@@ -263,7 +263,7 @@ def transfer_kv_per_object_group(
         This function expects the caller to stage the block ids (list[list[int]])
         into GPU tensors and pass them in as `block_ids_gpu`.
     """
-    lmcache_chunk_size = cache_context.lmcache_logical_chunk_size
+    lmcache_chunk_size = cache_context.lmcache_tokens_per_chunk
     kv_groups_manager = cache_context.kv_layer_groups_manager
     object_group = kv_groups_manager.object_groups[object_group_id]
     kernel_group_ids = object_group.kernel_group_indices
@@ -348,7 +348,7 @@ def transfer_kv_per_object_group(
             group_kv_pointers = cache_context.get_kernel_group_kv_pointers(
                 kernel_group_id
             )
-            group_lmcache_chunk_size = cache_context.get_physical_chunk_size(
+            group_lmcache_chunk_size = cache_context.get_slots_per_chunk(
                 kernel_group_id
             )
             tmp_gpu_buffers_batched = [
diff --git a/lmcache/v1/multiprocess/transfer_context/worker_transfer.py b/lmcache/v1/multiprocess/transfer_context/worker_transfer.py
index 7ab3dc3cdc..1d90fc522e 100644
--- a/lmcache/v1/multiprocess/transfer_context/worker_transfer.py
+++ b/lmcache/v1/multiprocess/transfer_context/worker_transfer.py
@@ -330,9 +330,9 @@ def register(
         hybrid KV cache groups and rejects multi-group transfers at store /
         retrieve time (see ``_single_group_block_ids``).
         """
-        # TODO: inference_engine_logical_block_size is currently used by
-        # DeepSeek V4 on the CUDA path. The non-CUDA path is yet to be
-        # implemented.
+        # TODO: per-group compression (EngineGroupInfo.tokens_per_block vs
+        # the tensor-detected slot count, e.g. DeepSeek V4) is only handled
+        # on the CUDA path. The non-CUDA path is yet to be implemented.
         (
             block_size,
             num_layers,
diff --git a/lmcache/v1/platform/cache_context.py b/lmcache/v1/platform/cache_context.py
index 81a9e4430d..0da963d9b6 100644
--- a/lmcache/v1/platform/cache_context.py
+++ b/lmcache/v1/platform/cache_context.py
@@ -34,7 +34,7 @@
 
 def create_cache_context(
     kv_caches: KVCache,
-    lmcache_logical_chunk_size: int = 256,
+    lmcache_tokens_per_chunk: int = 256,
     layout_hints: LayoutHints | None = None,
     engine_group_infos: "Sequence[EngineGroupInfo]" = (),
     engine_type: EngineType = EngineType.VLLM,
@@ -52,7 +52,7 @@ def create_cache_context(
     Args:
         kv_caches: KV cache tensor wrappers from the serving engine.
             Must be non-empty.
-        lmcache_logical_chunk_size: Number of tokens per LMCache chunk.
+        lmcache_tokens_per_chunk: Number of tokens per LMCache chunk.
         layout_hints: Optional hints for GPU KV format detection.
             Forwarded verbatim to the concrete context constructor.
         engine_group_infos: Engine-neutral KV cache group metadata.
@@ -78,7 +78,7 @@ def create_cache_context(
     )
     return cls(
         kv_caches,
-        lmcache_logical_chunk_size,
+        lmcache_tokens_per_chunk,
         layout_hints,
         engine_group_infos,
         engine_type,
diff --git a/lmcache/v1/platform/cpu/cache_context.py b/lmcache/v1/platform/cpu/cache_context.py
index 04dc8ea7fa..b26e2bda9f 100644
--- a/lmcache/v1/platform/cpu/cache_context.py
+++ b/lmcache/v1/platform/cpu/cache_context.py
@@ -71,7 +71,7 @@ class CpuCacheContext:
     def __init__(
         self,
         kv_caches: KVCache,
-        lmcache_logical_chunk_size: int = 256,
+        lmcache_tokens_per_chunk: int = 256,
         layout_hints: LayoutHints | None = None,
         engine_group_infos: "Sequence[EngineGroupInfo]" = (),
         engine_type: EngineType = EngineType.VLLM,
@@ -90,7 +90,7 @@ def __init__(
 
         unwrapped = unwrap_kv_cache_tensors(kv_caches)
         self.device_ = torch.device("cpu")
-        self.lmcache_logical_chunk_size = lmcache_logical_chunk_size
+        self.lmcache_tokens_per_chunk = lmcache_tokens_per_chunk
 
         # Discover layout & build KV layer groups via the same path
         # GPUCacheContext uses, so we don't need to hand-roll any
@@ -112,9 +112,8 @@ def __init__(
             self.kv_caches_,
             gpu_kv_format=self._gpu_kv_format,
             num_blocks=self.num_blocks_,
-            layout_hints=layout_hints,
             engine_group_infos=engine_group_infos,
-            lmcache_logical_chunk_size=lmcache_logical_chunk_size,
+            lmcache_tokens_per_chunk=lmcache_tokens_per_chunk,
         )
 
         # Per-group KV pointer tensors (CPU). Reuse the same helper
@@ -151,7 +150,7 @@ def __init__(
         for group_idx, group in enumerate(
             self.kv_layer_groups_manager_.kv_layer_groups
         ):
-            shape = self.get_kv_buffer_shape(lmcache_logical_chunk_size, group_idx)
+            shape = self.get_kv_buffer_shape(lmcache_tokens_per_chunk, group_idx)
             byte_size = shape.numel() * group.dtype.itemsize
             self.tmp_chunk_group_offsets_.append(
                 self.tmp_chunk_group_offsets_[-1] + byte_size
@@ -281,7 +280,7 @@ def hidden_dim_sizes(self) -> list[int]:
         return self.hidden_dim_sizes_
 
     @property
-    def group_physical_block_sizes(self) -> list[int]:
+    def group_slots_per_blocks(self) -> list[int]:
         """Per-group physical slot count (``shape_desc.bs``) in group
         order."""
         return [
@@ -289,16 +288,6 @@ def group_physical_block_sizes(self) -> list[int]:
             for group in self.kv_layer_groups_manager_.kv_layer_groups
         ]
 
-    @property
-    def group_compress_ratios(self) -> list[int]:
-        """Per-group compression ratio in group order.
-        ``1`` for non-compressed groups.
-        """
-        return [
-            group.compress_ratio
-            for group in self.kv_layer_groups_manager_.kv_layer_groups
-        ]
-
     @property
     def kv_layer_groups_manager(self) -> KVLayerGroupsManager:
         """Returns the KV layer groups manager."""
@@ -347,9 +336,9 @@ def get_shape_desc(self, group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
         """Returns the PageBufferShapeDesc for the given group."""
         return self.kv_layer_groups_manager_.get_shape_desc(group_idx)
 
-    def get_physical_chunk_size(self, group_idx: int) -> int:
+    def get_slots_per_chunk(self, group_idx: int) -> int:
         """Returns the per-chunk physical slot count for the group."""
-        return self.kv_layer_groups_manager_.get_physical_chunk_size(group_idx)
+        return self.kv_layer_groups_manager_.get_slots_per_chunk(group_idx)
 
     def blocks_for_tokens(self, num_logical_tokens: int, group_idx: int) -> int:
         """Number of blocks that span *num_logical_tokens* for a group.
@@ -357,7 +346,9 @@ def blocks_for_tokens(self, num_logical_tokens: int, group_idx: int) -> int:
         Mirrors :meth:`GPUCacheContext.blocks_for_tokens`.
         """
         group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
-        physical_slots = num_logical_tokens // group.compress_ratio
+        physical_slots = (
+            num_logical_tokens * group.slots_per_block // group.tokens_per_block
+        )
         return physical_slots // group.shape_desc.bs
 
     def get_group_kv_pointers(self, group_idx: int) -> torch.Tensor:
@@ -391,7 +382,7 @@ def get_kernel_group_shape_dtype(
             A ``(shape, dtype)`` tuple for the given kernel group.
         """
         group = self.kv_layer_groups_manager_.kv_layer_groups[kernel_group_idx]
-        compress_ratio = group.compress_ratio
+        compress_ratio = group.tokens_per_block // group.slots_per_block
         if num_tokens % compress_ratio != 0:
             raise ValueError(
                 "num_tokens (%d) is not a multiple of compress_ratio (%d) "
@@ -416,7 +407,7 @@ def get_kv_buffer_shape(
         compressed groups (MLA etc.) get the correct shape.
         """
         group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
-        compress_ratio = group.compress_ratio
+        compress_ratio = group.tokens_per_block // group.slots_per_block
         if logical_num_tokens % compress_ratio != 0:
             raise ValueError(
                 "logical_num_tokens (%d) is not a multiple of "
@@ -458,7 +449,7 @@ def get_temp_kernel_group_buffer(
             )
         group = self.kv_layer_groups_manager_.kv_layer_groups[kernel_group_idx]
         shape = self.get_kv_buffer_shape(
-            self.lmcache_logical_chunk_size, kernel_group_idx
+            self.lmcache_tokens_per_chunk, kernel_group_idx
         )
         g_start = self.tmp_chunk_group_offsets_[kernel_group_idx]
         g_end = self.tmp_chunk_group_offsets_[kernel_group_idx + 1]
@@ -504,7 +495,7 @@ def get_temp_object_group_buffer(
     def get_tmp_chunk_gpu_buffer(self, group_idx: int = 0) -> torch.Tensor:
         """Returns a typed view of the temp buffer for one chunk."""
         group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
-        shape = self.get_kv_buffer_shape(self.lmcache_logical_chunk_size, group_idx)
+        shape = self.get_kv_buffer_shape(self.lmcache_tokens_per_chunk, group_idx)
         start = self.tmp_chunk_group_offsets_[group_idx]
         end = self.tmp_chunk_group_offsets_[group_idx + 1]
         return self.tmp_cpu_buffer_[start:end].view(group.dtype).view(shape)
@@ -518,7 +509,7 @@ def get_tmp_chunk_gpu_buffer_batched(
                 "batch_size %d > max_batch_size %d" % (batch_size, self.max_batch_size)
             )
         group = self.kv_layer_groups_manager_.kv_layer_groups[group_idx]
-        shape = self.get_kv_buffer_shape(self.lmcache_logical_chunk_size, group_idx)
+        shape = self.get_kv_buffer_shape(self.lmcache_tokens_per_chunk, group_idx)
         g_start = self.tmp_chunk_group_offsets_[group_idx]
         g_end = self.tmp_chunk_group_offsets_[group_idx + 1]
         chunk = self.tmp_chunk_bytes_
@@ -603,8 +594,8 @@ def report_status(self) -> dict:
                     ),
                     "num_layers": group.num_layers,
                     "layer_indices": list(group.layer_indices),
-                    "physical_block_size": group.shape_desc.bs,
-                    "compress_ratio": group.compress_ratio,
+                    "tokens_per_block": group.tokens_per_block,
+                    "slots_per_block": group.slots_per_block,
                     "dtype": str(group.dtype),
                     "gpu_kv_concrete_shape": (
                         get_concrete_gpu_kv_shape_from_shape_desc(
@@ -620,9 +611,6 @@ def report_status(self) -> dict:
 
         return {
             "num_layers": self.num_layers_,
-            "inference_engine_logical_block_size": (
-                manager.inference_engine_logical_block_size
-            ),
             "num_blocks": self.num_blocks_,
             "cache_size_per_token": self.cache_size_per_token(),
             "kernel_groups": group_reports,
@@ -638,7 +626,8 @@ def cache_size_per_token(self) -> int:
         for group_idx, group in enumerate(
             self.kv_layer_groups_manager_.kv_layer_groups
         ):
-            numels = self.get_kv_buffer_shape(group.compress_ratio, group_idx).numel()
+            compress_ratio = group.tokens_per_block // group.slots_per_block
+            numels = self.get_kv_buffer_shape(compress_ratio, group_idx).numel()
             slot_bytes = numels * group.dtype.itemsize
-            total += slot_bytes // group.compress_ratio
+            total += slot_bytes // compress_ratio
         return total
diff --git a/tests/cli/test_describe.py b/tests/cli/test_describe.py
index c99aea8458..7240a3f796 100644
--- a/tests/cli/test_describe.py
+++ b/tests/cli/test_describe.py
@@ -35,7 +35,6 @@
             "world_size": 1,
             "kv_cache_layout": {
                 "num_layers": 32,
-                "inference_engine_logical_block_size": 16,
                 "num_blocks": 2048,
                 "cache_size_per_token": 163840,
                 "kernel_groups": [
@@ -45,8 +44,8 @@
                         "object_group_idx": 0,
                         "num_layers": 32,
                         "layer_indices": list(range(32)),
-                        "physical_block_size": 16,
-                        "compress_ratio": 1,
+                        "tokens_per_block": 16,
+                        "slots_per_block": 16,
                         "dtype": "torch.float16",
                         "gpu_kv_concrete_shape": "32 x [2, 2048, 16, 8, 128]",
                         "is_mla": False,
@@ -205,8 +204,7 @@ class FakeArgs:
         assert kg["engine_group_idx"] == 0
         assert kg["object_group_idx"] == 0
         assert kg["num_layers"] == 32
-        assert kg["physical_block_size"] == 16
-        assert kg["compress_ratio"] == 1
+        assert kg["slots_per_block"] == 16
         assert kg["dtype"] == "torch.float16"
         assert kg["is_mla"] is False
         assert kg["attention_backend"] == "vLLM non-MLA flash attention"
diff --git a/tests/v1/multiprocess/test_blend_server_v2.py b/tests/v1/multiprocess/test_blend_server_v2.py
index 4f6f2eaf37..0742c8bacf 100644
--- a/tests/v1/multiprocess/test_blend_server_v2.py
+++ b/tests/v1/multiprocess/test_blend_server_v2.py
@@ -833,7 +833,7 @@ def registered_instance(
             "testmodel",
             1,
             EngineType.VLLM,
-            {"inference_engine_logical_block_size": 16},
+            {},
             [],
         ],
         get_response_class(RequestType.REGISTER_KV_CACHE),
diff --git a/tests/v1/multiprocess/test_blend_v3_load_store_opts.py b/tests/v1/multiprocess/test_blend_v3_load_store_opts.py
index fd8047ed6a..aef071031a 100644
--- a/tests/v1/multiprocess/test_blend_v3_load_store_opts.py
+++ b/tests/v1/multiprocess/test_blend_v3_load_store_opts.py
@@ -231,17 +231,20 @@ def _build_fake_gpu_context(batch_size: int, num_groups: int):
     used by _apply_cb_rope_batched."""
     gpu_context = MagicMock()
     gpu_context.kv_layer_groups_manager.num_kernel_groups = num_groups
-    # All groups: compress_ratio=1, kv_size=2.
-    groups = [SimpleNamespace(compress_ratio=1) for _ in range(num_groups)]
+    # All groups: uncompressed (tokens_per_block == slots_per_block), kv_size=2.
+    groups = [
+        SimpleNamespace(tokens_per_block=4, slots_per_block=4)
+        for _ in range(num_groups)
+    ]
     gpu_context.kv_layer_groups_manager.kernel_groups = groups
 
     # Each per-(slot, group) buffer has shape
-    # (2 kv, num_layers, slots_per_chunk, hidden_dim).
-    num_layers, slots_per_chunk, hidden_dim = 2, 4, 64
+    # (2 kv, num_layers, slots_per_block, hidden_dim).
+    num_layers, slots_per_block, hidden_dim = 2, 4, 64
     head_size = 32
 
     def _get_temp_kernel_group_buffer(batch_idx, kernel_group_idx):
-        return _FakeTensor((2, num_layers, slots_per_chunk, hidden_dim))
+        return _FakeTensor((2, num_layers, slots_per_block, hidden_dim))
 
     gpu_context.get_temp_kernel_group_buffer.side_effect = _get_temp_kernel_group_buffer
     return gpu_context, head_size
@@ -317,14 +320,14 @@ def test_batched_rope_noop_on_empty_slots():
 
 
 def test_batched_rope_raises_on_compressed_layout():
-    """compress_ratio != 1 → RuntimeError."""
+    """A compressed group (tokens_per_block != slots_per_block) → RuntimeError."""
     # First Party
     from lmcache.v1.multiprocess.modules import blend_v3 as v3_mod
 
     gpu_context = MagicMock()
     gpu_context.kv_layer_groups_manager.num_kernel_groups = 1
     gpu_context.kv_layer_groups_manager.kernel_groups = [
-        SimpleNamespace(compress_ratio=2)
+        SimpleNamespace(tokens_per_block=8, slots_per_block=4)
     ]
     rope_state = SimpleNamespace(
         head_size=32, cos_sin_cache=MagicMock(), is_neox_style=True
@@ -335,5 +338,5 @@ def test_batched_rope_raises_on_compressed_layout():
         eng
     )
 
-    with pytest.raises(RuntimeError, match="compress_ratio="):
+    with pytest.raises(RuntimeError, match="is compressed"):
         eng._apply_cb_rope_batched(gpu_context, rope_state, 2, [(0, 1, 2)])
diff --git a/tests/v1/multiprocess/test_cache_server.py b/tests/v1/multiprocess/test_cache_server.py
index d0a53486f7..87ec7ec031 100644
--- a/tests/v1/multiprocess/test_cache_server.py
+++ b/tests/v1/multiprocess/test_cache_server.py
@@ -329,11 +329,9 @@ def registered_instance(
     """
     instance_id = os.getpid()
 
-    # Register KV cache. ``layout_hints['inference_engine_logical_block_size']``
-    # must match the client context's ``page_size`` (=16) — mismatching
-    # them would cause the server to compute a bogus ``compress_ratio``
-    # and the retrieve path would size the tmp GPU buffer in physical
-    # slots while the stored memory_obj is still sized in logical tokens.
+    # Register KV cache. No engine group infos are sent, so the server
+    # detects ``slots_per_block`` from the tensors and treats every group
+    # as uncompressed (``compress_ratio == 1``).
     future = client.submit_request(
         RequestType.REGISTER_KV_CACHE,
         [
@@ -342,7 +340,7 @@ def registered_instance(
             "testmodel",
             1,
             EngineType.VLLM,
-            {"inference_engine_logical_block_size": 16},
+            {},
             [],
         ],
         get_response_class(RequestType.REGISTER_KV_CACHE),
@@ -391,8 +389,8 @@ def test_register_unregister_kv_cache(
     """
     instance_id = os.getpid()
 
-    # Register. ``layout_hints['inference_engine_logical_block_size']``
-    # must match ClientContext.page_size (=16).
+    # Register. No engine group infos: geometry is detected from the
+    # tensors (uncompressed).
     future = client.submit_request(
         RequestType.REGISTER_KV_CACHE,
         [
@@ -401,7 +399,7 @@ def test_register_unregister_kv_cache(
             "testmodel",
             1,
             EngineType.VLLM,
-            {"inference_engine_logical_block_size": 16},
+            {},
             [],
         ],
         get_response_class(RequestType.REGISTER_KV_CACHE),
diff --git a/tests/v1/multiprocess/test_free_locks.py b/tests/v1/multiprocess/test_free_locks.py
index a9e695d235..f6790b4080 100644
--- a/tests/v1/multiprocess/test_free_locks.py
+++ b/tests/v1/multiprocess/test_free_locks.py
@@ -166,7 +166,7 @@ def test_adapter_free_lookup_locks_sends_request():
 
     adapter = LMCacheMPSchedulerAdapter.__new__(LMCacheMPSchedulerAdapter)
     adapter.model_name = "test_model"
-    adapter.chunk_size = 256
+    adapter.lmcache_tokens_per_chunk = 256
     adapter.blocks_in_chunk = 16
     adapter.parallel_strategy = ParallelStrategy(False, 1, 0, 1, 1)
     adapter._health_event = threading.Event()
@@ -216,7 +216,7 @@ def test_adapter_free_lookup_locks_key_matches_lookup():
 
     adapter = LMCacheMPSchedulerAdapter.__new__(LMCacheMPSchedulerAdapter)
     adapter.model_name = "test_model"
-    adapter.chunk_size = 256
+    adapter.lmcache_tokens_per_chunk = 256
     adapter.blocks_in_chunk = 16
     adapter.parallel_strategy = ParallelStrategy(False, 1, 0, 1, 1)
     adapter._health_event = threading.Event()
@@ -245,7 +245,8 @@ def test_adapter_free_lookup_locks_key_matches_lookup():
     mock_client.submit_request.reset_mock()
 
     # Submit free_lookup_locks with aligned end
-    aligned_end = (len(token_ids) // adapter.chunk_size) * adapter.chunk_size
+    tokens_per_chunk = adapter.lmcache_tokens_per_chunk
+    aligned_end = (len(token_ids) // tokens_per_chunk) * tokens_per_chunk
     adapter.free_lookup_locks(
         token_ids=token_ids,
         start=0,
diff --git a/tests/v1/multiprocess/test_gpu_context.py b/tests/v1/multiprocess/test_gpu_context.py
index 75790bed51..9123a418b7 100644
--- a/tests/v1/multiprocess/test_gpu_context.py
+++ b/tests/v1/multiprocess/test_gpu_context.py
@@ -30,12 +30,12 @@
 )
 
 # First Party
-from lmcache.v1.gpu_connector.utils import LayoutHints  # noqa: E402
 from lmcache.v1.kv_layer_groups import KVLayerGroupsManager  # noqa: E402
 from lmcache.v1.multiprocess.gpu_context import (  # noqa: E402
     GPUCacheContext,
     _TempGPUBuffer,
 )
+from lmcache.v1.multiprocess.group_view import EngineGroupInfo  # noqa: E402
 import lmcache.c_ops as lmc_ops  # noqa: E402
 
 _DEVICE = torch.device("cuda")
@@ -91,14 +91,14 @@ def _build_manager(
     tensors: list[torch.Tensor],
     num_blocks: int = 4,
     gpu_kv_format: "lmc_ops.GPUKVFormat" = lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS,
-    layout_hints: LayoutHints | None = None,
+    engine_group_infos: Sequence[EngineGroupInfo] = (),
 ) -> KVLayerGroupsManager:
     """Build a real :class:`KVLayerGroupsManager` from synthetic tensors."""
     return KVLayerGroupsManager(
         tensors,
         gpu_kv_format=gpu_kv_format,
         num_blocks=num_blocks,
-        layout_hints=layout_hints,
+        engine_group_infos=engine_group_infos,
     )
 
 
@@ -107,14 +107,16 @@ def _make_temp_buffer(
     chunk_size: int = 256,
     max_batch_size: int = 4,
     num_blocks: int = 4,
-    layout_hints: LayoutHints | None = None,
+    engine_group_infos: Sequence[EngineGroupInfo] = (),
 ) -> _TempGPUBuffer:
     """Build a ``_TempGPUBuffer`` backed by a real manager."""
     tensors = _make_kv_tensors(specs, num_blocks=num_blocks)
-    manager = _build_manager(tensors, num_blocks=num_blocks, layout_hints=layout_hints)
+    manager = _build_manager(
+        tensors, num_blocks=num_blocks, engine_group_infos=engine_group_infos
+    )
     return _TempGPUBuffer(
         kv_layer_groups_manager=manager,
-        lmcache_logical_chunk_size=chunk_size,
+        lmcache_tokens_per_chunk=chunk_size,
         device=_DEVICE,
         max_batch_size=max_batch_size,
     )
@@ -126,7 +128,7 @@ def _expected_kernel_group_shape(
     """Compute the expected kernel-group buffer shape from the manager's
     public metadata (kv_size, num_layers, slots, hidden_dim)."""
     group = manager.kernel_groups[kernel_group_idx]
-    num_slots = num_tokens // group.compress_ratio
+    num_slots = num_tokens * group.slots_per_block // group.tokens_per_block
     return torch.Size(
         (
             group.shape_desc.kv_size,
@@ -182,15 +184,15 @@ def _make_context(
     specs: Sequence[_GroupSpec],
     chunk_size: int = 256,
     num_blocks: int = 4,
-    layout_hints: LayoutHints | None = None,
+    engine_group_infos: Sequence[EngineGroupInfo] = (),
 ) -> GPUCacheContext:
     """Build a real ``GPUCacheContext`` via its public constructor."""
     tensors = _make_kv_tensors(specs, num_blocks=num_blocks)
     kv_caches = [_FakeIPCWrapper(t) for t in tensors]
     return GPUCacheContext(
         kv_caches,  # type: ignore
-        lmcache_logical_chunk_size=chunk_size,
-        layout_hints=layout_hints,
+        lmcache_tokens_per_chunk=chunk_size,
+        engine_group_infos=engine_group_infos,
     )
 
 
@@ -363,9 +365,11 @@ def test_shape_compressed_group(self) -> None:
         """For a compressed group, the token dim is divided by compress_ratio."""
         tensors = _make_kv_tensors([_GroupSpec(num_layers=2, block_size=8)])
         manager = _build_manager(
-            tensors, layout_hints={"inference_engine_logical_block_size": 16}
+            tensors,
+            engine_group_infos=[EngineGroupInfo(0, (0, 1), tokens_per_block=16)],
         )
-        assert manager.kernel_groups[0].compress_ratio == 2
+        kg = manager.kernel_groups[0]
+        assert kg.tokens_per_block // kg.slots_per_block == 2
         buf = _TempGPUBuffer(manager, 256, _DEVICE)
         shape, _ = buf.get_kernel_group_shape_dtype(256, 0)
         assert shape[2] == 256 // 2
@@ -373,7 +377,8 @@ def test_shape_compressed_group(self) -> None:
     def test_not_divisible_by_compress_ratio_raises(self) -> None:
         tensors = _make_kv_tensors([_GroupSpec(num_layers=2, block_size=8)])
         manager = _build_manager(
-            tensors, layout_hints={"inference_engine_logical_block_size": 16}
+            tensors,
+            engine_group_infos=[EngineGroupInfo(0, (0, 1), tokens_per_block=16)],
         )
         buf = _TempGPUBuffer(manager, 256, _DEVICE)
         with pytest.raises(ValueError, match="not a multiple of"):
@@ -401,7 +406,7 @@ def test_cache_size_per_token_compressed(self) -> None:
         uncompressed = _make_temp_buffer([_GroupSpec(num_layers=2, block_size=16)])
         compressed = _make_temp_buffer(
             [_GroupSpec(num_layers=2, block_size=8)],
-            layout_hints={"inference_engine_logical_block_size": 16},
+            engine_group_infos=[EngineGroupInfo(0, (0, 1), tokens_per_block=16)],
         )
         assert (
             compressed.get_cache_size_per_token() * 2
@@ -476,7 +481,6 @@ def test_calculate_num_blocks_matches_manager(self) -> None:
 class TestGPUCacheContextReportStatus:
     _TOP_LEVEL_KEYS = {
         "num_layers",
-        "inference_engine_logical_block_size",
         "num_blocks",
         "cache_size_per_token",
         "kernel_groups",
@@ -487,8 +491,8 @@ class TestGPUCacheContextReportStatus:
         "object_group_idx",
         "num_layers",
         "layer_indices",
-        "physical_block_size",
-        "compress_ratio",
+        "tokens_per_block",
+        "slots_per_block",
         "dtype",
         "gpu_kv_concrete_shape",
         "is_mla",
@@ -512,7 +516,6 @@ def test_report_status_fields(self) -> None:
         assert group["num_layers"] == 4
         assert group["layer_indices"] == [0, 1, 2, 3]
         assert group["is_mla"] is False
-        assert group["compress_ratio"] == 1
         assert group["gpu_kv_format"] == "NL_X_TWO_NB_BS_NH_HS"
         assert group["dtype"] == str(ctx.dtype)
 
@@ -532,8 +535,8 @@ def test_report_status_multi_group(self) -> None:
             assert group["kernel_group_idx"] == kg_idx
             assert group["engine_group_idx"] == kernel_group.engine_group_idx
             assert group["num_layers"] == kernel_group.num_layers
-            assert group["physical_block_size"] == kernel_group.shape_desc.bs
-            assert group["compress_ratio"] == kernel_group.compress_ratio
+            assert group["slots_per_block"] == kernel_group.slots_per_block
+            assert group["tokens_per_block"] == kernel_group.tokens_per_block
             assert 0 <= group["object_group_idx"] < manager.num_object_groups
 
 
diff --git a/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py b/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
index 7143f82066..3ef138f074 100644
--- a/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
+++ b/tests/v1/multiprocess/test_gpu_transfer_layout_registry.py
@@ -73,7 +73,7 @@ def test_unregister_one_shared_gpu_layout_keeps_registry_until_last_instance(
 
     def fake_create_cache_context(
         kv_caches: object,
-        lmcache_logical_chunk_size: int,
+        lmcache_tokens_per_chunk: int,
         layout_hints: object = None,
         engine_group_infos: object = (),
         engine_type: object = None,
diff --git a/tests/v1/multiprocess/test_mq_handler_helpers.py b/tests/v1/multiprocess/test_mq_handler_helpers.py
index 60b1b72345..b1d65251d3 100644
--- a/tests/v1/multiprocess/test_mq_handler_helpers.py
+++ b/tests/v1/multiprocess/test_mq_handler_helpers.py
@@ -52,10 +52,7 @@ def register_kv_cache_handler(
         model_name: Name of the model associated with this KV cache
         world_size: World size associated with this KV cache
         engine_type: Which serving engine produced the caches
-        layout_hints: Engine-provided hints dict. For vLLM,
-            ``layout_hints["inference_engine_logical_block_size"]``
-            carries the logical tokens-per-engine-block (previously a
-            standalone argument).
+        layout_hints: Engine-provided hints dict.
         engine_group_infos: Engine-neutral KV cache group metadata,
             msgspec-decoded from the request payload.
 
@@ -80,12 +77,6 @@ def register_kv_cache_handler(
     assert isinstance(layout_hints, dict), (
         f"Expected layout_hints to be dict, got {type(layout_hints)}"
     )
-    # inference_engine_logical_block_size, if present, must be an int.
-    ie_logical_block_size = layout_hints.get("inference_engine_logical_block_size")
-    assert ie_logical_block_size is None or isinstance(ie_logical_block_size, int), (
-        "Expected layout_hints['inference_engine_logical_block_size'] to be int, got "
-        f"{type(ie_logical_block_size)}"
-    )
     assert isinstance(engine_group_infos, list), (
         f"Expected engine_group_infos to be a list, got {type(engine_group_infos)}"
     )
diff --git a/tests/v1/test_kv_cache_groups.py b/tests/v1/test_kv_cache_groups.py
index b554f1f699..d7fb8e5499 100644
--- a/tests/v1/test_kv_cache_groups.py
+++ b/tests/v1/test_kv_cache_groups.py
@@ -89,44 +89,55 @@ def test_engine_group_infos_reject_out_of_range_layer():
 
 
 def test_slice_block_ids_uniform_block_sizes():
-    """Groups sharing the base block size slice to equal counts."""
+    """Groups sharing one tokens_per_block slice to equal counts."""
     allocated = {0: list(range(16)), 1: list(range(100, 116))}
     sliced = slice_block_ids_per_group(
         allocated,
-        group_block_sizes=[16, 16],
-        base_block_size=16,
-        start_block_idx=0,
-        end_block_idx=16,
+        group_tokens_per_block=[16, 16],
+        start_token_idx=0,
+        end_token_idx=256,
     )
     assert sliced == [list(range(16)), list(range(100, 116))]
 
 
 def test_slice_block_ids_heterogeneous_block_sizes():
-    """A block_size-32 group gets half the IDs of a block_size-16 group.
+    """A tokens_per_block-32 group gets half the IDs of a 16 group.
 
-    The range [0, 16) spans 256 tokens: the block_size-16 group needs
-    16 block IDs, the block_size-32 group 8, for the same token span.
+    The range [0, 256) spans 256 tokens: the tokens_per_block-16 group
+    needs 16 block IDs, the tokens_per_block-32 group 8, for the same
+    token span.
     """
     allocated = {0: list(range(16)), 1: list(range(8))}
     sliced = slice_block_ids_per_group(
         allocated,
-        group_block_sizes=[16, 32],
-        base_block_size=16,
-        start_block_idx=0,
-        end_block_idx=16,
+        group_tokens_per_block=[16, 32],
+        start_token_idx=0,
+        end_token_idx=256,
     )
     assert sliced == [list(range(16)), list(range(8))]
 
 
+def test_slice_block_ids_smaller_than_base_block_sizes():
+    """Groups with tiny paged chunks (e.g. DeepSeek V4 compressor state,
+    tokens_per_block 4/8) get proportionally more IDs over one token span."""
+    allocated = {0: [0], 1: list(range(64)), 2: list(range(32))}
+    sliced = slice_block_ids_per_group(
+        allocated,
+        group_tokens_per_block=[256, 4, 8],
+        start_token_idx=0,
+        end_token_idx=256,
+    )
+    assert sliced == [[0], list(range(64)), list(range(32))]
+
+
 def test_slice_block_ids_nonzero_start_offset():
-    """Start/end offsets are divided per group by the block factor."""
+    """Start/end token offsets are divided per group by tokens_per_block."""
     allocated = {0: list(range(32)), 1: list(range(16))}
     sliced = slice_block_ids_per_group(
         allocated,
-        group_block_sizes=[16, 32],
-        base_block_size=16,
-        start_block_idx=16,
-        end_block_idx=32,
+        group_tokens_per_block=[16, 32],
+        start_token_idx=256,
+        end_token_idx=512,
     )
     assert sliced == [list(range(16, 32)), list(range(8, 16))]
 
@@ -136,25 +147,23 @@ def test_slice_block_ids_missing_group_yields_empty():
     allocated = {0: list(range(16))}  # group 1 absent
     sliced = slice_block_ids_per_group(
         allocated,
-        group_block_sizes=[16, 16],
-        base_block_size=16,
-        start_block_idx=0,
-        end_block_idx=16,
+        group_tokens_per_block=[16, 16],
+        start_token_idx=0,
+        end_token_idx=256,
     )
     assert sliced == [list(range(16)), []]
 
 
 def test_slice_block_ids_misaligned_range_raises():
-    """A range that is not a whole number of a group's blocks is rejected."""
+    """A range that is not a whole number of a group's chunks is rejected."""
     allocated = {0: list(range(8)), 1: list(range(8))}
-    # group 1 block_size 48 -> factor 3; end=8 is not a multiple of 3.
+    # group 1 tokens_per_block 48; end=128 is not a multiple of 48.
     try:
         slice_block_ids_per_group(
             allocated,
-            group_block_sizes=[16, 48],
-            base_block_size=16,
-            start_block_idx=0,
-            end_block_idx=8,
+            group_tokens_per_block=[16, 48],
+            start_token_idx=0,
+            end_token_idx=128,
         )
     except ValueError as exc:
         assert "does not align" in str(exc)
diff --git a/tests/v1/test_kv_layer_groups_manager.py b/tests/v1/test_kv_layer_groups_manager.py
index 58aae45fcc..c913aa8de5 100644
--- a/tests/v1/test_kv_layer_groups_manager.py
+++ b/tests/v1/test_kv_layer_groups_manager.py
@@ -7,7 +7,6 @@
 import torch
 
 # First Party
-from lmcache.v1.gpu_connector.utils import LayoutHints
 from lmcache.v1.kv_layer_groups import (
     EXCLUDED_ENGINE_GROUP,
     KernelGroupIdentity,
@@ -30,7 +29,6 @@ def _build_manager(
     tensors: list[torch.Tensor],
     *,
     num_blocks: int,
-    layout_hints: LayoutHints | None = None,
     engine_group_infos: Sequence[EngineGroupInfo] = (),
 ) -> KVLayerGroupsManager:
     """Build a manager using the per-layer NHD format.
@@ -47,7 +45,6 @@ def _build_manager(
         tensors,
         gpu_kv_format=lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS,
         num_blocks=num_blocks,
-        layout_hints=layout_hints,
         engine_group_infos=engine_group_infos,
     )
 
@@ -281,34 +278,43 @@ def test_empty_groups_raises(self):
             format_kvcache_shape_spec([])
 
 
-class TestDeriveCompressionMetadata:
-    """``(compress_ratio, physical_chunk_size)`` derivation: ``1`` when there is
-    no engine block size, else ``ie_logical_block_size // bs`` (e.g. DeepSeek V4
-    compression where ``bs < logical``), with divisibility enforced.
+class TestDeriveSlotsPerChunk:
+    """``slots_per_chunk`` derivation from the two block-size sources:
+    ``tokens_per_block`` (engine KV cache spec, known at initialization) and
+    ``slots_per_block`` (registered tensor batch dimension), with
+    ``compress_ratio = tokens_per_block // slots_per_block`` (e.g. DeepSeek
+    V4 compression where ``slots < tokens``) and divisibility enforced.
     """
 
-    def _derive(self, bs: int, logical: "int | None", chunk: int = 256):
-        return KVLayerGroupsManager._derive_compression_metadata(
+    def _derive(self, slots: int, tokens: int, chunk: int = 256) -> int:
+        return KVLayerGroupsManager._derive_slots_per_chunk(
             group_idx=0,
-            bs=bs,
-            ie_logical_block_size=logical,
-            lmcache_logical_chunk_size=chunk,
+            slots_per_block=slots,
+            tokens_per_block=tokens,
+            lmcache_tokens_per_chunk=chunk,
         )
 
     def test_one_to_one(self):
-        assert self._derive(bs=16, logical=16) == (1, 256)
+        assert self._derive(slots=16, tokens=16) == 256
 
-    def test_no_block_size_info(self):
-        assert self._derive(bs=16, logical=None) == (1, 256)
+    def test_compression_slots_lt_tokens(self):
+        # slots=8 packs 2 logical tokens per physical slot (DeepSeek V4 style).
+        assert self._derive(slots=8, tokens=16) == 128
 
-    def test_compression_bs_lt_logical(self):
-        # bs=8 packs 2 logical tokens per physical slot (DeepSeek V4 style).
-        assert self._derive(bs=8, logical=16) == (2, 128)
+    def test_dsv4_declared_ratios(self):
+        # DeepSeek-V4-Flash MLA groups declare 256 tokens per paged chunk
+        # over 64 (compress_ratio 4) or 2 (compress_ratio 128) slots.
+        assert self._derive(slots=64, tokens=256) == 64
+        assert self._derive(slots=2, tokens=256) == 2
 
     def test_not_divisible_raises(self):
-        # Divisibility is enforced loudly (e.g. bs=6 does not divide 16).
+        # Divisibility is enforced loudly (e.g. slots=6 does not divide 16).
         with pytest.raises(ValueError, match="must be a multiple of"):
-            self._derive(bs=6, logical=16)
+            self._derive(slots=6, tokens=16)
+
+    def test_chunk_not_divisible_by_ratio_raises(self):
+        with pytest.raises(ValueError, match="lmcache_tokens_per_chunk"):
+            self._derive(slots=1, tokens=96, chunk=256)
 
 
 class TestKernelGroupIdentity:
@@ -400,15 +406,50 @@ def test_calculate_num_blocks_uncompressed(self):
         manager = _build_manager(tensors, num_blocks=32)
         assert manager.calculate_num_blocks(0, 256) == 16
 
+    def test_dsv4_flash_style_mixed_compression(self):
+        # Mirrors DeepSeek-V4-Flash: one 256-token engine group whose layers
+        # have 64- and 2-slot pages (declared compress ratios 4 and 128), one
+        # 64-token SWA group and one 4-token compressor-state group (ratio 1).
+        tensors = [
+            torch.randn(2, 8, 64, 1, 64, dtype=torch.float16),
+            torch.randn(2, 8, 2, 1, 64, dtype=torch.float16),
+            torch.randn(2, 8, 64, 1, 32, dtype=torch.float16),
+            torch.randn(2, 8, 4, 1, 128, dtype=torch.float32),
+        ]
+        manager = _build_manager(
+            tensors,
+            num_blocks=8,
+            engine_group_infos=[
+                EngineGroupInfo(0, (0, 1), tokens_per_block=256),
+                EngineGroupInfo(1, (2,), tokens_per_block=64),
+                EngineGroupInfo(2, (3,), tokens_per_block=4),
+            ],
+        )
+        by_layer = {g.layer_indices[0]: g for g in manager.kernel_groups}
+        assert by_layer[0].tokens_per_block // by_layer[0].slots_per_block == 4
+        assert by_layer[1].tokens_per_block // by_layer[1].slots_per_block == 128
+        assert by_layer[2].tokens_per_block // by_layer[2].slots_per_block == 1
+        assert by_layer[3].tokens_per_block // by_layer[3].slots_per_block == 1
+        # 256-token LMCache chunk -> 2 physical slots in the ratio-128 group.
+        assert by_layer[1].slots_per_chunk == 2
+        assert by_layer[0].slots_per_chunk == 64
+
     def test_calculate_num_blocks_compressed(self):
-        # bs=8, ie_logical_block_size=16 -> compress_ratio=2;
-        # 256 logical tokens -> 128 physical slots -> 128 // 8 = 16 blocks.
+        # slots_per_block=8 (tensor), tokens_per_block=16 (engine spec) ->
+        # compress_ratio=2; 256 logical tokens -> 128 physical slots ->
+        # 128 // 8 = 16 blocks.
         tensors = [torch.randn(2, 32, 8, 8, 64, dtype=torch.float16) for _ in range(2)]
         manager = _build_manager(
             tensors,
             num_blocks=32,
-            layout_hints={"inference_engine_logical_block_size": 16},
+            engine_group_infos=[
+                EngineGroupInfo(0, (0, 1), tokens_per_block=16),
+            ],
         )
+        group = manager.kernel_groups[0]
+        assert group.tokens_per_block == 16
+        assert group.slots_per_block == 8
+        assert group.tokens_per_block // group.slots_per_block == 2
         assert manager.calculate_num_blocks(0, 256) == 16
 
 
diff --git a/tests/v1/test_vllm_kv_cache_groups.py b/tests/v1/test_vllm_kv_cache_groups.py
index d6e990c40a..0b1fe3bf82 100644
--- a/tests/v1/test_vllm_kv_cache_groups.py
+++ b/tests/v1/test_vllm_kv_cache_groups.py
@@ -16,9 +16,15 @@
 )
 
 
+@dataclass
+class MockKVCacheSpec:
+    block_size: int
+
+
 @dataclass
 class MockKVCacheGroup:
     layer_names: list[str]
+    kv_cache_spec: MockKVCacheSpec
 
 
 @dataclass
@@ -46,8 +52,12 @@ def test_conversion_preserves_engine_group_layers():
     spec = create_engine_group_infos_from_vllm(
         MockKVCacheConfig(
             kv_cache_groups=[
-                MockKVCacheGroup(["layer.0", "layer.2"]),
-                MockKVCacheGroup(["layer.1", "layer.3"]),
+                MockKVCacheGroup(
+                    ["layer.0", "layer.2"], MockKVCacheSpec(block_size=16)
+                ),
+                MockKVCacheGroup(
+                    ["layer.1", "layer.3"], MockKVCacheSpec(block_size=16)
+                ),
             ]
         ),
         _same_shape_caches(["layer.0", "layer.1", "layer.2", "layer.3"]),
@@ -55,6 +65,7 @@ def test_conversion_preserves_engine_group_layers():
 
     assert num_engine_groups(spec) == 2
     assert get_engine_group_indices(spec, 4) == [0, 1, 0, 1]
+    assert [group.tokens_per_block for group in spec] == [16, 16]
 
 
 def test_conversion_splits_by_lmcache_layer_identity():
@@ -65,8 +76,12 @@ def test_conversion_splits_by_lmcache_layer_identity():
     spec = create_engine_group_infos_from_vllm(
         MockKVCacheConfig(
             kv_cache_groups=[
-                MockKVCacheGroup(["layer.0", "layer.2", "layer.4"]),
-                MockKVCacheGroup(["layer.1", "layer.3"]),
+                MockKVCacheGroup(
+                    ["layer.0", "layer.2", "layer.4"], MockKVCacheSpec(block_size=16)
+                ),
+                MockKVCacheGroup(
+                    ["layer.1", "layer.3"], MockKVCacheSpec(block_size=16)
+                ),
             ]
         ),
         caches,
diff --git a/tests/v1/test_vllm_mp_adapter.py b/tests/v1/test_vllm_mp_adapter.py
index d3f082a5b0..07cbb0e9ac 100644
--- a/tests/v1/test_vllm_mp_adapter.py
+++ b/tests/v1/test_vllm_mp_adapter.py
@@ -68,10 +68,7 @@ def start(self) -> None:
     # KV-cache wrapping pulls in CUDA IPC; bypass for unit tests.
     monkeypatch.setattr(adapter_mod, "wrap_kv_caches", lambda kv: list(kv.values()))
     # ``vllm_layout_hints`` returns a ``LayoutHints`` (TypedDict / dict at
-    # runtime); the production path performs item assignment on it
-    # (``layout_hints["inference_engine_logical_block_size"] = ...``), so
-    # the stub must also be a real dict — a string would raise
-    # ``TypeError: 'str' object does not support item assignment``.
+    # runtime); stub it with an empty dict.
     monkeypatch.setattr(
         "lmcache.integration.vllm.utils.vllm_layout_hints",
         lambda: {},

From 1d2b3de5c587182c7333b2c86ab341317b03a446 Mon Sep 17 00:00:00 2001
From: Rui Zhang <51696593+ruizhang0101@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:10:25 -0700
Subject: [PATCH 41/57] [Misc] align MP server id with OTel service.instance.id
 (#3558)

Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com>
---
 lmcache/v1/mp_observability/README.md  |  3 +--
 lmcache/v1/mp_observability/config.py  | 29 ++++++++----------------
 lmcache/v1/multiprocess/config.py      | 18 +++++++++++++++
 lmcache/v1/multiprocess/http_server.py |  9 +++-----
 lmcache/v1/multiprocess/server.py      |  7 ++++++
 tests/v1/multiprocess/test_config.py   | 31 ++++++++++++++++++++++++++
 6 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/lmcache/v1/mp_observability/README.md b/lmcache/v1/mp_observability/README.md
index e2250ae8f8..f5dc98cde6 100644
--- a/lmcache/v1/mp_observability/README.md
+++ b/lmcache/v1/mp_observability/README.md
@@ -58,7 +58,6 @@ CLI, pass the flags below; when embedding programmatically, construct an
 | `--event-bus-queue-size N` | `10000` | Maximum number of events in the EventBus queue before tail-drop. |
 | `--otlp-endpoint URL` | *(none)* | OTLP gRPC endpoint (e.g. `http://localhost:4317`). When set, metrics and traces are pushed to an OTel collector. When unset, metrics fall back to Prometheus pull mode. |
 | `--prometheus-port PORT` | `9090` | Port for the Prometheus `/metrics` endpoint. Only used when `--otlp-endpoint` is not set. |
-| `--service-instance-id ID` | *unset* (default random UUID v4) | Identifier for this MP server instance.  Attached as the OTel Resource attribute `service.instance.id` on every metric and span.  When the flag is not passed, defaults to a random UUID v4 minted at startup.  Pass `--service-instance-id=""` to force an explicit empty value. |
 
 ### `ObservabilityConfig` fields
 
@@ -71,7 +70,7 @@ CLI, pass the flags below; when embedding programmatically, construct an
 | `tracing_enabled` | `bool` | `False` | Register tracing subscribers (OTel spans). |
 | `otlp_endpoint` | `str \| None` | `None` | OTLP gRPC endpoint. When set, metrics and traces are pushed. When `None`, metrics use Prometheus pull fallback. |
 | `prometheus_port` | `int` | `9090` | Port for the Prometheus `/metrics` endpoint (pull fallback only). |
-| `service_instance_id` | `str \| None` | `None` (default random UUID v4) | Identifier for this MP server instance; attached as the OTel Resource attribute `service.instance.id` on every metric and span.  `None` defaults to a random UUID v4 at `init_observability` time.  An explicit `""` is preserved. |
+| `service_instance_id` | `str \| None` | `None` | OTel Resource attribute `service.instance.id`, attached to every metric and span. No CLI flag: `run_cache_server` projects the MP server's `--instance-id` onto it. `None` (standalone callers only) falls back to a random UUID v4 at `init_observability` time; an explicit value is preserved. |
 
 ### Metrics export modes
 
diff --git a/lmcache/v1/mp_observability/config.py b/lmcache/v1/mp_observability/config.py
index 275d17e492..b8a894451e 100644
--- a/lmcache/v1/mp_observability/config.py
+++ b/lmcache/v1/mp_observability/config.py
@@ -74,14 +74,16 @@ class ObservabilityConfig:
     minted and logged at INFO."""
 
     service_instance_id: str | None = None
-    """Identifier for this MP server instance.  Attached as the OTel
-    Resource attribute ``service.instance.id`` on every metric and span.
-    One MP server has exactly one instance id.
+    """OTel ``service.instance.id`` resource attribute, attached to every
+    metric and span. There is no CLI flag for it: the operator-facing id is
+    ``--instance-id`` (``MPServerConfig.instance_id``), which ``run_cache_server``
+    projects onto this attribute so telemetry and coordinator membership share
+    one id.
 
-    ``None`` (the default, also the state when the CLI flag is not
-    passed) falls back to a random UUID v4 at ``init_observability``
-    time.  An explicit empty string is preserved verbatim so operators
-    who want the attribute to report ``""`` can ask for it."""
+    ``None`` (the default) means "not set": standalone callers that build an
+    ``ObservabilityConfig`` directly (e.g. the trace CLI driver) fall back to a
+    random UUID v4 at ``init_observability`` time. An explicit value is
+    preserved verbatim."""
 
 
 DEFAULT_OBSERVABILITY_CONFIG = ObservabilityConfig(enabled=False)
@@ -162,18 +164,6 @@ def add_observability_args(
             "(0, 1.0]. Counters always count all events. Default is 0.01 (1%%)."
         ),
     )
-    group.add_argument(
-        "--service-instance-id",
-        type=str,
-        default=None,
-        help=(
-            "Identifier for this MP server instance. Attached as the OTel "
-            "Resource attribute 'service.instance.id' on every metric and "
-            "span. When the flag is not passed, defaults to a random "
-            "UUID v4 minted at startup. Pass --service-instance-id='' to "
-            "force an empty attribute value."
-        ),
-    )
 
     # Lookup hash logging config
     log_group = parser.add_argument_group(
@@ -262,7 +252,6 @@ def parse_args_to_observability_config(
         ),
         trace_level=args.trace_level,
         trace_output=args.trace_output,
-        service_instance_id=args.service_instance_id,
     )
 
     if config.tracing_enabled and config.otlp_endpoint is None:
diff --git a/lmcache/v1/multiprocess/config.py b/lmcache/v1/multiprocess/config.py
index 7b8eec772a..9e19bf3db5 100644
--- a/lmcache/v1/multiprocess/config.py
+++ b/lmcache/v1/multiprocess/config.py
@@ -10,6 +10,7 @@
 import json
 import math
 import os
+import uuid
 
 
 @dataclass
@@ -61,6 +62,13 @@ class MPServerConfig:
     script_allowed_imports: list[str] = field(default_factory=list)
     """Modules that /run_script endpoint is allowed to import."""
 
+    instance_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    """Stable identity of this MP server, the single source of truth for who
+    this server is. Used as the coordinator membership key and projected onto
+    the OTel ``service.instance.id`` resource attribute (see
+    ``run_cache_server``) so metrics, traces, and coordinator state all key on
+    the same id. Set via ``--instance-id``; defaults to a random UUID v4."""
+
 
 @dataclass
 class RuntimePluginConfig:
@@ -133,6 +141,15 @@ def add_mp_server_args(
     mp_group = parser.add_argument_group(
         "MP Server", "Configuration for the ZMQ multiprocess cache server"
     )
+    mp_group.add_argument(
+        "--instance-id",
+        type=str,
+        default=None,
+        help="Stable identity of this MP server. Used as the coordinator "
+        "membership key and as the OTel 'service.instance.id' resource "
+        "attribute on every metric and span. Defaults to a random UUID v4 "
+        "minted at startup.",
+    )
     mp_group.add_argument(
         "--host",
         type=str,
@@ -256,6 +273,7 @@ def parse_args_to_mp_server_config(
     except json.JSONDecodeError as exc:
         raise ValueError("--runtime-plugin-config is not valid JSON: %s" % exc) from exc
     return MPServerConfig(
+        instance_id=args.instance_id or str(uuid.uuid4()),
         host=args.host,
         port=args.port,
         chunk_size=args.chunk_size,
diff --git a/lmcache/v1/multiprocess/http_server.py b/lmcache/v1/multiprocess/http_server.py
index e01c714bbc..ddc3ab621f 100644
--- a/lmcache/v1/multiprocess/http_server.py
+++ b/lmcache/v1/multiprocess/http_server.py
@@ -107,7 +107,6 @@ async def lifespan(app: FastAPI):
     # the keep_registered task registers, heartbeats, and deregisters on
     # shutdown. Best-effort: failures are logged and retried, never fatal.
     http_config = _configs.get("http")
-    obs_config = _configs.get("observability")
     coordinator_config = _configs.get("coordinator")
     coordinator_client = None
     coordinator_registration_task = None
@@ -117,16 +116,14 @@ async def lifespan(app: FastAPI):
         and http_config is not None
     ):
         coordinator_client = httpx.AsyncClient(timeout=10.0)
-        # Reuse this server's telemetry identity (OTel service.instance.id) so
-        # coordinator membership lines up with metrics/traces. Empty lets the
-        # coordinator assign one.
-        service_instance_id = getattr(obs_config, "service_instance_id", None)
+        # Canonical id resolved by run_cache_server above; shared with
+        # the OTel service.instance.id so membership matches metrics/traces.
         coordinator_registration_task = asyncio.create_task(
             keep_registered(
                 coordinator_client,
                 coordinator_config.url,
                 http_port=http_config.http_port,
-                instance_id=service_instance_id or "",
+                instance_id=mp_config.instance_id,
                 advertise_ip=coordinator_config.advertise_ip,
                 heartbeat_interval=coordinator_config.heartbeat_interval,
             )
diff --git a/lmcache/v1/multiprocess/server.py b/lmcache/v1/multiprocess/server.py
index 76e8d148f6..d7c68e246c 100644
--- a/lmcache/v1/multiprocess/server.py
+++ b/lmcache/v1/multiprocess/server.py
@@ -235,6 +235,13 @@ def run_cache_server(
         If return_engine is True: tuple of (MessageQueueServer, MPCacheEngine).
         If return_engine is False: None (blocks until interrupted).
     """
+    # mp_config.instance_id is this server's single source of identity (set via
+    # --instance-id, else a random UUID v4). Project it onto the OTel
+    # service.instance.id unless observability set that attribute explicitly, so
+    # metrics/traces and coordinator membership all key on the same id.
+    if obs_config.service_instance_id is None:
+        obs_config.service_instance_id = mp_config.instance_id
+
     event_bus = init_observability(
         obs_config, start_prometheus_http_server=start_prometheus_http_server
     )
diff --git a/tests/v1/multiprocess/test_config.py b/tests/v1/multiprocess/test_config.py
index 686d7b884a..b3c98a6ec2 100644
--- a/tests/v1/multiprocess/test_config.py
+++ b/tests/v1/multiprocess/test_config.py
@@ -8,6 +8,7 @@
 
 # Standard
 import argparse
+import uuid
 
 # Third Party
 import pytest
@@ -15,8 +16,11 @@
 # First Party
 from lmcache.v1.multiprocess.config import (
     CoordinatorConfig,
+    MPServerConfig,
     add_coordinator_args,
+    add_mp_server_args,
     parse_args_to_coordinator_config,
+    parse_args_to_mp_server_config,
 )
 
 _COORD_ENV = (
@@ -96,3 +100,30 @@ def test_garbage_env_heartbeat_rejected(monkeypatch):
     monkeypatch.setenv("LMCACHE_COORDINATOR_HEARTBEAT_INTERVAL", "abc")
     with pytest.raises(ValueError, match="not a number"):
         _parse([])
+
+
+def _parse_mp(argv: list[str]) -> MPServerConfig:
+    parser = argparse.ArgumentParser()
+    add_mp_server_args(parser)
+    return parse_args_to_mp_server_config(parser.parse_args(argv))
+
+
+def test_instance_id_defaults_to_uuid4():
+    # No --instance-id flag => a random UUID v4 is minted.
+    config = _parse_mp([])
+    assert uuid.UUID(config.instance_id).version == 4
+
+
+def test_instance_id_flag_is_preserved():
+    config = _parse_mp(["--instance-id", "mp-server-7"])
+    assert config.instance_id == "mp-server-7"
+
+
+def test_instance_id_defaults_are_distinct():
+    # Each parse without the flag gets its own id (no shared default).
+    assert _parse_mp([]).instance_id != _parse_mp([]).instance_id
+
+
+def test_instance_id_dataclass_default_is_distinct():
+    # Direct construction (no CLI) also mints a fresh id per instance.
+    assert MPServerConfig().instance_id != MPServerConfig().instance_id

From 45c02cbe3bd874a5f380a6f29de6a6335abccc31 Mon Sep 17 00:00:00 2001
From: Kushagra Singh Gaur <kushagrasingh.gaur2022@vitstudent.ac.in>
Date: Thu, 11 Jun 2026 11:02:27 +0530
Subject: [PATCH 42/57] docs: add filesystem connector backend guide (#3534)

Signed-off-by: Kushagra963-lab <147275307+Kushagra963-lab@users.noreply.github.com>
Co-authored-by: Kushagra963-lab <147275307+Kushagra963-lab@users.noreply.github.com>
---
 docs/source/kv_cache/storage_backends/fs.rst  | 189 ++++++++++++++++++
 .../kv_cache/storage_backends/index.rst       |   1 +
 2 files changed, 190 insertions(+)
 create mode 100644 docs/source/kv_cache/storage_backends/fs.rst

diff --git a/docs/source/kv_cache/storage_backends/fs.rst b/docs/source/kv_cache/storage_backends/fs.rst
new file mode 100644
index 0000000000..94a165cbcf
--- /dev/null
+++ b/docs/source/kv_cache/storage_backends/fs.rst
@@ -0,0 +1,189 @@
+Filesystem Backend
+==================
+
+The filesystem backend uses ``FSConnector`` to store LMCache remote chunks as
+files under one or more POSIX filesystem directories. It is useful when you want
+a simple persistent remote backend, or when multiple inference workers can see
+the same mounted directory through local disk, NFS, a parallel filesystem, or a
+container volume.
+
+This backend is different from :doc:`local_storage`. Local disk offloading is a
+per-process local tier configured through ``local_disk``. ``FSConnector`` is a
+remote backend configured through ``remote_storage_plugins`` or the legacy
+``remote_url`` field, so it participates in the same remote backend path as
+Redis, S3, Mooncake, and other remote connectors.
+
+When to use it
+--------------
+
+Use ``FSConnector`` when:
+
+* You need a lightweight persistent remote backend for development, examples,
+  or benchmark runs.
+* Multiple LMCache or vLLM processes share a mounted cache directory.
+* Your storage is already exposed as a filesystem and does not need a separate
+  object-store or key-value service.
+* You want to test remote-backend behavior before moving to a production
+  backend such as Redis, Valkey, S3, Mooncake, or InfiniStore.
+
+Avoid using it when:
+
+* The filesystem is not shared by every process that must read the cache.
+* You need object-store semantics, cross-region persistence, or service-level
+  access control.
+* The storage path is on a slow network filesystem and sits on the hot request
+  path.
+
+Recommended configuration
+-------------------------
+
+The recommended form is the built-in remote storage plugin configuration. The
+plugin name ``fs`` selects ``FSConnector`` and the base path is configured in
+``extra_config``.
+
+.. code-block:: yaml
+
+   chunk_size: 256
+   local_cpu: false
+   max_local_cpu_size: 1
+   save_unfull_chunk: false
+   remote_serde: "naive"
+   blocking_timeout_secs: 10
+
+   remote_storage_plugins: ["fs"]
+   extra_config:
+     remote_storage_plugin.fs.base_path: "/tmp/lmcache-fs"
+     save_chunk_meta: false
+
+``FSConnector`` creates the base directory if it does not already exist. Each
+cache chunk is written as a ``.data`` file whose name is derived from the
+LMCache cache key.
+
+Multiple filesystem instances
+-----------------------------
+
+You can configure multiple named ``fs`` instances by appending an instance name
+after the connector type. The part before the first dot is still the connector
+type; the full plugin name becomes the ``extra_config`` prefix.
+
+.. code-block:: yaml
+
+   remote_storage_plugins: ["fs.primary", "fs.backup"]
+   extra_config:
+     remote_storage_plugin.fs.primary.base_path: "/mnt/cache-primary/lmcache"
+     remote_storage_plugin.fs.backup.base_path: "/mnt/cache-backup/lmcache"
+     save_chunk_meta: false
+
+This is useful when a deployment wants separate filesystem-backed remote stores
+for different cache policies, traffic classes, or experiments.
+
+Multiple base paths
+-------------------
+
+``remote_storage_plugin.<name>.base_path`` may contain a comma-separated list of
+directories. The connector chooses a directory by hashing the cache chunk key,
+which spreads files across the configured paths.
+
+.. code-block:: yaml
+
+   remote_storage_plugins: ["fs"]
+   extra_config:
+     remote_storage_plugin.fs.base_path: "/mnt/nvme0/lmcache,/mnt/nvme1/lmcache"
+     save_chunk_meta: false
+
+Use multiple paths when each path maps to an independent storage device or mount
+point. For best results, keep every path visible to the LMCache processes that
+need to retrieve the same chunks.
+
+Legacy ``remote_url`` configuration
+-----------------------------------
+
+The legacy ``remote_url`` form is still supported. The host and port are parsed
+for compatibility with other remote URL formats; ``FSConnector`` uses the path.
+
+.. code-block:: yaml
+
+   chunk_size: 256
+   local_cpu: false
+   max_local_cpu_size: 1
+   save_unfull_chunk: false
+   remote_url: "fs://localhost:0/tmp/lmcache-fs"
+   remote_serde: "naive"
+   blocking_timeout_secs: 10
+   extra_config:
+     save_chunk_meta: false
+
+Prefer ``remote_storage_plugins`` for new deployments because it also supports
+named instances and keeps connector-specific settings grouped by plugin name.
+
+Optional settings
+-----------------
+
+The connector reads the following optional settings from ``extra_config``.
+
+``fs_connector_relative_tmp_dir``
+   Relative directory used for temporary files before an atomic rename into the
+   final chunk path. The value must be relative, not absolute. When omitted,
+   temporary files are created next to the final file with a ``.tmp`` suffix.
+
+``fs_connector_read_ahead_size``
+   Number of bytes to read first when loading a chunk. If the read fills that
+   window, the connector reads the remaining bytes. This can trigger filesystem
+   readahead on filesystems that support it.
+
+``fs_connector_use_odirect``
+   Enables ``O_DIRECT`` for aligned reads and writes on platforms that expose
+   it. The connector falls back to normal I/O when a chunk size is not aligned
+   to the filesystem block size. ``O_DIRECT`` is disabled automatically when
+   ``save_chunk_meta`` is enabled because the metadata prefix is not block
+   aligned.
+
+``fs_base_path``
+   Compatibility fallback for plugin mode. Prefer
+   ``remote_storage_plugin.<name>.base_path`` so the setting remains scoped to a
+   specific plugin instance.
+
+Example with optional settings:
+
+.. code-block:: yaml
+
+   remote_storage_plugins: ["fs"]
+   extra_config:
+     remote_storage_plugin.fs.base_path: "/data/lmcache-fs"
+     fs_connector_relative_tmp_dir: ".tmp"
+     fs_connector_read_ahead_size: 1048576
+     fs_connector_use_odirect: true
+     save_chunk_meta: false
+
+Operational notes
+-----------------
+
+* Ensure the LMCache process has permission to create directories and write
+  files under every configured base path.
+* Put the path on durable storage if cache reuse must survive process restarts.
+  Temporary directories such as ``/tmp`` are convenient for tests but may be
+  cleaned by the operating system.
+* Use the same mounted path for every process that should share cache chunks.
+  If one process writes to a private container path, other processes will miss
+  those chunks even if they use the same configuration text.
+* Leave ``save_chunk_meta`` enabled when workers may infer different metadata
+  for the same chunk. Disable it only when you need the lower overhead path and
+  the workers share compatible cache metadata.
+* For MP mode L2 storage, see :doc:`../../mp/l2_storage`, which documents the
+  ``fs`` and ``fs_native`` L2 adapters configured through ``--l2-adapter``.
+
+Minimal vLLM usage
+------------------
+
+After writing ``fs.yaml`` with one of the configurations above, start vLLM with
+LMCache enabled:
+
+.. code-block:: bash
+
+   LMCACHE_CONFIG_FILE=fs.yaml vllm serve meta-llama/Llama-3.1-8B-Instruct \
+       --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}' \
+       --disable-log-requests
+
+Then send the same long-prefix request twice. The first request stores chunks in
+the filesystem backend. The second request should report LMCache hit tokens and
+load matching chunks from the configured filesystem path.
diff --git a/docs/source/kv_cache/storage_backends/index.rst b/docs/source/kv_cache/storage_backends/index.rst
index 7dd611fddf..feef262d6c 100644
--- a/docs/source/kv_cache/storage_backends/index.rst
+++ b/docs/source/kv_cache/storage_backends/index.rst
@@ -13,6 +13,7 @@ Supported Backends
    custom_backend
    dax
    eic
+   fs
    gds
    hfbucket
    infinistore

From 266f9a9ed9188e73a6c2469f3438d5c7706aa8cc Mon Sep 17 00:00:00 2001
From: chunxiaozheng <idellzheng@tencent.com>
Date: Thu, 11 Jun 2026 14:54:48 +0800
Subject: [PATCH 43/57] [cli] add quota management commands
 (set/get/list/delete) (#3623)

* [cli] add quota management commands (set/get/list/delete)

Signed-off-by: idellzheng <idellzheng@tencent.com>

* checkstyle error fix

Signed-off-by: idellzheng <idellzheng@tencent.com>

---------

Signed-off-by: idellzheng <idellzheng@tencent.com>
---
 docs/source/cli/index.rst                    |   5 +-
 docs/source/cli/quota.rst                    | 301 +++++++++++++++++++
 lmcache/cli/commands/quota/__init__.py       |  91 ++++++
 lmcache/cli/commands/quota/delete_command.py |  77 +++++
 lmcache/cli/commands/quota/get_command.py    |  80 +++++
 lmcache/cli/commands/quota/helpers.py        |  78 +++++
 lmcache/cli/commands/quota/list_command.py   |  76 +++++
 lmcache/cli/commands/quota/set_command.py    |  90 ++++++
 tests/cli/commands/test_quota.py             | 158 ++++++++++
 9 files changed, 955 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/cli/quota.rst
 create mode 100644 lmcache/cli/commands/quota/__init__.py
 create mode 100644 lmcache/cli/commands/quota/delete_command.py
 create mode 100644 lmcache/cli/commands/quota/get_command.py
 create mode 100644 lmcache/cli/commands/quota/helpers.py
 create mode 100644 lmcache/cli/commands/quota/list_command.py
 create mode 100644 lmcache/cli/commands/quota/set_command.py
 create mode 100644 tests/cli/commands/test_quota.py

diff --git a/docs/source/cli/index.rst b/docs/source/cli/index.rst
index fbaf700039..cf798ca9b6 100644
--- a/docs/source/cli/index.rst
+++ b/docs/source/cli/index.rst
@@ -33,7 +33,7 @@ The ``lmcache`` CLI ships in two packages:
    * - ``lmcache-cli``
      - ``pip install lmcache-cli``
      - CLI only: ``ping``, ``query``, ``describe``, ``kvcache``,
-       ``bench engine``. No GPU required, any OS.
+       ``quota``, ``bench engine``. No GPU required, any OS.
 
 .. note::
 
@@ -65,6 +65,8 @@ Available Commands
        adapter (``l2``).
    * - :doc:`kvcache`
      - Manage KV cache state (e.g. clear L1 cache) on a running server.
+   * - :doc:`quota`
+     - Manage per-salt cache quotas (set, get, list, delete).
    * - :doc:`trace`
      - Inspect and replay storage-level trace files.
    * - :doc:`tool`
@@ -100,5 +102,6 @@ See :doc:`/developer_guide/cli` for details.
    query
    bench
    kvcache
+   quota
    trace
    tool
diff --git a/docs/source/cli/quota.rst b/docs/source/cli/quota.rst
new file mode 100644
index 0000000000..114d5b8228
--- /dev/null
+++ b/docs/source/cli/quota.rst
@@ -0,0 +1,301 @@
+lmcache quota
+=============
+
+The ``lmcache quota`` command manages per-salt cache quotas on a running
+LMCache server. Quotas are soft limits: exceeding a quota triggers eviction
+on the next cycle (~1 s) rather than rejecting writes.
+
+.. code-block:: bash
+
+   lmcache quota <sub-command> [options]
+
+.. code-block:: text
+
+   $ lmcache quota -h
+   usage: lmcache quota [-h] {set,get,list,delete} ...
+
+   Manage per-salt cache quotas on the LMCache server.
+
+   subcommands:
+     set            Create or update a quota for a cache_salt
+     get            Show the quota and current usage for a cache_salt
+     list           List all registered quotas and their usage
+     delete         Remove a quota for a cache_salt
+
+   options:
+     -h, --help     show this help message and exit
+
+set
+---
+
+Create or update a quota for a given ``cache_salt``.
+
+.. code-block:: bash
+
+   lmcache quota set <salt> --limit-gb <GB> [--url <URL>]
+
+**Example:**
+
+.. code-block:: bash
+
+   $ lmcache quota set tenant1 --limit-gb 10.5
+
+   ================ Quota Set =================
+   Cache salt:                          tenant1
+   Limit (GB):                             10.5
+   Status:                                   ok
+   =============================================
+
+**Options:**
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 10 65
+
+   * - Flag
+     - Required
+     - Description
+   * - ``<salt>``
+     - Yes
+     - The ``cache_salt`` identifier. Use ``_default`` for anonymous
+       (un-salted) traffic.
+   * - ``--limit-gb``
+     - Yes
+     - Quota limit in gigabytes (non-negative float).
+   * - ``--url``
+     - No
+     - LMCache HTTP server URL (default: ``http://localhost:8080``).
+   * - ``--format``
+     - No
+     - Output format: ``terminal`` (default) or ``json``.
+   * - ``--output``
+     - No
+     - Save output to a file (uses the format chosen by ``--format``).
+   * - ``-q`` / ``--quiet``
+     - No
+     - Suppress stdout output. Exit code only.
+
+get
+---
+
+Show the current quota limit and live usage for a specific ``cache_salt``.
+
+.. code-block:: bash
+
+   lmcache quota get <salt> [--url <URL>]
+
+**Example:**
+
+.. code-block:: bash
+
+   $ lmcache quota get tenant1
+
+   ================ Quota Info ================
+   Cache salt:                          tenant1
+   Limit (GB):                             10.5
+   Current usage (GB):                     3.27
+   Exists:                                 True
+   =============================================
+
+**Options:**
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 10 65
+
+   * - Flag
+     - Required
+     - Description
+   * - ``<salt>``
+     - Yes
+     - The ``cache_salt`` identifier.
+   * - ``--url``
+     - No
+     - LMCache HTTP server URL (default: ``http://localhost:8080``).
+   * - ``--format``
+     - No
+     - Output format: ``terminal`` (default) or ``json``.
+   * - ``--output``
+     - No
+     - Save output to a file.
+   * - ``-q`` / ``--quiet``
+     - No
+     - Suppress stdout output.
+
+list
+----
+
+List all registered quotas along with their current usage.
+
+.. code-block:: bash
+
+   lmcache quota list [--url <URL>]
+
+**Example:**
+
+.. code-block:: bash
+
+   $ lmcache quota list
+
+   =================== Quota List ====================
+   --- Salt: tenant1 ---
+   Cache salt:                               tenant1
+   Limit (GB):                                  10.5
+   Current usage (GB):                          3.27
+   --- Salt: _default ---
+   Cache salt:                              _default
+   Limit (GB):                                   5.0
+   Current usage (GB):                          1.82
+   ===================================================
+
+**JSON output:**
+
+.. code-block:: bash
+
+   $ lmcache quota list --format json
+   {
+     "title": "Quota List",
+     "sections": {
+       "quota_0": {
+         "label": "Salt: tenant1",
+         "metrics": {
+           "cache_salt": "tenant1",
+           "limit_gb": 10.5,
+           "current_usage_gb": 3.27
+         }
+       },
+       "quota_1": {
+         "label": "Salt: _default",
+         "metrics": {
+           "cache_salt": "_default",
+           "limit_gb": 5.0,
+           "current_usage_gb": 1.82
+         }
+       }
+     }
+   }
+
+**Options:**
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 10 65
+
+   * - Flag
+     - Required
+     - Description
+   * - ``--url``
+     - No
+     - LMCache HTTP server URL (default: ``http://localhost:8080``).
+   * - ``--format``
+     - No
+     - Output format: ``terminal`` (default) or ``json``.
+   * - ``--output``
+     - No
+     - Save output to a file.
+   * - ``-q`` / ``--quiet``
+     - No
+     - Suppress stdout output.
+
+delete
+------
+
+Remove a quota entry for a given ``cache_salt``. Any bytes still cached
+under this salt become over-budget on the next eviction cycle and will be
+evicted (effective limit drops to 0).
+
+.. code-block:: bash
+
+   lmcache quota delete <salt> [--url <URL>]
+
+**Example:**
+
+.. code-block:: bash
+
+   $ lmcache quota delete tenant1
+
+   ============== Quota Delete ===============
+   Cache salt:                        tenant1
+   Status:                            removed
+   ===========================================
+
+**Options:**
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 10 65
+
+   * - Flag
+     - Required
+     - Description
+   * - ``<salt>``
+     - Yes
+     - The ``cache_salt`` identifier.
+   * - ``--url``
+     - No
+     - LMCache HTTP server URL (default: ``http://localhost:8080``).
+   * - ``--format``
+     - No
+     - Output format: ``terminal`` (default) or ``json``.
+   * - ``--output``
+     - No
+     - Save output to a file.
+   * - ``-q`` / ``--quiet``
+     - No
+     - Suppress stdout output.
+
+Exit Codes
+----------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 15 85
+
+   * - Code
+     - Meaning
+   * - ``0``
+     - Success.
+   * - ``1``
+     - Error (connection failure, server error, bad arguments).
+
+The ``_default`` Salt
+---------------------
+
+The LMCache server uses an empty string (``""``) as the ``cache_salt`` for
+anonymous / un-salted traffic. Since empty strings cannot appear in URL path
+parameters, the HTTP API (and this CLI) uses the sentinel ``_default`` in
+its place.
+
+.. code-block:: bash
+
+   # Set a 5 GB quota for anonymous traffic
+   lmcache quota set _default --limit-gb 5.0
+
+   # Check usage
+   lmcache quota get _default
+
+Common Patterns
+---------------
+
+**Provision quotas for multiple tenants:**
+
+.. code-block:: bash
+
+   for tenant in tenant_a tenant_b tenant_c; do
+       lmcache quota set "$tenant" --limit-gb 8.0
+   done
+
+**Monitor usage in a script:**
+
+.. code-block:: bash
+
+   USAGE=$(lmcache quota get tenant1 --format json | jq '.metrics.current_usage_gb')
+   LIMIT=$(lmcache quota get tenant1 --format json | jq '.metrics.limit_gb')
+   echo "tenant1: ${USAGE} / ${LIMIT} GB"
+
+**Revoke access (evict all cached data for a salt):**
+
+.. code-block:: bash
+
+   # Deleting the quota causes all data under this salt to be evicted
+   lmcache quota delete tenant1
diff --git a/lmcache/cli/commands/quota/__init__.py b/lmcache/cli/commands/quota/__init__.py
new file mode 100644
index 0000000000..0d2f32c1ef
--- /dev/null
+++ b/lmcache/cli/commands/quota/__init__.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+"""``lmcache quota`` command — per-salt quota management.
+
+Subcommands:
+
+* ``set SALT --limit-gb N``  — create or update a quota
+* ``get SALT``               — show quota and current usage for a salt
+* ``list``                   — list all registered quotas
+* ``delete SALT``            — remove a quota entry
+"""
+
+# Standard
+import argparse
+import sys
+
+# First Party
+from lmcache.cli.commands.base import BaseCommand
+from lmcache.cli.commands.quota.delete_command import (
+    register_delete_parser,
+    run_quota_delete,
+)
+from lmcache.cli.commands.quota.get_command import (
+    register_get_parser,
+    run_quota_get,
+)
+from lmcache.cli.commands.quota.list_command import (
+    register_list_parser,
+    run_quota_list,
+)
+from lmcache.cli.commands.quota.set_command import (
+    register_set_parser,
+    run_quota_set,
+)
+from lmcache.logging import init_logger
+
+logger = init_logger(__name__)
+
+
+class QuotaCommand(BaseCommand):
+    """CLI command for per-salt quota management on LMCache server."""
+
+    def name(self) -> str:
+        return "quota"
+
+    def help(self) -> str:
+        return "Manage per-salt cache quotas."
+
+    def add_arguments(self, _parser: argparse.ArgumentParser) -> None:
+        pass  # args registered in register() via subparsers
+
+    def register(self, subparsers: argparse._SubParsersAction) -> None:
+        """Register ``lmcache quota`` and all quota sub-subcommands.
+
+        Args:
+            subparsers: The subparsers action from the root parser.
+        """
+        parser = subparsers.add_parser(
+            self.name(),
+            help=self.help(),
+            description="Manage per-salt cache quotas on the LMCache server.",
+        )
+        inner = parser.add_subparsers(
+            dest="quota_action",
+            required=True,
+            metavar="{set,get,list,delete}",
+        )
+        register_set_parser(inner, self.execute)
+        register_get_parser(inner, self.execute)
+        register_list_parser(inner, self.execute)
+        register_delete_parser(inner, self.execute)
+
+    def execute(self, args: argparse.Namespace) -> None:
+        """Dispatch to the appropriate quota subcommand handler.
+
+        Args:
+            args: Parsed CLI arguments containing ``quota_action``.
+        """
+        handlers = {
+            "set": lambda a: run_quota_set(self, a),
+            "get": lambda a: run_quota_get(self, a),
+            "list": lambda a: run_quota_list(self, a),
+            "delete": lambda a: run_quota_delete(self, a),
+        }
+        handler = handlers.get(args.quota_action)
+        if handler is None:
+            print(
+                f"Unknown quota action: {args.quota_action}",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        handler(args)
diff --git a/lmcache/cli/commands/quota/delete_command.py b/lmcache/cli/commands/quota/delete_command.py
new file mode 100644
index 0000000000..873186453a
--- /dev/null
+++ b/lmcache/cli/commands/quota/delete_command.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+"""``lmcache quota delete`` — remove a quota for a cache_salt."""
+
+# Standard
+from typing import TYPE_CHECKING
+import argparse
+
+# First Party
+from lmcache.cli.commands.base import _add_output_args
+from lmcache.cli.commands.quota.helpers import (
+    escape_salt,
+    http_request,
+    normalize_url,
+)
+
+if TYPE_CHECKING:
+    # First Party
+    from lmcache.cli.commands.base import BaseCommand
+
+
+def register_delete_parser(
+    subparsers: argparse._SubParsersAction,
+    dispatch_func,
+) -> argparse.ArgumentParser:
+    """Register the ``lmcache quota delete`` subcommand parser.
+
+    Args:
+        subparsers: The ``quota`` subparsers action.
+        dispatch_func: Function to bind via ``set_defaults(func=...)``.
+
+    Returns:
+        The created ``ArgumentParser``.
+    """
+    parser = subparsers.add_parser(
+        "delete",
+        help="Remove a quota for a cache_salt.",
+        description=(
+            "Delete the quota entry for a given cache_salt. Any bytes "
+            "still cached under this salt become over-budget on the "
+            "next eviction cycle and will be evicted."
+        ),
+    )
+    parser.add_argument(
+        "salt",
+        type=str,
+        help=(
+            "The cache_salt identifier. Use '_default' for anonymous "
+            "(un-salted) traffic."
+        ),
+    )
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8080",
+        help="LMCache HTTP server URL (default: http://localhost:8080).",
+    )
+    _add_output_args(parser)
+    parser.set_defaults(func=dispatch_func)
+    return parser
+
+
+def run_quota_delete(cmd: "BaseCommand", args: argparse.Namespace) -> None:
+    """Execute the ``lmcache quota delete`` subcommand.
+
+    Args:
+        cmd: The parent command instance (for metrics creation).
+        args: Parsed CLI arguments.
+    """
+    base_url = normalize_url(args.url)
+    salt = escape_salt(args.salt)
+
+    result = http_request("DELETE", f"{base_url}/quota/{salt}")
+
+    metrics = cmd.create_metrics("Quota Delete", args)
+    metrics.add("cache_salt", "Cache salt", result.get("cache_salt", salt))
+    metrics.add("status", "Status", result.get("status", "unknown"))
+    metrics.emit()
diff --git a/lmcache/cli/commands/quota/get_command.py b/lmcache/cli/commands/quota/get_command.py
new file mode 100644
index 0000000000..d179e022dc
--- /dev/null
+++ b/lmcache/cli/commands/quota/get_command.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+"""``lmcache quota get`` — read the quota and usage for a cache_salt."""
+
+# Standard
+from typing import TYPE_CHECKING
+import argparse
+
+# First Party
+from lmcache.cli.commands.base import _add_output_args
+from lmcache.cli.commands.quota.helpers import (
+    escape_salt,
+    http_request,
+    normalize_url,
+)
+
+if TYPE_CHECKING:
+    # First Party
+    from lmcache.cli.commands.base import BaseCommand
+
+
+def register_get_parser(
+    subparsers: argparse._SubParsersAction,
+    dispatch_func,
+) -> argparse.ArgumentParser:
+    """Register the ``lmcache quota get`` subcommand parser.
+
+    Args:
+        subparsers: The ``quota`` subparsers action.
+        dispatch_func: Function to bind via ``set_defaults(func=...)``.
+
+    Returns:
+        The created ``ArgumentParser``.
+    """
+    parser = subparsers.add_parser(
+        "get",
+        help="Show the quota and current usage for a cache_salt.",
+        description=(
+            "Query the current quota limit and live usage for a "
+            "specific cache_salt on the LMCache server."
+        ),
+    )
+    parser.add_argument(
+        "salt",
+        type=str,
+        help=(
+            "The cache_salt identifier. Use '_default' for anonymous "
+            "(un-salted) traffic."
+        ),
+    )
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8080",
+        help="LMCache HTTP server URL (default: http://localhost:8080).",
+    )
+    _add_output_args(parser)
+    parser.set_defaults(func=dispatch_func)
+    return parser
+
+
+def run_quota_get(cmd: "BaseCommand", args: argparse.Namespace) -> None:
+    """Execute the ``lmcache quota get`` subcommand.
+
+    Args:
+        cmd: The parent command instance (for metrics creation).
+        args: Parsed CLI arguments.
+    """
+    base_url = normalize_url(args.url)
+    salt = escape_salt(args.salt)
+
+    result = http_request("GET", f"{base_url}/quota/{salt}")
+
+    metrics = cmd.create_metrics("Quota Info", args)
+    metrics.add("cache_salt", "Cache salt", result.get("cache_salt", salt))
+    metrics.add("limit_gb", "Limit (GB)", result.get("limit_gb"))
+    metrics.add(
+        "current_usage_gb", "Current usage (GB)", result.get("current_usage_gb")
+    )
+    metrics.add("exists", "Exists", result.get("exists"))
+    metrics.emit()
diff --git a/lmcache/cli/commands/quota/helpers.py b/lmcache/cli/commands/quota/helpers.py
new file mode 100644
index 0000000000..4d64ee6e15
--- /dev/null
+++ b/lmcache/cli/commands/quota/helpers.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Shared helpers for ``lmcache quota`` subcommands."""
+
+# Standard
+from typing import Any, Optional
+import json
+import sys
+import urllib.error
+import urllib.request
+
+# First Party
+from lmcache.logging import init_logger
+
+logger = init_logger(__name__)
+
+# The MP HTTP server uses "_default" as a sentinel for the empty-string
+# cache_salt (anonymous / un-salted traffic).
+DEFAULT_SALT_SENTINEL = "_default"
+
+
+def normalize_url(url: str) -> str:
+    """Ensure *url* has an ``http://`` or ``https://`` scheme."""
+    if not url.startswith(("http://", "https://")):
+        url = f"http://{url}"
+    return url.rstrip("/")
+
+
+def escape_salt(salt: str) -> str:
+    """Translate the empty-string salt to the URL sentinel."""
+    return DEFAULT_SALT_SENTINEL if salt == "" else salt
+
+
+def unescape_salt(salt: str) -> str:
+    """Translate the URL sentinel back to the empty-string salt."""
+    return "" if salt == DEFAULT_SALT_SENTINEL else salt
+
+
+def http_request(
+    method: str,
+    url: str,
+    data: Optional[dict[str, Any]] = None,
+    timeout: int = 10,
+) -> dict[str, Any]:
+    """Send an HTTP request and return the parsed JSON response.
+
+    Args:
+        method: HTTP method (GET, POST, PUT, DELETE).
+        url: Full URL to request.
+        data: Optional JSON body to send.
+        timeout: HTTP timeout in seconds.
+
+    Returns:
+        Parsed JSON response as a dict.
+
+    Raises:
+        SystemExit: On connection error or non-2xx HTTP response.
+    """
+    body = None
+    headers: dict[str, str] = {}
+    if data is not None:
+        body = json.dumps(data).encode()
+        headers["Content-Type"] = "application/json"
+
+    req = urllib.request.Request(url, data=body, headers=headers, method=method)
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read().decode())
+    except urllib.error.HTTPError as e:
+        try:
+            error_body = json.loads(e.read().decode())
+            msg = error_body.get("error") or error_body.get("message") or str(e)
+        except (json.JSONDecodeError, ValueError, OSError):
+            msg = str(e)
+        logger.error("Server error: %s", msg)
+        sys.exit(1)
+    except urllib.error.URLError as e:
+        logger.error("Cannot reach %s — is the server running? (%s)", url, e.reason)
+        sys.exit(1)
diff --git a/lmcache/cli/commands/quota/list_command.py b/lmcache/cli/commands/quota/list_command.py
new file mode 100644
index 0000000000..94316668d6
--- /dev/null
+++ b/lmcache/cli/commands/quota/list_command.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+"""``lmcache quota list`` — list all registered quotas and their usage."""
+
+# Standard
+from typing import TYPE_CHECKING
+import argparse
+
+# First Party
+from lmcache.cli.commands.base import _add_output_args
+from lmcache.cli.commands.quota.helpers import http_request, normalize_url
+
+if TYPE_CHECKING:
+    # First Party
+    from lmcache.cli.commands.base import BaseCommand
+
+
+def register_list_parser(
+    subparsers: argparse._SubParsersAction,
+    dispatch_func,
+) -> argparse.ArgumentParser:
+    """Register the ``lmcache quota list`` subcommand parser.
+
+    Args:
+        subparsers: The ``quota`` subparsers action.
+        dispatch_func: Function to bind via ``set_defaults(func=...)``.
+
+    Returns:
+        The created ``ArgumentParser``.
+    """
+    parser = subparsers.add_parser(
+        "list",
+        help="List all registered quotas and their usage.",
+        description=(
+            "Retrieve all per-salt quotas from the LMCache server "
+            "along with their current usage."
+        ),
+    )
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8080",
+        help="LMCache HTTP server URL (default: http://localhost:8080).",
+    )
+    _add_output_args(parser)
+    parser.set_defaults(func=dispatch_func)
+    return parser
+
+
+def run_quota_list(cmd: "BaseCommand", args: argparse.Namespace) -> None:
+    """Execute the ``lmcache quota list`` subcommand.
+
+    Args:
+        cmd: The parent command instance (for metrics creation).
+        args: Parsed CLI arguments.
+    """
+    base_url = normalize_url(args.url)
+
+    result = http_request("GET", f"{base_url}/quota")
+    users = result.get("users", {})
+
+    metrics = cmd.create_metrics("Quota List", args, width=55)
+
+    if not users:
+        metrics.add("info", "Info", "No quotas configured")
+        metrics.emit()
+        return
+
+    for idx, (salt, info) in enumerate(users.items()):
+        section_key = f"quota_{idx}"
+        metrics.add_list_section("quotas", section_key, f"Salt: {salt}")
+        sec = metrics[section_key]
+        sec.add("cache_salt", "Cache salt", salt)
+        sec.add("limit_gb", "Limit (GB)", info.get("limit_gb"))
+        sec.add("current_usage_gb", "Current usage (GB)", info.get("current_usage_gb"))
+
+    metrics.emit()
diff --git a/lmcache/cli/commands/quota/set_command.py b/lmcache/cli/commands/quota/set_command.py
new file mode 100644
index 0000000000..a84121e57c
--- /dev/null
+++ b/lmcache/cli/commands/quota/set_command.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+"""``lmcache quota set`` — create or update a quota for a cache_salt."""
+
+# Standard
+from typing import TYPE_CHECKING
+import argparse
+
+# First Party
+from lmcache.cli.commands.base import _add_output_args
+from lmcache.cli.commands.quota.helpers import (
+    escape_salt,
+    http_request,
+    normalize_url,
+)
+
+if TYPE_CHECKING:
+    # First Party
+    from lmcache.cli.commands.base import BaseCommand
+
+
+def register_set_parser(
+    subparsers: argparse._SubParsersAction,
+    dispatch_func,
+) -> argparse.ArgumentParser:
+    """Register the ``lmcache quota set`` subcommand parser.
+
+    Args:
+        subparsers: The ``quota`` subparsers action.
+        dispatch_func: Function to bind via ``set_defaults(func=...)``.
+
+    Returns:
+        The created ``ArgumentParser``.
+    """
+    parser = subparsers.add_parser(
+        "set",
+        help="Create or update a quota for a cache_salt.",
+        description=(
+            "Set a per-salt quota (in GB) on the LMCache server. "
+            "The quota is soft: exceeding it triggers eviction on the "
+            "next cycle rather than rejecting writes."
+        ),
+    )
+    parser.add_argument(
+        "salt",
+        type=str,
+        help=(
+            "The cache_salt identifier. Use '_default' for anonymous "
+            "(un-salted) traffic."
+        ),
+    )
+    parser.add_argument(
+        "--limit-gb",
+        type=float,
+        required=True,
+        metavar="GB",
+        help="Quota limit in gigabytes (non-negative).",
+    )
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8080",
+        help="LMCache HTTP server URL (default: http://localhost:8080).",
+    )
+    _add_output_args(parser)
+    parser.set_defaults(func=dispatch_func)
+    return parser
+
+
+def run_quota_set(cmd: "BaseCommand", args: argparse.Namespace) -> None:
+    """Execute the ``lmcache quota set`` subcommand.
+
+    Args:
+        cmd: The parent command instance (for metrics creation).
+        args: Parsed CLI arguments.
+    """
+    base_url = normalize_url(args.url)
+    salt = escape_salt(args.salt)
+    limit_gb = args.limit_gb
+
+    result = http_request(
+        "PUT",
+        f"{base_url}/quota/{salt}",
+        data={"limit_gb": limit_gb},
+    )
+
+    metrics = cmd.create_metrics("Quota Set", args)
+    metrics.add("cache_salt", "Cache salt", result.get("cache_salt", salt))
+    metrics.add("limit_gb", "Limit (GB)", result.get("limit_gb", limit_gb))
+    metrics.add("status", "Status", result.get("status", "ok"))
+    metrics.emit()
diff --git a/tests/cli/commands/test_quota.py b/tests/cli/commands/test_quota.py
new file mode 100644
index 0000000000..f279e105d4
--- /dev/null
+++ b/tests/cli/commands/test_quota.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for the ``lmcache quota`` CLI command."""
+
+# Standard
+from unittest.mock import MagicMock, patch
+import argparse
+import sys
+
+# Inject a fake openai module so that the auto-discovery of
+# lmcache.cli.commands (which imports bench -> openai) does not fail
+# in environments where openai is not installed.
+if "openai" not in sys.modules:
+    _fake_openai = MagicMock()
+    sys.modules["openai"] = _fake_openai
+
+# Third Party
+import pytest
+
+# First Party
+from lmcache.cli.commands.quota import QuotaCommand
+from lmcache.cli.commands.quota.helpers import (
+    DEFAULT_SALT_SENTINEL,
+    escape_salt,
+    normalize_url,
+    unescape_salt,
+)
+
+
+@pytest.fixture
+def cmd() -> QuotaCommand:
+    return QuotaCommand()
+
+
+@pytest.fixture
+def parser(cmd: QuotaCommand) -> argparse.ArgumentParser:
+    """An ArgumentParser with QuotaCommand registered."""
+    p = argparse.ArgumentParser()
+    sub = p.add_subparsers(dest="command")
+    cmd.register(sub)
+    return p
+
+
+class TestHelpers:
+    def test_normalize_url(self) -> None:
+        assert normalize_url("localhost:8080") == "http://localhost:8080"
+        assert normalize_url("https://host:443/") == "https://host:443"
+
+    def test_escape_unescape_salt(self) -> None:
+        assert escape_salt("") == DEFAULT_SALT_SENTINEL
+        assert escape_salt("tenant1") == "tenant1"
+        assert unescape_salt(DEFAULT_SALT_SENTINEL) == ""
+        assert unescape_salt("tenant1") == "tenant1"
+
+
+class TestQuotaCommandMetadata:
+    def test_name_and_help(self, cmd: QuotaCommand) -> None:
+        assert cmd.name() == "quota"
+        assert "quota" in cmd.help().lower()
+
+
+class TestQuotaCommandExecute:
+    @patch("lmcache.cli.commands.quota.set_command.http_request")
+    def test_set(
+        self,
+        mock_http,
+        cmd,
+        parser,
+        capsys,
+    ) -> None:
+        mock_http.return_value = {
+            "cache_salt": "tenant1",
+            "limit_gb": 10.5,
+            "status": "ok",
+        }
+        args = parser.parse_args(["quota", "set", "tenant1", "--limit-gb", "10.5"])
+        cmd.execute(args)
+
+        mock_http.assert_called_once_with(
+            "PUT",
+            "http://localhost:8080/quota/tenant1",
+            data={"limit_gb": 10.5},
+        )
+        out = capsys.readouterr().out
+        assert "Quota Set" in out and "tenant1" in out
+
+    @patch("lmcache.cli.commands.quota.get_command.http_request")
+    def test_get(
+        self,
+        mock_http,
+        cmd,
+        parser,
+        capsys,
+    ) -> None:
+        mock_http.return_value = {
+            "cache_salt": "tenant1",
+            "limit_gb": 10.5,
+            "current_usage_gb": 3.27,
+            "exists": True,
+        }
+        args = parser.parse_args(["quota", "get", "tenant1"])
+        cmd.execute(args)
+
+        mock_http.assert_called_once_with("GET", "http://localhost:8080/quota/tenant1")
+        out = capsys.readouterr().out
+        assert "Quota Info" in out and "3.27" in out
+
+    @patch("lmcache.cli.commands.quota.list_command.http_request")
+    def test_list(
+        self,
+        mock_http,
+        cmd,
+        parser,
+        capsys,
+    ) -> None:
+        mock_http.return_value = {
+            "users": {
+                "tenant1": {"limit_gb": 10.5, "current_usage_gb": 3.27},
+                "_default": {"limit_gb": 5.0, "current_usage_gb": 1.82},
+            }
+        }
+        args = parser.parse_args(["quota", "list"])
+        cmd.execute(args)
+
+        mock_http.assert_called_once_with("GET", "http://localhost:8080/quota")
+        out = capsys.readouterr().out
+        assert "tenant1" in out and "_default" in out
+
+    @patch("lmcache.cli.commands.quota.delete_command.http_request")
+    def test_delete(
+        self,
+        mock_http,
+        cmd,
+        parser,
+        capsys,
+    ) -> None:
+        mock_http.return_value = {"cache_salt": "tenant1", "status": "removed"}
+        args = parser.parse_args(["quota", "delete", "tenant1"])
+        cmd.execute(args)
+
+        mock_http.assert_called_once_with(
+            "DELETE",
+            "http://localhost:8080/quota/tenant1",
+        )
+        out = capsys.readouterr().out
+        assert "Quota Delete" in out and "removed" in out
+
+    @patch("lmcache.cli.commands.quota.set_command.http_request")
+    def test_quiet_suppresses_output(
+        self,
+        mock_http,
+        cmd,
+        parser,
+        capsys,
+    ) -> None:
+        mock_http.return_value = {"cache_salt": "t1", "limit_gb": 1.0, "status": "ok"}
+        args = parser.parse_args(["quota", "set", "t1", "--limit-gb", "1", "-q"])
+        cmd.execute(args)
+        assert capsys.readouterr().out == ""

From 045a0a90cfaddd3383583d6191495700af9fe95d Mon Sep 17 00:00:00 2001
From: deng451e <57919305+deng451e@users.noreply.github.com>
Date: Thu, 11 Jun 2026 10:50:51 -0700
Subject: [PATCH 44/57] [CI] cu129 images: pin vllm to the cu129 index (drop
 unsafe-best-match) (#3621)

Signed-off-by: deng451e <838677410@qq.com>
---
 docker/Dockerfile            | 9 ++++-----
 docker/Dockerfile.standalone | 1 -
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 629915875d..2719f03ad0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -84,13 +84,12 @@ RUN --mount=type=cache,target=/root/.cache/ccache,id=ccache \
         VLLM_PRECOMPILED_WHEEL_VARIANT=${CUDA_TAG} uv pip install --prerelease=allow \
             'vllm[runai,tensorizer,flashinfer]' \
             --extra-index-url https://wheels.vllm.ai/nightly/${CUDA_TAG} \
-            --extra-index-url https://download.pytorch.org/whl/${CUDA_TAG} \
-            --index-strategy unsafe-best-match ; \
+            --extra-index-url https://download.pytorch.org/whl/${CUDA_TAG} ; \
     else \
         VLLM_PRECOMPILED_WHEEL_VARIANT=${CUDA_TAG} uv pip install --prerelease=allow \
             "vllm[runai,tensorizer,flashinfer]==${VLLM_VERSION}" ; \
     fi && \
-    python3 -c 'import torch; print("TORCH=", torch.__version__)' && \
+    python3 -c 'import vllm._C; import torch; print("TORCH=", torch.__version__)' && \
     python3 setup.py bdist_wheel --dist-dir=dist_lmcache && \
     uv pip install ./dist_lmcache/*.whl --verbose
 
@@ -136,8 +135,8 @@ RUN . /opt/venv/bin/activate && \
     VLLM_PRECOMPILED_WHEEL_VARIANT=${CUDA_TAG} uv pip install --prerelease=allow \
         vllm[runai,tensorizer,flashinfer] \
         --extra-index-url https://wheels.vllm.ai/nightly/${CUDA_TAG} \
-        --extra-index-url https://download.pytorch.org/whl/${CUDA_TAG} \
-        --index-strategy unsafe-best-match && \
+        --extra-index-url https://download.pytorch.org/whl/${CUDA_TAG} && \
+    python3 -c 'import vllm._C' && \
     uv pip install lmcache==${VER} \
         --extra-index-url https://download.pytorch.org/whl/${CUDA_TAG} \
         --find-links https://github.com/LMCache/LMCache/releases/expanded_assets/v${VER}-cu129 \
diff --git a/docker/Dockerfile.standalone b/docker/Dockerfile.standalone
index 5899b0db90..b5ed866420 100644
--- a/docker/Dockerfile.standalone
+++ b/docker/Dockerfile.standalone
@@ -70,7 +70,6 @@ RUN --mount=type=cache,target=/root/.cache/uv,id=uv-resolver \
         uv pip compile /tmp/req.in --quiet --prerelease=allow \
             --extra-index-url https://wheels.vllm.ai/nightly/${CUDA_TAG} \
             --extra-index-url https://download.pytorch.org/whl/${CUDA_TAG} \
-            --index-strategy unsafe-best-match \
             > /tmp/resolved.txt ; \
     else \
         echo "vllm==${VLLM_VERSION}" > /tmp/req.in && \

From e115420496a33d9356b71e95d681046c9e1b9617 Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Thu, 11 Jun 2026 10:53:37 -0700
Subject: [PATCH 45/57] fix UTs

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 tests/v1/distributed/test_dax_l2_adapter.py                | 2 +-
 tests/v1/distributed/test_hfbucket_l2_adapter.py           | 2 +-
 tests/v1/distributed/test_l2_adapter_base.py               | 2 +-
 tests/v1/distributed/test_mock_l2_adapter.py               | 2 +-
 tests/v1/distributed/test_nixl_store_dynamic_l2_adapter.py | 2 +-
 tests/v1/distributed/test_nixl_store_l2_adapter.py         | 2 +-
 tests/v1/distributed/test_raw_block_l2_adapter.py          | 4 ++--
 tests/v1/distributed/test_s3_l2_adapter.py                 | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/v1/distributed/test_dax_l2_adapter.py b/tests/v1/distributed/test_dax_l2_adapter.py
index b8c5d72128..2222db73d1 100644
--- a/tests/v1/distributed/test_dax_l2_adapter.py
+++ b/tests/v1/distributed/test_dax_l2_adapter.py
@@ -48,7 +48,7 @@ def __init__(self):
         self.accessed: list[list[ObjectKey]] = []
         self.deleted: list[list[ObjectKey]] = []
 
-    def on_l2_keys_stored(self, keys: list[ObjectKey]):
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]):
         self.stored.append(list(keys))
 
     def on_l2_keys_accessed(self, keys: list[ObjectKey]):
diff --git a/tests/v1/distributed/test_hfbucket_l2_adapter.py b/tests/v1/distributed/test_hfbucket_l2_adapter.py
index 7647be7c9e..a10b6b75f1 100644
--- a/tests/v1/distributed/test_hfbucket_l2_adapter.py
+++ b/tests/v1/distributed/test_hfbucket_l2_adapter.py
@@ -195,7 +195,7 @@ def __init__(self) -> None:
         self.accessed: list[list[ObjectKey]] = []
         self.deleted: list[list[ObjectKey]] = []
 
-    def on_l2_keys_stored(self, keys):
+    def on_l2_keys_stored(self, keys, sizes):
         self.stored.append(list(keys))
 
     def on_l2_keys_accessed(self, keys):
diff --git a/tests/v1/distributed/test_l2_adapter_base.py b/tests/v1/distributed/test_l2_adapter_base.py
index dbaf10410d..c8e7ec1692 100644
--- a/tests/v1/distributed/test_l2_adapter_base.py
+++ b/tests/v1/distributed/test_l2_adapter_base.py
@@ -284,7 +284,7 @@ def __init__(self) -> None:
         self.accessed: list[list[ObjectKey]] = []
         self.deleted: list[list[ObjectKey]] = []
 
-    def on_l2_keys_stored(self, keys: list[ObjectKey]) -> None:
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]) -> None:
         self.stored.append(list(keys))
 
     def on_l2_keys_accessed(self, keys: list[ObjectKey]) -> None:
diff --git a/tests/v1/distributed/test_mock_l2_adapter.py b/tests/v1/distributed/test_mock_l2_adapter.py
index b3180c1a29..c786279be4 100644
--- a/tests/v1/distributed/test_mock_l2_adapter.py
+++ b/tests/v1/distributed/test_mock_l2_adapter.py
@@ -37,7 +37,7 @@ def __init__(self):
         self.accessed: list[list[ObjectKey]] = []
         self.deleted: list[list[ObjectKey]] = []
 
-    def on_l2_keys_stored(self, keys: list[ObjectKey]):
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]):
         self.stored.append(list(keys))
 
     def on_l2_keys_accessed(self, keys: list[ObjectKey]):
diff --git a/tests/v1/distributed/test_nixl_store_dynamic_l2_adapter.py b/tests/v1/distributed/test_nixl_store_dynamic_l2_adapter.py
index 59a4c92434..381c833081 100644
--- a/tests/v1/distributed/test_nixl_store_dynamic_l2_adapter.py
+++ b/tests/v1/distributed/test_nixl_store_dynamic_l2_adapter.py
@@ -46,7 +46,7 @@ def __init__(self):
         self.accessed: list[list[ObjectKey]] = []
         self.deleted: list[list[ObjectKey]] = []
 
-    def on_l2_keys_stored(self, keys: list[ObjectKey]):
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]):
         self.stored.append(list(keys))
 
     def on_l2_keys_accessed(self, keys: list[ObjectKey]):
diff --git a/tests/v1/distributed/test_nixl_store_l2_adapter.py b/tests/v1/distributed/test_nixl_store_l2_adapter.py
index fed9b40d2f..125e54bc1f 100644
--- a/tests/v1/distributed/test_nixl_store_l2_adapter.py
+++ b/tests/v1/distributed/test_nixl_store_l2_adapter.py
@@ -38,7 +38,7 @@ def __init__(self):
         self.accessed: list[list[ObjectKey]] = []
         self.deleted: list[list[ObjectKey]] = []
 
-    def on_l2_keys_stored(self, keys: list[ObjectKey]):
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]):
         self.stored.append(list(keys))
 
     def on_l2_keys_accessed(self, keys: list[ObjectKey]):
diff --git a/tests/v1/distributed/test_raw_block_l2_adapter.py b/tests/v1/distributed/test_raw_block_l2_adapter.py
index db3102e959..7242da6434 100644
--- a/tests/v1/distributed/test_raw_block_l2_adapter.py
+++ b/tests/v1/distributed/test_raw_block_l2_adapter.py
@@ -49,7 +49,7 @@ def __init__(self):
         self.accessed: list[list[ObjectKey]] = []
         self.deleted: list[list[ObjectKey]] = []
 
-    def on_l2_keys_stored(self, keys: list[ObjectKey]):
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]):
         self.stored.append(list(keys))
 
     def on_l2_keys_accessed(self, keys: list[ObjectKey]):
@@ -60,7 +60,7 @@ def on_l2_keys_deleted(self, keys: list[ObjectKey]):
 
 
 class _FailingListener(L2AdapterListener):
-    def on_l2_keys_stored(self, keys: list[ObjectKey]):
+    def on_l2_keys_stored(self, keys: list[ObjectKey], sizes: list[int]):
         del keys
         raise RuntimeError("store listener failed")
 
diff --git a/tests/v1/distributed/test_s3_l2_adapter.py b/tests/v1/distributed/test_s3_l2_adapter.py
index 094c07fe4c..8fe2f3a2ec 100644
--- a/tests/v1/distributed/test_s3_l2_adapter.py
+++ b/tests/v1/distributed/test_s3_l2_adapter.py
@@ -308,7 +308,7 @@ def __init__(self):
         self.accessed: list[list[ObjectKey]] = []
         self.deleted: list[list[ObjectKey]] = []
 
-    def on_l2_keys_stored(self, keys):
+    def on_l2_keys_stored(self, keys, sizes):
         self.stored.append(list(keys))
 
     def on_l2_keys_accessed(self, keys):

From 7bdb8fd59c97f8090b45efe3bb04a004afe2bc98 Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Thu, 11 Jun 2026 12:08:03 -0700
Subject: [PATCH 46/57] fix UT

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 lmcache/v1/mp_coordinator/http_apis/l2_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmcache/v1/mp_coordinator/http_apis/l2_api.py b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
index 7312605a84..d94f9a9562 100644
--- a/lmcache/v1/mp_coordinator/http_apis/l2_api.py
+++ b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
@@ -92,7 +92,7 @@ def _eviction_manager(request: Request) -> L2EvictionManager:
 # -- Quota writes ------------------------------------------------------------
 
 
-@router.put("/l2/quota/{cache_salt}")
+@router.put("/l2/quota/{cache_salt}", response_model=None)
 async def set_quota(
     cache_salt: str, body: SetQuotaRequest, request: Request
 ) -> QuotaResponse | JSONResponse:

From 5abbb704d2b7db265512c623bb64366e208460cb Mon Sep 17 00:00:00 2001
From: An Nguyen <annenguyen@google.com>
Date: Thu, 11 Jun 2026 17:47:41 -0400
Subject: [PATCH 47/57] feat: Integrate native Cloud Bigtable remote storage
 connector (#3404)

* feat: add Google Cloud Bigtable remote storage connector

- Integrate thread-safe gRPC AsyncPQExecutor for Point-Reads and Batched Mutations

- Implement numerical-precision partial chunk reshaping falling back to CPU recompute

- Incorporate 20MB MutateRow request payload thresholds and 10s TTLCache local shielding

- Include 100% portable dynamic gapic mock-isolated PyTest unit test coverage

- Append comprehensive user guide and integration documentation

- Add google-cloud-bigtable SDK to common requirements to resolve upstream GitHub Actions CI ImportErrors

- Implement graceful FileNotFoundError fallback for credentials_path to ensure 100% Buildkite K3 CI pipeline resilience

- Wrap all internal logger and warning strings to strictly comply with LMCache's 88-character ruff cap

Signed-off-by: An Nguyen <annenguyen@google.com>

Y

* update docs & address comment

Signed-off-by: An Nguyen <annenguyen@google.com>

* add bigtable bench

Signed-off-by: An Nguyen <annenguyen@google.com>

* fix mypy type checking for bigtable connector mock namespace

Signed-off-by: An Nguyen <annenguyen@google.com>

* fix: make bigtable max_chunk_size_mb default value consistent and add unit tests

Signed-off-by: An Nguyen <annenguyen@google.com>

* test: Add Bigtable Emulator integration tests and fix remove_sync context issues

Signed-off-by: An Nguyen <annenguyen@google.com>

* feat(storage): cache TableAsync and optimize remove_sync to fire-and-forget in Bigtable connector

- Lazily initialize and cache TableAsync in BigtableConnector to prevent memory registry leaks.
- Optimize remove_sync to be fire-and-forget to avoid blocking caller thread on the critical path, yielding a ~20% throughput improvement.
- Update unit tests to poll-wait for background deletion task before asserting.

Signed-off-by: An Nguyen <annenguyen@google.com>

* test: skip/reuse Bigtable emulator in CI, resolve mock pollution, and fix style lints

Signed-off-by: An Nguyen <annenguyen@google.com>

---------

Signed-off-by: An Nguyen <annenguyen@google.com>
---
 .../storage_backend_io_benchmark.py           |  48 +
 .../kv_cache/storage_backends/bigtable.rst    | 147 +++
 .../kv_cache/storage_backends/index.rst       |   1 +
 .../connector/bigtable_adapter.py             |  30 +
 .../connector/bigtable_config.py              | 136 +++
 .../connector/bigtable_connector.py           | 929 +++++++++++++++++
 .../connector/bigtable_schema.py              |  57 +
 requirements/common.txt                       |   3 +
 tests/conftest.py                             |  88 ++
 .../test_bigtable_connector.py                | 981 ++++++++++++++++++
 .../test_bigtable_integration.py              | 281 +++++
 11 files changed, 2701 insertions(+)
 create mode 100644 docs/source/kv_cache/storage_backends/bigtable.rst
 create mode 100644 lmcache/v1/storage_backend/connector/bigtable_adapter.py
 create mode 100644 lmcache/v1/storage_backend/connector/bigtable_config.py
 create mode 100644 lmcache/v1/storage_backend/connector/bigtable_connector.py
 create mode 100644 lmcache/v1/storage_backend/connector/bigtable_schema.py
 create mode 100644 tests/v1/storage_backend/test_bigtable_connector.py
 create mode 100644 tests/v1/storage_backend/test_bigtable_integration.py

diff --git a/benchmarks/storage_backend_io/storage_backend_io_benchmark.py b/benchmarks/storage_backend_io/storage_backend_io_benchmark.py
index bfcde62ebc..7fbaa5124c 100644
--- a/benchmarks/storage_backend_io/storage_backend_io_benchmark.py
+++ b/benchmarks/storage_backend_io/storage_backend_io_benchmark.py
@@ -982,6 +982,37 @@ def extra_config_keys(self) -> dict:
 
 
 # ============================================================================
+# BigtableBackendBenchmark Implementation
+# ============================================================================
+class BigtableBackendBenchmark(RemoteBackendBenchmark):
+    def __init__(
+        self,
+        num_ops: int,
+        concurrency: int,
+        remote_url: str,
+        use_odirect: bool,
+        alignment: int,
+        write_bench: bool,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+        verify_integrity: bool = False,
+    ):
+        super().__init__(
+            "bigtable",
+            num_ops,
+            concurrency,
+            remote_url,
+            use_odirect,
+            alignment,
+            write_bench,
+            chunk_size,
+            verify_integrity,
+        )
+
+    @property
+    def extra_config_keys(self) -> dict:
+        return {}
+
+
 # Main Entry Point
 # ============================================================================
 def main() -> None:
@@ -1000,6 +1031,7 @@ def main() -> None:
             "both",
             "hf3fs_backend",
             "fs_backend",
+            "bigtable",
         ],
         default="both",
         help=(
@@ -1174,6 +1206,22 @@ def main() -> None:
         result["fs_dir"] = args.remote_url
         results.append(result)
 
+    # Run BigtableBackend benchmark
+    if args.backend in ("bigtable",):
+        bigtable_bench = BigtableBackendBenchmark(
+            num_ops=args.num_ops,
+            concurrency=args.concurrency,
+            remote_url=args.remote_url,
+            use_odirect=False,
+            alignment=args.alignment,
+            write_bench=write_bench,
+            chunk_size=args.chunk_size,
+            verify_integrity=args.verify_integrity,
+        )
+        result = bigtable_bench.run()
+        result["bigtable_table"] = args.remote_url
+        results.append(result)
+
     # Print results
     for result in results:
         print(
diff --git a/docs/source/kv_cache/storage_backends/bigtable.rst b/docs/source/kv_cache/storage_backends/bigtable.rst
new file mode 100644
index 0000000000..16416151c2
--- /dev/null
+++ b/docs/source/kv_cache/storage_backends/bigtable.rst
@@ -0,0 +1,147 @@
+Google Cloud Bigtable
+=====================
+
+.. _bigtable-overview:
+
+Overview
+--------
+
+Google Cloud Bigtable is a petabyte-scale, fully managed NoSQL database. Integrating Cloud Bigtable as a built-in remote storage connector inside LMCache bridges volatile high-cost in-memory tiers (Redis) and low-cost, high-latency archival object stores (S3). 
+
+For more information, see the `Cloud Bigtable Overview <https://cloud.google.com/bigtable/docs/overview>`_ and `Cloud Bigtable Pricing <https://cloud.google.com/bigtable/pricing>`_.
+
+Architecture & Payload Limits
+-----------------------------
+
+- **Chunk Size Optimization**: Set LMCache's logical ``chunk_size`` to **256 tokens**. This groups payloads to minimize sequential Point-Read gRPC calls, preventing Python event-loop (GIL) bottlenecks.
+- **MutateRow Limit**: Enforces a strict **90.0 MB request limit** for a single ``MutateRow`` gRPC request.
+- **Storage Tier Row Limits**: The **SSD Tier** ceiling is **100 MiB per cell/row**. The **Enterprise Plus In-Memory Tier** is limited to **1.0 MiB per row**.
+- **TTLCache Shielding**: Embeds a thread-safe ``TTLCache`` (10-second TTL default) to shield Bigtable nodes from concurrent prefetch lookup spikes.
+
+Infrastructure Setup
+--------------------
+
+**1. Enable GCP APIs**
+
+.. code-block:: bash
+
+gcloud services enable bigtable.googleapis.com bigtableadmin.googleapis.com --project=your-gcp-project-id
+
+**2. Provision Bigtable Instance**
+
+Refer to the `gcloud beta bigtable Reference <https://cloud.google.com/sdk/gcloud/reference/beta/bigtable>`_ for additional parameter details.
+
+.. code-block:: bash
+
+gcloud beta bigtable instances create your-bigtable-instance-id \
+    --display-name="LMCache SSD Instance" \
+    --edition=ENTERPRISE \
+    --cluster-storage-type=ssd \
+    --cluster-config=id=your-cluster-id,zone=us-central1-a,nodes=1 \
+    --project=your-gcp-project-id
+
+**3. Create Database Table & Column Family**
+
+.. code-block:: bash
+
+gcloud bigtable instances tables create lmcache-benchmark-v1 \
+    --instance=your-bigtable-instance-id \
+    --column-families=cf \
+    --project=your-gcp-project-id
+
+**4. Install LMCache & Bigtable SDK**
+
+.. code-block:: bash
+
+export NO_NATIVE_EXT=1
+pip install --no-cache-dir lmcache google-cloud-bigtable
+
+Configuration
+-------------
+
+**Example A: Standard Bigtable SSD Integration (L2 Only)**
+
+.. code-block:: yaml
+
+chunk_size: 256
+
+local_cpu: true
+max_local_cpu_size: 10.0
+remote_url: "bigtable://your-gcp-project-id/your-bigtable-instance-id"
+
+remote_serde: "naive"
+
+extra_config:
+  bigtable_project_id: "your-gcp-project-id"
+  bigtable_instance_id: "your-bigtable-instance-id"
+  bigtable_table_name: "lmcache-benchmark-v1"
+
+.. note::
+   Alternatively, you can set the environment variables ``BT_PROJECT_ID``, ``BT_INSTANCE_ID``, and ``BT_TABLE_NAME`` instead of using ``extra_config``.
+
+**Example B: 3-Tier Multi-Connector Hybrid (Local CPU -> Redis L2 -> Bigtable SSD L3)**
+
+Deploy Redis for hot-cache loopbacks while offloading long-tail persistent storage to Bigtable SSD, using LMCache's dynamic OrderedDict routing.
+
+.. code-block:: yaml
+
+chunk_size: 256
+local_cpu: true
+max_local_cpu_size: 15.0
+
+remote_storage_plugins:
+  - "redis"
+  - "bigtable"
+
+extra_config:
+  remote_storage_plugin.redis.redis_url: "redis://your-redis-host:6379"
+  
+  remote_storage_plugin.bigtable.bigtable_project_id: "your-gcp-project-id"
+  remote_storage_plugin.bigtable.bigtable_instance_id: "your-bigtable-instance-id"
+  remote_storage_plugin.bigtable.bigtable_table_name: "lmcache-benchmark-v1"
+  remote_storage_plugin.bigtable.bigtable_family_name: "cf"
+  remote_storage_plugin.bigtable.bigtable_column_name: "data"
+  
+  remote_storage_plugin.bigtable.credentials_path: "/etc/gcp/key.json"
+  
+  remote_storage_plugin.bigtable.bigtable_max_chunk_size_mb: 90.0
+  remote_storage_plugin.bigtable.exists_cache_ttl_seconds: 10.0
+  remote_storage_plugin.bigtable.exists_cache_size: 10000
+  
+  remote_storage_plugin.bigtable.bigtable_write_timeout_ms: 10000.0
+  remote_storage_plugin.bigtable.bigtable_read_timeout_ms: 5000.0
+
+Authentication
+--------------
+
+- **Application Default Credentials (ADC)**: If ``credentials_path`` is omitted or ``null``, the connector natively invokes ADC. Compatible with local development via ``gcloud auth application-default login`` or GKE Workload Identity Federation.
+- **Explicit Keys**: Pass the absolute filesystem path containing a mounted GCP Service Account JSON secret to ``credentials_path``.
+
+Verification
+------------
+
+Ensure you have installed the required dependencies:
+
+.. code-block:: bash
+
+pip install cachetools google-cloud-bigtable
+
+Run the unit tests:
+
+.. code-block:: bash
+
+pytest tests/v1/storage_backend/test_bigtable_connector.py
+
+Troubleshooting Large Payload Warnings
+--------------------------------------
+
+If you see a warning in the logs indicating that a chunk size exceeds the limit and is skipped (e.g. ``Bigtable chunk size ... MB exceeds threshold ... MB. Skipping write to prevent hard failures``), choose one of the following approaches:
+
+1. **Reduce the LMCache Chunk Size (Recommended)**:
+   The serialized chunk size depends on LMCache's logical ``chunk_size`` (number of tokens per chunk) and model shape. You can reduce ``chunk_size`` (e.g., from ``256`` to ``128``) in your configuration file to shrink individual chunk payloads.
+   
+2. **Increase the Max Chunk Size**:
+   If your Bigtable instance uses the SSD storage tier (which supports up to 100 MB per cell/row), you can raise the maximum allowed write threshold in the configuration up to ``99.0`` MB using the ``bigtable_max_chunk_size_mb`` config key (or the ``BT_MAX_CHUNK_SIZE_MB`` environment variable).
+   
+   .. warning::
+      Do not set ``bigtable_max_chunk_size_mb`` higher than ``100.0`` MB. While Cloud Bigtable supports up to ``256.0`` MB for a single row, a single cell value (which LMCache uses to store the chunk payload) has a hard limit of ``100.0`` MB. Exceeding this will trigger hard gRPC exceptions.
diff --git a/docs/source/kv_cache/storage_backends/index.rst b/docs/source/kv_cache/storage_backends/index.rst
index feef262d6c..a8a254a6e4 100644
--- a/docs/source/kv_cache/storage_backends/index.rst
+++ b/docs/source/kv_cache/storage_backends/index.rst
@@ -23,6 +23,7 @@ Supported Backends
    mooncake
    nixl
    redis
+   bigtable
    resp
    s3
    sagemaker_hyperpod
diff --git a/lmcache/v1/storage_backend/connector/bigtable_adapter.py b/lmcache/v1/storage_backend/connector/bigtable_adapter.py
new file mode 100644
index 0000000000..f6e0da0479
--- /dev/null
+++ b/lmcache/v1/storage_backend/connector/bigtable_adapter.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# First Party
+from lmcache.v1.storage_backend.connector import ConnectorAdapter, ConnectorContext
+from lmcache.v1.storage_backend.connector.base_connector import RemoteConnector
+
+# Local
+from .bigtable_connector import BigtableConnector
+
+
+class BigtableConnectorAdapter(ConnectorAdapter):
+    """Adapter for BigtableConnector to integrate natively with LMCache
+    built-in connectors.
+
+    Supports both 'plugin://bigtable' and 'bigtable://' URL schemas.
+    """
+
+    def __init__(self):
+        super().__init__("plugin://bigtable")
+
+    def can_parse(self, url: str) -> bool:
+        return url.startswith(self.schema) or url.startswith("bigtable://")
+
+    def create_connector(self, context: ConnectorContext) -> RemoteConnector:
+        return BigtableConnector(
+            loop=context.loop,
+            local_cpu_backend=context.local_cpu_backend,
+            config=context.config,
+            plugin_name=context.plugin_name,
+        )
diff --git a/lmcache/v1/storage_backend/connector/bigtable_config.py b/lmcache/v1/storage_backend/connector/bigtable_config.py
new file mode 100644
index 0000000000..5b88848044
--- /dev/null
+++ b/lmcache/v1/storage_backend/connector/bigtable_config.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class BigtablePluginConfig:
+    project_id: str
+    instance_id: str
+    table_name: str
+    app_profile_id: Optional[str] = None
+    read_timeout_sec: float = 0.2
+    write_timeout_sec: float = 0.5
+    exists_cache_ttl_seconds: float = 30.0
+    exists_cache_size: int = 10000
+    thread_pool_size: int = 16
+    row_key_template: str = "hash#model"
+    credentials_path: Optional[str] = None
+    max_retries: int = 3
+    max_chunk_size_mb: float = 90.0
+    family_name: str = "cf"
+    column_name: str = "data"
+
+    @classmethod
+    def from_extra_config(
+        cls, extra_config: Dict[str, Any], plugin_name: Optional[str] = None
+    ) -> "BigtablePluginConfig":
+        # Standard
+        import os
+
+        def get_val(key: str, default: Any = None) -> Any:
+            if plugin_name:
+                full_key = f"remote_storage_plugin.{plugin_name}.{key}"
+                if full_key in extra_config:
+                    return extra_config[full_key]
+            return extra_config.get(key, default)
+
+        project_id = get_val("bigtable_project_id") or os.environ.get("BT_PROJECT_ID")
+        instance_id = get_val("bigtable_instance_id") or os.environ.get(
+            "BT_INSTANCE_ID"
+        )
+        table_name = get_val("bigtable_table_name") or os.environ.get("BT_TABLE_NAME")
+
+        if not project_id or not instance_id or not table_name:
+            raise ValueError(
+                f"Bigtable out-of-tree connector requires bigtable_project_id, "
+                f"bigtable_instance_id, and bigtable_table_name (or BT_* env vars). "
+                f"Got project={project_id}, instance={instance_id}, table={table_name}"
+            )
+
+        return cls(
+            project_id=project_id,
+            instance_id=instance_id,
+            table_name=table_name,
+            app_profile_id=get_val(
+                "bigtable_app_profile",
+                get_val("app_profile", os.environ.get("BT_APP_PROFILE")),
+            ),
+            read_timeout_sec=float(
+                get_val(
+                    "bigtable_read_timeout_ms",
+                    get_val(
+                        "read_timeout_ms",
+                        os.environ.get("BT_READ_TIMEOUT_MS", 200.0),
+                    ),
+                )
+            )
+            / 1000.0,
+            write_timeout_sec=float(
+                get_val(
+                    "bigtable_write_timeout_ms",
+                    get_val(
+                        "write_timeout_ms",
+                        os.environ.get("BT_WRITE_TIMEOUT_MS", 500.0),
+                    ),
+                )
+            )
+            / 1000.0,
+            exists_cache_ttl_seconds=float(
+                get_val(
+                    "bigtable_exists_cache_ttl_seconds",
+                    get_val(
+                        "exists_cache_ttl_seconds",
+                        os.environ.get("BT_EXISTS_CACHE_TTL_SECONDS", 30.0),
+                    ),
+                )
+            ),
+            exists_cache_size=int(
+                get_val(
+                    "bigtable_exists_cache_size",
+                    get_val(
+                        "exists_cache_size",
+                        os.environ.get("BT_EXISTS_CACHE_SIZE", 10000),
+                    ),
+                )
+            ),
+            thread_pool_size=int(
+                get_val(
+                    "bigtable_thread_pool_size",
+                    get_val(
+                        "thread_pool_size", os.environ.get("BT_THREAD_POOL_SIZE", 16)
+                    ),
+                )
+            ),
+            row_key_template=get_val(
+                "bigtable_row_key_template",
+                get_val(
+                    "row_key_template",
+                    os.environ.get("BT_ROW_KEY_TEMPLATE", "hash#model"),
+                ),
+            ),
+            credentials_path=get_val(
+                "bigtable_credentials_path",
+                get_val("credentials_path", os.environ.get("BT_CREDENTIALS_PATH")),
+            ),
+            max_retries=int(
+                get_val(
+                    "bigtable_max_retries",
+                    get_val("max_retries", os.environ.get("BT_MAX_RETRIES", 3)),
+                )
+            ),
+            max_chunk_size_mb=float(
+                get_val(
+                    "bigtable_max_chunk_size_mb",
+                    os.environ.get("BT_MAX_CHUNK_SIZE_MB", 90.0),
+                )
+            ),
+            family_name=get_val(
+                "bigtable_family_name", os.environ.get("BT_FAMILY_NAME", "cf")
+            ),
+            column_name=get_val(
+                "bigtable_column_name", os.environ.get("BT_COLUMN_NAME", "data")
+            ),
+        )
diff --git a/lmcache/v1/storage_backend/connector/bigtable_connector.py b/lmcache/v1/storage_backend/connector/bigtable_connector.py
new file mode 100644
index 0000000000..fe3d7129b8
--- /dev/null
+++ b/lmcache/v1/storage_backend/connector/bigtable_connector.py
@@ -0,0 +1,929 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+from enum import IntEnum, auto
+from typing import List, Optional
+import asyncio
+import inspect
+import threading
+
+# Third Party
+from cachetools import TTLCache as _TTLCache  # type: ignore
+
+# First Party
+from lmcache.logging import init_logger
+from lmcache.utils import CacheEngineKey
+from lmcache.v1.config import LMCacheEngineConfig
+from lmcache.v1.memory_management import MemoryObj
+from lmcache.v1.storage_backend.connector.base_connector import RemoteConnector
+from lmcache.v1.storage_backend.job_executor.pq_executor import AsyncPQExecutor
+from lmcache.v1.storage_backend.local_cpu_backend import LocalCPUBackend
+
+# Local
+from .bigtable_config import BigtablePluginConfig
+from .bigtable_schema import BigtableSchema
+
+logger = init_logger(__name__)
+
+
+class Priorities(IntEnum):
+    PEEK = auto()
+    PREFETCH = auto()
+    GET = auto()
+    PUT = auto()
+
+
+class TTLCache:
+    """Thread-safe wrapper around cachetools.TTLCache for existence checks."""
+
+    def __init__(self, max_size: int, ttl_seconds: float):
+        self.cache: _TTLCache = _TTLCache(maxsize=max_size, ttl=ttl_seconds)
+        self.lock = threading.RLock()
+
+    def get(self, key: str) -> Optional[bool]:
+        with self.lock:
+            return self.cache.get(key)
+
+    def put(self, key: str, val: bool):
+        with self.lock:
+            self.cache[key] = val
+
+    def invalidate(self, key: str):
+        with self.lock:
+            self.cache.pop(key, None)
+
+
+class BigtableConnector(RemoteConnector):
+    """
+    Native Bigtable remote connector integrated into LMCache.
+    Tier Agnostic Design: serves all tier roles driven by per-instance configuration.
+    """
+
+    def __init__(
+        self,
+        loop: asyncio.AbstractEventLoop,
+        local_cpu_backend: LocalCPUBackend,
+        config: LMCacheEngineConfig,
+        plugin_name: Optional[str] = None,
+    ):
+        super().__init__(local_cpu_backend.config, local_cpu_backend.metadata)
+
+        extra_config = config.extra_config if config.extra_config is not None else {}
+        self.cfg = BigtablePluginConfig.from_extra_config(extra_config, plugin_name)
+        self.schema = BigtableSchema(
+            self.cfg.row_key_template, self.cfg.family_name, self.cfg.column_name
+        )
+
+        self.loop = loop
+        self.local_cpu_backend = local_cpu_backend
+
+        self.exists_cache = TTLCache(
+            max_size=self.cfg.exists_cache_size,
+            ttl_seconds=self.cfg.exists_cache_ttl_seconds,
+        )
+
+        # Independent gRPC client pool initialized lazily on first operation
+        self._client = None
+        self._table = None
+
+        # Use native AsyncPQExecutor to run pure coroutines directly on the event loop
+        self.pq_executor = AsyncPQExecutor(loop, max_workers=self.cfg.thread_pool_size)
+
+        logger.info(
+            f"Initialized Bigtable remote connector for project={self.cfg.project_id}, "
+            f"instance={self.cfg.instance_id}, table={self.cfg.table_name}"
+        )
+
+    def _get_client(self):
+        """Lazy initialization of independent Bigtable async data client."""
+        if self._client is not None:
+            return self._client
+
+        try:
+            # Third Party
+            from google.api_core import exceptions as google_exceptions
+            from google.api_core.gapic_v1.client_info import ClientInfo
+            from google.cloud.bigtable.data import BigtableDataClientAsync
+
+            self.google_exceptions = google_exceptions
+
+            logger.info(
+                f"Lazily initializing Bigtable async client "
+                f"for project={self.cfg.project_id}, "
+                f"instance={self.cfg.instance_id}, "
+                f"table={self.cfg.table_name}"
+            )
+
+            client_info = ClientInfo(user_agent="lmcache")
+
+            if self.cfg.credentials_path:
+                # Third Party
+                from google.oauth2 import service_account
+                import google.auth.exceptions
+
+                try:
+                    credentials = service_account.Credentials.from_service_account_file(
+                        self.cfg.credentials_path
+                    )
+                    self._client = BigtableDataClientAsync(
+                        project=self.cfg.project_id,
+                        credentials=credentials,
+                        client_info=client_info,
+                    )
+                except (
+                    OSError,
+                    ValueError,
+                    google.auth.exceptions.GoogleAuthError,
+                ) as e:
+                    logger.warning(
+                        f"Failed to load credentials from "
+                        f"{self.cfg.credentials_path} due to {e}. "
+                        f"Falling back to Application Default Credentials."
+                    )
+                    self._client = BigtableDataClientAsync(
+                        project=self.cfg.project_id,
+                        client_info=client_info,
+                    )
+            else:
+                self._client = BigtableDataClientAsync(
+                    project=self.cfg.project_id,
+                    client_info=client_info,
+                )
+
+            return self._client
+        except Exception as e:
+            logger.error(f"Failed to initialize Bigtable async client: {e}")
+            raise
+
+    def _get_table(self):
+        """Lazy initialization and caching of TableAsync instance."""
+        if self._table is not None:
+            return self._table
+
+        client = self._get_client()
+        try:
+            asyncio.get_running_loop()
+        except RuntimeError as e:
+            raise RuntimeError(
+                "TableAsync must be retrieved within an async event loop context."
+            ) from e
+
+        self._table = client.get_table(self.cfg.instance_id, self.cfg.table_name)
+        return self._table
+
+    def _get_row_filters_module(self):
+        try:
+            # Third Party
+            from google.cloud.bigtable.data import row_filters
+
+            return row_filters
+        except ImportError:
+            # Third Party
+            from google.cloud.bigtable import row_filters
+
+            return row_filters
+
+    async def _exists_internal(self, key: CacheEngineKey) -> bool:
+        key_str = key.to_string()
+        cached_val = self.exists_cache.get(key_str)
+        if cached_val is not None:
+            return cached_val
+
+        row_key = self.schema.get_row_key(key)
+
+        row_filters = self._get_row_filters_module()
+        row_filter = getattr(
+            row_filters, "StripValueTransformerFilter", lambda flag: None
+        )(True)
+
+        retries = 0
+        while True:
+            try:
+                kwargs = {}
+                if row_filter is not None:
+                    kwargs["row_filter"] = row_filter
+                if self.cfg.app_profile_id:
+                    kwargs["app_profile_id"] = self.cfg.app_profile_id
+
+                table = self._get_table()
+                row = await table.read_row(
+                    row_key,
+                    operation_timeout=self.cfg.read_timeout_sec,
+                    **kwargs,
+                )
+                exists = row is not None
+                self.exists_cache.put(key_str, exists)
+                return exists
+            except (
+                self.google_exceptions.DeadlineExceeded,
+                TimeoutError,
+            ) as e:
+                logger.warning(
+                    f"Bigtable async timeout in exists: {e}. Treating as miss."
+                )
+                return False
+            except (
+                self.google_exceptions.PermissionDenied,
+                self.google_exceptions.Unauthenticated,
+            ) as e:
+                logger.error(f"Bigtable permission/auth error in exists: {e}")
+                raise
+            except self.google_exceptions.NotFound as e:
+                logger.error(f"Bigtable NotFound in exists: {e}")
+                raise
+            except self.google_exceptions.ResourceExhausted:
+                if retries < self.cfg.max_retries:
+                    sleep_time = 0.5 * (2**retries)
+                    logger.warning(
+                        f"Bigtable ResourceExhausted. Retrying in {sleep_time}s."
+                    )
+                    await asyncio.sleep(sleep_time)
+                    retries += 1
+                else:
+                    logger.warning(
+                        "Bigtable ResourceExhausted max retries reached in exists."
+                    )
+                    return False
+
+    async def exists(self, key: CacheEngineKey) -> bool:
+        return await self.pq_executor.submit_job(
+            self._exists_internal, key=key, priority=Priorities.PEEK
+        )
+
+    def exists_sync(self, key: CacheEngineKey) -> bool:
+        future = asyncio.run_coroutine_threadsafe(self.exists(key), self.loop)
+        return bool(future.result())
+
+    async def _get_internal(self, key: CacheEngineKey) -> Optional[MemoryObj]:
+        key_str = key.to_string()
+        row_key = self.schema.get_row_key(key)
+
+        row_filters = self._get_row_filters_module()
+        row_filter = getattr(row_filters, "CellsColumnLimitFilter", lambda n: None)(1)
+
+        retries = 0
+        while True:
+            try:
+                kwargs = {}
+                if row_filter is not None:
+                    kwargs["row_filter"] = row_filter
+                if self.cfg.app_profile_id:
+                    kwargs["app_profile_id"] = self.cfg.app_profile_id
+
+                table = self._get_table()
+                row = await table.read_row(
+                    row_key,
+                    operation_timeout=self.cfg.read_timeout_sec,
+                    **kwargs,
+                )
+                if row is None:
+                    self.exists_cache.put(key_str, False)
+                    return None
+
+                self.exists_cache.put(key_str, True)
+
+                cell_value = self.schema.extract_cell_value(row)
+                if cell_value is None:
+                    return None
+
+                memory_obj = self.local_cpu_backend.allocate(
+                    self.meta_shapes,
+                    self.meta_dtypes,
+                    self.meta_fmt,
+                )
+                if memory_obj is None:
+                    logger.warning("Failed to allocate memory during Bigtable receive")
+                    return None
+
+                view = memory_obj.byte_array
+                if not isinstance(view, memoryview):
+                    view = memoryview(view)
+
+                if isinstance(view.format, str) and view.format == "<B":
+                    view = view.cast("B")
+
+                if len(cell_value) > len(view):
+                    logger.warning(
+                        f"Bigtable cell size {len(cell_value)} exceeds "
+                        f"allocated view size {len(view)}"
+                    )
+                    memory_obj.ref_count_down()
+                    return None
+
+                view[: len(cell_value)] = cell_value
+                if len(cell_value) < len(view):
+                    memory_obj = self.reshape_partial_chunk(memory_obj, len(cell_value))
+                return memory_obj
+
+            except (
+                self.google_exceptions.DeadlineExceeded,
+                TimeoutError,
+            ) as e:
+                logger.warning(f"Bigtable async timeout in get: {e}. Treating as miss.")
+                return None
+            except (
+                self.google_exceptions.PermissionDenied,
+                self.google_exceptions.Unauthenticated,
+            ) as e:
+                logger.error(f"Bigtable permission/auth error in get: {e}")
+                raise
+            except self.google_exceptions.NotFound as e:
+                logger.error(f"Bigtable NotFound in get: {e}")
+                raise
+            except self.google_exceptions.ResourceExhausted:
+                if retries < self.cfg.max_retries:
+                    sleep_time = 0.5 * (2**retries)
+                    logger.warning(
+                        f"Bigtable ResourceExhausted. Retrying in {sleep_time}s."
+                    )
+                    await asyncio.sleep(sleep_time)
+                    retries += 1
+                else:
+                    logger.warning(
+                        "Bigtable ResourceExhausted max retries reached in get."
+                    )
+                    return None
+
+    async def get(self, key: CacheEngineKey) -> Optional[MemoryObj]:
+        return await self.pq_executor.submit_job(
+            self._get_internal, key=key, priority=Priorities.GET
+        )
+
+    async def _put_internal(self, key: CacheEngineKey, memory_obj: MemoryObj):
+        try:
+            key_str = key.to_string()
+            blob = memory_obj.byte_array
+            blob_size_mb = len(blob) / (1024 * 1024)
+
+            if blob_size_mb > self.cfg.max_chunk_size_mb:
+                logger.warning(
+                    f"Bigtable chunk size {blob_size_mb:.2f} MB exceeds "
+                    f"threshold {self.cfg.max_chunk_size_mb} MB. "
+                    f"Skipping write to prevent hard failures."
+                )
+                return
+
+            row_key = self.schema.get_row_key(key)
+            data_bytes = (
+                bytes(blob) if not isinstance(blob, (bytes, bytearray)) else blob
+            )
+
+            # Third Party
+            from google.cloud.bigtable.data import SetCell
+
+            mutation = SetCell(self.cfg.family_name, self.cfg.column_name, data_bytes)
+
+            retries = 0
+            while True:
+                try:
+                    kwargs = {}
+                    if self.cfg.app_profile_id:
+                        kwargs["app_profile_id"] = self.cfg.app_profile_id
+
+                    table = self._get_table()
+                    await table.mutate_row(
+                        row_key,
+                        mutation,
+                        operation_timeout=self.cfg.write_timeout_sec,
+                        **kwargs,
+                    )
+                    self.exists_cache.put(key_str, True)
+                    logger.debug(
+                        f"Successfully wrote "
+                        f"{row_key.decode('utf-8', errors='ignore')} "
+                        f"via async Bigtable"
+                    )
+                    return
+                except (
+                    self.google_exceptions.DeadlineExceeded,
+                    TimeoutError,
+                ) as e:
+                    logger.warning(
+                        f"Bigtable async timeout in put: {e}. Skipping write."
+                    )
+                    return
+                except (
+                    self.google_exceptions.PermissionDenied,
+                    self.google_exceptions.Unauthenticated,
+                ) as e:
+                    logger.error(f"Bigtable permission/auth error in put: {e}")
+                    raise
+                except self.google_exceptions.NotFound as e:
+                    logger.error(f"Bigtable NotFound in put: {e}")
+                    raise
+                except self.google_exceptions.ResourceExhausted:
+                    if retries < self.cfg.max_retries:
+                        sleep_time = 0.5 * (2**retries)
+                        logger.warning(
+                            f"Bigtable ResourceExhausted. Retrying in {sleep_time}s."
+                        )
+                        await asyncio.sleep(sleep_time)
+                        retries += 1
+                    else:
+                        logger.warning(
+                            "Bigtable ResourceExhausted max retries reached in put."
+                        )
+                        return
+        finally:
+            memory_obj.ref_count_down()
+
+    async def put(self, key: CacheEngineKey, memory_obj: MemoryObj):
+        await self.pq_executor.submit_job(
+            self._put_internal,
+            key=key,
+            memory_obj=memory_obj,
+            priority=Priorities.PUT,
+        )
+
+    def support_batched_get(self) -> bool:
+        return True
+
+    async def _batched_get_internal(
+        self, keys: List[CacheEngineKey]
+    ) -> List[Optional[MemoryObj]]:
+        # Third Party
+        from google.cloud.bigtable.data import ReadRowsQuery
+
+        row_keys: List[str | bytes] = [self.schema.get_row_key(k) for k in keys]
+        row_filters = self._get_row_filters_module()
+        row_filter = getattr(row_filters, "CellsColumnLimitFilter", lambda n: None)(1)
+
+        query = ReadRowsQuery(row_keys=row_keys, row_filter=row_filter)
+
+        retries = 0
+        while True:
+            try:
+                kwargs = {}
+                if self.cfg.app_profile_id:
+                    kwargs["app_profile_id"] = self.cfg.app_profile_id
+
+                table = self._get_table()
+                rows_gen = await table.read_rows(
+                    query=query,
+                    operation_timeout=self.cfg.read_timeout_sec,
+                    **kwargs,
+                )
+
+                row_dict = {}
+                for row in rows_gen:
+                    row_dict[row.row_key] = row
+
+                memory_objs: List[Optional[MemoryObj]] = []
+                for key, rk in zip(keys, row_keys, strict=False):
+                    key_str = key.to_string()
+                    row = row_dict.get(rk)
+                    if row is None:
+                        self.exists_cache.put(key_str, False)
+                        memory_objs.append(None)
+                        continue
+
+                    self.exists_cache.put(key_str, True)
+                    cell_value = self.schema.extract_cell_value(row)
+                    if cell_value is None:
+                        memory_objs.append(None)
+                        continue
+
+                    memory_obj = self.local_cpu_backend.allocate(
+                        self.meta_shapes,
+                        self.meta_dtypes,
+                        self.meta_fmt,
+                    )
+                    if memory_obj is None:
+                        logger.warning(
+                            "Failed to allocate memory during batched Bigtable receive"
+                        )
+                        memory_objs.append(None)
+                        continue
+
+                    view = memory_obj.byte_array
+                    if not isinstance(view, memoryview):
+                        view = memoryview(view)
+                    if isinstance(view.format, str) and view.format == "<B":
+                        view = view.cast("B")
+
+                    if len(cell_value) > len(view):
+                        logger.warning(
+                            f"Bigtable cell size {len(cell_value)} "
+                            f"exceeds allocated view size {len(view)}"
+                        )
+                        memory_obj.ref_count_down()
+                        memory_objs.append(None)
+                        continue
+
+                    view[: len(cell_value)] = cell_value
+                    if len(cell_value) < len(view):
+                        memory_obj = self.reshape_partial_chunk(
+                            memory_obj, len(cell_value)
+                        )
+                    memory_objs.append(memory_obj)
+
+                return memory_objs
+
+            except (
+                self.google_exceptions.DeadlineExceeded,
+                TimeoutError,
+            ) as e:
+                logger.warning(
+                    f"Bigtable async timeout in batched_get: {e}. Treating as miss."
+                )
+                return [None] * len(keys)
+            except (
+                self.google_exceptions.PermissionDenied,
+                self.google_exceptions.Unauthenticated,
+            ) as e:
+                logger.error(f"Bigtable permission/auth error in batched_get: {e}")
+                raise
+            except self.google_exceptions.NotFound as e:
+                logger.error(f"Bigtable NotFound in batched_get: {e}")
+                raise
+            except self.google_exceptions.ResourceExhausted:
+                if retries < self.cfg.max_retries:
+                    sleep_time = 0.5 * (2**retries)
+                    logger.warning(
+                        f"Bigtable ResourceExhausted. Retrying in {sleep_time}s."
+                    )
+                    await asyncio.sleep(sleep_time)
+                    retries += 1
+                else:
+                    logger.warning(
+                        "Bigtable ResourceExhausted max retries reached in batched_get."
+                    )
+                    return [None] * len(keys)
+
+    async def batched_get(
+        self, keys: List[CacheEngineKey]
+    ) -> List[Optional[MemoryObj]]:
+        return await self.pq_executor.submit_job(
+            self._batched_get_internal, keys=keys, priority=Priorities.GET
+        )
+
+    def support_batched_put(self) -> bool:
+        return True
+
+    async def _batched_put_internal(
+        self, keys: List[CacheEngineKey], memory_objs: List[MemoryObj]
+    ):
+        try:
+            # Third Party
+            from google.cloud.bigtable.data import RowMutationEntry, SetCell
+
+            current_batch: List[RowMutationEntry] = []
+            current_batch_keys: List[str] = []
+            current_batch_size = 0
+            MAX_BATCH_SIZE_BYTES = 30 * 1024 * 1024  # 30MB safety limit
+
+            table = self._get_table()
+            kwargs = {}
+            if self.cfg.app_profile_id:
+                kwargs["app_profile_id"] = self.cfg.app_profile_id
+
+            async def flush_batch(batch, batch_keys):
+                if not batch:
+                    return
+                retries = 0
+                while True:
+                    try:
+                        await table.bulk_mutate_rows(
+                            batch,
+                            operation_timeout=self.cfg.write_timeout_sec,
+                            **kwargs,
+                        )
+                        for k_str in batch_keys:
+                            self.exists_cache.put(k_str, True)
+                        logger.debug(
+                            f"Successfully batched put {len(batch)} rows "
+                            f"via async Bigtable"
+                        )
+                        return
+                    except (
+                        self.google_exceptions.DeadlineExceeded,
+                        TimeoutError,
+                    ) as e:
+                        logger.warning(
+                            f"Bigtable async timeout in batched_put: {e}. Skipping."
+                        )
+                        return
+                    except (
+                        self.google_exceptions.PermissionDenied,
+                        self.google_exceptions.Unauthenticated,
+                    ) as e:
+                        logger.error(
+                            f"Bigtable permission/auth error in batched_put: {e}"
+                        )
+                        raise
+                    except self.google_exceptions.NotFound as e:
+                        logger.error(f"Bigtable NotFound in batched_put: {e}")
+                        raise
+                    except self.google_exceptions.ResourceExhausted:
+                        if retries < self.cfg.max_retries:
+                            sleep_time = 0.5 * (2**retries)
+                            logger.warning(f"Retrying Bigtable in {sleep_time}s.")
+                            await asyncio.sleep(sleep_time)
+                            retries += 1
+                        else:
+                            logger.warning(
+                                "Bigtable ResourceExhausted max retries reached "
+                                "in batched_put."
+                            )
+                            return
+                    except Exception as e:
+                        logger.error(
+                            f"Unexpected error in Bigtable "
+                            f"_batched_put_internal flush: {e}",
+                            exc_info=True,
+                        )
+                        raise
+
+            for key, memory_obj in zip(keys, memory_objs, strict=False):
+                if memory_obj is None:
+                    continue
+                blob = memory_obj.byte_array
+                blob_size = len(blob)
+                blob_size_mb = blob_size / (1024 * 1024)
+
+                if blob_size_mb > self.cfg.max_chunk_size_mb:
+                    logger.warning(
+                        f"Bigtable chunk size {blob_size_mb:.2f} MB exceeds "
+                        f"threshold {self.cfg.max_chunk_size_mb} MB. "
+                        f"Skipping write for key {key.to_string()}."
+                    )
+                    continue
+
+                if (
+                    current_batch_size + blob_size > MAX_BATCH_SIZE_BYTES
+                    and current_batch
+                ):
+                    await flush_batch(current_batch, current_batch_keys)
+                    current_batch = []
+                    current_batch_keys = []
+                    current_batch_size = 0
+
+                row_key = self.schema.get_row_key(key)
+                data_bytes = (
+                    bytes(blob) if not isinstance(blob, (bytes, bytearray)) else blob
+                )
+
+                mutation = SetCell(
+                    self.cfg.family_name, self.cfg.column_name, data_bytes
+                )
+                entry = RowMutationEntry(row_key, mutation)
+                current_batch.append(entry)
+                current_batch_keys.append(key.to_string())
+                current_batch_size += blob_size
+
+            if current_batch:
+                await flush_batch(current_batch, current_batch_keys)
+        finally:
+            for memory_obj in memory_objs:
+                if memory_obj is not None:
+                    memory_obj.ref_count_down()
+
+    async def batched_put(
+        self, keys: List[CacheEngineKey], memory_objs: List[MemoryObj]
+    ):
+        await self.pq_executor.submit_job(
+            self._batched_put_internal,
+            keys=keys,
+            memory_objs=memory_objs,
+            priority=Priorities.PUT,
+        )
+
+    def support_batched_async_contains(self) -> bool:
+        return True
+
+    async def _batched_contains_internal(self, keys: List[CacheEngineKey]) -> int:
+        count = 0
+        missing_keys = []
+        for key in keys:
+            val = self.exists_cache.get(key.to_string())
+            if val is False:
+                return count
+            elif val is None:
+                missing_keys = keys[count:]
+                break
+            count += 1
+
+        if not missing_keys:
+            return count
+
+        # Third Party
+        from google.cloud.bigtable.data import ReadRowsQuery
+
+        row_keys_missing: List[str | bytes] = [
+            self.schema.get_row_key(k) for k in missing_keys
+        ]
+        row_filters = self._get_row_filters_module()
+        row_filter = getattr(
+            row_filters, "StripValueTransformerFilter", lambda flag: None
+        )(True)
+
+        query = ReadRowsQuery(row_keys=row_keys_missing, row_filter=row_filter)
+
+        retries = 0
+        while True:
+            try:
+                kwargs = {}
+                if self.cfg.app_profile_id:
+                    kwargs["app_profile_id"] = self.cfg.app_profile_id
+
+                table = self._get_table()
+                rows_gen = await table.read_rows(
+                    query=query,
+                    operation_timeout=self.cfg.read_timeout_sec,
+                    **kwargs,
+                )
+
+                existing_rk_set = set()
+                for row in rows_gen:
+                    existing_rk_set.add(row.row_key)
+
+                for k, rk in zip(missing_keys, row_keys_missing, strict=False):
+                    exists = rk in existing_rk_set
+                    self.exists_cache.put(k.to_string(), exists)
+                    if not exists:
+                        return count
+                    count += 1
+
+                return count
+
+            except (
+                self.google_exceptions.DeadlineExceeded,
+                TimeoutError,
+            ) as e:
+                logger.warning(f"Bigtable async timeout in batched_contains: {e}")
+                return count
+            except (
+                self.google_exceptions.PermissionDenied,
+                self.google_exceptions.Unauthenticated,
+            ) as e:
+                logger.error(f"Bigtable auth error in batched_contains: {e}")
+                raise
+            except self.google_exceptions.NotFound as e:
+                logger.error(f"Bigtable NotFound in batched_contains: {e}")
+                raise
+            except self.google_exceptions.ResourceExhausted:
+                if retries < self.cfg.max_retries:
+                    sleep_time = 0.5 * (2**retries)
+                    logger.warning(
+                        f"Bigtable ResourceExhausted. Retrying in {sleep_time}s."
+                    )
+                    await asyncio.sleep(sleep_time)
+                    retries += 1
+                else:
+                    logger.warning(
+                        "Bigtable ResourceExhausted max retries reached "
+                        "in batched_contains."
+                    )
+                    return count
+
+    async def batched_async_contains(
+        self,
+        lookup_id: str,
+        keys: List[CacheEngineKey],
+        pin: bool = False,
+    ) -> int:
+        return await self.pq_executor.submit_job(
+            self._batched_contains_internal,
+            keys=keys,
+            priority=Priorities.PREFETCH,
+        )
+
+    def remove_sync(self, key: CacheEngineKey) -> bool:
+        try:
+            self.exists_cache.invalidate(key.to_string())
+            # Third Party
+            from google.cloud.bigtable.data import DeleteAllFromRow
+
+            row_key = self.schema.get_row_key(key)
+
+            kwargs = {}
+            if self.cfg.app_profile_id:
+                kwargs["app_profile_id"] = self.cfg.app_profile_id
+
+            async def _do_remove():
+                try:
+                    table = self._get_table()
+                    await table.mutate_row(
+                        row_key,
+                        DeleteAllFromRow(),
+                        operation_timeout=self.cfg.write_timeout_sec,
+                        **kwargs,
+                    )
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to remove key {key} from Bigtable in background: {e}"
+                    )
+
+            asyncio.run_coroutine_threadsafe(
+                _do_remove(),
+                self.loop,
+            )
+            return True
+        except Exception as e:
+            logger.warning(
+                f"Failed to schedule removal of key {key} from Bigtable: {e}"
+            )
+            return False
+
+    async def _list_internal(self) -> List[str]:
+        # Third Party
+        from google.cloud.bigtable.data import ReadRowsQuery
+
+        row_filters = self._get_row_filters_module()
+        row_filter = getattr(
+            row_filters, "StripValueTransformerFilter", lambda flag: None
+        )(True)
+        query = ReadRowsQuery(row_filter=row_filter)
+
+        kwargs = {}
+        if self.cfg.app_profile_id:
+            kwargs["app_profile_id"] = self.cfg.app_profile_id
+
+        table = self._get_table()
+        rows_gen = await table.read_rows(
+            query=query,
+            operation_timeout=self.cfg.read_timeout_sec,
+            **kwargs,
+        )
+
+        res = []
+        for row in rows_gen:
+            rk_str = row.row_key.decode("utf-8")
+            parts_split = rk_str.split("#", 1)
+            if len(parts_split) != 2:
+                continue
+            p1, p2 = parts_split
+            if "@" in p1:
+                fingerprint, chunk_hash_hex = p1, p2
+            else:
+                chunk_hash_hex, fingerprint = p1, p2
+            parts = fingerprint.split("@")
+            if len(parts) < 4:
+                continue
+            model_name = parts[0]
+            world_size = parts[1]
+            worker_id = parts[2]
+            dtype_str = parts[3]
+
+            std_str = (
+                f"{model_name}@{world_size}@{worker_id}@{chunk_hash_hex}@{dtype_str}"
+            )
+            if len(parts) > 4:
+                std_str += "@" + "@".join(parts[4:])
+            res.append(std_str)
+        return res
+
+    async def list(self) -> List[str]:
+        return await self.pq_executor.submit_job(
+            self._list_internal, priority=Priorities.GET
+        )
+
+    async def close(self):
+        await self.pq_executor.shutdown_async(wait=False)
+        if getattr(self, "_table", None) is not None:
+            try:
+                res = self._table.close()
+                if inspect.isawaitable(res):
+                    await res
+            except Exception as e:
+                logger.warning(f"Failed to close Bigtable table cleanly: {e}")
+            self._table = None
+        if getattr(self, "_client", None) is not None:
+            try:
+                res = self._client.close()
+                if inspect.isawaitable(res):
+                    await res
+            except AttributeError:
+                pass
+        logger.info("Closed Bigtable connector cleanly.")
+
+    def support_batched_contains(self) -> bool:
+        return True
+
+    def batched_contains(self, keys: List[CacheEngineKey]) -> int:
+        future = asyncio.run_coroutine_threadsafe(
+            self.batched_async_contains("sync_lookup", keys), self.loop
+        )
+        return int(future.result())
+
+    def support_ping(self) -> bool:
+        return True
+
+    async def ping(self) -> int:
+        try:
+            client = self._get_client()
+            kwargs = {}
+            if self.cfg.app_profile_id:
+                kwargs["app_profile_id"] = self.cfg.app_profile_id
+            iterator = await client.execute_query(
+                "SELECT 1;",
+                self.cfg.instance_id,
+                operation_timeout=self.cfg.read_timeout_sec,
+                **kwargs,
+            )
+            async for _ in iterator:
+                pass
+            return 0
+        except Exception as e:
+            logger.warning(f"Bigtable ping failed: {e}")
+            return 1
diff --git a/lmcache/v1/storage_backend/connector/bigtable_schema.py b/lmcache/v1/storage_backend/connector/bigtable_schema.py
new file mode 100644
index 0000000000..11e757ac4b
--- /dev/null
+++ b/lmcache/v1/storage_backend/connector/bigtable_schema.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+from typing import Any, Optional
+
+# First Party
+from lmcache.utils import CacheEngineKey
+
+
+class BigtableSchema:
+    def __init__(self, row_key_template: str, family_name: str, column_name: str):
+        self.row_key_template = row_key_template
+        self.family_name = family_name
+        self.column_name = column_name
+
+    def get_row_key(self, key: CacheEngineKey) -> bytes:
+        fingerprint = (
+            f"{key.model_name}@{key.world_size}@{key.worker_id}@{key._dtype_str}"
+        )
+        if key.tags is not None and len(key.tags) != 0:
+            tags_str = "@".join([f"{k}%{v}" for k, v in key.tags])
+            fingerprint += f"@{tags_str}"
+
+        template = self.row_key_template
+        if "{hash}" in template:
+            row_key_str = template.replace("{hash}", key.chunk_hash_hex)
+        else:
+            row_key_str = template.replace("hash", key.chunk_hash_hex)
+
+        if "{model}" in row_key_str:
+            row_key_str = row_key_str.replace("{model}", fingerprint)
+        else:
+            row_key_str = row_key_str.replace("model", fingerprint)
+
+        return row_key_str.encode("utf-8")
+
+    def extract_cell_value(self, row: Any) -> Optional[bytes]:
+        if row is None or not hasattr(row, "cells"):
+            return None
+
+        col_bytes = (
+            self.column_name.encode("utf-8")
+            if isinstance(self.column_name, str)
+            else self.column_name
+        )
+
+        # Handle dict access (classic) or list access (v2 data client) flexibly
+        if isinstance(row.cells, dict):
+            cells_dict = row.cells.get(self.family_name, {})
+            col_cells = cells_dict.get(col_bytes, [])
+            if col_cells:
+                return col_cells[0].value
+        else:
+            for cell in row.cells:
+                if cell.family == self.family_name and cell.qualifier == col_bytes:
+                    return cell.value
+        return None
diff --git a/requirements/common.txt b/requirements/common.txt
index 90170412b7..79ac301f63 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -44,3 +44,6 @@ torch
 transformers >= 5.4
 uvicorn
 httptools
+cachetools
+google-api-core
+google-cloud-bigtable
diff --git a/tests/conftest.py b/tests/conftest.py
index 330adb8720..4d73dfa064 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -799,3 +799,91 @@ def lmcache_engine_metadata(role="worker"):
         use_mla=False,
         role=role,
     )
+
+
+@pytest.fixture(scope="session")
+def bigtable_emulator():
+    """Start or connect to the Bigtable emulator for integration testing."""
+    # Standard
+    import os
+    import shutil
+    import subprocess
+    import time
+
+    existing_host = os.environ.get("BIGTABLE_EMULATOR_HOST")
+    if existing_host:
+        print(f"\n[Fixture] Reusing existing Bigtable emulator at {existing_host}...")
+        yield existing_host
+        return
+
+    if os.environ.get("CI") == "true" or os.environ.get("GITHUB_ACTIONS") == "true":
+        pytest.skip("Skipping Bigtable emulator integration tests in CI")
+
+    if not shutil.which("gcloud"):
+        pytest.skip(
+            "gcloud CLI not found, skipping Bigtable Emulator integration tests"
+        )
+
+    port = "8899"
+    host_port = f"localhost:{port}"
+
+    print(f"\n[Fixture] Starting Bigtable emulator on {host_port}...")
+    emulator_process = subprocess.Popen(
+        [
+            "gcloud",
+            "beta",
+            "emulators",
+            "bigtable",
+            "start",
+            f"--host-port={host_port}",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+
+    os.environ["BIGTABLE_EMULATOR_HOST"] = host_port
+
+    def is_port_open(h: str, p: int, t: float = 1.0) -> bool:
+        # Standard
+        import socket
+
+        try:
+            with socket.create_connection((h, p), timeout=t):
+                return True
+        except OSError:
+            return False
+
+    # Wait for the emulator to initialize
+    port_num = int(port)
+    start_time = time.time()
+    success = False
+    while time.time() - start_time < 10.0:
+        if is_port_open("localhost", port_num):
+            success = True
+            break
+        time.sleep(0.5)
+
+    if not success:
+        print(
+            f"\n[Fixture] Bigtable emulator failed to bind to {host_port} within 10s!"
+        )
+        try:
+            stdout, stderr = emulator_process.communicate(timeout=2.0)
+            print(f"Stdout:\n{stdout.decode('utf-8', errors='ignore')}")
+            print(f"Stderr:\n{stderr.decode('utf-8', errors='ignore')}")
+        except Exception as e:
+            print(f"Could not retrieve process logs: {e}")
+        emulator_process.kill()
+        pytest.fail("Failed to start Bigtable emulator")
+
+    yield host_port
+
+    print("\n[Fixture] Stopping Bigtable emulator...")
+    emulator_process.terminate()
+    try:
+        emulator_process.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        emulator_process.kill()
+
+    if "BIGTABLE_EMULATOR_HOST" in os.environ:
+        del os.environ["BIGTABLE_EMULATOR_HOST"]
diff --git a/tests/v1/storage_backend/test_bigtable_connector.py b/tests/v1/storage_backend/test_bigtable_connector.py
new file mode 100644
index 0000000000..f4876eaab6
--- /dev/null
+++ b/tests/v1/storage_backend/test_bigtable_connector.py
@@ -0,0 +1,981 @@
+# SPDX-License-Identifier: Apache-2.0
+# Backup original state of sys.modules and google package attributes
+
+# Standard
+from typing import Any
+from unittest.mock import ANY, AsyncMock, MagicMock, patch
+import asyncio
+import sys
+
+_original_sys_modules: dict[str, Any] = {}
+_original_google_attrs: dict[str, Any] = {}
+_mocked_modules = [
+    "google.api_core",
+    "google.api_core.exceptions",
+    "google.api_core.gapic_v1",
+    "google.api_core.gapic_v1.client_info",
+    "google.cloud",
+    "google.cloud.bigtable",
+    "google.cloud.bigtable.row_filters",
+    "google.cloud.bigtable.data",
+    "google.cloud.bigtable.data.row_filters",
+    "google.oauth2",
+    "google.oauth2.service_account",
+]
+
+
+class MockDeadlineExceeded(Exception):
+    pass
+
+
+class MockPermissionDenied(Exception):
+    pass
+
+
+class MockNotFound(Exception):
+    pass
+
+
+class MockResourceExhausted(Exception):
+    pass
+
+
+# Setup mock google packages before any test runs
+mock_exceptions = MagicMock()
+mock_exceptions.DeadlineExceeded = MockDeadlineExceeded
+mock_exceptions.Timeout = MockDeadlineExceeded
+mock_exceptions.PermissionDenied = MockPermissionDenied
+mock_exceptions.Unauthenticated = MockPermissionDenied
+mock_exceptions.NotFound = MockNotFound
+mock_exceptions.ResourceExhausted = MockResourceExhausted
+
+mock_row_filters = MagicMock()
+mock_row_filters.StripValueTransformerFilter.return_value = MagicMock()
+
+mock_data = MagicMock()
+mock_data.BigtableDataClientAsync = MagicMock()
+
+
+def fake_client_constructor(*args, **kwargs):
+    client = mock_data.BigtableDataClientAsync.return_value
+    if isinstance(client, MagicMock):
+        client.get_table.return_value = client
+    return client
+
+
+mock_data.BigtableDataClientAsync.side_effect = fake_client_constructor
+
+mock_data.ReadRowsQuery = MagicMock()
+mock_data.RowMutationEntry = MagicMock()
+mock_data.row_filters = mock_row_filters
+
+mock_bigtable = MagicMock(data=mock_data, row_filters=mock_row_filters)
+
+mock_service_account = MagicMock()
+mock_oauth2 = MagicMock(service_account=mock_service_account)
+
+mock_api_core = MagicMock(exceptions=mock_exceptions)
+mock_gapic = MagicMock()
+mock_cloud = MagicMock(bigtable=mock_bigtable)
+
+# Third Party
+import pytest  # noqa: E402
+import torch  # noqa: E402
+
+# First Party
+from lmcache.utils import CacheEngineKey  # noqa: E402
+from lmcache.v1.config import LMCacheEngineConfig  # noqa: E402
+from lmcache.v1.memory_management import MemoryObj  # noqa: E402
+from lmcache.v1.metadata import LMCacheMetadata  # noqa: E402
+from lmcache.v1.storage_backend.connector.bigtable_connector import (  # noqa: E402
+    BigtableConnector,
+)
+from lmcache.v1.storage_backend.local_cpu_backend import LocalCPUBackend  # noqa: E402
+from lmcache.v1.storage_backend.remote_backend import RemoteBackend  # noqa: E402
+from tests.v1.utils import create_test_memory_obj  # noqa: E402
+
+
+@pytest.fixture(scope="module", autouse=True)
+def cleanup_google_mocks():
+    print("\n[Fixture] cleanup_google_mocks: running setup...")
+    # 1. Backup original state
+    for name in _mocked_modules:
+        if name in sys.modules:
+            _original_sys_modules[name] = sys.modules[name]
+    if "google" in sys.modules:
+        google_mod = sys.modules["google"]
+        for attr in ["oauth2", "cloud", "api_core"]:
+            if hasattr(google_mod, attr):
+                _original_google_attrs[attr] = getattr(google_mod, attr)
+
+    # 2. Inject mocks
+    sys.modules["google.api_core"] = mock_api_core
+    sys.modules["google.api_core.exceptions"] = mock_exceptions
+    sys.modules["google.api_core.gapic_v1"] = mock_gapic
+    sys.modules["google.api_core.gapic_v1.client_info"] = MagicMock()
+
+    sys.modules["google.cloud"] = mock_cloud
+    sys.modules["google.cloud.bigtable"] = mock_bigtable
+    sys.modules["google.cloud.bigtable.row_filters"] = mock_row_filters
+    sys.modules["google.cloud.bigtable.data"] = mock_data
+    sys.modules["google.cloud.bigtable.data.row_filters"] = mock_row_filters
+
+    sys.modules["google.oauth2"] = mock_oauth2
+    sys.modules["google.oauth2.service_account"] = mock_service_account
+
+    if "google" in sys.modules:
+        google_mod = sys.modules["google"]
+        google_mod.oauth2 = mock_oauth2  # type: ignore[attr-defined]
+        google_mod.cloud = mock_cloud  # type: ignore[attr-defined]
+        google_mod.api_core = mock_api_core  # type: ignore[attr-defined]
+
+    if "google.cloud" in sys.modules:
+        sys.modules["google.cloud"].bigtable = mock_bigtable  # type: ignore[attr-defined]
+
+    if "google.oauth2" in sys.modules:
+        sys.modules["google.oauth2"].service_account = mock_service_account  # type: ignore[attr-defined]
+
+    if "google.api_core" in sys.modules:
+        sys.modules["google.api_core"].exceptions = mock_exceptions  # type: ignore[attr-defined]
+        sys.modules["google.api_core"].gapic_v1 = mock_gapic  # type: ignore[attr-defined]
+
+    yield
+    print("\n[Fixture] cleanup_google_mocks: running teardown/cleanup...")
+    # 1. Restore sys.modules
+    for name in _mocked_modules:
+        if name in _original_sys_modules:
+            print(f"  Restoring sys.modules[{name}]")
+            sys.modules[name] = _original_sys_modules[name]
+        elif name in sys.modules:
+            print(f"  Deleting sys.modules[{name}]")
+            del sys.modules[name]
+
+    # 2. Restore google module attributes
+    if "google" in sys.modules:
+        google_mod = sys.modules["google"]
+        for attr in ["oauth2", "cloud", "api_core"]:
+            if attr in _original_google_attrs:
+                print(f"  Restoring google.{attr}")
+                setattr(google_mod, attr, _original_google_attrs[attr])
+            elif hasattr(google_mod, attr):
+                print(f"  Deleting google.{attr}")
+                delattr(google_mod, attr)
+
+
+async def mock_async_gen(items):
+    for item in items:
+        yield item
+
+
+def create_test_config(extra_overrides=None):
+    extras = {
+        "bigtable_project_id": "test-project",
+        "bigtable_instance_id": "test-instance",
+        "bigtable_table_name": "test-table",
+        "bigtable_max_chunk_size_mb": 90.0,
+        "bigtable_max_retries": 2,
+        "bigtable_exists_cache_ttl_seconds": 10.0,
+    }
+    if extra_overrides:
+        extras.update(extra_overrides)
+
+    return LMCacheEngineConfig.from_defaults(
+        chunk_size=256,
+        remote_storage_plugins=["bigtable"],
+        remote_serde="naive",
+        lmcache_instance_id="test_instance",
+        extra_config=extras,
+    )
+
+
+def create_test_metadata(kv_shape=(28, 2, 256, 8, 128), chunk_size=256):
+    return LMCacheMetadata(
+        model_name="test_model",
+        world_size=1,
+        local_world_size=1,
+        worker_id=0,
+        local_worker_id=0,
+        kv_dtype=torch.bfloat16,
+        kv_shape=kv_shape,
+        chunk_size=chunk_size,
+    )
+
+
+def create_test_key(key_id: int = 0) -> CacheEngineKey:
+    return CacheEngineKey(
+        model_name="test_model",
+        world_size=3,
+        worker_id=1,
+        chunk_hash=hash(key_id),
+        dtype=torch.bfloat16,
+    )
+
+
+@pytest.fixture
+def async_loop():
+    loop = asyncio.new_event_loop()
+    # Standard
+    import threading
+
+    # First Party
+    from lmcache.utils import start_loop_in_thread_with_exceptions
+
+    thread = threading.Thread(
+        target=start_loop_in_thread_with_exceptions,
+        args=(loop,),
+        name="test-async-loop",
+    )
+    thread.start()
+    yield loop
+    loop.call_soon_threadsafe(loop.stop)
+    thread.join(timeout=5.0)
+
+
+@pytest.fixture
+def local_cpu_backend(memory_allocator):
+    config = LMCacheEngineConfig.from_legacy(chunk_size=256)
+    metadata = create_test_metadata()
+    return LocalCPUBackend(config, metadata, memory_allocator=memory_allocator)
+
+
+@pytest.fixture(autouse=True)
+def mock_pq_executor():
+    # Patch AsyncPQExecutor so it doesn't spawn real thread workers during unit tests,
+    # but still executes jobs inline when submit_job is called.
+    with patch(
+        "lmcache.v1.storage_backend.connector.bigtable_connector.AsyncPQExecutor"
+    ) as mock:
+        instance = MagicMock()
+
+        async def fake_submit_job(fn, *args, **kwargs):
+            kwargs.pop("priority", None)
+            return await fn(*args, **kwargs)
+
+        instance.submit_job = AsyncMock(side_effect=fake_submit_job)
+        instance.shutdown_async = AsyncMock()
+        instance.shutdown = MagicMock()
+        mock.return_value = instance
+        yield mock
+
+
+class TestBigtableConnector:
+    def test_init_and_lazy_pool(self, async_loop, local_cpu_backend):
+        """Verify independent lazy async client pool initialization."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.read_row = AsyncMock(return_value=None)
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        connector = (
+            backend.connection._connector
+            if hasattr(backend.connection, "_connector")
+            else backend.connection
+        )
+        assert isinstance(connector, BigtableConnector)
+        assert connector.cfg.project_id == "test-project"
+        assert connector.cfg.instance_id == "test-instance"
+        assert connector.cfg.table_name == "test-table"
+
+        # Client pool should not be initialized yet (lazy)
+        assert connector._client is None
+
+        # Trigger first operation
+        key = create_test_key(1)
+
+        assert not backend.contains(key)
+        # Pool now initialized
+        assert connector._client is not None
+        mock_data.BigtableDataClientAsync.assert_called_once_with(
+            project="test-project", client_info=ANY
+        )
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_out_of_tree_plugin_init(self, async_loop, local_cpu_backend):
+        """Verify standalone out-of-tree plugin initialization via
+        class_name/module_path.
+        """
+        config = create_test_config(
+            {
+                "remote_storage_plugin.bigtable.module_path": "lmc_bigtable_connector",
+                "remote_storage_plugin.bigtable.class_name": "BigtableConnector",
+                "bigtable_project_id": "ext-project",
+                "bigtable_instance_id": "ext-instance",
+                "bigtable_table_name": "ext-table",
+            }
+        )
+        metadata = create_test_metadata()
+
+        mock_plugin_module = MagicMock()
+        mock_plugin_module.BigtableConnector = BigtableConnector
+        sys.modules["lmc_bigtable_connector"] = mock_plugin_module
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        connector = (
+            backend.connection._connector
+            if hasattr(backend.connection, "_connector")
+            else backend.connection
+        )
+        assert connector.__class__.__name__ == "BigtableConnector"
+        assert connector.cfg.project_id == "ext-project"
+        assert connector.cfg.instance_id == "ext-instance"
+        assert connector.cfg.table_name == "ext-table"
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_credentials_path_init(self, async_loop, local_cpu_backend):
+        """Verify passing credentials_path initializes client with
+        explicit service account.
+        """
+        config = create_test_config(
+            {"bigtable_credentials_path": "/path/to/creds.json"}
+        )
+        metadata = create_test_metadata()
+
+        mock_creds = MagicMock()
+        mock_service_account.Credentials.from_service_account_file.return_value = (
+            mock_creds
+        )
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        connector = (
+            backend.connection._connector
+            if hasattr(backend.connection, "_connector")
+            else backend.connection
+        )
+        connector._get_client()
+
+        mock_service_account.Credentials.from_service_account_file.assert_called_once_with(
+            "/path/to/creds.json"
+        )
+        mock_data.BigtableDataClientAsync.assert_called_with(
+            project="test-project", credentials=mock_creds, client_info=ANY
+        )
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_custom_row_key_template(self, async_loop, local_cpu_backend):
+        """Verify substituting {model} and {hash} placeholders flexibly."""
+        config = create_test_config({"bigtable_row_key_template": "{model}#{hash}"})
+        metadata = create_test_metadata()
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = create_test_key(123)
+        connector = (
+            backend.connection._connector
+            if hasattr(backend.connection, "_connector")
+            else backend.connection
+        )
+        row_key = connector.schema.get_row_key(key)
+        row_key_str = row_key.decode("utf-8")
+
+        assert row_key_str.startswith("test_model@3@1@bfloat16#")
+        assert row_key_str.endswith(key.chunk_hash_hex)
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_local_ttl_cache_hits(self, async_loop, local_cpu_backend):
+        """Verify that exists calls accelerate via local TTL cache
+        without hitting Bigtable.
+        """
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.read_row = AsyncMock(return_value=MagicMock())
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = create_test_key(1)
+
+        assert backend.contains(key)
+        assert mock_client_instance.read_row.call_count == 1
+
+        assert backend.contains(key)
+        assert mock_client_instance.read_row.call_count == 1
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_get_blocking_hit(self, async_loop, local_cpu_backend):
+        config = create_test_config()
+        metadata = create_test_metadata(kv_shape=(1, 2, 16, 8, 128), chunk_size=16)
+        local_cpu_backend.metadata = metadata
+
+        memory_obj = create_test_memory_obj()
+        expected_bytes = bytes(memory_obj.byte_array)
+
+        mock_cell = MagicMock(value=expected_bytes)
+        mock_row = MagicMock()
+        mock_row.cells = {"cf": {b"data": [mock_cell]}}
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.read_row = AsyncMock(return_value=mock_row)
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = create_test_key(1)
+
+        res = backend.get_blocking(key)
+        assert res is not None
+        assert isinstance(res, MemoryObj)
+        assert bytes(res.byte_array)[: len(expected_bytes)] == expected_bytes
+
+        connector = (
+            backend.connection._connector
+            if hasattr(backend.connection, "_connector")
+            else backend.connection
+        )
+        assert connector.exists_cache.get(key.to_string()) is True
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_cell_size_validation_skips_large_writes(
+        self, async_loop, local_cpu_backend
+    ):
+        """Validate chunk size at write time. Skips > 90MB rather than failing."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.mutate_row = AsyncMock()
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = create_test_key(1)
+        large_view = bytearray(95 * 1024 * 1024)
+        mock_memory_obj = MagicMock()
+        mock_memory_obj.byte_array = large_view
+
+        connector = (
+            backend.connection._connector
+            if hasattr(backend.connection, "_connector")
+            else backend.connection
+        )
+        asyncio.run_coroutine_threadsafe(
+            connector._put_internal(key, mock_memory_obj),
+            async_loop,
+        ).result()
+
+        mock_client_instance.mutate_row.assert_not_called()
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_error_handling_timeouts_as_miss(self, async_loop, local_cpu_backend):
+        """TimeoutError / DeadlineExceeded treated as cache miss and fall through."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.read_row = AsyncMock(
+            side_effect=MockDeadlineExceeded("timeout")
+        )
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = create_test_key(1)
+
+        assert not backend.contains(key)
+
+        res = backend.get_blocking(key)
+        assert res is None
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_error_handling_auth_raises(self, async_loop, local_cpu_backend):
+        """PermissionDenied / Unauthenticated propagate up."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.read_row = AsyncMock(
+            side_effect=MockPermissionDenied("auth error")
+        )
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = create_test_key(1)
+
+        connector = (
+            backend.connection._connector
+            if hasattr(backend.connection, "_connector")
+            else backend.connection
+        )
+        with pytest.raises(MockPermissionDenied):
+            asyncio.run_coroutine_threadsafe(connector.exists(key), async_loop).result()
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_batched_get(self, async_loop, local_cpu_backend):
+        """Verify batched_get retrieves multiple memory objects cleanly."""
+        config = create_test_config()
+        metadata = create_test_metadata(kv_shape=(1, 2, 16, 8, 128), chunk_size=16)
+        local_cpu_backend.metadata = metadata
+
+        memory_obj1 = create_test_memory_obj()
+        memory_obj2 = create_test_memory_obj()
+        bytes1 = bytes(memory_obj1.byte_array)
+        bytes2 = bytes(memory_obj2.byte_array)
+
+        mock_client_instance = MagicMock()
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key1 = create_test_key(1)
+        key2 = create_test_key(2)
+
+        connector = (
+            backend.connection._connector
+            if hasattr(backend.connection, "_connector")
+            else backend.connection
+        )
+        row_key1 = connector.schema.get_row_key(key1)
+        row_key2 = connector.schema.get_row_key(key2)
+
+        mock_cell1 = MagicMock(value=bytes1)
+        mock_cell2 = MagicMock(value=bytes2)
+        mock_row1 = MagicMock(row_key=row_key1)
+        mock_row2 = MagicMock(row_key=row_key2)
+        mock_row1.cells = {"cf": {b"data": [mock_cell1]}}
+        mock_row2.cells = {"cf": {b"data": [mock_cell2]}}
+        mock_client_instance.read_rows = AsyncMock(return_value=[mock_row1, mock_row2])
+
+        res = asyncio.run_coroutine_threadsafe(
+            backend.connection.batched_get([key1, key2]),
+            async_loop,
+        ).result()
+
+        assert len(res) == 2
+        assert res[0] is not None and res[1] is not None
+        assert bytes(res[0].byte_array)[: len(bytes1)] == bytes1
+        assert bytes(res[1].byte_array)[: len(bytes2)] == bytes2
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_batched_put(self, async_loop, local_cpu_backend):
+        """Verify batched_put packs mutations and sends bulk mutate rows cleanly."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.bulk_mutate_rows = AsyncMock()
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key1 = create_test_key(1)
+        key2 = create_test_key(2)
+        memory_obj1 = create_test_memory_obj()
+        memory_obj2 = create_test_memory_obj()
+
+        asyncio.run_coroutine_threadsafe(
+            backend.connection.batched_put([key1, key2], [memory_obj1, memory_obj2]),
+            async_loop,
+        ).result()
+
+        mock_client_instance.bulk_mutate_rows.assert_called_once()
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_batched_async_contains(self, async_loop, local_cpu_backend):
+        """Verify batched_async_contains returns correct match counts."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        mock_client_instance = MagicMock()
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key1 = create_test_key(1)
+        key2 = create_test_key(2)
+
+        connector = (
+            backend.connection._connector
+            if hasattr(backend.connection, "_connector")
+            else backend.connection
+        )
+        row_key1 = connector.schema.get_row_key(key1)
+        mock_row = MagicMock(row_key=row_key1)
+        mock_client_instance.read_rows = AsyncMock(return_value=[mock_row])
+
+        res = asyncio.run_coroutine_threadsafe(
+            backend.connection.batched_async_contains("test", [key1, key2]),
+            async_loop,
+        ).result()
+
+        # Only key1 rowkey starts with matched row_key
+        # from read_rows, so match count is 1
+        assert res == 1
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_remove_sync(self, async_loop, local_cpu_backend):
+        """Verify remove_sync issues DeleteAllFromRow mutation cleanly."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.mutate_row = AsyncMock()
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = create_test_key(1)
+        res = backend.connection.remove_sync(key)
+
+        assert res is True
+        # Wait up to 1 second for background deletion task to execute
+        # Standard
+        import time
+
+        for _ in range(100):
+            if mock_client_instance.mutate_row.call_count == 1:
+                break
+            time.sleep(0.01)
+        mock_client_instance.mutate_row.assert_called_once()
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_list(self, async_loop, local_cpu_backend):
+        """Verify list() returns parsed standardized LMCache key metadata formats."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        mock_row1 = MagicMock(row_key=b"hash1#test_model@3@1@bfloat16")
+        mock_row2 = MagicMock(row_key=b"hash2#test_model@3@1@bfloat16")
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.read_rows = AsyncMock(return_value=[mock_row1, mock_row2])
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        res = asyncio.run_coroutine_threadsafe(
+            backend.connection.list(),
+            async_loop,
+        ).result()
+
+        assert len(res) == 2
+        assert "test_model@3@1@hash1@bfloat16" in res
+        assert "test_model@3@1@hash2@bfloat16" in res
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_ping(self, async_loop, local_cpu_backend):
+        """Verify ping executes SELECT 1; health queries cleanly."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        async def mock_query_iter():
+            yield MagicMock()
+
+        mock_client_instance = MagicMock()
+        mock_client_instance.execute_query = AsyncMock(return_value=mock_query_iter())
+        mock_data.BigtableDataClientAsync.return_value = mock_client_instance
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        res = asyncio.run_coroutine_threadsafe(
+            backend.connection.ping(),
+            async_loop,
+        ).result()
+
+        assert res == 0
+        mock_client_instance.execute_query.assert_called_once()
+
+        backend.close()
+        local_cpu_backend.memory_allocator.close()
+
+    def test_get_client_credentials_success(self, async_loop, local_cpu_backend):
+        config = create_test_config(
+            extra_overrides={"bigtable_credentials_path": "/path/to/fake_creds.json"}
+        )
+        connector = BigtableConnector(async_loop, local_cpu_backend, config)
+
+        mock_creds = MagicMock()
+        with (
+            patch(
+                "google.oauth2.service_account.Credentials.from_service_account_file",
+                return_value=mock_creds,
+            ) as mock_from_file,
+            patch(
+                "google.cloud.bigtable.data.BigtableDataClientAsync"
+            ) as mock_client_cls,
+        ):
+            client = connector._get_client()
+
+            mock_from_file.assert_called_once_with("/path/to/fake_creds.json")
+            mock_client_cls.assert_called_once()
+            assert mock_client_cls.call_args[1]["credentials"] == mock_creds
+            assert mock_client_cls.call_args[1]["project"] == "test-project"
+            assert client == mock_client_cls.return_value
+
+    def test_get_client_credentials_os_error(self, async_loop, local_cpu_backend):
+        config = create_test_config(
+            extra_overrides={
+                "bigtable_credentials_path": "/path/to/permission_denied_creds.json"
+            }
+        )
+        connector = BigtableConnector(async_loop, local_cpu_backend, config)
+
+        with (
+            patch(
+                "google.oauth2.service_account.Credentials.from_service_account_file",
+                side_effect=PermissionError("Permission denied"),
+            ),
+            patch(
+                "google.cloud.bigtable.data.BigtableDataClientAsync"
+            ) as mock_client_cls,
+            patch(
+                "lmcache.v1.storage_backend.connector.bigtable_connector.logger.warning"
+            ) as mock_warning,
+        ):
+            client = connector._get_client()
+
+            mock_client_cls.assert_called_once()
+            assert (
+                "credentials" not in mock_client_cls.call_args[1]
+                or mock_client_cls.call_args[1]["credentials"] is None
+            )
+            assert mock_client_cls.call_args[1]["project"] == "test-project"
+
+            mock_warning.assert_called_once()
+            assert (
+                "Falling back to Application Default Credentials"
+                in mock_warning.call_args[0][0]
+            )
+            assert client == mock_client_cls.return_value
+
+    def test_get_client_credentials_value_error(self, async_loop, local_cpu_backend):
+        config = create_test_config(
+            extra_overrides={
+                "bigtable_credentials_path": "/path/to/corrupted_creds.json"
+            }
+        )
+        connector = BigtableConnector(async_loop, local_cpu_backend, config)
+
+        with (
+            patch(
+                "google.oauth2.service_account.Credentials.from_service_account_file",
+                side_effect=ValueError("Invalid JSON"),
+            ),
+            patch(
+                "google.cloud.bigtable.data.BigtableDataClientAsync"
+            ) as mock_client_cls,
+            patch(
+                "lmcache.v1.storage_backend.connector.bigtable_connector.logger.warning"
+            ) as mock_warning,
+        ):
+            client = connector._get_client()
+
+            mock_client_cls.assert_called_once()
+            assert (
+                "credentials" not in mock_client_cls.call_args[1]
+                or mock_client_cls.call_args[1]["credentials"] is None
+            )
+            assert mock_client_cls.call_args[1]["project"] == "test-project"
+
+            mock_warning.assert_called_once()
+            assert (
+                "Falling back to Application Default Credentials"
+                in mock_warning.call_args[0][0]
+            )
+            assert client == mock_client_cls.return_value
+
+    def test_get_client_credentials_auth_error(self, async_loop, local_cpu_backend):
+        config = create_test_config(
+            extra_overrides={"bigtable_credentials_path": "/path/to/expired_creds.json"}
+        )
+        connector = BigtableConnector(async_loop, local_cpu_backend, config)
+
+        # Third Party
+        import google.auth.exceptions
+
+        with (
+            patch(
+                "google.oauth2.service_account.Credentials.from_service_account_file",
+                side_effect=google.auth.exceptions.GoogleAuthError("Auth failed"),
+            ),
+            patch(
+                "google.cloud.bigtable.data.BigtableDataClientAsync"
+            ) as mock_client_cls,
+            patch(
+                "lmcache.v1.storage_backend.connector.bigtable_connector.logger.warning"
+            ) as mock_warning,
+        ):
+            client = connector._get_client()
+
+            mock_client_cls.assert_called_once()
+            assert (
+                "credentials" not in mock_client_cls.call_args[1]
+                or mock_client_cls.call_args[1]["credentials"] is None
+            )
+            assert mock_client_cls.call_args[1]["project"] == "test-project"
+
+            mock_warning.assert_called_once()
+            assert (
+                "Falling back to Application Default Credentials"
+                in mock_warning.call_args[0][0]
+            )
+            assert client == mock_client_cls.return_value
+
+    def test_bigtable_config_defaults(self):
+        """Verify that Bigtable configuration defaults are consistent."""
+        # First Party
+        from lmcache.v1.storage_backend.connector.bigtable_config import (
+            BigtablePluginConfig,
+        )
+
+        # Test defaults directly from class instantiation
+        config = BigtablePluginConfig(
+            project_id="test-project",
+            instance_id="test-instance",
+            table_name="test-table",
+        )
+        assert config.max_chunk_size_mb == 90.0
+        assert config.read_timeout_sec == 0.2
+        assert config.write_timeout_sec == 0.5
+        assert config.max_retries == 3
+
+        # Test defaults loaded via from_extra_config (no explicit overrides)
+        extra_config = {
+            "bigtable_project_id": "test-project",
+            "bigtable_instance_id": "test-instance",
+            "bigtable_table_name": "test-table",
+        }
+        loaded_config = BigtablePluginConfig.from_extra_config(extra_config)
+        assert loaded_config.max_chunk_size_mb == 90.0
+        assert loaded_config.read_timeout_sec == 0.2
+        assert loaded_config.write_timeout_sec == 0.5
+        assert loaded_config.max_retries == 3
diff --git a/tests/v1/storage_backend/test_bigtable_integration.py b/tests/v1/storage_backend/test_bigtable_integration.py
new file mode 100644
index 0000000000..946def03dc
--- /dev/null
+++ b/tests/v1/storage_backend/test_bigtable_integration.py
@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+
+# Standard
+import asyncio
+import threading
+
+# Third Party
+from google.cloud.bigtable import Client
+import pytest
+import torch
+
+# First Party
+from lmcache.utils import CacheEngineKey
+from lmcache.v1.config import LMCacheEngineConfig
+from lmcache.v1.memory_management import MixedMemoryAllocator
+from lmcache.v1.metadata import LMCacheMetadata
+from lmcache.v1.storage_backend.local_cpu_backend import LocalCPUBackend
+from lmcache.v1.storage_backend.remote_backend import RemoteBackend
+from tests.v1.utils import create_test_memory_obj
+
+
+# Simple test helpers
+def create_test_metadata(kv_shape=(2, 2, 256, 8, 128)):
+    return LMCacheMetadata(
+        model_name="test_model",
+        world_size=1,
+        local_world_size=1,
+        worker_id=0,
+        local_worker_id=0,
+        kv_dtype=torch.bfloat16,
+        kv_shape=kv_shape,
+        use_mla=False,
+        role="worker",
+    )
+
+
+def create_test_config(extra_overrides=None):
+    extras = {
+        "bigtable_project_id": "test-project",
+        "bigtable_instance_id": "test-instance",
+        "bigtable_table_name": "test-table",
+        "bigtable_family_name": "cf",
+        "bigtable_column_name": "data",
+        "bigtable_max_chunk_size_mb": 5.0,  # 5MB threshold for testing skip logic
+        "bigtable_max_retries": 2,
+    }
+    if extra_overrides:
+        extras.update(extra_overrides)
+
+    return LMCacheEngineConfig.from_defaults(
+        chunk_size=256,
+        remote_storage_plugins=["bigtable"],
+        remote_serde="naive",
+        lmcache_instance_id="test_instance",
+        extra_config=extras,
+    )
+
+
+@pytest.fixture
+def async_loop():
+    loop = asyncio.new_event_loop()
+    # Standard
+    import threading
+
+    # First Party
+    from lmcache.utils import start_loop_in_thread_with_exceptions
+
+    thread = threading.Thread(
+        target=start_loop_in_thread_with_exceptions,
+        args=(loop,),
+        name="test-async-loop",
+    )
+    thread.start()
+    yield loop
+    loop.call_soon_threadsafe(loop.stop)
+    thread.join(timeout=5.0)
+
+
+@pytest.mark.integration
+class TestBigtableEmulatorIntegration:
+    @pytest.fixture(autouse=True)
+    def setup_emulator_table(self, bigtable_emulator):
+        """Prepare instance, table, and column family in the emulator."""
+        project_id = "test-project"
+        instance_id = "test-instance"
+        table_name = "test-table"
+        family_name = "cf"
+
+        # Initialize sync admin client using the emulator host
+        client = Client(project=project_id, admin=True)
+        instance = client.instance(instance_id)
+
+        table = instance.table(table_name)
+        try:
+            if table.exists():
+                table.delete()
+        except Exception:
+            pass
+
+        table.create()
+        cf = table.column_family(family_name)
+        cf.create()
+
+        yield
+
+        # Clean up table after each test
+        try:
+            if table.exists():
+                table.delete()
+        except Exception:
+            pass
+
+    @pytest.fixture
+    def memory_allocator(self):
+        alloc = MixedMemoryAllocator(100 * 1024 * 1024)  # 100MB
+        yield alloc
+        alloc.close()
+
+    @pytest.fixture
+    def local_cpu_backend(self, memory_allocator):
+        config = LMCacheEngineConfig.from_defaults(chunk_size=256)
+        metadata = create_test_metadata()
+        backend = LocalCPUBackend(config, metadata, memory_allocator=memory_allocator)
+        yield backend
+        backend.close()
+
+    def test_integration_put_and_get(self, async_loop, local_cpu_backend):
+        """Verify standard put and get of chunk bytes with emulator."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = CacheEngineKey("test_model", 0, 0, 256, torch.bfloat16)
+
+        # Use test helper to create concrete MemoryObj and fill it with values
+        memory_obj = create_test_memory_obj(
+            shape=torch.Size([2, 2, 256, 8, 128]), dtype=torch.bfloat16
+        )
+        memory_obj.tensor.fill_(3.14)
+
+        # Write asynchronously and wait for future
+        fut = backend.submit_put_task(key, memory_obj)
+        fut.result(timeout=5.0)
+
+        # Query contains
+        assert backend.contains(key)
+
+        # Read back using get_blocking (allocates automatically)
+        retrieved_obj = backend.get_blocking(key)
+        assert retrieved_obj is not None
+        assert torch.all(retrieved_obj.tensor == 3.14)
+
+        backend.close()
+
+    def test_integration_batched_put_and_get(self, async_loop, local_cpu_backend):
+        """Verify dynamic batching and batched operations with emulator."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        keys = [
+            CacheEngineKey("test_model", 0, i, 256, torch.bfloat16) for i in range(5)
+        ]
+
+        memory_objs = []
+        for i in range(5):
+            memory_obj = create_test_memory_obj(
+                shape=torch.Size([2, 2, 256, 8, 128]), dtype=torch.bfloat16
+            )
+            memory_obj.tensor.fill_(float(i))
+            memory_objs.append(memory_obj)
+
+        # Use threading events to wait for batched write completion
+        done_events = [threading.Event() for _ in range(5)]
+
+        def on_complete(key):
+            idx = keys.index(key)
+            done_events[idx].set()
+
+        # Batched Put
+        backend.batched_submit_put_task(
+            keys, memory_objs, on_complete_callback=on_complete
+        )
+
+        # Wait for all writes to finish
+        for ev in done_events:
+            assert ev.wait(timeout=10.0)
+
+        # Assert all keys are in backend
+        for key in keys:
+            assert backend.contains(key)
+
+        # Read all back using batched_get_blocking
+        retrieved_objs = backend.batched_get_blocking(keys)
+        assert len(retrieved_objs) == 5
+
+        for i in range(5):
+            assert retrieved_objs[i] is not None
+            assert torch.all(retrieved_objs[i].tensor == float(i))
+
+        backend.close()
+
+    def test_integration_remove(self, async_loop, local_cpu_backend):
+        """Verify deleting chunks from emulator works."""
+        config = create_test_config()
+        metadata = create_test_metadata()
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = CacheEngineKey("test_model", 0, 10, 256, torch.bfloat16)
+        memory_obj = create_test_memory_obj(
+            shape=torch.Size([2, 2, 256, 8, 128]), dtype=torch.bfloat16
+        )
+        memory_obj.tensor.fill_(1.0)
+
+        fut = backend.submit_put_task(key, memory_obj)
+        fut.result(timeout=5.0)
+        assert backend.contains(key)
+
+        # Remove
+        assert backend.remove(key)
+        assert not backend.contains(key)
+
+        backend.close()
+
+    def test_integration_skips_large_writes(self, async_loop, local_cpu_backend):
+        """Verify writing a chunk larger than max_chunk_size_mb is skipped
+        without failure.
+        """
+        config = create_test_config()  # Max size is 5.0 MB
+        metadata = create_test_metadata()
+
+        backend = RemoteBackend(
+            config=config,
+            metadata=metadata,
+            loop=async_loop,
+            local_cpu_backend=local_cpu_backend,
+            dst_device="cpu",
+            plugin_name="bigtable",
+        )
+
+        key = CacheEngineKey("test_model", 0, 99, 256, torch.bfloat16)
+
+        # Create a payload of ~8.38 MB (exceeds the 5.0 MB threshold)
+        # 8 * 2 * 256 * 8 * 128 * 2 bytes = 8,388,608 bytes
+        memory_obj = create_test_memory_obj(
+            shape=torch.Size([8, 2, 256, 8, 128]), dtype=torch.bfloat16
+        )
+
+        # Try to put and wait for it
+        fut = backend.submit_put_task(key, memory_obj)
+        fut.result(timeout=5.0)
+
+        # Verify it was NOT written (skips write)
+        assert not backend.contains(key)
+
+        backend.close()

From 83164de9523eb80310c86c76ff2571bfc97074a0 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Thu, 11 Jun 2026 15:05:28 -0700
Subject: [PATCH 48/57] [Core][MP] Optimize DSV4 store/load size (#3635)

Signed-off-by: ApostaC <yihua@tensormesh.ai>
---
 lmcache/integration/vllm/kv_cache_groups.py   |  78 +++++++-
 lmcache/v1/kv_layer_groups.py                 | 188 +++++++++++-------
 lmcache/v1/multiprocess/gpu_context.py        |  65 ++++--
 lmcache/v1/multiprocess/group_view.py         |   4 +
 .../v1/multiprocess/modules/gpu_transfer.py   |   2 +-
 lmcache/v1/platform/cpu/cache_context.py      |  15 +-
 tests/v1/multiprocess/test_gpu_context.py     |  21 +-
 tests/v1/test_kv_cache_groups.py              |  13 +-
 tests/v1/test_kv_layer_groups_manager.py      | 147 +++++++++++---
 tests/v1/test_vllm_kv_cache_groups.py         | 131 +++++++++++-
 10 files changed, 536 insertions(+), 128 deletions(-)

diff --git a/lmcache/integration/vllm/kv_cache_groups.py b/lmcache/integration/vllm/kv_cache_groups.py
index 4945a08481..b15ae67126 100644
--- a/lmcache/integration/vllm/kv_cache_groups.py
+++ b/lmcache/integration/vllm/kv_cache_groups.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 # Standard
-from collections.abc import Mapping
+from collections.abc import Mapping, Sequence
 from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
@@ -13,8 +13,79 @@
     from lmcache.v1.gpu_connector.utils import LayoutHints
 
 # First Party
+from lmcache.logging import init_logger
 from lmcache.v1.multiprocess.group_view import EngineGroupInfo
 
+logger = init_logger(__name__)
+
+
+def _is_sliding_window_spec(spec: Any) -> bool:
+    """Return whether the KV cache spec is a vLLM sliding-window spec.
+
+    Checked by class name so this module stays importable without vLLM.
+    Subclasses such as ``SlidingWindowMLASpec`` count.
+    """
+    return any(cls.__name__ == "SlidingWindowSpec" for cls in type(spec).__mro__)
+
+
+def _resolve_per_layer_sw_sizes(
+    vllm_groups: Sequence[Any],
+    layer_to_idx: Mapping[str, int],
+    num_layers: int,
+) -> list[int]:
+    """Resolve the sliding window size in tokens for each registered KV tensor.
+
+    Will resolve -1 for non-sliding-window layers.
+
+    Args:
+        vllm_groups: vLLM ``KVCacheGroupSpec`` instances.
+        layer_to_idx: Layer name to registered tensor index mapping.
+        num_layers: Number of registered KV tensors.
+
+    Returns:
+        A list of length ``num_layers`` mapping each registered tensor index
+        to its sliding window size in tokens, or ``-1`` for
+        non-sliding-window layers.
+    """
+    per_layer_sw_size = [-1] * num_layers
+    for group in vllm_groups:
+        spec = getattr(group, "kv_cache_spec", None)
+        if spec is None:
+            continue
+        # ``UniformTypeKVCacheSpecs`` carries per-layer specs in
+        # ``kv_cache_specs``; other specs apply to all of the group's layers.
+        per_layer_specs = getattr(spec, "kv_cache_specs", None)
+        for name in group.layer_names:
+            layer_spec = per_layer_specs[name] if per_layer_specs else spec
+            if _is_sliding_window_spec(layer_spec):
+                per_layer_sw_size[layer_to_idx[name]] = layer_spec.sliding_window
+    return per_layer_sw_size
+
+
+def _merge_layer_sw_sizes(per_layer_sw_size: list[int], indices: list[int]) -> int:
+    """Merge the per-layer sliding window sizes of one LMCache group.
+
+    Args:
+        per_layer_sw_size: Sliding window size per registered tensor index.
+        indices: Registered tensor indices of the group's layers.
+
+    Returns:
+        The group's common sliding window size in tokens, or ``-1`` when the
+        layers are not sliding-window attention.
+
+    Raises:
+        ValueError: If the layers have different non-negative sliding window sizes.
+    """
+    sw_sizes = {per_layer_sw_size[idx] for idx in indices}
+    if len(sw_sizes) != 1:
+        raise ValueError(
+            f"Layers with indices {indices} have different sliding window sizes "
+            f"{sw_sizes}, but they are in the same group. This should "
+            "not happen because vLLM should only group layers with the same "
+            "KV cache spec, but got inconsistent metadata or registered tensors."
+        )
+    return sw_sizes.pop()
+
 
 def create_engine_group_infos_from_vllm(
     kv_cache_config: Any,
@@ -82,6 +153,7 @@ def create_engine_group_infos_from_vllm(
     # wrong-block-size group would corrupt the per-group block-id counts).
     per_layer_group_idx: list[int] | None = None
     group_tokens_per_block: dict[int, int] = {}
+    per_layer_sw_size = [-1] * num_layers
     if vllm_groups:
         per_layer_group_idx = [EXCLUDED_ENGINE_GROUP] * num_layers
         for engine_group_id, group in enumerate(vllm_groups):
@@ -91,6 +163,9 @@ def create_engine_group_infos_from_vllm(
             group_tokens_per_block[engine_group_id] = group.kv_cache_spec.block_size
             for name in group.layer_names:
                 per_layer_group_idx[layer_to_idx[name]] = engine_group_id
+        per_layer_sw_size = _resolve_per_layer_sw_sizes(
+            vllm_groups, layer_to_idx, num_layers
+        )
 
     # Within one vLLM engine group, layers can have different hidden dimensions
     # (e.g. a different head count), which require different GPU copy kernels.
@@ -104,6 +179,7 @@ def create_engine_group_infos_from_vllm(
             engine_group_id=identity[4],
             layer_indices=tuple(indices),
             tokens_per_block=group_tokens_per_block.get(identity[4], 0),
+            sw_size_tokens=_merge_layer_sw_sizes(per_layer_sw_size, indices),
         )
         for identity, indices in group_layers_by_identity(
             normalized_kv_caches,
diff --git a/lmcache/v1/kv_layer_groups.py b/lmcache/v1/kv_layer_groups.py
index 4b712687de..f00cdf4cf3 100644
--- a/lmcache/v1/kv_layer_groups.py
+++ b/lmcache/v1/kv_layer_groups.py
@@ -179,16 +179,11 @@ class KernelGroupInfo:
     initialization time (carried in ``EngineGroupInfo.tokens_per_block``).
     ``0`` means the engine did not report it; the group is then treated as
     uncompressed (``compress_ratio == 1``)."""
-    slots_per_chunk: int = 0
-    """Number of *physical* slots in one LMCache chunk for this group
-    (= ``lmcache_tokens_per_chunk // tokens_per_block * slots_per_block``).
-    This is what the block-level transfer kernel must be told, not the
-    logical ``lmcache_tokens_per_chunk`` which counts vLLM tokens. ``0``
-    means the field has not been populated yet; ``GPUCacheContext``
-    fills it in after construction once ``lmcache_tokens_per_chunk``
-    is known."""
     engine_group_idx: int = 0
     """Engine group index (paged-block address space). 0 for non-hybrid."""
+    sw_size_tokens: int = -1
+    """Sliding window size in logical tokens for this group's layers.
+    ``-1`` means the layers are not sliding-window attention."""
 
     def __repr__(self) -> str:
         if not self.layer_indices:
@@ -206,8 +201,8 @@ def __repr__(self) -> str:
             f"dtype={self.dtype}, "
             f"tokens_per_block={self.tokens_per_block}, "
             f"slots_per_block={self.slots_per_block}, "
-            f"slots_per_chunk={self.slots_per_chunk}, "
-            f"engine_group_idx={self.engine_group_idx})"
+            f"engine_group_idx={self.engine_group_idx}, "
+            f"sw_size_tokens={self.sw_size_tokens})"
         )
 
     @property
@@ -227,6 +222,12 @@ def slots_per_block(self) -> int:
         dimension, ``shape_desc.bs``)."""
         return self.shape_desc.bs
 
+    def calculate_slots(self, num_tokens: int) -> int:
+        """Calculate the number of slots for the specified number of
+        tokens. Assuming the number of tokens are already aligned.
+        """
+        return num_tokens * self.slots_per_block // self.tokens_per_block
+
 
 KVLayerGroupInfo = KernelGroupInfo  # Alias for compatibility
 
@@ -247,8 +248,10 @@ class ObjectGroupInfo:
     """Indices of the kernel groups belonging to this object group, in the
     order they should be laid out in memory."""
 
-    # NOTE: will add fields to indicate the "kv cache type" of this
-    # object group in the follow-up PRs
+    sw_size_chunks: int = -1
+    """Cross-chunk sliding window size in LMCache chunks shared by every
+    kernel group in this object group. ``-1`` means the kernel groups are
+    not sliding-window attention."""
 
 
 class KVLayerGroupsManager:
@@ -297,8 +300,8 @@ def __init__(
             gpu_kv_format: Format returned by
                 :func:`normalize_kv_and_discover_format`.
             num_blocks: Number of paged blocks in the device KV cache.
-            engine_group_infos: Engine KV cache group metadata, including
-                the engine group ids, and the sliding window information.
+            engine_group_infos: Engine KV cache group metadata, one info per
+                kernel group in kernel-group order, or empty.
             lmcache_logical_chunk_size: Tokens per LMCache chunk
         """
         # Import here to break a circular import via
@@ -323,18 +326,20 @@ def __init__(
             engine_group_infos, num_layers
         )
 
-        # Engine-reported logical tokens per paged chunk, keyed by engine
-        # group id. 0 / missing means the engine did not report it.
-        engine_tokens_per_block: dict[int, int] = {
-            info.engine_group_id: info.tokens_per_block
-            for info in engine_group_infos
-            if info.tokens_per_block > 0
-        }
-
         groups_by_identity = group_layers_by_identity(
             kv_caches, gpu_kv_format, num_layers, per_layer_engine_group_idx
         )
 
+        # Engine group infos are produced by the same group_layers_by_identity
+        # bucketing on the engine side, so they correspond 1:1, in order, to
+        # the kernel groups built below.
+        if engine_group_infos and len(engine_group_infos) != len(groups_by_identity):
+            raise ValueError(
+                f"Got {len(engine_group_infos)} engine group infos for "
+                f"{len(groups_by_identity)} kernel groups; expecting one "
+                "engine group info per kernel group"
+            )
+
         # Emit groups in order of their first-appearing layer so that group
         # indices remain deterministic across runs.
         for group_idx, ((_, _, _, bs, engine_group_idx, dt), indices) in enumerate(
@@ -356,19 +361,30 @@ def __init__(
                 block_stride_elems=block_stride_elems,
             )
 
+            info = engine_group_infos[group_idx] if engine_group_infos else None
+            if info is not None and tuple(indices) != tuple(info.layer_indices):
+                raise ValueError(
+                    f"group {group_idx}: engine group info covers layers "
+                    f"{info.layer_indices}, but the kernel group covers "
+                    f"layers {indices}"
+                )
+
             # tokens_per_block comes from the engine's KV cache spec; when
             # absent, fall back to the physical slot count so the group is
             # treated as non-compressed (compress_ratio == 1).
-            tokens_per_block = engine_tokens_per_block.get(engine_group_idx, bs)
+            tokens_per_block = (
+                info.tokens_per_block
+                if info is not None and info.tokens_per_block > 0
+                else bs
+            )
+            sw_size_tokens = info.sw_size_tokens if info is not None else -1
 
-            # TODO (ApostaC): the code here is not very good.
-            # Conceptually, KV Layer Group should not be aware of lmcache logical
-            # chunk size at all.
-            slots_per_chunk = self._derive_slots_per_chunk(
-                group_idx=group_idx,
+            self._validate_block_chunk_size_config(
+                group_idx,
                 slots_per_block=bs,
                 tokens_per_block=tokens_per_block,
                 lmcache_tokens_per_chunk=lmcache_tokens_per_chunk,
+                sw_size_tokens=sw_size_tokens,
             )
 
             self._kernel_groups.append(
@@ -377,12 +393,12 @@ def __init__(
                     shape_desc=shape_desc,
                     dtype=dt,
                     tokens_per_block=tokens_per_block,
-                    slots_per_chunk=slots_per_chunk,
                     engine_group_idx=engine_group_idx,
+                    sw_size_tokens=sw_size_tokens,
                 )
             )
 
-        self._lmcache_chunk_size = lmcache_tokens_per_chunk
+        self._lmcache_tokens_per_chunk = lmcache_tokens_per_chunk
 
         logger.info(
             "KV layer groups: ---\n%s\n---",
@@ -441,16 +457,32 @@ def get_shape_desc(self, kernel_group_idx: int) -> "lmc_ops.PageBufferShapeDesc"
         """
         return self._kernel_groups[kernel_group_idx].shape_desc
 
+    @lmcache_deprecate("It does not have hybrid model support")
     def get_slots_per_chunk(self, kernel_group_idx: int) -> int:
-        """Return the per-chunk *physical* slot count for *kernel_group_idx*.
+        """Return the per-chunk slot count for *kernel_group_idx*.
 
         Args:
             kernel_group_idx: 0-based kernel group index.
 
-        Raises:
-            IndexError: If *kernel_group_idx* is out of range.
+        Note:
+            This is a deprecated function because it does not have
+            hybrid model support
         """
-        return self._kernel_groups[kernel_group_idx].slots_per_chunk
+        group = self._kernel_groups[kernel_group_idx]
+        return group.calculate_slots(self._lmcache_tokens_per_chunk)
+
+    def get_slots_per_chunk_in_sw(self, kernel_group_idx: int) -> int:
+        """Return the per-chunk *transfer* slot count for *kernel_group_idx*.
+
+        For sub-chunk sliding window groups, the transfer slots is smaller
+        than the physical slots in a chunk.
+
+        Args:
+            kernel_group_idx: 0-based kernel group index.
+        """
+        group = self._kernel_groups[kernel_group_idx]
+        sw_size = self.get_subchunk_sw_size_tokens(kernel_group_idx)
+        return group.calculate_slots(sw_size)
 
     def get_subchunk_sw_size_tokens(self, kernel_group_idx: int) -> int:
         """Return the sub-chunk sliding window size of a given kernel group.
@@ -467,9 +499,10 @@ def get_subchunk_sw_size_tokens(self, kernel_group_idx: int) -> int:
             chunk size for non-slding-window models or big-sliding-
             window models.
         """
-        # TODO(ApostaC): now here's the 'dummy' implementation.
-        # Need to wire the real sw size from the kernel group info once it's available
-        return self._lmcache_chunk_size
+        sw_size_tokens = self._kernel_groups[kernel_group_idx].sw_size_tokens
+        if sw_size_tokens == -1 or sw_size_tokens >= self._lmcache_tokens_per_chunk:
+            return self._lmcache_tokens_per_chunk
+        return sw_size_tokens
 
     def get_sw_size_chunks(self, object_group_idx: int) -> int:
         """Return the sliding window size of a given kernel group,
@@ -490,8 +523,11 @@ def get_sw_size_chunks(self, object_group_idx: int) -> int:
             they can be retrieved at the same time from the same object.
             For small sliding window (subchunk window) models, it will return 1.
         """
-        # TODO(ApostaC): now here's the 'dummy' implementation.
-        # Need to wire the real sw size from the object group info once it's available
+        # NOTE(ApostaC): object-level skipping is not enabled yet, so we
+        # always return -1 here instead of reading the object group's
+        # ``sw_size_chunks``. Switch to
+        # ``self._object_groups[object_group_idx].sw_size_chunks`` once the
+        # lookup/registry side supports multiple object groups.
         return -1
 
     def calculate_num_blocks(self, kernel_group_idx: int, num_tokens: int) -> int:
@@ -528,39 +564,53 @@ def _detect_object_groups(
         Returns:
             A list of ObjectGroupInfo instances representing the detected object groups.
         """
-        # TODO: add the real object group detection logic based on
-        # the attention type metadata in the engine group infos once it's
-        # available.
-        # Now, we are using a single object group, which means
-        # all kernel groups' KV caches will be stored in the same memory object.
+        # TODO(ApostaC): The following commented code groups the object groups based
+        # on the sliding window information. We need to re-enable this after the lookup
+        # logic for sliding window has been implemented.
+        # For now, we put all the kernel groups into one object group.
+
+        # chunk_size = self._lmcache_tokens_per_chunk
+        # groups_by_sw_size: dict[int, list[int]] = defaultdict(list)
+        # for kernel_group_idx, group in enumerate(self._kernel_groups):
+        #    if group.sw_size_tokens == -1:
+        #        sw_size_chunks = -1
+        #    else:
+        #        sw_size_chunks = (
+        #            group.sw_size_tokens + chunk_size - 1
+        #        ) // chunk_size
+        #    groups_by_sw_size[sw_size_chunks].append(kernel_group_idx)
+        # return [
+        #    ObjectGroupInfo(
+        #        kernel_group_indices=kernel_group_indices,
+        #        sw_size_chunks=sw_size_chunks,
+        #    )
+        #    for sw_size_chunks, kernel_group_indices in sorted(
+        #        groups_by_sw_size.items(), key=lambda kv: kv[1][0]
+        #    )
+        # ]
         return [
             ObjectGroupInfo(kernel_group_indices=list(range(len(self._kernel_groups))))
         ]
 
     @staticmethod
-    def _derive_slots_per_chunk(
+    def _validate_block_chunk_size_config(
         group_idx: int,
         slots_per_block: int,
         tokens_per_block: int,
         lmcache_tokens_per_chunk: int,
-    ) -> int:
-        """Resolve ``slots_per_chunk`` (physical slots per LMCache chunk).
-
-        Derived directly from the three ground-truth quantities: the LMCache
-        chunk size ``lmcache_tokens_per_chunk`` (logical tokens), the group's
-        logical ``tokens_per_block`` and its physical ``slots_per_block``. One
-        LMCache chunk spans ``lmcache_tokens_per_chunk // tokens_per_block``
-        paged blocks, each holding ``slots_per_block`` physical slots, so
-        ``slots_per_chunk = lmcache_tokens_per_chunk // tokens_per_block
-        * slots_per_block``. This is the per-chunk physical slot count fed to
-        the block-level transfer kernel.
+        sw_size_tokens: int = -1,
+    ) -> None:
+        """Validate the chunk size configuration against the slot and
+        tokens block detected from the serving engine.
 
         Raises:
-            ValueError: If ``tokens_per_block`` is not a whole multiple of
-                ``slots_per_block`` (each physical slot must pack a whole number
-                of logical tokens), or if ``lmcache_tokens_per_chunk`` is not a
-                whole multiple of ``tokens_per_block`` (an LMCache chunk must
-                align to a whole number of the group's paged blocks).
+            ValueError: If one of the following conditions is met:
+                - ``tokens_per_block`` is not a whole multiple of
+                  ``slots_per_block``
+                - ``lmcache_tokens_per_chunk`` is not a whole multiple of
+                  ``tokens_per_block``
+                - a sub-chunk sliding window is not a whole multiple of
+                  ``tokens_per_block``
         """
         if tokens_per_block % slots_per_block != 0:
             raise ValueError(
@@ -573,18 +623,22 @@ def _derive_slots_per_chunk(
                 f"{lmcache_tokens_per_chunk} must be a multiple of "
                 f"tokens_per_block {tokens_per_block}"
             )
-        blocks_per_chunk = lmcache_tokens_per_chunk // tokens_per_block
-        slots_per_chunk = blocks_per_chunk * slots_per_block
+        if (
+            0 < sw_size_tokens < lmcache_tokens_per_chunk
+            and sw_size_tokens % tokens_per_block != 0
+        ):
+            raise ValueError(
+                f"group {group_idx}: sub-chunk sliding window size "
+                f"{sw_size_tokens} must be a multiple of tokens_per_block "
+                f"{tokens_per_block}"
+            )
         if slots_per_block != tokens_per_block:
             logger.info(
-                "group %d: compressed (tokens_per_block=%d, slots_per_block=%d "
-                "-> slots_per_chunk=%d)",
+                "group %d: compressed (tokens_per_block=%d, slots_per_block=%d)",
                 group_idx,
                 tokens_per_block,
                 slots_per_block,
-                slots_per_chunk,
             )
-        return slots_per_chunk
 
 
 # ------------------------------------------------------------------ #
diff --git a/lmcache/v1/multiprocess/gpu_context.py b/lmcache/v1/multiprocess/gpu_context.py
index e21494f24d..c5aba69a61 100644
--- a/lmcache/v1/multiprocess/gpu_context.py
+++ b/lmcache/v1/multiprocess/gpu_context.py
@@ -117,7 +117,7 @@ def __init__(
         max_batch_size: int = 4,
     ) -> None:
         self._kv_groups_manager = kv_layer_groups_manager
-        self._lmcache_chunk_size = lmcache_tokens_per_chunk
+        self._lmcache_tokens_per_chunk = lmcache_tokens_per_chunk
         self._max_batch_size = max_batch_size
 
         self._temp_buffer = torch.empty(
@@ -167,7 +167,7 @@ def __init__(
         self._shape_cache_kernel_group: dict[int, tuple[torch.Size, torch.dtype]] = {}
         for kernel_group_idx in range(self._kv_groups_manager.num_kernel_groups):
             shape = self._get_shape_for_kernel_group(
-                self._lmcache_chunk_size, kernel_group_idx
+                self._lmcache_tokens_per_chunk, kernel_group_idx
             )
             group = self._kv_groups_manager.kernel_groups[kernel_group_idx]
             dtype = group.dtype
@@ -247,7 +247,8 @@ def get_kernel_group_shape_dtype(
         MemoryLayoutDesc
 
         Args:
-            num_tokens: Number of tokens
+            num_tokens: Number of tokens. Must be a whole number of lmcache
+                chunk size.
             kernel_group_idx: Index of the kernel group.
 
         Returns:
@@ -263,7 +264,7 @@ def get_cache_size_per_token(self) -> int:
         """
         Returns the cache size per token (in bytes), summed across all kernel groups.
         """
-        return self._get_size_for_single_batch() // self._lmcache_chunk_size
+        return self._get_size_for_single_batch() // self._lmcache_tokens_per_chunk
 
     # Helper functions
     def _get_shape_for_kernel_group(
@@ -280,17 +281,26 @@ def _get_shape_for_kernel_group(
 
         Returns:
             The shape of the temp GPU buffer for the given kernel group index.
+
+        Raises:
+            ValueError: If ``num_tokens`` is not a whole number of LMCache
+                chunks.
         """
+        if num_tokens % self._lmcache_tokens_per_chunk != 0:
+            raise ValueError(
+                f"num_tokens ({num_tokens}) must be a multiple of "
+                f"lmcache_tokens_per_chunk ({self._lmcache_tokens_per_chunk})"
+            )
+
         group = self._kv_groups_manager.kernel_groups[kernel_group_idx]
-        compress_ratio = group.tokens_per_block // group.slots_per_block
         sd = group.shape_desc
 
-        if num_tokens % compress_ratio != 0:
-            raise ValueError(
-                f"logical_num_tokens ({num_tokens}) is not a multiple of "
-                f"compress_ratio ({compress_ratio}) for group {kernel_group_idx}"
-            )
-        num_slots = num_tokens // compress_ratio
+        num_chunks = num_tokens // self._lmcache_tokens_per_chunk
+        num_slots = (
+            self._kv_groups_manager.get_slots_per_chunk_in_sw(kernel_group_idx)
+            * num_chunks
+        )
+
         return torch.Size(
             (sd.kv_size, group.num_layers, num_slots, group.hidden_dim_size)
         )
@@ -300,12 +310,12 @@ def _get_size_for_kernel_group(self, kernel_group_idx: int) -> int:
         Returns the size in bytes of the temp GPU buffer for the given kernel group
         index
 
-        **Assumes the size is lmcache_chunk_size
+        **Assumes the size is lmcache_tokens_per_chunk
 
         Will only be called during initialization
         """
         shape = self._get_shape_for_kernel_group(
-            self._lmcache_chunk_size, kernel_group_idx
+            self._lmcache_tokens_per_chunk, kernel_group_idx
         )
         kernel_group = self._kv_groups_manager.kernel_groups[kernel_group_idx]
         dtype = kernel_group.dtype
@@ -315,7 +325,7 @@ def _get_size_for_object_group(self, object_group_idx: int) -> int:
         """
         Returns the size in bytes of the temp GPU buffer for the given object group
 
-        **Assumes the size is lmcache_chunk_size
+        **Assumes the size is lmcache_tokens_per_chunk
 
         Will only be called during initialization
         """
@@ -330,7 +340,7 @@ def _get_size_for_single_batch(self) -> int:
         Returns the size in bytes of the temp GPU buffer for a single batch
         (i.e., a single chunk)
 
-        **Assumes the size is lmcache_chunk_size
+        **Assumes the size is lmcache_tokens_per_chunk
         """
         return sum(
             self._get_size_for_object_group(object_group_idx)
@@ -485,11 +495,21 @@ def get_shape_desc(self, kernel_group_idx: int) -> "lmc_ops.PageBufferShapeDesc"
         """Returns the PageBufferShapeDesc for the given KV layer group."""
         return self.kv_layer_groups_manager_.get_shape_desc(kernel_group_idx)
 
-    def get_slots_per_chunk(self, kernel_group_idx: int) -> int:
-        """Returns the per-chunk physical slot count for the given kernel
-        group.
+    def get_slots_per_chunk_in_sw(self, kernel_group_idx: int) -> int:
+        """Returns the number of slots per lmcache chunk when doing
+        D/H transfer.
+
+        This function will take into account that for subchunk sliding window
+        case, the transfer slots will be smaller than the "actual" slots per
+        chunk (which is calculated based on the lmcache chunk size).
+
+        Args:
+            kernel_group_idx: Index of the kernel group.
+
+        Returns:
+            The number of used slots per lmcache chunk when doing D/H transfer.
         """
-        return self.kv_layer_groups_manager_.get_slots_per_chunk(kernel_group_idx)
+        return self.kv_layer_groups_manager.get_slots_per_chunk_in_sw(kernel_group_idx)
 
     def get_kernel_group_kv_pointers(self, kernel_group_idx: int) -> torch.Tensor:
         """Returns the pre-computed GPU tensor of KV cache pointers for the
@@ -546,7 +566,8 @@ def get_kernel_group_shape_dtype(
         Will be exported by GPUCacheContext and used to construct the MemoryLayoutDesc
 
         Args:
-            num_tokens: Number of tokens
+            num_tokens: Number of tokens. Must be a whole number of lmcache
+                chunk size.
             kernel_group_idx: Index of the kernel group.
 
         Returns:
@@ -728,7 +749,7 @@ class PlainGPUCacheContext:
     A plain GPU cache context that have a single contiguous 2LTD buffer
     """
 
-    def __init__(self, kv_caches: KVCache, lmcache_chunk_size: int = 256):
+    def __init__(self, kv_caches: KVCache, lmcache_tokens_per_chunk: int = 256):
         assert len(kv_caches) == 1, (
             "PlainGPUCacheContext only supports a single KV cache tensor"
         )
@@ -746,7 +767,7 @@ def __init__(self, kv_caches: KVCache, lmcache_chunk_size: int = 256):
         self._hidden_dim_size = shape[3]
 
         # Temporary buffer
-        tmp_buffer_shape = self.get_kv_buffer_shape(lmcache_chunk_size)
+        tmp_buffer_shape = self.get_kv_buffer_shape(lmcache_tokens_per_chunk)
         self._tmp_gpu_buffer = torch.empty(
             tmp_buffer_shape, dtype=self.dtype, device=self.device
         )
diff --git a/lmcache/v1/multiprocess/group_view.py b/lmcache/v1/multiprocess/group_view.py
index 46d45f040b..4b7742a2e4 100644
--- a/lmcache/v1/multiprocess/group_view.py
+++ b/lmcache/v1/multiprocess/group_view.py
@@ -49,6 +49,10 @@ class EngineGroupInfo(msgspec.Struct, frozen=True):
     from the registered tensors (i.e. the group is treated as
     uncompressed)."""
 
+    sw_size_tokens: int = -1
+    """Sliding window size in tokens for the layers of this group.
+    ``-1`` means the layers are not sliding-window attention."""
+
 
 def num_engine_groups(groups: Sequence[EngineGroupInfo]) -> int:
     """Return the number of engine groups (block-id lists per transfer request).
diff --git a/lmcache/v1/multiprocess/modules/gpu_transfer.py b/lmcache/v1/multiprocess/modules/gpu_transfer.py
index 1d95038e4d..47b1963170 100644
--- a/lmcache/v1/multiprocess/modules/gpu_transfer.py
+++ b/lmcache/v1/multiprocess/modules/gpu_transfer.py
@@ -348,7 +348,7 @@ def transfer_kv_per_object_group(
             group_kv_pointers = cache_context.get_kernel_group_kv_pointers(
                 kernel_group_id
             )
-            group_lmcache_chunk_size = cache_context.get_slots_per_chunk(
+            group_lmcache_chunk_size = cache_context.get_slots_per_chunk_in_sw(
                 kernel_group_id
             )
             tmp_gpu_buffers_batched = [
diff --git a/lmcache/v1/platform/cpu/cache_context.py b/lmcache/v1/platform/cpu/cache_context.py
index b26e2bda9f..da286a3743 100644
--- a/lmcache/v1/platform/cpu/cache_context.py
+++ b/lmcache/v1/platform/cpu/cache_context.py
@@ -336,9 +336,18 @@ def get_shape_desc(self, group_idx: int) -> "lmc_ops.PageBufferShapeDesc":
         """Returns the PageBufferShapeDesc for the given group."""
         return self.kv_layer_groups_manager_.get_shape_desc(group_idx)
 
-    def get_slots_per_chunk(self, group_idx: int) -> int:
-        """Returns the per-chunk physical slot count for the group."""
-        return self.kv_layer_groups_manager_.get_slots_per_chunk(group_idx)
+    def get_slots_per_chunk_in_sw(self, kernel_group_idx: int) -> int:
+        """Returns the number of slots per lmcache chunk when doing D/H transfer.
+
+        Mirrors :meth:`GPUCacheContext.get_slots_per_chunk_in_sw`.
+
+        Args:
+            kernel_group_idx: Index of the kernel group.
+
+        Returns:
+            The number of used slots per lmcache chunk when doing D/H transfer.
+        """
+        return self.kv_layer_groups_manager_.get_slots_per_chunk_in_sw(kernel_group_idx)
 
     def blocks_for_tokens(self, num_logical_tokens: int, group_idx: int) -> int:
         """Number of blocks that span *num_logical_tokens* for a group.
diff --git a/tests/v1/multiprocess/test_gpu_context.py b/tests/v1/multiprocess/test_gpu_context.py
index 9123a418b7..0ca3ebd56f 100644
--- a/tests/v1/multiprocess/test_gpu_context.py
+++ b/tests/v1/multiprocess/test_gpu_context.py
@@ -92,6 +92,7 @@ def _build_manager(
     num_blocks: int = 4,
     gpu_kv_format: "lmc_ops.GPUKVFormat" = lmc_ops.GPUKVFormat.NL_X_TWO_NB_BS_NH_HS,
     engine_group_infos: Sequence[EngineGroupInfo] = (),
+    lmcache_tokens_per_chunk: int = 256,
 ) -> KVLayerGroupsManager:
     """Build a real :class:`KVLayerGroupsManager` from synthetic tensors."""
     return KVLayerGroupsManager(
@@ -99,6 +100,7 @@ def _build_manager(
         gpu_kv_format=gpu_kv_format,
         num_blocks=num_blocks,
         engine_group_infos=engine_group_infos,
+        lmcache_tokens_per_chunk=lmcache_tokens_per_chunk,
     )
 
 
@@ -112,7 +114,10 @@ def _make_temp_buffer(
     """Build a ``_TempGPUBuffer`` backed by a real manager."""
     tensors = _make_kv_tensors(specs, num_blocks=num_blocks)
     manager = _build_manager(
-        tensors, num_blocks=num_blocks, engine_group_infos=engine_group_infos
+        tensors,
+        num_blocks=num_blocks,
+        engine_group_infos=engine_group_infos,
+        lmcache_tokens_per_chunk=chunk_size,
     )
     return _TempGPUBuffer(
         kv_layer_groups_manager=manager,
@@ -318,8 +323,8 @@ def test_contains_kernel_group_data(self) -> None:
         """Bytes written through kernel-group views are visible through the
         object-group flat view at matching offsets."""
         tensors = _make_kv_tensors(_MULTI_GROUP)
-        manager = _build_manager(tensors)
         chunk_size = 64
+        manager = _build_manager(tensors, lmcache_tokens_per_chunk=chunk_size)
         buf = _TempGPUBuffer(manager, chunk_size, _DEVICE)
         obj_group = manager.object_groups[0]
 
@@ -353,10 +358,12 @@ def test_object_groups_non_overlapping(self) -> None:
 
 class TestTempGPUBufferShapeDtype:
     def test_shape_scales_with_num_tokens(self) -> None:
+        # num_tokens must be a whole number of chunks; the shape scales
+        # linearly with the chunk count.
         tensors = _make_kv_tensors(_SINGLE_GROUP)
         manager = _build_manager(tensors)
         buf = _TempGPUBuffer(manager, 256, _DEVICE)
-        for num_tokens in (16, 128, 256):
+        for num_tokens in (256, 512, 768):
             shape, dtype = buf.get_kernel_group_shape_dtype(num_tokens, 0)
             assert shape == _expected_kernel_group_shape(manager, num_tokens, 0)
             assert dtype == manager.kernel_groups[0].dtype
@@ -374,14 +381,14 @@ def test_shape_compressed_group(self) -> None:
         shape, _ = buf.get_kernel_group_shape_dtype(256, 0)
         assert shape[2] == 256 // 2
 
-    def test_not_divisible_by_compress_ratio_raises(self) -> None:
+    def test_not_whole_chunks_raises(self) -> None:
         tensors = _make_kv_tensors([_GroupSpec(num_layers=2, block_size=8)])
         manager = _build_manager(
             tensors,
             engine_group_infos=[EngineGroupInfo(0, (0, 1), tokens_per_block=16)],
         )
         buf = _TempGPUBuffer(manager, 256, _DEVICE)
-        with pytest.raises(ValueError, match="not a multiple of"):
+        with pytest.raises(ValueError, match="must be a multiple of"):
             buf.get_kernel_group_shape_dtype(255, 0)
 
 
@@ -447,8 +454,8 @@ def test_get_temp_object_group_buffer(self) -> None:
     def test_get_kernel_group_shape_dtype(self) -> None:
         ctx = _make_context(_SINGLE_GROUP)
         manager = ctx.kv_layer_groups_manager
-        shape, dtype = ctx.get_kernel_group_shape_dtype(128, 0)
-        assert shape == _expected_kernel_group_shape(manager, 128, 0)
+        shape, dtype = ctx.get_kernel_group_shape_dtype(256, 0)
+        assert shape == _expected_kernel_group_shape(manager, 256, 0)
         assert dtype == manager.kernel_groups[0].dtype
 
 
diff --git a/tests/v1/test_kv_cache_groups.py b/tests/v1/test_kv_cache_groups.py
index d7fb8e5499..b7a0f51a35 100644
--- a/tests/v1/test_kv_cache_groups.py
+++ b/tests/v1/test_kv_cache_groups.py
@@ -44,11 +44,22 @@ def test_engine_group_infos_expand_engine_block_ids():
     ]
 
 
+def test_engine_group_info_old_payload_defaults_sw_size():
+    """A pre-sw_size_tokens msgspec payload decodes with the -1 default."""
+    old_payload = {"engine_group_id": 0, "layer_indices": (0, 1)}
+
+    decoded = msgspec.msgpack.decode(
+        msgspec.msgpack.encode(old_payload), type=EngineGroupInfo
+    )
+
+    assert decoded.sw_size_tokens == -1
+
+
 def test_engine_group_infos_msgspec_round_trip():
     """The groups encode/decode losslessly via msgspec (the IPC path)."""
     groups = [
         EngineGroupInfo(0, (0, 2)),
-        EngineGroupInfo(1, (1, 3)),
+        EngineGroupInfo(1, (1, 3), sw_size_tokens=128),
     ]
 
     decoded = msgspec.msgpack.decode(
diff --git a/tests/v1/test_kv_layer_groups_manager.py b/tests/v1/test_kv_layer_groups_manager.py
index c913aa8de5..38ce753d85 100644
--- a/tests/v1/test_kv_layer_groups_manager.py
+++ b/tests/v1/test_kv_layer_groups_manager.py
@@ -116,6 +116,21 @@ def test_build_rejects_bad_engine_group_infos(self):
                 engine_group_infos=[EngineGroupInfo(0, (2,))],
             )
 
+    def test_build_rejects_coarse_engine_group_infos(self):
+        # One info covering two layers that split into two kernel groups
+        # (different num_heads) violates the one-info-per-kernel-group
+        # contract.
+        tensors = [
+            torch.randn(2, 32, 256, 8, 64, dtype=torch.float16),
+            torch.randn(2, 32, 256, 16, 64, dtype=torch.float16),
+        ]
+        with pytest.raises(ValueError, match="engine group info"):
+            _build_manager(
+                tensors,
+                num_blocks=32,
+                engine_group_infos=[EngineGroupInfo(0, (0, 1))],
+            )
+
     def test_build_different_shapes(self):
         tensors = [
             torch.randn(2, 32, 256, 8, 64, dtype=torch.float16),
@@ -278,43 +293,49 @@ def test_empty_groups_raises(self):
             format_kvcache_shape_spec([])
 
 
-class TestDeriveSlotsPerChunk:
-    """``slots_per_chunk`` derivation from the two block-size sources:
-    ``tokens_per_block`` (engine KV cache spec, known at initialization) and
-    ``slots_per_block`` (registered tensor batch dimension), with
-    ``compress_ratio = tokens_per_block // slots_per_block`` (e.g. DeepSeek
-    V4 compression where ``slots < tokens``) and divisibility enforced.
+class TestValidateBlockChunkSizeConfig:
+    """Construction-time validation of the block/chunk size configuration:
+    ``tokens_per_block`` (engine KV cache spec) must pack whole
+    ``slots_per_block`` (registered tensor batch dimension), an LMCache chunk
+    must span whole paged blocks, and a sub-chunk sliding window must cover
+    whole paged blocks.
     """
 
-    def _derive(self, slots: int, tokens: int, chunk: int = 256) -> int:
-        return KVLayerGroupsManager._derive_slots_per_chunk(
+    def _validate(
+        self, slots: int, tokens: int, chunk: int = 256, sw: int = -1
+    ) -> None:
+        KVLayerGroupsManager._validate_block_chunk_size_config(
             group_idx=0,
             slots_per_block=slots,
             tokens_per_block=tokens,
             lmcache_tokens_per_chunk=chunk,
+            sw_size_tokens=sw,
         )
 
-    def test_one_to_one(self):
-        assert self._derive(slots=16, tokens=16) == 256
-
-    def test_compression_slots_lt_tokens(self):
+    def test_valid_configs_pass(self):
+        self._validate(slots=16, tokens=16)
         # slots=8 packs 2 logical tokens per physical slot (DeepSeek V4 style).
-        assert self._derive(slots=8, tokens=16) == 128
-
-    def test_dsv4_declared_ratios(self):
-        # DeepSeek-V4-Flash MLA groups declare 256 tokens per paged chunk
-        # over 64 (compress_ratio 4) or 2 (compress_ratio 128) slots.
-        assert self._derive(slots=64, tokens=256) == 64
-        assert self._derive(slots=2, tokens=256) == 2
+        self._validate(slots=8, tokens=16)
+        # Sub-chunk window aligned to whole paged blocks.
+        self._validate(slots=16, tokens=16, sw=64)
+        # Big window (>= chunk) needs no sub-chunk alignment.
+        self._validate(slots=16, tokens=16, sw=1000)
 
     def test_not_divisible_raises(self):
         # Divisibility is enforced loudly (e.g. slots=6 does not divide 16).
         with pytest.raises(ValueError, match="must be a multiple of"):
-            self._derive(slots=6, tokens=16)
+            self._validate(slots=6, tokens=16)
 
     def test_chunk_not_divisible_by_ratio_raises(self):
         with pytest.raises(ValueError, match="lmcache_tokens_per_chunk"):
-            self._derive(slots=1, tokens=96, chunk=256)
+            self._validate(slots=1, tokens=96, chunk=256)
+
+    def test_subchunk_window_not_block_aligned_raises(self):
+        # A sub-chunk window of 100 tokens does not cover whole 16-token
+        # blocks, so the transfer slot count would disagree with the kept
+        # block IDs.
+        with pytest.raises(ValueError, match="sliding window"):
+            self._validate(slots=16, tokens=16, sw=100)
 
 
 class TestKernelGroupIdentity:
@@ -375,6 +396,83 @@ def test_single_object_group_covers_all_kernel_groups(self):
         obj = manager.object_groups[0]
         assert isinstance(obj, ObjectGroupInfo)
         assert obj.kernel_group_indices == list(range(manager.num_kernel_groups))
+        assert obj.sw_size_chunks == -1
+        assert manager.get_sw_size_chunks(0) == -1
+
+    def test_kernel_groups_carry_sw_size_tokens(self):
+        # Same-shape layers split by engine group; the sliding-window group's
+        # window size lands on its kernel group, the other stays -1.
+        tensors = [torch.randn(2, 32, 32, 8, 64, dtype=torch.float16) for _ in range(2)]
+        manager = _build_manager(
+            tensors,
+            num_blocks=32,
+            engine_group_infos=[
+                EngineGroupInfo(0, (0,)),
+                EngineGroupInfo(1, (1,), sw_size_tokens=64),
+            ],
+        )
+        assert [g.sw_size_tokens for g in manager.kernel_groups] == [-1, 64]
+
+    def test_subchunk_window_not_block_aligned_rejected(self):
+        # A 64-token window over 256-slot blocks does not cover whole blocks;
+        # construction fails loudly instead of mistransferring.
+        tensors = [torch.randn(2, 32, 256, 8, 64, dtype=torch.float16)]
+        with pytest.raises(ValueError, match="sliding window"):
+            _build_manager(
+                tensors,
+                num_blocks=32,
+                engine_group_infos=[EngineGroupInfo(0, (0,), sw_size_tokens=64)],
+            )
+
+    def test_subchunk_sw_size_tokens(self):
+        # lmcache chunk size is 256 (default), 32-slot blocks. Sub-chunk
+        # window (64) is returned as-is; non-SW (-1) and big-SW (512) return
+        # the chunk size.
+        tensors = [
+            torch.randn(2, 32, 32, 8, 64, dtype=torch.float16),
+            torch.randn(2, 32, 32, 16, 64, dtype=torch.float16),
+            torch.randn(2, 32, 32, 32, 64, dtype=torch.float16),
+        ]
+        manager = _build_manager(
+            tensors,
+            num_blocks=32,
+            engine_group_infos=[
+                EngineGroupInfo(0, (0,)),
+                EngineGroupInfo(0, (1,), sw_size_tokens=64),
+                EngineGroupInfo(0, (2,), sw_size_tokens=512),
+            ],
+        )
+        assert manager.get_subchunk_sw_size_tokens(0) == 256
+        assert manager.get_subchunk_sw_size_tokens(1) == 64
+        assert manager.get_subchunk_sw_size_tokens(2) == 256
+        # Transfer slots follow the sub-chunk window (ratio 1 here).
+        assert manager.get_slots_per_chunk_in_sw(0) == 256
+        assert manager.get_slots_per_chunk_in_sw(1) == 64
+        assert manager.get_slots_per_chunk_in_sw(2) == 256
+
+    def test_mixed_sw_kernel_groups_share_single_object_group(self):
+        # Object-level bucketing by sliding window size is not enabled yet:
+        # kernel groups with differing window sizes still land in ONE object
+        # group and get_sw_size_chunks stays -1.
+        tensors = [
+            torch.randn(2, 32, 32, 8, 64, dtype=torch.float16),
+            torch.randn(2, 32, 32, 16, 64, dtype=torch.float16),
+            torch.randn(2, 32, 32, 32, 64, dtype=torch.float16),
+        ]
+        manager = _build_manager(
+            tensors,
+            num_blocks=32,
+            engine_group_infos=[
+                EngineGroupInfo(0, (0,)),
+                EngineGroupInfo(0, (1,), sw_size_tokens=64),
+                EngineGroupInfo(0, (2,), sw_size_tokens=512),
+            ],
+        )
+        assert manager.num_object_groups == 1
+        obj = manager.object_groups[0]
+        assert obj.kernel_group_indices == list(range(manager.num_kernel_groups))
+        assert obj.sw_size_chunks == -1
+        assert manager.get_sw_size_chunks(0) == -1
 
     def test_empty_manager_has_no_groups(self):
         # Empty registration returns early in __init__; both group lists must
@@ -420,7 +518,8 @@ def test_dsv4_flash_style_mixed_compression(self):
             tensors,
             num_blocks=8,
             engine_group_infos=[
-                EngineGroupInfo(0, (0, 1), tokens_per_block=256),
+                EngineGroupInfo(0, (0,), tokens_per_block=256),
+                EngineGroupInfo(0, (1,), tokens_per_block=256),
                 EngineGroupInfo(1, (2,), tokens_per_block=64),
                 EngineGroupInfo(2, (3,), tokens_per_block=4),
             ],
@@ -431,8 +530,8 @@ def test_dsv4_flash_style_mixed_compression(self):
         assert by_layer[2].tokens_per_block // by_layer[2].slots_per_block == 1
         assert by_layer[3].tokens_per_block // by_layer[3].slots_per_block == 1
         # 256-token LMCache chunk -> 2 physical slots in the ratio-128 group.
-        assert by_layer[1].slots_per_chunk == 2
-        assert by_layer[0].slots_per_chunk == 64
+        assert by_layer[1].calculate_slots(256) == 2
+        assert by_layer[0].calculate_slots(256) == 64
 
     def test_calculate_num_blocks_compressed(self):
         # slots_per_block=8 (tensor), tokens_per_block=16 (engine spec) ->
diff --git a/tests/v1/test_vllm_kv_cache_groups.py b/tests/v1/test_vllm_kv_cache_groups.py
index 0b1fe3bf82..021a72cb94 100644
--- a/tests/v1/test_vllm_kv_cache_groups.py
+++ b/tests/v1/test_vllm_kv_cache_groups.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # Standard
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 # Third Party
+import pytest
 import torch
 
 # First Party
@@ -15,16 +16,43 @@
     num_engine_groups,
 )
 
+# Test doubles for the vLLM KV cache spec classes. Unit tests must run
+# without vLLM installed; sliding-window specs are detected by class name,
+# so the doubles share the vLLM class names.
+
 
 @dataclass
 class MockKVCacheSpec:
     block_size: int
 
 
+@dataclass
+class SlidingWindowSpec:
+    block_size: int
+    sliding_window: int
+
+
+@dataclass
+class SlidingWindowMLASpec(SlidingWindowSpec):
+    pass
+
+
+@dataclass
+class FullAttentionSpec:
+    block_size: int
+    sliding_window: "int | None" = None
+
+
+@dataclass
+class UniformTypeKVCacheSpecs:
+    block_size: int
+    kv_cache_specs: "dict[str, object]" = field(default_factory=dict)
+
+
 @dataclass
 class MockKVCacheGroup:
     layer_names: list[str]
-    kv_cache_spec: MockKVCacheSpec
+    kv_cache_spec: object
 
 
 @dataclass
@@ -94,3 +122,102 @@ def test_conversion_splits_by_lmcache_layer_identity():
         [20],
         [10],
     ]
+
+
+def test_conversion_resolves_sliding_window_size():
+    """A SlidingWindowSpec group carries its window size in tokens;
+    subclasses count too."""
+    spec = create_engine_group_infos_from_vllm(
+        MockKVCacheConfig(
+            kv_cache_groups=[
+                MockKVCacheGroup(["layer.0"], FullAttentionSpec(block_size=16)),
+                MockKVCacheGroup(
+                    ["layer.1"], SlidingWindowSpec(block_size=16, sliding_window=64)
+                ),
+                MockKVCacheGroup(
+                    ["layer.2"],
+                    SlidingWindowMLASpec(block_size=16, sliding_window=128),
+                ),
+            ]
+        ),
+        _same_shape_caches(["layer.0", "layer.1", "layer.2"]),
+    )
+
+    assert [group.sw_size_tokens for group in spec] == [-1, 64, 128]
+
+
+def test_conversion_ignores_full_attention_sliding_window():
+    """SWA layers managed as full attention (hybrid allocator disabled) are
+    not sliding window: vLLM allocates blocks for all tokens."""
+    spec = create_engine_group_infos_from_vllm(
+        MockKVCacheConfig(
+            kv_cache_groups=[
+                MockKVCacheGroup(
+                    ["layer.0", "layer.1"],
+                    FullAttentionSpec(block_size=16, sliding_window=1024),
+                ),
+            ]
+        ),
+        _same_shape_caches(["layer.0", "layer.1"]),
+    )
+
+    assert [group.sw_size_tokens for group in spec] == [-1]
+
+
+def test_conversion_defaults_sliding_window_for_non_sw_spec():
+    """Groups whose spec is not a SlidingWindowSpec resolve to
+    non-sliding-window."""
+    spec = create_engine_group_infos_from_vllm(
+        MockKVCacheConfig(
+            kv_cache_groups=[
+                MockKVCacheGroup(["layer.0"], MockKVCacheSpec(block_size=16))
+            ]
+        ),
+        _same_shape_caches(["layer.0"]),
+    )
+
+    assert [group.sw_size_tokens for group in spec] == [-1]
+
+
+def test_conversion_uniform_type_specs_resolve_per_layer():
+    """Inside a UniformTypeKVCacheSpecs group, per-layer specs decide the
+    window. SW layers with a distinct transfer identity get their own group
+    carrying the window size."""
+    caches = _same_shape_caches(["layer.0", "layer.1"])
+    # layer.1 has a different head count -> distinct transfer identity.
+    caches["layer.1"] = torch.randn(2, 32, 16, 16, 64, dtype=torch.float16)
+    uniform_spec = UniformTypeKVCacheSpecs(
+        block_size=16,
+        kv_cache_specs={
+            "layer.0": FullAttentionSpec(block_size=16),
+            "layer.1": SlidingWindowSpec(block_size=16, sliding_window=512),
+        },
+    )
+    spec = create_engine_group_infos_from_vllm(
+        MockKVCacheConfig(
+            kv_cache_groups=[MockKVCacheGroup(["layer.0", "layer.1"], uniform_spec)]
+        ),
+        caches,
+    )
+
+    assert [group.layer_indices for group in spec] == [(0,), (1,)]
+    assert [group.sw_size_tokens for group in spec] == [-1, 512]
+
+
+def test_conversion_mixed_window_layers_in_one_group_rejected():
+    """Same-identity layers mixing different windows are inconsistent vLLM
+    metadata and fail loudly."""
+    uniform_spec = UniformTypeKVCacheSpecs(
+        block_size=16,
+        kv_cache_specs={
+            "layer.0": FullAttentionSpec(block_size=16),
+            "layer.1": SlidingWindowSpec(block_size=16, sliding_window=64),
+        },
+    )
+    with pytest.raises(ValueError, match="different sliding window sizes"):
+        create_engine_group_infos_from_vllm(
+            MockKVCacheConfig(
+                kv_cache_groups=[MockKVCacheGroup(["layer.0", "layer.1"], uniform_spec)]
+            ),
+            _same_shape_caches(["layer.0", "layer.1"]),
+        )

From 08c93df09e00296c740f0ae26fa541a285f76cbc Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 12 Jun 2026 06:59:37 +0800
Subject: [PATCH 49/57] [Recipe] Recipe update for Qwen 3.6 27B, and general
 guideline for mamba models (#3645)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 docs/source/mp/hybrid_models.rst          | 135 +++++++++++++++++-----
 docs/source/recipes/deepseek_v4_flash.rst |   2 +-
 docs/source/recipes/devstral.rst          |   2 +-
 docs/source/recipes/gemma3.rst            |   2 +-
 docs/source/recipes/gemma4.rst            |   2 +-
 docs/source/recipes/gpt_oss.rst           |   2 +-
 docs/source/recipes/index.rst             |   5 +-
 docs/source/recipes/llama.rst             |   2 +-
 docs/source/recipes/minimax_m2.rst        |   2 +-
 docs/source/recipes/mixtral.rst           |   2 +-
 docs/source/recipes/phi3.rst              |   2 +-
 docs/source/recipes/qwen3.rst             |   2 +-
 docs/source/recipes/qwen3_5.rst           |  56 ++++++---
 13 files changed, 163 insertions(+), 53 deletions(-)

diff --git a/docs/source/mp/hybrid_models.rst b/docs/source/mp/hybrid_models.rst
index 616c5fef87..9b84df3f96 100644
--- a/docs/source/mp/hybrid_models.rst
+++ b/docs/source/mp/hybrid_models.rst
@@ -54,45 +54,126 @@ detects the model's KV cache groups automatically at registration time.
 Mamba / Linear-Attention Hybrids
 --------------------------------
 
-Models that interleave **Mamba / Gated-DeltaNet layers** with full attention
-(e.g. ``Qwen/Qwen3.5-0.8B``) are supported. Their recurrent state caches are
-reinterpreted as opaque pages at registration time, so prefix caching and KV
-reuse work end to end. They need three extra flags:
+Models that interleave **Mamba / Gated-DeltaNet (GDN) linear-attention layers**
+with full attention — the Qwen3.5 and Qwen3.6 series (``Qwen/Qwen3.5-0.8B``,
+``Qwen/Qwen3.6-27B``, …), Qwen3-Next, and other GDN hybrids — are supported.
+Unlike a paged key/value cache, their linear-attention layers keep a recurrent
+**state cache** (a convolution + SSM state). LMCache reinterprets that state as
+an opaque page at registration time, so prefix caching and KV reuse work end to
+end without any model-specific transfer code.
+
+This section is the **general procedure for any such model**. The only
+per-model variable is the *unified block size* ``N`` (step 1); everything else
+is identical across models.
+
+.. _mamba-block-size:
+
+Step 1 — find the model's unified block size ``N``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``N`` is the **single number** that drives every other setting: the LMCache
+server's ``--chunk-size`` and vLLM's ``--max-num-batched-tokens`` are both
+derived from it (step 2). Get it wrong and LMCache raises at engine startup.
+
+For a Mamba / GDN hybrid, vLLM forces **one** block size across all KV cache
+groups, chosen large enough that an attention page is at least as big as a
+Mamba state page. It depends on the model's head dimensions and GDN state size,
+so it is **model-specific — never assume a value, read it from the model**.
+vLLM prints it once at startup::
+
+    INFO ... interface.py:670] Setting attention block size to 784 tokens to
+    ensure that attention page size is >= mamba page size.
+
+You do not need LMCache, a full serving run, or the weights to be quantized to
+read it — just launch vLLM until the line appears, then stop. The snippet below
+does exactly that and prints ``N``:
+
+.. code-block:: bash
+
+   MODEL=Qwen/Qwen3.6-27B
+   LOG=$(mktemp)
+
+   # Launch vLLM just far enough to size the KV cache; cheap settings only.
+   vllm serve "$MODEL" \
+       --mamba-cache-mode align --enable-prefix-caching \
+       --max-model-len 8192 --gpu-memory-utilization 0.5 \
+       --port 8011 > "$LOG" 2>&1 &
+   VLLM_PID=$!
+
+   # Wait for the block-size line (or a fatal error), then stop vLLM.
+   until grep -qiE "Setting attention block size|Error|Traceback" "$LOG"; do
+       sleep 3
+   done
+   grep -i "Setting attention block size" "$LOG"
+   kill "$VLLM_PID"
+
+The number in ``to N tokens`` is your ``N``. Values grow with model size; for
+example:
 
-#. vLLM must run with prefix caching and the ``align`` Mamba cache mode::
-
-       vllm serve Qwen/Qwen3.5-0.8B \
+.. list-table::
+   :header-rows: 1
+   :widths: 50 25 25
+
+   * - Model
+     - Unified block size ``N``
+     - GPUs
+   * - ``Qwen/Qwen3.6-27B``
+     - 784
+     - 1
+   * - ``Qwen/Qwen3.5-0.8B``
+     - 544
+     - 1
+
+Step 2 — derive the three required flags from ``N``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+#. **LMCache server** ``--chunk-size`` **= N** (or any multiple of ``N``). This
+   is the rule the connector enforces: LMCache's chunk size must be a multiple
+   of vLLM's unified block size, or registration fails::
+
+       lmcache server --chunk-size 784 --l1-size-gb 100 --eviction-policy LRU
+
+#. **vLLM** ``--max-num-batched-tokens`` **in [N, 2·N)** — setting it equal to
+   ``N`` is the simple, always-valid choice. Outside this range LMCache raises
+   at engine startup. ``align`` mode snapshots the Mamba state only at the
+   *end* of each scheduler step, so each prefill step must advance exactly one
+   block; a larger budget would let a step skip block boundaries, leaving no
+   snapshot for LMCache to store at those prefixes.
+
+#. **vLLM** ``--mamba-cache-mode align --enable-prefix-caching`` — ``align`` is
+   mandatory (GDN backends do not support the ``all`` mode)::
+
+       vllm serve <model> \
            --enable-prefix-caching --mamba-cache-mode align \
+           --max-num-batched-tokens 784 \
            --kv-transfer-config \
            '{"kv_connector":"LMCacheMPConnector", "kv_role":"kv_both"}'
 
-#. The LMCache server's ``--chunk-size`` must be a multiple of vLLM's unified
-   block size for the model (vLLM logs ``Setting attention block size to N
-   tokens`` at startup; for Qwen3.5-0.8B, ``N = 544``)::
-
-       lmcache server --chunk-size 544 --l1-size-gb 100 --eviction-policy LRU
-
-#. ``--max-num-batched-tokens`` must be at least the unified block size and
-   below twice it (LMCache raises at engine startup otherwise; setting it
-   equal to the block size is the simple choice)::
-
-       vllm serve ... --max-num-batched-tokens 544
+So for a freshly-probed model the whole derivation is just: read ``N`` (step 1),
+then pass ``--chunk-size N`` to the server and ``--max-num-batched-tokens N`` to
+vLLM.
 
-   ``align`` mode snapshots the Mamba state only at the *end* of each
-   scheduler step; a larger budget would let one step skip block boundaries,
-   leaving no snapshot for LMCache to store at those prefixes.
+No ``--no-disable-hybrid-kv-cache-manager`` or attention-backend flag is needed;
+``LMCacheMPConnector`` advertises hybrid support and vLLM auto-selects the GDN
+backend.
 
-Caveats:
+Caveats
+^^^^^^^
 
 - Generation is **not bit-exact** between a cached and a fresh run: GDN
-  backends do not support vLLM's batch-invariant mode. Expect score-level
-  equivalence, not token-level.
-- The cached pages are byte-opaque, so content-aware features (CacheGen
+  backends do not support vLLM's batch-invariant mode. Validate with a
+  **score-level** comparison (see `Verifying Correctness`_), not a token-level
+  diff.
+- The cached pages are **byte-opaque**, so content-aware features (CacheGen
   compression, CacheBlend) do not apply, and cache entries must not be shared
   across engines with different attention backends or kernel block sizes.
+- Several of these models are **vision-language** (they load a vision tower).
+  The validated, supported path is **text** KV caching; image/video KV caching
+  is not validated.
+- vLLM's Mamba prefix caching in ``align`` mode is marked experimental upstream.
 
-See the :doc:`Qwen3.5 recipe <../recipes/qwen3_5>` for the validated
-end-to-end commands.
+See the :doc:`Qwen3.5 / Qwen3.6 recipe <../recipes/qwen3_5>` for the validated
+end-to-end commands and the per-model block sizes.
 
 What Is Not Supported Yet
 -------------------------
diff --git a/docs/source/recipes/deepseek_v4_flash.rst b/docs/source/recipes/deepseek_v4_flash.rst
index 4bb4baaad6..ac2a32dc95 100644
--- a/docs/source/recipes/deepseek_v4_flash.rst
+++ b/docs/source/recipes/deepseek_v4_flash.rst
@@ -61,7 +61,7 @@ Validated models
       required for this model; ``--enable-expert-parallel`` distributes the MoE
       experts across the tensor-parallel ranks. Adjust
       ``--tensor-parallel-size`` to match your hardware. For the generic
-      LMCache + vLLM wiring (ports, remote hosts, in-process mode), see
+      LMCache + vLLM wiring (ports, remote hosts), see
       :doc:`../mp/quickstart`.
 
       If there are any issues with vLLM setup, please refer to the
diff --git a/docs/source/recipes/devstral.rst b/docs/source/recipes/devstral.rst
index 1f7e4814a2..5459a842fc 100644
--- a/docs/source/recipes/devstral.rst
+++ b/docs/source/recipes/devstral.rst
@@ -42,7 +42,7 @@ Validated models
       |
 
       Adjust ``--tensor-parallel-size`` to match your hardware. For the
-      generic LMCache + vLLM wiring (ports, remote hosts, in-process mode),
+      generic LMCache + vLLM wiring (ports, remote hosts),
       see :doc:`../mp/quickstart`.
 
       If there are any issues with vLLM setup, please refer to the
diff --git a/docs/source/recipes/gemma3.rst b/docs/source/recipes/gemma3.rst
index f7de109aed..4a50a182f7 100644
--- a/docs/source/recipes/gemma3.rst
+++ b/docs/source/recipes/gemma3.rst
@@ -49,7 +49,7 @@ Validated models
       ``google/gemma-3-4b-it`` is a gated model; authenticate with the Hugging
       Face Hub (e.g. set ``HF_TOKEN``) before serving. Adjust
       ``--tensor-parallel-size`` to match your hardware. For the generic LMCache
-      + vLLM wiring (ports, remote hosts, in-process mode), see
+      + vLLM wiring (ports, remote hosts), see
       :doc:`../mp/quickstart`.
 
       If there are any issues with vLLM setup, please refer to the
diff --git a/docs/source/recipes/gemma4.rst b/docs/source/recipes/gemma4.rst
index a2ed89894e..ed62997ee6 100644
--- a/docs/source/recipes/gemma4.rst
+++ b/docs/source/recipes/gemma4.rst
@@ -54,7 +54,7 @@ Validated models
       |
 
       Adjust ``--tensor-parallel-size`` to match your hardware. For the
-      generic LMCache + vLLM wiring (ports, remote hosts, in-process mode),
+      generic LMCache + vLLM wiring (ports, remote hosts),
       see :doc:`../mp/quickstart`.
 
       If there are any issues with vLLM setup, please refer to the
diff --git a/docs/source/recipes/gpt_oss.rst b/docs/source/recipes/gpt_oss.rst
index 97e3d6c705..11603879da 100644
--- a/docs/source/recipes/gpt_oss.rst
+++ b/docs/source/recipes/gpt_oss.rst
@@ -55,7 +55,7 @@ Validated models
       |
 
       Adjust ``--tensor-parallel-size`` to match your hardware. For the
-      generic LMCache + vLLM wiring (ports, remote hosts, in-process mode),
+      generic LMCache + vLLM wiring (ports, remote hosts),
       see :doc:`../mp/quickstart`.
 
       If there are any issues with vLLM setup, please refer to the
diff --git a/docs/source/recipes/index.rst b/docs/source/recipes/index.rst
index aaf053a3d7..37ca099ab2 100644
--- a/docs/source/recipes/index.rst
+++ b/docs/source/recipes/index.rst
@@ -25,7 +25,7 @@ Each recipe page is intentionally minimal:
   with per-method validation status. Extensible: new methods get a row.
 - **Caveats** -- known limitations, if any.
 
-For the generic LMCache + engine wiring (ports, remote hosts, in-process mode,
+For the generic LMCache + engine wiring (ports, remote hosts,
 sending a first request), see :doc:`../mp/quickstart`. Recipes assume that
 page as a prerequisite.
 
@@ -101,7 +101,8 @@ Supported architectures
      - :doc:`qwen3`
 
    * - ``Qwen3_5ForConditionalGeneration``
-     - ``Qwen/Qwen3.5-0.8B``
+     - | ``Qwen/Qwen3.6-27B``
+       | ``Qwen/Qwen3.5-0.8B``
      - ✓
      - —
      - —
diff --git a/docs/source/recipes/llama.rst b/docs/source/recipes/llama.rst
index d7fafb5575..ef151b3b99 100644
--- a/docs/source/recipes/llama.rst
+++ b/docs/source/recipes/llama.rst
@@ -98,7 +98,7 @@ Validated models
       |
 
       Adjust ``--tensor-parallel-size`` to match your hardware. For the
-      generic LMCache + vLLM wiring (ports, remote hosts, in-process mode),
+      generic LMCache + vLLM wiring (ports, remote hosts),
       see :doc:`../mp/quickstart`.
 
    .. tab-item:: SGLang
diff --git a/docs/source/recipes/minimax_m2.rst b/docs/source/recipes/minimax_m2.rst
index 44ab625544..59f9dbc1e1 100644
--- a/docs/source/recipes/minimax_m2.rst
+++ b/docs/source/recipes/minimax_m2.rst
@@ -75,7 +75,7 @@ Validated models
       |
 
       Adjust ``--tensor-parallel-size`` to match your hardware. For the
-      generic LMCache + vLLM wiring (ports, remote hosts, in-process mode),
+      generic LMCache + vLLM wiring (ports, remote hosts),
       see :doc:`../mp/quickstart`.
 
       If there are any issues with vLLM setup, please refer to the
diff --git a/docs/source/recipes/mixtral.rst b/docs/source/recipes/mixtral.rst
index 7e94c6feeb..6345bdba14 100644
--- a/docs/source/recipes/mixtral.rst
+++ b/docs/source/recipes/mixtral.rst
@@ -58,7 +58,7 @@ Validated models
       |
 
       Adjust ``--tensor-parallel-size`` to match your hardware. For the
-      generic LMCache + vLLM wiring (ports, remote hosts, in-process mode),
+      generic LMCache + vLLM wiring (ports, remote hosts),
       see :doc:`../mp/quickstart`.
 
    .. tab-item:: SGLang
diff --git a/docs/source/recipes/phi3.rst b/docs/source/recipes/phi3.rst
index 743bf35906..3214c9085c 100644
--- a/docs/source/recipes/phi3.rst
+++ b/docs/source/recipes/phi3.rst
@@ -56,7 +56,7 @@ Validated models
       |
       
       Adjust ``--tensor-parallel-size`` to match your hardware. For the
-      generic LMCache + vLLM wiring (ports, remote hosts, in-process mode),
+      generic LMCache + vLLM wiring (ports, remote hosts),
       see :doc:`../mp/quickstart`.
 
    .. tab-item:: SGLang
diff --git a/docs/source/recipes/qwen3.rst b/docs/source/recipes/qwen3.rst
index 1a7bddc46e..5ee5e4425e 100644
--- a/docs/source/recipes/qwen3.rst
+++ b/docs/source/recipes/qwen3.rst
@@ -86,7 +86,7 @@ Validated models
       |
 
       Adjust ``--tensor-parallel-size`` to match your hardware. For the
-      generic LMCache + vLLM wiring (ports, remote hosts, in-process mode),
+      generic LMCache + vLLM wiring (ports, remote hosts),
       see :doc:`../mp/quickstart`.
 
       If there are any issues with vLLM setup, please refer to the
diff --git a/docs/source/recipes/qwen3_5.rst b/docs/source/recipes/qwen3_5.rst
index 2a94418fe4..4d44f9eadb 100644
--- a/docs/source/recipes/qwen3_5.rst
+++ b/docs/source/recipes/qwen3_5.rst
@@ -4,13 +4,16 @@ Qwen3_5ForConditionalGeneration
 ===============================
 
 A hybrid architecture interleaving Mamba / Gated-DeltaNet (GDN) linear-attention
-layers with full-attention layers. LMCache reinterprets the recurrent state
-caches as opaque pages at registration time; see :doc:`../mp/hybrid_models`.
+layers with full-attention layers, shared by the **Qwen3.5 and Qwen3.6**
+series. LMCache reinterprets the recurrent state caches as opaque pages at
+registration time; see :doc:`../mp/hybrid_models` for the general handling of
+Mamba / linear-attention models.
 
 Validated models
 ----------------
 
-- `Qwen/Qwen3.5-0.8B <https://huggingface.co/Qwen/Qwen3.5-0.8B>`_
+- `Qwen/Qwen3.6-27B <https://huggingface.co/Qwen/Qwen3.6-27B>`_ (1 GPU)
+- `Qwen/Qwen3.5-0.8B <https://huggingface.co/Qwen/Qwen3.5-0.8B>`_ (1 GPU)
 
 .. tab-set::
    :sync-group: engine
@@ -24,27 +27,49 @@ Validated models
 
       **Status:** Validated with LMCache.
 
-      Start the LMCache MP server. ``--chunk-size`` must be a multiple of
-      vLLM's unified block size for the model — vLLM logs ``Setting attention
-      block size to N tokens`` at startup; for Qwen3.5-0.8B, ``N = 544``:
+      Every model in this family needs the same three settings: the ``align``
+      Mamba cache mode, prefix caching, and a chunk size matched to vLLM's
+      *unified block size*. That block size is model-specific — vLLM logs
+      ``Setting attention block size to N tokens`` at startup:
 
-      .. code-block:: bash
+      .. list-table::
+         :header-rows: 1
+         :widths: 50 25 25
 
-         lmcache server --chunk-size 544 --l1-size-gb 100 --eviction-policy LRU
+         * - Model
+           - Unified block size ``N``
+           - GPUs
+         * - ``Qwen/Qwen3.6-27B``
+           - 784
+           - 1
+         * - ``Qwen/Qwen3.5-0.8B``
+           - 544
+           - 1
 
-      |
+      Set the LMCache server's ``--chunk-size`` to that ``N`` (or a multiple of
+      it) and vLLM's ``--max-num-batched-tokens`` to ``N``.
+
+      **Qwen3.6-27B** (1 GPU, ``N = 784``):
+
+      .. code-block:: bash
 
-      **Qwen3.5-0.8B** (1 GPU):
+         lmcache server --chunk-size 784 --l1-size-gb 100 --eviction-policy LRU
 
       .. code-block:: bash
 
-         vllm serve Qwen/Qwen3.5-0.8B \
+         vllm serve Qwen/Qwen3.6-27B \
              --enable-prefix-caching \
              --mamba-cache-mode align \
-             --max-num-batched-tokens 544 \
+             --max-num-batched-tokens 784 \
              --kv-transfer-config \
              '{"kv_connector":"LMCacheMPConnector", "kv_role":"kv_both"}'
 
+      |
+
+      **Qwen3.5-0.8B** (1 GPU, ``N = 544``): identical to the above, with
+      ``784`` replaced by ``544`` in both ``--chunk-size`` and
+      ``--max-num-batched-tokens``.
+
       ``--mamba-cache-mode align`` is required (GDN does not support the
       ``all`` mode). ``--max-num-batched-tokens`` must be at least the unified
       block size and below twice it — LMCache raises at engine startup
@@ -52,8 +77,8 @@ Validated models
       ends, so each prefill step must advance exactly one block for every
       block boundary to hold a reusable snapshot.
 
-      For the generic LMCache + vLLM wiring (ports, remote hosts, in-process
-      mode), see :doc:`../mp/quickstart`.
+      For the generic LMCache + vLLM wiring (ports, remote hosts), see
+      :doc:`../mp/quickstart`.
 
    .. tab-item:: SGLang
 
@@ -94,3 +119,6 @@ Caveats
   shared across engines with different attention backends or kernel block
   sizes.
 - vLLM's Mamba prefix caching in ``align`` mode is experimental.
+- ``Qwen/Qwen3.6-27B`` is a vision-language model (it loads a vision tower);
+  the LMCache validation covers **text** generation (the ``hma_lm_eval_qwen3_5``
+  gsm8k store-vs-retrieve gate). Caching of image/video KV is not validated.

From a8a21fff593bd07f613ca876157ea7e44804be18 Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Thu, 11 Jun 2026 16:26:34 -0700
Subject: [PATCH 50/57] fix comments

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 lmcache/v1/mp_coordinator/app.py              |   5 +-
 lmcache/v1/mp_coordinator/config.py           |  12 +-
 lmcache/v1/mp_coordinator/http_apis/l2_api.py |  61 +++++--
 .../v1/mp_coordinator/l2/eviction_manager.py  | 155 +++++++-----------
 lmcache/v1/mp_coordinator/l2/quota_manager.py |  92 -----------
 5 files changed, 120 insertions(+), 205 deletions(-)
 delete mode 100644 lmcache/v1/mp_coordinator/l2/quota_manager.py

diff --git a/lmcache/v1/mp_coordinator/app.py b/lmcache/v1/mp_coordinator/app.py
index 13b52bbc6a..1ff4ebd874 100644
--- a/lmcache/v1/mp_coordinator/app.py
+++ b/lmcache/v1/mp_coordinator/app.py
@@ -28,11 +28,11 @@
 
 # First Party
 from lmcache.logging import init_logger
+from lmcache.v1.distributed.quota_manager import QuotaManager
 from lmcache.v1.mp_coordinator.config import MPCoordinatorConfig
 from lmcache.v1.mp_coordinator.l2.eviction_manager import (
     L2EvictionManager,
 )
-from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
 from lmcache.v1.mp_coordinator.l2.usage_manager import L2UsageManager
 from lmcache.v1.mp_coordinator.registry import InstanceRegistry
 from lmcache.v1.utils.router_discovery import discover_api_routers
@@ -70,12 +70,13 @@ def create_app(config: MPCoordinatorConfig) -> FastAPI:
         ``usage_manager``); all ``http_apis`` routers are registered.
     """
     registry = InstanceRegistry()
-    quota_manager = L2QuotaManager()
+    quota_manager = QuotaManager()
     usage_manager = L2UsageManager()
     eviction_manager = L2EvictionManager(
         quota_manager=quota_manager,
         usage_manager=usage_manager,
         eviction_ratio=config.eviction_ratio,
+        trigger_watermark=config.trigger_watermark,
     )
 
     async def _health_loop() -> None:
diff --git a/lmcache/v1/mp_coordinator/config.py b/lmcache/v1/mp_coordinator/config.py
index dd7876dcbb..90bf77da0e 100644
--- a/lmcache/v1/mp_coordinator/config.py
+++ b/lmcache/v1/mp_coordinator/config.py
@@ -31,8 +31,10 @@ class MPCoordinatorConfig:
             ``0`` disables the health-check loop.
         eviction_check_interval: Seconds between eviction sweeps. A value of
             ``0`` disables the eviction loop.
-        eviction_ratio: Fraction of over-quota bytes to target per eviction
+        eviction_ratio: Fraction of tracked keys (by count) to evict per
             cycle (0.0 to 1.0).
+        trigger_watermark: Eviction fires when usage reaches this fraction
+            of the quota (0.0 to 1.0).
     """
 
     host: str = "0.0.0.0"
@@ -40,7 +42,8 @@ class MPCoordinatorConfig:
     instance_timeout: float = 30.0
     health_check_interval: float = 10.0
     eviction_check_interval: float = 5.0
-    eviction_ratio: float = 0.5
+    eviction_ratio: float = 0.2
+    trigger_watermark: float = 1.0
 
     def __post_init__(self) -> None:
         """Validate timing parameters.
@@ -56,6 +59,10 @@ def __post_init__(self) -> None:
             raise ValueError("eviction_check_interval must be non-negative")
         if not 0.0 <= self.eviction_ratio <= 1.0:
             raise ValueError("eviction_ratio must be between 0.0 and 1.0")
+        if not 0.0 < self.trigger_watermark <= 1.0:
+            raise ValueError(
+                "trigger_watermark must be between 0.0 (exclusive) and 1.0"
+            )
 
     @classmethod
     def from_env(cls) -> "MPCoordinatorConfig":
@@ -93,4 +100,5 @@ def _num(name: str, default: float, cast) -> float:
                 "EVICTION_CHECK_INTERVAL", cls.eviction_check_interval, float
             ),
             eviction_ratio=_num("EVICTION_RATIO", cls.eviction_ratio, float),
+            trigger_watermark=_num("TRIGGER_WATERMARK", cls.trigger_watermark, float),
         )
diff --git a/lmcache/v1/mp_coordinator/http_apis/l2_api.py b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
index d94f9a9562..753d42d966 100644
--- a/lmcache/v1/mp_coordinator/http_apis/l2_api.py
+++ b/lmcache/v1/mp_coordinator/http_apis/l2_api.py
@@ -10,12 +10,14 @@
 from fastapi.responses import JSONResponse
 
 # First Party
+from lmcache.v1.distributed.api import ObjectKey
+from lmcache.v1.distributed.quota_manager import QuotaManager
 from lmcache.v1.mp_coordinator.l2.eviction_manager import (
     L2EvictionManager,
 )
-from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
 from lmcache.v1.mp_coordinator.l2.usage_manager import L2UsageManager
 from lmcache.v1.mp_coordinator.schemas import (
+    CacheKey,
     EventType,
     L2StatusListResponse,
     L2StatusResponse,
@@ -28,6 +30,12 @@
 router = APIRouter()
 
 _GB = 1024**3
+_DEFAULT_SALT_SENTINEL = "_default"
+
+
+def _resolve_salt_from_api_path(cache_salt: str) -> str:
+    """Map the ``_default`` sentinel to the empty string."""
+    return "" if cache_salt == _DEFAULT_SALT_SENTINEL else cache_salt
 
 
 def _gb(n_bytes: int) -> float:
@@ -35,14 +43,31 @@ def _gb(n_bytes: int) -> float:
     return n_bytes / _GB
 
 
-def _quota_manager(request: Request) -> L2QuotaManager:
+def _to_object_key(ck: CacheKey) -> ObjectKey:
+    """Convert a wire-format CacheKey to an ObjectKey.
+
+    Args:
+        ck: The cache key from the REST API.
+
+    Returns:
+        The equivalent ObjectKey for internal use.
+    """
+    return ObjectKey(
+        chunk_hash=bytes.fromhex(ck.chunk_hash_hex),
+        model_name=ck.model_name,
+        kv_rank=ck.kv_rank,
+        cache_salt=ck.cache_salt,
+    )
+
+
+def _quota_manager(request: Request) -> QuotaManager:
     """Return the shared quota manager from app state.
 
     Args:
         request: The incoming request.
 
     Returns:
-        The shared :class:`L2QuotaManager`.
+        The shared :class:`QuotaManager`.
 
     Raises:
         RuntimeError: If the manager is not initialized.
@@ -99,16 +124,18 @@ async def set_quota(
     """Create or update a quota for the given ``cache_salt``.
 
     Args:
-        cache_salt: The tenant identifier.
+        cache_salt: The tenant identifier. Use ``_default`` for the
+            empty salt.
         body: Quota limit to apply.
         request: The incoming request.
 
     Returns:
         The applied quota.
     """
+    cache_salt = _resolve_salt_from_api_path(cache_salt)
     limit_bytes = int(body.limit_gb * _GB)
     try:
-        _quota_manager(request).set(cache_salt, limit_bytes)
+        _quota_manager(request).set_quota(cache_salt, limit_bytes)
     except ValueError:
         return JSONResponse(status_code=400, content={"error": "invalid quota limit"})
     return QuotaResponse(
@@ -123,13 +150,15 @@ async def delete_quota(cache_salt: str, request: Request) -> QuotaResponse:
     """Remove a salt's quota entry.
 
     Args:
-        cache_salt: The tenant identifier.
+        cache_salt: The tenant identifier. Use ``_default`` for the
+            empty salt.
         request: The incoming request.
 
     Returns:
         Whether the entry was found and removed.
     """
-    removed = _quota_manager(request).delete(cache_salt)
+    cache_salt = _resolve_salt_from_api_path(cache_salt)
+    removed = _quota_manager(request).delete_quota(cache_salt)
     return QuotaResponse(
         cache_salt=cache_salt,
         limit_gb=0.0,
@@ -156,11 +185,12 @@ async def report_events(
     tracker = _usage_manager(request)
     ctrl = _eviction_manager(request)
     for event in body.events:
+        ok = _to_object_key(event.key)
         if event.type == EventType.STORE:
             tracker.record_stored(event.key.cache_salt, event.bytes)
-            ctrl.on_store(event.key, event.bytes)
+            ctrl.on_store(ok, event.bytes)
         elif event.type == EventType.LOOKUP:
-            ctrl.on_lookup(event.key)
+            ctrl.on_lookup(ok)
     return ReportUsageResponse(recorded=len(body.events))
 
 
@@ -172,20 +202,23 @@ async def get_status(cache_salt: str, request: Request) -> L2StatusResponse:
     """Read quota and usage for a single salt.
 
     Args:
-        cache_salt: The tenant identifier.
+        cache_salt: The tenant identifier. Use ``_default`` for the
+            empty salt.
         request: The incoming request.
 
     Returns:
         Combined quota and usage detail.
     """
+    cache_salt = _resolve_salt_from_api_path(cache_salt)
     tracker = _usage_manager(request)
     store = _quota_manager(request)
     usage = tracker.get(cache_salt)
-    limit = store.get(cache_salt)
+    exists = store.has_quota(cache_salt)
+    limit = store.get_limit_bytes(cache_salt)
     return L2StatusResponse(
         cache_salt=cache_salt,
-        quota_limit_gb=_gb(limit) if limit is not None else 0.0,
-        quota_exists=limit is not None,
+        quota_limit_gb=_gb(limit) if exists else 0.0,
+        quota_exists=exists,
         usage_gb=_gb(usage),
     )
 
@@ -204,7 +237,7 @@ async def list_status(request: Request) -> L2StatusListResponse:
     store = _quota_manager(request)
     by_salt = tracker.get_all()
     total = tracker.get_total()
-    quota_entries = {e.cache_salt: e.limit_bytes for e in store.list_all()}
+    quota_entries = {e.cache_salt: e.limit_bytes for e in store.list_quotas()}
     all_salts = sorted(set(by_salt) | set(quota_entries))
     return L2StatusListResponse(
         total_gb=_gb(total),
diff --git a/lmcache/v1/mp_coordinator/l2/eviction_manager.py b/lmcache/v1/mp_coordinator/l2/eviction_manager.py
index 1ce491d23c..04398167f7 100644
--- a/lmcache/v1/mp_coordinator/l2/eviction_manager.py
+++ b/lmcache/v1/mp_coordinator/l2/eviction_manager.py
@@ -1,16 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 """Coordinator-side eviction manager with per-``cache_salt`` LRU.
 
-Mirrors the structure of
-:class:`~lmcache.v1.distributed.eviction_policy.isolated_lru.IsolatedLRUEvictionPolicy`
-but runs inside the coordinator process and uses a lightweight
-:class:`CacheKey` instead of :class:`ObjectKey` (which pulls in
-``torch``).
+Wraps :class:`IsolatedLRUEvictionPolicy` for LRU key ordering,
+matching the eviction logic in
+:class:`~lmcache.v1.distributed.storage_controllers.eviction_controller.L2EvictionController`.
 
 The manager periodically checks per-salt usage
-(from :class:`L2UsageManager`) against limits
-(from :class:`L2QuotaManager`).
-When a salt exceeds its quota, it selects LRU victims and **logs**
+(from :class:`L2UsageManager`) against ``watermark * quota``
+(from :class:`QuotaManager`).
+When a salt exceeds its threshold, it selects LRU keys and **logs**
 them — actual deletion is not implemented yet.
 """
 
@@ -18,14 +16,16 @@
 from __future__ import annotations
 
 # Standard
-from collections import OrderedDict
 import threading
 
 # First Party
 from lmcache.logging import init_logger
-from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
+from lmcache.v1.distributed.api import ObjectKey
+from lmcache.v1.distributed.eviction_policy.isolated_lru import (
+    IsolatedLRUEvictionPolicy,
+)
+from lmcache.v1.distributed.quota_manager import QuotaManager
 from lmcache.v1.mp_coordinator.l2.usage_manager import L2UsageManager
-from lmcache.v1.mp_coordinator.schemas import CacheKey
 
 logger = init_logger(__name__)
 
@@ -33,112 +33,103 @@
 class L2EvictionManager:
     """Per-``cache_salt`` LRU eviction manager for the coordinator.
 
-    Maintains one ``OrderedDict`` per ``cache_salt``, ordered from
-    least-recently-used (front) to most-recently-used (end). Also
-    tracks per-key byte sizes so eviction can be byte-aware.
+    Delegates LRU ordering to :class:`IsolatedLRUEvictionPolicy`.
+    Mirrors the trigger and ratio logic of
+    :class:`L2EvictionController._check_and_evict_by_cache_salt`:
+    eviction fires when ``usage >= watermark * quota``, and
+    ``eviction_ratio`` is passed directly to the policy as a
+    fraction of keys by count.
 
-    Thread-safety: every public method acquires ``_lock``.
+    Thread-safety: ``_key_sizes`` is guarded by ``_lock``;
+    the policy has its own internal lock.
 
     Args:
         quota_manager: The shared quota registry.
         usage_manager: The shared usage manager.
-        eviction_ratio: Fraction of over-quota bytes to target for
-            eviction each cycle.
+        eviction_ratio: Fraction of tracked keys to evict per
+            cycle (by count). Passed to the policy.
+        trigger_watermark: Eviction fires when usage reaches
+            this fraction of the quota.
     """
 
     def __init__(
         self,
-        quota_manager: L2QuotaManager,
+        quota_manager: QuotaManager,
         usage_manager: L2UsageManager,
         eviction_ratio: float = 0.5,
+        trigger_watermark: float = 1.0,
     ) -> None:
         self._lock = threading.Lock()
         self._quota_manager = quota_manager
         self._usage_manager = usage_manager
         self._eviction_ratio = max(0.0, min(1.0, eviction_ratio))
-        self._per_salt_order: dict[str, OrderedDict[CacheKey, None]] = {}
-        self._key_sizes: dict[CacheKey, int] = {}
+        self._trigger_watermark = trigger_watermark
+        self._policy = IsolatedLRUEvictionPolicy()
+        self._key_sizes: dict[ObjectKey, int] = {}
 
-    def on_store(self, key: CacheKey, size_bytes: int) -> None:
+    def on_store(self, key: ObjectKey, size_bytes: int) -> None:
         """Record that a key was stored.
 
-        Inserts into (or refreshes) the LRU for the key's
-        ``cache_salt``, and records the per-key byte size.
-
         Args:
-            key: The cache key that was stored.
+            key: The object key that was stored.
             size_bytes: Number of bytes stored for this key.
         """
+        self._policy.on_keys_created([key])
         with self._lock:
-            order = self._per_salt_order.get(key.cache_salt)
-            if order is None:
-                order = OrderedDict()
-                self._per_salt_order[key.cache_salt] = order
-            if key in order:
-                order.move_to_end(key)
-            else:
-                order[key] = None
             self._key_sizes[key] = size_bytes
 
-    def on_lookup(self, key: CacheKey) -> None:
+    def on_lookup(self, key: ObjectKey) -> None:
         """Record that a key was looked up (touch — move to MRU end).
 
         Args:
-            key: The cache key that was looked up.
+            key: The object key that was looked up.
         """
-        with self._lock:
-            order = self._per_salt_order.get(key.cache_salt)
-            if order is not None and key in order:
-                order.move_to_end(key)
+        self._policy.on_keys_touched([key])
 
-    def on_remove(self, keys: list[CacheKey]) -> None:
+    def on_remove(self, keys: list[ObjectKey]) -> None:
         """Remove keys from LRU tracking (after eviction is executed).
 
         Args:
-            keys: The cache keys that were removed.
+            keys: The object keys that were removed.
         """
         if not keys:
             return
+        self._policy.on_keys_removed(keys)
         with self._lock:
             for key in keys:
-                order = self._per_salt_order.get(key.cache_salt)
-                if order is None:
-                    continue
-                order.pop(key, None)
-                if not order:
-                    del self._per_salt_order[key.cache_salt]
                 self._key_sizes.pop(key, None)
 
-    def execute_evictions(self) -> dict[str, list[CacheKey]]:
+    def execute_evictions(self) -> dict[str, list[ObjectKey]]:
         """Check all tracked salts against their quotas and log eviction candidates.
 
-        Salts with no quota or a zero quota are fully evicted. Salts
-        over quota have LRU keys selected targeting ``eviction_ratio``
-        of the over-quota bytes. Keys are logged but not actually
-        deleted.
+        For every tracked salt, compare usage against
+        ``watermark * quota``. Salts over threshold get eviction
+        scoped to their own LRU list. Salts with no quota or zero
+        quota get a full eviction (ratio=1.0).
 
         Returns:
             A mapping of ``cache_salt`` to the list of keys selected
             for eviction.
         """
-        quotas = {e.cache_salt: e.limit_bytes for e in self._quota_manager.list_all()}
-        with self._lock:
-            tracked_salts = list(self._per_salt_order.keys())
-
-        eviction_plan: dict[str, list[CacheKey]] = {}
+        tracked_salts = self._policy.get_tracked_salts()
+        eviction_plan: dict[str, list[ObjectKey]] = {}
 
         for cache_salt in tracked_salts:
-            limit_bytes = quotas.get(cache_salt, 0)
             current_bytes = self._usage_manager.get(cache_salt)
-            if current_bytes <= limit_bytes:
+            if current_bytes <= 0:
+                continue
+            limit = self._quota_manager.get_limit_bytes(cache_salt)
+            if current_bytes < self._trigger_watermark * limit:
                 continue
 
-            over_bytes = current_bytes - limit_bytes
-            target_bytes = int(over_bytes * self._eviction_ratio)
-            if target_bytes == 0 and over_bytes > 0:
-                target_bytes = over_bytes
+            effective_ratio = 1.0 if limit == 0 else self._eviction_ratio
+            actions = self._policy.get_eviction_actions(
+                effective_ratio, cache_salt=cache_salt
+            )
+            keys_to_evict: list[ObjectKey] = []
+            for action in actions:
+                keys_to_evict.extend(action.keys)
 
-            keys_to_evict = self._select_keys_to_evict(cache_salt, target_bytes)
             if keys_to_evict:
                 eviction_plan[cache_salt] = keys_to_evict
                 with self._lock:
@@ -147,50 +138,24 @@ def execute_evictions(self) -> dict[str, list[CacheKey]]:
                 logger.info(
                     "Eviction plan for cache_salt=%r: %d keys "
                     "(%d bytes) to free; usage=%d, quota=%d, "
-                    "over_by=%d",
+                    "watermark=%.2f, ratio=%.2f",
                     cache_salt,
                     len(keys_to_evict),
                     evict_bytes,
                     current_bytes,
-                    limit_bytes,
-                    over_bytes,
+                    limit,
+                    self._trigger_watermark,
+                    effective_ratio,
                 )
                 for k, size in zip(keys_to_evict, sizes, strict=True):
                     logger.info(
                         "  -> evict key: model=%s, kv_rank=%d, hash=%s, size=%d",
                         k.model_name,
                         k.kv_rank,
-                        k.chunk_hash_hex,
+                        k.chunk_hash.hex(),
                         size,
                     )
 
         # TODO: once eviction is wired end-to-end, call on_remove()
         # for each salt's victims after the MP server confirms deletion.
         return eviction_plan
-
-    def _select_keys_to_evict(
-        self, cache_salt: str, target_bytes: int
-    ) -> list[CacheKey]:
-        """Select LRU victims from a salt's bucket to free ``target_bytes``.
-
-        Args:
-            cache_salt: The salt to evict from.
-            target_bytes: Target number of bytes to free.
-
-        Returns:
-            List of keys in LRU order (oldest first).
-        """
-        with self._lock:
-            order = self._per_salt_order.get(cache_salt)
-            if not order:
-                return []
-
-            keys_to_evict: list[CacheKey] = []
-            freed = 0
-            for key in order:
-                keys_to_evict.append(key)
-                freed += self._key_sizes.get(key, 0)
-                if freed >= target_bytes:
-                    break
-
-            return keys_to_evict
diff --git a/lmcache/v1/mp_coordinator/l2/quota_manager.py b/lmcache/v1/mp_coordinator/l2/quota_manager.py
deleted file mode 100644
index c85962e2b1..0000000000
--- a/lmcache/v1/mp_coordinator/l2/quota_manager.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""Lightweight in-memory quota registry for the MP coordinator.
-
-Holds per-``cache_salt`` byte limits. The coordinator is the single
-source of truth for quotas; MP servers query the coordinator to obtain
-their limits. This class is intentionally free of heavy dependencies
-(no ``torch``, no ``distributed`` layer imports) so the coordinator
-process stays lightweight.
-"""
-
-# Future
-from __future__ import annotations
-
-# Standard
-from dataclasses import dataclass
-import threading
-
-
-@dataclass(frozen=True)
-class QuotaEntry:
-    """Snapshot of a single quota registration.
-
-    Attributes:
-        cache_salt: The tenant identifier.
-        limit_bytes: The byte budget for this tenant.
-    """
-
-    cache_salt: str
-    limit_bytes: int
-
-
-class L2QuotaManager:
-    """Thread-safe in-memory registry of byte quotas keyed by ``cache_salt``.
-
-    All public methods acquire an internal lock so the store stays
-    consistent under concurrent access from FastAPI endpoint handlers.
-    """
-
-    def __init__(self) -> None:
-        self._lock = threading.Lock()
-        self._limits: dict[str, int] = {}
-
-    def set(self, cache_salt: str, limit_bytes: int) -> None:
-        """Create or update the quota for ``cache_salt``.
-
-        Args:
-            cache_salt: The tenant identifier.
-            limit_bytes: The byte budget (must be non-negative).
-
-        Raises:
-            ValueError: If ``limit_bytes`` is negative.
-        """
-        if limit_bytes < 0:
-            raise ValueError(f"limit_bytes must be non-negative (got {limit_bytes})")
-        with self._lock:
-            self._limits[cache_salt] = limit_bytes
-
-    def get(self, cache_salt: str) -> int | None:
-        """Return the limit for ``cache_salt``, or ``None`` if unregistered.
-
-        Args:
-            cache_salt: The tenant identifier.
-
-        Returns:
-            The byte limit, or ``None`` if no quota is registered.
-        """
-        with self._lock:
-            return self._limits.get(cache_salt)
-
-    def delete(self, cache_salt: str) -> bool:
-        """Remove the quota entry for ``cache_salt``.
-
-        Args:
-            cache_salt: The tenant identifier.
-
-        Returns:
-            ``True`` if an entry was removed, ``False`` if none existed.
-        """
-        with self._lock:
-            return self._limits.pop(cache_salt, None) is not None
-
-    def list_all(self) -> list[QuotaEntry]:
-        """Return a snapshot of all registered quotas.
-
-        Returns:
-            A detached list of all quota entries.
-        """
-        with self._lock:
-            return [
-                QuotaEntry(cache_salt=salt, limit_bytes=limit)
-                for salt, limit in self._limits.items()
-            ]

From 147b2356496aa836615261e49cf3939bbff75bea Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Thu, 11 Jun 2026 16:51:42 -0700
Subject: [PATCH 51/57] fix UT

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 .../mp_coordinator/l2_usage_and_eviction.md   |  11 +-
 .../mp_coordinator/test_eviction_manager.py   | 145 ++++++++++--------
 tests/v1/mp_coordinator/test_l2_api.py        |   5 +-
 tests/v1/mp_coordinator/test_quota_manager.py |  64 --------
 4 files changed, 87 insertions(+), 138 deletions(-)
 delete mode 100644 tests/v1/mp_coordinator/test_quota_manager.py

diff --git a/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md b/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
index ccca93bf7f..cf7d25f838 100644
--- a/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
+++ b/docs/design/v1/mp_coordinator/l2_usage_and_eviction.md
@@ -37,7 +37,7 @@ MP server (store/lookup)
   POST /l2/events ──▶ Coordinator
                         ├─ L2UsageManager: per-salt byte accounting
                         ├─ L2EvictionManager: per-salt LRU
-                        └─ L2QuotaManager: per-salt byte limits
+                        └─ QuotaManager: per-salt byte limits
 
   Coordinator background loop (every eviction_check_interval, default 5s)
         │
@@ -72,11 +72,12 @@ Thread-safe per-salt byte counter. Two operations:
 
 Exposes ``get(salt)``, ``get_all()``, ``get_total()`` for the status endpoints.
 
-### L2QuotaManager (`quota_manager.py`)
+### QuotaManager (reused from ``lmcache.v1.distributed.quota_manager``)
 
 Thread-safe in-memory quota registry (``dict[str, int]`` + lock). CRUD via
-``set``, ``get``, ``delete``, ``list_all``. Quotas are set in GiB at the API
-and stored as bytes internally.
+``set_quota``, ``get_limit_bytes``, ``delete_quota``, ``list_quotas``.
+Quotas are set in GiB at the API and stored as bytes internally.
+Unregistered salts default to a 0-byte limit (allowlist semantics).
 
 ### L2EvictionManager (`eviction_manager.py`)
 
@@ -94,7 +95,7 @@ _key_sizes      : dict[CacheKey, int]                       # byte size per key
 - ``on_lookup(key)`` — touch (move to MRU end).
 - ``on_remove(keys)`` — remove from LRU tracking after confirmed deletion.
 - ``execute_evictions()`` — for each tracked salt, compare usage (from
-  ``L2UsageManager``) against quota (from ``L2QuotaManager``, default 0). If over
+  ``L2UsageManager``) against quota (from ``QuotaManager``, default 0). If over
   quota, select LRU keys targeting ``eviction_ratio`` of the overage. No quota
   or zero quota means evict all keys for that salt.
 
diff --git a/tests/v1/mp_coordinator/test_eviction_manager.py b/tests/v1/mp_coordinator/test_eviction_manager.py
index 811ea0dac0..1ed3cab257 100644
--- a/tests/v1/mp_coordinator/test_eviction_manager.py
+++ b/tests/v1/mp_coordinator/test_eviction_manager.py
@@ -2,103 +2,99 @@
 """Tests for the coordinator eviction manager."""
 
 # First Party
+from lmcache.v1.distributed.api import ObjectKey
+from lmcache.v1.distributed.quota_manager import QuotaManager
 from lmcache.v1.mp_coordinator.l2.eviction_manager import (
     L2EvictionManager,
 )
-from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
 from lmcache.v1.mp_coordinator.l2.usage_manager import L2UsageManager
-from lmcache.v1.mp_coordinator.schemas import CacheKey
 
 
-def _make_key(salt: str, model: str = "m", rank: int = 0, h: str = "aa") -> CacheKey:
-    return CacheKey(chunk_hash_hex=h, model_name=model, kv_rank=rank, cache_salt=salt)
+def _make_key(salt: str, model: str = "m", rank: int = 0, h: str = "aa") -> ObjectKey:
+    return ObjectKey(
+        chunk_hash=bytes.fromhex(h),
+        model_name=model,
+        kv_rank=rank,
+        cache_salt=salt,
+    )
 
 
 def _setup(
     eviction_ratio: float = 0.5,
-) -> tuple[L2EvictionManager, L2QuotaManager, L2UsageManager]:
-    qs = L2QuotaManager()
+    trigger_watermark: float = 1.0,
+) -> tuple[L2EvictionManager, QuotaManager, L2UsageManager]:
+    qs = QuotaManager()
     ut = L2UsageManager()
-    ctrl = L2EvictionManager(qs, ut, eviction_ratio=eviction_ratio)
+    ctrl = L2EvictionManager(
+        qs,
+        ut,
+        eviction_ratio=eviction_ratio,
+        trigger_watermark=trigger_watermark,
+    )
     return ctrl, qs, ut
 
 
 def test_on_store_tracks_key():
-    ctrl, _, _ = _setup()
-    k = _make_key("a")
-    ctrl.on_store(k, 100)
-    assert ctrl._select_keys_to_evict("a", 100) == [k]
-
-
-def test_on_store_updates_existing_key():
-    ctrl, _, _ = _setup()
+    ctrl, _, ut = _setup(eviction_ratio=1.0)
     k = _make_key("a")
     ctrl.on_store(k, 100)
-    ctrl.on_store(k, 200)
-    assert ctrl._select_keys_to_evict("a", 200) == [k]
+    ut.record_stored("a", 100)
+    result = ctrl.execute_evictions()
+    assert result["a"] == [k]
 
 
 def test_on_lookup_touches_key():
-    ctrl, _, _ = _setup()
+    ctrl, _, ut = _setup(eviction_ratio=1.0)
     k1 = _make_key("a", h="01")
     k2 = _make_key("a", h="02")
     ctrl.on_store(k1, 100)
     ctrl.on_store(k2, 100)
     ctrl.on_lookup(k1)
-    keys_to_evict = ctrl._select_keys_to_evict("a", 100)
-    assert keys_to_evict[0] == k2
+    ut.record_stored("a", 200)
+    result = ctrl.execute_evictions()
+    assert result["a"][0] == k2
 
 
 def test_on_lookup_unknown_key_is_noop():
-    ctrl, _, _ = _setup()
+    ctrl, _, ut = _setup(eviction_ratio=1.0)
     k = _make_key("a")
     ctrl.on_lookup(k)
-    assert ctrl._select_keys_to_evict("a", 1) == []
+    ut.record_stored("a", 100)
+    result = ctrl.execute_evictions()
+    assert result == {}
 
 
 def test_on_remove():
-    ctrl, _, _ = _setup()
+    ctrl, _, ut = _setup(eviction_ratio=1.0)
     k1 = _make_key("a", h="01")
     k2 = _make_key("a", h="02")
     ctrl.on_store(k1, 100)
     ctrl.on_store(k2, 200)
     ctrl.on_remove([k1])
-    assert ctrl._select_keys_to_evict("a", 200) == [k2]
+    ut.record_stored("a", 200)
+    result = ctrl.execute_evictions()
+    assert result["a"] == [k2]
 
 
 def test_on_remove_cleans_empty_bucket():
-    ctrl, _, _ = _setup()
+    ctrl, _, ut = _setup(eviction_ratio=1.0)
     k = _make_key("a")
     ctrl.on_store(k, 100)
     ctrl.on_remove([k])
-    assert ctrl._select_keys_to_evict("a", 1) == []
+    ut.record_stored("a", 100)
+    result = ctrl.execute_evictions()
+    assert result == {}
 
 
 def test_on_remove_empty_list_is_noop():
     ctrl, _, _ = _setup()
     ctrl.on_remove([])
-    assert ctrl._select_keys_to_evict("a", 1) == []
-
-
-def test_select_keys_to_evict_lru_order():
-    ctrl, _, _ = _setup()
-    k1 = _make_key("a", h="01")
-    k2 = _make_key("a", h="02")
-    k3 = _make_key("a", h="03")
-    ctrl.on_store(k1, 100)
-    ctrl.on_store(k2, 200)
-    ctrl.on_store(k3, 300)
-    keys_to_evict = ctrl._select_keys_to_evict("a", 250)
-    assert keys_to_evict == [k1, k2]
-
-
-def test_select_keys_to_evict_empty_bucket():
-    ctrl, _, _ = _setup()
-    assert ctrl._select_keys_to_evict("nonexistent", 100) == []
+    result = ctrl.execute_evictions()
+    assert result == {}
 
 
-def test_check_and_log_no_quotas_evicts_all():
-    ctrl, _, ut = _setup()
+def test_no_quotas_evicts_all():
+    ctrl, _, ut = _setup(eviction_ratio=1.0)
     k = _make_key("a")
     ctrl.on_store(k, 1000)
     ut.record_stored("a", 1000)
@@ -107,18 +103,18 @@ def test_check_and_log_no_quotas_evicts_all():
     assert result["a"] == [k]
 
 
-def test_check_and_log_under_quota():
+def test_under_quota():
     ctrl, qs, ut = _setup()
-    qs.set("a", 2000)
+    qs.set_quota("a", 2000)
     ut.record_stored("a", 1000)
     ctrl.on_store(_make_key("a"), 1000)
     result = ctrl.execute_evictions()
     assert result == {}
 
 
-def test_check_and_log_over_quota():
+def test_over_quota():
     ctrl, qs, ut = _setup(eviction_ratio=1.0)
-    qs.set("a", 500)
+    qs.set_quota("a", 500)
     ut.record_stored("a", 1000)
     k1 = _make_key("a", h="01")
     k2 = _make_key("a", h="02")
@@ -126,32 +122,27 @@ def test_check_and_log_over_quota():
     ctrl.on_store(k2, 600)
     result = ctrl.execute_evictions()
     assert "a" in result
-    keys_to_evict = result["a"]
-    assert keys_to_evict[0] == k1
-    total_evict_bytes = 400 + 600
-    assert total_evict_bytes >= 500
+    assert k1 in result["a"]
+    assert k2 in result["a"]
 
 
-def test_check_and_log_eviction_ratio():
+def test_eviction_ratio():
     ctrl, qs, ut = _setup(eviction_ratio=0.5)
-    qs.set("a", 500)
+    qs.set_quota("a", 500)
     ut.record_stored("a", 1000)
     k1 = _make_key("a", h="01")
     k2 = _make_key("a", h="02")
-    k3 = _make_key("a", h="03")
     ctrl.on_store(k1, 200)
-    ctrl.on_store(k2, 200)
-    ctrl.on_store(k3, 600)
+    ctrl.on_store(k2, 800)
     result = ctrl.execute_evictions()
     assert "a" in result
-    keys_to_evict = result["a"]
-    assert len(keys_to_evict) >= 1
-    assert keys_to_evict[0] == k1
+    assert len(result["a"]) == 1
+    assert result["a"][0] == k1
 
 
-def test_check_and_log_zero_quota_evicts_all():
-    ctrl, qs, ut = _setup()
-    qs.set("a", 0)
+def test_zero_quota_evicts_all():
+    ctrl, qs, ut = _setup(eviction_ratio=1.0)
+    qs.set_quota("a", 0)
     ut.record_stored("a", 1000)
     k = _make_key("a")
     ctrl.on_store(k, 1000)
@@ -162,8 +153,8 @@ def test_check_and_log_zero_quota_evicts_all():
 
 def test_multiple_salts_independent():
     ctrl, qs, ut = _setup(eviction_ratio=1.0)
-    qs.set("a", 100)
-    qs.set("b", 5000)
+    qs.set_quota("a", 100)
+    qs.set_quota("b", 5000)
     ut.record_stored("a", 500)
     ut.record_stored("b", 1000)
     ka = _make_key("a", h="01")
@@ -173,3 +164,23 @@ def test_multiple_salts_independent():
     result = ctrl.execute_evictions()
     assert "a" in result
     assert "b" not in result
+
+
+def test_watermark_below_threshold_skips():
+    ctrl, qs, ut = _setup(trigger_watermark=0.8)
+    qs.set_quota("a", 1000)
+    ut.record_stored("a", 700)
+    ctrl.on_store(_make_key("a"), 700)
+    result = ctrl.execute_evictions()
+    assert result == {}
+
+
+def test_watermark_above_threshold_evicts():
+    ctrl, qs, ut = _setup(eviction_ratio=1.0, trigger_watermark=0.8)
+    qs.set_quota("a", 1000)
+    ut.record_stored("a", 900)
+    k = _make_key("a")
+    ctrl.on_store(k, 900)
+    result = ctrl.execute_evictions()
+    assert "a" in result
+    assert result["a"] == [k]
diff --git a/tests/v1/mp_coordinator/test_l2_api.py b/tests/v1/mp_coordinator/test_l2_api.py
index 4ba5e4bb5e..27b0ddf5a7 100644
--- a/tests/v1/mp_coordinator/test_l2_api.py
+++ b/tests/v1/mp_coordinator/test_l2_api.py
@@ -216,14 +216,15 @@ def test_status_list_includes_quota_only_salt():
 
 
 def test_default_salt_sentinel():
+    """``_default`` in path maps to the empty-string salt."""
     with _client() as client:
         client.put("/l2/quota/_default", json={"limit_gb": 3.0})
         client.post(
             "/l2/events",
-            json={"events": [_store("_default", 500)]},
+            json={"events": [_store("", 500)]},
         )
         data = client.get("/l2/status/_default").json()
-        assert data["cache_salt"] == "_default"
+        assert data["cache_salt"] == ""
         assert data["quota_exists"] is True
         assert abs(data["quota_limit_gb"] - 3.0) < 1e-6
         assert abs(data["usage_gb"] - 500 / 1024**3) < 1e-12
diff --git a/tests/v1/mp_coordinator/test_quota_manager.py b/tests/v1/mp_coordinator/test_quota_manager.py
deleted file mode 100644
index 7ccf110378..0000000000
--- a/tests/v1/mp_coordinator/test_quota_manager.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""Tests for the coordinator L2QuotaManager."""
-
-# Third Party
-import pytest
-
-# First Party
-from lmcache.v1.mp_coordinator.l2.quota_manager import L2QuotaManager
-
-
-def test_set_and_get():
-    store = L2QuotaManager()
-    store.set("salt-a", 1000)
-    assert store.get("salt-a") == 1000
-
-
-def test_get_unregistered_returns_none():
-    store = L2QuotaManager()
-    assert store.get("unknown") is None
-
-
-def test_set_overwrites():
-    store = L2QuotaManager()
-    store.set("salt-a", 1000)
-    store.set("salt-a", 2000)
-    assert store.get("salt-a") == 2000
-
-
-def test_delete():
-    store = L2QuotaManager()
-    store.set("salt-a", 1000)
-    assert store.delete("salt-a") is True
-    assert store.get("salt-a") is None
-
-
-def test_delete_nonexistent():
-    store = L2QuotaManager()
-    assert store.delete("unknown") is False
-
-
-def test_list_all():
-    store = L2QuotaManager()
-    store.set("a", 100)
-    store.set("b", 200)
-    entries = store.list_all()
-    by_salt = {e.cache_salt: e.limit_bytes for e in entries}
-    assert by_salt == {"a": 100, "b": 200}
-
-
-def test_list_all_empty():
-    store = L2QuotaManager()
-    assert store.list_all() == []
-
-
-def test_negative_limit_raises():
-    store = L2QuotaManager()
-    with pytest.raises(ValueError, match="non-negative"):
-        store.set("salt-a", -1)
-
-
-def test_zero_limit_accepted():
-    store = L2QuotaManager()
-    store.set("salt-a", 0)
-    assert store.get("salt-a") == 0

From f6e2081d5b4c6258693ae69d205c83ca87abb829 Mon Sep 17 00:00:00 2001
From: Tony Lin <tony.lin@intel.com>
Date: Fri, 12 Jun 2026 09:46:22 +0800
Subject: [PATCH 52/57] Optimize the Python fallback path for block transfer
 operations with notable speedup (#3591)

* Perf: optimize Python fallback block transfer for 3x speedup

- Optimize fallback block-id and D2H staging overhead
- Restructure per-layer transfer loops to iterate over objects first
  then layers

Signed-off-by: Tony Lin <tony.lin@intel.com>

* apply gemini's suggestion

Signed-off-by: Tony Lin <tony.lin@intel.com>

* optimize flash_infer block transfer paths in python fallback

Signed-off-by: Tony Lin <tony.lin@intel.com>

---------

Signed-off-by: Tony Lin <tony.lin@intel.com>
---
 lmcache/python_ops_fallback.py | 458 +++++++++++++++++++++------------
 1 file changed, 293 insertions(+), 165 deletions(-)

diff --git a/lmcache/python_ops_fallback.py b/lmcache/python_ops_fallback.py
index 0c28a176d8..00790eb059 100644
--- a/lmcache/python_ops_fallback.py
+++ b/lmcache/python_ops_fallback.py
@@ -1156,7 +1156,11 @@ def multi_layer_block_kv_transfer(
         gpu_kv_format=gpu_kv_format,
         dtype=kv_dtype,
     )
-    block_id_list = _to_block_id_list(block_ids)
+    n_block_ids = (
+        int(block_ids.numel())
+        if isinstance(block_ids, torch.Tensor)
+        else len(block_ids)
+    )
     blocks_per_object = lmcache_chunk_size // int(shape_desc.bs)
     block_size = int(shape_desc.bs)
 
@@ -1164,7 +1168,8 @@ def multi_layer_block_kv_transfer(
         _transfer_cross_layer(
             normalized,
             object_tensors,
-            block_id_list,
+            block_ids,
+            n_block_ids,
             blocks_per_object,
             block_size,
             gpu_kv_format,
@@ -1175,7 +1180,8 @@ def multi_layer_block_kv_transfer(
         _transfer_sglang_mha(
             normalized,
             object_tensors,
-            block_id_list,
+            block_ids,
+            n_block_ids,
             blocks_per_object,
             block_size,
             gpu_kv_format,
@@ -1186,7 +1192,8 @@ def multi_layer_block_kv_transfer(
         _transfer_per_layer_mla(
             normalized,
             object_tensors,
-            block_id_list,
+            block_ids,
+            n_block_ids,
             blocks_per_object,
             block_size,
             gpu_kv_format,
@@ -1197,7 +1204,8 @@ def multi_layer_block_kv_transfer(
         _transfer_per_layer_hnd(
             normalized,
             object_tensors,
-            block_id_list,
+            block_ids,
+            n_block_ids,
             blocks_per_object,
             block_size,
             gpu_kv_format,
@@ -1208,7 +1216,8 @@ def multi_layer_block_kv_transfer(
         _transfer_per_layer_nhd(
             normalized,
             object_tensors,
-            block_id_list,
+            block_ids,
+            n_block_ids,
             blocks_per_object,
             block_size,
             gpu_kv_format,
@@ -1247,10 +1256,28 @@ def _valid_block_range(
     return block_id_list[valid_flat_start:valid_flat_end], offset_in_object
 
 
+def _valid_block_range_indices(
+    object_idx: int,
+    n_block_ids: int,
+    blocks_per_object: int,
+    block_size: int,
+    skip_prefix_n_blocks: int,
+) -> tuple[int, int, int] | None:
+    """Return valid [start, end) range over flat block IDs and object token offset."""
+    object_flat_start = object_idx * blocks_per_object
+    valid_flat_start = max(object_flat_start, skip_prefix_n_blocks)
+    valid_flat_end = min(object_flat_start + blocks_per_object, n_block_ids)
+    if valid_flat_start >= valid_flat_end:
+        return None
+    offset_in_object = (valid_flat_start - object_flat_start) * block_size
+    return valid_flat_start, valid_flat_end, offset_in_object
+
+
 def _transfer_cross_layer(
     paged_tensor: torch.Tensor,
     object_tensors: list[torch.Tensor],
-    block_id_list: list[int],
+    block_ids: torch.Tensor | list[int],
+    n_block_ids: int,
     blocks_per_object: int,
     block_size: int,
     gpu_kv_format: GPUKVFormat,
@@ -1274,23 +1301,24 @@ def _transfer_cross_layer(
     # H2D: pre-transfer objects to paged device
     if not is_d2h and object_tensors:
         objs_on_device = [obj.to(paged_tensor.device) for obj in object_tensors]
+    block_ids_dev = torch.as_tensor(
+        block_ids, dtype=torch.long, device=paged_tensor.device
+    )
 
     for object_idx, obj in enumerate(object_tensors):
-        valid = _valid_block_range(
+        valid = _valid_block_range_indices(
             object_idx,
-            block_id_list,
+            n_block_ids,
             blocks_per_object,
             block_size,
             skip_prefix_n_blocks,
         )
         if valid is None:
             continue
-        engine_block_ids, offset_in_object = valid
-        n_valid = len(engine_block_ids)
+        idx_start, idx_end, offset_in_object = valid
+        n_valid = idx_end - idx_start
         token_end = offset_in_object + n_valid * block_size
-        eff_idx = torch.tensor(
-            engine_block_ids, dtype=torch.long, device=paged_tensor.device
-        )
+        eff_idx = block_ids_dev[idx_start:idx_end]
 
         if is_d2h:
             selected = paged_tensor.index_select(0, eff_idx)
@@ -1330,7 +1358,8 @@ def _transfer_cross_layer(
 def _transfer_sglang_mha(
     paged_tensors: list[list[torch.Tensor]],
     object_tensors: list[torch.Tensor],
-    block_id_list: list[int],
+    block_ids: torch.Tensor | list[int],
+    n_block_ids: int,
     blocks_per_object: int,
     block_size: int,
     gpu_kv_format: GPUKVFormat,
@@ -1349,21 +1378,22 @@ def _transfer_sglang_mha(
     # H2D: pre-transfer objects
     if not is_d2h and object_tensors:
         objs_on_device = [obj.to(target_device) for obj in object_tensors]
+    block_ids_dev = torch.as_tensor(block_ids, dtype=torch.long, device=target_device)
 
     for object_idx, obj in enumerate(object_tensors):
-        valid = _valid_block_range(
+        valid = _valid_block_range_indices(
             object_idx,
-            block_id_list,
+            n_block_ids,
             blocks_per_object,
             block_size,
             skip_prefix_n_blocks,
         )
         if valid is None:
             continue
-        engine_block_ids, offset_in_object = valid
-        n_valid = len(engine_block_ids)
+        idx_start, idx_end, offset_in_object = valid
+        n_valid = idx_end - idx_start
         token_end = offset_in_object + n_valid * block_size
-        eff_idx = torch.tensor(engine_block_ids, dtype=torch.long, device=target_device)
+        eff_idx = block_ids_dev[idx_start:idx_end]
         if is_flat:
             # Flat token positions for all valid blocks:
             # block_id * block_size + token offset. Reused across layer/KV pairs.
@@ -1408,7 +1438,8 @@ def _transfer_sglang_mha(
 def _transfer_per_layer_mla(
     layer_tensors: list[torch.Tensor],
     object_tensors: list[torch.Tensor],
-    block_id_list: list[int],
+    block_ids: torch.Tensor | list[int],
+    n_block_ids: int,
     blocks_per_object: int,
     block_size: int,
     gpu_kv_format: GPUKVFormat,
@@ -1416,49 +1447,59 @@ def _transfer_per_layer_mla(
     skip_prefix_n_blocks: int,
 ) -> None:
     """Handle MLA per-layer formats: [NB, BS, HS]."""
-    if not is_d2h and layer_tensors and object_tensors:
-        target_device = layer_tensors[0].device
-        objs_on_device = [obj.to(target_device) for obj in object_tensors]
+    if not layer_tensors or not object_tensors:
+        return
 
-    for layer_idx, layer in enumerate(layer_tensors):
-        is_flat = int(gpu_kv_format) == int(GPUKVFormat.NL_X_NBBS_ONE_HS)
+    is_flat = int(gpu_kv_format) == int(GPUKVFormat.NL_X_NBBS_ONE_HS)
+    target_device = layer_tensors[0].device
+    if is_flat:
+        token_offsets = torch.arange(block_size, dtype=torch.long, device=target_device)
+    block_ids_dev = torch.as_tensor(block_ids, dtype=torch.long, device=target_device)
+
+    for object_idx, obj in enumerate(object_tensors):
+        valid = _valid_block_range_indices(
+            object_idx,
+            n_block_ids,
+            blocks_per_object,
+            block_size,
+            skip_prefix_n_blocks,
+        )
+        if valid is None:
+            continue
+        idx_start, idx_end, offset_in_object = valid
+        n_valid = idx_end - idx_start
+        token_end = offset_in_object + n_valid * block_size
+        eff_idx = block_ids_dev[idx_start:idx_end]
         if is_flat:
-            token_offsets = torch.arange(
-                block_size, dtype=torch.long, device=layer.device
-            )
-        for object_idx, obj in enumerate(object_tensors):
-            valid = _valid_block_range(
-                object_idx,
-                block_id_list,
-                blocks_per_object,
-                block_size,
-                skip_prefix_n_blocks,
-            )
-            if valid is None:
-                continue
-            engine_block_ids, offset_in_object = valid
-            n_valid = len(engine_block_ids)
-            token_end = offset_in_object + n_valid * block_size
-            eff_idx = torch.tensor(
-                engine_block_ids, dtype=torch.long, device=layer.device
-            )
-            if is_flat:
-                token_indices = (
-                    eff_idx[:, None] * block_size + token_offsets[None, :]
-                ).reshape(-1)
+            token_indices = (
+                eff_idx[:, None] * block_size + token_offsets[None, :]
+            ).reshape(-1)
 
-            if is_d2h:
+        if is_d2h:
+            hidden_size = layer_tensors[0].shape[-1]
+            chunk_gpu = torch.empty(
+                len(layer_tensors),
+                n_valid * block_size,
+                hidden_size,
+                dtype=layer_tensors[0].dtype,
+                device=target_device,
+            )
+            for layer_idx, layer in enumerate(layer_tensors):
                 if is_flat:
-                    layer_blocks = layer.index_select(0, token_indices)
+                    dst = chunk_gpu[layer_idx].view(
+                        n_valid * block_size, 1, hidden_size
+                    )
+                    torch.index_select(layer, 0, token_indices, out=dst)
                 else:
-                    layer_blocks = layer.index_select(0, eff_idx)
-                flat = layer_blocks.reshape(n_valid * block_size, layer.shape[-1])
-                obj[layer_idx, offset_in_object:token_end].copy_(
-                    flat, non_blocking=True
-                )
-            else:
-                obj_device = objs_on_device[object_idx]
-                src = obj_device[layer_idx, offset_in_object:token_end]
+                    dst = chunk_gpu[layer_idx].view(n_valid, block_size, hidden_size)
+                    torch.index_select(layer, 0, eff_idx, out=dst)
+            obj[:, offset_in_object:token_end].copy_(chunk_gpu, non_blocking=True)
+        else:
+            chunk_gpu = obj[:, offset_in_object:token_end].to(
+                target_device, non_blocking=True
+            )
+            for layer_idx, layer in enumerate(layer_tensors):
+                src = chunk_gpu[layer_idx]
                 hidden_size = layer.shape[-1]
                 if is_flat:
                     src_tokens = src.reshape(n_valid * block_size, 1, hidden_size)
@@ -1471,7 +1512,8 @@ def _transfer_per_layer_mla(
 def _transfer_per_layer_hnd(
     layer_tensors: list[torch.Tensor],
     object_tensors: list[torch.Tensor],
-    block_id_list: list[int],
+    block_ids: torch.Tensor | list[int],
+    n_block_ids: int,
     blocks_per_object: int,
     block_size: int,
     gpu_kv_format: GPUKVFormat,
@@ -1479,73 +1521,123 @@ def _transfer_per_layer_hnd(
     skip_prefix_n_blocks: int,
 ) -> None:
     """Handle per-layer HND formats: heads before block tokens."""
-    if not is_d2h and layer_tensors and object_tensors:
-        target_device = layer_tensors[0].device
-        objs_on_device = [obj.to(target_device) for obj in object_tensors]
+    if not layer_tensors or not object_tensors:
+        return
 
-    for layer_idx, layer in enumerate(layer_tensors):
-        # Determine K/V split based on specific format
-        if int(gpu_kv_format) == int(GPUKVFormat.NL_X_TWO_NB_NH_BS_HS):
-            k_t, v_t = layer[0], layer[1]
-        elif int(gpu_kv_format) == int(GPUKVFormat.NL_X_NB_NH_BS_TWO_HS):
-            # vLLM CPU blocks-first fused KV: [NB, NH, BS, 2, HS].
-            k_t, v_t = layer[:, :, :, 0], layer[:, :, :, 1]
-        else:
-            k_t, v_t = layer[:, 0], layer[:, 1]
-        _nb, nh, _bs, hs = k_t.shape
-
-        for object_idx, obj in enumerate(object_tensors):
-            valid = _valid_block_range(
-                object_idx,
-                block_id_list,
-                blocks_per_object,
+    target_device = layer_tensors[0].device
+    block_ids_dev = torch.as_tensor(block_ids, dtype=torch.long, device=target_device)
+
+    first_layer = layer_tensors[0]
+    if int(gpu_kv_format) == int(GPUKVFormat.NL_X_TWO_NB_NH_BS_HS):
+        first_k = first_layer[0]
+    elif int(gpu_kv_format) == int(GPUKVFormat.NL_X_NB_NH_BS_TWO_HS):
+        first_k = first_layer[:, :, :, 0]
+    else:
+        first_k = first_layer[:, 0]
+    _nb0, nh0, _bs0, hs0 = first_k.shape
+
+    for object_idx, obj in enumerate(object_tensors):
+        valid = _valid_block_range_indices(
+            object_idx,
+            n_block_ids,
+            blocks_per_object,
+            block_size,
+            skip_prefix_n_blocks,
+        )
+        if valid is None:
+            continue
+        idx_start, idx_end, offset_in_object = valid
+        n_valid = idx_end - idx_start
+        token_end = offset_in_object + n_valid * block_size
+        eff_idx = block_ids_dev[idx_start:idx_end]
+
+        if is_d2h:
+            chunk_gpu = torch.empty(
+                2,
+                len(layer_tensors),
+                n_valid * block_size,
+                nh0 * hs0,
+                dtype=first_k.dtype,
+                device=target_device,
+            )
+            scratch = torch.empty(
+                n_valid,
+                nh0,
                 block_size,
-                skip_prefix_n_blocks,
+                hs0,
+                dtype=first_k.dtype,
+                device=target_device,
             )
-            if valid is None:
-                continue
-            engine_block_ids, offset_in_object = valid
-            n_valid = len(engine_block_ids)
-            token_end = offset_in_object + n_valid * block_size
-            eff_idx = torch.tensor(
-                engine_block_ids, dtype=torch.long, device=layer.device
+            for layer_idx, layer in enumerate(layer_tensors):
+                if int(gpu_kv_format) == int(GPUKVFormat.NL_X_TWO_NB_NH_BS_HS):
+                    k_t, v_t = layer[0], layer[1]
+                    torch.index_select(k_t, 0, eff_idx, out=scratch)
+                    chunk_gpu[0, layer_idx].view(n_valid, block_size, nh0, hs0).copy_(
+                        scratch.permute(0, 2, 1, 3)
+                    )
+                    torch.index_select(v_t, 0, eff_idx, out=scratch)
+                    chunk_gpu[1, layer_idx].view(n_valid, block_size, nh0, hs0).copy_(
+                        scratch.permute(0, 2, 1, 3)
+                    )
+                elif int(gpu_kv_format) == int(GPUKVFormat.NL_X_NB_NH_BS_TWO_HS):
+                    k_t, v_t = layer[:, :, :, 0], layer[:, :, :, 1]
+                    torch.index_select(k_t, 0, eff_idx, out=scratch)
+                    chunk_gpu[0, layer_idx].view(n_valid, block_size, nh0, hs0).copy_(
+                        scratch.permute(0, 2, 1, 3)
+                    )
+                    torch.index_select(v_t, 0, eff_idx, out=scratch)
+                    chunk_gpu[1, layer_idx].view(n_valid, block_size, nh0, hs0).copy_(
+                        scratch.permute(0, 2, 1, 3)
+                    )
+                else:
+                    # FlashInfer HND stores KV as [NB, 2, NH, BS, HS].
+                    # Gather on dim=0 first so reads stay contiguous in memory;
+                    # index_select on layer[:, 0]/layer[:, 1] non-contiguous views
+                    # triggers slower element-wise gather reads.
+                    selected = layer.index_select(0, eff_idx)
+                    chunk_gpu[0, layer_idx].view(n_valid, block_size, nh0, hs0).copy_(
+                        selected[:, 0].permute(0, 2, 1, 3)
+                    )
+                    chunk_gpu[1, layer_idx].view(n_valid, block_size, nh0, hs0).copy_(
+                        selected[:, 1].permute(0, 2, 1, 3)
+                    )
+            obj[:, :, offset_in_object:token_end].copy_(chunk_gpu, non_blocking=True)
+        else:
+            chunk_gpu = obj[:, :, offset_in_object:token_end].to(
+                target_device, non_blocking=True
             )
-
-            if is_d2h:
+            for layer_idx, layer in enumerate(layer_tensors):
+                if int(gpu_kv_format) == int(GPUKVFormat.NL_X_TWO_NB_NH_BS_HS):
+                    k_t, v_t = layer[0], layer[1]
+                elif int(gpu_kv_format) == int(GPUKVFormat.NL_X_NB_NH_BS_TWO_HS):
+                    k_t, v_t = layer[:, :, :, 0], layer[:, :, :, 1]
+                else:
+                    k_t, v_t = layer[:, 0], layer[:, 1]
+                _nb, nh, _bs, hs = k_t.shape
                 k_blocks = (
-                    k_t.index_select(0, eff_idx)
+                    chunk_gpu[0, layer_idx]
+                    .reshape(n_valid, block_size, nh, hs)
                     .permute(0, 2, 1, 3)
-                    .reshape(n_valid * block_size, nh * hs)
                 )
                 v_blocks = (
-                    v_t.index_select(0, eff_idx)
+                    chunk_gpu[1, layer_idx]
+                    .reshape(n_valid, block_size, nh, hs)
                     .permute(0, 2, 1, 3)
-                    .reshape(n_valid * block_size, nh * hs)
-                )
-                obj[0, layer_idx, offset_in_object:token_end].copy_(
-                    k_blocks, non_blocking=True
-                )
-                obj[1, layer_idx, offset_in_object:token_end].copy_(
-                    v_blocks, non_blocking=True
                 )
-            else:
-                obj_device = objs_on_device[object_idx]
-                k_src = obj_device[0, layer_idx, offset_in_object:token_end]
-                v_src = obj_device[1, layer_idx, offset_in_object:token_end]
-                k_blocks = k_src.reshape(n_valid, block_size, nh, hs).permute(
-                    0, 2, 1, 3
-                )
-                v_blocks = v_src.reshape(n_valid, block_size, nh, hs).permute(
-                    0, 2, 1, 3
-                )
-                k_t.index_copy_(0, eff_idx, k_blocks)
-                v_t.index_copy_(0, eff_idx, v_blocks)
+                if int(gpu_kv_format) == int(GPUKVFormat.NL_X_NB_TWO_NH_BS_HS):
+                    layer.index_copy_(
+                        0, eff_idx, torch.stack([k_blocks, v_blocks], dim=1)
+                    )
+                else:
+                    k_t.index_copy_(0, eff_idx, k_blocks)
+                    v_t.index_copy_(0, eff_idx, v_blocks)
 
 
 def _transfer_per_layer_nhd(
     layer_tensors: list[torch.Tensor],
     object_tensors: list[torch.Tensor],
-    block_id_list: list[int],
+    block_ids: torch.Tensor | list[int],
+    n_block_ids: int,
     blocks_per_object: int,
     block_size: int,
     gpu_kv_format: GPUKVFormat,
@@ -1553,62 +1645,98 @@ def _transfer_per_layer_nhd(
     skip_prefix_n_blocks: int,
 ) -> None:
     """Handle per-layer NHD formats: block tokens before heads."""
-    if not is_d2h and layer_tensors and object_tensors:
-        target_device = layer_tensors[0].device
-        objs_on_device = [obj.to(target_device) for obj in object_tensors]
+    if not layer_tensors or not object_tensors:
+        return
 
-    for layer_idx, layer in enumerate(layer_tensors):
-        # Determine K/V split based on specific format
-        if int(gpu_kv_format) == int(GPUKVFormat.NL_X_TWO_NB_BS_NH_HS):
-            k_t, v_t = layer[0], layer[1]
-        else:
-            k_t, v_t = layer[:, 0], layer[:, 1]
-        _nb, _bs, nh, hs = k_t.shape
-
-        for object_idx, obj in enumerate(object_tensors):
-            valid = _valid_block_range(
-                object_idx,
-                block_id_list,
-                blocks_per_object,
-                block_size,
-                skip_prefix_n_blocks,
+    target_device = layer_tensors[0].device
+    block_ids_dev = torch.as_tensor(block_ids, dtype=torch.long, device=target_device)
+
+    first_layer = layer_tensors[0]
+    if int(gpu_kv_format) == int(GPUKVFormat.NL_X_TWO_NB_BS_NH_HS):
+        first_k = first_layer[0]
+    else:
+        first_k = first_layer[:, 0]
+    _nb0, _bs0, nh0, hs0 = first_k.shape
+
+    for object_idx, obj in enumerate(object_tensors):
+        valid = _valid_block_range_indices(
+            object_idx,
+            n_block_ids,
+            blocks_per_object,
+            block_size,
+            skip_prefix_n_blocks,
+        )
+        if valid is None:
+            continue
+        idx_start, idx_end, offset_in_object = valid
+        n_valid = idx_end - idx_start
+        token_end = offset_in_object + n_valid * block_size
+        eff_idx = block_ids_dev[idx_start:idx_end]
+
+        if is_d2h:
+            chunk_gpu = torch.empty(
+                2,
+                len(layer_tensors),
+                n_valid * block_size,
+                nh0 * hs0,
+                dtype=first_k.dtype,
+                device=target_device,
             )
-            if valid is None:
-                continue
-            engine_block_ids, offset_in_object = valid
-            n_valid = len(engine_block_ids)
-            token_end = offset_in_object + n_valid * block_size
-            eff_idx = torch.tensor(
-                engine_block_ids, dtype=torch.long, device=layer.device
+            for layer_idx, layer in enumerate(layer_tensors):
+                if int(gpu_kv_format) == int(GPUKVFormat.NL_X_TWO_NB_BS_NH_HS):
+                    k_t, v_t = layer[0], layer[1]
+                    torch.index_select(
+                        k_t,
+                        0,
+                        eff_idx,
+                        out=chunk_gpu[0, layer_idx].view(n_valid, block_size, nh0, hs0),
+                    )
+                    torch.index_select(
+                        v_t,
+                        0,
+                        eff_idx,
+                        out=chunk_gpu[1, layer_idx].view(n_valid, block_size, nh0, hs0),
+                    )
+                else:
+                    # FlashInfer NHD stores KV as [NB, 2, BS, NH, HS].
+                    # Gather on dim=0 first to avoid index_select from
+                    # non-contiguous layer[:, 0]/layer[:, 1] views, which
+                    # trigger slower element-wise gather reads.
+                    selected = layer.index_select(0, eff_idx)
+                    chunk_gpu[0, layer_idx].copy_(
+                        selected[:, 0].reshape(n_valid * block_size, nh0 * hs0)
+                    )
+                    chunk_gpu[1, layer_idx].copy_(
+                        selected[:, 1].reshape(n_valid * block_size, nh0 * hs0)
+                    )
+            obj[:, :, offset_in_object:token_end].copy_(chunk_gpu, non_blocking=True)
+        else:
+            chunk_gpu = obj[:, :, offset_in_object:token_end].to(
+                target_device, non_blocking=True
             )
-
-            if is_d2h:
-                k_blocks = k_t.index_select(0, eff_idx).reshape(
-                    n_valid * block_size, nh * hs
-                )
-                v_blocks = v_t.index_select(0, eff_idx).reshape(
-                    n_valid * block_size, nh * hs
-                )
-                obj[0, layer_idx, offset_in_object:token_end].copy_(
-                    k_blocks, non_blocking=True
-                )
-                obj[1, layer_idx, offset_in_object:token_end].copy_(
-                    v_blocks, non_blocking=True
-                )
-            else:
-                obj_device = objs_on_device[object_idx]
-                k_src = obj_device[0, layer_idx, offset_in_object:token_end]
-                v_src = obj_device[1, layer_idx, offset_in_object:token_end]
-                k_t.index_copy_(
-                    0,
-                    eff_idx,
-                    k_src.reshape(n_valid, block_size, nh, hs),
-                )
-                v_t.index_copy_(
-                    0,
-                    eff_idx,
-                    v_src.reshape(n_valid, block_size, nh, hs),
-                )
+            for layer_idx, layer in enumerate(layer_tensors):
+                if int(gpu_kv_format) == int(GPUKVFormat.NL_X_TWO_NB_BS_NH_HS):
+                    k_t, v_t = layer[0], layer[1]
+                    k_t.index_copy_(
+                        0,
+                        eff_idx,
+                        chunk_gpu[0, layer_idx].reshape(n_valid, block_size, nh0, hs0),
+                    )
+                    v_t.index_copy_(
+                        0,
+                        eff_idx,
+                        chunk_gpu[1, layer_idx].reshape(n_valid, block_size, nh0, hs0),
+                    )
+                else:
+                    k_blocks = chunk_gpu[0, layer_idx].reshape(
+                        n_valid, block_size, nh0, hs0
+                    )
+                    v_blocks = chunk_gpu[1, layer_idx].reshape(
+                        n_valid, block_size, nh0, hs0
+                    )
+                    layer.index_copy_(
+                        0, eff_idx, torch.stack([k_blocks, v_blocks], dim=1)
+                    )
 
 
 def single_layer_kv_transfer(

From 8622fa2e61080413e68e4dd6e792a737821b7bbb Mon Sep 17 00:00:00 2001
From: deng451e <838677410@qq.com>
Date: Thu, 11 Jun 2026 20:44:22 -0700
Subject: [PATCH 53/57] [fix ]correct retrieve log label prefix -> non_shifted
 (#3648)

Signed-off-by: deng451e <838677410@qq.com>
---
 lmcache/v1/multiprocess/modules/blend_v3.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lmcache/v1/multiprocess/modules/blend_v3.py b/lmcache/v1/multiprocess/modules/blend_v3.py
index e24ad36be7..20a9d587ff 100644
--- a/lmcache/v1/multiprocess/modules/blend_v3.py
+++ b/lmcache/v1/multiprocess/modules/blend_v3.py
@@ -1107,9 +1107,9 @@ def cb_retrieve_pre_computed(
                     key.request_id,
                 )
 
-        # prefix (no re-rope) vs shifted (re-rope), for logging.
-        n_prefix = sum(1 for r in cb_match_result if r.old_st == r.cur_st)
-        n_shifted = len(cb_match_result) - n_prefix
+        # Non-prefix sparse hits split by re-rope need (not prefix coverage).
+        n_non_shifted = sum(1 for r in cb_match_result if r.old_st == r.cur_st)
+        n_shifted = len(cb_match_result) - n_non_shifted
 
         if not all_obj_keys:
             self._event_bus.publish(
@@ -1327,11 +1327,11 @@ def cb_retrieve_pre_computed(
         _scatter_ms = (time.perf_counter() - _retrieve_t0) * 1000
         logger.info(
             "Retrieved pre-computed for %d match results into request %s "
-            "paged blocks (scatter_ms=%.2f, prefix=%d shifted=%d)",
+            "paged blocks (scatter_ms=%.2f, non_shifted=%d shifted=%d)",
             len(cb_match_result),
             key.request_id,
             _scatter_ms,
-            n_prefix,
+            n_non_shifted,
             n_shifted,
         )
         self._event_bus.publish_on_stream(

From 549b0070e27f0fbcb55c0c5f378797941d51ae15 Mon Sep 17 00:00:00 2001
From: Roy Huang <roy.y.huang@gmail.com>
Date: Thu, 11 Jun 2026 20:52:55 -0700
Subject: [PATCH 54/57] fix(operator): emit --engine-type blend for CacheBlend
 engine (#3647)

Signed-off-by: royyhuang <roy.y.huang@gmail.com>
---
 docs/source/mp/operator.rst                              | 4 ++--
 operator/DESIGN.md                                       | 4 ++--
 .../samples/lmcache_v1alpha1_cacheblendengine.yaml       | 2 +-
 .../controller/cacheblendengine_controller_test.go       | 2 +-
 operator/internal/resources/cacheblend_engine.go         | 9 +++++----
 operator/internal/resources/cacheblend_engine_test.go    | 8 ++++----
 6 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/docs/source/mp/operator.rst b/docs/source/mp/operator.rst
index 666a914b0e..7c964502d7 100644
--- a/docs/source/mp/operator.rst
+++ b/docs/source/mp/operator.rst
@@ -603,7 +603,7 @@ for the technique itself.
 
 It has two halves the operator runs together:
 
-- a GPU-resident ``blend_v3`` engine (``lmcache server --engine-type blend_v3``),
+- a GPU-resident ``blend_v3`` engine (``lmcache server --engine-type blend``),
   deployed as a DaemonSet with the **same GPU model as** ``LMCacheEngine``
   (``privileged`` + ``runtimeClassName: nvidia`` + ``NVIDIA_VISIBLE_DEVICES=all``
   + ``hostIPC``, and **no** ``nvidia.com/gpu`` claim) so it shares the vLLM GPU
@@ -653,7 +653,7 @@ Deploying a CacheBlendEngine
         imagePullSecrets:
           - name: my-registry-secret
 
-The engine runs ``lmcache server --engine-type blend_v3`` as a DaemonSet and
+The engine runs ``lmcache server --engine-type blend`` as a DaemonSet and
 emits a ``my-cacheblend-connection`` ConfigMap with the ``CBKVConnector``
 ``kv-transfer-config`` (the operator wires the node-local Service host/port and
 the ``cb.*`` tunables).
diff --git a/operator/DESIGN.md b/operator/DESIGN.md
index 044e3079d4..33148d1419 100644
--- a/operator/DESIGN.md
+++ b/operator/DESIGN.md
@@ -391,7 +391,7 @@ l2Backend, scheduling, overrides, imagePullSecrets) and adds:
 ### The blend engine (controller)
 
 `CacheBlendEngineReconciler` mirrors `LMCacheEngineReconciler` and reconciles a
-DaemonSet running `lmcache server --engine-type blend_v3` (plus
+DaemonSet running `lmcache server --engine-type blend` (plus
 `--l1-align-bytes 16777216`), a node-local lookup Service, a metrics Service, and
 a `<name>-connection` ConfigMap. **GPU model is identical to `LMCacheEngine`**:
 `privileged` + `runtimeClassName: nvidia` + `NVIDIA_VISIBLE_DEVICES=all` +
@@ -469,7 +469,7 @@ user-supplied `--flag=value`.
 
 | Resource | Name | Purpose |
 |---|---|---|
-| DaemonSet | `cb` | `lmcache server --engine-type blend_v3` on GPU nodes |
+| DaemonSet | `cb` | `lmcache server --engine-type blend` on GPU nodes |
 | Service (node-local) | `cb` | same-node discovery for vLLM (`CBKVConnector`) |
 | Service (headless) | `cb-metrics` | Prometheus scrape target |
 | ConfigMap | `cb-connection` | `CBKVConnector` kv-transfer-config |
diff --git a/operator/config/samples/lmcache_v1alpha1_cacheblendengine.yaml b/operator/config/samples/lmcache_v1alpha1_cacheblendengine.yaml
index 3340ec7243..b6554e97aa 100644
--- a/operator/config/samples/lmcache_v1alpha1_cacheblendengine.yaml
+++ b/operator/config/samples/lmcache_v1alpha1_cacheblendengine.yaml
@@ -5,7 +5,7 @@ metadata:
   namespace: default
 spec:
   # -- Container image (the LMCache blend_v3 ENGINE image) --
-  # The engine runs `lmcache server --engine-type blend_v3`. If this image lives
+  # The engine runs `lmcache server --engine-type blend`. If this image lives
   # in a PRIVATE registry, set imagePullSecrets (the Secret must exist in this
   # namespace).
   image:
diff --git a/operator/internal/controller/cacheblendengine_controller_test.go b/operator/internal/controller/cacheblendengine_controller_test.go
index 2e19dcb8d9..e804f62510 100644
--- a/operator/internal/controller/cacheblendengine_controller_test.go
+++ b/operator/internal/controller/cacheblendengine_controller_test.go
@@ -135,7 +135,7 @@ var _ = Describe("CacheBlendEngine Controller", func() {
 			Expect(podSpec.Containers).To(HaveLen(1))
 			engineContainer := podSpec.Containers[0]
 
-			Expect(argsContainFlagValue(engineContainer.Args, "--engine-type", "blend_v3")).To(BeTrue())
+			Expect(argsContainFlagValue(engineContainer.Args, "--engine-type", "blend")).To(BeTrue())
 			Expect(argsContainFlagValue(engineContainer.Args, "--l1-align-bytes", "16777216")).To(BeTrue())
 
 			By("Verifying there is no GPU resource claim")
diff --git a/operator/internal/resources/cacheblend_engine.go b/operator/internal/resources/cacheblend_engine.go
index 156f22ac31..f749f79ec4 100644
--- a/operator/internal/resources/cacheblend_engine.go
+++ b/operator/internal/resources/cacheblend_engine.go
@@ -26,9 +26,10 @@ import (
 
 const (
 	// cbEngineType is the value of the --engine-type flag that selects the
-	// CacheBlend blend_v3 engine on the lmcache server binary
-	// (cacheblend-plugin/README.md:24).
-	cbEngineType = "blend_v3"
+	// CacheBlend V3 engine on the lmcache server binary. The server maps the
+	// value "blend" to BlendV3Module; "blend_v3" is no longer recognized
+	// (lmcache/v1/multiprocess/server.py).
+	cbEngineType = "blend"
 
 	// cbL1AlignBytes is the value of the --l1-align-bytes flag required by the
 	// blend server (blend_server.sh:31).
@@ -88,7 +89,7 @@ func cbSpecToEngineSpec(spec *lmcachev1alpha1.CacheBlendEngineSpec) *lmcachev1al
 
 // BuildCBEngineArgs returns the server CLI args for the blend_v3 engine: the
 // proven LMCacheEngine serialization (--host/--port/--l1-size-gb/--chunk-size/
-// eviction/prometheus/L2) plus the CacheBlend-specific --engine-type blend_v3 and
+// eviction/prometheus/L2) plus the CacheBlend-specific --engine-type blend and
 // --l1-align-bytes flags. The blend flags are inserted before the user-supplied
 // extraArgs so a user can still override them.
 func BuildCBEngineArgs(spec *lmcachev1alpha1.CacheBlendEngineSpec) []string {
diff --git a/operator/internal/resources/cacheblend_engine_test.go b/operator/internal/resources/cacheblend_engine_test.go
index f92876221e..86984882c3 100644
--- a/operator/internal/resources/cacheblend_engine_test.go
+++ b/operator/internal/resources/cacheblend_engine_test.go
@@ -52,7 +52,7 @@ func TestBuildCBEngineArgs_BlendFlags(t *testing.T) {
 	args := BuildCBEngineArgs(&minimalCBEngine().Spec)
 
 	// Blend-specific flags.
-	assertArg(t, args, "--engine-type", "blend_v3")
+	assertArg(t, args, "--engine-type", "blend")
 	assertArg(t, args, "--l1-align-bytes", "16777216")
 
 	// Reuses the proven LMCacheEngine serialization (NOT --l1-size).
@@ -86,8 +86,8 @@ func TestBuildCBEngineArgs_ExtraArgsAfterBlendFlags(t *testing.T) {
 	if firstIdx == lastIdx {
 		t.Fatalf("expected --engine-type to appear twice, got args=%v", args)
 	}
-	if args[firstIdx+1] != "blend_v3" {
-		t.Fatalf("expected operator-set --engine-type blend_v3 first, got %s", args[firstIdx+1])
+	if args[firstIdx+1] != "blend" {
+		t.Fatalf("expected operator-set --engine-type blend first, got %s", args[firstIdx+1])
 	}
 	if args[lastIdx+1] != "override-me" {
 		t.Fatalf("expected user --engine-type override-me last, got %s", args[lastIdx+1])
@@ -144,7 +144,7 @@ func TestBuildCBEngineDaemonSet_GPUAndSecurity(t *testing.T) {
 	}
 
 	// Blend args present on the container.
-	assertArg(t, c.Args, "--engine-type", "blend_v3")
+	assertArg(t, c.Args, "--engine-type", "blend")
 	assertArg(t, c.Args, "--l1-align-bytes", "16777216")
 }
 

From 7bf829150c0446a1627800d388cce03201ea00f2 Mon Sep 17 00:00:00 2001
From: maobaolong <baoloongmao@tencent.com>
Date: Fri, 12 Jun 2026 14:21:11 +0800
Subject: [PATCH 55/57] ci: add CPU e2e test(vLLM and bench server) (#3590)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* ci: add cpu device e2e test

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci: alias vllm-cpu-nightly dist-info as vllm to fix CLI version lookup

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci: macOS job also installs vllm-cpu-nightly from PyPI

Drop the in-CI git+url build, drop the manual pip cache step (now
handled by setup-python's cache: pip), reuse the same dist-info alias
trick as ubuntu so importlib.metadata.version('vllm') works.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci: tag aliased vllm dist-info with +cpu so platform plugin activates

vllm.platforms.cpu_platform_plugin() decides whether the CPU platform
is available by checking 'cpu' in importlib.metadata.version('vllm').
Our build script strips the +cpu local label before upload (PyPI
rejects local versions), so the alias version was just a date string
without 'cpu', making the plugin return None and 'vllm serve' fail
with 'Failed to infer device type'. Re-tag the alias copy with +cpu;
the original dist-info is untouched.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci(buildkite): use unsafe-best-match for uv when installing vllm-cpu-nightly

uv's default first-index strategy locked setuptools to whatever the
pytorch CPU index serves (<=70.2.0), so vllm-cpu-nightly's pinned
setuptools==80.10.2 could not be satisfied. Tell uv to consider the
full cross-index version pool just for this install.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci(buildkite): alias vllm-cpu-nightly dist-info to vllm with +cpu tag

vllm CLI calls importlib.metadata.version('vllm'), but our wheel
registers as vllm-cpu-nightly so the lookup raises PackageNotFoundError
and 'vllm serve' dies. Same fix already applied in the GH Actions
cpu_device.yml — port it to the buildkite script.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci(buildkite): drop stale CpuCacheContext check in handle transport verify

handle (server-side copy) now goes through ShmTransferStrategy after
the non_gpu_transfer refactor; CpuCacheContext is no longer
instantiated on this path. Match on the actual log line
'Using shm non-GPU transfer strategy' instead.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci(buildkite): verify transport via worker 'Creating transfer context' line

Step 5.5 transport-mode verification was checking server-side strategy
strings, but for handle mode the worker enters HandleTransferContext
which goes through gpu_transfer.py, not non_gpu_transfer.py - so the
shm strategy line never shows up. Switch to grepping the worker's own
'Creating transfer context (device_type=*, mode=*)' log line, which
is the single source of truth for which TransferContext got created.
Also split the previously-conflated 'auto' and 'handle' branches: on
CPU, auto falls back to DataTransferContext, while handle stays as
HandleTransferContext.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci(buildkite): grep handle/auto transport verify in vllm log, not lmcache log

Worker is a child of vllm serve, so its 'Creating transfer context'
line goes to VLLM_LOG (vllm stdout), not LMCACHE_LOG (lmcache server
stdout). Step 5.5 was grepping the wrong file.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci(cpu): factor out shared install/download scripts

Pull the duplicated vLLM-CPU install + dist-info alias, lmcache CPU install and opt-125m download out of cpu_device.yml and run-cpu-e2e-validation.sh into three small scripts under .github/scripts/. Also collapse the ubuntu/macos jobs into a single matrix job, and trim a few overly long inline comments.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* ci(cpu): generalize download script; define wait_for_metric_change

- Rename download_opt125m.sh -> download_model.sh and accept the repo id as a positional arg (or via MODEL_ID), so the script is reusable for other models. - Add the missing wait_for_metric_change helper that run-cpu-e2e-validation.sh has been calling but never defined; previously bash silently swallowed it via '|| true'.

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* improve

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* Address comment, remove SKIP_CACHE_HIT_VALIDATION env var

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

* Address comment, move scripts together

Signed-off-by: baoloongmao <baoloongmao@tencent.com>

---------

Signed-off-by: baoloongmao <baoloongmao@tencent.com>
---
 .buildkite/k3_tests/multiprocess/pipeline.yml |   6 +-
 .github/scripts/cpu_device_test.sh            | 156 ++++++++++
 .github/scripts/cpu_server_bench_test.sh      | 138 ++++++++
 .github/scripts/cpu_vllm_e2e_test.sh          |  88 ++++++
 .github/scripts/download_model.sh             |  79 +++++
 .github/scripts/install_lmcache_cpu.sh        |  27 ++
 .github/scripts/install_vllm_cpu.sh           |  66 ++++
 .github/scripts/macos_smoke_test.sh           | 124 --------
 .../scripts/run-cpu-e2e-validation.sh         | 294 ++++++++++++------
 .github/workflows/cpu_device.yml              | 159 ++++++++++
 .github/workflows/macos_compat.yml            | 147 ---------
 11 files changed, 907 insertions(+), 377 deletions(-)
 create mode 100644 .github/scripts/cpu_device_test.sh
 create mode 100644 .github/scripts/cpu_server_bench_test.sh
 create mode 100644 .github/scripts/cpu_vllm_e2e_test.sh
 create mode 100644 .github/scripts/download_model.sh
 create mode 100644 .github/scripts/install_lmcache_cpu.sh
 create mode 100644 .github/scripts/install_vllm_cpu.sh
 delete mode 100644 .github/scripts/macos_smoke_test.sh
 rename {.buildkite/k3_tests/multiprocess => .github}/scripts/run-cpu-e2e-validation.sh (67%)
 create mode 100644 .github/workflows/cpu_device.yml
 delete mode 100644 .github/workflows/macos_compat.yml

diff --git a/.buildkite/k3_tests/multiprocess/pipeline.yml b/.buildkite/k3_tests/multiprocess/pipeline.yml
index 64e965935f..e59b236f4d 100644
--- a/.buildkite/k3_tests/multiprocess/pipeline.yml
+++ b/.buildkite/k3_tests/multiprocess/pipeline.yml
@@ -142,7 +142,7 @@ steps:
   - group: ":compression: Multiprocess (CPU-only)"
     steps:
       - label: ":compression: cpu_e2e_validation (shm)"
-        command: bash .buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+        command: bash .github/scripts/run-cpu-e2e-validation.sh
         timeout_in_minutes: 30
         agents: { queue: "k8s" }
         plugins:
@@ -167,7 +167,7 @@ steps:
                   - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } }
 
       - label: ":compression: cpu_e2e_validation (pickle)"
-        command: bash .buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+        command: bash .github/scripts/run-cpu-e2e-validation.sh
         env:
           LMCACHE_SHM_NAME: ""
         timeout_in_minutes: 30
@@ -194,7 +194,7 @@ steps:
                   - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } }
 
       - label: ":compression: cpu_e2e_validation (server-side copy)"
-        command: bash .buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+        command: bash .github/scripts/run-cpu-e2e-validation.sh
         env:
           LMCACHE_MP_TRANSFER_MODE: "handle"
         timeout_in_minutes: 30
diff --git a/.github/scripts/cpu_device_test.sh b/.github/scripts/cpu_device_test.sh
new file mode 100644
index 0000000000..0b191662d8
--- /dev/null
+++ b/.github/scripts/cpu_device_test.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+#
+# CPU device test: runs both server bench and vLLM e2e tests
+# in the same environment to avoid repeated vLLM installation.
+#
+# Usage: cpu_device_test.sh [mode]
+#   mode: server_bench, vllm_e2e, or all (default)
+#
+# Environment variables:
+#   LMCACHE_BENCH_TRANSFER_MODE  data|handle (default: handle)
+#   LMCACHE_E2E_TRANSPORT_MODE  handle|data|shm|pickle (default: handle)
+#     shm/pickle are user-friendly aliases for data mode with corresponding sub-mode
+#   LMCACHE_E2E_DATA_MODE       shm|pickle (default: shm, data transport sub-mode)
+#   LMCACHE_HTTP_PORT_BENCH     HTTP port for bench (default: 18080)
+#   LMCACHE_ZMQ_PORT_BENCH      ZMQ port for bench (default: 15555)
+#   LMCACHE_HTTP_PORT_E2E       HTTP port for e2e (default: 18081)
+#   LMCACHE_ZMQ_PORT_E2E        ZMQ port for e2e (default: 15557)
+#   VLLM_PORT_E2E               HTTP port for vLLM (default: 18000)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OS="$(uname -s)"
+TEST_MODE="${1:-all}"
+
+echo "==> CPU device test (OS: ${OS}, Mode: ${TEST_MODE})"
+echo "    Python: $(python3 --version 2>&1 || true)"
+
+# Configuration
+BENCH_TRANSFER_MODE="${LMCACHE_BENCH_TRANSFER_MODE:-handle}"
+E2E_TRANSPORT_MODE="${LMCACHE_E2E_TRANSPORT_MODE:-handle}"
+E2E_DATA_MODE="${LMCACHE_E2E_DATA_MODE:-shm}"
+HTTP_PORT_BENCH="${LMCACHE_HTTP_PORT_BENCH:-18080}"
+ZMQ_PORT_BENCH="${LMCACHE_ZMQ_PORT_BENCH:-15555}"
+HTTP_PORT_E2E="${LMCACHE_HTTP_PORT_E2E:-18081}"
+ZMQ_PORT_E2E="${LMCACHE_ZMQ_PORT_E2E:-15557}"
+VLLM_PORT_E2E="${VLLM_PORT_E2E:-18000}"
+
+# Validate modes
+case "${BENCH_TRANSFER_MODE}" in
+  data|handle) ;;
+  *)
+    echo "!! Unknown LMCACHE_BENCH_TRANSFER_MODE='${BENCH_TRANSFER_MODE}'"
+    exit 1
+    ;;
+esac
+
+# Map user-facing LMCACHE_E2E_TRANSPORT_MODE to internal representation.
+# shm/pickle are aliases for data mode with corresponding sub-mode selection,
+# preserved for backward compatibility with CI yaml and user convenience.
+case "${E2E_TRANSPORT_MODE}" in
+  data|handle)
+    MAPPED_TRANSPORT_MODE="${E2E_TRANSPORT_MODE}"
+    MAPPED_DATA_MODE="${E2E_DATA_MODE}"   # keep user override if any
+    ;;
+  shm)
+    MAPPED_TRANSPORT_MODE="data"
+    MAPPED_DATA_MODE="shm"
+    ;;
+  pickle)
+    MAPPED_TRANSPORT_MODE="data"
+    MAPPED_DATA_MODE="pickle"
+    ;;
+  *)
+    echo "!! Unknown LMCACHE_E2E_TRANSPORT_MODE='${E2E_TRANSPORT_MODE}'"
+    echo "   Valid values: handle, data, shm, pickle"
+    exit 1
+    ;;
+esac
+
+echo "    Bench transfer mode: ${BENCH_TRANSFER_MODE}"
+echo "    E2E transport mode: ${E2E_TRANSPORT_MODE}"
+echo "    Ports: bench=${HTTP_PORT_BENCH}/${ZMQ_PORT_BENCH}, e2e=${HTTP_PORT_E2E}/${ZMQ_PORT_E2E}/${VLLM_PORT_E2E}"
+
+# Reap any LMCache/vLLM children started by this run on exit so the
+# next workflow step does not collide on default ZMQ/HTTP ports.
+cleanup_processes_safe() {
+    local rc=$?
+    set +e
+    # Kill child processes started by this shell first (e.g. lmcache server
+    # backgrounded by the shared validation script).
+    pkill -P $$ 2>/dev/null || true
+    sleep 1
+    pkill -9 -P $$ 2>/dev/null || true
+    return $rc
+}
+trap cleanup_processes_safe EXIT
+
+# Function to run server bench test
+run_server_bench() {
+    echo ""
+    echo "==> Running CPU server bench test"
+    
+    # Set environment for bench test
+    export LMCACHE_BENCH_TRANSFER_MODE="${BENCH_TRANSFER_MODE}"
+    export LMCACHE_HTTP_PORT="${HTTP_PORT_BENCH}"
+    export LMCACHE_ZMQ_PORT="${ZMQ_PORT_BENCH}"
+    export LMCACHE_LOG_FILE="/tmp/cpu_device_bench_${BENCH_TRANSFER_MODE}_lmcache.log"
+    export BENCH_OUTPUT_LOG="/tmp/cpu_device_bench_${BENCH_TRANSFER_MODE}_output.log"
+    export LMCACHE_HEALTHCHECK_TIMEOUT="30"
+    export BENCH_NUM_REQUESTS="3"
+    export BENCH_NUM_TOKENS="512"
+    
+    # Run bench test
+    bash "${SCRIPT_DIR}/cpu_server_bench_test.sh"
+    
+    echo "==> CPU server bench test completed successfully"
+}
+
+# Function to run vLLM e2e test
+run_vllm_e2e() {
+    echo ""
+    echo "==> Running CPU vLLM e2e test"
+    
+    # Set environment for e2e test
+    export LMCACHE_TRANSPORT_MODE="${MAPPED_TRANSPORT_MODE}"
+    export LMCACHE_DATA_MODE="${MAPPED_DATA_MODE}"
+    export LMCACHE_HTTP_PORT="${HTTP_PORT_E2E}"
+    export LMCACHE_ZMQ_PORT="${ZMQ_PORT_E2E}"
+    export VLLM_PORT="${VLLM_PORT_E2E}"
+    export LMCACHE_LOG_FILE="/tmp/cpu_device_e2e_${E2E_TRANSPORT_MODE}_lmcache.log"
+    export VLLM_LOG_FILE="/tmp/cpu_device_e2e_${E2E_TRANSPORT_MODE}_vllm.log"
+    export LMCACHE_HEALTHCHECK_TIMEOUT="30"
+    export VLLM_READY_TIMEOUT="300"
+    
+    # Run e2e test
+    bash "${SCRIPT_DIR}/cpu_vllm_e2e_test.sh"
+    
+    echo "==> CPU vLLM e2e test completed successfully"
+}
+
+# Determine which tests to run
+case "${TEST_MODE}" in
+    "server_bench")
+        run_server_bench
+        ;;
+    "vllm_e2e")
+        run_vllm_e2e
+        ;;
+    "all")
+        run_server_bench
+        run_vllm_e2e
+        ;;
+    *)
+        echo "!! Unknown test mode: ${TEST_MODE}"
+        echo "    Supported modes: server_bench, vllm_e2e, all"
+        exit 1
+        ;;
+esac
+
+echo ""
+echo "==> CPU device test passed for modes:"
+echo "    Test mode: ${TEST_MODE}"
+echo "    Server bench: ${BENCH_TRANSFER_MODE}"
+echo "    vLLM e2e: ${E2E_TRANSPORT_MODE}"
diff --git a/.github/scripts/cpu_server_bench_test.sh b/.github/scripts/cpu_server_bench_test.sh
new file mode 100644
index 0000000000..7aa97512ee
--- /dev/null
+++ b/.github/scripts/cpu_server_bench_test.sh
@@ -0,0 +1,138 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+#
+# CPU server bench test: starts lmcache server, runs
+# ``lmcache bench server --mode cpu`` with the requested transfer mode,
+# then tears down.
+#
+# Transfer modes (LMCACHE_BENCH_TRANSFER_MODE):
+#   data    - worker-side gather/scatter via POSIX SHM pool
+#   handle  - server-side copy via POSIX SHM IPC (shm_open/mmap)
+#
+# Environment variables (all optional, defaults shown):
+#   LMCACHE_BENCH_TRANSFER_MODE  data|handle          (default: handle)
+#   LMCACHE_HTTP_PORT            HTTP port            (default: 18080)
+#   LMCACHE_ZMQ_PORT             ZMQ RPC port         (default: 15555)
+#   LMCACHE_LOG_FILE             server log path      (default: /tmp/...)
+#   LMCACHE_HEALTHCHECK_TIMEOUT  seconds              (default: 60)
+#   BENCH_NUM_REQUESTS           requests to run      (default: 3)
+#   BENCH_NUM_TOKENS             tokens per request   (default: 512)
+
+set -euo pipefail
+
+OS="$(uname -s)"
+echo "==> CPU server bench test (OS: ${OS})"
+echo "    Python: $(python3 --version 2>&1 || true)"
+
+TRANSFER_MODE="${LMCACHE_BENCH_TRANSFER_MODE:-handle}"
+HTTP_PORT="${LMCACHE_HTTP_PORT:-18080}"
+ZMQ_PORT="${LMCACHE_ZMQ_PORT:-15555}"
+LOG_FILE="${LMCACHE_LOG_FILE:-/tmp/cpu_server_bench_lmcache.log}"
+HEALTHCHECK_TIMEOUT="${LMCACHE_HEALTHCHECK_TIMEOUT:-60}"
+BENCH_NUM_REQUESTS="${BENCH_NUM_REQUESTS:-3}"
+BENCH_NUM_TOKENS="${BENCH_NUM_TOKENS:-512}"
+
+case "${TRANSFER_MODE}" in
+  data|handle) ;;
+  *)
+    echo "!! Unknown LMCACHE_BENCH_TRANSFER_MODE='${TRANSFER_MODE}'"
+    echo "   Valid values: data, handle"
+    exit 1
+    ;;
+esac
+
+echo "    TRANSFER_MODE=${TRANSFER_MODE}"
+echo "    HTTP_PORT=${HTTP_PORT}  ZMQ_PORT=${ZMQ_PORT}"
+echo "    BENCH_NUM_REQUESTS=${BENCH_NUM_REQUESTS}"
+echo "    BENCH_NUM_TOKENS=${BENCH_NUM_TOKENS}"
+
+# ------------------------------------------------------------------ #
+# Start lmcache server
+# ------------------------------------------------------------------ #
+echo ""
+echo "==> Starting lmcache server (log: ${LOG_FILE})"
+rm -f "${LOG_FILE}"
+
+lmcache server \
+  --port "${ZMQ_PORT}" \
+  --http-port "${HTTP_PORT}" \
+  --l1-size-gb 1 \
+  --eviction-policy LRU \
+  >"${LOG_FILE}" 2>&1 &
+SERVER_PID=$!
+
+cleanup() {
+  echo "==> Cleanup: stopping lmcache server (pid=${SERVER_PID})"
+  if kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" 2>/dev/null || true
+    for _ in $(seq 1 10); do
+      kill -0 "${SERVER_PID}" 2>/dev/null || break
+      sleep 1
+    done
+    kill -9 "${SERVER_PID}" 2>/dev/null || true
+  fi
+  echo "==> Last 50 lines of server log:"
+  tail -n 50 "${LOG_FILE}" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# ------------------------------------------------------------------ #
+# Wait for healthcheck
+# ------------------------------------------------------------------ #
+echo "==> Waiting for healthcheck (timeout: ${HEALTHCHECK_TIMEOUT}s)"
+READY=0
+for i in $(seq 1 "${HEALTHCHECK_TIMEOUT}"); do
+  if ! kill -0 "${SERVER_PID}" 2>/dev/null; then
+    echo "!! lmcache server exited prematurely after ${i}s"
+    break
+  fi
+  if curl -fsS --max-time 2 \
+      "http://127.0.0.1:${HTTP_PORT}/healthcheck" >/dev/null 2>&1; then
+    READY=1
+    echo "    Server healthy after ${i}s"
+    break
+  fi
+  sleep 1
+done
+
+if [ "${READY}" != "1" ]; then
+  echo "!! lmcache server did not become healthy within ${HEALTHCHECK_TIMEOUT}s"
+  exit 1
+fi
+
+# ------------------------------------------------------------------ #
+# Run bench and validate results
+# ------------------------------------------------------------------ #
+echo ""
+echo "==> Running: lmcache bench server" \
+  "--mode cpu --transfer-mode ${TRANSFER_MODE}" \
+  "--num-tokens ${BENCH_NUM_TOKENS}" \
+  "--end ${BENCH_NUM_REQUESTS}"
+
+BENCH_LOG="${BENCH_OUTPUT_LOG:-/tmp/cpu_server_bench_output.log}"
+lmcache bench server \
+  --rpc-url "tcp://127.0.0.1:${ZMQ_PORT}" \
+  --url "http://127.0.0.1:${HTTP_PORT}" \
+  --mode cpu \
+  --transfer-mode "${TRANSFER_MODE}" \
+  --num-tokens "${BENCH_NUM_TOKENS}" \
+  --end "${BENCH_NUM_REQUESTS}" \
+  2>&1 | tee "${BENCH_LOG}"
+
+echo ""
+echo "==> Validating bench results"
+
+if grep -q "CHECKSUM MISMATCH" "${BENCH_LOG}"; then
+  echo "!! CHECKSUM MISMATCH detected — store/retrieve data corruption"
+  exit 1
+fi
+
+MATCH_COUNT="$(grep -c "CHECKSUM MATCH OK" "${BENCH_LOG}" || true)"
+if [ "${MATCH_COUNT}" -lt "${BENCH_NUM_REQUESTS}" ]; then
+  echo "!! CHECKSUM MATCH count (${MATCH_COUNT}) < expected (${BENCH_NUM_REQUESTS})"
+  exit 1
+fi
+echo "    CHECKSUM MATCH: ${MATCH_COUNT}/${BENCH_NUM_REQUESTS} request(s) verified OK"
+
+echo ""
+echo "==> CPU server bench (${TRANSFER_MODE}) passed."
diff --git a/.github/scripts/cpu_vllm_e2e_test.sh b/.github/scripts/cpu_vllm_e2e_test.sh
new file mode 100644
index 0000000000..a85743d8f4
--- /dev/null
+++ b/.github/scripts/cpu_vllm_e2e_test.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+#
+# Generic CPU vLLM e2e wrapper for both macOS and Ubuntu.
+# Assumes vLLM CPU build and facebook/opt-125m are already installed/
+# downloaded by the CI workflow steps before this script is invoked.
+#
+# Delegates all logic to the shared run-cpu-e2e-validation.sh with:
+#   SKIP_INSTALL=1              (install done by CI workflow steps)
+#
+# Transport mode is selected via LMCACHE_TRANSPORT_MODE:
+#   handle -> LMCACHE_MP_TRANSFER_MODE=handle (POSIX SHM server-side copy)
+#   data   -> LMCACHE_DATA_MODE selects shm (default) or pickle
+#
+# Environment variables (all optional, defaults shown):
+#   LMCACHE_TRANSPORT_MODE   Transport mode: handle|data (default: handle)
+#   LMCACHE_DATA_MODE        Data transfer mode: shm|pickle (default: shm)
+#   LMCACHE_HTTP_PORT        HTTP port for LMCache server  (default: 8080)
+#   VLLM_PORT                HTTP port for vLLM server     (default: 8000)
+#   LMCACHE_L1_SIZE_GB       LMCache L1 cache size in GB   (default: 2)
+#   VLLM_READY_TIMEOUT       Seconds to wait for vLLM      (default: 300)
+#   LMCACHE_HEALTHCHECK_TIMEOUT  Seconds to wait for LMCache (default: 60)
+
+set -euo pipefail
+
+OS="$(uname -s)"
+echo "==> CPU vLLM e2e test (OS: ${OS})"
+echo "    Python: $(python3 --version 2>&1 || true)"
+echo "    uname:  $(uname -a)"
+if [ "${OS}" = "Darwin" ]; then
+    sw_vers 2>/dev/null || true
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+SHARED_SCRIPT="${REPO_ROOT}/.github/scripts/run-cpu-e2e-validation.sh"
+
+if [ ! -f "${SHARED_SCRIPT}" ]; then
+    echo "!! Shared script not found: ${SHARED_SCRIPT}"
+    exit 1
+fi
+
+LMCACHE_TRANSPORT_MODE="${LMCACHE_TRANSPORT_MODE:-handle}"
+
+# When LMCACHE_TRANSPORT_MODE=data, LMCACHE_DATA_MODE selects the
+# specific data transfer mechanism: shm (default) or pickle.
+LMCACHE_DATA_MODE="${LMCACHE_DATA_MODE:-shm}"
+
+# Map LMCACHE_TRANSPORT_MODE to the vars expected by the shared script.
+case "${LMCACHE_TRANSPORT_MODE}" in
+  data)
+    case "${LMCACHE_DATA_MODE}" in
+      shm)
+        export LMCACHE_SHM_NAME="__default__"
+        ;;
+      pickle)
+        export LMCACHE_SHM_NAME=""
+        ;;
+      *)
+        echo "!! Unknown LMCACHE_DATA_MODE='${LMCACHE_DATA_MODE}'"
+        echo "   Valid values: shm, pickle"
+        exit 1
+        ;;
+    esac
+    export LMCACHE_MP_TRANSFER_MODE="data"
+    ;;
+  handle)
+    export LMCACHE_MP_TRANSFER_MODE="handle"
+    ;;
+  *)
+    echo "!! Unknown LMCACHE_TRANSPORT_MODE='${LMCACHE_TRANSPORT_MODE}'"
+    echo "   Valid values: handle, data"
+    exit 1
+    ;;
+esac
+
+export SKIP_INSTALL="${SKIP_INSTALL:-1}"
+export LMCACHE_HEALTHCHECK_TIMEOUT="${LMCACHE_HEALTHCHECK_TIMEOUT:-60}"
+export VLLM_READY_TIMEOUT="${VLLM_READY_TIMEOUT:-300}"
+export LMCACHE_LOG_FILE="${LMCACHE_LOG_FILE:-/tmp/cpu_e2e_lmcache.log}"
+export VLLM_LOG_FILE="${VLLM_LOG_FILE:-/tmp/cpu_e2e_vllm.log}"
+
+echo "    LMCACHE_TRANSPORT_MODE=${LMCACHE_TRANSPORT_MODE}"
+echo "    LMCACHE_DATA_MODE=${LMCACHE_DATA_MODE}"
+echo "    SKIP_INSTALL=${SKIP_INSTALL}"
+echo "    Delegating to: ${SHARED_SCRIPT}"
+
+exec bash "${SHARED_SCRIPT}"
diff --git a/.github/scripts/download_model.sh b/.github/scripts/download_model.sh
new file mode 100644
index 0000000000..488cc4c050
--- /dev/null
+++ b/.github/scripts/download_model.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+#
+# Download (or just verify the local cache of) a HuggingFace model
+# repo, with bounded retry + exponential backoff so flaky HF mirrors
+# don't fail the whole CI run.
+#
+# Usage:
+#   download_model.sh <repo_id> [<repo_id> ...]
+#   MODEL_ID=facebook/opt-125m download_model.sh
+#
+# Environment:
+#   MODEL_ID                  fallback when no positional args given
+#   HF_DOWNLOAD_MAX_RETRIES   default 3
+#   HF_DOWNLOAD_RETRY_DELAY   default 30 (seconds, doubled per retry)
+#   HF_DOWNLOAD_FAIL_ON_ERROR default 0  (1 -> exit non-zero on failure)
+
+set -euo pipefail
+
+if [ "$#" -eq 0 ]; then
+  if [ -z "${MODEL_ID:-}" ]; then
+    echo "!! download_model.sh: no model id provided"
+    echo "   pass repo ids as args or set MODEL_ID=..."
+    exit 2
+  fi
+  set -- "${MODEL_ID}"
+fi
+
+MAX_RETRIES="${HF_DOWNLOAD_MAX_RETRIES:-3}"
+RETRY_DELAY="${HF_DOWNLOAD_RETRY_DELAY:-30}"
+FAIL_ON_ERROR="${HF_DOWNLOAD_FAIL_ON_ERROR:-0}"
+
+MAX_RETRIES="${MAX_RETRIES}" RETRY_DELAY="${RETRY_DELAY}" \
+FAIL_ON_ERROR="${FAIL_ON_ERROR}" python3 - "$@" <<'PY'
+import os
+import sys
+import time
+
+from huggingface_hub import snapshot_download
+
+max_retries = int(os.environ["MAX_RETRIES"])
+base_delay = int(os.environ["RETRY_DELAY"])
+fail_on_error = os.environ["FAIL_ON_ERROR"] == "1"
+
+repos = sys.argv[1:]
+failures = []
+
+for repo in repos:
+    # Try local cache first to avoid unnecessary HF API calls
+    # (which can 429 on busy CI runners even when the model is cached).
+    try:
+        snapshot_download(repo, local_files_only=True)
+        print(f"CACHED: {repo} (local, no network)")
+        continue
+    except Exception:
+        pass
+
+    delay = base_delay
+    ok = False
+    for attempt in range(max_retries):
+        try:
+            print(f"Attempt {attempt + 1}/{max_retries}: {repo}")
+            snapshot_download(repo)
+            print(f"OK: {repo}")
+            ok = True
+            break
+        except Exception as exc:
+            print(f"Attempt {attempt + 1} failed for {repo}: {exc}")
+            if attempt < max_retries - 1:
+                print(f"Waiting {delay}s before retry...")
+                time.sleep(delay)
+                delay *= 2
+    if not ok:
+        failures.append(repo)
+
+if failures:
+    print(f"All retry attempts failed for: {', '.join(failures)}")
+    sys.exit(1 if fail_on_error else 0)
+PY
diff --git a/.github/scripts/install_lmcache_cpu.sh b/.github/scripts/install_lmcache_cpu.sh
new file mode 100644
index 0000000000..bafa7722f6
--- /dev/null
+++ b/.github/scripts/install_lmcache_cpu.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+#
+# Install lmcache in editable, CPU-only mode (NO_GPU_EXT=1).
+# Assumes vLLM (and therefore torch) was already installed by
+# install_vllm_cpu.sh, so we use --no-deps to keep that pinned torch
+# in place.
+#
+# Usage:
+#   install_lmcache_cpu.sh           # plain pip
+#   PIP_BIN="uv pip" install_lmcache_cpu.sh
+
+set -euo pipefail
+
+PIP_BIN="${PIP_BIN:-pip}"
+
+export NO_GPU_EXT=1
+export SETUPTOOLS_SCM_PRETEND_VERSION="${SETUPTOOLS_SCM_PRETEND_VERSION:-0.0.0.dev0}"
+
+${PIP_BIN} install --upgrade pip
+${PIP_BIN} install -r requirements/build.txt
+${PIP_BIN} install -r requirements/common.txt
+${PIP_BIN} install -r requirements/cli.txt
+${PIP_BIN} install -e . --no-deps --no-build-isolation
+
+python -c "import lmcache, vllm; \
+print('lmcache:', lmcache.__version__, 'vllm:', vllm.__version__)"
diff --git a/.github/scripts/install_vllm_cpu.sh b/.github/scripts/install_vllm_cpu.sh
new file mode 100644
index 0000000000..fd44a369e2
--- /dev/null
+++ b/.github/scripts/install_vllm_cpu.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+#
+# Install the prebuilt CPU-only vLLM wheel (`vllm-cpu-nightly`) plus a
+# `vllm-<ver>+cpu.dist-info` alias.
+#
+# Why the alias: the wheel installs the `vllm/` package but registers
+# its dist metadata under `vllm-cpu-nightly`. vLLM's CLI / internal
+# callers do `importlib.metadata.version("vllm")` (distribution name,
+# not import name); without the alias that raises PackageNotFoundError
+# and `vllm serve` won't start. The `+cpu` local label is also needed
+# so `cpu_platform_plugin()` activates the CPU platform (it greps the
+# dist metadata for the substring "cpu"); our build strips `+cpu`
+# before PyPI upload because PyPI rejects local versions.
+#
+# Usage:
+#   install_vllm_cpu.sh           # use plain `pip`
+#   PIP_BIN="uv pip" install_vllm_cpu.sh
+#                                 # use `uv pip` and pass extra flags
+#                                 # via PIP_INSTALL_EXTRA_ARGS
+#
+# Idempotent: re-running just rewrites the alias.
+
+set -euo pipefail
+
+PIP_BIN="${PIP_BIN:-pip}"
+PIP_INSTALL_EXTRA_ARGS="${PIP_INSTALL_EXTRA_ARGS:-}"
+
+# `--extra-index-url` is required because the wheel pins torch==2.11.0
+# which only lives on the pytorch CPU index. Harmless on macOS.
+${PIP_BIN} install "numpy<2"
+# shellcheck disable=SC2086
+${PIP_BIN} install vllm-cpu-nightly \
+  --extra-index-url https://download.pytorch.org/whl/cpu \
+  ${PIP_INSTALL_EXTRA_ARGS}
+
+python - <<'PY'
+import importlib.metadata as md
+import pathlib
+import shutil
+
+dist = md.distribution("vllm-cpu-nightly")
+ver = dist.version
+fake_ver = f"{ver}+cpu"
+site_root = pathlib.Path(dist.locate_file(""))
+info_name = next(
+    p.parts[0] for p in (dist.files or [])
+    if p.parts and p.parts[0].endswith(".dist-info")
+)
+src = site_root / info_name
+dst = src.with_name(f"vllm-{fake_ver}.dist-info")
+if dst.exists():
+    shutil.rmtree(dst)
+shutil.copytree(src, dst)
+meta = dst / "METADATA"
+txt = meta.read_text()
+txt = txt.replace("Name: vllm-cpu-nightly", "Name: vllm", 1)
+txt = txt.replace(f"Version: {ver}", f"Version: {fake_ver}", 1)
+meta.write_text(txt)
+print(f"Aliased {src.name} -> {dst.name}")
+print("vllm version (via importlib.metadata):", md.version("vllm"))
+PY
+
+python -c "import vllm, torch; \
+print('vllm:', vllm.__version__, 'torch:', torch.__version__, \
+      'cuda:', torch.cuda.is_available())"
diff --git a/.github/scripts/macos_smoke_test.sh b/.github/scripts/macos_smoke_test.sh
deleted file mode 100644
index 15932ff2ea..0000000000
--- a/.github/scripts/macos_smoke_test.sh
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: Apache-2.0
-#
-# macOS basic-compatibility smoke test for the lmcache multiprocess
-# server. Verifies that:
-#   1) `lmcache --help` works (CLI entry point is importable)
-#   2) common C++ extensions (native_storage_ops / lmcache_redis /
-#      lmcache_fs) load on macOS, exercising the PipeNotifier fallback
-#      in csrc/storage_backends/event_notifier.h
-#   3) `lmcache server` can launch the ZMQ + HTTP server on CPU
-#   4) the HTTP server answers GET / and GET /healthcheck
-#
-# This script is intentionally minimal — it does not exercise any
-# GPU / CUDA / vLLM code paths. It is meant to be a fast regression
-# signal against accidental Linux-only imports or filesystem usage
-# (e.g. /dev/shm, librt, eventfd, fcntl at import time).
-
-set -euo pipefail
-
-HTTP_HOST="127.0.0.1"
-HTTP_PORT="${LMCACHE_HTTP_PORT:-18080}"
-ZMQ_PORT="${LMCACHE_ZMQ_PORT:-15555}"
-LOG_FILE="${LMCACHE_LOG_FILE:-/tmp/lmcache_server.log}"
-# GitHub macOS runners are noticeably slower than local macs on cold
-# `import torch / fastapi / opentelemetry` chains, so budget enough
-# wall-clock for the first HTTP hit after `lmcache server` starts.
-STARTUP_TIMEOUT="${LMCACHE_STARTUP_TIMEOUT:-180}"
-
-echo "==> Environment"
-uname -a || true
-sw_vers || true
-python --version
-python -c "import torch; print('torch', torch.__version__, 'cuda', torch.cuda.is_available())"
-
-echo "==> Step 1: lmcache CLI help"
-lmcache --help >/dev/null
-
-echo "==> Step 1.5: import common C++ extensions (CPU-only build)"
-# Mirrors the verify step in .github/workflows/build_cpu_artifacts.yml
-# on the macOS axis: makes sure NO_GPU_EXT=1 produced loadable .so's
-# and that c_ops resolves to the python_ops_fallback shim.
-python -c "
-import sys
-import lmcache
-import lmcache.native_storage_ops  # noqa: F401
-import lmcache.lmcache_redis  # noqa: F401
-import lmcache.lmcache_fs  # noqa: F401
-import lmcache.c_ops  # noqa: F401
-assert lmcache.torch_device_type == 'cpu', lmcache.torch_device_type
-assert sys.modules['lmcache.c_ops'].__name__ == 'lmcache.python_ops_fallback'
-"
-
-echo "==> Step 2: launch 'lmcache server' on ${HTTP_HOST}:${HTTP_PORT} (zmq ${ZMQ_PORT})"
-rm -f "${LOG_FILE}"
-# Run the server in the background. Using `setsid`-like behavior via
-# a subshell so we can kill the whole process group cleanly.
-(
-  lmcache server \
-    --host "${HTTP_HOST}" \
-    --port "${ZMQ_PORT}" \
-    --http-host "${HTTP_HOST}" \
-    --http-port "${HTTP_PORT}" \
-    --l1-size-gb "${LMCACHE_L1_SIZE_GB:-1}" \
-    --eviction-policy "${LMCACHE_EVICTION_POLICY:-LRU}" \
-    --no-l1-use-lazy \
-    >"${LOG_FILE}" 2>&1
-) &
-SERVER_PID=$!
-
-cleanup() {
-  echo "==> Cleanup: stopping server (pid=${SERVER_PID})"
-  if kill -0 "${SERVER_PID}" 2>/dev/null; then
-    kill "${SERVER_PID}" 2>/dev/null || true
-    # Give it a moment to exit; escalate if needed.
-    for _ in $(seq 1 10); do
-      kill -0 "${SERVER_PID}" 2>/dev/null || break
-      sleep 1
-    done
-    kill -9 "${SERVER_PID}" 2>/dev/null || true
-  fi
-  if [[ -f "${LOG_FILE}" ]]; then
-    echo "==> Last 100 lines of server log:"
-    tail -n 100 "${LOG_FILE}" || true
-  fi
-}
-trap cleanup EXIT
-
-echo "==> Step 3: wait for HTTP endpoint (timeout=${STARTUP_TIMEOUT}s)"
-READY=0
-for i in $(seq 1 "${STARTUP_TIMEOUT}"); do
-  if ! kill -0 "${SERVER_PID}" 2>/dev/null; then
-    echo "!! lmcache server exited prematurely after ${i}s"
-    break
-  fi
-  if curl -fsS --max-time 2 "http://${HTTP_HOST}:${HTTP_PORT}/" >/dev/null 2>&1; then
-    READY=1
-    echo "==> Server reachable after ${i}s"
-    break
-  fi
-  sleep 1
-done
-
-if [[ "${READY}" != "1" ]]; then
-  echo "!! lmcache server did not become ready within ${STARTUP_TIMEOUT}s"
-  exit 1
-fi
-
-echo "==> Step 4: curl GET /"
-ROOT_BODY="$(curl -fsS "http://${HTTP_HOST}:${HTTP_PORT}/")"
-echo "    body: ${ROOT_BODY}"
-echo "${ROOT_BODY}" | grep -q '"status"' || {
-  echo "!! GET / did not return expected status field"
-  exit 1
-}
-
-echo "==> Step 5: curl GET /healthcheck"
-HEALTH_BODY="$(curl -fsS "http://${HTTP_HOST}:${HTTP_PORT}/healthcheck")"
-echo "    body: ${HEALTH_BODY}"
-echo "${HEALTH_BODY}" | grep -q '"status"[[:space:]]*:[[:space:]]*"healthy"' || {
-  echo "!! GET /healthcheck did not report healthy"
-  exit 1
-}
-
-echo "==> macOS smoke test passed."
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh b/.github/scripts/run-cpu-e2e-validation.sh
similarity index 67%
rename from .buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
rename to .github/scripts/run-cpu-e2e-validation.sh
index cd9bd24604..2a47b3ae91 100755
--- a/.buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+++ b/.github/scripts/run-cpu-e2e-validation.sh
@@ -3,15 +3,21 @@ set -euo pipefail
 
 echo "Build ID: ${BUILDKITE_BUILD_ID:-local}"
 echo "Python: $(python3 --version 2>&1 || true)"
-echo "uv: $(uv --version 2>&1 || true)"
+if command -v uv >/dev/null 2>&1; then
+  echo "uv: $(uv --version 2>&1 || true)"
+else
+  echo "uv: not installed"
+fi
 
 BUILD_ID="${BUILDKITE_BUILD_ID:-local_$$}"
 VENV_DIR=".venv-${BUILD_ID}"
-LMCACHE_LOG="/tmp/build_${BUILD_ID}_lmcache_cpu_validation.log"
-VLLM_LOG="/tmp/build_${BUILD_ID}_vllm_cpu_validation.log"
+SHARED_SCRIPTS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+LMCACHE_LOG="${LMCACHE_LOG_FILE:-/tmp/build_${BUILD_ID}_lmcache_cpu_validation.log}"
+VLLM_LOG="${VLLM_LOG_FILE:-/tmp/build_${BUILD_ID}_vllm_cpu_validation.log}"
 LMCACHE_PID=""
 VLLM_PID=""
 LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+LMCACHE_ZMQ_PORT="${LMCACHE_ZMQ_PORT:-5555}"
 VLLM_PORT="${VLLM_PORT:-8000}"
 LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-1}"
 LMCACHE_EVICTION_POLICY="${LMCACHE_EVICTION_POLICY:-LRU}"
@@ -25,10 +31,16 @@ VLLM_MAX_MODEL_LEN="${VLLM_MAX_MODEL_LEN:-2048}"
 VLLM_MAX_NUM_SEQS="${VLLM_MAX_NUM_SEQS:-4}"
 LMCACHE_HEALTHCHECK_TIMEOUT="${LMCACHE_HEALTHCHECK_TIMEOUT:-30}"
 VLLM_READY_TIMEOUT="${VLLM_READY_TIMEOUT:-120}"
-# Set LMCACHE_SHM_NAME="" to use pickle transport; unset/default uses shm transport
+# Transport mode selection:
+#   LMCACHE_MP_TRANSFER_MODE=handle  -> handle mode (POSIX SHM server-side copy)
+#   LMCACHE_MP_TRANSFER_MODE=data    -> data mode, sub-selected by LMCACHE_SHM_NAME:
+#       LMCACHE_SHM_NAME=""              -> pickle transport
+#       LMCACHE_SHM_NAME=__default__     -> shm transport (default)
 LMCACHE_SHM_NAME="${LMCACHE_SHM_NAME-__default__}"
-# Set LMCACHE_MP_TRANSFER_MODE=handle for server-side copy (POSIX SHM IPC)
-LMCACHE_MP_TRANSFER_MODE="${LMCACHE_MP_TRANSFER_MODE:-auto}"
+LMCACHE_MP_TRANSFER_MODE="${LMCACHE_MP_TRANSFER_MODE:-data}"
+# Set SKIP_INSTALL=1 to skip Phase 1 (install) — useful when the caller
+# has already installed everything (e.g. macOS CI workflow steps).
+SKIP_INSTALL="${SKIP_INSTALL:-0}"
 
 # Directory to collect artifacts before workspace is deleted
 ARTIFACT_DIR="/tmp/build_${BUILD_ID}_artifacts"
@@ -150,55 +162,53 @@ print(int(total))
 EOF
 }
 
-# Wait for a metric to change from its previous value
+# Poll a Prometheus counter until its value differs from `baseline`,
+# or until `timeout` seconds elapse. Returns 0 on change, 1 on timeout
+# (callers typically `|| true` it because not every probe expects a
+# change to actually happen).
 wait_for_metric_change() {
   local metric_name="$1"
-  local previous_value="$2"
-  local timeout_seconds="${3:-5}"
-  
-  echo "Waiting for metric '${metric_name}' to change from ${previous_value} (timeout: ${timeout_seconds}s)"
-  
-  local start_time current_time
-  start_time=$(date +%s)
-  
-  while true; do
-    current_time=$(date +%s)
-    if [ $((current_time - start_time)) -ge "${timeout_seconds}" ]; then
-      echo "Timeout: Metric '${metric_name}' did not change within ${timeout_seconds}s"
-      return 1
-    fi
-    
-    local current_value
-    current_value="$(scrape_metric "${metric_name}")"
-    
-    if [ "${current_value}" -gt "${previous_value}" ]; then
-      echo "Metric '${metric_name}' changed from ${previous_value} to ${current_value}"
+  local baseline="$2"
+  local timeout="${3:-10}"
+  local current
+  for _ in $(seq 1 "${timeout}"); do
+    current="$(scrape_metric "${metric_name}")"
+    if [ "${current}" != "${baseline}" ]; then
       return 0
     fi
-    
     sleep 1
   done
+  return 1
 }
 
 # Send a completion request and print the text output
 send_completion() {
   local prompt_file="$1"
   local max_tokens="${2:-50}"
-  local prompt
-  prompt="$(cat "${prompt_file}")"
-  local response
-  response="$(curl -fsS "http://localhost:${VLLM_PORT}/v1/completions" \
-    -H "Content-Type: application/json" \
-    -d "$(python3 -c "
-import json, sys
-prompt = open('${prompt_file}').read()
+  local body_file
+  body_file="$(mktemp)"
+  # Build the JSON body in a separate process to avoid nested-quote
+  # quoting nightmares with -d "$(python3 -c "...")". Pass the prompt
+  # file path and max_tokens via argv so the python snippet itself
+  # does not need any shell interpolation inside its string body.
+  PROMPT_FILE="${prompt_file}" MAX_TOKENS="${max_tokens}" \
+    python3 - >"${body_file}" <<'PYEOF'
+import json
+import os
+
+prompt = open(os.environ['PROMPT_FILE']).read()
 print(json.dumps({
     'model': 'facebook/opt-125m',
     'prompt': prompt,
-    'max_tokens': ${max_tokens},
-    'temperature': 0
+    'max_tokens': int(os.environ['MAX_TOKENS']),
+    'temperature': 0,
 }))
-")")"
+PYEOF
+  local response
+  response="$(curl -fsS "http://localhost:${VLLM_PORT}/v1/completions" \
+    -H "Content-Type: application/json" \
+    --data-binary "@${body_file}")"
+  rm -f "${body_file}"
   echo "${response}" | python3 -c "import json,sys; print(json.load(sys.stdin)['choices'][0]['text'])"
 }
 
@@ -208,11 +218,42 @@ start_vllm() {
   # VLLM_CPU_KVCACHE_SPACE, CPU backend falls back to
   # `total_memory * gpu_memory_utilization`, which can request 100s of GiB
   # on big hosts and OOM (see vllm/v1/worker/cpu_worker.py:determine_available_memory).
+  # VLLM_DEVICE is the modern env var (vLLM 0.8+); VLLM_TARGET_DEVICE is
+  # kept for backwards-compatibility with older vLLM CPU wheels.
+  export VLLM_DEVICE=cpu
   export VLLM_TARGET_DEVICE=cpu
   export VLLM_CPU_KVCACHE_SPACE="${VLLM_CPU_KVCACHE_SPACE}"
   export LMCACHE_MP_TRANSFER_MODE="${LMCACHE_MP_TRANSFER_MODE}"
+  # Pin gloo / vLLM rendezvous to loopback. Otherwise vLLM's
+  # network_utils.get_ip() picks a LAN address (e.g. 192.168.x.x on the
+  # macOS GHA runner) and gloo's init_process_group sits there for ~16
+  # minutes doing slow socket bind/connect retries before the engine
+  # ever loads weights.
+  export VLLM_HOST_IP="${VLLM_HOST_IP:-127.0.0.1}"
+  if [ -z "${GLOO_SOCKET_IFNAME:-}" ]; then
+    case "$(uname -s)" in
+      Darwin) export GLOO_SOCKET_IFNAME=lo0 ;;
+      Linux)  export GLOO_SOCKET_IFNAME=lo ;;
+    esac
+  fi
   local kv_cache_bytes
   kv_cache_bytes="$(python3 -c "print(int(${VLLM_CPU_KVCACHE_SPACE} * 1024 * 1024 * 1024))")"
+  # Tell LMCacheMPConnector where the lmcache server actually listens.
+  # Without this it falls back to tcp://localhost:5555 and dies with
+  # "Cannot reach the LMCache MP server" whenever we run multiple e2e
+  # steps in parallel/sequence on different ZMQ ports.
+  local kv_transfer_config
+  kv_transfer_config="$(python3 -c "
+import json
+print(json.dumps({
+    'kv_connector': 'LMCacheMPConnector',
+    'kv_role': 'kv_both',
+    'kv_connector_module_path': 'lmcache.integration.vllm.lmcache_mp_connector',
+    'kv_connector_extra_config': {
+        'lmcache.mp.host': 'tcp://localhost',
+        'lmcache.mp.port': int('${LMCACHE_ZMQ_PORT}'),
+    },
+}))")"
   vllm serve facebook/opt-125m \
     --port "${VLLM_PORT}" \
     --dtype bfloat16 \
@@ -222,7 +263,8 @@ start_vllm() {
     --kv-cache-memory-bytes "${kv_cache_bytes}" \
     --max-model-len "${VLLM_MAX_MODEL_LEN}" \
     --max-num-seqs "${VLLM_MAX_NUM_SEQS}" \
-    --kv-transfer-config '{"kv_connector":"LMCacheMPConnector","kv_role":"kv_both"}' \
+    --kv-transfer-config "${kv_transfer_config}" \
+    --enforce-eager \
     >"${VLLM_LOG}" 2>&1 &
   VLLM_PID=$!
   echo "vLLM server started (PID=${VLLM_PID})"
@@ -262,78 +304,89 @@ on_error() {
 
 trap on_error ERR
 
-echo "=== CPU Install Validation (Phase 1) ==="
-echo "Creating virtual environment with uv at ${VENV_DIR}"
-uv venv --python 3.12 "${VENV_DIR}"
-source "${VENV_DIR}/bin/activate"
-echo "✅ Virtual environment ready"
-
-echo "Upgrading pip/setuptools/wheel"
-uv pip install --upgrade pip setuptools wheel
-echo "✅ Upgraded pip/setuptools/wheel"
-
-echo "Installing build dependencies from requirements/build.txt"
-uv pip install -r requirements/build.txt
-echo "✅ Installed requirements/build.txt"
-
-echo "Installing common dependencies from requirements/common.txt"
-uv pip install -r requirements/common.txt
-echo "✅ Installed requirements/common.txt"
-
-echo "Installing vLLM CPU build"
-# Un-pinned from 71df063c (LMCache #3538) now that LMCache handles the
-# blocks-first fused KV layout. Running against nightly means a passing CPU
-# e2e proves the new GPUKVFormat path works.
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index --torch-backend cpu
-echo "✅ vLLM CPU install completed"
-
-echo "Installing LMCache in editable mode with NO_GPU_EXT=1"
-NO_GPU_EXT=1 uv pip install -e . --no-build-isolation
-echo "✅ LMCache install completed"
-
-echo "Freezing installed package versions"
-uv pip freeze
-
-echo "Validating imports"
-python -c "import lmcache; import vllm; print('✅ Imports OK')"
-
-echo "Printing package versions"
-python -c "import vllm; print('vllm:', vllm.__version__)"
-python -c "import lmcache; print('lmcache:', lmcache.__version__)"
-
-echo "✅ CPU install validation passed"
+if [ "${SKIP_INSTALL}" = "1" ]; then
+  echo "=== CPU Install Validation (Phase 1) — SKIPPED (SKIP_INSTALL=1) ==="
+else
+  echo "=== CPU Install Validation (Phase 1) ==="
+  echo "Creating virtual environment with uv at ${VENV_DIR}"
+  uv venv --python 3.12 "${VENV_DIR}"
+  # shellcheck disable=SC1091
+  source "${VENV_DIR}/bin/activate"
+
+  uv pip install --upgrade pip setuptools wheel
+
+  # `--index-strategy unsafe-best-match` is required because uv's
+  # default `first-index` strategy locks each package to the first
+  # index that lists it; the pytorch CPU index ships an older mirror
+  # of `setuptools` that would block the version vllm-cpu-nightly
+  # pins.
+  uv pip uninstall -y vllm vllm-cpu-nightly 2>/dev/null || true
+  PIP_BIN="uv pip" \
+  PIP_INSTALL_EXTRA_ARGS="--index-strategy unsafe-best-match" \
+    bash "${SHARED_SCRIPTS_DIR}/install_vllm_cpu.sh"
+
+  PIP_BIN="uv pip" \
+    bash "${SHARED_SCRIPTS_DIR}/install_lmcache_cpu.sh"
+
+  echo "Freezing installed package versions"
+  uv pip freeze
+
+  echo "✅ CPU install validation passed"
+fi
 
 echo "=== CPU E2E Validation (Phase 2) ==="
 
-echo "[Phase 2 / Step 1] Installing numpy<2 for scipy/vLLM compatibility"
-uv pip install "numpy<2"
-echo "✅ numpy<2 installed"
+if [ "${SKIP_INSTALL}" = "1" ]; then
+  echo "[Phase 2 / Step 1] numpy<2 install — SKIPPED (SKIP_INSTALL=1)"
+else
+  echo "[Phase 2 / Step 1] Installing numpy<2 for scipy/vLLM compatibility"
+  uv pip install "numpy<2"
+  echo "✅ numpy<2 installed"
+fi
 
 echo "[Phase 2 / Step 2] Downloading facebook/opt-125m model (cache-aware)"
-if ! python -c "from huggingface_hub import snapshot_download; snapshot_download('facebook/opt-125m')"; then
-  echo "❌ Failed to download/cache facebook/opt-125m"
-  false
-fi
+HF_DOWNLOAD_FAIL_ON_ERROR=1 \
+  bash "${SHARED_SCRIPTS_DIR}/download_model.sh" facebook/opt-125m
 echo "✅ Model download/check complete"
 
 echo "[Phase 2 / Step 3] Starting LMCache server"
 echo "LMCache log: ${LMCACHE_LOG}"
 # Build lmcache server args
 LMCACHE_ARGS=(
+  --port "${LMCACHE_ZMQ_PORT}"
+  --http-port "${LMCACHE_HTTP_PORT}"
   --l1-size-gb "${LMCACHE_L1_SIZE_GB}"
   --eviction-policy "${LMCACHE_EVICTION_POLICY}"
   --chunk-size "${LMCACHE_CHUNK_SIZE}"
 )
+# `data` handle look identical here (both leave SHM_NAME at
+# default) but resolve to different worker-side TransferContexts:
+#   handle -> HandleTransferContext (server-side copy via POSIX SHM IPC)
+#   data   -> DataTransferContext on non-CUDA devices
+# Step 5.5 verifies which one the worker actually entered.
 if [ "${LMCACHE_MP_TRANSFER_MODE}" = "handle" ]; then
   echo "Transport mode: server-side copy (handle via POSIX SHM IPC)"
   EXPECTED_TRANSPORT="handle"
-elif [ "${LMCACHE_SHM_NAME}" = "__default__" ]; then
-  echo "Transport mode: shared memory (shm)"
-  EXPECTED_TRANSPORT="shm"
+elif [ "${LMCACHE_MP_TRANSFER_MODE}" = "data" ]; then
+  if [ "${LMCACHE_SHM_NAME}" = "__default__" ]; then
+    echo "Transport mode: data/shm (shared memory)"
+    EXPECTED_TRANSPORT="shm"
+  else
+    echo "Transport mode: data/pickle (--shm-name '${LMCACHE_SHM_NAME}')"
+    LMCACHE_ARGS+=(--shm-name "${LMCACHE_SHM_NAME}")
+    EXPECTED_TRANSPORT="pickle"
+  fi
 else
-  echo "Transport mode: pickle (--shm-name '${LMCACHE_SHM_NAME}')"
-  LMCACHE_ARGS+=(--shm-name "${LMCACHE_SHM_NAME}")
-  EXPECTED_TRANSPORT="pickle"
+  echo "Transport mode: unknown '${LMCACHE_MP_TRANSFER_MODE}',"
+  echo "  falling back to LMCACHE_SHM_NAME-based detection"
+  if [ "${LMCACHE_SHM_NAME}" = "__default__" ]; then
+    echo "Transport mode: data/shm (shared memory, fallback)"
+    EXPECTED_TRANSPORT="shm"
+  else
+    echo "Transport mode: data/pickle (--shm-name '${LMCACHE_SHM_NAME}')"
+    LMCACHE_ARGS+=(--shm-name "${LMCACHE_SHM_NAME}")
+    EXPECTED_TRANSPORT="pickle"
+  fi
 fi
 
 lmcache server "${LMCACHE_ARGS[@]}" \
@@ -352,8 +405,31 @@ if ! wait_for_endpoint_contains "http://localhost:${LMCACHE_HTTP_PORT}/healthche
 fi
 echo "✅ LMCache server is healthy"
 
-echo "[Phase 2 / Step 4] Installing libnuma and starting vLLM server"
-apt-get update && apt-get install -y --no-install-recommends libnuma1
+echo "[Phase 2 / Step 4] Installing libnuma (Linux only) and starting vLLM server"
+if [ "$(uname -s)" = "Linux" ]; then
+  # libnuma1 is required by some vLLM CPU paths. On GitHub Actions
+  # ubuntu runners apt-get must be invoked via sudo; on hardened images
+  # without passwordless sudo it may not be available at all. Skip the
+  # install if the shared object is already present, and never let an
+  # apt-get hiccup fail the whole e2e step (vLLM startup itself will
+  # surface a clearer error if libnuma is genuinely missing).
+  if [ ! -e /usr/lib/x86_64-linux-gnu/libnuma.so.1 ] \
+     && [ ! -e /lib/x86_64-linux-gnu/libnuma.so.1 ]; then
+    if command -v sudo >/dev/null 2>&1; then
+      sudo apt-get update \
+        && sudo apt-get install -y --no-install-recommends libnuma1 \
+        || echo "⚠️  libnuma1 install via sudo apt-get failed; continuing"
+    else
+      apt-get update \
+        && apt-get install -y --no-install-recommends libnuma1 \
+        || echo "⚠️  libnuma1 install via apt-get failed; continuing"
+    fi
+  else
+    echo "libnuma1 already present, skipping apt install"
+  fi
+fi
+# VLLM_DEVICE is the modern env var (vLLM 0.8+)
+export VLLM_DEVICE=cpu
 export VLLM_TARGET_DEVICE=cpu
 start_vllm
 
@@ -374,23 +450,31 @@ echo "✅ E2E request validation passed"
 
 # Verify transport mode (logged after vLLM connects to LMCache server)
 echo "[Phase 2 / Step 5.5] Verifying transport mode: expecting '${EXPECTED_TRANSPORT}'"
+# Worker logs `Creating transfer context (device_type=<dev>, mode=<m>)`
+# from worker_transfer.py:create_transfer_context, where <m> is the
+# resolved MPTransferMode after env-var lookup. This is the single source
+# of truth for which TransferContext the worker actually entered. Note:
+# the worker is a child of `vllm serve`, so its LMCache log lines land in
+# VLLM_LOG (vllm's stdout), not in LMCACHE_LOG (lmcache server's stdout).
+# The shm/pickle branches still grep LMCACHE_LOG because the strategy
+# line is emitted by the lmcache server itself.
 if [ "${EXPECTED_TRANSPORT}" = "handle" ]; then
-  if ! grep -q "CpuCacheContext" "${LMCACHE_LOG}" 2>/dev/null; then
-    echo "❌ Expected server-side copy but 'CpuCacheContext' not found in log"
-    tail -50 "${LMCACHE_LOG}"
+  if ! grep -q "Creating transfer context.*mode=handle" "${VLLM_LOG}" 2>/dev/null; then
+    echo "❌ Expected handle worker context but 'mode=handle' not found in vLLM log"
+    tail -50 "${VLLM_LOG}"
     false
   fi
   echo "✅ Transport mode confirmed: handle (server-side copy)"
 elif [ "${EXPECTED_TRANSPORT}" = "shm" ]; then
-  if ! grep -q "Using shm" "${LMCACHE_LOG}" 2>/dev/null; then
-    echo "❌ Expected shm transport but 'Using shm' not found in log"
+  if ! grep -q "Using shm non-GPU transfer strategy" "${LMCACHE_LOG}" 2>/dev/null; then
+    echo "❌ Expected shm transport but server strategy line not found in log"
     tail -50 "${LMCACHE_LOG}"
     false
   fi
   echo "✅ Transport mode confirmed: shm"
 elif [ "${EXPECTED_TRANSPORT}" = "pickle" ]; then
-  if ! grep -q "Using pickle" "${LMCACHE_LOG}" 2>/dev/null; then
-    echo "❌ Expected pickle transport but 'Using pickle' not found in log"
+  if ! grep -q "Using pickle non-GPU transfer strategy" "${LMCACHE_LOG}" 2>/dev/null; then
+    echo "❌ Expected pickle transport but server strategy line not found in log"
     tail -50 "${LMCACHE_LOG}"
     false
   fi
@@ -533,6 +617,10 @@ echo "=========================================="
 echo "✅ All phases passed (Phase 1 + 2 + 3)"
 echo "=========================================="
 
+# Make sure lmcache/vllm processes started in this run are reaped so
+# the next CI step does not collide on their default ZMQ/HTTP ports.
+cleanup_processes
+
 # Upload artifacts BEFORE deleting the workspace
 upload_artifacts
-cleanup_workspace
\ No newline at end of file
+cleanup_workspace
diff --git a/.github/workflows/cpu_device.yml b/.github/workflows/cpu_device.yml
new file mode 100644
index 0000000000..f7995353a0
--- /dev/null
+++ b/.github/workflows/cpu_device.yml
@@ -0,0 +1,159 @@
+name: CPU Device Tests
+
+# CPU device tests: server bench + vLLM e2e. Both job types share a
+# single environment (built once per OS) to avoid re-installing vLLM.
+
+on:
+  workflow_call:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - "dev"
+      - "release-**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  changes:
+    name: Detect changes
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read
+    outputs:
+      lmcache: >-
+        ${{
+          github.event_name == 'workflow_dispatch' ||
+          github.event_name == 'workflow_call' ||
+          steps.filter.outputs.lmcache == 'true'
+        }}
+    steps:
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        if: github.event_name == 'push'
+        with:
+          fetch-depth: 2
+
+      - uses: dorny/paths-filter@v3
+        if: >-
+          github.event_name == 'push' ||
+          github.event_name == 'pull_request'
+        id: filter
+        with:
+          filters: |
+            lmcache:
+              - 'lmcache/**'
+              - 'pyproject.toml'
+              - 'setup.py'
+              - 'requirements/**.txt'
+              - '.github/workflows/cpu_device.yml'
+              - '.github/scripts/cpu_device_test.sh'
+              - '.github/scripts/cpu_server_bench_test.sh'
+              - '.github/scripts/cpu_vllm_e2e_test.sh'
+              - '.github/scripts/install_vllm_cpu.sh'
+              - '.github/scripts/install_lmcache_cpu.sh'
+              - '.github/scripts/download_model.sh'
+              - '.github/scripts/run-cpu-e2e-validation.sh'
+              - '!operator/**'
+
+  cpu-device-test:
+    needs: changes
+    if: needs.changes.outputs.lmcache == 'true'
+    name: CPU device (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04, macos-latest]
+
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@20cf305ff2072d973412fa9b1e3a4f227bda3c76 # v2.14.0
+        with:
+          egress-policy: audit
+
+      - name: Checkout
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          fetch-depth: 1
+
+      - name: Setup Python 3.12
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: "3.12"
+          cache: pip
+          cache-dependency-path: |
+            pyproject.toml
+            requirements/*.txt
+
+      - name: Cache HuggingFace models
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-opt-125m-v1
+
+      - name: Install vLLM CPU (prebuilt nightly from PyPI)
+        run: bash .github/scripts/install_vllm_cpu.sh
+
+      - name: Install lmcache (CPU-only, no vLLM)
+        run: bash .github/scripts/install_lmcache_cpu.sh
+
+      - name: Download facebook/opt-125m
+        run: bash .github/scripts/download_model.sh facebook/opt-125m
+
+      - name: Server bench — Data
+        run: |
+          LMCACHE_BENCH_TRANSFER_MODE=data \
+          LMCACHE_HTTP_PORT_BENCH=18080 \
+          LMCACHE_ZMQ_PORT_BENCH=15555 \
+            bash .github/scripts/cpu_device_test.sh server_bench
+
+      - name: Server bench — Handle
+        run: |
+          LMCACHE_BENCH_TRANSFER_MODE=handle \
+          LMCACHE_HTTP_PORT_BENCH=18081 \
+          LMCACHE_ZMQ_PORT_BENCH=15556 \
+            bash .github/scripts/cpu_device_test.sh server_bench
+
+      - name: vLLM e2e — Data (pickle)
+        run: |
+          LMCACHE_E2E_TRANSPORT_MODE=pickle \
+          LMCACHE_HTTP_PORT_E2E=18082 \
+          LMCACHE_ZMQ_PORT_E2E=15557 \
+          VLLM_PORT_E2E=18000 \
+            bash .github/scripts/cpu_device_test.sh vllm_e2e
+
+      # POSIX SHM transport is Linux-only.
+      - name: vLLM e2e — Data (shm)
+        if: runner.os == 'Linux'
+        run: |
+          LMCACHE_E2E_TRANSPORT_MODE=shm \
+          LMCACHE_HTTP_PORT_E2E=18083 \
+          LMCACHE_ZMQ_PORT_E2E=15558 \
+          VLLM_PORT_E2E=18001 \
+            bash .github/scripts/cpu_device_test.sh vllm_e2e
+
+      - name: vLLM e2e — Handle
+        run: |
+          LMCACHE_E2E_TRANSPORT_MODE=handle \
+          LMCACHE_HTTP_PORT_E2E=18084 \
+          LMCACHE_ZMQ_PORT_E2E=15559 \
+          VLLM_PORT_E2E=18002 \
+            bash .github/scripts/cpu_device_test.sh vllm_e2e
+
+      - name: Upload logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: cpu-device-logs-${{ matrix.os }}
+          path: |
+            /tmp/cpu_device_*.log
+          if-no-files-found: ignore
diff --git a/.github/workflows/macos_compat.yml b/.github/workflows/macos_compat.yml
deleted file mode 100644
index 5559896857..0000000000
--- a/.github/workflows/macos_compat.yml
+++ /dev/null
@@ -1,147 +0,0 @@
-name: macOS Compat
-
-# Smoke test that verifies the lmcache multiprocess server can be
-# installed and launched on macOS (GPU backend skipped via NO_GPU_EXT=1;
-# common C++ extensions — native_storage_ops / lmcache_redis /
-# lmcache_fs — are still built so the cross-platform PipeNotifier
-# fallback in csrc/storage_backends/event_notifier.h is exercised),
-# and that both the HTTP endpoint and the `lmcache` CLI are usable.
-# This is a "best-effort basic compatibility" check — it does not
-# cover GPU / CUDA / vLLM code paths.
-
-on:
-  workflow_call:
-  workflow_dispatch:
-  pull_request:
-    branches:
-      - "dev"
-      - "release-**"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-defaults:
-  run:
-    shell: bash
-
-# Single source of truth for the smoke-script contract. The script
-# (.github/scripts/macos_smoke_test.sh) reads these from the env, and
-# the failure-artifact upload step below references LMCACHE_LOG_FILE.
-env:
-  LMCACHE_HTTP_PORT: "18080"
-  LMCACHE_ZMQ_PORT: "15555"
-  LMCACHE_LOG_FILE: "/tmp/lmcache_server.log"
-  LMCACHE_STARTUP_TIMEOUT: "180"
-
-jobs:
-  changes:
-    name: Detect changes
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read
-    outputs:
-      # workflow_call / workflow_dispatch bypass the paths-filter (which
-      # only runs for push / pull_request). Treat both as "always run"
-      # so callers don't get a silently-skipped job.
-      lmcache: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || steps.filter.outputs.lmcache == 'true' }}
-    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
-        if: github.event_name == 'push'
-        with:
-          fetch-depth: 2
-
-      - uses: dorny/paths-filter@v3
-        if: github.event_name == 'push' || github.event_name == 'pull_request'
-        id: filter
-        with:
-          filters: |
-            lmcache:
-              - 'lmcache/**'
-              - 'pyproject.toml'
-              - 'setup.py'
-              - 'requirements/**.txt'
-              - '.github/workflows/macos_compat.yml'
-              - '.github/scripts/macos_smoke_test.sh'
-              - '!operator/**'
-
-  macos-smoke:
-    needs: changes
-    if: needs.changes.outputs.lmcache == 'true'
-    name: "macOS smoke: py${{ matrix.python }} on ${{ matrix.platform }}"
-    runs-on: "${{ matrix.platform }}"
-    strategy:
-      fail-fast: false
-      matrix:
-        python:
-          - "3.11"
-          - "3.12"
-        platform:
-          # macos-latest is Apple Silicon (arm64);
-          - "macos-latest"
-
-    steps:
-      - name: Harden Runner
-        uses: step-security/harden-runner@20cf305ff2072d973412fa9b1e3a4f227bda3c76 # v2.14.0
-        with:
-          egress-policy: audit
-
-      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
-        with:
-          # Shallow clone is enough; SETUPTOOLS_SCM_PRETEND_VERSION
-          # below avoids any git-tag resolution by setuptools-scm.
-          fetch-depth: 1
-
-      - name: Setup Python ${{ matrix.python }}
-        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
-        with:
-          python-version: ${{ matrix.python }}
-          cache: pip
-          cache-dependency-path: |
-            pyproject.toml
-            requirements/*.txt
-
-      - name: Install lmcache (CPU-only, common C++ ext, no vLLM)
-        env:
-          # NO_GPU_EXT=1: build common C++ extensions (native_storage_ops,
-          # lmcache_redis, lmcache_fs) but skip the CUDA / ROCm / SYCL
-          # backend. Mirrors build_cpu_artifacts.yml on the macOS axis.
-          NO_GPU_EXT: "1"
-          # Pin a synthetic version so setuptools-scm never inspects
-          # git tags. Defense-in-depth on top of pyproject.toml's
-          # tag_regex; harmless if tag_regex already filters tags.
-          SETUPTOOLS_SCM_PRETEND_VERSION: "0.0.0.dev0"
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install -r requirements/build.txt
-          # Pure-Python deps. macOS wheels exist for everything in
-          # common.txt; we explicitly skip requirements/cuda13_core.txt
-          # (the default core file picked by setup.py) since cupy and
-          # nixl are CUDA/Linux-only.
-          python -m pip install torch
-          python -m pip install -r requirements/common.txt
-          # CLI entry points eagerly import openai / matplotlib (e.g.
-          # lmcache.cli.commands.bench), so cli.txt is needed for the
-          # `lmcache --help` step inside the smoke script.
-          python -m pip install -r requirements/cli.txt
-          # Install lmcache itself with --no-deps so setup.py's
-          # install_requires (which appends cuda13_core.txt by default)
-          # cannot drag CUDA-only wheels onto macOS.
-          python -m pip install -e . --no-deps --no-build-isolation
-
-      - name: Run macOS smoke test
-        run: |
-          chmod +x .github/scripts/macos_smoke_test.sh
-          .github/scripts/macos_smoke_test.sh
-
-      - name: Upload server log on failure
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: lmcache-server-log-${{ matrix.platform }}-py${{ matrix.python }}
-          path: ${{ env.LMCACHE_LOG_FILE }}
-          if-no-files-found: ignore

From bc245d96d640973a8259867471d73e92bc95ff7a Mon Sep 17 00:00:00 2001
From: Jinwoo Jeong <35993396+JinuJeong@users.noreply.github.com>
Date: Fri, 12 Jun 2026 23:38:29 +0900
Subject: [PATCH 56/57] fix(nixl): create storage directory if it doesn't exist
 (#3568)

Signed-off-by: Jinwoo Jeong <jwjeong@csl.korea.ac.kr>
---
 .../v1/distributed/l2_adapters/nixl_store_dynamic_l2_adapter.py  | 1 +
 lmcache/v1/distributed/l2_adapters/nixl_store_l2_adapter.py      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/lmcache/v1/distributed/l2_adapters/nixl_store_dynamic_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/nixl_store_dynamic_l2_adapter.py
index cb65fef5c1..91b17fe938 100644
--- a/lmcache/v1/distributed/l2_adapters/nixl_store_dynamic_l2_adapter.py
+++ b/lmcache/v1/distributed/l2_adapters/nixl_store_dynamic_l2_adapter.py
@@ -103,6 +103,7 @@ def __init__(
         self.backend_params = backend_params
         self.l1_align_bytes = l1_memory_desc.align_bytes
         self.file_path = backend_params["file_path"]
+        os.makedirs(self.file_path, exist_ok=True)
         self.use_direct_io = (
             str(backend_params.get("use_direct_io", "false")).lower() == "true"
         )
diff --git a/lmcache/v1/distributed/l2_adapters/nixl_store_l2_adapter.py b/lmcache/v1/distributed/l2_adapters/nixl_store_l2_adapter.py
index 17c783a82e..73ef11fe26 100644
--- a/lmcache/v1/distributed/l2_adapters/nixl_store_l2_adapter.py
+++ b/lmcache/v1/distributed/l2_adapters/nixl_store_l2_adapter.py
@@ -254,6 +254,7 @@ def init_storage_handlers_file(
             file_path: Directory where storage files are created.
             use_direct_io: Whether to open files with O_DIRECT.
         """
+        os.makedirs(file_path, exist_ok=True)
         if file_size % page_size != 0:
             raise ValueError(
                 f"file_size ({file_size}) must be a multiple of page_size ({page_size})"

From a2bb0419f9f62f82f7942a1b6a0d758b578df906 Mon Sep 17 00:00:00 2001
From: aeon-x <talexcao@gmail.com>
Date: Fri, 12 Jun 2026 11:08:53 -0700
Subject: [PATCH 57/57] fix comments

Signed-off-by: aeon-x <talexcao@gmail.com>
---
 docs/source/cli/coordinator.rst               |  14 ++-
 docs/source/cli/server.rst                    |   5 +
 docs/source/mp/coordinator.rst                | 109 +++++++++++++++++-
 lmcache/cli/commands/coordinator.py           |  29 +++++
 .../v1/mp_coordinator/l2/event_listener.py    |  14 ++-
 lmcache/v1/mp_coordinator/schemas.py          |   6 +
 lmcache/v1/multiprocess/http_server.py        |   1 +
 tests/v1/mp_coordinator/test_l2_api.py        |  36 ++++--
 8 files changed, 193 insertions(+), 21 deletions(-)

diff --git a/docs/source/cli/coordinator.rst b/docs/source/cli/coordinator.rst
index 66d1a60a2c..585a446908 100644
--- a/docs/source/cli/coordinator.rst
+++ b/docs/source/cli/coordinator.rst
@@ -40,15 +40,25 @@ Options
      - Seconds without a heartbeat after which an instance is evicted
        (default: ``30``).
    * - ``--health-check-interval SECS``
-     - Seconds between eviction sweeps; ``0`` disables the loop
+     - Seconds between health-check sweeps; ``0`` disables the loop
        (default: ``10``).
+   * - ``--eviction-check-interval SECS``
+     - Seconds between L2 eviction sweeps; ``0`` disables the loop
+       (default: ``5``).
+   * - ``--eviction-ratio RATIO``
+     - Fraction of tracked keys (by count) to evict per cycle, ``0.0`` to
+       ``1.0`` (default: ``0.2``).
+   * - ``--trigger-watermark RATIO``
+     - Eviction fires when usage reaches this fraction of the quota, ``0.0``
+       (exclusive) to ``1.0`` (default: ``1.0``).
 
 Configuration
 -------------
 
 Every flag is optional. Unset flags fall back to the
 ``LMCACHE_MP_COORDINATOR_*`` environment variables (``HOST``, ``PORT``,
-``INSTANCE_TIMEOUT``, ``HEALTH_CHECK_INTERVAL``), and then to the built-in
+``INSTANCE_TIMEOUT``, ``HEALTH_CHECK_INTERVAL``, ``EVICTION_CHECK_INTERVAL``,
+``EVICTION_RATIO``, ``TRIGGER_WATERMARK``), and then to the built-in
 defaults. A supplied flag always overrides the matching env-derived value, so
 env-only deployments keep working unchanged.
 
diff --git a/docs/source/cli/server.rst b/docs/source/cli/server.rst
index fe1381894a..f81f905a39 100644
--- a/docs/source/cli/server.rst
+++ b/docs/source/cli/server.rst
@@ -73,6 +73,11 @@ Commonly used flags include:
    * - ``--coordinator-heartbeat-interval SECONDS``
      - Seconds between heartbeats (``> 0``, default ``5``). Keep well below the
        coordinator's instance timeout.
+   * - ``--coordinator-l2-event-reporting``
+     - Enable reporting L2 store/lookup events to the coordinator for
+       fleet-wide usage tracking and quota-based eviction.
+   * - ``--coordinator-l2-event-flush-interval SECONDS``
+     - Seconds between L2 event batch flushes (``> 0``, default ``1``).
    * - ``--trace-level {storage}``
      - Enable storage-level trace recording (see :doc:`trace`).
    * - ``--trace-output PATH``
diff --git a/docs/source/mp/coordinator.rst b/docs/source/mp/coordinator.rst
index e9da70368d..36c4f07ce9 100644
--- a/docs/source/mp/coordinator.rst
+++ b/docs/source/mp/coordinator.rst
@@ -21,10 +21,11 @@ Expected log output:
 
     LMCache INFO: MP coordinator listening on http://0.0.0.0:9300
 
-The CLI accepts ``--host``, ``--port``, ``--instance-timeout``, and
-``--health-check-interval``; any flag overrides the matching environment
-variable below. See :doc:`/cli/coordinator` for details. Equivalently, the
-coordinator can still be launched as a module with
+The CLI accepts ``--host``, ``--port``, ``--instance-timeout``,
+``--health-check-interval``, ``--eviction-check-interval``,
+``--eviction-ratio``, and ``--trigger-watermark``; any flag overrides the
+matching environment variable below. See :doc:`/cli/coordinator` for details.
+Equivalently, the coordinator can still be launched as a module with
 ``python3 -m lmcache.v1.mp_coordinator``.
 
 Configuration
@@ -53,6 +54,16 @@ variables:
    * - ``LMCACHE_MP_COORDINATOR_HEALTH_CHECK_INTERVAL``
      - ``10``
      - Seconds between health-check sweeps. ``0`` disables eviction.
+   * - ``LMCACHE_MP_COORDINATOR_EVICTION_CHECK_INTERVAL``
+     - ``5``
+     - Seconds between L2 eviction sweeps. ``0`` disables the loop.
+   * - ``LMCACHE_MP_COORDINATOR_EVICTION_RATIO``
+     - ``0.2``
+     - Fraction of tracked keys (by count) to evict per cycle (0.0 to 1.0).
+   * - ``LMCACHE_MP_COORDINATOR_TRIGGER_WATERMARK``
+     - ``1.0``
+     - Eviction fires when usage reaches this fraction of the quota
+       (0.0 exclusive to 1.0).
 
 Connecting MP servers
 ---------------------
@@ -83,6 +94,13 @@ Kubernetes downward API); an explicit flag wins over the env var.
      - ``LMCACHE_COORDINATOR_HEARTBEAT_INTERVAL``
      - Seconds between heartbeats (must be ``> 0``, default ``5``). Keep it well
        below the coordinator's ``INSTANCE_TIMEOUT``.
+   * - ``--coordinator-l2-event-reporting``
+     - ``LMCACHE_COORDINATOR_L2_EVENT_REPORTING``
+     - Enable reporting L2 store/lookup events to the coordinator for
+       fleet-wide usage tracking and quota-based eviction.
+   * - ``--coordinator-l2-event-flush-interval``
+     - ``LMCACHE_COORDINATOR_L2_EVENT_FLUSH_INTERVAL``
+     - Seconds between L2 event batch flushes (must be ``> 0``, default ``1``).
 
 The server registers under its telemetry identity (``--service-instance-id`` /
 OTel ``service.instance.id``); if that is unset, the coordinator assigns an id.
@@ -106,4 +124,85 @@ Two read-only endpoints let you observe the coordinator:
 
     curl -s http://localhost:9300/healthz
     # -> {"status": "healthy"}
-    
\ No newline at end of file
+
+L2 usage tracking and eviction
+------------------------------
+
+When MP servers enable ``--coordinator-l2-event-reporting``, they stream L2
+store and lookup events to the coordinator. The coordinator aggregates
+per-``cache_salt`` usage, enforces quotas, and selects LRU keys to evict.
+
+Each event batch carries the server's ``instance_id`` and a monotonically
+increasing sequence number (``seq``) scoped to that instance. These fields
+enable future gap detection to identify lost batches.
+
+**Quota management** -- set per-``cache_salt`` byte budgets. Salts without a
+quota default to a 0-byte limit (allowlist semantics).
+
+.. code-block:: bash
+
+    # Set a 10 GiB quota for tenant "user-a"
+    curl -s -X PUT http://localhost:9300/l2/quota/user-a \
+        -H 'Content-Type: application/json' \
+        -d '{"limit_gb": 10.0}'
+    # -> {"cache_salt": "user-a", "limit_gb": 10.0, "status": "ok"}
+
+    # Remove the quota
+    curl -s -X DELETE http://localhost:9300/l2/quota/user-a
+    # -> {"cache_salt": "user-a", "limit_gb": 0.0, "status": "removed"}
+
+Use ``_default`` as the path parameter to target the empty-string salt.
+
+**Event ingestion** -- MP servers POST batched events; this is handled
+automatically by the event listener and is not typically called manually.
+
+.. code-block:: bash
+
+    curl -s -X POST http://localhost:9300/l2/events \
+        -H 'Content-Type: application/json' \
+        -d '{
+            "instance_id": "server-1",
+            "seq": 1,
+            "events": [
+                {"type": "store", "key": {"chunk_hash_hex": "aa", "model_name": "m", "kv_rank": 0, "cache_salt": "user-a"}, "bytes": 1024}
+            ]
+        }'
+    # -> {"recorded": 1}
+
+**Status queries** -- inspect usage and quota info.
+
+.. code-block:: bash
+
+    # Single salt
+    curl -s http://localhost:9300/l2/status/user-a
+    # -> {"cache_salt": "user-a", "quota_limit_gb": 10.0, "quota_exists": true, "usage_gb": 0.001}
+
+    # All salts
+    curl -s http://localhost:9300/l2/status
+    # -> {"total_gb": 0.005, "by_cache_salt": [...]}
+
+L2 endpoint summary
+~~~~~~~~~~~~~~~~~~~~
+
+.. list-table::
+   :header-rows: 1
+   :widths: 12 38 50
+
+   * - Method
+     - Path
+     - Description
+   * - ``PUT``
+     - ``/l2/quota/{cache_salt}``
+     - Create or update a quota (body: ``{"limit_gb": N}``).
+   * - ``DELETE``
+     - ``/l2/quota/{cache_salt}``
+     - Remove a salt's quota entry.
+   * - ``POST``
+     - ``/l2/events``
+     - Ingest a batch of L2 store/lookup events.
+   * - ``GET``
+     - ``/l2/status/{cache_salt}``
+     - Quota and usage for a single salt.
+   * - ``GET``
+     - ``/l2/status``
+     - Total usage and per-salt breakdown.
diff --git a/lmcache/cli/commands/coordinator.py b/lmcache/cli/commands/coordinator.py
index e27f8370bc..f5f58a1fa7 100644
--- a/lmcache/cli/commands/coordinator.py
+++ b/lmcache/cli/commands/coordinator.py
@@ -75,6 +75,32 @@ def add_arguments(self, parser: argparse.ArgumentParser) -> None:
                 "(default: 10)."
             ),
         )
+        parser.add_argument(
+            "--eviction-check-interval",
+            type=float,
+            default=None,
+            help=(
+                "Seconds between L2 eviction sweeps; 0 disables the loop (default: 5)."
+            ),
+        )
+        parser.add_argument(
+            "--eviction-ratio",
+            type=float,
+            default=None,
+            help=(
+                "Fraction of tracked keys (by count) to evict per cycle, "
+                "0.0 to 1.0 (default: 0.2)."
+            ),
+        )
+        parser.add_argument(
+            "--trigger-watermark",
+            type=float,
+            default=None,
+            help=(
+                "Eviction fires when usage reaches this fraction of the "
+                "quota, 0.0 (exclusive) to 1.0 (default: 1.0)."
+            ),
+        )
 
     def execute(self, args: argparse.Namespace) -> None:
         """Build the coordinator config and serve the app with uvicorn.
@@ -116,6 +142,9 @@ def execute(self, args: argparse.Namespace) -> None:
                 ("port", args.port),
                 ("instance_timeout", args.instance_timeout),
                 ("health_check_interval", args.health_check_interval),
+                ("eviction_check_interval", args.eviction_check_interval),
+                ("eviction_ratio", args.eviction_ratio),
+                ("trigger_watermark", args.trigger_watermark),
             )
             if value is not None
         }
diff --git a/lmcache/v1/mp_coordinator/l2/event_listener.py b/lmcache/v1/mp_coordinator/l2/event_listener.py
index 75285c55e0..fda2ed62c2 100644
--- a/lmcache/v1/mp_coordinator/l2/event_listener.py
+++ b/lmcache/v1/mp_coordinator/l2/event_listener.py
@@ -61,6 +61,7 @@ class L2EventListener(L2AdapterListener):
     Args:
         client: The HTTP client to send with.
         coordinator_url: Coordinator base URL (e.g. ``http://host:9300``).
+        instance_id: Identifier of this MP server (included in every batch).
         flush_interval: Seconds between flush attempts.
     """
 
@@ -68,11 +69,14 @@ def __init__(
         self,
         client: httpx.AsyncClient,
         coordinator_url: str,
+        instance_id: str,
         flush_interval: float = _DEFAULT_FLUSH_INTERVAL,
     ) -> None:
         self._client = client
         self._base_url = coordinator_url.rstrip("/")
+        self._instance_id = instance_id
         self._flush_interval = flush_interval
+        self._seq = 0
         self._lock = threading.Lock()
         self._buffer: list[UsageEvent] = []
 
@@ -122,8 +126,14 @@ async def _flush(self) -> None:
                 return
             batch = self._buffer
             self._buffer = []
-
-        body = ReportUsageRequest(events=batch)
+            self._seq += 1
+            seq = self._seq
+
+        body = ReportUsageRequest(
+            instance_id=self._instance_id,
+            seq=seq,
+            events=batch,
+        )
         try:
             resp = await self._client.post(
                 f"{self._base_url}/l2/events",
diff --git a/lmcache/v1/mp_coordinator/schemas.py b/lmcache/v1/mp_coordinator/schemas.py
index 880bb37877..b4ece79a90 100644
--- a/lmcache/v1/mp_coordinator/schemas.py
+++ b/lmcache/v1/mp_coordinator/schemas.py
@@ -134,9 +134,15 @@ class ReportUsageRequest(BaseModel):
     """Body of ``POST /l2/events``.
 
     Attributes:
+        instance_id: Identifier of the MP server that produced this batch.
+        seq: Monotonically increasing sequence number scoped to this
+            ``instance_id``. Starts at 1 for the first flush after the
+            server starts.
         events: Batch of store/lookup events to record.
     """
 
+    instance_id: str
+    seq: int = Field(ge=1)
     events: list[UsageEvent]
 
 
diff --git a/lmcache/v1/multiprocess/http_server.py b/lmcache/v1/multiprocess/http_server.py
index 1a3cb40741..6b7f35fadd 100644
--- a/lmcache/v1/multiprocess/http_server.py
+++ b/lmcache/v1/multiprocess/http_server.py
@@ -143,6 +143,7 @@ async def lifespan(app: FastAPI):
         coordinator_l2_event_client = L2EventListener(
             coordinator_client,
             coordinator_config.url,
+            instance_id=mp_config.instance_id,
             flush_interval=coordinator_config.l2_event_flush_interval,
         )
         if engine.storage_manager is not None:
diff --git a/tests/v1/mp_coordinator/test_l2_api.py b/tests/v1/mp_coordinator/test_l2_api.py
index 27b0ddf5a7..94ac109924 100644
--- a/tests/v1/mp_coordinator/test_l2_api.py
+++ b/tests/v1/mp_coordinator/test_l2_api.py
@@ -31,6 +31,15 @@ def _lookup(salt: str, **kw) -> dict:
     return {"type": "lookup", "key": _key(salt, **kw), "bytes": 0}
 
 
+_seq_counter = 0
+
+
+def _events_body(events: list[dict], instance_id: str = "test-server") -> dict:
+    global _seq_counter
+    _seq_counter += 1
+    return {"instance_id": instance_id, "seq": _seq_counter, "events": events}
+
+
 # -- Quota writes ------------------------------------------------------------
 
 
@@ -98,13 +107,13 @@ def test_report_store_events():
     with _client() as client:
         resp = client.post(
             "/l2/events",
-            json={
-                "events": [
+            json=_events_body(
+                [
                     _store("user-a", 1000, h="01"),
                     _store("user-a", 500, h="02"),
                     _store("user-b", 2000, h="03"),
                 ]
-            },
+            ),
         )
         assert resp.status_code == 200
         assert resp.json()["recorded"] == 3
@@ -120,7 +129,7 @@ def test_report_lookup_events_accepted():
     with _client() as client:
         resp = client.post(
             "/l2/events",
-            json={"events": [_lookup("user-a")]},
+            json=_events_body([_lookup("user-a")]),
         )
         assert resp.status_code == 200
         assert resp.json()["recorded"] == 1
@@ -128,7 +137,10 @@ def test_report_lookup_events_accepted():
 
 def test_empty_events_batch():
     with _client() as client:
-        resp = client.post("/l2/events", json={"events": []})
+        resp = client.post(
+            "/l2/events",
+            json=_events_body([]),
+        )
         assert resp.status_code == 200
         assert resp.json()["recorded"] == 0
 
@@ -137,7 +149,7 @@ def test_invalid_event_type_rejected():
     with _client() as client:
         resp = client.post(
             "/l2/events",
-            json={"events": [{"type": "delete", "key": _key("a"), "bytes": 100}]},
+            json=_events_body([{"type": "delete", "key": _key("a"), "bytes": 100}]),
         )
         assert resp.status_code == 422
 
@@ -146,7 +158,7 @@ def test_negative_bytes_rejected():
     with _client() as client:
         resp = client.post(
             "/l2/events",
-            json={"events": [{"type": "store", "key": _key("a"), "bytes": -1}]},
+            json=_events_body([{"type": "store", "key": _key("a"), "bytes": -1}]),
         )
         assert resp.status_code == 422
 
@@ -159,7 +171,7 @@ def test_status_single_salt():
         client.put("/l2/quota/user-a", json={"limit_gb": 2.5})
         client.post(
             "/l2/events",
-            json={"events": [_store("user-a", 1000)]},
+            json=_events_body([_store("user-a", 1000)]),
         )
         data = client.get("/l2/status/user-a").json()
         assert data["cache_salt"] == "user-a"
@@ -181,12 +193,12 @@ def test_status_list():
         client.put("/l2/quota/a", json={"limit_gb": 1.0})
         client.post(
             "/l2/events",
-            json={
-                "events": [
+            json=_events_body(
+                [
                     _store("a", 100, h="01"),
                     _store("b", 200, h="02"),
                 ]
-            },
+            ),
         )
         data = client.get("/l2/status").json()
         assert abs(data["total_gb"] - 300 / 1024**3) < 1e-12
@@ -221,7 +233,7 @@ def test_default_salt_sentinel():
         client.put("/l2/quota/_default", json={"limit_gb": 3.0})
         client.post(
             "/l2/events",
-            json={"events": [_store("", 500)]},
+            json=_events_body([_store("", 500)]),
         )
         data = client.get("/l2/status/_default").json()
         assert data["cache_salt"] == ""