aeon-x · aeon-x · Jun 7, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/.buildkite/k3_tests/multiprocess/pipeline.yml b/.buildkite/k3_tests/multiprocess/pipeline.yml
@@ -26,40 +26,6 @@ steps:
                   - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } }
         artifact_paths: ["*.log"]
 
-      # HMA (hybrid memory allocator) correctness check on google/gemma-4-31B-it.
-      # It interleaves sliding + full attention whose full layers use a larger
-      # head_dim (512 vs 256), so vLLM gives its KV cache groups different block
-      # sizes -- exercising LMCache's per-group block-size handling. Runs gsm8k,
-      # resets vLLM's local prefix cache (LMCache preserved), reruns, and asserts
-      # the scores match (run1 == run2 == no-LMCache baseline). Needs 2 GPUs
-      # (LMCache+vLLM + baseline). It is public (no HF_TOKEN), forces TRITON_ATTN
-      # (so ATTENTION_BACKEND=auto and a non-zero SCORE_TOLERANCE, since
-      # TRITON_ATTN is not bit-exact under batch invariance), and its ~63GB of
-      # weights need a higher GPU_MEMORY_UTILIZATION than the 0.5 default.
-      - label: ":compression: hma_lm_eval_gemma4"
-        command: .buildkite/k3_tests/multiprocess/run.sh hma_lm_eval_gemma4
-        timeout_in_minutes: 60
-        env:
-          MODEL: "google/gemma-4-31B-it"
-          SCORE_TOLERANCE: "0.05"
-          ATTENTION_BACKEND: "auto"
-          GPU_MEMORY_UTILIZATION: "0.85"
-          # Skip CUDA-graph capture so the large model doesn't time out at launch
-          # (safe here: this test uses a tolerance, not the bit-exact check).
-          ENFORCE_EAGER: "1"
-          # 31B weights are large; allow longer for download + load before the
-          # readiness probe gives up (other models keep the 300s default).
-          MAX_WAIT_SECONDS: "400"
-          # LIMIT = number of gsm8k samples. 31B's large per-token KV makes the
-          # full 200-sample working set overflow the CPU pool and thrash, so run
-          # 2 misses LMCache; cap the samples and enlarge the pool to keep run 2
-          # cache-served. CPU_BUFFER_SIZE (GB) is bounded by node RAM.
-          LIMIT: "100"
-          CPU_BUFFER_SIZE: "200"
-        agents: { queue: "k8s" }
-        plugins: [{ kubernetes: { podSpec: *pod-2gpu } }]
-        artifact_paths: ["*.log"]
-
       - label: ":compression: long_doc_qa"
         command: .buildkite/k3_tests/multiprocess/run.sh long_doc_qa
         timeout_in_minutes: 30
@@ -99,6 +65,30 @@ steps:
                 volumes: *vols
         artifact_paths: ["*.log"]
 
+      # HMA correctness check on google/gemma-4-31B-it (a hybrid model whose KV
+      # cache groups get different block sizes). Runs gsm8k, resets vLLM's prefix
+      # cache (LMCache preserved), reruns served by LMCache, and asserts the two
+      # runs' scores match. Single GPU, no baseline.
+      - label: ":compression: hma_lm_eval_gemma4"
+        command: .buildkite/k3_tests/multiprocess/run.sh hma_lm_eval_gemma4
+        timeout_in_minutes: 60
+        env:
+          MODEL: "google/gemma-4-31B-it"
+          # Require an exact score match between the two runs.
+          SCORE_TOLERANCE: "0"
+          ATTENTION_BACKEND: "auto"
+          GPU_MEMORY_UTILIZATION: "0.85"
+          # 31B load + CUDA-graph capture is slow; raise the readiness timeout
+          # above the 300s default.
+          MAX_WAIT_SECONDS: "600"
+          # Cap samples and enlarge the CPU pool so the retrieve run stays
+          # cache-served (31B's per-token KV is large).
+          LIMIT: "100"
+          CPU_BUFFER_SIZE: "200"
+        agents: { queue: "k8s" }
+        plugins: [{ kubernetes: { podSpec: *pod-1gpu } }]
+        artifact_paths: ["*.log"]
+
       - label: ":compression: fault_tolerance"
         command: .buildkite/k3_tests/multiprocess/run.sh fault_tolerance
         timeout_in_minutes: 30
@@ -127,10 +117,32 @@ steps:
         plugins: [{ kubernetes: { podSpec: *pod-1gpu } }]
         artifact_paths: ["*.log"]
 
+      - label: ":compression: gds_smoke_test"
+        command: .buildkite/k3_tests/multiprocess/run.sh gds_smoke_test
+        timeout_in_minutes: 30
+        agents: { queue: "k8s" }
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - name: container-0
+                    image: lmcache/ci-base:latest
+                    imagePullPolicy: Never
+                    resources: { limits: { "nvidia.com/gpu": "1" } }
+                    volumeMounts:
+                      - { name: hf-cache, mountPath: /root/.cache/huggingface }
+                      - { name: scratch, mountPath: /scratch }
+                      - { name: udev, mountPath: /run/udev, readOnly: true }
+                volumes:
+                  - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } }
+                  - { name: scratch, hostPath: { path: /data/gds-scratch, type: DirectoryOrCreate } }
+                  - { name: udev, hostPath: { path: /run/udev, type: Directory } }
+        artifact_paths: ["*.log"]
+
   - group: ":compression: Multiprocess (CPU-only)"
     steps:
       - label: ":compression: cpu_e2e_validation (shm)"
-        command: bash .buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+        command: bash .github/scripts/run-cpu-e2e-validation.sh
         timeout_in_minutes: 30
         agents: { queue: "k8s" }
         plugins:
@@ -155,7 +167,7 @@ steps:
                   - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } }
 
       - label: ":compression: cpu_e2e_validation (pickle)"
-        command: bash .buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh
+        command: bash .github/scripts/run-cpu-e2e-validation.sh
         env:
           LMCACHE_SHM_NAME: ""
         timeout_in_minutes: 30
@@ -180,3 +192,30 @@ steps:
                 volumes:
                   - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } }
                   - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } }
+
+      - label: ":compression: cpu_e2e_validation (server-side copy)"
+        command: bash .github/scripts/run-cpu-e2e-validation.sh
+        env:
+          LMCACHE_MP_TRANSFER_MODE: "handle"
+        timeout_in_minutes: 30
+        agents: { queue: "k8s" }
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - name: container-0
+                    image: lmcache/ci-base:latest
+                    imagePullPolicy: Never
+                    resources:
+                      requests:
+                        cpu: "8"
+                        memory: "256Gi"
+                      limits:
+                        cpu: "8"
+                        memory: "256Gi"
+                    volumeMounts:
+                      - { name: hf-cache, mountPath: /root/.cache/huggingface }
+                      - { name: dshm, mountPath: /dev/shm }
+                volumes:
+                  - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } }
+                  - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } }
diff --git a/.buildkite/k3_tests/multiprocess/run.sh b/.buildkite/k3_tests/multiprocess/run.sh
@@ -3,6 +3,7 @@
 # Usage: run.sh <test_name>
 #   test_name: lm_eval | hma_lm_eval_gemma4 | vllm_bench | long_doc_qa
 #              | long_doc_qa_l2 | fault_tolerance | deadlock | restart_recovery
+#              | gds_smoke_test
 # Thin wrapper: sets up environment, then delegates to scripts/.
 # No Docker -- all processes run natively in the pod.
 set -euo pipefail

diff --git a/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh b/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh
@@ -28,6 +28,14 @@ for port in "${VLLM_PORT:-8000}" "${VLLM_BASELINE_PORT:-9000}" "${LMCACHE_PORT:-
     fuser -k "${port}/tcp" 2>/dev/null || true
 done
 
+# Remove the GDS slab scratch dir (only set for gds_* tests). It lives on the
+# /scratch hostPath (host-local NVMe), so it persists past the pod and the
+# preallocated slab is large -- drop it now that the server is stopped.
+if [[ -n "${GDS_L1_PATH:-}" ]]; then
+    echo "Removing GDS slab dir: $GDS_L1_PATH"
+    rm -rf "${GDS_L1_PATH}" 2>/dev/null || true
+fi
+
 echo "=== Cleanup complete ==="
 
 # Copy server logs to the workspace so Buildkite can collect them as artifacts

diff --git a/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh b/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh
@@ -17,9 +17,9 @@ MAX_WORKERS="${MAX_WORKERS:-4}"
 MODEL="${MODEL:-Qwen/Qwen3-14B}"
 BUILD_ID="${BUILD_ID:-local_$$}"
 
-# K8s assigns exactly 2 GPUs as devices 0 and 1
-GPU_FOR_VLLM=0
-GPU_FOR_BASELINE=1
+# K8s assigns exactly 2 GPUs as devices 0 and 1 (overridable for local runs).
+GPU_FOR_VLLM="${GPU_FOR_VLLM:-0}"
+GPU_FOR_BASELINE="${GPU_FOR_BASELINE:-1}"
 echo "Using GPU $GPU_FOR_VLLM for vLLM with LMCache"
 echo "Using GPU $GPU_FOR_BASELINE for vLLM baseline"
 
@@ -68,6 +68,27 @@ fi
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-auto}"
 MAX_MODEL_LEN_ARG="--max-model-len ${MAX_MODEL_LEN}"
 
+# LMCache server chunk size in tokens. Empty -> server default.
+CHUNK_SIZE_ARG=""
+if [ -n "${CHUNK_SIZE:-}" ]; then
+    CHUNK_SIZE_ARG="--chunk-size ${CHUNK_SIZE}"
+fi
+
+# vLLM batch-invariant mode. On by default; GDN/Mamba backends do not support it.
+BATCH_INVARIANT="${BATCH_INVARIANT:-1}"
+
+# Mamba KV cache mode + prefix caching, set only for hybrid Mamba models.
+MAMBA_ARGS=""
+if [ -n "${MAMBA_CACHE_MODE:-}" ]; then
+    MAMBA_ARGS="--mamba-cache-mode ${MAMBA_CACHE_MODE} --enable-prefix-caching"
+fi
+
+# Max tokens per scheduler step. Empty -> vLLM default.
+MAX_NUM_BATCHED_TOKENS_ARG=""
+if [ -n "${MAX_NUM_BATCHED_TOKENS:-}" ]; then
+    MAX_NUM_BATCHED_TOKENS_ARG="--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}"
+fi
+
 # Store PIDs in a file so cleanup.sh can find them
 PID_FILE="/tmp/lmcache_mp_pids_${BUILD_ID}"
 > "$PID_FILE"
@@ -76,12 +97,24 @@ PID_FILE="/tmp/lmcache_mp_pids_${BUILD_ID}"
 echo "=== Launching LMCache MP server ==="
 echo "Port: $LMCACHE_PORT"
 
+# Optional GDS L1 slab tier (gds_* tests). When GDS_L1_PATH is set, the L1
+# medium becomes an NVMe slab accessed via cuFile DMA instead of pinned DRAM;
+# --l1-size-gb then sizes the slab. The path must be on a GDS-capable
+# filesystem (local NVMe), provided by the /scratch hostPath mount.
+GDS_L1_ARG=""
+if [ -n "${GDS_L1_PATH:-}" ]; then
+    echo "GDS L1 tier enabled; slab directory: $GDS_L1_PATH"
+    GDS_L1_ARG="--gds-l1-path ${GDS_L1_PATH}"
+fi
+
 CUDA_VISIBLE_DEVICES="${GPU_FOR_VLLM}" \
 lmcache server \
     --l1-size-gb "$CPU_BUFFER_SIZE" \
     --eviction-policy LRU \
     --max-workers "$MAX_WORKERS" \
+    $CHUNK_SIZE_ARG \
     --port "$LMCACHE_PORT" \
+    ${GDS_L1_ARG} \
     > "/tmp/build_${BUILD_ID}_lmcache.log" 2>&1 &
 
 LMCACHE_PID=$!
@@ -105,7 +138,7 @@ echo "Port: $vllm_port"
 CUDA_VISIBLE_DEVICES="${GPU_FOR_VLLM}" \
 VLLM_ENABLE_V1_MULTIPROCESSING=0 \
 VLLM_SERVER_DEV_MODE=1 \
-VLLM_BATCH_INVARIANT=1 \
+VLLM_BATCH_INVARIANT=${BATCH_INVARIANT} \
 PYTHONHASHSEED=0 \
 vllm serve "$MODEL" \
     --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 10}}" \
@@ -115,6 +148,8 @@ vllm serve "$MODEL" \
     $MAX_MODEL_LEN_ARG \
     $ENFORCE_EAGER_ARG \
     $GPU_MEMORY_UTIL_ARG \
+    $MAMBA_ARGS \
+    $MAX_NUM_BATCHED_TOKENS_ARG \
     > "/tmp/build_${BUILD_ID}_vllm.log" 2>&1 &
 
 VLLM_PID=$!

diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-gds-smoke.sh b/.buildkite/k3_tests/multiprocess/scripts/run-gds-smoke.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+# GDS L1 smoke test. Sends a few completions (cold) to store KV to the slab,
+# resets vLLM's prefix cache, then re-sends them (warm) to read the KV back from
+# LMCache/GDS. Passes if every request returns HTTP 200, a real LMCache retrieve
+# happened, and the warm (GDS-retrieved) outputs match the cold (recomputed)
+# ones -- i.e. the GDS store/retrieve path works and is correct.
+#
+# Expects the GDS-enabled LMCache server + vLLM to already be running, with
+# VLLM_SERVER_DEV_MODE=1 (for /reset_prefix_cache).
+set -e
+set -o pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
+source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh"
+
+VLLM_PORT="${VLLM_PORT:-8000}"
+MODEL="${MODEL:-Qwen/Qwen3-14B}"
+BUILD_ID="${BUILD_ID:-local_$$}"
+LMCACHE_LOG="/tmp/build_${BUILD_ID}_lmcache.log"
+N_PROMPTS="${GDS_SMOKE_PROMPTS:-4}"
+OUT_DIR="$(mktemp -d)"
+trap 'rm -rf "$OUT_DIR"' EXIT
+
+# A long-ish prompt so each request stores at least one LMCache chunk.
+build_prompt() {  # $1 = unique id
+    local filler="The key-value cache stores attention keys and values across transformer layers. "
+    local body="" i
+    for i in $(seq 1 80); do body="${body}${filler}"; done
+    printf 'Document %s. %s' "$1" "$body"
+}
+
+# Send N_PROMPTS completions; capture each generated text to
+# $OUT_DIR/<label>_<i>.txt and require every request to return HTTP 200.
+send_batch() {  # $1 = phase label (cold|warm)
+    local label="$1" ok=0 i prompt payload resp http body
+    for i in $(seq 1 "$N_PROMPTS"); do
+        prompt="$(build_prompt "$i")"
+        payload=$(python3 -c 'import json,sys; print(json.dumps({"model":sys.argv[1],"prompt":sys.argv[2],"max_tokens":16,"temperature":0}))' "$MODEL" "$prompt")
+        resp=$(curl -s -w $'\n%{http_code}' \
+            "http://127.0.0.1:${VLLM_PORT}/v1/completions" \
+            -H "Content-Type: application/json" -d "$payload")
+        http="${resp##*$'\n'}"
+        body="${resp%$'\n'*}"
+        printf '%s' "$body" \
+            | python3 -c 'import json,sys; print(json.load(sys.stdin)["choices"][0]["text"])' \
+            > "${OUT_DIR}/${label}_${i}.txt" 2>/dev/null \
+            || echo "<no-output>" > "${OUT_DIR}/${label}_${i}.txt"
+        echo "  [$label] req $i -> HTTP $http"
+        [ "$http" = "200" ] && ok=$((ok + 1))
+    done
+    [ "$ok" -eq "$N_PROMPTS" ] || { echo "[$label] only $ok/$N_PROMPTS returned HTTP 200"; return 1; }
+}
+
+# Count completed LMCache retrieves recorded in the server log (0 if no log yet).
+count_retrieves() {
+    [ -f "$LMCACHE_LOG" ] || { echo 0; return; }
+    grep -c "Retrieved" "$LMCACHE_LOG" 2>/dev/null || true
+}
+
+echo "============================================"
+echo "=== GDS smoke: phase 1 (cold -> store KV to the GDS slab) ==="
+echo "============================================"
+send_batch cold
+echo "Waiting for async stores to drain to the LMCache server..."
+sleep 3
+retrieves_before=$(count_retrieves)
+
+echo "============================================"
+echo "=== Reset vLLM prefix cache (force warm requests through LMCache/GDS) ==="
+echo "============================================"
+reset_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
+    "http://127.0.0.1:${VLLM_PORT}/reset_prefix_cache")
+if [ "$reset_code" != "200" ]; then
+    echo "reset_prefix_cache failed (HTTP $reset_code); is VLLM_SERVER_DEV_MODE=1?"
+    exit 1
+fi
+sleep 2
+
+echo "============================================"
+echo "=== GDS smoke: phase 2 (warm -> retrieve KV from the GDS slab) ==="
+echo "============================================"
+send_batch warm
+retrieves_after=$(count_retrieves)
+
+# 1. A real GDS retrieve must have happened (else warm recomputed / hit the APC).
+echo ""
+echo "LMCache retrieves logged: before=${retrieves_before} after=${retrieves_after}"
+if [ "$retrieves_after" -le "$retrieves_before" ]; then
+    echo "GDS smoke FAILED: no LMCache retrieve recorded -- the GDS read path was"
+    echo "not exercised (warm requests recomputed or hit vLLM's prefix cache)."
+    exit 1
+fi
+
+# 2. The KV retrieved from the GDS slab must produce the same output as the
+#    cold recompute (deterministic decoding -> byte-identical completions).
+echo "=== Verifying warm (GDS-retrieved) outputs match cold (recomputed) ==="
+mismatch=0
+for i in $(seq 1 "$N_PROMPTS"); do
+    if diff -q "${OUT_DIR}/cold_${i}.txt" "${OUT_DIR}/warm_${i}.txt" >/dev/null 2>&1; then
+        echo "  prompt $i: match"
+    else
+        echo "  prompt $i: MISMATCH"
+        mismatch=$((mismatch + 1))
+    fi
+done
+if [ "$mismatch" -ne 0 ]; then
+    echo "GDS smoke FAILED: ${mismatch}/${N_PROMPTS} warm outputs differ from cold"
+    echo "-- the KV retrieved from the GDS slab is incorrect."
+    exit 1
+fi
+
+echo "=== GDS smoke test passed: GDS store + retrieve path works and is correct ==="