diff --git a/.buildkite/k3_tests/multiprocess/pipeline.yml b/.buildkite/k3_tests/multiprocess/pipeline.yml index b9fa979b86..e59b236f4d 100644 --- a/.buildkite/k3_tests/multiprocess/pipeline.yml +++ b/.buildkite/k3_tests/multiprocess/pipeline.yml @@ -26,40 +26,6 @@ steps: - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } } artifact_paths: ["*.log"] - # HMA (hybrid memory allocator) correctness check on google/gemma-4-31B-it. - # It interleaves sliding + full attention whose full layers use a larger - # head_dim (512 vs 256), so vLLM gives its KV cache groups different block - # sizes -- exercising LMCache's per-group block-size handling. Runs gsm8k, - # resets vLLM's local prefix cache (LMCache preserved), reruns, and asserts - # the scores match (run1 == run2 == no-LMCache baseline). Needs 2 GPUs - # (LMCache+vLLM + baseline). It is public (no HF_TOKEN), forces TRITON_ATTN - # (so ATTENTION_BACKEND=auto and a non-zero SCORE_TOLERANCE, since - # TRITON_ATTN is not bit-exact under batch invariance), and its ~63GB of - # weights need a higher GPU_MEMORY_UTILIZATION than the 0.5 default. - - label: ":compression: hma_lm_eval_gemma4" - command: .buildkite/k3_tests/multiprocess/run.sh hma_lm_eval_gemma4 - timeout_in_minutes: 60 - env: - MODEL: "google/gemma-4-31B-it" - SCORE_TOLERANCE: "0.05" - ATTENTION_BACKEND: "auto" - GPU_MEMORY_UTILIZATION: "0.85" - # Skip CUDA-graph capture so the large model doesn't time out at launch - # (safe here: this test uses a tolerance, not the bit-exact check). - ENFORCE_EAGER: "1" - # 31B weights are large; allow longer for download + load before the - # readiness probe gives up (other models keep the 300s default). - MAX_WAIT_SECONDS: "400" - # LIMIT = number of gsm8k samples. 31B's large per-token KV makes the - # full 200-sample working set overflow the CPU pool and thrash, so run - # 2 misses LMCache; cap the samples and enlarge the pool to keep run 2 - # cache-served. CPU_BUFFER_SIZE (GB) is bounded by node RAM. - LIMIT: "100" - CPU_BUFFER_SIZE: "200" - agents: { queue: "k8s" } - plugins: [{ kubernetes: { podSpec: *pod-2gpu } }] - artifact_paths: ["*.log"] - - label: ":compression: long_doc_qa" command: .buildkite/k3_tests/multiprocess/run.sh long_doc_qa timeout_in_minutes: 30 @@ -99,6 +65,30 @@ steps: volumes: *vols artifact_paths: ["*.log"] + # HMA correctness check on google/gemma-4-31B-it (a hybrid model whose KV + # cache groups get different block sizes). Runs gsm8k, resets vLLM's prefix + # cache (LMCache preserved), reruns served by LMCache, and asserts the two + # runs' scores match. Single GPU, no baseline. + - label: ":compression: hma_lm_eval_gemma4" + command: .buildkite/k3_tests/multiprocess/run.sh hma_lm_eval_gemma4 + timeout_in_minutes: 60 + env: + MODEL: "google/gemma-4-31B-it" + # Require an exact score match between the two runs. + SCORE_TOLERANCE: "0" + ATTENTION_BACKEND: "auto" + GPU_MEMORY_UTILIZATION: "0.85" + # 31B load + CUDA-graph capture is slow; raise the readiness timeout + # above the 300s default. + MAX_WAIT_SECONDS: "600" + # Cap samples and enlarge the CPU pool so the retrieve run stays + # cache-served (31B's per-token KV is large). + LIMIT: "100" + CPU_BUFFER_SIZE: "200" + agents: { queue: "k8s" } + plugins: [{ kubernetes: { podSpec: *pod-1gpu } }] + artifact_paths: ["*.log"] + - label: ":compression: fault_tolerance" command: .buildkite/k3_tests/multiprocess/run.sh fault_tolerance timeout_in_minutes: 30 @@ -127,10 +117,32 @@ steps: plugins: [{ kubernetes: { podSpec: *pod-1gpu } }] artifact_paths: ["*.log"] + - label: ":compression: gds_smoke_test" + command: .buildkite/k3_tests/multiprocess/run.sh gds_smoke_test + timeout_in_minutes: 30 + agents: { queue: "k8s" } + plugins: + - kubernetes: + podSpec: + containers: + - name: container-0 + image: lmcache/ci-base:latest + imagePullPolicy: Never + resources: { limits: { "nvidia.com/gpu": "1" } } + volumeMounts: + - { name: hf-cache, mountPath: /root/.cache/huggingface } + - { name: scratch, mountPath: /scratch } + - { name: udev, mountPath: /run/udev, readOnly: true } + volumes: + - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } } + - { name: scratch, hostPath: { path: /data/gds-scratch, type: DirectoryOrCreate } } + - { name: udev, hostPath: { path: /run/udev, type: Directory } } + artifact_paths: ["*.log"] + - group: ":compression: Multiprocess (CPU-only)" steps: - label: ":compression: cpu_e2e_validation (shm)" - command: bash .buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh + command: bash .github/scripts/run-cpu-e2e-validation.sh timeout_in_minutes: 30 agents: { queue: "k8s" } plugins: @@ -155,7 +167,7 @@ steps: - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } } - label: ":compression: cpu_e2e_validation (pickle)" - command: bash .buildkite/k3_tests/multiprocess/scripts/run-cpu-e2e-validation.sh + command: bash .github/scripts/run-cpu-e2e-validation.sh env: LMCACHE_SHM_NAME: "" timeout_in_minutes: 30 @@ -180,3 +192,30 @@ steps: volumes: - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } } - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } } + + - label: ":compression: cpu_e2e_validation (server-side copy)" + command: bash .github/scripts/run-cpu-e2e-validation.sh + env: + LMCACHE_MP_TRANSFER_MODE: "handle" + timeout_in_minutes: 30 + agents: { queue: "k8s" } + plugins: + - kubernetes: + podSpec: + containers: + - name: container-0 + image: lmcache/ci-base:latest + imagePullPolicy: Never + resources: + requests: + cpu: "8" + memory: "256Gi" + limits: + cpu: "8" + memory: "256Gi" + volumeMounts: + - { name: hf-cache, mountPath: /root/.cache/huggingface } + - { name: dshm, mountPath: /dev/shm } + volumes: + - { name: hf-cache, hostPath: { path: /data/huggingface, type: DirectoryOrCreate } } + - { name: dshm, emptyDir: { medium: Memory, sizeLimit: 4Gi } } diff --git a/.buildkite/k3_tests/multiprocess/run.sh b/.buildkite/k3_tests/multiprocess/run.sh index 62e2e9b0ba..369c90bec5 100755 --- a/.buildkite/k3_tests/multiprocess/run.sh +++ b/.buildkite/k3_tests/multiprocess/run.sh @@ -3,6 +3,7 @@ # Usage: run.sh # test_name: lm_eval | hma_lm_eval_gemma4 | vllm_bench | long_doc_qa # | long_doc_qa_l2 | fault_tolerance | deadlock | restart_recovery +# | gds_smoke_test # Thin wrapper: sets up environment, then delegates to scripts/. # No Docker -- all processes run natively in the pod. set -euo pipefail diff --git a/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh b/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh index 4ae44b79e9..2a6160118f 100755 --- a/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh +++ b/.buildkite/k3_tests/multiprocess/scripts/cleanup.sh @@ -28,6 +28,14 @@ for port in "${VLLM_PORT:-8000}" "${VLLM_BASELINE_PORT:-9000}" "${LMCACHE_PORT:- fuser -k "${port}/tcp" 2>/dev/null || true done +# Remove the GDS slab scratch dir (only set for gds_* tests). It lives on the +# /scratch hostPath (host-local NVMe), so it persists past the pod and the +# preallocated slab is large -- drop it now that the server is stopped. +if [[ -n "${GDS_L1_PATH:-}" ]]; then + echo "Removing GDS slab dir: $GDS_L1_PATH" + rm -rf "${GDS_L1_PATH}" 2>/dev/null || true +fi + echo "=== Cleanup complete ===" # Copy server logs to the workspace so Buildkite can collect them as artifacts diff --git a/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh b/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh index c0b52ae605..4d70634298 100755 --- a/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh +++ b/.buildkite/k3_tests/multiprocess/scripts/launch-processes.sh @@ -17,9 +17,9 @@ MAX_WORKERS="${MAX_WORKERS:-4}" MODEL="${MODEL:-Qwen/Qwen3-14B}" BUILD_ID="${BUILD_ID:-local_$$}" -# K8s assigns exactly 2 GPUs as devices 0 and 1 -GPU_FOR_VLLM=0 -GPU_FOR_BASELINE=1 +# K8s assigns exactly 2 GPUs as devices 0 and 1 (overridable for local runs). +GPU_FOR_VLLM="${GPU_FOR_VLLM:-0}" +GPU_FOR_BASELINE="${GPU_FOR_BASELINE:-1}" echo "Using GPU $GPU_FOR_VLLM for vLLM with LMCache" echo "Using GPU $GPU_FOR_BASELINE for vLLM baseline" @@ -68,6 +68,27 @@ fi MAX_MODEL_LEN="${MAX_MODEL_LEN:-auto}" MAX_MODEL_LEN_ARG="--max-model-len ${MAX_MODEL_LEN}" +# LMCache server chunk size in tokens. Empty -> server default. +CHUNK_SIZE_ARG="" +if [ -n "${CHUNK_SIZE:-}" ]; then + CHUNK_SIZE_ARG="--chunk-size ${CHUNK_SIZE}" +fi + +# vLLM batch-invariant mode. On by default; GDN/Mamba backends do not support it. +BATCH_INVARIANT="${BATCH_INVARIANT:-1}" + +# Mamba KV cache mode + prefix caching, set only for hybrid Mamba models. +MAMBA_ARGS="" +if [ -n "${MAMBA_CACHE_MODE:-}" ]; then + MAMBA_ARGS="--mamba-cache-mode ${MAMBA_CACHE_MODE} --enable-prefix-caching" +fi + +# Max tokens per scheduler step. Empty -> vLLM default. +MAX_NUM_BATCHED_TOKENS_ARG="" +if [ -n "${MAX_NUM_BATCHED_TOKENS:-}" ]; then + MAX_NUM_BATCHED_TOKENS_ARG="--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}" +fi + # Store PIDs in a file so cleanup.sh can find them PID_FILE="/tmp/lmcache_mp_pids_${BUILD_ID}" > "$PID_FILE" @@ -76,12 +97,24 @@ PID_FILE="/tmp/lmcache_mp_pids_${BUILD_ID}" echo "=== Launching LMCache MP server ===" echo "Port: $LMCACHE_PORT" +# Optional GDS L1 slab tier (gds_* tests). When GDS_L1_PATH is set, the L1 +# medium becomes an NVMe slab accessed via cuFile DMA instead of pinned DRAM; +# --l1-size-gb then sizes the slab. The path must be on a GDS-capable +# filesystem (local NVMe), provided by the /scratch hostPath mount. +GDS_L1_ARG="" +if [ -n "${GDS_L1_PATH:-}" ]; then + echo "GDS L1 tier enabled; slab directory: $GDS_L1_PATH" + GDS_L1_ARG="--gds-l1-path ${GDS_L1_PATH}" +fi + CUDA_VISIBLE_DEVICES="${GPU_FOR_VLLM}" \ lmcache server \ --l1-size-gb "$CPU_BUFFER_SIZE" \ --eviction-policy LRU \ --max-workers "$MAX_WORKERS" \ + $CHUNK_SIZE_ARG \ --port "$LMCACHE_PORT" \ + ${GDS_L1_ARG} \ > "/tmp/build_${BUILD_ID}_lmcache.log" 2>&1 & LMCACHE_PID=$! @@ -105,7 +138,7 @@ echo "Port: $vllm_port" CUDA_VISIBLE_DEVICES="${GPU_FOR_VLLM}" \ VLLM_ENABLE_V1_MULTIPROCESSING=0 \ VLLM_SERVER_DEV_MODE=1 \ -VLLM_BATCH_INVARIANT=1 \ +VLLM_BATCH_INVARIANT=${BATCH_INVARIANT} \ PYTHONHASHSEED=0 \ vllm serve "$MODEL" \ --kv-transfer-config "{\"kv_connector\":\"LMCacheMPConnector\", \"kv_role\":\"kv_both\", \"kv_load_failure_policy\": \"recompute\", \"kv_connector_extra_config\": {\"lmcache.mp.port\": $LMCACHE_PORT, \"lmcache.mp.mq_timeout\": 10}}" \ @@ -115,6 +148,8 @@ vllm serve "$MODEL" \ $MAX_MODEL_LEN_ARG \ $ENFORCE_EAGER_ARG \ $GPU_MEMORY_UTIL_ARG \ + $MAMBA_ARGS \ + $MAX_NUM_BATCHED_TOKENS_ARG \ > "/tmp/build_${BUILD_ID}_vllm.log" 2>&1 & VLLM_PID=$! diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-gds-smoke.sh b/.buildkite/k3_tests/multiprocess/scripts/run-gds-smoke.sh new file mode 100755 index 0000000000..c33e5c676a --- /dev/null +++ b/.buildkite/k3_tests/multiprocess/scripts/run-gds-smoke.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# GDS L1 smoke test. Sends a few completions (cold) to store KV to the slab, +# resets vLLM's prefix cache, then re-sends them (warm) to read the KV back from +# LMCache/GDS. Passes if every request returns HTTP 200, a real LMCache retrieve +# happened, and the warm (GDS-retrieved) outputs match the cold (recomputed) +# ones -- i.e. the GDS store/retrieve path works and is correct. +# +# Expects the GDS-enabled LMCache server + vLLM to already be running, with +# VLLM_SERVER_DEV_MODE=1 (for /reset_prefix_cache). +set -e +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" +source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh" + +VLLM_PORT="${VLLM_PORT:-8000}" +MODEL="${MODEL:-Qwen/Qwen3-14B}" +BUILD_ID="${BUILD_ID:-local_$$}" +LMCACHE_LOG="/tmp/build_${BUILD_ID}_lmcache.log" +N_PROMPTS="${GDS_SMOKE_PROMPTS:-4}" +OUT_DIR="$(mktemp -d)" +trap 'rm -rf "$OUT_DIR"' EXIT + +# A long-ish prompt so each request stores at least one LMCache chunk. +build_prompt() { # $1 = unique id + local filler="The key-value cache stores attention keys and values across transformer layers. " + local body="" i + for i in $(seq 1 80); do body="${body}${filler}"; done + printf 'Document %s. %s' "$1" "$body" +} + +# Send N_PROMPTS completions; capture each generated text to +# $OUT_DIR/