From 260a2c72d24de26967c88309c95a70e4d81e115f Mon Sep 17 00:00:00 2001 From: OceanLi <122793010+ohdearquant@users.noreply.github.com> Date: Sun, 31 May 2026 13:46:26 -0400 Subject: [PATCH 1/4] feat(inference): ADR-064 Phase-0 decode slope measurement harness Add `scripts/bench_decode_slope.sh` that runs decode at multiple context lengths (64-1024), fits a linear model (per_tok_ms = slope*ctx + intercept), and outputs JSON with slope_ms, intercept_ms, r_squared, tok_per_sec_64. Add `make bench-decode` Makefile target for convenient invocation. Add debug-only runtime assertion in MetalKvCache::new verifying buffer sizes match expected f32 layout (will need updating when #154 migrates to f16). Implements #168 (decode slope harness) and #170 (KV layout assertion). Co-Authored-By: Claude Opus 4.6 (1M context) --- Makefile | 6 + crates/inference/src/forward/metal_qwen35.rs | 14 ++- scripts/bench_decode_slope.sh | 125 +++++++++++++++++++ 3 files changed, 144 insertions(+), 1 deletion(-) create mode 100755 scripts/bench_decode_slope.sh diff --git a/Makefile b/Makefile index 049a9de5d..062762b4e 100644 --- a/Makefile +++ b/Makefile @@ -57,6 +57,12 @@ bench-gate: cargo bench -p lattice-embed --bench simd -- --baseline base --noplot; \ python3 scripts/perf-bench-gate.py target/criterion "$$arch-local" +# ADR-064: measure decode slope/intercept and output JSON. +# Usage: make bench-decode (default 5 runs, ctx 64-1024) +# RUNS=3 make bench-decode (faster) +bench-decode: + ./scripts/bench_decode_slope.sh + # A/B benchmark across git refs. Uses worktree for base, leaves working tree untouched. # Usage: make bench-compare (origin/main vs HEAD) # make bench-compare BASE=main HEAD=pr/x (explicit refs) diff --git a/crates/inference/src/forward/metal_qwen35.rs b/crates/inference/src/forward/metal_qwen35.rs index 56bb49e3a..1ca972c74 100644 --- a/crates/inference/src/forward/metal_qwen35.rs +++ b/crates/inference/src/forward/metal_qwen35.rs @@ -3534,13 +3534,25 @@ kernel void moe_shared_gate_add( &format!("kv_v_{i}"), )); } - Self { + let cache = Self { k_bufs, v_bufs, seq_len: 0, kv_dim, max_cache_len, + }; + // ADR-064: runtime KV layout assertion (debug builds only). + // Current layout is f32; update this when #154 migrates to f16. + #[cfg(debug_assertions)] + { + let expected_bytes = max_cache_len * kv_dim * std::mem::size_of::(); + debug_assert_eq!( + cache.k_bufs[0].length() as usize, + expected_bytes, + "KV cache buffer size mismatch: expected f32 layout ({expected_bytes} bytes)" + ); } + cache } /// Append a K/V pair into the cache for a given full-attention layer. diff --git a/scripts/bench_decode_slope.sh b/scripts/bench_decode_slope.sh new file mode 100755 index 000000000..81f1a81ea --- /dev/null +++ b/scripts/bench_decode_slope.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# ADR-064 Phase-0: Decode slope/intercept measurement harness. +# +# Runs decode at multiple context lengths, fits a linear model: +# per_tok_ms = slope * ctx + intercept +# +# Output: JSON to stdout with slope_ms, intercept_ms, r_squared, tok_per_sec_64 +# +# Usage: +# ./scripts/bench_decode_slope.sh # default 5 runs, contexts 64-1024 +# RUNS=3 ./scripts/bench_decode_slope.sh # fewer runs (faster) +# CONTEXTS="64 256 512" ./scripts/bench_decode_slope.sh +set -uo pipefail + +REPO="$(cd "$(dirname "$0")/.." && pwd)" +LAT_BIN="$REPO/target/release/bench_decode_ab" +Q8_DIR="$HOME/.lattice/models/qwen3.5-0.8b" +RUNS="${RUNS:-5}" +N1=8 + +if [[ -z "${CONTEXTS:-}" ]]; then + CONTEXTS_ARR=(64 128 256 512 1024) +else + read -ra CONTEXTS_ARR <<< "$CONTEXTS" +fi + +# Build if needed +if [[ ! -x "$LAT_BIN" ]]; then + cargo build --release -p lattice-inference --bin bench_decode_ab \ + --features "f16,metal-gpu" 2>/dev/null \ + || { echo '{"error":"build failed"}'; exit 1; } +fi + +# Measure at N1 (baseline) +lattice_median() { + local n=$1 + env BENCH_N="$n" BENCH_RUNS="$RUNS" LATTICE_MODEL_DIR="$Q8_DIR" "$LAT_BIN" 2>/dev/null \ + | awk -F'total_ms=' '/^RESULT/{print $2}' \ + | sort -n \ + | awk "NR==$(( (RUNS + 1) / 2 ))" +} + +T1=$(lattice_median $N1) +if [[ -z "$T1" ]]; then + echo '{"error":"baseline measurement failed"}' + exit 1 +fi + +# Collect (ctx, per_tok_ms) pairs +declare -a CTX_VALS=() +declare -a PTM_VALS=() + +for CTX in "${CONTEXTS_ARR[@]}"; do + T2=$(lattice_median "$CTX") + if [[ -n "$T2" && "$T2" != "0" ]]; then + # per_tok_ms = (T2 - T1) / (CTX - N1) + PTM=$(echo "scale=6; ($T2 - $T1) / ($CTX - $N1)" | bc) + CTX_VALS+=("$CTX") + PTM_VALS+=("$PTM") + >&2 echo " ctx=$CTX: T2=${T2}ms per_tok=${PTM}ms" + else + >&2 echo " ctx=$CTX: FAILED (skipping)" + fi +done + +N=${#CTX_VALS[@]} +if (( N < 2 )); then + echo '{"error":"insufficient data points","n":'$N'}' + exit 1 +fi + +# Linear regression: per_tok_ms = slope * ctx + intercept +# Using least-squares via awk +RESULT=$(awk -v n="$N" ' +BEGIN { + split(ENVIRON["CTX_STR"], xs, " ") + split(ENVIRON["PTM_STR"], ys, " ") + sx = 0; sy = 0; sxx = 0; sxy = 0 + for (i = 1; i <= n; i++) { + x = xs[i] + 0 + y = ys[i] + 0 + sx += x; sy += y; sxx += x*x; sxy += x*y + } + denom = n * sxx - sx * sx + slope = (n * sxy - sx * sy) / denom + intercept = (sy - slope * sx) / n + # R-squared + ybar = sy / n + sstot = 0; ssres = 0 + for (i = 1; i <= n; i++) { + y = ys[i] + 0 + yhat = slope * (xs[i] + 0) + intercept + sstot += (y - ybar)^2 + ssres += (y - yhat)^2 + } + r2 = (sstot > 0) ? 1 - ssres/sstot : 1 + # tok/s at ctx=64 + ptm64 = slope * 64 + intercept + tps64 = (ptm64 > 0) ? 1000 / ptm64 : 0 + printf "{\"slope_ms\":%.6f,\"intercept_ms\":%.4f,\"r_squared\":%.6f,\"tok_per_sec_64\":%.1f,\"n_points\":%d,\"contexts\":[", slope, intercept, r2, tps64, n + for (i = 1; i <= n; i++) { + if (i > 1) printf "," + printf "%d", xs[i] + } + printf "],\"per_tok_ms\":[" + for (i = 1; i <= n; i++) { + if (i > 1) printf "," + printf "%.4f", ys[i] + } + printf "]}\n" +}' <<< "" CTX_STR="${CTX_VALS[*]}" PTM_STR="${PTM_VALS[*]}") + +# Pretty print to stderr, raw JSON to stdout +>&2 echo "" +>&2 echo "=== Decode Slope Fit ===" +>&2 echo "$RESULT" | python3 -c " +import json, sys +d = json.load(sys.stdin) +print(f\" slope: {d['slope_ms']:.6f} ms/ctx-token\") +print(f\" intercept: {d['intercept_ms']:.4f} ms\") +print(f\" R²: {d['r_squared']:.6f}\") +print(f\" tok/s@64: {d['tok_per_sec_64']:.1f}\") +" 2>/dev/null + +echo "$RESULT" From 060ed1ef475160404ecca3ec5891e779efdbf13d Mon Sep 17 00:00:00 2001 From: OceanLi <122793010+ohdearquant@users.noreply.github.com> Date: Sun, 31 May 2026 13:53:04 -0400 Subject: [PATCH 2/4] fix(scripts): pass env vars correctly to awk in bench_decode_slope The ENVIRON[] lookup requires variables to be in the environment, not positional shell args. Prefix the awk invocation with the variable assignments so they appear in ENVIRON. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/bench_decode_slope.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/bench_decode_slope.sh b/scripts/bench_decode_slope.sh index 81f1a81ea..fa7424c91 100755 --- a/scripts/bench_decode_slope.sh +++ b/scripts/bench_decode_slope.sh @@ -71,7 +71,7 @@ fi # Linear regression: per_tok_ms = slope * ctx + intercept # Using least-squares via awk -RESULT=$(awk -v n="$N" ' +RESULT=$(CTX_STR="${CTX_VALS[*]}" PTM_STR="${PTM_VALS[*]}" awk -v n="$N" ' BEGIN { split(ENVIRON["CTX_STR"], xs, " ") split(ENVIRON["PTM_STR"], ys, " ") @@ -108,7 +108,7 @@ BEGIN { printf "%.4f", ys[i] } printf "]}\n" -}' <<< "" CTX_STR="${CTX_VALS[*]}" PTM_STR="${PTM_VALS[*]}") +}' /dev/null) # Pretty print to stderr, raw JSON to stdout >&2 echo "" From 53a2e648abc72d3f4f83321feae86f1d8ee06796 Mon Sep 17 00:00:00 2001 From: OceanLi <122793010+ohdearquant@users.noreply.github.com> Date: Sun, 31 May 2026 13:59:43 -0400 Subject: [PATCH 3/4] fix(scripts): use actual completion tokens in slope calculation The model may hit EOS before generating the requested token count (Qwen3.5-0.8B caps at ~346 tokens). Previously used the requested count as the denominator, giving incorrect per-token times. Now extracts actual completion count from bench_decode_ab RESULT output. Also fixed macOS awk compatibility (no match() with capture groups). Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/bench_decode_slope.sh | 37 ++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/scripts/bench_decode_slope.sh b/scripts/bench_decode_slope.sh index fa7424c91..362642a38 100755 --- a/scripts/bench_decode_slope.sh +++ b/scripts/bench_decode_slope.sh @@ -31,33 +31,42 @@ if [[ ! -x "$LAT_BIN" ]]; then || { echo '{"error":"build failed"}'; exit 1; } fi -# Measure at N1 (baseline) -lattice_median() { +# Measure at N1 (baseline). Returns "total_ms completion_tokens" (median by total_ms). +lattice_measure() { local n=$1 env BENCH_N="$n" BENCH_RUNS="$RUNS" LATTICE_MODEL_DIR="$Q8_DIR" "$LAT_BIN" 2>/dev/null \ - | awk -F'total_ms=' '/^RESULT/{print $2}' \ + | sed -n 's/^RESULT.*completion=\([0-9]*\).*total_ms=\([0-9.]*\)/\2 \1/p' \ | sort -n \ - | awk "NR==$(( (RUNS + 1) / 2 ))" + | awk "NR==$(( (RUNS + 1) / 2 )){print}" } -T1=$(lattice_median $N1) -if [[ -z "$T1" ]]; then +BASELINE=$(lattice_measure $N1) +if [[ -z "$BASELINE" ]]; then echo '{"error":"baseline measurement failed"}' exit 1 fi +T1=$(echo "$BASELINE" | awk '{print $1}') +C1=$(echo "$BASELINE" | awk '{print $2}') -# Collect (ctx, per_tok_ms) pairs +# Collect (actual_ctx, per_tok_ms) pairs declare -a CTX_VALS=() declare -a PTM_VALS=() for CTX in "${CONTEXTS_ARR[@]}"; do - T2=$(lattice_median "$CTX") - if [[ -n "$T2" && "$T2" != "0" ]]; then - # per_tok_ms = (T2 - T1) / (CTX - N1) - PTM=$(echo "scale=6; ($T2 - $T1) / ($CTX - $N1)" | bc) - CTX_VALS+=("$CTX") - PTM_VALS+=("$PTM") - >&2 echo " ctx=$CTX: T2=${T2}ms per_tok=${PTM}ms" + MEAS=$(lattice_measure "$CTX") + if [[ -n "$MEAS" ]]; then + T2=$(echo "$MEAS" | awk '{print $1}') + C2=$(echo "$MEAS" | awk '{print $2}') + # Use ACTUAL completion tokens, not requested — model may hit EOS early + ACTUAL_DELTA=$(( C2 - C1 )) + if (( ACTUAL_DELTA > 0 )); then + PTM=$(echo "scale=6; ($T2 - $T1) / $ACTUAL_DELTA" | bc) + CTX_VALS+=("$C2") + PTM_VALS+=("$PTM") + >&2 echo " ctx=$C2 (req=$CTX): T2=${T2}ms per_tok=${PTM}ms" + else + >&2 echo " ctx=$CTX: no token delta (C2=$C2, C1=$C1), skipping" + fi else >&2 echo " ctx=$CTX: FAILED (skipping)" fi From 5085d21854638c56d3ecaf096d30a1500f4cc4be Mon Sep 17 00:00:00 2001 From: OceanLi <122793010+ohdearquant@users.noreply.github.com> Date: Sun, 31 May 2026 14:03:49 -0400 Subject: [PATCH 4/4] fix(scripts): address codex review findings on bench_decode_slope Fixes from PR #184 review (round 1): 1. [Major] Input validation: reject contexts <= N1, reject duplicates, validate RUNS is positive integer, guard bc output, guard awk denom=0. Script now exits 1 with JSON error for all invalid inputs. 2. [Medium] KV assertion safety: skip when num_full_layers == 0, verify ALL K and V buffers (not just k_bufs[0]). 3. Use `set -euo pipefail` (was `set -uo pipefail`). Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/inference/src/forward/metal_qwen35.rs | 19 +++++--- scripts/bench_decode_slope.sh | 46 ++++++++++++++++++-- 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/crates/inference/src/forward/metal_qwen35.rs b/crates/inference/src/forward/metal_qwen35.rs index 1ca972c74..114b0793a 100644 --- a/crates/inference/src/forward/metal_qwen35.rs +++ b/crates/inference/src/forward/metal_qwen35.rs @@ -3544,13 +3544,20 @@ kernel void moe_shared_gate_add( // ADR-064: runtime KV layout assertion (debug builds only). // Current layout is f32; update this when #154 migrates to f16. #[cfg(debug_assertions)] - { + if num_full_layers > 0 { let expected_bytes = max_cache_len * kv_dim * std::mem::size_of::(); - debug_assert_eq!( - cache.k_bufs[0].length() as usize, - expected_bytes, - "KV cache buffer size mismatch: expected f32 layout ({expected_bytes} bytes)" - ); + for (i, (k, v)) in cache.k_bufs.iter().zip(cache.v_bufs.iter()).enumerate() { + debug_assert_eq!( + k.length() as usize, + expected_bytes, + "KV cache K[{i}] size mismatch: expected f32 layout ({expected_bytes} bytes)" + ); + debug_assert_eq!( + v.length() as usize, + expected_bytes, + "KV cache V[{i}] size mismatch: expected f32 layout ({expected_bytes} bytes)" + ); + } } cache } diff --git a/scripts/bench_decode_slope.sh b/scripts/bench_decode_slope.sh index 362642a38..bdb7928ba 100755 --- a/scripts/bench_decode_slope.sh +++ b/scripts/bench_decode_slope.sh @@ -10,7 +10,7 @@ # ./scripts/bench_decode_slope.sh # default 5 runs, contexts 64-1024 # RUNS=3 ./scripts/bench_decode_slope.sh # fewer runs (faster) # CONTEXTS="64 256 512" ./scripts/bench_decode_slope.sh -set -uo pipefail +set -euo pipefail REPO="$(cd "$(dirname "$0")/.." && pwd)" LAT_BIN="$REPO/target/release/bench_decode_ab" @@ -18,12 +18,38 @@ Q8_DIR="$HOME/.lattice/models/qwen3.5-0.8b" RUNS="${RUNS:-5}" N1=8 +# Validate RUNS +if ! [[ "$RUNS" =~ ^[0-9]+$ ]] || (( RUNS < 1 )); then + echo '{"error":"RUNS must be a positive integer"}'; exit 1 +fi + if [[ -z "${CONTEXTS:-}" ]]; then CONTEXTS_ARR=(64 128 256 512 1024) else read -ra CONTEXTS_ARR <<< "$CONTEXTS" fi +# Validate contexts: must be positive integers > N1, no duplicates +declare -A SEEN_CTX=() +VALID_CTX=() +for CTX in "${CONTEXTS_ARR[@]}"; do + if ! [[ "$CTX" =~ ^[0-9]+$ ]] || (( CTX <= N1 )); then + >&2 echo "WARNING: skipping invalid context $CTX (must be integer > $N1)" + continue + fi + if [[ -n "${SEEN_CTX[$CTX]:-}" ]]; then + >&2 echo "WARNING: skipping duplicate context $CTX" + continue + fi + SEEN_CTX[$CTX]=1 + VALID_CTX+=("$CTX") +done +CONTEXTS_ARR=("${VALID_CTX[@]}") +if (( ${#CONTEXTS_ARR[@]} < 2 )); then + echo '{"error":"need at least 2 valid contexts (integers > '$N1')"}' + exit 1 +fi + # Build if needed if [[ ! -x "$LAT_BIN" ]]; then cargo build --release -p lattice-inference --bin bench_decode_ab \ @@ -60,7 +86,11 @@ for CTX in "${CONTEXTS_ARR[@]}"; do # Use ACTUAL completion tokens, not requested — model may hit EOS early ACTUAL_DELTA=$(( C2 - C1 )) if (( ACTUAL_DELTA > 0 )); then - PTM=$(echo "scale=6; ($T2 - $T1) / $ACTUAL_DELTA" | bc) + PTM=$(echo "scale=6; ($T2 - $T1) / $ACTUAL_DELTA" | bc) || true + if [[ -z "$PTM" || ! "$PTM" =~ ^-?[0-9] ]]; then + >&2 echo " ctx=$CTX: bc failed (T2=$T2, T1=$T1, delta=$ACTUAL_DELTA), skipping" + continue + fi CTX_VALS+=("$C2") PTM_VALS+=("$PTM") >&2 echo " ctx=$C2 (req=$CTX): T2=${T2}ms per_tok=${PTM}ms" @@ -91,6 +121,7 @@ BEGIN { sx += x; sy += y; sxx += x*x; sxy += x*y } denom = n * sxx - sx * sx + if (denom == 0) { printf "{\"error\":\"degenerate input (all x identical)\"}\n"; exit } slope = (n * sxy - sx * sy) / denom intercept = (sy - slope * sx) / n # R-squared @@ -119,16 +150,25 @@ BEGIN { printf "]}\n" }' /dev/null) +# Validate output +if [[ -z "$RESULT" ]]; then + echo '{"error":"regression calculation produced no output"}' + exit 1 +fi + # Pretty print to stderr, raw JSON to stdout >&2 echo "" >&2 echo "=== Decode Slope Fit ===" >&2 echo "$RESULT" | python3 -c " import json, sys d = json.load(sys.stdin) +if 'error' in d: + print(f\" ERROR: {d['error']}\") + sys.exit(0) print(f\" slope: {d['slope_ms']:.6f} ms/ctx-token\") print(f\" intercept: {d['intercept_ms']:.4f} ms\") print(f\" R²: {d['r_squared']:.6f}\") print(f\" tok/s@64: {d['tok_per_sec_64']:.1f}\") -" 2>/dev/null +" 2>/dev/null || true echo "$RESULT"