From 260a2c72d24de26967c88309c95a70e4d81e115f Mon Sep 17 00:00:00 2001
From: OceanLi <122793010+ohdearquant@users.noreply.github.com>
Date: Sun, 31 May 2026 13:46:26 -0400
Subject: [PATCH 1/4] feat(inference): ADR-064 Phase-0 decode slope measurement
 harness

Add `scripts/bench_decode_slope.sh` that runs decode at multiple context
lengths (64-1024), fits a linear model (per_tok_ms = slope*ctx + intercept),
and outputs JSON with slope_ms, intercept_ms, r_squared, tok_per_sec_64.

Add `make bench-decode` Makefile target for convenient invocation.

Add debug-only runtime assertion in MetalKvCache::new verifying buffer sizes
match expected f32 layout (will need updating when #154 migrates to f16).

Implements #168 (decode slope harness) and #170 (KV layout assertion).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 Makefile                                     |   6 +
 crates/inference/src/forward/metal_qwen35.rs |  14 ++-
 scripts/bench_decode_slope.sh                | 125 +++++++++++++++++++
 3 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100755 scripts/bench_decode_slope.sh
diff --git a/Makefile b/Makefile
index 049a9de5d..062762b4e 100644
--- a/Makefile
+++ b/Makefile
@@ -57,6 +57,12 @@ bench-gate:
 		cargo bench -p lattice-embed --bench simd -- --baseline base --noplot; \
 		python3 scripts/perf-bench-gate.py target/criterion "$$arch-local"
 
+# ADR-064: measure decode slope/intercept and output JSON.
+# Usage: make bench-decode                   (default 5 runs, ctx 64-1024)
+#        RUNS=3 make bench-decode            (faster)
+bench-decode:
+	./scripts/bench_decode_slope.sh
+
 # A/B benchmark across git refs. Uses worktree for base, leaves working tree untouched.
 # Usage: make bench-compare                     (origin/main vs HEAD)
 #        make bench-compare BASE=main HEAD=pr/x (explicit refs)
diff --git a/crates/inference/src/forward/metal_qwen35.rs b/crates/inference/src/forward/metal_qwen35.rs
index 56bb49e3a..1ca972c74 100644
--- a/crates/inference/src/forward/metal_qwen35.rs
+++ b/crates/inference/src/forward/metal_qwen35.rs
@@ -3534,13 +3534,25 @@ kernel void moe_shared_gate_add(
                     &format!("kv_v_{i}"),
                 ));
             }
-            Self {
+            let cache = Self {
                 k_bufs,
                 v_bufs,
                 seq_len: 0,
                 kv_dim,
                 max_cache_len,
+            };
+            // ADR-064: runtime KV layout assertion (debug builds only).
+            // Current layout is f32; update this when #154 migrates to f16.
+            #[cfg(debug_assertions)]
+            {
+                let expected_bytes = max_cache_len * kv_dim * std::mem::size_of::<f32>();
+                debug_assert_eq!(
+                    cache.k_bufs[0].length() as usize,
+                    expected_bytes,
+                    "KV cache buffer size mismatch: expected f32 layout ({expected_bytes} bytes)"
+                );
             }
+            cache
         }
 
         /// Append a K/V pair into the cache for a given full-attention layer.
diff --git a/scripts/bench_decode_slope.sh b/scripts/bench_decode_slope.sh
new file mode 100755
index 000000000..81f1a81ea
--- /dev/null
+++ b/scripts/bench_decode_slope.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+# ADR-064 Phase-0: Decode slope/intercept measurement harness.
+#
+# Runs decode at multiple context lengths, fits a linear model:
+#   per_tok_ms = slope * ctx + intercept
+#
+# Output: JSON to stdout with slope_ms, intercept_ms, r_squared, tok_per_sec_64
+#
+# Usage:
+#   ./scripts/bench_decode_slope.sh              # default 5 runs, contexts 64-1024
+#   RUNS=3 ./scripts/bench_decode_slope.sh       # fewer runs (faster)
+#   CONTEXTS="64 256 512" ./scripts/bench_decode_slope.sh
+set -uo pipefail
+
+REPO="$(cd "$(dirname "$0")/.." && pwd)"
+LAT_BIN="$REPO/target/release/bench_decode_ab"
+Q8_DIR="$HOME/.lattice/models/qwen3.5-0.8b"
+RUNS="${RUNS:-5}"
+N1=8
+
+if [[ -z "${CONTEXTS:-}" ]]; then
+    CONTEXTS_ARR=(64 128 256 512 1024)
+else
+    read -ra CONTEXTS_ARR <<< "$CONTEXTS"
+fi
+
+# Build if needed
+if [[ ! -x "$LAT_BIN" ]]; then
+    cargo build --release -p lattice-inference --bin bench_decode_ab \
+        --features "f16,metal-gpu" 2>/dev/null \
+        || { echo '{"error":"build failed"}'; exit 1; }
+fi
+
+# Measure at N1 (baseline)
+lattice_median() {
+    local n=$1
+    env BENCH_N="$n" BENCH_RUNS="$RUNS" LATTICE_MODEL_DIR="$Q8_DIR" "$LAT_BIN" 2>/dev/null \
+        | awk -F'total_ms=' '/^RESULT/{print $2}' \
+        | sort -n \
+        | awk "NR==$(( (RUNS + 1) / 2 ))"
+}
+
+T1=$(lattice_median $N1)
+if [[ -z "$T1" ]]; then
+    echo '{"error":"baseline measurement failed"}'
+    exit 1
+fi
+
+# Collect (ctx, per_tok_ms) pairs
+declare -a CTX_VALS=()
+declare -a PTM_VALS=()
+
+for CTX in "${CONTEXTS_ARR[@]}"; do
+    T2=$(lattice_median "$CTX")
+    if [[ -n "$T2" && "$T2" != "0" ]]; then
+        # per_tok_ms = (T2 - T1) / (CTX - N1)
+        PTM=$(echo "scale=6; ($T2 - $T1) / ($CTX - $N1)" | bc)
+        CTX_VALS+=("$CTX")
+        PTM_VALS+=("$PTM")
+        >&2 echo "  ctx=$CTX: T2=${T2}ms per_tok=${PTM}ms"
+    else
+        >&2 echo "  ctx=$CTX: FAILED (skipping)"
+    fi
+done
+
+N=${#CTX_VALS[@]}
+if (( N < 2 )); then
+    echo '{"error":"insufficient data points","n":'$N'}'
+    exit 1
+fi
+
+# Linear regression: per_tok_ms = slope * ctx + intercept
+# Using least-squares via awk
+RESULT=$(awk -v n="$N" '
+BEGIN {
+    split(ENVIRON["CTX_STR"], xs, " ")
+    split(ENVIRON["PTM_STR"], ys, " ")
+    sx = 0; sy = 0; sxx = 0; sxy = 0
+    for (i = 1; i <= n; i++) {
+        x = xs[i] + 0
+        y = ys[i] + 0
+        sx += x; sy += y; sxx += x*x; sxy += x*y
+    }
+    denom = n * sxx - sx * sx
+    slope = (n * sxy - sx * sy) / denom
+    intercept = (sy - slope * sx) / n
+    # R-squared
+    ybar = sy / n
+    sstot = 0; ssres = 0
+    for (i = 1; i <= n; i++) {
+        y = ys[i] + 0
+        yhat = slope * (xs[i] + 0) + intercept
+        sstot += (y - ybar)^2
+        ssres += (y - yhat)^2
+    }
+    r2 = (sstot > 0) ? 1 - ssres/sstot : 1
+    # tok/s at ctx=64
+    ptm64 = slope * 64 + intercept
+    tps64 = (ptm64 > 0) ? 1000 / ptm64 : 0
+    printf "{\"slope_ms\":%.6f,\"intercept_ms\":%.4f,\"r_squared\":%.6f,\"tok_per_sec_64\":%.1f,\"n_points\":%d,\"contexts\":[", slope, intercept, r2, tps64, n
+    for (i = 1; i <= n; i++) {
+        if (i > 1) printf ","
+        printf "%d", xs[i]
+    }
+    printf "],\"per_tok_ms\":["
+    for (i = 1; i <= n; i++) {
+        if (i > 1) printf ","
+        printf "%.4f", ys[i]
+    }
+    printf "]}\n"
+}' <<< "" CTX_STR="${CTX_VALS[*]}" PTM_STR="${PTM_VALS[*]}")
+
+# Pretty print to stderr, raw JSON to stdout
+>&2 echo ""
+>&2 echo "=== Decode Slope Fit ==="
+>&2 echo "$RESULT" | python3 -c "
+import json, sys
+d = json.load(sys.stdin)
+print(f\"  slope:     {d['slope_ms']:.6f} ms/ctx-token\")
+print(f\"  intercept: {d['intercept_ms']:.4f} ms\")
+print(f\"  R²:        {d['r_squared']:.6f}\")
+print(f\"  tok/s@64:  {d['tok_per_sec_64']:.1f}\")
+" 2>/dev/null
+
+echo "$RESULT"

From 060ed1ef475160404ecca3ec5891e779efdbf13d Mon Sep 17 00:00:00 2001
From: OceanLi <122793010+ohdearquant@users.noreply.github.com>
Date: Sun, 31 May 2026 13:53:04 -0400
Subject: [PATCH 2/4] fix(scripts): pass env vars correctly to awk in
 bench_decode_slope

The ENVIRON[] lookup requires variables to be in the environment, not
positional shell args. Prefix the awk invocation with the variable
assignments so they appear in ENVIRON.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/bench_decode_slope.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/bench_decode_slope.sh b/scripts/bench_decode_slope.sh
index 81f1a81ea..fa7424c91 100755
--- a/scripts/bench_decode_slope.sh
+++ b/scripts/bench_decode_slope.sh
@@ -71,7 +71,7 @@ fi
 
 # Linear regression: per_tok_ms = slope * ctx + intercept
 # Using least-squares via awk
-RESULT=$(awk -v n="$N" '
+RESULT=$(CTX_STR="${CTX_VALS[*]}" PTM_STR="${PTM_VALS[*]}" awk -v n="$N" '
 BEGIN {
     split(ENVIRON["CTX_STR"], xs, " ")
     split(ENVIRON["PTM_STR"], ys, " ")
@@ -108,7 +108,7 @@ BEGIN {
         printf "%.4f", ys[i]
     }
     printf "]}\n"
-}' <<< "" CTX_STR="${CTX_VALS[*]}" PTM_STR="${PTM_VALS[*]}")
+}' /dev/null)
 
 # Pretty print to stderr, raw JSON to stdout
 >&2 echo ""

From 53a2e648abc72d3f4f83321feae86f1d8ee06796 Mon Sep 17 00:00:00 2001
From: OceanLi <122793010+ohdearquant@users.noreply.github.com>
Date: Sun, 31 May 2026 13:59:43 -0400
Subject: [PATCH 3/4] fix(scripts): use actual completion tokens in slope
 calculation

The model may hit EOS before generating the requested token count
(Qwen3.5-0.8B caps at ~346 tokens). Previously used the requested
count as the denominator, giving incorrect per-token times. Now
extracts actual completion count from bench_decode_ab RESULT output.

Also fixed macOS awk compatibility (no match() with capture groups).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/bench_decode_slope.sh | 37 ++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/scripts/bench_decode_slope.sh b/scripts/bench_decode_slope.sh
index fa7424c91..362642a38 100755
--- a/scripts/bench_decode_slope.sh
+++ b/scripts/bench_decode_slope.sh
@@ -31,33 +31,42 @@ if [[ ! -x "$LAT_BIN" ]]; then
         || { echo '{"error":"build failed"}'; exit 1; }
 fi
 
-# Measure at N1 (baseline)
-lattice_median() {
+# Measure at N1 (baseline). Returns "total_ms completion_tokens" (median by total_ms).
+lattice_measure() {
     local n=$1
     env BENCH_N="$n" BENCH_RUNS="$RUNS" LATTICE_MODEL_DIR="$Q8_DIR" "$LAT_BIN" 2>/dev/null \
-        | awk -F'total_ms=' '/^RESULT/{print $2}' \
+        | sed -n 's/^RESULT.*completion=\([0-9]*\).*total_ms=\([0-9.]*\)/\2 \1/p' \
         | sort -n \
-        | awk "NR==$(( (RUNS + 1) / 2 ))"
+        | awk "NR==$(( (RUNS + 1) / 2 )){print}"
 }
 
-T1=$(lattice_median $N1)
-if [[ -z "$T1" ]]; then
+BASELINE=$(lattice_measure $N1)
+if [[ -z "$BASELINE" ]]; then
     echo '{"error":"baseline measurement failed"}'
     exit 1
 fi
+T1=$(echo "$BASELINE" | awk '{print $1}')
+C1=$(echo "$BASELINE" | awk '{print $2}')
 
-# Collect (ctx, per_tok_ms) pairs
+# Collect (actual_ctx, per_tok_ms) pairs
 declare -a CTX_VALS=()
 declare -a PTM_VALS=()
 
 for CTX in "${CONTEXTS_ARR[@]}"; do
-    T2=$(lattice_median "$CTX")
-    if [[ -n "$T2" && "$T2" != "0" ]]; then
-        # per_tok_ms = (T2 - T1) / (CTX - N1)
-        PTM=$(echo "scale=6; ($T2 - $T1) / ($CTX - $N1)" | bc)
-        CTX_VALS+=("$CTX")
-        PTM_VALS+=("$PTM")
-        >&2 echo "  ctx=$CTX: T2=${T2}ms per_tok=${PTM}ms"
+    MEAS=$(lattice_measure "$CTX")
+    if [[ -n "$MEAS" ]]; then
+        T2=$(echo "$MEAS" | awk '{print $1}')
+        C2=$(echo "$MEAS" | awk '{print $2}')
+        # Use ACTUAL completion tokens, not requested — model may hit EOS early
+        ACTUAL_DELTA=$(( C2 - C1 ))
+        if (( ACTUAL_DELTA > 0 )); then
+            PTM=$(echo "scale=6; ($T2 - $T1) / $ACTUAL_DELTA" | bc)
+            CTX_VALS+=("$C2")
+            PTM_VALS+=("$PTM")
+            >&2 echo "  ctx=$C2 (req=$CTX): T2=${T2}ms per_tok=${PTM}ms"
+        else
+            >&2 echo "  ctx=$CTX: no token delta (C2=$C2, C1=$C1), skipping"
+        fi
     else
         >&2 echo "  ctx=$CTX: FAILED (skipping)"
     fi

From 5085d21854638c56d3ecaf096d30a1500f4cc4be Mon Sep 17 00:00:00 2001
From: OceanLi <122793010+ohdearquant@users.noreply.github.com>
Date: Sun, 31 May 2026 14:03:49 -0400
Subject: [PATCH 4/4] fix(scripts): address codex review findings on
 bench_decode_slope

Fixes from PR #184 review (round 1):

1. [Major] Input validation: reject contexts <= N1, reject duplicates,
   validate RUNS is positive integer, guard bc output, guard awk denom=0.
   Script now exits 1 with JSON error for all invalid inputs.

2. [Medium] KV assertion safety: skip when num_full_layers == 0, verify
   ALL K and V buffers (not just k_bufs[0]).

3. Use `set -euo pipefail` (was `set -uo pipefail`).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/inference/src/forward/metal_qwen35.rs | 19 +++++---
 scripts/bench_decode_slope.sh                | 46 ++++++++++++++++++--
 2 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/crates/inference/src/forward/metal_qwen35.rs b/crates/inference/src/forward/metal_qwen35.rs
index 1ca972c74..114b0793a 100644
--- a/crates/inference/src/forward/metal_qwen35.rs
+++ b/crates/inference/src/forward/metal_qwen35.rs
@@ -3544,13 +3544,20 @@ kernel void moe_shared_gate_add(
             // ADR-064: runtime KV layout assertion (debug builds only).
             // Current layout is f32; update this when #154 migrates to f16.
             #[cfg(debug_assertions)]
-            {
+            if num_full_layers > 0 {
                 let expected_bytes = max_cache_len * kv_dim * std::mem::size_of::<f32>();
-                debug_assert_eq!(
-                    cache.k_bufs[0].length() as usize,
-                    expected_bytes,
-                    "KV cache buffer size mismatch: expected f32 layout ({expected_bytes} bytes)"
-                );
+                for (i, (k, v)) in cache.k_bufs.iter().zip(cache.v_bufs.iter()).enumerate() {
+                    debug_assert_eq!(
+                        k.length() as usize,
+                        expected_bytes,
+                        "KV cache K[{i}] size mismatch: expected f32 layout ({expected_bytes} bytes)"
+                    );
+                    debug_assert_eq!(
+                        v.length() as usize,
+                        expected_bytes,
+                        "KV cache V[{i}] size mismatch: expected f32 layout ({expected_bytes} bytes)"
+                    );
+                }
             }
             cache
         }
diff --git a/scripts/bench_decode_slope.sh b/scripts/bench_decode_slope.sh
index 362642a38..bdb7928ba 100755
--- a/scripts/bench_decode_slope.sh
+++ b/scripts/bench_decode_slope.sh
@@ -10,7 +10,7 @@
 #   ./scripts/bench_decode_slope.sh              # default 5 runs, contexts 64-1024
 #   RUNS=3 ./scripts/bench_decode_slope.sh       # fewer runs (faster)
 #   CONTEXTS="64 256 512" ./scripts/bench_decode_slope.sh
-set -uo pipefail
+set -euo pipefail
 
 REPO="$(cd "$(dirname "$0")/.." && pwd)"
 LAT_BIN="$REPO/target/release/bench_decode_ab"
@@ -18,12 +18,38 @@ Q8_DIR="$HOME/.lattice/models/qwen3.5-0.8b"
 RUNS="${RUNS:-5}"
 N1=8
 
+# Validate RUNS
+if ! [[ "$RUNS" =~ ^[0-9]+$ ]] || (( RUNS < 1 )); then
+    echo '{"error":"RUNS must be a positive integer"}'; exit 1
+fi
+
 if [[ -z "${CONTEXTS:-}" ]]; then
     CONTEXTS_ARR=(64 128 256 512 1024)
 else
     read -ra CONTEXTS_ARR <<< "$CONTEXTS"
 fi
 
+# Validate contexts: must be positive integers > N1, no duplicates
+declare -A SEEN_CTX=()
+VALID_CTX=()
+for CTX in "${CONTEXTS_ARR[@]}"; do
+    if ! [[ "$CTX" =~ ^[0-9]+$ ]] || (( CTX <= N1 )); then
+        >&2 echo "WARNING: skipping invalid context $CTX (must be integer > $N1)"
+        continue
+    fi
+    if [[ -n "${SEEN_CTX[$CTX]:-}" ]]; then
+        >&2 echo "WARNING: skipping duplicate context $CTX"
+        continue
+    fi
+    SEEN_CTX[$CTX]=1
+    VALID_CTX+=("$CTX")
+done
+CONTEXTS_ARR=("${VALID_CTX[@]}")
+if (( ${#CONTEXTS_ARR[@]} < 2 )); then
+    echo '{"error":"need at least 2 valid contexts (integers > '$N1')"}'
+    exit 1
+fi
+
 # Build if needed
 if [[ ! -x "$LAT_BIN" ]]; then
     cargo build --release -p lattice-inference --bin bench_decode_ab \
@@ -60,7 +86,11 @@ for CTX in "${CONTEXTS_ARR[@]}"; do
         # Use ACTUAL completion tokens, not requested — model may hit EOS early
         ACTUAL_DELTA=$(( C2 - C1 ))
         if (( ACTUAL_DELTA > 0 )); then
-            PTM=$(echo "scale=6; ($T2 - $T1) / $ACTUAL_DELTA" | bc)
+            PTM=$(echo "scale=6; ($T2 - $T1) / $ACTUAL_DELTA" | bc) || true
+            if [[ -z "$PTM" || ! "$PTM" =~ ^-?[0-9] ]]; then
+                >&2 echo "  ctx=$CTX: bc failed (T2=$T2, T1=$T1, delta=$ACTUAL_DELTA), skipping"
+                continue
+            fi
             CTX_VALS+=("$C2")
             PTM_VALS+=("$PTM")
             >&2 echo "  ctx=$C2 (req=$CTX): T2=${T2}ms per_tok=${PTM}ms"
@@ -91,6 +121,7 @@ BEGIN {
         sx += x; sy += y; sxx += x*x; sxy += x*y
     }
     denom = n * sxx - sx * sx
+    if (denom == 0) { printf "{\"error\":\"degenerate input (all x identical)\"}\n"; exit }
     slope = (n * sxy - sx * sy) / denom
     intercept = (sy - slope * sx) / n
     # R-squared
@@ -119,16 +150,25 @@ BEGIN {
     printf "]}\n"
 }' /dev/null)
 
+# Validate output
+if [[ -z "$RESULT" ]]; then
+    echo '{"error":"regression calculation produced no output"}'
+    exit 1
+fi
+
 # Pretty print to stderr, raw JSON to stdout
 >&2 echo ""
 >&2 echo "=== Decode Slope Fit ==="
 >&2 echo "$RESULT" | python3 -c "
 import json, sys
 d = json.load(sys.stdin)
+if 'error' in d:
+    print(f\"  ERROR: {d['error']}\")
+    sys.exit(0)
 print(f\"  slope:     {d['slope_ms']:.6f} ms/ctx-token\")
 print(f\"  intercept: {d['intercept_ms']:.4f} ms\")
 print(f\"  R²:        {d['r_squared']:.6f}\")
 print(f\"  tok/s@64:  {d['tok_per_sec_64']:.1f}\")
-" 2>/dev/null
+" 2>/dev/null || true
 
 echo "$RESULT"