ohdearquant · ohdearquant · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
@@ -11,7 +11,7 @@ if [ $? -ne 0 ]; then
 fi
 
 # Clippy
-cargo clippy --workspace -- -D warnings
+cargo clippy --workspace --all-targets -- -D warnings
 if [ $? -ne 0 ]; then
     echo "Clippy check failed. Fix warnings before committing."
     exit 1

diff --git a/.github/workflows/bench-regression.yml b/.github/workflows/bench-regression.yml
@@ -77,17 +77,13 @@ jobs:
           cargo bench -p lattice-inference --bench elementwise_cpu_bench --no-run
           cargo bench -p lattice-embed --bench simd --no-run
 
-      - name: Run benches against baseline
-        if: steps.fetch_baseline.outputs.have_baseline == 'true'
-        run: |
-          cargo bench -p lattice-inference --bench elementwise_cpu_bench -- --baseline base --noplot
-          cargo bench -p lattice-embed --bench simd -- --baseline base --noplot
-
-      - name: Run benches without baseline (seed)
-        if: steps.fetch_baseline.outputs.have_baseline == 'false'
+      - name: Run benches
         run: |
-          cargo bench -p lattice-inference --bench elementwise_cpu_bench -- --save-baseline base --noplot
-          cargo bench -p lattice-embed --bench simd -- --save-baseline base --noplot
+          # --save-baseline saves new data AND compares against existing baseline
+          # if present. Unlike --baseline, it doesn't panic when a bench group
+          # has no prior baseline (e.g., newly added bench groups).
+          cargo bench -p lattice-inference --bench elementwise_cpu_bench -- --save-baseline base --noplot --quick
+          cargo bench -p lattice-embed --bench simd -- --save-baseline base --noplot --quick
 
       - name: Apply gate
         id: gate
@@ -98,15 +94,14 @@ jobs:
           && echo "gate=pass" >> "$GITHUB_OUTPUT" \
           || echo "gate=fail" >> "$GITHUB_OUTPUT"
 
-      - name: Note seed run
+      - name: Note no baseline
         if: steps.fetch_baseline.outputs.have_baseline == 'false'
         run: |
-          mkdir -p .
-          cat > report-${{ matrix.arch }}.md <<EOF
-          ### \`${{ matrix.arch }}\` — no baseline available
+          cat > report-${{ matrix.arch }}.md <<'EOF'
+          ### `${{ matrix.arch }}` — no baseline available
 
-          The \`perf-baselines\` branch has no data for this arch yet. Run
-          \`bench-update.yml\` on \`main\` to seed it. This PR is not gated.
+          The `perf-baselines` branch has no data for this arch yet. Run
+          `bench-update.yml` on `main` to seed it. This PR is not gated.
           EOF
 
       - name: Upload report

diff --git a/crates/inference/benches/metrics_bench.rs b/crates/inference/benches/metrics_bench.rs
@@ -60,7 +60,7 @@ fn naive_entropy_nats(logits: &[f32]) -> f32 {
     if logits.len() < 2 {
         return 0.0;
     }
-    let max_l = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+    let max_l = logits.iter().copied().fold(f32::NEG_INFINITY, f32::max);
     let mut sum_exp = 0.0_f32;
     let mut exps: Vec<f32> = logits
         .iter()

diff --git a/crates/inference/src/attention/mod.rs b/crates/inference/src/attention/mod.rs
@@ -448,7 +448,7 @@ mod attention_kind_tests {
             head_dim: 128,
         };
         let kind = AttentionKind::Gqa(cfg);
-        let cloned = kind.clone();
+        let cloned = kind;
         assert_eq!(cloned.name(), "gqa");
         if let AttentionKind::Gqa(c) = cloned {
             assert_eq!(c.num_heads, 32);

diff --git a/crates/inference/src/kv_cache/flat.rs b/crates/inference/src/kv_cache/flat.rs
@@ -675,10 +675,10 @@ mod tests {
         };
         let kv_dim = 2 * 4; // 8
         // f16: 2 * 1 * 16 * 8 * 2 = 512 bytes
-        let expected_f16 = 2 * 1 * 16 * kv_dim * std::mem::size_of::<f16>();
+        let expected_f16 = 2 * 16 * kv_dim * std::mem::size_of::<f16>();
         assert_eq!(config.total_bytes(), expected_f16);
         // Would have been 1024 with f32
-        let would_be_f32 = 2 * 1 * 16 * kv_dim * std::mem::size_of::<f32>();
+        let would_be_f32 = 2 * 16 * kv_dim * std::mem::size_of::<f32>();
         assert_eq!(config.total_bytes() * 2, would_be_f32);
     }
 
@@ -947,8 +947,7 @@ mod tests {
         // Measured relative error must be < 0.1% (0.001)
         assert!(
             max_rel_kv < 0.001,
-            "max relative error for KV in [-10,10] is {:.4e}, expected < 0.001",
-            max_rel_kv
+            "max relative error for KV in [-10,10] is {max_rel_kv:.4e}, expected < 0.001"
         );
     }
 
@@ -1023,7 +1022,7 @@ mod tests {
             // Phase 2: stable softmax
             let max_s = scores[..kv_seq_len]
                 .iter()
-                .cloned()
+                .copied()
                 .fold(f32::NEG_INFINITY, f32::max);
             let sum: f32 = scores[..kv_seq_len]
                 .iter_mut()
@@ -1227,31 +1226,27 @@ mod tests {
 
         let top1_rate = top1_match_count as f32 / total_cases as f32;
         eprintln!(
-            "\n=== Tensor Oracle Summary ===\n  logit_max_abs_diff = {:.4e}  (gate: < 0.02)\n  top1_match_rate    = {:.4}    (gate: >= 0.95)\n  nan_count          = {}\n  max_synth_nll_delta= {:.4e}  (gate: < 0.01)",
-            global_max_logit_diff, top1_rate, nan_count, max_synth_nll_delta
+            "\n=== Tensor Oracle Summary ===\n  logit_max_abs_diff = {global_max_logit_diff:.4e}  (gate: < 0.02)\n  top1_match_rate    = {top1_rate:.4}    (gate: >= 0.95)\n  nan_count          = {nan_count}\n  max_synth_nll_delta= {max_synth_nll_delta:.4e}  (gate: < 0.01)"
         );
 
         assert_eq!(nan_count, 0, "f16 KV dequant introduced NaN in logits");
         assert!(
             global_max_logit_diff < 0.02,
-            "logit_max_abs_diff {:.4e} >= 0.02 gate",
-            global_max_logit_diff
+            "logit_max_abs_diff {global_max_logit_diff:.4e} >= 0.02 gate"
         );
         assert!(
             top1_rate >= 0.95,
-            "top1_match_rate {:.4} < 0.95 gate",
-            top1_rate
+            "top1_match_rate {top1_rate:.4} < 0.95 gate"
         );
         assert!(
             max_synth_nll_delta < 0.01,
-            "max synthetic NLL delta {:.4e} >= 0.01",
-            max_synth_nll_delta
+            "max synthetic NLL delta {max_synth_nll_delta:.4e} >= 0.01"
         );
     }
 
     /// Compute log softmax probability for target token (for synthetic NLL).
     fn softmax_log_prob(logits: &[f32], target: usize) -> f32 {
-        let max_l = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let max_l = logits.iter().copied().fold(f32::NEG_INFINITY, f32::max);
         let sum: f32 = logits.iter().map(|&l| (l - max_l).exp()).sum();
         let log_sum = sum.ln();
         (logits[target] - max_l) - log_sum

diff --git a/crates/inference/src/metrics.rs b/crates/inference/src/metrics.rs
@@ -343,7 +343,7 @@ mod tests {
         let online_h = acc.entropy_nats();
 
         // Naive reference: softmax → -sum p log p.
-        let max_l = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let max_l = logits.iter().copied().fold(f32::NEG_INFINITY, f32::max);
         let exps: Vec<f32> = logits.iter().map(|&l| (l - max_l).exp()).collect();
         let sum_exp: f32 = exps.iter().sum();
         let naive_h: f32 = exps
@@ -557,7 +557,7 @@ mod tests {
         // 32 tokens at +3.0, 32 tokens at -3.0 → two-cluster split.
         // Both clusters uniform within themselves; compare online to naive.
         let mut logits = vec![3.0_f32; 32];
-        logits.extend(std::iter::repeat(-3.0_f32).take(32));
+        logits.extend(std::iter::repeat_n(-3.0_f32, 32));
 
         let mut acc = OnlineSoftmaxEntropy::new();
         for &l in &logits {
@@ -606,7 +606,7 @@ mod tests {
         assert!(h >= 0.0, "subnormal entropy must be non-negative, got {h}");
 
         // Verify within tolerance of naive reference.
-        let max_l = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let max_l = logits.iter().copied().fold(f32::NEG_INFINITY, f32::max);
         let exps: Vec<f32> = logits.iter().map(|&l| (l - max_l).exp()).collect();
         let sum_exp: f32 = exps.iter().sum();
         let naive_h: f32 = exps

diff --git a/crates/inference/src/pruning.rs b/crates/inference/src/pruning.rs
@@ -309,8 +309,7 @@ mod tests {
         );
         assert!(
             c.unwrap().is_finite(),
-            "large identical cosine must be finite, got {:?}",
-            c
+            "large identical cosine must be finite, got {c:?}"
         );
         assert_close(
             c.unwrap(),
@@ -505,7 +504,7 @@ mod tests {
         // Paper: mean(cos) = (1.0 + 0.0) / 2 = 0.5
         // Pseudocode drift: sum(dot) / sum(norms) = 0.0 / 10001.0 ≈ 0.0 ← WRONG
         let unit_x = vec![1.0_f32, 0.0];
-        let _unit_y = vec![0.0_f32, 1.0];
+        let _unit_y = [0.0_f32, 1.0];
         let big_x = vec![100.0_f32, 0.0];
         let big_y = vec![0.0_f32, 100.0];