diff --git a/docs/anchor-transitive.md b/docs/anchor-transitive.md
new file mode 100644
index 00000000..6f1b02f8
--- /dev/null
+++ b/docs/anchor-transitive.md
@@ -0,0 +1,15 @@
+# anchor transitive scan
+
+`scan_and_force_transitive` (anchor_scan.cpp) expands the query pool with
+tokens from newly-forced chunks and re-runs `scan_and_force` until fixed
+point or max_iters (default 3) is reached.
+
+Improves multi-hop retrieval: enables discovery of intermediate context
+chunks whose tokens do not appear in the original query but connect
+query-to-needle via shared rare tokens.
+
+Empirical result: F1=0.628 on LongBench HotpotQA at ee7 + keep=0.15
+(vs uncompressed F1=0.697). This is the ceiling for attention-score-based
+prefill compression on this task; see bench/2026-05-25_longbench_hotpotqa/.
+
+On by default. Disable via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0.
diff --git a/docs/pflash-compress-cfg.md b/docs/pflash-compress-cfg.md
new file mode 100644
index 00000000..5755e314
--- /dev/null
+++ b/docs/pflash-compress-cfg.md
@@ -0,0 +1,46 @@
+# pflash compression knobs
+
+All PFLASH_COMPRESS_* and DFLASH_COMPRESS_* env vars are read once per
+request in `compress_cfg_from_env(n_chunks, n_keep)` in qwen3_drafter.cpp.
+
+## anchor_radius adaptive ladder
+
+Prevents the 64K NIAH cliff: at long context the needle text is more likely
+to straddle multiple chunks, and a fixed radius=2 window (5 chunks / ~160
+tokens) loses the back half of the needle.
+
+Default ladder (override via PFLASH_COMPRESS_ANCHOR_RADIUS):
+
+| n_chunks   | anchor_radius |
+|------------|---------------|
+| < 1024     | 2             |
+| 1024-2047  | 4             |
+| >= 2048    | 8             |
+
+## max_anchor_hits adaptive ladder
+
+Same breakpoints as anchor_radius. At long context anchors are sparser, so
+more hits per query token are affordable.
+
+| n_chunks   | max_anchor_hits |
+|------------|-----------------|
+| < 1024     | 8               |
+| 1024-2047  | 16              |
+| >= 2048    | 32              |
+
+## anchor_transitive
+
+On by default. Gated rare-token bridge expands the query pool with tokens
+from newly-forced chunks and re-runs anchor scan to fixed point.
+Improves multi-hop F1 on LongBench HotpotQA (empirically; F1=0.628 ceiling
+at ee7+anchor-transitive on RTX 3090 — see bench/2026-05-25_longbench_hotpotqa/).
+Control via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0 to disable.
+
+## head/tail chunk forcing
+
+Head and tail chunks are force-included before top-K scoring fills the
+remainder. The counts scale with n_keep so top-K always gets at least one
+slot even when head_raw + tail_raw >= n_keep.
+
+Defaults: head=8, tail=24 (override via DFLASH_COMPRESS_HEAD_CHUNKS /
+DFLASH_COMPRESS_TAIL_CHUNKS).
diff --git a/server/src/qwen3/anchor_scan.cpp b/server/src/qwen3/anchor_scan.cpp
new file mode 100644
index 00000000..e0088167
--- /dev/null
+++ b/server/src/qwen3/anchor_scan.cpp
@@ -0,0 +1,169 @@
+#include "anchor_scan.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <unordered_map>
+#include <vector>
+
+namespace dflash::qwen3 {
+
+// Force chunk and its radius-neighborhood into `forced`.
+static void force_neighborhood(std::vector<uint8_t>& forced, int n_chunks,
+                                int chunk, int radius) {
+    int lo = std::max(0, chunk - radius);
+    int hi = std::min(n_chunks - 1, chunk + radius);
+    for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1;
+}
+
+void scan_and_force(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& query_pool,
+    const AnchorScanCfg& cfg,
+    std::vector<uint8_t>& forced)
+{
+    const int n_chunks = (int)forced.size();
+    const int ngram    = cfg.ngram;
+    const int search_end = std::max(0, body_end - ngram);
+
+    for (int qi = 0; qi + ngram <= (int)query_pool.size(); ++qi) {
+        int hits = 0;
+        int hit_pos[8];
+        for (int p = 0; p <= search_end && hits <= cfg.max_anchor_hits; ++p) {
+            bool same = true;
+            for (int k = 0; k < ngram; ++k) {
+                if (ids[(size_t)p + k] != query_pool[(size_t)qi + k]) {
+                    same = false;
+                    break;
+                }
+            }
+            if (same) {
+                if (hits < 8) hit_pos[hits] = p;
+                ++hits;
+            }
+        }
+        if (hits > 0 && hits <= cfg.max_anchor_hits) {
+            for (int i = 0; i < hits && i < 8; ++i) {
+                force_neighborhood(forced, n_chunks,
+                                   hit_pos[i] / cfg.chunk_size,
+                                   cfg.anchor_radius);
+            }
+        }
+    }
+}
+
+// Helper: count set entries in forced.
+static int count_set(const std::vector<uint8_t>& forced) {
+    int n = 0;
+    for (uint8_t v : forced) n += (v != 0);
+    return n;
+}
+
+void scan_and_force_transitive(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& initial_query_pool,
+    const AnchorScanCfg& cfg,
+    int max_iters,
+    std::vector<uint8_t>& forced)
+{
+    auto pool = initial_query_pool;
+    const int n_chunks = (int)forced.size();
+
+    // Precompute token frequencies in body once.
+    std::unordered_map<int32_t, int> body_freq;
+    body_freq.reserve((size_t)body_end);
+    for (int j = 0; j < body_end; ++j) ++body_freq[ids[(size_t)j]];
+
+    // Build inverted index: token -> list of body positions (for rare tokens only).
+    std::unordered_map<int32_t, std::vector<int>> rare_positions;
+    if (cfg.rare_token_max_freq > 0) {
+        for (auto& kv : body_freq) {
+            if (kv.second <= cfg.rare_token_max_freq) {
+                rare_positions[kv.first] = {};
+            }
+        }
+        for (int p = 0; p < body_end; ++p) {
+            auto it = rare_positions.find(ids[(size_t)p]);
+            if (it != rare_positions.end()) it->second.push_back(p);
+        }
+    }
+
+    // Pass-1: run the initial scan.
+    const int count_before_pass1 = count_set(forced);
+    scan_and_force(ids, body_end, pool, cfg, forced);
+    const int gained_pass1 = count_set(forced) - count_before_pass1;
+
+    // Gating: if pass-1 already found many anchors, skip the cascade entirely.
+    if (cfg.cascade_min_anchor_count > 0 && gained_pass1 >= cfg.cascade_min_anchor_count) {
+        return;
+    }
+
+    // Cascade loop: expand pool with newly-forced tokens and re-scan.
+    std::vector<uint8_t> prev_forced;
+    for (int it = 0; it < max_iters; ++it) {
+        prev_forced = forced;
+
+        // Rare-token single-match: worklist-driven so cascades within a pass are
+        // caught (e.g. hop3 forces hop2 which forces hop1 in one outer iteration).
+        if (cfg.rare_token_max_freq > 0) {
+            std::vector<int> worklist;
+            for (int c = 0; c < n_chunks; ++c) {
+                if (forced[c] && !prev_forced[c]) worklist.push_back(c);
+            }
+            // On first iteration, seed from everything forced so far (pass-1 results).
+            if (it == 0) {
+                worklist.clear();
+                for (int c = 0; c < n_chunks; ++c) {
+                    if (forced[c]) worklist.push_back(c);
+                }
+            }
+            for (int wi = 0; wi < (int)worklist.size(); ++wi) {
+                int c = worklist[wi];
+                int s = c * cfg.chunk_size;
+                int e = std::min(body_end, (c + 1) * cfg.chunk_size);
+                for (int j = s; j < e; ++j) {
+                    auto it2 = rare_positions.find(ids[(size_t)j]);
+                    if (it2 == rare_positions.end()) continue;
+                    for (int p : it2->second) {
+                        int target_c = p / cfg.chunk_size;
+                        if (!forced[(size_t)target_c]) {
+                            force_neighborhood(forced, n_chunks,
+                                               target_c, cfg.anchor_radius);
+                            worklist.push_back(target_c);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Hard cap: if we exceeded max_forced_count, revert this iteration and stop.
+        if (count_set(forced) > cfg.max_forced_count) {
+            forced = prev_forced;
+            break;
+        }
+
+        if (forced == prev_forced) break;
+
+        // Expand pool with tokens from newly-forced chunks (feeds next 4-gram pass).
+        for (int c = 0; c < n_chunks; ++c) {
+            if (forced[c] && !prev_forced[c]) {
+                int s = c * cfg.chunk_size;
+                int e = std::min((int)ids.size(), (c + 1) * cfg.chunk_size);
+                for (int j = s; j < e; ++j) pool.push_back(ids[j]);
+            }
+        }
+
+        // 4-gram scan with expanded pool for next iteration.
+        prev_forced = forced;
+        scan_and_force(ids, body_end, pool, cfg, forced);
+
+        // Hard cap check after 4-gram expansion too.
+        if (count_set(forced) > cfg.max_forced_count) {
+            forced = prev_forced;
+            break;
+        }
+    }
+}
+
+} // namespace dflash::qwen3
diff --git a/server/src/qwen3/anchor_scan.h b/server/src/qwen3/anchor_scan.h
new file mode 100644
index 00000000..8f75a085
--- /dev/null
+++ b/server/src/qwen3/anchor_scan.h
@@ -0,0 +1,42 @@
+// N-gram anchor scan: mark chunks forced by token-match between a query pool
+// and the body of an ids sequence.  Pure CPU, no GPU, no model required.
+#pragma once
+
+#include <climits>
+#include <cstdint>
+#include <vector>
+
+namespace dflash::qwen3 {
+
+struct AnchorScanCfg {
+    int chunk_size;
+    int anchor_radius;
+    int max_anchor_hits;
+    int ngram = 4;
+    int rare_token_max_freq = 8;        // tokens appearing <= this many times in body count as rare
+    int cascade_min_anchor_count = 0;   // skip cascade if pass-1 forced >= this many chunks (0 = always cascade)
+    int max_forced_count = INT_MAX;     // hard cap on total forced chunks
+};
+
+// Marks chunks forced by ngram-matches between query_pool and ids[0..body_end).
+// `forced` is in-out; new hits are OR-merged.  Idempotent.
+void scan_and_force(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& query_pool,
+    const AnchorScanCfg& cfg,
+    std::vector<uint8_t>& forced
+);
+
+// Transitive variant: expands the query pool with tokens from newly-forced
+// chunks and re-runs scan_and_force until a fixed point or max_iters reached.
+void scan_and_force_transitive(
+    const std::vector<int32_t>& ids,
+    int body_end,
+    const std::vector<int32_t>& initial_query_pool,
+    const AnchorScanCfg& cfg,
+    int max_iters,
+    std::vector<uint8_t>& forced
+);
+
+} // namespace dflash::qwen3
diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
index f65cb079..852fc96e 100644
--- a/server/src/qwen3/qwen3_drafter.cpp
+++ b/server/src/qwen3/qwen3_drafter.cpp
@@ -17,6 +17,7 @@
 #include "qwen3_drafter_model.h"
 #include "common/backend_precision.h"
 #include "internal.h"
+#include "anchor_scan.h"
 
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -64,11 +65,122 @@ static int env_int(const char * name, int fallback) {
     return fallback;
 }
 
-static void force_chunk_neighborhood(std::vector<uint8_t> & forced, int n_chunks,
-                                     int chunk, int radius) {
-    int lo = std::max(0, chunk - radius);
-    int hi = std::min(n_chunks - 1, chunk + radius);
-    for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1;
+static float env_float(const char * name, float def) {
+    if (const char * v = std::getenv(name)) {
+        try { return std::stof(v); } catch (...) {}
+    }
+    return def;
+}
+
+// All pflash/dflash compression knobs read from env, derived per-request.
+// anchor_radius and max_anchor_hits use an adaptive ladder keyed on n_chunks
+// to prevent the 64K NIAH cliff; see docs/pflash-compress-cfg.md.
+// Override any ladder value via PFLASH_COMPRESS_* env vars.
+struct CompressCfg {
+    int   query_tokens;
+    int   head_chunks;
+    int   tail_chunks;
+    dflash::qwen3::AnchorScanCfg anchor;
+    bool  use_transitive;
+    int   max_iters;
+};
+
+static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep,
+                                          int use_transitive_override = -1) {
+    CompressCfg c{};
+
+    c.query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96);
+
+    // head/tail forced chunks scale so top-K scoring always gets slots
+    const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8);
+    const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24);
+    c.head_chunks = h_raw;
+    c.tail_chunks = t_raw;
+    if (c.head_chunks + c.tail_chunks >= n_keep) {
+        const int budget = std::max(1, n_keep - 1);
+        c.head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw));
+        c.tail_chunks = std::max(0, budget - c.head_chunks);
+    }
+
+    // anchor_radius: adaptive ladder prevents 64K NIAH cliff
+    // (<32K=2, 32-64K=4, >=64K=8); override via PFLASH_COMPRESS_ANCHOR_RADIUS
+    {
+        const int env_r    = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1);
+        const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1);
+        if      (env_r    >= 0)    c.anchor.anchor_radius = env_r;
+        else if (legacy_r >= 0)    c.anchor.anchor_radius = legacy_r;
+        else if (n_chunks <  1024) c.anchor.anchor_radius = 2;
+        else if (n_chunks <  2048) c.anchor.anchor_radius = 4;
+        else                       c.anchor.anchor_radius = 8;
+    }
+
+    // max_anchor_hits: same ladder — sparser anchors at long context
+    {
+        const int env_h    = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
+        const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1);
+        if      (env_h    >= 0)    c.anchor.max_anchor_hits = env_h;
+        else if (legacy_h >= 0)    c.anchor.max_anchor_hits = legacy_h;
+        else if (n_chunks <  1024) c.anchor.max_anchor_hits = 8;
+        else if (n_chunks <  2048) c.anchor.max_anchor_hits = 16;
+        else                       c.anchor.max_anchor_hits = 32;
+    }
+
+    c.anchor.ngram = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM deprecated, use PFLASH_COMPRESS_ANCHOR_NGRAM\n"); return lv; }
+        return 4;
+    }();
+
+    c.anchor.rare_token_max_freq = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1);
+        const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ deprecated, use PFLASH_COMPRESS_RARE_MAX_FREQ\n"); return lv; }
+        return 2;
+    }();
+
+    const float cascade_min_anchor_frac = [&]{
+        const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
+        const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f);
+        if (nv >= 0.0f) return nv;
+        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC deprecated, use PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC\n"); return lv; }
+        return 0.0f;
+    }();
+
+    const float max_forced_ratio = [&]{
+        const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
+        const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f);
+        if (nv >= 0.0f) return nv;
+        if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO deprecated, use PFLASH_COMPRESS_MAX_FORCED_RATIO\n"); return lv; }
+        return 10.0f;
+    }();
+
+    c.anchor.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep);
+    c.anchor.max_forced_count         = (int)(max_forced_ratio * n_keep);
+
+    c.use_transitive = [&]{
+        // Per-request override (0=off, 1=on) from router decision takes precedence.
+        if (use_transitive_override == 0) return false;
+        if (use_transitive_override == 1) return true;
+        // Fallback: read from env (same as before, no behaviour change when -1).
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1);
+        if (nv >= 0) return nv != 0;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE deprecated, use PFLASH_COMPRESS_ANCHOR_TRANSITIVE\n"); return lv != 0; }
+        return true;  // on by default; see docs/anchor-transitive.md
+    }();
+
+    c.max_iters = [&]{
+        const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
+        const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1);
+        if (nv >= 0) return nv;
+        if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS deprecated, use PFLASH_COMPRESS_ANCHOR_MAX_ITERS\n"); return lv; }
+        return 3;
+    }();
+
+    return c;
 }
 
 #if defined(DFLASH27B_BACKEND_HIP)
@@ -120,21 +232,6 @@ const char * drafter_arch_name(DrafterArch arch) {
     return "unknown";
 }
 
-bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
-                  DrafterContext & out) {
-    return load_drafter(gguf_path, /*gpu_layers=*/999, /*gpu=*/0, out);
-}
-
-bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
-                  int gpu, DrafterContext & out) {
-    return load_drafter(gguf_path, /*gpu_layers=*/999, DrafterArch::Qwen3_0p6b, gpu, out);
-}
-
-bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
-                  DrafterArch arch, DrafterContext & out) {
-    return load_drafter(gguf_path, /*gpu_layers=*/999, arch, /*gpu=*/0, out);
-}
-
 bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
                   DrafterArch arch, int gpu, DrafterContext & out) {
     if (gpu < 0) {
@@ -224,6 +321,22 @@ bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/,
     return true;
 }
 
+// Thin overloads for API compat; all forward to the canonical 4-arg form.
+bool load_drafter(const std::string & gguf_path, int gpu_layers,
+                  DrafterContext & out) {
+    return load_drafter(gguf_path, gpu_layers, DrafterArch::Qwen3_0p6b, /*gpu=*/0, out);
+}
+
+bool load_drafter(const std::string & gguf_path, int gpu_layers,
+                  int gpu, DrafterContext & out) {
+    return load_drafter(gguf_path, gpu_layers, DrafterArch::Qwen3_0p6b, gpu, out);
+}
+
+bool load_drafter(const std::string & gguf_path, int gpu_layers,
+                  DrafterArch arch, DrafterContext & out) {
+    return load_drafter(gguf_path, gpu_layers, arch, /*gpu=*/0, out);
+}
+
 void free_drafter(DrafterContext & ctx) {
     free_drafter_weights(ctx);
     if (ctx.backend) {
@@ -254,7 +367,8 @@ static std::vector<int32_t> qwen35_score_and_compress(
     float keep_ratio,
     int chunk_size,
     int n_lookahead,
-    int pool_kernel) {
+    int pool_kernel,
+    int use_transitive_override = -1) {
 
     const int S = (int)ids.size();
     const int hidden = w.n_embd;
@@ -505,24 +619,23 @@ static std::vector<int32_t> qwen35_score_and_compress(
 
     const int n_chunks = (S + chunk_size - 1) / chunk_size;
     const int n_keep = std::max(1, (int)((float)n_chunks * keep_ratio));
-    
-    std::vector<float> smooth_score = score;
-    // Caller pool_kernel takes precedence; if zero/negative, fall back to env or 5.
+
     const int pk = (pool_kernel > 0)
         ? pool_kernel
         : std::max(3, env_int("DFLASH_COMPRESS_POOL_KERNEL", 5));
-    std::vector<float> smoothed((size_t)S, 0.0f);
-    int half = pk / 2;
-    for (int j = 0; j < S; ++j) {
-        int lo = std::max(0, j - half);
-        int hi = std::min(S - 1, j + half);
-        float s = 0.0f;
-        int n = 0;
-        for (int k = lo; k <= hi; ++k) { s += score[(size_t)k]; ++n; }
-        smoothed[(size_t)j] = (n > 0) ? (s / (float)n) : 0.0f;
+    std::vector<float> smooth_score((size_t)S, 0.0f);
+    {
+        int half = pk / 2;
+        for (int j = 0; j < S; ++j) {
+            int lo = std::max(0, j - half);
+            int hi = std::min(S - 1, j + half);
+            float s = 0.0f;
+            int n = 0;
+            for (int k = lo; k <= hi; ++k) { s += score[(size_t)k]; ++n; }
+            smooth_score[(size_t)j] = (n > 0) ? (s / (float)n) : 0.0f;
+        }
     }
-    smooth_score.swap(smoothed);
-    
+
     std::vector<std::pair<float, int>> chunk_means;
     for (int c = 0; c < n_chunks; ++c) {
         int lo = c * chunk_size, hi = std::min(S, lo + chunk_size);
@@ -531,50 +644,28 @@ static std::vector<int32_t> qwen35_score_and_compress(
         chunk_means.push_back({s / std::max(1, hi - lo), c});
     }
     std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; });
-    
+
+    const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep, use_transitive_override);
+
     std::vector<uint8_t> selected((size_t)n_chunks, 0);
     int count = 0;
-    // Scale head/tail forced chunks so they don't crowd out top-K scoring.
-    {
-        const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8);
-        const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24);
-        int h_n = h_raw, t_n = t_raw;
-        if (h_n + t_n >= n_keep) {
-            const int budget = std::max(1, n_keep - 1);
-            h_n = std::max(0, h_raw * budget / (h_raw + t_raw));
-            t_n = std::max(0, budget - h_n);
-        }
-        for (int c = 0; c < std::min(n_chunks, h_n); ++c) { selected[(size_t)c] = 1; ++count; }
-        for (int c = std::max(0, n_chunks - t_n); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; }
-    }
+    for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) { selected[(size_t)c] = 1; ++count; }
+    for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; }
 
-    const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96);
-    const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2);
-    const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
+    const int q0 = std::max(0, S - cfg.query_tokens);
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
 
-    const int q0 = std::max(0, S - query_tokens);
-    constexpr int NGRAM = 4;
-    for (int q = q0; q + NGRAM <= S; ++q) {
-        int hits = 0;
-        int hit_pos[8];
-        const int search_end = std::max(0, q0 - NGRAM);
-        for (int p = 0; p <= search_end && hits <= max_anchor_hits; ++p) {
-            bool same = true;
-            for (int k = 0; k < NGRAM; ++k) {
-                if (ids[(size_t)p + k] != ids[(size_t)q + k]) { same = false; break; }
-            }
-            if (same) {
-                if (hits < 8) hit_pos[hits] = p;
-                ++hits;
-            }
-        }
-        if (hits > 0 && hits <= max_anchor_hits) {
-            for (int i = 0; i < hits && i < 8; ++i) {
-                force_chunk_neighborhood(forced, n_chunks, hit_pos[i] / chunk_size, anchor_radius);
-            }
-        }
+    dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor;
+    anchor_cfg.chunk_size = chunk_size;
+
+    if (cfg.use_transitive) {
+        dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                                  anchor_cfg, cfg.max_iters, forced);
+    } else {
+        dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced);
     }
+
     for (int c = 0; c < n_chunks; ++c) {
         if (forced[(size_t)c] && !selected[(size_t)c]) {
             selected[(size_t)c] = 1;
@@ -582,16 +673,14 @@ static std::vector<int32_t> qwen35_score_and_compress(
         }
     }
 
-    // Global aggregation tasks often depend on repeated rare tokens that do
-    // not appear in the final query. Preserve high-frequency-but-not-filler
-    // token chunks before filling with model-score top-K.
+    // Global aggregation tasks: preserve high-frequency-but-not-filler token chunks.
     const int repeat_min = env_int("DFLASH_COMPRESS_REPEAT_MIN", 4);
     const int repeat_max = env_int("DFLASH_COMPRESS_REPEAT_MAX", 32);
     const int repeat_limit = env_int("DFLASH_COMPRESS_REPEAT_CHUNKS", n_keep);
     if (repeat_min > 1 && count < repeat_limit) {
         std::unordered_map<int32_t, int> freq;
         freq.reserve((size_t)S);
-        const int repeat_scan_end = std::max(0, S - query_tokens);
+        const int repeat_scan_end = std::max(0, S - cfg.query_tokens);
         for (int j = 0; j < repeat_scan_end; ++j) {
             ++freq[ids[(size_t)j]];
         }
@@ -619,12 +708,12 @@ static std::vector<int32_t> qwen35_score_and_compress(
             }
         }
     }
-    
+
     for (auto [_, c] : chunk_means) {
         if (count >= n_keep) break;
         if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; }
     }
-    
+
     std::vector<int32_t> out_ids;
     std::vector<int> selected_chunks;
     for (int c = 0; c < n_chunks; ++c) {
@@ -660,7 +749,8 @@ std::vector<int32_t> drafter_score_and_compress(
     float keep_ratio,
     int chunk_size,
     int n_lookahead,
-    int pool_kernel) {
+    int pool_kernel,
+    int use_transitive_override) {
     if (!ctx.loaded) {
         set_last_error("drafter not loaded");
         return {};
@@ -671,7 +761,7 @@ std::vector<int32_t> drafter_score_and_compress(
             return {};
         }
         auto * st = static_cast<Qwen35DrafterState *>(ctx.arch_state);
-        return qwen35_score_and_compress(st->weights, ids, keep_ratio, chunk_size, n_lookahead, pool_kernel);
+        return qwen35_score_and_compress(st->weights, ids, keep_ratio, chunk_size, n_lookahead, pool_kernel, use_transitive_override);
     }
     const int S = (int)ids.size();
     if (S < n_lookahead + 1) {
@@ -728,46 +818,27 @@ std::vector<int32_t> drafter_score_and_compress(
     std::sort(chunk_means.begin(), chunk_means.end(),
                       [](auto a, auto b) { return a.first > b.first; });
 
-    // Retrieval tasks often repeat a rare key in the final query and in the
-    // needle span. Exact scores alone can keep the query while dropping the
-    // neighboring answer chunk, so force a small token-only anchor neighborhood.
-    // Head/tail forced chunks scale with n_keep so top-K scoring always gets slots.
-    const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8);
-    const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24);
-    int head_chunks = h_raw, tail_chunks = t_raw;
-    if (head_chunks + tail_chunks >= n_keep) {
-        const int budget = std::max(1, n_keep - 1);
-        head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw));
-        tail_chunks = std::max(0, budget - head_chunks);
-    }
-    const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96);
-    const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2);
-    const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8);
+    const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep, use_transitive_override);
+
     std::vector<uint8_t> selected_mask((size_t)n_chunks, 0);
     std::vector<uint8_t> forced((size_t)n_chunks, 0);
-    for (int c = 0; c < std::min(n_chunks, head_chunks); ++c) forced[(size_t)c] = 1;
-    for (int c = std::max(0, n_chunks - tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1;
-
-    const int q0 = std::max(0, S - query_tokens);
-    constexpr int NGRAM = 4;
-    for (int q = q0; q + NGRAM <= S; ++q) {
-        int hits = 0;
-        int hit_pos[8];
-        const int search_end = std::max(0, q0 - NGRAM);
-        for (int p = 0; p <= search_end && hits <= max_anchor_hits; ++p) {
-            bool same = true;
-            for (int k = 0; k < NGRAM; ++k) {
-                if (ids[(size_t)p + k] != ids[(size_t)q + k]) { same = false; break; }
-            }
-            if (same) {
-                if (hits < 8) hit_pos[hits] = p;
-                ++hits;
-            }
-        }
-        if (hits > 0 && hits <= max_anchor_hits) {
-            for (int i = 0; i < hits && i < 8; ++i) {
-                force_chunk_neighborhood(forced, n_chunks, hit_pos[i] / chunk_size, anchor_radius);
-            }
+    for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) forced[(size_t)c] = 1;
+    for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1;
+
+    const int q0 = std::max(0, S - cfg.query_tokens);
+    {
+        std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+        dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor;
+        anchor_cfg.chunk_size = chunk_size;
+        std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d\n",
+            n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count);
+        std::fflush(stderr);
+
+        if (cfg.use_transitive) {
+            dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                                      anchor_cfg, cfg.max_iters, forced);
+        } else {
+            dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced);
         }
     }
 
@@ -824,4 +895,18 @@ std::vector<int32_t> drafter_score_and_compress(
     return out;
 }
 
+// ABI-stable 6-arg overload — old callers compiled before the use_transitive_override
+// parameter was added link here without requiring recompilation.
+std::vector<int32_t> drafter_score_and_compress(
+    DrafterContext & ctx,
+    const std::vector<int32_t> & ids,
+    float keep_ratio,
+    int chunk_size,
+    int n_lookahead,
+    int pool_kernel) {
+    return drafter_score_and_compress(ctx, ids, keep_ratio,
+                                      chunk_size, n_lookahead, pool_kernel,
+                                      /*use_transitive_override=*/-1);
+}
+
 } // namespace dflash::common
diff --git a/server/src/qwen3/qwen3_drafter.h b/server/src/qwen3/qwen3_drafter.h
index e5424f9d..08aed3e9 100644
--- a/server/src/qwen3/qwen3_drafter.h
+++ b/server/src/qwen3/qwen3_drafter.h
@@ -66,13 +66,27 @@ void free_drafter_weights(DrafterContext & ctx);
 // Score importance per token via Liu Q-hook tail attention, then chunk-top-K
 // span merge. Returns surviving token IDs (drafter vocab).
 //
-//   ids          input token IDs of length S
-//   keep_ratio   fraction of `chunk_size`-token chunks to keep
-//   chunk_size   span granularity (default 32)
-//   n_lookahead  trailing Q tokens used for tail attention (default 8)
-//   pool_kernel  AvgPool kernel for score smoothing (default 13)
+//   ids                    input token IDs of length S
+//   keep_ratio             fraction of `chunk_size`-token chunks to keep
+//   chunk_size             span granularity (default 32)
+//   n_lookahead            trailing Q tokens used for tail attention (default 8)
+//   pool_kernel            AvgPool kernel for score smoothing (default 13)
+//   use_transitive_override  -1 = read from env (default, no behaviour change)
+//                             0 = cascade off (agentic path)
+//                             1 = cascade on  (retrieval path)
 //
 // On failure returns empty vector + sets last_error.
+std::vector<int32_t> drafter_score_and_compress(
+    DrafterContext & ctx,
+    const std::vector<int32_t> & ids,
+    float  keep_ratio,
+    int    chunk_size,
+    int    n_lookahead,
+    int    pool_kernel,
+    int    use_transitive_override);
+
+// Backward-compatible 6-arg overload — ABI-stable wrapper, defined in qwen3_drafter.cpp.
+// Old callers compiled against the 6-arg signature continue to link without recompile.
 std::vector<int32_t> drafter_score_and_compress(
     DrafterContext & ctx,
     const std::vector<int32_t> & ids,
diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
index 71c544ac..117514f7 100644
--- a/server/src/server/http_server.h
+++ b/server/src/server/http_server.h
@@ -144,7 +144,7 @@ struct ServerConfig {
     enum class PflashMode { OFF, AUTO, ALWAYS };
     PflashMode  pflash_mode      = PflashMode::OFF;
     int         pflash_threshold = 32000;   // token count threshold for AUTO mode
-    float       pflash_keep_ratio = 0.05f;  // fraction of tokens to keep
+    float       pflash_keep_ratio = 0.10f;  // fraction of tokens to keep
     std::string pflash_drafter_path;        // path to drafter GGUF (Qwen3-0.6B)
     int         pflash_drafter_gpu = 0;     // backend-local GPU for PFlash drafter
     bool        pflash_remote_drafter = false; // use IPC drafter for mixed backends
diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp
index 5f00d4df..9438bc38 100644
--- a/server/src/server/server_main.cpp
+++ b/server/src/server/server_main.cpp
@@ -209,7 +209,7 @@ static void print_usage(const char * prog) {
         "PFlash (speculative prefill compression):\n"
         "  --prefill-compression off|auto|always  (default: off)\n"
         "  --prefill-threshold <N>     Token threshold for auto mode (default: 32000)\n"
-        "  --prefill-keep-ratio <F>    Fraction of tokens to keep (default: 0.05)\n"
+        "  --prefill-keep-ratio <F>    Fraction of tokens to keep (default: 0.10)\n"
         "  --prefill-drafter <path>    Drafter GGUF for compression (Qwen3-0.6B)\n"
         "  --prefill-skip-park         Skip park/unpark (for >=32GB GPUs)\n"
         "  --draft-residency auto|persistent|request-scoped\n"
diff --git a/server/test/test_anchor_transitive.cpp b/server/test/test_anchor_transitive.cpp
new file mode 100644
index 00000000..ae8a0bbc
--- /dev/null
+++ b/server/test/test_anchor_transitive.cpp
@@ -0,0 +1,355 @@
+// TDD: anchor transitive multi-pass.
+//
+// T1 — single-pass query-match preserved (regression pin, PASS today)
+// T2 — single-pass misses chain hops (characterises limitation, PASS today)
+// T3 — transitive rescues all hops (RED until Phase 2)
+//
+// Pure CPU — no GPU, no model load.
+
+#include "../src/qwen3/anchor_scan.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#define REQUIRE(cond) \
+    do { if (!(cond)) { \
+        std::fprintf(stderr, "FAIL: %s line %d: %s\n", __FILE__, __LINE__, #cond); \
+        std::exit(1); \
+    } } while (0)
+
+static constexpr int32_t FILLER = 1;
+static constexpr int32_t M1 = 1001, M2 = 1002, M3 = 1003;
+static constexpr int CHUNK = 64;
+
+// Place a marker 4-gram [FILLER, FILLER, MARKER, FILLER] at position pos.
+static void place_marker_4gram(std::vector<int32_t>& ids, int pos, int32_t marker) {
+    ids[(size_t)pos]     = FILLER;
+    ids[(size_t)pos + 1] = FILLER;
+    ids[(size_t)pos + 2] = marker;
+    ids[(size_t)pos + 3] = FILLER;
+}
+
+// T1 — single-pass finds a query-matching marker in the body.
+static void t1_single_pass_match() {
+    const int N = 2048;
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // Body marker at pos 100 (chunk 1).
+    place_marker_4gram(ids, 100, M3);
+    // Same 4-gram in the query suffix at pos 2044 (inside query window).
+    place_marker_4gram(ids, 2044, M3);
+
+    const int q0 = 1948;  // N - 100
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4};
+    dflash::qwen3::scan_and_force(ids, q0, query_pool, cfg, forced);
+
+    // Chunk containing pos 100 must be forced.
+    const int target_chunk = 100 / CHUNK;  // chunk 1
+    REQUIRE(forced[(size_t)target_chunk] == 1);
+
+    std::printf("T1 PASS: chunk %d forced by single-pass M3 match\n", target_chunk);
+}
+
+// T2 — single-pass only forces the direct match; chain hops stay unforced.
+static void t2_single_pass_misses_hops() {
+    const int N = 2048;
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // hop1 at pos 200 (chunk 3): contains M1.
+    place_marker_4gram(ids, 200, M1);
+
+    // hop2 at pos 600 (chunk 9): contains M2 + M1 (bridge to hop1).
+    place_marker_4gram(ids, 600, M2);
+    place_marker_4gram(ids, 604, M1);
+
+    // hop3 at pos 1200 (chunk 18): contains M3 + M2 (bridge to hop2).
+    place_marker_4gram(ids, 1200, M3);
+    place_marker_4gram(ids, 1204, M2);
+
+    // Query suffix at pos 2044: contains M3.
+    place_marker_4gram(ids, 2044, M3);
+
+    const int q0 = 1948;
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4};
+    dflash::qwen3::scan_and_force(ids, q0, query_pool, cfg, forced);
+
+    const int chunk_hop3 = 1200 / CHUNK;  // 18
+    const int chunk_hop2 = 600  / CHUNK;  // 9
+    const int chunk_hop1 = 200  / CHUNK;  // 3
+
+    // Single-pass: only the direct M3 match at pos 1200 is forced.
+    REQUIRE(forced[(size_t)chunk_hop3] == 1);
+    REQUIRE(forced[(size_t)chunk_hop2] == 0);
+    REQUIRE(forced[(size_t)chunk_hop1] == 0);
+
+    std::printf("T2 PASS: chunk(%d) forced, chunk(%d) and chunk(%d) NOT forced (single-pass)\n",
+                chunk_hop3, chunk_hop2, chunk_hop1);
+}
+
+// T3 — transitive rescues all hops (FAILS until Phase 2 implements the function).
+static void t3_transitive_rescues_all() {
+    const int N = 2048;
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    place_marker_4gram(ids, 200, M1);
+
+    place_marker_4gram(ids, 600, M2);
+    place_marker_4gram(ids, 604, M1);
+
+    place_marker_4gram(ids, 1200, M3);
+    place_marker_4gram(ids, 1204, M2);
+
+    place_marker_4gram(ids, 2044, M3);
+
+    const int q0 = 1948;
+    std::vector<int32_t> initial_query_pool(ids.begin() + q0, ids.end());
+
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4};
+    dflash::qwen3::scan_and_force_transitive(ids, q0, initial_query_pool,
+                                              cfg, /*max_iters=*/3, forced);
+
+    const int chunk_hop3 = 1200 / CHUNK;
+    const int chunk_hop2 = 600  / CHUNK;
+    const int chunk_hop1 = 200  / CHUNK;
+
+    REQUIRE(forced[(size_t)chunk_hop3] == 1);
+    REQUIRE(forced[(size_t)chunk_hop2] == 1);
+    REQUIRE(forced[(size_t)chunk_hop1] == 1);
+
+    std::printf("T3 PASS: all hops forced transitively\n");
+}
+
+// T4 — variable-name reuse across templates (FAILS until v2 adds rare-token match).
+//
+// Token layout:
+//   FILLER=1, V1=2001(X42), V2=2002(Y42), V3=2003(Z42)
+//   Template-context tokens: A=3001,B=3002,C=3003,D=3004,E=3005,F=3006
+//   Query-match tokens: X1=4001,X2=4002,X3=4003
+//
+// hop3 (chunk 18, pos 1200): [X1,X2,V3,X3,E,V2,F,FILL] — 4-gram [X1,X2,V3,X3] matches query
+// hop2 (chunk  9, pos  600): [C,V2,FILL,V1,D,FILL,FILL] — V2 in DIFFERENT context than hop3
+// hop1 (chunk  3, pos  200): [A,V1,FILL,B]              — V1 in DIFFERENT context than hop2
+// query (pos 2044):          [X1,X2,V3,X3]              — matches hop3 4-gram exactly
+//
+// Pass 1 (4-gram): forces hop3.
+// Pass 1 rare-token: V2 (freq=2) found in hop3 → also at pos 601 (hop2 chunk 9) → forces hop2.
+// Pass 2 rare-token: V1 (freq=2) found in hop2 → also at pos 201 (hop1 chunk 3) → forces hop1.
+// Today's impl (4-gram only) fails because V2 4-grams in hop3 ≠ V2 4-grams in hop2.
+static void t4_rare_token_bridges_different_context() {
+    static constexpr int32_t V1 = 2001, V2 = 2002, V3 = 2003;
+    static constexpr int32_t A = 3001, B = 3002, C = 3003, D = 3004, E = 3005, F = 3006;
+    static constexpr int32_t X1 = 4001, X2 = 4002, X3 = 4003;
+
+    const int N = 2048;
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // hop1 (chunk 3, pos 200): [A, V1, FILL, B]
+    ids[200] = A; ids[201] = V1; ids[202] = FILLER; ids[203] = B;
+
+    // hop2 (chunk 9, pos 600): [C, V2, FILL, V1, D, FILL, FILL]
+    ids[600] = C; ids[601] = V2; ids[602] = FILLER; ids[603] = V1;
+    ids[604] = D; ids[605] = FILLER; ids[606] = FILLER;
+
+    // hop3 (chunk 18, pos 1200): [X1, X2, V3, X3, E, V2, F, FILL]
+    // V2 here is in 4-gram context [E,V2,F,FILL] — differs from hop2's [C,V2,FILL,V1]
+    ids[1200] = X1; ids[1201] = X2; ids[1202] = V3; ids[1203] = X3;
+    ids[1204] = E;  ids[1205] = V2; ids[1206] = F;  ids[1207] = FILLER;
+
+    // query suffix (pos 2044): [X1, X2, V3, X3] — exact 4-gram match to hop3
+    ids[2044] = X1; ids[2045] = X2; ids[2046] = V3; ids[2047] = X3;
+
+    const int q0 = 1948;
+    std::vector<int32_t> initial_query_pool(ids.begin() + q0, ids.end());
+
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4,
+                                     /*rare_token_max_freq=*/8};
+    dflash::qwen3::scan_and_force_transitive(ids, q0, initial_query_pool,
+                                              cfg, /*max_iters=*/3, forced);
+
+    const int chunk_hop3 = 1200 / CHUNK;  // 18
+    const int chunk_hop2 =  600 / CHUNK;  //  9
+    const int chunk_hop1 =  200 / CHUNK;  //  3
+
+    REQUIRE(forced[(size_t)chunk_hop3] == 1);
+    REQUIRE(forced[(size_t)chunk_hop2] == 1);
+    REQUIRE(forced[(size_t)chunk_hop1] == 1);
+
+    std::printf("T4 PASS: all hops forced via rare-token bridge (V2 freq=2, V1 freq=2)\n");
+}
+
+// T5: gate closes when pass-1 already finds >= cascade_min_anchor_count chunks.
+//
+// Layout (N=4096, chunk=64 → 64 chunks):
+//   A common 4-gram [CMN,CMN,CMN,CMN] appears 50 times at scattered body positions.
+//   One forced chunk (chunk 5, pos 320) also contains a unique rare token RT (freq=1).
+//   RT appears once more at a separate body position in chunk 60 (pos 3840).
+//   Query suffix contains the common 4-gram → pass-1 forces all 50 matching chunks.
+//
+// With cascade_min_anchor_count=5: gained=50 >= 5 → gate closes → cascade skipped.
+// chunk 60 (pos 3840, which has RT but is only reachable via cascade) stays UNFORCED.
+//
+// With cascade_min_anchor_count=0: gate open → cascade runs → chunk 60 gets forced.
+// This contrast proves the gate is operative.
+static void t5_gate_closes_when_pass1_finds_many() {
+    static constexpr int32_t CMN = 5001;  // common token (4-gram made of it)
+    static constexpr int32_t RT  = 5002;  // rare token (freq=2)
+
+    const int N = 4096;
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;  // 64
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // Place common 4-gram at 50 scattered body positions (chunks 0..49).
+    // Spaced 64 tokens apart to land in different chunks.
+    for (int i = 0; i < 50; ++i) {
+        int pos = i * 64 + 4;  // pos 4, 68, 132, ... (well within body)
+        ids[(size_t)pos]     = CMN;
+        ids[(size_t)pos + 1] = CMN;
+        ids[(size_t)pos + 2] = CMN;
+        ids[(size_t)pos + 3] = CMN;
+    }
+
+    // RT appears in chunk 5 (pos 320) and chunk 60 (pos 3840).
+    ids[320] = RT;
+    ids[3840] = RT;
+
+    // Query suffix: just the common 4-gram so pass-1 fires on all 50 body positions.
+    const int q0 = N - 32;
+    ids[(size_t)q0]     = CMN;
+    ids[(size_t)q0 + 1] = CMN;
+    ids[(size_t)q0 + 2] = CMN;
+    ids[(size_t)q0 + 3] = CMN;
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+
+    // --- Test A: gate CLOSED (cascade_min_anchor_count=5) ---
+    {
+        std::vector<uint8_t> forced_a((size_t)n_chunks, 0);
+        dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                         /*max_anchor_hits=*/64, /*ngram=*/4,
+                                         /*rare_token_max_freq=*/2,
+                                         /*cascade_min_anchor_count=*/5,
+                                         /*max_forced_count=*/INT_MAX};
+        dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                                  cfg, /*max_iters=*/3, forced_a);
+
+        // Pass-1 forces chunks 0..49 (50 chunks); gate closes → cascade skipped.
+        // chunk 60 (pos 3840 has RT but only reachable via cascade) must be UNFORCED.
+        const int chunk_rt_extra = 3840 / CHUNK;  // 60
+        REQUIRE(forced_a[(size_t)chunk_rt_extra] == 0);
+        // chunk 5 (contains RT at pos 320) is forced by pass-1 (common 4-gram at pos 324).
+        REQUIRE(forced_a[5] == 1);
+
+        std::printf("T5a PASS: gate closed (gained=50 >= min=5), chunk %d unforced\n",
+                    chunk_rt_extra);
+    }
+
+    // --- Test B: gate OPEN (cascade_min_anchor_count=0) → cascade forces chunk 60 ---
+    {
+        std::vector<uint8_t> forced_b((size_t)n_chunks, 0);
+        dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                         /*max_anchor_hits=*/64, /*ngram=*/4,
+                                         /*rare_token_max_freq=*/2,
+                                         /*cascade_min_anchor_count=*/0,
+                                         /*max_forced_count=*/INT_MAX};
+        dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                                  cfg, /*max_iters=*/3, forced_b);
+
+        // Cascade runs; chunk 5 is forced by pass-1 and contains RT;
+        // RT at pos 3840 → chunk 60 forced via rare-token cascade.
+        const int chunk_rt_extra = 3840 / CHUNK;
+        REQUIRE(forced_b[(size_t)chunk_rt_extra] == 1);
+
+        std::printf("T5b PASS: gate open (min=0), cascade forced chunk %d via RT\n",
+                    chunk_rt_extra);
+    }
+}
+
+// T6: hard cap (max_forced_count) prevents runaway cascade.
+//
+// Layout (N=2048, chunk=64 → 32 chunks):
+//   Query contains 4-gram [TGR,TGR,TGR,TGR] which matches body chunk 0.
+//   Chunk 0 contains chain token C0 (freq=2): also appears in chunk 1.
+//   Chunk 1 contains chain token C1 (freq=2): also appears in chunk 2.
+//   ... 20 such chain links.
+//   Pass-1 forces chunk 0 (1 chunk gained < cascade_min_anchor_count=0 → gate open).
+//   Cascade rare-token worklist propagates: chunk 0→1→2→...→20 (20 more).
+//   max_forced_count=5 → cascade stops when total > 5. Result: forced <= 5.
+static void t6_hard_cap_prevents_runaway() {
+    static constexpr int32_t TGR = 7000;  // trigger token for 4-gram pass-1 match
+
+    const int N = 2048;
+    const int n_chunks = (N + CHUNK - 1) / CHUNK;  // 32
+    std::vector<int32_t> ids((size_t)N, FILLER);
+
+    // body chunk 0 (pos 0): place 4-gram [TGR,TGR,TGR,TGR] so pass-1 forces it.
+    ids[0] = TGR; ids[1] = TGR; ids[2] = TGR; ids[3] = TGR;
+
+    // Rare-token chain: C_i appears in chunk i (at offset 8) and chunk i+1 (at offset 9).
+    // Offsets 8 and 9 within each chunk don't collide between consecutive tokens.
+    // Cascade worklist: chunk i forced → C_i found at offset 8 → chunk i+1 forced.
+    for (int i = 0; i < 20; ++i) {
+        int32_t tok = 7100 + i;
+        ids[(size_t)(i * 64 + 8)]           = tok;  // in chunk i, offset 8
+        ids[(size_t)((i + 1) * 64 + 9)]     = tok;  // in chunk i+1, offset 9
+    }
+
+    // Query suffix: contains [TGR,TGR,TGR,TGR] → pass-1 matches body chunk 0.
+    const int q0 = N - 64;
+    ids[(size_t)q0]     = TGR;
+    ids[(size_t)q0 + 1] = TGR;
+    ids[(size_t)q0 + 2] = TGR;
+    ids[(size_t)q0 + 3] = TGR;
+    std::vector<int32_t> query_pool(ids.begin() + q0, ids.end());
+
+    // Without cap: cascade forces chunks 0..20 (21 chunks total).
+    // With cap=5: stops at 5.
+    std::vector<uint8_t> forced((size_t)n_chunks, 0);
+    dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0,
+                                     /*max_anchor_hits=*/8, /*ngram=*/4,
+                                     /*rare_token_max_freq=*/2,
+                                     /*cascade_min_anchor_count=*/0,
+                                     /*max_forced_count=*/5};
+    dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool,
+                                              cfg, /*max_iters=*/25, forced);
+
+    int total_forced = 0;
+    for (int c = 0; c < n_chunks; ++c) total_forced += (int)forced[(size_t)c];
+
+    REQUIRE(total_forced <= 5);
+    REQUIRE(forced[0] == 1);  // chunk 0 always forced by pass-1
+
+    std::printf("T6 PASS: hard cap engaged, forced=%d (cap=5, chain length=20)\n",
+                total_forced);
+}
+
+int main() {
+    t1_single_pass_match();
+    t2_single_pass_misses_hops();
+    t3_transitive_rescues_all();
+    t4_rare_token_bridges_different_context();
+    t5_gate_closes_when_pass1_finds_many();
+    t6_hard_cap_prevents_runaway();
+    std::printf("\nAll anchor_transitive tests passed.\n");
+    return 0;
+}