diff --git a/docs/anchor-transitive.md b/docs/anchor-transitive.md new file mode 100644 index 00000000..6f1b02f8 --- /dev/null +++ b/docs/anchor-transitive.md @@ -0,0 +1,15 @@ +# anchor transitive scan + +`scan_and_force_transitive` (anchor_scan.cpp) expands the query pool with +tokens from newly-forced chunks and re-runs `scan_and_force` until fixed +point or max_iters (default 3) is reached. + +Improves multi-hop retrieval: enables discovery of intermediate context +chunks whose tokens do not appear in the original query but connect +query-to-needle via shared rare tokens. + +Empirical result: F1=0.628 on LongBench HotpotQA at ee7 + keep=0.15 +(vs uncompressed F1=0.697). This is the ceiling for attention-score-based +prefill compression on this task; see bench/2026-05-25_longbench_hotpotqa/. + +On by default. Disable via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0. diff --git a/docs/pflash-compress-cfg.md b/docs/pflash-compress-cfg.md new file mode 100644 index 00000000..5755e314 --- /dev/null +++ b/docs/pflash-compress-cfg.md @@ -0,0 +1,46 @@ +# pflash compression knobs + +All PFLASH_COMPRESS_* and DFLASH_COMPRESS_* env vars are read once per +request in `compress_cfg_from_env(n_chunks, n_keep)` in qwen3_drafter.cpp. + +## anchor_radius adaptive ladder + +Prevents the 64K NIAH cliff: at long context the needle text is more likely +to straddle multiple chunks, and a fixed radius=2 window (5 chunks / ~160 +tokens) loses the back half of the needle. + +Default ladder (override via PFLASH_COMPRESS_ANCHOR_RADIUS): + +| n_chunks | anchor_radius | +|------------|---------------| +| < 1024 | 2 | +| 1024-2047 | 4 | +| >= 2048 | 8 | + +## max_anchor_hits adaptive ladder + +Same breakpoints as anchor_radius. At long context anchors are sparser, so +more hits per query token are affordable. + +| n_chunks | max_anchor_hits | +|------------|-----------------| +| < 1024 | 8 | +| 1024-2047 | 16 | +| >= 2048 | 32 | + +## anchor_transitive + +On by default. Gated rare-token bridge expands the query pool with tokens +from newly-forced chunks and re-runs anchor scan to fixed point. +Improves multi-hop F1 on LongBench HotpotQA (empirically; F1=0.628 ceiling +at ee7+anchor-transitive on RTX 3090 — see bench/2026-05-25_longbench_hotpotqa/). +Control via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0 to disable. + +## head/tail chunk forcing + +Head and tail chunks are force-included before top-K scoring fills the +remainder. The counts scale with n_keep so top-K always gets at least one +slot even when head_raw + tail_raw >= n_keep. + +Defaults: head=8, tail=24 (override via DFLASH_COMPRESS_HEAD_CHUNKS / +DFLASH_COMPRESS_TAIL_CHUNKS). diff --git a/server/src/qwen3/anchor_scan.cpp b/server/src/qwen3/anchor_scan.cpp new file mode 100644 index 00000000..e0088167 --- /dev/null +++ b/server/src/qwen3/anchor_scan.cpp @@ -0,0 +1,169 @@ +#include "anchor_scan.h" + +#include +#include +#include +#include + +namespace dflash::qwen3 { + +// Force chunk and its radius-neighborhood into `forced`. +static void force_neighborhood(std::vector& forced, int n_chunks, + int chunk, int radius) { + int lo = std::max(0, chunk - radius); + int hi = std::min(n_chunks - 1, chunk + radius); + for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1; +} + +void scan_and_force( + const std::vector& ids, + int body_end, + const std::vector& query_pool, + const AnchorScanCfg& cfg, + std::vector& forced) +{ + const int n_chunks = (int)forced.size(); + const int ngram = cfg.ngram; + const int search_end = std::max(0, body_end - ngram); + + for (int qi = 0; qi + ngram <= (int)query_pool.size(); ++qi) { + int hits = 0; + int hit_pos[8]; + for (int p = 0; p <= search_end && hits <= cfg.max_anchor_hits; ++p) { + bool same = true; + for (int k = 0; k < ngram; ++k) { + if (ids[(size_t)p + k] != query_pool[(size_t)qi + k]) { + same = false; + break; + } + } + if (same) { + if (hits < 8) hit_pos[hits] = p; + ++hits; + } + } + if (hits > 0 && hits <= cfg.max_anchor_hits) { + for (int i = 0; i < hits && i < 8; ++i) { + force_neighborhood(forced, n_chunks, + hit_pos[i] / cfg.chunk_size, + cfg.anchor_radius); + } + } + } +} + +// Helper: count set entries in forced. +static int count_set(const std::vector& forced) { + int n = 0; + for (uint8_t v : forced) n += (v != 0); + return n; +} + +void scan_and_force_transitive( + const std::vector& ids, + int body_end, + const std::vector& initial_query_pool, + const AnchorScanCfg& cfg, + int max_iters, + std::vector& forced) +{ + auto pool = initial_query_pool; + const int n_chunks = (int)forced.size(); + + // Precompute token frequencies in body once. + std::unordered_map body_freq; + body_freq.reserve((size_t)body_end); + for (int j = 0; j < body_end; ++j) ++body_freq[ids[(size_t)j]]; + + // Build inverted index: token -> list of body positions (for rare tokens only). + std::unordered_map> rare_positions; + if (cfg.rare_token_max_freq > 0) { + for (auto& kv : body_freq) { + if (kv.second <= cfg.rare_token_max_freq) { + rare_positions[kv.first] = {}; + } + } + for (int p = 0; p < body_end; ++p) { + auto it = rare_positions.find(ids[(size_t)p]); + if (it != rare_positions.end()) it->second.push_back(p); + } + } + + // Pass-1: run the initial scan. + const int count_before_pass1 = count_set(forced); + scan_and_force(ids, body_end, pool, cfg, forced); + const int gained_pass1 = count_set(forced) - count_before_pass1; + + // Gating: if pass-1 already found many anchors, skip the cascade entirely. + if (cfg.cascade_min_anchor_count > 0 && gained_pass1 >= cfg.cascade_min_anchor_count) { + return; + } + + // Cascade loop: expand pool with newly-forced tokens and re-scan. + std::vector prev_forced; + for (int it = 0; it < max_iters; ++it) { + prev_forced = forced; + + // Rare-token single-match: worklist-driven so cascades within a pass are + // caught (e.g. hop3 forces hop2 which forces hop1 in one outer iteration). + if (cfg.rare_token_max_freq > 0) { + std::vector worklist; + for (int c = 0; c < n_chunks; ++c) { + if (forced[c] && !prev_forced[c]) worklist.push_back(c); + } + // On first iteration, seed from everything forced so far (pass-1 results). + if (it == 0) { + worklist.clear(); + for (int c = 0; c < n_chunks; ++c) { + if (forced[c]) worklist.push_back(c); + } + } + for (int wi = 0; wi < (int)worklist.size(); ++wi) { + int c = worklist[wi]; + int s = c * cfg.chunk_size; + int e = std::min(body_end, (c + 1) * cfg.chunk_size); + for (int j = s; j < e; ++j) { + auto it2 = rare_positions.find(ids[(size_t)j]); + if (it2 == rare_positions.end()) continue; + for (int p : it2->second) { + int target_c = p / cfg.chunk_size; + if (!forced[(size_t)target_c]) { + force_neighborhood(forced, n_chunks, + target_c, cfg.anchor_radius); + worklist.push_back(target_c); + } + } + } + } + } + + // Hard cap: if we exceeded max_forced_count, revert this iteration and stop. + if (count_set(forced) > cfg.max_forced_count) { + forced = prev_forced; + break; + } + + if (forced == prev_forced) break; + + // Expand pool with tokens from newly-forced chunks (feeds next 4-gram pass). + for (int c = 0; c < n_chunks; ++c) { + if (forced[c] && !prev_forced[c]) { + int s = c * cfg.chunk_size; + int e = std::min((int)ids.size(), (c + 1) * cfg.chunk_size); + for (int j = s; j < e; ++j) pool.push_back(ids[j]); + } + } + + // 4-gram scan with expanded pool for next iteration. + prev_forced = forced; + scan_and_force(ids, body_end, pool, cfg, forced); + + // Hard cap check after 4-gram expansion too. + if (count_set(forced) > cfg.max_forced_count) { + forced = prev_forced; + break; + } + } +} + +} // namespace dflash::qwen3 diff --git a/server/src/qwen3/anchor_scan.h b/server/src/qwen3/anchor_scan.h new file mode 100644 index 00000000..8f75a085 --- /dev/null +++ b/server/src/qwen3/anchor_scan.h @@ -0,0 +1,42 @@ +// N-gram anchor scan: mark chunks forced by token-match between a query pool +// and the body of an ids sequence. Pure CPU, no GPU, no model required. +#pragma once + +#include +#include +#include + +namespace dflash::qwen3 { + +struct AnchorScanCfg { + int chunk_size; + int anchor_radius; + int max_anchor_hits; + int ngram = 4; + int rare_token_max_freq = 8; // tokens appearing <= this many times in body count as rare + int cascade_min_anchor_count = 0; // skip cascade if pass-1 forced >= this many chunks (0 = always cascade) + int max_forced_count = INT_MAX; // hard cap on total forced chunks +}; + +// Marks chunks forced by ngram-matches between query_pool and ids[0..body_end). +// `forced` is in-out; new hits are OR-merged. Idempotent. +void scan_and_force( + const std::vector& ids, + int body_end, + const std::vector& query_pool, + const AnchorScanCfg& cfg, + std::vector& forced +); + +// Transitive variant: expands the query pool with tokens from newly-forced +// chunks and re-runs scan_and_force until a fixed point or max_iters reached. +void scan_and_force_transitive( + const std::vector& ids, + int body_end, + const std::vector& initial_query_pool, + const AnchorScanCfg& cfg, + int max_iters, + std::vector& forced +); + +} // namespace dflash::qwen3 diff --git a/server/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp index f65cb079..852fc96e 100644 --- a/server/src/qwen3/qwen3_drafter.cpp +++ b/server/src/qwen3/qwen3_drafter.cpp @@ -17,6 +17,7 @@ #include "qwen3_drafter_model.h" #include "common/backend_precision.h" #include "internal.h" +#include "anchor_scan.h" #include "ggml.h" #include "ggml-alloc.h" @@ -64,11 +65,122 @@ static int env_int(const char * name, int fallback) { return fallback; } -static void force_chunk_neighborhood(std::vector & forced, int n_chunks, - int chunk, int radius) { - int lo = std::max(0, chunk - radius); - int hi = std::min(n_chunks - 1, chunk + radius); - for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1; +static float env_float(const char * name, float def) { + if (const char * v = std::getenv(name)) { + try { return std::stof(v); } catch (...) {} + } + return def; +} + +// All pflash/dflash compression knobs read from env, derived per-request. +// anchor_radius and max_anchor_hits use an adaptive ladder keyed on n_chunks +// to prevent the 64K NIAH cliff; see docs/pflash-compress-cfg.md. +// Override any ladder value via PFLASH_COMPRESS_* env vars. +struct CompressCfg { + int query_tokens; + int head_chunks; + int tail_chunks; + dflash::qwen3::AnchorScanCfg anchor; + bool use_transitive; + int max_iters; +}; + +static CompressCfg compress_cfg_from_env(int n_chunks, int n_keep, + int use_transitive_override = -1) { + CompressCfg c{}; + + c.query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); + + // head/tail forced chunks scale so top-K scoring always gets slots + const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8); + const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24); + c.head_chunks = h_raw; + c.tail_chunks = t_raw; + if (c.head_chunks + c.tail_chunks >= n_keep) { + const int budget = std::max(1, n_keep - 1); + c.head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw)); + c.tail_chunks = std::max(0, budget - c.head_chunks); + } + + // anchor_radius: adaptive ladder prevents 64K NIAH cliff + // (<32K=2, 32-64K=4, >=64K=8); override via PFLASH_COMPRESS_ANCHOR_RADIUS + { + const int env_r = env_int("PFLASH_COMPRESS_ANCHOR_RADIUS", -1); + const int legacy_r = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", -1); + if (env_r >= 0) c.anchor.anchor_radius = env_r; + else if (legacy_r >= 0) c.anchor.anchor_radius = legacy_r; + else if (n_chunks < 1024) c.anchor.anchor_radius = 2; + else if (n_chunks < 2048) c.anchor.anchor_radius = 4; + else c.anchor.anchor_radius = 8; + } + + // max_anchor_hits: same ladder — sparser anchors at long context + { + const int env_h = env_int("PFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); + const int legacy_h = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", -1); + if (env_h >= 0) c.anchor.max_anchor_hits = env_h; + else if (legacy_h >= 0) c.anchor.max_anchor_hits = legacy_h; + else if (n_chunks < 1024) c.anchor.max_anchor_hits = 8; + else if (n_chunks < 2048) c.anchor.max_anchor_hits = 16; + else c.anchor.max_anchor_hits = 32; + } + + c.anchor.ngram = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_NGRAM", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_NGRAM", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_NGRAM deprecated, use PFLASH_COMPRESS_ANCHOR_NGRAM\n"); return lv; } + return 4; + }(); + + c.anchor.rare_token_max_freq = [&]{ + const int nv = env_int("PFLASH_COMPRESS_RARE_MAX_FREQ", -1); + const int lv = env_int("DFLASH_COMPRESS_RARE_MAX_FREQ", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_RARE_MAX_FREQ deprecated, use PFLASH_COMPRESS_RARE_MAX_FREQ\n"); return lv; } + return 2; + }(); + + const float cascade_min_anchor_frac = [&]{ + const float nv = env_float("PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); + const float lv = env_float("DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC", -1.0f); + if (nv >= 0.0f) return nv; + if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC deprecated, use PFLASH_COMPRESS_CASCADE_MIN_ANCHOR_FRAC\n"); return lv; } + return 0.0f; + }(); + + const float max_forced_ratio = [&]{ + const float nv = env_float("PFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); + const float lv = env_float("DFLASH_COMPRESS_MAX_FORCED_RATIO", -1.0f); + if (nv >= 0.0f) return nv; + if (lv >= 0.0f) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_MAX_FORCED_RATIO deprecated, use PFLASH_COMPRESS_MAX_FORCED_RATIO\n"); return lv; } + return 10.0f; + }(); + + c.anchor.cascade_min_anchor_count = (int)(cascade_min_anchor_frac * n_keep); + c.anchor.max_forced_count = (int)(max_forced_ratio * n_keep); + + c.use_transitive = [&]{ + // Per-request override (0=off, 1=on) from router decision takes precedence. + if (use_transitive_override == 0) return false; + if (use_transitive_override == 1) return true; + // Fallback: read from env (same as before, no behaviour change when -1). + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_TRANSITIVE", -1); + if (nv >= 0) return nv != 0; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_TRANSITIVE deprecated, use PFLASH_COMPRESS_ANCHOR_TRANSITIVE\n"); return lv != 0; } + return true; // on by default; see docs/anchor-transitive.md + }(); + + c.max_iters = [&]{ + const int nv = env_int("PFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); + const int lv = env_int("DFLASH_COMPRESS_ANCHOR_MAX_ITERS", -1); + if (nv >= 0) return nv; + if (lv >= 0) { fprintf(stderr, "[WARN] DFLASH_COMPRESS_ANCHOR_MAX_ITERS deprecated, use PFLASH_COMPRESS_ANCHOR_MAX_ITERS\n"); return lv; } + return 3; + }(); + + return c; } #if defined(DFLASH27B_BACKEND_HIP) @@ -120,21 +232,6 @@ const char * drafter_arch_name(DrafterArch arch) { return "unknown"; } -bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, - DrafterContext & out) { - return load_drafter(gguf_path, /*gpu_layers=*/999, /*gpu=*/0, out); -} - -bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, - int gpu, DrafterContext & out) { - return load_drafter(gguf_path, /*gpu_layers=*/999, DrafterArch::Qwen3_0p6b, gpu, out); -} - -bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, - DrafterArch arch, DrafterContext & out) { - return load_drafter(gguf_path, /*gpu_layers=*/999, arch, /*gpu=*/0, out); -} - bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, DrafterArch arch, int gpu, DrafterContext & out) { if (gpu < 0) { @@ -224,6 +321,22 @@ bool load_drafter(const std::string & gguf_path, int /*gpu_layers*/, return true; } +// Thin overloads for API compat; all forward to the canonical 4-arg form. +bool load_drafter(const std::string & gguf_path, int gpu_layers, + DrafterContext & out) { + return load_drafter(gguf_path, gpu_layers, DrafterArch::Qwen3_0p6b, /*gpu=*/0, out); +} + +bool load_drafter(const std::string & gguf_path, int gpu_layers, + int gpu, DrafterContext & out) { + return load_drafter(gguf_path, gpu_layers, DrafterArch::Qwen3_0p6b, gpu, out); +} + +bool load_drafter(const std::string & gguf_path, int gpu_layers, + DrafterArch arch, DrafterContext & out) { + return load_drafter(gguf_path, gpu_layers, arch, /*gpu=*/0, out); +} + void free_drafter(DrafterContext & ctx) { free_drafter_weights(ctx); if (ctx.backend) { @@ -254,7 +367,8 @@ static std::vector qwen35_score_and_compress( float keep_ratio, int chunk_size, int n_lookahead, - int pool_kernel) { + int pool_kernel, + int use_transitive_override = -1) { const int S = (int)ids.size(); const int hidden = w.n_embd; @@ -505,24 +619,23 @@ static std::vector qwen35_score_and_compress( const int n_chunks = (S + chunk_size - 1) / chunk_size; const int n_keep = std::max(1, (int)((float)n_chunks * keep_ratio)); - - std::vector smooth_score = score; - // Caller pool_kernel takes precedence; if zero/negative, fall back to env or 5. + const int pk = (pool_kernel > 0) ? pool_kernel : std::max(3, env_int("DFLASH_COMPRESS_POOL_KERNEL", 5)); - std::vector smoothed((size_t)S, 0.0f); - int half = pk / 2; - for (int j = 0; j < S; ++j) { - int lo = std::max(0, j - half); - int hi = std::min(S - 1, j + half); - float s = 0.0f; - int n = 0; - for (int k = lo; k <= hi; ++k) { s += score[(size_t)k]; ++n; } - smoothed[(size_t)j] = (n > 0) ? (s / (float)n) : 0.0f; + std::vector smooth_score((size_t)S, 0.0f); + { + int half = pk / 2; + for (int j = 0; j < S; ++j) { + int lo = std::max(0, j - half); + int hi = std::min(S - 1, j + half); + float s = 0.0f; + int n = 0; + for (int k = lo; k <= hi; ++k) { s += score[(size_t)k]; ++n; } + smooth_score[(size_t)j] = (n > 0) ? (s / (float)n) : 0.0f; + } } - smooth_score.swap(smoothed); - + std::vector> chunk_means; for (int c = 0; c < n_chunks; ++c) { int lo = c * chunk_size, hi = std::min(S, lo + chunk_size); @@ -531,50 +644,28 @@ static std::vector qwen35_score_and_compress( chunk_means.push_back({s / std::max(1, hi - lo), c}); } std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; }); - + + const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep, use_transitive_override); + std::vector selected((size_t)n_chunks, 0); int count = 0; - // Scale head/tail forced chunks so they don't crowd out top-K scoring. - { - const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8); - const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24); - int h_n = h_raw, t_n = t_raw; - if (h_n + t_n >= n_keep) { - const int budget = std::max(1, n_keep - 1); - h_n = std::max(0, h_raw * budget / (h_raw + t_raw)); - t_n = std::max(0, budget - h_n); - } - for (int c = 0; c < std::min(n_chunks, h_n); ++c) { selected[(size_t)c] = 1; ++count; } - for (int c = std::max(0, n_chunks - t_n); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; } - } + for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) { selected[(size_t)c] = 1; ++count; } + for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; } - const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); - const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); - const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); + const int q0 = std::max(0, S - cfg.query_tokens); + std::vector query_pool(ids.begin() + q0, ids.end()); std::vector forced((size_t)n_chunks, 0); - const int q0 = std::max(0, S - query_tokens); - constexpr int NGRAM = 4; - for (int q = q0; q + NGRAM <= S; ++q) { - int hits = 0; - int hit_pos[8]; - const int search_end = std::max(0, q0 - NGRAM); - for (int p = 0; p <= search_end && hits <= max_anchor_hits; ++p) { - bool same = true; - for (int k = 0; k < NGRAM; ++k) { - if (ids[(size_t)p + k] != ids[(size_t)q + k]) { same = false; break; } - } - if (same) { - if (hits < 8) hit_pos[hits] = p; - ++hits; - } - } - if (hits > 0 && hits <= max_anchor_hits) { - for (int i = 0; i < hits && i < 8; ++i) { - force_chunk_neighborhood(forced, n_chunks, hit_pos[i] / chunk_size, anchor_radius); - } - } + dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor; + anchor_cfg.chunk_size = chunk_size; + + if (cfg.use_transitive) { + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + anchor_cfg, cfg.max_iters, forced); + } else { + dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced); } + for (int c = 0; c < n_chunks; ++c) { if (forced[(size_t)c] && !selected[(size_t)c]) { selected[(size_t)c] = 1; @@ -582,16 +673,14 @@ static std::vector qwen35_score_and_compress( } } - // Global aggregation tasks often depend on repeated rare tokens that do - // not appear in the final query. Preserve high-frequency-but-not-filler - // token chunks before filling with model-score top-K. + // Global aggregation tasks: preserve high-frequency-but-not-filler token chunks. const int repeat_min = env_int("DFLASH_COMPRESS_REPEAT_MIN", 4); const int repeat_max = env_int("DFLASH_COMPRESS_REPEAT_MAX", 32); const int repeat_limit = env_int("DFLASH_COMPRESS_REPEAT_CHUNKS", n_keep); if (repeat_min > 1 && count < repeat_limit) { std::unordered_map freq; freq.reserve((size_t)S); - const int repeat_scan_end = std::max(0, S - query_tokens); + const int repeat_scan_end = std::max(0, S - cfg.query_tokens); for (int j = 0; j < repeat_scan_end; ++j) { ++freq[ids[(size_t)j]]; } @@ -619,12 +708,12 @@ static std::vector qwen35_score_and_compress( } } } - + for (auto [_, c] : chunk_means) { if (count >= n_keep) break; if (!selected[(size_t)c]) { selected[(size_t)c] = 1; ++count; } } - + std::vector out_ids; std::vector selected_chunks; for (int c = 0; c < n_chunks; ++c) { @@ -660,7 +749,8 @@ std::vector drafter_score_and_compress( float keep_ratio, int chunk_size, int n_lookahead, - int pool_kernel) { + int pool_kernel, + int use_transitive_override) { if (!ctx.loaded) { set_last_error("drafter not loaded"); return {}; @@ -671,7 +761,7 @@ std::vector drafter_score_and_compress( return {}; } auto * st = static_cast(ctx.arch_state); - return qwen35_score_and_compress(st->weights, ids, keep_ratio, chunk_size, n_lookahead, pool_kernel); + return qwen35_score_and_compress(st->weights, ids, keep_ratio, chunk_size, n_lookahead, pool_kernel, use_transitive_override); } const int S = (int)ids.size(); if (S < n_lookahead + 1) { @@ -728,46 +818,27 @@ std::vector drafter_score_and_compress( std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; }); - // Retrieval tasks often repeat a rare key in the final query and in the - // needle span. Exact scores alone can keep the query while dropping the - // neighboring answer chunk, so force a small token-only anchor neighborhood. - // Head/tail forced chunks scale with n_keep so top-K scoring always gets slots. - const int h_raw = env_int("DFLASH_COMPRESS_HEAD_CHUNKS", 8); - const int t_raw = env_int("DFLASH_COMPRESS_TAIL_CHUNKS", 24); - int head_chunks = h_raw, tail_chunks = t_raw; - if (head_chunks + tail_chunks >= n_keep) { - const int budget = std::max(1, n_keep - 1); - head_chunks = std::max(0, h_raw * budget / (h_raw + t_raw)); - tail_chunks = std::max(0, budget - head_chunks); - } - const int query_tokens = env_int("DFLASH_COMPRESS_QUERY_TOKENS", 96); - const int anchor_radius = env_int("DFLASH_COMPRESS_ANCHOR_RADIUS", 2); - const int max_anchor_hits = env_int("DFLASH_COMPRESS_MAX_ANCHOR_HITS", 8); + const CompressCfg cfg = compress_cfg_from_env(n_chunks, n_keep, use_transitive_override); + std::vector selected_mask((size_t)n_chunks, 0); std::vector forced((size_t)n_chunks, 0); - for (int c = 0; c < std::min(n_chunks, head_chunks); ++c) forced[(size_t)c] = 1; - for (int c = std::max(0, n_chunks - tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1; - - const int q0 = std::max(0, S - query_tokens); - constexpr int NGRAM = 4; - for (int q = q0; q + NGRAM <= S; ++q) { - int hits = 0; - int hit_pos[8]; - const int search_end = std::max(0, q0 - NGRAM); - for (int p = 0; p <= search_end && hits <= max_anchor_hits; ++p) { - bool same = true; - for (int k = 0; k < NGRAM; ++k) { - if (ids[(size_t)p + k] != ids[(size_t)q + k]) { same = false; break; } - } - if (same) { - if (hits < 8) hit_pos[hits] = p; - ++hits; - } - } - if (hits > 0 && hits <= max_anchor_hits) { - for (int i = 0; i < hits && i < 8; ++i) { - force_chunk_neighborhood(forced, n_chunks, hit_pos[i] / chunk_size, anchor_radius); - } + for (int c = 0; c < std::min(n_chunks, cfg.head_chunks); ++c) forced[(size_t)c] = 1; + for (int c = std::max(0, n_chunks - cfg.tail_chunks); c < n_chunks; ++c) forced[(size_t)c] = 1; + + const int q0 = std::max(0, S - cfg.query_tokens); + { + std::vector query_pool(ids.begin() + q0, ids.end()); + dflash::qwen3::AnchorScanCfg anchor_cfg = cfg.anchor; + anchor_cfg.chunk_size = chunk_size; + std::fprintf(stderr, "[drafter_cascade] n_keep=%d max_forced=%d min_anchor=%d\n", + n_keep, anchor_cfg.max_forced_count, anchor_cfg.cascade_min_anchor_count); + std::fflush(stderr); + + if (cfg.use_transitive) { + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + anchor_cfg, cfg.max_iters, forced); + } else { + dflash::qwen3::scan_and_force(ids, q0, query_pool, anchor_cfg, forced); } } @@ -824,4 +895,18 @@ std::vector drafter_score_and_compress( return out; } +// ABI-stable 6-arg overload — old callers compiled before the use_transitive_override +// parameter was added link here without requiring recompilation. +std::vector drafter_score_and_compress( + DrafterContext & ctx, + const std::vector & ids, + float keep_ratio, + int chunk_size, + int n_lookahead, + int pool_kernel) { + return drafter_score_and_compress(ctx, ids, keep_ratio, + chunk_size, n_lookahead, pool_kernel, + /*use_transitive_override=*/-1); +} + } // namespace dflash::common diff --git a/server/src/qwen3/qwen3_drafter.h b/server/src/qwen3/qwen3_drafter.h index e5424f9d..08aed3e9 100644 --- a/server/src/qwen3/qwen3_drafter.h +++ b/server/src/qwen3/qwen3_drafter.h @@ -66,13 +66,27 @@ void free_drafter_weights(DrafterContext & ctx); // Score importance per token via Liu Q-hook tail attention, then chunk-top-K // span merge. Returns surviving token IDs (drafter vocab). // -// ids input token IDs of length S -// keep_ratio fraction of `chunk_size`-token chunks to keep -// chunk_size span granularity (default 32) -// n_lookahead trailing Q tokens used for tail attention (default 8) -// pool_kernel AvgPool kernel for score smoothing (default 13) +// ids input token IDs of length S +// keep_ratio fraction of `chunk_size`-token chunks to keep +// chunk_size span granularity (default 32) +// n_lookahead trailing Q tokens used for tail attention (default 8) +// pool_kernel AvgPool kernel for score smoothing (default 13) +// use_transitive_override -1 = read from env (default, no behaviour change) +// 0 = cascade off (agentic path) +// 1 = cascade on (retrieval path) // // On failure returns empty vector + sets last_error. +std::vector drafter_score_and_compress( + DrafterContext & ctx, + const std::vector & ids, + float keep_ratio, + int chunk_size, + int n_lookahead, + int pool_kernel, + int use_transitive_override); + +// Backward-compatible 6-arg overload — ABI-stable wrapper, defined in qwen3_drafter.cpp. +// Old callers compiled against the 6-arg signature continue to link without recompile. std::vector drafter_score_and_compress( DrafterContext & ctx, const std::vector & ids, diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h index 71c544ac..117514f7 100644 --- a/server/src/server/http_server.h +++ b/server/src/server/http_server.h @@ -144,7 +144,7 @@ struct ServerConfig { enum class PflashMode { OFF, AUTO, ALWAYS }; PflashMode pflash_mode = PflashMode::OFF; int pflash_threshold = 32000; // token count threshold for AUTO mode - float pflash_keep_ratio = 0.05f; // fraction of tokens to keep + float pflash_keep_ratio = 0.10f; // fraction of tokens to keep std::string pflash_drafter_path; // path to drafter GGUF (Qwen3-0.6B) int pflash_drafter_gpu = 0; // backend-local GPU for PFlash drafter bool pflash_remote_drafter = false; // use IPC drafter for mixed backends diff --git a/server/src/server/server_main.cpp b/server/src/server/server_main.cpp index 5f00d4df..9438bc38 100644 --- a/server/src/server/server_main.cpp +++ b/server/src/server/server_main.cpp @@ -209,7 +209,7 @@ static void print_usage(const char * prog) { "PFlash (speculative prefill compression):\n" " --prefill-compression off|auto|always (default: off)\n" " --prefill-threshold Token threshold for auto mode (default: 32000)\n" - " --prefill-keep-ratio Fraction of tokens to keep (default: 0.05)\n" + " --prefill-keep-ratio Fraction of tokens to keep (default: 0.10)\n" " --prefill-drafter Drafter GGUF for compression (Qwen3-0.6B)\n" " --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n" " --draft-residency auto|persistent|request-scoped\n" diff --git a/server/test/test_anchor_transitive.cpp b/server/test/test_anchor_transitive.cpp new file mode 100644 index 00000000..ae8a0bbc --- /dev/null +++ b/server/test/test_anchor_transitive.cpp @@ -0,0 +1,355 @@ +// TDD: anchor transitive multi-pass. +// +// T1 — single-pass query-match preserved (regression pin, PASS today) +// T2 — single-pass misses chain hops (characterises limitation, PASS today) +// T3 — transitive rescues all hops (RED until Phase 2) +// +// Pure CPU — no GPU, no model load. + +#include "../src/qwen3/anchor_scan.h" + +#include +#include +#include +#include + +#define REQUIRE(cond) \ + do { if (!(cond)) { \ + std::fprintf(stderr, "FAIL: %s line %d: %s\n", __FILE__, __LINE__, #cond); \ + std::exit(1); \ + } } while (0) + +static constexpr int32_t FILLER = 1; +static constexpr int32_t M1 = 1001, M2 = 1002, M3 = 1003; +static constexpr int CHUNK = 64; + +// Place a marker 4-gram [FILLER, FILLER, MARKER, FILLER] at position pos. +static void place_marker_4gram(std::vector& ids, int pos, int32_t marker) { + ids[(size_t)pos] = FILLER; + ids[(size_t)pos + 1] = FILLER; + ids[(size_t)pos + 2] = marker; + ids[(size_t)pos + 3] = FILLER; +} + +// T1 — single-pass finds a query-matching marker in the body. +static void t1_single_pass_match() { + const int N = 2048; + std::vector ids((size_t)N, FILLER); + + // Body marker at pos 100 (chunk 1). + place_marker_4gram(ids, 100, M3); + // Same 4-gram in the query suffix at pos 2044 (inside query window). + place_marker_4gram(ids, 2044, M3); + + const int q0 = 1948; // N - 100 + std::vector query_pool(ids.begin() + q0, ids.end()); + + const int n_chunks = (N + CHUNK - 1) / CHUNK; + std::vector forced((size_t)n_chunks, 0); + + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4}; + dflash::qwen3::scan_and_force(ids, q0, query_pool, cfg, forced); + + // Chunk containing pos 100 must be forced. + const int target_chunk = 100 / CHUNK; // chunk 1 + REQUIRE(forced[(size_t)target_chunk] == 1); + + std::printf("T1 PASS: chunk %d forced by single-pass M3 match\n", target_chunk); +} + +// T2 — single-pass only forces the direct match; chain hops stay unforced. +static void t2_single_pass_misses_hops() { + const int N = 2048; + std::vector ids((size_t)N, FILLER); + + // hop1 at pos 200 (chunk 3): contains M1. + place_marker_4gram(ids, 200, M1); + + // hop2 at pos 600 (chunk 9): contains M2 + M1 (bridge to hop1). + place_marker_4gram(ids, 600, M2); + place_marker_4gram(ids, 604, M1); + + // hop3 at pos 1200 (chunk 18): contains M3 + M2 (bridge to hop2). + place_marker_4gram(ids, 1200, M3); + place_marker_4gram(ids, 1204, M2); + + // Query suffix at pos 2044: contains M3. + place_marker_4gram(ids, 2044, M3); + + const int q0 = 1948; + std::vector query_pool(ids.begin() + q0, ids.end()); + + const int n_chunks = (N + CHUNK - 1) / CHUNK; + std::vector forced((size_t)n_chunks, 0); + + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4}; + dflash::qwen3::scan_and_force(ids, q0, query_pool, cfg, forced); + + const int chunk_hop3 = 1200 / CHUNK; // 18 + const int chunk_hop2 = 600 / CHUNK; // 9 + const int chunk_hop1 = 200 / CHUNK; // 3 + + // Single-pass: only the direct M3 match at pos 1200 is forced. + REQUIRE(forced[(size_t)chunk_hop3] == 1); + REQUIRE(forced[(size_t)chunk_hop2] == 0); + REQUIRE(forced[(size_t)chunk_hop1] == 0); + + std::printf("T2 PASS: chunk(%d) forced, chunk(%d) and chunk(%d) NOT forced (single-pass)\n", + chunk_hop3, chunk_hop2, chunk_hop1); +} + +// T3 — transitive rescues all hops (FAILS until Phase 2 implements the function). +static void t3_transitive_rescues_all() { + const int N = 2048; + std::vector ids((size_t)N, FILLER); + + place_marker_4gram(ids, 200, M1); + + place_marker_4gram(ids, 600, M2); + place_marker_4gram(ids, 604, M1); + + place_marker_4gram(ids, 1200, M3); + place_marker_4gram(ids, 1204, M2); + + place_marker_4gram(ids, 2044, M3); + + const int q0 = 1948; + std::vector initial_query_pool(ids.begin() + q0, ids.end()); + + const int n_chunks = (N + CHUNK - 1) / CHUNK; + std::vector forced((size_t)n_chunks, 0); + + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4}; + dflash::qwen3::scan_and_force_transitive(ids, q0, initial_query_pool, + cfg, /*max_iters=*/3, forced); + + const int chunk_hop3 = 1200 / CHUNK; + const int chunk_hop2 = 600 / CHUNK; + const int chunk_hop1 = 200 / CHUNK; + + REQUIRE(forced[(size_t)chunk_hop3] == 1); + REQUIRE(forced[(size_t)chunk_hop2] == 1); + REQUIRE(forced[(size_t)chunk_hop1] == 1); + + std::printf("T3 PASS: all hops forced transitively\n"); +} + +// T4 — variable-name reuse across templates (FAILS until v2 adds rare-token match). +// +// Token layout: +// FILLER=1, V1=2001(X42), V2=2002(Y42), V3=2003(Z42) +// Template-context tokens: A=3001,B=3002,C=3003,D=3004,E=3005,F=3006 +// Query-match tokens: X1=4001,X2=4002,X3=4003 +// +// hop3 (chunk 18, pos 1200): [X1,X2,V3,X3,E,V2,F,FILL] — 4-gram [X1,X2,V3,X3] matches query +// hop2 (chunk 9, pos 600): [C,V2,FILL,V1,D,FILL,FILL] — V2 in DIFFERENT context than hop3 +// hop1 (chunk 3, pos 200): [A,V1,FILL,B] — V1 in DIFFERENT context than hop2 +// query (pos 2044): [X1,X2,V3,X3] — matches hop3 4-gram exactly +// +// Pass 1 (4-gram): forces hop3. +// Pass 1 rare-token: V2 (freq=2) found in hop3 → also at pos 601 (hop2 chunk 9) → forces hop2. +// Pass 2 rare-token: V1 (freq=2) found in hop2 → also at pos 201 (hop1 chunk 3) → forces hop1. +// Today's impl (4-gram only) fails because V2 4-grams in hop3 ≠ V2 4-grams in hop2. +static void t4_rare_token_bridges_different_context() { + static constexpr int32_t V1 = 2001, V2 = 2002, V3 = 2003; + static constexpr int32_t A = 3001, B = 3002, C = 3003, D = 3004, E = 3005, F = 3006; + static constexpr int32_t X1 = 4001, X2 = 4002, X3 = 4003; + + const int N = 2048; + std::vector ids((size_t)N, FILLER); + + // hop1 (chunk 3, pos 200): [A, V1, FILL, B] + ids[200] = A; ids[201] = V1; ids[202] = FILLER; ids[203] = B; + + // hop2 (chunk 9, pos 600): [C, V2, FILL, V1, D, FILL, FILL] + ids[600] = C; ids[601] = V2; ids[602] = FILLER; ids[603] = V1; + ids[604] = D; ids[605] = FILLER; ids[606] = FILLER; + + // hop3 (chunk 18, pos 1200): [X1, X2, V3, X3, E, V2, F, FILL] + // V2 here is in 4-gram context [E,V2,F,FILL] — differs from hop2's [C,V2,FILL,V1] + ids[1200] = X1; ids[1201] = X2; ids[1202] = V3; ids[1203] = X3; + ids[1204] = E; ids[1205] = V2; ids[1206] = F; ids[1207] = FILLER; + + // query suffix (pos 2044): [X1, X2, V3, X3] — exact 4-gram match to hop3 + ids[2044] = X1; ids[2045] = X2; ids[2046] = V3; ids[2047] = X3; + + const int q0 = 1948; + std::vector initial_query_pool(ids.begin() + q0, ids.end()); + + const int n_chunks = (N + CHUNK - 1) / CHUNK; + std::vector forced((size_t)n_chunks, 0); + + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4, + /*rare_token_max_freq=*/8}; + dflash::qwen3::scan_and_force_transitive(ids, q0, initial_query_pool, + cfg, /*max_iters=*/3, forced); + + const int chunk_hop3 = 1200 / CHUNK; // 18 + const int chunk_hop2 = 600 / CHUNK; // 9 + const int chunk_hop1 = 200 / CHUNK; // 3 + + REQUIRE(forced[(size_t)chunk_hop3] == 1); + REQUIRE(forced[(size_t)chunk_hop2] == 1); + REQUIRE(forced[(size_t)chunk_hop1] == 1); + + std::printf("T4 PASS: all hops forced via rare-token bridge (V2 freq=2, V1 freq=2)\n"); +} + +// T5: gate closes when pass-1 already finds >= cascade_min_anchor_count chunks. +// +// Layout (N=4096, chunk=64 → 64 chunks): +// A common 4-gram [CMN,CMN,CMN,CMN] appears 50 times at scattered body positions. +// One forced chunk (chunk 5, pos 320) also contains a unique rare token RT (freq=1). +// RT appears once more at a separate body position in chunk 60 (pos 3840). +// Query suffix contains the common 4-gram → pass-1 forces all 50 matching chunks. +// +// With cascade_min_anchor_count=5: gained=50 >= 5 → gate closes → cascade skipped. +// chunk 60 (pos 3840, which has RT but is only reachable via cascade) stays UNFORCED. +// +// With cascade_min_anchor_count=0: gate open → cascade runs → chunk 60 gets forced. +// This contrast proves the gate is operative. +static void t5_gate_closes_when_pass1_finds_many() { + static constexpr int32_t CMN = 5001; // common token (4-gram made of it) + static constexpr int32_t RT = 5002; // rare token (freq=2) + + const int N = 4096; + const int n_chunks = (N + CHUNK - 1) / CHUNK; // 64 + std::vector ids((size_t)N, FILLER); + + // Place common 4-gram at 50 scattered body positions (chunks 0..49). + // Spaced 64 tokens apart to land in different chunks. + for (int i = 0; i < 50; ++i) { + int pos = i * 64 + 4; // pos 4, 68, 132, ... (well within body) + ids[(size_t)pos] = CMN; + ids[(size_t)pos + 1] = CMN; + ids[(size_t)pos + 2] = CMN; + ids[(size_t)pos + 3] = CMN; + } + + // RT appears in chunk 5 (pos 320) and chunk 60 (pos 3840). + ids[320] = RT; + ids[3840] = RT; + + // Query suffix: just the common 4-gram so pass-1 fires on all 50 body positions. + const int q0 = N - 32; + ids[(size_t)q0] = CMN; + ids[(size_t)q0 + 1] = CMN; + ids[(size_t)q0 + 2] = CMN; + ids[(size_t)q0 + 3] = CMN; + std::vector query_pool(ids.begin() + q0, ids.end()); + + // --- Test A: gate CLOSED (cascade_min_anchor_count=5) --- + { + std::vector forced_a((size_t)n_chunks, 0); + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/64, /*ngram=*/4, + /*rare_token_max_freq=*/2, + /*cascade_min_anchor_count=*/5, + /*max_forced_count=*/INT_MAX}; + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + cfg, /*max_iters=*/3, forced_a); + + // Pass-1 forces chunks 0..49 (50 chunks); gate closes → cascade skipped. + // chunk 60 (pos 3840 has RT but only reachable via cascade) must be UNFORCED. + const int chunk_rt_extra = 3840 / CHUNK; // 60 + REQUIRE(forced_a[(size_t)chunk_rt_extra] == 0); + // chunk 5 (contains RT at pos 320) is forced by pass-1 (common 4-gram at pos 324). + REQUIRE(forced_a[5] == 1); + + std::printf("T5a PASS: gate closed (gained=50 >= min=5), chunk %d unforced\n", + chunk_rt_extra); + } + + // --- Test B: gate OPEN (cascade_min_anchor_count=0) → cascade forces chunk 60 --- + { + std::vector forced_b((size_t)n_chunks, 0); + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/64, /*ngram=*/4, + /*rare_token_max_freq=*/2, + /*cascade_min_anchor_count=*/0, + /*max_forced_count=*/INT_MAX}; + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + cfg, /*max_iters=*/3, forced_b); + + // Cascade runs; chunk 5 is forced by pass-1 and contains RT; + // RT at pos 3840 → chunk 60 forced via rare-token cascade. + const int chunk_rt_extra = 3840 / CHUNK; + REQUIRE(forced_b[(size_t)chunk_rt_extra] == 1); + + std::printf("T5b PASS: gate open (min=0), cascade forced chunk %d via RT\n", + chunk_rt_extra); + } +} + +// T6: hard cap (max_forced_count) prevents runaway cascade. +// +// Layout (N=2048, chunk=64 → 32 chunks): +// Query contains 4-gram [TGR,TGR,TGR,TGR] which matches body chunk 0. +// Chunk 0 contains chain token C0 (freq=2): also appears in chunk 1. +// Chunk 1 contains chain token C1 (freq=2): also appears in chunk 2. +// ... 20 such chain links. +// Pass-1 forces chunk 0 (1 chunk gained < cascade_min_anchor_count=0 → gate open). +// Cascade rare-token worklist propagates: chunk 0→1→2→...→20 (20 more). +// max_forced_count=5 → cascade stops when total > 5. Result: forced <= 5. +static void t6_hard_cap_prevents_runaway() { + static constexpr int32_t TGR = 7000; // trigger token for 4-gram pass-1 match + + const int N = 2048; + const int n_chunks = (N + CHUNK - 1) / CHUNK; // 32 + std::vector ids((size_t)N, FILLER); + + // body chunk 0 (pos 0): place 4-gram [TGR,TGR,TGR,TGR] so pass-1 forces it. + ids[0] = TGR; ids[1] = TGR; ids[2] = TGR; ids[3] = TGR; + + // Rare-token chain: C_i appears in chunk i (at offset 8) and chunk i+1 (at offset 9). + // Offsets 8 and 9 within each chunk don't collide between consecutive tokens. + // Cascade worklist: chunk i forced → C_i found at offset 8 → chunk i+1 forced. + for (int i = 0; i < 20; ++i) { + int32_t tok = 7100 + i; + ids[(size_t)(i * 64 + 8)] = tok; // in chunk i, offset 8 + ids[(size_t)((i + 1) * 64 + 9)] = tok; // in chunk i+1, offset 9 + } + + // Query suffix: contains [TGR,TGR,TGR,TGR] → pass-1 matches body chunk 0. + const int q0 = N - 64; + ids[(size_t)q0] = TGR; + ids[(size_t)q0 + 1] = TGR; + ids[(size_t)q0 + 2] = TGR; + ids[(size_t)q0 + 3] = TGR; + std::vector query_pool(ids.begin() + q0, ids.end()); + + // Without cap: cascade forces chunks 0..20 (21 chunks total). + // With cap=5: stops at 5. + std::vector forced((size_t)n_chunks, 0); + dflash::qwen3::AnchorScanCfg cfg{CHUNK, /*anchor_radius=*/0, + /*max_anchor_hits=*/8, /*ngram=*/4, + /*rare_token_max_freq=*/2, + /*cascade_min_anchor_count=*/0, + /*max_forced_count=*/5}; + dflash::qwen3::scan_and_force_transitive(ids, q0, query_pool, + cfg, /*max_iters=*/25, forced); + + int total_forced = 0; + for (int c = 0; c < n_chunks; ++c) total_forced += (int)forced[(size_t)c]; + + REQUIRE(total_forced <= 5); + REQUIRE(forced[0] == 1); // chunk 0 always forced by pass-1 + + std::printf("T6 PASS: hard cap engaged, forced=%d (cap=5, chain length=20)\n", + total_forced); +} + +int main() { + t1_single_pass_match(); + t2_single_pass_misses_hops(); + t3_transitive_rescues_all(); + t4_rare_token_bridges_different_context(); + t5_gate_closes_when_pass1_finds_many(); + t6_hard_cap_prevents_runaway(); + std::printf("\nAll anchor_transitive tests passed.\n"); + return 0; +}