From 50698a2053b8154d32f06de85bcd1cb43f4fa6ef Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 22 May 2026 13:46:35 +0800 Subject: [PATCH] feat(pflash): add dynamic threshold for chunk selection and lower auto-mode trigger - Introduce score-based dynamic n_keep via DFLASH_COMPRESS_DYNAMIC_THRESHOLD env var (float alpha in (0,1)). Keeps all chunks whose mean score >= max_score * alpha, clamped to [1, min(n_chunks/2, fixed_n_keep*3)]. - Lower pflash auto-mode token threshold from 32000 to 2000. --- dflash/src/qwen3/qwen3_drafter.cpp | 61 +++++++++++++++++++++++++++++- dflash/src/server/http_server.h | 2 +- dflash/src/server/server_main.cpp | 2 +- 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/dflash/src/qwen3/qwen3_drafter.cpp b/dflash/src/qwen3/qwen3_drafter.cpp index c5e31fb3..a137c2d1 100644 --- a/dflash/src/qwen3/qwen3_drafter.cpp +++ b/dflash/src/qwen3/qwen3_drafter.cpp @@ -70,6 +70,59 @@ static void force_chunk_neighborhood(std::vector & forced, int n_chunks for (int c = lo; c <= hi; ++c) forced[(size_t)c] = 1; } +// Dynamic threshold: override n_keep based on score distribution rather than +// a fixed ratio. Enabled via DFLASH_COMPRESS_DYNAMIC_THRESHOLD env var (float +// alpha in (0,1)). Keeps all chunks whose mean score >= max_score * alpha, +// clamped between [1, min(n_chunks/2, fixed_n_keep*3)]. +// Returns the original n_keep unchanged if env var is unset or invalid. +static int maybe_dynamic_n_keep( + const std::vector> & chunk_means_sorted_desc, + int n_chunks, + int fixed_n_keep) +{ + static const char * dyn_env = std::getenv("DFLASH_COMPRESS_DYNAMIC_THRESHOLD"); + if (!dyn_env) return fixed_n_keep; + + char * end = nullptr; + float alpha = std::strtof(dyn_env, &end); + if (end == dyn_env || alpha <= 0.0f || alpha >= 1.0f) { + // Invalid value — log once and fall back. + static bool warned = false; + if (!warned) { + std::fprintf(stderr, "[compress] DFLASH_COMPRESS_DYNAMIC_THRESHOLD='%s' " + "invalid (need float in (0,1)), using fixed keep_ratio\n", dyn_env); + warned = true; + } + return fixed_n_keep; + } + + if (chunk_means_sorted_desc.empty()) return fixed_n_keep; + + float max_score = chunk_means_sorted_desc[0].first; + if (max_score <= 0.0f || !std::isfinite(max_score)) return fixed_n_keep; + + float thresh = max_score * alpha; + int dyn_keep = 0; + for (const auto & cm : chunk_means_sorted_desc) { + if (cm.first >= thresh) ++dyn_keep; + } + + // Clamp: floor=1, ceiling=min(n_chunks/2, fixed_n_keep*3) to prevent + // pathological cases (flat scores → keep everything). + int ceiling = std::min(n_chunks / 2, std::max(fixed_n_keep * 3, 1)); + int result = std::max(1, std::min(dyn_keep, ceiling)); + + static bool logged_once = false; + if (!logged_once) { + std::fprintf(stderr, "[compress] dynamic threshold: alpha=%.3f max_score=%.4f " + "thresh=%.4f dyn_keep=%d ceiling=%d final=%d (fixed was %d)\n", + alpha, max_score, thresh, dyn_keep, ceiling, result, fixed_n_keep); + logged_once = true; + } + + return result; +} + #if defined(DFLASH27B_BACKEND_HIP) bool prewarm_drafter_once(const Qwen3DrafterWeights & w) { static bool warmed = false; @@ -475,7 +528,7 @@ static std::vector qwen35_score_and_compress( } const int n_chunks = (S + chunk_size - 1) / chunk_size; - const int n_keep = std::max(1, (int)((float)n_chunks * keep_ratio)); + const int fixed_n_keep = std::max(1, (int)((float)n_chunks * keep_ratio)); std::vector smooth_score = score; // Caller pool_kernel takes precedence; if zero/negative, fall back to env or 5. @@ -502,6 +555,8 @@ static std::vector qwen35_score_and_compress( chunk_means.push_back({s / std::max(1, hi - lo), c}); } std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; }); + + const int n_keep = maybe_dynamic_n_keep(chunk_means, n_chunks, fixed_n_keep); std::vector selected((size_t)n_chunks, 0); int count = 0; @@ -685,7 +740,7 @@ std::vector drafter_score_and_compress( // ── 4. Chunk-top-K + span merge ─────────────────────────────────── int n_chunks = (S + chunk_size - 1) / chunk_size; - int n_keep = std::max(1, (int)((float)n_chunks * keep_ratio)); + int fixed_n_keep = std::max(1, (int)((float)n_chunks * keep_ratio)); std::vector> chunk_means; chunk_means.reserve((size_t)n_chunks); for (int c = 0; c < n_chunks; ++c) { @@ -699,6 +754,8 @@ std::vector drafter_score_and_compress( std::sort(chunk_means.begin(), chunk_means.end(), [](auto a, auto b) { return a.first > b.first; }); + int n_keep = maybe_dynamic_n_keep(chunk_means, n_chunks, fixed_n_keep); + // Retrieval tasks often repeat a rare key in the final query and in the // needle span. Exact scores alone can keep the query while dropping the // neighboring answer chunk, so force a small token-only anchor neighborhood. diff --git a/dflash/src/server/http_server.h b/dflash/src/server/http_server.h index 8c0ec9eb..554b4bb4 100644 --- a/dflash/src/server/http_server.h +++ b/dflash/src/server/http_server.h @@ -52,7 +52,7 @@ struct ServerConfig { // PFlash (speculative prefill compression) enum class PflashMode { OFF, AUTO, ALWAYS }; PflashMode pflash_mode = PflashMode::OFF; - int pflash_threshold = 32000; // token count threshold for AUTO mode + int pflash_threshold = 2000; // token count threshold for AUTO mode float pflash_keep_ratio = 0.05f; // fraction of tokens to keep std::string pflash_drafter_path; // path to drafter GGUF (Qwen3-0.6B) bool pflash_skip_park = false; // skip park/unpark for ≥32GB GPUs diff --git a/dflash/src/server/server_main.cpp b/dflash/src/server/server_main.cpp index 319f97de..45ef5558 100644 --- a/dflash/src/server/server_main.cpp +++ b/dflash/src/server/server_main.cpp @@ -64,7 +64,7 @@ static void print_usage(const char * prog) { "\n" "PFlash (speculative prefill compression):\n" " --prefill-compression off|auto|always (default: off)\n" - " --prefill-threshold Token threshold for auto mode (default: 32000)\n" + " --prefill-threshold Token threshold for auto mode (default: 2000)\n" " --prefill-keep-ratio Fraction of tokens to keep (default: 0.05)\n" " --prefill-drafter Drafter GGUF for compression (Qwen3-0.6B)\n" " --prefill-skip-park Skip park/unpark (for >=32GB GPUs)\n"