From 912b797a6c97f239a0043357d1f50f4edcf277fd Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sun, 14 Jun 2026 05:43:22 +0200 Subject: [PATCH] fix(decode): reserve >= dflash_block_size outputs for DFlash draft contexts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DFlash draft decodes an entire diffusion block per step with every token flagged for output (block_size tokens, default 16), so output_reserve() needs n_outputs_max >= dflash_block_size. The server sizes draft contexts to n_parallel (server-context.cpp:933), which is < block_size, tripping GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) in output_reserve() on the first draft() call — the "loads clean, child dies on first decode" crash. Clamp cparams.n_outputs_max up to dflash_block_size in the llama_context ctor for DFlash-arch models. In-library guard so it defends every caller, not just the server path. No effect on non-DFlash models (guarded on arch). Addresses #108. --- src/llama-context.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1157c21c33cd..2f96823c1785 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -208,6 +208,15 @@ llama_context::llama_context( cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max; + // DFlash decodes an entire diffusion block per step with every token flagged + // for output (block_size tokens), so the output buffer must hold at least the + // block size. The server sizes draft contexts to n_parallel (server-context.cpp), + // which is < dflash_block_size and trips GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) + // in output_reserve() on the first draft(). See heiervang-technologies/ht-llama.cpp#108. + if (model.arch == LLM_ARCH_DFLASH) { + cparams.n_outputs_max = std::max(cparams.n_outputs_max, model.hparams.dflash_block_size); + } + cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified;