From 912b797a6c97f239a0043357d1f50f4edcf277fd Mon Sep 17 00:00:00 2001
From: marksverdhei <mark.sverdhei@gmail.com>
Date: Sun, 14 Jun 2026 05:43:22 +0200
Subject: [PATCH] fix(decode): reserve >= dflash_block_size outputs for DFlash
 draft contexts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The DFlash draft decodes an entire diffusion block per step with every token
flagged for output (block_size tokens, default 16), so output_reserve() needs
n_outputs_max >= dflash_block_size. The server sizes draft contexts to
n_parallel (server-context.cpp:933), which is < block_size, tripping
GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) in output_reserve() on the
first draft() call — the "loads clean, child dies on first decode" crash.

Clamp cparams.n_outputs_max up to dflash_block_size in the llama_context ctor
for DFlash-arch models. In-library guard so it defends every caller, not just
the server path. No effect on non-DFlash models (guarded on arch).

Addresses #108.
---
 src/llama-context.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 1157c21c33cd..2f96823c1785 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -208,6 +208,15 @@ llama_context::llama_context(
 
     cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max;
 
+    // DFlash decodes an entire diffusion block per step with every token flagged
+    // for output (block_size tokens), so the output buffer must hold at least the
+    // block size. The server sizes draft contexts to n_parallel (server-context.cpp),
+    // which is < dflash_block_size and trips GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max)
+    // in output_reserve() on the first draft(). See heiervang-technologies/ht-llama.cpp#108.
+    if (model.arch == LLM_ARCH_DFLASH) {
+        cparams.n_outputs_max = std::max(cparams.n_outputs_max, model.hparams.dflash_block_size);
+    }
+
     cparams.op_offload = params.op_offload;
     cparams.kv_unified = params.kv_unified;