From 2033bc8ebbe67b59521499127288dab0260f5998 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Mon, 8 Jun 2026 23:23:29 +0530
Subject: [PATCH 01/13] diffusion-gemma: chat-completions-compatible HTTP
 server (llama-server analogue)

---
 examples/diffusion-gemma/CMakeLists.txt       |  10 +
 .../diffusion-gemma-server.cpp                | 889 ++++++++++++++++++
 2 files changed, 899 insertions(+)
 create mode 100644 examples/diffusion-gemma/diffusion-gemma-server.cpp
diff --git a/examples/diffusion-gemma/CMakeLists.txt b/examples/diffusion-gemma/CMakeLists.txt
index 6ee3414d38df..b0f805717a7c 100644
--- a/examples/diffusion-gemma/CMakeLists.txt
+++ b/examples/diffusion-gemma/CMakeLists.txt
@@ -5,3 +5,13 @@ target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd ${CMAKE_THREAD_L
 # mtmd (tools/) is added after examples/, so add its include dir explicitly for the headers
 target_include_directories(${TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+# OpenAI-compatible HTTP server for the block-diffusion models (llama-server analogue)
+set(TARGET llama-diffusion-gemma-server)
+add_executable(${TARGET} diffusion-gemma-server.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd
+    ${CMAKE_SOURCE_DIR}/vendor)   # cpp-httplib/httplib.h, nlohmann/json.hpp
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
new file mode 100644
index 000000000000..e6fd0a536a1b
--- /dev/null
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -0,0 +1,889 @@
+// OpenAI-compatible HTTP server for the block-diffusion models (diffusion-gemma).
+//
+// This is the llama-server analogue for the diffusion family: it loads a block-diffusion model
+// once and serves the same denoising generation loop as diffusion-gemma-cli over HTTP, exposing
+// the OpenAI endpoints (/v1/chat/completions, /v1/completions, /v1/models) plus the llama-server
+// observability surface (/health, /v1/health, /props, /metrics, /slots) with the same response
+// `timings`/`usage` objects, the same per-request timing logs, the same access log, and a Prometheus
+// /metrics endpoint -- all with the metric semantics re-mapped to block diffusion (canvas blocks,
+// denoising steps, s/step) instead of autoregressive token-by-token decode.
+//
+// Generation is NOT autoregressive: each request denoises one or more 256-token canvases against a
+// cached prompt prefix (see diffusion-gemma-cli.cpp for the full description). A single llama_context
+// is reused across requests and is not thread-safe, so generation is serialized behind a mutex (one
+// slot); the HTTP layer still accepts many connections concurrently.
+
+#include "arg.h"
+#include "chat.h"
+#include "common.h"
+#include "llama.h"
+#include "log.h"
+
+#include <cpp-httplib/httplib.h>
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <ctime>
+#include <mutex>
+#include <numeric>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+// llama-server-style log macros (server-common.h): "srv  <func>: ..." / "slot <func>: id N | ..."
+#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SLT_INF(id, fmt, ...) LOG_INF("slot %12.*s: id %2d | " fmt, 12, __func__, (id), __VA_ARGS__)
+
+// reference defaults from generation_config.json / DiffusionGemmaGenerationConfig (mirror the CLI)
+static constexpr int   DEF_CANVAS_LENGTH    = 256;
+static constexpr int   DEF_MAX_DENOISE_STEPS = 48;
+static constexpr float ENTROPY_BOUND        = 0.1f;
+static constexpr float TEMP_MIN             = 0.4f;
+static constexpr float TEMP_MAX             = 0.8f;
+static constexpr float CONFIDENCE_THRESHOLD = 0.005f;
+static constexpr int   STABILITY_THRESHOLD  = 1;
+static constexpr int   SC_K                 = 256; // must match llama_model_diffusion_gemma::N_SC_TOPK
+
+// ------------------------------------------------------------------------------------------------
+// per-request generation parameters and result
+// ------------------------------------------------------------------------------------------------
+struct diffusion_request {
+    int      canvas_length = DEF_CANVAS_LENGTH;
+    int      n_steps       = DEF_MAX_DENOISE_STEPS;
+    int      max_canvases  = 1;     // ceil(max_tokens / canvas_length)
+    int      topk_fixed    = 0;     // 0 = full softmax
+    int      topk_start    = 0;
+    int      topk_end      = 0;
+    bool     topk_tail     = false;
+    bool     ignore_eos    = false; // run all max_canvases blocks (don't stop at end-of-text)
+    uint32_t seed          = 1234;
+};
+
+struct diffusion_result {
+    std::string answer;            // final response text (post channel-split / eog truncation)
+    std::string full;             // full detokenized canvas (thought + response)
+    int    prompt_tokens   = 0;    // prompt prefix tokens processed (encoder prefill)
+    int    answer_tokens   = 0;    // tokens in the extracted answer
+    int    n_blocks        = 0;    // canvas blocks run
+    int    n_steps_total   = 0;    // denoising steps across all blocks
+    int    canvas_tokens   = 0;    // n_blocks * canvas_length
+    int    n_decode        = 0;    // total llama_decode() calls (prefill + denoise + commit)
+    double prefill_ms      = 0.0;  // encoder-phase prompt prefill wall time
+    double gen_ms          = 0.0;  // denoising block-loop wall time
+    bool   ok              = false;
+    std::string error;
+};
+
+// ------------------------------------------------------------------------------------------------
+// server metrics (llama-server analogue; protected by its own mutex)
+// ------------------------------------------------------------------------------------------------
+struct server_metrics {
+    std::mutex mu;
+    int64_t  t_start = 0;                 // process start (unix seconds)
+
+    // cumulative counters
+    uint64_t n_requests_total           = 0;
+    uint64_t n_prompt_tokens_total      = 0;
+    double   t_prompt_ms_total          = 0.0;
+    uint64_t n_tokens_predicted_total   = 0;   // answer tokens
+    double   t_predicted_ms_total       = 0.0; // denoise time
+    uint64_t n_decode_total             = 0;
+    uint64_t n_blocks_total             = 0;
+    uint64_t n_steps_total              = 0;
+    uint64_t n_canvas_tokens_total      = 0;
+
+    // last-request gauges
+    double   last_prompt_tps            = 0.0;
+    double   last_predicted_tps         = 0.0;
+    double   last_steps_per_second      = 0.0;
+    double   last_canvas_tps            = 0.0;
+    double   last_ms_per_step           = 0.0;
+
+    std::atomic<int> n_processing{0};
+
+    void add(const diffusion_result & r) {
+        std::lock_guard<std::mutex> lk(mu);
+        n_requests_total          += 1;
+        n_prompt_tokens_total     += r.prompt_tokens;
+        t_prompt_ms_total         += r.prefill_ms;
+        n_tokens_predicted_total  += r.answer_tokens;
+        t_predicted_ms_total      += r.gen_ms;
+        n_decode_total            += r.n_decode;
+        n_blocks_total            += r.n_blocks;
+        n_steps_total             += r.n_steps_total;
+        n_canvas_tokens_total     += r.canvas_tokens;
+        last_prompt_tps      = r.prefill_ms > 0 ? r.prompt_tokens * 1e3 / r.prefill_ms : 0.0;
+        last_predicted_tps   = r.gen_ms     > 0 ? r.answer_tokens * 1e3 / r.gen_ms     : 0.0;
+        last_steps_per_second= r.gen_ms     > 0 ? r.n_steps_total * 1e3 / r.gen_ms     : 0.0;
+        last_canvas_tps      = r.gen_ms     > 0 ? r.canvas_tokens * 1e3 / r.gen_ms     : 0.0;
+        last_ms_per_step     = r.n_steps_total > 0 ? r.gen_ms / r.n_steps_total        : 0.0;
+    }
+};
+
+// ------------------------------------------------------------------------------------------------
+// server state (one model + one reused context == one slot, guarded by a mutex)
+// ------------------------------------------------------------------------------------------------
+struct diffusion_server {
+    llama_model       * model = nullptr;
+    llama_context     * ctx   = nullptr;
+    const llama_vocab * vocab = nullptr;
+    llama_memory_t      mem   = nullptr;
+    llama_batch         batch{};
+    common_chat_templates_ptr templates;
+
+    int    canvas_length = DEF_CANVAS_LENGTH;
+    int    n_steps       = DEF_MAX_DENOISE_STEPS;
+    int    n_ctx         = 0;
+    int    n_ub          = 0;       // ubatch == canvas_length (keeps the decoder gather graph small)
+    int    n_vocab       = 0;
+    std::string model_id;
+    std::string model_path;
+    std::string build_info = "llama.cpp diffusion-gemma-server";
+
+    server_metrics metrics;
+    std::mutex gen_mutex;            // serialize generation (ctx is single-threaded == one slot)
+
+    std::string format_messages(const json & messages) const {
+        common_chat_templates_inputs inputs;
+        inputs.messages = common_chat_msgs_parse_oaicompat(messages);
+        inputs.add_generation_prompt = true;
+        return common_chat_templates_apply(templates.get(), inputs).prompt;
+    }
+
+    // causal prefill of `toks` starting at position pos0, chunked to n_ub (encoder phase).
+    // increments *n_dec with the number of llama_decode() calls made.
+    bool prefill_causal(const std::vector<llama_token> & toks, int pos0, int * n_dec) {
+        llama_set_causal_attn(ctx, true);
+        llama_set_diffusion_decoder_phase(ctx, false);
+        llama_set_diffusion_self_cond_topk(ctx, nullptr, nullptr, 0, 0);
+        const int n = (int) toks.size();
+        for (int off = 0; off < n; off += n_ub) {
+            const int cnt = std::min(n_ub, n - off);
+            batch.n_tokens = cnt;
+            for (int i = 0; i < cnt; ++i) {
+                batch.token[i]     = toks[off + i];
+                batch.pos[i]       = pos0 + off + i;
+                batch.n_seq_id[i]  = 1;
+                batch.seq_id[i][0] = 0;
+                batch.logits[i]    = (off + i == n - 1) ? 1 : 0;
+            }
+            if (llama_decode(ctx, batch) != 0) return false;
+            ++(*n_dec);
+        }
+        return true;
+    }
+
+    // run the block-diffusion denoising loop for one request (mirrors diffusion-gemma-cli.cpp)
+    diffusion_result generate(const std::vector<llama_token> & prompt_tokens, const diffusion_request & rq) {
+        diffusion_result out;
+        const int prefix_len = (int) prompt_tokens.size();
+        int n_decode = 0;
+
+        int max_canvases = rq.max_canvases;
+        const int fit = (n_ctx - prefix_len) / canvas_length - 1;
+        if (fit < 1) {
+            out.error = "prompt too long for context (n_ctx=" + std::to_string(n_ctx) + ")";
+            return out;
+        }
+        if (max_canvases > fit) max_canvases = fit;
+        if (max_canvases < 1)   max_canvases = 1;
+
+        llama_memory_clear(mem, true);
+
+        std::mt19937 rng(rq.seed);
+        std::uniform_int_distribution<int> rand_tok(0, n_vocab - 1);
+        std::uniform_real_distribution<float> rand_unif(0.0f, 1.0f);
+
+        // ---- ENCODER phase: prefill the prompt prefix (causal, no self-conditioning) ----
+        const auto t_prefill0 = std::chrono::steady_clock::now();
+        int n_past = 0;
+        if (prefix_len > 0) {
+            if (!prefill_causal(prompt_tokens, 0, &n_decode)) {
+                out.error = "prompt prefill (encoder) decode failed";
+                return out;
+            }
+            n_past = prefix_len;
+        }
+        out.prefill_ms = std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - t_prefill0).count();
+
+        std::vector<llama_token> canvas(canvas_length);
+        std::vector<llama_token> argmax_canvas(canvas_length, -1);
+        std::vector<llama_token> prev_argmax(canvas_length, -1);
+        std::vector<llama_token> accepted(canvas_length);
+        std::vector<int32_t>     sc_ids ((size_t) SC_K * canvas_length, 0);
+        std::vector<float>       sc_probs((size_t) SC_K * canvas_length, 0.0f);
+        std::vector<llama_token> generated;
+
+        int n_blocks_run = 0, n_steps_total = 0;
+        bool done = false;
+
+        const auto t_gen0 = std::chrono::steady_clock::now();
+        for (int block = 0; block < max_canvases && !done; ++block) {
+            ++n_blocks_run;
+            for (auto & t : canvas) t = rand_tok(rng);
+            std::fill(prev_argmax.begin(), prev_argmax.end(), -1);
+            llama_set_diffusion_self_cond_topk(ctx, nullptr, nullptr, 0, 0);
+
+            for (int cur_step = n_steps; cur_step >= 1; --cur_step) {
+                ++n_steps_total;
+                llama_set_causal_attn(ctx, false);
+                llama_set_diffusion_decoder_phase(ctx, true);
+                batch.n_tokens = canvas_length;
+                for (int j = 0; j < canvas_length; ++j) {
+                    batch.token[j]     = canvas[j];
+                    batch.pos[j]       = n_past + j;
+                    batch.n_seq_id[j]  = 1;
+                    batch.seq_id[j][0] = 0;
+                    batch.logits[j]    = 1;
+                }
+                if (llama_decode(ctx, batch) != 0) {
+                    out.error = "llama_decode failed at denoising step " + std::to_string(cur_step);
+                    return out;
+                }
+                ++n_decode;
+                const float * logits = llama_get_logits(ctx);
+
+                const float temp = TEMP_MIN + (TEMP_MAX - TEMP_MIN) * ((float) cur_step / (float) n_steps);
+
+                std::vector<float> entropy(canvas_length);
+                std::vector<llama_token> sampled(canvas_length);
+
+                int k_step = 0;
+                if (rq.topk_start > 0 && rq.topk_end > 0) {
+                    const float frac = (n_steps > 1) ? (float) (cur_step - 1) / (float) (n_steps - 1) : 0.0f;
+                    k_step = (int) lroundf(rq.topk_end + (rq.topk_start - rq.topk_end) * frac);
+                } else if (rq.topk_fixed > 0) {
+                    k_step = rq.topk_fixed;
+                }
+                if (k_step <= 0 || k_step >= n_vocab) k_step = 0;
+
+                std::fill(sc_probs.begin(), sc_probs.end(), 0.0f);
+                std::fill(sc_ids.begin(),   sc_ids.end(),   0);
+
+                const auto cmp = [](const std::pair<float,int>&a, const std::pair<float,int>&b){ return a.first > b.first; };
+
+                if (k_step == 0) {
+                    std::vector<float> probs(n_vocab);
+                    std::vector<std::pair<float,int>> scheap; scheap.reserve(SC_K);
+                    for (int j = 0; j < canvas_length; ++j) {
+                        const float * lg = logits + (size_t) j * n_vocab;
+                        float maxl = -INFINITY; int amax = 0;
+                        scheap.clear();
+                        for (int v = 0; v < n_vocab; ++v) {
+                            const float x = lg[v] / temp;
+                            if (x > maxl) { maxl = x; amax = v; }
+                            if ((int) scheap.size() < SC_K) {
+                                scheap.push_back({x, v});
+                                std::push_heap(scheap.begin(), scheap.end(), cmp);
+                            } else if (x > scheap.front().first) {
+                                std::pop_heap(scheap.begin(), scheap.end(), cmp);
+                                scheap.back() = {x, v};
+                                std::push_heap(scheap.begin(), scheap.end(), cmp);
+                            }
+                        }
+                        float sum = 0.0f;
+                        for (int v = 0; v < n_vocab; ++v) { const float p = expf(lg[v] / temp - maxl); probs[v] = p; sum += p; }
+                        float ent = 0.0f;
+                        const float r = rand_unif(rng) * sum;
+                        float cum = 0.0f; int tok = amax; bool picked = false;
+                        for (int v = 0; v < n_vocab; ++v) {
+                            const float p = probs[v] / sum;
+                            if (p > 0.0f) ent -= p * logf(p);
+                            cum += probs[v];
+                            if (!picked && cum >= r) { tok = v; picked = true; }
+                        }
+                        int32_t * sid = sc_ids.data()   + (size_t) j * SC_K;
+                        float   * spr = sc_probs.data() + (size_t) j * SC_K;
+                        int slot = 0;
+                        for (auto & h : scheap) { sid[slot] = h.second; spr[slot] = expf(h.first - maxl) / sum; ++slot; }
+                        entropy[j] = ent; sampled[j] = tok; argmax_canvas[j] = amax;
+                    }
+                } else {
+                    const int heap_k = std::max(k_step, SC_K);
+                    std::vector<std::pair<float,int>> heap; heap.reserve(heap_k);
+                    for (int j = 0; j < canvas_length; ++j) {
+                        const float * lg = logits + (size_t) j * n_vocab;
+                        float maxl = -INFINITY; int amax = 0;
+                        heap.clear();
+                        for (int v = 0; v < n_vocab; ++v) {
+                            const float x = lg[v] / temp;
+                            if (x > maxl) { maxl = x; amax = v; }
+                            if ((int) heap.size() < heap_k) {
+                                heap.push_back({x, v});
+                                std::push_heap(heap.begin(), heap.end(), cmp);
+                            } else if (x > heap.front().first) {
+                                std::pop_heap(heap.begin(), heap.end(), cmp);
+                                heap.back() = {x, v};
+                                std::push_heap(heap.begin(), heap.end(), cmp);
+                            }
+                        }
+                        std::sort(heap.begin(), heap.end(), [](const std::pair<float,int>&a, const std::pair<float,int>&b){ return a.first > b.first; });
+                        float Zk = 0.0f;
+                        for (int i = 0; i < k_step; ++i) { const float e = expf(heap[i].first - maxl); heap[i].first = e; Zk += e; }
+                        float ent;
+                        if (rq.topk_tail) {
+                            double Zf = 0.0, T = 0.0;
+                            for (int v = 0; v < n_vocab; ++v) {
+                                const double d = (double) (lg[v] / temp) - (double) maxl;
+                                const double e = exp(d); Zf += e; T += d * e;
+                            }
+                            ent = (float) (log(Zf) - T / Zf);
+                        } else {
+                            ent = 0.0f;
+                            for (int i = 0; i < k_step; ++i) { const float q = heap[i].first / Zk; if (q > 0.0f) ent -= q * logf(q); }
+                        }
+                        const float r = rand_unif(rng) * Zk;
+                        float cum = 0.0f; int tok = amax; bool picked = false;
+                        for (int i = 0; i < k_step; ++i) { cum += heap[i].first; if (!picked && cum >= r) { tok = heap[i].second; picked = true; } }
+                        int32_t * sid = sc_ids.data()   + (size_t) j * SC_K;
+                        float   * spr = sc_probs.data() + (size_t) j * SC_K;
+                        const int n_sc = std::min(k_step, SC_K);
+                        for (int i = 0; i < n_sc; ++i) { sid[i] = heap[i].second; spr[i] = heap[i].first / Zk; }
+                        entropy[j] = ent; sampled[j] = tok; argmax_canvas[j] = amax;
+                    }
+                }
+
+                llama_memory_seq_rm(mem, 0, n_past, -1);
+                llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length);
+
+                std::vector<int> order(canvas_length);
+                std::iota(order.begin(), order.end(), 0);
+                std::sort(order.begin(), order.end(), [&](int a, int b) { return entropy[a] < entropy[b]; });
+                std::vector<char> accept_mask(canvas_length, 0);
+                float prefix = 0.0f;
+                for (int k = 0; k < canvas_length; ++k) {
+                    if (prefix <= ENTROPY_BOUND) { accept_mask[order[k]] = 1; prefix += entropy[order[k]]; }
+                    else break;
+                }
+                for (int i = 0; i < canvas_length; ++i) if (accept_mask[i]) accepted[i] = sampled[i];
+
+                const float mean_ent = std::accumulate(entropy.begin(), entropy.end(), 0.0f) / canvas_length;
+                const bool stable    = (STABILITY_THRESHOLD == 0) || (argmax_canvas == prev_argmax);
+                const bool confident = mean_ent < CONFIDENCE_THRESHOLD;
+                if (stable && confident) break;
+                prev_argmax = argmax_canvas;
+
+                for (int i = 0; i < canvas_length; ++i) canvas[i] = accept_mask[i] ? accepted[i] : rand_tok(rng);
+            }
+
+            const std::vector<llama_token> & block_out = argmax_canvas;
+            generated.insert(generated.end(), block_out.begin(), block_out.end());
+            if (!rq.ignore_eos) {
+                for (int j = 0; j < canvas_length; ++j) {
+                    if (llama_vocab_is_eog(vocab, block_out[j])) { done = true; break; }
+                }
+            }
+            if (!done && block + 1 < max_canvases) {
+                if (!prefill_causal(block_out, n_past, &n_decode)) {
+                    out.error = "canvas commit (encoder) decode failed";
+                    return out;
+                }
+                n_past += canvas_length;
+            }
+        }
+        out.gen_ms = std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - t_gen0).count();
+        out.full   = common_detokenize(vocab, generated, false);
+
+        // extract the final response: after the last "<channel|>" close, until the first eog token
+        llama_token chan_close = LLAMA_TOKEN_NULL;
+        {
+            auto t = common_tokenize(vocab, "<channel|>", false, true);
+            if (t.size() == 1) chan_close = t[0];
+        }
+        const int n_gen = (int) generated.size();
+        int start = 0;
+        if (chan_close != LLAMA_TOKEN_NULL) {
+            for (int j = 0; j < n_gen; ++j) if (generated[j] == chan_close) start = j + 1;
+        }
+        std::vector<llama_token> answer;
+        for (int j = start; j < n_gen; ++j) {
+            if (llama_vocab_is_eog(vocab, generated[j])) break;
+            answer.push_back(generated[j]);
+        }
+        std::string ans = common_detokenize(vocab, answer, false);
+        {
+            std::string s = ans;
+            size_t h = s.find_first_not_of(" \n\t"); if (h != std::string::npos) s = s.substr(h);
+            const size_t half = s.size() / 2;
+            if (half > 0 && s.compare(0, half, s, s.size() - half, half) == 0) ans = s.substr(0, half);
+        }
+
+        out.answer        = ans;
+        out.prompt_tokens = prefix_len;
+        out.answer_tokens = (int) answer.size();
+        out.n_blocks      = n_blocks_run;
+        out.n_steps_total = n_steps_total;
+        out.canvas_tokens = n_blocks_run * canvas_length;
+        out.n_decode      = n_decode;
+        out.ok            = true;
+        return out;
+    }
+};
+
+// ------------------------------------------------------------------------------------------------
+// JSON / log helpers
+// ------------------------------------------------------------------------------------------------
+static std::atomic<uint64_t> g_req_counter{0};
+static std::string gen_id(const char * prefix) { return std::string(prefix) + "-" + std::to_string(g_req_counter.fetch_add(1)); }
+static json error_json(const std::string & msg, const std::string & type, int code) {
+    return json{{"error", {{"message", msg}, {"type", type}, {"code", code}}}};
+}
+
+// llama-server `timings` object, with diffusion fields. prompt_* describes the encoder prefill;
+// predicted_* describes the answer tokens; the nested `diffusion` object describes the denoising.
+static json timings_json(const diffusion_result & r) {
+    const double ppt = r.prompt_tokens > 0 ? r.prefill_ms / r.prompt_tokens : 0.0;
+    const double pps = r.prefill_ms    > 0 ? r.prompt_tokens * 1e3 / r.prefill_ms : 0.0;
+    const double dpt = r.answer_tokens > 0 ? r.gen_ms / r.answer_tokens : 0.0;
+    const double dps = r.gen_ms        > 0 ? r.answer_tokens * 1e3 / r.gen_ms : 0.0;
+    const double sps = r.gen_ms        > 0 ? r.n_steps_total * 1e3 / r.gen_ms : 0.0;
+    const double cps = r.gen_ms        > 0 ? r.canvas_tokens * 1e3 / r.gen_ms : 0.0;
+    const double mps = r.n_steps_total > 0 ? r.gen_ms / r.n_steps_total : 0.0;
+    return json{
+        {"cache_n", 0},
+        {"prompt_n", r.prompt_tokens},
+        {"prompt_ms", r.prefill_ms},
+        {"prompt_per_token_ms", ppt},
+        {"prompt_per_second", pps},
+        {"predicted_n", r.answer_tokens},
+        {"predicted_ms", r.gen_ms},
+        {"predicted_per_token_ms", dpt},
+        {"predicted_per_second", dps},
+        {"diffusion", {
+            {"n_blocks", r.n_blocks},
+            {"n_steps", r.n_steps_total},
+            {"canvas_tokens", r.canvas_tokens},
+            {"ms_per_step", mps},
+            {"steps_per_second", sps},
+            {"canvas_tokens_per_second", cps},
+            {"n_decode", r.n_decode},
+        }},
+    };
+}
+
+static json usage_json(const diffusion_result & r) {
+    return json{
+        {"prompt_tokens", r.prompt_tokens},
+        {"completion_tokens", r.answer_tokens},
+        {"total_tokens", r.prompt_tokens + r.answer_tokens},
+        {"prompt_tokens_details", {{"cached_tokens", 0}}},
+    };
+}
+
+// per-request timing log, llama_perf-style but with the denoise phase substituted for AR eval
+static void log_timings(const diffusion_result & r) {
+    const double ppt = r.prompt_tokens > 0 ? r.prefill_ms / r.prompt_tokens : 0.0;
+    const double pps = r.prefill_ms    > 0 ? r.prompt_tokens * 1e3 / r.prefill_ms : 0.0;
+    const double mps = r.n_steps_total > 0 ? r.gen_ms / r.n_steps_total : 0.0;
+    const double sps = r.gen_ms        > 0 ? r.n_steps_total * 1e3 / r.gen_ms : 0.0;
+    const double cps = r.gen_ms        > 0 ? r.canvas_tokens * 1e3 / r.gen_ms : 0.0;
+    SLT_INF(0, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+            r.prefill_ms, r.prompt_tokens, ppt, pps);
+    SLT_INF(0, "    denoise time = %10.2f ms / %5d steps  (%8.2f ms per step,  %8.2f steps per second)\n",
+            r.gen_ms, r.n_steps_total, mps, sps);
+    SLT_INF(0, "      gen tokens = %5d answer | %5d canvas over %d block(s) (%8.2f canvas tok/s)\n",
+            r.answer_tokens, r.canvas_tokens, r.n_blocks, cps);
+    SLT_INF(0, "      total time = %10.2f ms / %5d decode call(s)\n",
+            r.prefill_ms + r.gen_ms, r.n_decode);
+}
+
+static diffusion_request request_from_body(const json & body, const diffusion_server & srv, uint32_t default_seed) {
+    diffusion_request rq;
+    rq.canvas_length = srv.canvas_length;
+    rq.n_steps       = srv.n_steps;
+    rq.seed          = default_seed;
+    int max_tokens = 0;
+    if (body.contains("max_tokens") && body["max_tokens"].is_number_integer())                       max_tokens = body["max_tokens"].get<int>();
+    else if (body.contains("max_completion_tokens") && body["max_completion_tokens"].is_number_integer()) max_tokens = body["max_completion_tokens"].get<int>();
+    rq.max_canvases = max_tokens > 0 ? (max_tokens + rq.canvas_length - 1) / rq.canvas_length : 1;
+    if (body.contains("top_k") && body["top_k"].is_number_integer()) rq.topk_fixed = body["top_k"].get<int>();
+    if (body.contains("seed")  && body["seed"].is_number_integer())  rq.seed = (uint32_t) body["seed"].get<int64_t>();
+    if (body.contains("ignore_eos") && body["ignore_eos"].is_boolean()) rq.ignore_eos = body["ignore_eos"].get<bool>();
+    return rq;
+}
+
+// ------------------------------------------------------------------------------------------------
+int main(int argc, char ** argv) {
+    std::string hostname = "127.0.0.1";
+    int         port     = 8080;
+    std::string api_key;
+    bool        enable_metrics = false;
+    bool        enable_slots   = false;
+
+    // Pull server-only flags out of argv before common_params_parse (which validates against
+    // LLAMA_EXAMPLE_DIFFUSION and would reject these). Everything else (-m, --mmproj, -ngl, -c,
+    // --seed, --top-k*, -t, ...) is forwarded and parsed normally.
+    std::vector<char *> fwd;
+    fwd.push_back(argv[0]);
+    for (int i = 1; i < argc; ++i) {
+        const std::string a = argv[i];
+        auto next = [&](const char * def) -> std::string { return (i + 1 < argc) ? argv[++i] : def; };
+        if (a == "--host")            { hostname = next("127.0.0.1"); }
+        else if (a == "--port")       { port = atoi(next("8080").c_str()); }
+        else if (a == "--api-key")    { api_key = next(""); }
+        else if (a == "--metrics")    { enable_metrics = true; }
+        else if (a == "--slots")      { enable_slots = true; }
+        else if (a == "--no-metrics") { enable_metrics = false; }
+        else if (a == "--no-slots")   { enable_slots = false; }
+        else                          { fwd.push_back(argv[i]); }
+    }
+
+    common_params params;
+    if (!common_params_parse((int) fwd.size(), fwd.data(), params, LLAMA_EXAMPLE_DIFFUSION)) {
+        return 1;
+    }
+    common_init();
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    SRV_INF("%s\n", "loading model");
+
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = params.n_gpu_layers >= 0 ? params.n_gpu_layers : 999;
+    model_params.devices      = params.devices.data();
+    model_params.use_mmap     = params.use_mmap;
+
+    diffusion_server srv;
+    srv.model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+    if (!srv.model) {
+        SRV_ERR("failed to load model '%s'\n", params.model.path.c_str());
+        return 1;
+    }
+    if (!llama_model_is_diffusion(srv.model)) {
+        SRV_ERR("'%s' is not a diffusion model\n", params.model.path.c_str());
+        llama_model_free(srv.model);
+        return 1;
+    }
+
+    srv.vocab         = llama_model_get_vocab(srv.model);
+    srv.n_vocab       = llama_vocab_n_tokens(srv.vocab);
+    srv.canvas_length = DEF_CANVAS_LENGTH;
+    srv.n_steps       = DEF_MAX_DENOISE_STEPS;
+    srv.n_ub          = srv.canvas_length;
+    srv.n_ctx         = params.n_ctx > 0 ? (int) params.n_ctx : 4096;
+    if (srv.n_ctx < 2 * srv.canvas_length) srv.n_ctx = 2 * srv.canvas_length;
+    srv.model_path    = params.model.path;
+    srv.metrics.t_start = (int64_t) std::time(nullptr);
+
+    {
+        std::string p = params.model.path;
+        size_t slash = p.find_last_of("/\\");
+        srv.model_id = (slash == std::string::npos) ? p : p.substr(slash + 1);
+        size_t dot = srv.model_id.rfind(".gguf");
+        if (dot != std::string::npos) srv.model_id = srv.model_id.substr(0, dot);
+        if (srv.model_id.empty()) srv.model_id = "diffusion-gemma";
+    }
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx    = srv.n_ctx;
+    ctx_params.n_batch  = srv.n_ub;
+    ctx_params.n_ubatch = srv.n_ub;
+    ctx_params.no_perf  = params.no_perf;
+
+    srv.ctx = llama_init_from_model(srv.model, ctx_params);
+    if (!srv.ctx) {
+        SRV_ERR("%s\n", "failed to create context");
+        llama_model_free(srv.model);
+        return 1;
+    }
+    llama_set_n_threads(srv.ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);
+    srv.mem   = llama_get_memory(srv.ctx);
+    srv.batch = llama_batch_init(srv.n_ub, 0, 1);
+    srv.templates = common_chat_templates_init(srv.model, "");
+
+    const uint32_t default_seed = params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed;
+
+    SRV_INF("%s\n", llama_print_system_info());
+    SRV_INF("model loaded: '%s' | n_ctx = %d | canvas = %d | denoise steps = %d | 1 slot\n",
+            srv.model_id.c_str(), srv.n_ctx, srv.canvas_length, srv.n_steps);
+
+    // ------------------------------------------------------------------------------------------
+    // HTTP server
+    // ------------------------------------------------------------------------------------------
+    httplib::Server http;
+    http.set_default_headers({
+        {"Server", "llama.cpp-diffusion-gemma"},
+        {"Access-Control-Allow-Origin", "*"},
+        {"Access-Control-Allow-Headers", "Content-Type, Authorization"},
+        {"Access-Control-Allow-Methods", "GET, POST, OPTIONS"},
+    });
+
+    // access log (llama-server: SRV_TRC "done request: METHOD PATH ADDR STATUS"; skip noisy paths)
+    http.set_logger([](const httplib::Request & req, const httplib::Response & res) {
+        if (req.path == "/health" || req.path == "/v1/health" || req.path == "/metrics" ||
+            req.path == "/props"  || req.path == "/models"    || req.path == "/v1/models") {
+            return;
+        }
+        SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+        SRV_DBG("request body: %s\n", req.body.c_str());
+    });
+
+    if (!api_key.empty()) {
+        http.set_pre_routing_handler([api_key](const httplib::Request & req, httplib::Response & res) {
+            if (req.method == "OPTIONS" || req.path == "/health" || req.path == "/v1/health") {
+                return httplib::Server::HandlerResponse::Unhandled;
+            }
+            if (req.get_header_value("Authorization") != "Bearer " + api_key) {
+                res.status = 401;
+                res.set_content(error_json("invalid api key", "authentication_error", 401).dump(), "application/json");
+                return httplib::Server::HandlerResponse::Handled;
+            }
+            return httplib::Server::HandlerResponse::Unhandled;
+        });
+    }
+
+    http.Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) { res.status = 204; });
+
+    auto health = [](const httplib::Request &, httplib::Response & res) {
+        res.set_content(json{{"status", "ok"}}.dump(), "application/json");
+    };
+    http.Get("/health", health);
+    http.Get("/v1/health", health);
+
+    http.Get("/v1/models", [&srv](const httplib::Request &, httplib::Response & res) {
+        json models = json::array();
+        models.push_back({{"id", srv.model_id}, {"object", "model"}, {"created", (int64_t) std::time(nullptr)}, {"owned_by", "local"}});
+        res.set_content(json{{"object", "list"}, {"data", models}}.dump(), "application/json");
+    });
+    http.Get("/models", [&srv](const httplib::Request &, httplib::Response & res) {
+        json models = json::array();
+        models.push_back({{"id", srv.model_id}, {"object", "model"}, {"created", (int64_t) std::time(nullptr)}, {"owned_by", "local"}});
+        res.set_content(json{{"object", "list"}, {"data", models}}.dump(), "application/json");
+    });
+
+    // /props (llama-server analogue): server + model + default generation settings
+    http.Get("/props", [&](const httplib::Request &, httplib::Response & res) {
+        json props{
+            {"default_generation_settings", {
+                {"n_ctx", srv.n_ctx},
+                {"canvas_length", srv.canvas_length},
+                {"n_denoise_steps", srv.n_steps},
+                {"temperature_min", TEMP_MIN},
+                {"temperature_max", TEMP_MAX},
+                {"entropy_bound", ENTROPY_BOUND},
+                {"confidence_threshold", CONFIDENCE_THRESHOLD},
+                {"stability_threshold", STABILITY_THRESHOLD},
+                {"self_cond_topk", SC_K},
+            }},
+            {"total_slots", 1},
+            {"model_path", srv.model_path},
+            {"model_alias", srv.model_id},
+            {"n_ctx", srv.n_ctx},
+            {"build_info", srv.build_info},
+            {"generation_type", "block-diffusion"},
+            {"endpoint_slots", enable_slots},
+            {"endpoint_metrics", enable_metrics},
+        };
+        res.set_content(props.dump(), "application/json");
+    });
+
+    // /metrics (Prometheus). Gated behind --metrics, like llama-server.
+    if (enable_metrics) {
+        http.Get("/metrics", [&srv](const httplib::Request &, httplib::Response & res) {
+            server_metrics & m = srv.metrics;
+            std::lock_guard<std::mutex> lk(m.mu);
+            const int n_proc = m.n_processing.load();
+            char buf[8192];
+            int n = snprintf(buf, sizeof(buf),
+                // --- counters (re-mapped to diffusion where the AR concept differs) ---
+                "# HELP llamacpp:prompt_tokens_total Number of prompt tokens processed.\n"
+                "# TYPE llamacpp:prompt_tokens_total counter\n"
+                "llamacpp:prompt_tokens_total %llu\n"
+                "# HELP llamacpp:prompt_seconds_total Prompt (encoder prefill) process time.\n"
+                "# TYPE llamacpp:prompt_seconds_total counter\n"
+                "llamacpp:prompt_seconds_total %.3f\n"
+                "# HELP llamacpp:tokens_predicted_total Number of answer tokens generated.\n"
+                "# TYPE llamacpp:tokens_predicted_total counter\n"
+                "llamacpp:tokens_predicted_total %llu\n"
+                "# HELP llamacpp:tokens_predicted_seconds_total Denoising (generation) process time.\n"
+                "# TYPE llamacpp:tokens_predicted_seconds_total counter\n"
+                "llamacpp:tokens_predicted_seconds_total %.3f\n"
+                "# HELP llamacpp:n_decode_total Total number of llama_decode() calls (prefill + denoise + commit).\n"
+                "# TYPE llamacpp:n_decode_total counter\n"
+                "llamacpp:n_decode_total %llu\n"
+                "# HELP llamacpp:requests_total Total number of completed generation requests.\n"
+                "# TYPE llamacpp:requests_total counter\n"
+                "llamacpp:requests_total %llu\n"
+                "# HELP llamacpp:diffusion_blocks_total Total number of canvas blocks denoised.\n"
+                "# TYPE llamacpp:diffusion_blocks_total counter\n"
+                "llamacpp:diffusion_blocks_total %llu\n"
+                "# HELP llamacpp:diffusion_steps_total Total number of denoising steps.\n"
+                "# TYPE llamacpp:diffusion_steps_total counter\n"
+                "llamacpp:diffusion_steps_total %llu\n"
+                "# HELP llamacpp:diffusion_canvas_tokens_total Total canvas tokens denoised (blocks * canvas_length).\n"
+                "# TYPE llamacpp:diffusion_canvas_tokens_total counter\n"
+                "llamacpp:diffusion_canvas_tokens_total %llu\n"
+                // --- gauges (last request) ---
+                "# HELP llamacpp:prompt_tokens_seconds Average prompt throughput in tokens/s.\n"
+                "# TYPE llamacpp:prompt_tokens_seconds gauge\n"
+                "llamacpp:prompt_tokens_seconds %.3f\n"
+                "# HELP llamacpp:predicted_tokens_seconds Average answer-token throughput in tokens/s.\n"
+                "# TYPE llamacpp:predicted_tokens_seconds gauge\n"
+                "llamacpp:predicted_tokens_seconds %.3f\n"
+                "# HELP llamacpp:diffusion_steps_per_second Denoising steps per second (last request).\n"
+                "# TYPE llamacpp:diffusion_steps_per_second gauge\n"
+                "llamacpp:diffusion_steps_per_second %.3f\n"
+                "# HELP llamacpp:diffusion_canvas_tokens_per_second Canvas tokens per second (last request).\n"
+                "# TYPE llamacpp:diffusion_canvas_tokens_per_second gauge\n"
+                "llamacpp:diffusion_canvas_tokens_per_second %.3f\n"
+                "# HELP llamacpp:diffusion_ms_per_step Milliseconds per denoising step (last request).\n"
+                "# TYPE llamacpp:diffusion_ms_per_step gauge\n"
+                "llamacpp:diffusion_ms_per_step %.3f\n"
+                "# HELP llamacpp:requests_processing Number of requests currently generating.\n"
+                "# TYPE llamacpp:requests_processing gauge\n"
+                "llamacpp:requests_processing %d\n",
+                (unsigned long long) m.n_prompt_tokens_total,
+                m.t_prompt_ms_total / 1000.0,
+                (unsigned long long) m.n_tokens_predicted_total,
+                m.t_predicted_ms_total / 1000.0,
+                (unsigned long long) m.n_decode_total,
+                (unsigned long long) m.n_requests_total,
+                (unsigned long long) m.n_blocks_total,
+                (unsigned long long) m.n_steps_total,
+                (unsigned long long) m.n_canvas_tokens_total,
+                m.last_prompt_tps,
+                m.last_predicted_tps,
+                m.last_steps_per_second,
+                m.last_canvas_tps,
+                m.last_ms_per_step,
+                n_proc);
+            res.set_header("Process-Start-Time-Unix", std::to_string(m.t_start));
+            res.set_content(std::string(buf, n > 0 ? (size_t) n : 0), "text/plain; version=0.0.4");
+        });
+        SRV_INF("%s\n", "metrics endpoint enabled at /metrics");
+    }
+
+    // /slots (single slot). Gated behind --slots, like llama-server.
+    if (enable_slots) {
+        http.Get("/slots", [&srv](const httplib::Request &, httplib::Response & res) {
+            json slot{
+                {"id", 0},
+                {"n_ctx", srv.n_ctx},
+                {"is_processing", srv.metrics.n_processing.load() > 0},
+                {"generation_type", "block-diffusion"},
+                {"params", {
+                    {"canvas_length", srv.canvas_length},
+                    {"n_denoise_steps", srv.n_steps},
+                    {"self_cond_topk", SC_K},
+                }},
+            };
+            res.set_content(json::array({slot}).dump(), "application/json");
+        });
+        SRV_INF("%s\n", "slots endpoint enabled at /slots");
+    }
+
+    // shared: run one generation (under the slot mutex), update metrics, print timing log
+    auto run_for_body = [&srv, default_seed](const json & body, const std::vector<llama_token> & prompt_tokens) {
+        diffusion_request rq = request_from_body(body, srv, default_seed);
+        srv.metrics.n_processing.fetch_add(1);
+        std::unique_lock<std::mutex> lock(srv.gen_mutex);
+        diffusion_result r = srv.generate(prompt_tokens, rq);
+        lock.unlock();
+        srv.metrics.n_processing.fetch_sub(1);
+        if (r.ok) { srv.metrics.add(r); log_timings(r); }
+        return r;
+    };
+
+    // POST /v1/chat/completions
+    http.Post("/v1/chat/completions", [&](const httplib::Request & req, httplib::Response & res) {
+        json body;
+        try { body = json::parse(req.body); }
+        catch (const std::exception & e) { res.status = 400; res.set_content(error_json(std::string("invalid JSON: ") + e.what(), "invalid_request_error", 400).dump(), "application/json"); return; }
+        if (!body.contains("messages") || !body["messages"].is_array()) {
+            res.status = 400; res.set_content(error_json("'messages' (array) is required", "invalid_request_error", 400).dump(), "application/json"); return;
+        }
+        std::string prompt;
+        try { prompt = srv.format_messages(body["messages"]); }
+        catch (const std::exception & e) { res.status = 400; res.set_content(error_json(std::string("failed to format messages: ") + e.what(), "invalid_request_error", 400).dump(), "application/json"); return; }
+        const std::vector<llama_token> prompt_tokens = common_tokenize(srv.vocab, prompt, false, true);
+        const bool stream = body.value("stream", false);
+
+        const diffusion_result r = run_for_body(body, prompt_tokens);
+        if (!r.ok) { res.status = 500; res.set_content(error_json(r.error, "server_error", 500).dump(), "application/json"); return; }
+
+        if (!stream) {
+            json out{
+                {"id", gen_id("chatcmpl")}, {"object", "chat.completion"}, {"created", (int64_t) std::time(nullptr)}, {"model", srv.model_id},
+                {"choices", json::array({ json{{"index", 0}, {"message", {{"role", "assistant"}, {"content", r.answer}}}, {"finish_reason", "stop"}} })},
+                {"usage", usage_json(r)},
+                {"timings", timings_json(r)},
+            };
+            res.set_content(out.dump(), "application/json");
+            return;
+        }
+
+        const std::string id = gen_id("chatcmpl");
+        const int64_t created = (int64_t) std::time(nullptr);
+        const std::string model_id = srv.model_id;
+        const std::string answer = r.answer;
+        const json timings = timings_json(r);
+        res.set_chunked_content_provider("text/event-stream",
+            [id, created, model_id, answer, timings](size_t, httplib::DataSink & sink) {
+                auto send = [&](const json & c) { std::string s = "data: " + c.dump() + "\n\n"; return sink.write(s.data(), s.size()); };
+                auto chunk = [&](const json & delta, const char * finish, const json * extra) {
+                    json c{{"id", id}, {"object", "chat.completion.chunk"}, {"created", created}, {"model", model_id},
+                           {"choices", json::array({ json{{"index", 0}, {"delta", delta}, {"finish_reason", finish ? json(finish) : json(nullptr)}} })}};
+                    if (extra) c["timings"] = *extra;
+                    return send(c);
+                };
+                if (!chunk(json{{"role", "assistant"}}, nullptr, nullptr)) return false;
+                if (!chunk(json{{"content", answer}}, nullptr, nullptr))   return false;
+                if (!chunk(json::object(), "stop", &timings))              return false;
+                const std::string done = "data: [DONE]\n\n";
+                sink.write(done.data(), done.size());
+                sink.done();
+                return true;
+            });
+    });
+
+    // POST /v1/completions (plain prompt)
+    http.Post("/v1/completions", [&](const httplib::Request & req, httplib::Response & res) {
+        json body;
+        try { body = json::parse(req.body); }
+        catch (const std::exception & e) { res.status = 400; res.set_content(error_json(std::string("invalid JSON: ") + e.what(), "invalid_request_error", 400).dump(), "application/json"); return; }
+        if (!body.contains("prompt") || !body["prompt"].is_string()) {
+            res.status = 400; res.set_content(error_json("'prompt' (string) is required", "invalid_request_error", 400).dump(), "application/json"); return;
+        }
+        const std::string prompt = body["prompt"].get<std::string>();
+        const std::vector<llama_token> prompt_tokens = common_tokenize(srv.vocab, prompt, true, true);
+
+        const diffusion_result r = run_for_body(body, prompt_tokens);
+        if (!r.ok) { res.status = 500; res.set_content(error_json(r.error, "server_error", 500).dump(), "application/json"); return; }
+        json out{
+            {"id", gen_id("cmpl")}, {"object", "text_completion"}, {"created", (int64_t) std::time(nullptr)}, {"model", srv.model_id},
+            {"choices", json::array({ json{{"index", 0}, {"text", r.answer}, {"finish_reason", "stop"}} })},
+            {"usage", usage_json(r)},
+            {"timings", timings_json(r)},
+        };
+        res.set_content(out.dump(), "application/json");
+    });
+
+    SRV_INF("server is listening on http://%s:%d\n", hostname.c_str(), port);
+    SRV_INF("%s\n", "all slots are idle");
+    if (!http.listen(hostname, port)) {
+        SRV_ERR("failed to bind to %s:%d\n", hostname.c_str(), port);
+        llama_batch_free(srv.batch);
+        llama_free(srv.ctx);
+        llama_model_free(srv.model);
+        llama_backend_free();
+        return 1;
+    }
+
+    llama_batch_free(srv.batch);
+    llama_free(srv.ctx);
+    llama_model_free(srv.model);
+    llama_backend_free();
+    return 0;
+}

From ed011ec8153a50cedc0e3b317489bab30df17525 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Tue, 9 Jun 2026 12:10:55 +0530
Subject: [PATCH 02/13] diffusion-gemma: add CUDA graph GPU sampling server
 path

---
 .../diffusion-gemma/diffusion-gemma-cli.cpp   |  76 ++-
 .../diffusion-gemma-server.cpp                | 115 +++-
 ggml/include/ggml-cuda.h                      |  25 +
 ggml/src/ggml-cuda/diffusion-sampling.cu      | 493 ++++++++++++++++++
 ggml/src/ggml-cuda/diffusion-sampling.cuh     |   9 +
 ggml/src/ggml-cuda/ggml-cuda.cu               |  27 +-
 include/llama.h                               |  29 ++
 src/llama-context.cpp                         |  98 +++-
 src/llama-context.h                           |   7 +
 9 files changed, 855 insertions(+), 24 deletions(-)
 create mode 100644 ggml/src/ggml-cuda/diffusion-sampling.cu
 create mode 100644 ggml/src/ggml-cuda/diffusion-sampling.cuh

diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index d7510f8e1e23..6beea37005b8 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -40,6 +40,7 @@ static constexpr float TEMP_MIN               = 0.4f;   // LinearTemperatureSche
 static constexpr float TEMP_MAX               = 0.8f;   // LinearTemperatureScheduleConfig.t_max
 static constexpr float CONFIDENCE_THRESHOLD   = 0.005f; // StableAndConfident.confidence_threshold
 static constexpr int   STABILITY_THRESHOLD    = 1;      // StableAndConfident.stability_threshold
+static constexpr int   GPU_SAMPLING_MAX_TOP_K = 1024;   // small top-k sort kernel limit
 
 static int env_int(const char * name, int def) {
     const char * v = getenv(name);
@@ -199,8 +200,21 @@ int main(int argc, char ** argv) {
     llama_set_diffusion_prompt_len(ctx, prefix_len);
     llama_memory_t mem = llama_get_memory(ctx);
 
+    const int topk_max_requested =
+        (topk_start > 0 && topk_end > 0) ? std::max(topk_start, topk_end) : topk_fixed;
+    const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0;
+    const bool gpu_sampling_topk_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
+    const bool use_gpu_sampling = gpu_sampling_requested &&
+                                  gpu_sampling_topk_ok &&
+                                  llama_diffusion_sample_topk_supported(ctx);
+    llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling);
+
     LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n",
             prefix_len, canvas_length, max_canvases, n_steps, entropy_bound, TEMP_MIN, TEMP_MAX, n_ctx, (int) use_mm);
+    LOG_INF("diffusion-gemma: gpu sampling: %s%s\n",
+            use_gpu_sampling ? "on" : "off",
+            (!gpu_sampling_topk_ok ? " (top-k exceeds CUDA fast-path limit)" :
+             (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")));
     if (topk_fixed > 0 || (topk_start > 0 && topk_end > 0)) {
         LOG_INF("diffusion-gemma: top-k sampling: fixed=%d anneal=[%d->%d] tail_correction=%d (vocab=%d)\n",
                 topk_fixed, topk_start, topk_end, topk_tail, n_vocab);
@@ -278,8 +292,13 @@ int main(int argc, char ** argv) {
     int n_blocks_run  = 0;
     int n_steps_total = 0;
     const auto t_gen_start = std::chrono::steady_clock::now();
+    const bool log_step_timing = env_int("DG_TIMING", 0) != 0;
+    double decode_enqueue_s = 0.0;
+    double sample_sync_s    = 0.0;
+    double host_loop_s      = 0.0;
 
     bool done = false;
+    bool failed = false;
     for (int block = 0; block < max_canvases && !done; ++block) {
     ++n_blocks_run;
     // 1. initialize canvas with random tokens
@@ -303,12 +322,12 @@ int main(int argc, char ** argv) {
             batch.seq_id[j][0] = 0;
             batch.logits[j]    = 1;
         }
+        const auto t_decode_start = std::chrono::steady_clock::now();
         if (llama_decode(ctx, batch) != 0) {
             LOG_ERR("error: llama_decode failed at step %d\n", cur_step);
             break;
         }
-        // canvas logits occupy rows [0, canvas_length) (canvas-only ubatch)
-        const float * logits = llama_get_logits(ctx);
+        decode_enqueue_s += std::chrono::duration<double>(std::chrono::steady_clock::now() - t_decode_start).count();
 
         // 2b. linear temperature schedule: t = t_min + (t_max - t_min) * (cur_step / n_steps)
         const float temp = TEMP_MIN + (TEMP_MAX - TEMP_MIN) * ((float) cur_step / (float) n_steps);
@@ -334,7 +353,32 @@ int main(int argc, char ** argv) {
         std::fill(sc_probs.begin(), sc_probs.end(), 0.0f);
         std::fill(sc_ids.begin(),   sc_ids.end(),   0);
 
-        if (k_step == 0) {
+        bool sampled_ok = true;
+        const auto t_sample_start = std::chrono::steady_clock::now();
+        if (use_gpu_sampling) {
+            llama_diffusion_sample_params sample_params = {
+                /* .n_tokens              = */ canvas_length,
+                /* .top_k                 = */ k_step,
+                /* .self_cond_top_k       = */ SC_K,
+                /* .temperature           = */ temp,
+                /* .seed                  = */ params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed,
+                /* .step                  = */ (uint32_t) n_steps_total,
+                /* .top_k_tail_correction = */ topk_tail != 0,
+            };
+            llama_diffusion_sample_result sample_result = {
+                /* .sampled         = */ sampled.data(),
+                /* .argmax          = */ argmax_canvas.data(),
+                /* .entropy         = */ entropy.data(),
+                /* .self_cond_ids   = */ sc_ids.data(),
+                /* .self_cond_probs = */ sc_probs.data(),
+            };
+            sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result);
+            if (!sampled_ok) {
+                LOG_ERR("error: CUDA diffusion sampling failed at step %d (k=%d)\n", cur_step, k_step);
+            }
+        } else if (k_step == 0) {
+            // canvas logits occupy rows [0, canvas_length) (canvas-only ubatch)
+            const float * logits = llama_get_logits(ctx);
             // ---- full softmax over the whole vocabulary (reference behaviour) ----
             // self-cond still feeds only the top-SC_K tokens (full-normalized probs); the dropped
             // tail carries negligible embedding weight and the post RMS norm absorbs the scale.
@@ -385,6 +429,8 @@ int main(int argc, char ** argv) {
                 argmax_canvas[j] = amax;
             }
         } else {
+            // canvas logits occupy rows [0, canvas_length) (canvas-only ubatch)
+            const float * logits = llama_get_logits(ctx);
             // ---- top-k host sampling: softmax / entropy / sample / self-cond over the top-k
             // logits only. Self-cond feeds the top min(k,SC_K) tokens (renormalized over the
             // sampled top-k), gathered in-graph; the dropped tail carries negligible weight. ----
@@ -453,6 +499,15 @@ int main(int argc, char ** argv) {
                 argmax_canvas[j] = amax;
             }
         }
+        sample_sync_s += std::chrono::duration<double>(std::chrono::steady_clock::now() - t_sample_start).count();
+        if (!sampled_ok) {
+            llama_memory_seq_rm(mem, 0, n_past, -1);
+            failed = true;
+            done = true;
+            break;
+        }
+
+        const auto t_host_start = std::chrono::steady_clock::now();
 
         // roll back the canvas K/V written by this decode so the cache keeps only the committed
         // prefix [0, n_past); the next step re-decodes the canvas fresh against that prefix.
@@ -502,6 +557,10 @@ int main(int argc, char ** argv) {
         for (int i = 0; i < canvas_length; ++i) {
             canvas[i] = accept_mask[i] ? accepted[i] : rand_tok(rng);
         }
+        host_loop_s += std::chrono::duration<double>(std::chrono::steady_clock::now() - t_host_start).count();
+    }
+    if (failed) {
+        break;
     }
 
     // 3. block output = the inline argmax of the last (stable) denoising step's logits.
@@ -545,6 +604,13 @@ int main(int argc, char ** argv) {
 
     llama_batch_free(batch);
 
+    if (failed) {
+        llama_free(ctx);
+        llama_model_free(model);
+        llama_backend_free();
+        return 1;
+    }
+
     // the full denoised canvas (thought channel + response), for reference
     LOG_INF("\n=== generated canvas ===\n%s\n", common_detokenize(vocab, generated, false).c_str());
 
@@ -588,6 +654,10 @@ int main(int argc, char ** argv) {
             gen_s > 0.0 ? n_canvas_tok / gen_s : 0.0,
             n_steps_total > 0 ? gen_s / n_steps_total : 0.0,
             (int) answer.size());
+    if (log_step_timing) {
+        LOG_INF("timing: decode enqueue %.3f s, sample/sync %.3f s, host loop %.3f s\n",
+                decode_enqueue_s, sample_sync_s, host_loop_s);
+    }
 
     llama_free(ctx);
     llama_model_free(model);
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
index e6fd0a536a1b..ebad596a5515 100644
--- a/examples/diffusion-gemma/diffusion-gemma-server.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -28,6 +28,7 @@
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <ctime>
 #include <mutex>
 #include <numeric>
@@ -54,6 +55,12 @@ static constexpr float TEMP_MAX             = 0.8f;
 static constexpr float CONFIDENCE_THRESHOLD = 0.005f;
 static constexpr int   STABILITY_THRESHOLD  = 1;
 static constexpr int   SC_K                 = 256; // must match llama_model_diffusion_gemma::N_SC_TOPK
+static constexpr int   GPU_SAMPLING_MAX_TOP_K = 1024; // CUDA diffusion sampler limit
+
+static int env_int(const char * name, int def) {
+    const char * v = getenv(name);
+    return v ? atoi(v) : def;
+}
 
 // ------------------------------------------------------------------------------------------------
 // per-request generation parameters and result
@@ -147,6 +154,11 @@ struct diffusion_server {
     int    n_ctx         = 0;
     int    n_ub          = 0;       // ubatch == canvas_length (keeps the decoder gather graph small)
     int    n_vocab       = 0;
+    int    topk_fixed    = 0;
+    int    topk_start    = 0;
+    int    topk_end      = 0;
+    bool   topk_tail     = false;
+    bool   use_gpu_sampling = false;
     std::string model_id;
     std::string model_path;
     std::string build_info = "llama.cpp diffusion-gemma-server";
@@ -237,8 +249,22 @@ struct diffusion_server {
 
             for (int cur_step = n_steps; cur_step >= 1; --cur_step) {
                 ++n_steps_total;
+                const float temp = TEMP_MIN + (TEMP_MAX - TEMP_MIN) * ((float) cur_step / (float) n_steps);
+
+                int k_step = 0;
+                if (rq.topk_start > 0 && rq.topk_end > 0) {
+                    const float frac = (n_steps > 1) ? (float) (cur_step - 1) / (float) (n_steps - 1) : 0.0f;
+                    k_step = (int) lroundf(rq.topk_end + (rq.topk_start - rq.topk_end) * frac);
+                } else if (rq.topk_fixed > 0) {
+                    k_step = rq.topk_fixed;
+                }
+                if (k_step <= 0 || k_step >= n_vocab) k_step = 0;
+                const bool use_gpu_sampling_step = use_gpu_sampling &&
+                    (k_step <= 0 || k_step <= GPU_SAMPLING_MAX_TOP_K);
+
                 llama_set_causal_attn(ctx, false);
                 llama_set_diffusion_decoder_phase(ctx, true);
+                llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling_step);
                 batch.n_tokens = canvas_length;
                 for (int j = 0; j < canvas_length; ++j) {
                     batch.token[j]     = canvas[j];
@@ -252,28 +278,39 @@ struct diffusion_server {
                     return out;
                 }
                 ++n_decode;
-                const float * logits = llama_get_logits(ctx);
-
-                const float temp = TEMP_MIN + (TEMP_MAX - TEMP_MIN) * ((float) cur_step / (float) n_steps);
 
                 std::vector<float> entropy(canvas_length);
                 std::vector<llama_token> sampled(canvas_length);
 
-                int k_step = 0;
-                if (rq.topk_start > 0 && rq.topk_end > 0) {
-                    const float frac = (n_steps > 1) ? (float) (cur_step - 1) / (float) (n_steps - 1) : 0.0f;
-                    k_step = (int) lroundf(rq.topk_end + (rq.topk_start - rq.topk_end) * frac);
-                } else if (rq.topk_fixed > 0) {
-                    k_step = rq.topk_fixed;
-                }
-                if (k_step <= 0 || k_step >= n_vocab) k_step = 0;
-
                 std::fill(sc_probs.begin(), sc_probs.end(), 0.0f);
                 std::fill(sc_ids.begin(),   sc_ids.end(),   0);
 
-                const auto cmp = [](const std::pair<float,int>&a, const std::pair<float,int>&b){ return a.first > b.first; };
-
-                if (k_step == 0) {
+                if (use_gpu_sampling_step) {
+                    llama_diffusion_sample_params sample_params = {
+                        /* .n_tokens              = */ canvas_length,
+                        /* .top_k                 = */ k_step,
+                        /* .self_cond_top_k       = */ SC_K,
+                        /* .temperature           = */ temp,
+                        /* .seed                  = */ rq.seed,
+                        /* .step                  = */ (uint32_t) n_steps_total,
+                        /* .top_k_tail_correction = */ rq.topk_tail,
+                    };
+                    llama_diffusion_sample_result sample_result = {
+                        /* .sampled         = */ sampled.data(),
+                        /* .argmax          = */ argmax_canvas.data(),
+                        /* .entropy         = */ entropy.data(),
+                        /* .self_cond_ids   = */ sc_ids.data(),
+                        /* .self_cond_probs = */ sc_probs.data(),
+                    };
+                    if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) {
+                        llama_memory_seq_rm(mem, 0, n_past, -1);
+                        out.error = "CUDA diffusion sampling failed at denoising step " +
+                            std::to_string(cur_step) + " (k=" + std::to_string(k_step) + ")";
+                        return out;
+                    }
+                } else if (k_step == 0) {
+                    const float * logits = llama_get_logits(ctx);
+                    const auto cmp = [](const std::pair<float,int>&a, const std::pair<float,int>&b){ return a.first > b.first; };
                     std::vector<float> probs(n_vocab);
                     std::vector<std::pair<float,int>> scheap; scheap.reserve(SC_K);
                     for (int j = 0; j < canvas_length; ++j) {
@@ -310,6 +347,8 @@ struct diffusion_server {
                         entropy[j] = ent; sampled[j] = tok; argmax_canvas[j] = amax;
                     }
                 } else {
+                    const float * logits = llama_get_logits(ctx);
+                    const auto cmp = [](const std::pair<float,int>&a, const std::pair<float,int>&b){ return a.first > b.first; };
                     const int heap_k = std::max(k_step, SC_K);
                     std::vector<std::pair<float,int>> heap; heap.reserve(heap_k);
                     for (int j = 0; j < canvas_length; ++j) {
@@ -503,12 +542,25 @@ static diffusion_request request_from_body(const json & body, const diffusion_se
     rq.canvas_length = srv.canvas_length;
     rq.n_steps       = srv.n_steps;
     rq.seed          = default_seed;
+    rq.topk_fixed    = srv.topk_fixed;
+    rq.topk_start    = srv.topk_start;
+    rq.topk_end      = srv.topk_end;
+    rq.topk_tail     = srv.topk_tail;
     int max_tokens = 0;
     if (body.contains("max_tokens") && body["max_tokens"].is_number_integer())                       max_tokens = body["max_tokens"].get<int>();
     else if (body.contains("max_completion_tokens") && body["max_completion_tokens"].is_number_integer()) max_tokens = body["max_completion_tokens"].get<int>();
     rq.max_canvases = max_tokens > 0 ? (max_tokens + rq.canvas_length - 1) / rq.canvas_length : 1;
-    if (body.contains("top_k") && body["top_k"].is_number_integer()) rq.topk_fixed = body["top_k"].get<int>();
-    if (body.contains("seed")  && body["seed"].is_number_integer())  rq.seed = (uint32_t) body["seed"].get<int64_t>();
+    if (body.contains("top_k") && body["top_k"].is_number_integer()) {
+        rq.topk_fixed = body["top_k"].get<int>();
+        rq.topk_start = 0;
+        rq.topk_end   = 0;
+    }
+    if (body.contains("top_k_start") && body["top_k_start"].is_number_integer()) rq.topk_start = body["top_k_start"].get<int>();
+    if (body.contains("top_k_end")   && body["top_k_end"].is_number_integer())   rq.topk_end   = body["top_k_end"].get<int>();
+    if (body.contains("top_k_tail_correction") && body["top_k_tail_correction"].is_boolean()) {
+        rq.topk_tail = body["top_k_tail_correction"].get<bool>();
+    }
+    if (body.contains("seed")  && body["seed"].is_number_integer()) rq.seed = (uint32_t) body["seed"].get<int64_t>();
     if (body.contains("ignore_eos") && body["ignore_eos"].is_boolean()) rq.ignore_eos = body["ignore_eos"].get<bool>();
     return rq;
 }
@@ -574,6 +626,12 @@ int main(int argc, char ** argv) {
     srv.n_ub          = srv.canvas_length;
     srv.n_ctx         = params.n_ctx > 0 ? (int) params.n_ctx : 4096;
     if (srv.n_ctx < 2 * srv.canvas_length) srv.n_ctx = 2 * srv.canvas_length;
+    srv.topk_fixed    = (params.sampling.user_sampling_config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K)
+                            ? params.sampling.top_k
+                            : 0;
+    srv.topk_start    = params.diffusion.top_k_start;
+    srv.topk_end      = params.diffusion.top_k_end;
+    srv.topk_tail     = params.diffusion.top_k_tail_correction;
     srv.model_path    = params.model.path;
     srv.metrics.t_start = (int64_t) std::time(nullptr);
 
@@ -603,11 +661,24 @@ int main(int argc, char ** argv) {
     srv.batch = llama_batch_init(srv.n_ub, 0, 1);
     srv.templates = common_chat_templates_init(srv.model, "");
 
+    const int topk_max_requested =
+        (srv.topk_start > 0 && srv.topk_end > 0) ? std::max(srv.topk_start, srv.topk_end) : srv.topk_fixed;
+    const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0;
+    srv.use_gpu_sampling = gpu_sampling_requested &&
+                           llama_diffusion_sample_topk_supported(srv.ctx);
+    llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling);
+    const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
+
     const uint32_t default_seed = params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed;
 
     SRV_INF("%s\n", llama_print_system_info());
     SRV_INF("model loaded: '%s' | n_ctx = %d | canvas = %d | denoise steps = %d | 1 slot\n",
             srv.model_id.c_str(), srv.n_ctx, srv.canvas_length, srv.n_steps);
+    SRV_INF("gpu sampling: %s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n",
+            srv.use_gpu_sampling ? "on" : "off",
+            (!default_topk_gpu_ok ? " (default top-k will use CPU fallback until k <= CUDA limit)" :
+             (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")),
+            srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0);
 
     // ------------------------------------------------------------------------------------------
     // HTTP server
@@ -676,6 +747,11 @@ int main(int argc, char ** argv) {
                 {"confidence_threshold", CONFIDENCE_THRESHOLD},
                 {"stability_threshold", STABILITY_THRESHOLD},
                 {"self_cond_topk", SC_K},
+                {"gpu_sampling", srv.use_gpu_sampling},
+                {"top_k", srv.topk_fixed},
+                {"top_k_start", srv.topk_start},
+                {"top_k_end", srv.topk_end},
+                {"top_k_tail_correction", srv.topk_tail},
             }},
             {"total_slots", 1},
             {"model_path", srv.model_path},
@@ -777,6 +853,11 @@ int main(int argc, char ** argv) {
                     {"canvas_length", srv.canvas_length},
                     {"n_denoise_steps", srv.n_steps},
                     {"self_cond_topk", SC_K},
+                    {"gpu_sampling", srv.use_gpu_sampling},
+                    {"top_k", srv.topk_fixed},
+                    {"top_k_start", srv.topk_start},
+                    {"top_k_end", srv.topk_end},
+                    {"top_k_tail_correction", srv.topk_tail},
                 }},
             };
             res.set_content(json::array({slot}).dump(), "application/json");
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index 5436c7ef579c..4f3691c2980f 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -43,6 +43,31 @@ GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * f
 GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
 GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 
+struct ggml_cuda_diffusion_sample_params {
+    int32_t  n_vocab;
+    int32_t  n_tokens;
+    int32_t  top_k;
+    int32_t  self_cond_top_k;
+    float    temperature;
+    uint32_t seed;
+    uint32_t step;
+    bool     top_k_tail_correction;
+};
+
+struct ggml_cuda_diffusion_sample_result {
+    int32_t * sampled;
+    int32_t * argmax;
+    float   * entropy;
+    int32_t * self_cond_ids;
+    float   * self_cond_probs;
+};
+
+typedef bool (*ggml_backend_cuda_diffusion_sample_topk_t)(
+        ggml_backend_t backend,
+        const struct ggml_tensor * logits,
+        const struct ggml_cuda_diffusion_sample_params * params,
+        struct ggml_cuda_diffusion_sample_result * result);
+
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
 
 #ifdef  __cplusplus
diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu
new file mode 100644
index 000000000000..042e3af0520d
--- /dev/null
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cu
@@ -0,0 +1,493 @@
+#include "argsort.cuh"
+#include "diffusion-sampling.cuh"
+#include "../ggml-backend-impl.h"
+
+#include <algorithm>
+#include <cfloat>
+#include <cstdlib>
+#include <cstring>
+
+#ifdef GGML_CUDA_USE_CUB
+#    include <cub/cub.cuh>
+#    if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2)
+#        define CUB_DIFFUSION_TOP_K_AVAILABLE
+#        include <cuda/iterator>
+using namespace cub;
+#    endif
+#endif
+
+static int next_power_of_2_host(int x) {
+    int n = 1;
+    while (n < x) {
+        n <<= 1;
+    }
+    return n;
+}
+
+static bool diffusion_fast_topk_enabled() {
+    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FAST_TOPK");
+    return !v || std::strcmp(v, "0") != 0;
+}
+
+static __device__ __forceinline__ bool diffusion_should_swap_desc(
+        const float a_val, const int a_id,
+        const float b_val, const int b_id) {
+    return a_val < b_val || (a_val == b_val && a_id > b_id);
+}
+
+template<int LOCAL_K>
+static __global__ void diffusion_select_topk_local_kernel(
+        const float * __restrict__ logits,
+        int * __restrict__ top_ids,
+        const int n_vocab,
+        const int heap_k) {
+    const int row = blockIdx.x;
+    const int tid = threadIdx.x;
+
+    float vals[LOCAL_K];
+    int ids[LOCAL_K];
+#pragma unroll
+    for (int i = 0; i < LOCAL_K; ++i) {
+        vals[i] = -FLT_MAX;
+        ids[i] = 0;
+    }
+
+    const float * row_logits = logits + (size_t) row * n_vocab;
+    for (int v = tid; v < n_vocab; v += blockDim.x) {
+        const float x = row_logits[v];
+        if (diffusion_should_swap_desc(vals[LOCAL_K - 1], ids[LOCAL_K - 1], x, v)) {
+            int pos = LOCAL_K - 1;
+#pragma unroll
+            for (int i = LOCAL_K - 1; i > 0; --i) {
+                if (pos == i && diffusion_should_swap_desc(vals[i - 1], ids[i - 1], x, v)) {
+                    vals[i] = vals[i - 1];
+                    ids[i] = ids[i - 1];
+                    --pos;
+                }
+            }
+            vals[pos] = x;
+            ids[pos] = v;
+        }
+    }
+
+    constexpr int candidate_count = LOCAL_K * 256;
+
+    extern __shared__ unsigned char smem[];
+    float * s_vals = (float *) smem;
+    int * s_ids = (int *) (s_vals + candidate_count);
+
+#pragma unroll
+    for (int i = 0; i < LOCAL_K; ++i) {
+        const int dst = tid * LOCAL_K + i;
+        s_vals[dst] = vals[i];
+        s_ids[dst] = ids[i];
+    }
+    __syncthreads();
+
+    for (int k = 2; k <= candidate_count; k <<= 1) {
+        for (int j = k >> 1; j > 0; j >>= 1) {
+            for (int i = tid; i < candidate_count; i += blockDim.x) {
+                const int ixj = i ^ j;
+                if (ixj > i) {
+                    const bool descending = (i & k) == 0;
+                    const bool swap = descending
+                        ? diffusion_should_swap_desc(s_vals[i], s_ids[i], s_vals[ixj], s_ids[ixj])
+                        : diffusion_should_swap_desc(s_vals[ixj], s_ids[ixj], s_vals[i], s_ids[i]);
+                    if (swap) {
+                        const float tv = s_vals[i];
+                        s_vals[i] = s_vals[ixj];
+                        s_vals[ixj] = tv;
+                        const int ti = s_ids[i];
+                        s_ids[i] = s_ids[ixj];
+                        s_ids[ixj] = ti;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    for (int i = tid; i < heap_k; i += blockDim.x) {
+        top_ids[(size_t) row * heap_k + i] = s_ids[i];
+    }
+}
+
+static void diffusion_select_topk_local(
+        const float * logits,
+        int * top_ids,
+        const int n_vocab,
+        const int n_tokens,
+        const int heap_k,
+        cudaStream_t stream) {
+    constexpr int block_size = 256;
+
+    if (heap_k <= 256) {
+        constexpr int local_k = 8;
+        const size_t smem = (size_t) local_k * block_size * (sizeof(float) + sizeof(int));
+        diffusion_select_topk_local_kernel<local_k><<<n_tokens, block_size, smem, stream>>>(
+                logits, top_ids, n_vocab, heap_k);
+    } else {
+        constexpr int local_k = 16;
+        const size_t smem = (size_t) local_k * block_size * (sizeof(float) + sizeof(int));
+        diffusion_select_topk_local_kernel<local_k><<<n_tokens, block_size, smem, stream>>>(
+                logits, top_ids, n_vocab, heap_k);
+    }
+}
+
+#ifdef CUB_DIFFUSION_TOP_K_AVAILABLE
+static void diffusion_top_k_cub(
+        ggml_cuda_pool & pool,
+        const float * src,
+        int * dst,
+        const int ncols,
+        const int k,
+        cudaStream_t stream) {
+    auto requirements = cuda::execution::require(cuda::execution::determinism::not_guaranteed,
+                                                 cuda::execution::output_ordering::unsorted);
+    auto stream_env   = cuda::stream_ref{ stream };
+    auto env          = cuda::std::execution::env{ stream_env, requirements };
+    auto indexes_in   = cuda::make_counting_iterator(0);
+
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK(DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k, env));
+
+    ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
+    void * d_temp_storage = temp_storage_alloc.get();
+
+    CUDA_CHECK(DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k, env));
+}
+#endif
+
+static __global__ void diffusion_sort_top_ids_kernel(
+        const float * __restrict__ logits,
+        int * __restrict__ top_ids,
+        const int n_vocab,
+        const int heap_k,
+        const int heap_k_pad,
+        const float inv_temp) {
+    const int row = blockIdx.x;
+    const int tid = threadIdx.x;
+
+    extern __shared__ unsigned char smem[];
+    int * ids = (int *) smem;
+    float * vals = (float *) (ids + heap_k_pad);
+
+    const int base = row * heap_k;
+    for (int i = tid; i < heap_k_pad; i += blockDim.x) {
+        if (i < heap_k) {
+            const int id = top_ids[base + i];
+            ids[i] = id;
+            vals[i] = logits[(size_t) row * n_vocab + id] * inv_temp;
+        } else {
+            ids[i] = 0;
+            vals[i] = -FLT_MAX;
+        }
+    }
+    __syncthreads();
+
+    for (int k = 2; k <= heap_k_pad; k <<= 1) {
+        for (int j = k >> 1; j > 0; j >>= 1) {
+            const int ixj = tid ^ j;
+            if (ixj > tid && ixj < heap_k_pad) {
+                const bool up = (tid & k) == 0;
+                const bool swap = up ? (vals[tid] < vals[ixj]) : (vals[tid] > vals[ixj]);
+                if (swap) {
+                    const float tv = vals[tid];
+                    vals[tid] = vals[ixj];
+                    vals[ixj] = tv;
+                    const int ti = ids[tid];
+                    ids[tid] = ids[ixj];
+                    ids[ixj] = ti;
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    for (int i = tid; i < heap_k; i += blockDim.x) {
+        top_ids[base + i] = ids[i];
+    }
+}
+
+static __device__ __forceinline__ uint32_t diffusion_rng_u32(uint32_t x) {
+    x ^= x >> 16;
+    x *= 0x7feb352du;
+    x ^= x >> 15;
+    x *= 0x846ca68bu;
+    x ^= x >> 16;
+    return x;
+}
+
+static __device__ __forceinline__ float diffusion_rng_uniform(uint32_t seed, uint32_t step, uint32_t row) {
+    const uint32_t x = diffusion_rng_u32(seed ^ (step * 0x9e3779b9u) ^ (row * 0x85ebca6bu));
+    return ((x >> 8) + 0.5f) * (1.0f / 16777216.0f);
+}
+
+static __global__ void diffusion_sample_kernel(
+        const float * __restrict__ logits,
+        const int * __restrict__ top_ids,
+        const int n_vocab,
+        const int n_tokens,
+        const int top_k,
+        const int heap_k,
+        const int sc_k,
+        const float inv_temp,
+        const uint32_t seed,
+        const uint32_t step,
+        const bool tail_correction,
+        int * __restrict__ sampled,
+        int * __restrict__ argmax,
+        float * __restrict__ entropy,
+        int * __restrict__ sc_ids,
+        float * __restrict__ sc_probs) {
+    const int row = blockIdx.x;
+    const int tid = threadIdx.x;
+    if (row >= n_tokens) {
+        return;
+    }
+
+    __shared__ float s_val[1024];
+    __shared__ float s_sum[1024];
+    __shared__ int   s_idx[1024];
+
+    const float * row_logits = logits + (size_t) row * n_vocab;
+    const int * row_top = top_ids + (size_t) row * heap_k;
+
+    float local_max = -FLT_MAX;
+    int local_idx = 0;
+
+    if (top_k == 0 || tail_correction) {
+        for (int v = tid; v < n_vocab; v += blockDim.x) {
+            const float x = row_logits[v] * inv_temp;
+            if (x > local_max) {
+                local_max = x;
+                local_idx = v;
+            }
+        }
+    } else {
+        for (int i = tid; i < top_k; i += blockDim.x) {
+            const int v = row_top[i];
+            const float x = row_logits[v] * inv_temp;
+            if (x > local_max) {
+                local_max = x;
+                local_idx = v;
+            }
+        }
+    }
+
+    s_val[tid] = local_max;
+    s_idx[tid] = local_idx;
+    __syncthreads();
+
+    for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
+        if (tid < stride && s_val[tid + stride] > s_val[tid]) {
+            s_val[tid] = s_val[tid + stride];
+            s_idx[tid] = s_idx[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    const float max_l = s_val[0];
+    const int amax = s_idx[0];
+
+    float local_sum = 0.0f;
+    float local_t = 0.0f;
+
+    if (top_k == 0 || tail_correction) {
+        for (int v = tid; v < n_vocab; v += blockDim.x) {
+            const float d = row_logits[v] * inv_temp - max_l;
+            const float e = expf(d);
+            local_sum += e;
+            local_t   += d * e;
+        }
+    } else {
+        for (int i = tid; i < top_k; i += blockDim.x) {
+            const int v = row_top[i];
+            const float d = row_logits[v] * inv_temp - max_l;
+            const float e = expf(d);
+            local_sum += e;
+            local_t   += d * e;
+        }
+    }
+
+    s_sum[tid] = local_sum;
+    s_val[tid] = local_t;
+    __syncthreads();
+
+    for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            s_sum[tid] += s_sum[tid + stride];
+            s_val[tid] += s_val[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    const float z = s_sum[0];
+    const float t = s_val[0];
+
+    if (tid == 0) {
+        argmax[row] = amax;
+        entropy[row] = logf(z) - t / z;
+
+        float sample_z = z;
+        if (top_k > 0 && tail_correction) {
+            sample_z = 0.0f;
+            for (int i = 0; i < top_k; ++i) {
+                const int v = row_top[i];
+                sample_z += expf(row_logits[v] * inv_temp - max_l);
+            }
+        }
+
+        const float r = diffusion_rng_uniform(seed, step, row) * sample_z;
+        float cum = 0.0f;
+        int tok = amax;
+
+        if (top_k == 0) {
+            for (int v = 0; v < n_vocab; ++v) {
+                cum += expf(row_logits[v] * inv_temp - max_l);
+                if (cum >= r) {
+                    tok = v;
+                    break;
+                }
+            }
+        } else {
+            for (int i = 0; i < top_k; ++i) {
+                const int v = row_top[i];
+                cum += expf(row_logits[v] * inv_temp - max_l);
+                if (cum >= r) {
+                    tok = v;
+                    break;
+                }
+            }
+        }
+        sampled[row] = tok;
+
+        const int n_sc = top_k == 0 ? sc_k : min(sc_k, top_k);
+        for (int i = 0; i < sc_k; ++i) {
+            const int out = row * sc_k + i;
+            if (i < n_sc) {
+                const int v = row_top[i];
+                sc_ids[out] = v;
+                sc_probs[out] = expf(row_logits[v] * inv_temp - max_l) / sample_z;
+            } else {
+                sc_ids[out] = 0;
+                sc_probs[out] = 0.0f;
+            }
+        }
+    }
+}
+
+bool ggml_cuda_diffusion_sample_topk(
+        ggml_backend_t backend,
+        const ggml_tensor * logits,
+        const ggml_cuda_diffusion_sample_params * params,
+        ggml_cuda_diffusion_sample_result * result) {
+    if (!backend || !logits || !params || !result) {
+        return false;
+    }
+    if (!ggml_backend_is_cuda(backend) || logits->type != GGML_TYPE_F32 || !ggml_is_contiguous(logits)) {
+        return false;
+    }
+    if (!result->sampled || !result->argmax || !result->entropy || !result->self_cond_ids || !result->self_cond_probs) {
+        return false;
+    }
+
+    const int n_vocab = params->n_vocab > 0 ? params->n_vocab : (int) logits->ne[0];
+    const int n_tokens = params->n_tokens > 0 ? params->n_tokens : (int) ggml_nrows(logits);
+    if (n_vocab <= 0 || n_tokens <= 0 || logits->ne[0] != n_vocab || ggml_nrows(logits) < n_tokens) {
+        return false;
+    }
+
+    int top_k = params->top_k;
+    if (top_k <= 0 || top_k >= n_vocab) {
+        top_k = 0;
+    }
+
+    const int sc_k = params->self_cond_top_k;
+    if (sc_k <= 0 || sc_k > 1024) {
+        return false;
+    }
+
+    const int heap_k = top_k == 0 ? sc_k : std::max(top_k, sc_k);
+    if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) {
+        return false;
+    }
+
+    const float temp = params->temperature > 0.0f ? params->temperature : 1.0f;
+    const float inv_temp = 1.0f / temp;
+
+    ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *) backend->context;
+    ggml_cuda_set_device(ctx->device);
+    ggml_cuda_pool & pool = ctx->pool();
+    cudaStream_t stream = ctx->stream();
+
+    const float * logits_d = (const float *) logits->data;
+
+    ggml_cuda_pool_alloc<int> top_ids_alloc(pool, (size_t) n_tokens * heap_k);
+    int * top_ids = top_ids_alloc.get();
+    bool top_ids_sorted = false;
+
+    const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k;
+    if (use_fast_topk) {
+        diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream);
+        top_ids_sorted = true;
+    } else {
+#ifdef CUB_DIFFUSION_TOP_K_AVAILABLE
+        for (int row = 0; row < n_tokens; ++row) {
+            diffusion_top_k_cub(pool, logits_d + (size_t) row * n_vocab, top_ids + (size_t) row * heap_k, n_vocab, heap_k, stream);
+        }
+#elif defined(GGML_CUDA_USE_CUB)
+        ggml_cuda_pool_alloc<int> sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab);
+        int * sorted_ids = sorted_ids_alloc.get();
+        argsort_f32_i32_cuda_cub(pool, logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream);
+        CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int),
+                                     heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream));
+        top_ids_sorted = true;
+#else
+        if (n_vocab > 1024) {
+            return false;
+        }
+        ggml_cuda_pool_alloc<int> sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab);
+        int * sorted_ids = sorted_ids_alloc.get();
+        argsort_f32_i32_cuda_bitonic(logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream);
+        CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int),
+                                     heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream));
+        top_ids_sorted = true;
+#endif
+    }
+
+    if (!top_ids_sorted) {
+        const int heap_k_pad = next_power_of_2_host(heap_k);
+        const int sort_threads = std::max(32, heap_k_pad);
+        const size_t sort_smem = (size_t) heap_k_pad * (sizeof(int) + sizeof(float));
+        diffusion_sort_top_ids_kernel<<<n_tokens, sort_threads, sort_smem, stream>>>(
+                logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp);
+    }
+
+    ggml_cuda_pool_alloc<int>   sampled_alloc(pool, n_tokens);
+    ggml_cuda_pool_alloc<int>   argmax_alloc(pool, n_tokens);
+    ggml_cuda_pool_alloc<float> entropy_alloc(pool, n_tokens);
+    ggml_cuda_pool_alloc<int>   sc_ids_alloc(pool, (size_t) n_tokens * sc_k);
+    ggml_cuda_pool_alloc<float> sc_probs_alloc(pool, (size_t) n_tokens * sc_k);
+
+    constexpr int block_size = 256;
+    diffusion_sample_kernel<<<n_tokens, block_size, 0, stream>>>(
+            logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp,
+            params->seed, params->step, params->top_k_tail_correction,
+            sampled_alloc.get(), argmax_alloc.get(), entropy_alloc.get(),
+            sc_ids_alloc.get(), sc_probs_alloc.get());
+
+    CUDA_CHECK(cudaMemcpyAsync(result->sampled, sampled_alloc.get(), (size_t) n_tokens * sizeof(int),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(result->argmax, argmax_alloc.get(), (size_t) n_tokens * sizeof(int),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(result->entropy, entropy_alloc.get(), (size_t) n_tokens * sizeof(float),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float),
+                               cudaMemcpyDeviceToHost, stream));
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    return true;
+}
diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cuh b/ggml/src/ggml-cuda/diffusion-sampling.cuh
new file mode 100644
index 000000000000..dfce406d9303
--- /dev/null
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "common.cuh"
+
+bool ggml_cuda_diffusion_sample_topk(
+        ggml_backend_t backend,
+        const ggml_tensor * logits,
+        const ggml_cuda_diffusion_sample_params * params,
+        ggml_cuda_diffusion_sample_result * result);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e779a9be9e95..17e2c10f2b9c 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -23,6 +23,7 @@
 #include "ggml-cuda/cumsum.cuh"
 #include "ggml-cuda/diagmask.cuh"
 #include "ggml-cuda/diag.cuh"
+#include "ggml-cuda/diffusion-sampling.cuh"
 #include "ggml-cuda/fattn.cuh"
 #include "ggml-cuda/fwht.cuh"
 #include "ggml-cuda/getrows.cuh"
@@ -3296,6 +3297,10 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
 static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
 
     bool use_cuda_graph = true;
+    static const bool log_graph_incompat = [] {
+        const char * env = getenv("GGML_CUDA_GRAPH_DIAG");
+        return env != nullptr && atoi(env) != 0;
+    }();
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -3307,6 +3312,10 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
 
         if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
             use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
+            if (log_graph_incompat) {
+                GGML_LOG_INFO("%s: disabling CUDA graphs due to split buffer at node %d %s (%s)\n",
+                        __func__, i, node->name, ggml_op_name(node->op));
+            }
 #ifndef NDEBUG
             GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
 #endif
@@ -3315,12 +3324,23 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
         // [TAG_MUL_MAT_ID_CUDA_GRAPHS]
         if (node->op == GGML_OP_MUL_MAT_ID) {
             const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-            const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc);
-            if (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max) {
+            const ggml_tensor * src0 = node->src[0];
+            const ggml_tensor * src1 = node->src[1];
+            const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
+            const bool use_mmvq = ggml_is_quantized(src0->type) && node->ne[2] <= mmvq_mmid_max;
+            const bool use_mmq  = ggml_is_quantized(src0->type) && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[2], /*n_experts=*/src0->ne[2]);
+            const bool use_mmf  = ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true);
+            if (!use_mmvq && !use_mmq && !use_mmf) {
                 // under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
                 // TODO: figure out a way to enable for larger batch sizes, without hurting performance
                 // ref: https://github.com/ggml-org/llama.cpp/pull/18958
                 use_cuda_graph = false;
+                if (log_graph_incompat) {
+                    GGML_LOG_INFO("%s: disabling CUDA graphs due to MUL_MAT_ID at node %d %s: src0=%s ne2=%lld src1_ne2=%lld n_experts=%lld mmvq_mmid_max=%d quantized=%d mmq=%d mmf=%d\n",
+                            __func__, i, node->name, ggml_type_name(src0->type),
+                            (long long) node->ne[2], (long long) src1->ne[2], (long long) src0->ne[2],
+                            mmvq_mmid_max, (int) ggml_is_quantized(src0->type), (int) use_mmq, (int) use_mmf);
+                }
 #ifndef NDEBUG
                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
 #endif
@@ -5664,6 +5684,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
     if (strcmp(name, "ggml_backend_get_features") == 0) {
         return (void *)ggml_backend_cuda_get_features;
     }
+    if (strcmp(name, "ggml_backend_cuda_diffusion_sample_topk") == 0) {
+        return (void *)ggml_cuda_diffusion_sample_topk;
+    }
     return nullptr;
 }
 
diff --git a/include/llama.h b/include/llama.h
index cbb20b200ba5..eb2d9cb338fb 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1015,6 +1015,35 @@ extern "C" {
                          int64_t   k,
                          int64_t   n_tokens);
 
+    struct llama_diffusion_sample_params {
+        int32_t  n_tokens;
+        int32_t  top_k;
+        int32_t  self_cond_top_k;
+        float    temperature;
+        uint32_t seed;
+        uint32_t step;
+        bool     top_k_tail_correction;
+    };
+
+    struct llama_diffusion_sample_result {
+        llama_token * sampled;
+        llama_token * argmax;
+        float       * entropy;
+        int32_t     * self_cond_ids;
+        float       * self_cond_probs;
+    };
+
+    // CUDA-only fast path for block-diffusion sampling. When enabled, llama_decode()
+    // keeps the dense diffusion logits on the backend and does not copy them to the
+    // host output buffer. Call llama_diffusion_sample_topk() after llama_decode() to
+    // sample the most recent logits tensor and retrieve compact row results.
+    LLAMA_API bool llama_diffusion_sample_topk_supported(struct llama_context * ctx);
+    LLAMA_API void llama_set_diffusion_gpu_sampling(struct llama_context * ctx, bool enabled);
+    LLAMA_API bool llama_diffusion_sample_topk(
+            struct llama_context * ctx,
+            const struct llama_diffusion_sample_params * params,
+            struct llama_diffusion_sample_result * result);
+
     // Set whether the model is in warmup mode or not
     // If true, all model tensors are activated during llama_decode() to load and cache their weights.
     //
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d2a624392ada..4ec0292704e6 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1,6 +1,7 @@
 #include "llama-context.h"
 
 #include "ggml.h"
+#include "ggml-cuda.h"
 #include "llama-arch.h"
 #include "llama-graph.h"
 #include "llama-impl.h"
@@ -1160,6 +1161,80 @@ void llama_context::set_diffusion_self_cond_topk(const int32_t * ids, const floa
     diffusion_cond.sc_topk_probs.assign(probs, probs + n);
 }
 
+static ggml_backend_cuda_diffusion_sample_topk_t get_cuda_diffusion_sample_topk_proc(ggml_backend_t backend) {
+    if (!backend) {
+        return nullptr;
+    }
+
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    if (!dev) {
+        return nullptr;
+    }
+
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    if (!reg) {
+        return nullptr;
+    }
+
+    return (ggml_backend_cuda_diffusion_sample_topk_t)
+        ggml_backend_reg_get_proc_address(reg, "ggml_backend_cuda_diffusion_sample_topk");
+}
+
+bool llama_context::diffusion_sample_topk_supported() const {
+    for (ggml_backend_t backend : backend_ptrs) {
+        if (get_cuda_diffusion_sample_topk_proc(backend)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void llama_context::set_diffusion_gpu_sampling(bool enabled) {
+    diffusion_gpu_sampling = enabled;
+}
+
+bool llama_context::diffusion_sample_topk(
+        const llama_diffusion_sample_params * params,
+              llama_diffusion_sample_result * result) {
+    if (!params || !result || !gf_res_prev) {
+        return false;
+    }
+
+    static_assert(sizeof(llama_token) == sizeof(int32_t), "llama_token must be int32_t for CUDA diffusion sampling");
+
+    ggml_tensor * t_logits = gf_res_prev->get_logits();
+    if (!t_logits) {
+        return false;
+    }
+
+    ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
+    auto sample_proc = get_cuda_diffusion_sample_topk_proc(backend);
+    if (!sample_proc) {
+        return false;
+    }
+
+    ggml_cuda_diffusion_sample_params cuda_params = {
+        /* .n_vocab               = */ (int32_t) model.vocab.n_tokens(),
+        /* .n_tokens              = */ params->n_tokens,
+        /* .top_k                 = */ params->top_k,
+        /* .self_cond_top_k       = */ params->self_cond_top_k,
+        /* .temperature           = */ params->temperature,
+        /* .seed                  = */ params->seed,
+        /* .step                  = */ params->step,
+        /* .top_k_tail_correction = */ params->top_k_tail_correction,
+    };
+
+    ggml_cuda_diffusion_sample_result cuda_result = {
+        /* .sampled         = */ (int32_t *) result->sampled,
+        /* .argmax          = */ (int32_t *) result->argmax,
+        /* .entropy         = */ result->entropy,
+        /* .self_cond_ids   = */ result->self_cond_ids,
+        /* .self_cond_probs = */ result->self_cond_probs,
+    };
+
+    return sample_proc(backend, t_logits, &cuda_params, &cuda_result);
+}
+
 void llama_context::set_causal_attn(bool value) {
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
@@ -1659,7 +1734,11 @@ static void copy_tensor_async_candidates(
     }
 }
 
-static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map<llama_seq_id, llama_sampler *> & samplers) {
+static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map<llama_seq_id, llama_sampler *> & samplers, bool skip_raw_logits) {
+    if (skip_raw_logits) {
+        return false;
+    }
+
     for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
         if (!ubatch.output[i]) {
             continue;
@@ -1886,7 +1965,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
         }
 
         // extract logits
-        if (logits.data && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers)) {
+        if (logits.data && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers, diffusion_gpu_sampling)) {
             ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
             GGML_ASSERT(backend_res != nullptr);
             GGML_ASSERT(logits.data != nullptr);
@@ -3606,6 +3685,21 @@ void llama_set_diffusion_self_cond_topk(llama_context * ctx, const int32_t * ids
     ctx->set_diffusion_self_cond_topk(ids, probs, k, n_tokens);
 }
 
+bool llama_diffusion_sample_topk_supported(llama_context * ctx) {
+    return ctx->diffusion_sample_topk_supported();
+}
+
+void llama_set_diffusion_gpu_sampling(llama_context * ctx, bool enabled) {
+    ctx->set_diffusion_gpu_sampling(enabled);
+}
+
+bool llama_diffusion_sample_topk(
+        llama_context * ctx,
+        const llama_diffusion_sample_params * params,
+        llama_diffusion_sample_result * result) {
+    return ctx->diffusion_sample_topk(params, result);
+}
+
 void llama_set_warmup(llama_context * ctx, bool warmup) {
     ctx->set_warmup(warmup);
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index 3ee5cf124b28..48928162acbd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -128,6 +128,12 @@ struct llama_context {
     // diffusion: sparse (top-k) self-conditioning (top-k token ids + probs per position)
     void set_diffusion_self_cond_topk(const int32_t * ids, const float * probs, int64_t k, int64_t n_tokens);
 
+    bool diffusion_sample_topk_supported() const;
+    void set_diffusion_gpu_sampling(bool enabled);
+    bool diffusion_sample_topk(
+            const llama_diffusion_sample_params * params,
+                  llama_diffusion_sample_result * result);
+
     void set_adapters_lora(llama_adapter_lora ** adapters, size_t n_adapters, float * scales);
 
     bool adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales);
@@ -288,6 +294,7 @@ struct llama_context {
     llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
 
     llama_diffusion_cond diffusion_cond; // diffusion self-conditioning (set per-decode by the sampler)
+    bool diffusion_gpu_sampling = false; // skip dense logits D2H; sampled via CUDA backend proc
 
     llama_memory_ptr memory;
 

From 0c69257e69dd0437ca7242da5eba898b91520fd7 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Tue, 9 Jun 2026 14:12:45 +0530
Subject: [PATCH 03/13] diffusion-gemma: keep self-cond sampling on device

---
 .../diffusion-gemma/diffusion-gemma-cli.cpp   | 25 ++++++----
 .../diffusion-gemma-server.cpp                | 23 ++++++---
 ggml/include/ggml-cuda.h                      |  2 +
 ggml/src/ggml-cuda/diffusion-sampling.cu      | 48 +++++++++++++++++--
 include/llama.h                               |  2 +
 src/llama-context.cpp                         | 36 +++++++++++++-
 src/llama-graph.cpp                           | 37 ++++++++++++++
 src/llama-graph.h                             |  8 ++++
 8 files changed, 160 insertions(+), 21 deletions(-)

diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index 6beea37005b8..84fd64be6895 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -207,14 +207,16 @@ int main(int argc, char ** argv) {
     const bool use_gpu_sampling = gpu_sampling_requested &&
                                   gpu_sampling_topk_ok &&
                                   llama_diffusion_sample_topk_supported(ctx);
+    const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling);
 
     LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n",
             prefix_len, canvas_length, max_canvases, n_steps, entropy_bound, TEMP_MIN, TEMP_MAX, n_ctx, (int) use_mm);
-    LOG_INF("diffusion-gemma: gpu sampling: %s%s\n",
+    LOG_INF("diffusion-gemma: gpu sampling: %s%s%s\n",
             use_gpu_sampling ? "on" : "off",
             (!gpu_sampling_topk_ok ? " (top-k exceeds CUDA fast-path limit)" :
-             (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")));
+             (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")),
+            use_device_self_cond ? " | device self-cond: on" : "");
     if (topk_fixed > 0 || (topk_start > 0 && topk_end > 0)) {
         LOG_INF("diffusion-gemma: top-k sampling: fixed=%d anneal=[%d->%d] tail_correction=%d (vocab=%d)\n",
                 topk_fixed, topk_start, topk_end, topk_tail, n_vocab);
@@ -350,8 +352,10 @@ int main(int argc, char ** argv) {
 
         // sparse self-cond for the NEXT step: top-SC_K (id, prob) per position. Cleared each step;
         // unused slots stay (id 0, prob 0) so the graph gather contributes nothing for them.
-        std::fill(sc_probs.begin(), sc_probs.end(), 0.0f);
-        std::fill(sc_ids.begin(),   sc_ids.end(),   0);
+        if (!use_device_self_cond) {
+            std::fill(sc_probs.begin(), sc_probs.end(), 0.0f);
+            std::fill(sc_ids.begin(),   sc_ids.end(),   0);
+        }
 
         bool sampled_ok = true;
         const auto t_sample_start = std::chrono::steady_clock::now();
@@ -369,8 +373,8 @@ int main(int argc, char ** argv) {
                 /* .sampled         = */ sampled.data(),
                 /* .argmax          = */ argmax_canvas.data(),
                 /* .entropy         = */ entropy.data(),
-                /* .self_cond_ids   = */ sc_ids.data(),
-                /* .self_cond_probs = */ sc_probs.data(),
+                /* .self_cond_ids   = */ use_device_self_cond ? nullptr : sc_ids.data(),
+                /* .self_cond_probs = */ use_device_self_cond ? nullptr : sc_probs.data(),
             };
             sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result);
             if (!sampled_ok) {
@@ -513,9 +517,6 @@ int main(int argc, char ** argv) {
         // prefix [0, n_past); the next step re-decodes the canvas fresh against that prefix.
         llama_memory_seq_rm(mem, 0, n_past, -1);
 
-        // self-conditioning for the NEXT denoising step: feed this step's top-SC_K (id, prob)
-        llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length);
-
         // 2c. entropy-bound accept: sort positions by entropy ascending, accept the prefix
         // where sum(entropy of all-but-last) <= entropy_bound (monotonic -> prefix selection)
         std::vector<int> order(canvas_length);
@@ -553,6 +554,12 @@ int main(int argc, char ** argv) {
         }
         prev_argmax = argmax_canvas;
 
+        // self-conditioning for the NEXT denoising step. With device self-cond this was already
+        // copied D2D into the reused graph input tensors by llama_diffusion_sample_topk().
+        if (!use_device_self_cond) {
+            llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length);
+        }
+
         // 2e. renoise non-accepted positions with fresh random tokens -> next canvas
         for (int i = 0; i < canvas_length; ++i) {
             canvas[i] = accept_mask[i] ? accepted[i] : rand_tok(rng);
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
index ebad596a5515..bc6ea6e7caf1 100644
--- a/examples/diffusion-gemma/diffusion-gemma-server.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -159,6 +159,7 @@ struct diffusion_server {
     int    topk_end      = 0;
     bool   topk_tail     = false;
     bool   use_gpu_sampling = false;
+    bool   use_device_self_cond = false;
     std::string model_id;
     std::string model_path;
     std::string build_info = "llama.cpp diffusion-gemma-server";
@@ -261,6 +262,7 @@ struct diffusion_server {
                 if (k_step <= 0 || k_step >= n_vocab) k_step = 0;
                 const bool use_gpu_sampling_step = use_gpu_sampling &&
                     (k_step <= 0 || k_step <= GPU_SAMPLING_MAX_TOP_K);
+                const bool use_device_self_cond_step = use_gpu_sampling_step && use_device_self_cond;
 
                 llama_set_causal_attn(ctx, false);
                 llama_set_diffusion_decoder_phase(ctx, true);
@@ -282,8 +284,10 @@ struct diffusion_server {
                 std::vector<float> entropy(canvas_length);
                 std::vector<llama_token> sampled(canvas_length);
 
-                std::fill(sc_probs.begin(), sc_probs.end(), 0.0f);
-                std::fill(sc_ids.begin(),   sc_ids.end(),   0);
+                if (!use_device_self_cond_step) {
+                    std::fill(sc_probs.begin(), sc_probs.end(), 0.0f);
+                    std::fill(sc_ids.begin(),   sc_ids.end(),   0);
+                }
 
                 if (use_gpu_sampling_step) {
                     llama_diffusion_sample_params sample_params = {
@@ -299,8 +303,8 @@ struct diffusion_server {
                         /* .sampled         = */ sampled.data(),
                         /* .argmax          = */ argmax_canvas.data(),
                         /* .entropy         = */ entropy.data(),
-                        /* .self_cond_ids   = */ sc_ids.data(),
-                        /* .self_cond_probs = */ sc_probs.data(),
+                        /* .self_cond_ids   = */ use_device_self_cond_step ? nullptr : sc_ids.data(),
+                        /* .self_cond_probs = */ use_device_self_cond_step ? nullptr : sc_probs.data(),
                     };
                     if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) {
                         llama_memory_seq_rm(mem, 0, n_past, -1);
@@ -394,7 +398,6 @@ struct diffusion_server {
                 }
 
                 llama_memory_seq_rm(mem, 0, n_past, -1);
-                llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length);
 
                 std::vector<int> order(canvas_length);
                 std::iota(order.begin(), order.end(), 0);
@@ -413,6 +416,10 @@ struct diffusion_server {
                 if (stable && confident) break;
                 prev_argmax = argmax_canvas;
 
+                if (!use_device_self_cond_step) {
+                    llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length);
+                }
+
                 for (int i = 0; i < canvas_length; ++i) canvas[i] = accept_mask[i] ? accepted[i] : rand_tok(rng);
             }
 
@@ -666,6 +673,7 @@ int main(int argc, char ** argv) {
     const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0;
     srv.use_gpu_sampling = gpu_sampling_requested &&
                            llama_diffusion_sample_topk_supported(srv.ctx);
+    srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling);
     const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
 
@@ -674,10 +682,11 @@ int main(int argc, char ** argv) {
     SRV_INF("%s\n", llama_print_system_info());
     SRV_INF("model loaded: '%s' | n_ctx = %d | canvas = %d | denoise steps = %d | 1 slot\n",
             srv.model_id.c_str(), srv.n_ctx, srv.canvas_length, srv.n_steps);
-    SRV_INF("gpu sampling: %s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n",
+    SRV_INF("gpu sampling: %s%s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n",
             srv.use_gpu_sampling ? "on" : "off",
             (!default_topk_gpu_ok ? " (default top-k will use CPU fallback until k <= CUDA limit)" :
              (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")),
+            srv.use_device_self_cond ? " | device self-cond: on" : "",
             srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0);
 
     // ------------------------------------------------------------------------------------------
@@ -748,6 +757,7 @@ int main(int argc, char ** argv) {
                 {"stability_threshold", STABILITY_THRESHOLD},
                 {"self_cond_topk", SC_K},
                 {"gpu_sampling", srv.use_gpu_sampling},
+                {"device_self_cond", srv.use_device_self_cond},
                 {"top_k", srv.topk_fixed},
                 {"top_k_start", srv.topk_start},
                 {"top_k_end", srv.topk_end},
@@ -854,6 +864,7 @@ int main(int argc, char ** argv) {
                     {"n_denoise_steps", srv.n_steps},
                     {"self_cond_topk", SC_K},
                     {"gpu_sampling", srv.use_gpu_sampling},
+                    {"device_self_cond", srv.use_device_self_cond},
                     {"top_k", srv.topk_fixed},
                     {"top_k_start", srv.topk_start},
                     {"top_k_end", srv.topk_end},
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index 4f3691c2980f..f16842c3f835 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -60,6 +60,8 @@ struct ggml_cuda_diffusion_sample_result {
     float   * entropy;
     int32_t * self_cond_ids;
     float   * self_cond_probs;
+    struct ggml_tensor * self_cond_ids_tensor;
+    struct ggml_tensor * self_cond_probs_tensor;
 };
 
 typedef bool (*ggml_backend_cuda_diffusion_sample_topk_t)(
diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu
index 042e3af0520d..34c8efdac7a9 100644
--- a/ggml/src/ggml-cuda/diffusion-sampling.cu
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cu
@@ -388,7 +388,7 @@ bool ggml_cuda_diffusion_sample_topk(
     if (!ggml_backend_is_cuda(backend) || logits->type != GGML_TYPE_F32 || !ggml_is_contiguous(logits)) {
         return false;
     }
-    if (!result->sampled || !result->argmax || !result->entropy || !result->self_cond_ids || !result->self_cond_probs) {
+    if (!result->sampled || !result->argmax || !result->entropy) {
         return false;
     }
 
@@ -408,6 +408,36 @@ bool ggml_cuda_diffusion_sample_topk(
         return false;
     }
 
+    const bool have_self_cond_host = result->self_cond_ids != nullptr && result->self_cond_probs != nullptr;
+    const bool have_self_cond_tensor = result->self_cond_ids_tensor != nullptr && result->self_cond_probs_tensor != nullptr;
+    if (!have_self_cond_host && !have_self_cond_tensor) {
+        return false;
+    }
+    if ((result->self_cond_ids == nullptr) != (result->self_cond_probs == nullptr)) {
+        return false;
+    }
+    if ((result->self_cond_ids_tensor == nullptr) != (result->self_cond_probs_tensor == nullptr)) {
+        return false;
+    }
+    if (have_self_cond_tensor) {
+        if (result->self_cond_ids_tensor->type != GGML_TYPE_I32 ||
+            result->self_cond_probs_tensor->type != GGML_TYPE_F32 ||
+            !ggml_is_contiguous(result->self_cond_ids_tensor) ||
+            !ggml_is_contiguous(result->self_cond_probs_tensor) ||
+            result->self_cond_ids_tensor->data == nullptr ||
+            result->self_cond_probs_tensor->data == nullptr ||
+            result->self_cond_ids_tensor->buffer == nullptr ||
+            result->self_cond_probs_tensor->buffer == nullptr ||
+            ggml_backend_buffer_is_host(result->self_cond_ids_tensor->buffer) ||
+            ggml_backend_buffer_is_host(result->self_cond_probs_tensor->buffer)) {
+            return false;
+        }
+        if (ggml_nbytes(result->self_cond_ids_tensor) != (size_t) n_tokens * sc_k * sizeof(int) ||
+            ggml_nbytes(result->self_cond_probs_tensor) != (size_t) n_tokens * sc_k * sizeof(float)) {
+            return false;
+        }
+    }
+
     const int heap_k = top_k == 0 ? sc_k : std::max(top_k, sc_k);
     if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) {
         return false;
@@ -483,10 +513,18 @@ bool ggml_cuda_diffusion_sample_topk(
                                cudaMemcpyDeviceToHost, stream));
     CUDA_CHECK(cudaMemcpyAsync(result->entropy, entropy_alloc.get(), (size_t) n_tokens * sizeof(float),
                                cudaMemcpyDeviceToHost, stream));
-    CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int),
-                               cudaMemcpyDeviceToHost, stream));
-    CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float),
-                               cudaMemcpyDeviceToHost, stream));
+    if (have_self_cond_host) {
+        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int),
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float),
+                                   cudaMemcpyDeviceToHost, stream));
+    }
+    if (have_self_cond_tensor) {
+        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids_tensor->data, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int),
+                                   cudaMemcpyDeviceToDevice, stream));
+        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float),
+                                   cudaMemcpyDeviceToDevice, stream));
+    }
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     return true;
diff --git a/include/llama.h b/include/llama.h
index eb2d9cb338fb..e6062b4bbfe9 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1029,6 +1029,8 @@ extern "C" {
         llama_token * sampled;
         llama_token * argmax;
         float       * entropy;
+        // Optional host outputs for self-conditioning. If both are null, the CUDA fast path writes
+        // sparse self-conditioning directly into the reused decoder graph input tensors.
         int32_t     * self_cond_ids;
         float       * self_cond_probs;
     };
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4ec0292704e6..2a0a5568a1c6 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1149,6 +1149,12 @@ void llama_context::set_diffusion_decoder_phase(bool decoder_phase) {
 }
 
 void llama_context::set_diffusion_self_cond_topk(const int32_t * ids, const float * probs, int64_t k, int64_t n_tokens) {
+    diffusion_cond.sc_topk_device_ready       = false;
+    diffusion_cond.sc_topk_device_ids_data    = nullptr;
+    diffusion_cond.sc_topk_device_probs_data  = nullptr;
+    diffusion_cond.sc_topk_device_ids_bytes   = 0;
+    diffusion_cond.sc_topk_device_probs_bytes = 0;
+
     if (ids == nullptr || probs == nullptr || k <= 0 || n_tokens <= 0) {
         diffusion_cond.sc_topk = 0;
         diffusion_cond.sc_topk_ids.clear();
@@ -1213,6 +1219,20 @@ bool llama_context::diffusion_sample_topk(
         return false;
     }
 
+    ggml_tensor * t_self_cond_ids   = nullptr;
+    ggml_tensor * t_self_cond_probs = nullptr;
+    const bool device_self_cond = result->self_cond_ids == nullptr && result->self_cond_probs == nullptr;
+    if (device_self_cond) {
+        auto * inp = gf_res_prev->get_inp_diffusion_self_cond_topk();
+        if (!inp || !inp->ids || !inp->probs) {
+            return false;
+        }
+        t_self_cond_ids   = inp->ids;
+        t_self_cond_probs = inp->probs;
+    } else if (result->self_cond_ids == nullptr || result->self_cond_probs == nullptr) {
+        return false;
+    }
+
     ggml_cuda_diffusion_sample_params cuda_params = {
         /* .n_vocab               = */ (int32_t) model.vocab.n_tokens(),
         /* .n_tokens              = */ params->n_tokens,
@@ -1230,9 +1250,23 @@ bool llama_context::diffusion_sample_topk(
         /* .entropy         = */ result->entropy,
         /* .self_cond_ids   = */ result->self_cond_ids,
         /* .self_cond_probs = */ result->self_cond_probs,
+        /* .self_cond_ids_tensor   = */ t_self_cond_ids,
+        /* .self_cond_probs_tensor = */ t_self_cond_probs,
     };
 
-    return sample_proc(backend, t_logits, &cuda_params, &cuda_result);
+    const bool ok = sample_proc(backend, t_logits, &cuda_params, &cuda_result);
+    if (ok && device_self_cond) {
+        diffusion_cond.sc_topk = params->self_cond_top_k;
+        diffusion_cond.sc_topk_ids.clear();
+        diffusion_cond.sc_topk_probs.clear();
+        diffusion_cond.sc_topk_device_ready       = true;
+        diffusion_cond.sc_topk_device_ids_data    = t_self_cond_ids->data;
+        diffusion_cond.sc_topk_device_probs_data  = t_self_cond_probs->data;
+        diffusion_cond.sc_topk_device_ids_bytes   = ggml_nbytes(t_self_cond_ids);
+        diffusion_cond.sc_topk_device_probs_bytes = ggml_nbytes(t_self_cond_probs);
+    }
+
+    return ok;
 }
 
 void llama_context::set_causal_attn(bool value) {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 83615641fdb5..b0a7a31f8131 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -415,6 +415,18 @@ void llm_graph_input_diffusion_self_cond_topk::set_input(const llama_ubatch * ub
     const size_t n_id_bytes = ggml_nbytes(ids);
     const size_t n_pr_bytes = ggml_nbytes(probs);
 
+    const bool have_device = diffusion
+        && diffusion->sc_topk > 0
+        && diffusion->sc_topk_device_ready
+        && diffusion->sc_topk_device_ids_data    == ids->data
+        && diffusion->sc_topk_device_probs_data  == probs->data
+        && diffusion->sc_topk_device_ids_bytes   == n_id_bytes
+        && diffusion->sc_topk_device_probs_bytes == n_pr_bytes;
+
+    if (have_device) {
+        return;
+    }
+
     const bool have = diffusion
         && diffusion->sc_topk > 0
         && diffusion->sc_topk_ids.size()   * sizeof(int32_t) == n_id_bytes
@@ -1020,6 +1032,16 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
     }
 }
 
+llm_graph_input_diffusion_self_cond_topk * llm_graph_result::get_inp_diffusion_self_cond_topk() const {
+    for (auto & input : inputs) {
+        if (auto * topk = dynamic_cast<llm_graph_input_diffusion_self_cond_topk *>(input.get())) {
+            return topk;
+        }
+    }
+
+    return nullptr;
+}
+
 void llm_graph_result::set_outputs() {
     if (t_logits != nullptr) {
         ggml_set_output(t_logits);
@@ -1033,6 +1055,9 @@ void llm_graph_result::set_outputs() {
     if (t_h_nextn != nullptr) {
         ggml_set_output(t_h_nextn);
     }
+    if (t_h_pre_norm != nullptr) {
+        ggml_set_output(t_h_pre_norm);
+    }
     for (auto & [seq_id, t] : t_sampled) {
         if (t != nullptr) {
             ggml_set_output(t);
@@ -2105,6 +2130,18 @@ llm_graph_input_diffusion_self_cond_topk * llm_graph_context::build_inp_diffusio
     inp->probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, k, n_tokens);
     ggml_set_input(inp->probs);
 
+    if (sched) {
+        for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) {
+            ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i);
+            if (backend && backend != backend_cpu &&
+                ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+                ggml_backend_sched_set_tensor_backend(sched, inp->ids, backend);
+                ggml_backend_sched_set_tensor_backend(sched, inp->probs, backend);
+                break;
+            }
+        }
+    }
+
     auto * ptr = inp.get();
     res->add_input(std::move(inp));
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 609dd282a9e9..3d14c47968f4 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -93,6 +93,11 @@ struct llama_diffusion_cond {
     int64_t sc_topk = 0;
     std::vector<int32_t> sc_topk_ids;
     std::vector<float>   sc_topk_probs;
+    bool     sc_topk_device_ready      = false;
+    void   * sc_topk_device_ids_data   = nullptr;
+    void   * sc_topk_device_probs_data = nullptr;
+    size_t   sc_topk_device_ids_bytes  = 0;
+    size_t   sc_topk_device_probs_bytes = 0;
 
     // KV-cache reuse phase selector (block-diffusion):
     //   false (encoder phase) = plain token embeddings, no self-conditioning; KV is
@@ -784,6 +789,8 @@ class llm_graph_result {
     ggml_tensor * get_embd()        const { return t_embd; }
     ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
     ggml_tensor * get_h_nextn()     const { return t_h_nextn; }
+    ggml_tensor * get_h_pre_norm()  const { return t_h_pre_norm; }
+    llm_graph_input_diffusion_self_cond_topk * get_inp_diffusion_self_cond_topk() const;
 
     ggml_cgraph  * get_gf()  const { return gf; }
     ggml_context * get_ctx() const { return ctx_compute.get(); }
@@ -813,6 +820,7 @@ class llm_graph_result {
     ggml_tensor * t_embd        = nullptr;
     ggml_tensor * t_embd_pooled = nullptr;
     ggml_tensor * t_h_nextn     = nullptr; // [n_embd, n_outputs] hidden state before final output norm
+    ggml_tensor * t_h_pre_norm  = nullptr; // [n_embd, n_outputs] hidden state before final output norm
 
     std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
     std::map<llama_seq_id, ggml_tensor*> t_candidates;

From a1b087404d01773402e202ecb960418533934a70 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Tue, 9 Jun 2026 15:18:06 +0530
Subject: [PATCH 04/13] diffusion-gemma: keep denoising loop on device

---
 .../diffusion-gemma/diffusion-gemma-cli.cpp   |  40 +++-
 .../diffusion-gemma-server.cpp                |  42 +++-
 ggml/include/ggml-cuda.h                      |   4 +
 ggml/include/ggml.h                           |   1 +
 ggml/src/ggml-alloc.c                         |  13 +-
 ggml/src/ggml-cuda/diffusion-sampling.cu      | 212 +++++++++++++++---
 include/llama.h                               |   7 +
 src/llama-context.cpp                         |  53 ++++-
 src/llama-graph.cpp                           | 139 +++++++++---
 src/llama-graph.h                             |   9 +-
 src/llama-kv-cache.cpp                        |  66 ++++--
 src/models/diffusion-gemma.cpp                |  15 +-
 12 files changed, 509 insertions(+), 92 deletions(-)

diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index 84fd64be6895..853d5f36a649 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -208,15 +208,17 @@ int main(int argc, char ** argv) {
                                   gpu_sampling_topk_ok &&
                                   llama_diffusion_sample_topk_supported(ctx);
     const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
+    const bool use_device_loop      = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
     llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling);
 
     LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n",
             prefix_len, canvas_length, max_canvases, n_steps, entropy_bound, TEMP_MIN, TEMP_MAX, n_ctx, (int) use_mm);
-    LOG_INF("diffusion-gemma: gpu sampling: %s%s%s\n",
+    LOG_INF("diffusion-gemma: gpu sampling: %s%s%s%s\n",
             use_gpu_sampling ? "on" : "off",
             (!gpu_sampling_topk_ok ? " (top-k exceeds CUDA fast-path limit)" :
              (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")),
-            use_device_self_cond ? " | device self-cond: on" : "");
+            use_device_self_cond ? " | device self-cond: on" : "",
+            use_device_loop ? " | device loop: on" : "");
     if (topk_fixed > 0 || (topk_start > 0 && topk_end > 0)) {
         LOG_INF("diffusion-gemma: top-k sampling: fixed=%d anneal=[%d->%d] tail_correction=%d (vocab=%d)\n",
                 topk_fixed, topk_start, topk_end, topk_tail, n_vocab);
@@ -350,6 +352,40 @@ int main(int argc, char ** argv) {
         if (k_step <= 0 || k_step >= n_vocab) k_step = 0;
         step_k = k_step;
 
+        if (use_device_loop) {
+            bool sampled_ok = true;
+            const auto t_sample_start = std::chrono::steady_clock::now();
+            llama_diffusion_sample_params sample_params = {
+                /* .n_tokens              = */ canvas_length,
+                /* .top_k                 = */ k_step,
+                /* .self_cond_top_k       = */ SC_K,
+                /* .temperature           = */ temp,
+                /* .seed                  = */ params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed,
+                /* .step                  = */ (uint32_t) n_steps_total,
+                /* .top_k_tail_correction = */ topk_tail != 0,
+            };
+            llama_diffusion_sample_result sample_result = {
+                /* .sampled         = */ nullptr,
+                /* .argmax          = */ nullptr,
+                /* .entropy         = */ nullptr,
+                /* .self_cond_ids   = */ nullptr,
+                /* .self_cond_probs = */ nullptr,
+                /* .final_tokens    = */ cur_step == 1 ? argmax_canvas.data() : nullptr,
+                /* .update_canvas_on_device = */ cur_step > 1,
+                /* .entropy_bound   = */ entropy_bound,
+            };
+            sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result);
+            sample_sync_s += std::chrono::duration<double>(std::chrono::steady_clock::now() - t_sample_start).count();
+            llama_memory_seq_rm(mem, 0, n_past, -1);
+            if (!sampled_ok) {
+                LOG_ERR("error: CUDA diffusion device loop failed at step %d (k=%d)\n", cur_step, k_step);
+                failed = true;
+                done = true;
+                break;
+            }
+            continue;
+        }
+
         // sparse self-cond for the NEXT step: top-SC_K (id, prob) per position. Cleared each step;
         // unused slots stay (id 0, prob 0) so the graph gather contributes nothing for them.
         if (!use_device_self_cond) {
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
index bc6ea6e7caf1..3985e2740d5a 100644
--- a/examples/diffusion-gemma/diffusion-gemma-server.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -160,6 +160,7 @@ struct diffusion_server {
     bool   topk_tail     = false;
     bool   use_gpu_sampling = false;
     bool   use_device_self_cond = false;
+    bool   use_device_loop = false;
     std::string model_id;
     std::string model_path;
     std::string build_info = "llama.cpp diffusion-gemma-server";
@@ -240,6 +241,10 @@ struct diffusion_server {
 
         int n_blocks_run = 0, n_steps_total = 0;
         bool done = false;
+        const int topk_max_requested =
+            (rq.topk_start > 0 && rq.topk_end > 0) ? std::max(rq.topk_start, rq.topk_end) : rq.topk_fixed;
+        const bool use_device_loop_request = use_device_loop &&
+            (topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K);
 
         const auto t_gen0 = std::chrono::steady_clock::now();
         for (int block = 0; block < max_canvases && !done; ++block) {
@@ -263,6 +268,7 @@ struct diffusion_server {
                 const bool use_gpu_sampling_step = use_gpu_sampling &&
                     (k_step <= 0 || k_step <= GPU_SAMPLING_MAX_TOP_K);
                 const bool use_device_self_cond_step = use_gpu_sampling_step && use_device_self_cond;
+                const bool use_device_loop_step = use_device_loop_request && use_gpu_sampling_step;
 
                 llama_set_causal_attn(ctx, false);
                 llama_set_diffusion_decoder_phase(ctx, true);
@@ -284,6 +290,36 @@ struct diffusion_server {
                 std::vector<float> entropy(canvas_length);
                 std::vector<llama_token> sampled(canvas_length);
 
+                if (use_device_loop_step) {
+                    llama_diffusion_sample_params sample_params = {
+                        /* .n_tokens              = */ canvas_length,
+                        /* .top_k                 = */ k_step,
+                        /* .self_cond_top_k       = */ SC_K,
+                        /* .temperature           = */ temp,
+                        /* .seed                  = */ rq.seed,
+                        /* .step                  = */ (uint32_t) n_steps_total,
+                        /* .top_k_tail_correction = */ rq.topk_tail,
+                    };
+                    llama_diffusion_sample_result sample_result = {
+                        /* .sampled         = */ nullptr,
+                        /* .argmax          = */ nullptr,
+                        /* .entropy         = */ nullptr,
+                        /* .self_cond_ids   = */ nullptr,
+                        /* .self_cond_probs = */ nullptr,
+                        /* .final_tokens    = */ cur_step == 1 ? argmax_canvas.data() : nullptr,
+                        /* .update_canvas_on_device = */ cur_step > 1,
+                        /* .entropy_bound   = */ ENTROPY_BOUND,
+                    };
+                    if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) {
+                        llama_memory_seq_rm(mem, 0, n_past, -1);
+                        out.error = "CUDA diffusion device loop failed at denoising step " +
+                            std::to_string(cur_step) + " (k=" + std::to_string(k_step) + ")";
+                        return out;
+                    }
+                    llama_memory_seq_rm(mem, 0, n_past, -1);
+                    continue;
+                }
+
                 if (!use_device_self_cond_step) {
                     std::fill(sc_probs.begin(), sc_probs.end(), 0.0f);
                     std::fill(sc_ids.begin(),   sc_ids.end(),   0);
@@ -674,6 +710,7 @@ int main(int argc, char ** argv) {
     srv.use_gpu_sampling = gpu_sampling_requested &&
                            llama_diffusion_sample_topk_supported(srv.ctx);
     srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
+    srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
     llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling);
     const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
 
@@ -682,11 +719,12 @@ int main(int argc, char ** argv) {
     SRV_INF("%s\n", llama_print_system_info());
     SRV_INF("model loaded: '%s' | n_ctx = %d | canvas = %d | denoise steps = %d | 1 slot\n",
             srv.model_id.c_str(), srv.n_ctx, srv.canvas_length, srv.n_steps);
-    SRV_INF("gpu sampling: %s%s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n",
+    SRV_INF("gpu sampling: %s%s%s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n",
             srv.use_gpu_sampling ? "on" : "off",
             (!default_topk_gpu_ok ? " (default top-k will use CPU fallback until k <= CUDA limit)" :
              (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")),
             srv.use_device_self_cond ? " | device self-cond: on" : "",
+            srv.use_device_loop ? " | device loop: on" : "",
             srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0);
 
     // ------------------------------------------------------------------------------------------
@@ -758,6 +796,7 @@ int main(int argc, char ** argv) {
                 {"self_cond_topk", SC_K},
                 {"gpu_sampling", srv.use_gpu_sampling},
                 {"device_self_cond", srv.use_device_self_cond},
+                {"device_loop", srv.use_device_loop},
                 {"top_k", srv.topk_fixed},
                 {"top_k_start", srv.topk_start},
                 {"top_k_end", srv.topk_end},
@@ -865,6 +904,7 @@ int main(int argc, char ** argv) {
                     {"self_cond_topk", SC_K},
                     {"gpu_sampling", srv.use_gpu_sampling},
                     {"device_self_cond", srv.use_device_self_cond},
+                    {"device_loop", srv.use_device_loop},
                     {"top_k", srv.topk_fixed},
                     {"top_k_start", srv.topk_start},
                     {"top_k_end", srv.topk_end},
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index f16842c3f835..d966e5a3d0d7 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -62,6 +62,10 @@ struct ggml_cuda_diffusion_sample_result {
     float   * self_cond_probs;
     struct ggml_tensor * self_cond_ids_tensor;
     struct ggml_tensor * self_cond_probs_tensor;
+    struct ggml_tensor * canvas_tokens_tensor;
+    int32_t * final_tokens;
+    float entropy_bound;
+    bool update_canvas_on_device;
 };
 
 typedef bool (*ggml_backend_cuda_diffusion_sample_topk_t)(
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 374934aacf32..ff3bbce1b531 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -647,6 +647,7 @@ extern "C" {
         GGML_TENSOR_FLAG_PARAM   =  4, // ...contains trainable parameters
         GGML_TENSOR_FLAG_LOSS    =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
         GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
+        GGML_TENSOR_FLAG_PERSIST = 32, // ...must not be overwritten by graph allocator reuse
     };
 
     enum ggml_tri_type {
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 3bda9abbe03d..5eec18a059c9 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -641,9 +641,10 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                     continue;
                 }
 
-                // outputs cannot be reused
-                if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
-                    AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
+                // outputs and persistent inputs cannot be reused
+                if ((parent->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)) ||
+                    (parent->view_src != NULL && (parent->view_src->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)))) {
+                    AT_PRINTF("not reusing parent %s for %s as it is persistent\n", parent->name, node->name);
                     continue;
                 }
 
@@ -688,9 +689,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
 }
 
 static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
-    // graph outputs are never freed
-    if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
-        AT_PRINTF("not freeing output %s\n", node->name);
+    // graph outputs and persistent inputs are never freed
+    if (node->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)) {
+        AT_PRINTF("not freeing persistent %s\n", node->name);
         return;
     }
 
diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu
index 34c8efdac7a9..f0b6f53e3146 100644
--- a/ggml/src/ggml-cuda/diffusion-sampling.cu
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cu
@@ -6,6 +6,8 @@
 #include <cfloat>
 #include <cstdlib>
 #include <cstring>
+#include <map>
+#include <mutex>
 
 #ifdef GGML_CUDA_USE_CUB
 #    include <cub/cub.cuh>
@@ -29,6 +31,62 @@ static bool diffusion_fast_topk_enabled() {
     return !v || std::strcmp(v, "0") != 0;
 }
 
+struct diffusion_sample_scratch {
+    int   * top_ids  = nullptr;
+    int   * sampled  = nullptr;
+    int   * argmax   = nullptr;
+    float * entropy  = nullptr;
+    int   * sc_ids   = nullptr;
+    float * sc_probs = nullptr;
+
+    size_t top_ids_cap  = 0;
+    size_t sampled_cap  = 0;
+    size_t argmax_cap   = 0;
+    size_t entropy_cap  = 0;
+    size_t sc_ids_cap   = 0;
+    size_t sc_probs_cap = 0;
+};
+
+static std::mutex g_diffusion_scratch_mutex;
+static std::map<cudaStream_t, diffusion_sample_scratch> g_diffusion_scratch;
+
+template<typename T>
+static void diffusion_scratch_reserve(cudaStream_t stream, T ** ptr, size_t * cap, const size_t need, bool * synced) {
+    if (*cap >= need) {
+        return;
+    }
+    if (*ptr) {
+        if (!*synced) {
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+            *synced = true;
+        }
+        CUDA_CHECK(cudaFree(*ptr));
+        *ptr = nullptr;
+        *cap = 0;
+    }
+    CUDA_CHECK(cudaMalloc((void **) ptr, need * sizeof(T)));
+    *cap = need;
+}
+
+static diffusion_sample_scratch * diffusion_get_scratch(
+        cudaStream_t stream,
+        const int n_tokens,
+        const int heap_k,
+        const int sc_k) {
+    std::lock_guard<std::mutex> lock(g_diffusion_scratch_mutex);
+    diffusion_sample_scratch & scratch = g_diffusion_scratch[stream];
+    bool synced = false;
+
+    diffusion_scratch_reserve(stream, &scratch.top_ids,  &scratch.top_ids_cap,  (size_t) n_tokens * heap_k, &synced);
+    diffusion_scratch_reserve(stream, &scratch.sampled,  &scratch.sampled_cap,  (size_t) n_tokens,          &synced);
+    diffusion_scratch_reserve(stream, &scratch.argmax,   &scratch.argmax_cap,   (size_t) n_tokens,          &synced);
+    diffusion_scratch_reserve(stream, &scratch.entropy,  &scratch.entropy_cap,  (size_t) n_tokens,          &synced);
+    diffusion_scratch_reserve(stream, &scratch.sc_ids,   &scratch.sc_ids_cap,   (size_t) n_tokens * sc_k,   &synced);
+    diffusion_scratch_reserve(stream, &scratch.sc_probs, &scratch.sc_probs_cap, (size_t) n_tokens * sc_k,   &synced);
+
+    return &scratch;
+}
+
 static __device__ __forceinline__ bool diffusion_should_swap_desc(
         const float a_val, const int a_id,
         const float b_val, const int b_id) {
@@ -223,6 +281,82 @@ static __device__ __forceinline__ float diffusion_rng_uniform(uint32_t seed, uin
     return ((x >> 8) + 0.5f) * (1.0f / 16777216.0f);
 }
 
+static __device__ __forceinline__ bool diffusion_pair_gt(float a_val, int a_id, float b_val, int b_id) {
+    return a_val > b_val || (a_val == b_val && a_id > b_id);
+}
+
+static __global__ void diffusion_update_canvas_kernel(
+        const float * __restrict__ entropy,
+        const int * __restrict__ sampled,
+        int * __restrict__ canvas_tokens,
+        const int n_tokens,
+        const int n_vocab,
+        const float entropy_bound,
+        const uint32_t seed,
+        const uint32_t step) {
+    const int tid = threadIdx.x;
+
+    __shared__ float s_entropy[1024];
+    __shared__ int   s_index[1024];
+    __shared__ unsigned char s_accept[1024];
+
+    if (tid < n_tokens) {
+        s_entropy[tid] = entropy[tid];
+        s_index[tid]   = tid;
+        s_accept[tid]  = 0;
+    } else {
+        s_entropy[tid] = FLT_MAX;
+        s_index[tid]   = tid;
+    }
+    __syncthreads();
+
+    for (int k = 2; k <= blockDim.x; k <<= 1) {
+        for (int j = k >> 1; j > 0; j >>= 1) {
+            const int ixj = tid ^ j;
+            if (ixj > tid) {
+                const bool ascending = (tid & k) == 0;
+                const float a_val = s_entropy[tid];
+                const int   a_idx = s_index[tid];
+                const float b_val = s_entropy[ixj];
+                const int   b_idx = s_index[ixj];
+                const bool swap = ascending ?
+                    diffusion_pair_gt(a_val, a_idx, b_val, b_idx) :
+                    diffusion_pair_gt(b_val, b_idx, a_val, a_idx);
+                if (swap) {
+                    s_entropy[tid] = b_val;
+                    s_index[tid]   = b_idx;
+                    s_entropy[ixj] = a_val;
+                    s_index[ixj]   = a_idx;
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    if (tid == 0) {
+        float prefix = 0.0f;
+        for (int i = 0; i < n_tokens; ++i) {
+            const int pos = s_index[i];
+            if (prefix <= entropy_bound) {
+                s_accept[pos] = 1;
+                prefix += s_entropy[i];
+            } else {
+                break;
+            }
+        }
+    }
+    __syncthreads();
+
+    if (tid < n_tokens) {
+        if (s_accept[tid]) {
+            canvas_tokens[tid] = sampled[tid];
+        } else {
+            const uint32_t r = diffusion_rng_u32(seed ^ ((step + 1u) * 0x9e3779b9u) ^ ((uint32_t) tid * 0x7f4a7c15u) ^ 0xa5a5a5a5u);
+            canvas_tokens[tid] = (int) (r % (uint32_t) n_vocab);
+        }
+    }
+}
+
 static __global__ void diffusion_sample_kernel(
         const float * __restrict__ logits,
         const int * __restrict__ top_ids,
@@ -388,10 +522,6 @@ bool ggml_cuda_diffusion_sample_topk(
     if (!ggml_backend_is_cuda(backend) || logits->type != GGML_TYPE_F32 || !ggml_is_contiguous(logits)) {
         return false;
     }
-    if (!result->sampled || !result->argmax || !result->entropy) {
-        return false;
-    }
-
     const int n_vocab = params->n_vocab > 0 ? params->n_vocab : (int) logits->ne[0];
     const int n_tokens = params->n_tokens > 0 ? params->n_tokens : (int) ggml_nrows(logits);
     if (n_vocab <= 0 || n_tokens <= 0 || logits->ne[0] != n_vocab || ggml_nrows(logits) < n_tokens) {
@@ -408,6 +538,19 @@ bool ggml_cuda_diffusion_sample_topk(
         return false;
     }
 
+    if (result->update_canvas_on_device) {
+        if (n_tokens > 1024 ||
+            result->canvas_tokens_tensor == nullptr ||
+            result->canvas_tokens_tensor->type != GGML_TYPE_I32 ||
+            !ggml_is_contiguous(result->canvas_tokens_tensor) ||
+            result->canvas_tokens_tensor->data == nullptr ||
+            result->canvas_tokens_tensor->buffer == nullptr ||
+            ggml_backend_buffer_is_host(result->canvas_tokens_tensor->buffer) ||
+            ggml_nbytes(result->canvas_tokens_tensor) != (size_t) n_tokens * sizeof(int)) {
+            return false;
+        }
+    }
+
     const bool have_self_cond_host = result->self_cond_ids != nullptr && result->self_cond_probs != nullptr;
     const bool have_self_cond_tensor = result->self_cond_ids_tensor != nullptr && result->self_cond_probs_tensor != nullptr;
     if (!have_self_cond_host && !have_self_cond_tensor) {
@@ -453,11 +596,12 @@ bool ggml_cuda_diffusion_sample_topk(
 
     const float * logits_d = (const float *) logits->data;
 
-    ggml_cuda_pool_alloc<int> top_ids_alloc(pool, (size_t) n_tokens * heap_k);
-    int * top_ids = top_ids_alloc.get();
+    diffusion_sample_scratch * scratch = diffusion_get_scratch(stream, n_tokens, heap_k, sc_k);
+    int * top_ids = scratch->top_ids;
     bool top_ids_sorted = false;
 
     const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k;
+    bool sync_required = !use_fast_topk;
     if (use_fast_topk) {
         diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream);
         top_ids_sorted = true;
@@ -494,38 +638,54 @@ bool ggml_cuda_diffusion_sample_topk(
                 logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp);
     }
 
-    ggml_cuda_pool_alloc<int>   sampled_alloc(pool, n_tokens);
-    ggml_cuda_pool_alloc<int>   argmax_alloc(pool, n_tokens);
-    ggml_cuda_pool_alloc<float> entropy_alloc(pool, n_tokens);
-    ggml_cuda_pool_alloc<int>   sc_ids_alloc(pool, (size_t) n_tokens * sc_k);
-    ggml_cuda_pool_alloc<float> sc_probs_alloc(pool, (size_t) n_tokens * sc_k);
-
     constexpr int block_size = 256;
     diffusion_sample_kernel<<<n_tokens, block_size, 0, stream>>>(
             logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp,
             params->seed, params->step, params->top_k_tail_correction,
-            sampled_alloc.get(), argmax_alloc.get(), entropy_alloc.get(),
-            sc_ids_alloc.get(), sc_probs_alloc.get());
-
-    CUDA_CHECK(cudaMemcpyAsync(result->sampled, sampled_alloc.get(), (size_t) n_tokens * sizeof(int),
-                               cudaMemcpyDeviceToHost, stream));
-    CUDA_CHECK(cudaMemcpyAsync(result->argmax, argmax_alloc.get(), (size_t) n_tokens * sizeof(int),
-                               cudaMemcpyDeviceToHost, stream));
-    CUDA_CHECK(cudaMemcpyAsync(result->entropy, entropy_alloc.get(), (size_t) n_tokens * sizeof(float),
-                               cudaMemcpyDeviceToHost, stream));
+            scratch->sampled, scratch->argmax, scratch->entropy,
+            scratch->sc_ids, scratch->sc_probs);
+
+    if (result->update_canvas_on_device) {
+        const int update_threads = next_power_of_2_host(n_tokens);
+        diffusion_update_canvas_kernel<<<1, update_threads, 0, stream>>>(
+                scratch->entropy, scratch->sampled, (int *) result->canvas_tokens_tensor->data,
+                n_tokens, n_vocab, result->entropy_bound, params->seed, params->step);
+    }
+
+    if (result->sampled) {
+        CUDA_CHECK(cudaMemcpyAsync(result->sampled, scratch->sampled, (size_t) n_tokens * sizeof(int),
+                                   cudaMemcpyDeviceToHost, stream));
+    }
+    if (result->argmax) {
+        CUDA_CHECK(cudaMemcpyAsync(result->argmax, scratch->argmax, (size_t) n_tokens * sizeof(int),
+                                   cudaMemcpyDeviceToHost, stream));
+    }
+    if (result->entropy) {
+        CUDA_CHECK(cudaMemcpyAsync(result->entropy, scratch->entropy, (size_t) n_tokens * sizeof(float),
+                                   cudaMemcpyDeviceToHost, stream));
+    }
+    if (result->final_tokens) {
+        CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int),
+                                   cudaMemcpyDeviceToHost, stream));
+    }
     if (have_self_cond_host) {
-        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int),
+        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, scratch->sc_ids, (size_t) n_tokens * sc_k * sizeof(int),
                                    cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float),
+        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float),
                                    cudaMemcpyDeviceToHost, stream));
     }
     if (have_self_cond_tensor) {
-        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids_tensor->data, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int),
+        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids_tensor->data, scratch->sc_ids, (size_t) n_tokens * sc_k * sizeof(int),
                                    cudaMemcpyDeviceToDevice, stream));
-        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float),
+        CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float),
                                    cudaMemcpyDeviceToDevice, stream));
     }
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    const bool host_outputs_requested = result->sampled || result->argmax || result->entropy ||
+        result->final_tokens || have_self_cond_host;
+    CUDA_CHECK(cudaGetLastError());
+    if (sync_required || host_outputs_requested) {
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
     return true;
 }
diff --git a/include/llama.h b/include/llama.h
index e6062b4bbfe9..0e39b71a388e 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1033,6 +1033,13 @@ extern "C" {
         // sparse self-conditioning directly into the reused decoder graph input tensors.
         int32_t     * self_cond_ids;
         float       * self_cond_probs;
+        // Optional device-resident denoise update. When update_canvas_on_device is true, the CUDA
+        // fast path performs entropy accept/renoise and writes the next canvas into the reused
+        // decoder token input tensor. final_tokens is copied only when requested, typically on
+        // the final denoising step.
+        llama_token * final_tokens;
+        bool          update_canvas_on_device;
+        float         entropy_bound;
     };
 
     // CUDA-only fast path for block-diffusion sampling. When enabled, llama_decode()
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 2a0a5568a1c6..fd6c8bc99a36 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1154,6 +1154,9 @@ void llama_context::set_diffusion_self_cond_topk(const int32_t * ids, const floa
     diffusion_cond.sc_topk_device_probs_data  = nullptr;
     diffusion_cond.sc_topk_device_ids_bytes   = 0;
     diffusion_cond.sc_topk_device_probs_bytes = 0;
+    diffusion_cond.canvas_tokens_device_ready = false;
+    diffusion_cond.canvas_tokens_device_data  = nullptr;
+    diffusion_cond.canvas_tokens_device_bytes = 0;
 
     if (ids == nullptr || probs == nullptr || k <= 0 || n_tokens <= 0) {
         diffusion_cond.sc_topk = 0;
@@ -1186,6 +1189,35 @@ static ggml_backend_cuda_diffusion_sample_topk_t get_cuda_diffusion_sample_topk_
         ggml_backend_reg_get_proc_address(reg, "ggml_backend_cuda_diffusion_sample_topk");
 }
 
+static bool diffusion_decoder_inputs_device_ready(
+        const llama_diffusion_cond & diffusion_cond,
+        const llm_graph_result * res) {
+    if (!diffusion_cond.decoder_phase ||
+        !diffusion_cond.canvas_tokens_device_ready ||
+        !diffusion_cond.sc_topk_device_ready ||
+        !res) {
+        return false;
+    }
+
+    ggml_tensor * t_tokens = res->get_inp_tokens();
+    if (!t_tokens ||
+        diffusion_cond.canvas_tokens_device_data  != t_tokens->data ||
+        diffusion_cond.canvas_tokens_device_bytes != ggml_nbytes(t_tokens)) {
+        return false;
+    }
+
+    auto * inp_sc = res->get_inp_diffusion_self_cond_topk();
+    if (!inp_sc || !inp_sc->ids || !inp_sc->probs ||
+        diffusion_cond.sc_topk_device_ids_data    != inp_sc->ids->data ||
+        diffusion_cond.sc_topk_device_probs_data  != inp_sc->probs->data ||
+        diffusion_cond.sc_topk_device_ids_bytes   != ggml_nbytes(inp_sc->ids) ||
+        diffusion_cond.sc_topk_device_probs_bytes != ggml_nbytes(inp_sc->probs)) {
+        return false;
+    }
+
+    return true;
+}
+
 bool llama_context::diffusion_sample_topk_supported() const {
     for (ggml_backend_t backend : backend_ptrs) {
         if (get_cuda_diffusion_sample_topk_proc(backend)) {
@@ -1221,6 +1253,7 @@ bool llama_context::diffusion_sample_topk(
 
     ggml_tensor * t_self_cond_ids   = nullptr;
     ggml_tensor * t_self_cond_probs = nullptr;
+    ggml_tensor * t_canvas_tokens   = nullptr;
     const bool device_self_cond = result->self_cond_ids == nullptr && result->self_cond_probs == nullptr;
     if (device_self_cond) {
         auto * inp = gf_res_prev->get_inp_diffusion_self_cond_topk();
@@ -1233,6 +1266,13 @@ bool llama_context::diffusion_sample_topk(
         return false;
     }
 
+    if (result->update_canvas_on_device) {
+        t_canvas_tokens = gf_res_prev->get_inp_tokens();
+        if (!t_canvas_tokens) {
+            return false;
+        }
+    }
+
     ggml_cuda_diffusion_sample_params cuda_params = {
         /* .n_vocab               = */ (int32_t) model.vocab.n_tokens(),
         /* .n_tokens              = */ params->n_tokens,
@@ -1252,6 +1292,10 @@ bool llama_context::diffusion_sample_topk(
         /* .self_cond_probs = */ result->self_cond_probs,
         /* .self_cond_ids_tensor   = */ t_self_cond_ids,
         /* .self_cond_probs_tensor = */ t_self_cond_probs,
+        /* .canvas_tokens_tensor   = */ t_canvas_tokens,
+        /* .final_tokens           = */ (int32_t *) result->final_tokens,
+        /* .entropy_bound          = */ result->entropy_bound,
+        /* .update_canvas_on_device = */ result->update_canvas_on_device,
     };
 
     const bool ok = sample_proc(backend, t_logits, &cuda_params, &cuda_result);
@@ -1265,6 +1309,11 @@ bool llama_context::diffusion_sample_topk(
         diffusion_cond.sc_topk_device_ids_bytes   = ggml_nbytes(t_self_cond_ids);
         diffusion_cond.sc_topk_device_probs_bytes = ggml_nbytes(t_self_cond_probs);
     }
+    if (ok && result->update_canvas_on_device) {
+        diffusion_cond.canvas_tokens_device_ready = true;
+        diffusion_cond.canvas_tokens_device_data  = t_canvas_tokens->data;
+        diffusion_cond.canvas_tokens_device_bytes = ggml_nbytes(t_canvas_tokens);
+    }
 
     return ok;
 }
@@ -1465,7 +1514,9 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
         //const auto t_start_us = ggml_time_us();
 
         // FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated
-        res->set_inputs(&ubatch);
+        if (!diffusion_decoder_inputs_device_ready(diffusion_cond, res)) {
+            res->set_inputs(&ubatch);
+        }
 
         //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
     }
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index b0a7a31f8131..778e413b0851 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -14,6 +14,7 @@
 
 #include <cassert>
 #include <cmath>
+#include <cstdlib>
 #include <cstring>
 #include <numeric>
 #include <sstream>
@@ -84,8 +85,16 @@ static ggml_tensor * ggml_mul_mat_aux(
 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     if (ubatch->token) {
         const int64_t n_tokens = ubatch->n_tokens;
+        const size_t n_bytes = n_tokens*ggml_element_size(tokens);
 
-        ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens));
+        const bool have_device = diffusion
+            && diffusion->canvas_tokens_device_ready
+            && diffusion->canvas_tokens_device_data  == tokens->data
+            && diffusion->canvas_tokens_device_bytes == n_bytes;
+
+        if (!have_device) {
+            ggml_backend_tensor_set(tokens, ubatch->token, 0, n_bytes);
+        }
     }
 
     if (ubatch->embd) {
@@ -216,26 +225,25 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
 
     const int64_t n_tokens = ubatch->n_tokens;
 
-    GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
-    int32_t * data = (int32_t *) out_ids->data;
+    std::vector<int32_t> data(n_outputs);
 
     if (n_outputs == n_tokens) {
         for (int i = 0; i < n_tokens; ++i) {
             data[i] = i;
         }
+    } else {
+        GGML_ASSERT(ubatch->output);
 
-        return;
-    }
-
-    GGML_ASSERT(ubatch->output);
-
-    int n_outputs = 0;
+        int n_outputs = 0;
 
-    for (int i = 0; i < n_tokens; ++i) {
-        if (ubatch->output[i]) {
-            data[n_outputs++] = i;
+        for (int i = 0; i < n_tokens; ++i) {
+            if (ubatch->output[i]) {
+                data[n_outputs++] = i;
+            }
         }
     }
+
+    ggml_backend_tensor_set(out_ids, data.data(), 0, data.size()*ggml_element_size(out_ids));
 }
 
 bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
@@ -524,20 +532,26 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
     };
 
     GGML_ASSERT(self_kq_mask);
-    GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
     if (self_kq_mask->type == GGML_TYPE_F16) {
-        fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
+        std::vector<ggml_fp16_t> data(ggml_nelements(self_kq_mask));
+        fill_mask(data.data(), ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
+        ggml_backend_tensor_set(self_kq_mask, data.data(), 0, ggml_nbytes(self_kq_mask));
     } else {
-        fill_mask((float       *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
+        std::vector<float> data(ggml_nelements(self_kq_mask));
+        fill_mask(data.data(), ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
+        ggml_backend_tensor_set(self_kq_mask, data.data(), 0, ggml_nbytes(self_kq_mask));
     }
 
     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
         GGML_ASSERT(self_kq_mask_swa);
-        GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
         if (self_kq_mask_swa->type == GGML_TYPE_F16) {
-            fill_mask((ggml_fp16_t *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type);
+            std::vector<ggml_fp16_t> data(ggml_nelements(self_kq_mask_swa));
+            fill_mask(data.data(), ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type);
+            ggml_backend_tensor_set(self_kq_mask_swa, data.data(), 0, ggml_nbytes(self_kq_mask_swa));
         } else {
-            fill_mask((float       *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type);
+            std::vector<float> data(ggml_nelements(self_kq_mask_swa));
+            fill_mask(data.data(), ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type);
+            ggml_backend_tensor_set(self_kq_mask_swa, data.data(), 0, ggml_nbytes(self_kq_mask_swa));
         }
     }
 }
@@ -574,11 +588,14 @@ void llm_graph_input_attn_no_cache_prefix::set_input(const llama_ubatch * ubatch
     };
 
     GGML_ASSERT(self_kq_mask);
-    GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
     if (self_kq_mask->type == GGML_TYPE_F16) {
-        fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask));
+        std::vector<ggml_fp16_t> data(ggml_nelements(self_kq_mask));
+        fill_mask(data.data(), ggml_nelements(self_kq_mask));
+        ggml_backend_tensor_set(self_kq_mask, data.data(), 0, ggml_nbytes(self_kq_mask));
     } else {
-        fill_mask((float       *) self_kq_mask->data, ggml_nelements(self_kq_mask));
+        std::vector<float> data(ggml_nelements(self_kq_mask));
+        fill_mask(data.data(), ggml_nelements(self_kq_mask));
+        ggml_backend_tensor_set(self_kq_mask, data.data(), 0, ggml_nbytes(self_kq_mask));
     }
 }
 
@@ -1177,6 +1194,34 @@ void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
     }
 }
 
+void llm_graph_context::set_diffusion_input_backend(ggml_tensor * tensor, uint32_t group) const {
+    if (!diffusion || !diffusion->decoder_phase || !sched || !tensor) {
+        return;
+    }
+
+    static const uint32_t enabled_groups = [] {
+        const char * env = getenv("DG_GPU_INPUT_GROUPS");
+        // Keep the default to inputs used by the fixed diffusion decoder graph:
+        // canvas/self-cond, positions, attention scale, KV indices, masks and
+        // rotary helpers. They are marked persistent below so the allocator will
+        // not overwrite them between denoising replays.
+        return env ? (uint32_t) strtoul(env, nullptr, 0) : 63u; // 1|2|4|8|16|32
+    }();
+    if ((enabled_groups & group) == 0) {
+        return;
+    }
+
+    for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) {
+        ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i);
+        if (backend && backend != backend_cpu &&
+            ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+            tensor->flags |= GGML_TENSOR_FLAG_PERSIST;
+            ggml_backend_sched_set_tensor_backend(sched, tensor, backend);
+            break;
+        }
+    }
+}
+
 ggml_tensor * llm_graph_context::build_cvec(
          ggml_tensor * cur,
                  int   il) const {
@@ -1924,12 +1969,13 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 
     assert(n_embd_inp >= n_embd);
 
-    auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp);
+    auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp, diffusion);
 
     inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
     cb(inp->tokens, "inp_tokens", -1);
     ggml_set_input(inp->tokens);
     res->t_inp_tokens = inp->tokens;
+    set_diffusion_input_backend(inp->tokens);
 
     inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
     cb(inp->embd, "inp_embd", -1);
@@ -2014,6 +2060,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
 
     cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
     ggml_set_input(cur);
+    set_diffusion_input_backend(cur, 2);
 
     res->add_input(std::move(inp));
 
@@ -2028,6 +2075,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
     // this need to be 1x1xN for broadcasting
     cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
     ggml_set_input(cur);
+    set_diffusion_input_backend(cur, 4);
     ggml_set_name(cur, "attn_scale");
 
     res->add_input(std::move(inp));
@@ -2036,6 +2084,10 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
+    if (diffusion && diffusion->decoder_phase && n_outputs == n_tokens) {
+        return nullptr;
+    }
+
     // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
     //       but this would make the graph topology depend on the number of output tokens, which can interfere with
     //       features that require constant topology such as pipeline parallelism
@@ -2050,6 +2102,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const {
 
     cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
     ggml_set_input(cur);
+    set_diffusion_input_backend(cur, 64);
 
     res->add_input(std::move(inp));
 
@@ -2113,6 +2166,7 @@ ggml_tensor * llm_graph_context::build_inp_diffusion_self_cond(int64_t n_vocab)
 
     cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens);
     ggml_set_input(cur);
+    set_diffusion_input_backend(cur);
 
     res->add_input(std::move(inp));
 
@@ -2126,21 +2180,11 @@ llm_graph_input_diffusion_self_cond_topk * llm_graph_context::build_inp_diffusio
     // dims that must match the data tensor; a flat index list gathers into [n_embd, k*n_tokens]).
     inp->ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, k * n_tokens);
     ggml_set_input(inp->ids);
+    set_diffusion_input_backend(inp->ids);
 
     inp->probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, k, n_tokens);
     ggml_set_input(inp->probs);
-
-    if (sched) {
-        for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) {
-            ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i);
-            if (backend && backend != backend_cpu &&
-                ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_GPU) {
-                ggml_backend_sched_set_tensor_backend(sched, inp->ids, backend);
-                ggml_backend_sched_set_tensor_backend(sched, inp->probs, backend);
-                break;
-            }
-        }
-    }
+    set_diffusion_input_backend(inp->probs);
 
     auto * ptr = inp.get();
     res->add_input(std::move(inp));
@@ -2337,12 +2381,14 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
     inp->self_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1);
     ggml_set_input(inp->self_kq_mask);
+    set_diffusion_input_backend(inp->self_kq_mask, 128);
 
     inp->self_kq_mask_cnv = inp->self_kq_mask;
 
     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
         inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1);
         ggml_set_input(inp->self_kq_mask_swa);
+        set_diffusion_input_backend(inp->self_kq_mask_swa, 128);
 
         inp->self_kq_mask_swa_cnv = inp->self_kq_mask_swa;
     } else {
@@ -2360,6 +2406,7 @@ llm_graph_input_attn_no_cache_prefix * llm_graph_context::build_attn_inp_no_cach
 
     inp->self_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1);
     ggml_set_input(inp->self_kq_mask);
+    set_diffusion_input_backend(inp->self_kq_mask, 128);
     inp->self_kq_mask_cnv = inp->self_kq_mask;
 
     // sliding-window layers reuse the same prefix mask (valid while n_tokens <= sliding_window)
@@ -2450,6 +2497,11 @@ llm_graph_input_attn_kv * llm_graph_context::build_attn_inp_kv() const {
     const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
 
     auto inp = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
+    set_diffusion_input_backend(inp->self_k_idxs, 8);
+    set_diffusion_input_backend(inp->self_v_idxs, 8);
+    set_diffusion_input_backend(inp->self_kq_mask, 16);
+    set_diffusion_input_backend(inp->self_k_rot, 32);
+    set_diffusion_input_backend(inp->self_v_rot, 32);
 
     return (llm_graph_input_attn_kv *) res->add_input(std::move(inp));
 }
@@ -2554,6 +2606,8 @@ llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const {
     const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
 
     auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
+    set_diffusion_input_backend(inp->self_k_idxs, 8);
+    set_diffusion_input_backend(inp->self_kq_mask, 16);
 
     return (llm_graph_input_attn_k *) res->add_input(std::move(inp));
 }
@@ -2863,6 +2917,12 @@ llm_graph_input_attn_k_dsa * llm_graph_context::build_attn_inp_k_dsa() const {
         inp->self_k_rot_lid = mctx_cur->get_lid()->build_input_k_rot(ctx0);
     }
 
+    set_diffusion_input_backend(inp->self_k_idxs_mla, 8);
+    set_diffusion_input_backend(inp->self_kq_mask_mla, 16);
+    set_diffusion_input_backend(inp->self_k_idxs_lid, 8);
+    set_diffusion_input_backend(inp->self_kq_mask_lid, 16);
+    set_diffusion_input_backend(inp->self_k_rot_lid, 32);
+
     return (llm_graph_input_attn_k_dsa *) res->add_input(std::move(inp));
 }
 
@@ -2898,6 +2958,17 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
     inp->self_k_rot_swa = mctx_cur->get_swa()->build_input_k_rot(ctx0);
     inp->self_v_rot_swa = mctx_cur->get_swa()->build_input_v_rot(ctx0);
 
+    set_diffusion_input_backend(inp->self_k_idxs, 8);
+    set_diffusion_input_backend(inp->self_v_idxs, 8);
+    set_diffusion_input_backend(inp->self_kq_mask, 16);
+    set_diffusion_input_backend(inp->self_k_idxs_swa, 8);
+    set_diffusion_input_backend(inp->self_v_idxs_swa, 8);
+    set_diffusion_input_backend(inp->self_kq_mask_swa, 16);
+    set_diffusion_input_backend(inp->self_k_rot, 32);
+    set_diffusion_input_backend(inp->self_v_rot, 32);
+    set_diffusion_input_backend(inp->self_k_rot_swa, 32);
+    set_diffusion_input_backend(inp->self_v_rot_swa, 32);
+
     return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
 }
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 3d14c47968f4..838bbce24b86 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -99,6 +99,10 @@ struct llama_diffusion_cond {
     size_t   sc_topk_device_ids_bytes  = 0;
     size_t   sc_topk_device_probs_bytes = 0;
 
+    bool     canvas_tokens_device_ready = false;
+    void   * canvas_tokens_device_data  = nullptr;
+    size_t   canvas_tokens_device_bytes = 0;
+
     // KV-cache reuse phase selector (block-diffusion):
     //   false (encoder phase) = plain token embeddings, no self-conditioning; KV is
     //          committed to the cache (prompt prefill / finalized-canvas commit).
@@ -147,7 +151,8 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
 
 class llm_graph_input_embd : public llm_graph_input_i {
 public:
-    llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
+    llm_graph_input_embd(int64_t n_embd, const llama_diffusion_cond * diffusion = nullptr) :
+        diffusion(diffusion), n_embd(n_embd) {}
     virtual ~llm_graph_input_embd() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
@@ -157,6 +162,7 @@ class llm_graph_input_embd : public llm_graph_input_i {
     ggml_tensor * tokens = nullptr; // I32 [n_batch]
     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
 
+    const llama_diffusion_cond * diffusion = nullptr;
     const int64_t n_embd = 0;
 };
 
@@ -923,6 +929,7 @@ struct llm_graph_context {
     virtual ~llm_graph_context() = default;
 
     void cb(ggml_tensor * cur, const char * name, int il) const;
+    void set_diffusion_input_backend(ggml_tensor * tensor, uint32_t group = 1) const;
 
     //
     // common
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 2802103bdd82..0e7c92f91b0a 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1450,8 +1450,15 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub
     const uint32_t n_tokens = ubatch->n_tokens;
     GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
 
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-    int64_t * data = (int64_t *) dst->data;
+    std::vector<int64_t> data_staging;
+    int64_t * data = nullptr;
+    const bool is_host = ggml_backend_buffer_is_host(dst->buffer);
+    if (is_host) {
+        data = (int64_t *) dst->data;
+    } else {
+        data_staging.resize(ggml_nelements(dst));
+        data = data_staging.data();
+    }
 
     for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
         const int64_t offs = sinfo.strm[s]*get_size();
@@ -1460,14 +1467,25 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub
             data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i];
         }
     }
+
+    if (!is_host) {
+        ggml_backend_tensor_set(dst, data_staging.data(), 0, ggml_nbytes(dst));
+    }
 }
 
 void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
     const uint32_t n_tokens = ubatch->n_tokens;
     GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
 
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-    int64_t * data = (int64_t *) dst->data;
+    std::vector<int64_t> data_staging;
+    int64_t * data = nullptr;
+    const bool is_host = ggml_backend_buffer_is_host(dst->buffer);
+    if (is_host) {
+        data = (int64_t *) dst->data;
+    } else {
+        data_staging.resize(ggml_nelements(dst));
+        data = data_staging.data();
+    }
 
     if (!v_trans) {
         for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
@@ -1493,6 +1511,10 @@ void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ub
             }
         }
     }
+
+    if (!is_host) {
+        ggml_backend_tensor_set(dst, data_staging.data(), 0, ggml_nbytes(dst));
+    }
 }
 
 void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
@@ -1715,8 +1737,6 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data
 void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
     const uint32_t n_tokens = ubatch->n_tokens;
 
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-
     const int64_t n_kv     = dst->ne[0];
     const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
 
@@ -1740,9 +1760,21 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
     };
 
     if (dst->type == GGML_TYPE_F16) {
-        set_input_kq_mask_impl<ggml_fp16_t>(args, (ggml_fp16_t *) dst->data, causal_attn);
+        if (ggml_backend_buffer_is_host(dst->buffer)) {
+            set_input_kq_mask_impl<ggml_fp16_t>(args, (ggml_fp16_t *) dst->data, causal_attn);
+        } else {
+            std::vector<ggml_fp16_t> data(ggml_nelements(dst));
+            set_input_kq_mask_impl<ggml_fp16_t>(args, data.data(), causal_attn);
+            ggml_backend_tensor_set(dst, data.data(), 0, ggml_nbytes(dst));
+        }
     } else {
-        set_input_kq_mask_impl<float>(args, (float *) dst->data, causal_attn);
+        if (ggml_backend_buffer_is_host(dst->buffer)) {
+            set_input_kq_mask_impl<float>(args, (float *) dst->data, causal_attn);
+        } else {
+            std::vector<float> data(ggml_nelements(dst));
+            set_input_kq_mask_impl<float>(args, data.data(), causal_attn);
+            ggml_backend_tensor_set(dst, data.data(), 0, ggml_nbytes(dst));
+        }
     }
 
     //const int64_t t_end = ggml_time_us();
@@ -1776,21 +1808,27 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
 }
 
 void llama_kv_cache::set_input_k_rot(ggml_tensor * dst) const {
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-
     const auto n_rot = dst->ne[0];
     GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0]));
 
-    memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst));
+    const auto & data = attn_rot_hadamard.at(n_rot);
+    if (ggml_backend_buffer_is_host(dst->buffer)) {
+        memcpy(dst->data, data.data(), ggml_nbytes(dst));
+    } else {
+        ggml_backend_tensor_set(dst, data.data(), 0, ggml_nbytes(dst));
+    }
 }
 
 void llama_kv_cache::set_input_v_rot(ggml_tensor * dst) const {
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-
     const auto n_rot = dst->ne[0];
     GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0]));
 
-    memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst));
+    const auto & data = attn_rot_hadamard.at(n_rot);
+    if (ggml_backend_buffer_is_host(dst->buffer)) {
+        memcpy(dst->data, data.data(), ggml_nbytes(dst));
+    } else {
+        ggml_backend_tensor_set(dst, data.data(), 0, ggml_nbytes(dst));
+    }
 }
 
 size_t llama_kv_cache::total_size() const {
diff --git a/src/models/diffusion-gemma.cpp b/src/models/diffusion-gemma.cpp
index b01fe2e8de97..fe521082e542 100644
--- a/src/models/diffusion-gemma.cpp
+++ b/src/models/diffusion-gemma.cpp
@@ -53,13 +53,14 @@ llama_model_diffusion_gemma::~llama_model_diffusion_gemma() {
     }
 }
 
-// Place the token embedding the self-conditioning step needs on an offloaded (GPU) backend.
+// Place the token embedding the diffusion decoder needs on an offloaded (GPU) backend.
 //
 // Primary path (sparse gather): store tok_embd {n_embd, n_vocab} as F16 in tok_embd_gpu on-device.
-// The decoder graph gathers only the top-k rows per position (ggml_get_rows), so this is the whole
-// soft-embedding cost: ~1.47 GiB resident, no per-decode PCIe copy, no full-vocab matmul. F16 (not
-// the native Q4_K) because CUDA get_rows has no Q4_K/Q6_K kernel -- a quantized gather would fall
-// back to CPU every step (a large regression). F16 halves the VRAM vs the F32 dense path.
+// The decoder graph gathers the canvas token rows and the self-conditioning top-k rows from this
+// tensor. This keeps token-id driven embedding lookup on device: ~1.47 GiB resident, no per-decode
+// token-id D2H for CPU row selection, no per-decode embedding H2D. F16 (not the native Q4_K)
+// because CUDA get_rows has no Q4_K/Q6_K kernel -- a quantized gather would fall back to CPU every
+// step (a large regression). F16 halves the VRAM vs the F32 dense path.
 //
 // Fallback path (dense matmul): if the F16 copy can't be allocated, precompute the transposed F32
 // embedding {n_vocab, n_embd} so the dense `probs @ token_embd` matmul (build_input) still runs
@@ -118,7 +119,7 @@ void llama_model_diffusion_gemma::load_arch_post(llama_model_loader & ml) {
         if (tok_embd_gpu_buf) {
             ggml_backend_tensor_set(tok_embd_gpu, half.data(), 0, ggml_nbytes(tok_embd_gpu));
 
-            LLAMA_LOG_INFO("%s: placed self-cond token embedding {%lld, %lld} F16 (%.2f GiB) on %s (gather, k=%lld)\n",
+            LLAMA_LOG_INFO("%s: placed diffusion token embedding {%lld, %lld} F16 (%.2f GiB) on %s (gather, k=%lld)\n",
                            __func__, (long long) n_embd_t, (long long) n_vocab_t,
                            ggml_nbytes(tok_embd_gpu) / (1024.0 * 1024.0 * 1024.0),
                            ggml_backend_buffer_name(tok_embd_gpu->buffer),
@@ -184,7 +185,7 @@ std::unique_ptr<llm_graph_context> llama_model_diffusion_gemma::build_arch_graph
 ggml_tensor * llama_model_diffusion_gemma::graph_base::build_input(bool is_decoder) {
     const auto & dmodel = static_cast<const llama_model_diffusion_gemma &>(model);
 
-    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
+    ggml_tensor * inpL = build_inp_embd(dmodel.tok_embd_gpu ? dmodel.tok_embd_gpu : model.tok_embd);
 
     // scaled word embeddings (sqrt(hidden_size)); raw embeddings input is not scaled
     inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);

From ba31e9a6d5ec0a60bf4e09283aeab23629a50e64 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Tue, 9 Jun 2026 15:36:43 +0530
Subject: [PATCH 05/13] diffusion-gemma: use output flag for persistent inputs

---
 ggml/include/ggml.h   |  1 -
 ggml/src/ggml-alloc.c | 13 ++++++-------
 src/llama-graph.cpp   |  7 ++++---
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index ff3bbce1b531..374934aacf32 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -647,7 +647,6 @@ extern "C" {
         GGML_TENSOR_FLAG_PARAM   =  4, // ...contains trainable parameters
         GGML_TENSOR_FLAG_LOSS    =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
         GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
-        GGML_TENSOR_FLAG_PERSIST = 32, // ...must not be overwritten by graph allocator reuse
     };
 
     enum ggml_tri_type {
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 5eec18a059c9..3bda9abbe03d 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -641,10 +641,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                     continue;
                 }
 
-                // outputs and persistent inputs cannot be reused
-                if ((parent->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)) ||
-                    (parent->view_src != NULL && (parent->view_src->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)))) {
-                    AT_PRINTF("not reusing parent %s for %s as it is persistent\n", parent->name, node->name);
+                // outputs cannot be reused
+                if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
+                    AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
                     continue;
                 }
 
@@ -689,9 +688,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
 }
 
 static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
-    // graph outputs and persistent inputs are never freed
-    if (node->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)) {
-        AT_PRINTF("not freeing persistent %s\n", node->name);
+    // graph outputs are never freed
+    if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        AT_PRINTF("not freeing output %s\n", node->name);
         return;
     }
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 778e413b0851..eee04a30de2e 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1203,8 +1203,9 @@ void llm_graph_context::set_diffusion_input_backend(ggml_tensor * tensor, uint32
         const char * env = getenv("DG_GPU_INPUT_GROUPS");
         // Keep the default to inputs used by the fixed diffusion decoder graph:
         // canvas/self-cond, positions, attention scale, KV indices, masks and
-        // rotary helpers. They are marked persistent below so the allocator will
-        // not overwrite them between denoising replays.
+        // rotary helpers. Mark them as outputs too, matching ggml-backend's
+        // existing copy-tensor convention, so the allocator will not overwrite
+        // them between denoising replays.
         return env ? (uint32_t) strtoul(env, nullptr, 0) : 63u; // 1|2|4|8|16|32
     }();
     if ((enabled_groups & group) == 0) {
@@ -1215,7 +1216,7 @@ void llm_graph_context::set_diffusion_input_backend(ggml_tensor * tensor, uint32
         ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i);
         if (backend && backend != backend_cpu &&
             ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_GPU) {
-            tensor->flags |= GGML_TENSOR_FLAG_PERSIST;
+            ggml_set_output(tensor);
             ggml_backend_sched_set_tensor_backend(sched, tensor, backend);
             break;
         }

From 7a7c0a8e4c83dd88823a46eac024dfa5b090f2f3 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Tue, 9 Jun 2026 17:24:15 +0530
Subject: [PATCH 06/13] diffusion-gemma: checkpoint device-loop early stop

---
 .../diffusion-gemma/diffusion-gemma-cli.cpp   | 33 ++++++++-
 .../diffusion-gemma-server.cpp                | 36 +++++++++-
 ggml/include/ggml-cuda.h                      |  6 ++
 ggml/src/ggml-cuda/diffusion-sampling.cu      | 69 ++++++++++++++++++-
 include/llama.h                               | 10 ++-
 src/llama-context.cpp                         |  6 ++
 6 files changed, 154 insertions(+), 6 deletions(-)

diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index 853d5f36a649..6556df737402 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -209,6 +209,8 @@ int main(int argc, char ** argv) {
                                   llama_diffusion_sample_topk_supported(ctx);
     const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     const bool use_device_loop      = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
+    const int device_early_stop_interval = use_device_loop ?
+        std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 0), 0) : 0;
     llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling);
 
     LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n",
@@ -219,6 +221,9 @@ int main(int argc, char ** argv) {
              (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")),
             use_device_self_cond ? " | device self-cond: on" : "",
             use_device_loop ? " | device loop: on" : "");
+    if (device_early_stop_interval > 0) {
+        LOG_INF("diffusion-gemma: device early-stop interval=%d\n", device_early_stop_interval);
+    }
     if (topk_fixed > 0 || (topk_start > 0 && topk_end > 0)) {
         LOG_INF("diffusion-gemma: top-k sampling: fixed=%d anneal=[%d->%d] tail_correction=%d (vocab=%d)\n",
                 topk_fixed, topk_start, topk_end, topk_tail, n_vocab);
@@ -354,6 +359,12 @@ int main(int argc, char ** argv) {
 
         if (use_device_loop) {
             bool sampled_ok = true;
+            const int block_step = n_steps - cur_step + 1;
+            const bool use_device_early_stop = device_early_stop_interval > 0;
+            const bool reset_stop_state = use_device_early_stop && block_step == 1;
+            const bool check_stop = use_device_early_stop &&
+                (cur_step == 1 || (block_step % device_early_stop_interval) == 0);
+            int32_t stop_flag = 0;
             const auto t_sample_start = std::chrono::steady_clock::now();
             llama_diffusion_sample_params sample_params = {
                 /* .n_tokens              = */ canvas_length,
@@ -370,9 +381,15 @@ int main(int argc, char ** argv) {
                 /* .entropy         = */ nullptr,
                 /* .self_cond_ids   = */ nullptr,
                 /* .self_cond_probs = */ nullptr,
-                /* .final_tokens    = */ cur_step == 1 ? argmax_canvas.data() : nullptr,
-                /* .update_canvas_on_device = */ cur_step > 1,
+                /* .final_tokens    = */ (cur_step == 1 || check_stop) ? argmax_canvas.data() : nullptr,
+                /* .stop            = */ check_stop ? &stop_flag : nullptr,
                 /* .entropy_bound   = */ entropy_bound,
+                /* .confidence_threshold = */ CONFIDENCE_THRESHOLD,
+                /* .stability_threshold  = */ STABILITY_THRESHOLD,
+                /* .update_canvas_on_device = */ cur_step > 1,
+                /* .update_stop_state_on_device = */ use_device_early_stop,
+                /* .check_stop_on_device = */ check_stop,
+                /* .reset_stop_state = */ reset_stop_state,
             };
             sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result);
             sample_sync_s += std::chrono::duration<double>(std::chrono::steady_clock::now() - t_sample_start).count();
@@ -383,6 +400,9 @@ int main(int argc, char ** argv) {
                 done = true;
                 break;
             }
+            if (check_stop && stop_flag != 0) {
+                break;
+            }
             continue;
         }
 
@@ -411,6 +431,15 @@ int main(int argc, char ** argv) {
                 /* .entropy         = */ entropy.data(),
                 /* .self_cond_ids   = */ use_device_self_cond ? nullptr : sc_ids.data(),
                 /* .self_cond_probs = */ use_device_self_cond ? nullptr : sc_probs.data(),
+                /* .final_tokens    = */ nullptr,
+                /* .stop            = */ nullptr,
+                /* .entropy_bound   = */ 0.0f,
+                /* .confidence_threshold = */ 0.0f,
+                /* .stability_threshold  = */ 0,
+                /* .update_canvas_on_device = */ false,
+                /* .update_stop_state_on_device = */ false,
+                /* .check_stop_on_device = */ false,
+                /* .reset_stop_state = */ false,
             };
             sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result);
             if (!sampled_ok) {
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
index 3985e2740d5a..d3e8c169eaf4 100644
--- a/examples/diffusion-gemma/diffusion-gemma-server.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -161,6 +161,7 @@ struct diffusion_server {
     bool   use_gpu_sampling = false;
     bool   use_device_self_cond = false;
     bool   use_device_loop = false;
+    int    device_early_stop_interval = 0;
     std::string model_id;
     std::string model_path;
     std::string build_info = "llama.cpp diffusion-gemma-server";
@@ -291,6 +292,12 @@ struct diffusion_server {
                 std::vector<llama_token> sampled(canvas_length);
 
                 if (use_device_loop_step) {
+                    const int block_step = n_steps - cur_step + 1;
+                    const bool use_device_early_stop = device_early_stop_interval > 0;
+                    const bool reset_stop_state = use_device_early_stop && block_step == 1;
+                    const bool check_stop = use_device_early_stop &&
+                        (cur_step == 1 || (block_step % device_early_stop_interval) == 0);
+                    int32_t stop_flag = 0;
                     llama_diffusion_sample_params sample_params = {
                         /* .n_tokens              = */ canvas_length,
                         /* .top_k                 = */ k_step,
@@ -306,9 +313,15 @@ struct diffusion_server {
                         /* .entropy         = */ nullptr,
                         /* .self_cond_ids   = */ nullptr,
                         /* .self_cond_probs = */ nullptr,
-                        /* .final_tokens    = */ cur_step == 1 ? argmax_canvas.data() : nullptr,
-                        /* .update_canvas_on_device = */ cur_step > 1,
+                        /* .final_tokens    = */ (cur_step == 1 || check_stop) ? argmax_canvas.data() : nullptr,
+                        /* .stop            = */ check_stop ? &stop_flag : nullptr,
                         /* .entropy_bound   = */ ENTROPY_BOUND,
+                        /* .confidence_threshold = */ CONFIDENCE_THRESHOLD,
+                        /* .stability_threshold  = */ STABILITY_THRESHOLD,
+                        /* .update_canvas_on_device = */ cur_step > 1,
+                        /* .update_stop_state_on_device = */ use_device_early_stop,
+                        /* .check_stop_on_device = */ check_stop,
+                        /* .reset_stop_state = */ reset_stop_state,
                     };
                     if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) {
                         llama_memory_seq_rm(mem, 0, n_past, -1);
@@ -317,6 +330,9 @@ struct diffusion_server {
                         return out;
                     }
                     llama_memory_seq_rm(mem, 0, n_past, -1);
+                    if (check_stop && stop_flag != 0) {
+                        break;
+                    }
                     continue;
                 }
 
@@ -341,6 +357,15 @@ struct diffusion_server {
                         /* .entropy         = */ entropy.data(),
                         /* .self_cond_ids   = */ use_device_self_cond_step ? nullptr : sc_ids.data(),
                         /* .self_cond_probs = */ use_device_self_cond_step ? nullptr : sc_probs.data(),
+                        /* .final_tokens    = */ nullptr,
+                        /* .stop            = */ nullptr,
+                        /* .entropy_bound   = */ 0.0f,
+                        /* .confidence_threshold = */ 0.0f,
+                        /* .stability_threshold  = */ 0,
+                        /* .update_canvas_on_device = */ false,
+                        /* .update_stop_state_on_device = */ false,
+                        /* .check_stop_on_device = */ false,
+                        /* .reset_stop_state = */ false,
                     };
                     if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) {
                         llama_memory_seq_rm(mem, 0, n_past, -1);
@@ -711,6 +736,8 @@ int main(int argc, char ** argv) {
                            llama_diffusion_sample_topk_supported(srv.ctx);
     srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
+    srv.device_early_stop_interval = srv.use_device_loop ?
+        std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 0), 0) : 0;
     llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling);
     const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
 
@@ -726,6 +753,9 @@ int main(int argc, char ** argv) {
             srv.use_device_self_cond ? " | device self-cond: on" : "",
             srv.use_device_loop ? " | device loop: on" : "",
             srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0);
+    if (srv.device_early_stop_interval > 0) {
+        SRV_INF("device early-stop interval: %d\n", srv.device_early_stop_interval);
+    }
 
     // ------------------------------------------------------------------------------------------
     // HTTP server
@@ -797,6 +827,7 @@ int main(int argc, char ** argv) {
                 {"gpu_sampling", srv.use_gpu_sampling},
                 {"device_self_cond", srv.use_device_self_cond},
                 {"device_loop", srv.use_device_loop},
+                {"device_early_stop_interval", srv.device_early_stop_interval},
                 {"top_k", srv.topk_fixed},
                 {"top_k_start", srv.topk_start},
                 {"top_k_end", srv.topk_end},
@@ -905,6 +936,7 @@ int main(int argc, char ** argv) {
                     {"gpu_sampling", srv.use_gpu_sampling},
                     {"device_self_cond", srv.use_device_self_cond},
                     {"device_loop", srv.use_device_loop},
+                    {"device_early_stop_interval", srv.device_early_stop_interval},
                     {"top_k", srv.topk_fixed},
                     {"top_k_start", srv.topk_start},
                     {"top_k_end", srv.topk_end},
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index d966e5a3d0d7..cb1136ba856d 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -64,8 +64,14 @@ struct ggml_cuda_diffusion_sample_result {
     struct ggml_tensor * self_cond_probs_tensor;
     struct ggml_tensor * canvas_tokens_tensor;
     int32_t * final_tokens;
+    int32_t * stop;
     float entropy_bound;
+    float confidence_threshold;
+    int32_t stability_threshold;
     bool update_canvas_on_device;
+    bool update_stop_state_on_device;
+    bool check_stop_on_device;
+    bool reset_stop_state;
 };
 
 typedef bool (*ggml_backend_cuda_diffusion_sample_topk_t)(
diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu
index f0b6f53e3146..81cb0056d352 100644
--- a/ggml/src/ggml-cuda/diffusion-sampling.cu
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cu
@@ -35,6 +35,8 @@ struct diffusion_sample_scratch {
     int   * top_ids  = nullptr;
     int   * sampled  = nullptr;
     int   * argmax   = nullptr;
+    int   * prev_argmax = nullptr;
+    int   * stop     = nullptr;
     float * entropy  = nullptr;
     int   * sc_ids   = nullptr;
     float * sc_probs = nullptr;
@@ -42,6 +44,8 @@ struct diffusion_sample_scratch {
     size_t top_ids_cap  = 0;
     size_t sampled_cap  = 0;
     size_t argmax_cap   = 0;
+    size_t prev_argmax_cap = 0;
+    size_t stop_cap     = 0;
     size_t entropy_cap  = 0;
     size_t sc_ids_cap   = 0;
     size_t sc_probs_cap = 0;
@@ -80,6 +84,8 @@ static diffusion_sample_scratch * diffusion_get_scratch(
     diffusion_scratch_reserve(stream, &scratch.top_ids,  &scratch.top_ids_cap,  (size_t) n_tokens * heap_k, &synced);
     diffusion_scratch_reserve(stream, &scratch.sampled,  &scratch.sampled_cap,  (size_t) n_tokens,          &synced);
     diffusion_scratch_reserve(stream, &scratch.argmax,   &scratch.argmax_cap,   (size_t) n_tokens,          &synced);
+    diffusion_scratch_reserve(stream, &scratch.prev_argmax, &scratch.prev_argmax_cap, (size_t) n_tokens,     &synced);
+    diffusion_scratch_reserve(stream, &scratch.stop,     &scratch.stop_cap,     (size_t) 1,                 &synced);
     diffusion_scratch_reserve(stream, &scratch.entropy,  &scratch.entropy_cap,  (size_t) n_tokens,          &synced);
     diffusion_scratch_reserve(stream, &scratch.sc_ids,   &scratch.sc_ids_cap,   (size_t) n_tokens * sc_k,   &synced);
     diffusion_scratch_reserve(stream, &scratch.sc_probs, &scratch.sc_probs_cap, (size_t) n_tokens * sc_k,   &synced);
@@ -357,6 +363,51 @@ static __global__ void diffusion_update_canvas_kernel(
     }
 }
 
+static __global__ void diffusion_stop_state_kernel(
+        const float * __restrict__ entropy,
+        const int * __restrict__ argmax,
+        int * __restrict__ prev_argmax,
+        int * __restrict__ stop,
+        const int n_tokens,
+        const float confidence_threshold,
+        const int stability_threshold,
+        const int check_stop,
+        const int reset_state) {
+    const int tid = threadIdx.x;
+
+    __shared__ float s_entropy[1024];
+    __shared__ int   s_diff[1024];
+
+    if (tid < n_tokens) {
+        s_entropy[tid] = entropy[tid];
+        s_diff[tid] = reset_state ? 1 : (prev_argmax[tid] != argmax[tid]);
+    } else {
+        s_entropy[tid] = 0.0f;
+        s_diff[tid] = 0;
+    }
+    __syncthreads();
+
+    for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            s_entropy[tid] += s_entropy[tid + stride];
+            s_diff[tid] += s_diff[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0 && check_stop) {
+        const bool stable = !reset_state && (stability_threshold == 0 || s_diff[0] == 0);
+        const bool confident = confidence_threshold > 0.0f &&
+                (s_entropy[0] / (float) n_tokens) < confidence_threshold;
+        stop[0] = (stable && confident) ? 1 : 0;
+    }
+    __syncthreads();
+
+    if (tid < n_tokens) {
+        prev_argmax[tid] = argmax[tid];
+    }
+}
+
 static __global__ void diffusion_sample_kernel(
         const float * __restrict__ logits,
         const int * __restrict__ top_ids,
@@ -550,6 +601,11 @@ bool ggml_cuda_diffusion_sample_topk(
             return false;
         }
     }
+    if (result->update_stop_state_on_device || result->check_stop_on_device || result->reset_stop_state) {
+        if (n_tokens > 1024 || (result->check_stop_on_device && result->stop == nullptr)) {
+            return false;
+        }
+    }
 
     const bool have_self_cond_host = result->self_cond_ids != nullptr && result->self_cond_probs != nullptr;
     const bool have_self_cond_tensor = result->self_cond_ids_tensor != nullptr && result->self_cond_probs_tensor != nullptr;
@@ -651,6 +707,13 @@ bool ggml_cuda_diffusion_sample_topk(
                 scratch->entropy, scratch->sampled, (int *) result->canvas_tokens_tensor->data,
                 n_tokens, n_vocab, result->entropy_bound, params->seed, params->step);
     }
+    if (result->update_stop_state_on_device || result->check_stop_on_device || result->reset_stop_state) {
+        const int stop_threads = next_power_of_2_host(n_tokens);
+        diffusion_stop_state_kernel<<<1, stop_threads, 0, stream>>>(
+                scratch->entropy, scratch->argmax, scratch->prev_argmax, scratch->stop,
+                n_tokens, result->confidence_threshold, result->stability_threshold,
+                result->check_stop_on_device ? 1 : 0, result->reset_stop_state ? 1 : 0);
+    }
 
     if (result->sampled) {
         CUDA_CHECK(cudaMemcpyAsync(result->sampled, scratch->sampled, (size_t) n_tokens * sizeof(int),
@@ -668,6 +731,10 @@ bool ggml_cuda_diffusion_sample_topk(
         CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int),
                                    cudaMemcpyDeviceToHost, stream));
     }
+    if (result->stop) {
+        CUDA_CHECK(cudaMemcpyAsync(result->stop, scratch->stop, sizeof(int),
+                                   cudaMemcpyDeviceToHost, stream));
+    }
     if (have_self_cond_host) {
         CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, scratch->sc_ids, (size_t) n_tokens * sc_k * sizeof(int),
                                    cudaMemcpyDeviceToHost, stream));
@@ -682,7 +749,7 @@ bool ggml_cuda_diffusion_sample_topk(
     }
 
     const bool host_outputs_requested = result->sampled || result->argmax || result->entropy ||
-        result->final_tokens || have_self_cond_host;
+        result->final_tokens || result->stop || have_self_cond_host;
     CUDA_CHECK(cudaGetLastError());
     if (sync_required || host_outputs_requested) {
         CUDA_CHECK(cudaStreamSynchronize(stream));
diff --git a/include/llama.h b/include/llama.h
index 0e39b71a388e..089dd2747a8b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1038,8 +1038,16 @@ extern "C" {
         // decoder token input tensor. final_tokens is copied only when requested, typically on
         // the final denoising step.
         llama_token * final_tokens;
-        bool          update_canvas_on_device;
+        // Optional host stop flag for device-loop early stopping. When requested, CUDA computes
+        // the stable+confident condition on device and copies only this flag plus final_tokens.
+        int32_t     * stop;
         float         entropy_bound;
+        float         confidence_threshold;
+        int32_t       stability_threshold;
+        bool          update_canvas_on_device;
+        bool          update_stop_state_on_device;
+        bool          check_stop_on_device;
+        bool          reset_stop_state;
     };
 
     // CUDA-only fast path for block-diffusion sampling. When enabled, llama_decode()
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index fd6c8bc99a36..cf3b9760a546 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1294,8 +1294,14 @@ bool llama_context::diffusion_sample_topk(
         /* .self_cond_probs_tensor = */ t_self_cond_probs,
         /* .canvas_tokens_tensor   = */ t_canvas_tokens,
         /* .final_tokens           = */ (int32_t *) result->final_tokens,
+        /* .stop                   = */ result->stop,
         /* .entropy_bound          = */ result->entropy_bound,
+        /* .confidence_threshold   = */ result->confidence_threshold,
+        /* .stability_threshold    = */ result->stability_threshold,
         /* .update_canvas_on_device = */ result->update_canvas_on_device,
+        /* .update_stop_state_on_device = */ result->update_stop_state_on_device,
+        /* .check_stop_on_device   = */ result->check_stop_on_device,
+        /* .reset_stop_state       = */ result->reset_stop_state,
     };
 
     const bool ok = sample_proc(backend, t_logits, &cuda_params, &cuda_result);

From 4823ee09e91f2914234e4ecba68ad32e05088225 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Tue, 9 Jun 2026 17:51:29 +0530
Subject: [PATCH 07/13] diffusion-gemma: make device early stop every step

---
 examples/diffusion-gemma/diffusion-gemma-cli.cpp    | 6 ++----
 examples/diffusion-gemma/diffusion-gemma-server.cpp | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index 6556df737402..87b99b0aa31b 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -209,8 +209,7 @@ int main(int argc, char ** argv) {
                                   llama_diffusion_sample_topk_supported(ctx);
     const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     const bool use_device_loop      = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
-    const int device_early_stop_interval = use_device_loop ?
-        std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 0), 0) : 0;
+    const int device_early_stop_interval = use_device_loop ? 1 : 0;
     llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling);
 
     LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n",
@@ -362,8 +361,7 @@ int main(int argc, char ** argv) {
             const int block_step = n_steps - cur_step + 1;
             const bool use_device_early_stop = device_early_stop_interval > 0;
             const bool reset_stop_state = use_device_early_stop && block_step == 1;
-            const bool check_stop = use_device_early_stop &&
-                (cur_step == 1 || (block_step % device_early_stop_interval) == 0);
+            const bool check_stop = use_device_early_stop;
             int32_t stop_flag = 0;
             const auto t_sample_start = std::chrono::steady_clock::now();
             llama_diffusion_sample_params sample_params = {
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
index d3e8c169eaf4..e14920644966 100644
--- a/examples/diffusion-gemma/diffusion-gemma-server.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -295,8 +295,7 @@ struct diffusion_server {
                     const int block_step = n_steps - cur_step + 1;
                     const bool use_device_early_stop = device_early_stop_interval > 0;
                     const bool reset_stop_state = use_device_early_stop && block_step == 1;
-                    const bool check_stop = use_device_early_stop &&
-                        (cur_step == 1 || (block_step % device_early_stop_interval) == 0);
+                    const bool check_stop = use_device_early_stop;
                     int32_t stop_flag = 0;
                     llama_diffusion_sample_params sample_params = {
                         /* .n_tokens              = */ canvas_length,
@@ -736,8 +735,7 @@ int main(int argc, char ** argv) {
                            llama_diffusion_sample_topk_supported(srv.ctx);
     srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
-    srv.device_early_stop_interval = srv.use_device_loop ?
-        std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 0), 0) : 0;
+    srv.device_early_stop_interval = srv.use_device_loop ? 1 : 0;
     llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling);
     const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
 

From 2a3dcf5764eea43299ba2be1b9015b4944ecabe0 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Tue, 9 Jun 2026 23:38:45 +0530
Subject: [PATCH 08/13] examples: update diffusion gemma mtmd bitmap loading

---
 examples/diffusion-gemma/diffusion-gemma-cli.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index 87b99b0aa31b..8a3e5cbbcd97 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -140,13 +140,13 @@ int main(int argc, char ** argv) {
         mtmd::bitmaps bitmaps;
         std::string markers;
         for (const auto & img : params.image) {
-            mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(mctx_vision.get(), img.c_str()));
-            if (!bmp.ptr) {
+            auto bmp_res = mtmd_helper_bitmap_init_from_file(mctx_vision.get(), img.c_str(), false);
+            if (!bmp_res.bitmap) {
                 LOG_ERR("error: failed to load image '%s'\n", img.c_str());
                 llama_model_free(model);
                 return 1;
             }
-            bitmaps.entries.push_back(std::move(bmp));
+            bitmaps.entries.emplace_back(bmp_res.bitmap);
             markers += mtmd_default_marker();
             markers += "\n";
         }

From 1347c3d5fa93a9c363326f66609f3aaa967392d8 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Wed, 10 Jun 2026 01:11:44 +0530
Subject: [PATCH 09/13] diffusion: add gated kernel optimization experiments

---
 examples/diffusion-gemma/diffusion-gemma-cli.cpp    |  6 ++++--
 examples/diffusion-gemma/diffusion-gemma-server.cpp |  6 ++++--
 ggml/src/ggml-cuda/diffusion-sampling.cu            | 13 +++++++++++--
 ggml/src/ggml-cuda/mmq.cu                           | 13 ++++++++++---
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index 8a3e5cbbcd97..ee703087b7bd 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -209,7 +209,8 @@ int main(int argc, char ** argv) {
                                   llama_diffusion_sample_topk_supported(ctx);
     const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     const bool use_device_loop      = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
-    const int device_early_stop_interval = use_device_loop ? 1 : 0;
+    const int device_early_stop_interval = use_device_loop ?
+        std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 1), 0) : 0;
     llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling);
 
     LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n",
@@ -361,7 +362,8 @@ int main(int argc, char ** argv) {
             const int block_step = n_steps - cur_step + 1;
             const bool use_device_early_stop = device_early_stop_interval > 0;
             const bool reset_stop_state = use_device_early_stop && block_step == 1;
-            const bool check_stop = use_device_early_stop;
+            const bool check_stop = use_device_early_stop &&
+                (cur_step == 1 || (block_step % device_early_stop_interval) == 0);
             int32_t stop_flag = 0;
             const auto t_sample_start = std::chrono::steady_clock::now();
             llama_diffusion_sample_params sample_params = {
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
index e14920644966..708e2e0af48f 100644
--- a/examples/diffusion-gemma/diffusion-gemma-server.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -295,7 +295,8 @@ struct diffusion_server {
                     const int block_step = n_steps - cur_step + 1;
                     const bool use_device_early_stop = device_early_stop_interval > 0;
                     const bool reset_stop_state = use_device_early_stop && block_step == 1;
-                    const bool check_stop = use_device_early_stop;
+                    const bool check_stop = use_device_early_stop &&
+                        (cur_step == 1 || (block_step % device_early_stop_interval) == 0);
                     int32_t stop_flag = 0;
                     llama_diffusion_sample_params sample_params = {
                         /* .n_tokens              = */ canvas_length,
@@ -735,7 +736,8 @@ int main(int argc, char ** argv) {
                            llama_diffusion_sample_topk_supported(srv.ctx);
     srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
-    srv.device_early_stop_interval = srv.use_device_loop ? 1 : 0;
+    srv.device_early_stop_interval = srv.use_device_loop ?
+        std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 1), 0) : 0;
     llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling);
     const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
 
diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu
index 81cb0056d352..da8d05680c55 100644
--- a/ggml/src/ggml-cuda/diffusion-sampling.cu
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cu
@@ -31,6 +31,11 @@ static bool diffusion_fast_topk_enabled() {
     return !v || std::strcmp(v, "0") != 0;
 }
 
+static bool diffusion_direct_selfcond_enabled() {
+    const char * v = std::getenv("GGML_CUDA_DIFFUSION_DIRECT_SELFCOND");
+    return v && std::strcmp(v, "0") != 0;
+}
+
 struct diffusion_sample_scratch {
     int   * top_ids  = nullptr;
     int   * sampled  = nullptr;
@@ -694,12 +699,16 @@ bool ggml_cuda_diffusion_sample_topk(
                 logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp);
     }
 
+    const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && diffusion_direct_selfcond_enabled();
+    int * sc_ids_out = direct_self_cond_tensor ? (int *) result->self_cond_ids_tensor->data : scratch->sc_ids;
+    float * sc_probs_out = direct_self_cond_tensor ? (float *) result->self_cond_probs_tensor->data : scratch->sc_probs;
+
     constexpr int block_size = 256;
     diffusion_sample_kernel<<<n_tokens, block_size, 0, stream>>>(
             logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp,
             params->seed, params->step, params->top_k_tail_correction,
             scratch->sampled, scratch->argmax, scratch->entropy,
-            scratch->sc_ids, scratch->sc_probs);
+            sc_ids_out, sc_probs_out);
 
     if (result->update_canvas_on_device) {
         const int update_threads = next_power_of_2_host(n_tokens);
@@ -741,7 +750,7 @@ bool ggml_cuda_diffusion_sample_topk(
         CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float),
                                    cudaMemcpyDeviceToHost, stream));
     }
-    if (have_self_cond_tensor) {
+    if (have_self_cond_tensor && !direct_self_cond_tensor) {
         CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids_tensor->data, scratch->sc_ids, (size_t) n_tokens * sc_k * sizeof(int),
                                    cudaMemcpyDeviceToDevice, stream));
         CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float),
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index e1add5e03316..a07e49493e06 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -3,6 +3,13 @@
 #include "quantize.cuh"
 #include "mmid.cuh"
 
+#include <cstdlib>
+
+static bool ggml_cuda_mmq_stream_k_enabled() {
+    const char * v = std::getenv("GGML_CUDA_MMQ_STREAM_K");
+    return !v || std::atoi(v) != 0;
+}
+
 static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
     switch (args.type_x) {
         case GGML_TYPE_Q1_0:
@@ -118,8 +125,8 @@ void ggml_cuda_mul_mat_q(
     const int64_t s03 = src0->nb[3] / ts_src0;
     const int64_t s3  =  dst->nb[3] / ts_dst;
 
-    const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
-                            || GGML_CUDA_CC_IS_CDNA(cc);
+    const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
+                            || GGML_CUDA_CC_IS_CDNA(cc)) && ggml_cuda_mmq_stream_k_enabled();
 
     // TODO: tighter pool buffer size vs q8 path
     const bool use_native_fp4 = blackwell_mma_available(cc) && (src0->type == GGML_TYPE_MXFP4 || src0->type == GGML_TYPE_NVFP4);
@@ -251,7 +258,7 @@ void ggml_cuda_op_mul_mat_q(
     // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
     const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
                             || GGML_CUDA_CC_IS_CDNA(cc))
-                            && src1_ncols == ne11;
+                            && src1_ncols == ne11 && ggml_cuda_mmq_stream_k_enabled();
     const mmq_args args = {
         src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
         ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,

From a83d05d8110f29f6c343b9df6c0d0a9283ea1036 Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Wed, 10 Jun 2026 01:19:47 +0530
Subject: [PATCH 10/13] diffusion: poll device stop state every denoise step

---
 .../diffusion-gemma/diffusion-gemma-cli.cpp    |  9 ++++-----
 .../diffusion-gemma/diffusion-gemma-server.cpp |  9 ++++-----
 ggml/src/ggml-cuda/diffusion-sampling.cu       | 18 ++++++++++++++++--
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index ee703087b7bd..ad7059805a34 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -209,8 +209,7 @@ int main(int argc, char ** argv) {
                                   llama_diffusion_sample_topk_supported(ctx);
     const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     const bool use_device_loop      = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
-    const int device_early_stop_interval = use_device_loop ?
-        std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 1), 0) : 0;
+    const int device_early_stop_interval = use_device_loop ? 1 : 0;
     llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling);
 
     LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n",
@@ -360,10 +359,10 @@ int main(int argc, char ** argv) {
         if (use_device_loop) {
             bool sampled_ok = true;
             const int block_step = n_steps - cur_step + 1;
+            const bool final_step = cur_step == 1;
             const bool use_device_early_stop = device_early_stop_interval > 0;
             const bool reset_stop_state = use_device_early_stop && block_step == 1;
-            const bool check_stop = use_device_early_stop &&
-                (cur_step == 1 || (block_step % device_early_stop_interval) == 0);
+            const bool check_stop = use_device_early_stop && !final_step;
             int32_t stop_flag = 0;
             const auto t_sample_start = std::chrono::steady_clock::now();
             llama_diffusion_sample_params sample_params = {
@@ -381,7 +380,7 @@ int main(int argc, char ** argv) {
                 /* .entropy         = */ nullptr,
                 /* .self_cond_ids   = */ nullptr,
                 /* .self_cond_probs = */ nullptr,
-                /* .final_tokens    = */ (cur_step == 1 || check_stop) ? argmax_canvas.data() : nullptr,
+                /* .final_tokens    = */ (final_step || check_stop) ? argmax_canvas.data() : nullptr,
                 /* .stop            = */ check_stop ? &stop_flag : nullptr,
                 /* .entropy_bound   = */ entropy_bound,
                 /* .confidence_threshold = */ CONFIDENCE_THRESHOLD,
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
index 708e2e0af48f..f651e32a98bf 100644
--- a/examples/diffusion-gemma/diffusion-gemma-server.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -293,10 +293,10 @@ struct diffusion_server {
 
                 if (use_device_loop_step) {
                     const int block_step = n_steps - cur_step + 1;
+                    const bool final_step = cur_step == 1;
                     const bool use_device_early_stop = device_early_stop_interval > 0;
                     const bool reset_stop_state = use_device_early_stop && block_step == 1;
-                    const bool check_stop = use_device_early_stop &&
-                        (cur_step == 1 || (block_step % device_early_stop_interval) == 0);
+                    const bool check_stop = use_device_early_stop && !final_step;
                     int32_t stop_flag = 0;
                     llama_diffusion_sample_params sample_params = {
                         /* .n_tokens              = */ canvas_length,
@@ -313,7 +313,7 @@ struct diffusion_server {
                         /* .entropy         = */ nullptr,
                         /* .self_cond_ids   = */ nullptr,
                         /* .self_cond_probs = */ nullptr,
-                        /* .final_tokens    = */ (cur_step == 1 || check_stop) ? argmax_canvas.data() : nullptr,
+                        /* .final_tokens    = */ (final_step || check_stop) ? argmax_canvas.data() : nullptr,
                         /* .stop            = */ check_stop ? &stop_flag : nullptr,
                         /* .entropy_bound   = */ ENTROPY_BOUND,
                         /* .confidence_threshold = */ CONFIDENCE_THRESHOLD,
@@ -736,8 +736,7 @@ int main(int argc, char ** argv) {
                            llama_diffusion_sample_topk_supported(srv.ctx);
     srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
     srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
-    srv.device_early_stop_interval = srv.use_device_loop ?
-        std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 1), 0) : 0;
+    srv.device_early_stop_interval = srv.use_device_loop ? 1 : 0;
     llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling);
     const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
 
diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu
index da8d05680c55..73e1472119ea 100644
--- a/ggml/src/ggml-cuda/diffusion-sampling.cu
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cu
@@ -36,6 +36,11 @@ static bool diffusion_direct_selfcond_enabled() {
     return v && std::strcmp(v, "0") != 0;
 }
 
+static bool diffusion_final_tokens_on_stop_enabled() {
+    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FINAL_TOKENS_ON_STOP");
+    return v && std::strcmp(v, "0") != 0;
+}
+
 struct diffusion_sample_scratch {
     int   * top_ids  = nullptr;
     int   * sampled  = nullptr;
@@ -736,7 +741,11 @@ bool ggml_cuda_diffusion_sample_topk(
         CUDA_CHECK(cudaMemcpyAsync(result->entropy, scratch->entropy, (size_t) n_tokens * sizeof(float),
                                    cudaMemcpyDeviceToHost, stream));
     }
-    if (result->final_tokens) {
+    const bool final_tokens_after_stop =
+        result->final_tokens && result->stop && result->check_stop_on_device &&
+        diffusion_final_tokens_on_stop_enabled();
+
+    if (result->final_tokens && !final_tokens_after_stop) {
         CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int),
                                    cudaMemcpyDeviceToHost, stream));
     }
@@ -758,10 +767,15 @@ bool ggml_cuda_diffusion_sample_topk(
     }
 
     const bool host_outputs_requested = result->sampled || result->argmax || result->entropy ||
-        result->final_tokens || result->stop || have_self_cond_host;
+        (result->final_tokens && !final_tokens_after_stop) || result->stop || have_self_cond_host;
     CUDA_CHECK(cudaGetLastError());
     if (sync_required || host_outputs_requested) {
         CUDA_CHECK(cudaStreamSynchronize(stream));
     }
+    if (final_tokens_after_stop && *result->stop != 0) {
+        CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int),
+                                   cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
     return true;
 }

From 1f1e91c826f1708ae511087b1c0c7c4e9d3c9cef Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Wed, 10 Jun 2026 01:33:53 +0530
Subject: [PATCH 11/13] diffusion: add gated topk and mmq kernel experiments

---
 ggml/src/ggml-cuda/diffusion-sampling.cu | 381 +++++++++++++++++++----
 ggml/src/ggml-cuda/mmq.cuh               |  54 +++-
 2 files changed, 375 insertions(+), 60 deletions(-)

diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu
index 73e1472119ea..d28a397fad7b 100644
--- a/ggml/src/ggml-cuda/diffusion-sampling.cu
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cu
@@ -41,6 +41,41 @@ static bool diffusion_final_tokens_on_stop_enabled() {
     return v && std::strcmp(v, "0") != 0;
 }
 
+static bool diffusion_fused_topk_sample_enabled() {
+    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSED_TOPK_SAMPLE");
+    return v && std::strcmp(v, "0") != 0;
+}
+
+static bool diffusion_tight_topk_enabled() {
+    const char * v = std::getenv("GGML_CUDA_DIFFUSION_TIGHT_TOPK");
+    return v && std::strcmp(v, "0") != 0;
+}
+
+static int diffusion_topk_local_k(const int heap_k) {
+    int requested = heap_k <= 256 ? 8 : 16;
+    if (const char * v = std::getenv("GGML_CUDA_DIFFUSION_TOPK_LOCAL_K")) {
+        requested = std::atoi(v);
+    }
+
+    const int min_local_k = std::max(1, (heap_k + 255) / 256);
+    if (requested <= 1) {
+        requested = 1;
+    } else if (requested <= 2) {
+        requested = 2;
+    } else if (requested <= 4) {
+        requested = 4;
+    } else if (requested <= 8) {
+        requested = 8;
+    } else {
+        requested = 16;
+    }
+
+    while (requested < min_local_k && requested < 16) {
+        requested <<= 1;
+    }
+    return requested;
+}
+
 struct diffusion_sample_scratch {
     int   * top_ids  = nullptr;
     int   * sampled  = nullptr;
@@ -194,17 +229,39 @@ static void diffusion_select_topk_local(
         const int heap_k,
         cudaStream_t stream) {
     constexpr int block_size = 256;
-
-    if (heap_k <= 256) {
-        constexpr int local_k = 8;
-        const size_t smem = (size_t) local_k * block_size * (sizeof(float) + sizeof(int));
-        diffusion_select_topk_local_kernel<local_k><<<n_tokens, block_size, smem, stream>>>(
-                logits, top_ids, n_vocab, heap_k);
-    } else {
-        constexpr int local_k = 16;
-        const size_t smem = (size_t) local_k * block_size * (sizeof(float) + sizeof(int));
-        diffusion_select_topk_local_kernel<local_k><<<n_tokens, block_size, smem, stream>>>(
-                logits, top_ids, n_vocab, heap_k);
+    const int local_k = diffusion_topk_local_k(heap_k);
+
+    switch (local_k) {
+        case 1: {
+            constexpr int local_k_t = 1;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int));
+            diffusion_select_topk_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, top_ids, n_vocab, heap_k);
+        } break;
+        case 2: {
+            constexpr int local_k_t = 2;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int));
+            diffusion_select_topk_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, top_ids, n_vocab, heap_k);
+        } break;
+        case 4: {
+            constexpr int local_k_t = 4;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int));
+            diffusion_select_topk_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, top_ids, n_vocab, heap_k);
+        } break;
+        case 8: {
+            constexpr int local_k_t = 8;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int));
+            diffusion_select_topk_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, top_ids, n_vocab, heap_k);
+        } break;
+        default: {
+            constexpr int local_k_t = 16;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int));
+            diffusion_select_topk_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, top_ids, n_vocab, heap_k);
+        } break;
     }
 }
 
@@ -297,6 +354,213 @@ static __device__ __forceinline__ float diffusion_rng_uniform(uint32_t seed, uin
     return ((x >> 8) + 0.5f) * (1.0f / 16777216.0f);
 }
 
+template<int LOCAL_K>
+static __global__ void diffusion_sample_topk_fused_local_kernel(
+        const float * __restrict__ logits,
+        const int n_vocab,
+        const int n_tokens,
+        const int top_k,
+        const int sc_k,
+        const float inv_temp,
+        const uint32_t seed,
+        const uint32_t step,
+        int * __restrict__ sampled,
+        int * __restrict__ argmax,
+        float * __restrict__ entropy,
+        int * __restrict__ sc_ids,
+        float * __restrict__ sc_probs) {
+    constexpr int block_size = 256;
+    constexpr int candidate_count = LOCAL_K * block_size;
+
+    const int row = blockIdx.x;
+    const int tid = threadIdx.x;
+    if (row >= n_tokens) {
+        return;
+    }
+
+    float vals[LOCAL_K];
+    int ids[LOCAL_K];
+#pragma unroll
+    for (int i = 0; i < LOCAL_K; ++i) {
+        vals[i] = -FLT_MAX;
+        ids[i] = 0;
+    }
+
+    const float * row_logits = logits + (size_t) row * n_vocab;
+    for (int v = tid; v < n_vocab; v += blockDim.x) {
+        const float x = row_logits[v];
+        if (diffusion_should_swap_desc(vals[LOCAL_K - 1], ids[LOCAL_K - 1], x, v)) {
+            int pos = LOCAL_K - 1;
+#pragma unroll
+            for (int i = LOCAL_K - 1; i > 0; --i) {
+                if (pos == i && diffusion_should_swap_desc(vals[i - 1], ids[i - 1], x, v)) {
+                    vals[i] = vals[i - 1];
+                    ids[i] = ids[i - 1];
+                    --pos;
+                }
+            }
+            vals[pos] = x;
+            ids[pos] = v;
+        }
+    }
+
+    extern __shared__ unsigned char smem[];
+    float * s_vals = (float *) smem;
+    int * s_ids = (int *) (s_vals + candidate_count);
+    float * s_sum = (float *) (s_ids + candidate_count);
+    float * s_t = s_sum + block_size;
+
+#pragma unroll
+    for (int i = 0; i < LOCAL_K; ++i) {
+        const int dst = tid * LOCAL_K + i;
+        s_vals[dst] = vals[i];
+        s_ids[dst] = ids[i];
+    }
+    __syncthreads();
+
+    for (int k = 2; k <= candidate_count; k <<= 1) {
+        for (int j = k >> 1; j > 0; j >>= 1) {
+            for (int i = tid; i < candidate_count; i += blockDim.x) {
+                const int ixj = i ^ j;
+                if (ixj > i) {
+                    const bool descending = (i & k) == 0;
+                    const bool swap = descending
+                        ? diffusion_should_swap_desc(s_vals[i], s_ids[i], s_vals[ixj], s_ids[ixj])
+                        : diffusion_should_swap_desc(s_vals[ixj], s_ids[ixj], s_vals[i], s_ids[i]);
+                    if (swap) {
+                        const float tv = s_vals[i];
+                        s_vals[i] = s_vals[ixj];
+                        s_vals[ixj] = tv;
+                        const int ti = s_ids[i];
+                        s_ids[i] = s_ids[ixj];
+                        s_ids[ixj] = ti;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    const float max_l = s_vals[0] * inv_temp;
+    const int amax = s_ids[0];
+
+    float local_sum = 0.0f;
+    float local_t = 0.0f;
+    if (tid < top_k) {
+        const float d = s_vals[tid] * inv_temp - max_l;
+        const float e = expf(d);
+        local_sum = e;
+        local_t = d * e;
+    }
+
+    s_sum[tid] = local_sum;
+    s_t[tid] = local_t;
+    __syncthreads();
+
+    for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            s_sum[tid] += s_sum[tid + stride];
+            s_t[tid] += s_t[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    const float z = s_sum[0];
+    const float t = s_t[0];
+
+    if (tid == 0) {
+        argmax[row] = amax;
+        entropy[row] = logf(z) - t / z;
+
+        const float r = diffusion_rng_uniform(seed, step, row) * z;
+        float cum = 0.0f;
+        int tok = amax;
+        for (int i = 0; i < top_k; ++i) {
+            const float d = s_vals[i] * inv_temp - max_l;
+            cum += expf(d);
+            if (cum >= r) {
+                tok = s_ids[i];
+                break;
+            }
+        }
+        sampled[row] = tok;
+
+        const int n_sc = min(sc_k, top_k);
+        for (int i = 0; i < sc_k; ++i) {
+            const int out = row * sc_k + i;
+            if (i < n_sc) {
+                sc_ids[out] = s_ids[i];
+                sc_probs[out] = expf(s_vals[i] * inv_temp - max_l) / z;
+            } else {
+                sc_ids[out] = 0;
+                sc_probs[out] = 0.0f;
+            }
+        }
+    }
+}
+
+static void diffusion_sample_topk_fused_local(
+        const float * logits,
+        const int n_vocab,
+        const int n_tokens,
+        const int top_k,
+        const int sc_k,
+        const float inv_temp,
+        const uint32_t seed,
+        const uint32_t step,
+        int * sampled,
+        int * argmax,
+        float * entropy,
+        int * sc_ids,
+        float * sc_probs,
+        cudaStream_t stream) {
+    constexpr int block_size = 256;
+    const int local_k = diffusion_topk_local_k(top_k);
+
+    switch (local_k) {
+        case 1: {
+            constexpr int local_k_t = 1;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                2 * block_size * sizeof(float);
+            diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+        case 2: {
+            constexpr int local_k_t = 2;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                2 * block_size * sizeof(float);
+            diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+        case 4: {
+            constexpr int local_k_t = 4;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                2 * block_size * sizeof(float);
+            diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+        case 8: {
+            constexpr int local_k_t = 8;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                2 * block_size * sizeof(float);
+            diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+        default: {
+            constexpr int local_k_t = 16;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                2 * block_size * sizeof(float);
+            diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+    }
+}
+
 static __device__ __forceinline__ bool diffusion_pair_gt(float a_val, int a_id, float b_val, int b_id) {
     return a_val > b_val || (a_val == b_val && a_id > b_id);
 }
@@ -647,7 +911,7 @@ bool ggml_cuda_diffusion_sample_topk(
         }
     }
 
-    const int heap_k = top_k == 0 ? sc_k : std::max(top_k, sc_k);
+    const int heap_k = top_k == 0 || !diffusion_tight_topk_enabled() ? std::max(top_k, sc_k) : top_k;
     if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) {
         return false;
     }
@@ -664,56 +928,65 @@ bool ggml_cuda_diffusion_sample_topk(
 
     diffusion_sample_scratch * scratch = diffusion_get_scratch(stream, n_tokens, heap_k, sc_k);
     int * top_ids = scratch->top_ids;
-    bool top_ids_sorted = false;
+    const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && diffusion_direct_selfcond_enabled();
+    int * sc_ids_out = direct_self_cond_tensor ? (int *) result->self_cond_ids_tensor->data : scratch->sc_ids;
+    float * sc_probs_out = direct_self_cond_tensor ? (float *) result->self_cond_probs_tensor->data : scratch->sc_probs;
 
-    const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k;
-    bool sync_required = !use_fast_topk;
-    if (use_fast_topk) {
-        diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream);
-        top_ids_sorted = true;
+    constexpr int block_size = 256;
+    bool sync_required = false;
+    const bool use_fused_topk_sample = diffusion_fused_topk_sample_enabled() &&
+        top_k > 0 && top_k <= block_size && !params->top_k_tail_correction && n_vocab >= top_k;
+
+    if (use_fused_topk_sample) {
+        diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, inv_temp,
+                params->seed, params->step, scratch->sampled, scratch->argmax, scratch->entropy,
+                sc_ids_out, sc_probs_out, stream);
     } else {
+        bool top_ids_sorted = false;
+        const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k;
+        sync_required = !use_fast_topk;
+        if (use_fast_topk) {
+            diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream);
+            top_ids_sorted = true;
+        } else {
 #ifdef CUB_DIFFUSION_TOP_K_AVAILABLE
-        for (int row = 0; row < n_tokens; ++row) {
-            diffusion_top_k_cub(pool, logits_d + (size_t) row * n_vocab, top_ids + (size_t) row * heap_k, n_vocab, heap_k, stream);
-        }
+            for (int row = 0; row < n_tokens; ++row) {
+                diffusion_top_k_cub(pool, logits_d + (size_t) row * n_vocab, top_ids + (size_t) row * heap_k, n_vocab, heap_k, stream);
+            }
 #elif defined(GGML_CUDA_USE_CUB)
-        ggml_cuda_pool_alloc<int> sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab);
-        int * sorted_ids = sorted_ids_alloc.get();
-        argsort_f32_i32_cuda_cub(pool, logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream);
-        CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int),
-                                     heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream));
-        top_ids_sorted = true;
+            ggml_cuda_pool_alloc<int> sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab);
+            int * sorted_ids = sorted_ids_alloc.get();
+            argsort_f32_i32_cuda_cub(pool, logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream);
+            CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int),
+                                         heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream));
+            top_ids_sorted = true;
 #else
-        if (n_vocab > 1024) {
-            return false;
-        }
-        ggml_cuda_pool_alloc<int> sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab);
-        int * sorted_ids = sorted_ids_alloc.get();
-        argsort_f32_i32_cuda_bitonic(logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream);
-        CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int),
-                                     heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream));
-        top_ids_sorted = true;
+            if (n_vocab > 1024) {
+                return false;
+            }
+            ggml_cuda_pool_alloc<int> sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab);
+            int * sorted_ids = sorted_ids_alloc.get();
+            argsort_f32_i32_cuda_bitonic(logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream);
+            CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int),
+                                         heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream));
+            top_ids_sorted = true;
 #endif
-    }
-
-    if (!top_ids_sorted) {
-        const int heap_k_pad = next_power_of_2_host(heap_k);
-        const int sort_threads = std::max(32, heap_k_pad);
-        const size_t sort_smem = (size_t) heap_k_pad * (sizeof(int) + sizeof(float));
-        diffusion_sort_top_ids_kernel<<<n_tokens, sort_threads, sort_smem, stream>>>(
-                logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp);
-    }
+        }
 
-    const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && diffusion_direct_selfcond_enabled();
-    int * sc_ids_out = direct_self_cond_tensor ? (int *) result->self_cond_ids_tensor->data : scratch->sc_ids;
-    float * sc_probs_out = direct_self_cond_tensor ? (float *) result->self_cond_probs_tensor->data : scratch->sc_probs;
+        if (!top_ids_sorted) {
+            const int heap_k_pad = next_power_of_2_host(heap_k);
+            const int sort_threads = std::max(32, heap_k_pad);
+            const size_t sort_smem = (size_t) heap_k_pad * (sizeof(int) + sizeof(float));
+            diffusion_sort_top_ids_kernel<<<n_tokens, sort_threads, sort_smem, stream>>>(
+                    logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp);
+        }
 
-    constexpr int block_size = 256;
-    diffusion_sample_kernel<<<n_tokens, block_size, 0, stream>>>(
-            logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp,
-            params->seed, params->step, params->top_k_tail_correction,
-            scratch->sampled, scratch->argmax, scratch->entropy,
-            sc_ids_out, sc_probs_out);
+        diffusion_sample_kernel<<<n_tokens, block_size, 0, stream>>>(
+                logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp,
+                params->seed, params->step, params->top_k_tail_correction,
+                scratch->sampled, scratch->argmax, scratch->entropy,
+                sc_ids_out, sc_probs_out);
+    }
 
     if (result->update_canvas_on_device) {
         const int update_threads = next_power_of_2_host(n_tokens);
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index edf546d8f1e2..0986ece35844 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -4,7 +4,9 @@
 #include "vecdotq.cuh"
 #include "mma.cuh"
 
+#include <algorithm>
 #include <climits>
+#include <cstdlib>
 #include <cstdint>
 
 using namespace ggml_cuda_mma;
@@ -14,6 +16,25 @@ using namespace ggml_cuda_mma;
 #define MMQ_ITER_K_FP4         512
 #define MMQ_NWARPS               8
 
+static bool mmq_env_enabled(const char * name, const bool default_value = false) {
+    const char * v = std::getenv(name);
+    return v ? std::atoi(v) != 0 : default_value;
+}
+
+static int mmq_env_int(const char * name, const int default_value) {
+    const char * v = std::getenv(name);
+    return v ? std::atoi(v) : default_value;
+}
+
+static int mmq_largest_divisor_leq(const int value, const int limit) {
+    for (int d = std::min(value, limit); d > 0; --d) {
+        if (value % d == 0) {
+            return d;
+        }
+    }
+    return 1;
+}
+
 typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
 typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00);
 typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t * __restrict__ get_rows_to_sorted,
@@ -3972,7 +3993,31 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
     const uint3 channel_ratio_fd   = init_fastdiv_values(channel_ratio);
     const uint3 sample_ratio_fd    = init_fastdiv_values(sample_ratio);
 
-    if (!args.use_stream_k) {
+    const int ntiles_dst = ntx * nty * ntzw;
+    const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm;
+    const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves);
+
+    bool use_stream_k = args.use_stream_k;
+    int nblocks_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm;
+
+    if (use_stream_k && mmq_env_enabled("GGML_CUDA_MMQ_STREAM_K_DIVISOR")) {
+        const int divisor_min_pct = std::max(0, std::min(100, mmq_env_int("GGML_CUDA_MMQ_STREAM_K_DIVISOR_MIN_PCT", 80)));
+        const int divisor_min_blocks = std::max(1, nsm * divisor_min_pct / 100);
+        const int divisor_blocks = mmq_largest_divisor_leq(ntiles_dst, nsm);
+        if (divisor_blocks >= divisor_min_blocks) {
+            nblocks_stream_k = divisor_blocks;
+        }
+    }
+
+    const bool stream_k_fixup_needed = ntiles_dst % nblocks_stream_k != 0;
+    if (use_stream_k && stream_k_fixup_needed && mmq_env_enabled("GGML_CUDA_MMQ_AVOID_FIXUP")) {
+        const int avoid_min_eff = std::max(0, std::min(100, mmq_env_int("GGML_CUDA_MMQ_AVOID_FIXUP_MIN_EFF", 80)));
+        if (tiles_efficiency_percent >= avoid_min_eff) {
+            use_stream_k = false;
+        }
+    }
+
+    if (!use_stream_k) {
         if (args.nrows_x % mmq_y == 0) {
             constexpr bool need_check = false;
             mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
@@ -3995,14 +4040,11 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
 
     // For the stream-k kernel it is possible to run it with tiling by setting the number of CUDA blocks equal to the number of tiles.
     // This is worthwhile if the efficiency of tiling is high and skipping the fixup kernel is more important.
-    const int ntiles_dst = ntx * nty * ntzw;
-    const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm;
-    const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves);
-    const dim3 block_nums_stream_k(GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm, 1, 1);
+    const dim3 block_nums_stream_k(nblocks_stream_k, 1, 1);
 
     GGML_ASSERT(ntiles_dst * blocks_per_ne00_fd.z < (1 << 30)); // Assert that variable kbc will not overflow.
 
-    const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;
+    const bool fixup_needed = stream_k_fixup_needed;
 
     ggml_cuda_pool & pool = ctx.pool(id);
     ggml_cuda_pool_alloc<float> tmp_fixup(pool);

From 027b998fc62238d9907917e07daee70942abd16e Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Wed, 10 Jun 2026 12:27:44 +0530
Subject: [PATCH 12/13] diffusion: add kernel optimization controls

---
 .../diffusion-gemma/diffusion-gemma-cli.cpp   |  14 +-
 .../diffusion-gemma-server.cpp                |  15 +-
 ggml/include/ggml-cuda.h                      |   3 +
 ggml/src/ggml-cuda/diffusion-sampling.cu      | 170 ++++++++++++++++--
 ggml/src/ggml-cuda/mmq.cuh                    |   8 +-
 src/llama-context.cpp                         |  87 +++++++--
 src/llama-graph.cpp                           |  49 +++++
 src/llama-graph.h                             |  22 +++
 src/models/diffusion-gemma.cpp                |  52 +++++-
 9 files changed, 383 insertions(+), 37 deletions(-)

diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index ad7059805a34..cd320009f7fa 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -47,6 +47,15 @@ static int env_int(const char * name, int def) {
     return v ? atoi(v) : def;
 }
 
+static int diffusion_self_cond_top_k() {
+    constexpr int def = 256;
+    const int k = env_int("GGML_CUDA_DIFFUSION_SC_TOPK", def);
+    if (k <= 0) {
+        return def;
+    }
+    return std::min(k, def);
+}
+
 // apply the model's chat template to the user prompt (this is a chat-trained model)
 static std::string format_chat(llama_model * model, const std::string & prompt) {
     auto tmpls = common_chat_templates_init(model, "");
@@ -287,8 +296,9 @@ int main(int argc, char ** argv) {
     // (renormalized) softmax probabilities per position. Fed to the next decode (Option-2 graph
     // gather: the decoder gathers just these SC_K embedding rows and blends them, instead of a
     // dense full-vocab probs @ token_embd matmul). Zero probs => no self-conditioning (step 1).
-    // SC_K must match llama_model_diffusion_gemma::N_SC_TOPK (the graph's fixed gather width).
-    const int SC_K = 256;
+    // SC_K must match the graph's fixed gather width. Default is 256; use
+    // GGML_CUDA_DIFFUSION_SC_TOPK for gated experiments with a smaller sparse blend.
+    const int SC_K = diffusion_self_cond_top_k();
     std::vector<int32_t> sc_ids ((size_t) SC_K * canvas_length, 0);
     std::vector<float>   sc_probs((size_t) SC_K * canvas_length, 0.0f);
 
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
index f651e32a98bf..9b49f55b8d25 100644
--- a/examples/diffusion-gemma/diffusion-gemma-server.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -54,7 +54,7 @@ static constexpr float TEMP_MIN             = 0.4f;
 static constexpr float TEMP_MAX             = 0.8f;
 static constexpr float CONFIDENCE_THRESHOLD = 0.005f;
 static constexpr int   STABILITY_THRESHOLD  = 1;
-static constexpr int   SC_K                 = 256; // must match llama_model_diffusion_gemma::N_SC_TOPK
+static constexpr int   DEF_SC_K             = 256;
 static constexpr int   GPU_SAMPLING_MAX_TOP_K = 1024; // CUDA diffusion sampler limit
 
 static int env_int(const char * name, int def) {
@@ -62,6 +62,14 @@ static int env_int(const char * name, int def) {
     return v ? atoi(v) : def;
 }
 
+static int diffusion_self_cond_top_k() {
+    const int k = env_int("GGML_CUDA_DIFFUSION_SC_TOPK", DEF_SC_K);
+    if (k <= 0) {
+        return DEF_SC_K;
+    }
+    return std::min(k, DEF_SC_K);
+}
+
 // ------------------------------------------------------------------------------------------------
 // per-request generation parameters and result
 // ------------------------------------------------------------------------------------------------
@@ -236,6 +244,7 @@ struct diffusion_server {
         std::vector<llama_token> argmax_canvas(canvas_length, -1);
         std::vector<llama_token> prev_argmax(canvas_length, -1);
         std::vector<llama_token> accepted(canvas_length);
+        const int SC_K = diffusion_self_cond_top_k();
         std::vector<int32_t>     sc_ids ((size_t) SC_K * canvas_length, 0);
         std::vector<float>       sc_probs((size_t) SC_K * canvas_length, 0.0f);
         std::vector<llama_token> generated;
@@ -822,7 +831,7 @@ int main(int argc, char ** argv) {
                 {"entropy_bound", ENTROPY_BOUND},
                 {"confidence_threshold", CONFIDENCE_THRESHOLD},
                 {"stability_threshold", STABILITY_THRESHOLD},
-                {"self_cond_topk", SC_K},
+                {"self_cond_topk", diffusion_self_cond_top_k()},
                 {"gpu_sampling", srv.use_gpu_sampling},
                 {"device_self_cond", srv.use_device_self_cond},
                 {"device_loop", srv.use_device_loop},
@@ -931,7 +940,7 @@ int main(int argc, char ** argv) {
                 {"params", {
                     {"canvas_length", srv.canvas_length},
                     {"n_denoise_steps", srv.n_steps},
-                    {"self_cond_topk", SC_K},
+                    {"self_cond_topk", diffusion_self_cond_top_k()},
                     {"gpu_sampling", srv.use_gpu_sampling},
                     {"device_self_cond", srv.use_device_self_cond},
                     {"device_loop", srv.use_device_loop},
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index cb1136ba856d..f8990bea4c10 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -52,6 +52,7 @@ struct ggml_cuda_diffusion_sample_params {
     uint32_t seed;
     uint32_t step;
     bool     top_k_tail_correction;
+    float    logit_softcap;
 };
 
 struct ggml_cuda_diffusion_sample_result {
@@ -62,6 +63,8 @@ struct ggml_cuda_diffusion_sample_result {
     float   * self_cond_probs;
     struct ggml_tensor * self_cond_ids_tensor;
     struct ggml_tensor * self_cond_probs_tensor;
+    struct ggml_tensor * self_cond_embd_tensor;
+    const struct ggml_tensor * token_embd_tensor;
     struct ggml_tensor * canvas_tokens_tensor;
     int32_t * final_tokens;
     int32_t * stop;
diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu
index d28a397fad7b..2306e68558a0 100644
--- a/ggml/src/ggml-cuda/diffusion-sampling.cu
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cu
@@ -354,6 +354,107 @@ static __device__ __forceinline__ float diffusion_rng_uniform(uint32_t seed, uin
     return ((x >> 8) + 0.5f) * (1.0f / 16777216.0f);
 }
 
+static __device__ __forceinline__ float diffusion_apply_logit_softcap(const float x, const float softcap) {
+    return softcap > 0.0f ? softcap * tanhf(x / softcap) : x;
+}
+
+template<typename T>
+static __device__ __forceinline__ float diffusion_embd_to_float(const T x) {
+    return (float) x;
+}
+
+template<>
+__device__ __forceinline__ float diffusion_embd_to_float<half>(const half x) {
+    return __half2float(x);
+}
+
+template<typename T>
+static __global__ void diffusion_build_selfcond_embd_kernel(
+        const T * __restrict__ token_embd,
+        const int64_t embd_stride,
+        const int n_embd,
+        const int n_tokens,
+        const int sc_k,
+        const int * __restrict__ sc_ids,
+        const float * __restrict__ sc_probs,
+        float * __restrict__ dst) {
+    extern __shared__ unsigned char smem[];
+    int * s_ids = (int *) smem;
+    float * s_probs = (float *) (s_ids + sc_k);
+
+    const int token = blockIdx.y;
+    const int dim = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for (int i = threadIdx.x; i < sc_k; i += blockDim.x) {
+        const int off = token * sc_k + i;
+        s_ids[i] = sc_ids[off];
+        s_probs[i] = sc_probs[off];
+    }
+    __syncthreads();
+
+    if (token >= n_tokens || dim >= n_embd) {
+        return;
+    }
+
+    float sum = 0.0f;
+    for (int i = 0; i < sc_k; ++i) {
+        const float p = s_probs[i];
+        if (p != 0.0f) {
+            const int id = s_ids[i];
+            sum += p * diffusion_embd_to_float(token_embd[(int64_t) id * embd_stride + dim]);
+        }
+    }
+    dst[(int64_t) token * n_embd + dim] = sum;
+}
+
+static bool diffusion_build_selfcond_embd(
+        const ggml_tensor * token_embd,
+        const int * sc_ids,
+        const float * sc_probs,
+        const int n_tokens,
+        const int sc_k,
+        ggml_tensor * dst,
+        cudaStream_t stream) {
+    if (!token_embd || !dst || !sc_ids || !sc_probs) {
+        return false;
+    }
+    if (!ggml_is_contiguous(token_embd) || !ggml_is_contiguous(dst) ||
+        token_embd->data == nullptr || dst->data == nullptr ||
+        token_embd->buffer == nullptr || dst->buffer == nullptr ||
+        ggml_backend_buffer_is_host(token_embd->buffer) ||
+        ggml_backend_buffer_is_host(dst->buffer) ||
+        dst->type != GGML_TYPE_F32 ||
+        token_embd->ne[1] <= 0 ||
+        dst->ne[0] != token_embd->ne[0] ||
+        dst->ne[1] < n_tokens) {
+        return false;
+    }
+
+    const int n_embd = (int) token_embd->ne[0];
+    const int block_size = 256;
+    const dim3 block(block_size, 1, 1);
+    const dim3 grid((n_embd + block_size - 1) / block_size, n_tokens, 1);
+    const size_t smem = (size_t) sc_k * (sizeof(int) + sizeof(float));
+
+    switch (token_embd->type) {
+        case GGML_TYPE_F16: {
+            const int64_t embd_stride = token_embd->nb[1] / (int64_t) sizeof(half);
+            diffusion_build_selfcond_embd_kernel<half><<<grid, block, smem, stream>>>(
+                    (const half *) token_embd->data, embd_stride, n_embd, n_tokens, sc_k,
+                    sc_ids, sc_probs, (float *) dst->data);
+        } break;
+        case GGML_TYPE_F32: {
+            const int64_t embd_stride = token_embd->nb[1] / (int64_t) sizeof(float);
+            diffusion_build_selfcond_embd_kernel<float><<<grid, block, smem, stream>>>(
+                    (const float *) token_embd->data, embd_stride, n_embd, n_tokens, sc_k,
+                    sc_ids, sc_probs, (float *) dst->data);
+        } break;
+        default:
+            return false;
+    }
+    return true;
+}
+
 template<int LOCAL_K>
 static __global__ void diffusion_sample_topk_fused_local_kernel(
         const float * __restrict__ logits,
@@ -362,6 +463,7 @@ static __global__ void diffusion_sample_topk_fused_local_kernel(
         const int top_k,
         const int sc_k,
         const float inv_temp,
+        const float logit_softcap,
         const uint32_t seed,
         const uint32_t step,
         int * __restrict__ sampled,
@@ -441,6 +543,11 @@ static __global__ void diffusion_sample_topk_fused_local_kernel(
         }
     }
 
+    if (tid < top_k) {
+        s_vals[tid] = diffusion_apply_logit_softcap(s_vals[tid], logit_softcap);
+    }
+    __syncthreads();
+
     const float max_l = s_vals[0] * inv_temp;
     const int amax = s_ids[0];
 
@@ -506,6 +613,7 @@ static void diffusion_sample_topk_fused_local(
         const int top_k,
         const int sc_k,
         const float inv_temp,
+        const float logit_softcap,
         const uint32_t seed,
         const uint32_t step,
         int * sampled,
@@ -523,7 +631,7 @@ static void diffusion_sample_topk_fused_local(
             const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
                                 2 * block_size * sizeof(float);
             diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
-                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step,
                     sampled, argmax, entropy, sc_ids, sc_probs);
         } break;
         case 2: {
@@ -531,7 +639,7 @@ static void diffusion_sample_topk_fused_local(
             const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
                                 2 * block_size * sizeof(float);
             diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
-                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step,
                     sampled, argmax, entropy, sc_ids, sc_probs);
         } break;
         case 4: {
@@ -539,7 +647,7 @@ static void diffusion_sample_topk_fused_local(
             const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
                                 2 * block_size * sizeof(float);
             diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
-                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step,
                     sampled, argmax, entropy, sc_ids, sc_probs);
         } break;
         case 8: {
@@ -547,7 +655,7 @@ static void diffusion_sample_topk_fused_local(
             const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
                                 2 * block_size * sizeof(float);
             diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
-                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step,
                     sampled, argmax, entropy, sc_ids, sc_probs);
         } break;
         default: {
@@ -555,7 +663,7 @@ static void diffusion_sample_topk_fused_local(
             const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
                                 2 * block_size * sizeof(float);
             diffusion_sample_topk_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
-                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step,
+                    logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step,
                     sampled, argmax, entropy, sc_ids, sc_probs);
         } break;
     }
@@ -691,6 +799,7 @@ static __global__ void diffusion_sample_kernel(
         const int heap_k,
         const int sc_k,
         const float inv_temp,
+        const float logit_softcap,
         const uint32_t seed,
         const uint32_t step,
         const bool tail_correction,
@@ -717,7 +826,7 @@ static __global__ void diffusion_sample_kernel(
 
     if (top_k == 0 || tail_correction) {
         for (int v = tid; v < n_vocab; v += blockDim.x) {
-            const float x = row_logits[v] * inv_temp;
+            const float x = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp;
             if (x > local_max) {
                 local_max = x;
                 local_idx = v;
@@ -726,7 +835,7 @@ static __global__ void diffusion_sample_kernel(
     } else {
         for (int i = tid; i < top_k; i += blockDim.x) {
             const int v = row_top[i];
-            const float x = row_logits[v] * inv_temp;
+            const float x = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp;
             if (x > local_max) {
                 local_max = x;
                 local_idx = v;
@@ -754,7 +863,7 @@ static __global__ void diffusion_sample_kernel(
 
     if (top_k == 0 || tail_correction) {
         for (int v = tid; v < n_vocab; v += blockDim.x) {
-            const float d = row_logits[v] * inv_temp - max_l;
+            const float d = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l;
             const float e = expf(d);
             local_sum += e;
             local_t   += d * e;
@@ -762,7 +871,7 @@ static __global__ void diffusion_sample_kernel(
     } else {
         for (int i = tid; i < top_k; i += blockDim.x) {
             const int v = row_top[i];
-            const float d = row_logits[v] * inv_temp - max_l;
+            const float d = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l;
             const float e = expf(d);
             local_sum += e;
             local_t   += d * e;
@@ -793,7 +902,7 @@ static __global__ void diffusion_sample_kernel(
             sample_z = 0.0f;
             for (int i = 0; i < top_k; ++i) {
                 const int v = row_top[i];
-                sample_z += expf(row_logits[v] * inv_temp - max_l);
+                sample_z += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l);
             }
         }
 
@@ -803,7 +912,7 @@ static __global__ void diffusion_sample_kernel(
 
         if (top_k == 0) {
             for (int v = 0; v < n_vocab; ++v) {
-                cum += expf(row_logits[v] * inv_temp - max_l);
+                cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l);
                 if (cum >= r) {
                     tok = v;
                     break;
@@ -812,7 +921,7 @@ static __global__ void diffusion_sample_kernel(
         } else {
             for (int i = 0; i < top_k; ++i) {
                 const int v = row_top[i];
-                cum += expf(row_logits[v] * inv_temp - max_l);
+                cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l);
                 if (cum >= r) {
                     tok = v;
                     break;
@@ -827,7 +936,7 @@ static __global__ void diffusion_sample_kernel(
             if (i < n_sc) {
                 const int v = row_top[i];
                 sc_ids[out] = v;
-                sc_probs[out] = expf(row_logits[v] * inv_temp - max_l) / sample_z;
+                sc_probs[out] = expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l) / sample_z;
             } else {
                 sc_ids[out] = 0;
                 sc_probs[out] = 0.0f;
@@ -883,7 +992,8 @@ bool ggml_cuda_diffusion_sample_topk(
 
     const bool have_self_cond_host = result->self_cond_ids != nullptr && result->self_cond_probs != nullptr;
     const bool have_self_cond_tensor = result->self_cond_ids_tensor != nullptr && result->self_cond_probs_tensor != nullptr;
-    if (!have_self_cond_host && !have_self_cond_tensor) {
+    const bool have_self_cond_embd_tensor = result->self_cond_embd_tensor != nullptr;
+    if (!have_self_cond_host && !have_self_cond_tensor && !have_self_cond_embd_tensor) {
         return false;
     }
     if ((result->self_cond_ids == nullptr) != (result->self_cond_probs == nullptr)) {
@@ -910,6 +1020,27 @@ bool ggml_cuda_diffusion_sample_topk(
             return false;
         }
     }
+    if (have_self_cond_embd_tensor) {
+        if (result->token_embd_tensor == nullptr ||
+            result->token_embd_tensor->data == nullptr ||
+            result->token_embd_tensor->buffer == nullptr ||
+            result->self_cond_embd_tensor->type != GGML_TYPE_F32 ||
+            result->self_cond_embd_tensor->data == nullptr ||
+            result->self_cond_embd_tensor->buffer == nullptr ||
+            ggml_backend_buffer_is_host(result->token_embd_tensor->buffer) ||
+            ggml_backend_buffer_is_host(result->self_cond_embd_tensor->buffer) ||
+            !ggml_is_contiguous(result->token_embd_tensor) ||
+            !ggml_is_contiguous(result->self_cond_embd_tensor) ||
+            result->token_embd_tensor->ne[1] < n_vocab ||
+            result->self_cond_embd_tensor->ne[0] != result->token_embd_tensor->ne[0] ||
+            result->self_cond_embd_tensor->ne[1] < n_tokens) {
+            return false;
+        }
+        if (result->token_embd_tensor->type != GGML_TYPE_F16 &&
+            result->token_embd_tensor->type != GGML_TYPE_F32) {
+            return false;
+        }
+    }
 
     const int heap_k = top_k == 0 || !diffusion_tight_topk_enabled() ? std::max(top_k, sc_k) : top_k;
     if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) {
@@ -918,6 +1049,7 @@ bool ggml_cuda_diffusion_sample_topk(
 
     const float temp = params->temperature > 0.0f ? params->temperature : 1.0f;
     const float inv_temp = 1.0f / temp;
+    const float logit_softcap = params->logit_softcap > 0.0f ? params->logit_softcap : 0.0f;
 
     ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *) backend->context;
     ggml_cuda_set_device(ctx->device);
@@ -938,7 +1070,7 @@ bool ggml_cuda_diffusion_sample_topk(
         top_k > 0 && top_k <= block_size && !params->top_k_tail_correction && n_vocab >= top_k;
 
     if (use_fused_topk_sample) {
-        diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, inv_temp,
+        diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap,
                 params->seed, params->step, scratch->sampled, scratch->argmax, scratch->entropy,
                 sc_ids_out, sc_probs_out, stream);
     } else {
@@ -983,7 +1115,7 @@ bool ggml_cuda_diffusion_sample_topk(
 
         diffusion_sample_kernel<<<n_tokens, block_size, 0, stream>>>(
                 logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp,
-                params->seed, params->step, params->top_k_tail_correction,
+                logit_softcap, params->seed, params->step, params->top_k_tail_correction,
                 scratch->sampled, scratch->argmax, scratch->entropy,
                 sc_ids_out, sc_probs_out);
     }
@@ -1038,6 +1170,12 @@ bool ggml_cuda_diffusion_sample_topk(
         CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float),
                                    cudaMemcpyDeviceToDevice, stream));
     }
+    if (have_self_cond_embd_tensor) {
+        if (!diffusion_build_selfcond_embd(result->token_embd_tensor, sc_ids_out, sc_probs_out, n_tokens, sc_k,
+                result->self_cond_embd_tensor, stream)) {
+            return false;
+        }
+    }
 
     const bool host_outputs_requested = result->sampled || result->argmax || result->entropy ||
         (result->final_tokens && !final_tokens_after_stop) || result->stop || have_self_cond_host;
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 0986ece35844..cf2c2d2f26c2 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -4102,7 +4102,13 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
     const int warp_size = ggml_cuda_info().devices[id].warp_size;
     const int nwarps    = mmq_get_nwarps_host(cc, warp_size);
 
-    const int mmq_x_max = get_mmq_x_max_host(cc);
+    int mmq_x_max = get_mmq_x_max_host(cc);
+    if (mmq_env_enabled("GGML_CUDA_MMQ_MAX_X")) {
+        const int mmq_x_max_env = mmq_env_int("GGML_CUDA_MMQ_MAX_X", mmq_x_max);
+        if (mmq_x_max_env > 0) {
+            mmq_x_max = std::min(mmq_x_max, std::max(8, 8 * (mmq_x_max_env / 8)));
+        }
+    }
     const int mmq_y = get_mmq_y_host(cc);
 
     int mmq_x_best  = 0;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index cf3b9760a546..575b647f733d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1127,6 +1127,18 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) {
 }
 
 void llama_context::set_diffusion_self_cond(const float * probs, int64_t n_vocab, int64_t n_tokens) {
+    diffusion_cond.sc_topk_device_ready       = false;
+    diffusion_cond.sc_topk_device_ids_data    = nullptr;
+    diffusion_cond.sc_topk_device_probs_data  = nullptr;
+    diffusion_cond.sc_topk_device_ids_bytes   = 0;
+    diffusion_cond.sc_topk_device_probs_bytes = 0;
+    diffusion_cond.sc_embd_device_ready       = false;
+    diffusion_cond.sc_embd_device_data        = nullptr;
+    diffusion_cond.sc_embd_device_bytes       = 0;
+    diffusion_cond.canvas_tokens_device_ready = false;
+    diffusion_cond.canvas_tokens_device_data  = nullptr;
+    diffusion_cond.canvas_tokens_device_bytes = 0;
+
     if (probs == nullptr || n_vocab <= 0 || n_tokens <= 0) {
         diffusion_cond.probs.clear();
         diffusion_cond.n_vocab  = 0;
@@ -1154,6 +1166,9 @@ void llama_context::set_diffusion_self_cond_topk(const int32_t * ids, const floa
     diffusion_cond.sc_topk_device_probs_data  = nullptr;
     diffusion_cond.sc_topk_device_ids_bytes   = 0;
     diffusion_cond.sc_topk_device_probs_bytes = 0;
+    diffusion_cond.sc_embd_device_ready       = false;
+    diffusion_cond.sc_embd_device_data        = nullptr;
+    diffusion_cond.sc_embd_device_bytes       = 0;
     diffusion_cond.canvas_tokens_device_ready = false;
     diffusion_cond.canvas_tokens_device_data  = nullptr;
     diffusion_cond.canvas_tokens_device_bytes = 0;
@@ -1189,12 +1204,21 @@ static ggml_backend_cuda_diffusion_sample_topk_t get_cuda_diffusion_sample_topk_
         ggml_backend_reg_get_proc_address(reg, "ggml_backend_cuda_diffusion_sample_topk");
 }
 
+static bool diffusion_cuda_fuse_final_logit_softcap_enabled() {
+    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSE_FINAL_SOFTCAP");
+    return v && std::strcmp(v, "0") != 0;
+}
+
+static bool diffusion_cuda_fused_selfcond_embd_enabled() {
+    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD");
+    return v && std::strcmp(v, "0") != 0;
+}
+
 static bool diffusion_decoder_inputs_device_ready(
         const llama_diffusion_cond & diffusion_cond,
         const llm_graph_result * res) {
     if (!diffusion_cond.decoder_phase ||
         !diffusion_cond.canvas_tokens_device_ready ||
-        !diffusion_cond.sc_topk_device_ready ||
         !res) {
         return false;
     }
@@ -1206,8 +1230,17 @@ static bool diffusion_decoder_inputs_device_ready(
         return false;
     }
 
+    auto * inp_sc_embd = res->get_inp_diffusion_self_cond_embd();
+    if (inp_sc_embd) {
+        return inp_sc_embd->embd &&
+            diffusion_cond.sc_embd_device_ready &&
+            diffusion_cond.sc_embd_device_data  == inp_sc_embd->embd->data &&
+            diffusion_cond.sc_embd_device_bytes == ggml_nbytes(inp_sc_embd->embd);
+    }
+
     auto * inp_sc = res->get_inp_diffusion_self_cond_topk();
     if (!inp_sc || !inp_sc->ids || !inp_sc->probs ||
+        !diffusion_cond.sc_topk_device_ready ||
         diffusion_cond.sc_topk_device_ids_data    != inp_sc->ids->data ||
         diffusion_cond.sc_topk_device_probs_data  != inp_sc->probs->data ||
         diffusion_cond.sc_topk_device_ids_bytes   != ggml_nbytes(inp_sc->ids) ||
@@ -1253,15 +1286,26 @@ bool llama_context::diffusion_sample_topk(
 
     ggml_tensor * t_self_cond_ids   = nullptr;
     ggml_tensor * t_self_cond_probs = nullptr;
+    ggml_tensor * t_self_cond_embd  = nullptr;
+    const ggml_tensor * t_token_embd = nullptr;
     ggml_tensor * t_canvas_tokens   = nullptr;
     const bool device_self_cond = result->self_cond_ids == nullptr && result->self_cond_probs == nullptr;
     if (device_self_cond) {
-        auto * inp = gf_res_prev->get_inp_diffusion_self_cond_topk();
-        if (!inp || !inp->ids || !inp->probs) {
-            return false;
+        auto * inp_embd = gf_res_prev->get_inp_diffusion_self_cond_embd();
+        if (inp_embd && diffusion_cuda_fused_selfcond_embd_enabled()) {
+            t_self_cond_embd = inp_embd->embd;
+            t_token_embd = gf_res_prev->get_diffusion_token_embd();
+            if (!t_self_cond_embd || !t_token_embd) {
+                return false;
+            }
+        } else {
+            auto * inp = gf_res_prev->get_inp_diffusion_self_cond_topk();
+            if (!inp || !inp->ids || !inp->probs) {
+                return false;
+            }
+            t_self_cond_ids   = inp->ids;
+            t_self_cond_probs = inp->probs;
         }
-        t_self_cond_ids   = inp->ids;
-        t_self_cond_probs = inp->probs;
     } else if (result->self_cond_ids == nullptr || result->self_cond_probs == nullptr) {
         return false;
     }
@@ -1273,6 +1317,10 @@ bool llama_context::diffusion_sample_topk(
         }
     }
 
+    const float fused_logit_softcap = diffusion_cuda_fuse_final_logit_softcap_enabled()
+        ? model.hparams.f_final_logit_softcapping
+        : 0.0f;
+
     ggml_cuda_diffusion_sample_params cuda_params = {
         /* .n_vocab               = */ (int32_t) model.vocab.n_tokens(),
         /* .n_tokens              = */ params->n_tokens,
@@ -1282,6 +1330,7 @@ bool llama_context::diffusion_sample_topk(
         /* .seed                  = */ params->seed,
         /* .step                  = */ params->step,
         /* .top_k_tail_correction = */ params->top_k_tail_correction,
+        /* .logit_softcap         = */ fused_logit_softcap,
     };
 
     ggml_cuda_diffusion_sample_result cuda_result = {
@@ -1292,6 +1341,8 @@ bool llama_context::diffusion_sample_topk(
         /* .self_cond_probs = */ result->self_cond_probs,
         /* .self_cond_ids_tensor   = */ t_self_cond_ids,
         /* .self_cond_probs_tensor = */ t_self_cond_probs,
+        /* .self_cond_embd_tensor  = */ t_self_cond_embd,
+        /* .token_embd_tensor      = */ t_token_embd,
         /* .canvas_tokens_tensor   = */ t_canvas_tokens,
         /* .final_tokens           = */ (int32_t *) result->final_tokens,
         /* .stop                   = */ result->stop,
@@ -1309,11 +1360,25 @@ bool llama_context::diffusion_sample_topk(
         diffusion_cond.sc_topk = params->self_cond_top_k;
         diffusion_cond.sc_topk_ids.clear();
         diffusion_cond.sc_topk_probs.clear();
-        diffusion_cond.sc_topk_device_ready       = true;
-        diffusion_cond.sc_topk_device_ids_data    = t_self_cond_ids->data;
-        diffusion_cond.sc_topk_device_probs_data  = t_self_cond_probs->data;
-        diffusion_cond.sc_topk_device_ids_bytes   = ggml_nbytes(t_self_cond_ids);
-        diffusion_cond.sc_topk_device_probs_bytes = ggml_nbytes(t_self_cond_probs);
+        diffusion_cond.sc_topk_device_ready       = false;
+        diffusion_cond.sc_topk_device_ids_data    = nullptr;
+        diffusion_cond.sc_topk_device_probs_data  = nullptr;
+        diffusion_cond.sc_topk_device_ids_bytes   = 0;
+        diffusion_cond.sc_topk_device_probs_bytes = 0;
+        diffusion_cond.sc_embd_device_ready       = false;
+        diffusion_cond.sc_embd_device_data        = nullptr;
+        diffusion_cond.sc_embd_device_bytes       = 0;
+        if (t_self_cond_embd) {
+            diffusion_cond.sc_embd_device_ready = true;
+            diffusion_cond.sc_embd_device_data  = t_self_cond_embd->data;
+            diffusion_cond.sc_embd_device_bytes = ggml_nbytes(t_self_cond_embd);
+        } else {
+            diffusion_cond.sc_topk_device_ready       = true;
+            diffusion_cond.sc_topk_device_ids_data    = t_self_cond_ids->data;
+            diffusion_cond.sc_topk_device_probs_data  = t_self_cond_probs->data;
+            diffusion_cond.sc_topk_device_ids_bytes   = ggml_nbytes(t_self_cond_ids);
+            diffusion_cond.sc_topk_device_probs_bytes = ggml_nbytes(t_self_cond_probs);
+        }
     }
     if (ok && result->update_canvas_on_device) {
         diffusion_cond.canvas_tokens_device_ready = true;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index eee04a30de2e..7b21baf75d6c 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -453,6 +453,28 @@ void llm_graph_input_diffusion_self_cond_topk::set_input(const llama_ubatch * ub
     }
 }
 
+void llm_graph_input_diffusion_self_cond_embd::set_input(const llama_ubatch * ubatch) {
+    GGML_UNUSED(ubatch);
+
+    if (!embd) {
+        return;
+    }
+    assert(embd->type == GGML_TYPE_F32);
+
+    const size_t n_bytes = ggml_nbytes(embd);
+    const bool have_device = diffusion
+        && diffusion->sc_embd_device_ready
+        && diffusion->sc_embd_device_data  == embd->data
+        && diffusion->sc_embd_device_bytes == n_bytes;
+
+    if (have_device) {
+        return;
+    }
+
+    std::vector<uint8_t> zeros(n_bytes, 0);
+    ggml_backend_tensor_set(embd, zeros.data(), 0, n_bytes);
+}
+
 template <typename T>
 static void print_mask(const T * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
     LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
@@ -1021,6 +1043,9 @@ void llm_graph_result::reset() {
     t_logits      = nullptr;
     t_embd        = nullptr;
     t_embd_pooled = nullptr;
+    t_h_nextn     = nullptr;
+    t_h_pre_norm  = nullptr;
+    t_diffusion_token_embd = nullptr;
     t_sampled.clear();
     t_sampled_probs.clear();
     t_sampled_logits.clear();
@@ -1059,6 +1084,16 @@ llm_graph_input_diffusion_self_cond_topk * llm_graph_result::get_inp_diffusion_s
     return nullptr;
 }
 
+llm_graph_input_diffusion_self_cond_embd * llm_graph_result::get_inp_diffusion_self_cond_embd() const {
+    for (auto & input : inputs) {
+        if (auto * embd = dynamic_cast<llm_graph_input_diffusion_self_cond_embd *>(input.get())) {
+            return embd;
+        }
+    }
+
+    return nullptr;
+}
+
 void llm_graph_result::set_outputs() {
     if (t_logits != nullptr) {
         ggml_set_output(t_logits);
@@ -2193,6 +2228,20 @@ llm_graph_input_diffusion_self_cond_topk * llm_graph_context::build_inp_diffusio
     return ptr;
 }
 
+ggml_tensor * llm_graph_context::build_inp_diffusion_self_cond_embd(int64_t n_embd) const {
+    auto inp = std::make_unique<llm_graph_input_diffusion_self_cond_embd>(diffusion);
+
+    auto & cur = inp->embd;
+
+    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+    ggml_set_input(cur);
+    set_diffusion_input_backend(cur);
+
+    res->add_input(std::move(inp));
+
+    return cur;
+}
+
 ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
     auto inp = std::make_unique<llm_graph_input_pos_bucket>(hparams);
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 838bbce24b86..446e8162cf3c 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -99,6 +99,10 @@ struct llama_diffusion_cond {
     size_t   sc_topk_device_ids_bytes  = 0;
     size_t   sc_topk_device_probs_bytes = 0;
 
+    bool     sc_embd_device_ready = false;
+    void   * sc_embd_device_data  = nullptr;
+    size_t   sc_embd_device_bytes = 0;
+
     bool     canvas_tokens_device_ready = false;
     void   * canvas_tokens_device_data  = nullptr;
     size_t   canvas_tokens_device_bytes = 0;
@@ -351,6 +355,20 @@ class llm_graph_input_diffusion_self_cond_topk : public llm_graph_input_i {
     const llama_diffusion_cond * diffusion;
 };
 
+// Dense diffusion self-conditioning embedding input. This is filled by the CUDA sampler from
+// top-k ids/probs when GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD is enabled.
+class llm_graph_input_diffusion_self_cond_embd : public llm_graph_input_i {
+public:
+    llm_graph_input_diffusion_self_cond_embd(const llama_diffusion_cond * diffusion) : diffusion(diffusion) {}
+    virtual ~llm_graph_input_diffusion_self_cond_embd() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * embd = nullptr; // F32 [n_embd, n_tokens]
+
+    const llama_diffusion_cond * diffusion;
+};
+
 class llm_graph_input_attn_no_cache : public llm_graph_input_i {
 public:
     llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
@@ -796,7 +814,9 @@ class llm_graph_result {
     ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
     ggml_tensor * get_h_nextn()     const { return t_h_nextn; }
     ggml_tensor * get_h_pre_norm()  const { return t_h_pre_norm; }
+    ggml_tensor * get_diffusion_token_embd() const { return t_diffusion_token_embd; }
     llm_graph_input_diffusion_self_cond_topk * get_inp_diffusion_self_cond_topk() const;
+    llm_graph_input_diffusion_self_cond_embd * get_inp_diffusion_self_cond_embd() const;
 
     ggml_cgraph  * get_gf()  const { return gf; }
     ggml_context * get_ctx() const { return ctx_compute.get(); }
@@ -827,6 +847,7 @@ class llm_graph_result {
     ggml_tensor * t_embd_pooled = nullptr;
     ggml_tensor * t_h_nextn     = nullptr; // [n_embd, n_outputs] hidden state before final output norm
     ggml_tensor * t_h_pre_norm  = nullptr; // [n_embd, n_outputs] hidden state before final output norm
+    ggml_tensor * t_diffusion_token_embd = nullptr; // on-device F16 token embedding used by diffusion CUDA helpers
 
     std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
     std::map<llama_seq_id, ggml_tensor*> t_candidates;
@@ -1039,6 +1060,7 @@ struct llm_graph_context {
     ggml_tensor * build_inp_diffusion_self_cond(int64_t n_vocab) const; // F32 [n_vocab, n_tokens]
     // sparse self-cond: returns the input object exposing ->ids (I32 [k,n_tokens]) and ->probs (F32 [k,n_tokens])
     llm_graph_input_diffusion_self_cond_topk * build_inp_diffusion_self_cond_topk(int64_t k) const;
+    ggml_tensor * build_inp_diffusion_self_cond_embd(int64_t n_embd) const; // F32 [n_embd, n_tokens]
     ggml_tensor * build_inp_pos() const;
     ggml_tensor * build_inp_attn_scale() const;
     ggml_tensor * build_inp_out_ids() const;
diff --git a/src/models/diffusion-gemma.cpp b/src/models/diffusion-gemma.cpp
index fe521082e542..b8b1091cbb26 100644
--- a/src/models/diffusion-gemma.cpp
+++ b/src/models/diffusion-gemma.cpp
@@ -3,6 +3,42 @@
 #include "ggml-backend.h"
 #include "ggml-alloc.h"
 
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+
+static bool diffusion_gemma_fuse_final_logit_softcap_enabled() {
+    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSE_FINAL_SOFTCAP");
+    return v && std::strcmp(v, "0") != 0;
+}
+
+static bool diffusion_gemma_env_enabled_default(const char * name, bool def) {
+    const char * v = std::getenv(name);
+    if (!v) {
+        return def;
+    }
+    return std::strcmp(v, "0") != 0;
+}
+
+static bool diffusion_gemma_fused_selfcond_embd_enabled() {
+    return diffusion_gemma_env_enabled_default("GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD", false) &&
+        diffusion_gemma_env_enabled_default("DG_DEVICE_SELFCOND", true);
+}
+
+static int64_t diffusion_gemma_self_cond_top_k() {
+    const int64_t def = llama_model_diffusion_gemma::N_SC_TOPK;
+    const char * v = std::getenv("GGML_CUDA_DIFFUSION_SC_TOPK");
+    if (!v) {
+        return def;
+    }
+
+    const int64_t k = std::atoll(v);
+    if (k <= 0) {
+        return def;
+    }
+    return std::min(k, def);
+}
+
 // diffusion_gemma reuses the gemma4 decoder block (tensor layout + per-layer math) but runs
 // as a bidirectional (non-causal) block-diffusion denoiser over a canvas, with KV-cache reuse:
 // the prompt / previously-finalized canvases form a causal, read-only prefix in the unified
@@ -123,7 +159,7 @@ void llama_model_diffusion_gemma::load_arch_post(llama_model_loader & ml) {
                            __func__, (long long) n_embd_t, (long long) n_vocab_t,
                            ggml_nbytes(tok_embd_gpu) / (1024.0 * 1024.0 * 1024.0),
                            ggml_backend_buffer_name(tok_embd_gpu->buffer),
-                           (long long) N_SC_TOPK);
+                           (long long) diffusion_gemma_self_cond_top_k());
             return;
         }
         LLAMA_LOG_WARN("%s: failed to allocate on-device gather embedding; falling back to dense matmul\n", __func__);
@@ -186,6 +222,9 @@ ggml_tensor * llama_model_diffusion_gemma::graph_base::build_input(bool is_decod
     const auto & dmodel = static_cast<const llama_model_diffusion_gemma &>(model);
 
     ggml_tensor * inpL = build_inp_embd(dmodel.tok_embd_gpu ? dmodel.tok_embd_gpu : model.tok_embd);
+    if (dmodel.tok_embd_gpu) {
+        res->t_diffusion_token_embd = dmodel.tok_embd_gpu;
+    }
 
     // scaled word embeddings (sqrt(hidden_size)); raw embeddings input is not scaled
     inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
@@ -194,12 +233,14 @@ ggml_tensor * llama_model_diffusion_gemma::graph_base::build_input(bool is_decod
     if (is_decoder) {
         ggml_tensor * soft; // soft-embedding {n_embd, n_tokens}: blend of the previous step's
                             // predicted token embeddings, scaled by sqrt(n_embd)
-        if (dmodel.tok_embd_gpu) {
+        if (dmodel.tok_embd_gpu && diffusion_gemma_fused_selfcond_embd_enabled()) {
+            soft = build_inp_diffusion_self_cond_embd(n_embd);
+        } else if (dmodel.tok_embd_gpu) {
             // Sparse gather path (Option-2): the previous step's top-k token ids+probs are fed per
             // position; gather just those k embedding rows and blend them, instead of the dense
             // full-vocab `probs @ token_embd` matmul. Gather width is fixed (N_SC_TOPK) so the
             // graph shape is constant; unused slots carry prob 0 (the CLI zero-pads).
-            const int64_t k = llama_model_diffusion_gemma::N_SC_TOPK;
+            const int64_t k = diffusion_gemma_self_cond_top_k();
             auto * inp = build_inp_diffusion_self_cond_topk(k);
             ggml_tensor * ids   = inp->ids;                                                // I32 {k*n_tokens}
             ggml_tensor * probs = inp->probs;                                              // F32 {k, n_tokens}
@@ -419,7 +460,10 @@ void llama_model_diffusion_gemma::graph_base::build_transformer(ggml_tensor * in
 
     cur = build_lora_mm(model.output, cur, model.output_s);
 
-    if (hparams.f_final_logit_softcapping) {
+    const bool fuse_final_softcap =
+        diffusion && diffusion->decoder_phase && diffusion_gemma_fuse_final_logit_softcap_enabled();
+
+    if (hparams.f_final_logit_softcapping && !fuse_final_softcap) {
         cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
         cur = ggml_tanh(ctx0, cur);
         cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);

From 89856ffdfcfe339356ea81277dd51cfd7f2dabac Mon Sep 17 00:00:00 2001
From: Loveneet Nigam <lnigam@nvidia.com>
Date: Thu, 11 Jun 2026 10:51:19 +0530
Subject: [PATCH 13/13] diffusion: expose tuning switches as options

---
 common/arg.cpp                                |  95 +++++
 common/common.cpp                             |   5 +
 common/common.h                               |  23 ++
 .../diffusion-gemma/diffusion-gemma-cli.cpp   | 116 ++++--
 .../diffusion-gemma-server.cpp                | 104 +++++-
 ggml/include/ggml-cuda.h                      |   8 +
 ggml/src/ggml-backend.cpp                     |   2 +
 ggml/src/ggml-cuda/diffusion-sampling.cu      | 340 +++++++++++++++---
 ggml/src/ggml-cuda/ggml-cuda.cu               |  10 +-
 ggml/src/ggml-impl.h                          |  18 +
 ggml/src/ggml.c                               |   3 +
 include/llama.h                               |  13 +
 scripts/collect_diffusion_ncu_admin.ps1       | 186 ++++++++++
 src/llama-context.cpp                         |  33 +-
 src/llama-graph.cpp                           |  19 +-
 src/llama-graph.h                             |   8 +-
 src/models/diffusion-gemma.cpp                |  48 +--
 17 files changed, 885 insertions(+), 146 deletions(-)
 create mode 100644 scripts/collect_diffusion_ncu_admin.ps1

diff --git a/common/arg.cpp b/common/arg.cpp
index f5d8fb4acf8b..895a9eaadc69 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3865,6 +3865,101 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
         [](common_params & params) { params.diffusion.visual_mode = true; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--no-diffusion-gpu-sampling"},
+        "disable CUDA block-diffusion sampling fast path",
+        [](common_params & params) { params.diffusion.gpu_sampling = false; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--no-diffusion-device-selfcond"},
+        "disable device-resident block-diffusion self-conditioning",
+        [](common_params & params) { params.diffusion.device_self_cond = false; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--no-diffusion-device-denoise-loop"},
+        "disable device-side block-diffusion canvas and stop-state updates",
+        [](common_params & params) { params.diffusion.device_denoise_loop = false; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-pin-host-outputs"},
+        "register compact diffusion output buffers as pinned host memory",
+        [](common_params & params) { params.diffusion.pin_host_outputs = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-self-cond-top-k"}, "N",
+        string_format("block-diffusion sparse self-conditioning width (default: %d)", params.diffusion.self_cond_top_k),
+        [](common_params & params, int value) { params.diffusion.self_cond_top_k = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-input-gpu-groups"}, "N",
+        string_format("bitmask of block-diffusion decoder input groups assigned to GPU backend (default: %u)", params.diffusion.input_gpu_groups),
+        [](common_params & params, int value) { params.diffusion.input_gpu_groups = (uint32_t) std::max(value, 0); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-default-top-k"}, "N",
+        "block-diffusion top-k used when --top-k is not explicitly provided",
+        [](common_params & params, int value) { params.diffusion.default_top_k = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-force-top-k"}, "N",
+        "block-diffusion server: override per-request top_k when N > 0",
+        [](common_params & params, int value) { params.diffusion.force_top_k = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-fused-self-cond-embd"},
+        "use fused device self-conditioning embedding input for block diffusion",
+        [](common_params & params) { params.diffusion.fused_self_cond_embd = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-fuse-final-softcap"},
+        "move final logit softcap into the CUDA diffusion sampling kernel",
+        [](common_params & params) { params.diffusion.fuse_final_logit_softcap = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-separate-encoder-decoder"},
+        "build separate block-diffusion encoder and decoder graph variants",
+        [](common_params & params) { params.diffusion.separate_encoder_decoder = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-direct-self-cond"},
+        "write CUDA diffusion self-conditioning directly into decoder graph inputs",
+        [](common_params & params) { params.diffusion.cuda_direct_self_cond = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-final-tokens-on-stop"},
+        "copy final diffusion tokens only when the device stop condition is reached",
+        [](common_params & params) { params.diffusion.cuda_final_tokens_on_stop = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-fused-top-k-sample"},
+        "fuse CUDA diffusion top-k selection and sampling",
+        [](common_params & params) { params.diffusion.cuda_fused_top_k_sample = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-tight-top-k"},
+        "avoid extra CUDA diffusion top-k scratch width when possible",
+        [](common_params & params) { params.diffusion.cuda_tight_top_k = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-parallel-full-softmax"},
+        "parallelize CUDA diffusion full-vocab sampling when top-k is 0",
+        [](common_params & params) { params.diffusion.cuda_parallel_full_softmax = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-fused-full-softmax"},
+        "fuse CUDA diffusion full-vocab softmax sampling and self-conditioning",
+        [](common_params & params) { params.diffusion.cuda_fused_full_softmax = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--diffusion-cuda-top-k-local-k"}, "N",
+        "CUDA diffusion local top-k candidates per thread (0 = backend default)",
+        [](common_params & params, int value) { params.diffusion.cuda_top_k_local_k = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        {"--no-diffusion-cuda-fast-top-k"},
+        "disable CUDA diffusion CUB/fast top-k selection path",
+        [](common_params & params) { params.diffusion.cuda_fast_top_k = false; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         {"--top-k-start"}, "N",
         "block-diffusion: anneal top-k from N at the first (high-entropy) denoising step (with --top-k-end)",
diff --git a/common/common.cpp b/common/common.cpp
index b01772e1cbfe..23209395dbef 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1585,6 +1585,11 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
     cparams.kv_unified        = params.kv_unified;
+    cparams.diffusion_self_cond_top_k = params.diffusion.self_cond_top_k;
+    cparams.diffusion_input_gpu_groups = params.diffusion.input_gpu_groups;
+    cparams.diffusion_fused_self_cond_embd = params.diffusion.fused_self_cond_embd;
+    cparams.diffusion_fuse_final_logit_softcap = params.diffusion.fuse_final_logit_softcap;
+    cparams.diffusion_separate_encoder_decoder = params.diffusion.separate_encoder_decoder;
 
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;
diff --git a/common/common.h b/common/common.h
index b85ac5701e63..d702febf2137 100644
--- a/common/common.h
+++ b/common/common.h
@@ -381,6 +381,10 @@ struct common_params_vocoder {
 struct common_params_diffusion {
     int32_t steps         = 128;
     bool    visual_mode   = false;
+    bool    gpu_sampling  = true;     // use CUDA diffusion sampling fast path when available
+    bool    device_self_cond = true;  // keep diffusion self-conditioning state on device
+    bool    device_denoise_loop = true; // update diffusion canvas/stop state on device
+    bool    pin_host_outputs = false; // register compact D2H output buffers as pinned host memory
 
     float   eps           = 0;        // epsilon for timesteps
     int32_t block_length  = 0;        // block length for generation
@@ -395,6 +399,25 @@ struct common_params_diffusion {
     int32_t top_k_start        = 0;     // anneal top-k from this (first/high-entropy step) ...
     int32_t top_k_end          = 0;     // ... to this (last step); both > 0 enables annealing
     bool    top_k_tail_correction = false; // use exact full-vocab entropy for accept/stop
+    int32_t default_top_k      = 0;     // top-k used when --top-k is not explicitly provided
+    int32_t force_top_k        = 0;     // server: override per-request top_k when > 0
+    int32_t self_cond_top_k    = 256;   // sparse self-conditioning gather width
+    uint32_t input_gpu_groups  = 63;    // decoder input tensor groups assigned to GPU backend
+
+    // CUDA diffusion sampling fast-path knobs. Defaults preserve behavior when no tuning flags are passed.
+    bool    cuda_fast_top_k                = true;
+    bool    cuda_direct_self_cond          = false;
+    bool    cuda_final_tokens_on_stop      = false;
+    bool    cuda_fused_top_k_sample        = false;
+    bool    cuda_tight_top_k               = false;
+    bool    cuda_parallel_full_softmax     = false;
+    bool    cuda_fused_full_softmax        = false;
+    int32_t cuda_top_k_local_k             = 0;     // 0 = backend default
+
+    // Diffusion graph-shape knobs.
+    bool    fused_self_cond_embd       = false;
+    bool    fuse_final_logit_softcap   = false;
+    bool    separate_encoder_decoder   = false;
 };
 
 // reasoning API response format (not to be confused as chat template's reasoning format)
diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
index cd320009f7fa..20aff7e7b6e1 100644
--- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp
@@ -21,11 +21,15 @@
 #include "log.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
 
 #include <algorithm>
 #include <chrono>
 #include <cmath>
 #include <cstdio>
+#include <cstdlib>
 #include <functional>
 #include <numeric>
 #include <random>
@@ -42,14 +46,34 @@ static constexpr float CONFIDENCE_THRESHOLD   = 0.005f; // StableAndConfident.co
 static constexpr int   STABILITY_THRESHOLD    = 1;      // StableAndConfident.stability_threshold
 static constexpr int   GPU_SAMPLING_MAX_TOP_K = 1024;   // small top-k sort kernel limit
 
-static int env_int(const char * name, int def) {
-    const char * v = getenv(name);
-    return v ? atoi(v) : def;
-}
+#ifdef GGML_USE_CUDA
+struct diffusion_cuda_host_pin {
+    diffusion_cuda_host_pin(void * ptr, size_t size, bool enabled) : ptr(ptr), size(size) {
+        registered = enabled && ptr && size && ggml_backend_cuda_register_host_buffer(ptr, size);
+    }
+
+    ~diffusion_cuda_host_pin() {
+        if (registered) {
+            ggml_backend_cuda_unregister_host_buffer(ptr);
+        }
+    }
+
+    diffusion_cuda_host_pin(const diffusion_cuda_host_pin &) = delete;
+    diffusion_cuda_host_pin & operator=(const diffusion_cuda_host_pin &) = delete;
 
-static int diffusion_self_cond_top_k() {
+    void * ptr = nullptr;
+    size_t size = 0;
+    bool registered = false;
+};
+#else
+struct diffusion_cuda_host_pin {
+    diffusion_cuda_host_pin(void *, size_t, bool) {}
+};
+#endif
+
+static int diffusion_self_cond_top_k(const common_params & params) {
     constexpr int def = 256;
-    const int k = env_int("GGML_CUDA_DIFFUSION_SC_TOPK", def);
+    const int k = params.diffusion.self_cond_top_k;
     if (k <= 0) {
         return def;
     }
@@ -68,22 +92,44 @@ static std::string format_chat(llama_model * model, const std::string & prompt)
     return common_chat_templates_apply(tmpls.get(), inputs).prompt;
 }
 
+static void diffusion_gemma_print_usage(int, char **) {
+    printf("\nDiffusion-Gemma options:\n");
+    printf("  --diffusion-timing                    print diffusion decode/sample timing breakdown\n");
+}
+
 int main(int argc, char ** argv) {
+    bool log_step_timing = false;
+    std::vector<char *> fwd;
+    fwd.push_back(argv[0]);
+    for (int i = 1; i < argc; ++i) {
+        const std::string arg = argv[i];
+        if (arg == "--diffusion-timing") {
+            log_step_timing = true;
+        } else {
+            fwd.push_back(argv[i]);
+        }
+    }
+
     common_params params;
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) {
+    params.diffusion.steps = DEF_MAX_DENOISE_STEPS;
+    if (!common_params_parse((int) fwd.size(), fwd.data(), params, LLAMA_EXAMPLE_DIFFUSION, diffusion_gemma_print_usage)) {
         return 1;
     }
     common_init();
+    if (!params.diffusion.device_self_cond && params.diffusion.fused_self_cond_embd) {
+        LOG_ERR("--no-diffusion-device-selfcond cannot be used with --diffusion-fused-self-cond-embd\n");
+        return 1;
+    }
 
-    // diffusion config (env-overridable for quick CPU testing)
-    // canvas_length is fixed at the trained block size (256); overriding it is for experiments only.
-    const int   canvas_length = env_int("DG_CANVAS", DEF_CANVAS_LENGTH);
-    const int   n_steps       = env_int("DG_STEPS", DEF_MAX_DENOISE_STEPS);
+    // diffusion config
+    // canvas_length is fixed at the trained block size (256).
+    const int   canvas_length = DEF_CANVAS_LENGTH;
+    const int   n_steps       = std::max(params.diffusion.steps, 1);
     // number of autoregressive canvas blocks = ceil(-n / canvas_length), i.e. -n is the total
     // number of tokens to generate (e.g. -n 256 -> 1 canvas, -n 512 -> 2, -n 1024 -> 4). The
-    // model may stop earlier on an EOG token. DG_MAX_CANVASES overrides; default (no -n) is 1.
+    // model may stop earlier on an EOG token; default (no -n) is 1 canvas.
     const int   blocks_from_n = params.n_predict > 0 ? (params.n_predict + canvas_length - 1) / canvas_length : 1;
-    const int   max_canvases  = std::max(env_int("DG_MAX_CANVASES", blocks_from_n), 1); // autoregressive blocks
+    const int   max_canvases  = std::max(blocks_from_n, 1); // autoregressive blocks
     const float entropy_bound = ENTROPY_BOUND;
 
     // top-k host sampling (CLI flags; default = full softmax over the whole vocab):
@@ -94,10 +140,11 @@ int main(int argc, char ** argv) {
     // --top-k uses its own "0 = disabled" convention and is applied only when explicitly passed.
     const int topk_fixed = (params.sampling.user_sampling_config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K)
                          ? params.sampling.top_k
-                         : 0;
+                         : std::max(params.diffusion.default_top_k, 0);
     const int topk_start = params.diffusion.top_k_start;
     const int topk_end   = params.diffusion.top_k_end;
     const int topk_tail  = params.diffusion.top_k_tail_correction ? 1 : 0;
+    const int SC_K       = diffusion_self_cond_top_k(params);
 
     llama_backend_init();
 
@@ -198,6 +245,11 @@ int main(int argc, char ** argv) {
     ctx_params.n_batch  = n_ub;
     ctx_params.n_ubatch = n_ub;
     ctx_params.no_perf  = params.no_perf;
+    ctx_params.diffusion_self_cond_top_k = SC_K;
+    ctx_params.diffusion_input_gpu_groups = params.diffusion.input_gpu_groups;
+    ctx_params.diffusion_fused_self_cond_embd = params.diffusion.fused_self_cond_embd;
+    ctx_params.diffusion_fuse_final_logit_softcap = params.diffusion.fuse_final_logit_softcap;
+    ctx_params.diffusion_separate_encoder_decoder = params.diffusion.separate_encoder_decoder;
 
     llama_context * ctx = llama_init_from_model(model, ctx_params);
     if (!ctx) {
@@ -211,13 +263,13 @@ int main(int argc, char ** argv) {
 
     const int topk_max_requested =
         (topk_start > 0 && topk_end > 0) ? std::max(topk_start, topk_end) : topk_fixed;
-    const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0;
+    const bool gpu_sampling_requested = params.diffusion.gpu_sampling;
     const bool gpu_sampling_topk_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
     const bool use_gpu_sampling = gpu_sampling_requested &&
                                   gpu_sampling_topk_ok &&
                                   llama_diffusion_sample_topk_supported(ctx);
-    const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
-    const bool use_device_loop      = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
+    const bool use_device_self_cond = use_gpu_sampling && params.diffusion.device_self_cond;
+    const bool use_device_loop      = use_device_self_cond && params.diffusion.device_denoise_loop;
     const int device_early_stop_interval = use_device_loop ? 1 : 0;
     llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling);
 
@@ -226,7 +278,7 @@ int main(int argc, char ** argv) {
     LOG_INF("diffusion-gemma: gpu sampling: %s%s%s%s\n",
             use_gpu_sampling ? "on" : "off",
             (!gpu_sampling_topk_ok ? " (top-k exceeds CUDA fast-path limit)" :
-             (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")),
+             (!gpu_sampling_requested ? " (disabled by --no-diffusion-gpu-sampling)" : "")),
             use_device_self_cond ? " | device self-cond: on" : "",
             use_device_loop ? " | device loop: on" : "");
     if (device_early_stop_interval > 0) {
@@ -292,16 +344,21 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> argmax_canvas(canvas_length, -1);
     std::vector<llama_token> prev_argmax(canvas_length, -1);
     std::vector<llama_token> accepted(canvas_length);
+    int32_t stop_flag = 0;
     // sparse self-conditioning over the canvas: the previous step's top-SC_K token ids + their
     // (renormalized) softmax probabilities per position. Fed to the next decode (Option-2 graph
     // gather: the decoder gathers just these SC_K embedding rows and blends them, instead of a
     // dense full-vocab probs @ token_embd matmul). Zero probs => no self-conditioning (step 1).
     // SC_K must match the graph's fixed gather width. Default is 256; use
-    // GGML_CUDA_DIFFUSION_SC_TOPK for gated experiments with a smaller sparse blend.
-    const int SC_K = diffusion_self_cond_top_k();
+    // --diffusion-self-cond-top-k can be used for gated experiments with a smaller sparse blend.
     std::vector<int32_t> sc_ids ((size_t) SC_K * canvas_length, 0);
     std::vector<float>   sc_probs((size_t) SC_K * canvas_length, 0.0f);
 
+    const bool pin_host_outputs = params.diffusion.pin_host_outputs;
+    diffusion_cuda_host_pin pin_argmax_canvas(
+            argmax_canvas.data(), argmax_canvas.size() * sizeof(argmax_canvas[0]), pin_host_outputs);
+    diffusion_cuda_host_pin pin_stop_flag(&stop_flag, sizeof(stop_flag), pin_host_outputs);
+
     // all generated tokens across the autoregressive canvas blocks
     std::vector<llama_token> generated;
 
@@ -310,7 +367,6 @@ int main(int argc, char ** argv) {
     int n_blocks_run  = 0;
     int n_steps_total = 0;
     const auto t_gen_start = std::chrono::steady_clock::now();
-    const bool log_step_timing = env_int("DG_TIMING", 0) != 0;
     double decode_enqueue_s = 0.0;
     double sample_sync_s    = 0.0;
     double host_loop_s      = 0.0;
@@ -373,7 +429,7 @@ int main(int argc, char ** argv) {
             const bool use_device_early_stop = device_early_stop_interval > 0;
             const bool reset_stop_state = use_device_early_stop && block_step == 1;
             const bool check_stop = use_device_early_stop && !final_step;
-            int32_t stop_flag = 0;
+            stop_flag = 0;
             const auto t_sample_start = std::chrono::steady_clock::now();
             llama_diffusion_sample_params sample_params = {
                 /* .n_tokens              = */ canvas_length,
@@ -383,6 +439,14 @@ int main(int argc, char ** argv) {
                 /* .seed                  = */ params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed,
                 /* .step                  = */ (uint32_t) n_steps_total,
                 /* .top_k_tail_correction = */ topk_tail != 0,
+                /* .cuda_fast_top_k       = */ params.diffusion.cuda_fast_top_k,
+                /* .cuda_direct_self_cond = */ params.diffusion.cuda_direct_self_cond,
+                /* .cuda_final_tokens_on_stop = */ params.diffusion.cuda_final_tokens_on_stop,
+                /* .cuda_fused_top_k_sample = */ params.diffusion.cuda_fused_top_k_sample,
+                /* .cuda_tight_top_k      = */ params.diffusion.cuda_tight_top_k,
+                /* .cuda_parallel_full_softmax = */ params.diffusion.cuda_parallel_full_softmax,
+                /* .cuda_fused_full_softmax = */ params.diffusion.cuda_fused_full_softmax,
+                /* .cuda_top_k_local_k    = */ params.diffusion.cuda_top_k_local_k,
             };
             llama_diffusion_sample_result sample_result = {
                 /* .sampled         = */ nullptr,
@@ -433,6 +497,14 @@ int main(int argc, char ** argv) {
                 /* .seed                  = */ params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed,
                 /* .step                  = */ (uint32_t) n_steps_total,
                 /* .top_k_tail_correction = */ topk_tail != 0,
+                /* .cuda_fast_top_k       = */ params.diffusion.cuda_fast_top_k,
+                /* .cuda_direct_self_cond = */ params.diffusion.cuda_direct_self_cond,
+                /* .cuda_final_tokens_on_stop = */ params.diffusion.cuda_final_tokens_on_stop,
+                /* .cuda_fused_top_k_sample = */ params.diffusion.cuda_fused_top_k_sample,
+                /* .cuda_tight_top_k      = */ params.diffusion.cuda_tight_top_k,
+                /* .cuda_parallel_full_softmax = */ params.diffusion.cuda_parallel_full_softmax,
+                /* .cuda_fused_full_softmax = */ params.diffusion.cuda_fused_full_softmax,
+                /* .cuda_top_k_local_k    = */ params.diffusion.cuda_top_k_local_k,
             };
             llama_diffusion_sample_result sample_result = {
                 /* .sampled         = */ sampled.data(),
diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp
index 9b49f55b8d25..97f5b99ce65b 100644
--- a/examples/diffusion-gemma/diffusion-gemma-server.cpp
+++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp
@@ -18,6 +18,9 @@
 #include "common.h"
 #include "llama.h"
 #include "log.h"
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
 
 #include <cpp-httplib/httplib.h>
 #include <nlohmann/json.hpp>
@@ -57,13 +60,33 @@ static constexpr int   STABILITY_THRESHOLD  = 1;
 static constexpr int   DEF_SC_K             = 256;
 static constexpr int   GPU_SAMPLING_MAX_TOP_K = 1024; // CUDA diffusion sampler limit
 
-static int env_int(const char * name, int def) {
-    const char * v = getenv(name);
-    return v ? atoi(v) : def;
-}
+#ifdef GGML_USE_CUDA
+struct diffusion_cuda_host_pin {
+    diffusion_cuda_host_pin(void * ptr, size_t size, bool enabled) : ptr(ptr), size(size) {
+        registered = enabled && ptr && size && ggml_backend_cuda_register_host_buffer(ptr, size);
+    }
+
+    ~diffusion_cuda_host_pin() {
+        if (registered) {
+            ggml_backend_cuda_unregister_host_buffer(ptr);
+        }
+    }
+
+    diffusion_cuda_host_pin(const diffusion_cuda_host_pin &) = delete;
+    diffusion_cuda_host_pin & operator=(const diffusion_cuda_host_pin &) = delete;
+
+    void * ptr = nullptr;
+    size_t size = 0;
+    bool registered = false;
+};
+#else
+struct diffusion_cuda_host_pin {
+    diffusion_cuda_host_pin(void *, size_t, bool) {}
+};
+#endif
 
-static int diffusion_self_cond_top_k() {
-    const int k = env_int("GGML_CUDA_DIFFUSION_SC_TOPK", DEF_SC_K);
+static int diffusion_self_cond_top_k(const common_params & params) {
+    const int k = params.diffusion.self_cond_top_k;
     if (k <= 0) {
         return DEF_SC_K;
     }
@@ -170,6 +193,7 @@ struct diffusion_server {
     bool   use_device_self_cond = false;
     bool   use_device_loop = false;
     int    device_early_stop_interval = 0;
+    common_params_diffusion diffusion;
     std::string model_id;
     std::string model_path;
     std::string build_info = "llama.cpp diffusion-gemma-server";
@@ -244,11 +268,17 @@ struct diffusion_server {
         std::vector<llama_token> argmax_canvas(canvas_length, -1);
         std::vector<llama_token> prev_argmax(canvas_length, -1);
         std::vector<llama_token> accepted(canvas_length);
-        const int SC_K = diffusion_self_cond_top_k();
+        int32_t stop_flag = 0;
+        const int SC_K = diffusion.self_cond_top_k;
         std::vector<int32_t>     sc_ids ((size_t) SC_K * canvas_length, 0);
         std::vector<float>       sc_probs((size_t) SC_K * canvas_length, 0.0f);
         std::vector<llama_token> generated;
 
+        const bool pin_host_outputs = diffusion.pin_host_outputs;
+        diffusion_cuda_host_pin pin_argmax_canvas(
+                argmax_canvas.data(), argmax_canvas.size() * sizeof(argmax_canvas[0]), pin_host_outputs);
+        diffusion_cuda_host_pin pin_stop_flag(&stop_flag, sizeof(stop_flag), pin_host_outputs);
+
         int n_blocks_run = 0, n_steps_total = 0;
         bool done = false;
         const int topk_max_requested =
@@ -306,7 +336,7 @@ struct diffusion_server {
                     const bool use_device_early_stop = device_early_stop_interval > 0;
                     const bool reset_stop_state = use_device_early_stop && block_step == 1;
                     const bool check_stop = use_device_early_stop && !final_step;
-                    int32_t stop_flag = 0;
+                    stop_flag = 0;
                     llama_diffusion_sample_params sample_params = {
                         /* .n_tokens              = */ canvas_length,
                         /* .top_k                 = */ k_step,
@@ -315,6 +345,14 @@ struct diffusion_server {
                         /* .seed                  = */ rq.seed,
                         /* .step                  = */ (uint32_t) n_steps_total,
                         /* .top_k_tail_correction = */ rq.topk_tail,
+                        /* .cuda_fast_top_k       = */ diffusion.cuda_fast_top_k,
+                        /* .cuda_direct_self_cond = */ diffusion.cuda_direct_self_cond,
+                        /* .cuda_final_tokens_on_stop = */ diffusion.cuda_final_tokens_on_stop,
+                        /* .cuda_fused_top_k_sample = */ diffusion.cuda_fused_top_k_sample,
+                        /* .cuda_tight_top_k      = */ diffusion.cuda_tight_top_k,
+                        /* .cuda_parallel_full_softmax = */ diffusion.cuda_parallel_full_softmax,
+                        /* .cuda_fused_full_softmax = */ diffusion.cuda_fused_full_softmax,
+                        /* .cuda_top_k_local_k    = */ diffusion.cuda_top_k_local_k,
                     };
                     llama_diffusion_sample_result sample_result = {
                         /* .sampled         = */ nullptr,
@@ -359,6 +397,14 @@ struct diffusion_server {
                         /* .seed                  = */ rq.seed,
                         /* .step                  = */ (uint32_t) n_steps_total,
                         /* .top_k_tail_correction = */ rq.topk_tail,
+                        /* .cuda_fast_top_k       = */ diffusion.cuda_fast_top_k,
+                        /* .cuda_direct_self_cond = */ diffusion.cuda_direct_self_cond,
+                        /* .cuda_final_tokens_on_stop = */ diffusion.cuda_final_tokens_on_stop,
+                        /* .cuda_fused_top_k_sample = */ diffusion.cuda_fused_top_k_sample,
+                        /* .cuda_tight_top_k      = */ diffusion.cuda_tight_top_k,
+                        /* .cuda_parallel_full_softmax = */ diffusion.cuda_parallel_full_softmax,
+                        /* .cuda_fused_full_softmax = */ diffusion.cuda_fused_full_softmax,
+                        /* .cuda_top_k_local_k    = */ diffusion.cuda_top_k_local_k,
                     };
                     llama_diffusion_sample_result sample_result = {
                         /* .sampled         = */ sampled.data(),
@@ -627,6 +673,11 @@ static diffusion_request request_from_body(const json & body, const diffusion_se
     if (body.contains("max_tokens") && body["max_tokens"].is_number_integer())                       max_tokens = body["max_tokens"].get<int>();
     else if (body.contains("max_completion_tokens") && body["max_completion_tokens"].is_number_integer()) max_tokens = body["max_completion_tokens"].get<int>();
     rq.max_canvases = max_tokens > 0 ? (max_tokens + rq.canvas_length - 1) / rq.canvas_length : 1;
+    if (body.contains("diffusion_steps") && body["diffusion_steps"].is_number_integer()) {
+        rq.n_steps = std::max(body["diffusion_steps"].get<int>(), 1);
+    } else if (body.contains("n_denoise_steps") && body["n_denoise_steps"].is_number_integer()) {
+        rq.n_steps = std::max(body["n_denoise_steps"].get<int>(), 1);
+    }
     if (body.contains("top_k") && body["top_k"].is_number_integer()) {
         rq.topk_fixed = body["top_k"].get<int>();
         rq.topk_start = 0;
@@ -637,6 +688,12 @@ static diffusion_request request_from_body(const json & body, const diffusion_se
     if (body.contains("top_k_tail_correction") && body["top_k_tail_correction"].is_boolean()) {
         rq.topk_tail = body["top_k_tail_correction"].get<bool>();
     }
+    const int forced_top_k = std::max(srv.diffusion.force_top_k, 0);
+    if (forced_top_k > 0) {
+        rq.topk_fixed = forced_top_k;
+        rq.topk_start = 0;
+        rq.topk_end   = 0;
+    }
     if (body.contains("seed")  && body["seed"].is_number_integer()) rq.seed = (uint32_t) body["seed"].get<int64_t>();
     if (body.contains("ignore_eos") && body["ignore_eos"].is_boolean()) rq.ignore_eos = body["ignore_eos"].get<bool>();
     return rq;
@@ -669,10 +726,16 @@ int main(int argc, char ** argv) {
     }
 
     common_params params;
+    params.diffusion.steps = DEF_MAX_DENOISE_STEPS;
     if (!common_params_parse((int) fwd.size(), fwd.data(), params, LLAMA_EXAMPLE_DIFFUSION)) {
         return 1;
     }
     common_init();
+    if (!params.diffusion.device_self_cond && params.diffusion.fused_self_cond_embd) {
+        SRV_ERR("%s\n", "--no-diffusion-device-selfcond cannot be used with --diffusion-fused-self-cond-embd");
+        return 1;
+    }
+    params.diffusion.self_cond_top_k = diffusion_self_cond_top_k(params);
 
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -699,13 +762,14 @@ int main(int argc, char ** argv) {
     srv.vocab         = llama_model_get_vocab(srv.model);
     srv.n_vocab       = llama_vocab_n_tokens(srv.vocab);
     srv.canvas_length = DEF_CANVAS_LENGTH;
-    srv.n_steps       = DEF_MAX_DENOISE_STEPS;
+    srv.n_steps       = std::max(params.diffusion.steps, 1);
+    srv.diffusion     = params.diffusion;
     srv.n_ub          = srv.canvas_length;
     srv.n_ctx         = params.n_ctx > 0 ? (int) params.n_ctx : 4096;
     if (srv.n_ctx < 2 * srv.canvas_length) srv.n_ctx = 2 * srv.canvas_length;
     srv.topk_fixed    = (params.sampling.user_sampling_config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K)
                             ? params.sampling.top_k
-                            : 0;
+                            : std::max(params.diffusion.default_top_k, 0);
     srv.topk_start    = params.diffusion.top_k_start;
     srv.topk_end      = params.diffusion.top_k_end;
     srv.topk_tail     = params.diffusion.top_k_tail_correction;
@@ -726,6 +790,11 @@ int main(int argc, char ** argv) {
     ctx_params.n_batch  = srv.n_ub;
     ctx_params.n_ubatch = srv.n_ub;
     ctx_params.no_perf  = params.no_perf;
+    ctx_params.diffusion_self_cond_top_k = srv.diffusion.self_cond_top_k;
+    ctx_params.diffusion_input_gpu_groups = srv.diffusion.input_gpu_groups;
+    ctx_params.diffusion_fused_self_cond_embd = srv.diffusion.fused_self_cond_embd;
+    ctx_params.diffusion_fuse_final_logit_softcap = srv.diffusion.fuse_final_logit_softcap;
+    ctx_params.diffusion_separate_encoder_decoder = srv.diffusion.separate_encoder_decoder;
 
     srv.ctx = llama_init_from_model(srv.model, ctx_params);
     if (!srv.ctx) {
@@ -740,11 +809,11 @@ int main(int argc, char ** argv) {
 
     const int topk_max_requested =
         (srv.topk_start > 0 && srv.topk_end > 0) ? std::max(srv.topk_start, srv.topk_end) : srv.topk_fixed;
-    const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0;
+    const bool gpu_sampling_requested = params.diffusion.gpu_sampling;
     srv.use_gpu_sampling = gpu_sampling_requested &&
                            llama_diffusion_sample_topk_supported(srv.ctx);
-    srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0;
-    srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0;
+    srv.use_device_self_cond = srv.use_gpu_sampling && params.diffusion.device_self_cond;
+    srv.use_device_loop = srv.use_device_self_cond && params.diffusion.device_denoise_loop;
     srv.device_early_stop_interval = srv.use_device_loop ? 1 : 0;
     llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling);
     const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K;
@@ -757,13 +826,16 @@ int main(int argc, char ** argv) {
     SRV_INF("gpu sampling: %s%s%s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n",
             srv.use_gpu_sampling ? "on" : "off",
             (!default_topk_gpu_ok ? " (default top-k will use CPU fallback until k <= CUDA limit)" :
-             (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")),
+             (!gpu_sampling_requested ? " (disabled by --no-diffusion-gpu-sampling)" : "")),
             srv.use_device_self_cond ? " | device self-cond: on" : "",
             srv.use_device_loop ? " | device loop: on" : "",
             srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0);
     if (srv.device_early_stop_interval > 0) {
         SRV_INF("device early-stop interval: %d\n", srv.device_early_stop_interval);
     }
+    if (srv.diffusion.force_top_k > 0) {
+        SRV_INF("forcing request top-k to %d via --diffusion-force-top-k\n", srv.diffusion.force_top_k);
+    }
 
     // ------------------------------------------------------------------------------------------
     // HTTP server
@@ -831,7 +903,7 @@ int main(int argc, char ** argv) {
                 {"entropy_bound", ENTROPY_BOUND},
                 {"confidence_threshold", CONFIDENCE_THRESHOLD},
                 {"stability_threshold", STABILITY_THRESHOLD},
-                {"self_cond_topk", diffusion_self_cond_top_k()},
+                {"self_cond_topk", srv.diffusion.self_cond_top_k},
                 {"gpu_sampling", srv.use_gpu_sampling},
                 {"device_self_cond", srv.use_device_self_cond},
                 {"device_loop", srv.use_device_loop},
@@ -940,7 +1012,7 @@ int main(int argc, char ** argv) {
                 {"params", {
                     {"canvas_length", srv.canvas_length},
                     {"n_denoise_steps", srv.n_steps},
-                    {"self_cond_topk", diffusion_self_cond_top_k()},
+                    {"self_cond_topk", srv.diffusion.self_cond_top_k},
                     {"gpu_sampling", srv.use_gpu_sampling},
                     {"device_self_cond", srv.use_device_self_cond},
                     {"device_loop", srv.use_device_loop},
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index f8990bea4c10..0a72747c3838 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -53,6 +53,14 @@ struct ggml_cuda_diffusion_sample_params {
     uint32_t step;
     bool     top_k_tail_correction;
     float    logit_softcap;
+    bool     fast_top_k;
+    bool     direct_self_cond;
+    bool     final_tokens_on_stop;
+    bool     fused_top_k_sample;
+    bool     tight_top_k;
+    bool     parallel_full_softmax;
+    bool     fused_full_softmax;
+    int32_t  top_k_local_k;
 };
 
 struct ggml_cuda_diffusion_sample_result {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 87615921c09b..bbdbc40a0513 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1405,6 +1405,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
     }
     sched->graph.n_nodes = 0;
     sched->graph.n_leafs = 0;
+    sched->graph.flags = graph->flags;
 
     struct ggml_cgraph * graph_copy = &sched->graph;
 
@@ -2134,6 +2135,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
         graph_copy->nodes[i] = node_copy;
     }
     graph_copy->n_nodes = graph->n_nodes;
+    graph_copy->flags = graph->flags;
 
     ggml_hash_set_free(&hash_set);
     free(node_copies);
diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu
index 2306e68558a0..93a66ad3aac2 100644
--- a/ggml/src/ggml-cuda/diffusion-sampling.cu
+++ b/ggml/src/ggml-cuda/diffusion-sampling.cu
@@ -4,8 +4,6 @@
 
 #include <algorithm>
 #include <cfloat>
-#include <cstdlib>
-#include <cstring>
 #include <map>
 #include <mutex>
 
@@ -26,35 +24,9 @@ static int next_power_of_2_host(int x) {
     return n;
 }
 
-static bool diffusion_fast_topk_enabled() {
-    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FAST_TOPK");
-    return !v || std::strcmp(v, "0") != 0;
-}
-
-static bool diffusion_direct_selfcond_enabled() {
-    const char * v = std::getenv("GGML_CUDA_DIFFUSION_DIRECT_SELFCOND");
-    return v && std::strcmp(v, "0") != 0;
-}
-
-static bool diffusion_final_tokens_on_stop_enabled() {
-    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FINAL_TOKENS_ON_STOP");
-    return v && std::strcmp(v, "0") != 0;
-}
-
-static bool diffusion_fused_topk_sample_enabled() {
-    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSED_TOPK_SAMPLE");
-    return v && std::strcmp(v, "0") != 0;
-}
-
-static bool diffusion_tight_topk_enabled() {
-    const char * v = std::getenv("GGML_CUDA_DIFFUSION_TIGHT_TOPK");
-    return v && std::strcmp(v, "0") != 0;
-}
-
-static int diffusion_topk_local_k(const int heap_k) {
-    int requested = heap_k <= 256 ? 8 : 16;
-    if (const char * v = std::getenv("GGML_CUDA_DIFFUSION_TOPK_LOCAL_K")) {
-        requested = std::atoi(v);
+static int diffusion_topk_local_k(const int heap_k, int requested) {
+    if (requested <= 0) {
+        requested = heap_k <= 256 ? 8 : 16;
     }
 
     const int min_local_k = std::max(1, (heap_k + 255) / 256);
@@ -227,9 +199,10 @@ static void diffusion_select_topk_local(
         const int n_vocab,
         const int n_tokens,
         const int heap_k,
+        const int top_k_local_k,
         cudaStream_t stream) {
     constexpr int block_size = 256;
-    const int local_k = diffusion_topk_local_k(heap_k);
+    const int local_k = diffusion_topk_local_k(heap_k, top_k_local_k);
 
     switch (local_k) {
         case 1: {
@@ -612,6 +585,7 @@ static void diffusion_sample_topk_fused_local(
         const int n_tokens,
         const int top_k,
         const int sc_k,
+        const int top_k_local_k,
         const float inv_temp,
         const float logit_softcap,
         const uint32_t seed,
@@ -623,7 +597,7 @@ static void diffusion_sample_topk_fused_local(
         float * sc_probs,
         cudaStream_t stream) {
     constexpr int block_size = 256;
-    const int local_k = diffusion_topk_local_k(top_k);
+    const int local_k = diffusion_topk_local_k(top_k, top_k_local_k);
 
     switch (local_k) {
         case 1: {
@@ -669,6 +643,243 @@ static void diffusion_sample_topk_fused_local(
     }
 }
 
+template<int LOCAL_K>
+static __global__ void diffusion_sample_full_softmax_fused_local_kernel(
+        const float * __restrict__ logits,
+        const int n_vocab,
+        const int n_tokens,
+        const int sc_k,
+        const float inv_temp,
+        const float logit_softcap,
+        const uint32_t seed,
+        const uint32_t step,
+        int * __restrict__ sampled,
+        int * __restrict__ argmax,
+        float * __restrict__ entropy,
+        int * __restrict__ sc_ids,
+        float * __restrict__ sc_probs) {
+    constexpr int block_size = 256;
+    constexpr int candidate_count = LOCAL_K * block_size;
+
+    const int row = blockIdx.x;
+    const int tid = threadIdx.x;
+    if (row >= n_tokens) {
+        return;
+    }
+
+    float vals[LOCAL_K];
+    int ids[LOCAL_K];
+#pragma unroll
+    for (int i = 0; i < LOCAL_K; ++i) {
+        vals[i] = -FLT_MAX;
+        ids[i] = 0;
+    }
+
+    const float * row_logits = logits + (size_t) row * n_vocab;
+    float local_max = -FLT_MAX;
+    int local_idx = 0;
+    for (int v = tid; v < n_vocab; v += blockDim.x) {
+        const float raw = row_logits[v];
+        const float x = diffusion_apply_logit_softcap(raw, logit_softcap) * inv_temp;
+        if (x > local_max) {
+            local_max = x;
+            local_idx = v;
+        }
+        if (diffusion_should_swap_desc(vals[LOCAL_K - 1], ids[LOCAL_K - 1], raw, v)) {
+            int pos = LOCAL_K - 1;
+#pragma unroll
+            for (int i = LOCAL_K - 1; i > 0; --i) {
+                if (pos == i && diffusion_should_swap_desc(vals[i - 1], ids[i - 1], raw, v)) {
+                    vals[i] = vals[i - 1];
+                    ids[i] = ids[i - 1];
+                    --pos;
+                }
+            }
+            vals[pos] = raw;
+            ids[pos] = v;
+        }
+    }
+
+    extern __shared__ unsigned char smem[];
+    float * s_vals = (float *) smem;
+    int * s_ids = (int *) (s_vals + candidate_count);
+    int * s_max_ids = s_ids + candidate_count;
+    float * s_sum = (float *) (s_max_ids + block_size);
+    float * s_t = s_sum + block_size;
+    float * s_cdf = s_t + block_size;
+    __shared__ int s_sampled;
+
+#pragma unroll
+    for (int i = 0; i < LOCAL_K; ++i) {
+        const int dst = tid * LOCAL_K + i;
+        s_vals[dst] = vals[i];
+        s_ids[dst] = ids[i];
+    }
+    s_sum[tid] = local_max;
+    s_max_ids[tid] = local_idx;
+    __syncthreads();
+
+    for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
+        if (tid < stride && s_sum[tid + stride] > s_sum[tid]) {
+            s_sum[tid] = s_sum[tid + stride];
+            s_max_ids[tid] = s_max_ids[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    const float max_l = s_sum[0];
+    const int amax = s_max_ids[0];
+
+    for (int k = 2; k <= candidate_count; k <<= 1) {
+        for (int j = k >> 1; j > 0; j >>= 1) {
+            for (int i = tid; i < candidate_count; i += blockDim.x) {
+                const int ixj = i ^ j;
+                if (ixj > i) {
+                    const bool descending = (i & k) == 0;
+                    const bool swap = descending
+                        ? diffusion_should_swap_desc(s_vals[i], s_ids[i], s_vals[ixj], s_ids[ixj])
+                        : diffusion_should_swap_desc(s_vals[ixj], s_ids[ixj], s_vals[i], s_ids[i]);
+                    if (swap) {
+                        const float tv = s_vals[i];
+                        s_vals[i] = s_vals[ixj];
+                        s_vals[ixj] = tv;
+                        const int ti = s_ids[i];
+                        s_ids[i] = s_ids[ixj];
+                        s_ids[ixj] = ti;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    float local_sum = 0.0f;
+    float local_t = 0.0f;
+    for (int v = tid; v < n_vocab; v += blockDim.x) {
+        const float d = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l;
+        const float e = expf(d);
+        local_sum += e;
+        local_t += d * e;
+    }
+
+    s_sum[tid] = local_sum;
+    s_t[tid] = local_t;
+    s_cdf[tid] = local_sum;
+    __syncthreads();
+
+    for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            s_sum[tid] += s_sum[tid + stride];
+            s_t[tid] += s_t[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    const float z = s_sum[0];
+    const float t = s_t[0];
+
+    if (tid == 0) {
+        argmax[row] = amax;
+        entropy[row] = logf(z) - t / z;
+        s_sampled = amax;
+    }
+    __syncthreads();
+
+    for (int stride = 1; stride < blockDim.x; stride <<= 1) {
+        const float add = tid >= stride ? s_cdf[tid - stride] : 0.0f;
+        __syncthreads();
+        s_cdf[tid] += add;
+        __syncthreads();
+    }
+
+    const float r = diffusion_rng_uniform(seed, step, row) * z;
+    const float chunk_begin = tid == 0 ? 0.0f : s_cdf[tid - 1];
+    const float chunk_end   = s_cdf[tid];
+    if (r > chunk_begin && r <= chunk_end) {
+        float thread_cum = chunk_begin;
+        for (int v = tid; v < n_vocab; v += blockDim.x) {
+            thread_cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l);
+            if (thread_cum >= r) {
+                s_sampled = v;
+                break;
+            }
+        }
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+        sampled[row] = s_sampled;
+        for (int i = 0; i < sc_k; ++i) {
+            const int out = row * sc_k + i;
+            sc_ids[out] = s_ids[i];
+            sc_probs[out] = expf(diffusion_apply_logit_softcap(s_vals[i], logit_softcap) * inv_temp - max_l) / z;
+        }
+    }
+}
+
+static void diffusion_sample_full_softmax_fused_local(
+        const float * logits,
+        const int n_vocab,
+        const int n_tokens,
+        const int sc_k,
+        const int top_k_local_k,
+        const float inv_temp,
+        const float logit_softcap,
+        const uint32_t seed,
+        const uint32_t step,
+        int * sampled,
+        int * argmax,
+        float * entropy,
+        int * sc_ids,
+        float * sc_probs,
+        cudaStream_t stream) {
+    constexpr int block_size = 256;
+    const int local_k = diffusion_topk_local_k(sc_k, top_k_local_k);
+
+    switch (local_k) {
+        case 1: {
+            constexpr int local_k_t = 1;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                3 * block_size * sizeof(float) + block_size * sizeof(int);
+            diffusion_sample_full_softmax_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+        case 2: {
+            constexpr int local_k_t = 2;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                3 * block_size * sizeof(float) + block_size * sizeof(int);
+            diffusion_sample_full_softmax_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+        case 4: {
+            constexpr int local_k_t = 4;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                3 * block_size * sizeof(float) + block_size * sizeof(int);
+            diffusion_sample_full_softmax_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+        case 8: {
+            constexpr int local_k_t = 8;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                3 * block_size * sizeof(float) + block_size * sizeof(int);
+            diffusion_sample_full_softmax_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+        default: {
+            constexpr int local_k_t = 16;
+            const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) +
+                                3 * block_size * sizeof(float) + block_size * sizeof(int);
+            diffusion_sample_full_softmax_fused_local_kernel<local_k_t><<<n_tokens, block_size, smem, stream>>>(
+                    logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step,
+                    sampled, argmax, entropy, sc_ids, sc_probs);
+        } break;
+    }
+}
+
 static __device__ __forceinline__ bool diffusion_pair_gt(float a_val, int a_id, float b_val, int b_id) {
     return a_val > b_val || (a_val == b_val && a_id > b_id);
 }
@@ -803,6 +1014,7 @@ static __global__ void diffusion_sample_kernel(
         const uint32_t seed,
         const uint32_t step,
         const bool tail_correction,
+        const bool parallel_full_softmax_sample,
         int * __restrict__ sampled,
         int * __restrict__ argmax,
         float * __restrict__ entropy,
@@ -816,7 +1028,9 @@ static __global__ void diffusion_sample_kernel(
 
     __shared__ float s_val[1024];
     __shared__ float s_sum[1024];
+    __shared__ float s_cdf[1024];
     __shared__ int   s_idx[1024];
+    __shared__ int   s_sampled;
 
     const float * row_logits = logits + (size_t) row * n_vocab;
     const int * row_top = top_ids + (size_t) row * heap_k;
@@ -880,6 +1094,7 @@ static __global__ void diffusion_sample_kernel(
 
     s_sum[tid] = local_sum;
     s_val[tid] = local_t;
+    s_cdf[tid] = local_sum;
     __syncthreads();
 
     for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
@@ -910,7 +1125,7 @@ static __global__ void diffusion_sample_kernel(
         float cum = 0.0f;
         int tok = amax;
 
-        if (top_k == 0) {
+        if (top_k == 0 && !parallel_full_softmax_sample) {
             for (int v = 0; v < n_vocab; ++v) {
                 cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l);
                 if (cum >= r) {
@@ -943,6 +1158,40 @@ static __global__ void diffusion_sample_kernel(
             }
         }
     }
+
+    if (top_k == 0 && parallel_full_softmax_sample) {
+        for (int stride = 1; stride < blockDim.x; stride <<= 1) {
+            const float add = tid >= stride ? s_cdf[tid - stride] : 0.0f;
+            __syncthreads();
+            s_cdf[tid] += add;
+            __syncthreads();
+        }
+
+        const float z = s_sum[0];
+        const float r = diffusion_rng_uniform(seed, step, row) * z;
+        if (tid == 0) {
+            s_sampled = amax;
+        }
+        __syncthreads();
+
+        const float chunk_begin = tid == 0 ? 0.0f : s_cdf[tid - 1];
+        const float chunk_end   = s_cdf[tid];
+        if (r > chunk_begin && r <= chunk_end) {
+            float thread_cum = chunk_begin;
+            for (int v = tid; v < n_vocab; v += blockDim.x) {
+                thread_cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l);
+                if (thread_cum >= r) {
+                    s_sampled = v;
+                    break;
+                }
+            }
+        }
+        __syncthreads();
+
+        if (tid == 0) {
+            sampled[row] = s_sampled;
+        }
+    }
 }
 
 bool ggml_cuda_diffusion_sample_topk(
@@ -1042,7 +1291,7 @@ bool ggml_cuda_diffusion_sample_topk(
         }
     }
 
-    const int heap_k = top_k == 0 || !diffusion_tight_topk_enabled() ? std::max(top_k, sc_k) : top_k;
+    const int heap_k = top_k == 0 || !params->tight_top_k ? std::max(top_k, sc_k) : top_k;
     if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) {
         return false;
     }
@@ -1060,25 +1309,33 @@ bool ggml_cuda_diffusion_sample_topk(
 
     diffusion_sample_scratch * scratch = diffusion_get_scratch(stream, n_tokens, heap_k, sc_k);
     int * top_ids = scratch->top_ids;
-    const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && diffusion_direct_selfcond_enabled();
+    const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && params->direct_self_cond;
     int * sc_ids_out = direct_self_cond_tensor ? (int *) result->self_cond_ids_tensor->data : scratch->sc_ids;
     float * sc_probs_out = direct_self_cond_tensor ? (float *) result->self_cond_probs_tensor->data : scratch->sc_probs;
 
     constexpr int block_size = 256;
     bool sync_required = false;
-    const bool use_fused_topk_sample = diffusion_fused_topk_sample_enabled() &&
+    const bool use_fused_topk_sample = params->fused_top_k_sample &&
         top_k > 0 && top_k <= block_size && !params->top_k_tail_correction && n_vocab >= top_k;
+    const bool use_fused_full_softmax_sample =
+        top_k == 0 && !params->top_k_tail_correction && params->fused_full_softmax;
+    const bool use_parallel_full_softmax_sample =
+        top_k == 0 && !use_fused_full_softmax_sample && params->parallel_full_softmax;
 
     if (use_fused_topk_sample) {
-        diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap,
+        diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, params->top_k_local_k, inv_temp, logit_softcap,
+                params->seed, params->step, scratch->sampled, scratch->argmax, scratch->entropy,
+                sc_ids_out, sc_probs_out, stream);
+    } else if (use_fused_full_softmax_sample) {
+        diffusion_sample_full_softmax_fused_local(logits_d, n_vocab, n_tokens, sc_k, params->top_k_local_k, inv_temp, logit_softcap,
                 params->seed, params->step, scratch->sampled, scratch->argmax, scratch->entropy,
                 sc_ids_out, sc_probs_out, stream);
     } else {
         bool top_ids_sorted = false;
-        const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k;
+        const bool use_fast_topk = params->fast_top_k && heap_k <= 1024 && n_vocab >= heap_k;
         sync_required = !use_fast_topk;
         if (use_fast_topk) {
-            diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream);
+            diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, params->top_k_local_k, stream);
             top_ids_sorted = true;
         } else {
 #ifdef CUB_DIFFUSION_TOP_K_AVAILABLE
@@ -1116,6 +1373,7 @@ bool ggml_cuda_diffusion_sample_topk(
         diffusion_sample_kernel<<<n_tokens, block_size, 0, stream>>>(
                 logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp,
                 logit_softcap, params->seed, params->step, params->top_k_tail_correction,
+                use_parallel_full_softmax_sample,
                 scratch->sampled, scratch->argmax, scratch->entropy,
                 sc_ids_out, sc_probs_out);
     }
@@ -1148,7 +1406,7 @@ bool ggml_cuda_diffusion_sample_topk(
     }
     const bool final_tokens_after_stop =
         result->final_tokens && result->stop && result->check_stop_on_device &&
-        diffusion_final_tokens_on_stop_enabled();
+        params->final_tokens_on_stop;
 
     if (result->final_tokens && !final_tokens_after_stop) {
         CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int),
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 17e2c10f2b9c..1b3ba4216227 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3301,6 +3301,8 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
         const char * env = getenv("GGML_CUDA_GRAPH_DIAG");
         return env != nullptr && atoi(env) != 0;
     }();
+    const bool is_diffusion_decoder_graph = ggml_graph_has_flag(cgraph, GGML_CGRAPH_FLAG_DIFFUSION_DECODER);
+
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -3328,18 +3330,18 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
             const ggml_tensor * src1 = node->src[1];
             const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
             const bool use_mmvq = ggml_is_quantized(src0->type) && node->ne[2] <= mmvq_mmid_max;
-            const bool use_mmq  = ggml_is_quantized(src0->type) && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[2], /*n_experts=*/src0->ne[2]);
-            const bool use_mmf  = ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true);
+            const bool use_mmq  = is_diffusion_decoder_graph && ggml_is_quantized(src0->type) && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[2], /*n_experts=*/src0->ne[2]);
+            const bool use_mmf  = is_diffusion_decoder_graph && ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true);
             if (!use_mmvq && !use_mmq && !use_mmf) {
                 // under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
                 // TODO: figure out a way to enable for larger batch sizes, without hurting performance
                 // ref: https://github.com/ggml-org/llama.cpp/pull/18958
                 use_cuda_graph = false;
                 if (log_graph_incompat) {
-                    GGML_LOG_INFO("%s: disabling CUDA graphs due to MUL_MAT_ID at node %d %s: src0=%s ne2=%lld src1_ne2=%lld n_experts=%lld mmvq_mmid_max=%d quantized=%d mmq=%d mmf=%d\n",
+                    GGML_LOG_INFO("%s: disabling CUDA graphs due to MUL_MAT_ID at node %d %s: src0=%s ne2=%lld src1_ne2=%lld n_experts=%lld mmvq_mmid_max=%d quantized=%d diffusion_decoder=%d mmq=%d mmf=%d\n",
                             __func__, i, node->name, ggml_type_name(src0->type),
                             (long long) node->ne[2], (long long) src1->ne[2], (long long) src0->ne[2],
-                            mmvq_mmid_max, (int) ggml_is_quantized(src0->type), (int) use_mmq, (int) use_mmf);
+                            mmvq_mmid_max, (int) ggml_is_quantized(src0->type), (int) is_diffusion_decoder_graph, (int) use_mmq, (int) use_mmf);
                 }
 #ifndef NDEBUG
                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 62b76abbcec9..81d083be30da 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -326,6 +326,10 @@ enum ggml_cgraph_eval_order {
     GGML_CGRAPH_EVAL_ORDER_COUNT
 };
 
+enum ggml_cgraph_flag {
+    GGML_CGRAPH_FLAG_DIFFUSION_DECODER = 1u << 0,
+};
+
 struct ggml_cgraph {
     int size;    // maximum number of nodes/leafs/grads/grad_accs
     int n_nodes; // number of nodes currently in use
@@ -341,11 +345,25 @@ struct ggml_cgraph {
 
     enum ggml_cgraph_eval_order order;
 
+    uint32_t flags;
+
     // an optional identifier that can be utilized to recognize same graphs if two non-zero values match
     // a value of 0 means it is not set and should be ignored
     uint64_t uid;
 };
 
+static inline void ggml_graph_set_flag(struct ggml_cgraph * cgraph, enum ggml_cgraph_flag flag, bool enabled) {
+    if (enabled) {
+        cgraph->flags |= (uint32_t) flag;
+    } else {
+        cgraph->flags &= ~((uint32_t) flag);
+    }
+}
+
+static inline bool ggml_graph_has_flag(const struct ggml_cgraph * cgraph, enum ggml_cgraph_flag flag) {
+    return (cgraph->flags & (uint32_t) flag) != 0;
+}
+
 // returns a slice of cgraph with nodes [i0, i1)
 // the slice does not have leafs or gradients
 // if you need the gradients, get them from the original graph
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 18a5ebd2ab0c..9a841c200aa9 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -7162,6 +7162,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
         /*.use_counts   =*/ use_counts_ptr,
         /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
         /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.flags        =*/ 0,
         /*.uid          =*/ 0,
     };
 
@@ -7190,6 +7191,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
         /*.use_counts       =*/ cgraph0->use_counts,
         /*.visited_hash_set =*/ cgraph0->visited_hash_set,
         /*.order            =*/ cgraph0->order,
+        /*.flags            =*/ cgraph0->flags,
         /*.uid              =*/ 0
     };
 
@@ -7204,6 +7206,7 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
     dst->n_leafs = src->n_leafs;
     dst->n_nodes = src->n_nodes;
     dst->order   = src->order;
+    dst->flags   = src->flags;
 
     for (int i = 0; i < src->n_leafs; ++i) {
         dst->leafs[i] = src->leafs[i];
diff --git a/include/llama.h b/include/llama.h
index 089dd2747a8b..1fef35a215cc 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -340,6 +340,8 @@ extern "C" {
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
         uint32_t n_rs_seq;          // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
         uint32_t n_outputs_max;     // max outputs in a ubatch (0 = n_batch)
+        int32_t  diffusion_self_cond_top_k; // sparse self-conditioning width for diffusion models, 0 = model default [EXPERIMENTAL]
+        uint32_t diffusion_input_gpu_groups; // bitmask of diffusion decoder inputs assigned to GPU backend [EXPERIMENTAL]
         int32_t  n_threads;         // number of threads to use for generation
         int32_t  n_threads_batch;   // number of threads to use for batch processing
 
@@ -382,6 +384,9 @@ extern "C" {
         bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
                           // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                           // ref: https://github.com/ggml-org/llama.cpp/pull/14363
+        bool diffusion_fused_self_cond_embd;     // use fused diffusion self-conditioning embedding input [EXPERIMENTAL]
+        bool diffusion_fuse_final_logit_softcap; // move diffusion final softcap into sampling [EXPERIMENTAL]
+        bool diffusion_separate_encoder_decoder; // build separate diffusion encoder/decoder graph variants [EXPERIMENTAL]
 
         // [EXPERIMENTAL]
         // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
@@ -1023,6 +1028,14 @@ extern "C" {
         uint32_t seed;
         uint32_t step;
         bool     top_k_tail_correction;
+        bool     cuda_fast_top_k;
+        bool     cuda_direct_self_cond;
+        bool     cuda_final_tokens_on_stop;
+        bool     cuda_fused_top_k_sample;
+        bool     cuda_tight_top_k;
+        bool     cuda_parallel_full_softmax;
+        bool     cuda_fused_full_softmax;
+        int32_t  cuda_top_k_local_k;
     };
 
     struct llama_diffusion_sample_result {
diff --git a/scripts/collect_diffusion_ncu_admin.ps1 b/scripts/collect_diffusion_ncu_admin.ps1
new file mode 100644
index 000000000000..f42a6b9e1864
--- /dev/null
+++ b/scripts/collect_diffusion_ncu_admin.ps1
@@ -0,0 +1,186 @@
+param(
+    [string] $Model = "D:\Day0-1\northbloom\diffusion-gemma-26b-v10-q4_k_m.gguf",
+    [string] $Exe = "D:\Day0-1\northbloom\llama.cpp-codex-sampling\pr-changes\llama.cpp\build-review\bin\Release\llama-diffusion-gemma-cli.exe",
+    [string] $OutDir = "D:\Day0-1\northbloom\llama.cpp-codex-sampling\pr-changes\llama.cpp\profiles\diffusion-kernel-efficiency\ncu-admin",
+    [string] $Prompt = "Write one short sentence about CUDA graphs.",
+    [int] $TopK = 64
+)
+
+$ErrorActionPreference = "Stop"
+
+function Resolve-Ncu {
+    $cmd = Get-Command ncu.exe -ErrorAction SilentlyContinue
+    if ($cmd) {
+        return $cmd.Source
+    }
+
+    $candidates = @(
+        "C:\Program Files\NVIDIA Corporation\Nsight Compute 2026.2.0\target\windows-desktop-win7-x64\ncu.exe",
+        "C:\Program Files\NVIDIA Corporation\Nsight Compute 2026.2.0\host\target-windows-x64\ncu.exe",
+        "C:\Program Files\NVIDIA Corporation\Nsight Compute 2026.1.0\target\windows-desktop-win7-x64\ncu.exe",
+        "C:\Program Files\NVIDIA Corporation\Nsight Compute 2026.1.0\host\target-windows-x64\ncu.exe",
+        "C:\Program Files\NVIDIA Corporation\Nsight Compute 2025.3.1\target\windows-desktop-win7-x64\ncu.exe",
+        "C:\Program Files\NVIDIA Corporation\Nsight Compute 2025.3.1\host\target-windows-x64\ncu.exe"
+    )
+    foreach ($candidate in $candidates) {
+        if (Test-Path $candidate) {
+            return $candidate
+        }
+    }
+
+    $found = Get-ChildItem "C:\Program Files\NVIDIA Corporation" -Recurse -Filter ncu.exe -ErrorAction SilentlyContinue |
+        Select-Object -First 1
+    if ($found) {
+        return $found.FullName
+    }
+
+    throw "Could not find ncu.exe. Add Nsight Compute to PATH or edit Resolve-Ncu in this script."
+}
+
+if (-not (Test-Path $Exe)) {
+    throw "Executable not found: $Exe"
+}
+if (-not (Test-Path $Model)) {
+    throw "Model not found: $Model"
+}
+
+New-Item -ItemType Directory -Force -Path $OutDir | Out-Null
+
+$ncu = Resolve-Ncu
+Write-Host "Using NCU: $ncu"
+Write-Host "Output : $OutDir"
+
+$runArgs = @(
+    $Exe,
+    "-m", $Model,
+    "-p", $Prompt,
+    "-n", "256",
+    "--diffusion-steps", "48",
+    "--top-k", "$TopK",
+    "-ngl", "all",
+    "-sm", "none",
+    "-mg", "0",
+    "-fa", "on",
+    "--ctx-size", "4096",
+    "--seed", "42",
+    "--no-mmproj",
+    "--diffusion-cuda-direct-self-cond",
+    "--diffusion-cuda-final-tokens-on-stop",
+    "--diffusion-cuda-tight-top-k",
+    "--diffusion-cuda-top-k-local-k", "4"
+)
+
+function Set-EnvMap($map) {
+    foreach ($kv in $map.GetEnumerator()) {
+        [Environment]::SetEnvironmentVariable($kv.Key, $kv.Value, "Process")
+    }
+}
+
+function Clear-ExperimentEnv {
+    $names = @(
+        "GGML_CUDA_MMQ_STREAM_K",
+        "GGML_CUDA_MMQ_STREAM_K_DIVISOR",
+        "GGML_CUDA_MMQ_STREAM_K_DIVISOR_MIN_PCT",
+        "GGML_CUDA_MMQ_AVOID_FIXUP",
+        "GGML_CUDA_MMQ_AVOID_FIXUP_MIN_EFF",
+        "GGML_CUDA_DISABLE_GRAPHS"
+    )
+    foreach ($name in $names) {
+        [Environment]::SetEnvironmentVariable($name, $null, "Process")
+    }
+}
+
+function Invoke-NcuProfile {
+    param(
+        [string] $Name,
+        [string] $KernelRegex,
+        [string] $MetricSet = "basic",
+        [int] $LaunchSkip = 0,
+        [int] $LaunchCount = 4,
+        [string[]] $ExtraArgs = @(),
+        [hashtable] $ExtraEnv = @{}
+    )
+
+    Clear-ExperimentEnv
+    Set-EnvMap $ExtraEnv
+
+    $export = Join-Path $OutDir $Name
+    $args = @(
+        "--force-overwrite",
+        "--target-processes", "all",
+        "--graph-profiling", "node",
+        "--replay-mode", "application",
+        "--set", $MetricSet,
+        "--kernel-name", "regex:$KernelRegex",
+        "--launch-skip", "$LaunchSkip",
+        "--launch-count", "$LaunchCount",
+        "--export", $export,
+        "--"
+    ) + $runArgs + $ExtraArgs
+
+    Write-Host ""
+    Write-Host "== $Name =="
+    Write-Host "kernel regex: $KernelRegex"
+    Write-Host "set=$MetricSet skip=$LaunchSkip count=$LaunchCount"
+    & $ncu @args
+    if ($LASTEXITCODE -ne 0) {
+        throw "NCU failed for $Name with exit code $LASTEXITCODE"
+    }
+}
+
+Invoke-NcuProfile `
+    -Name "01_topk_fused_local4_detailed" `
+    -KernelRegex "diffusion_sample_topk_fused_local_kernel" `
+    -MetricSet "detailed" `
+    -LaunchSkip 0 `
+    -LaunchCount 4 `
+    -ExtraArgs @("--diffusion-cuda-fused-top-k-sample")
+
+Invoke-NcuProfile `
+    -Name "02_topk_baseline_select_detailed" `
+    -KernelRegex "diffusion_select_topk_local_kernel" `
+    -MetricSet "detailed" `
+    -LaunchSkip 0 `
+    -LaunchCount 4 `
+    -ExtraArgs @("--diffusion-cuda-top-k-local-k", "8")
+
+Invoke-NcuProfile `
+    -Name "03_mmq_denoise_basic" `
+    -KernelRegex "mul_mat_q|mul_mat_q_stream_k_fixup|quantize_mmq_q8_1" `
+    -MetricSet "basic" `
+    -LaunchSkip 900 `
+    -LaunchCount 16 `
+    -ExtraArgs @("--diffusion-cuda-fused-top-k-sample")
+
+Invoke-NcuProfile `
+    -Name "04_mmq_no_fixup_aggressive_basic" `
+    -KernelRegex "mul_mat_q|mul_mat_q_stream_k_fixup|quantize_mmq_q8_1" `
+    -MetricSet "basic" `
+    -LaunchSkip 900 `
+    -LaunchCount 16 `
+    -ExtraArgs @("--diffusion-cuda-fused-top-k-sample") `
+    -ExtraEnv @{
+        GGML_CUDA_MMQ_STREAM_K_DIVISOR = "1"
+        GGML_CUDA_MMQ_AVOID_FIXUP = "1"
+        GGML_CUDA_MMQ_AVOID_FIXUP_MIN_EFF = "0"
+    }
+
+Invoke-NcuProfile `
+    -Name "05_attention_basic" `
+    -KernelRegex "flash_attn_ext_f16|flash_attn_stream_k_fixup" `
+    -MetricSet "basic" `
+    -LaunchSkip 80 `
+    -LaunchCount 16 `
+    -ExtraArgs @("--diffusion-cuda-fused-top-k-sample")
+
+Invoke-NcuProfile `
+    -Name "06_misc_hotspots_basic" `
+    -KernelRegex "cpy_scalar_transpose|reduce_rows_f32|k_get_rows_float|softcap_f32" `
+    -MetricSet "basic" `
+    -LaunchSkip 0 `
+    -LaunchCount 16 `
+    -ExtraArgs @("--diffusion-cuda-fused-top-k-sample")
+
+Write-Host ""
+Write-Host "NCU collection complete. Reports are in:"
+Write-Host $OutDir
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 575b647f733d..45f966ca1935 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -197,6 +197,12 @@ llama_context::llama_context(
 
     cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max;
 
+    diffusion_cond.self_cond_top_k = params.diffusion_self_cond_top_k > 0 ? params.diffusion_self_cond_top_k : 256;
+    diffusion_cond.input_gpu_groups = params.diffusion_input_gpu_groups;
+    diffusion_cond.fused_self_cond_embd = params.diffusion_fused_self_cond_embd;
+    diffusion_cond.fuse_final_logit_softcap = params.diffusion_fuse_final_logit_softcap;
+    diffusion_cond.separate_encoder_decoder = params.diffusion_separate_encoder_decoder;
+
     cparams.op_offload = params.op_offload;
     cparams.kv_unified = params.kv_unified;
 
@@ -1204,16 +1210,6 @@ static ggml_backend_cuda_diffusion_sample_topk_t get_cuda_diffusion_sample_topk_
         ggml_backend_reg_get_proc_address(reg, "ggml_backend_cuda_diffusion_sample_topk");
 }
 
-static bool diffusion_cuda_fuse_final_logit_softcap_enabled() {
-    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSE_FINAL_SOFTCAP");
-    return v && std::strcmp(v, "0") != 0;
-}
-
-static bool diffusion_cuda_fused_selfcond_embd_enabled() {
-    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD");
-    return v && std::strcmp(v, "0") != 0;
-}
-
 static bool diffusion_decoder_inputs_device_ready(
         const llama_diffusion_cond & diffusion_cond,
         const llm_graph_result * res) {
@@ -1292,7 +1288,7 @@ bool llama_context::diffusion_sample_topk(
     const bool device_self_cond = result->self_cond_ids == nullptr && result->self_cond_probs == nullptr;
     if (device_self_cond) {
         auto * inp_embd = gf_res_prev->get_inp_diffusion_self_cond_embd();
-        if (inp_embd && diffusion_cuda_fused_selfcond_embd_enabled()) {
+        if (inp_embd && diffusion_cond.fused_self_cond_embd) {
             t_self_cond_embd = inp_embd->embd;
             t_token_embd = gf_res_prev->get_diffusion_token_embd();
             if (!t_self_cond_embd || !t_token_embd) {
@@ -1317,7 +1313,7 @@ bool llama_context::diffusion_sample_topk(
         }
     }
 
-    const float fused_logit_softcap = diffusion_cuda_fuse_final_logit_softcap_enabled()
+    const float fused_logit_softcap = diffusion_cond.fuse_final_logit_softcap
         ? model.hparams.f_final_logit_softcapping
         : 0.0f;
 
@@ -1331,6 +1327,14 @@ bool llama_context::diffusion_sample_topk(
         /* .step                  = */ params->step,
         /* .top_k_tail_correction = */ params->top_k_tail_correction,
         /* .logit_softcap         = */ fused_logit_softcap,
+        /* .fast_top_k            = */ params->cuda_fast_top_k,
+        /* .direct_self_cond      = */ params->cuda_direct_self_cond,
+        /* .final_tokens_on_stop  = */ params->cuda_final_tokens_on_stop,
+        /* .fused_top_k_sample    = */ params->cuda_fused_top_k_sample,
+        /* .tight_top_k           = */ params->cuda_tight_top_k,
+        /* .parallel_full_softmax = */ params->cuda_parallel_full_softmax,
+        /* .fused_full_softmax    = */ params->cuda_fused_full_softmax,
+        /* .top_k_local_k         = */ params->cuda_top_k_local_k,
     };
 
     ggml_cuda_diffusion_sample_result cuda_result = {
@@ -3628,6 +3632,8 @@ llama_context_params llama_context_default_params() {
         /*.n_seq_max                   =*/ 1,
         /*.n_rs_seq                    =*/ 0,
         /*.n_outputs_max               =*/ 0,
+        /*.diffusion_self_cond_top_k   =*/ 256,
+        /*.diffusion_input_gpu_groups  =*/ 63,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
         /*.ctx_type                    =*/ LLAMA_CONTEXT_TYPE_DEFAULT,
@@ -3655,6 +3661,9 @@ llama_context_params llama_context_default_params() {
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
         /*.kv_unified                  =*/ false,
+        /*.diffusion_fused_self_cond_embd =*/ false,
+        /*.diffusion_fuse_final_logit_softcap =*/ false,
+        /*.diffusion_separate_encoder_decoder =*/ false,
         /*.sampler                     =*/ nullptr,
         /*.n_sampler                   =*/ 0,
         /*.ctx_other                   =*/ nullptr,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 7b21baf75d6c..5883c5a9ea9b 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -12,6 +12,8 @@
 #include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"
 
+#include "../ggml/src/ggml-impl.h"
+
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
@@ -1221,6 +1223,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     ctx0             (res->get_ctx()),
     gf               (res->get_gf()) {
         res->set_params(params);
+        ggml_graph_set_flag(gf, GGML_CGRAPH_FLAG_DIFFUSION_DECODER,
+                llm_arch_is_diffusion(arch) && diffusion && diffusion->decoder_phase);
     }
 
 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@@ -1234,15 +1238,12 @@ void llm_graph_context::set_diffusion_input_backend(ggml_tensor * tensor, uint32
         return;
     }
 
-    static const uint32_t enabled_groups = [] {
-        const char * env = getenv("DG_GPU_INPUT_GROUPS");
-        // Keep the default to inputs used by the fixed diffusion decoder graph:
-        // canvas/self-cond, positions, attention scale, KV indices, masks and
-        // rotary helpers. Mark them as outputs too, matching ggml-backend's
-        // existing copy-tensor convention, so the allocator will not overwrite
-        // them between denoising replays.
-        return env ? (uint32_t) strtoul(env, nullptr, 0) : 63u; // 1|2|4|8|16|32
-    }();
+    // Keep the default to inputs used by the fixed diffusion decoder graph:
+    // canvas/self-cond, positions, attention scale, KV indices, masks and
+    // rotary helpers. Mark them as outputs too, matching ggml-backend's
+    // existing copy-tensor convention, so the allocator will not overwrite
+    // them between denoising replays.
+    const uint32_t enabled_groups = diffusion->input_gpu_groups;
     if ((enabled_groups & group) == 0) {
         return;
     }
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 446e8162cf3c..ce98d241836d 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -119,6 +119,12 @@ struct llama_diffusion_cond {
     // superset graph (the decoder adds the self-conditioning input + block on top of the
     // encoder graph). The caller sets the actual phase before every decode.
     bool decoder_phase = true;
+
+    int64_t self_cond_top_k = 256;
+    uint32_t input_gpu_groups = 63;
+    bool fused_self_cond_embd = false;
+    bool fuse_final_logit_softcap = false;
+    bool separate_encoder_decoder = false;
 };
 
 struct llm_graph_params;
@@ -356,7 +362,7 @@ class llm_graph_input_diffusion_self_cond_topk : public llm_graph_input_i {
 };
 
 // Dense diffusion self-conditioning embedding input. This is filled by the CUDA sampler from
-// top-k ids/probs when GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD is enabled.
+// top-k ids/probs when the fused diffusion self-conditioning embedding path is enabled.
 class llm_graph_input_diffusion_self_cond_embd : public llm_graph_input_i {
 public:
     llm_graph_input_diffusion_self_cond_embd(const llama_diffusion_cond * diffusion) : diffusion(diffusion) {}
diff --git a/src/models/diffusion-gemma.cpp b/src/models/diffusion-gemma.cpp
index b8b1091cbb26..e7a2d6e01727 100644
--- a/src/models/diffusion-gemma.cpp
+++ b/src/models/diffusion-gemma.cpp
@@ -4,40 +4,6 @@
 #include "ggml-alloc.h"
 
 #include <algorithm>
-#include <cstdlib>
-#include <cstring>
-
-static bool diffusion_gemma_fuse_final_logit_softcap_enabled() {
-    const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSE_FINAL_SOFTCAP");
-    return v && std::strcmp(v, "0") != 0;
-}
-
-static bool diffusion_gemma_env_enabled_default(const char * name, bool def) {
-    const char * v = std::getenv(name);
-    if (!v) {
-        return def;
-    }
-    return std::strcmp(v, "0") != 0;
-}
-
-static bool diffusion_gemma_fused_selfcond_embd_enabled() {
-    return diffusion_gemma_env_enabled_default("GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD", false) &&
-        diffusion_gemma_env_enabled_default("DG_DEVICE_SELFCOND", true);
-}
-
-static int64_t diffusion_gemma_self_cond_top_k() {
-    const int64_t def = llama_model_diffusion_gemma::N_SC_TOPK;
-    const char * v = std::getenv("GGML_CUDA_DIFFUSION_SC_TOPK");
-    if (!v) {
-        return def;
-    }
-
-    const int64_t k = std::atoll(v);
-    if (k <= 0) {
-        return def;
-    }
-    return std::min(k, def);
-}
 
 // diffusion_gemma reuses the gemma4 decoder block (tensor layout + per-layer math) but runs
 // as a bidirectional (non-causal) block-diffusion denoiser over a canvas, with KV-cache reuse:
@@ -46,7 +12,7 @@ static int64_t diffusion_gemma_self_cond_top_k() {
 // (self-conditioned, bidirectional), rolling back its own K/V afterwards.
 //
 // Two graph variants are provided (see build_arch_graph): a single phase-branching graph, and
-// a separate encoder/decoder pair (DG_SEPARATE_ENC_DEC). Both reuse the gemma4 transformer
+// a separate encoder/decoder pair (--diffusion-separate-encoder-decoder). Both reuse the gemma4 transformer
 // body and differ only in input-embedding handling (encoder: plain; decoder: self-conditioned).
 
 void llama_model_diffusion_gemma::load_arch_hparams(llama_model_loader & ml) {
@@ -159,7 +125,7 @@ void llama_model_diffusion_gemma::load_arch_post(llama_model_loader & ml) {
                            __func__, (long long) n_embd_t, (long long) n_vocab_t,
                            ggml_nbytes(tok_embd_gpu) / (1024.0 * 1024.0 * 1024.0),
                            ggml_backend_buffer_name(tok_embd_gpu->buffer),
-                           (long long) diffusion_gemma_self_cond_top_k());
+                           (long long) llama_model_diffusion_gemma::N_SC_TOPK);
             return;
         }
         LLAMA_LOG_WARN("%s: failed to allocate on-device gather embedding; falling back to dense matmul\n", __func__);
@@ -201,10 +167,10 @@ std::unique_ptr<llm_graph_context> llama_model_diffusion_gemma::build_arch_graph
     const bool is_decoder = params.diffusion && params.diffusion->decoder_phase;
 
     // Variant B ("separate encoder and decoder block", shared weights): opt-in via
-    // DG_SEPARATE_ENC_DEC. Two distinct graphs are built per phase. Functionally identical
+    // --diffusion-separate-encoder-decoder. Two distinct graphs are built per phase. Functionally identical
     // to Variant A here (the checkpoint shares encoder/decoder weights); the split mirrors
     // the HF two-stack structure and generalizes to a checkpoint with distinct weights.
-    if (getenv("DG_SEPARATE_ENC_DEC")) {
+    if (params.diffusion && params.diffusion->separate_encoder_decoder) {
         if (is_decoder) {
             return std::make_unique<graph_decoder>(*this, params);
         }
@@ -233,14 +199,14 @@ ggml_tensor * llama_model_diffusion_gemma::graph_base::build_input(bool is_decod
     if (is_decoder) {
         ggml_tensor * soft; // soft-embedding {n_embd, n_tokens}: blend of the previous step's
                             // predicted token embeddings, scaled by sqrt(n_embd)
-        if (dmodel.tok_embd_gpu && diffusion_gemma_fused_selfcond_embd_enabled()) {
+        if (dmodel.tok_embd_gpu && diffusion && diffusion->fused_self_cond_embd) {
             soft = build_inp_diffusion_self_cond_embd(n_embd);
         } else if (dmodel.tok_embd_gpu) {
             // Sparse gather path (Option-2): the previous step's top-k token ids+probs are fed per
             // position; gather just those k embedding rows and blend them, instead of the dense
             // full-vocab `probs @ token_embd` matmul. Gather width is fixed (N_SC_TOPK) so the
             // graph shape is constant; unused slots carry prob 0 (the CLI zero-pads).
-            const int64_t k = diffusion_gemma_self_cond_top_k();
+            const int64_t k = diffusion ? std::min<int64_t>(std::max<int64_t>(diffusion->self_cond_top_k, 1), llama_model_diffusion_gemma::N_SC_TOPK) : llama_model_diffusion_gemma::N_SC_TOPK;
             auto * inp = build_inp_diffusion_self_cond_topk(k);
             ggml_tensor * ids   = inp->ids;                                                // I32 {k*n_tokens}
             ggml_tensor * probs = inp->probs;                                              // F32 {k, n_tokens}
@@ -461,7 +427,7 @@ void llama_model_diffusion_gemma::graph_base::build_transformer(ggml_tensor * in
     cur = build_lora_mm(model.output, cur, model.output_s);
 
     const bool fuse_final_softcap =
-        diffusion && diffusion->decoder_phase && diffusion_gemma_fuse_final_logit_softcap_enabled();
+        diffusion && diffusion->decoder_phase && diffusion->fuse_final_logit_softcap;
 
     if (hparams.f_final_logit_softcapping && !fuse_final_softcap) {
         cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);