From 2033bc8ebbe67b59521499127288dab0260f5998 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Mon, 8 Jun 2026 23:23:29 +0530 Subject: [PATCH 01/13] diffusion-gemma: chat-completions-compatible HTTP server (llama-server analogue) --- examples/diffusion-gemma/CMakeLists.txt | 10 + .../diffusion-gemma-server.cpp | 889 ++++++++++++++++++ 2 files changed, 899 insertions(+) create mode 100644 examples/diffusion-gemma/diffusion-gemma-server.cpp diff --git a/examples/diffusion-gemma/CMakeLists.txt b/examples/diffusion-gemma/CMakeLists.txt index 6ee3414d38df..b0f805717a7c 100644 --- a/examples/diffusion-gemma/CMakeLists.txt +++ b/examples/diffusion-gemma/CMakeLists.txt @@ -5,3 +5,13 @@ target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd ${CMAKE_THREAD_L # mtmd (tools/) is added after examples/, so add its include dir explicitly for the headers target_include_directories(${TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +# OpenAI-compatible HTTP server for the block-diffusion models (llama-server analogue) +set(TARGET llama-diffusion-gemma-server) +add_executable(${TARGET} diffusion-gemma-server.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd cpp-httplib ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd + ${CMAKE_SOURCE_DIR}/vendor) # cpp-httplib/httplib.h, nlohmann/json.hpp +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp new file mode 100644 index 000000000000..e6fd0a536a1b --- /dev/null +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -0,0 +1,889 @@ +// OpenAI-compatible HTTP server for the block-diffusion models (diffusion-gemma). +// +// This is the llama-server analogue for the diffusion family: it loads a block-diffusion model +// once and serves the same denoising generation loop as diffusion-gemma-cli over HTTP, exposing +// the OpenAI endpoints (/v1/chat/completions, /v1/completions, /v1/models) plus the llama-server +// observability surface (/health, /v1/health, /props, /metrics, /slots) with the same response +// `timings`/`usage` objects, the same per-request timing logs, the same access log, and a Prometheus +// /metrics endpoint -- all with the metric semantics re-mapped to block diffusion (canvas blocks, +// denoising steps, s/step) instead of autoregressive token-by-token decode. +// +// Generation is NOT autoregressive: each request denoises one or more 256-token canvases against a +// cached prompt prefix (see diffusion-gemma-cli.cpp for the full description). A single llama_context +// is reused across requests and is not thread-safe, so generation is serialized behind a mutex (one +// slot); the HTTP layer still accepts many connections concurrently. + +#include "arg.h" +#include "chat.h" +#include "common.h" +#include "llama.h" +#include "log.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using json = nlohmann::ordered_json; + +// llama-server-style log macros (server-common.h): "srv : ..." / "slot : id N | ..." +#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SLT_INF(id, fmt, ...) LOG_INF("slot %12.*s: id %2d | " fmt, 12, __func__, (id), __VA_ARGS__) + +// reference defaults from generation_config.json / DiffusionGemmaGenerationConfig (mirror the CLI) +static constexpr int DEF_CANVAS_LENGTH = 256; +static constexpr int DEF_MAX_DENOISE_STEPS = 48; +static constexpr float ENTROPY_BOUND = 0.1f; +static constexpr float TEMP_MIN = 0.4f; +static constexpr float TEMP_MAX = 0.8f; +static constexpr float CONFIDENCE_THRESHOLD = 0.005f; +static constexpr int STABILITY_THRESHOLD = 1; +static constexpr int SC_K = 256; // must match llama_model_diffusion_gemma::N_SC_TOPK + +// ------------------------------------------------------------------------------------------------ +// per-request generation parameters and result +// ------------------------------------------------------------------------------------------------ +struct diffusion_request { + int canvas_length = DEF_CANVAS_LENGTH; + int n_steps = DEF_MAX_DENOISE_STEPS; + int max_canvases = 1; // ceil(max_tokens / canvas_length) + int topk_fixed = 0; // 0 = full softmax + int topk_start = 0; + int topk_end = 0; + bool topk_tail = false; + bool ignore_eos = false; // run all max_canvases blocks (don't stop at end-of-text) + uint32_t seed = 1234; +}; + +struct diffusion_result { + std::string answer; // final response text (post channel-split / eog truncation) + std::string full; // full detokenized canvas (thought + response) + int prompt_tokens = 0; // prompt prefix tokens processed (encoder prefill) + int answer_tokens = 0; // tokens in the extracted answer + int n_blocks = 0; // canvas blocks run + int n_steps_total = 0; // denoising steps across all blocks + int canvas_tokens = 0; // n_blocks * canvas_length + int n_decode = 0; // total llama_decode() calls (prefill + denoise + commit) + double prefill_ms = 0.0; // encoder-phase prompt prefill wall time + double gen_ms = 0.0; // denoising block-loop wall time + bool ok = false; + std::string error; +}; + +// ------------------------------------------------------------------------------------------------ +// server metrics (llama-server analogue; protected by its own mutex) +// ------------------------------------------------------------------------------------------------ +struct server_metrics { + std::mutex mu; + int64_t t_start = 0; // process start (unix seconds) + + // cumulative counters + uint64_t n_requests_total = 0; + uint64_t n_prompt_tokens_total = 0; + double t_prompt_ms_total = 0.0; + uint64_t n_tokens_predicted_total = 0; // answer tokens + double t_predicted_ms_total = 0.0; // denoise time + uint64_t n_decode_total = 0; + uint64_t n_blocks_total = 0; + uint64_t n_steps_total = 0; + uint64_t n_canvas_tokens_total = 0; + + // last-request gauges + double last_prompt_tps = 0.0; + double last_predicted_tps = 0.0; + double last_steps_per_second = 0.0; + double last_canvas_tps = 0.0; + double last_ms_per_step = 0.0; + + std::atomic n_processing{0}; + + void add(const diffusion_result & r) { + std::lock_guard lk(mu); + n_requests_total += 1; + n_prompt_tokens_total += r.prompt_tokens; + t_prompt_ms_total += r.prefill_ms; + n_tokens_predicted_total += r.answer_tokens; + t_predicted_ms_total += r.gen_ms; + n_decode_total += r.n_decode; + n_blocks_total += r.n_blocks; + n_steps_total += r.n_steps_total; + n_canvas_tokens_total += r.canvas_tokens; + last_prompt_tps = r.prefill_ms > 0 ? r.prompt_tokens * 1e3 / r.prefill_ms : 0.0; + last_predicted_tps = r.gen_ms > 0 ? r.answer_tokens * 1e3 / r.gen_ms : 0.0; + last_steps_per_second= r.gen_ms > 0 ? r.n_steps_total * 1e3 / r.gen_ms : 0.0; + last_canvas_tps = r.gen_ms > 0 ? r.canvas_tokens * 1e3 / r.gen_ms : 0.0; + last_ms_per_step = r.n_steps_total > 0 ? r.gen_ms / r.n_steps_total : 0.0; + } +}; + +// ------------------------------------------------------------------------------------------------ +// server state (one model + one reused context == one slot, guarded by a mutex) +// ------------------------------------------------------------------------------------------------ +struct diffusion_server { + llama_model * model = nullptr; + llama_context * ctx = nullptr; + const llama_vocab * vocab = nullptr; + llama_memory_t mem = nullptr; + llama_batch batch{}; + common_chat_templates_ptr templates; + + int canvas_length = DEF_CANVAS_LENGTH; + int n_steps = DEF_MAX_DENOISE_STEPS; + int n_ctx = 0; + int n_ub = 0; // ubatch == canvas_length (keeps the decoder gather graph small) + int n_vocab = 0; + std::string model_id; + std::string model_path; + std::string build_info = "llama.cpp diffusion-gemma-server"; + + server_metrics metrics; + std::mutex gen_mutex; // serialize generation (ctx is single-threaded == one slot) + + std::string format_messages(const json & messages) const { + common_chat_templates_inputs inputs; + inputs.messages = common_chat_msgs_parse_oaicompat(messages); + inputs.add_generation_prompt = true; + return common_chat_templates_apply(templates.get(), inputs).prompt; + } + + // causal prefill of `toks` starting at position pos0, chunked to n_ub (encoder phase). + // increments *n_dec with the number of llama_decode() calls made. + bool prefill_causal(const std::vector & toks, int pos0, int * n_dec) { + llama_set_causal_attn(ctx, true); + llama_set_diffusion_decoder_phase(ctx, false); + llama_set_diffusion_self_cond_topk(ctx, nullptr, nullptr, 0, 0); + const int n = (int) toks.size(); + for (int off = 0; off < n; off += n_ub) { + const int cnt = std::min(n_ub, n - off); + batch.n_tokens = cnt; + for (int i = 0; i < cnt; ++i) { + batch.token[i] = toks[off + i]; + batch.pos[i] = pos0 + off + i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = (off + i == n - 1) ? 1 : 0; + } + if (llama_decode(ctx, batch) != 0) return false; + ++(*n_dec); + } + return true; + } + + // run the block-diffusion denoising loop for one request (mirrors diffusion-gemma-cli.cpp) + diffusion_result generate(const std::vector & prompt_tokens, const diffusion_request & rq) { + diffusion_result out; + const int prefix_len = (int) prompt_tokens.size(); + int n_decode = 0; + + int max_canvases = rq.max_canvases; + const int fit = (n_ctx - prefix_len) / canvas_length - 1; + if (fit < 1) { + out.error = "prompt too long for context (n_ctx=" + std::to_string(n_ctx) + ")"; + return out; + } + if (max_canvases > fit) max_canvases = fit; + if (max_canvases < 1) max_canvases = 1; + + llama_memory_clear(mem, true); + + std::mt19937 rng(rq.seed); + std::uniform_int_distribution rand_tok(0, n_vocab - 1); + std::uniform_real_distribution rand_unif(0.0f, 1.0f); + + // ---- ENCODER phase: prefill the prompt prefix (causal, no self-conditioning) ---- + const auto t_prefill0 = std::chrono::steady_clock::now(); + int n_past = 0; + if (prefix_len > 0) { + if (!prefill_causal(prompt_tokens, 0, &n_decode)) { + out.error = "prompt prefill (encoder) decode failed"; + return out; + } + n_past = prefix_len; + } + out.prefill_ms = std::chrono::duration(std::chrono::steady_clock::now() - t_prefill0).count(); + + std::vector canvas(canvas_length); + std::vector argmax_canvas(canvas_length, -1); + std::vector prev_argmax(canvas_length, -1); + std::vector accepted(canvas_length); + std::vector sc_ids ((size_t) SC_K * canvas_length, 0); + std::vector sc_probs((size_t) SC_K * canvas_length, 0.0f); + std::vector generated; + + int n_blocks_run = 0, n_steps_total = 0; + bool done = false; + + const auto t_gen0 = std::chrono::steady_clock::now(); + for (int block = 0; block < max_canvases && !done; ++block) { + ++n_blocks_run; + for (auto & t : canvas) t = rand_tok(rng); + std::fill(prev_argmax.begin(), prev_argmax.end(), -1); + llama_set_diffusion_self_cond_topk(ctx, nullptr, nullptr, 0, 0); + + for (int cur_step = n_steps; cur_step >= 1; --cur_step) { + ++n_steps_total; + llama_set_causal_attn(ctx, false); + llama_set_diffusion_decoder_phase(ctx, true); + batch.n_tokens = canvas_length; + for (int j = 0; j < canvas_length; ++j) { + batch.token[j] = canvas[j]; + batch.pos[j] = n_past + j; + batch.n_seq_id[j] = 1; + batch.seq_id[j][0] = 0; + batch.logits[j] = 1; + } + if (llama_decode(ctx, batch) != 0) { + out.error = "llama_decode failed at denoising step " + std::to_string(cur_step); + return out; + } + ++n_decode; + const float * logits = llama_get_logits(ctx); + + const float temp = TEMP_MIN + (TEMP_MAX - TEMP_MIN) * ((float) cur_step / (float) n_steps); + + std::vector entropy(canvas_length); + std::vector sampled(canvas_length); + + int k_step = 0; + if (rq.topk_start > 0 && rq.topk_end > 0) { + const float frac = (n_steps > 1) ? (float) (cur_step - 1) / (float) (n_steps - 1) : 0.0f; + k_step = (int) lroundf(rq.topk_end + (rq.topk_start - rq.topk_end) * frac); + } else if (rq.topk_fixed > 0) { + k_step = rq.topk_fixed; + } + if (k_step <= 0 || k_step >= n_vocab) k_step = 0; + + std::fill(sc_probs.begin(), sc_probs.end(), 0.0f); + std::fill(sc_ids.begin(), sc_ids.end(), 0); + + const auto cmp = [](const std::pair&a, const std::pair&b){ return a.first > b.first; }; + + if (k_step == 0) { + std::vector probs(n_vocab); + std::vector> scheap; scheap.reserve(SC_K); + for (int j = 0; j < canvas_length; ++j) { + const float * lg = logits + (size_t) j * n_vocab; + float maxl = -INFINITY; int amax = 0; + scheap.clear(); + for (int v = 0; v < n_vocab; ++v) { + const float x = lg[v] / temp; + if (x > maxl) { maxl = x; amax = v; } + if ((int) scheap.size() < SC_K) { + scheap.push_back({x, v}); + std::push_heap(scheap.begin(), scheap.end(), cmp); + } else if (x > scheap.front().first) { + std::pop_heap(scheap.begin(), scheap.end(), cmp); + scheap.back() = {x, v}; + std::push_heap(scheap.begin(), scheap.end(), cmp); + } + } + float sum = 0.0f; + for (int v = 0; v < n_vocab; ++v) { const float p = expf(lg[v] / temp - maxl); probs[v] = p; sum += p; } + float ent = 0.0f; + const float r = rand_unif(rng) * sum; + float cum = 0.0f; int tok = amax; bool picked = false; + for (int v = 0; v < n_vocab; ++v) { + const float p = probs[v] / sum; + if (p > 0.0f) ent -= p * logf(p); + cum += probs[v]; + if (!picked && cum >= r) { tok = v; picked = true; } + } + int32_t * sid = sc_ids.data() + (size_t) j * SC_K; + float * spr = sc_probs.data() + (size_t) j * SC_K; + int slot = 0; + for (auto & h : scheap) { sid[slot] = h.second; spr[slot] = expf(h.first - maxl) / sum; ++slot; } + entropy[j] = ent; sampled[j] = tok; argmax_canvas[j] = amax; + } + } else { + const int heap_k = std::max(k_step, SC_K); + std::vector> heap; heap.reserve(heap_k); + for (int j = 0; j < canvas_length; ++j) { + const float * lg = logits + (size_t) j * n_vocab; + float maxl = -INFINITY; int amax = 0; + heap.clear(); + for (int v = 0; v < n_vocab; ++v) { + const float x = lg[v] / temp; + if (x > maxl) { maxl = x; amax = v; } + if ((int) heap.size() < heap_k) { + heap.push_back({x, v}); + std::push_heap(heap.begin(), heap.end(), cmp); + } else if (x > heap.front().first) { + std::pop_heap(heap.begin(), heap.end(), cmp); + heap.back() = {x, v}; + std::push_heap(heap.begin(), heap.end(), cmp); + } + } + std::sort(heap.begin(), heap.end(), [](const std::pair&a, const std::pair&b){ return a.first > b.first; }); + float Zk = 0.0f; + for (int i = 0; i < k_step; ++i) { const float e = expf(heap[i].first - maxl); heap[i].first = e; Zk += e; } + float ent; + if (rq.topk_tail) { + double Zf = 0.0, T = 0.0; + for (int v = 0; v < n_vocab; ++v) { + const double d = (double) (lg[v] / temp) - (double) maxl; + const double e = exp(d); Zf += e; T += d * e; + } + ent = (float) (log(Zf) - T / Zf); + } else { + ent = 0.0f; + for (int i = 0; i < k_step; ++i) { const float q = heap[i].first / Zk; if (q > 0.0f) ent -= q * logf(q); } + } + const float r = rand_unif(rng) * Zk; + float cum = 0.0f; int tok = amax; bool picked = false; + for (int i = 0; i < k_step; ++i) { cum += heap[i].first; if (!picked && cum >= r) { tok = heap[i].second; picked = true; } } + int32_t * sid = sc_ids.data() + (size_t) j * SC_K; + float * spr = sc_probs.data() + (size_t) j * SC_K; + const int n_sc = std::min(k_step, SC_K); + for (int i = 0; i < n_sc; ++i) { sid[i] = heap[i].second; spr[i] = heap[i].first / Zk; } + entropy[j] = ent; sampled[j] = tok; argmax_canvas[j] = amax; + } + } + + llama_memory_seq_rm(mem, 0, n_past, -1); + llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length); + + std::vector order(canvas_length); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](int a, int b) { return entropy[a] < entropy[b]; }); + std::vector accept_mask(canvas_length, 0); + float prefix = 0.0f; + for (int k = 0; k < canvas_length; ++k) { + if (prefix <= ENTROPY_BOUND) { accept_mask[order[k]] = 1; prefix += entropy[order[k]]; } + else break; + } + for (int i = 0; i < canvas_length; ++i) if (accept_mask[i]) accepted[i] = sampled[i]; + + const float mean_ent = std::accumulate(entropy.begin(), entropy.end(), 0.0f) / canvas_length; + const bool stable = (STABILITY_THRESHOLD == 0) || (argmax_canvas == prev_argmax); + const bool confident = mean_ent < CONFIDENCE_THRESHOLD; + if (stable && confident) break; + prev_argmax = argmax_canvas; + + for (int i = 0; i < canvas_length; ++i) canvas[i] = accept_mask[i] ? accepted[i] : rand_tok(rng); + } + + const std::vector & block_out = argmax_canvas; + generated.insert(generated.end(), block_out.begin(), block_out.end()); + if (!rq.ignore_eos) { + for (int j = 0; j < canvas_length; ++j) { + if (llama_vocab_is_eog(vocab, block_out[j])) { done = true; break; } + } + } + if (!done && block + 1 < max_canvases) { + if (!prefill_causal(block_out, n_past, &n_decode)) { + out.error = "canvas commit (encoder) decode failed"; + return out; + } + n_past += canvas_length; + } + } + out.gen_ms = std::chrono::duration(std::chrono::steady_clock::now() - t_gen0).count(); + out.full = common_detokenize(vocab, generated, false); + + // extract the final response: after the last "" close, until the first eog token + llama_token chan_close = LLAMA_TOKEN_NULL; + { + auto t = common_tokenize(vocab, "", false, true); + if (t.size() == 1) chan_close = t[0]; + } + const int n_gen = (int) generated.size(); + int start = 0; + if (chan_close != LLAMA_TOKEN_NULL) { + for (int j = 0; j < n_gen; ++j) if (generated[j] == chan_close) start = j + 1; + } + std::vector answer; + for (int j = start; j < n_gen; ++j) { + if (llama_vocab_is_eog(vocab, generated[j])) break; + answer.push_back(generated[j]); + } + std::string ans = common_detokenize(vocab, answer, false); + { + std::string s = ans; + size_t h = s.find_first_not_of(" \n\t"); if (h != std::string::npos) s = s.substr(h); + const size_t half = s.size() / 2; + if (half > 0 && s.compare(0, half, s, s.size() - half, half) == 0) ans = s.substr(0, half); + } + + out.answer = ans; + out.prompt_tokens = prefix_len; + out.answer_tokens = (int) answer.size(); + out.n_blocks = n_blocks_run; + out.n_steps_total = n_steps_total; + out.canvas_tokens = n_blocks_run * canvas_length; + out.n_decode = n_decode; + out.ok = true; + return out; + } +}; + +// ------------------------------------------------------------------------------------------------ +// JSON / log helpers +// ------------------------------------------------------------------------------------------------ +static std::atomic g_req_counter{0}; +static std::string gen_id(const char * prefix) { return std::string(prefix) + "-" + std::to_string(g_req_counter.fetch_add(1)); } +static json error_json(const std::string & msg, const std::string & type, int code) { + return json{{"error", {{"message", msg}, {"type", type}, {"code", code}}}}; +} + +// llama-server `timings` object, with diffusion fields. prompt_* describes the encoder prefill; +// predicted_* describes the answer tokens; the nested `diffusion` object describes the denoising. +static json timings_json(const diffusion_result & r) { + const double ppt = r.prompt_tokens > 0 ? r.prefill_ms / r.prompt_tokens : 0.0; + const double pps = r.prefill_ms > 0 ? r.prompt_tokens * 1e3 / r.prefill_ms : 0.0; + const double dpt = r.answer_tokens > 0 ? r.gen_ms / r.answer_tokens : 0.0; + const double dps = r.gen_ms > 0 ? r.answer_tokens * 1e3 / r.gen_ms : 0.0; + const double sps = r.gen_ms > 0 ? r.n_steps_total * 1e3 / r.gen_ms : 0.0; + const double cps = r.gen_ms > 0 ? r.canvas_tokens * 1e3 / r.gen_ms : 0.0; + const double mps = r.n_steps_total > 0 ? r.gen_ms / r.n_steps_total : 0.0; + return json{ + {"cache_n", 0}, + {"prompt_n", r.prompt_tokens}, + {"prompt_ms", r.prefill_ms}, + {"prompt_per_token_ms", ppt}, + {"prompt_per_second", pps}, + {"predicted_n", r.answer_tokens}, + {"predicted_ms", r.gen_ms}, + {"predicted_per_token_ms", dpt}, + {"predicted_per_second", dps}, + {"diffusion", { + {"n_blocks", r.n_blocks}, + {"n_steps", r.n_steps_total}, + {"canvas_tokens", r.canvas_tokens}, + {"ms_per_step", mps}, + {"steps_per_second", sps}, + {"canvas_tokens_per_second", cps}, + {"n_decode", r.n_decode}, + }}, + }; +} + +static json usage_json(const diffusion_result & r) { + return json{ + {"prompt_tokens", r.prompt_tokens}, + {"completion_tokens", r.answer_tokens}, + {"total_tokens", r.prompt_tokens + r.answer_tokens}, + {"prompt_tokens_details", {{"cached_tokens", 0}}}, + }; +} + +// per-request timing log, llama_perf-style but with the denoise phase substituted for AR eval +static void log_timings(const diffusion_result & r) { + const double ppt = r.prompt_tokens > 0 ? r.prefill_ms / r.prompt_tokens : 0.0; + const double pps = r.prefill_ms > 0 ? r.prompt_tokens * 1e3 / r.prefill_ms : 0.0; + const double mps = r.n_steps_total > 0 ? r.gen_ms / r.n_steps_total : 0.0; + const double sps = r.gen_ms > 0 ? r.n_steps_total * 1e3 / r.gen_ms : 0.0; + const double cps = r.gen_ms > 0 ? r.canvas_tokens * 1e3 / r.gen_ms : 0.0; + SLT_INF(0, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + r.prefill_ms, r.prompt_tokens, ppt, pps); + SLT_INF(0, " denoise time = %10.2f ms / %5d steps (%8.2f ms per step, %8.2f steps per second)\n", + r.gen_ms, r.n_steps_total, mps, sps); + SLT_INF(0, " gen tokens = %5d answer | %5d canvas over %d block(s) (%8.2f canvas tok/s)\n", + r.answer_tokens, r.canvas_tokens, r.n_blocks, cps); + SLT_INF(0, " total time = %10.2f ms / %5d decode call(s)\n", + r.prefill_ms + r.gen_ms, r.n_decode); +} + +static diffusion_request request_from_body(const json & body, const diffusion_server & srv, uint32_t default_seed) { + diffusion_request rq; + rq.canvas_length = srv.canvas_length; + rq.n_steps = srv.n_steps; + rq.seed = default_seed; + int max_tokens = 0; + if (body.contains("max_tokens") && body["max_tokens"].is_number_integer()) max_tokens = body["max_tokens"].get(); + else if (body.contains("max_completion_tokens") && body["max_completion_tokens"].is_number_integer()) max_tokens = body["max_completion_tokens"].get(); + rq.max_canvases = max_tokens > 0 ? (max_tokens + rq.canvas_length - 1) / rq.canvas_length : 1; + if (body.contains("top_k") && body["top_k"].is_number_integer()) rq.topk_fixed = body["top_k"].get(); + if (body.contains("seed") && body["seed"].is_number_integer()) rq.seed = (uint32_t) body["seed"].get(); + if (body.contains("ignore_eos") && body["ignore_eos"].is_boolean()) rq.ignore_eos = body["ignore_eos"].get(); + return rq; +} + +// ------------------------------------------------------------------------------------------------ +int main(int argc, char ** argv) { + std::string hostname = "127.0.0.1"; + int port = 8080; + std::string api_key; + bool enable_metrics = false; + bool enable_slots = false; + + // Pull server-only flags out of argv before common_params_parse (which validates against + // LLAMA_EXAMPLE_DIFFUSION and would reject these). Everything else (-m, --mmproj, -ngl, -c, + // --seed, --top-k*, -t, ...) is forwarded and parsed normally. + std::vector fwd; + fwd.push_back(argv[0]); + for (int i = 1; i < argc; ++i) { + const std::string a = argv[i]; + auto next = [&](const char * def) -> std::string { return (i + 1 < argc) ? argv[++i] : def; }; + if (a == "--host") { hostname = next("127.0.0.1"); } + else if (a == "--port") { port = atoi(next("8080").c_str()); } + else if (a == "--api-key") { api_key = next(""); } + else if (a == "--metrics") { enable_metrics = true; } + else if (a == "--slots") { enable_slots = true; } + else if (a == "--no-metrics") { enable_metrics = false; } + else if (a == "--no-slots") { enable_slots = false; } + else { fwd.push_back(argv[i]); } + } + + common_params params; + if (!common_params_parse((int) fwd.size(), fwd.data(), params, LLAMA_EXAMPLE_DIFFUSION)) { + return 1; + } + common_init(); + + llama_backend_init(); + llama_numa_init(params.numa); + + SRV_INF("%s\n", "loading model"); + + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = params.n_gpu_layers >= 0 ? params.n_gpu_layers : 999; + model_params.devices = params.devices.data(); + model_params.use_mmap = params.use_mmap; + + diffusion_server srv; + srv.model = llama_model_load_from_file(params.model.path.c_str(), model_params); + if (!srv.model) { + SRV_ERR("failed to load model '%s'\n", params.model.path.c_str()); + return 1; + } + if (!llama_model_is_diffusion(srv.model)) { + SRV_ERR("'%s' is not a diffusion model\n", params.model.path.c_str()); + llama_model_free(srv.model); + return 1; + } + + srv.vocab = llama_model_get_vocab(srv.model); + srv.n_vocab = llama_vocab_n_tokens(srv.vocab); + srv.canvas_length = DEF_CANVAS_LENGTH; + srv.n_steps = DEF_MAX_DENOISE_STEPS; + srv.n_ub = srv.canvas_length; + srv.n_ctx = params.n_ctx > 0 ? (int) params.n_ctx : 4096; + if (srv.n_ctx < 2 * srv.canvas_length) srv.n_ctx = 2 * srv.canvas_length; + srv.model_path = params.model.path; + srv.metrics.t_start = (int64_t) std::time(nullptr); + + { + std::string p = params.model.path; + size_t slash = p.find_last_of("/\\"); + srv.model_id = (slash == std::string::npos) ? p : p.substr(slash + 1); + size_t dot = srv.model_id.rfind(".gguf"); + if (dot != std::string::npos) srv.model_id = srv.model_id.substr(0, dot); + if (srv.model_id.empty()) srv.model_id = "diffusion-gemma"; + } + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = srv.n_ctx; + ctx_params.n_batch = srv.n_ub; + ctx_params.n_ubatch = srv.n_ub; + ctx_params.no_perf = params.no_perf; + + srv.ctx = llama_init_from_model(srv.model, ctx_params); + if (!srv.ctx) { + SRV_ERR("%s\n", "failed to create context"); + llama_model_free(srv.model); + return 1; + } + llama_set_n_threads(srv.ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads); + srv.mem = llama_get_memory(srv.ctx); + srv.batch = llama_batch_init(srv.n_ub, 0, 1); + srv.templates = common_chat_templates_init(srv.model, ""); + + const uint32_t default_seed = params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed; + + SRV_INF("%s\n", llama_print_system_info()); + SRV_INF("model loaded: '%s' | n_ctx = %d | canvas = %d | denoise steps = %d | 1 slot\n", + srv.model_id.c_str(), srv.n_ctx, srv.canvas_length, srv.n_steps); + + // ------------------------------------------------------------------------------------------ + // HTTP server + // ------------------------------------------------------------------------------------------ + httplib::Server http; + http.set_default_headers({ + {"Server", "llama.cpp-diffusion-gemma"}, + {"Access-Control-Allow-Origin", "*"}, + {"Access-Control-Allow-Headers", "Content-Type, Authorization"}, + {"Access-Control-Allow-Methods", "GET, POST, OPTIONS"}, + }); + + // access log (llama-server: SRV_TRC "done request: METHOD PATH ADDR STATUS"; skip noisy paths) + http.set_logger([](const httplib::Request & req, const httplib::Response & res) { + if (req.path == "/health" || req.path == "/v1/health" || req.path == "/metrics" || + req.path == "/props" || req.path == "/models" || req.path == "/v1/models") { + return; + } + SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); + SRV_DBG("request body: %s\n", req.body.c_str()); + }); + + if (!api_key.empty()) { + http.set_pre_routing_handler([api_key](const httplib::Request & req, httplib::Response & res) { + if (req.method == "OPTIONS" || req.path == "/health" || req.path == "/v1/health") { + return httplib::Server::HandlerResponse::Unhandled; + } + if (req.get_header_value("Authorization") != "Bearer " + api_key) { + res.status = 401; + res.set_content(error_json("invalid api key", "authentication_error", 401).dump(), "application/json"); + return httplib::Server::HandlerResponse::Handled; + } + return httplib::Server::HandlerResponse::Unhandled; + }); + } + + http.Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) { res.status = 204; }); + + auto health = [](const httplib::Request &, httplib::Response & res) { + res.set_content(json{{"status", "ok"}}.dump(), "application/json"); + }; + http.Get("/health", health); + http.Get("/v1/health", health); + + http.Get("/v1/models", [&srv](const httplib::Request &, httplib::Response & res) { + json models = json::array(); + models.push_back({{"id", srv.model_id}, {"object", "model"}, {"created", (int64_t) std::time(nullptr)}, {"owned_by", "local"}}); + res.set_content(json{{"object", "list"}, {"data", models}}.dump(), "application/json"); + }); + http.Get("/models", [&srv](const httplib::Request &, httplib::Response & res) { + json models = json::array(); + models.push_back({{"id", srv.model_id}, {"object", "model"}, {"created", (int64_t) std::time(nullptr)}, {"owned_by", "local"}}); + res.set_content(json{{"object", "list"}, {"data", models}}.dump(), "application/json"); + }); + + // /props (llama-server analogue): server + model + default generation settings + http.Get("/props", [&](const httplib::Request &, httplib::Response & res) { + json props{ + {"default_generation_settings", { + {"n_ctx", srv.n_ctx}, + {"canvas_length", srv.canvas_length}, + {"n_denoise_steps", srv.n_steps}, + {"temperature_min", TEMP_MIN}, + {"temperature_max", TEMP_MAX}, + {"entropy_bound", ENTROPY_BOUND}, + {"confidence_threshold", CONFIDENCE_THRESHOLD}, + {"stability_threshold", STABILITY_THRESHOLD}, + {"self_cond_topk", SC_K}, + }}, + {"total_slots", 1}, + {"model_path", srv.model_path}, + {"model_alias", srv.model_id}, + {"n_ctx", srv.n_ctx}, + {"build_info", srv.build_info}, + {"generation_type", "block-diffusion"}, + {"endpoint_slots", enable_slots}, + {"endpoint_metrics", enable_metrics}, + }; + res.set_content(props.dump(), "application/json"); + }); + + // /metrics (Prometheus). Gated behind --metrics, like llama-server. + if (enable_metrics) { + http.Get("/metrics", [&srv](const httplib::Request &, httplib::Response & res) { + server_metrics & m = srv.metrics; + std::lock_guard lk(m.mu); + const int n_proc = m.n_processing.load(); + char buf[8192]; + int n = snprintf(buf, sizeof(buf), + // --- counters (re-mapped to diffusion where the AR concept differs) --- + "# HELP llamacpp:prompt_tokens_total Number of prompt tokens processed.\n" + "# TYPE llamacpp:prompt_tokens_total counter\n" + "llamacpp:prompt_tokens_total %llu\n" + "# HELP llamacpp:prompt_seconds_total Prompt (encoder prefill) process time.\n" + "# TYPE llamacpp:prompt_seconds_total counter\n" + "llamacpp:prompt_seconds_total %.3f\n" + "# HELP llamacpp:tokens_predicted_total Number of answer tokens generated.\n" + "# TYPE llamacpp:tokens_predicted_total counter\n" + "llamacpp:tokens_predicted_total %llu\n" + "# HELP llamacpp:tokens_predicted_seconds_total Denoising (generation) process time.\n" + "# TYPE llamacpp:tokens_predicted_seconds_total counter\n" + "llamacpp:tokens_predicted_seconds_total %.3f\n" + "# HELP llamacpp:n_decode_total Total number of llama_decode() calls (prefill + denoise + commit).\n" + "# TYPE llamacpp:n_decode_total counter\n" + "llamacpp:n_decode_total %llu\n" + "# HELP llamacpp:requests_total Total number of completed generation requests.\n" + "# TYPE llamacpp:requests_total counter\n" + "llamacpp:requests_total %llu\n" + "# HELP llamacpp:diffusion_blocks_total Total number of canvas blocks denoised.\n" + "# TYPE llamacpp:diffusion_blocks_total counter\n" + "llamacpp:diffusion_blocks_total %llu\n" + "# HELP llamacpp:diffusion_steps_total Total number of denoising steps.\n" + "# TYPE llamacpp:diffusion_steps_total counter\n" + "llamacpp:diffusion_steps_total %llu\n" + "# HELP llamacpp:diffusion_canvas_tokens_total Total canvas tokens denoised (blocks * canvas_length).\n" + "# TYPE llamacpp:diffusion_canvas_tokens_total counter\n" + "llamacpp:diffusion_canvas_tokens_total %llu\n" + // --- gauges (last request) --- + "# HELP llamacpp:prompt_tokens_seconds Average prompt throughput in tokens/s.\n" + "# TYPE llamacpp:prompt_tokens_seconds gauge\n" + "llamacpp:prompt_tokens_seconds %.3f\n" + "# HELP llamacpp:predicted_tokens_seconds Average answer-token throughput in tokens/s.\n" + "# TYPE llamacpp:predicted_tokens_seconds gauge\n" + "llamacpp:predicted_tokens_seconds %.3f\n" + "# HELP llamacpp:diffusion_steps_per_second Denoising steps per second (last request).\n" + "# TYPE llamacpp:diffusion_steps_per_second gauge\n" + "llamacpp:diffusion_steps_per_second %.3f\n" + "# HELP llamacpp:diffusion_canvas_tokens_per_second Canvas tokens per second (last request).\n" + "# TYPE llamacpp:diffusion_canvas_tokens_per_second gauge\n" + "llamacpp:diffusion_canvas_tokens_per_second %.3f\n" + "# HELP llamacpp:diffusion_ms_per_step Milliseconds per denoising step (last request).\n" + "# TYPE llamacpp:diffusion_ms_per_step gauge\n" + "llamacpp:diffusion_ms_per_step %.3f\n" + "# HELP llamacpp:requests_processing Number of requests currently generating.\n" + "# TYPE llamacpp:requests_processing gauge\n" + "llamacpp:requests_processing %d\n", + (unsigned long long) m.n_prompt_tokens_total, + m.t_prompt_ms_total / 1000.0, + (unsigned long long) m.n_tokens_predicted_total, + m.t_predicted_ms_total / 1000.0, + (unsigned long long) m.n_decode_total, + (unsigned long long) m.n_requests_total, + (unsigned long long) m.n_blocks_total, + (unsigned long long) m.n_steps_total, + (unsigned long long) m.n_canvas_tokens_total, + m.last_prompt_tps, + m.last_predicted_tps, + m.last_steps_per_second, + m.last_canvas_tps, + m.last_ms_per_step, + n_proc); + res.set_header("Process-Start-Time-Unix", std::to_string(m.t_start)); + res.set_content(std::string(buf, n > 0 ? (size_t) n : 0), "text/plain; version=0.0.4"); + }); + SRV_INF("%s\n", "metrics endpoint enabled at /metrics"); + } + + // /slots (single slot). Gated behind --slots, like llama-server. + if (enable_slots) { + http.Get("/slots", [&srv](const httplib::Request &, httplib::Response & res) { + json slot{ + {"id", 0}, + {"n_ctx", srv.n_ctx}, + {"is_processing", srv.metrics.n_processing.load() > 0}, + {"generation_type", "block-diffusion"}, + {"params", { + {"canvas_length", srv.canvas_length}, + {"n_denoise_steps", srv.n_steps}, + {"self_cond_topk", SC_K}, + }}, + }; + res.set_content(json::array({slot}).dump(), "application/json"); + }); + SRV_INF("%s\n", "slots endpoint enabled at /slots"); + } + + // shared: run one generation (under the slot mutex), update metrics, print timing log + auto run_for_body = [&srv, default_seed](const json & body, const std::vector & prompt_tokens) { + diffusion_request rq = request_from_body(body, srv, default_seed); + srv.metrics.n_processing.fetch_add(1); + std::unique_lock lock(srv.gen_mutex); + diffusion_result r = srv.generate(prompt_tokens, rq); + lock.unlock(); + srv.metrics.n_processing.fetch_sub(1); + if (r.ok) { srv.metrics.add(r); log_timings(r); } + return r; + }; + + // POST /v1/chat/completions + http.Post("/v1/chat/completions", [&](const httplib::Request & req, httplib::Response & res) { + json body; + try { body = json::parse(req.body); } + catch (const std::exception & e) { res.status = 400; res.set_content(error_json(std::string("invalid JSON: ") + e.what(), "invalid_request_error", 400).dump(), "application/json"); return; } + if (!body.contains("messages") || !body["messages"].is_array()) { + res.status = 400; res.set_content(error_json("'messages' (array) is required", "invalid_request_error", 400).dump(), "application/json"); return; + } + std::string prompt; + try { prompt = srv.format_messages(body["messages"]); } + catch (const std::exception & e) { res.status = 400; res.set_content(error_json(std::string("failed to format messages: ") + e.what(), "invalid_request_error", 400).dump(), "application/json"); return; } + const std::vector prompt_tokens = common_tokenize(srv.vocab, prompt, false, true); + const bool stream = body.value("stream", false); + + const diffusion_result r = run_for_body(body, prompt_tokens); + if (!r.ok) { res.status = 500; res.set_content(error_json(r.error, "server_error", 500).dump(), "application/json"); return; } + + if (!stream) { + json out{ + {"id", gen_id("chatcmpl")}, {"object", "chat.completion"}, {"created", (int64_t) std::time(nullptr)}, {"model", srv.model_id}, + {"choices", json::array({ json{{"index", 0}, {"message", {{"role", "assistant"}, {"content", r.answer}}}, {"finish_reason", "stop"}} })}, + {"usage", usage_json(r)}, + {"timings", timings_json(r)}, + }; + res.set_content(out.dump(), "application/json"); + return; + } + + const std::string id = gen_id("chatcmpl"); + const int64_t created = (int64_t) std::time(nullptr); + const std::string model_id = srv.model_id; + const std::string answer = r.answer; + const json timings = timings_json(r); + res.set_chunked_content_provider("text/event-stream", + [id, created, model_id, answer, timings](size_t, httplib::DataSink & sink) { + auto send = [&](const json & c) { std::string s = "data: " + c.dump() + "\n\n"; return sink.write(s.data(), s.size()); }; + auto chunk = [&](const json & delta, const char * finish, const json * extra) { + json c{{"id", id}, {"object", "chat.completion.chunk"}, {"created", created}, {"model", model_id}, + {"choices", json::array({ json{{"index", 0}, {"delta", delta}, {"finish_reason", finish ? json(finish) : json(nullptr)}} })}}; + if (extra) c["timings"] = *extra; + return send(c); + }; + if (!chunk(json{{"role", "assistant"}}, nullptr, nullptr)) return false; + if (!chunk(json{{"content", answer}}, nullptr, nullptr)) return false; + if (!chunk(json::object(), "stop", &timings)) return false; + const std::string done = "data: [DONE]\n\n"; + sink.write(done.data(), done.size()); + sink.done(); + return true; + }); + }); + + // POST /v1/completions (plain prompt) + http.Post("/v1/completions", [&](const httplib::Request & req, httplib::Response & res) { + json body; + try { body = json::parse(req.body); } + catch (const std::exception & e) { res.status = 400; res.set_content(error_json(std::string("invalid JSON: ") + e.what(), "invalid_request_error", 400).dump(), "application/json"); return; } + if (!body.contains("prompt") || !body["prompt"].is_string()) { + res.status = 400; res.set_content(error_json("'prompt' (string) is required", "invalid_request_error", 400).dump(), "application/json"); return; + } + const std::string prompt = body["prompt"].get(); + const std::vector prompt_tokens = common_tokenize(srv.vocab, prompt, true, true); + + const diffusion_result r = run_for_body(body, prompt_tokens); + if (!r.ok) { res.status = 500; res.set_content(error_json(r.error, "server_error", 500).dump(), "application/json"); return; } + json out{ + {"id", gen_id("cmpl")}, {"object", "text_completion"}, {"created", (int64_t) std::time(nullptr)}, {"model", srv.model_id}, + {"choices", json::array({ json{{"index", 0}, {"text", r.answer}, {"finish_reason", "stop"}} })}, + {"usage", usage_json(r)}, + {"timings", timings_json(r)}, + }; + res.set_content(out.dump(), "application/json"); + }); + + SRV_INF("server is listening on http://%s:%d\n", hostname.c_str(), port); + SRV_INF("%s\n", "all slots are idle"); + if (!http.listen(hostname, port)) { + SRV_ERR("failed to bind to %s:%d\n", hostname.c_str(), port); + llama_batch_free(srv.batch); + llama_free(srv.ctx); + llama_model_free(srv.model); + llama_backend_free(); + return 1; + } + + llama_batch_free(srv.batch); + llama_free(srv.ctx); + llama_model_free(srv.model); + llama_backend_free(); + return 0; +} From ed011ec8153a50cedc0e3b317489bab30df17525 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Tue, 9 Jun 2026 12:10:55 +0530 Subject: [PATCH 02/13] diffusion-gemma: add CUDA graph GPU sampling server path --- .../diffusion-gemma/diffusion-gemma-cli.cpp | 76 ++- .../diffusion-gemma-server.cpp | 115 +++- ggml/include/ggml-cuda.h | 25 + ggml/src/ggml-cuda/diffusion-sampling.cu | 493 ++++++++++++++++++ ggml/src/ggml-cuda/diffusion-sampling.cuh | 9 + ggml/src/ggml-cuda/ggml-cuda.cu | 27 +- include/llama.h | 29 ++ src/llama-context.cpp | 98 +++- src/llama-context.h | 7 + 9 files changed, 855 insertions(+), 24 deletions(-) create mode 100644 ggml/src/ggml-cuda/diffusion-sampling.cu create mode 100644 ggml/src/ggml-cuda/diffusion-sampling.cuh diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index d7510f8e1e23..6beea37005b8 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -40,6 +40,7 @@ static constexpr float TEMP_MIN = 0.4f; // LinearTemperatureSche static constexpr float TEMP_MAX = 0.8f; // LinearTemperatureScheduleConfig.t_max static constexpr float CONFIDENCE_THRESHOLD = 0.005f; // StableAndConfident.confidence_threshold static constexpr int STABILITY_THRESHOLD = 1; // StableAndConfident.stability_threshold +static constexpr int GPU_SAMPLING_MAX_TOP_K = 1024; // small top-k sort kernel limit static int env_int(const char * name, int def) { const char * v = getenv(name); @@ -199,8 +200,21 @@ int main(int argc, char ** argv) { llama_set_diffusion_prompt_len(ctx, prefix_len); llama_memory_t mem = llama_get_memory(ctx); + const int topk_max_requested = + (topk_start > 0 && topk_end > 0) ? std::max(topk_start, topk_end) : topk_fixed; + const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0; + const bool gpu_sampling_topk_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; + const bool use_gpu_sampling = gpu_sampling_requested && + gpu_sampling_topk_ok && + llama_diffusion_sample_topk_supported(ctx); + llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling); + LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n", prefix_len, canvas_length, max_canvases, n_steps, entropy_bound, TEMP_MIN, TEMP_MAX, n_ctx, (int) use_mm); + LOG_INF("diffusion-gemma: gpu sampling: %s%s\n", + use_gpu_sampling ? "on" : "off", + (!gpu_sampling_topk_ok ? " (top-k exceeds CUDA fast-path limit)" : + (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : ""))); if (topk_fixed > 0 || (topk_start > 0 && topk_end > 0)) { LOG_INF("diffusion-gemma: top-k sampling: fixed=%d anneal=[%d->%d] tail_correction=%d (vocab=%d)\n", topk_fixed, topk_start, topk_end, topk_tail, n_vocab); @@ -278,8 +292,13 @@ int main(int argc, char ** argv) { int n_blocks_run = 0; int n_steps_total = 0; const auto t_gen_start = std::chrono::steady_clock::now(); + const bool log_step_timing = env_int("DG_TIMING", 0) != 0; + double decode_enqueue_s = 0.0; + double sample_sync_s = 0.0; + double host_loop_s = 0.0; bool done = false; + bool failed = false; for (int block = 0; block < max_canvases && !done; ++block) { ++n_blocks_run; // 1. initialize canvas with random tokens @@ -303,12 +322,12 @@ int main(int argc, char ** argv) { batch.seq_id[j][0] = 0; batch.logits[j] = 1; } + const auto t_decode_start = std::chrono::steady_clock::now(); if (llama_decode(ctx, batch) != 0) { LOG_ERR("error: llama_decode failed at step %d\n", cur_step); break; } - // canvas logits occupy rows [0, canvas_length) (canvas-only ubatch) - const float * logits = llama_get_logits(ctx); + decode_enqueue_s += std::chrono::duration(std::chrono::steady_clock::now() - t_decode_start).count(); // 2b. linear temperature schedule: t = t_min + (t_max - t_min) * (cur_step / n_steps) const float temp = TEMP_MIN + (TEMP_MAX - TEMP_MIN) * ((float) cur_step / (float) n_steps); @@ -334,7 +353,32 @@ int main(int argc, char ** argv) { std::fill(sc_probs.begin(), sc_probs.end(), 0.0f); std::fill(sc_ids.begin(), sc_ids.end(), 0); - if (k_step == 0) { + bool sampled_ok = true; + const auto t_sample_start = std::chrono::steady_clock::now(); + if (use_gpu_sampling) { + llama_diffusion_sample_params sample_params = { + /* .n_tokens = */ canvas_length, + /* .top_k = */ k_step, + /* .self_cond_top_k = */ SC_K, + /* .temperature = */ temp, + /* .seed = */ params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed, + /* .step = */ (uint32_t) n_steps_total, + /* .top_k_tail_correction = */ topk_tail != 0, + }; + llama_diffusion_sample_result sample_result = { + /* .sampled = */ sampled.data(), + /* .argmax = */ argmax_canvas.data(), + /* .entropy = */ entropy.data(), + /* .self_cond_ids = */ sc_ids.data(), + /* .self_cond_probs = */ sc_probs.data(), + }; + sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result); + if (!sampled_ok) { + LOG_ERR("error: CUDA diffusion sampling failed at step %d (k=%d)\n", cur_step, k_step); + } + } else if (k_step == 0) { + // canvas logits occupy rows [0, canvas_length) (canvas-only ubatch) + const float * logits = llama_get_logits(ctx); // ---- full softmax over the whole vocabulary (reference behaviour) ---- // self-cond still feeds only the top-SC_K tokens (full-normalized probs); the dropped // tail carries negligible embedding weight and the post RMS norm absorbs the scale. @@ -385,6 +429,8 @@ int main(int argc, char ** argv) { argmax_canvas[j] = amax; } } else { + // canvas logits occupy rows [0, canvas_length) (canvas-only ubatch) + const float * logits = llama_get_logits(ctx); // ---- top-k host sampling: softmax / entropy / sample / self-cond over the top-k // logits only. Self-cond feeds the top min(k,SC_K) tokens (renormalized over the // sampled top-k), gathered in-graph; the dropped tail carries negligible weight. ---- @@ -453,6 +499,15 @@ int main(int argc, char ** argv) { argmax_canvas[j] = amax; } } + sample_sync_s += std::chrono::duration(std::chrono::steady_clock::now() - t_sample_start).count(); + if (!sampled_ok) { + llama_memory_seq_rm(mem, 0, n_past, -1); + failed = true; + done = true; + break; + } + + const auto t_host_start = std::chrono::steady_clock::now(); // roll back the canvas K/V written by this decode so the cache keeps only the committed // prefix [0, n_past); the next step re-decodes the canvas fresh against that prefix. @@ -502,6 +557,10 @@ int main(int argc, char ** argv) { for (int i = 0; i < canvas_length; ++i) { canvas[i] = accept_mask[i] ? accepted[i] : rand_tok(rng); } + host_loop_s += std::chrono::duration(std::chrono::steady_clock::now() - t_host_start).count(); + } + if (failed) { + break; } // 3. block output = the inline argmax of the last (stable) denoising step's logits. @@ -545,6 +604,13 @@ int main(int argc, char ** argv) { llama_batch_free(batch); + if (failed) { + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + return 1; + } + // the full denoised canvas (thought channel + response), for reference LOG_INF("\n=== generated canvas ===\n%s\n", common_detokenize(vocab, generated, false).c_str()); @@ -588,6 +654,10 @@ int main(int argc, char ** argv) { gen_s > 0.0 ? n_canvas_tok / gen_s : 0.0, n_steps_total > 0 ? gen_s / n_steps_total : 0.0, (int) answer.size()); + if (log_step_timing) { + LOG_INF("timing: decode enqueue %.3f s, sample/sync %.3f s, host loop %.3f s\n", + decode_enqueue_s, sample_sync_s, host_loop_s); + } llama_free(ctx); llama_model_free(model); diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp index e6fd0a536a1b..ebad596a5515 100644 --- a/examples/diffusion-gemma/diffusion-gemma-server.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,12 @@ static constexpr float TEMP_MAX = 0.8f; static constexpr float CONFIDENCE_THRESHOLD = 0.005f; static constexpr int STABILITY_THRESHOLD = 1; static constexpr int SC_K = 256; // must match llama_model_diffusion_gemma::N_SC_TOPK +static constexpr int GPU_SAMPLING_MAX_TOP_K = 1024; // CUDA diffusion sampler limit + +static int env_int(const char * name, int def) { + const char * v = getenv(name); + return v ? atoi(v) : def; +} // ------------------------------------------------------------------------------------------------ // per-request generation parameters and result @@ -147,6 +154,11 @@ struct diffusion_server { int n_ctx = 0; int n_ub = 0; // ubatch == canvas_length (keeps the decoder gather graph small) int n_vocab = 0; + int topk_fixed = 0; + int topk_start = 0; + int topk_end = 0; + bool topk_tail = false; + bool use_gpu_sampling = false; std::string model_id; std::string model_path; std::string build_info = "llama.cpp diffusion-gemma-server"; @@ -237,8 +249,22 @@ struct diffusion_server { for (int cur_step = n_steps; cur_step >= 1; --cur_step) { ++n_steps_total; + const float temp = TEMP_MIN + (TEMP_MAX - TEMP_MIN) * ((float) cur_step / (float) n_steps); + + int k_step = 0; + if (rq.topk_start > 0 && rq.topk_end > 0) { + const float frac = (n_steps > 1) ? (float) (cur_step - 1) / (float) (n_steps - 1) : 0.0f; + k_step = (int) lroundf(rq.topk_end + (rq.topk_start - rq.topk_end) * frac); + } else if (rq.topk_fixed > 0) { + k_step = rq.topk_fixed; + } + if (k_step <= 0 || k_step >= n_vocab) k_step = 0; + const bool use_gpu_sampling_step = use_gpu_sampling && + (k_step <= 0 || k_step <= GPU_SAMPLING_MAX_TOP_K); + llama_set_causal_attn(ctx, false); llama_set_diffusion_decoder_phase(ctx, true); + llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling_step); batch.n_tokens = canvas_length; for (int j = 0; j < canvas_length; ++j) { batch.token[j] = canvas[j]; @@ -252,28 +278,39 @@ struct diffusion_server { return out; } ++n_decode; - const float * logits = llama_get_logits(ctx); - - const float temp = TEMP_MIN + (TEMP_MAX - TEMP_MIN) * ((float) cur_step / (float) n_steps); std::vector entropy(canvas_length); std::vector sampled(canvas_length); - int k_step = 0; - if (rq.topk_start > 0 && rq.topk_end > 0) { - const float frac = (n_steps > 1) ? (float) (cur_step - 1) / (float) (n_steps - 1) : 0.0f; - k_step = (int) lroundf(rq.topk_end + (rq.topk_start - rq.topk_end) * frac); - } else if (rq.topk_fixed > 0) { - k_step = rq.topk_fixed; - } - if (k_step <= 0 || k_step >= n_vocab) k_step = 0; - std::fill(sc_probs.begin(), sc_probs.end(), 0.0f); std::fill(sc_ids.begin(), sc_ids.end(), 0); - const auto cmp = [](const std::pair&a, const std::pair&b){ return a.first > b.first; }; - - if (k_step == 0) { + if (use_gpu_sampling_step) { + llama_diffusion_sample_params sample_params = { + /* .n_tokens = */ canvas_length, + /* .top_k = */ k_step, + /* .self_cond_top_k = */ SC_K, + /* .temperature = */ temp, + /* .seed = */ rq.seed, + /* .step = */ (uint32_t) n_steps_total, + /* .top_k_tail_correction = */ rq.topk_tail, + }; + llama_diffusion_sample_result sample_result = { + /* .sampled = */ sampled.data(), + /* .argmax = */ argmax_canvas.data(), + /* .entropy = */ entropy.data(), + /* .self_cond_ids = */ sc_ids.data(), + /* .self_cond_probs = */ sc_probs.data(), + }; + if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) { + llama_memory_seq_rm(mem, 0, n_past, -1); + out.error = "CUDA diffusion sampling failed at denoising step " + + std::to_string(cur_step) + " (k=" + std::to_string(k_step) + ")"; + return out; + } + } else if (k_step == 0) { + const float * logits = llama_get_logits(ctx); + const auto cmp = [](const std::pair&a, const std::pair&b){ return a.first > b.first; }; std::vector probs(n_vocab); std::vector> scheap; scheap.reserve(SC_K); for (int j = 0; j < canvas_length; ++j) { @@ -310,6 +347,8 @@ struct diffusion_server { entropy[j] = ent; sampled[j] = tok; argmax_canvas[j] = amax; } } else { + const float * logits = llama_get_logits(ctx); + const auto cmp = [](const std::pair&a, const std::pair&b){ return a.first > b.first; }; const int heap_k = std::max(k_step, SC_K); std::vector> heap; heap.reserve(heap_k); for (int j = 0; j < canvas_length; ++j) { @@ -503,12 +542,25 @@ static diffusion_request request_from_body(const json & body, const diffusion_se rq.canvas_length = srv.canvas_length; rq.n_steps = srv.n_steps; rq.seed = default_seed; + rq.topk_fixed = srv.topk_fixed; + rq.topk_start = srv.topk_start; + rq.topk_end = srv.topk_end; + rq.topk_tail = srv.topk_tail; int max_tokens = 0; if (body.contains("max_tokens") && body["max_tokens"].is_number_integer()) max_tokens = body["max_tokens"].get(); else if (body.contains("max_completion_tokens") && body["max_completion_tokens"].is_number_integer()) max_tokens = body["max_completion_tokens"].get(); rq.max_canvases = max_tokens > 0 ? (max_tokens + rq.canvas_length - 1) / rq.canvas_length : 1; - if (body.contains("top_k") && body["top_k"].is_number_integer()) rq.topk_fixed = body["top_k"].get(); - if (body.contains("seed") && body["seed"].is_number_integer()) rq.seed = (uint32_t) body["seed"].get(); + if (body.contains("top_k") && body["top_k"].is_number_integer()) { + rq.topk_fixed = body["top_k"].get(); + rq.topk_start = 0; + rq.topk_end = 0; + } + if (body.contains("top_k_start") && body["top_k_start"].is_number_integer()) rq.topk_start = body["top_k_start"].get(); + if (body.contains("top_k_end") && body["top_k_end"].is_number_integer()) rq.topk_end = body["top_k_end"].get(); + if (body.contains("top_k_tail_correction") && body["top_k_tail_correction"].is_boolean()) { + rq.topk_tail = body["top_k_tail_correction"].get(); + } + if (body.contains("seed") && body["seed"].is_number_integer()) rq.seed = (uint32_t) body["seed"].get(); if (body.contains("ignore_eos") && body["ignore_eos"].is_boolean()) rq.ignore_eos = body["ignore_eos"].get(); return rq; } @@ -574,6 +626,12 @@ int main(int argc, char ** argv) { srv.n_ub = srv.canvas_length; srv.n_ctx = params.n_ctx > 0 ? (int) params.n_ctx : 4096; if (srv.n_ctx < 2 * srv.canvas_length) srv.n_ctx = 2 * srv.canvas_length; + srv.topk_fixed = (params.sampling.user_sampling_config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K) + ? params.sampling.top_k + : 0; + srv.topk_start = params.diffusion.top_k_start; + srv.topk_end = params.diffusion.top_k_end; + srv.topk_tail = params.diffusion.top_k_tail_correction; srv.model_path = params.model.path; srv.metrics.t_start = (int64_t) std::time(nullptr); @@ -603,11 +661,24 @@ int main(int argc, char ** argv) { srv.batch = llama_batch_init(srv.n_ub, 0, 1); srv.templates = common_chat_templates_init(srv.model, ""); + const int topk_max_requested = + (srv.topk_start > 0 && srv.topk_end > 0) ? std::max(srv.topk_start, srv.topk_end) : srv.topk_fixed; + const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0; + srv.use_gpu_sampling = gpu_sampling_requested && + llama_diffusion_sample_topk_supported(srv.ctx); + llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling); + const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; + const uint32_t default_seed = params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed; SRV_INF("%s\n", llama_print_system_info()); SRV_INF("model loaded: '%s' | n_ctx = %d | canvas = %d | denoise steps = %d | 1 slot\n", srv.model_id.c_str(), srv.n_ctx, srv.canvas_length, srv.n_steps); + SRV_INF("gpu sampling: %s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n", + srv.use_gpu_sampling ? "on" : "off", + (!default_topk_gpu_ok ? " (default top-k will use CPU fallback until k <= CUDA limit)" : + (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")), + srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0); // ------------------------------------------------------------------------------------------ // HTTP server @@ -676,6 +747,11 @@ int main(int argc, char ** argv) { {"confidence_threshold", CONFIDENCE_THRESHOLD}, {"stability_threshold", STABILITY_THRESHOLD}, {"self_cond_topk", SC_K}, + {"gpu_sampling", srv.use_gpu_sampling}, + {"top_k", srv.topk_fixed}, + {"top_k_start", srv.topk_start}, + {"top_k_end", srv.topk_end}, + {"top_k_tail_correction", srv.topk_tail}, }}, {"total_slots", 1}, {"model_path", srv.model_path}, @@ -777,6 +853,11 @@ int main(int argc, char ** argv) { {"canvas_length", srv.canvas_length}, {"n_denoise_steps", srv.n_steps}, {"self_cond_topk", SC_K}, + {"gpu_sampling", srv.use_gpu_sampling}, + {"top_k", srv.topk_fixed}, + {"top_k_start", srv.topk_start}, + {"top_k_end", srv.topk_end}, + {"top_k_tail_correction", srv.topk_tail}, }}, }; res.set_content(json::array({slot}).dump(), "application/json"); diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index 5436c7ef579c..4f3691c2980f 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -43,6 +43,31 @@ GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * f GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); +struct ggml_cuda_diffusion_sample_params { + int32_t n_vocab; + int32_t n_tokens; + int32_t top_k; + int32_t self_cond_top_k; + float temperature; + uint32_t seed; + uint32_t step; + bool top_k_tail_correction; +}; + +struct ggml_cuda_diffusion_sample_result { + int32_t * sampled; + int32_t * argmax; + float * entropy; + int32_t * self_cond_ids; + float * self_cond_probs; +}; + +typedef bool (*ggml_backend_cuda_diffusion_sample_topk_t)( + ggml_backend_t backend, + const struct ggml_tensor * logits, + const struct ggml_cuda_diffusion_sample_params * params, + struct ggml_cuda_diffusion_sample_result * result); + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void); #ifdef __cplusplus diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu new file mode 100644 index 000000000000..042e3af0520d --- /dev/null +++ b/ggml/src/ggml-cuda/diffusion-sampling.cu @@ -0,0 +1,493 @@ +#include "argsort.cuh" +#include "diffusion-sampling.cuh" +#include "../ggml-backend-impl.h" + +#include +#include +#include +#include + +#ifdef GGML_CUDA_USE_CUB +# include +# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2) +# define CUB_DIFFUSION_TOP_K_AVAILABLE +# include +using namespace cub; +# endif +#endif + +static int next_power_of_2_host(int x) { + int n = 1; + while (n < x) { + n <<= 1; + } + return n; +} + +static bool diffusion_fast_topk_enabled() { + const char * v = std::getenv("GGML_CUDA_DIFFUSION_FAST_TOPK"); + return !v || std::strcmp(v, "0") != 0; +} + +static __device__ __forceinline__ bool diffusion_should_swap_desc( + const float a_val, const int a_id, + const float b_val, const int b_id) { + return a_val < b_val || (a_val == b_val && a_id > b_id); +} + +template +static __global__ void diffusion_select_topk_local_kernel( + const float * __restrict__ logits, + int * __restrict__ top_ids, + const int n_vocab, + const int heap_k) { + const int row = blockIdx.x; + const int tid = threadIdx.x; + + float vals[LOCAL_K]; + int ids[LOCAL_K]; +#pragma unroll + for (int i = 0; i < LOCAL_K; ++i) { + vals[i] = -FLT_MAX; + ids[i] = 0; + } + + const float * row_logits = logits + (size_t) row * n_vocab; + for (int v = tid; v < n_vocab; v += blockDim.x) { + const float x = row_logits[v]; + if (diffusion_should_swap_desc(vals[LOCAL_K - 1], ids[LOCAL_K - 1], x, v)) { + int pos = LOCAL_K - 1; +#pragma unroll + for (int i = LOCAL_K - 1; i > 0; --i) { + if (pos == i && diffusion_should_swap_desc(vals[i - 1], ids[i - 1], x, v)) { + vals[i] = vals[i - 1]; + ids[i] = ids[i - 1]; + --pos; + } + } + vals[pos] = x; + ids[pos] = v; + } + } + + constexpr int candidate_count = LOCAL_K * 256; + + extern __shared__ unsigned char smem[]; + float * s_vals = (float *) smem; + int * s_ids = (int *) (s_vals + candidate_count); + +#pragma unroll + for (int i = 0; i < LOCAL_K; ++i) { + const int dst = tid * LOCAL_K + i; + s_vals[dst] = vals[i]; + s_ids[dst] = ids[i]; + } + __syncthreads(); + + for (int k = 2; k <= candidate_count; k <<= 1) { + for (int j = k >> 1; j > 0; j >>= 1) { + for (int i = tid; i < candidate_count; i += blockDim.x) { + const int ixj = i ^ j; + if (ixj > i) { + const bool descending = (i & k) == 0; + const bool swap = descending + ? diffusion_should_swap_desc(s_vals[i], s_ids[i], s_vals[ixj], s_ids[ixj]) + : diffusion_should_swap_desc(s_vals[ixj], s_ids[ixj], s_vals[i], s_ids[i]); + if (swap) { + const float tv = s_vals[i]; + s_vals[i] = s_vals[ixj]; + s_vals[ixj] = tv; + const int ti = s_ids[i]; + s_ids[i] = s_ids[ixj]; + s_ids[ixj] = ti; + } + } + } + __syncthreads(); + } + } + + for (int i = tid; i < heap_k; i += blockDim.x) { + top_ids[(size_t) row * heap_k + i] = s_ids[i]; + } +} + +static void diffusion_select_topk_local( + const float * logits, + int * top_ids, + const int n_vocab, + const int n_tokens, + const int heap_k, + cudaStream_t stream) { + constexpr int block_size = 256; + + if (heap_k <= 256) { + constexpr int local_k = 8; + const size_t smem = (size_t) local_k * block_size * (sizeof(float) + sizeof(int)); + diffusion_select_topk_local_kernel<<>>( + logits, top_ids, n_vocab, heap_k); + } else { + constexpr int local_k = 16; + const size_t smem = (size_t) local_k * block_size * (sizeof(float) + sizeof(int)); + diffusion_select_topk_local_kernel<<>>( + logits, top_ids, n_vocab, heap_k); + } +} + +#ifdef CUB_DIFFUSION_TOP_K_AVAILABLE +static void diffusion_top_k_cub( + ggml_cuda_pool & pool, + const float * src, + int * dst, + const int ncols, + const int k, + cudaStream_t stream) { + auto requirements = cuda::execution::require(cuda::execution::determinism::not_guaranteed, + cuda::execution::output_ordering::unsorted); + auto stream_env = cuda::stream_ref{ stream }; + auto env = cuda::std::execution::env{ stream_env, requirements }; + auto indexes_in = cuda::make_counting_iterator(0); + + size_t temp_storage_bytes = 0; + CUDA_CHECK(DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k, env)); + + ggml_cuda_pool_alloc temp_storage_alloc(pool, temp_storage_bytes); + void * d_temp_storage = temp_storage_alloc.get(); + + CUDA_CHECK(DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k, env)); +} +#endif + +static __global__ void diffusion_sort_top_ids_kernel( + const float * __restrict__ logits, + int * __restrict__ top_ids, + const int n_vocab, + const int heap_k, + const int heap_k_pad, + const float inv_temp) { + const int row = blockIdx.x; + const int tid = threadIdx.x; + + extern __shared__ unsigned char smem[]; + int * ids = (int *) smem; + float * vals = (float *) (ids + heap_k_pad); + + const int base = row * heap_k; + for (int i = tid; i < heap_k_pad; i += blockDim.x) { + if (i < heap_k) { + const int id = top_ids[base + i]; + ids[i] = id; + vals[i] = logits[(size_t) row * n_vocab + id] * inv_temp; + } else { + ids[i] = 0; + vals[i] = -FLT_MAX; + } + } + __syncthreads(); + + for (int k = 2; k <= heap_k_pad; k <<= 1) { + for (int j = k >> 1; j > 0; j >>= 1) { + const int ixj = tid ^ j; + if (ixj > tid && ixj < heap_k_pad) { + const bool up = (tid & k) == 0; + const bool swap = up ? (vals[tid] < vals[ixj]) : (vals[tid] > vals[ixj]); + if (swap) { + const float tv = vals[tid]; + vals[tid] = vals[ixj]; + vals[ixj] = tv; + const int ti = ids[tid]; + ids[tid] = ids[ixj]; + ids[ixj] = ti; + } + } + __syncthreads(); + } + } + + for (int i = tid; i < heap_k; i += blockDim.x) { + top_ids[base + i] = ids[i]; + } +} + +static __device__ __forceinline__ uint32_t diffusion_rng_u32(uint32_t x) { + x ^= x >> 16; + x *= 0x7feb352du; + x ^= x >> 15; + x *= 0x846ca68bu; + x ^= x >> 16; + return x; +} + +static __device__ __forceinline__ float diffusion_rng_uniform(uint32_t seed, uint32_t step, uint32_t row) { + const uint32_t x = diffusion_rng_u32(seed ^ (step * 0x9e3779b9u) ^ (row * 0x85ebca6bu)); + return ((x >> 8) + 0.5f) * (1.0f / 16777216.0f); +} + +static __global__ void diffusion_sample_kernel( + const float * __restrict__ logits, + const int * __restrict__ top_ids, + const int n_vocab, + const int n_tokens, + const int top_k, + const int heap_k, + const int sc_k, + const float inv_temp, + const uint32_t seed, + const uint32_t step, + const bool tail_correction, + int * __restrict__ sampled, + int * __restrict__ argmax, + float * __restrict__ entropy, + int * __restrict__ sc_ids, + float * __restrict__ sc_probs) { + const int row = blockIdx.x; + const int tid = threadIdx.x; + if (row >= n_tokens) { + return; + } + + __shared__ float s_val[1024]; + __shared__ float s_sum[1024]; + __shared__ int s_idx[1024]; + + const float * row_logits = logits + (size_t) row * n_vocab; + const int * row_top = top_ids + (size_t) row * heap_k; + + float local_max = -FLT_MAX; + int local_idx = 0; + + if (top_k == 0 || tail_correction) { + for (int v = tid; v < n_vocab; v += blockDim.x) { + const float x = row_logits[v] * inv_temp; + if (x > local_max) { + local_max = x; + local_idx = v; + } + } + } else { + for (int i = tid; i < top_k; i += blockDim.x) { + const int v = row_top[i]; + const float x = row_logits[v] * inv_temp; + if (x > local_max) { + local_max = x; + local_idx = v; + } + } + } + + s_val[tid] = local_max; + s_idx[tid] = local_idx; + __syncthreads(); + + for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { + if (tid < stride && s_val[tid + stride] > s_val[tid]) { + s_val[tid] = s_val[tid + stride]; + s_idx[tid] = s_idx[tid + stride]; + } + __syncthreads(); + } + + const float max_l = s_val[0]; + const int amax = s_idx[0]; + + float local_sum = 0.0f; + float local_t = 0.0f; + + if (top_k == 0 || tail_correction) { + for (int v = tid; v < n_vocab; v += blockDim.x) { + const float d = row_logits[v] * inv_temp - max_l; + const float e = expf(d); + local_sum += e; + local_t += d * e; + } + } else { + for (int i = tid; i < top_k; i += blockDim.x) { + const int v = row_top[i]; + const float d = row_logits[v] * inv_temp - max_l; + const float e = expf(d); + local_sum += e; + local_t += d * e; + } + } + + s_sum[tid] = local_sum; + s_val[tid] = local_t; + __syncthreads(); + + for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { + if (tid < stride) { + s_sum[tid] += s_sum[tid + stride]; + s_val[tid] += s_val[tid + stride]; + } + __syncthreads(); + } + + const float z = s_sum[0]; + const float t = s_val[0]; + + if (tid == 0) { + argmax[row] = amax; + entropy[row] = logf(z) - t / z; + + float sample_z = z; + if (top_k > 0 && tail_correction) { + sample_z = 0.0f; + for (int i = 0; i < top_k; ++i) { + const int v = row_top[i]; + sample_z += expf(row_logits[v] * inv_temp - max_l); + } + } + + const float r = diffusion_rng_uniform(seed, step, row) * sample_z; + float cum = 0.0f; + int tok = amax; + + if (top_k == 0) { + for (int v = 0; v < n_vocab; ++v) { + cum += expf(row_logits[v] * inv_temp - max_l); + if (cum >= r) { + tok = v; + break; + } + } + } else { + for (int i = 0; i < top_k; ++i) { + const int v = row_top[i]; + cum += expf(row_logits[v] * inv_temp - max_l); + if (cum >= r) { + tok = v; + break; + } + } + } + sampled[row] = tok; + + const int n_sc = top_k == 0 ? sc_k : min(sc_k, top_k); + for (int i = 0; i < sc_k; ++i) { + const int out = row * sc_k + i; + if (i < n_sc) { + const int v = row_top[i]; + sc_ids[out] = v; + sc_probs[out] = expf(row_logits[v] * inv_temp - max_l) / sample_z; + } else { + sc_ids[out] = 0; + sc_probs[out] = 0.0f; + } + } + } +} + +bool ggml_cuda_diffusion_sample_topk( + ggml_backend_t backend, + const ggml_tensor * logits, + const ggml_cuda_diffusion_sample_params * params, + ggml_cuda_diffusion_sample_result * result) { + if (!backend || !logits || !params || !result) { + return false; + } + if (!ggml_backend_is_cuda(backend) || logits->type != GGML_TYPE_F32 || !ggml_is_contiguous(logits)) { + return false; + } + if (!result->sampled || !result->argmax || !result->entropy || !result->self_cond_ids || !result->self_cond_probs) { + return false; + } + + const int n_vocab = params->n_vocab > 0 ? params->n_vocab : (int) logits->ne[0]; + const int n_tokens = params->n_tokens > 0 ? params->n_tokens : (int) ggml_nrows(logits); + if (n_vocab <= 0 || n_tokens <= 0 || logits->ne[0] != n_vocab || ggml_nrows(logits) < n_tokens) { + return false; + } + + int top_k = params->top_k; + if (top_k <= 0 || top_k >= n_vocab) { + top_k = 0; + } + + const int sc_k = params->self_cond_top_k; + if (sc_k <= 0 || sc_k > 1024) { + return false; + } + + const int heap_k = top_k == 0 ? sc_k : std::max(top_k, sc_k); + if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) { + return false; + } + + const float temp = params->temperature > 0.0f ? params->temperature : 1.0f; + const float inv_temp = 1.0f / temp; + + ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *) backend->context; + ggml_cuda_set_device(ctx->device); + ggml_cuda_pool & pool = ctx->pool(); + cudaStream_t stream = ctx->stream(); + + const float * logits_d = (const float *) logits->data; + + ggml_cuda_pool_alloc top_ids_alloc(pool, (size_t) n_tokens * heap_k); + int * top_ids = top_ids_alloc.get(); + bool top_ids_sorted = false; + + const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k; + if (use_fast_topk) { + diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream); + top_ids_sorted = true; + } else { +#ifdef CUB_DIFFUSION_TOP_K_AVAILABLE + for (int row = 0; row < n_tokens; ++row) { + diffusion_top_k_cub(pool, logits_d + (size_t) row * n_vocab, top_ids + (size_t) row * heap_k, n_vocab, heap_k, stream); + } +#elif defined(GGML_CUDA_USE_CUB) + ggml_cuda_pool_alloc sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab); + int * sorted_ids = sorted_ids_alloc.get(); + argsort_f32_i32_cuda_cub(pool, logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream); + CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int), + heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream)); + top_ids_sorted = true; +#else + if (n_vocab > 1024) { + return false; + } + ggml_cuda_pool_alloc sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab); + int * sorted_ids = sorted_ids_alloc.get(); + argsort_f32_i32_cuda_bitonic(logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream); + CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int), + heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream)); + top_ids_sorted = true; +#endif + } + + if (!top_ids_sorted) { + const int heap_k_pad = next_power_of_2_host(heap_k); + const int sort_threads = std::max(32, heap_k_pad); + const size_t sort_smem = (size_t) heap_k_pad * (sizeof(int) + sizeof(float)); + diffusion_sort_top_ids_kernel<<>>( + logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp); + } + + ggml_cuda_pool_alloc sampled_alloc(pool, n_tokens); + ggml_cuda_pool_alloc argmax_alloc(pool, n_tokens); + ggml_cuda_pool_alloc entropy_alloc(pool, n_tokens); + ggml_cuda_pool_alloc sc_ids_alloc(pool, (size_t) n_tokens * sc_k); + ggml_cuda_pool_alloc sc_probs_alloc(pool, (size_t) n_tokens * sc_k); + + constexpr int block_size = 256; + diffusion_sample_kernel<<>>( + logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp, + params->seed, params->step, params->top_k_tail_correction, + sampled_alloc.get(), argmax_alloc.get(), entropy_alloc.get(), + sc_ids_alloc.get(), sc_probs_alloc.get()); + + CUDA_CHECK(cudaMemcpyAsync(result->sampled, sampled_alloc.get(), (size_t) n_tokens * sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(result->argmax, argmax_alloc.get(), (size_t) n_tokens * sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(result->entropy, entropy_alloc.get(), (size_t) n_tokens * sizeof(float), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float), + cudaMemcpyDeviceToHost, stream)); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + return true; +} diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cuh b/ggml/src/ggml-cuda/diffusion-sampling.cuh new file mode 100644 index 000000000000..dfce406d9303 --- /dev/null +++ b/ggml/src/ggml-cuda/diffusion-sampling.cuh @@ -0,0 +1,9 @@ +#pragma once + +#include "common.cuh" + +bool ggml_cuda_diffusion_sample_topk( + ggml_backend_t backend, + const ggml_tensor * logits, + const ggml_cuda_diffusion_sample_params * params, + ggml_cuda_diffusion_sample_result * result); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index e779a9be9e95..17e2c10f2b9c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -23,6 +23,7 @@ #include "ggml-cuda/cumsum.cuh" #include "ggml-cuda/diagmask.cuh" #include "ggml-cuda/diag.cuh" +#include "ggml-cuda/diffusion-sampling.cuh" #include "ggml-cuda/fattn.cuh" #include "ggml-cuda/fwht.cuh" #include "ggml-cuda/getrows.cuh" @@ -3296,6 +3297,10 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { bool use_cuda_graph = true; + static const bool log_graph_incompat = [] { + const char * env = getenv("GGML_CUDA_GRAPH_DIAG"); + return env != nullptr && atoi(env) != 0; + }(); // Loop over nodes in GGML graph to obtain info needed for CUDA graph for (int i = 0; i < cgraph->n_nodes; i++) { @@ -3307,6 +3312,10 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture + if (log_graph_incompat) { + GGML_LOG_INFO("%s: disabling CUDA graphs due to split buffer at node %d %s (%s)\n", + __func__, i, node->name, ggml_op_name(node->op)); + } #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__); #endif @@ -3315,12 +3324,23 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { // [TAG_MUL_MAT_ID_CUDA_GRAPHS] if (node->op == GGML_OP_MUL_MAT_ID) { const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc); - if (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max) { + const ggml_tensor * src0 = node->src[0]; + const ggml_tensor * src1 = node->src[1]; + const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc); + const bool use_mmvq = ggml_is_quantized(src0->type) && node->ne[2] <= mmvq_mmid_max; + const bool use_mmq = ggml_is_quantized(src0->type) && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[2], /*n_experts=*/src0->ne[2]); + const bool use_mmf = ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true); + if (!use_mmvq && !use_mmq && !use_mmf) { // under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs // TODO: figure out a way to enable for larger batch sizes, without hurting performance // ref: https://github.com/ggml-org/llama.cpp/pull/18958 use_cuda_graph = false; + if (log_graph_incompat) { + GGML_LOG_INFO("%s: disabling CUDA graphs due to MUL_MAT_ID at node %d %s: src0=%s ne2=%lld src1_ne2=%lld n_experts=%lld mmvq_mmid_max=%d quantized=%d mmq=%d mmf=%d\n", + __func__, i, node->name, ggml_type_name(src0->type), + (long long) node->ne[2], (long long) src1->ne[2], (long long) src0->ne[2], + mmvq_mmid_max, (int) ggml_is_quantized(src0->type), (int) use_mmq, (int) use_mmf); + } #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__); #endif @@ -5664,6 +5684,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con if (strcmp(name, "ggml_backend_get_features") == 0) { return (void *)ggml_backend_cuda_get_features; } + if (strcmp(name, "ggml_backend_cuda_diffusion_sample_topk") == 0) { + return (void *)ggml_cuda_diffusion_sample_topk; + } return nullptr; } diff --git a/include/llama.h b/include/llama.h index cbb20b200ba5..eb2d9cb338fb 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1015,6 +1015,35 @@ extern "C" { int64_t k, int64_t n_tokens); + struct llama_diffusion_sample_params { + int32_t n_tokens; + int32_t top_k; + int32_t self_cond_top_k; + float temperature; + uint32_t seed; + uint32_t step; + bool top_k_tail_correction; + }; + + struct llama_diffusion_sample_result { + llama_token * sampled; + llama_token * argmax; + float * entropy; + int32_t * self_cond_ids; + float * self_cond_probs; + }; + + // CUDA-only fast path for block-diffusion sampling. When enabled, llama_decode() + // keeps the dense diffusion logits on the backend and does not copy them to the + // host output buffer. Call llama_diffusion_sample_topk() after llama_decode() to + // sample the most recent logits tensor and retrieve compact row results. + LLAMA_API bool llama_diffusion_sample_topk_supported(struct llama_context * ctx); + LLAMA_API void llama_set_diffusion_gpu_sampling(struct llama_context * ctx, bool enabled); + LLAMA_API bool llama_diffusion_sample_topk( + struct llama_context * ctx, + const struct llama_diffusion_sample_params * params, + struct llama_diffusion_sample_result * result); + // Set whether the model is in warmup mode or not // If true, all model tensors are activated during llama_decode() to load and cache their weights. // diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d2a624392ada..4ec0292704e6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1,6 +1,7 @@ #include "llama-context.h" #include "ggml.h" +#include "ggml-cuda.h" #include "llama-arch.h" #include "llama-graph.h" #include "llama-impl.h" @@ -1160,6 +1161,80 @@ void llama_context::set_diffusion_self_cond_topk(const int32_t * ids, const floa diffusion_cond.sc_topk_probs.assign(probs, probs + n); } +static ggml_backend_cuda_diffusion_sample_topk_t get_cuda_diffusion_sample_topk_proc(ggml_backend_t backend) { + if (!backend) { + return nullptr; + } + + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (!dev) { + return nullptr; + } + + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (!reg) { + return nullptr; + } + + return (ggml_backend_cuda_diffusion_sample_topk_t) + ggml_backend_reg_get_proc_address(reg, "ggml_backend_cuda_diffusion_sample_topk"); +} + +bool llama_context::diffusion_sample_topk_supported() const { + for (ggml_backend_t backend : backend_ptrs) { + if (get_cuda_diffusion_sample_topk_proc(backend)) { + return true; + } + } + return false; +} + +void llama_context::set_diffusion_gpu_sampling(bool enabled) { + diffusion_gpu_sampling = enabled; +} + +bool llama_context::diffusion_sample_topk( + const llama_diffusion_sample_params * params, + llama_diffusion_sample_result * result) { + if (!params || !result || !gf_res_prev) { + return false; + } + + static_assert(sizeof(llama_token) == sizeof(int32_t), "llama_token must be int32_t for CUDA diffusion sampling"); + + ggml_tensor * t_logits = gf_res_prev->get_logits(); + if (!t_logits) { + return false; + } + + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); + auto sample_proc = get_cuda_diffusion_sample_topk_proc(backend); + if (!sample_proc) { + return false; + } + + ggml_cuda_diffusion_sample_params cuda_params = { + /* .n_vocab = */ (int32_t) model.vocab.n_tokens(), + /* .n_tokens = */ params->n_tokens, + /* .top_k = */ params->top_k, + /* .self_cond_top_k = */ params->self_cond_top_k, + /* .temperature = */ params->temperature, + /* .seed = */ params->seed, + /* .step = */ params->step, + /* .top_k_tail_correction = */ params->top_k_tail_correction, + }; + + ggml_cuda_diffusion_sample_result cuda_result = { + /* .sampled = */ (int32_t *) result->sampled, + /* .argmax = */ (int32_t *) result->argmax, + /* .entropy = */ result->entropy, + /* .self_cond_ids = */ result->self_cond_ids, + /* .self_cond_probs = */ result->self_cond_probs, + }; + + return sample_proc(backend, t_logits, &cuda_params, &cuda_result); +} + void llama_context::set_causal_attn(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); @@ -1659,7 +1734,11 @@ static void copy_tensor_async_candidates( } } -static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map & samplers) { +static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map & samplers, bool skip_raw_logits) { + if (skip_raw_logits) { + return false; + } + for (uint32_t i = 0; i < ubatch.n_tokens; i++) { if (!ubatch.output[i]) { continue; @@ -1886,7 +1965,7 @@ int llama_context::decode(const llama_batch & batch_inp) { } // extract logits - if (logits.data && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers)) { + if (logits.data && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers, diffusion_gpu_sampling)) { ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); GGML_ASSERT(backend_res != nullptr); GGML_ASSERT(logits.data != nullptr); @@ -3606,6 +3685,21 @@ void llama_set_diffusion_self_cond_topk(llama_context * ctx, const int32_t * ids ctx->set_diffusion_self_cond_topk(ids, probs, k, n_tokens); } +bool llama_diffusion_sample_topk_supported(llama_context * ctx) { + return ctx->diffusion_sample_topk_supported(); +} + +void llama_set_diffusion_gpu_sampling(llama_context * ctx, bool enabled) { + ctx->set_diffusion_gpu_sampling(enabled); +} + +bool llama_diffusion_sample_topk( + llama_context * ctx, + const llama_diffusion_sample_params * params, + llama_diffusion_sample_result * result) { + return ctx->diffusion_sample_topk(params, result); +} + void llama_set_warmup(llama_context * ctx, bool warmup) { ctx->set_warmup(warmup); } diff --git a/src/llama-context.h b/src/llama-context.h index 3ee5cf124b28..48928162acbd 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -128,6 +128,12 @@ struct llama_context { // diffusion: sparse (top-k) self-conditioning (top-k token ids + probs per position) void set_diffusion_self_cond_topk(const int32_t * ids, const float * probs, int64_t k, int64_t n_tokens); + bool diffusion_sample_topk_supported() const; + void set_diffusion_gpu_sampling(bool enabled); + bool diffusion_sample_topk( + const llama_diffusion_sample_params * params, + llama_diffusion_sample_result * result); + void set_adapters_lora(llama_adapter_lora ** adapters, size_t n_adapters, float * scales); bool adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales); @@ -288,6 +294,7 @@ struct llama_context { llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably llama_diffusion_cond diffusion_cond; // diffusion self-conditioning (set per-decode by the sampler) + bool diffusion_gpu_sampling = false; // skip dense logits D2H; sampled via CUDA backend proc llama_memory_ptr memory; From 0c69257e69dd0437ca7242da5eba898b91520fd7 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Tue, 9 Jun 2026 14:12:45 +0530 Subject: [PATCH 03/13] diffusion-gemma: keep self-cond sampling on device --- .../diffusion-gemma/diffusion-gemma-cli.cpp | 25 ++++++---- .../diffusion-gemma-server.cpp | 23 ++++++--- ggml/include/ggml-cuda.h | 2 + ggml/src/ggml-cuda/diffusion-sampling.cu | 48 +++++++++++++++++-- include/llama.h | 2 + src/llama-context.cpp | 36 +++++++++++++- src/llama-graph.cpp | 37 ++++++++++++++ src/llama-graph.h | 8 ++++ 8 files changed, 160 insertions(+), 21 deletions(-) diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index 6beea37005b8..84fd64be6895 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -207,14 +207,16 @@ int main(int argc, char ** argv) { const bool use_gpu_sampling = gpu_sampling_requested && gpu_sampling_topk_ok && llama_diffusion_sample_topk_supported(ctx); + const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling); LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n", prefix_len, canvas_length, max_canvases, n_steps, entropy_bound, TEMP_MIN, TEMP_MAX, n_ctx, (int) use_mm); - LOG_INF("diffusion-gemma: gpu sampling: %s%s\n", + LOG_INF("diffusion-gemma: gpu sampling: %s%s%s\n", use_gpu_sampling ? "on" : "off", (!gpu_sampling_topk_ok ? " (top-k exceeds CUDA fast-path limit)" : - (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : ""))); + (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")), + use_device_self_cond ? " | device self-cond: on" : ""); if (topk_fixed > 0 || (topk_start > 0 && topk_end > 0)) { LOG_INF("diffusion-gemma: top-k sampling: fixed=%d anneal=[%d->%d] tail_correction=%d (vocab=%d)\n", topk_fixed, topk_start, topk_end, topk_tail, n_vocab); @@ -350,8 +352,10 @@ int main(int argc, char ** argv) { // sparse self-cond for the NEXT step: top-SC_K (id, prob) per position. Cleared each step; // unused slots stay (id 0, prob 0) so the graph gather contributes nothing for them. - std::fill(sc_probs.begin(), sc_probs.end(), 0.0f); - std::fill(sc_ids.begin(), sc_ids.end(), 0); + if (!use_device_self_cond) { + std::fill(sc_probs.begin(), sc_probs.end(), 0.0f); + std::fill(sc_ids.begin(), sc_ids.end(), 0); + } bool sampled_ok = true; const auto t_sample_start = std::chrono::steady_clock::now(); @@ -369,8 +373,8 @@ int main(int argc, char ** argv) { /* .sampled = */ sampled.data(), /* .argmax = */ argmax_canvas.data(), /* .entropy = */ entropy.data(), - /* .self_cond_ids = */ sc_ids.data(), - /* .self_cond_probs = */ sc_probs.data(), + /* .self_cond_ids = */ use_device_self_cond ? nullptr : sc_ids.data(), + /* .self_cond_probs = */ use_device_self_cond ? nullptr : sc_probs.data(), }; sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result); if (!sampled_ok) { @@ -513,9 +517,6 @@ int main(int argc, char ** argv) { // prefix [0, n_past); the next step re-decodes the canvas fresh against that prefix. llama_memory_seq_rm(mem, 0, n_past, -1); - // self-conditioning for the NEXT denoising step: feed this step's top-SC_K (id, prob) - llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length); - // 2c. entropy-bound accept: sort positions by entropy ascending, accept the prefix // where sum(entropy of all-but-last) <= entropy_bound (monotonic -> prefix selection) std::vector order(canvas_length); @@ -553,6 +554,12 @@ int main(int argc, char ** argv) { } prev_argmax = argmax_canvas; + // self-conditioning for the NEXT denoising step. With device self-cond this was already + // copied D2D into the reused graph input tensors by llama_diffusion_sample_topk(). + if (!use_device_self_cond) { + llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length); + } + // 2e. renoise non-accepted positions with fresh random tokens -> next canvas for (int i = 0; i < canvas_length; ++i) { canvas[i] = accept_mask[i] ? accepted[i] : rand_tok(rng); diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp index ebad596a5515..bc6ea6e7caf1 100644 --- a/examples/diffusion-gemma/diffusion-gemma-server.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -159,6 +159,7 @@ struct diffusion_server { int topk_end = 0; bool topk_tail = false; bool use_gpu_sampling = false; + bool use_device_self_cond = false; std::string model_id; std::string model_path; std::string build_info = "llama.cpp diffusion-gemma-server"; @@ -261,6 +262,7 @@ struct diffusion_server { if (k_step <= 0 || k_step >= n_vocab) k_step = 0; const bool use_gpu_sampling_step = use_gpu_sampling && (k_step <= 0 || k_step <= GPU_SAMPLING_MAX_TOP_K); + const bool use_device_self_cond_step = use_gpu_sampling_step && use_device_self_cond; llama_set_causal_attn(ctx, false); llama_set_diffusion_decoder_phase(ctx, true); @@ -282,8 +284,10 @@ struct diffusion_server { std::vector entropy(canvas_length); std::vector sampled(canvas_length); - std::fill(sc_probs.begin(), sc_probs.end(), 0.0f); - std::fill(sc_ids.begin(), sc_ids.end(), 0); + if (!use_device_self_cond_step) { + std::fill(sc_probs.begin(), sc_probs.end(), 0.0f); + std::fill(sc_ids.begin(), sc_ids.end(), 0); + } if (use_gpu_sampling_step) { llama_diffusion_sample_params sample_params = { @@ -299,8 +303,8 @@ struct diffusion_server { /* .sampled = */ sampled.data(), /* .argmax = */ argmax_canvas.data(), /* .entropy = */ entropy.data(), - /* .self_cond_ids = */ sc_ids.data(), - /* .self_cond_probs = */ sc_probs.data(), + /* .self_cond_ids = */ use_device_self_cond_step ? nullptr : sc_ids.data(), + /* .self_cond_probs = */ use_device_self_cond_step ? nullptr : sc_probs.data(), }; if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) { llama_memory_seq_rm(mem, 0, n_past, -1); @@ -394,7 +398,6 @@ struct diffusion_server { } llama_memory_seq_rm(mem, 0, n_past, -1); - llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length); std::vector order(canvas_length); std::iota(order.begin(), order.end(), 0); @@ -413,6 +416,10 @@ struct diffusion_server { if (stable && confident) break; prev_argmax = argmax_canvas; + if (!use_device_self_cond_step) { + llama_set_diffusion_self_cond_topk(ctx, sc_ids.data(), sc_probs.data(), SC_K, canvas_length); + } + for (int i = 0; i < canvas_length; ++i) canvas[i] = accept_mask[i] ? accepted[i] : rand_tok(rng); } @@ -666,6 +673,7 @@ int main(int argc, char ** argv) { const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0; srv.use_gpu_sampling = gpu_sampling_requested && llama_diffusion_sample_topk_supported(srv.ctx); + srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling); const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; @@ -674,10 +682,11 @@ int main(int argc, char ** argv) { SRV_INF("%s\n", llama_print_system_info()); SRV_INF("model loaded: '%s' | n_ctx = %d | canvas = %d | denoise steps = %d | 1 slot\n", srv.model_id.c_str(), srv.n_ctx, srv.canvas_length, srv.n_steps); - SRV_INF("gpu sampling: %s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n", + SRV_INF("gpu sampling: %s%s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n", srv.use_gpu_sampling ? "on" : "off", (!default_topk_gpu_ok ? " (default top-k will use CPU fallback until k <= CUDA limit)" : (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")), + srv.use_device_self_cond ? " | device self-cond: on" : "", srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0); // ------------------------------------------------------------------------------------------ @@ -748,6 +757,7 @@ int main(int argc, char ** argv) { {"stability_threshold", STABILITY_THRESHOLD}, {"self_cond_topk", SC_K}, {"gpu_sampling", srv.use_gpu_sampling}, + {"device_self_cond", srv.use_device_self_cond}, {"top_k", srv.topk_fixed}, {"top_k_start", srv.topk_start}, {"top_k_end", srv.topk_end}, @@ -854,6 +864,7 @@ int main(int argc, char ** argv) { {"n_denoise_steps", srv.n_steps}, {"self_cond_topk", SC_K}, {"gpu_sampling", srv.use_gpu_sampling}, + {"device_self_cond", srv.use_device_self_cond}, {"top_k", srv.topk_fixed}, {"top_k_start", srv.topk_start}, {"top_k_end", srv.topk_end}, diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index 4f3691c2980f..f16842c3f835 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -60,6 +60,8 @@ struct ggml_cuda_diffusion_sample_result { float * entropy; int32_t * self_cond_ids; float * self_cond_probs; + struct ggml_tensor * self_cond_ids_tensor; + struct ggml_tensor * self_cond_probs_tensor; }; typedef bool (*ggml_backend_cuda_diffusion_sample_topk_t)( diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu index 042e3af0520d..34c8efdac7a9 100644 --- a/ggml/src/ggml-cuda/diffusion-sampling.cu +++ b/ggml/src/ggml-cuda/diffusion-sampling.cu @@ -388,7 +388,7 @@ bool ggml_cuda_diffusion_sample_topk( if (!ggml_backend_is_cuda(backend) || logits->type != GGML_TYPE_F32 || !ggml_is_contiguous(logits)) { return false; } - if (!result->sampled || !result->argmax || !result->entropy || !result->self_cond_ids || !result->self_cond_probs) { + if (!result->sampled || !result->argmax || !result->entropy) { return false; } @@ -408,6 +408,36 @@ bool ggml_cuda_diffusion_sample_topk( return false; } + const bool have_self_cond_host = result->self_cond_ids != nullptr && result->self_cond_probs != nullptr; + const bool have_self_cond_tensor = result->self_cond_ids_tensor != nullptr && result->self_cond_probs_tensor != nullptr; + if (!have_self_cond_host && !have_self_cond_tensor) { + return false; + } + if ((result->self_cond_ids == nullptr) != (result->self_cond_probs == nullptr)) { + return false; + } + if ((result->self_cond_ids_tensor == nullptr) != (result->self_cond_probs_tensor == nullptr)) { + return false; + } + if (have_self_cond_tensor) { + if (result->self_cond_ids_tensor->type != GGML_TYPE_I32 || + result->self_cond_probs_tensor->type != GGML_TYPE_F32 || + !ggml_is_contiguous(result->self_cond_ids_tensor) || + !ggml_is_contiguous(result->self_cond_probs_tensor) || + result->self_cond_ids_tensor->data == nullptr || + result->self_cond_probs_tensor->data == nullptr || + result->self_cond_ids_tensor->buffer == nullptr || + result->self_cond_probs_tensor->buffer == nullptr || + ggml_backend_buffer_is_host(result->self_cond_ids_tensor->buffer) || + ggml_backend_buffer_is_host(result->self_cond_probs_tensor->buffer)) { + return false; + } + if (ggml_nbytes(result->self_cond_ids_tensor) != (size_t) n_tokens * sc_k * sizeof(int) || + ggml_nbytes(result->self_cond_probs_tensor) != (size_t) n_tokens * sc_k * sizeof(float)) { + return false; + } + } + const int heap_k = top_k == 0 ? sc_k : std::max(top_k, sc_k); if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) { return false; @@ -483,10 +513,18 @@ bool ggml_cuda_diffusion_sample_topk( cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(result->entropy, entropy_alloc.get(), (size_t) n_tokens * sizeof(float), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int), - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float), - cudaMemcpyDeviceToHost, stream)); + if (have_self_cond_host) { + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float), + cudaMemcpyDeviceToHost, stream)); + } + if (have_self_cond_tensor) { + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids_tensor->data, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int), + cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float), + cudaMemcpyDeviceToDevice, stream)); + } CUDA_CHECK(cudaStreamSynchronize(stream)); return true; diff --git a/include/llama.h b/include/llama.h index eb2d9cb338fb..e6062b4bbfe9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1029,6 +1029,8 @@ extern "C" { llama_token * sampled; llama_token * argmax; float * entropy; + // Optional host outputs for self-conditioning. If both are null, the CUDA fast path writes + // sparse self-conditioning directly into the reused decoder graph input tensors. int32_t * self_cond_ids; float * self_cond_probs; }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4ec0292704e6..2a0a5568a1c6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1149,6 +1149,12 @@ void llama_context::set_diffusion_decoder_phase(bool decoder_phase) { } void llama_context::set_diffusion_self_cond_topk(const int32_t * ids, const float * probs, int64_t k, int64_t n_tokens) { + diffusion_cond.sc_topk_device_ready = false; + diffusion_cond.sc_topk_device_ids_data = nullptr; + diffusion_cond.sc_topk_device_probs_data = nullptr; + diffusion_cond.sc_topk_device_ids_bytes = 0; + diffusion_cond.sc_topk_device_probs_bytes = 0; + if (ids == nullptr || probs == nullptr || k <= 0 || n_tokens <= 0) { diffusion_cond.sc_topk = 0; diffusion_cond.sc_topk_ids.clear(); @@ -1213,6 +1219,20 @@ bool llama_context::diffusion_sample_topk( return false; } + ggml_tensor * t_self_cond_ids = nullptr; + ggml_tensor * t_self_cond_probs = nullptr; + const bool device_self_cond = result->self_cond_ids == nullptr && result->self_cond_probs == nullptr; + if (device_self_cond) { + auto * inp = gf_res_prev->get_inp_diffusion_self_cond_topk(); + if (!inp || !inp->ids || !inp->probs) { + return false; + } + t_self_cond_ids = inp->ids; + t_self_cond_probs = inp->probs; + } else if (result->self_cond_ids == nullptr || result->self_cond_probs == nullptr) { + return false; + } + ggml_cuda_diffusion_sample_params cuda_params = { /* .n_vocab = */ (int32_t) model.vocab.n_tokens(), /* .n_tokens = */ params->n_tokens, @@ -1230,9 +1250,23 @@ bool llama_context::diffusion_sample_topk( /* .entropy = */ result->entropy, /* .self_cond_ids = */ result->self_cond_ids, /* .self_cond_probs = */ result->self_cond_probs, + /* .self_cond_ids_tensor = */ t_self_cond_ids, + /* .self_cond_probs_tensor = */ t_self_cond_probs, }; - return sample_proc(backend, t_logits, &cuda_params, &cuda_result); + const bool ok = sample_proc(backend, t_logits, &cuda_params, &cuda_result); + if (ok && device_self_cond) { + diffusion_cond.sc_topk = params->self_cond_top_k; + diffusion_cond.sc_topk_ids.clear(); + diffusion_cond.sc_topk_probs.clear(); + diffusion_cond.sc_topk_device_ready = true; + diffusion_cond.sc_topk_device_ids_data = t_self_cond_ids->data; + diffusion_cond.sc_topk_device_probs_data = t_self_cond_probs->data; + diffusion_cond.sc_topk_device_ids_bytes = ggml_nbytes(t_self_cond_ids); + diffusion_cond.sc_topk_device_probs_bytes = ggml_nbytes(t_self_cond_probs); + } + + return ok; } void llama_context::set_causal_attn(bool value) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 83615641fdb5..b0a7a31f8131 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -415,6 +415,18 @@ void llm_graph_input_diffusion_self_cond_topk::set_input(const llama_ubatch * ub const size_t n_id_bytes = ggml_nbytes(ids); const size_t n_pr_bytes = ggml_nbytes(probs); + const bool have_device = diffusion + && diffusion->sc_topk > 0 + && diffusion->sc_topk_device_ready + && diffusion->sc_topk_device_ids_data == ids->data + && diffusion->sc_topk_device_probs_data == probs->data + && diffusion->sc_topk_device_ids_bytes == n_id_bytes + && diffusion->sc_topk_device_probs_bytes == n_pr_bytes; + + if (have_device) { + return; + } + const bool have = diffusion && diffusion->sc_topk > 0 && diffusion->sc_topk_ids.size() * sizeof(int32_t) == n_id_bytes @@ -1020,6 +1032,16 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) { } } +llm_graph_input_diffusion_self_cond_topk * llm_graph_result::get_inp_diffusion_self_cond_topk() const { + for (auto & input : inputs) { + if (auto * topk = dynamic_cast(input.get())) { + return topk; + } + } + + return nullptr; +} + void llm_graph_result::set_outputs() { if (t_logits != nullptr) { ggml_set_output(t_logits); @@ -1033,6 +1055,9 @@ void llm_graph_result::set_outputs() { if (t_h_nextn != nullptr) { ggml_set_output(t_h_nextn); } + if (t_h_pre_norm != nullptr) { + ggml_set_output(t_h_pre_norm); + } for (auto & [seq_id, t] : t_sampled) { if (t != nullptr) { ggml_set_output(t); @@ -2105,6 +2130,18 @@ llm_graph_input_diffusion_self_cond_topk * llm_graph_context::build_inp_diffusio inp->probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, k, n_tokens); ggml_set_input(inp->probs); + if (sched) { + for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) { + ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i); + if (backend && backend != backend_cpu && + ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_GPU) { + ggml_backend_sched_set_tensor_backend(sched, inp->ids, backend); + ggml_backend_sched_set_tensor_backend(sched, inp->probs, backend); + break; + } + } + } + auto * ptr = inp.get(); res->add_input(std::move(inp)); diff --git a/src/llama-graph.h b/src/llama-graph.h index 609dd282a9e9..3d14c47968f4 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -93,6 +93,11 @@ struct llama_diffusion_cond { int64_t sc_topk = 0; std::vector sc_topk_ids; std::vector sc_topk_probs; + bool sc_topk_device_ready = false; + void * sc_topk_device_ids_data = nullptr; + void * sc_topk_device_probs_data = nullptr; + size_t sc_topk_device_ids_bytes = 0; + size_t sc_topk_device_probs_bytes = 0; // KV-cache reuse phase selector (block-diffusion): // false (encoder phase) = plain token embeddings, no self-conditioning; KV is @@ -784,6 +789,8 @@ class llm_graph_result { ggml_tensor * get_embd() const { return t_embd; } ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } ggml_tensor * get_h_nextn() const { return t_h_nextn; } + ggml_tensor * get_h_pre_norm() const { return t_h_pre_norm; } + llm_graph_input_diffusion_self_cond_topk * get_inp_diffusion_self_cond_topk() const; ggml_cgraph * get_gf() const { return gf; } ggml_context * get_ctx() const { return ctx_compute.get(); } @@ -813,6 +820,7 @@ class llm_graph_result { ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd_pooled = nullptr; ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm + ggml_tensor * t_h_pre_norm = nullptr; // [n_embd, n_outputs] hidden state before final output norm std::map t_sampled_logits; std::map t_candidates; From a1b087404d01773402e202ecb960418533934a70 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Tue, 9 Jun 2026 15:18:06 +0530 Subject: [PATCH 04/13] diffusion-gemma: keep denoising loop on device --- .../diffusion-gemma/diffusion-gemma-cli.cpp | 40 +++- .../diffusion-gemma-server.cpp | 42 +++- ggml/include/ggml-cuda.h | 4 + ggml/include/ggml.h | 1 + ggml/src/ggml-alloc.c | 13 +- ggml/src/ggml-cuda/diffusion-sampling.cu | 212 +++++++++++++++--- include/llama.h | 7 + src/llama-context.cpp | 53 ++++- src/llama-graph.cpp | 139 +++++++++--- src/llama-graph.h | 9 +- src/llama-kv-cache.cpp | 66 ++++-- src/models/diffusion-gemma.cpp | 15 +- 12 files changed, 509 insertions(+), 92 deletions(-) diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index 84fd64be6895..853d5f36a649 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -208,15 +208,17 @@ int main(int argc, char ** argv) { gpu_sampling_topk_ok && llama_diffusion_sample_topk_supported(ctx); const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; + const bool use_device_loop = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling); LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n", prefix_len, canvas_length, max_canvases, n_steps, entropy_bound, TEMP_MIN, TEMP_MAX, n_ctx, (int) use_mm); - LOG_INF("diffusion-gemma: gpu sampling: %s%s%s\n", + LOG_INF("diffusion-gemma: gpu sampling: %s%s%s%s\n", use_gpu_sampling ? "on" : "off", (!gpu_sampling_topk_ok ? " (top-k exceeds CUDA fast-path limit)" : (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")), - use_device_self_cond ? " | device self-cond: on" : ""); + use_device_self_cond ? " | device self-cond: on" : "", + use_device_loop ? " | device loop: on" : ""); if (topk_fixed > 0 || (topk_start > 0 && topk_end > 0)) { LOG_INF("diffusion-gemma: top-k sampling: fixed=%d anneal=[%d->%d] tail_correction=%d (vocab=%d)\n", topk_fixed, topk_start, topk_end, topk_tail, n_vocab); @@ -350,6 +352,40 @@ int main(int argc, char ** argv) { if (k_step <= 0 || k_step >= n_vocab) k_step = 0; step_k = k_step; + if (use_device_loop) { + bool sampled_ok = true; + const auto t_sample_start = std::chrono::steady_clock::now(); + llama_diffusion_sample_params sample_params = { + /* .n_tokens = */ canvas_length, + /* .top_k = */ k_step, + /* .self_cond_top_k = */ SC_K, + /* .temperature = */ temp, + /* .seed = */ params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed, + /* .step = */ (uint32_t) n_steps_total, + /* .top_k_tail_correction = */ topk_tail != 0, + }; + llama_diffusion_sample_result sample_result = { + /* .sampled = */ nullptr, + /* .argmax = */ nullptr, + /* .entropy = */ nullptr, + /* .self_cond_ids = */ nullptr, + /* .self_cond_probs = */ nullptr, + /* .final_tokens = */ cur_step == 1 ? argmax_canvas.data() : nullptr, + /* .update_canvas_on_device = */ cur_step > 1, + /* .entropy_bound = */ entropy_bound, + }; + sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result); + sample_sync_s += std::chrono::duration(std::chrono::steady_clock::now() - t_sample_start).count(); + llama_memory_seq_rm(mem, 0, n_past, -1); + if (!sampled_ok) { + LOG_ERR("error: CUDA diffusion device loop failed at step %d (k=%d)\n", cur_step, k_step); + failed = true; + done = true; + break; + } + continue; + } + // sparse self-cond for the NEXT step: top-SC_K (id, prob) per position. Cleared each step; // unused slots stay (id 0, prob 0) so the graph gather contributes nothing for them. if (!use_device_self_cond) { diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp index bc6ea6e7caf1..3985e2740d5a 100644 --- a/examples/diffusion-gemma/diffusion-gemma-server.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -160,6 +160,7 @@ struct diffusion_server { bool topk_tail = false; bool use_gpu_sampling = false; bool use_device_self_cond = false; + bool use_device_loop = false; std::string model_id; std::string model_path; std::string build_info = "llama.cpp diffusion-gemma-server"; @@ -240,6 +241,10 @@ struct diffusion_server { int n_blocks_run = 0, n_steps_total = 0; bool done = false; + const int topk_max_requested = + (rq.topk_start > 0 && rq.topk_end > 0) ? std::max(rq.topk_start, rq.topk_end) : rq.topk_fixed; + const bool use_device_loop_request = use_device_loop && + (topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K); const auto t_gen0 = std::chrono::steady_clock::now(); for (int block = 0; block < max_canvases && !done; ++block) { @@ -263,6 +268,7 @@ struct diffusion_server { const bool use_gpu_sampling_step = use_gpu_sampling && (k_step <= 0 || k_step <= GPU_SAMPLING_MAX_TOP_K); const bool use_device_self_cond_step = use_gpu_sampling_step && use_device_self_cond; + const bool use_device_loop_step = use_device_loop_request && use_gpu_sampling_step; llama_set_causal_attn(ctx, false); llama_set_diffusion_decoder_phase(ctx, true); @@ -284,6 +290,36 @@ struct diffusion_server { std::vector entropy(canvas_length); std::vector sampled(canvas_length); + if (use_device_loop_step) { + llama_diffusion_sample_params sample_params = { + /* .n_tokens = */ canvas_length, + /* .top_k = */ k_step, + /* .self_cond_top_k = */ SC_K, + /* .temperature = */ temp, + /* .seed = */ rq.seed, + /* .step = */ (uint32_t) n_steps_total, + /* .top_k_tail_correction = */ rq.topk_tail, + }; + llama_diffusion_sample_result sample_result = { + /* .sampled = */ nullptr, + /* .argmax = */ nullptr, + /* .entropy = */ nullptr, + /* .self_cond_ids = */ nullptr, + /* .self_cond_probs = */ nullptr, + /* .final_tokens = */ cur_step == 1 ? argmax_canvas.data() : nullptr, + /* .update_canvas_on_device = */ cur_step > 1, + /* .entropy_bound = */ ENTROPY_BOUND, + }; + if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) { + llama_memory_seq_rm(mem, 0, n_past, -1); + out.error = "CUDA diffusion device loop failed at denoising step " + + std::to_string(cur_step) + " (k=" + std::to_string(k_step) + ")"; + return out; + } + llama_memory_seq_rm(mem, 0, n_past, -1); + continue; + } + if (!use_device_self_cond_step) { std::fill(sc_probs.begin(), sc_probs.end(), 0.0f); std::fill(sc_ids.begin(), sc_ids.end(), 0); @@ -674,6 +710,7 @@ int main(int argc, char ** argv) { srv.use_gpu_sampling = gpu_sampling_requested && llama_diffusion_sample_topk_supported(srv.ctx); srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; + srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling); const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; @@ -682,11 +719,12 @@ int main(int argc, char ** argv) { SRV_INF("%s\n", llama_print_system_info()); SRV_INF("model loaded: '%s' | n_ctx = %d | canvas = %d | denoise steps = %d | 1 slot\n", srv.model_id.c_str(), srv.n_ctx, srv.canvas_length, srv.n_steps); - SRV_INF("gpu sampling: %s%s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n", + SRV_INF("gpu sampling: %s%s%s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n", srv.use_gpu_sampling ? "on" : "off", (!default_topk_gpu_ok ? " (default top-k will use CPU fallback until k <= CUDA limit)" : (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")), srv.use_device_self_cond ? " | device self-cond: on" : "", + srv.use_device_loop ? " | device loop: on" : "", srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0); // ------------------------------------------------------------------------------------------ @@ -758,6 +796,7 @@ int main(int argc, char ** argv) { {"self_cond_topk", SC_K}, {"gpu_sampling", srv.use_gpu_sampling}, {"device_self_cond", srv.use_device_self_cond}, + {"device_loop", srv.use_device_loop}, {"top_k", srv.topk_fixed}, {"top_k_start", srv.topk_start}, {"top_k_end", srv.topk_end}, @@ -865,6 +904,7 @@ int main(int argc, char ** argv) { {"self_cond_topk", SC_K}, {"gpu_sampling", srv.use_gpu_sampling}, {"device_self_cond", srv.use_device_self_cond}, + {"device_loop", srv.use_device_loop}, {"top_k", srv.topk_fixed}, {"top_k_start", srv.topk_start}, {"top_k_end", srv.topk_end}, diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index f16842c3f835..d966e5a3d0d7 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -62,6 +62,10 @@ struct ggml_cuda_diffusion_sample_result { float * self_cond_probs; struct ggml_tensor * self_cond_ids_tensor; struct ggml_tensor * self_cond_probs_tensor; + struct ggml_tensor * canvas_tokens_tensor; + int32_t * final_tokens; + float entropy_bound; + bool update_canvas_on_device; }; typedef bool (*ggml_backend_cuda_diffusion_sample_topk_t)( diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 374934aacf32..ff3bbce1b531 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -647,6 +647,7 @@ extern "C" { GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed + GGML_TENSOR_FLAG_PERSIST = 32, // ...must not be overwritten by graph allocator reuse }; enum ggml_tri_type { diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 3bda9abbe03d..5eec18a059c9 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -641,9 +641,10 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor continue; } - // outputs cannot be reused - if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) { - AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name); + // outputs and persistent inputs cannot be reused + if ((parent->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)) || + (parent->view_src != NULL && (parent->view_src->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)))) { + AT_PRINTF("not reusing parent %s for %s as it is persistent\n", parent->name, node->name); continue; } @@ -688,9 +689,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor } static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { - // graph outputs are never freed - if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { - AT_PRINTF("not freeing output %s\n", node->name); + // graph outputs and persistent inputs are never freed + if (node->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)) { + AT_PRINTF("not freeing persistent %s\n", node->name); return; } diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu index 34c8efdac7a9..f0b6f53e3146 100644 --- a/ggml/src/ggml-cuda/diffusion-sampling.cu +++ b/ggml/src/ggml-cuda/diffusion-sampling.cu @@ -6,6 +6,8 @@ #include #include #include +#include +#include #ifdef GGML_CUDA_USE_CUB # include @@ -29,6 +31,62 @@ static bool diffusion_fast_topk_enabled() { return !v || std::strcmp(v, "0") != 0; } +struct diffusion_sample_scratch { + int * top_ids = nullptr; + int * sampled = nullptr; + int * argmax = nullptr; + float * entropy = nullptr; + int * sc_ids = nullptr; + float * sc_probs = nullptr; + + size_t top_ids_cap = 0; + size_t sampled_cap = 0; + size_t argmax_cap = 0; + size_t entropy_cap = 0; + size_t sc_ids_cap = 0; + size_t sc_probs_cap = 0; +}; + +static std::mutex g_diffusion_scratch_mutex; +static std::map g_diffusion_scratch; + +template +static void diffusion_scratch_reserve(cudaStream_t stream, T ** ptr, size_t * cap, const size_t need, bool * synced) { + if (*cap >= need) { + return; + } + if (*ptr) { + if (!*synced) { + CUDA_CHECK(cudaStreamSynchronize(stream)); + *synced = true; + } + CUDA_CHECK(cudaFree(*ptr)); + *ptr = nullptr; + *cap = 0; + } + CUDA_CHECK(cudaMalloc((void **) ptr, need * sizeof(T))); + *cap = need; +} + +static diffusion_sample_scratch * diffusion_get_scratch( + cudaStream_t stream, + const int n_tokens, + const int heap_k, + const int sc_k) { + std::lock_guard lock(g_diffusion_scratch_mutex); + diffusion_sample_scratch & scratch = g_diffusion_scratch[stream]; + bool synced = false; + + diffusion_scratch_reserve(stream, &scratch.top_ids, &scratch.top_ids_cap, (size_t) n_tokens * heap_k, &synced); + diffusion_scratch_reserve(stream, &scratch.sampled, &scratch.sampled_cap, (size_t) n_tokens, &synced); + diffusion_scratch_reserve(stream, &scratch.argmax, &scratch.argmax_cap, (size_t) n_tokens, &synced); + diffusion_scratch_reserve(stream, &scratch.entropy, &scratch.entropy_cap, (size_t) n_tokens, &synced); + diffusion_scratch_reserve(stream, &scratch.sc_ids, &scratch.sc_ids_cap, (size_t) n_tokens * sc_k, &synced); + diffusion_scratch_reserve(stream, &scratch.sc_probs, &scratch.sc_probs_cap, (size_t) n_tokens * sc_k, &synced); + + return &scratch; +} + static __device__ __forceinline__ bool diffusion_should_swap_desc( const float a_val, const int a_id, const float b_val, const int b_id) { @@ -223,6 +281,82 @@ static __device__ __forceinline__ float diffusion_rng_uniform(uint32_t seed, uin return ((x >> 8) + 0.5f) * (1.0f / 16777216.0f); } +static __device__ __forceinline__ bool diffusion_pair_gt(float a_val, int a_id, float b_val, int b_id) { + return a_val > b_val || (a_val == b_val && a_id > b_id); +} + +static __global__ void diffusion_update_canvas_kernel( + const float * __restrict__ entropy, + const int * __restrict__ sampled, + int * __restrict__ canvas_tokens, + const int n_tokens, + const int n_vocab, + const float entropy_bound, + const uint32_t seed, + const uint32_t step) { + const int tid = threadIdx.x; + + __shared__ float s_entropy[1024]; + __shared__ int s_index[1024]; + __shared__ unsigned char s_accept[1024]; + + if (tid < n_tokens) { + s_entropy[tid] = entropy[tid]; + s_index[tid] = tid; + s_accept[tid] = 0; + } else { + s_entropy[tid] = FLT_MAX; + s_index[tid] = tid; + } + __syncthreads(); + + for (int k = 2; k <= blockDim.x; k <<= 1) { + for (int j = k >> 1; j > 0; j >>= 1) { + const int ixj = tid ^ j; + if (ixj > tid) { + const bool ascending = (tid & k) == 0; + const float a_val = s_entropy[tid]; + const int a_idx = s_index[tid]; + const float b_val = s_entropy[ixj]; + const int b_idx = s_index[ixj]; + const bool swap = ascending ? + diffusion_pair_gt(a_val, a_idx, b_val, b_idx) : + diffusion_pair_gt(b_val, b_idx, a_val, a_idx); + if (swap) { + s_entropy[tid] = b_val; + s_index[tid] = b_idx; + s_entropy[ixj] = a_val; + s_index[ixj] = a_idx; + } + } + __syncthreads(); + } + } + + if (tid == 0) { + float prefix = 0.0f; + for (int i = 0; i < n_tokens; ++i) { + const int pos = s_index[i]; + if (prefix <= entropy_bound) { + s_accept[pos] = 1; + prefix += s_entropy[i]; + } else { + break; + } + } + } + __syncthreads(); + + if (tid < n_tokens) { + if (s_accept[tid]) { + canvas_tokens[tid] = sampled[tid]; + } else { + const uint32_t r = diffusion_rng_u32(seed ^ ((step + 1u) * 0x9e3779b9u) ^ ((uint32_t) tid * 0x7f4a7c15u) ^ 0xa5a5a5a5u); + canvas_tokens[tid] = (int) (r % (uint32_t) n_vocab); + } + } +} + static __global__ void diffusion_sample_kernel( const float * __restrict__ logits, const int * __restrict__ top_ids, @@ -388,10 +522,6 @@ bool ggml_cuda_diffusion_sample_topk( if (!ggml_backend_is_cuda(backend) || logits->type != GGML_TYPE_F32 || !ggml_is_contiguous(logits)) { return false; } - if (!result->sampled || !result->argmax || !result->entropy) { - return false; - } - const int n_vocab = params->n_vocab > 0 ? params->n_vocab : (int) logits->ne[0]; const int n_tokens = params->n_tokens > 0 ? params->n_tokens : (int) ggml_nrows(logits); if (n_vocab <= 0 || n_tokens <= 0 || logits->ne[0] != n_vocab || ggml_nrows(logits) < n_tokens) { @@ -408,6 +538,19 @@ bool ggml_cuda_diffusion_sample_topk( return false; } + if (result->update_canvas_on_device) { + if (n_tokens > 1024 || + result->canvas_tokens_tensor == nullptr || + result->canvas_tokens_tensor->type != GGML_TYPE_I32 || + !ggml_is_contiguous(result->canvas_tokens_tensor) || + result->canvas_tokens_tensor->data == nullptr || + result->canvas_tokens_tensor->buffer == nullptr || + ggml_backend_buffer_is_host(result->canvas_tokens_tensor->buffer) || + ggml_nbytes(result->canvas_tokens_tensor) != (size_t) n_tokens * sizeof(int)) { + return false; + } + } + const bool have_self_cond_host = result->self_cond_ids != nullptr && result->self_cond_probs != nullptr; const bool have_self_cond_tensor = result->self_cond_ids_tensor != nullptr && result->self_cond_probs_tensor != nullptr; if (!have_self_cond_host && !have_self_cond_tensor) { @@ -453,11 +596,12 @@ bool ggml_cuda_diffusion_sample_topk( const float * logits_d = (const float *) logits->data; - ggml_cuda_pool_alloc top_ids_alloc(pool, (size_t) n_tokens * heap_k); - int * top_ids = top_ids_alloc.get(); + diffusion_sample_scratch * scratch = diffusion_get_scratch(stream, n_tokens, heap_k, sc_k); + int * top_ids = scratch->top_ids; bool top_ids_sorted = false; const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k; + bool sync_required = !use_fast_topk; if (use_fast_topk) { diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream); top_ids_sorted = true; @@ -494,38 +638,54 @@ bool ggml_cuda_diffusion_sample_topk( logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp); } - ggml_cuda_pool_alloc sampled_alloc(pool, n_tokens); - ggml_cuda_pool_alloc argmax_alloc(pool, n_tokens); - ggml_cuda_pool_alloc entropy_alloc(pool, n_tokens); - ggml_cuda_pool_alloc sc_ids_alloc(pool, (size_t) n_tokens * sc_k); - ggml_cuda_pool_alloc sc_probs_alloc(pool, (size_t) n_tokens * sc_k); - constexpr int block_size = 256; diffusion_sample_kernel<<>>( logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp, params->seed, params->step, params->top_k_tail_correction, - sampled_alloc.get(), argmax_alloc.get(), entropy_alloc.get(), - sc_ids_alloc.get(), sc_probs_alloc.get()); - - CUDA_CHECK(cudaMemcpyAsync(result->sampled, sampled_alloc.get(), (size_t) n_tokens * sizeof(int), - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(result->argmax, argmax_alloc.get(), (size_t) n_tokens * sizeof(int), - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(result->entropy, entropy_alloc.get(), (size_t) n_tokens * sizeof(float), - cudaMemcpyDeviceToHost, stream)); + scratch->sampled, scratch->argmax, scratch->entropy, + scratch->sc_ids, scratch->sc_probs); + + if (result->update_canvas_on_device) { + const int update_threads = next_power_of_2_host(n_tokens); + diffusion_update_canvas_kernel<<<1, update_threads, 0, stream>>>( + scratch->entropy, scratch->sampled, (int *) result->canvas_tokens_tensor->data, + n_tokens, n_vocab, result->entropy_bound, params->seed, params->step); + } + + if (result->sampled) { + CUDA_CHECK(cudaMemcpyAsync(result->sampled, scratch->sampled, (size_t) n_tokens * sizeof(int), + cudaMemcpyDeviceToHost, stream)); + } + if (result->argmax) { + CUDA_CHECK(cudaMemcpyAsync(result->argmax, scratch->argmax, (size_t) n_tokens * sizeof(int), + cudaMemcpyDeviceToHost, stream)); + } + if (result->entropy) { + CUDA_CHECK(cudaMemcpyAsync(result->entropy, scratch->entropy, (size_t) n_tokens * sizeof(float), + cudaMemcpyDeviceToHost, stream)); + } + if (result->final_tokens) { + CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int), + cudaMemcpyDeviceToHost, stream)); + } if (have_self_cond_host) { - CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int), + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, scratch->sc_ids, (size_t) n_tokens * sc_k * sizeof(int), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float), + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float), cudaMemcpyDeviceToHost, stream)); } if (have_self_cond_tensor) { - CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids_tensor->data, sc_ids_alloc.get(), (size_t) n_tokens * sc_k * sizeof(int), + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids_tensor->data, scratch->sc_ids, (size_t) n_tokens * sc_k * sizeof(int), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, sc_probs_alloc.get(), (size_t) n_tokens * sc_k * sizeof(float), + CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float), cudaMemcpyDeviceToDevice, stream)); } - CUDA_CHECK(cudaStreamSynchronize(stream)); + const bool host_outputs_requested = result->sampled || result->argmax || result->entropy || + result->final_tokens || have_self_cond_host; + CUDA_CHECK(cudaGetLastError()); + if (sync_required || host_outputs_requested) { + CUDA_CHECK(cudaStreamSynchronize(stream)); + } return true; } diff --git a/include/llama.h b/include/llama.h index e6062b4bbfe9..0e39b71a388e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1033,6 +1033,13 @@ extern "C" { // sparse self-conditioning directly into the reused decoder graph input tensors. int32_t * self_cond_ids; float * self_cond_probs; + // Optional device-resident denoise update. When update_canvas_on_device is true, the CUDA + // fast path performs entropy accept/renoise and writes the next canvas into the reused + // decoder token input tensor. final_tokens is copied only when requested, typically on + // the final denoising step. + llama_token * final_tokens; + bool update_canvas_on_device; + float entropy_bound; }; // CUDA-only fast path for block-diffusion sampling. When enabled, llama_decode() diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2a0a5568a1c6..fd6c8bc99a36 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1154,6 +1154,9 @@ void llama_context::set_diffusion_self_cond_topk(const int32_t * ids, const floa diffusion_cond.sc_topk_device_probs_data = nullptr; diffusion_cond.sc_topk_device_ids_bytes = 0; diffusion_cond.sc_topk_device_probs_bytes = 0; + diffusion_cond.canvas_tokens_device_ready = false; + diffusion_cond.canvas_tokens_device_data = nullptr; + diffusion_cond.canvas_tokens_device_bytes = 0; if (ids == nullptr || probs == nullptr || k <= 0 || n_tokens <= 0) { diffusion_cond.sc_topk = 0; @@ -1186,6 +1189,35 @@ static ggml_backend_cuda_diffusion_sample_topk_t get_cuda_diffusion_sample_topk_ ggml_backend_reg_get_proc_address(reg, "ggml_backend_cuda_diffusion_sample_topk"); } +static bool diffusion_decoder_inputs_device_ready( + const llama_diffusion_cond & diffusion_cond, + const llm_graph_result * res) { + if (!diffusion_cond.decoder_phase || + !diffusion_cond.canvas_tokens_device_ready || + !diffusion_cond.sc_topk_device_ready || + !res) { + return false; + } + + ggml_tensor * t_tokens = res->get_inp_tokens(); + if (!t_tokens || + diffusion_cond.canvas_tokens_device_data != t_tokens->data || + diffusion_cond.canvas_tokens_device_bytes != ggml_nbytes(t_tokens)) { + return false; + } + + auto * inp_sc = res->get_inp_diffusion_self_cond_topk(); + if (!inp_sc || !inp_sc->ids || !inp_sc->probs || + diffusion_cond.sc_topk_device_ids_data != inp_sc->ids->data || + diffusion_cond.sc_topk_device_probs_data != inp_sc->probs->data || + diffusion_cond.sc_topk_device_ids_bytes != ggml_nbytes(inp_sc->ids) || + diffusion_cond.sc_topk_device_probs_bytes != ggml_nbytes(inp_sc->probs)) { + return false; + } + + return true; +} + bool llama_context::diffusion_sample_topk_supported() const { for (ggml_backend_t backend : backend_ptrs) { if (get_cuda_diffusion_sample_topk_proc(backend)) { @@ -1221,6 +1253,7 @@ bool llama_context::diffusion_sample_topk( ggml_tensor * t_self_cond_ids = nullptr; ggml_tensor * t_self_cond_probs = nullptr; + ggml_tensor * t_canvas_tokens = nullptr; const bool device_self_cond = result->self_cond_ids == nullptr && result->self_cond_probs == nullptr; if (device_self_cond) { auto * inp = gf_res_prev->get_inp_diffusion_self_cond_topk(); @@ -1233,6 +1266,13 @@ bool llama_context::diffusion_sample_topk( return false; } + if (result->update_canvas_on_device) { + t_canvas_tokens = gf_res_prev->get_inp_tokens(); + if (!t_canvas_tokens) { + return false; + } + } + ggml_cuda_diffusion_sample_params cuda_params = { /* .n_vocab = */ (int32_t) model.vocab.n_tokens(), /* .n_tokens = */ params->n_tokens, @@ -1252,6 +1292,10 @@ bool llama_context::diffusion_sample_topk( /* .self_cond_probs = */ result->self_cond_probs, /* .self_cond_ids_tensor = */ t_self_cond_ids, /* .self_cond_probs_tensor = */ t_self_cond_probs, + /* .canvas_tokens_tensor = */ t_canvas_tokens, + /* .final_tokens = */ (int32_t *) result->final_tokens, + /* .entropy_bound = */ result->entropy_bound, + /* .update_canvas_on_device = */ result->update_canvas_on_device, }; const bool ok = sample_proc(backend, t_logits, &cuda_params, &cuda_result); @@ -1265,6 +1309,11 @@ bool llama_context::diffusion_sample_topk( diffusion_cond.sc_topk_device_ids_bytes = ggml_nbytes(t_self_cond_ids); diffusion_cond.sc_topk_device_probs_bytes = ggml_nbytes(t_self_cond_probs); } + if (ok && result->update_canvas_on_device) { + diffusion_cond.canvas_tokens_device_ready = true; + diffusion_cond.canvas_tokens_device_data = t_canvas_tokens->data; + diffusion_cond.canvas_tokens_device_bytes = ggml_nbytes(t_canvas_tokens); + } return ok; } @@ -1465,7 +1514,9 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll //const auto t_start_us = ggml_time_us(); // FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated - res->set_inputs(&ubatch); + if (!diffusion_decoder_inputs_device_ready(diffusion_cond, res)) { + res->set_inputs(&ubatch); + } //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index b0a7a31f8131..778e413b0851 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -84,8 +85,16 @@ static ggml_tensor * ggml_mul_mat_aux( void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { if (ubatch->token) { const int64_t n_tokens = ubatch->n_tokens; + const size_t n_bytes = n_tokens*ggml_element_size(tokens); - ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens)); + const bool have_device = diffusion + && diffusion->canvas_tokens_device_ready + && diffusion->canvas_tokens_device_data == tokens->data + && diffusion->canvas_tokens_device_bytes == n_bytes; + + if (!have_device) { + ggml_backend_tensor_set(tokens, ubatch->token, 0, n_bytes); + } } if (ubatch->embd) { @@ -216,26 +225,25 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { const int64_t n_tokens = ubatch->n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); - int32_t * data = (int32_t *) out_ids->data; + std::vector data(n_outputs); if (n_outputs == n_tokens) { for (int i = 0; i < n_tokens; ++i) { data[i] = i; } + } else { + GGML_ASSERT(ubatch->output); - return; - } - - GGML_ASSERT(ubatch->output); - - int n_outputs = 0; + int n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch->output[i]) { - data[n_outputs++] = i; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch->output[i]) { + data[n_outputs++] = i; + } } } + + ggml_backend_tensor_set(out_ids, data.data(), 0, data.size()*ggml_element_size(out_ids)); } bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) { @@ -524,20 +532,26 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { }; GGML_ASSERT(self_kq_mask); - GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer)); if (self_kq_mask->type == GGML_TYPE_F16) { - fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); + std::vector data(ggml_nelements(self_kq_mask)); + fill_mask(data.data(), ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); + ggml_backend_tensor_set(self_kq_mask, data.data(), 0, ggml_nbytes(self_kq_mask)); } else { - fill_mask((float *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); + std::vector data(ggml_nelements(self_kq_mask)); + fill_mask(data.data(), ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); + ggml_backend_tensor_set(self_kq_mask, data.data(), 0, ggml_nbytes(self_kq_mask)); } if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(self_kq_mask_swa); - GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer)); if (self_kq_mask_swa->type == GGML_TYPE_F16) { - fill_mask((ggml_fp16_t *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type); + std::vector data(ggml_nelements(self_kq_mask_swa)); + fill_mask(data.data(), ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type); + ggml_backend_tensor_set(self_kq_mask_swa, data.data(), 0, ggml_nbytes(self_kq_mask_swa)); } else { - fill_mask((float *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type); + std::vector data(ggml_nelements(self_kq_mask_swa)); + fill_mask(data.data(), ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type); + ggml_backend_tensor_set(self_kq_mask_swa, data.data(), 0, ggml_nbytes(self_kq_mask_swa)); } } } @@ -574,11 +588,14 @@ void llm_graph_input_attn_no_cache_prefix::set_input(const llama_ubatch * ubatch }; GGML_ASSERT(self_kq_mask); - GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer)); if (self_kq_mask->type == GGML_TYPE_F16) { - fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask)); + std::vector data(ggml_nelements(self_kq_mask)); + fill_mask(data.data(), ggml_nelements(self_kq_mask)); + ggml_backend_tensor_set(self_kq_mask, data.data(), 0, ggml_nbytes(self_kq_mask)); } else { - fill_mask((float *) self_kq_mask->data, ggml_nelements(self_kq_mask)); + std::vector data(ggml_nelements(self_kq_mask)); + fill_mask(data.data(), ggml_nelements(self_kq_mask)); + ggml_backend_tensor_set(self_kq_mask, data.data(), 0, ggml_nbytes(self_kq_mask)); } } @@ -1177,6 +1194,34 @@ void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { } } +void llm_graph_context::set_diffusion_input_backend(ggml_tensor * tensor, uint32_t group) const { + if (!diffusion || !diffusion->decoder_phase || !sched || !tensor) { + return; + } + + static const uint32_t enabled_groups = [] { + const char * env = getenv("DG_GPU_INPUT_GROUPS"); + // Keep the default to inputs used by the fixed diffusion decoder graph: + // canvas/self-cond, positions, attention scale, KV indices, masks and + // rotary helpers. They are marked persistent below so the allocator will + // not overwrite them between denoising replays. + return env ? (uint32_t) strtoul(env, nullptr, 0) : 63u; // 1|2|4|8|16|32 + }(); + if ((enabled_groups & group) == 0) { + return; + } + + for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) { + ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i); + if (backend && backend != backend_cpu && + ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_GPU) { + tensor->flags |= GGML_TENSOR_FLAG_PERSIST; + ggml_backend_sched_set_tensor_backend(sched, tensor, backend); + break; + } + } +} + ggml_tensor * llm_graph_context::build_cvec( ggml_tensor * cur, int il) const { @@ -1924,12 +1969,13 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { assert(n_embd_inp >= n_embd); - auto inp = std::make_unique(n_embd_inp); + auto inp = std::make_unique(n_embd_inp, diffusion); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); cb(inp->tokens, "inp_tokens", -1); ggml_set_input(inp->tokens); res->t_inp_tokens = inp->tokens; + set_diffusion_input_backend(inp->tokens); inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens); cb(inp->embd, "inp_embd", -1); @@ -2014,6 +2060,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd()); ggml_set_input(cur); + set_diffusion_input_backend(cur, 2); res->add_input(std::move(inp)); @@ -2028,6 +2075,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const { // this need to be 1x1xN for broadcasting cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens); ggml_set_input(cur); + set_diffusion_input_backend(cur, 4); ggml_set_name(cur, "attn_scale"); res->add_input(std::move(inp)); @@ -2036,6 +2084,10 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const { } ggml_tensor * llm_graph_context::build_inp_out_ids() const { + if (diffusion && diffusion->decoder_phase && n_outputs == n_tokens) { + return nullptr; + } + // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls, // but this would make the graph topology depend on the number of output tokens, which can interfere with // features that require constant topology such as pipeline parallelism @@ -2050,6 +2102,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const { cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); ggml_set_input(cur); + set_diffusion_input_backend(cur, 64); res->add_input(std::move(inp)); @@ -2113,6 +2166,7 @@ ggml_tensor * llm_graph_context::build_inp_diffusion_self_cond(int64_t n_vocab) cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens); ggml_set_input(cur); + set_diffusion_input_backend(cur); res->add_input(std::move(inp)); @@ -2126,21 +2180,11 @@ llm_graph_input_diffusion_self_cond_topk * llm_graph_context::build_inp_diffusio // dims that must match the data tensor; a flat index list gathers into [n_embd, k*n_tokens]). inp->ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, k * n_tokens); ggml_set_input(inp->ids); + set_diffusion_input_backend(inp->ids); inp->probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, k, n_tokens); ggml_set_input(inp->probs); - - if (sched) { - for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) { - ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i); - if (backend && backend != backend_cpu && - ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_GPU) { - ggml_backend_sched_set_tensor_backend(sched, inp->ids, backend); - ggml_backend_sched_set_tensor_backend(sched, inp->probs, backend); - break; - } - } - } + set_diffusion_input_backend(inp->probs); auto * ptr = inp.get(); res->add_input(std::move(inp)); @@ -2337,12 +2381,14 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch inp->self_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1); ggml_set_input(inp->self_kq_mask); + set_diffusion_input_backend(inp->self_kq_mask, 128); inp->self_kq_mask_cnv = inp->self_kq_mask; if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1); ggml_set_input(inp->self_kq_mask_swa); + set_diffusion_input_backend(inp->self_kq_mask_swa, 128); inp->self_kq_mask_swa_cnv = inp->self_kq_mask_swa; } else { @@ -2360,6 +2406,7 @@ llm_graph_input_attn_no_cache_prefix * llm_graph_context::build_attn_inp_no_cach inp->self_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1); ggml_set_input(inp->self_kq_mask); + set_diffusion_input_backend(inp->self_kq_mask, 128); inp->self_kq_mask_cnv = inp->self_kq_mask; // sliding-window layers reuse the same prefix mask (valid while n_tokens <= sliding_window) @@ -2450,6 +2497,11 @@ llm_graph_input_attn_kv * llm_graph_context::build_attn_inp_kv() const { const auto * mctx_cur = static_cast(mctx); auto inp = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur); + set_diffusion_input_backend(inp->self_k_idxs, 8); + set_diffusion_input_backend(inp->self_v_idxs, 8); + set_diffusion_input_backend(inp->self_kq_mask, 16); + set_diffusion_input_backend(inp->self_k_rot, 32); + set_diffusion_input_backend(inp->self_v_rot, 32); return (llm_graph_input_attn_kv *) res->add_input(std::move(inp)); } @@ -2554,6 +2606,8 @@ llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const { const auto * mctx_cur = static_cast(mctx); auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur); + set_diffusion_input_backend(inp->self_k_idxs, 8); + set_diffusion_input_backend(inp->self_kq_mask, 16); return (llm_graph_input_attn_k *) res->add_input(std::move(inp)); } @@ -2863,6 +2917,12 @@ llm_graph_input_attn_k_dsa * llm_graph_context::build_attn_inp_k_dsa() const { inp->self_k_rot_lid = mctx_cur->get_lid()->build_input_k_rot(ctx0); } + set_diffusion_input_backend(inp->self_k_idxs_mla, 8); + set_diffusion_input_backend(inp->self_kq_mask_mla, 16); + set_diffusion_input_backend(inp->self_k_idxs_lid, 8); + set_diffusion_input_backend(inp->self_kq_mask_lid, 16); + set_diffusion_input_backend(inp->self_k_rot_lid, 32); + return (llm_graph_input_attn_k_dsa *) res->add_input(std::move(inp)); } @@ -2898,6 +2958,17 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const inp->self_k_rot_swa = mctx_cur->get_swa()->build_input_k_rot(ctx0); inp->self_v_rot_swa = mctx_cur->get_swa()->build_input_v_rot(ctx0); + set_diffusion_input_backend(inp->self_k_idxs, 8); + set_diffusion_input_backend(inp->self_v_idxs, 8); + set_diffusion_input_backend(inp->self_kq_mask, 16); + set_diffusion_input_backend(inp->self_k_idxs_swa, 8); + set_diffusion_input_backend(inp->self_v_idxs_swa, 8); + set_diffusion_input_backend(inp->self_kq_mask_swa, 16); + set_diffusion_input_backend(inp->self_k_rot, 32); + set_diffusion_input_backend(inp->self_v_rot, 32); + set_diffusion_input_backend(inp->self_k_rot_swa, 32); + set_diffusion_input_backend(inp->self_v_rot_swa, 32); + return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp)); } diff --git a/src/llama-graph.h b/src/llama-graph.h index 3d14c47968f4..838bbce24b86 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -99,6 +99,10 @@ struct llama_diffusion_cond { size_t sc_topk_device_ids_bytes = 0; size_t sc_topk_device_probs_bytes = 0; + bool canvas_tokens_device_ready = false; + void * canvas_tokens_device_data = nullptr; + size_t canvas_tokens_device_bytes = 0; + // KV-cache reuse phase selector (block-diffusion): // false (encoder phase) = plain token embeddings, no self-conditioning; KV is // committed to the cache (prompt prefill / finalized-canvas commit). @@ -147,7 +151,8 @@ using llm_graph_input_ptr = std::unique_ptr; class llm_graph_input_embd : public llm_graph_input_i { public: - llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {} + llm_graph_input_embd(int64_t n_embd, const llama_diffusion_cond * diffusion = nullptr) : + diffusion(diffusion), n_embd(n_embd) {} virtual ~llm_graph_input_embd() = default; void set_input(const llama_ubatch * ubatch) override; @@ -157,6 +162,7 @@ class llm_graph_input_embd : public llm_graph_input_i { ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] + const llama_diffusion_cond * diffusion = nullptr; const int64_t n_embd = 0; }; @@ -923,6 +929,7 @@ struct llm_graph_context { virtual ~llm_graph_context() = default; void cb(ggml_tensor * cur, const char * name, int il) const; + void set_diffusion_input_backend(ggml_tensor * tensor, uint32_t group = 1) const; // // common diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 2802103bdd82..0e7c92f91b0a 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1450,8 +1450,15 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub const uint32_t n_tokens = ubatch->n_tokens; GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream()); - GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - int64_t * data = (int64_t *) dst->data; + std::vector data_staging; + int64_t * data = nullptr; + const bool is_host = ggml_backend_buffer_is_host(dst->buffer); + if (is_host) { + data = (int64_t *) dst->data; + } else { + data_staging.resize(ggml_nelements(dst)); + data = data_staging.data(); + } for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { const int64_t offs = sinfo.strm[s]*get_size(); @@ -1460,14 +1467,25 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i]; } } + + if (!is_host) { + ggml_backend_tensor_set(dst, data_staging.data(), 0, ggml_nbytes(dst)); + } } void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const { const uint32_t n_tokens = ubatch->n_tokens; GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream()); - GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - int64_t * data = (int64_t *) dst->data; + std::vector data_staging; + int64_t * data = nullptr; + const bool is_host = ggml_backend_buffer_is_host(dst->buffer); + if (is_host) { + data = (int64_t *) dst->data; + } else { + data_staging.resize(ggml_nelements(dst)); + data = data_staging.data(); + } if (!v_trans) { for (uint32_t s = 0; s < sinfo.n_stream(); ++s) { @@ -1493,6 +1511,10 @@ void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ub } } } + + if (!is_host) { + ggml_backend_tensor_set(dst, data_staging.data(), 0, ggml_nbytes(dst)); + } } void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const { @@ -1715,8 +1737,6 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { const uint32_t n_tokens = ubatch->n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - const int64_t n_kv = dst->ne[0]; const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch @@ -1740,9 +1760,21 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u }; if (dst->type == GGML_TYPE_F16) { - set_input_kq_mask_impl(args, (ggml_fp16_t *) dst->data, causal_attn); + if (ggml_backend_buffer_is_host(dst->buffer)) { + set_input_kq_mask_impl(args, (ggml_fp16_t *) dst->data, causal_attn); + } else { + std::vector data(ggml_nelements(dst)); + set_input_kq_mask_impl(args, data.data(), causal_attn); + ggml_backend_tensor_set(dst, data.data(), 0, ggml_nbytes(dst)); + } } else { - set_input_kq_mask_impl(args, (float *) dst->data, causal_attn); + if (ggml_backend_buffer_is_host(dst->buffer)) { + set_input_kq_mask_impl(args, (float *) dst->data, causal_attn); + } else { + std::vector data(ggml_nelements(dst)); + set_input_kq_mask_impl(args, data.data(), causal_attn); + ggml_backend_tensor_set(dst, data.data(), 0, ggml_nbytes(dst)); + } } //const int64_t t_end = ggml_time_us(); @@ -1776,21 +1808,27 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch } void llama_kv_cache::set_input_k_rot(ggml_tensor * dst) const { - GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - const auto n_rot = dst->ne[0]; GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0])); - memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst)); + const auto & data = attn_rot_hadamard.at(n_rot); + if (ggml_backend_buffer_is_host(dst->buffer)) { + memcpy(dst->data, data.data(), ggml_nbytes(dst)); + } else { + ggml_backend_tensor_set(dst, data.data(), 0, ggml_nbytes(dst)); + } } void llama_kv_cache::set_input_v_rot(ggml_tensor * dst) const { - GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - const auto n_rot = dst->ne[0]; GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0])); - memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst)); + const auto & data = attn_rot_hadamard.at(n_rot); + if (ggml_backend_buffer_is_host(dst->buffer)) { + memcpy(dst->data, data.data(), ggml_nbytes(dst)); + } else { + ggml_backend_tensor_set(dst, data.data(), 0, ggml_nbytes(dst)); + } } size_t llama_kv_cache::total_size() const { diff --git a/src/models/diffusion-gemma.cpp b/src/models/diffusion-gemma.cpp index b01fe2e8de97..fe521082e542 100644 --- a/src/models/diffusion-gemma.cpp +++ b/src/models/diffusion-gemma.cpp @@ -53,13 +53,14 @@ llama_model_diffusion_gemma::~llama_model_diffusion_gemma() { } } -// Place the token embedding the self-conditioning step needs on an offloaded (GPU) backend. +// Place the token embedding the diffusion decoder needs on an offloaded (GPU) backend. // // Primary path (sparse gather): store tok_embd {n_embd, n_vocab} as F16 in tok_embd_gpu on-device. -// The decoder graph gathers only the top-k rows per position (ggml_get_rows), so this is the whole -// soft-embedding cost: ~1.47 GiB resident, no per-decode PCIe copy, no full-vocab matmul. F16 (not -// the native Q4_K) because CUDA get_rows has no Q4_K/Q6_K kernel -- a quantized gather would fall -// back to CPU every step (a large regression). F16 halves the VRAM vs the F32 dense path. +// The decoder graph gathers the canvas token rows and the self-conditioning top-k rows from this +// tensor. This keeps token-id driven embedding lookup on device: ~1.47 GiB resident, no per-decode +// token-id D2H for CPU row selection, no per-decode embedding H2D. F16 (not the native Q4_K) +// because CUDA get_rows has no Q4_K/Q6_K kernel -- a quantized gather would fall back to CPU every +// step (a large regression). F16 halves the VRAM vs the F32 dense path. // // Fallback path (dense matmul): if the F16 copy can't be allocated, precompute the transposed F32 // embedding {n_vocab, n_embd} so the dense `probs @ token_embd` matmul (build_input) still runs @@ -118,7 +119,7 @@ void llama_model_diffusion_gemma::load_arch_post(llama_model_loader & ml) { if (tok_embd_gpu_buf) { ggml_backend_tensor_set(tok_embd_gpu, half.data(), 0, ggml_nbytes(tok_embd_gpu)); - LLAMA_LOG_INFO("%s: placed self-cond token embedding {%lld, %lld} F16 (%.2f GiB) on %s (gather, k=%lld)\n", + LLAMA_LOG_INFO("%s: placed diffusion token embedding {%lld, %lld} F16 (%.2f GiB) on %s (gather, k=%lld)\n", __func__, (long long) n_embd_t, (long long) n_vocab_t, ggml_nbytes(tok_embd_gpu) / (1024.0 * 1024.0 * 1024.0), ggml_backend_buffer_name(tok_embd_gpu->buffer), @@ -184,7 +185,7 @@ std::unique_ptr llama_model_diffusion_gemma::build_arch_graph ggml_tensor * llama_model_diffusion_gemma::graph_base::build_input(bool is_decoder) { const auto & dmodel = static_cast(model); - ggml_tensor * inpL = build_inp_embd(model.tok_embd); + ggml_tensor * inpL = build_inp_embd(dmodel.tok_embd_gpu ? dmodel.tok_embd_gpu : model.tok_embd); // scaled word embeddings (sqrt(hidden_size)); raw embeddings input is not scaled inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); From ba31e9a6d5ec0a60bf4e09283aeab23629a50e64 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Tue, 9 Jun 2026 15:36:43 +0530 Subject: [PATCH 05/13] diffusion-gemma: use output flag for persistent inputs --- ggml/include/ggml.h | 1 - ggml/src/ggml-alloc.c | 13 ++++++------- src/llama-graph.cpp | 7 ++++--- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index ff3bbce1b531..374934aacf32 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -647,7 +647,6 @@ extern "C" { GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed - GGML_TENSOR_FLAG_PERSIST = 32, // ...must not be overwritten by graph allocator reuse }; enum ggml_tri_type { diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 5eec18a059c9..3bda9abbe03d 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -641,10 +641,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor continue; } - // outputs and persistent inputs cannot be reused - if ((parent->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)) || - (parent->view_src != NULL && (parent->view_src->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)))) { - AT_PRINTF("not reusing parent %s for %s as it is persistent\n", parent->name, node->name); + // outputs cannot be reused + if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) { + AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name); continue; } @@ -689,9 +688,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor } static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { - // graph outputs and persistent inputs are never freed - if (node->flags & (GGML_TENSOR_FLAG_OUTPUT | GGML_TENSOR_FLAG_PERSIST)) { - AT_PRINTF("not freeing persistent %s\n", node->name); + // graph outputs are never freed + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { + AT_PRINTF("not freeing output %s\n", node->name); return; } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 778e413b0851..eee04a30de2e 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1203,8 +1203,9 @@ void llm_graph_context::set_diffusion_input_backend(ggml_tensor * tensor, uint32 const char * env = getenv("DG_GPU_INPUT_GROUPS"); // Keep the default to inputs used by the fixed diffusion decoder graph: // canvas/self-cond, positions, attention scale, KV indices, masks and - // rotary helpers. They are marked persistent below so the allocator will - // not overwrite them between denoising replays. + // rotary helpers. Mark them as outputs too, matching ggml-backend's + // existing copy-tensor convention, so the allocator will not overwrite + // them between denoising replays. return env ? (uint32_t) strtoul(env, nullptr, 0) : 63u; // 1|2|4|8|16|32 }(); if ((enabled_groups & group) == 0) { @@ -1215,7 +1216,7 @@ void llm_graph_context::set_diffusion_input_backend(ggml_tensor * tensor, uint32 ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i); if (backend && backend != backend_cpu && ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_GPU) { - tensor->flags |= GGML_TENSOR_FLAG_PERSIST; + ggml_set_output(tensor); ggml_backend_sched_set_tensor_backend(sched, tensor, backend); break; } From 7a7c0a8e4c83dd88823a46eac024dfa5b090f2f3 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Tue, 9 Jun 2026 17:24:15 +0530 Subject: [PATCH 06/13] diffusion-gemma: checkpoint device-loop early stop --- .../diffusion-gemma/diffusion-gemma-cli.cpp | 33 ++++++++- .../diffusion-gemma-server.cpp | 36 +++++++++- ggml/include/ggml-cuda.h | 6 ++ ggml/src/ggml-cuda/diffusion-sampling.cu | 69 ++++++++++++++++++- include/llama.h | 10 ++- src/llama-context.cpp | 6 ++ 6 files changed, 154 insertions(+), 6 deletions(-) diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index 853d5f36a649..6556df737402 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -209,6 +209,8 @@ int main(int argc, char ** argv) { llama_diffusion_sample_topk_supported(ctx); const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; const bool use_device_loop = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; + const int device_early_stop_interval = use_device_loop ? + std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 0), 0) : 0; llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling); LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n", @@ -219,6 +221,9 @@ int main(int argc, char ** argv) { (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")), use_device_self_cond ? " | device self-cond: on" : "", use_device_loop ? " | device loop: on" : ""); + if (device_early_stop_interval > 0) { + LOG_INF("diffusion-gemma: device early-stop interval=%d\n", device_early_stop_interval); + } if (topk_fixed > 0 || (topk_start > 0 && topk_end > 0)) { LOG_INF("diffusion-gemma: top-k sampling: fixed=%d anneal=[%d->%d] tail_correction=%d (vocab=%d)\n", topk_fixed, topk_start, topk_end, topk_tail, n_vocab); @@ -354,6 +359,12 @@ int main(int argc, char ** argv) { if (use_device_loop) { bool sampled_ok = true; + const int block_step = n_steps - cur_step + 1; + const bool use_device_early_stop = device_early_stop_interval > 0; + const bool reset_stop_state = use_device_early_stop && block_step == 1; + const bool check_stop = use_device_early_stop && + (cur_step == 1 || (block_step % device_early_stop_interval) == 0); + int32_t stop_flag = 0; const auto t_sample_start = std::chrono::steady_clock::now(); llama_diffusion_sample_params sample_params = { /* .n_tokens = */ canvas_length, @@ -370,9 +381,15 @@ int main(int argc, char ** argv) { /* .entropy = */ nullptr, /* .self_cond_ids = */ nullptr, /* .self_cond_probs = */ nullptr, - /* .final_tokens = */ cur_step == 1 ? argmax_canvas.data() : nullptr, - /* .update_canvas_on_device = */ cur_step > 1, + /* .final_tokens = */ (cur_step == 1 || check_stop) ? argmax_canvas.data() : nullptr, + /* .stop = */ check_stop ? &stop_flag : nullptr, /* .entropy_bound = */ entropy_bound, + /* .confidence_threshold = */ CONFIDENCE_THRESHOLD, + /* .stability_threshold = */ STABILITY_THRESHOLD, + /* .update_canvas_on_device = */ cur_step > 1, + /* .update_stop_state_on_device = */ use_device_early_stop, + /* .check_stop_on_device = */ check_stop, + /* .reset_stop_state = */ reset_stop_state, }; sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result); sample_sync_s += std::chrono::duration(std::chrono::steady_clock::now() - t_sample_start).count(); @@ -383,6 +400,9 @@ int main(int argc, char ** argv) { done = true; break; } + if (check_stop && stop_flag != 0) { + break; + } continue; } @@ -411,6 +431,15 @@ int main(int argc, char ** argv) { /* .entropy = */ entropy.data(), /* .self_cond_ids = */ use_device_self_cond ? nullptr : sc_ids.data(), /* .self_cond_probs = */ use_device_self_cond ? nullptr : sc_probs.data(), + /* .final_tokens = */ nullptr, + /* .stop = */ nullptr, + /* .entropy_bound = */ 0.0f, + /* .confidence_threshold = */ 0.0f, + /* .stability_threshold = */ 0, + /* .update_canvas_on_device = */ false, + /* .update_stop_state_on_device = */ false, + /* .check_stop_on_device = */ false, + /* .reset_stop_state = */ false, }; sampled_ok = llama_diffusion_sample_topk(ctx, &sample_params, &sample_result); if (!sampled_ok) { diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp index 3985e2740d5a..d3e8c169eaf4 100644 --- a/examples/diffusion-gemma/diffusion-gemma-server.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -161,6 +161,7 @@ struct diffusion_server { bool use_gpu_sampling = false; bool use_device_self_cond = false; bool use_device_loop = false; + int device_early_stop_interval = 0; std::string model_id; std::string model_path; std::string build_info = "llama.cpp diffusion-gemma-server"; @@ -291,6 +292,12 @@ struct diffusion_server { std::vector sampled(canvas_length); if (use_device_loop_step) { + const int block_step = n_steps - cur_step + 1; + const bool use_device_early_stop = device_early_stop_interval > 0; + const bool reset_stop_state = use_device_early_stop && block_step == 1; + const bool check_stop = use_device_early_stop && + (cur_step == 1 || (block_step % device_early_stop_interval) == 0); + int32_t stop_flag = 0; llama_diffusion_sample_params sample_params = { /* .n_tokens = */ canvas_length, /* .top_k = */ k_step, @@ -306,9 +313,15 @@ struct diffusion_server { /* .entropy = */ nullptr, /* .self_cond_ids = */ nullptr, /* .self_cond_probs = */ nullptr, - /* .final_tokens = */ cur_step == 1 ? argmax_canvas.data() : nullptr, - /* .update_canvas_on_device = */ cur_step > 1, + /* .final_tokens = */ (cur_step == 1 || check_stop) ? argmax_canvas.data() : nullptr, + /* .stop = */ check_stop ? &stop_flag : nullptr, /* .entropy_bound = */ ENTROPY_BOUND, + /* .confidence_threshold = */ CONFIDENCE_THRESHOLD, + /* .stability_threshold = */ STABILITY_THRESHOLD, + /* .update_canvas_on_device = */ cur_step > 1, + /* .update_stop_state_on_device = */ use_device_early_stop, + /* .check_stop_on_device = */ check_stop, + /* .reset_stop_state = */ reset_stop_state, }; if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) { llama_memory_seq_rm(mem, 0, n_past, -1); @@ -317,6 +330,9 @@ struct diffusion_server { return out; } llama_memory_seq_rm(mem, 0, n_past, -1); + if (check_stop && stop_flag != 0) { + break; + } continue; } @@ -341,6 +357,15 @@ struct diffusion_server { /* .entropy = */ entropy.data(), /* .self_cond_ids = */ use_device_self_cond_step ? nullptr : sc_ids.data(), /* .self_cond_probs = */ use_device_self_cond_step ? nullptr : sc_probs.data(), + /* .final_tokens = */ nullptr, + /* .stop = */ nullptr, + /* .entropy_bound = */ 0.0f, + /* .confidence_threshold = */ 0.0f, + /* .stability_threshold = */ 0, + /* .update_canvas_on_device = */ false, + /* .update_stop_state_on_device = */ false, + /* .check_stop_on_device = */ false, + /* .reset_stop_state = */ false, }; if (!llama_diffusion_sample_topk(ctx, &sample_params, &sample_result)) { llama_memory_seq_rm(mem, 0, n_past, -1); @@ -711,6 +736,8 @@ int main(int argc, char ** argv) { llama_diffusion_sample_topk_supported(srv.ctx); srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; + srv.device_early_stop_interval = srv.use_device_loop ? + std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 0), 0) : 0; llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling); const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; @@ -726,6 +753,9 @@ int main(int argc, char ** argv) { srv.use_device_self_cond ? " | device self-cond: on" : "", srv.use_device_loop ? " | device loop: on" : "", srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0); + if (srv.device_early_stop_interval > 0) { + SRV_INF("device early-stop interval: %d\n", srv.device_early_stop_interval); + } // ------------------------------------------------------------------------------------------ // HTTP server @@ -797,6 +827,7 @@ int main(int argc, char ** argv) { {"gpu_sampling", srv.use_gpu_sampling}, {"device_self_cond", srv.use_device_self_cond}, {"device_loop", srv.use_device_loop}, + {"device_early_stop_interval", srv.device_early_stop_interval}, {"top_k", srv.topk_fixed}, {"top_k_start", srv.topk_start}, {"top_k_end", srv.topk_end}, @@ -905,6 +936,7 @@ int main(int argc, char ** argv) { {"gpu_sampling", srv.use_gpu_sampling}, {"device_self_cond", srv.use_device_self_cond}, {"device_loop", srv.use_device_loop}, + {"device_early_stop_interval", srv.device_early_stop_interval}, {"top_k", srv.topk_fixed}, {"top_k_start", srv.topk_start}, {"top_k_end", srv.topk_end}, diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index d966e5a3d0d7..cb1136ba856d 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -64,8 +64,14 @@ struct ggml_cuda_diffusion_sample_result { struct ggml_tensor * self_cond_probs_tensor; struct ggml_tensor * canvas_tokens_tensor; int32_t * final_tokens; + int32_t * stop; float entropy_bound; + float confidence_threshold; + int32_t stability_threshold; bool update_canvas_on_device; + bool update_stop_state_on_device; + bool check_stop_on_device; + bool reset_stop_state; }; typedef bool (*ggml_backend_cuda_diffusion_sample_topk_t)( diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu index f0b6f53e3146..81cb0056d352 100644 --- a/ggml/src/ggml-cuda/diffusion-sampling.cu +++ b/ggml/src/ggml-cuda/diffusion-sampling.cu @@ -35,6 +35,8 @@ struct diffusion_sample_scratch { int * top_ids = nullptr; int * sampled = nullptr; int * argmax = nullptr; + int * prev_argmax = nullptr; + int * stop = nullptr; float * entropy = nullptr; int * sc_ids = nullptr; float * sc_probs = nullptr; @@ -42,6 +44,8 @@ struct diffusion_sample_scratch { size_t top_ids_cap = 0; size_t sampled_cap = 0; size_t argmax_cap = 0; + size_t prev_argmax_cap = 0; + size_t stop_cap = 0; size_t entropy_cap = 0; size_t sc_ids_cap = 0; size_t sc_probs_cap = 0; @@ -80,6 +84,8 @@ static diffusion_sample_scratch * diffusion_get_scratch( diffusion_scratch_reserve(stream, &scratch.top_ids, &scratch.top_ids_cap, (size_t) n_tokens * heap_k, &synced); diffusion_scratch_reserve(stream, &scratch.sampled, &scratch.sampled_cap, (size_t) n_tokens, &synced); diffusion_scratch_reserve(stream, &scratch.argmax, &scratch.argmax_cap, (size_t) n_tokens, &synced); + diffusion_scratch_reserve(stream, &scratch.prev_argmax, &scratch.prev_argmax_cap, (size_t) n_tokens, &synced); + diffusion_scratch_reserve(stream, &scratch.stop, &scratch.stop_cap, (size_t) 1, &synced); diffusion_scratch_reserve(stream, &scratch.entropy, &scratch.entropy_cap, (size_t) n_tokens, &synced); diffusion_scratch_reserve(stream, &scratch.sc_ids, &scratch.sc_ids_cap, (size_t) n_tokens * sc_k, &synced); diffusion_scratch_reserve(stream, &scratch.sc_probs, &scratch.sc_probs_cap, (size_t) n_tokens * sc_k, &synced); @@ -357,6 +363,51 @@ static __global__ void diffusion_update_canvas_kernel( } } +static __global__ void diffusion_stop_state_kernel( + const float * __restrict__ entropy, + const int * __restrict__ argmax, + int * __restrict__ prev_argmax, + int * __restrict__ stop, + const int n_tokens, + const float confidence_threshold, + const int stability_threshold, + const int check_stop, + const int reset_state) { + const int tid = threadIdx.x; + + __shared__ float s_entropy[1024]; + __shared__ int s_diff[1024]; + + if (tid < n_tokens) { + s_entropy[tid] = entropy[tid]; + s_diff[tid] = reset_state ? 1 : (prev_argmax[tid] != argmax[tid]); + } else { + s_entropy[tid] = 0.0f; + s_diff[tid] = 0; + } + __syncthreads(); + + for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { + if (tid < stride) { + s_entropy[tid] += s_entropy[tid + stride]; + s_diff[tid] += s_diff[tid + stride]; + } + __syncthreads(); + } + + if (tid == 0 && check_stop) { + const bool stable = !reset_state && (stability_threshold == 0 || s_diff[0] == 0); + const bool confident = confidence_threshold > 0.0f && + (s_entropy[0] / (float) n_tokens) < confidence_threshold; + stop[0] = (stable && confident) ? 1 : 0; + } + __syncthreads(); + + if (tid < n_tokens) { + prev_argmax[tid] = argmax[tid]; + } +} + static __global__ void diffusion_sample_kernel( const float * __restrict__ logits, const int * __restrict__ top_ids, @@ -550,6 +601,11 @@ bool ggml_cuda_diffusion_sample_topk( return false; } } + if (result->update_stop_state_on_device || result->check_stop_on_device || result->reset_stop_state) { + if (n_tokens > 1024 || (result->check_stop_on_device && result->stop == nullptr)) { + return false; + } + } const bool have_self_cond_host = result->self_cond_ids != nullptr && result->self_cond_probs != nullptr; const bool have_self_cond_tensor = result->self_cond_ids_tensor != nullptr && result->self_cond_probs_tensor != nullptr; @@ -651,6 +707,13 @@ bool ggml_cuda_diffusion_sample_topk( scratch->entropy, scratch->sampled, (int *) result->canvas_tokens_tensor->data, n_tokens, n_vocab, result->entropy_bound, params->seed, params->step); } + if (result->update_stop_state_on_device || result->check_stop_on_device || result->reset_stop_state) { + const int stop_threads = next_power_of_2_host(n_tokens); + diffusion_stop_state_kernel<<<1, stop_threads, 0, stream>>>( + scratch->entropy, scratch->argmax, scratch->prev_argmax, scratch->stop, + n_tokens, result->confidence_threshold, result->stability_threshold, + result->check_stop_on_device ? 1 : 0, result->reset_stop_state ? 1 : 0); + } if (result->sampled) { CUDA_CHECK(cudaMemcpyAsync(result->sampled, scratch->sampled, (size_t) n_tokens * sizeof(int), @@ -668,6 +731,10 @@ bool ggml_cuda_diffusion_sample_topk( CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int), cudaMemcpyDeviceToHost, stream)); } + if (result->stop) { + CUDA_CHECK(cudaMemcpyAsync(result->stop, scratch->stop, sizeof(int), + cudaMemcpyDeviceToHost, stream)); + } if (have_self_cond_host) { CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids, scratch->sc_ids, (size_t) n_tokens * sc_k * sizeof(int), cudaMemcpyDeviceToHost, stream)); @@ -682,7 +749,7 @@ bool ggml_cuda_diffusion_sample_topk( } const bool host_outputs_requested = result->sampled || result->argmax || result->entropy || - result->final_tokens || have_self_cond_host; + result->final_tokens || result->stop || have_self_cond_host; CUDA_CHECK(cudaGetLastError()); if (sync_required || host_outputs_requested) { CUDA_CHECK(cudaStreamSynchronize(stream)); diff --git a/include/llama.h b/include/llama.h index 0e39b71a388e..089dd2747a8b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1038,8 +1038,16 @@ extern "C" { // decoder token input tensor. final_tokens is copied only when requested, typically on // the final denoising step. llama_token * final_tokens; - bool update_canvas_on_device; + // Optional host stop flag for device-loop early stopping. When requested, CUDA computes + // the stable+confident condition on device and copies only this flag plus final_tokens. + int32_t * stop; float entropy_bound; + float confidence_threshold; + int32_t stability_threshold; + bool update_canvas_on_device; + bool update_stop_state_on_device; + bool check_stop_on_device; + bool reset_stop_state; }; // CUDA-only fast path for block-diffusion sampling. When enabled, llama_decode() diff --git a/src/llama-context.cpp b/src/llama-context.cpp index fd6c8bc99a36..cf3b9760a546 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1294,8 +1294,14 @@ bool llama_context::diffusion_sample_topk( /* .self_cond_probs_tensor = */ t_self_cond_probs, /* .canvas_tokens_tensor = */ t_canvas_tokens, /* .final_tokens = */ (int32_t *) result->final_tokens, + /* .stop = */ result->stop, /* .entropy_bound = */ result->entropy_bound, + /* .confidence_threshold = */ result->confidence_threshold, + /* .stability_threshold = */ result->stability_threshold, /* .update_canvas_on_device = */ result->update_canvas_on_device, + /* .update_stop_state_on_device = */ result->update_stop_state_on_device, + /* .check_stop_on_device = */ result->check_stop_on_device, + /* .reset_stop_state = */ result->reset_stop_state, }; const bool ok = sample_proc(backend, t_logits, &cuda_params, &cuda_result); From 4823ee09e91f2914234e4ecba68ad32e05088225 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Tue, 9 Jun 2026 17:51:29 +0530 Subject: [PATCH 07/13] diffusion-gemma: make device early stop every step --- examples/diffusion-gemma/diffusion-gemma-cli.cpp | 6 ++---- examples/diffusion-gemma/diffusion-gemma-server.cpp | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index 6556df737402..87b99b0aa31b 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -209,8 +209,7 @@ int main(int argc, char ** argv) { llama_diffusion_sample_topk_supported(ctx); const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; const bool use_device_loop = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; - const int device_early_stop_interval = use_device_loop ? - std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 0), 0) : 0; + const int device_early_stop_interval = use_device_loop ? 1 : 0; llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling); LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n", @@ -362,8 +361,7 @@ int main(int argc, char ** argv) { const int block_step = n_steps - cur_step + 1; const bool use_device_early_stop = device_early_stop_interval > 0; const bool reset_stop_state = use_device_early_stop && block_step == 1; - const bool check_stop = use_device_early_stop && - (cur_step == 1 || (block_step % device_early_stop_interval) == 0); + const bool check_stop = use_device_early_stop; int32_t stop_flag = 0; const auto t_sample_start = std::chrono::steady_clock::now(); llama_diffusion_sample_params sample_params = { diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp index d3e8c169eaf4..e14920644966 100644 --- a/examples/diffusion-gemma/diffusion-gemma-server.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -295,8 +295,7 @@ struct diffusion_server { const int block_step = n_steps - cur_step + 1; const bool use_device_early_stop = device_early_stop_interval > 0; const bool reset_stop_state = use_device_early_stop && block_step == 1; - const bool check_stop = use_device_early_stop && - (cur_step == 1 || (block_step % device_early_stop_interval) == 0); + const bool check_stop = use_device_early_stop; int32_t stop_flag = 0; llama_diffusion_sample_params sample_params = { /* .n_tokens = */ canvas_length, @@ -736,8 +735,7 @@ int main(int argc, char ** argv) { llama_diffusion_sample_topk_supported(srv.ctx); srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; - srv.device_early_stop_interval = srv.use_device_loop ? - std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 0), 0) : 0; + srv.device_early_stop_interval = srv.use_device_loop ? 1 : 0; llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling); const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; From 2a3dcf5764eea43299ba2be1b9015b4944ecabe0 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Tue, 9 Jun 2026 23:38:45 +0530 Subject: [PATCH 08/13] examples: update diffusion gemma mtmd bitmap loading --- examples/diffusion-gemma/diffusion-gemma-cli.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index 87b99b0aa31b..8a3e5cbbcd97 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -140,13 +140,13 @@ int main(int argc, char ** argv) { mtmd::bitmaps bitmaps; std::string markers; for (const auto & img : params.image) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(mctx_vision.get(), img.c_str())); - if (!bmp.ptr) { + auto bmp_res = mtmd_helper_bitmap_init_from_file(mctx_vision.get(), img.c_str(), false); + if (!bmp_res.bitmap) { LOG_ERR("error: failed to load image '%s'\n", img.c_str()); llama_model_free(model); return 1; } - bitmaps.entries.push_back(std::move(bmp)); + bitmaps.entries.emplace_back(bmp_res.bitmap); markers += mtmd_default_marker(); markers += "\n"; } From 1347c3d5fa93a9c363326f66609f3aaa967392d8 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Wed, 10 Jun 2026 01:11:44 +0530 Subject: [PATCH 09/13] diffusion: add gated kernel optimization experiments --- examples/diffusion-gemma/diffusion-gemma-cli.cpp | 6 ++++-- examples/diffusion-gemma/diffusion-gemma-server.cpp | 6 ++++-- ggml/src/ggml-cuda/diffusion-sampling.cu | 13 +++++++++++-- ggml/src/ggml-cuda/mmq.cu | 13 ++++++++++--- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index 8a3e5cbbcd97..ee703087b7bd 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -209,7 +209,8 @@ int main(int argc, char ** argv) { llama_diffusion_sample_topk_supported(ctx); const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; const bool use_device_loop = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; - const int device_early_stop_interval = use_device_loop ? 1 : 0; + const int device_early_stop_interval = use_device_loop ? + std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 1), 0) : 0; llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling); LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n", @@ -361,7 +362,8 @@ int main(int argc, char ** argv) { const int block_step = n_steps - cur_step + 1; const bool use_device_early_stop = device_early_stop_interval > 0; const bool reset_stop_state = use_device_early_stop && block_step == 1; - const bool check_stop = use_device_early_stop; + const bool check_stop = use_device_early_stop && + (cur_step == 1 || (block_step % device_early_stop_interval) == 0); int32_t stop_flag = 0; const auto t_sample_start = std::chrono::steady_clock::now(); llama_diffusion_sample_params sample_params = { diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp index e14920644966..708e2e0af48f 100644 --- a/examples/diffusion-gemma/diffusion-gemma-server.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -295,7 +295,8 @@ struct diffusion_server { const int block_step = n_steps - cur_step + 1; const bool use_device_early_stop = device_early_stop_interval > 0; const bool reset_stop_state = use_device_early_stop && block_step == 1; - const bool check_stop = use_device_early_stop; + const bool check_stop = use_device_early_stop && + (cur_step == 1 || (block_step % device_early_stop_interval) == 0); int32_t stop_flag = 0; llama_diffusion_sample_params sample_params = { /* .n_tokens = */ canvas_length, @@ -735,7 +736,8 @@ int main(int argc, char ** argv) { llama_diffusion_sample_topk_supported(srv.ctx); srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; - srv.device_early_stop_interval = srv.use_device_loop ? 1 : 0; + srv.device_early_stop_interval = srv.use_device_loop ? + std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 1), 0) : 0; llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling); const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu index 81cb0056d352..da8d05680c55 100644 --- a/ggml/src/ggml-cuda/diffusion-sampling.cu +++ b/ggml/src/ggml-cuda/diffusion-sampling.cu @@ -31,6 +31,11 @@ static bool diffusion_fast_topk_enabled() { return !v || std::strcmp(v, "0") != 0; } +static bool diffusion_direct_selfcond_enabled() { + const char * v = std::getenv("GGML_CUDA_DIFFUSION_DIRECT_SELFCOND"); + return v && std::strcmp(v, "0") != 0; +} + struct diffusion_sample_scratch { int * top_ids = nullptr; int * sampled = nullptr; @@ -694,12 +699,16 @@ bool ggml_cuda_diffusion_sample_topk( logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp); } + const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && diffusion_direct_selfcond_enabled(); + int * sc_ids_out = direct_self_cond_tensor ? (int *) result->self_cond_ids_tensor->data : scratch->sc_ids; + float * sc_probs_out = direct_self_cond_tensor ? (float *) result->self_cond_probs_tensor->data : scratch->sc_probs; + constexpr int block_size = 256; diffusion_sample_kernel<<>>( logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp, params->seed, params->step, params->top_k_tail_correction, scratch->sampled, scratch->argmax, scratch->entropy, - scratch->sc_ids, scratch->sc_probs); + sc_ids_out, sc_probs_out); if (result->update_canvas_on_device) { const int update_threads = next_power_of_2_host(n_tokens); @@ -741,7 +750,7 @@ bool ggml_cuda_diffusion_sample_topk( CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float), cudaMemcpyDeviceToHost, stream)); } - if (have_self_cond_tensor) { + if (have_self_cond_tensor && !direct_self_cond_tensor) { CUDA_CHECK(cudaMemcpyAsync(result->self_cond_ids_tensor->data, scratch->sc_ids, (size_t) n_tokens * sc_k * sizeof(int), cudaMemcpyDeviceToDevice, stream)); CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float), diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index e1add5e03316..a07e49493e06 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -3,6 +3,13 @@ #include "quantize.cuh" #include "mmid.cuh" +#include + +static bool ggml_cuda_mmq_stream_k_enabled() { + const char * v = std::getenv("GGML_CUDA_MMQ_STREAM_K"); + return !v || std::atoi(v) != 0; +} + static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { switch (args.type_x) { case GGML_TYPE_Q1_0: @@ -118,8 +125,8 @@ void ggml_cuda_mul_mat_q( const int64_t s03 = src0->nb[3] / ts_src0; const int64_t s3 = dst->nb[3] / ts_dst; - const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) - || GGML_CUDA_CC_IS_CDNA(cc); + const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) + || GGML_CUDA_CC_IS_CDNA(cc)) && ggml_cuda_mmq_stream_k_enabled(); // TODO: tighter pool buffer size vs q8 path const bool use_native_fp4 = blackwell_mma_available(cc) && (src0->type == GGML_TYPE_MXFP4 || src0->type == GGML_TYPE_NVFP4); @@ -251,7 +258,7 @@ void ggml_cuda_op_mul_mat_q( // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_CDNA(cc)) - && src1_ncols == ne11; + && src1_ncols == ne11 && ggml_cuda_mmq_stream_k_enabled(); const mmq_args args = { src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst, From a83d05d8110f29f6c343b9df6c0d0a9283ea1036 Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Wed, 10 Jun 2026 01:19:47 +0530 Subject: [PATCH 10/13] diffusion: poll device stop state every denoise step --- .../diffusion-gemma/diffusion-gemma-cli.cpp | 9 ++++----- .../diffusion-gemma/diffusion-gemma-server.cpp | 9 ++++----- ggml/src/ggml-cuda/diffusion-sampling.cu | 18 ++++++++++++++++-- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index ee703087b7bd..ad7059805a34 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -209,8 +209,7 @@ int main(int argc, char ** argv) { llama_diffusion_sample_topk_supported(ctx); const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; const bool use_device_loop = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; - const int device_early_stop_interval = use_device_loop ? - std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 1), 0) : 0; + const int device_early_stop_interval = use_device_loop ? 1 : 0; llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling); LOG_INF("diffusion-gemma: prefix=%d canvas=%d max_canvases=%d steps=%d entropy_bound=%.3f temp=[%.2f,%.2f] n_ctx=%d mm=%d\n", @@ -360,10 +359,10 @@ int main(int argc, char ** argv) { if (use_device_loop) { bool sampled_ok = true; const int block_step = n_steps - cur_step + 1; + const bool final_step = cur_step == 1; const bool use_device_early_stop = device_early_stop_interval > 0; const bool reset_stop_state = use_device_early_stop && block_step == 1; - const bool check_stop = use_device_early_stop && - (cur_step == 1 || (block_step % device_early_stop_interval) == 0); + const bool check_stop = use_device_early_stop && !final_step; int32_t stop_flag = 0; const auto t_sample_start = std::chrono::steady_clock::now(); llama_diffusion_sample_params sample_params = { @@ -381,7 +380,7 @@ int main(int argc, char ** argv) { /* .entropy = */ nullptr, /* .self_cond_ids = */ nullptr, /* .self_cond_probs = */ nullptr, - /* .final_tokens = */ (cur_step == 1 || check_stop) ? argmax_canvas.data() : nullptr, + /* .final_tokens = */ (final_step || check_stop) ? argmax_canvas.data() : nullptr, /* .stop = */ check_stop ? &stop_flag : nullptr, /* .entropy_bound = */ entropy_bound, /* .confidence_threshold = */ CONFIDENCE_THRESHOLD, diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp index 708e2e0af48f..f651e32a98bf 100644 --- a/examples/diffusion-gemma/diffusion-gemma-server.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -293,10 +293,10 @@ struct diffusion_server { if (use_device_loop_step) { const int block_step = n_steps - cur_step + 1; + const bool final_step = cur_step == 1; const bool use_device_early_stop = device_early_stop_interval > 0; const bool reset_stop_state = use_device_early_stop && block_step == 1; - const bool check_stop = use_device_early_stop && - (cur_step == 1 || (block_step % device_early_stop_interval) == 0); + const bool check_stop = use_device_early_stop && !final_step; int32_t stop_flag = 0; llama_diffusion_sample_params sample_params = { /* .n_tokens = */ canvas_length, @@ -313,7 +313,7 @@ struct diffusion_server { /* .entropy = */ nullptr, /* .self_cond_ids = */ nullptr, /* .self_cond_probs = */ nullptr, - /* .final_tokens = */ (cur_step == 1 || check_stop) ? argmax_canvas.data() : nullptr, + /* .final_tokens = */ (final_step || check_stop) ? argmax_canvas.data() : nullptr, /* .stop = */ check_stop ? &stop_flag : nullptr, /* .entropy_bound = */ ENTROPY_BOUND, /* .confidence_threshold = */ CONFIDENCE_THRESHOLD, @@ -736,8 +736,7 @@ int main(int argc, char ** argv) { llama_diffusion_sample_topk_supported(srv.ctx); srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; - srv.device_early_stop_interval = srv.use_device_loop ? - std::max(env_int("DG_DEVICE_EARLY_STOP_INTERVAL", 1), 0) : 0; + srv.device_early_stop_interval = srv.use_device_loop ? 1 : 0; llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling); const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu index da8d05680c55..73e1472119ea 100644 --- a/ggml/src/ggml-cuda/diffusion-sampling.cu +++ b/ggml/src/ggml-cuda/diffusion-sampling.cu @@ -36,6 +36,11 @@ static bool diffusion_direct_selfcond_enabled() { return v && std::strcmp(v, "0") != 0; } +static bool diffusion_final_tokens_on_stop_enabled() { + const char * v = std::getenv("GGML_CUDA_DIFFUSION_FINAL_TOKENS_ON_STOP"); + return v && std::strcmp(v, "0") != 0; +} + struct diffusion_sample_scratch { int * top_ids = nullptr; int * sampled = nullptr; @@ -736,7 +741,11 @@ bool ggml_cuda_diffusion_sample_topk( CUDA_CHECK(cudaMemcpyAsync(result->entropy, scratch->entropy, (size_t) n_tokens * sizeof(float), cudaMemcpyDeviceToHost, stream)); } - if (result->final_tokens) { + const bool final_tokens_after_stop = + result->final_tokens && result->stop && result->check_stop_on_device && + diffusion_final_tokens_on_stop_enabled(); + + if (result->final_tokens && !final_tokens_after_stop) { CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int), cudaMemcpyDeviceToHost, stream)); } @@ -758,10 +767,15 @@ bool ggml_cuda_diffusion_sample_topk( } const bool host_outputs_requested = result->sampled || result->argmax || result->entropy || - result->final_tokens || result->stop || have_self_cond_host; + (result->final_tokens && !final_tokens_after_stop) || result->stop || have_self_cond_host; CUDA_CHECK(cudaGetLastError()); if (sync_required || host_outputs_requested) { CUDA_CHECK(cudaStreamSynchronize(stream)); } + if (final_tokens_after_stop && *result->stop != 0) { + CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } return true; } From 1f1e91c826f1708ae511087b1c0c7c4e9d3c9cef Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Wed, 10 Jun 2026 01:33:53 +0530 Subject: [PATCH 11/13] diffusion: add gated topk and mmq kernel experiments --- ggml/src/ggml-cuda/diffusion-sampling.cu | 381 +++++++++++++++++++---- ggml/src/ggml-cuda/mmq.cuh | 54 +++- 2 files changed, 375 insertions(+), 60 deletions(-) diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu index 73e1472119ea..d28a397fad7b 100644 --- a/ggml/src/ggml-cuda/diffusion-sampling.cu +++ b/ggml/src/ggml-cuda/diffusion-sampling.cu @@ -41,6 +41,41 @@ static bool diffusion_final_tokens_on_stop_enabled() { return v && std::strcmp(v, "0") != 0; } +static bool diffusion_fused_topk_sample_enabled() { + const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSED_TOPK_SAMPLE"); + return v && std::strcmp(v, "0") != 0; +} + +static bool diffusion_tight_topk_enabled() { + const char * v = std::getenv("GGML_CUDA_DIFFUSION_TIGHT_TOPK"); + return v && std::strcmp(v, "0") != 0; +} + +static int diffusion_topk_local_k(const int heap_k) { + int requested = heap_k <= 256 ? 8 : 16; + if (const char * v = std::getenv("GGML_CUDA_DIFFUSION_TOPK_LOCAL_K")) { + requested = std::atoi(v); + } + + const int min_local_k = std::max(1, (heap_k + 255) / 256); + if (requested <= 1) { + requested = 1; + } else if (requested <= 2) { + requested = 2; + } else if (requested <= 4) { + requested = 4; + } else if (requested <= 8) { + requested = 8; + } else { + requested = 16; + } + + while (requested < min_local_k && requested < 16) { + requested <<= 1; + } + return requested; +} + struct diffusion_sample_scratch { int * top_ids = nullptr; int * sampled = nullptr; @@ -194,17 +229,39 @@ static void diffusion_select_topk_local( const int heap_k, cudaStream_t stream) { constexpr int block_size = 256; - - if (heap_k <= 256) { - constexpr int local_k = 8; - const size_t smem = (size_t) local_k * block_size * (sizeof(float) + sizeof(int)); - diffusion_select_topk_local_kernel<<>>( - logits, top_ids, n_vocab, heap_k); - } else { - constexpr int local_k = 16; - const size_t smem = (size_t) local_k * block_size * (sizeof(float) + sizeof(int)); - diffusion_select_topk_local_kernel<<>>( - logits, top_ids, n_vocab, heap_k); + const int local_k = diffusion_topk_local_k(heap_k); + + switch (local_k) { + case 1: { + constexpr int local_k_t = 1; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)); + diffusion_select_topk_local_kernel<<>>( + logits, top_ids, n_vocab, heap_k); + } break; + case 2: { + constexpr int local_k_t = 2; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)); + diffusion_select_topk_local_kernel<<>>( + logits, top_ids, n_vocab, heap_k); + } break; + case 4: { + constexpr int local_k_t = 4; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)); + diffusion_select_topk_local_kernel<<>>( + logits, top_ids, n_vocab, heap_k); + } break; + case 8: { + constexpr int local_k_t = 8; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)); + diffusion_select_topk_local_kernel<<>>( + logits, top_ids, n_vocab, heap_k); + } break; + default: { + constexpr int local_k_t = 16; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)); + diffusion_select_topk_local_kernel<<>>( + logits, top_ids, n_vocab, heap_k); + } break; } } @@ -297,6 +354,213 @@ static __device__ __forceinline__ float diffusion_rng_uniform(uint32_t seed, uin return ((x >> 8) + 0.5f) * (1.0f / 16777216.0f); } +template +static __global__ void diffusion_sample_topk_fused_local_kernel( + const float * __restrict__ logits, + const int n_vocab, + const int n_tokens, + const int top_k, + const int sc_k, + const float inv_temp, + const uint32_t seed, + const uint32_t step, + int * __restrict__ sampled, + int * __restrict__ argmax, + float * __restrict__ entropy, + int * __restrict__ sc_ids, + float * __restrict__ sc_probs) { + constexpr int block_size = 256; + constexpr int candidate_count = LOCAL_K * block_size; + + const int row = blockIdx.x; + const int tid = threadIdx.x; + if (row >= n_tokens) { + return; + } + + float vals[LOCAL_K]; + int ids[LOCAL_K]; +#pragma unroll + for (int i = 0; i < LOCAL_K; ++i) { + vals[i] = -FLT_MAX; + ids[i] = 0; + } + + const float * row_logits = logits + (size_t) row * n_vocab; + for (int v = tid; v < n_vocab; v += blockDim.x) { + const float x = row_logits[v]; + if (diffusion_should_swap_desc(vals[LOCAL_K - 1], ids[LOCAL_K - 1], x, v)) { + int pos = LOCAL_K - 1; +#pragma unroll + for (int i = LOCAL_K - 1; i > 0; --i) { + if (pos == i && diffusion_should_swap_desc(vals[i - 1], ids[i - 1], x, v)) { + vals[i] = vals[i - 1]; + ids[i] = ids[i - 1]; + --pos; + } + } + vals[pos] = x; + ids[pos] = v; + } + } + + extern __shared__ unsigned char smem[]; + float * s_vals = (float *) smem; + int * s_ids = (int *) (s_vals + candidate_count); + float * s_sum = (float *) (s_ids + candidate_count); + float * s_t = s_sum + block_size; + +#pragma unroll + for (int i = 0; i < LOCAL_K; ++i) { + const int dst = tid * LOCAL_K + i; + s_vals[dst] = vals[i]; + s_ids[dst] = ids[i]; + } + __syncthreads(); + + for (int k = 2; k <= candidate_count; k <<= 1) { + for (int j = k >> 1; j > 0; j >>= 1) { + for (int i = tid; i < candidate_count; i += blockDim.x) { + const int ixj = i ^ j; + if (ixj > i) { + const bool descending = (i & k) == 0; + const bool swap = descending + ? diffusion_should_swap_desc(s_vals[i], s_ids[i], s_vals[ixj], s_ids[ixj]) + : diffusion_should_swap_desc(s_vals[ixj], s_ids[ixj], s_vals[i], s_ids[i]); + if (swap) { + const float tv = s_vals[i]; + s_vals[i] = s_vals[ixj]; + s_vals[ixj] = tv; + const int ti = s_ids[i]; + s_ids[i] = s_ids[ixj]; + s_ids[ixj] = ti; + } + } + } + __syncthreads(); + } + } + + const float max_l = s_vals[0] * inv_temp; + const int amax = s_ids[0]; + + float local_sum = 0.0f; + float local_t = 0.0f; + if (tid < top_k) { + const float d = s_vals[tid] * inv_temp - max_l; + const float e = expf(d); + local_sum = e; + local_t = d * e; + } + + s_sum[tid] = local_sum; + s_t[tid] = local_t; + __syncthreads(); + + for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { + if (tid < stride) { + s_sum[tid] += s_sum[tid + stride]; + s_t[tid] += s_t[tid + stride]; + } + __syncthreads(); + } + + const float z = s_sum[0]; + const float t = s_t[0]; + + if (tid == 0) { + argmax[row] = amax; + entropy[row] = logf(z) - t / z; + + const float r = diffusion_rng_uniform(seed, step, row) * z; + float cum = 0.0f; + int tok = amax; + for (int i = 0; i < top_k; ++i) { + const float d = s_vals[i] * inv_temp - max_l; + cum += expf(d); + if (cum >= r) { + tok = s_ids[i]; + break; + } + } + sampled[row] = tok; + + const int n_sc = min(sc_k, top_k); + for (int i = 0; i < sc_k; ++i) { + const int out = row * sc_k + i; + if (i < n_sc) { + sc_ids[out] = s_ids[i]; + sc_probs[out] = expf(s_vals[i] * inv_temp - max_l) / z; + } else { + sc_ids[out] = 0; + sc_probs[out] = 0.0f; + } + } + } +} + +static void diffusion_sample_topk_fused_local( + const float * logits, + const int n_vocab, + const int n_tokens, + const int top_k, + const int sc_k, + const float inv_temp, + const uint32_t seed, + const uint32_t step, + int * sampled, + int * argmax, + float * entropy, + int * sc_ids, + float * sc_probs, + cudaStream_t stream) { + constexpr int block_size = 256; + const int local_k = diffusion_topk_local_k(top_k); + + switch (local_k) { + case 1: { + constexpr int local_k_t = 1; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 2 * block_size * sizeof(float); + diffusion_sample_topk_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + case 2: { + constexpr int local_k_t = 2; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 2 * block_size * sizeof(float); + diffusion_sample_topk_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + case 4: { + constexpr int local_k_t = 4; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 2 * block_size * sizeof(float); + diffusion_sample_topk_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + case 8: { + constexpr int local_k_t = 8; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 2 * block_size * sizeof(float); + diffusion_sample_topk_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + default: { + constexpr int local_k_t = 16; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 2 * block_size * sizeof(float); + diffusion_sample_topk_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + } +} + static __device__ __forceinline__ bool diffusion_pair_gt(float a_val, int a_id, float b_val, int b_id) { return a_val > b_val || (a_val == b_val && a_id > b_id); } @@ -647,7 +911,7 @@ bool ggml_cuda_diffusion_sample_topk( } } - const int heap_k = top_k == 0 ? sc_k : std::max(top_k, sc_k); + const int heap_k = top_k == 0 || !diffusion_tight_topk_enabled() ? std::max(top_k, sc_k) : top_k; if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) { return false; } @@ -664,56 +928,65 @@ bool ggml_cuda_diffusion_sample_topk( diffusion_sample_scratch * scratch = diffusion_get_scratch(stream, n_tokens, heap_k, sc_k); int * top_ids = scratch->top_ids; - bool top_ids_sorted = false; + const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && diffusion_direct_selfcond_enabled(); + int * sc_ids_out = direct_self_cond_tensor ? (int *) result->self_cond_ids_tensor->data : scratch->sc_ids; + float * sc_probs_out = direct_self_cond_tensor ? (float *) result->self_cond_probs_tensor->data : scratch->sc_probs; - const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k; - bool sync_required = !use_fast_topk; - if (use_fast_topk) { - diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream); - top_ids_sorted = true; + constexpr int block_size = 256; + bool sync_required = false; + const bool use_fused_topk_sample = diffusion_fused_topk_sample_enabled() && + top_k > 0 && top_k <= block_size && !params->top_k_tail_correction && n_vocab >= top_k; + + if (use_fused_topk_sample) { + diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, inv_temp, + params->seed, params->step, scratch->sampled, scratch->argmax, scratch->entropy, + sc_ids_out, sc_probs_out, stream); } else { + bool top_ids_sorted = false; + const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k; + sync_required = !use_fast_topk; + if (use_fast_topk) { + diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream); + top_ids_sorted = true; + } else { #ifdef CUB_DIFFUSION_TOP_K_AVAILABLE - for (int row = 0; row < n_tokens; ++row) { - diffusion_top_k_cub(pool, logits_d + (size_t) row * n_vocab, top_ids + (size_t) row * heap_k, n_vocab, heap_k, stream); - } + for (int row = 0; row < n_tokens; ++row) { + diffusion_top_k_cub(pool, logits_d + (size_t) row * n_vocab, top_ids + (size_t) row * heap_k, n_vocab, heap_k, stream); + } #elif defined(GGML_CUDA_USE_CUB) - ggml_cuda_pool_alloc sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab); - int * sorted_ids = sorted_ids_alloc.get(); - argsort_f32_i32_cuda_cub(pool, logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream); - CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int), - heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream)); - top_ids_sorted = true; + ggml_cuda_pool_alloc sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab); + int * sorted_ids = sorted_ids_alloc.get(); + argsort_f32_i32_cuda_cub(pool, logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream); + CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int), + heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream)); + top_ids_sorted = true; #else - if (n_vocab > 1024) { - return false; - } - ggml_cuda_pool_alloc sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab); - int * sorted_ids = sorted_ids_alloc.get(); - argsort_f32_i32_cuda_bitonic(logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream); - CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int), - heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream)); - top_ids_sorted = true; + if (n_vocab > 1024) { + return false; + } + ggml_cuda_pool_alloc sorted_ids_alloc(pool, (size_t) n_tokens * n_vocab); + int * sorted_ids = sorted_ids_alloc.get(); + argsort_f32_i32_cuda_bitonic(logits_d, sorted_ids, n_vocab, n_tokens, GGML_SORT_ORDER_DESC, stream); + CUDA_CHECK(cudaMemcpy2DAsync(top_ids, heap_k * sizeof(int), sorted_ids, n_vocab * sizeof(int), + heap_k * sizeof(int), n_tokens, cudaMemcpyDeviceToDevice, stream)); + top_ids_sorted = true; #endif - } - - if (!top_ids_sorted) { - const int heap_k_pad = next_power_of_2_host(heap_k); - const int sort_threads = std::max(32, heap_k_pad); - const size_t sort_smem = (size_t) heap_k_pad * (sizeof(int) + sizeof(float)); - diffusion_sort_top_ids_kernel<<>>( - logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp); - } + } - const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && diffusion_direct_selfcond_enabled(); - int * sc_ids_out = direct_self_cond_tensor ? (int *) result->self_cond_ids_tensor->data : scratch->sc_ids; - float * sc_probs_out = direct_self_cond_tensor ? (float *) result->self_cond_probs_tensor->data : scratch->sc_probs; + if (!top_ids_sorted) { + const int heap_k_pad = next_power_of_2_host(heap_k); + const int sort_threads = std::max(32, heap_k_pad); + const size_t sort_smem = (size_t) heap_k_pad * (sizeof(int) + sizeof(float)); + diffusion_sort_top_ids_kernel<<>>( + logits_d, top_ids, n_vocab, heap_k, heap_k_pad, inv_temp); + } - constexpr int block_size = 256; - diffusion_sample_kernel<<>>( - logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp, - params->seed, params->step, params->top_k_tail_correction, - scratch->sampled, scratch->argmax, scratch->entropy, - sc_ids_out, sc_probs_out); + diffusion_sample_kernel<<>>( + logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp, + params->seed, params->step, params->top_k_tail_correction, + scratch->sampled, scratch->argmax, scratch->entropy, + sc_ids_out, sc_probs_out); + } if (result->update_canvas_on_device) { const int update_threads = next_power_of_2_host(n_tokens); diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index edf546d8f1e2..0986ece35844 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -4,7 +4,9 @@ #include "vecdotq.cuh" #include "mma.cuh" +#include #include +#include #include using namespace ggml_cuda_mma; @@ -14,6 +16,25 @@ using namespace ggml_cuda_mma; #define MMQ_ITER_K_FP4 512 #define MMQ_NWARPS 8 +static bool mmq_env_enabled(const char * name, const bool default_value = false) { + const char * v = std::getenv(name); + return v ? std::atoi(v) != 0 : default_value; +} + +static int mmq_env_int(const char * name, const int default_value) { + const char * v = std::getenv(name); + return v ? std::atoi(v) : default_value; +} + +static int mmq_largest_divisor_leq(const int value, const int limit) { + for (int d = std::min(value, limit); d > 0; --d) { + if (value % d == 0) { + return d; + } + } + return 1; +} + typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride); typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00); typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t * __restrict__ get_rows_to_sorted, @@ -3972,7 +3993,31 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const uint3 channel_ratio_fd = init_fastdiv_values(channel_ratio); const uint3 sample_ratio_fd = init_fastdiv_values(sample_ratio); - if (!args.use_stream_k) { + const int ntiles_dst = ntx * nty * ntzw; + const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm; + const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves); + + bool use_stream_k = args.use_stream_k; + int nblocks_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm; + + if (use_stream_k && mmq_env_enabled("GGML_CUDA_MMQ_STREAM_K_DIVISOR")) { + const int divisor_min_pct = std::max(0, std::min(100, mmq_env_int("GGML_CUDA_MMQ_STREAM_K_DIVISOR_MIN_PCT", 80))); + const int divisor_min_blocks = std::max(1, nsm * divisor_min_pct / 100); + const int divisor_blocks = mmq_largest_divisor_leq(ntiles_dst, nsm); + if (divisor_blocks >= divisor_min_blocks) { + nblocks_stream_k = divisor_blocks; + } + } + + const bool stream_k_fixup_needed = ntiles_dst % nblocks_stream_k != 0; + if (use_stream_k && stream_k_fixup_needed && mmq_env_enabled("GGML_CUDA_MMQ_AVOID_FIXUP")) { + const int avoid_min_eff = std::max(0, std::min(100, mmq_env_int("GGML_CUDA_MMQ_AVOID_FIXUP_MIN_EFF", 80))); + if (tiles_efficiency_percent >= avoid_min_eff) { + use_stream_k = false; + } + } + + if (!use_stream_k) { if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; mul_mat_q<<>> @@ -3995,14 +4040,11 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a // For the stream-k kernel it is possible to run it with tiling by setting the number of CUDA blocks equal to the number of tiles. // This is worthwhile if the efficiency of tiling is high and skipping the fixup kernel is more important. - const int ntiles_dst = ntx * nty * ntzw; - const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm; - const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves); - const dim3 block_nums_stream_k(GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm, 1, 1); + const dim3 block_nums_stream_k(nblocks_stream_k, 1, 1); GGML_ASSERT(ntiles_dst * blocks_per_ne00_fd.z < (1 << 30)); // Assert that variable kbc will not overflow. - const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0; + const bool fixup_needed = stream_k_fixup_needed; ggml_cuda_pool & pool = ctx.pool(id); ggml_cuda_pool_alloc tmp_fixup(pool); From 027b998fc62238d9907917e07daee70942abd16e Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Wed, 10 Jun 2026 12:27:44 +0530 Subject: [PATCH 12/13] diffusion: add kernel optimization controls --- .../diffusion-gemma/diffusion-gemma-cli.cpp | 14 +- .../diffusion-gemma-server.cpp | 15 +- ggml/include/ggml-cuda.h | 3 + ggml/src/ggml-cuda/diffusion-sampling.cu | 170 ++++++++++++++++-- ggml/src/ggml-cuda/mmq.cuh | 8 +- src/llama-context.cpp | 87 +++++++-- src/llama-graph.cpp | 49 +++++ src/llama-graph.h | 22 +++ src/models/diffusion-gemma.cpp | 52 +++++- 9 files changed, 383 insertions(+), 37 deletions(-) diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index ad7059805a34..cd320009f7fa 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -47,6 +47,15 @@ static int env_int(const char * name, int def) { return v ? atoi(v) : def; } +static int diffusion_self_cond_top_k() { + constexpr int def = 256; + const int k = env_int("GGML_CUDA_DIFFUSION_SC_TOPK", def); + if (k <= 0) { + return def; + } + return std::min(k, def); +} + // apply the model's chat template to the user prompt (this is a chat-trained model) static std::string format_chat(llama_model * model, const std::string & prompt) { auto tmpls = common_chat_templates_init(model, ""); @@ -287,8 +296,9 @@ int main(int argc, char ** argv) { // (renormalized) softmax probabilities per position. Fed to the next decode (Option-2 graph // gather: the decoder gathers just these SC_K embedding rows and blends them, instead of a // dense full-vocab probs @ token_embd matmul). Zero probs => no self-conditioning (step 1). - // SC_K must match llama_model_diffusion_gemma::N_SC_TOPK (the graph's fixed gather width). - const int SC_K = 256; + // SC_K must match the graph's fixed gather width. Default is 256; use + // GGML_CUDA_DIFFUSION_SC_TOPK for gated experiments with a smaller sparse blend. + const int SC_K = diffusion_self_cond_top_k(); std::vector sc_ids ((size_t) SC_K * canvas_length, 0); std::vector sc_probs((size_t) SC_K * canvas_length, 0.0f); diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp index f651e32a98bf..9b49f55b8d25 100644 --- a/examples/diffusion-gemma/diffusion-gemma-server.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -54,7 +54,7 @@ static constexpr float TEMP_MIN = 0.4f; static constexpr float TEMP_MAX = 0.8f; static constexpr float CONFIDENCE_THRESHOLD = 0.005f; static constexpr int STABILITY_THRESHOLD = 1; -static constexpr int SC_K = 256; // must match llama_model_diffusion_gemma::N_SC_TOPK +static constexpr int DEF_SC_K = 256; static constexpr int GPU_SAMPLING_MAX_TOP_K = 1024; // CUDA diffusion sampler limit static int env_int(const char * name, int def) { @@ -62,6 +62,14 @@ static int env_int(const char * name, int def) { return v ? atoi(v) : def; } +static int diffusion_self_cond_top_k() { + const int k = env_int("GGML_CUDA_DIFFUSION_SC_TOPK", DEF_SC_K); + if (k <= 0) { + return DEF_SC_K; + } + return std::min(k, DEF_SC_K); +} + // ------------------------------------------------------------------------------------------------ // per-request generation parameters and result // ------------------------------------------------------------------------------------------------ @@ -236,6 +244,7 @@ struct diffusion_server { std::vector argmax_canvas(canvas_length, -1); std::vector prev_argmax(canvas_length, -1); std::vector accepted(canvas_length); + const int SC_K = diffusion_self_cond_top_k(); std::vector sc_ids ((size_t) SC_K * canvas_length, 0); std::vector sc_probs((size_t) SC_K * canvas_length, 0.0f); std::vector generated; @@ -822,7 +831,7 @@ int main(int argc, char ** argv) { {"entropy_bound", ENTROPY_BOUND}, {"confidence_threshold", CONFIDENCE_THRESHOLD}, {"stability_threshold", STABILITY_THRESHOLD}, - {"self_cond_topk", SC_K}, + {"self_cond_topk", diffusion_self_cond_top_k()}, {"gpu_sampling", srv.use_gpu_sampling}, {"device_self_cond", srv.use_device_self_cond}, {"device_loop", srv.use_device_loop}, @@ -931,7 +940,7 @@ int main(int argc, char ** argv) { {"params", { {"canvas_length", srv.canvas_length}, {"n_denoise_steps", srv.n_steps}, - {"self_cond_topk", SC_K}, + {"self_cond_topk", diffusion_self_cond_top_k()}, {"gpu_sampling", srv.use_gpu_sampling}, {"device_self_cond", srv.use_device_self_cond}, {"device_loop", srv.use_device_loop}, diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index cb1136ba856d..f8990bea4c10 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -52,6 +52,7 @@ struct ggml_cuda_diffusion_sample_params { uint32_t seed; uint32_t step; bool top_k_tail_correction; + float logit_softcap; }; struct ggml_cuda_diffusion_sample_result { @@ -62,6 +63,8 @@ struct ggml_cuda_diffusion_sample_result { float * self_cond_probs; struct ggml_tensor * self_cond_ids_tensor; struct ggml_tensor * self_cond_probs_tensor; + struct ggml_tensor * self_cond_embd_tensor; + const struct ggml_tensor * token_embd_tensor; struct ggml_tensor * canvas_tokens_tensor; int32_t * final_tokens; int32_t * stop; diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu index d28a397fad7b..2306e68558a0 100644 --- a/ggml/src/ggml-cuda/diffusion-sampling.cu +++ b/ggml/src/ggml-cuda/diffusion-sampling.cu @@ -354,6 +354,107 @@ static __device__ __forceinline__ float diffusion_rng_uniform(uint32_t seed, uin return ((x >> 8) + 0.5f) * (1.0f / 16777216.0f); } +static __device__ __forceinline__ float diffusion_apply_logit_softcap(const float x, const float softcap) { + return softcap > 0.0f ? softcap * tanhf(x / softcap) : x; +} + +template +static __device__ __forceinline__ float diffusion_embd_to_float(const T x) { + return (float) x; +} + +template<> +__device__ __forceinline__ float diffusion_embd_to_float(const half x) { + return __half2float(x); +} + +template +static __global__ void diffusion_build_selfcond_embd_kernel( + const T * __restrict__ token_embd, + const int64_t embd_stride, + const int n_embd, + const int n_tokens, + const int sc_k, + const int * __restrict__ sc_ids, + const float * __restrict__ sc_probs, + float * __restrict__ dst) { + extern __shared__ unsigned char smem[]; + int * s_ids = (int *) smem; + float * s_probs = (float *) (s_ids + sc_k); + + const int token = blockIdx.y; + const int dim = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = threadIdx.x; i < sc_k; i += blockDim.x) { + const int off = token * sc_k + i; + s_ids[i] = sc_ids[off]; + s_probs[i] = sc_probs[off]; + } + __syncthreads(); + + if (token >= n_tokens || dim >= n_embd) { + return; + } + + float sum = 0.0f; + for (int i = 0; i < sc_k; ++i) { + const float p = s_probs[i]; + if (p != 0.0f) { + const int id = s_ids[i]; + sum += p * diffusion_embd_to_float(token_embd[(int64_t) id * embd_stride + dim]); + } + } + dst[(int64_t) token * n_embd + dim] = sum; +} + +static bool diffusion_build_selfcond_embd( + const ggml_tensor * token_embd, + const int * sc_ids, + const float * sc_probs, + const int n_tokens, + const int sc_k, + ggml_tensor * dst, + cudaStream_t stream) { + if (!token_embd || !dst || !sc_ids || !sc_probs) { + return false; + } + if (!ggml_is_contiguous(token_embd) || !ggml_is_contiguous(dst) || + token_embd->data == nullptr || dst->data == nullptr || + token_embd->buffer == nullptr || dst->buffer == nullptr || + ggml_backend_buffer_is_host(token_embd->buffer) || + ggml_backend_buffer_is_host(dst->buffer) || + dst->type != GGML_TYPE_F32 || + token_embd->ne[1] <= 0 || + dst->ne[0] != token_embd->ne[0] || + dst->ne[1] < n_tokens) { + return false; + } + + const int n_embd = (int) token_embd->ne[0]; + const int block_size = 256; + const dim3 block(block_size, 1, 1); + const dim3 grid((n_embd + block_size - 1) / block_size, n_tokens, 1); + const size_t smem = (size_t) sc_k * (sizeof(int) + sizeof(float)); + + switch (token_embd->type) { + case GGML_TYPE_F16: { + const int64_t embd_stride = token_embd->nb[1] / (int64_t) sizeof(half); + diffusion_build_selfcond_embd_kernel<<>>( + (const half *) token_embd->data, embd_stride, n_embd, n_tokens, sc_k, + sc_ids, sc_probs, (float *) dst->data); + } break; + case GGML_TYPE_F32: { + const int64_t embd_stride = token_embd->nb[1] / (int64_t) sizeof(float); + diffusion_build_selfcond_embd_kernel<<>>( + (const float *) token_embd->data, embd_stride, n_embd, n_tokens, sc_k, + sc_ids, sc_probs, (float *) dst->data); + } break; + default: + return false; + } + return true; +} + template static __global__ void diffusion_sample_topk_fused_local_kernel( const float * __restrict__ logits, @@ -362,6 +463,7 @@ static __global__ void diffusion_sample_topk_fused_local_kernel( const int top_k, const int sc_k, const float inv_temp, + const float logit_softcap, const uint32_t seed, const uint32_t step, int * __restrict__ sampled, @@ -441,6 +543,11 @@ static __global__ void diffusion_sample_topk_fused_local_kernel( } } + if (tid < top_k) { + s_vals[tid] = diffusion_apply_logit_softcap(s_vals[tid], logit_softcap); + } + __syncthreads(); + const float max_l = s_vals[0] * inv_temp; const int amax = s_ids[0]; @@ -506,6 +613,7 @@ static void diffusion_sample_topk_fused_local( const int top_k, const int sc_k, const float inv_temp, + const float logit_softcap, const uint32_t seed, const uint32_t step, int * sampled, @@ -523,7 +631,7 @@ static void diffusion_sample_topk_fused_local( const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + 2 * block_size * sizeof(float); diffusion_sample_topk_fused_local_kernel<<>>( - logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step, sampled, argmax, entropy, sc_ids, sc_probs); } break; case 2: { @@ -531,7 +639,7 @@ static void diffusion_sample_topk_fused_local( const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + 2 * block_size * sizeof(float); diffusion_sample_topk_fused_local_kernel<<>>( - logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step, sampled, argmax, entropy, sc_ids, sc_probs); } break; case 4: { @@ -539,7 +647,7 @@ static void diffusion_sample_topk_fused_local( const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + 2 * block_size * sizeof(float); diffusion_sample_topk_fused_local_kernel<<>>( - logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step, sampled, argmax, entropy, sc_ids, sc_probs); } break; case 8: { @@ -547,7 +655,7 @@ static void diffusion_sample_topk_fused_local( const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + 2 * block_size * sizeof(float); diffusion_sample_topk_fused_local_kernel<<>>( - logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step, sampled, argmax, entropy, sc_ids, sc_probs); } break; default: { @@ -555,7 +663,7 @@ static void diffusion_sample_topk_fused_local( const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + 2 * block_size * sizeof(float); diffusion_sample_topk_fused_local_kernel<<>>( - logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, seed, step, + logits, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, seed, step, sampled, argmax, entropy, sc_ids, sc_probs); } break; } @@ -691,6 +799,7 @@ static __global__ void diffusion_sample_kernel( const int heap_k, const int sc_k, const float inv_temp, + const float logit_softcap, const uint32_t seed, const uint32_t step, const bool tail_correction, @@ -717,7 +826,7 @@ static __global__ void diffusion_sample_kernel( if (top_k == 0 || tail_correction) { for (int v = tid; v < n_vocab; v += blockDim.x) { - const float x = row_logits[v] * inv_temp; + const float x = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp; if (x > local_max) { local_max = x; local_idx = v; @@ -726,7 +835,7 @@ static __global__ void diffusion_sample_kernel( } else { for (int i = tid; i < top_k; i += blockDim.x) { const int v = row_top[i]; - const float x = row_logits[v] * inv_temp; + const float x = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp; if (x > local_max) { local_max = x; local_idx = v; @@ -754,7 +863,7 @@ static __global__ void diffusion_sample_kernel( if (top_k == 0 || tail_correction) { for (int v = tid; v < n_vocab; v += blockDim.x) { - const float d = row_logits[v] * inv_temp - max_l; + const float d = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l; const float e = expf(d); local_sum += e; local_t += d * e; @@ -762,7 +871,7 @@ static __global__ void diffusion_sample_kernel( } else { for (int i = tid; i < top_k; i += blockDim.x) { const int v = row_top[i]; - const float d = row_logits[v] * inv_temp - max_l; + const float d = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l; const float e = expf(d); local_sum += e; local_t += d * e; @@ -793,7 +902,7 @@ static __global__ void diffusion_sample_kernel( sample_z = 0.0f; for (int i = 0; i < top_k; ++i) { const int v = row_top[i]; - sample_z += expf(row_logits[v] * inv_temp - max_l); + sample_z += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l); } } @@ -803,7 +912,7 @@ static __global__ void diffusion_sample_kernel( if (top_k == 0) { for (int v = 0; v < n_vocab; ++v) { - cum += expf(row_logits[v] * inv_temp - max_l); + cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l); if (cum >= r) { tok = v; break; @@ -812,7 +921,7 @@ static __global__ void diffusion_sample_kernel( } else { for (int i = 0; i < top_k; ++i) { const int v = row_top[i]; - cum += expf(row_logits[v] * inv_temp - max_l); + cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l); if (cum >= r) { tok = v; break; @@ -827,7 +936,7 @@ static __global__ void diffusion_sample_kernel( if (i < n_sc) { const int v = row_top[i]; sc_ids[out] = v; - sc_probs[out] = expf(row_logits[v] * inv_temp - max_l) / sample_z; + sc_probs[out] = expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l) / sample_z; } else { sc_ids[out] = 0; sc_probs[out] = 0.0f; @@ -883,7 +992,8 @@ bool ggml_cuda_diffusion_sample_topk( const bool have_self_cond_host = result->self_cond_ids != nullptr && result->self_cond_probs != nullptr; const bool have_self_cond_tensor = result->self_cond_ids_tensor != nullptr && result->self_cond_probs_tensor != nullptr; - if (!have_self_cond_host && !have_self_cond_tensor) { + const bool have_self_cond_embd_tensor = result->self_cond_embd_tensor != nullptr; + if (!have_self_cond_host && !have_self_cond_tensor && !have_self_cond_embd_tensor) { return false; } if ((result->self_cond_ids == nullptr) != (result->self_cond_probs == nullptr)) { @@ -910,6 +1020,27 @@ bool ggml_cuda_diffusion_sample_topk( return false; } } + if (have_self_cond_embd_tensor) { + if (result->token_embd_tensor == nullptr || + result->token_embd_tensor->data == nullptr || + result->token_embd_tensor->buffer == nullptr || + result->self_cond_embd_tensor->type != GGML_TYPE_F32 || + result->self_cond_embd_tensor->data == nullptr || + result->self_cond_embd_tensor->buffer == nullptr || + ggml_backend_buffer_is_host(result->token_embd_tensor->buffer) || + ggml_backend_buffer_is_host(result->self_cond_embd_tensor->buffer) || + !ggml_is_contiguous(result->token_embd_tensor) || + !ggml_is_contiguous(result->self_cond_embd_tensor) || + result->token_embd_tensor->ne[1] < n_vocab || + result->self_cond_embd_tensor->ne[0] != result->token_embd_tensor->ne[0] || + result->self_cond_embd_tensor->ne[1] < n_tokens) { + return false; + } + if (result->token_embd_tensor->type != GGML_TYPE_F16 && + result->token_embd_tensor->type != GGML_TYPE_F32) { + return false; + } + } const int heap_k = top_k == 0 || !diffusion_tight_topk_enabled() ? std::max(top_k, sc_k) : top_k; if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) { @@ -918,6 +1049,7 @@ bool ggml_cuda_diffusion_sample_topk( const float temp = params->temperature > 0.0f ? params->temperature : 1.0f; const float inv_temp = 1.0f / temp; + const float logit_softcap = params->logit_softcap > 0.0f ? params->logit_softcap : 0.0f; ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *) backend->context; ggml_cuda_set_device(ctx->device); @@ -938,7 +1070,7 @@ bool ggml_cuda_diffusion_sample_topk( top_k > 0 && top_k <= block_size && !params->top_k_tail_correction && n_vocab >= top_k; if (use_fused_topk_sample) { - diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, inv_temp, + diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, params->seed, params->step, scratch->sampled, scratch->argmax, scratch->entropy, sc_ids_out, sc_probs_out, stream); } else { @@ -983,7 +1115,7 @@ bool ggml_cuda_diffusion_sample_topk( diffusion_sample_kernel<<>>( logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp, - params->seed, params->step, params->top_k_tail_correction, + logit_softcap, params->seed, params->step, params->top_k_tail_correction, scratch->sampled, scratch->argmax, scratch->entropy, sc_ids_out, sc_probs_out); } @@ -1038,6 +1170,12 @@ bool ggml_cuda_diffusion_sample_topk( CUDA_CHECK(cudaMemcpyAsync(result->self_cond_probs_tensor->data, scratch->sc_probs, (size_t) n_tokens * sc_k * sizeof(float), cudaMemcpyDeviceToDevice, stream)); } + if (have_self_cond_embd_tensor) { + if (!diffusion_build_selfcond_embd(result->token_embd_tensor, sc_ids_out, sc_probs_out, n_tokens, sc_k, + result->self_cond_embd_tensor, stream)) { + return false; + } + } const bool host_outputs_requested = result->sampled || result->argmax || result->entropy || (result->final_tokens && !final_tokens_after_stop) || result->stop || have_self_cond_host; diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 0986ece35844..cf2c2d2f26c2 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -4102,7 +4102,13 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda const int warp_size = ggml_cuda_info().devices[id].warp_size; const int nwarps = mmq_get_nwarps_host(cc, warp_size); - const int mmq_x_max = get_mmq_x_max_host(cc); + int mmq_x_max = get_mmq_x_max_host(cc); + if (mmq_env_enabled("GGML_CUDA_MMQ_MAX_X")) { + const int mmq_x_max_env = mmq_env_int("GGML_CUDA_MMQ_MAX_X", mmq_x_max); + if (mmq_x_max_env > 0) { + mmq_x_max = std::min(mmq_x_max, std::max(8, 8 * (mmq_x_max_env / 8))); + } + } const int mmq_y = get_mmq_y_host(cc); int mmq_x_best = 0; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index cf3b9760a546..575b647f733d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1127,6 +1127,18 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) { } void llama_context::set_diffusion_self_cond(const float * probs, int64_t n_vocab, int64_t n_tokens) { + diffusion_cond.sc_topk_device_ready = false; + diffusion_cond.sc_topk_device_ids_data = nullptr; + diffusion_cond.sc_topk_device_probs_data = nullptr; + diffusion_cond.sc_topk_device_ids_bytes = 0; + diffusion_cond.sc_topk_device_probs_bytes = 0; + diffusion_cond.sc_embd_device_ready = false; + diffusion_cond.sc_embd_device_data = nullptr; + diffusion_cond.sc_embd_device_bytes = 0; + diffusion_cond.canvas_tokens_device_ready = false; + diffusion_cond.canvas_tokens_device_data = nullptr; + diffusion_cond.canvas_tokens_device_bytes = 0; + if (probs == nullptr || n_vocab <= 0 || n_tokens <= 0) { diffusion_cond.probs.clear(); diffusion_cond.n_vocab = 0; @@ -1154,6 +1166,9 @@ void llama_context::set_diffusion_self_cond_topk(const int32_t * ids, const floa diffusion_cond.sc_topk_device_probs_data = nullptr; diffusion_cond.sc_topk_device_ids_bytes = 0; diffusion_cond.sc_topk_device_probs_bytes = 0; + diffusion_cond.sc_embd_device_ready = false; + diffusion_cond.sc_embd_device_data = nullptr; + diffusion_cond.sc_embd_device_bytes = 0; diffusion_cond.canvas_tokens_device_ready = false; diffusion_cond.canvas_tokens_device_data = nullptr; diffusion_cond.canvas_tokens_device_bytes = 0; @@ -1189,12 +1204,21 @@ static ggml_backend_cuda_diffusion_sample_topk_t get_cuda_diffusion_sample_topk_ ggml_backend_reg_get_proc_address(reg, "ggml_backend_cuda_diffusion_sample_topk"); } +static bool diffusion_cuda_fuse_final_logit_softcap_enabled() { + const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSE_FINAL_SOFTCAP"); + return v && std::strcmp(v, "0") != 0; +} + +static bool diffusion_cuda_fused_selfcond_embd_enabled() { + const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD"); + return v && std::strcmp(v, "0") != 0; +} + static bool diffusion_decoder_inputs_device_ready( const llama_diffusion_cond & diffusion_cond, const llm_graph_result * res) { if (!diffusion_cond.decoder_phase || !diffusion_cond.canvas_tokens_device_ready || - !diffusion_cond.sc_topk_device_ready || !res) { return false; } @@ -1206,8 +1230,17 @@ static bool diffusion_decoder_inputs_device_ready( return false; } + auto * inp_sc_embd = res->get_inp_diffusion_self_cond_embd(); + if (inp_sc_embd) { + return inp_sc_embd->embd && + diffusion_cond.sc_embd_device_ready && + diffusion_cond.sc_embd_device_data == inp_sc_embd->embd->data && + diffusion_cond.sc_embd_device_bytes == ggml_nbytes(inp_sc_embd->embd); + } + auto * inp_sc = res->get_inp_diffusion_self_cond_topk(); if (!inp_sc || !inp_sc->ids || !inp_sc->probs || + !diffusion_cond.sc_topk_device_ready || diffusion_cond.sc_topk_device_ids_data != inp_sc->ids->data || diffusion_cond.sc_topk_device_probs_data != inp_sc->probs->data || diffusion_cond.sc_topk_device_ids_bytes != ggml_nbytes(inp_sc->ids) || @@ -1253,15 +1286,26 @@ bool llama_context::diffusion_sample_topk( ggml_tensor * t_self_cond_ids = nullptr; ggml_tensor * t_self_cond_probs = nullptr; + ggml_tensor * t_self_cond_embd = nullptr; + const ggml_tensor * t_token_embd = nullptr; ggml_tensor * t_canvas_tokens = nullptr; const bool device_self_cond = result->self_cond_ids == nullptr && result->self_cond_probs == nullptr; if (device_self_cond) { - auto * inp = gf_res_prev->get_inp_diffusion_self_cond_topk(); - if (!inp || !inp->ids || !inp->probs) { - return false; + auto * inp_embd = gf_res_prev->get_inp_diffusion_self_cond_embd(); + if (inp_embd && diffusion_cuda_fused_selfcond_embd_enabled()) { + t_self_cond_embd = inp_embd->embd; + t_token_embd = gf_res_prev->get_diffusion_token_embd(); + if (!t_self_cond_embd || !t_token_embd) { + return false; + } + } else { + auto * inp = gf_res_prev->get_inp_diffusion_self_cond_topk(); + if (!inp || !inp->ids || !inp->probs) { + return false; + } + t_self_cond_ids = inp->ids; + t_self_cond_probs = inp->probs; } - t_self_cond_ids = inp->ids; - t_self_cond_probs = inp->probs; } else if (result->self_cond_ids == nullptr || result->self_cond_probs == nullptr) { return false; } @@ -1273,6 +1317,10 @@ bool llama_context::diffusion_sample_topk( } } + const float fused_logit_softcap = diffusion_cuda_fuse_final_logit_softcap_enabled() + ? model.hparams.f_final_logit_softcapping + : 0.0f; + ggml_cuda_diffusion_sample_params cuda_params = { /* .n_vocab = */ (int32_t) model.vocab.n_tokens(), /* .n_tokens = */ params->n_tokens, @@ -1282,6 +1330,7 @@ bool llama_context::diffusion_sample_topk( /* .seed = */ params->seed, /* .step = */ params->step, /* .top_k_tail_correction = */ params->top_k_tail_correction, + /* .logit_softcap = */ fused_logit_softcap, }; ggml_cuda_diffusion_sample_result cuda_result = { @@ -1292,6 +1341,8 @@ bool llama_context::diffusion_sample_topk( /* .self_cond_probs = */ result->self_cond_probs, /* .self_cond_ids_tensor = */ t_self_cond_ids, /* .self_cond_probs_tensor = */ t_self_cond_probs, + /* .self_cond_embd_tensor = */ t_self_cond_embd, + /* .token_embd_tensor = */ t_token_embd, /* .canvas_tokens_tensor = */ t_canvas_tokens, /* .final_tokens = */ (int32_t *) result->final_tokens, /* .stop = */ result->stop, @@ -1309,11 +1360,25 @@ bool llama_context::diffusion_sample_topk( diffusion_cond.sc_topk = params->self_cond_top_k; diffusion_cond.sc_topk_ids.clear(); diffusion_cond.sc_topk_probs.clear(); - diffusion_cond.sc_topk_device_ready = true; - diffusion_cond.sc_topk_device_ids_data = t_self_cond_ids->data; - diffusion_cond.sc_topk_device_probs_data = t_self_cond_probs->data; - diffusion_cond.sc_topk_device_ids_bytes = ggml_nbytes(t_self_cond_ids); - diffusion_cond.sc_topk_device_probs_bytes = ggml_nbytes(t_self_cond_probs); + diffusion_cond.sc_topk_device_ready = false; + diffusion_cond.sc_topk_device_ids_data = nullptr; + diffusion_cond.sc_topk_device_probs_data = nullptr; + diffusion_cond.sc_topk_device_ids_bytes = 0; + diffusion_cond.sc_topk_device_probs_bytes = 0; + diffusion_cond.sc_embd_device_ready = false; + diffusion_cond.sc_embd_device_data = nullptr; + diffusion_cond.sc_embd_device_bytes = 0; + if (t_self_cond_embd) { + diffusion_cond.sc_embd_device_ready = true; + diffusion_cond.sc_embd_device_data = t_self_cond_embd->data; + diffusion_cond.sc_embd_device_bytes = ggml_nbytes(t_self_cond_embd); + } else { + diffusion_cond.sc_topk_device_ready = true; + diffusion_cond.sc_topk_device_ids_data = t_self_cond_ids->data; + diffusion_cond.sc_topk_device_probs_data = t_self_cond_probs->data; + diffusion_cond.sc_topk_device_ids_bytes = ggml_nbytes(t_self_cond_ids); + diffusion_cond.sc_topk_device_probs_bytes = ggml_nbytes(t_self_cond_probs); + } } if (ok && result->update_canvas_on_device) { diffusion_cond.canvas_tokens_device_ready = true; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index eee04a30de2e..7b21baf75d6c 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -453,6 +453,28 @@ void llm_graph_input_diffusion_self_cond_topk::set_input(const llama_ubatch * ub } } +void llm_graph_input_diffusion_self_cond_embd::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + if (!embd) { + return; + } + assert(embd->type == GGML_TYPE_F32); + + const size_t n_bytes = ggml_nbytes(embd); + const bool have_device = diffusion + && diffusion->sc_embd_device_ready + && diffusion->sc_embd_device_data == embd->data + && diffusion->sc_embd_device_bytes == n_bytes; + + if (have_device) { + return; + } + + std::vector zeros(n_bytes, 0); + ggml_backend_tensor_set(embd, zeros.data(), 0, n_bytes); +} + template static void print_mask(const T * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) { LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__); @@ -1021,6 +1043,9 @@ void llm_graph_result::reset() { t_logits = nullptr; t_embd = nullptr; t_embd_pooled = nullptr; + t_h_nextn = nullptr; + t_h_pre_norm = nullptr; + t_diffusion_token_embd = nullptr; t_sampled.clear(); t_sampled_probs.clear(); t_sampled_logits.clear(); @@ -1059,6 +1084,16 @@ llm_graph_input_diffusion_self_cond_topk * llm_graph_result::get_inp_diffusion_s return nullptr; } +llm_graph_input_diffusion_self_cond_embd * llm_graph_result::get_inp_diffusion_self_cond_embd() const { + for (auto & input : inputs) { + if (auto * embd = dynamic_cast(input.get())) { + return embd; + } + } + + return nullptr; +} + void llm_graph_result::set_outputs() { if (t_logits != nullptr) { ggml_set_output(t_logits); @@ -2193,6 +2228,20 @@ llm_graph_input_diffusion_self_cond_topk * llm_graph_context::build_inp_diffusio return ptr; } +ggml_tensor * llm_graph_context::build_inp_diffusion_self_cond_embd(int64_t n_embd) const { + auto inp = std::make_unique(diffusion); + + auto & cur = inp->embd; + + cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(cur); + set_diffusion_input_backend(cur); + + res->add_input(std::move(inp)); + + return cur; +} + ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const { auto inp = std::make_unique(hparams); diff --git a/src/llama-graph.h b/src/llama-graph.h index 838bbce24b86..446e8162cf3c 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -99,6 +99,10 @@ struct llama_diffusion_cond { size_t sc_topk_device_ids_bytes = 0; size_t sc_topk_device_probs_bytes = 0; + bool sc_embd_device_ready = false; + void * sc_embd_device_data = nullptr; + size_t sc_embd_device_bytes = 0; + bool canvas_tokens_device_ready = false; void * canvas_tokens_device_data = nullptr; size_t canvas_tokens_device_bytes = 0; @@ -351,6 +355,20 @@ class llm_graph_input_diffusion_self_cond_topk : public llm_graph_input_i { const llama_diffusion_cond * diffusion; }; +// Dense diffusion self-conditioning embedding input. This is filled by the CUDA sampler from +// top-k ids/probs when GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD is enabled. +class llm_graph_input_diffusion_self_cond_embd : public llm_graph_input_i { +public: + llm_graph_input_diffusion_self_cond_embd(const llama_diffusion_cond * diffusion) : diffusion(diffusion) {} + virtual ~llm_graph_input_diffusion_self_cond_embd() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * embd = nullptr; // F32 [n_embd, n_tokens] + + const llama_diffusion_cond * diffusion; +}; + class llm_graph_input_attn_no_cache : public llm_graph_input_i { public: llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) : @@ -796,7 +814,9 @@ class llm_graph_result { ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } ggml_tensor * get_h_nextn() const { return t_h_nextn; } ggml_tensor * get_h_pre_norm() const { return t_h_pre_norm; } + ggml_tensor * get_diffusion_token_embd() const { return t_diffusion_token_embd; } llm_graph_input_diffusion_self_cond_topk * get_inp_diffusion_self_cond_topk() const; + llm_graph_input_diffusion_self_cond_embd * get_inp_diffusion_self_cond_embd() const; ggml_cgraph * get_gf() const { return gf; } ggml_context * get_ctx() const { return ctx_compute.get(); } @@ -827,6 +847,7 @@ class llm_graph_result { ggml_tensor * t_embd_pooled = nullptr; ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm ggml_tensor * t_h_pre_norm = nullptr; // [n_embd, n_outputs] hidden state before final output norm + ggml_tensor * t_diffusion_token_embd = nullptr; // on-device F16 token embedding used by diffusion CUDA helpers std::map t_sampled_logits; std::map t_candidates; @@ -1039,6 +1060,7 @@ struct llm_graph_context { ggml_tensor * build_inp_diffusion_self_cond(int64_t n_vocab) const; // F32 [n_vocab, n_tokens] // sparse self-cond: returns the input object exposing ->ids (I32 [k,n_tokens]) and ->probs (F32 [k,n_tokens]) llm_graph_input_diffusion_self_cond_topk * build_inp_diffusion_self_cond_topk(int64_t k) const; + ggml_tensor * build_inp_diffusion_self_cond_embd(int64_t n_embd) const; // F32 [n_embd, n_tokens] ggml_tensor * build_inp_pos() const; ggml_tensor * build_inp_attn_scale() const; ggml_tensor * build_inp_out_ids() const; diff --git a/src/models/diffusion-gemma.cpp b/src/models/diffusion-gemma.cpp index fe521082e542..b8b1091cbb26 100644 --- a/src/models/diffusion-gemma.cpp +++ b/src/models/diffusion-gemma.cpp @@ -3,6 +3,42 @@ #include "ggml-backend.h" #include "ggml-alloc.h" +#include +#include +#include + +static bool diffusion_gemma_fuse_final_logit_softcap_enabled() { + const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSE_FINAL_SOFTCAP"); + return v && std::strcmp(v, "0") != 0; +} + +static bool diffusion_gemma_env_enabled_default(const char * name, bool def) { + const char * v = std::getenv(name); + if (!v) { + return def; + } + return std::strcmp(v, "0") != 0; +} + +static bool diffusion_gemma_fused_selfcond_embd_enabled() { + return diffusion_gemma_env_enabled_default("GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD", false) && + diffusion_gemma_env_enabled_default("DG_DEVICE_SELFCOND", true); +} + +static int64_t diffusion_gemma_self_cond_top_k() { + const int64_t def = llama_model_diffusion_gemma::N_SC_TOPK; + const char * v = std::getenv("GGML_CUDA_DIFFUSION_SC_TOPK"); + if (!v) { + return def; + } + + const int64_t k = std::atoll(v); + if (k <= 0) { + return def; + } + return std::min(k, def); +} + // diffusion_gemma reuses the gemma4 decoder block (tensor layout + per-layer math) but runs // as a bidirectional (non-causal) block-diffusion denoiser over a canvas, with KV-cache reuse: // the prompt / previously-finalized canvases form a causal, read-only prefix in the unified @@ -123,7 +159,7 @@ void llama_model_diffusion_gemma::load_arch_post(llama_model_loader & ml) { __func__, (long long) n_embd_t, (long long) n_vocab_t, ggml_nbytes(tok_embd_gpu) / (1024.0 * 1024.0 * 1024.0), ggml_backend_buffer_name(tok_embd_gpu->buffer), - (long long) N_SC_TOPK); + (long long) diffusion_gemma_self_cond_top_k()); return; } LLAMA_LOG_WARN("%s: failed to allocate on-device gather embedding; falling back to dense matmul\n", __func__); @@ -186,6 +222,9 @@ ggml_tensor * llama_model_diffusion_gemma::graph_base::build_input(bool is_decod const auto & dmodel = static_cast(model); ggml_tensor * inpL = build_inp_embd(dmodel.tok_embd_gpu ? dmodel.tok_embd_gpu : model.tok_embd); + if (dmodel.tok_embd_gpu) { + res->t_diffusion_token_embd = dmodel.tok_embd_gpu; + } // scaled word embeddings (sqrt(hidden_size)); raw embeddings input is not scaled inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); @@ -194,12 +233,14 @@ ggml_tensor * llama_model_diffusion_gemma::graph_base::build_input(bool is_decod if (is_decoder) { ggml_tensor * soft; // soft-embedding {n_embd, n_tokens}: blend of the previous step's // predicted token embeddings, scaled by sqrt(n_embd) - if (dmodel.tok_embd_gpu) { + if (dmodel.tok_embd_gpu && diffusion_gemma_fused_selfcond_embd_enabled()) { + soft = build_inp_diffusion_self_cond_embd(n_embd); + } else if (dmodel.tok_embd_gpu) { // Sparse gather path (Option-2): the previous step's top-k token ids+probs are fed per // position; gather just those k embedding rows and blend them, instead of the dense // full-vocab `probs @ token_embd` matmul. Gather width is fixed (N_SC_TOPK) so the // graph shape is constant; unused slots carry prob 0 (the CLI zero-pads). - const int64_t k = llama_model_diffusion_gemma::N_SC_TOPK; + const int64_t k = diffusion_gemma_self_cond_top_k(); auto * inp = build_inp_diffusion_self_cond_topk(k); ggml_tensor * ids = inp->ids; // I32 {k*n_tokens} ggml_tensor * probs = inp->probs; // F32 {k, n_tokens} @@ -419,7 +460,10 @@ void llama_model_diffusion_gemma::graph_base::build_transformer(ggml_tensor * in cur = build_lora_mm(model.output, cur, model.output_s); - if (hparams.f_final_logit_softcapping) { + const bool fuse_final_softcap = + diffusion && diffusion->decoder_phase && diffusion_gemma_fuse_final_logit_softcap_enabled(); + + if (hparams.f_final_logit_softcapping && !fuse_final_softcap) { cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); cur = ggml_tanh(ctx0, cur); cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); From 89856ffdfcfe339356ea81277dd51cfd7f2dabac Mon Sep 17 00:00:00 2001 From: Loveneet Nigam Date: Thu, 11 Jun 2026 10:51:19 +0530 Subject: [PATCH 13/13] diffusion: expose tuning switches as options --- common/arg.cpp | 95 +++++ common/common.cpp | 5 + common/common.h | 23 ++ .../diffusion-gemma/diffusion-gemma-cli.cpp | 116 ++++-- .../diffusion-gemma-server.cpp | 104 +++++- ggml/include/ggml-cuda.h | 8 + ggml/src/ggml-backend.cpp | 2 + ggml/src/ggml-cuda/diffusion-sampling.cu | 340 +++++++++++++++--- ggml/src/ggml-cuda/ggml-cuda.cu | 10 +- ggml/src/ggml-impl.h | 18 + ggml/src/ggml.c | 3 + include/llama.h | 13 + scripts/collect_diffusion_ncu_admin.ps1 | 186 ++++++++++ src/llama-context.cpp | 33 +- src/llama-graph.cpp | 19 +- src/llama-graph.h | 8 +- src/models/diffusion-gemma.cpp | 48 +-- 17 files changed, 885 insertions(+), 146 deletions(-) create mode 100644 scripts/collect_diffusion_ncu_admin.ps1 diff --git a/common/arg.cpp b/common/arg.cpp index f5d8fb4acf8b..895a9eaadc69 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3865,6 +3865,101 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"), [](common_params & params) { params.diffusion.visual_mode = true; } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--no-diffusion-gpu-sampling"}, + "disable CUDA block-diffusion sampling fast path", + [](common_params & params) { params.diffusion.gpu_sampling = false; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--no-diffusion-device-selfcond"}, + "disable device-resident block-diffusion self-conditioning", + [](common_params & params) { params.diffusion.device_self_cond = false; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--no-diffusion-device-denoise-loop"}, + "disable device-side block-diffusion canvas and stop-state updates", + [](common_params & params) { params.diffusion.device_denoise_loop = false; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-pin-host-outputs"}, + "register compact diffusion output buffers as pinned host memory", + [](common_params & params) { params.diffusion.pin_host_outputs = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-self-cond-top-k"}, "N", + string_format("block-diffusion sparse self-conditioning width (default: %d)", params.diffusion.self_cond_top_k), + [](common_params & params, int value) { params.diffusion.self_cond_top_k = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-input-gpu-groups"}, "N", + string_format("bitmask of block-diffusion decoder input groups assigned to GPU backend (default: %u)", params.diffusion.input_gpu_groups), + [](common_params & params, int value) { params.diffusion.input_gpu_groups = (uint32_t) std::max(value, 0); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-default-top-k"}, "N", + "block-diffusion top-k used when --top-k is not explicitly provided", + [](common_params & params, int value) { params.diffusion.default_top_k = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-force-top-k"}, "N", + "block-diffusion server: override per-request top_k when N > 0", + [](common_params & params, int value) { params.diffusion.force_top_k = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-fused-self-cond-embd"}, + "use fused device self-conditioning embedding input for block diffusion", + [](common_params & params) { params.diffusion.fused_self_cond_embd = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-fuse-final-softcap"}, + "move final logit softcap into the CUDA diffusion sampling kernel", + [](common_params & params) { params.diffusion.fuse_final_logit_softcap = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-separate-encoder-decoder"}, + "build separate block-diffusion encoder and decoder graph variants", + [](common_params & params) { params.diffusion.separate_encoder_decoder = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-cuda-direct-self-cond"}, + "write CUDA diffusion self-conditioning directly into decoder graph inputs", + [](common_params & params) { params.diffusion.cuda_direct_self_cond = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-cuda-final-tokens-on-stop"}, + "copy final diffusion tokens only when the device stop condition is reached", + [](common_params & params) { params.diffusion.cuda_final_tokens_on_stop = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-cuda-fused-top-k-sample"}, + "fuse CUDA diffusion top-k selection and sampling", + [](common_params & params) { params.diffusion.cuda_fused_top_k_sample = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-cuda-tight-top-k"}, + "avoid extra CUDA diffusion top-k scratch width when possible", + [](common_params & params) { params.diffusion.cuda_tight_top_k = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-cuda-parallel-full-softmax"}, + "parallelize CUDA diffusion full-vocab sampling when top-k is 0", + [](common_params & params) { params.diffusion.cuda_parallel_full_softmax = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-cuda-fused-full-softmax"}, + "fuse CUDA diffusion full-vocab softmax sampling and self-conditioning", + [](common_params & params) { params.diffusion.cuda_fused_full_softmax = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-cuda-top-k-local-k"}, "N", + "CUDA diffusion local top-k candidates per thread (0 = backend default)", + [](common_params & params, int value) { params.diffusion.cuda_top_k_local_k = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--no-diffusion-cuda-fast-top-k"}, + "disable CUDA diffusion CUB/fast top-k selection path", + [](common_params & params) { params.diffusion.cuda_fast_top_k = false; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( {"--top-k-start"}, "N", "block-diffusion: anneal top-k from N at the first (high-entropy) denoising step (with --top-k-end)", diff --git a/common/common.cpp b/common/common.cpp index b01772e1cbfe..23209395dbef 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1585,6 +1585,11 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.op_offload = !params.no_op_offload; cparams.swa_full = params.swa_full; cparams.kv_unified = params.kv_unified; + cparams.diffusion_self_cond_top_k = params.diffusion.self_cond_top_k; + cparams.diffusion_input_gpu_groups = params.diffusion.input_gpu_groups; + cparams.diffusion_fused_self_cond_embd = params.diffusion.fused_self_cond_embd; + cparams.diffusion_fuse_final_logit_softcap = params.diffusion.fuse_final_logit_softcap; + cparams.diffusion_separate_encoder_decoder = params.diffusion.separate_encoder_decoder; cparams.type_k = params.cache_type_k; cparams.type_v = params.cache_type_v; diff --git a/common/common.h b/common/common.h index b85ac5701e63..d702febf2137 100644 --- a/common/common.h +++ b/common/common.h @@ -381,6 +381,10 @@ struct common_params_vocoder { struct common_params_diffusion { int32_t steps = 128; bool visual_mode = false; + bool gpu_sampling = true; // use CUDA diffusion sampling fast path when available + bool device_self_cond = true; // keep diffusion self-conditioning state on device + bool device_denoise_loop = true; // update diffusion canvas/stop state on device + bool pin_host_outputs = false; // register compact D2H output buffers as pinned host memory float eps = 0; // epsilon for timesteps int32_t block_length = 0; // block length for generation @@ -395,6 +399,25 @@ struct common_params_diffusion { int32_t top_k_start = 0; // anneal top-k from this (first/high-entropy step) ... int32_t top_k_end = 0; // ... to this (last step); both > 0 enables annealing bool top_k_tail_correction = false; // use exact full-vocab entropy for accept/stop + int32_t default_top_k = 0; // top-k used when --top-k is not explicitly provided + int32_t force_top_k = 0; // server: override per-request top_k when > 0 + int32_t self_cond_top_k = 256; // sparse self-conditioning gather width + uint32_t input_gpu_groups = 63; // decoder input tensor groups assigned to GPU backend + + // CUDA diffusion sampling fast-path knobs. Defaults preserve behavior when no tuning flags are passed. + bool cuda_fast_top_k = true; + bool cuda_direct_self_cond = false; + bool cuda_final_tokens_on_stop = false; + bool cuda_fused_top_k_sample = false; + bool cuda_tight_top_k = false; + bool cuda_parallel_full_softmax = false; + bool cuda_fused_full_softmax = false; + int32_t cuda_top_k_local_k = 0; // 0 = backend default + + // Diffusion graph-shape knobs. + bool fused_self_cond_embd = false; + bool fuse_final_logit_softcap = false; + bool separate_encoder_decoder = false; }; // reasoning API response format (not to be confused as chat template's reasoning format) diff --git a/examples/diffusion-gemma/diffusion-gemma-cli.cpp b/examples/diffusion-gemma/diffusion-gemma-cli.cpp index cd320009f7fa..20aff7e7b6e1 100644 --- a/examples/diffusion-gemma/diffusion-gemma-cli.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-cli.cpp @@ -21,11 +21,15 @@ #include "log.h" #include "mtmd.h" #include "mtmd-helper.h" +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif #include #include #include #include +#include #include #include #include @@ -42,14 +46,34 @@ static constexpr float CONFIDENCE_THRESHOLD = 0.005f; // StableAndConfident.co static constexpr int STABILITY_THRESHOLD = 1; // StableAndConfident.stability_threshold static constexpr int GPU_SAMPLING_MAX_TOP_K = 1024; // small top-k sort kernel limit -static int env_int(const char * name, int def) { - const char * v = getenv(name); - return v ? atoi(v) : def; -} +#ifdef GGML_USE_CUDA +struct diffusion_cuda_host_pin { + diffusion_cuda_host_pin(void * ptr, size_t size, bool enabled) : ptr(ptr), size(size) { + registered = enabled && ptr && size && ggml_backend_cuda_register_host_buffer(ptr, size); + } + + ~diffusion_cuda_host_pin() { + if (registered) { + ggml_backend_cuda_unregister_host_buffer(ptr); + } + } + + diffusion_cuda_host_pin(const diffusion_cuda_host_pin &) = delete; + diffusion_cuda_host_pin & operator=(const diffusion_cuda_host_pin &) = delete; -static int diffusion_self_cond_top_k() { + void * ptr = nullptr; + size_t size = 0; + bool registered = false; +}; +#else +struct diffusion_cuda_host_pin { + diffusion_cuda_host_pin(void *, size_t, bool) {} +}; +#endif + +static int diffusion_self_cond_top_k(const common_params & params) { constexpr int def = 256; - const int k = env_int("GGML_CUDA_DIFFUSION_SC_TOPK", def); + const int k = params.diffusion.self_cond_top_k; if (k <= 0) { return def; } @@ -68,22 +92,44 @@ static std::string format_chat(llama_model * model, const std::string & prompt) return common_chat_templates_apply(tmpls.get(), inputs).prompt; } +static void diffusion_gemma_print_usage(int, char **) { + printf("\nDiffusion-Gemma options:\n"); + printf(" --diffusion-timing print diffusion decode/sample timing breakdown\n"); +} + int main(int argc, char ** argv) { + bool log_step_timing = false; + std::vector fwd; + fwd.push_back(argv[0]); + for (int i = 1; i < argc; ++i) { + const std::string arg = argv[i]; + if (arg == "--diffusion-timing") { + log_step_timing = true; + } else { + fwd.push_back(argv[i]); + } + } + common_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) { + params.diffusion.steps = DEF_MAX_DENOISE_STEPS; + if (!common_params_parse((int) fwd.size(), fwd.data(), params, LLAMA_EXAMPLE_DIFFUSION, diffusion_gemma_print_usage)) { return 1; } common_init(); + if (!params.diffusion.device_self_cond && params.diffusion.fused_self_cond_embd) { + LOG_ERR("--no-diffusion-device-selfcond cannot be used with --diffusion-fused-self-cond-embd\n"); + return 1; + } - // diffusion config (env-overridable for quick CPU testing) - // canvas_length is fixed at the trained block size (256); overriding it is for experiments only. - const int canvas_length = env_int("DG_CANVAS", DEF_CANVAS_LENGTH); - const int n_steps = env_int("DG_STEPS", DEF_MAX_DENOISE_STEPS); + // diffusion config + // canvas_length is fixed at the trained block size (256). + const int canvas_length = DEF_CANVAS_LENGTH; + const int n_steps = std::max(params.diffusion.steps, 1); // number of autoregressive canvas blocks = ceil(-n / canvas_length), i.e. -n is the total // number of tokens to generate (e.g. -n 256 -> 1 canvas, -n 512 -> 2, -n 1024 -> 4). The - // model may stop earlier on an EOG token. DG_MAX_CANVASES overrides; default (no -n) is 1. + // model may stop earlier on an EOG token; default (no -n) is 1 canvas. const int blocks_from_n = params.n_predict > 0 ? (params.n_predict + canvas_length - 1) / canvas_length : 1; - const int max_canvases = std::max(env_int("DG_MAX_CANVASES", blocks_from_n), 1); // autoregressive blocks + const int max_canvases = std::max(blocks_from_n, 1); // autoregressive blocks const float entropy_bound = ENTROPY_BOUND; // top-k host sampling (CLI flags; default = full softmax over the whole vocab): @@ -94,10 +140,11 @@ int main(int argc, char ** argv) { // --top-k uses its own "0 = disabled" convention and is applied only when explicitly passed. const int topk_fixed = (params.sampling.user_sampling_config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K) ? params.sampling.top_k - : 0; + : std::max(params.diffusion.default_top_k, 0); const int topk_start = params.diffusion.top_k_start; const int topk_end = params.diffusion.top_k_end; const int topk_tail = params.diffusion.top_k_tail_correction ? 1 : 0; + const int SC_K = diffusion_self_cond_top_k(params); llama_backend_init(); @@ -198,6 +245,11 @@ int main(int argc, char ** argv) { ctx_params.n_batch = n_ub; ctx_params.n_ubatch = n_ub; ctx_params.no_perf = params.no_perf; + ctx_params.diffusion_self_cond_top_k = SC_K; + ctx_params.diffusion_input_gpu_groups = params.diffusion.input_gpu_groups; + ctx_params.diffusion_fused_self_cond_embd = params.diffusion.fused_self_cond_embd; + ctx_params.diffusion_fuse_final_logit_softcap = params.diffusion.fuse_final_logit_softcap; + ctx_params.diffusion_separate_encoder_decoder = params.diffusion.separate_encoder_decoder; llama_context * ctx = llama_init_from_model(model, ctx_params); if (!ctx) { @@ -211,13 +263,13 @@ int main(int argc, char ** argv) { const int topk_max_requested = (topk_start > 0 && topk_end > 0) ? std::max(topk_start, topk_end) : topk_fixed; - const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0; + const bool gpu_sampling_requested = params.diffusion.gpu_sampling; const bool gpu_sampling_topk_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; const bool use_gpu_sampling = gpu_sampling_requested && gpu_sampling_topk_ok && llama_diffusion_sample_topk_supported(ctx); - const bool use_device_self_cond = use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; - const bool use_device_loop = use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; + const bool use_device_self_cond = use_gpu_sampling && params.diffusion.device_self_cond; + const bool use_device_loop = use_device_self_cond && params.diffusion.device_denoise_loop; const int device_early_stop_interval = use_device_loop ? 1 : 0; llama_set_diffusion_gpu_sampling(ctx, use_gpu_sampling); @@ -226,7 +278,7 @@ int main(int argc, char ** argv) { LOG_INF("diffusion-gemma: gpu sampling: %s%s%s%s\n", use_gpu_sampling ? "on" : "off", (!gpu_sampling_topk_ok ? " (top-k exceeds CUDA fast-path limit)" : - (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")), + (!gpu_sampling_requested ? " (disabled by --no-diffusion-gpu-sampling)" : "")), use_device_self_cond ? " | device self-cond: on" : "", use_device_loop ? " | device loop: on" : ""); if (device_early_stop_interval > 0) { @@ -292,16 +344,21 @@ int main(int argc, char ** argv) { std::vector argmax_canvas(canvas_length, -1); std::vector prev_argmax(canvas_length, -1); std::vector accepted(canvas_length); + int32_t stop_flag = 0; // sparse self-conditioning over the canvas: the previous step's top-SC_K token ids + their // (renormalized) softmax probabilities per position. Fed to the next decode (Option-2 graph // gather: the decoder gathers just these SC_K embedding rows and blends them, instead of a // dense full-vocab probs @ token_embd matmul). Zero probs => no self-conditioning (step 1). // SC_K must match the graph's fixed gather width. Default is 256; use - // GGML_CUDA_DIFFUSION_SC_TOPK for gated experiments with a smaller sparse blend. - const int SC_K = diffusion_self_cond_top_k(); + // --diffusion-self-cond-top-k can be used for gated experiments with a smaller sparse blend. std::vector sc_ids ((size_t) SC_K * canvas_length, 0); std::vector sc_probs((size_t) SC_K * canvas_length, 0.0f); + const bool pin_host_outputs = params.diffusion.pin_host_outputs; + diffusion_cuda_host_pin pin_argmax_canvas( + argmax_canvas.data(), argmax_canvas.size() * sizeof(argmax_canvas[0]), pin_host_outputs); + diffusion_cuda_host_pin pin_stop_flag(&stop_flag, sizeof(stop_flag), pin_host_outputs); + // all generated tokens across the autoregressive canvas blocks std::vector generated; @@ -310,7 +367,6 @@ int main(int argc, char ** argv) { int n_blocks_run = 0; int n_steps_total = 0; const auto t_gen_start = std::chrono::steady_clock::now(); - const bool log_step_timing = env_int("DG_TIMING", 0) != 0; double decode_enqueue_s = 0.0; double sample_sync_s = 0.0; double host_loop_s = 0.0; @@ -373,7 +429,7 @@ int main(int argc, char ** argv) { const bool use_device_early_stop = device_early_stop_interval > 0; const bool reset_stop_state = use_device_early_stop && block_step == 1; const bool check_stop = use_device_early_stop && !final_step; - int32_t stop_flag = 0; + stop_flag = 0; const auto t_sample_start = std::chrono::steady_clock::now(); llama_diffusion_sample_params sample_params = { /* .n_tokens = */ canvas_length, @@ -383,6 +439,14 @@ int main(int argc, char ** argv) { /* .seed = */ params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed, /* .step = */ (uint32_t) n_steps_total, /* .top_k_tail_correction = */ topk_tail != 0, + /* .cuda_fast_top_k = */ params.diffusion.cuda_fast_top_k, + /* .cuda_direct_self_cond = */ params.diffusion.cuda_direct_self_cond, + /* .cuda_final_tokens_on_stop = */ params.diffusion.cuda_final_tokens_on_stop, + /* .cuda_fused_top_k_sample = */ params.diffusion.cuda_fused_top_k_sample, + /* .cuda_tight_top_k = */ params.diffusion.cuda_tight_top_k, + /* .cuda_parallel_full_softmax = */ params.diffusion.cuda_parallel_full_softmax, + /* .cuda_fused_full_softmax = */ params.diffusion.cuda_fused_full_softmax, + /* .cuda_top_k_local_k = */ params.diffusion.cuda_top_k_local_k, }; llama_diffusion_sample_result sample_result = { /* .sampled = */ nullptr, @@ -433,6 +497,14 @@ int main(int argc, char ** argv) { /* .seed = */ params.sampling.seed == LLAMA_DEFAULT_SEED ? 1234u : params.sampling.seed, /* .step = */ (uint32_t) n_steps_total, /* .top_k_tail_correction = */ topk_tail != 0, + /* .cuda_fast_top_k = */ params.diffusion.cuda_fast_top_k, + /* .cuda_direct_self_cond = */ params.diffusion.cuda_direct_self_cond, + /* .cuda_final_tokens_on_stop = */ params.diffusion.cuda_final_tokens_on_stop, + /* .cuda_fused_top_k_sample = */ params.diffusion.cuda_fused_top_k_sample, + /* .cuda_tight_top_k = */ params.diffusion.cuda_tight_top_k, + /* .cuda_parallel_full_softmax = */ params.diffusion.cuda_parallel_full_softmax, + /* .cuda_fused_full_softmax = */ params.diffusion.cuda_fused_full_softmax, + /* .cuda_top_k_local_k = */ params.diffusion.cuda_top_k_local_k, }; llama_diffusion_sample_result sample_result = { /* .sampled = */ sampled.data(), diff --git a/examples/diffusion-gemma/diffusion-gemma-server.cpp b/examples/diffusion-gemma/diffusion-gemma-server.cpp index 9b49f55b8d25..97f5b99ce65b 100644 --- a/examples/diffusion-gemma/diffusion-gemma-server.cpp +++ b/examples/diffusion-gemma/diffusion-gemma-server.cpp @@ -18,6 +18,9 @@ #include "common.h" #include "llama.h" #include "log.h" +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif #include #include @@ -57,13 +60,33 @@ static constexpr int STABILITY_THRESHOLD = 1; static constexpr int DEF_SC_K = 256; static constexpr int GPU_SAMPLING_MAX_TOP_K = 1024; // CUDA diffusion sampler limit -static int env_int(const char * name, int def) { - const char * v = getenv(name); - return v ? atoi(v) : def; -} +#ifdef GGML_USE_CUDA +struct diffusion_cuda_host_pin { + diffusion_cuda_host_pin(void * ptr, size_t size, bool enabled) : ptr(ptr), size(size) { + registered = enabled && ptr && size && ggml_backend_cuda_register_host_buffer(ptr, size); + } + + ~diffusion_cuda_host_pin() { + if (registered) { + ggml_backend_cuda_unregister_host_buffer(ptr); + } + } + + diffusion_cuda_host_pin(const diffusion_cuda_host_pin &) = delete; + diffusion_cuda_host_pin & operator=(const diffusion_cuda_host_pin &) = delete; + + void * ptr = nullptr; + size_t size = 0; + bool registered = false; +}; +#else +struct diffusion_cuda_host_pin { + diffusion_cuda_host_pin(void *, size_t, bool) {} +}; +#endif -static int diffusion_self_cond_top_k() { - const int k = env_int("GGML_CUDA_DIFFUSION_SC_TOPK", DEF_SC_K); +static int diffusion_self_cond_top_k(const common_params & params) { + const int k = params.diffusion.self_cond_top_k; if (k <= 0) { return DEF_SC_K; } @@ -170,6 +193,7 @@ struct diffusion_server { bool use_device_self_cond = false; bool use_device_loop = false; int device_early_stop_interval = 0; + common_params_diffusion diffusion; std::string model_id; std::string model_path; std::string build_info = "llama.cpp diffusion-gemma-server"; @@ -244,11 +268,17 @@ struct diffusion_server { std::vector argmax_canvas(canvas_length, -1); std::vector prev_argmax(canvas_length, -1); std::vector accepted(canvas_length); - const int SC_K = diffusion_self_cond_top_k(); + int32_t stop_flag = 0; + const int SC_K = diffusion.self_cond_top_k; std::vector sc_ids ((size_t) SC_K * canvas_length, 0); std::vector sc_probs((size_t) SC_K * canvas_length, 0.0f); std::vector generated; + const bool pin_host_outputs = diffusion.pin_host_outputs; + diffusion_cuda_host_pin pin_argmax_canvas( + argmax_canvas.data(), argmax_canvas.size() * sizeof(argmax_canvas[0]), pin_host_outputs); + diffusion_cuda_host_pin pin_stop_flag(&stop_flag, sizeof(stop_flag), pin_host_outputs); + int n_blocks_run = 0, n_steps_total = 0; bool done = false; const int topk_max_requested = @@ -306,7 +336,7 @@ struct diffusion_server { const bool use_device_early_stop = device_early_stop_interval > 0; const bool reset_stop_state = use_device_early_stop && block_step == 1; const bool check_stop = use_device_early_stop && !final_step; - int32_t stop_flag = 0; + stop_flag = 0; llama_diffusion_sample_params sample_params = { /* .n_tokens = */ canvas_length, /* .top_k = */ k_step, @@ -315,6 +345,14 @@ struct diffusion_server { /* .seed = */ rq.seed, /* .step = */ (uint32_t) n_steps_total, /* .top_k_tail_correction = */ rq.topk_tail, + /* .cuda_fast_top_k = */ diffusion.cuda_fast_top_k, + /* .cuda_direct_self_cond = */ diffusion.cuda_direct_self_cond, + /* .cuda_final_tokens_on_stop = */ diffusion.cuda_final_tokens_on_stop, + /* .cuda_fused_top_k_sample = */ diffusion.cuda_fused_top_k_sample, + /* .cuda_tight_top_k = */ diffusion.cuda_tight_top_k, + /* .cuda_parallel_full_softmax = */ diffusion.cuda_parallel_full_softmax, + /* .cuda_fused_full_softmax = */ diffusion.cuda_fused_full_softmax, + /* .cuda_top_k_local_k = */ diffusion.cuda_top_k_local_k, }; llama_diffusion_sample_result sample_result = { /* .sampled = */ nullptr, @@ -359,6 +397,14 @@ struct diffusion_server { /* .seed = */ rq.seed, /* .step = */ (uint32_t) n_steps_total, /* .top_k_tail_correction = */ rq.topk_tail, + /* .cuda_fast_top_k = */ diffusion.cuda_fast_top_k, + /* .cuda_direct_self_cond = */ diffusion.cuda_direct_self_cond, + /* .cuda_final_tokens_on_stop = */ diffusion.cuda_final_tokens_on_stop, + /* .cuda_fused_top_k_sample = */ diffusion.cuda_fused_top_k_sample, + /* .cuda_tight_top_k = */ diffusion.cuda_tight_top_k, + /* .cuda_parallel_full_softmax = */ diffusion.cuda_parallel_full_softmax, + /* .cuda_fused_full_softmax = */ diffusion.cuda_fused_full_softmax, + /* .cuda_top_k_local_k = */ diffusion.cuda_top_k_local_k, }; llama_diffusion_sample_result sample_result = { /* .sampled = */ sampled.data(), @@ -627,6 +673,11 @@ static diffusion_request request_from_body(const json & body, const diffusion_se if (body.contains("max_tokens") && body["max_tokens"].is_number_integer()) max_tokens = body["max_tokens"].get(); else if (body.contains("max_completion_tokens") && body["max_completion_tokens"].is_number_integer()) max_tokens = body["max_completion_tokens"].get(); rq.max_canvases = max_tokens > 0 ? (max_tokens + rq.canvas_length - 1) / rq.canvas_length : 1; + if (body.contains("diffusion_steps") && body["diffusion_steps"].is_number_integer()) { + rq.n_steps = std::max(body["diffusion_steps"].get(), 1); + } else if (body.contains("n_denoise_steps") && body["n_denoise_steps"].is_number_integer()) { + rq.n_steps = std::max(body["n_denoise_steps"].get(), 1); + } if (body.contains("top_k") && body["top_k"].is_number_integer()) { rq.topk_fixed = body["top_k"].get(); rq.topk_start = 0; @@ -637,6 +688,12 @@ static diffusion_request request_from_body(const json & body, const diffusion_se if (body.contains("top_k_tail_correction") && body["top_k_tail_correction"].is_boolean()) { rq.topk_tail = body["top_k_tail_correction"].get(); } + const int forced_top_k = std::max(srv.diffusion.force_top_k, 0); + if (forced_top_k > 0) { + rq.topk_fixed = forced_top_k; + rq.topk_start = 0; + rq.topk_end = 0; + } if (body.contains("seed") && body["seed"].is_number_integer()) rq.seed = (uint32_t) body["seed"].get(); if (body.contains("ignore_eos") && body["ignore_eos"].is_boolean()) rq.ignore_eos = body["ignore_eos"].get(); return rq; @@ -669,10 +726,16 @@ int main(int argc, char ** argv) { } common_params params; + params.diffusion.steps = DEF_MAX_DENOISE_STEPS; if (!common_params_parse((int) fwd.size(), fwd.data(), params, LLAMA_EXAMPLE_DIFFUSION)) { return 1; } common_init(); + if (!params.diffusion.device_self_cond && params.diffusion.fused_self_cond_embd) { + SRV_ERR("%s\n", "--no-diffusion-device-selfcond cannot be used with --diffusion-fused-self-cond-embd"); + return 1; + } + params.diffusion.self_cond_top_k = diffusion_self_cond_top_k(params); llama_backend_init(); llama_numa_init(params.numa); @@ -699,13 +762,14 @@ int main(int argc, char ** argv) { srv.vocab = llama_model_get_vocab(srv.model); srv.n_vocab = llama_vocab_n_tokens(srv.vocab); srv.canvas_length = DEF_CANVAS_LENGTH; - srv.n_steps = DEF_MAX_DENOISE_STEPS; + srv.n_steps = std::max(params.diffusion.steps, 1); + srv.diffusion = params.diffusion; srv.n_ub = srv.canvas_length; srv.n_ctx = params.n_ctx > 0 ? (int) params.n_ctx : 4096; if (srv.n_ctx < 2 * srv.canvas_length) srv.n_ctx = 2 * srv.canvas_length; srv.topk_fixed = (params.sampling.user_sampling_config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K) ? params.sampling.top_k - : 0; + : std::max(params.diffusion.default_top_k, 0); srv.topk_start = params.diffusion.top_k_start; srv.topk_end = params.diffusion.top_k_end; srv.topk_tail = params.diffusion.top_k_tail_correction; @@ -726,6 +790,11 @@ int main(int argc, char ** argv) { ctx_params.n_batch = srv.n_ub; ctx_params.n_ubatch = srv.n_ub; ctx_params.no_perf = params.no_perf; + ctx_params.diffusion_self_cond_top_k = srv.diffusion.self_cond_top_k; + ctx_params.diffusion_input_gpu_groups = srv.diffusion.input_gpu_groups; + ctx_params.diffusion_fused_self_cond_embd = srv.diffusion.fused_self_cond_embd; + ctx_params.diffusion_fuse_final_logit_softcap = srv.diffusion.fuse_final_logit_softcap; + ctx_params.diffusion_separate_encoder_decoder = srv.diffusion.separate_encoder_decoder; srv.ctx = llama_init_from_model(srv.model, ctx_params); if (!srv.ctx) { @@ -740,11 +809,11 @@ int main(int argc, char ** argv) { const int topk_max_requested = (srv.topk_start > 0 && srv.topk_end > 0) ? std::max(srv.topk_start, srv.topk_end) : srv.topk_fixed; - const bool gpu_sampling_requested = env_int("DG_GPU_SAMPLING", 1) != 0; + const bool gpu_sampling_requested = params.diffusion.gpu_sampling; srv.use_gpu_sampling = gpu_sampling_requested && llama_diffusion_sample_topk_supported(srv.ctx); - srv.use_device_self_cond = srv.use_gpu_sampling && env_int("DG_DEVICE_SELFCOND", 1) != 0; - srv.use_device_loop = srv.use_device_self_cond && env_int("DG_DEVICE_LOOP", 1) != 0; + srv.use_device_self_cond = srv.use_gpu_sampling && params.diffusion.device_self_cond; + srv.use_device_loop = srv.use_device_self_cond && params.diffusion.device_denoise_loop; srv.device_early_stop_interval = srv.use_device_loop ? 1 : 0; llama_set_diffusion_gpu_sampling(srv.ctx, srv.use_gpu_sampling); const bool default_topk_gpu_ok = topk_max_requested <= 0 || topk_max_requested <= GPU_SAMPLING_MAX_TOP_K; @@ -757,13 +826,16 @@ int main(int argc, char ** argv) { SRV_INF("gpu sampling: %s%s%s%s | top-k fixed=%d anneal=[%d->%d] tail_correction=%d\n", srv.use_gpu_sampling ? "on" : "off", (!default_topk_gpu_ok ? " (default top-k will use CPU fallback until k <= CUDA limit)" : - (!gpu_sampling_requested ? " (disabled by DG_GPU_SAMPLING=0)" : "")), + (!gpu_sampling_requested ? " (disabled by --no-diffusion-gpu-sampling)" : "")), srv.use_device_self_cond ? " | device self-cond: on" : "", srv.use_device_loop ? " | device loop: on" : "", srv.topk_fixed, srv.topk_start, srv.topk_end, srv.topk_tail ? 1 : 0); if (srv.device_early_stop_interval > 0) { SRV_INF("device early-stop interval: %d\n", srv.device_early_stop_interval); } + if (srv.diffusion.force_top_k > 0) { + SRV_INF("forcing request top-k to %d via --diffusion-force-top-k\n", srv.diffusion.force_top_k); + } // ------------------------------------------------------------------------------------------ // HTTP server @@ -831,7 +903,7 @@ int main(int argc, char ** argv) { {"entropy_bound", ENTROPY_BOUND}, {"confidence_threshold", CONFIDENCE_THRESHOLD}, {"stability_threshold", STABILITY_THRESHOLD}, - {"self_cond_topk", diffusion_self_cond_top_k()}, + {"self_cond_topk", srv.diffusion.self_cond_top_k}, {"gpu_sampling", srv.use_gpu_sampling}, {"device_self_cond", srv.use_device_self_cond}, {"device_loop", srv.use_device_loop}, @@ -940,7 +1012,7 @@ int main(int argc, char ** argv) { {"params", { {"canvas_length", srv.canvas_length}, {"n_denoise_steps", srv.n_steps}, - {"self_cond_topk", diffusion_self_cond_top_k()}, + {"self_cond_topk", srv.diffusion.self_cond_top_k}, {"gpu_sampling", srv.use_gpu_sampling}, {"device_self_cond", srv.use_device_self_cond}, {"device_loop", srv.use_device_loop}, diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index f8990bea4c10..0a72747c3838 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -53,6 +53,14 @@ struct ggml_cuda_diffusion_sample_params { uint32_t step; bool top_k_tail_correction; float logit_softcap; + bool fast_top_k; + bool direct_self_cond; + bool final_tokens_on_stop; + bool fused_top_k_sample; + bool tight_top_k; + bool parallel_full_softmax; + bool fused_full_softmax; + int32_t top_k_local_k; }; struct ggml_cuda_diffusion_sample_result { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 87615921c09b..bbdbc40a0513 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1405,6 +1405,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra } sched->graph.n_nodes = 0; sched->graph.n_leafs = 0; + sched->graph.flags = graph->flags; struct ggml_cgraph * graph_copy = &sched->graph; @@ -2134,6 +2135,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s graph_copy->nodes[i] = node_copy; } graph_copy->n_nodes = graph->n_nodes; + graph_copy->flags = graph->flags; ggml_hash_set_free(&hash_set); free(node_copies); diff --git a/ggml/src/ggml-cuda/diffusion-sampling.cu b/ggml/src/ggml-cuda/diffusion-sampling.cu index 2306e68558a0..93a66ad3aac2 100644 --- a/ggml/src/ggml-cuda/diffusion-sampling.cu +++ b/ggml/src/ggml-cuda/diffusion-sampling.cu @@ -4,8 +4,6 @@ #include #include -#include -#include #include #include @@ -26,35 +24,9 @@ static int next_power_of_2_host(int x) { return n; } -static bool diffusion_fast_topk_enabled() { - const char * v = std::getenv("GGML_CUDA_DIFFUSION_FAST_TOPK"); - return !v || std::strcmp(v, "0") != 0; -} - -static bool diffusion_direct_selfcond_enabled() { - const char * v = std::getenv("GGML_CUDA_DIFFUSION_DIRECT_SELFCOND"); - return v && std::strcmp(v, "0") != 0; -} - -static bool diffusion_final_tokens_on_stop_enabled() { - const char * v = std::getenv("GGML_CUDA_DIFFUSION_FINAL_TOKENS_ON_STOP"); - return v && std::strcmp(v, "0") != 0; -} - -static bool diffusion_fused_topk_sample_enabled() { - const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSED_TOPK_SAMPLE"); - return v && std::strcmp(v, "0") != 0; -} - -static bool diffusion_tight_topk_enabled() { - const char * v = std::getenv("GGML_CUDA_DIFFUSION_TIGHT_TOPK"); - return v && std::strcmp(v, "0") != 0; -} - -static int diffusion_topk_local_k(const int heap_k) { - int requested = heap_k <= 256 ? 8 : 16; - if (const char * v = std::getenv("GGML_CUDA_DIFFUSION_TOPK_LOCAL_K")) { - requested = std::atoi(v); +static int diffusion_topk_local_k(const int heap_k, int requested) { + if (requested <= 0) { + requested = heap_k <= 256 ? 8 : 16; } const int min_local_k = std::max(1, (heap_k + 255) / 256); @@ -227,9 +199,10 @@ static void diffusion_select_topk_local( const int n_vocab, const int n_tokens, const int heap_k, + const int top_k_local_k, cudaStream_t stream) { constexpr int block_size = 256; - const int local_k = diffusion_topk_local_k(heap_k); + const int local_k = diffusion_topk_local_k(heap_k, top_k_local_k); switch (local_k) { case 1: { @@ -612,6 +585,7 @@ static void diffusion_sample_topk_fused_local( const int n_tokens, const int top_k, const int sc_k, + const int top_k_local_k, const float inv_temp, const float logit_softcap, const uint32_t seed, @@ -623,7 +597,7 @@ static void diffusion_sample_topk_fused_local( float * sc_probs, cudaStream_t stream) { constexpr int block_size = 256; - const int local_k = diffusion_topk_local_k(top_k); + const int local_k = diffusion_topk_local_k(top_k, top_k_local_k); switch (local_k) { case 1: { @@ -669,6 +643,243 @@ static void diffusion_sample_topk_fused_local( } } +template +static __global__ void diffusion_sample_full_softmax_fused_local_kernel( + const float * __restrict__ logits, + const int n_vocab, + const int n_tokens, + const int sc_k, + const float inv_temp, + const float logit_softcap, + const uint32_t seed, + const uint32_t step, + int * __restrict__ sampled, + int * __restrict__ argmax, + float * __restrict__ entropy, + int * __restrict__ sc_ids, + float * __restrict__ sc_probs) { + constexpr int block_size = 256; + constexpr int candidate_count = LOCAL_K * block_size; + + const int row = blockIdx.x; + const int tid = threadIdx.x; + if (row >= n_tokens) { + return; + } + + float vals[LOCAL_K]; + int ids[LOCAL_K]; +#pragma unroll + for (int i = 0; i < LOCAL_K; ++i) { + vals[i] = -FLT_MAX; + ids[i] = 0; + } + + const float * row_logits = logits + (size_t) row * n_vocab; + float local_max = -FLT_MAX; + int local_idx = 0; + for (int v = tid; v < n_vocab; v += blockDim.x) { + const float raw = row_logits[v]; + const float x = diffusion_apply_logit_softcap(raw, logit_softcap) * inv_temp; + if (x > local_max) { + local_max = x; + local_idx = v; + } + if (diffusion_should_swap_desc(vals[LOCAL_K - 1], ids[LOCAL_K - 1], raw, v)) { + int pos = LOCAL_K - 1; +#pragma unroll + for (int i = LOCAL_K - 1; i > 0; --i) { + if (pos == i && diffusion_should_swap_desc(vals[i - 1], ids[i - 1], raw, v)) { + vals[i] = vals[i - 1]; + ids[i] = ids[i - 1]; + --pos; + } + } + vals[pos] = raw; + ids[pos] = v; + } + } + + extern __shared__ unsigned char smem[]; + float * s_vals = (float *) smem; + int * s_ids = (int *) (s_vals + candidate_count); + int * s_max_ids = s_ids + candidate_count; + float * s_sum = (float *) (s_max_ids + block_size); + float * s_t = s_sum + block_size; + float * s_cdf = s_t + block_size; + __shared__ int s_sampled; + +#pragma unroll + for (int i = 0; i < LOCAL_K; ++i) { + const int dst = tid * LOCAL_K + i; + s_vals[dst] = vals[i]; + s_ids[dst] = ids[i]; + } + s_sum[tid] = local_max; + s_max_ids[tid] = local_idx; + __syncthreads(); + + for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { + if (tid < stride && s_sum[tid + stride] > s_sum[tid]) { + s_sum[tid] = s_sum[tid + stride]; + s_max_ids[tid] = s_max_ids[tid + stride]; + } + __syncthreads(); + } + + const float max_l = s_sum[0]; + const int amax = s_max_ids[0]; + + for (int k = 2; k <= candidate_count; k <<= 1) { + for (int j = k >> 1; j > 0; j >>= 1) { + for (int i = tid; i < candidate_count; i += blockDim.x) { + const int ixj = i ^ j; + if (ixj > i) { + const bool descending = (i & k) == 0; + const bool swap = descending + ? diffusion_should_swap_desc(s_vals[i], s_ids[i], s_vals[ixj], s_ids[ixj]) + : diffusion_should_swap_desc(s_vals[ixj], s_ids[ixj], s_vals[i], s_ids[i]); + if (swap) { + const float tv = s_vals[i]; + s_vals[i] = s_vals[ixj]; + s_vals[ixj] = tv; + const int ti = s_ids[i]; + s_ids[i] = s_ids[ixj]; + s_ids[ixj] = ti; + } + } + } + __syncthreads(); + } + } + + float local_sum = 0.0f; + float local_t = 0.0f; + for (int v = tid; v < n_vocab; v += blockDim.x) { + const float d = diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l; + const float e = expf(d); + local_sum += e; + local_t += d * e; + } + + s_sum[tid] = local_sum; + s_t[tid] = local_t; + s_cdf[tid] = local_sum; + __syncthreads(); + + for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { + if (tid < stride) { + s_sum[tid] += s_sum[tid + stride]; + s_t[tid] += s_t[tid + stride]; + } + __syncthreads(); + } + + const float z = s_sum[0]; + const float t = s_t[0]; + + if (tid == 0) { + argmax[row] = amax; + entropy[row] = logf(z) - t / z; + s_sampled = amax; + } + __syncthreads(); + + for (int stride = 1; stride < blockDim.x; stride <<= 1) { + const float add = tid >= stride ? s_cdf[tid - stride] : 0.0f; + __syncthreads(); + s_cdf[tid] += add; + __syncthreads(); + } + + const float r = diffusion_rng_uniform(seed, step, row) * z; + const float chunk_begin = tid == 0 ? 0.0f : s_cdf[tid - 1]; + const float chunk_end = s_cdf[tid]; + if (r > chunk_begin && r <= chunk_end) { + float thread_cum = chunk_begin; + for (int v = tid; v < n_vocab; v += blockDim.x) { + thread_cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l); + if (thread_cum >= r) { + s_sampled = v; + break; + } + } + } + __syncthreads(); + + if (tid == 0) { + sampled[row] = s_sampled; + for (int i = 0; i < sc_k; ++i) { + const int out = row * sc_k + i; + sc_ids[out] = s_ids[i]; + sc_probs[out] = expf(diffusion_apply_logit_softcap(s_vals[i], logit_softcap) * inv_temp - max_l) / z; + } + } +} + +static void diffusion_sample_full_softmax_fused_local( + const float * logits, + const int n_vocab, + const int n_tokens, + const int sc_k, + const int top_k_local_k, + const float inv_temp, + const float logit_softcap, + const uint32_t seed, + const uint32_t step, + int * sampled, + int * argmax, + float * entropy, + int * sc_ids, + float * sc_probs, + cudaStream_t stream) { + constexpr int block_size = 256; + const int local_k = diffusion_topk_local_k(sc_k, top_k_local_k); + + switch (local_k) { + case 1: { + constexpr int local_k_t = 1; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 3 * block_size * sizeof(float) + block_size * sizeof(int); + diffusion_sample_full_softmax_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + case 2: { + constexpr int local_k_t = 2; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 3 * block_size * sizeof(float) + block_size * sizeof(int); + diffusion_sample_full_softmax_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + case 4: { + constexpr int local_k_t = 4; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 3 * block_size * sizeof(float) + block_size * sizeof(int); + diffusion_sample_full_softmax_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + case 8: { + constexpr int local_k_t = 8; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 3 * block_size * sizeof(float) + block_size * sizeof(int); + diffusion_sample_full_softmax_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + default: { + constexpr int local_k_t = 16; + const size_t smem = (size_t) local_k_t * block_size * (sizeof(float) + sizeof(int)) + + 3 * block_size * sizeof(float) + block_size * sizeof(int); + diffusion_sample_full_softmax_fused_local_kernel<<>>( + logits, n_vocab, n_tokens, sc_k, inv_temp, logit_softcap, seed, step, + sampled, argmax, entropy, sc_ids, sc_probs); + } break; + } +} + static __device__ __forceinline__ bool diffusion_pair_gt(float a_val, int a_id, float b_val, int b_id) { return a_val > b_val || (a_val == b_val && a_id > b_id); } @@ -803,6 +1014,7 @@ static __global__ void diffusion_sample_kernel( const uint32_t seed, const uint32_t step, const bool tail_correction, + const bool parallel_full_softmax_sample, int * __restrict__ sampled, int * __restrict__ argmax, float * __restrict__ entropy, @@ -816,7 +1028,9 @@ static __global__ void diffusion_sample_kernel( __shared__ float s_val[1024]; __shared__ float s_sum[1024]; + __shared__ float s_cdf[1024]; __shared__ int s_idx[1024]; + __shared__ int s_sampled; const float * row_logits = logits + (size_t) row * n_vocab; const int * row_top = top_ids + (size_t) row * heap_k; @@ -880,6 +1094,7 @@ static __global__ void diffusion_sample_kernel( s_sum[tid] = local_sum; s_val[tid] = local_t; + s_cdf[tid] = local_sum; __syncthreads(); for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { @@ -910,7 +1125,7 @@ static __global__ void diffusion_sample_kernel( float cum = 0.0f; int tok = amax; - if (top_k == 0) { + if (top_k == 0 && !parallel_full_softmax_sample) { for (int v = 0; v < n_vocab; ++v) { cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l); if (cum >= r) { @@ -943,6 +1158,40 @@ static __global__ void diffusion_sample_kernel( } } } + + if (top_k == 0 && parallel_full_softmax_sample) { + for (int stride = 1; stride < blockDim.x; stride <<= 1) { + const float add = tid >= stride ? s_cdf[tid - stride] : 0.0f; + __syncthreads(); + s_cdf[tid] += add; + __syncthreads(); + } + + const float z = s_sum[0]; + const float r = diffusion_rng_uniform(seed, step, row) * z; + if (tid == 0) { + s_sampled = amax; + } + __syncthreads(); + + const float chunk_begin = tid == 0 ? 0.0f : s_cdf[tid - 1]; + const float chunk_end = s_cdf[tid]; + if (r > chunk_begin && r <= chunk_end) { + float thread_cum = chunk_begin; + for (int v = tid; v < n_vocab; v += blockDim.x) { + thread_cum += expf(diffusion_apply_logit_softcap(row_logits[v], logit_softcap) * inv_temp - max_l); + if (thread_cum >= r) { + s_sampled = v; + break; + } + } + } + __syncthreads(); + + if (tid == 0) { + sampled[row] = s_sampled; + } + } } bool ggml_cuda_diffusion_sample_topk( @@ -1042,7 +1291,7 @@ bool ggml_cuda_diffusion_sample_topk( } } - const int heap_k = top_k == 0 || !diffusion_tight_topk_enabled() ? std::max(top_k, sc_k) : top_k; + const int heap_k = top_k == 0 || !params->tight_top_k ? std::max(top_k, sc_k) : top_k; if (heap_k <= 0 || heap_k > 1024 || heap_k > n_vocab) { return false; } @@ -1060,25 +1309,33 @@ bool ggml_cuda_diffusion_sample_topk( diffusion_sample_scratch * scratch = diffusion_get_scratch(stream, n_tokens, heap_k, sc_k); int * top_ids = scratch->top_ids; - const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && diffusion_direct_selfcond_enabled(); + const bool direct_self_cond_tensor = have_self_cond_tensor && !have_self_cond_host && params->direct_self_cond; int * sc_ids_out = direct_self_cond_tensor ? (int *) result->self_cond_ids_tensor->data : scratch->sc_ids; float * sc_probs_out = direct_self_cond_tensor ? (float *) result->self_cond_probs_tensor->data : scratch->sc_probs; constexpr int block_size = 256; bool sync_required = false; - const bool use_fused_topk_sample = diffusion_fused_topk_sample_enabled() && + const bool use_fused_topk_sample = params->fused_top_k_sample && top_k > 0 && top_k <= block_size && !params->top_k_tail_correction && n_vocab >= top_k; + const bool use_fused_full_softmax_sample = + top_k == 0 && !params->top_k_tail_correction && params->fused_full_softmax; + const bool use_parallel_full_softmax_sample = + top_k == 0 && !use_fused_full_softmax_sample && params->parallel_full_softmax; if (use_fused_topk_sample) { - diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, inv_temp, logit_softcap, + diffusion_sample_topk_fused_local(logits_d, n_vocab, n_tokens, top_k, sc_k, params->top_k_local_k, inv_temp, logit_softcap, + params->seed, params->step, scratch->sampled, scratch->argmax, scratch->entropy, + sc_ids_out, sc_probs_out, stream); + } else if (use_fused_full_softmax_sample) { + diffusion_sample_full_softmax_fused_local(logits_d, n_vocab, n_tokens, sc_k, params->top_k_local_k, inv_temp, logit_softcap, params->seed, params->step, scratch->sampled, scratch->argmax, scratch->entropy, sc_ids_out, sc_probs_out, stream); } else { bool top_ids_sorted = false; - const bool use_fast_topk = diffusion_fast_topk_enabled() && heap_k <= 1024 && n_vocab >= heap_k; + const bool use_fast_topk = params->fast_top_k && heap_k <= 1024 && n_vocab >= heap_k; sync_required = !use_fast_topk; if (use_fast_topk) { - diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, stream); + diffusion_select_topk_local(logits_d, top_ids, n_vocab, n_tokens, heap_k, params->top_k_local_k, stream); top_ids_sorted = true; } else { #ifdef CUB_DIFFUSION_TOP_K_AVAILABLE @@ -1116,6 +1373,7 @@ bool ggml_cuda_diffusion_sample_topk( diffusion_sample_kernel<<>>( logits_d, top_ids, n_vocab, n_tokens, top_k, heap_k, sc_k, inv_temp, logit_softcap, params->seed, params->step, params->top_k_tail_correction, + use_parallel_full_softmax_sample, scratch->sampled, scratch->argmax, scratch->entropy, sc_ids_out, sc_probs_out); } @@ -1148,7 +1406,7 @@ bool ggml_cuda_diffusion_sample_topk( } const bool final_tokens_after_stop = result->final_tokens && result->stop && result->check_stop_on_device && - diffusion_final_tokens_on_stop_enabled(); + params->final_tokens_on_stop; if (result->final_tokens && !final_tokens_after_stop) { CUDA_CHECK(cudaMemcpyAsync(result->final_tokens, scratch->argmax, (size_t) n_tokens * sizeof(int), diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 17e2c10f2b9c..1b3ba4216227 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3301,6 +3301,8 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { const char * env = getenv("GGML_CUDA_GRAPH_DIAG"); return env != nullptr && atoi(env) != 0; }(); + const bool is_diffusion_decoder_graph = ggml_graph_has_flag(cgraph, GGML_CGRAPH_FLAG_DIFFUSION_DECODER); + // Loop over nodes in GGML graph to obtain info needed for CUDA graph for (int i = 0; i < cgraph->n_nodes; i++) { @@ -3328,18 +3330,18 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { const ggml_tensor * src1 = node->src[1]; const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc); const bool use_mmvq = ggml_is_quantized(src0->type) && node->ne[2] <= mmvq_mmid_max; - const bool use_mmq = ggml_is_quantized(src0->type) && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[2], /*n_experts=*/src0->ne[2]); - const bool use_mmf = ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true); + const bool use_mmq = is_diffusion_decoder_graph && ggml_is_quantized(src0->type) && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[2], /*n_experts=*/src0->ne[2]); + const bool use_mmf = is_diffusion_decoder_graph && ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true); if (!use_mmvq && !use_mmq && !use_mmf) { // under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs // TODO: figure out a way to enable for larger batch sizes, without hurting performance // ref: https://github.com/ggml-org/llama.cpp/pull/18958 use_cuda_graph = false; if (log_graph_incompat) { - GGML_LOG_INFO("%s: disabling CUDA graphs due to MUL_MAT_ID at node %d %s: src0=%s ne2=%lld src1_ne2=%lld n_experts=%lld mmvq_mmid_max=%d quantized=%d mmq=%d mmf=%d\n", + GGML_LOG_INFO("%s: disabling CUDA graphs due to MUL_MAT_ID at node %d %s: src0=%s ne2=%lld src1_ne2=%lld n_experts=%lld mmvq_mmid_max=%d quantized=%d diffusion_decoder=%d mmq=%d mmf=%d\n", __func__, i, node->name, ggml_type_name(src0->type), (long long) node->ne[2], (long long) src1->ne[2], (long long) src0->ne[2], - mmvq_mmid_max, (int) ggml_is_quantized(src0->type), (int) use_mmq, (int) use_mmf); + mmvq_mmid_max, (int) ggml_is_quantized(src0->type), (int) is_diffusion_decoder_graph, (int) use_mmq, (int) use_mmf); } #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__); diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 62b76abbcec9..81d083be30da 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -326,6 +326,10 @@ enum ggml_cgraph_eval_order { GGML_CGRAPH_EVAL_ORDER_COUNT }; +enum ggml_cgraph_flag { + GGML_CGRAPH_FLAG_DIFFUSION_DECODER = 1u << 0, +}; + struct ggml_cgraph { int size; // maximum number of nodes/leafs/grads/grad_accs int n_nodes; // number of nodes currently in use @@ -341,11 +345,25 @@ struct ggml_cgraph { enum ggml_cgraph_eval_order order; + uint32_t flags; + // an optional identifier that can be utilized to recognize same graphs if two non-zero values match // a value of 0 means it is not set and should be ignored uint64_t uid; }; +static inline void ggml_graph_set_flag(struct ggml_cgraph * cgraph, enum ggml_cgraph_flag flag, bool enabled) { + if (enabled) { + cgraph->flags |= (uint32_t) flag; + } else { + cgraph->flags &= ~((uint32_t) flag); + } +} + +static inline bool ggml_graph_has_flag(const struct ggml_cgraph * cgraph, enum ggml_cgraph_flag flag) { + return (cgraph->flags & (uint32_t) flag) != 0; +} + // returns a slice of cgraph with nodes [i0, i1) // the slice does not have leafs or gradients // if you need the gradients, get them from the original graph diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 18a5ebd2ab0c..9a841c200aa9 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7162,6 +7162,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.use_counts =*/ use_counts_ptr, /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, + /*.flags =*/ 0, /*.uid =*/ 0, }; @@ -7190,6 +7191,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.use_counts =*/ cgraph0->use_counts, /*.visited_hash_set =*/ cgraph0->visited_hash_set, /*.order =*/ cgraph0->order, + /*.flags =*/ cgraph0->flags, /*.uid =*/ 0 }; @@ -7204,6 +7206,7 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { dst->n_leafs = src->n_leafs; dst->n_nodes = src->n_nodes; dst->order = src->order; + dst->flags = src->flags; for (int i = 0; i < src->n_leafs; ++i) { dst->leafs[i] = src->leafs[i]; diff --git a/include/llama.h b/include/llama.h index 089dd2747a8b..1fef35a215cc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -340,6 +340,8 @@ extern "C" { uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] uint32_t n_outputs_max; // max outputs in a ubatch (0 = n_batch) + int32_t diffusion_self_cond_top_k; // sparse self-conditioning width for diffusion models, 0 = model default [EXPERIMENTAL] + uint32_t diffusion_input_gpu_groups; // bitmask of diffusion decoder inputs assigned to GPU backend [EXPERIMENTAL] int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing @@ -382,6 +384,9 @@ extern "C" { bool kv_unified; // use a unified buffer across the input sequences when computing the attention // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix // ref: https://github.com/ggml-org/llama.cpp/pull/14363 + bool diffusion_fused_self_cond_embd; // use fused diffusion self-conditioning embedding input [EXPERIMENTAL] + bool diffusion_fuse_final_logit_softcap; // move diffusion final softcap into sampling [EXPERIMENTAL] + bool diffusion_separate_encoder_decoder; // build separate diffusion encoder/decoder graph variants [EXPERIMENTAL] // [EXPERIMENTAL] // backend sampler chain configuration (make sure the caller keeps the sampler chains alive) @@ -1023,6 +1028,14 @@ extern "C" { uint32_t seed; uint32_t step; bool top_k_tail_correction; + bool cuda_fast_top_k; + bool cuda_direct_self_cond; + bool cuda_final_tokens_on_stop; + bool cuda_fused_top_k_sample; + bool cuda_tight_top_k; + bool cuda_parallel_full_softmax; + bool cuda_fused_full_softmax; + int32_t cuda_top_k_local_k; }; struct llama_diffusion_sample_result { diff --git a/scripts/collect_diffusion_ncu_admin.ps1 b/scripts/collect_diffusion_ncu_admin.ps1 new file mode 100644 index 000000000000..f42a6b9e1864 --- /dev/null +++ b/scripts/collect_diffusion_ncu_admin.ps1 @@ -0,0 +1,186 @@ +param( + [string] $Model = "D:\Day0-1\northbloom\diffusion-gemma-26b-v10-q4_k_m.gguf", + [string] $Exe = "D:\Day0-1\northbloom\llama.cpp-codex-sampling\pr-changes\llama.cpp\build-review\bin\Release\llama-diffusion-gemma-cli.exe", + [string] $OutDir = "D:\Day0-1\northbloom\llama.cpp-codex-sampling\pr-changes\llama.cpp\profiles\diffusion-kernel-efficiency\ncu-admin", + [string] $Prompt = "Write one short sentence about CUDA graphs.", + [int] $TopK = 64 +) + +$ErrorActionPreference = "Stop" + +function Resolve-Ncu { + $cmd = Get-Command ncu.exe -ErrorAction SilentlyContinue + if ($cmd) { + return $cmd.Source + } + + $candidates = @( + "C:\Program Files\NVIDIA Corporation\Nsight Compute 2026.2.0\target\windows-desktop-win7-x64\ncu.exe", + "C:\Program Files\NVIDIA Corporation\Nsight Compute 2026.2.0\host\target-windows-x64\ncu.exe", + "C:\Program Files\NVIDIA Corporation\Nsight Compute 2026.1.0\target\windows-desktop-win7-x64\ncu.exe", + "C:\Program Files\NVIDIA Corporation\Nsight Compute 2026.1.0\host\target-windows-x64\ncu.exe", + "C:\Program Files\NVIDIA Corporation\Nsight Compute 2025.3.1\target\windows-desktop-win7-x64\ncu.exe", + "C:\Program Files\NVIDIA Corporation\Nsight Compute 2025.3.1\host\target-windows-x64\ncu.exe" + ) + foreach ($candidate in $candidates) { + if (Test-Path $candidate) { + return $candidate + } + } + + $found = Get-ChildItem "C:\Program Files\NVIDIA Corporation" -Recurse -Filter ncu.exe -ErrorAction SilentlyContinue | + Select-Object -First 1 + if ($found) { + return $found.FullName + } + + throw "Could not find ncu.exe. Add Nsight Compute to PATH or edit Resolve-Ncu in this script." +} + +if (-not (Test-Path $Exe)) { + throw "Executable not found: $Exe" +} +if (-not (Test-Path $Model)) { + throw "Model not found: $Model" +} + +New-Item -ItemType Directory -Force -Path $OutDir | Out-Null + +$ncu = Resolve-Ncu +Write-Host "Using NCU: $ncu" +Write-Host "Output : $OutDir" + +$runArgs = @( + $Exe, + "-m", $Model, + "-p", $Prompt, + "-n", "256", + "--diffusion-steps", "48", + "--top-k", "$TopK", + "-ngl", "all", + "-sm", "none", + "-mg", "0", + "-fa", "on", + "--ctx-size", "4096", + "--seed", "42", + "--no-mmproj", + "--diffusion-cuda-direct-self-cond", + "--diffusion-cuda-final-tokens-on-stop", + "--diffusion-cuda-tight-top-k", + "--diffusion-cuda-top-k-local-k", "4" +) + +function Set-EnvMap($map) { + foreach ($kv in $map.GetEnumerator()) { + [Environment]::SetEnvironmentVariable($kv.Key, $kv.Value, "Process") + } +} + +function Clear-ExperimentEnv { + $names = @( + "GGML_CUDA_MMQ_STREAM_K", + "GGML_CUDA_MMQ_STREAM_K_DIVISOR", + "GGML_CUDA_MMQ_STREAM_K_DIVISOR_MIN_PCT", + "GGML_CUDA_MMQ_AVOID_FIXUP", + "GGML_CUDA_MMQ_AVOID_FIXUP_MIN_EFF", + "GGML_CUDA_DISABLE_GRAPHS" + ) + foreach ($name in $names) { + [Environment]::SetEnvironmentVariable($name, $null, "Process") + } +} + +function Invoke-NcuProfile { + param( + [string] $Name, + [string] $KernelRegex, + [string] $MetricSet = "basic", + [int] $LaunchSkip = 0, + [int] $LaunchCount = 4, + [string[]] $ExtraArgs = @(), + [hashtable] $ExtraEnv = @{} + ) + + Clear-ExperimentEnv + Set-EnvMap $ExtraEnv + + $export = Join-Path $OutDir $Name + $args = @( + "--force-overwrite", + "--target-processes", "all", + "--graph-profiling", "node", + "--replay-mode", "application", + "--set", $MetricSet, + "--kernel-name", "regex:$KernelRegex", + "--launch-skip", "$LaunchSkip", + "--launch-count", "$LaunchCount", + "--export", $export, + "--" + ) + $runArgs + $ExtraArgs + + Write-Host "" + Write-Host "== $Name ==" + Write-Host "kernel regex: $KernelRegex" + Write-Host "set=$MetricSet skip=$LaunchSkip count=$LaunchCount" + & $ncu @args + if ($LASTEXITCODE -ne 0) { + throw "NCU failed for $Name with exit code $LASTEXITCODE" + } +} + +Invoke-NcuProfile ` + -Name "01_topk_fused_local4_detailed" ` + -KernelRegex "diffusion_sample_topk_fused_local_kernel" ` + -MetricSet "detailed" ` + -LaunchSkip 0 ` + -LaunchCount 4 ` + -ExtraArgs @("--diffusion-cuda-fused-top-k-sample") + +Invoke-NcuProfile ` + -Name "02_topk_baseline_select_detailed" ` + -KernelRegex "diffusion_select_topk_local_kernel" ` + -MetricSet "detailed" ` + -LaunchSkip 0 ` + -LaunchCount 4 ` + -ExtraArgs @("--diffusion-cuda-top-k-local-k", "8") + +Invoke-NcuProfile ` + -Name "03_mmq_denoise_basic" ` + -KernelRegex "mul_mat_q|mul_mat_q_stream_k_fixup|quantize_mmq_q8_1" ` + -MetricSet "basic" ` + -LaunchSkip 900 ` + -LaunchCount 16 ` + -ExtraArgs @("--diffusion-cuda-fused-top-k-sample") + +Invoke-NcuProfile ` + -Name "04_mmq_no_fixup_aggressive_basic" ` + -KernelRegex "mul_mat_q|mul_mat_q_stream_k_fixup|quantize_mmq_q8_1" ` + -MetricSet "basic" ` + -LaunchSkip 900 ` + -LaunchCount 16 ` + -ExtraArgs @("--diffusion-cuda-fused-top-k-sample") ` + -ExtraEnv @{ + GGML_CUDA_MMQ_STREAM_K_DIVISOR = "1" + GGML_CUDA_MMQ_AVOID_FIXUP = "1" + GGML_CUDA_MMQ_AVOID_FIXUP_MIN_EFF = "0" + } + +Invoke-NcuProfile ` + -Name "05_attention_basic" ` + -KernelRegex "flash_attn_ext_f16|flash_attn_stream_k_fixup" ` + -MetricSet "basic" ` + -LaunchSkip 80 ` + -LaunchCount 16 ` + -ExtraArgs @("--diffusion-cuda-fused-top-k-sample") + +Invoke-NcuProfile ` + -Name "06_misc_hotspots_basic" ` + -KernelRegex "cpy_scalar_transpose|reduce_rows_f32|k_get_rows_float|softcap_f32" ` + -MetricSet "basic" ` + -LaunchSkip 0 ` + -LaunchCount 16 ` + -ExtraArgs @("--diffusion-cuda-fused-top-k-sample") + +Write-Host "" +Write-Host "NCU collection complete. Reports are in:" +Write-Host $OutDir diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 575b647f733d..45f966ca1935 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -197,6 +197,12 @@ llama_context::llama_context( cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max; + diffusion_cond.self_cond_top_k = params.diffusion_self_cond_top_k > 0 ? params.diffusion_self_cond_top_k : 256; + diffusion_cond.input_gpu_groups = params.diffusion_input_gpu_groups; + diffusion_cond.fused_self_cond_embd = params.diffusion_fused_self_cond_embd; + diffusion_cond.fuse_final_logit_softcap = params.diffusion_fuse_final_logit_softcap; + diffusion_cond.separate_encoder_decoder = params.diffusion_separate_encoder_decoder; + cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; @@ -1204,16 +1210,6 @@ static ggml_backend_cuda_diffusion_sample_topk_t get_cuda_diffusion_sample_topk_ ggml_backend_reg_get_proc_address(reg, "ggml_backend_cuda_diffusion_sample_topk"); } -static bool diffusion_cuda_fuse_final_logit_softcap_enabled() { - const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSE_FINAL_SOFTCAP"); - return v && std::strcmp(v, "0") != 0; -} - -static bool diffusion_cuda_fused_selfcond_embd_enabled() { - const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD"); - return v && std::strcmp(v, "0") != 0; -} - static bool diffusion_decoder_inputs_device_ready( const llama_diffusion_cond & diffusion_cond, const llm_graph_result * res) { @@ -1292,7 +1288,7 @@ bool llama_context::diffusion_sample_topk( const bool device_self_cond = result->self_cond_ids == nullptr && result->self_cond_probs == nullptr; if (device_self_cond) { auto * inp_embd = gf_res_prev->get_inp_diffusion_self_cond_embd(); - if (inp_embd && diffusion_cuda_fused_selfcond_embd_enabled()) { + if (inp_embd && diffusion_cond.fused_self_cond_embd) { t_self_cond_embd = inp_embd->embd; t_token_embd = gf_res_prev->get_diffusion_token_embd(); if (!t_self_cond_embd || !t_token_embd) { @@ -1317,7 +1313,7 @@ bool llama_context::diffusion_sample_topk( } } - const float fused_logit_softcap = diffusion_cuda_fuse_final_logit_softcap_enabled() + const float fused_logit_softcap = diffusion_cond.fuse_final_logit_softcap ? model.hparams.f_final_logit_softcapping : 0.0f; @@ -1331,6 +1327,14 @@ bool llama_context::diffusion_sample_topk( /* .step = */ params->step, /* .top_k_tail_correction = */ params->top_k_tail_correction, /* .logit_softcap = */ fused_logit_softcap, + /* .fast_top_k = */ params->cuda_fast_top_k, + /* .direct_self_cond = */ params->cuda_direct_self_cond, + /* .final_tokens_on_stop = */ params->cuda_final_tokens_on_stop, + /* .fused_top_k_sample = */ params->cuda_fused_top_k_sample, + /* .tight_top_k = */ params->cuda_tight_top_k, + /* .parallel_full_softmax = */ params->cuda_parallel_full_softmax, + /* .fused_full_softmax = */ params->cuda_fused_full_softmax, + /* .top_k_local_k = */ params->cuda_top_k_local_k, }; ggml_cuda_diffusion_sample_result cuda_result = { @@ -3628,6 +3632,8 @@ llama_context_params llama_context_default_params() { /*.n_seq_max =*/ 1, /*.n_rs_seq =*/ 0, /*.n_outputs_max =*/ 0, + /*.diffusion_self_cond_top_k =*/ 256, + /*.diffusion_input_gpu_groups =*/ 63, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, /*.ctx_type =*/ LLAMA_CONTEXT_TYPE_DEFAULT, @@ -3655,6 +3661,9 @@ llama_context_params llama_context_default_params() { /*.op_offload =*/ true, /*.swa_full =*/ true, /*.kv_unified =*/ false, + /*.diffusion_fused_self_cond_embd =*/ false, + /*.diffusion_fuse_final_logit_softcap =*/ false, + /*.diffusion_separate_encoder_decoder =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, /*.ctx_other =*/ nullptr, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 7b21baf75d6c..5883c5a9ea9b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -12,6 +12,8 @@ #include "llama-memory-hybrid-iswa.h" #include "llama-memory-recurrent.h" +#include "../ggml/src/ggml-impl.h" + #include #include #include @@ -1221,6 +1223,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : ctx0 (res->get_ctx()), gf (res->get_gf()) { res->set_params(params); + ggml_graph_set_flag(gf, GGML_CGRAPH_FLAG_DIFFUSION_DECODER, + llm_arch_is_diffusion(arch) && diffusion && diffusion->decoder_phase); } void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { @@ -1234,15 +1238,12 @@ void llm_graph_context::set_diffusion_input_backend(ggml_tensor * tensor, uint32 return; } - static const uint32_t enabled_groups = [] { - const char * env = getenv("DG_GPU_INPUT_GROUPS"); - // Keep the default to inputs used by the fixed diffusion decoder graph: - // canvas/self-cond, positions, attention scale, KV indices, masks and - // rotary helpers. Mark them as outputs too, matching ggml-backend's - // existing copy-tensor convention, so the allocator will not overwrite - // them between denoising replays. - return env ? (uint32_t) strtoul(env, nullptr, 0) : 63u; // 1|2|4|8|16|32 - }(); + // Keep the default to inputs used by the fixed diffusion decoder graph: + // canvas/self-cond, positions, attention scale, KV indices, masks and + // rotary helpers. Mark them as outputs too, matching ggml-backend's + // existing copy-tensor convention, so the allocator will not overwrite + // them between denoising replays. + const uint32_t enabled_groups = diffusion->input_gpu_groups; if ((enabled_groups & group) == 0) { return; } diff --git a/src/llama-graph.h b/src/llama-graph.h index 446e8162cf3c..ce98d241836d 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -119,6 +119,12 @@ struct llama_diffusion_cond { // superset graph (the decoder adds the self-conditioning input + block on top of the // encoder graph). The caller sets the actual phase before every decode. bool decoder_phase = true; + + int64_t self_cond_top_k = 256; + uint32_t input_gpu_groups = 63; + bool fused_self_cond_embd = false; + bool fuse_final_logit_softcap = false; + bool separate_encoder_decoder = false; }; struct llm_graph_params; @@ -356,7 +362,7 @@ class llm_graph_input_diffusion_self_cond_topk : public llm_graph_input_i { }; // Dense diffusion self-conditioning embedding input. This is filled by the CUDA sampler from -// top-k ids/probs when GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD is enabled. +// top-k ids/probs when the fused diffusion self-conditioning embedding path is enabled. class llm_graph_input_diffusion_self_cond_embd : public llm_graph_input_i { public: llm_graph_input_diffusion_self_cond_embd(const llama_diffusion_cond * diffusion) : diffusion(diffusion) {} diff --git a/src/models/diffusion-gemma.cpp b/src/models/diffusion-gemma.cpp index b8b1091cbb26..e7a2d6e01727 100644 --- a/src/models/diffusion-gemma.cpp +++ b/src/models/diffusion-gemma.cpp @@ -4,40 +4,6 @@ #include "ggml-alloc.h" #include -#include -#include - -static bool diffusion_gemma_fuse_final_logit_softcap_enabled() { - const char * v = std::getenv("GGML_CUDA_DIFFUSION_FUSE_FINAL_SOFTCAP"); - return v && std::strcmp(v, "0") != 0; -} - -static bool diffusion_gemma_env_enabled_default(const char * name, bool def) { - const char * v = std::getenv(name); - if (!v) { - return def; - } - return std::strcmp(v, "0") != 0; -} - -static bool diffusion_gemma_fused_selfcond_embd_enabled() { - return diffusion_gemma_env_enabled_default("GGML_CUDA_DIFFUSION_FUSED_SELFCOND_EMBD", false) && - diffusion_gemma_env_enabled_default("DG_DEVICE_SELFCOND", true); -} - -static int64_t diffusion_gemma_self_cond_top_k() { - const int64_t def = llama_model_diffusion_gemma::N_SC_TOPK; - const char * v = std::getenv("GGML_CUDA_DIFFUSION_SC_TOPK"); - if (!v) { - return def; - } - - const int64_t k = std::atoll(v); - if (k <= 0) { - return def; - } - return std::min(k, def); -} // diffusion_gemma reuses the gemma4 decoder block (tensor layout + per-layer math) but runs // as a bidirectional (non-causal) block-diffusion denoiser over a canvas, with KV-cache reuse: @@ -46,7 +12,7 @@ static int64_t diffusion_gemma_self_cond_top_k() { // (self-conditioned, bidirectional), rolling back its own K/V afterwards. // // Two graph variants are provided (see build_arch_graph): a single phase-branching graph, and -// a separate encoder/decoder pair (DG_SEPARATE_ENC_DEC). Both reuse the gemma4 transformer +// a separate encoder/decoder pair (--diffusion-separate-encoder-decoder). Both reuse the gemma4 transformer // body and differ only in input-embedding handling (encoder: plain; decoder: self-conditioned). void llama_model_diffusion_gemma::load_arch_hparams(llama_model_loader & ml) { @@ -159,7 +125,7 @@ void llama_model_diffusion_gemma::load_arch_post(llama_model_loader & ml) { __func__, (long long) n_embd_t, (long long) n_vocab_t, ggml_nbytes(tok_embd_gpu) / (1024.0 * 1024.0 * 1024.0), ggml_backend_buffer_name(tok_embd_gpu->buffer), - (long long) diffusion_gemma_self_cond_top_k()); + (long long) llama_model_diffusion_gemma::N_SC_TOPK); return; } LLAMA_LOG_WARN("%s: failed to allocate on-device gather embedding; falling back to dense matmul\n", __func__); @@ -201,10 +167,10 @@ std::unique_ptr llama_model_diffusion_gemma::build_arch_graph const bool is_decoder = params.diffusion && params.diffusion->decoder_phase; // Variant B ("separate encoder and decoder block", shared weights): opt-in via - // DG_SEPARATE_ENC_DEC. Two distinct graphs are built per phase. Functionally identical + // --diffusion-separate-encoder-decoder. Two distinct graphs are built per phase. Functionally identical // to Variant A here (the checkpoint shares encoder/decoder weights); the split mirrors // the HF two-stack structure and generalizes to a checkpoint with distinct weights. - if (getenv("DG_SEPARATE_ENC_DEC")) { + if (params.diffusion && params.diffusion->separate_encoder_decoder) { if (is_decoder) { return std::make_unique(*this, params); } @@ -233,14 +199,14 @@ ggml_tensor * llama_model_diffusion_gemma::graph_base::build_input(bool is_decod if (is_decoder) { ggml_tensor * soft; // soft-embedding {n_embd, n_tokens}: blend of the previous step's // predicted token embeddings, scaled by sqrt(n_embd) - if (dmodel.tok_embd_gpu && diffusion_gemma_fused_selfcond_embd_enabled()) { + if (dmodel.tok_embd_gpu && diffusion && diffusion->fused_self_cond_embd) { soft = build_inp_diffusion_self_cond_embd(n_embd); } else if (dmodel.tok_embd_gpu) { // Sparse gather path (Option-2): the previous step's top-k token ids+probs are fed per // position; gather just those k embedding rows and blend them, instead of the dense // full-vocab `probs @ token_embd` matmul. Gather width is fixed (N_SC_TOPK) so the // graph shape is constant; unused slots carry prob 0 (the CLI zero-pads). - const int64_t k = diffusion_gemma_self_cond_top_k(); + const int64_t k = diffusion ? std::min(std::max(diffusion->self_cond_top_k, 1), llama_model_diffusion_gemma::N_SC_TOPK) : llama_model_diffusion_gemma::N_SC_TOPK; auto * inp = build_inp_diffusion_self_cond_topk(k); ggml_tensor * ids = inp->ids; // I32 {k*n_tokens} ggml_tensor * probs = inp->probs; // F32 {k, n_tokens} @@ -461,7 +427,7 @@ void llama_model_diffusion_gemma::graph_base::build_transformer(ggml_tensor * in cur = build_lora_mm(model.output, cur, model.output_s); const bool fuse_final_softcap = - diffusion && diffusion->decoder_phase && diffusion_gemma_fuse_final_logit_softcap_enabled(); + diffusion && diffusion->decoder_phase && diffusion->fuse_final_logit_softcap; if (hparams.f_final_logit_softcapping && !fuse_final_softcap) { cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);