Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3865,6 +3865,101 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
[](common_params & params) { params.diffusion.visual_mode = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--no-diffusion-gpu-sampling"},
"disable CUDA block-diffusion sampling fast path",
[](common_params & params) { params.diffusion.gpu_sampling = false; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--no-diffusion-device-selfcond"},
"disable device-resident block-diffusion self-conditioning",
[](common_params & params) { params.diffusion.device_self_cond = false; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--no-diffusion-device-denoise-loop"},
"disable device-side block-diffusion canvas and stop-state updates",
[](common_params & params) { params.diffusion.device_denoise_loop = false; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-pin-host-outputs"},
"register compact diffusion output buffers as pinned host memory",
[](common_params & params) { params.diffusion.pin_host_outputs = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-self-cond-top-k"}, "N",
string_format("block-diffusion sparse self-conditioning width (default: %d)", params.diffusion.self_cond_top_k),
[](common_params & params, int value) { params.diffusion.self_cond_top_k = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-input-gpu-groups"}, "N",
string_format("bitmask of block-diffusion decoder input groups assigned to GPU backend (default: %u)", params.diffusion.input_gpu_groups),
[](common_params & params, int value) { params.diffusion.input_gpu_groups = (uint32_t) std::max(value, 0); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-default-top-k"}, "N",
"block-diffusion top-k used when --top-k is not explicitly provided",
[](common_params & params, int value) { params.diffusion.default_top_k = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-force-top-k"}, "N",
"block-diffusion server: override per-request top_k when N > 0",
[](common_params & params, int value) { params.diffusion.force_top_k = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-fused-self-cond-embd"},
"use fused device self-conditioning embedding input for block diffusion",
[](common_params & params) { params.diffusion.fused_self_cond_embd = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-fuse-final-softcap"},
"move final logit softcap into the CUDA diffusion sampling kernel",
[](common_params & params) { params.diffusion.fuse_final_logit_softcap = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-separate-encoder-decoder"},
"build separate block-diffusion encoder and decoder graph variants",
[](common_params & params) { params.diffusion.separate_encoder_decoder = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-direct-self-cond"},
"write CUDA diffusion self-conditioning directly into decoder graph inputs",
[](common_params & params) { params.diffusion.cuda_direct_self_cond = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-final-tokens-on-stop"},
"copy final diffusion tokens only when the device stop condition is reached",
[](common_params & params) { params.diffusion.cuda_final_tokens_on_stop = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-fused-top-k-sample"},
"fuse CUDA diffusion top-k selection and sampling",
[](common_params & params) { params.diffusion.cuda_fused_top_k_sample = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-tight-top-k"},
"avoid extra CUDA diffusion top-k scratch width when possible",
[](common_params & params) { params.diffusion.cuda_tight_top_k = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-parallel-full-softmax"},
"parallelize CUDA diffusion full-vocab sampling when top-k is 0",
[](common_params & params) { params.diffusion.cuda_parallel_full_softmax = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-fused-full-softmax"},
"fuse CUDA diffusion full-vocab softmax sampling and self-conditioning",
[](common_params & params) { params.diffusion.cuda_fused_full_softmax = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cuda-top-k-local-k"}, "N",
"CUDA diffusion local top-k candidates per thread (0 = backend default)",
[](common_params & params, int value) { params.diffusion.cuda_top_k_local_k = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--no-diffusion-cuda-fast-top-k"},
"disable CUDA diffusion CUB/fast top-k selection path",
[](common_params & params) { params.diffusion.cuda_fast_top_k = false; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--top-k-start"}, "N",
"block-diffusion: anneal top-k from N at the first (high-entropy) denoising step (with --top-k-end)",
Expand Down
5 changes: 5 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1585,6 +1585,11 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
cparams.kv_unified = params.kv_unified;
cparams.diffusion_self_cond_top_k = params.diffusion.self_cond_top_k;
cparams.diffusion_input_gpu_groups = params.diffusion.input_gpu_groups;
cparams.diffusion_fused_self_cond_embd = params.diffusion.fused_self_cond_embd;
cparams.diffusion_fuse_final_logit_softcap = params.diffusion.fuse_final_logit_softcap;
cparams.diffusion_separate_encoder_decoder = params.diffusion.separate_encoder_decoder;

cparams.type_k = params.cache_type_k;
cparams.type_v = params.cache_type_v;
Expand Down
23 changes: 23 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,10 @@ struct common_params_vocoder {
struct common_params_diffusion {
int32_t steps = 128;
bool visual_mode = false;
bool gpu_sampling = true; // use CUDA diffusion sampling fast path when available
bool device_self_cond = true; // keep diffusion self-conditioning state on device
bool device_denoise_loop = true; // update diffusion canvas/stop state on device
bool pin_host_outputs = false; // register compact D2H output buffers as pinned host memory

float eps = 0; // epsilon for timesteps
int32_t block_length = 0; // block length for generation
Expand All @@ -395,6 +399,25 @@ struct common_params_diffusion {
int32_t top_k_start = 0; // anneal top-k from this (first/high-entropy step) ...
int32_t top_k_end = 0; // ... to this (last step); both > 0 enables annealing
bool top_k_tail_correction = false; // use exact full-vocab entropy for accept/stop
int32_t default_top_k = 0; // top-k used when --top-k is not explicitly provided
int32_t force_top_k = 0; // server: override per-request top_k when > 0
int32_t self_cond_top_k = 256; // sparse self-conditioning gather width
uint32_t input_gpu_groups = 63; // decoder input tensor groups assigned to GPU backend

// CUDA diffusion sampling fast-path knobs. Defaults preserve behavior when no tuning flags are passed.
bool cuda_fast_top_k = true;
bool cuda_direct_self_cond = false;
bool cuda_final_tokens_on_stop = false;
bool cuda_fused_top_k_sample = false;
bool cuda_tight_top_k = false;
bool cuda_parallel_full_softmax = false;
bool cuda_fused_full_softmax = false;
int32_t cuda_top_k_local_k = 0; // 0 = backend default

// Diffusion graph-shape knobs.
bool fused_self_cond_embd = false;
bool fuse_final_logit_softcap = false;
bool separate_encoder_decoder = false;
};

// reasoning API response format (not to be confused as chat template's reasoning format)
Expand Down
10 changes: 10 additions & 0 deletions examples/diffusion-gemma/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,13 @@ target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd ${CMAKE_THREAD_L
# mtmd (tools/) is added after examples/, so add its include dir explicitly for the headers
target_include_directories(${TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd)
target_compile_features(${TARGET} PRIVATE cxx_std_17)

# OpenAI-compatible HTTP server for the block-diffusion models (llama-server analogue)
set(TARGET llama-diffusion-gemma-server)
add_executable(${TARGET} diffusion-gemma-server.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama llama-common mtmd cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../../tools/mtmd
${CMAKE_SOURCE_DIR}/vendor) # cpp-httplib/httplib.h, nlohmann/json.hpp
target_compile_features(${TARGET} PRIVATE cxx_std_17)
Loading