From 6cd1324d2677a787a899d25a65e70995ffcf5813 Mon Sep 17 00:00:00 2001 From: James Dumay Date: Fri, 17 Apr 2026 18:25:59 +1000 Subject: [PATCH] Fix multi-shard moe-split tensor counting --- tools/moe-split/moe-split.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/moe-split/moe-split.cpp b/tools/moe-split/moe-split.cpp index 4ac2c6240010..913a4725a3a0 100644 --- a/tools/moe-split/moe-split.cpp +++ b/tools/moe-split/moe-split.cpp @@ -496,12 +496,13 @@ static void write_group( } } - // Create ggml context for ALL tensors (trunk + experts). - // Previously used n_expert_tensors which underallocated for models where the - // total tensor count far exceeds the expert tensor count (e.g. GLM-5.1 has - // 1809 total tensors but only ~237 expert tensors), causing SIGABRT at the - // last block when the context overflows. - size_t ctx_size = (size_t)(n_tensors + 64) * ggml_tensor_overhead() + 4096; + // Create ggml context for ALL tensors (trunk + experts) across every input + // shard. The multi-shard path no longer has a single ctx_in tensor count. + int n_tensors_total = 0; + for (const auto & input_shard : input_shards) { + n_tensors_total += gguf_get_n_tensors(input_shard.ctx_gguf); + } + size_t ctx_size = (size_t)(n_tensors_total + 64) * ggml_tensor_overhead() + 4096; struct ggml_init_params ctx_params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ nullptr,