From 1fe6d496a80d4bce8945df65b4987762deb66865 Mon Sep 17 00:00:00 2001 From: co-seven Date: Mon, 8 Jun 2026 07:44:07 +0000 Subject: [PATCH 1/4] server: add LingBot-MAP SMT reconstruction pipeline. Add LingBot-MAP as an SMT vision model implementation, including GGUF metadata/model registration, quantization support, aggregator/camera_head GGML runtime, ViT/DPT ONNX integration, reconstruction postprocess, and /reconstruct server routing. --- convert_hf_to_gguf.py | 205 ++- src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-model.cpp | 11 + src/llama-quant.cpp | 6 + src/models/lingbot-map.cpp | 68 + src/models/models.h | 9 + tools/mtmd/CMakeLists.txt | 5 + tools/mtmd/lingbot-map-wrapper.cpp | 1893 ++++++++++++++++++++++++++ tools/mtmd/lingbot-map-wrapper.h | 172 +++ tools/mtmd/smt-vision-preprocess.cpp | 270 ++++ tools/mtmd/smt-vision-preprocess.h | 16 + tools/server/server-common.cpp | 4 +- tools/server/server-context.cpp | 287 +++- tools/server/server-context.h | 2 + tools/server/server-smt-vision.cpp | 836 +++++++++++- tools/server/server-smt-vision.h | 130 ++ tools/server/server.cpp | 19 +- 18 files changed, 3894 insertions(+), 41 deletions(-) create mode 100644 src/models/lingbot-map.cpp create mode 100644 tools/mtmd/lingbot-map-wrapper.cpp create mode 100644 tools/mtmd/lingbot-map-wrapper.h diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 85527553563d..e39f95d1f116 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7,8 +7,11 @@ import logging import os import sys +from collections import OrderedDict from pathlib import Path +from typing import Iterable +import numpy as np import torch if 'NO_LOCAL_GGUF' not in os.environ: @@ -27,6 +30,184 @@ ) +LINGBOT_MAP_ARCH = "lingbot-map" +LINGBOT_MAP_DEFAULT_CHECKPOINT = Path("/home/cailinxi/modelzoo/lingbot-map/hf_model/lingbot-map.pt") +LINGBOT_MAP_DEFAULT_OUTFILE = Path("/home/cailinxi/modelzoo/lingbot-map/mtmd_model/lingbot-map-agg-camera-f32.gguf") + + +def unwrap_lingbot_map_state_dict(obj: object) -> OrderedDict[str, torch.Tensor]: + if isinstance(obj, dict) and "model" in obj and isinstance(obj["model"], dict): + obj = obj["model"] + if not isinstance(obj, dict): + raise TypeError(f"checkpoint must contain a state dict, got {type(obj)!r}") + + state = OrderedDict() + for key, value in obj.items(): + if isinstance(value, torch.Tensor): + state[str(key)] = value.detach().cpu() + if not state: + raise ValueError("checkpoint does not contain tensor entries") + return state + + +def lingbot_map_selected_tensor_names( + state: OrderedDict[str, torch.Tensor], + include_patch_embed: bool, + include_depth_head: bool) -> list[str]: + names: list[str] = [] + for name in state: + if name.startswith("aggregator."): + if not include_patch_embed and name.startswith("aggregator.patch_embed."): + continue + names.append(name) + elif name.startswith("camera_head."): + names.append(name) + elif include_depth_head and name.startswith("depth_head."): + names.append(name) + return names + + +def lingbot_map_count_indexed_modules(names: Iterable[str], prefix: str) -> int: + indices: set[int] = set() + needle = prefix + "." + for name in names: + if not name.startswith(needle): + continue + rest = name[len(needle):] + first = rest.split(".", 1)[0] + if first.isdigit(): + indices.add(int(first)) + return max(indices) + 1 if indices else 0 + + +def lingbot_map_infer_metadata( + state: OrderedDict[str, torch.Tensor], + selected: list[str], + include_patch_embed: bool, + include_depth_head: bool) -> dict[str, object]: + camera_token = state.get("aggregator.camera_token") + if camera_token is None: + raise KeyError("missing required tensor: aggregator.camera_token") + + embed_dim = int(camera_token.shape[-1]) + num_camera_token_variants = int(camera_token.shape[1]) + num_register_tokens = int(state["aggregator.register_token"].shape[2]) if "aggregator.register_token" in state else 0 + has_scale_token = "aggregator.scale_token" in state + num_special_tokens = 1 + num_register_tokens + (1 if has_scale_token else 0) + frame_blocks = lingbot_map_count_indexed_modules(selected, "aggregator.frame_blocks") + global_blocks = lingbot_map_count_indexed_modules(selected, "aggregator.global_blocks") + camera_blocks = lingbot_map_count_indexed_modules(selected, "camera_head.trunk") + + patch_proj = state.get("aggregator.patch_embed.patch_embed.proj.weight") + patch_size = int(patch_proj.shape[-1]) if patch_proj is not None else 14 + + camera_qkv = state.get("camera_head.trunk.0.attn.qkv.weight") + camera_dim = int(camera_qkv.shape[1]) if camera_qkv is not None else embed_dim * 2 + camera_pose_dim = int(state["camera_head.empty_pose_tokens"].shape[-1]) if "camera_head.empty_pose_tokens" in state else 9 + + return { + "schema_version": 1, + "component": "aggregator_camera_head", + "includes_patch_embed": bool(include_patch_embed), + "includes_depth_head": bool(include_depth_head), + "embed_dim": embed_dim, + "camera_dim": camera_dim, + "camera_pose_dim": camera_pose_dim, + "patch_size": patch_size, + "num_register_tokens": num_register_tokens, + "num_special_tokens": num_special_tokens, + "num_camera_token_variants": num_camera_token_variants, + "has_scale_token": has_scale_token, + "aggregator_frame_block_count": frame_blocks, + "aggregator_global_block_count": global_blocks, + "camera_trunk_block_count": camera_blocks, + "aa_order": ["frame", "global"], + "aa_block_size": 1, + "rope_freq": 100.0, + "resnet_mean": [0.485, 0.456, 0.406], + "resnet_std": [0.229, 0.224, 0.225], + } + + +def lingbot_map_add_metadata(writer: gguf.GGUFWriter, meta: dict[str, object], outtype: str) -> None: + writer.add_name("LingBot-MAP aggregator + camera head") + writer.add_type("model") + writer.add_description("LingBot-MAP non-LLM GGUF containing aggregator and camera head tensors.") + writer.add_file_type(int(gguf.LlamaFileType.MOSTLY_F16 if outtype == "f16" else gguf.LlamaFileType.ALL_F32)) + writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + for key, value in meta.items(): + full_key = f"{LINGBOT_MAP_ARCH}.{key}" + if isinstance(value, bool): + writer.add_bool(full_key, value) + elif isinstance(value, int): + writer.add_uint32(full_key, value) + elif isinstance(value, float): + writer.add_float32(full_key, value) + elif isinstance(value, str): + writer.add_string(full_key, value) + elif isinstance(value, list): + writer.add_array(full_key, value) + else: + raise TypeError(f"unsupported metadata value for {key}: {type(value)!r}") + + +def lingbot_map_tensor_to_numpy(tensor: torch.Tensor, outtype: str) -> np.ndarray: + if tensor.dtype.is_floating_point: + if outtype == "f16": + return tensor.to(torch.float16).numpy() + return tensor.to(torch.float32).numpy() + + if tensor.dtype in (torch.int8, torch.int16, torch.int32, torch.int64): + return tensor.numpy() + + raise TypeError(f"unsupported tensor dtype: {tensor.dtype}") + + +def write_lingbot_map_gguf(args: argparse.Namespace) -> None: + outtype = "f32" if args.outtype == "auto" else args.outtype + if outtype not in ("f32", "f16"): + raise ValueError("LingBot-MAP GGUF conversion only supports --outtype f32 or f16") + + checkpoint_path = args.checkpoint or LINGBOT_MAP_DEFAULT_CHECKPOINT + outfile = args.outfile or LINGBOT_MAP_DEFAULT_OUTFILE + + logger.info("Loading LingBot-MAP checkpoint: %s", checkpoint_path) + checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False) + state = unwrap_lingbot_map_state_dict(checkpoint) + selected = lingbot_map_selected_tensor_names(state, args.include_patch_embed, args.include_depth_head) + if not selected: + raise ValueError("no LingBot-MAP tensors selected for conversion") + + meta = lingbot_map_infer_metadata(state, selected, args.include_patch_embed, args.include_depth_head) + total_params = sum(state[name].numel() for name in selected) + total_bytes = sum(lingbot_map_tensor_to_numpy(state[name], outtype).nbytes for name in selected) + + logger.info("Selected LingBot-MAP tensors: %d", len(selected)) + logger.info("Selected LingBot-MAP parameters: %.3f M", total_params / 1e6) + logger.info("Selected LingBot-MAP tensor bytes: %.3f MiB", total_bytes / (1024 * 1024)) + for key, value in meta.items(): + logger.info("LingBot-MAP meta %s = %s", key, value) + + if args.dry_run: + return + + outfile.parent.mkdir(parents=True, exist_ok=True) + writer = gguf.GGUFWriter(outfile, LINGBOT_MAP_ARCH) + lingbot_map_add_metadata(writer, meta, outtype) + + for name in selected: + arr = lingbot_map_tensor_to_numpy(state[name].contiguous(), outtype) + writer.add_tensor(name, arr) + + logger.info("Writing LingBot-MAP GGUF: %s", outfile) + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file(progress=True) + writer.close() + logger.info("LingBot-MAP GGUF conversion done") + + def split_str_to_n_bytes(split_str: str) -> int: if split_str.endswith("K"): n = int(split_str[:-1]) * 1000 @@ -60,6 +241,22 @@ def parse_args() -> argparse.Namespace: "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto", help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type", ) + parser.add_argument( + "--lingbot-map", action="store_true", + help="Export LingBot-MAP aggregator and camera_head tensors from a PyTorch checkpoint to GGUF.", + ) + parser.add_argument( + "--checkpoint", type=Path, + help="Path to LingBot-MAP .pt checkpoint. Only used with --lingbot-map.", + ) + parser.add_argument( + "--include-patch-embed", action="store_true", + help="Also include aggregator.patch_embed.* tensors when converting LingBot-MAP.", + ) + parser.add_argument( + "--include-depth-head", action="store_true", + help="Also include depth_head.* tensors when converting LingBot-MAP.", + ) parser.add_argument( "--bigendian", action="store_true", help="model is executed on big endian machine", @@ -95,7 +292,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--dry-run", action="store_true", - help="only print out a split plan and exit, without writing any new files", + help="only print out a split plan and exit, without writing any new files. In --lingbot-map mode, print selected tensors and inferred metadata without writing GGUF.", ) parser.add_argument( "--no-tensor-first-split", action="store_true", @@ -154,7 +351,7 @@ def parse_args() -> argparse.Namespace: ) args = parser.parse_args() - if not args.print_supported_models and args.model is None: + if not args.print_supported_models and not args.lingbot_map and args.model is None: parser.error("the following arguments are required: model") return args @@ -172,6 +369,10 @@ def main() -> None: else: logging.basicConfig(level=logging.INFO) + if args.lingbot_map: + write_lingbot_map_gguf(args) + return + if args.remote: hf_repo_id = args.model from huggingface_hub import snapshot_download diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index be8f73cc1edd..0ca1d20a4a3b 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -8,6 +8,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize + { LLM_ARCH_LINGBOT_MAP, "lingbot-map" }, { LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA4, "llama4" }, { LLM_ARCH_DECI, "deci" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 2c71bbe81562..a7a21b2ef606 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -12,6 +12,7 @@ enum llm_arch { LLM_ARCH_CLIP, + LLM_ARCH_LINGBOT_MAP, LLM_ARCH_LLAMA, LLM_ARCH_LLAMA4, LLM_ARCH_DECI, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 3e236f8c17d2..b8056cac3496 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -37,6 +37,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params & params) { switch (arch) { + case LLM_ARCH_LINGBOT_MAP: + return new llama_model_lingbot_map(params); case LLM_ARCH_LLAMA: return new llama_model_llama(params); case LLM_ARCH_LLAMA4: @@ -1004,6 +1006,14 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { return; } + if (ml.get_arch() == LLM_ARCH_LINGBOT_MAP) { + load_arch_hparams(ml); + pimpl->n_bytes = ml.n_bytes; + pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name(); + hparams.rope_type = LLAMA_ROPE_TYPE_NONE; + return; + } + ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); @@ -2258,6 +2268,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { switch (model->arch) { // these models do not use RoPE case LLM_ARCH_CLIP: + case LLM_ARCH_LINGBOT_MAP: case LLM_ARCH_GPT2: case LLM_ARCH_GPTJ: case LLM_ARCH_MPT: diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 43e05c3d56fe..d2955a846237 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -351,6 +351,12 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param quantize &= name.find(".patch_embd") == std::string::npos; quantize &= name.find(".patch_merger") == std::string::npos; + if (arch == LLM_ARCH_LINGBOT_MAP) { + // Pose input projection has ne[0] = pose_dim = 9. Legacy block quantizers such as Q4_0 + // require the first dimension to be divisible by 32, so keep such tiny projection tensors in F32. + quantize &= tensor->ne[0] % 32 == 0; + } + return quantize; } diff --git a/src/models/lingbot-map.cpp b/src/models/lingbot-map.cpp new file mode 100644 index 000000000000..877402d7ee4a --- /dev/null +++ b/src/models/lingbot-map.cpp @@ -0,0 +1,68 @@ +#include "models.h" + +#include +#include +#include + +void llama_model_lingbot_map::load_arch_hparams(llama_model_loader & ml) { + std::string component; + uint32_t embed_dim = 0; + uint32_t camera_dim = 0; + uint32_t frame_blocks = 0; + uint32_t global_blocks = 0; + uint32_t camera_blocks = 0; + + ml.get_key("lingbot-map.component", component); + ml.get_key("lingbot-map.embed_dim", embed_dim); + ml.get_key("lingbot-map.camera_dim", camera_dim); + ml.get_key("lingbot-map.aggregator_frame_block_count", frame_blocks); + ml.get_key("lingbot-map.aggregator_global_block_count", global_blocks); + ml.get_key("lingbot-map.camera_trunk_block_count", camera_blocks); + + if (component != "aggregator_camera_head") { + throw std::runtime_error("unsupported LingBot-MAP GGUF component: " + component); + } + if (embed_dim == 0 || camera_dim == 0 || frame_blocks == 0 || global_blocks == 0 || camera_blocks == 0) { + throw std::runtime_error("invalid LingBot-MAP GGUF metadata"); + } + + type = LLM_TYPE_UNKNOWN; + hparams.n_ctx_train = 0; + hparams.n_embd = std::max(embed_dim, camera_dim); + hparams.n_layer = frame_blocks + global_blocks + camera_blocks; + hparams.n_expert = 0; + hparams.n_expert_used = 0; + hparams.causal_attn = false; + hparams.f_norm_eps = 1e-6f; + hparams.f_norm_rms_eps = 0.0f; + hparams.rope_freq_base_train = 0.0f; + hparams.rope_freq_scale_train = 1.0f; + hparams.rope_type = LLAMA_ROPE_TYPE_NONE; + + const uint32_t n_heads = 16; + const uint32_t n_layers = std::min(hparams.n_layer, LLAMA_MAX_LAYERS); + std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); + std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); + std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); + std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0); + std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0); + for (uint32_t il = 0; il < n_layers; ++il) { + hparams.n_head_arr[il] = n_heads; + hparams.n_head_kv_arr[il] = n_heads; + hparams.n_ff_arr[il] = hparams.n_embd * 4; + } + hparams.n_embd_head_k_full = hparams.n_embd / n_heads; + hparams.n_embd_head_v_full = hparams.n_embd / n_heads; + hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full; + hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full; + hparams.n_rot_full = 0; + hparams.n_rot_swa = 0; +} + +void llama_model_lingbot_map::load_arch_tensors(llama_model_loader &) { + throw std::runtime_error("LingBot-MAP GGUF tensors are loaded by the mtmd SMT wrapper, not llama_model"); +} + +std::unique_ptr llama_model_lingbot_map::build_arch_graph(const llm_graph_params &) const { + throw std::runtime_error("LingBot-MAP does not support llama_model text graph execution"); +} diff --git a/src/models/models.h b/src/models/models.h index 5251e2d82802..47c099a76d9a 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -173,6 +173,15 @@ struct llama_model_llama_embed : public llama_model_llama { }; +struct llama_model_lingbot_map : public llama_model_base { + llama_model_lingbot_map(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_maincoder : public llama_model_base { llama_model_maincoder(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 542a18b5cbca..0f46d9adfead 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -145,6 +145,11 @@ if(LLAMA_SERVER_SMT_VISION) message(FATAL_ERROR "Could not find spine_llm_argparser.cc in SPACEMIT_ORT_DIR='${SPACEMIT_ORT_DIR}'") endif() + target_sources(mtmd PRIVATE + lingbot-map-wrapper.cpp + lingbot-map-wrapper.h + ) + if(EXISTS "${SPACEMIT_ORT_LIB_DIR}/libonnxruntime.so") set(ONNXRUNTIME_LIB "${SPACEMIT_ORT_LIB_DIR}/libonnxruntime.so") elseif(EXISTS "${SPACEMIT_ORT_LIB_DIR}/libonnxruntime.a") diff --git a/tools/mtmd/lingbot-map-wrapper.cpp b/tools/mtmd/lingbot-map-wrapper.cpp new file mode 100644 index 000000000000..46e2a87fe473 --- /dev/null +++ b/tools/mtmd/lingbot-map-wrapper.cpp @@ -0,0 +1,1893 @@ +#include "lingbot-map-wrapper.h" + +#include "gguf.h" +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +struct gguf_deleter { + void operator()(gguf_context * ctx) const { + if (ctx != nullptr) { + gguf_free(ctx); + } + } +}; + +struct ggml_deleter { + void operator()(ggml_context * ctx) const { + if (ctx != nullptr) { + ggml_free(ctx); + } + } +}; + +struct ggml_backend_deleter { + void operator()(ggml_backend * backend) const { + if (backend != nullptr) { + ggml_backend_free(backend); + } + } +}; + +struct ggml_backend_buffer_deleter { + void operator()(ggml_backend_buffer * buffer) const { + if (buffer != nullptr) { + ggml_backend_buffer_free(buffer); + } + } +}; + +struct ggml_backend_sched_deleter { + void operator()(ggml_backend_sched * sched) const { + if (sched != nullptr) { + ggml_backend_sched_free(sched); + } + } +}; + +using gguf_context_ptr = std::unique_ptr; +using ggml_context_ptr = std::unique_ptr; +using ggml_backend_ptr = std::unique_ptr; +using ggml_backend_buffer_ptr = std::unique_ptr; +using ggml_backend_sched_ptr = std::unique_ptr; + +struct lingbot_map_loaded_gguf { + gguf_context_ptr gguf; + ggml_context_ptr ggml; +}; + +struct lingbot_map_runtime_weights { + gguf_context_ptr gguf; + ggml_context_ptr ggml; + ggml_backend_buffer_ptr buffer; +}; + +struct lingbot_map_runtime_graph { + ggml_tensor * input_tokens = nullptr; + ggml_tensor * camera_head_input = nullptr; + ggml_tensor * final_pose = nullptr; + ggml_cgraph * graph = nullptr; + std::vector selected_outputs; + std::vector iteration_poses; +}; + + + + +static int64_t lingbot_elapsed_ms(std::chrono::steady_clock::time_point start) { + return std::chrono::duration_cast(std::chrono::steady_clock::now() - start).count(); +} + +static bool lingbot_graph_supported_by_backend(ggml_backend_t backend, + ggml_backend_buffer_type_t buft, + ggml_cgraph * graph, + bool log_summary) { + if (backend == nullptr || buft == nullptr || graph == nullptr) { + return false; + } + bool ok = true; + int unsupported_nodes = 0; + if (!ggml_backend_supports_buft(backend, buft)) { + ok = false; + } + const int n_nodes = ggml_graph_n_nodes(graph); + for (int i = 0; i < n_nodes; ++i) { + const ggml_tensor * node = ggml_graph_node(graph, i); + if (node == nullptr) { + continue; + } + if (!ggml_backend_supports_op(backend, node)) { + ++unsupported_nodes; + ok = false; + } + } + if (!ok && log_summary) { + std::cerr << "[LingBot-MAP] GGML graph support check failed on backend=" << ggml_backend_name(backend) + << ", buffer_type=" << ggml_backend_buft_name(buft) + << ", unsupported_nodes=" << unsupported_nodes << "/" << n_nodes << "\n"; + } + return ok; +} + +static std::string read_file_to_string(const std::string & path) { + std::ifstream file(path); + if (!file.is_open()) { + return {}; + } + return std::string((std::istreambuf_iterator(file)), std::istreambuf_iterator()); +} + +static bool file_exists(const std::string & path) { + struct stat st; + return stat(path.c_str(), &st) == 0 && S_ISREG(st.st_mode); +} + +static size_t find_closing_brace(const std::string & text, size_t start_pos) { + if (start_pos == std::string::npos) { + return std::string::npos; + } + int depth = 0; + for (size_t i = start_pos; i < text.size(); ++i) { + if (text[i] == '{') { + ++depth; + } else if (text[i] == '}') { + --depth; + if (depth == 0) { + return i; + } + } + } + return std::string::npos; +} + +static std::string trim_ascii(std::string value) { + while (!value.empty() && std::isspace(static_cast(value.front()))) { + value.erase(value.begin()); + } + while (!value.empty() && std::isspace(static_cast(value.back()))) { + value.pop_back(); + } + return value; +} + +static std::string normalize_path(const std::string & base_dir, const std::string & path) { + const std::string trimmed = trim_ascii(path); + if (trimmed.empty()) { + return {}; + } + if (trimmed.front() == '/') { + return trimmed; + } + return base_dir + "/" + trimmed; +} + +static std::string extract_object_block(const std::string & text, const std::string & key) { + const std::string marker = "\"" + key + "\""; + const size_t key_pos = text.find(marker); + if (key_pos == std::string::npos) { + return {}; + } + const size_t brace_start = text.find('{', key_pos + marker.size()); + const size_t brace_end = find_closing_brace(text, brace_start); + if (brace_start == std::string::npos || brace_end == std::string::npos || brace_end <= brace_start) { + return {}; + } + return text.substr(brace_start, brace_end - brace_start + 1); +} + +static std::string extract_string_value(const std::string & text, const std::string & key) { + const std::string marker = "\"" + key + "\""; + const size_t key_pos = text.find(marker); + if (key_pos == std::string::npos) { + return {}; + } + const size_t colon_pos = text.find(':', key_pos + marker.size()); + if (colon_pos == std::string::npos) { + return {}; + } + const size_t first_quote = text.find('"', colon_pos + 1); + if (first_quote == std::string::npos) { + return {}; + } + const size_t second_quote = text.find('"', first_quote + 1); + if (second_quote == std::string::npos) { + return {}; + } + return text.substr(first_quote + 1, second_quote - first_quote - 1); +} + +static int32_t extract_int32_value(const std::string & text, const std::string & key, int32_t default_value) { + const std::string marker = "\"" + key + "\""; + const size_t key_pos = text.find(marker); + if (key_pos == std::string::npos) { + return default_value; + } + const size_t colon_pos = text.find(':', key_pos + marker.size()); + if (colon_pos == std::string::npos) { + return default_value; + } + size_t pos = colon_pos + 1; + while (pos < text.size() && std::isspace(static_cast(text[pos]))) { + ++pos; + } + size_t end = pos; + if (end < text.size() && (text[end] == '-' || text[end] == '+')) { + ++end; + } + while (end < text.size() && std::isdigit(static_cast(text[end]))) { + ++end; + } + if (end == pos) { + return default_value; + } + try { + return std::stoi(text.substr(pos, end - pos)); + } catch (...) { + return default_value; + } +} + +static void extract_float_array3(const std::string & text, const std::string & key, float values[3]) { + const std::string marker = "\"" + key + "\""; + const size_t key_pos = text.find(marker); + if (key_pos == std::string::npos) { + return; + } + const size_t bracket_start = text.find('[', key_pos + marker.size()); + const size_t bracket_end = text.find(']', bracket_start == std::string::npos ? key_pos : bracket_start + 1); + if (bracket_start == std::string::npos || bracket_end == std::string::npos || bracket_end <= bracket_start) { + return; + } + + size_t pos = bracket_start + 1; + for (int i = 0; i < 3 && pos < bracket_end; ++i) { + while (pos < bracket_end && (std::isspace(static_cast(text[pos])) || text[pos] == ',')) { + ++pos; + } + size_t end = pos; + while (end < bracket_end && text[end] != ',') { + ++end; + } + try { + values[i] = std::stof(text.substr(pos, end - pos)); + } catch (...) { + return; + } + pos = end + 1; + } +} + +static bool extract_bool_value(const std::string & text, const std::string & key, bool default_value) { + const std::string marker = "\"" + key + "\""; + const size_t key_pos = text.find(marker); + if (key_pos == std::string::npos) { + return default_value; + } + const size_t colon_pos = text.find(':', key_pos + marker.size()); + if (colon_pos == std::string::npos) { + return default_value; + } + size_t pos = colon_pos + 1; + while (pos < text.size() && std::isspace(static_cast(text[pos]))) { + ++pos; + } + if (text.compare(pos, 4, "true") == 0) { + return true; + } + if (text.compare(pos, 5, "false") == 0) { + return false; + } + return default_value; +} + + +static std::unordered_map extract_string_map(const std::string & text, const std::string & key) { + std::unordered_map values; + const std::string marker = "\"" + key + "\""; + const size_t key_pos = text.find(marker); + if (key_pos == std::string::npos) { + return values; + } + const size_t brace_start = text.find('{', key_pos + marker.size()); + const size_t brace_end = find_closing_brace(text, brace_start); + if (brace_start == std::string::npos || brace_end == std::string::npos || brace_end <= brace_start) { + return values; + } + const std::string content = text.substr(brace_start + 1, brace_end - brace_start - 1); + size_t pos = 0; + while (pos < content.size()) { + while (pos < content.size() && (std::isspace(static_cast(content[pos])) || content[pos] == ',')) { + ++pos; + } + if (pos >= content.size() || content[pos] != '"') { + break; + } + const size_t key_start = pos + 1; + const size_t key_end = content.find('"', key_start); + if (key_end == std::string::npos) { + break; + } + const size_t colon = content.find(':', key_end + 1); + const size_t value_quote = content.find('"', colon == std::string::npos ? key_end + 1 : colon + 1); + if (colon == std::string::npos || value_quote == std::string::npos) { + break; + } + const size_t value_end = content.find('"', value_quote + 1); + if (value_end == std::string::npos) { + break; + } + values[content.substr(key_start, key_end - key_start)] = content.substr(value_quote + 1, value_end - value_quote - 1); + pos = value_end + 1; + } + return values; +} + +static void merge_missing_ep_config(std::unordered_map & dst, + const std::unordered_map & src) { + for (const auto & kv : src) { + if (dst.find(kv.first) == dst.end()) { + dst[kv.first] = kv.second; + } + } +} + +static void apply_legacy_lingbot_ep_config(const std::string & text, + std::unordered_map & ep_config) { + if (ep_config.find("SPACEMIT_EP_INTRA_THREAD_NUM") == ep_config.end()) { + ep_config["SPACEMIT_EP_INTRA_THREAD_NUM"] = std::to_string(extract_int32_value(text, "spacemit_ep_intra_thread_num", 4)); + } + if (ep_config.find("SPACEMIT_EP_INTER_THREAD_NUM") == ep_config.end()) { + ep_config["SPACEMIT_EP_INTER_THREAD_NUM"] = std::to_string(extract_int32_value(text, "spacemit_ep_inter_thread_num", 1)); + } + const std::string affinity = extract_string_value(text, "spacemit_ep_intra_thread_affinity"); + if (!affinity.empty() && ep_config.find("SPACEMIT_EP_INTRA_THREAD_AFFINITY") == ep_config.end()) { + ep_config["SPACEMIT_EP_INTRA_THREAD_AFFINITY"] = affinity; + } +} + +static std::vector extract_string_array(const std::string & text, const std::string & key) { + std::vector values; + const std::string marker = "\"" + key + "\""; + const size_t key_pos = text.find(marker); + if (key_pos == std::string::npos) { + return values; + } + const size_t bracket_start = text.find('[', key_pos + marker.size()); + const size_t bracket_end = text.find(']', bracket_start == std::string::npos ? key_pos : bracket_start + 1); + if (bracket_start == std::string::npos || bracket_end == std::string::npos || bracket_end <= bracket_start) { + return values; + } + size_t pos = bracket_start + 1; + while (pos < bracket_end) { + const size_t first_quote = text.find('"', pos); + if (first_quote == std::string::npos || first_quote >= bracket_end) { + break; + } + const size_t second_quote = text.find('"', first_quote + 1); + if (second_quote == std::string::npos || second_quote > bracket_end) { + break; + } + values.push_back(text.substr(first_quote + 1, second_quote - first_quote - 1)); + pos = second_quote + 1; + } + return values; +} + +static std::vector extract_int32_array(const std::string & text, const std::string & key) { + std::vector values; + const std::string marker = "\"" + key + "\""; + const size_t key_pos = text.find(marker); + if (key_pos == std::string::npos) { + return values; + } + const size_t bracket_start = text.find('[', key_pos + marker.size()); + const size_t bracket_end = text.find(']', bracket_start == std::string::npos ? key_pos : bracket_start + 1); + if (bracket_start == std::string::npos || bracket_end == std::string::npos || bracket_end <= bracket_start) { + return values; + } + + size_t pos = bracket_start + 1; + while (pos < bracket_end) { + while (pos < bracket_end && (std::isspace(static_cast(text[pos])) || text[pos] == ',')) { + ++pos; + } + if (pos >= bracket_end) { + break; + } + size_t end = pos; + if (end < bracket_end && (text[end] == '-' || text[end] == '+')) { + ++end; + } + while (end < bracket_end && std::isdigit(static_cast(text[end]))) { + ++end; + } + if (end == pos) { + break; + } + values.push_back((int32_t) std::stoi(text.substr(pos, end - pos))); + pos = end; + } + return values; +} + +static uint32_t require_gguf_u32(const gguf_context * ctx, const char * key) { + const int64_t id = gguf_find_key(ctx, key); + if (id < 0 || gguf_get_kv_type(ctx, id) != GGUF_TYPE_UINT32) { + throw std::runtime_error(std::string("missing GGUF uint32 metadata: ") + key); + } + return gguf_get_val_u32(ctx, id); +} + + +static bool lingbot_tensor_type_is_supported_matrix_weight(ggml_type type) { + return type == GGML_TYPE_F32 || type == GGML_TYPE_F16 || type == GGML_TYPE_BF16 || ggml_is_quantized(type); +} + +static void require_ggml_tensor_shape( + ggml_context * ctx, + const std::string & name, + int64_t ne0, + int64_t ne1 = 1, + int64_t ne2 = 1, + int64_t ne3 = 1, + bool allow_quantized_matrix_weight = false) { + const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (tensor == nullptr) { + throw std::runtime_error("missing LingBot-MAP tensor: " + name); + } + const bool shape_ok = tensor->ne[0] == ne0 && tensor->ne[1] == ne1 && tensor->ne[2] == ne2 && tensor->ne[3] == ne3; + const bool type_ok = allow_quantized_matrix_weight ? + lingbot_tensor_type_is_supported_matrix_weight(tensor->type) : + tensor->type == GGML_TYPE_F32; + if (!shape_ok || !type_ok) { + throw std::runtime_error( + "unexpected LingBot-MAP tensor shape/type: " + name + + " type=" + ggml_type_name(tensor->type)); + } +} + +static void validate_lingbot_map_aggregator_block_shapes(ggml_context * ctx, const lingbot_map_config & cfg) { + const int64_t c = cfg.hidden_size; + const int64_t mlp = c * 4; + const int64_t head_dim = c / 16; + if (c <= 0 || c % 16 != 0) { + throw std::runtime_error("LingBot-MAP hidden_size must be divisible by 16 attention heads"); + } + for (const auto & prefix : { + std::string("aggregator.frame_blocks.0"), + std::string("aggregator.frame_blocks.") + std::to_string(cfg.frame_block_count - 1), + std::string("aggregator.global_blocks.0"), + std::string("aggregator.global_blocks.") + std::to_string(cfg.global_block_count - 1), + }) { + require_ggml_tensor_shape(ctx, prefix + ".norm1.weight", c); + require_ggml_tensor_shape(ctx, prefix + ".norm1.bias", c); + require_ggml_tensor_shape(ctx, prefix + ".attn.qkv.weight", c, c * 3, 1, 1, true); + require_ggml_tensor_shape(ctx, prefix + ".attn.qkv.bias", c * 3); + require_ggml_tensor_shape(ctx, prefix + ".attn.q_norm.weight", head_dim); + require_ggml_tensor_shape(ctx, prefix + ".attn.q_norm.bias", head_dim); + require_ggml_tensor_shape(ctx, prefix + ".attn.k_norm.weight", head_dim); + require_ggml_tensor_shape(ctx, prefix + ".attn.k_norm.bias", head_dim); + require_ggml_tensor_shape(ctx, prefix + ".attn.proj.weight", c, c, 1, 1, true); + require_ggml_tensor_shape(ctx, prefix + ".attn.proj.bias", c); + require_ggml_tensor_shape(ctx, prefix + ".ls1.gamma", c); + require_ggml_tensor_shape(ctx, prefix + ".norm2.weight", c); + require_ggml_tensor_shape(ctx, prefix + ".norm2.bias", c); + require_ggml_tensor_shape(ctx, prefix + ".mlp.fc1.weight", c, mlp, 1, 1, true); + require_ggml_tensor_shape(ctx, prefix + ".mlp.fc1.bias", mlp); + require_ggml_tensor_shape(ctx, prefix + ".mlp.fc2.weight", mlp, c, 1, 1, true); + require_ggml_tensor_shape(ctx, prefix + ".mlp.fc2.bias", c); + require_ggml_tensor_shape(ctx, prefix + ".ls2.gamma", c); + } +} + + +static void validate_lingbot_map_camera_head_shapes(ggml_context * ctx, const lingbot_map_config & cfg) { + const int64_t c = cfg.camera_hidden_size; + const int64_t pose_dim = 9; + const int64_t mlp = c * 4; + if (c <= 0 || c % 16 != 0) { + throw std::runtime_error("LingBot-MAP camera_hidden_size must be divisible by 16 attention heads"); + } + if (cfg.camera_trunk_block_count <= 0 || cfg.camera_num_iterations <= 0) { + throw std::runtime_error("LingBot-MAP camera_head requires positive trunk block and iteration counts"); + } + + require_ggml_tensor_shape(ctx, "camera_head.empty_pose_tokens", pose_dim, 1, 1); + require_ggml_tensor_shape(ctx, "camera_head.token_norm.weight", c); + require_ggml_tensor_shape(ctx, "camera_head.token_norm.bias", c); + require_ggml_tensor_shape(ctx, "camera_head.trunk_norm.weight", c); + require_ggml_tensor_shape(ctx, "camera_head.trunk_norm.bias", c); + require_ggml_tensor_shape(ctx, "camera_head.embed_pose.weight", pose_dim, c); + require_ggml_tensor_shape(ctx, "camera_head.embed_pose.bias", c); + require_ggml_tensor_shape(ctx, "camera_head.poseLN_modulation.1.weight", c, c * 3, 1, 1, true); + require_ggml_tensor_shape(ctx, "camera_head.poseLN_modulation.1.bias", c * 3); + require_ggml_tensor_shape(ctx, "camera_head.pose_branch.fc1.weight", c, c / 2, 1, 1, true); + require_ggml_tensor_shape(ctx, "camera_head.pose_branch.fc1.bias", c / 2); + require_ggml_tensor_shape(ctx, "camera_head.pose_branch.fc2.weight", c / 2, pose_dim, 1, 1, true); + require_ggml_tensor_shape(ctx, "camera_head.pose_branch.fc2.bias", pose_dim); + + for (int32_t i = 0; i < cfg.camera_trunk_block_count; ++i) { + const std::string prefix = "camera_head.trunk." + std::to_string(i); + require_ggml_tensor_shape(ctx, prefix + ".norm1.weight", c); + require_ggml_tensor_shape(ctx, prefix + ".norm1.bias", c); + require_ggml_tensor_shape(ctx, prefix + ".attn.qkv.weight", c, c * 3, 1, 1, true); + require_ggml_tensor_shape(ctx, prefix + ".attn.qkv.bias", c * 3); + require_ggml_tensor_shape(ctx, prefix + ".attn.proj.weight", c, c, 1, 1, true); + require_ggml_tensor_shape(ctx, prefix + ".attn.proj.bias", c); + require_ggml_tensor_shape(ctx, prefix + ".ls1.gamma", c); + require_ggml_tensor_shape(ctx, prefix + ".norm2.weight", c); + require_ggml_tensor_shape(ctx, prefix + ".norm2.bias", c); + require_ggml_tensor_shape(ctx, prefix + ".mlp.fc1.weight", c, mlp, 1, 1, true); + require_ggml_tensor_shape(ctx, prefix + ".mlp.fc1.bias", mlp); + require_ggml_tensor_shape(ctx, prefix + ".mlp.fc2.weight", mlp, c, 1, 1, true); + require_ggml_tensor_shape(ctx, prefix + ".mlp.fc2.bias", c); + require_ggml_tensor_shape(ctx, prefix + ".ls2.gamma", c); + } +} + +static std::string require_gguf_string(const gguf_context * ctx, const char * key) { + const int64_t id = gguf_find_key(ctx, key); + if (id < 0 || gguf_get_kv_type(ctx, id) != GGUF_TYPE_STRING) { + throw std::runtime_error(std::string("missing GGUF string metadata: ") + key); + } + return gguf_get_val_str(ctx, id); +} + +static lingbot_map_config load_lingbot_map_config(const std::string & config_dir) { + const std::string config_path = config_dir + "/config.json"; + const std::string content = read_file_to_string(config_path); + if (content.empty()) { + throw std::runtime_error("failed to read LingBot-MAP config: " + config_path); + } + + const std::string vision_block = extract_object_block(content, "vision_model"); + const std::string agg_block = extract_object_block(content, "aggregator_camera_model"); + const std::string depth_block = extract_object_block(content, "depth_model"); + const std::string post_block = extract_object_block(content, "postprocess"); + if (vision_block.empty() || agg_block.empty() || depth_block.empty()) { + throw std::runtime_error("LingBot-MAP config requires vision_model, aggregator_camera_model, and depth_model blocks"); + } + + lingbot_map_config cfg; + cfg.architectures = extract_string_array(content, "architectures"); + cfg.vision_model_path = normalize_path(config_dir, extract_string_value(vision_block, "model_path")); + cfg.aggregator_camera_model_path = normalize_path(config_dir, extract_string_value(agg_block, "model_path")); + cfg.depth_model_path = normalize_path(config_dir, extract_string_value(depth_block, "model_path")); + cfg.ep_config = extract_string_map(vision_block, "ep_config"); + merge_missing_ep_config(cfg.ep_config, extract_string_map(content, "ep_config")); + apply_legacy_lingbot_ep_config(vision_block, cfg.ep_config); + apply_legacy_lingbot_ep_config(content, cfg.ep_config); + + cfg.image_size = extract_int32_value(vision_block, "image_size", 518); + cfg.patch_size = extract_int32_value(vision_block, "patch_size", 14); + extract_float_array3(vision_block, "image_mean", cfg.image_mean); + extract_float_array3(vision_block, "image_std", cfg.image_std); + cfg.hidden_size = extract_int32_value(agg_block, "hidden_size", 0); + cfg.camera_hidden_size = extract_int32_value(agg_block, "camera_hidden_size", 0); + cfg.num_special_tokens = extract_int32_value(agg_block, "num_special_tokens", 0); + cfg.num_register_tokens = extract_int32_value(agg_block, "num_register_tokens", 0); + cfg.frame_block_count = extract_int32_value(agg_block, "frame_block_count", 0); + cfg.global_block_count = extract_int32_value(agg_block, "global_block_count", 0); + cfg.camera_trunk_block_count = extract_int32_value(agg_block, "camera_trunk_block_count", 0); + cfg.camera_num_iterations = extract_int32_value(agg_block, "camera_num_iterations", 4); + cfg.ggml_threads = extract_int32_value(agg_block, "ggml_threads", 8); + cfg.aggregator_selected_layers = extract_int32_array(agg_block, "selected_layers"); + if (cfg.aggregator_selected_layers.empty()) { + cfg.aggregator_selected_layers = { 4, 11, 17, 23 }; + } + + cfg.output_pose = extract_bool_value(post_block, "output_pose", true); + cfg.output_depth = extract_bool_value(post_block, "output_depth", true); + cfg.output_point_cloud = extract_bool_value(post_block, "output_point_cloud", true); + + if (cfg.architectures.empty()) { + throw std::runtime_error("LingBot-MAP config requires architectures"); + } + if (cfg.vision_model_path.empty() || cfg.aggregator_camera_model_path.empty() || cfg.depth_model_path.empty()) { + throw std::runtime_error("LingBot-MAP config contains empty model_path"); + } + for (const int32_t layer_idx : cfg.aggregator_selected_layers) { + if (layer_idx < 0 || layer_idx >= cfg.frame_block_count) { + throw std::runtime_error("LingBot-MAP aggregator selected_layers contains an invalid layer index"); + } + } + for (const auto & path : { cfg.vision_model_path, cfg.aggregator_camera_model_path, cfg.depth_model_path }) { + if (!file_exists(path)) { + throw std::runtime_error("LingBot-MAP model file not found: " + path); + } + } + return cfg; +} + +static lingbot_map_loaded_gguf load_and_validate_gguf(const lingbot_map_config & cfg) { + ggml_context * ggml_raw = nullptr; + gguf_init_params params = { + /*.no_alloc =*/ false, + /*.ctx =*/ &ggml_raw, + }; + lingbot_map_loaded_gguf loaded; + loaded.gguf.reset(gguf_init_from_file(cfg.aggregator_camera_model_path.c_str(), params)); + loaded.ggml.reset(ggml_raw); + if (!loaded.gguf || !loaded.ggml) { + throw std::runtime_error("failed to open LingBot-MAP GGUF: " + cfg.aggregator_camera_model_path); + } + + const gguf_context * gguf = loaded.gguf.get(); + const std::string arch = require_gguf_string(gguf, "general.architecture"); + if (arch != "lingbot-map") { + throw std::runtime_error("expected LingBot-MAP GGUF architecture 'lingbot-map', got '" + arch + "'"); + } + const std::string component = require_gguf_string(gguf, "lingbot-map.component"); + if (component != "aggregator_camera_head") { + throw std::runtime_error("unsupported LingBot-MAP GGUF component: " + component); + } + + const uint32_t file_type = require_gguf_u32(gguf, "general.file_type"); + (void) file_type; + + const uint32_t embed_dim = require_gguf_u32(gguf, "lingbot-map.embed_dim"); + const uint32_t camera_dim = require_gguf_u32(gguf, "lingbot-map.camera_dim"); + const uint32_t special_tokens = require_gguf_u32(gguf, "lingbot-map.num_special_tokens"); + const uint32_t frame_blocks = require_gguf_u32(gguf, "lingbot-map.aggregator_frame_block_count"); + const uint32_t global_blocks = require_gguf_u32(gguf, "lingbot-map.aggregator_global_block_count"); + const uint32_t camera_blocks = require_gguf_u32(gguf, "lingbot-map.camera_trunk_block_count"); + + if ((uint32_t) cfg.hidden_size != embed_dim || (uint32_t) cfg.camera_hidden_size != camera_dim || + (uint32_t) cfg.num_special_tokens != special_tokens || (uint32_t) cfg.frame_block_count != frame_blocks || + (uint32_t) cfg.global_block_count != global_blocks || (uint32_t) cfg.camera_trunk_block_count != camera_blocks) { + throw std::runtime_error("LingBot-MAP config.json does not match GGUF metadata"); + } + + if (gguf_find_tensor(gguf, "aggregator.camera_token") < 0 || + gguf_find_tensor(gguf, "camera_head.pose_branch.fc2.bias") < 0 || + ggml_get_tensor(loaded.ggml.get(), "aggregator.camera_token") == nullptr || + ggml_get_tensor(loaded.ggml.get(), "camera_head.pose_branch.fc2.bias") == nullptr) { + throw std::runtime_error("LingBot-MAP GGUF is missing required boundary tensors"); + } + validate_lingbot_map_aggregator_block_shapes(loaded.ggml.get(), cfg); + validate_lingbot_map_camera_head_shapes(loaded.ggml.get(), cfg); + + return loaded; +} + +static const float * lingbot_tensor_f32_data(const ggml_tensor * tensor, const std::string & name) { + if (tensor == nullptr) { + throw std::runtime_error("missing LingBot-MAP tensor: " + name); + } + if (tensor->type != GGML_TYPE_F32 || tensor->data == nullptr) { + throw std::runtime_error("LingBot-MAP tensor must be loaded as F32: " + name); + } + return static_cast(tensor->data); +} + +} // namespace + +struct lingbot_map_context::impl { + lingbot_map_config config; + gguf_context_ptr gguf; + ggml_context_ptr ggml; + std::string arch_name; + + ggml_backend_ptr runtime_backend; + ggml_backend_buffer_type_t runtime_buft = nullptr; + lingbot_map_runtime_weights runtime_weights; + bool runtime_initialized = false; + bool runtime_prefer_smt = true; +}; + +lingbot_map_context::~lingbot_map_context() = default; + +std::unique_ptr lingbot_map_context::create(const std::string & config_dir) { + auto ctx = std::unique_ptr(new lingbot_map_context()); + ctx->pimpl_ = std::make_unique(); + ctx->pimpl_->config = load_lingbot_map_config(config_dir); + auto loaded = load_and_validate_gguf(ctx->pimpl_->config); + ctx->pimpl_->gguf = std::move(loaded.gguf); + ctx->pimpl_->ggml = std::move(loaded.ggml); + ctx->pimpl_->arch_name = ctx->pimpl_->config.architectures.empty() ? std::string() : ctx->pimpl_->config.architectures[0]; + + std::cerr << "[LingBot-MAP] loaded config and GGUF: " << ctx->pimpl_->config.aggregator_camera_model_path + << ", tensors=" << gguf_get_n_tensors(ctx->pimpl_->gguf.get()) << "\n"; + return ctx; +} + +const lingbot_map_config & lingbot_map_context::config() const { + return pimpl_->config; +} + +const std::string & lingbot_map_context::architecture() const { + return pimpl_->arch_name; +} + +int64_t lingbot_map_context::tensor_count() const { + return gguf_get_n_tensors(pimpl_->gguf.get()); +} + +ggml_context * lingbot_map_context::ggml_ctx() const { + return pimpl_->ggml.get(); +} + +const ggml_tensor * lingbot_map_context::tensor(const std::string & name) const { + if (pimpl_->ggml == nullptr) { + return nullptr; + } + return ggml_get_tensor(pimpl_->ggml.get(), name.c_str()); +} + + +lingbot_map_aggregator_input lingbot_map_context::build_aggregator_input( + const float * vit_tokens, + int32_t n_frames, + int32_t vit_tokens_per_frame, + int32_t hidden_size, + int32_t image_h, + int32_t image_w, + int32_t num_frame_for_scale) const { + if (vit_tokens == nullptr) { + throw std::invalid_argument("LingBot-MAP aggregator input requires ViT tokens"); + } + const auto & cfg = config(); + if (n_frames <= 0 || vit_tokens_per_frame <= 0 || hidden_size != cfg.hidden_size) { + throw std::invalid_argument("Invalid LingBot-MAP ViT token shape for aggregator"); + } + if (image_h <= 0 || image_w <= 0 || cfg.patch_size <= 0) { + throw std::invalid_argument("Invalid LingBot-MAP image dimensions for aggregator"); + } + + const int32_t patch_h = image_h / cfg.patch_size; + const int32_t patch_w = image_w / cfg.patch_size; + const int32_t patch_tokens = patch_h * patch_w; + if (patch_tokens <= 0 || vit_tokens_per_frame < patch_tokens) { + throw std::invalid_argument("LingBot-MAP ViT output does not contain enough patch tokens for aggregator"); + } + + const int32_t vit_prefix_tokens = vit_tokens_per_frame - patch_tokens; + const int32_t patch_start_idx = 1 + cfg.num_register_tokens + 1; + if (patch_start_idx != cfg.num_special_tokens) { + throw std::runtime_error("LingBot-MAP special token metadata is inconsistent"); + } + + const ggml_tensor * camera_tensor = tensor("aggregator.camera_token"); + const ggml_tensor * register_tensor = tensor("aggregator.register_token"); + const ggml_tensor * scale_tensor = tensor("aggregator.scale_token"); + const float * camera_token = lingbot_tensor_f32_data(camera_tensor, "aggregator.camera_token"); + const float * register_token = lingbot_tensor_f32_data(register_tensor, "aggregator.register_token"); + const float * scale_token = lingbot_tensor_f32_data(scale_tensor, "aggregator.scale_token"); + + if (camera_tensor->ne[0] != hidden_size || camera_tensor->ne[1] != 1 || camera_tensor->ne[2] != 2 || + register_tensor->ne[0] != hidden_size || register_tensor->ne[1] != cfg.num_register_tokens || register_tensor->ne[2] != 2 || + scale_tensor->ne[0] != hidden_size || scale_tensor->ne[1] != 1 || scale_tensor->ne[2] != 2) { + throw std::runtime_error("LingBot-MAP special token tensor shapes do not match config"); + } + + lingbot_map_aggregator_input out; + out.n_frames = n_frames; + out.hidden_size = hidden_size; + out.vit_tokens_per_frame = vit_tokens_per_frame; + out.vit_prefix_tokens = vit_prefix_tokens; + out.patch_tokens = patch_tokens; + out.patch_start_idx = patch_start_idx; + out.tokens_per_frame = patch_start_idx + patch_tokens; + out.tokens.resize((size_t) n_frames * (size_t) out.tokens_per_frame * (size_t) hidden_size); + + const int32_t scale_frames = std::max(1, std::min(num_frame_for_scale, n_frames)); + auto copy_token_variant = [&](const float * token_base, int32_t variant, int32_t n_token, int32_t frame, int32_t dst_token) { + const size_t src_base = ((size_t) variant * (size_t) n_token) * (size_t) hidden_size; + const size_t dst_base = ((size_t) frame * (size_t) out.tokens_per_frame + (size_t) dst_token) * (size_t) hidden_size; + std::copy(token_base + src_base, token_base + src_base + (size_t) n_token * (size_t) hidden_size, + out.tokens.data() + dst_base); + }; + + for (int32_t f = 0; f < n_frames; ++f) { + const int32_t camera_variant = f == 0 ? 0 : 1; + const int32_t register_variant = f == 0 ? 0 : 1; + const int32_t scale_variant = f < scale_frames ? 0 : 1; + + copy_token_variant(camera_token, camera_variant, 1, f, 0); + copy_token_variant(register_token, register_variant, cfg.num_register_tokens, f, 1); + copy_token_variant(scale_token, scale_variant, 1, f, 1 + cfg.num_register_tokens); + + const float * vit_frame = vit_tokens + (size_t) f * (size_t) vit_tokens_per_frame * (size_t) hidden_size; + const float * patch_src = vit_frame + (size_t) vit_prefix_tokens * (size_t) hidden_size; + float * patch_dst = out.tokens.data() + ((size_t) f * (size_t) out.tokens_per_frame + (size_t) patch_start_idx) * (size_t) hidden_size; + std::copy(patch_src, patch_src + (size_t) patch_tokens * (size_t) hidden_size, patch_dst); + } + + return out; +} + + +static ggml_tensor * lingbot_require_tensor(ggml_context * ctx, const std::string & name) { + ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (tensor == nullptr) { + throw std::runtime_error("missing LingBot-MAP tensor: " + name); + } + return tensor; +} + +static ggml_tensor * lingbot_layer_norm( + ggml_context * ctx, + ggml_tensor * input, + ggml_tensor * weight, + ggml_tensor * bias, + float eps) { + ggml_tensor * cur = ggml_norm(ctx, input, eps); + cur = ggml_mul(ctx, cur, weight); + cur = ggml_add(ctx, cur, bias); + return cur; +} + +static ggml_tensor * lingbot_linear( + ggml_context * ctx, + ggml_tensor * input, + ggml_tensor * weight, + ggml_tensor * bias) { + ggml_tensor * cur = ggml_mul_mat(ctx, weight, input); + if (bias != nullptr) { + cur = ggml_add(ctx, cur, bias); + } + return cur; +} + +static ggml_tensor * lingbot_mlp_gelu( + ggml_context * ctx, + ggml_tensor * input, + ggml_tensor * fc1_w, + ggml_tensor * fc1_b, + ggml_tensor * fc2_w, + ggml_tensor * fc2_b) { + ggml_tensor * cur = lingbot_linear(ctx, input, fc1_w, fc1_b); + cur = ggml_gelu(ctx, cur); + cur = lingbot_linear(ctx, cur, fc2_w, fc2_b); + return cur; +} + +static ggml_tensor * lingbot_qkv_view( + ggml_context * ctx, + ggml_tensor * qkv, + int64_t hidden_size, + int index) { + return ggml_view_3d(ctx, qkv, + hidden_size, + qkv->ne[1], + qkv->ne[2], + qkv->nb[1], + qkv->nb[2], + (size_t) index * (size_t) hidden_size * ggml_type_size(qkv->type)); +} + +static ggml_tensor * lingbot_head_view( + ggml_context * ctx, + ggml_tensor * x, + int64_t head_dim, + int64_t n_heads) { + return ggml_view_4d(ctx, x, + head_dim, + n_heads, + x->ne[1], + x->ne[2], + (size_t) head_dim * ggml_type_size(x->type), + x->nb[1], + x->nb[2], + 0); +} + +static ggml_tensor * lingbot_frame_self_attention( + ggml_context * ctx, + ggml_tensor * qkv, + ggml_tensor * q_norm_w, + ggml_tensor * q_norm_b, + ggml_tensor * k_norm_w, + ggml_tensor * k_norm_b, + ggml_tensor * proj_w, + ggml_tensor * proj_b, + int64_t hidden_size) { + const int64_t head_dim = q_norm_w->ne[0]; + if (head_dim <= 0 || hidden_size % head_dim != 0 || k_norm_w->ne[0] != head_dim || + q_norm_b->ne[0] != head_dim || k_norm_b->ne[0] != head_dim) { + throw std::runtime_error("LingBot-MAP q/k norm shapes do not match hidden size"); + } + const int64_t n_heads = hidden_size / head_dim; + + ggml_tensor * q = lingbot_qkv_view(ctx, qkv, hidden_size, 0); + ggml_tensor * k = lingbot_qkv_view(ctx, qkv, hidden_size, 1); + ggml_tensor * v = lingbot_qkv_view(ctx, qkv, hidden_size, 2); + + q = lingbot_head_view(ctx, q, head_dim, n_heads); + k = lingbot_head_view(ctx, k, head_dim, n_heads); + v = lingbot_head_view(ctx, v, head_dim, n_heads); + + q = lingbot_layer_norm(ctx, q, q_norm_w, q_norm_b, 1e-6f); + k = lingbot_layer_norm(ctx, k, k_norm_w, k_norm_b, 1e-6f); + + q = ggml_permute(ctx, q, 0, 2, 1, 3); + k = ggml_permute(ctx, k, 0, 2, 1, 3); + v = ggml_permute(ctx, v, 0, 2, 1, 3); + + ggml_tensor * attn = ggml_flash_attn_ext(ctx, q, k, v, nullptr, 1.0f / std::sqrt((float) head_dim), 0.0f, 0.0f); + ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32); + attn = ggml_cont_3d(ctx, attn, hidden_size, qkv->ne[1], qkv->ne[2]); + + return lingbot_linear(ctx, attn, proj_w, proj_b); +} + +static ggml_tensor * lingbot_apply_aggregator_block( + ggml_context * ctx, + ggml_context * weights_ctx, + const lingbot_map_config & cfg, + ggml_tensor * x, + const std::string & prefix, + const std::string & graph_name, + ggml_tensor ** qkv_out) { + ggml_tensor * norm1_w = lingbot_require_tensor(weights_ctx, prefix + ".norm1.weight"); + ggml_tensor * norm1_b = lingbot_require_tensor(weights_ctx, prefix + ".norm1.bias"); + ggml_tensor * qkv_w = lingbot_require_tensor(weights_ctx, prefix + ".attn.qkv.weight"); + ggml_tensor * qkv_b = lingbot_require_tensor(weights_ctx, prefix + ".attn.qkv.bias"); + ggml_tensor * q_norm_w = lingbot_require_tensor(weights_ctx, prefix + ".attn.q_norm.weight"); + ggml_tensor * q_norm_b = lingbot_require_tensor(weights_ctx, prefix + ".attn.q_norm.bias"); + ggml_tensor * k_norm_w = lingbot_require_tensor(weights_ctx, prefix + ".attn.k_norm.weight"); + ggml_tensor * k_norm_b = lingbot_require_tensor(weights_ctx, prefix + ".attn.k_norm.bias"); + ggml_tensor * proj_w = lingbot_require_tensor(weights_ctx, prefix + ".attn.proj.weight"); + ggml_tensor * proj_b = lingbot_require_tensor(weights_ctx, prefix + ".attn.proj.bias"); + ggml_tensor * ls1 = lingbot_require_tensor(weights_ctx, prefix + ".ls1.gamma"); + ggml_tensor * norm2_w = lingbot_require_tensor(weights_ctx, prefix + ".norm2.weight"); + ggml_tensor * norm2_b = lingbot_require_tensor(weights_ctx, prefix + ".norm2.bias"); + ggml_tensor * fc1_w = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc1.weight"); + ggml_tensor * fc1_b = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc1.bias"); + ggml_tensor * fc2_w = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc2.weight"); + ggml_tensor * fc2_b = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc2.bias"); + ggml_tensor * ls2 = lingbot_require_tensor(weights_ctx, prefix + ".ls2.gamma"); + + ggml_tensor * normed = lingbot_layer_norm(ctx, x, norm1_w, norm1_b, 1e-6f); + ggml_tensor * qkv = lingbot_linear(ctx, normed, qkv_w, qkv_b); + ggml_set_name(qkv, (graph_name + ".qkv").c_str()); + if (qkv_out != nullptr) { + *qkv_out = qkv; + } + + ggml_tensor * attn = lingbot_frame_self_attention(ctx, qkv, q_norm_w, q_norm_b, k_norm_w, k_norm_b, + proj_w, proj_b, cfg.hidden_size); + attn = ggml_mul(ctx, attn, ls1); + ggml_tensor * attn_out = ggml_add(ctx, x, attn); + ggml_set_name(attn_out, (graph_name + ".attn_output").c_str()); + + ggml_tensor * ffn_inp = lingbot_layer_norm(ctx, attn_out, norm2_w, norm2_b, 1e-6f); + ggml_tensor * ffn = lingbot_mlp_gelu(ctx, ffn_inp, fc1_w, fc1_b, fc2_w, fc2_b); + ffn = ggml_mul(ctx, ffn, ls2); + ggml_tensor * out = ggml_add(ctx, attn_out, ffn); + ggml_set_name(out, (graph_name + ".output").c_str()); + return out; +} + + +static ggml_tensor * lingbot_camera_self_attention( + ggml_context * ctx, + ggml_tensor * qkv, + ggml_tensor * proj_w, + ggml_tensor * proj_b, + int64_t hidden_size) { + const int64_t n_heads = 16; + if (hidden_size <= 0 || hidden_size % n_heads != 0) { + throw std::runtime_error("LingBot-MAP camera hidden size must be divisible by 16 attention heads"); + } + const int64_t head_dim = hidden_size / n_heads; + + ggml_tensor * q = lingbot_qkv_view(ctx, qkv, hidden_size, 0); + ggml_tensor * k = lingbot_qkv_view(ctx, qkv, hidden_size, 1); + ggml_tensor * v = lingbot_qkv_view(ctx, qkv, hidden_size, 2); + + q = lingbot_head_view(ctx, q, head_dim, n_heads); + k = lingbot_head_view(ctx, k, head_dim, n_heads); + v = lingbot_head_view(ctx, v, head_dim, n_heads); + + q = ggml_permute(ctx, q, 0, 2, 1, 3); + k = ggml_permute(ctx, k, 0, 2, 1, 3); + v = ggml_permute(ctx, v, 0, 2, 1, 3); + + ggml_tensor * attn = ggml_flash_attn_ext(ctx, q, k, v, nullptr, 1.0f / std::sqrt((float) head_dim), 0.0f, 0.0f); + ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32); + attn = ggml_cont_3d(ctx, attn, hidden_size, qkv->ne[1], qkv->ne[2]); + + return lingbot_linear(ctx, attn, proj_w, proj_b); +} + +static ggml_tensor * lingbot_apply_camera_trunk_block( + ggml_context * ctx, + ggml_context * weights_ctx, + const lingbot_map_config & cfg, + ggml_tensor * x, + const std::string & prefix, + const std::string & graph_name) { + ggml_tensor * norm1_w = lingbot_require_tensor(weights_ctx, prefix + ".norm1.weight"); + ggml_tensor * norm1_b = lingbot_require_tensor(weights_ctx, prefix + ".norm1.bias"); + ggml_tensor * qkv_w = lingbot_require_tensor(weights_ctx, prefix + ".attn.qkv.weight"); + ggml_tensor * qkv_b = lingbot_require_tensor(weights_ctx, prefix + ".attn.qkv.bias"); + ggml_tensor * proj_w = lingbot_require_tensor(weights_ctx, prefix + ".attn.proj.weight"); + ggml_tensor * proj_b = lingbot_require_tensor(weights_ctx, prefix + ".attn.proj.bias"); + ggml_tensor * ls1 = lingbot_require_tensor(weights_ctx, prefix + ".ls1.gamma"); + ggml_tensor * norm2_w = lingbot_require_tensor(weights_ctx, prefix + ".norm2.weight"); + ggml_tensor * norm2_b = lingbot_require_tensor(weights_ctx, prefix + ".norm2.bias"); + ggml_tensor * fc1_w = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc1.weight"); + ggml_tensor * fc1_b = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc1.bias"); + ggml_tensor * fc2_w = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc2.weight"); + ggml_tensor * fc2_b = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc2.bias"); + ggml_tensor * ls2 = lingbot_require_tensor(weights_ctx, prefix + ".ls2.gamma"); + + ggml_tensor * normed = lingbot_layer_norm(ctx, x, norm1_w, norm1_b, 1e-6f); + ggml_tensor * qkv = lingbot_linear(ctx, normed, qkv_w, qkv_b); + ggml_set_name(qkv, (graph_name + ".qkv").c_str()); + + ggml_tensor * attn = lingbot_camera_self_attention(ctx, qkv, proj_w, proj_b, cfg.camera_hidden_size); + attn = ggml_mul(ctx, attn, ls1); + ggml_tensor * attn_out = ggml_add(ctx, x, attn); + ggml_set_name(attn_out, (graph_name + ".attn_output").c_str()); + + ggml_tensor * ffn_inp = lingbot_layer_norm(ctx, attn_out, norm2_w, norm2_b, 1e-6f); + ggml_tensor * ffn = lingbot_mlp_gelu(ctx, ffn_inp, fc1_w, fc1_b, fc2_w, fc2_b); + ffn = ggml_mul(ctx, ffn, ls2); + ggml_tensor * out = ggml_add(ctx, attn_out, ffn); + ggml_set_name(out, (graph_name + ".output").c_str()); + return out; +} + +static ggml_tensor * lingbot_pose_branch( + ggml_context * ctx, + ggml_context * weights_ctx, + ggml_tensor * x) { + ggml_tensor * fc1_w = lingbot_require_tensor(weights_ctx, "camera_head.pose_branch.fc1.weight"); + ggml_tensor * fc1_b = lingbot_require_tensor(weights_ctx, "camera_head.pose_branch.fc1.bias"); + ggml_tensor * fc2_w = lingbot_require_tensor(weights_ctx, "camera_head.pose_branch.fc2.weight"); + ggml_tensor * fc2_b = lingbot_require_tensor(weights_ctx, "camera_head.pose_branch.fc2.bias"); + return lingbot_mlp_gelu(ctx, x, fc1_w, fc1_b, fc2_w, fc2_b); +} + + +static ggml_tensor * lingbot_activate_pose( + ggml_context * ctx, + ggml_tensor * pred_pose) { + if (pred_pose->ne[0] != 9) { + throw std::runtime_error("LingBot-MAP camera_head pose activation expects 9 pose channels"); + } + ggml_tensor * trans_quat = ggml_view_3d(ctx, pred_pose, 7, pred_pose->ne[1], pred_pose->ne[2], + pred_pose->nb[1], pred_pose->nb[2], 0); + ggml_tensor * fov = ggml_view_3d(ctx, pred_pose, 2, pred_pose->ne[1], pred_pose->ne[2], + pred_pose->nb[1], pred_pose->nb[2], + 7 * ggml_type_size(pred_pose->type)); + fov = ggml_relu(ctx, fov); + return ggml_concat(ctx, trans_quat, fov, 0); +} + +static ggml_backend_buffer_type_t lingbot_select_cpu_buffer_type( + ggml_backend_t backend, + bool prefer_smt) { + ggml_backend_buffer_type_t default_buft = ggml_backend_get_default_buffer_type(backend); + if (!prefer_smt) { + return default_buft; + } + + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + auto * get_extra_bufts = (ggml_backend_dev_get_extra_bufts_t) + ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts"); + if (get_extra_bufts == nullptr) { + return default_buft; + } + + ggml_backend_buffer_type_t * extra_bufts = get_extra_bufts(dev); + if (extra_bufts == nullptr) { + return default_buft; + } + for (int i = 0; extra_bufts[i] != nullptr; ++i) { + const char * name = ggml_backend_buft_name(extra_bufts[i]); + if (name != nullptr && std::strstr(name, "SPACEMIT") != nullptr && + ggml_backend_supports_buft(backend, extra_bufts[i])) { + return extra_bufts[i]; + } + } + return default_buft; +} + +static lingbot_map_runtime_weights lingbot_load_runtime_weights( + const lingbot_map_config & cfg, + ggml_backend_buffer_type_t buft) { + ggml_context * ggml_raw = nullptr; + gguf_init_params params = { + /*.no_alloc =*/ true, + /*.ctx =*/ &ggml_raw, + }; + + lingbot_map_runtime_weights weights; + weights.gguf.reset(gguf_init_from_file(cfg.aggregator_camera_model_path.c_str(), params)); + weights.ggml.reset(ggml_raw); + if (!weights.gguf || !weights.ggml) { + throw std::runtime_error("failed to open LingBot-MAP GGUF for runtime: " + cfg.aggregator_camera_model_path); + } + validate_lingbot_map_aggregator_block_shapes(weights.ggml.get(), cfg); + validate_lingbot_map_camera_head_shapes(weights.ggml.get(), cfg); + + weights.buffer.reset(ggml_backend_alloc_ctx_tensors_from_buft(weights.ggml.get(), buft)); + if (!weights.buffer) { + throw std::runtime_error(std::string("failed to allocate LingBot-MAP runtime weights on buffer type: ") + + ggml_backend_buft_name(buft)); + } + ggml_backend_buffer_set_usage(weights.buffer.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + std::ifstream fin(cfg.aggregator_camera_model_path, std::ios::binary); + if (!fin.is_open()) { + throw std::runtime_error("failed to read LingBot-MAP GGUF weights: " + cfg.aggregator_camera_model_path); + } + + std::vector read_buf; + const int64_t n_tensors = gguf_get_n_tensors(weights.gguf.get()); + for (int64_t i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(weights.gguf.get(), i); + ggml_tensor * tensor = ggml_get_tensor(weights.ggml.get(), name); + if (tensor == nullptr) { + throw std::runtime_error(std::string("missing LingBot-MAP runtime tensor: ") + name); + } + const size_t offset = gguf_get_data_offset(weights.gguf.get()) + gguf_get_tensor_offset(weights.gguf.get(), i); + const size_t nbytes = ggml_nbytes(tensor); + fin.seekg((std::streamoff) offset, std::ios::beg); + if (!fin) { + throw std::runtime_error(std::string("failed to seek LingBot-MAP runtime tensor: ") + name); + } + if (ggml_backend_buft_is_host(buft)) { + fin.read(reinterpret_cast(tensor->data), (std::streamsize) nbytes); + } else { + read_buf.resize(nbytes); + fin.read(reinterpret_cast(read_buf.data()), (std::streamsize) nbytes); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes); + } + if (!fin) { + throw std::runtime_error(std::string("failed to load LingBot-MAP runtime tensor: ") + name); + } + } + return weights; +} + +static lingbot_map_runtime_graph lingbot_build_aggregator_camera_runtime_graph( + ggml_context * ctx, + ggml_context * weights_ctx, + const lingbot_map_config & cfg, + const lingbot_map_aggregator_input & input) { + ggml_tensor * tokens = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.hidden_size, + input.tokens_per_frame, input.n_frames); + ggml_set_name(tokens, "lingbot_map.runtime.aggregator.input"); + ggml_set_input(tokens); + + lingbot_map_runtime_graph built; + built.input_tokens = tokens; + built.selected_outputs.reserve(cfg.aggregator_selected_layers.size()); + built.iteration_poses.reserve(cfg.camera_num_iterations); + + ggml_tensor * frame_tokens = nullptr; + ggml_tensor * global_tokens = nullptr; + ggml_tensor * global_as_frame = nullptr; + + for (int32_t i = 0; i < cfg.frame_block_count; ++i) { + frame_tokens = lingbot_apply_aggregator_block(ctx, weights_ctx, cfg, tokens, + "aggregator.frame_blocks." + std::to_string(i), + "lingbot_map.runtime.aggregator.frame." + std::to_string(i), + nullptr); + global_tokens = ggml_reshape_3d(ctx, frame_tokens, cfg.hidden_size, + (int64_t) input.tokens_per_frame * input.n_frames, 1); + global_tokens = lingbot_apply_aggregator_block(ctx, weights_ctx, cfg, global_tokens, + "aggregator.global_blocks." + std::to_string(i), + "lingbot_map.runtime.aggregator.global." + std::to_string(i), + nullptr); + global_as_frame = ggml_reshape_3d(ctx, global_tokens, cfg.hidden_size, input.tokens_per_frame, input.n_frames); + + if (std::find(cfg.aggregator_selected_layers.begin(), cfg.aggregator_selected_layers.end(), i) != + cfg.aggregator_selected_layers.end()) { + ggml_tensor * selected = ggml_concat(ctx, frame_tokens, global_as_frame, 0); + ggml_set_name(selected, ("lingbot_map.runtime.aggregator.selected." + std::to_string(i)).c_str()); + built.selected_outputs.push_back(selected); + if (i == cfg.frame_block_count - 1) { + built.camera_head_input = selected; + } + } + + tokens = global_as_frame; + } + + if (built.camera_head_input == nullptr) { + built.camera_head_input = ggml_concat(ctx, frame_tokens, global_as_frame, 0); + ggml_set_name(built.camera_head_input, "lingbot_map.runtime.aggregator.camera_head_input"); + } + if (built.camera_head_input->ne[0] != cfg.camera_hidden_size) { + throw std::runtime_error("LingBot-MAP runtime camera_head input width does not match camera_hidden_size"); + } + + const int64_t pose_dim = 9; + ggml_tensor * pose_tokens = ggml_view_3d(ctx, built.camera_head_input, + cfg.camera_hidden_size, input.n_frames, 1, + built.camera_head_input->nb[2], + (size_t) built.camera_head_input->nb[2] * (size_t) input.n_frames, + 0); + pose_tokens = lingbot_layer_norm(ctx, pose_tokens, + lingbot_require_tensor(weights_ctx, "camera_head.token_norm.weight"), + lingbot_require_tensor(weights_ctx, "camera_head.token_norm.bias"), + 1e-6f); + ggml_set_name(pose_tokens, "lingbot_map.runtime.camera_head.pose_tokens"); + + ggml_tensor * empty_pose = lingbot_require_tensor(weights_ctx, "camera_head.empty_pose_tokens"); + ggml_tensor * pred_pose = nullptr; + + for (int32_t iter = 0; iter < cfg.camera_num_iterations; ++iter) { + ggml_tensor * module_input = nullptr; + if (pred_pose == nullptr) { + ggml_tensor * empty_pose_target = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, pose_dim, input.n_frames, 1); + module_input = ggml_repeat(ctx, empty_pose, empty_pose_target); + } else { + module_input = pred_pose; + } + + module_input = lingbot_linear(ctx, module_input, + lingbot_require_tensor(weights_ctx, "camera_head.embed_pose.weight"), + lingbot_require_tensor(weights_ctx, "camera_head.embed_pose.bias")); + + ggml_tensor * modulation = lingbot_linear(ctx, + ggml_silu(ctx, module_input), + lingbot_require_tensor(weights_ctx, "camera_head.poseLN_modulation.1.weight"), + lingbot_require_tensor(weights_ctx, "camera_head.poseLN_modulation.1.bias")); + ggml_tensor * shift = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1, + modulation->nb[1], modulation->nb[2], 0); + ggml_tensor * scale = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1, + modulation->nb[1], modulation->nb[2], + (size_t) cfg.camera_hidden_size * ggml_type_size(modulation->type)); + ggml_tensor * gate = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1, + modulation->nb[1], modulation->nb[2], + (size_t) cfg.camera_hidden_size * 2 * ggml_type_size(modulation->type)); + + ggml_tensor * adaln = ggml_norm(ctx, pose_tokens, 1e-6f); + ggml_tensor * scale_cont = ggml_cont(ctx, scale); + ggml_tensor * modulated = ggml_mul(ctx, adaln, ggml_scale_bias(ctx, scale_cont, 1.0f, 1.0f)); + modulated = ggml_add(ctx, modulated, shift); + modulated = ggml_mul(ctx, modulated, gate); + modulated = ggml_add(ctx, modulated, pose_tokens); + + for (int32_t block = 0; block < cfg.camera_trunk_block_count; ++block) { + modulated = lingbot_apply_camera_trunk_block(ctx, weights_ctx, cfg, modulated, + "camera_head.trunk." + std::to_string(block), + "lingbot_map.runtime.camera_head.iter." + std::to_string(iter) + + ".trunk." + std::to_string(block)); + } + + ggml_tensor * trunk_norm = lingbot_layer_norm(ctx, modulated, + lingbot_require_tensor(weights_ctx, "camera_head.trunk_norm.weight"), + lingbot_require_tensor(weights_ctx, "camera_head.trunk_norm.bias"), + 1e-6f); + ggml_tensor * delta = lingbot_pose_branch(ctx, weights_ctx, trunk_norm); + pred_pose = pred_pose == nullptr ? delta : ggml_add(ctx, pred_pose, delta); + ggml_set_name(pred_pose, ("lingbot_map.runtime.camera_head.pose_iter." + std::to_string(iter)).c_str()); + ggml_tensor * activated_pose = lingbot_activate_pose(ctx, pred_pose); + ggml_set_name(activated_pose, ("lingbot_map.runtime.camera_head.activated_pose_iter." + std::to_string(iter)).c_str()); + built.iteration_poses.push_back(activated_pose); + built.final_pose = activated_pose; + } + + built.graph = ggml_new_graph_custom(ctx, 32768, false); + for (size_t i = 0; i < built.selected_outputs.size(); ++i) { + ggml_tensor * selected = ggml_cont(ctx, built.selected_outputs[i]); + ggml_set_name(selected, ("lingbot_map.runtime.aggregator.selected_output." + std::to_string(i)).c_str()); + built.selected_outputs[i] = selected; + ggml_build_forward_expand(built.graph, selected); + } + built.final_pose = ggml_cont(ctx, built.final_pose); + ggml_set_name(built.final_pose, "lingbot_map.runtime.camera_head.final_pose_output"); + ggml_build_forward_expand(built.graph, built.final_pose); + return built; +} + +static lingbot_map_graph_probe_result lingbot_build_aggregator_block_probe( + ggml_context * weights_ctx, + const lingbot_map_config & cfg, + const lingbot_map_aggregator_input & input, + const std::string & block_prefix, + const std::string & graph_name, + bool flatten_frames) { + if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size) { + throw std::invalid_argument("Invalid LingBot-MAP aggregator input for block probe"); + } + + const size_t mem_size = 64ull * 1024ull * 1024ull; + ggml_init_params params = { + /*.mem_size =*/ mem_size, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ggml_context_ptr compute_ctx(ggml_init(params)); + if (!compute_ctx) { + throw std::runtime_error("failed to create LingBot-MAP aggregator probe ggml context"); + } + + const int64_t n_seq_tokens = flatten_frames ? (int64_t) input.tokens_per_frame * input.n_frames : input.tokens_per_frame; + const int64_t n_batches = flatten_frames ? 1 : input.n_frames; + + ggml_context * ctx = compute_ctx.get(); + ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.hidden_size, n_seq_tokens, n_batches); + ggml_set_name(x, (graph_name + ".input").c_str()); + ggml_set_input(x); + + ggml_tensor * qkv = nullptr; + ggml_tensor * out = lingbot_apply_aggregator_block(ctx, weights_ctx, cfg, x, block_prefix, graph_name, &qkv); + + ggml_cgraph * graph = ggml_new_graph_custom(ctx, 256, false); + ggml_build_forward_expand(graph, qkv); + ggml_build_forward_expand(graph, out); + + lingbot_map_graph_probe_result result; + result.graph_nodes = ggml_graph_n_nodes(graph); + result.input_tokens_per_frame = (int32_t) n_seq_tokens; + for (int i = 0; i < 4; ++i) { + result.output_ne[i] = (int32_t) out->ne[i]; + result.qkv_ne[i] = (int32_t) qkv->ne[i]; + } + return result; +} + +lingbot_map_graph_probe_result lingbot_map_context::build_aggregator_frame_block_probe( + const lingbot_map_aggregator_input & input, + int32_t block_index) const { + const auto & cfg = config(); + if (block_index < 0 || block_index >= cfg.frame_block_count) { + throw std::invalid_argument("Invalid LingBot-MAP frame block index"); + } + return lingbot_build_aggregator_block_probe(pimpl_->ggml.get(), cfg, input, + "aggregator.frame_blocks." + std::to_string(block_index), + "lingbot_map.aggregator.frame_probe", + /* flatten_frames */ false); +} + +lingbot_map_graph_probe_result lingbot_map_context::build_aggregator_global_block_probe( + const lingbot_map_aggregator_input & input, + int32_t block_index) const { + const auto & cfg = config(); + if (block_index < 0 || block_index >= cfg.global_block_count) { + throw std::invalid_argument("Invalid LingBot-MAP global block index"); + } + return lingbot_build_aggregator_block_probe(pimpl_->ggml.get(), cfg, input, + "aggregator.global_blocks." + std::to_string(block_index), + "lingbot_map.aggregator.global_probe", + /* flatten_frames */ true); +} + +lingbot_map_aggregator_probe_result lingbot_map_context::build_aggregator_block_probes( + const lingbot_map_aggregator_input & input, + int32_t block_index) const { + lingbot_map_aggregator_probe_result result; + result.frame = build_aggregator_frame_block_probe(input, block_index); + result.global = build_aggregator_global_block_probe(input, block_index); + return result; +} + +lingbot_map_full_aggregator_probe_result lingbot_map_context::build_full_aggregator_probe( + const lingbot_map_aggregator_input & input) const { + const auto & cfg = config(); + if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size) { + throw std::invalid_argument("Invalid LingBot-MAP aggregator input for full probe"); + } + if (cfg.frame_block_count <= 0 || cfg.global_block_count <= 0 || cfg.frame_block_count != cfg.global_block_count) { + throw std::runtime_error("LingBot-MAP full aggregator probe requires matching frame/global block counts"); + } + + const size_t mem_size = 256ull * 1024ull * 1024ull; + ggml_init_params params = { + /*.mem_size =*/ mem_size, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ggml_context_ptr compute_ctx(ggml_init(params)); + if (!compute_ctx) { + throw std::runtime_error("failed to create LingBot-MAP full aggregator probe ggml context"); + } + + ggml_context * ctx = compute_ctx.get(); + ggml_tensor * tokens = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.hidden_size, input.tokens_per_frame, input.n_frames); + ggml_set_name(tokens, "lingbot_map.aggregator.full_probe.input"); + ggml_set_input(tokens); + + ggml_tensor * frame_tokens = nullptr; + ggml_tensor * global_tokens = nullptr; + int32_t selected_outputs = 0; + for (int32_t i = 0; i < cfg.frame_block_count; ++i) { + frame_tokens = lingbot_apply_aggregator_block(ctx, pimpl_->ggml.get(), cfg, tokens, + "aggregator.frame_blocks." + std::to_string(i), + "lingbot_map.aggregator.full_probe.frame." + std::to_string(i), + nullptr); + global_tokens = ggml_reshape_3d(ctx, frame_tokens, cfg.hidden_size, + (int64_t) input.tokens_per_frame * input.n_frames, 1); + global_tokens = lingbot_apply_aggregator_block(ctx, pimpl_->ggml.get(), cfg, global_tokens, + "aggregator.global_blocks." + std::to_string(i), + "lingbot_map.aggregator.full_probe.global." + std::to_string(i), + nullptr); + tokens = ggml_reshape_3d(ctx, global_tokens, cfg.hidden_size, input.tokens_per_frame, input.n_frames); + if (std::find(cfg.aggregator_selected_layers.begin(), cfg.aggregator_selected_layers.end(), i) != + cfg.aggregator_selected_layers.end()) { + selected_outputs += 1; + } + } + + ggml_cgraph * graph = ggml_new_graph_custom(ctx, 8192, false); + ggml_build_forward_expand(graph, frame_tokens); + ggml_build_forward_expand(graph, global_tokens); + + lingbot_map_full_aggregator_probe_result result; + result.graph_nodes = ggml_graph_n_nodes(graph); + result.selected_output_count = selected_outputs; + result.frame_block_count = cfg.frame_block_count; + result.global_block_count = cfg.global_block_count; + for (int i = 0; i < 4; ++i) { + result.final_frame_ne[i] = (int32_t) frame_tokens->ne[i]; + result.final_global_ne[i] = (int32_t) global_tokens->ne[i]; + } + return result; +} + +lingbot_map_aggregator_graph_result lingbot_map_context::build_aggregator_graph( + const lingbot_map_aggregator_input & input) const { + const auto & cfg = config(); + if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size) { + throw std::invalid_argument("Invalid LingBot-MAP aggregator input for graph build"); + } + if (cfg.frame_block_count <= 0 || cfg.global_block_count <= 0 || cfg.frame_block_count != cfg.global_block_count) { + throw std::runtime_error("LingBot-MAP aggregator graph requires matching frame/global block counts"); + } + + const size_t mem_size = 256ull * 1024ull * 1024ull; + ggml_init_params params = { + /*.mem_size =*/ mem_size, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ggml_context_ptr compute_ctx(ggml_init(params)); + if (!compute_ctx) { + throw std::runtime_error("failed to create LingBot-MAP aggregator graph ggml context"); + } + + ggml_context * ctx = compute_ctx.get(); + ggml_tensor * tokens = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.hidden_size, input.tokens_per_frame, input.n_frames); + ggml_set_name(tokens, "lingbot_map.aggregator.graph.input"); + ggml_set_input(tokens); + + std::vector selected_outputs; + selected_outputs.reserve(cfg.aggregator_selected_layers.size()); + ggml_tensor * frame_tokens = nullptr; + ggml_tensor * global_tokens = nullptr; + + for (int32_t i = 0; i < cfg.frame_block_count; ++i) { + frame_tokens = lingbot_apply_aggregator_block(ctx, pimpl_->ggml.get(), cfg, tokens, + "aggregator.frame_blocks." + std::to_string(i), + "lingbot_map.aggregator.graph.frame." + std::to_string(i), + nullptr); + global_tokens = ggml_reshape_3d(ctx, frame_tokens, cfg.hidden_size, + (int64_t) input.tokens_per_frame * input.n_frames, 1); + global_tokens = lingbot_apply_aggregator_block(ctx, pimpl_->ggml.get(), cfg, global_tokens, + "aggregator.global_blocks." + std::to_string(i), + "lingbot_map.aggregator.graph.global." + std::to_string(i), + nullptr); + ggml_tensor * global_as_frame = ggml_reshape_3d(ctx, global_tokens, cfg.hidden_size, input.tokens_per_frame, input.n_frames); + + if (std::find(cfg.aggregator_selected_layers.begin(), cfg.aggregator_selected_layers.end(), i) != + cfg.aggregator_selected_layers.end()) { + ggml_tensor * selected = ggml_concat(ctx, frame_tokens, global_as_frame, 0); + ggml_set_name(selected, ("lingbot_map.aggregator.graph.selected." + std::to_string(i)).c_str()); + selected_outputs.push_back(selected); + } + + tokens = global_as_frame; + } + + ggml_cgraph * graph = ggml_new_graph_custom(ctx, 8192, false); + ggml_build_forward_expand(graph, frame_tokens); + ggml_build_forward_expand(graph, global_tokens); + for (ggml_tensor * selected : selected_outputs) { + ggml_build_forward_expand(graph, selected); + } + + lingbot_map_aggregator_graph_result result; + result.graph_nodes = ggml_graph_n_nodes(graph); + result.selected_output_count = (int32_t) selected_outputs.size(); + result.frame_block_count = cfg.frame_block_count; + result.global_block_count = cfg.global_block_count; + result.tokens_per_frame = input.tokens_per_frame; + result.patch_start_idx = input.patch_start_idx; + result.selected_layers = cfg.aggregator_selected_layers; + for (int i = 0; i < 4; ++i) { + result.final_frame_ne[i] = (int32_t) frame_tokens->ne[i]; + result.final_global_ne[i] = (int32_t) global_tokens->ne[i]; + } + result.selected_output_shapes.reserve(selected_outputs.size()); + for (const ggml_tensor * selected : selected_outputs) { + result.selected_output_shapes.push_back({ + (int32_t) selected->ne[0], + (int32_t) selected->ne[1], + (int32_t) selected->ne[2], + (int32_t) selected->ne[3], + }); + } + return result; +} + +lingbot_map_camera_head_graph_result lingbot_map_context::build_camera_head_graph( + const lingbot_map_aggregator_input & input) const { + const auto & cfg = config(); + if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size || + cfg.camera_hidden_size <= 0) { + throw std::invalid_argument("Invalid LingBot-MAP camera_head graph input"); + } + if (cfg.camera_trunk_block_count <= 0 || cfg.camera_num_iterations <= 0) { + throw std::runtime_error("LingBot-MAP camera_head graph requires trunk blocks and iterations"); + } + + const size_t mem_size = 256ull * 1024ull * 1024ull; + ggml_init_params params = { + /*.mem_size =*/ mem_size, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ggml_context_ptr compute_ctx(ggml_init(params)); + if (!compute_ctx) { + throw std::runtime_error("failed to create LingBot-MAP camera_head graph ggml context"); + } + + ggml_context * weights_ctx = pimpl_->ggml.get(); + ggml_context * ctx = compute_ctx.get(); + const int64_t pose_dim = 9; + ggml_tensor * aggregated_tokens = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.camera_hidden_size, + input.tokens_per_frame, input.n_frames); + ggml_set_name(aggregated_tokens, "lingbot_map.camera_head.graph.aggregated_tokens"); + ggml_set_input(aggregated_tokens); + + ggml_tensor * pose_tokens = ggml_view_3d(ctx, aggregated_tokens, + cfg.camera_hidden_size, input.n_frames, 1, + aggregated_tokens->nb[2], + (size_t) aggregated_tokens->nb[2] * (size_t) input.n_frames, + 0); + pose_tokens = lingbot_layer_norm(ctx, pose_tokens, + lingbot_require_tensor(weights_ctx, "camera_head.token_norm.weight"), + lingbot_require_tensor(weights_ctx, "camera_head.token_norm.bias"), + 1e-6f); + ggml_set_name(pose_tokens, "lingbot_map.camera_head.graph.pose_tokens"); + + ggml_tensor * empty_pose = lingbot_require_tensor(weights_ctx, "camera_head.empty_pose_tokens"); + ggml_tensor * pred_pose = nullptr; + std::vector iteration_outputs; + iteration_outputs.reserve(cfg.camera_num_iterations); + + for (int32_t iter = 0; iter < cfg.camera_num_iterations; ++iter) { + ggml_tensor * module_input = nullptr; + if (pred_pose == nullptr) { + ggml_tensor * empty_pose_target = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, pose_dim, input.n_frames, 1); + module_input = ggml_repeat(ctx, empty_pose, empty_pose_target); + } else { + module_input = pred_pose; + } + + module_input = lingbot_linear(ctx, module_input, + lingbot_require_tensor(weights_ctx, "camera_head.embed_pose.weight"), + lingbot_require_tensor(weights_ctx, "camera_head.embed_pose.bias")); + + ggml_tensor * modulation = lingbot_linear(ctx, + ggml_silu(ctx, module_input), + lingbot_require_tensor(weights_ctx, "camera_head.poseLN_modulation.1.weight"), + lingbot_require_tensor(weights_ctx, "camera_head.poseLN_modulation.1.bias")); + ggml_tensor * shift = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1, + modulation->nb[1], modulation->nb[2], 0); + ggml_tensor * scale = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1, + modulation->nb[1], modulation->nb[2], + (size_t) cfg.camera_hidden_size * ggml_type_size(modulation->type)); + ggml_tensor * gate = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1, + modulation->nb[1], modulation->nb[2], + (size_t) cfg.camera_hidden_size * 2 * ggml_type_size(modulation->type)); + + ggml_tensor * adaln = ggml_norm(ctx, pose_tokens, 1e-6f); + ggml_tensor * scale_cont = ggml_cont(ctx, scale); + ggml_tensor * modulated = ggml_mul(ctx, adaln, ggml_scale_bias(ctx, scale_cont, 1.0f, 1.0f)); + modulated = ggml_add(ctx, modulated, shift); + modulated = ggml_mul(ctx, modulated, gate); + modulated = ggml_add(ctx, modulated, pose_tokens); + + for (int32_t block = 0; block < cfg.camera_trunk_block_count; ++block) { + modulated = lingbot_apply_camera_trunk_block(ctx, weights_ctx, cfg, modulated, + "camera_head.trunk." + std::to_string(block), + "lingbot_map.camera_head.graph.iter." + std::to_string(iter) + ".trunk." + std::to_string(block)); + } + + ggml_tensor * trunk_norm = lingbot_layer_norm(ctx, modulated, + lingbot_require_tensor(weights_ctx, "camera_head.trunk_norm.weight"), + lingbot_require_tensor(weights_ctx, "camera_head.trunk_norm.bias"), + 1e-6f); + ggml_tensor * delta = lingbot_pose_branch(ctx, weights_ctx, trunk_norm); + pred_pose = pred_pose == nullptr ? delta : ggml_add(ctx, pred_pose, delta); + ggml_set_name(pred_pose, ("lingbot_map.camera_head.graph.pose_iter." + std::to_string(iter)).c_str()); + ggml_tensor * activated_pose = lingbot_activate_pose(ctx, pred_pose); + ggml_set_name(activated_pose, ("lingbot_map.camera_head.graph.activated_pose_iter." + std::to_string(iter)).c_str()); + iteration_outputs.push_back(activated_pose); + } + + ggml_cgraph * graph = ggml_new_graph_custom(ctx, 8192, false); + for (ggml_tensor * out : iteration_outputs) { + ggml_build_forward_expand(graph, out); + } + + lingbot_map_camera_head_graph_result result; + result.graph_nodes = ggml_graph_n_nodes(graph); + result.trunk_block_count = cfg.camera_trunk_block_count; + result.iteration_count = cfg.camera_num_iterations; + result.pose_dim = (int32_t) pose_dim; + for (int i = 0; i < 4; ++i) { + result.input_ne[i] = (int32_t) aggregated_tokens->ne[i]; + result.final_pose_ne[i] = (int32_t) iteration_outputs.back()->ne[i]; + } + result.iteration_pose_shapes.reserve(iteration_outputs.size()); + for (const ggml_tensor * out : iteration_outputs) { + result.iteration_pose_shapes.push_back({ + (int32_t) out->ne[0], + (int32_t) out->ne[1], + (int32_t) out->ne[2], + (int32_t) out->ne[3], + }); + } + return result; +} + + +lingbot_map_runtime_result lingbot_map_context::run_aggregator_camera_head( + const lingbot_map_aggregator_input & input, + bool prefer_smt) const { + const auto & cfg = config(); + if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size) { + throw std::invalid_argument("Invalid LingBot-MAP runtime input"); + } + if (cfg.frame_block_count <= 0 || cfg.global_block_count <= 0 || cfg.frame_block_count != cfg.global_block_count) { + throw std::runtime_error("LingBot-MAP runtime requires matching frame/global block counts"); + } + if (cfg.camera_trunk_block_count <= 0 || cfg.camera_num_iterations <= 0) { + throw std::runtime_error("LingBot-MAP runtime requires camera_head trunk blocks and iterations"); + } + + if (!pimpl_->runtime_initialized || pimpl_->runtime_prefer_smt != prefer_smt) { + ggml_backend_ptr backend(ggml_backend_cpu_init()); + if (!backend) { + throw std::runtime_error("failed to initialize LingBot-MAP GGML CPU/SMT backend"); + } + ggml_backend_cpu_set_n_threads(backend.get(), cfg.ggml_threads); + std::cerr << "[LingBot-MAP] GGML CPU backend threads=" << cfg.ggml_threads << "\n"; + + ggml_backend_buffer_type_t buft = lingbot_select_cpu_buffer_type(backend.get(), prefer_smt); + lingbot_map_runtime_weights weights; + try { + weights = lingbot_load_runtime_weights(cfg, buft); + } catch (const std::exception & e) { + ggml_backend_buffer_type_t default_buft = ggml_backend_get_default_buffer_type(backend.get()); + if (!prefer_smt || buft == default_buft) { + throw; + } + std::cerr << "[LingBot-MAP] failed to allocate/load runtime weights on " << ggml_backend_buft_name(buft) + << ", falling back to " << ggml_backend_buft_name(default_buft) << ": " << e.what() << "\n"; + buft = default_buft; + weights = lingbot_load_runtime_weights(cfg, buft); + } + + pimpl_->runtime_backend = std::move(backend); + pimpl_->runtime_buft = buft; + pimpl_->runtime_weights = std::move(weights); + pimpl_->runtime_prefer_smt = prefer_smt; + pimpl_->runtime_initialized = true; + std::cerr << "[LingBot-MAP] initialized GGML runtime backend=" << ggml_backend_name(pimpl_->runtime_backend.get()) + << ", buffer_type=" << ggml_backend_buft_name(pimpl_->runtime_buft) << "\n"; + } + + ggml_backend_t backend = pimpl_->runtime_backend.get(); + ggml_backend_buffer_type_t buft = pimpl_->runtime_buft; + ggml_context * weights_ctx = pimpl_->runtime_weights.ggml.get(); + + const size_t mem_size = 512ull * 1024ull * 1024ull; + ggml_init_params params = { + /*.mem_size =*/ mem_size, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ggml_context_ptr compute_ctx(ggml_init(params)); + if (!compute_ctx) { + throw std::runtime_error("failed to create LingBot-MAP runtime ggml context"); + } + + auto stage_start = std::chrono::steady_clock::now(); + lingbot_map_runtime_graph runtime_graph = lingbot_build_aggregator_camera_runtime_graph( + compute_ctx.get(), weights_ctx, cfg, input); + std::cerr << "[LingBot-MAP][time] ggml_build_graph_ms=" << lingbot_elapsed_ms(stage_start) + << ", nodes=" << (runtime_graph.graph ? ggml_graph_n_nodes(runtime_graph.graph) : 0) << "\n"; + if (runtime_graph.selected_outputs.empty() || runtime_graph.final_pose == nullptr || runtime_graph.graph == nullptr) { + throw std::runtime_error("LingBot-MAP runtime graph did not produce required outputs"); + } + + ggml_backend_buffer_type_t default_buft = ggml_backend_get_default_buffer_type(backend); + const bool primary_graph_supported = lingbot_graph_supported_by_backend( + backend, buft, runtime_graph.graph, /* log_summary */ buft != default_buft); + + ggml_backend_ptr fallback_backend; + std::vector backend_ptrs; + std::vector backend_bufts; + backend_ptrs.push_back(backend); + backend_bufts.push_back(buft); + + bool using_hybrid_cpu_fallback = false; + if (!primary_graph_supported && buft != default_buft) { + if (!ggml_backend_supports_buft(backend, default_buft)) { + std::cerr << "[LingBot-MAP] primary backend cannot use " << ggml_backend_buft_name(default_buft) + << ", falling back to plain CPU scheduler\n"; + buft = default_buft; + pimpl_->runtime_buft = buft; + pimpl_->runtime_weights = lingbot_load_runtime_weights(cfg, buft); + weights_ctx = pimpl_->runtime_weights.ggml.get(); + stage_start = std::chrono::steady_clock::now(); + runtime_graph = lingbot_build_aggregator_camera_runtime_graph(compute_ctx.get(), weights_ctx, cfg, input); + std::cerr << "[LingBot-MAP][time] ggml_rebuild_graph_after_cpu_fallback_ms=" << lingbot_elapsed_ms(stage_start) + << ", nodes=" << (runtime_graph.graph ? ggml_graph_n_nodes(runtime_graph.graph) : 0) << "\n"; + backend_ptrs.clear(); + backend_bufts.clear(); + backend_ptrs.push_back(backend); + backend_bufts.push_back(default_buft); + } else { + fallback_backend.reset(ggml_backend_cpu_init()); + if (!fallback_backend) { + throw std::runtime_error("failed to initialize LingBot-MAP GGML CPU fallback backend"); + } + ggml_backend_cpu_set_n_threads(fallback_backend.get(), cfg.ggml_threads); + + pimpl_->runtime_buft = default_buft; + pimpl_->runtime_weights = lingbot_load_runtime_weights(cfg, default_buft); + weights_ctx = pimpl_->runtime_weights.ggml.get(); + stage_start = std::chrono::steady_clock::now(); + runtime_graph = lingbot_build_aggregator_camera_runtime_graph(compute_ctx.get(), weights_ctx, cfg, input); + std::cerr << "[LingBot-MAP][time] ggml_rebuild_graph_for_hybrid_ms=" << lingbot_elapsed_ms(stage_start) + << ", nodes=" << (runtime_graph.graph ? ggml_graph_n_nodes(runtime_graph.graph) : 0) << "\n"; + if (runtime_graph.selected_outputs.empty() || runtime_graph.final_pose == nullptr || runtime_graph.graph == nullptr) { + throw std::runtime_error("LingBot-MAP runtime graph did not produce required outputs after hybrid rebuild"); + } + + backend_ptrs.push_back(fallback_backend.get()); + backend_bufts.push_back(default_buft); + using_hybrid_cpu_fallback = true; + std::cerr << "[LingBot-MAP] using hybrid GGML scheduler with CPU-host weights: primary=" << ggml_backend_buft_name(buft) + << ", fallback=" << ggml_backend_buft_name(default_buft) + << ", threads=" << cfg.ggml_threads << "\n"; + } + } else if (!primary_graph_supported) { + throw std::runtime_error("LingBot-MAP GGML runtime graph contains ops unsupported by the selected backend"); + } + + ggml_backend_sched_ptr sched( + ggml_backend_sched_new(backend_ptrs.data(), backend_bufts.data(), (int) backend_ptrs.size(), + 32768, false, true)); + if (!sched) { + throw std::runtime_error("failed to create LingBot-MAP GGML scheduler"); + } + + ggml_backend_sched_reset(sched.get()); + ggml_backend_t output_backend = fallback_backend ? fallback_backend.get() : backend; + for (ggml_tensor * selected : runtime_graph.selected_outputs) { + ggml_backend_sched_set_tensor_backend(sched.get(), selected, output_backend); + } + ggml_backend_sched_set_tensor_backend(sched.get(), runtime_graph.final_pose, output_backend); + + stage_start = std::chrono::steady_clock::now(); + if (!ggml_backend_sched_alloc_graph(sched.get(), runtime_graph.graph)) { + throw std::runtime_error("failed to allocate LingBot-MAP GGML runtime graph"); + } + std::cerr << "[LingBot-MAP][time] ggml_alloc_graph_ms=" << lingbot_elapsed_ms(stage_start) << "\n"; + + const size_t input_nbytes = ggml_nbytes(runtime_graph.input_tokens); + if (input_nbytes != input.tokens.size() * sizeof(float)) { + throw std::runtime_error("LingBot-MAP runtime input byte size mismatch"); + } + ggml_backend_tensor_set(runtime_graph.input_tokens, input.tokens.data(), 0, input_nbytes); + + stage_start = std::chrono::steady_clock::now(); + std::cerr << "[LingBot-MAP][time] ggml_compute_start backend=" << ggml_backend_name(backend) + << ", buffer_type=" << ggml_backend_buft_name(buft) + << ", nodes=" << ggml_graph_n_nodes(runtime_graph.graph) << "\n"; + const enum ggml_status status = ggml_backend_sched_graph_compute(sched.get(), runtime_graph.graph); + std::cerr << "[LingBot-MAP][time] ggml_compute_ms=" << lingbot_elapsed_ms(stage_start) + << ", status=" << ggml_status_to_string(status) << "\n"; + if (status != GGML_STATUS_SUCCESS) { + throw std::runtime_error(std::string("LingBot-MAP GGML runtime compute failed: ") + ggml_status_to_string(status)); + } + + lingbot_map_runtime_result result; + result.backend_name = ggml_backend_name(backend); + result.buffer_type_name = using_hybrid_cpu_fallback ? + std::string(ggml_backend_buft_name(buft)) + "+" + ggml_backend_buft_name(default_buft) : + ggml_backend_buft_name(buft); + result.graph_nodes = ggml_graph_n_nodes(runtime_graph.graph); + result.selected_output_count = (int32_t) runtime_graph.selected_outputs.size(); + result.tokens_per_frame = input.tokens_per_frame; + result.patch_start_idx = input.patch_start_idx; + result.frame_block_count = cfg.frame_block_count; + result.global_block_count = cfg.global_block_count; + result.camera_trunk_block_count = cfg.camera_trunk_block_count; + result.camera_iteration_count = cfg.camera_num_iterations; + result.camera_pose_dim = 9; + result.selected_layers = cfg.aggregator_selected_layers; + + result.selected_output_shapes.reserve(runtime_graph.selected_outputs.size()); + result.selected_outputs.reserve(runtime_graph.selected_outputs.size()); + for (const ggml_tensor * selected : runtime_graph.selected_outputs) { + if (selected->type != GGML_TYPE_F32) { + throw std::runtime_error("LingBot-MAP runtime selected output is not F32"); + } + result.selected_output_shapes.push_back({ + (int32_t) selected->ne[0], + (int32_t) selected->ne[1], + (int32_t) selected->ne[2], + (int32_t) selected->ne[3], + }); + std::vector output(ggml_nbytes(selected) / sizeof(float)); + ggml_backend_t selected_backend = ggml_backend_sched_get_tensor_backend(sched.get(), const_cast(selected)); + if (selected_backend == nullptr) { + throw std::runtime_error("LingBot-MAP selected output has no scheduled backend"); + } + std::cerr << "[LingBot-MAP][time] reading selected_output index=" << result.selected_outputs.size() + << ", bytes=" << ggml_nbytes(selected) + << ", backend=" << ggml_backend_name(selected_backend) + << "\n"; + ggml_backend_tensor_get_async(selected_backend, selected, output.data(), 0, ggml_nbytes(selected)); + ggml_backend_synchronize(selected_backend); + result.selected_outputs.push_back(std::move(output)); + } + + result.camera_head_input_shape = { + (int32_t) runtime_graph.camera_head_input->ne[0], + (int32_t) runtime_graph.camera_head_input->ne[1], + (int32_t) runtime_graph.camera_head_input->ne[2], + (int32_t) runtime_graph.camera_head_input->ne[3], + }; + result.camera_head_final_pose_shape = { + (int32_t) runtime_graph.final_pose->ne[0], + (int32_t) runtime_graph.final_pose->ne[1], + (int32_t) runtime_graph.final_pose->ne[2], + (int32_t) runtime_graph.final_pose->ne[3], + }; + result.camera_head_iteration_pose_shapes.reserve(runtime_graph.iteration_poses.size()); + for (const ggml_tensor * pose : runtime_graph.iteration_poses) { + result.camera_head_iteration_pose_shapes.push_back({ + (int32_t) pose->ne[0], + (int32_t) pose->ne[1], + (int32_t) pose->ne[2], + (int32_t) pose->ne[3], + }); + } + + if (runtime_graph.final_pose->type != GGML_TYPE_F32) { + throw std::runtime_error("LingBot-MAP runtime final pose is not F32"); + } + result.pose_encoding.resize(ggml_nbytes(runtime_graph.final_pose) / sizeof(float)); + ggml_backend_t pose_backend = ggml_backend_sched_get_tensor_backend(sched.get(), runtime_graph.final_pose); + if (pose_backend == nullptr) { + throw std::runtime_error("LingBot-MAP final pose has no scheduled backend"); + } + std::cerr << "[LingBot-MAP][time] reading final_pose bytes=" << ggml_nbytes(runtime_graph.final_pose) + << ", backend=" << ggml_backend_name(pose_backend) + << "\n"; + ggml_backend_tensor_get_async(pose_backend, runtime_graph.final_pose, result.pose_encoding.data(), 0, ggml_nbytes(runtime_graph.final_pose)); + ggml_backend_synchronize(pose_backend); + return result; +} + diff --git a/tools/mtmd/lingbot-map-wrapper.h b/tools/mtmd/lingbot-map-wrapper.h new file mode 100644 index 000000000000..ad3eaeddbc46 --- /dev/null +++ b/tools/mtmd/lingbot-map-wrapper.h @@ -0,0 +1,172 @@ +// LingBot-MAP multimodal wrapper. +// Loads mtmd_model/config.json and validates the ONNX/GGUF artifact set. + +#pragma once + +#include +#include +#include +#include +#include + +struct ggml_context; +struct ggml_tensor; + +struct lingbot_map_aggregator_input { + int32_t n_frames = 0; + int32_t hidden_size = 0; + int32_t vit_tokens_per_frame = 0; + int32_t vit_prefix_tokens = 0; + int32_t patch_tokens = 0; + int32_t patch_start_idx = 0; + int32_t tokens_per_frame = 0; + std::vector tokens; +}; + + +struct lingbot_map_graph_probe_result { + int32_t graph_nodes = 0; + int32_t input_tokens_per_frame = 0; + int32_t output_ne[4] = { 0, 0, 0, 0 }; + int32_t qkv_ne[4] = { 0, 0, 0, 0 }; +}; + +struct lingbot_map_aggregator_probe_result { + lingbot_map_graph_probe_result frame; + lingbot_map_graph_probe_result global; +}; + +struct lingbot_map_full_aggregator_probe_result { + int32_t graph_nodes = 0; + int32_t selected_output_count = 0; + int32_t frame_block_count = 0; + int32_t global_block_count = 0; + int32_t final_frame_ne[4] = { 0, 0, 0, 0 }; + int32_t final_global_ne[4] = { 0, 0, 0, 0 }; +}; + +struct lingbot_map_aggregator_graph_result { + int32_t graph_nodes = 0; + int32_t selected_output_count = 0; + int32_t frame_block_count = 0; + int32_t global_block_count = 0; + int32_t tokens_per_frame = 0; + int32_t patch_start_idx = 0; + int32_t final_frame_ne[4] = { 0, 0, 0, 0 }; + int32_t final_global_ne[4] = { 0, 0, 0, 0 }; + std::vector selected_layers; + std::vector> selected_output_shapes; +}; + +struct lingbot_map_camera_head_graph_result { + int32_t graph_nodes = 0; + int32_t trunk_block_count = 0; + int32_t iteration_count = 0; + int32_t pose_dim = 0; + int32_t input_ne[4] = { 0, 0, 0, 0 }; + int32_t final_pose_ne[4] = { 0, 0, 0, 0 }; + std::vector> iteration_pose_shapes; +}; + +struct lingbot_map_runtime_result { + std::string backend_name; + std::string buffer_type_name; + int32_t graph_nodes = 0; + int32_t selected_output_count = 0; + int32_t tokens_per_frame = 0; + int32_t patch_start_idx = 0; + int32_t frame_block_count = 0; + int32_t global_block_count = 0; + int32_t camera_trunk_block_count = 0; + int32_t camera_iteration_count = 0; + int32_t camera_pose_dim = 0; + std::vector selected_layers; + std::vector> selected_output_shapes; + std::vector> selected_outputs; + std::vector camera_head_input_shape; + std::vector camera_head_final_pose_shape; + std::vector> camera_head_iteration_pose_shapes; + std::vector pose_encoding; +}; + +struct lingbot_map_config { + std::vector architectures; + + std::string vision_model_path; + std::string aggregator_camera_model_path; + std::string depth_model_path; + std::unordered_map ep_config; + + int32_t image_size = 0; + int32_t patch_size = 0; + float image_mean[3] = { 0.485f, 0.456f, 0.406f }; + float image_std[3] = { 0.229f, 0.224f, 0.225f }; + int32_t hidden_size = 0; + int32_t camera_hidden_size = 0; + int32_t num_special_tokens = 0; + int32_t num_register_tokens = 0; + int32_t frame_block_count = 0; + int32_t global_block_count = 0; + int32_t camera_trunk_block_count = 0; + int32_t camera_num_iterations = 0; + int32_t ggml_threads = 8; + std::vector aggregator_selected_layers; + + bool output_pose = true; + bool output_depth = true; + bool output_point_cloud = true; +}; + +struct lingbot_map_context { + lingbot_map_context(const lingbot_map_context &) = delete; + lingbot_map_context & operator=(const lingbot_map_context &) = delete; + ~lingbot_map_context(); + + static std::unique_ptr create(const std::string & config_dir); + + const lingbot_map_config & config() const; + const std::string & architecture() const; + int64_t tensor_count() const; + + ggml_context * ggml_ctx() const; + const ggml_tensor * tensor(const std::string & name) const; + + lingbot_map_aggregator_input build_aggregator_input( + const float * vit_tokens, + int32_t n_frames, + int32_t vit_tokens_per_frame, + int32_t hidden_size, + int32_t image_h, + int32_t image_w, + int32_t num_frame_for_scale = 1) const; + + lingbot_map_graph_probe_result build_aggregator_frame_block_probe( + const lingbot_map_aggregator_input & input, + int32_t block_index = 0) const; + + lingbot_map_graph_probe_result build_aggregator_global_block_probe( + const lingbot_map_aggregator_input & input, + int32_t block_index = 0) const; + + lingbot_map_aggregator_probe_result build_aggregator_block_probes( + const lingbot_map_aggregator_input & input, + int32_t block_index = 0) const; + + lingbot_map_full_aggregator_probe_result build_full_aggregator_probe( + const lingbot_map_aggregator_input & input) const; + + lingbot_map_aggregator_graph_result build_aggregator_graph( + const lingbot_map_aggregator_input & input) const; + + lingbot_map_camera_head_graph_result build_camera_head_graph( + const lingbot_map_aggregator_input & input) const; + + lingbot_map_runtime_result run_aggregator_camera_head( + const lingbot_map_aggregator_input & input, + bool prefer_smt = true) const; + + private: + lingbot_map_context() = default; + struct impl; + std::unique_ptr pimpl_; +}; diff --git a/tools/mtmd/smt-vision-preprocess.cpp b/tools/mtmd/smt-vision-preprocess.cpp index 88780a1ab2b2..4e4741b9d118 100644 --- a/tools/mtmd/smt-vision-preprocess.cpp +++ b/tools/mtmd/smt-vision-preprocess.cpp @@ -161,6 +161,197 @@ static std::vector resize_rgb_u8_antialias(const uint8_t * src, return out; } + +static std::vector rgba_u8_to_rgb_u8_white(const uint8_t * src, int32_t w, int32_t h) { + if (src == nullptr || w <= 0 || h <= 0) { + throw std::runtime_error("Invalid RGBA image dimensions"); + } + + std::vector out((size_t) w * (size_t) h * 3u, 255); + for (int32_t y = 0; y < h; ++y) { + for (int32_t x = 0; x < w; ++x) { + const size_t src_idx = ((size_t) y * (size_t) w + (size_t) x) * 4u; + const size_t dst_idx = ((size_t) y * (size_t) w + (size_t) x) * 3u; + const uint32_t a = src[src_idx + 3]; + for (int32_t c = 0; c < 3; ++c) { + const uint32_t v = (uint32_t) src[src_idx + (size_t) c]; + out[dst_idx + (size_t) c] = (uint8_t) ((v * a + 255u * (255u - a) + 127u) / 255u); + } + } + } + return out; +} + +static std::vector resize_rgb_u8_pillow_bicubic(const std::vector & src, + int32_t src_w, + int32_t src_h, + int32_t dst_w, + int32_t dst_h) { + if (src.size() != (size_t) src_w * (size_t) src_h * 3u || src_w <= 0 || src_h <= 0 || dst_w <= 0 || dst_h <= 0) { + throw std::runtime_error("Invalid Pillow bicubic resize dimensions"); + } + if (src_w == dst_w && src_h == dst_h) { + return src; + } + + constexpr int precision_bits = 32 - 8 - 2; + constexpr double filter_support = 2.0; + + auto bicubic_filter = [](double x) -> double { + constexpr double a = -0.5; + if (x < 0.0) { + x = -x; + } + if (x < 1.0) { + return ((a + 2.0) * x - (a + 3.0)) * x * x + 1.0; + } + if (x < 2.0) { + return (((x - 5.0) * x + 8.0) * x - 4.0) * a; + } + return 0.0; + }; + + auto clip8 = [](int32_t value) -> uint8_t { + if (value < 0) { + return 0; + } + if (value > 255) { + return 255; + } + return (uint8_t) value; + }; + + auto precompute_weights = [&](int32_t in_size, int32_t out_size, + std::vector & bounds, + std::vector & weights) -> int32_t { + const double scale = (double) in_size / (double) out_size; + const double filterscale = std::max(1.0, scale); + const double support = filter_support * filterscale; + const int32_t ksize = (int32_t) std::ceil(support) * 2 + 1; + const double ss = 1.0 / filterscale; + const double fxp_scale = std::ldexp(1.0, precision_bits); + + bounds.resize((size_t) out_size * 2u); + weights.assign((size_t) out_size * (size_t) ksize, 0); + + for (int32_t out = 0; out < out_size; ++out) { + const double center = ((double) out + 0.5) * scale; + int32_t xmin = (int32_t) (center - support + 0.5); + int32_t xmax = (int32_t) (center + support + 0.5); + xmin = std::max(0, xmin); + xmax = std::min(in_size, xmax); + const int32_t count = xmax - xmin; + + bounds[(size_t) out * 2u + 0u] = xmin; + bounds[(size_t) out * 2u + 1u] = count; + + double weight_sum = 0.0; + std::vector tmp((size_t) ksize, 0.0); + for (int32_t k = 0; k < count; ++k) { + const double w = bicubic_filter(((double) k + (double) xmin - center + 0.5) * ss); + tmp[(size_t) k] = w; + weight_sum += w; + } + if (weight_sum != 0.0) { + for (int32_t k = 0; k < count; ++k) { + tmp[(size_t) k] /= weight_sum; + } + } + for (int32_t k = 0; k < ksize; ++k) { + double v = tmp[(size_t) k] * fxp_scale; + v += tmp[(size_t) k] < 0.0 ? -0.5 : 0.5; + v = std::round(v); + v = std::clamp(v, (double) std::numeric_limits::min(), (double) std::numeric_limits::max()); + weights[(size_t) out * (size_t) ksize + (size_t) k] = (int32_t) v; + } + } + return ksize; + }; + + auto resample_horizontal = [&](const std::vector & input, + std::vector & output, + int32_t in_w, + int32_t in_h, + int32_t out_w, + int32_t ksize, + const std::vector & bounds, + const std::vector & weights) { + output.resize((size_t) out_w * (size_t) in_h * 3u); + for (int32_t y = 0; y < in_h; ++y) { + for (int32_t x = 0; x < out_w; ++x) { + const int32_t xmin = bounds[(size_t) x * 2u + 0u]; + const int32_t count = bounds[(size_t) x * 2u + 1u]; + int32_t acc[3] = { 1 << (precision_bits - 1), 1 << (precision_bits - 1), 1 << (precision_bits - 1) }; + for (int32_t k = 0; k < count; ++k) { + const size_t src_idx = ((size_t) y * (size_t) in_w + (size_t) (xmin + k)) * 3u; + const int32_t w = weights[(size_t) x * (size_t) ksize + (size_t) k]; + acc[0] += (int32_t) input[src_idx + 0u] * w; + acc[1] += (int32_t) input[src_idx + 1u] * w; + acc[2] += (int32_t) input[src_idx + 2u] * w; + } + const size_t dst_idx = ((size_t) y * (size_t) out_w + (size_t) x) * 3u; + output[dst_idx + 0u] = clip8(acc[0] >> precision_bits); + output[dst_idx + 1u] = clip8(acc[1] >> precision_bits); + output[dst_idx + 2u] = clip8(acc[2] >> precision_bits); + } + } + }; + + auto resample_vertical = [&](const std::vector & input, + std::vector & output, + int32_t in_w, + int32_t in_h, + int32_t out_h, + int32_t ksize, + const std::vector & bounds, + const std::vector & weights) { + output.resize((size_t) in_w * (size_t) out_h * 3u); + for (int32_t y = 0; y < out_h; ++y) { + const int32_t ymin = bounds[(size_t) y * 2u + 0u]; + const int32_t count = bounds[(size_t) y * 2u + 1u]; + for (int32_t x = 0; x < in_w; ++x) { + int32_t acc[3] = { 1 << (precision_bits - 1), 1 << (precision_bits - 1), 1 << (precision_bits - 1) }; + for (int32_t k = 0; k < count; ++k) { + const size_t src_idx = ((size_t) (ymin + k) * (size_t) in_w + (size_t) x) * 3u; + const int32_t w = weights[(size_t) y * (size_t) ksize + (size_t) k]; + acc[0] += (int32_t) input[src_idx + 0u] * w; + acc[1] += (int32_t) input[src_idx + 1u] * w; + acc[2] += (int32_t) input[src_idx + 2u] * w; + } + const size_t dst_idx = ((size_t) y * (size_t) in_w + (size_t) x) * 3u; + output[dst_idx + 0u] = clip8(acc[0] >> precision_bits); + output[dst_idx + 1u] = clip8(acc[1] >> precision_bits); + output[dst_idx + 2u] = clip8(acc[2] >> precision_bits); + } + } + }; + + std::vector bounds_x; + std::vector bounds_y; + std::vector weights_x; + std::vector weights_y; + const bool need_x = src_w != dst_w; + const bool need_y = src_h != dst_h; + const int32_t ksize_x = need_x ? precompute_weights(src_w, dst_w, bounds_x, weights_x) : 0; + const int32_t ksize_y = need_y ? precompute_weights(src_h, dst_h, bounds_y, weights_y) : 0; + + if (need_x && need_y) { + std::vector tmp; + resample_horizontal(src, tmp, src_w, src_h, dst_w, ksize_x, bounds_x, weights_x); + std::vector out; + resample_vertical(tmp, out, dst_w, src_h, dst_h, ksize_y, bounds_y, weights_y); + return out; + } + if (need_x) { + std::vector out; + resample_horizontal(src, out, src_w, src_h, dst_w, ksize_x, bounds_x, weights_x); + return out; + } + std::vector out; + resample_vertical(src, out, src_w, src_h, dst_h, ksize_y, bounds_y, weights_y); + return out; +} + static std::vector rgb_u8_to_chw_f32(const std::vector & src, int32_t w, int32_t h, @@ -279,3 +470,82 @@ smt_vision_preprocess_result smt_vision_preprocess_if_image(const std::vector> & images, + int32_t target_w, + int32_t target_h, + int32_t patch_size, + const float mean[3], + const float std_values[3]) { + if (images.empty()) { + throw std::invalid_argument("LingBot-MAP preprocessing requires at least one image"); + } + if (target_w <= 0 || target_h <= 0 || patch_size <= 0) { + throw std::invalid_argument("Invalid LingBot-MAP preprocessing dimensions"); + } + + smt_lingbot_map_preprocess_result out; + out.target_w = target_w; + out.target_h = target_h; + out.n_images = (int32_t) images.size(); + out.tensor_nchw.resize((size_t) out.n_images * 3u * (size_t) target_h * (size_t) target_w); + out.resized_heights.reserve(images.size()); + + const size_t image_plane = (size_t) target_h * (size_t) target_w; + for (size_t i = 0; i < images.size(); ++i) { + const auto & input = images[i]; + if (input.empty() || input.size() > (size_t) std::numeric_limits::max()) { + throw std::invalid_argument("Invalid LingBot-MAP image payload"); + } + + int src_w = 0, src_h = 0, src_c = 0; + uint8_t * pixels = stbi_load_from_memory(input.data(), (int) input.size(), &src_w, &src_h, &src_c, + /* desired_channels */ 4); + if (pixels == nullptr || src_w <= 0 || src_h <= 0) { + if (pixels != nullptr) { + stbi_image_free(pixels); + } + throw std::invalid_argument("LingBot-MAP input is not a supported image"); + } + + try { + int32_t resized_h = (int32_t) std::round(((double) src_h * (double) target_w / (double) src_w) / + (double) patch_size) * patch_size; + resized_h = std::max(patch_size, resized_h); + out.resized_heights.push_back(resized_h); + + const auto rgb = rgba_u8_to_rgb_u8_white(pixels, src_w, src_h); + stbi_image_free(pixels); + pixels = nullptr; + + const auto resized = resize_rgb_u8_pillow_bicubic(rgb, src_w, src_h, target_w, resized_h); + + const int32_t crop_y = resized_h > target_h ? (resized_h - target_h) / 2 : 0; + const int32_t pad_y = resized_h < target_h ? (target_h - resized_h) / 2 : 0; + + for (int32_t c = 0; c < 3; ++c) { + const float denom = std_values[c] == 0.0f ? 1.0f : std_values[c]; + float * dst = out.tensor_nchw.data() + ((i * 3u + (size_t) c) * image_plane); + for (int32_t y = 0; y < target_h; ++y) { + const int32_t src_y = y + crop_y - pad_y; + for (int32_t x = 0; x < target_w; ++x) { + uint8_t v = 255; + if (src_y >= 0 && src_y < resized_h) { + v = resized[((size_t) src_y * (size_t) target_w + (size_t) x) * 3u + (size_t) c]; + } + dst[(size_t) y * (size_t) target_w + (size_t) x] = (((float) v / 255.0f) - mean[c]) / denom; + } + } + } + } catch (...) { + if (pixels != nullptr) { + stbi_image_free(pixels); + } + throw; + } + } + + return out; +} diff --git a/tools/mtmd/smt-vision-preprocess.h b/tools/mtmd/smt-vision-preprocess.h index b4cc6a1a69bb..7aae24b8a4be 100644 --- a/tools/mtmd/smt-vision-preprocess.h +++ b/tools/mtmd/smt-vision-preprocess.h @@ -19,6 +19,14 @@ struct smt_vision_preprocess_config { bool has_normalize_config = false; }; +struct smt_lingbot_map_preprocess_result { + int32_t target_w = 0; + int32_t target_h = 0; + int32_t n_images = 0; + std::vector tensor_nchw; + std::vector resized_heights; +}; + // If input bytes decode as image (jpg/png/webp/...), preprocess them into // float32 NCHW bytes for SMT vision ONNX input. Otherwise returns was_image=false. smt_vision_preprocess_result smt_vision_preprocess_if_image(const std::vector & input, @@ -26,3 +34,11 @@ smt_vision_preprocess_result smt_vision_preprocess_if_image(const std::vector> & images, + int32_t target_w, + int32_t target_h, + int32_t patch_size, + const float mean[3], + const float std_values[3]); diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 4d23f33f1350..7c5420a5658f 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1202,7 +1202,7 @@ static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, bool parse_special) { constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string"; constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data"; - const bool has_mtmd = mctx != nullptr || smt_ctx != nullptr; + const bool has_mtmd = mctx != nullptr || server_smt_vision_supports_prompt_embeddings(smt_ctx); if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { // string or mixed llama_tokens tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special); @@ -1223,7 +1223,7 @@ static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) { files.push_back(base64_decode(entry)); } - if (smt_ctx != nullptr) { + if (server_smt_vision_supports_prompt_embeddings(smt_ctx)) { return process_smt_prompt(smt_ctx, vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), files, add_special, parse_special); } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 1372da3135e4..eec80696ad3f 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -715,6 +715,7 @@ struct server_context_impl { common_context_seq_rm_type ctx_tgt_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO; common_context_seq_rm_type ctx_dft_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO; + bool model_less_reconstruction = false; common_speculative_ptr spec; @@ -749,7 +750,7 @@ struct server_context_impl { bool sleeping = false; - bool has_multimodal() const { return mctx != nullptr || smt_ctx != nullptr; } + bool has_multimodal() const { return mctx != nullptr || server_smt_vision_supports_prompt_embeddings(smt_ctx); } const char * vision_backend_name() const { switch (vision_backend) { @@ -819,6 +820,57 @@ struct server_context_impl { params_base = params; params_base.n_outputs_max = server_n_outputs_max(params_base); +#if defined(LLAMA_SERVER_SMT_VISION) + const std::string & lingbot_backend_config_dir = params_base.smt_config_dir; + const bool is_lingbot_map_reconstruct = + (params_base.media_backend == "smt" || params_base.media_backend == "auto") && + server_smt_vision_config_is_lingbot_map(lingbot_backend_config_dir); + + if (is_lingbot_map_reconstruct) { + try { + smt_ctx = server_smt_vision_init(nullptr, lingbot_backend_config_dir, params_base.warmup); + } catch (const std::exception & e) { + SRV_ERR("failed to load LingBot-MAP SMT backend from '%s': %s\n", lingbot_backend_config_dir.c_str(), e.what()); + return false; + } + + vision_backend = SERVER_VISION_BACKEND_SMT; + model_less_reconstruction = true; + chat_params = { + /* use_jinja */ params_base.use_jinja, + /* prefill_assistant */ params_base.prefill_assistant, + /* reasoning_format */ params_base.reasoning_format, + /* chat_template_kwargs */ params_base.default_template_kwargs, + /* tmpls */ nullptr, + /* allow_image */ false, + /* allow_audio */ false, + /* image_bin_only */ false, + /* media_backend */ vision_backend_name(), + /* enable_thinking */ false, + /* reasoning_budget */ params_base.sampling.reasoning_budget_tokens, + /* reasoning_budget_msg */ params_base.sampling.reasoning_budget_message, + /* media_path */ params_base.media_path, + /* force_pure_content */ params_base.force_pure_content_parser + }; + + if (!params_base.model_alias.empty()) { + model_name = *params_base.model_alias.begin(); + } else if (!params_base.model.name.empty()) { + model_name = params_base.model.name; + } else { + model_name = "lingbot-map"; + } + model_aliases = params_base.model_alias; + model_tags = params_base.model_tags; + + params = params_base; + if (!is_resume) { + return init(); + } + return true; + } +#endif + std::string & mmproj_path = params_base.mmproj.path; bool has_mmproj = !mmproj_path.empty(); mtmd_context_params mparams = mtmd_context_params_default(); @@ -1029,9 +1081,9 @@ struct server_context_impl { server_vision_backend_mode selected_backend = SERVER_VISION_BACKEND_NONE; #if defined(LLAMA_SERVER_SMT_VISION) const std::string & backend_pref = params_base.media_backend; + const std::string & backend_config_dir = params_base.smt_config_dir; if (backend_pref == "auto") { - const std::string & smt_config_dir = params_base.smt_config_dir; - if (!smt_config_dir.empty()) { + if (!backend_config_dir.empty()) { selected_backend = SERVER_VISION_BACKEND_SMT; } else if (!mmproj_path.empty()) { selected_backend = SERVER_VISION_BACKEND_MTMD; @@ -1070,9 +1122,9 @@ struct server_context_impl { #if defined(LLAMA_SERVER_SMT_VISION) } else if (selected_backend == SERVER_VISION_BACKEND_SMT) { - const std::string & smt_config_dir = params_base.smt_config_dir; + const std::string & smt_config_dir = backend_config_dir; if (smt_config_dir.empty()) { - SRV_ERR("%s", "media backend 'smt' selected but --smt-config-dir is empty\n"); + SRV_ERR("%s", "media backend 'smt' selected but --smt-config-dir is not set\n"); return false; } try { @@ -1259,8 +1311,10 @@ struct server_context_impl { // unlike load_model(), this is only called once during initialization bool init() { - GGML_ASSERT(ctx_tgt != nullptr); - GGML_ASSERT(model_tgt != nullptr); + if (!model_less_reconstruction) { + GGML_ASSERT(ctx_tgt != nullptr); + GGML_ASSERT(model_tgt != nullptr); + } GGML_ASSERT(!sleeping); @@ -1301,6 +1355,10 @@ struct server_context_impl { } } + if (model_less_reconstruction) { + return true; + } + // populate chat template params { common_chat_templates_ptr chat_templates; @@ -2076,12 +2134,12 @@ struct server_context_impl { bool tokenize_cli_input(server_task & task) { try { auto & prompt = task.cli_prompt; - if (smt_ctx != nullptr) { + if (server_smt_vision_supports_prompt_embeddings(smt_ctx)) { task.tokens = process_smt_prompt(smt_ctx, vocab, prompt, task.cli_files); } else if (mctx != nullptr) { task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files); } else { - task.tokens = std::move(tokenize_input_prompts(vocab, mctx, smt_ctx, prompt, true, true)[0]); + task.tokens = std::move(tokenize_input_prompts(vocab, mctx, server_smt_vision_supports_prompt_embeddings(smt_ctx) ? smt_ctx : nullptr, prompt, true, true)[0]); } task.cli_prompt.clear(); task.cli_files.clear(); @@ -3734,7 +3792,7 @@ struct server_context_impl { SRV_DBG("%s", "run slots completed\n"); } - int get_slot_n_ctx() { return slots.back().n_ctx; } + int get_slot_n_ctx() { return slots.empty() ? 0 : slots.back().n_ctx; } server_response_reader get_response_reader() { return server_response_reader(queue_tasks, queue_results, HTTP_POLLING_SECONDS); @@ -3771,8 +3829,9 @@ server_response_reader server_context::get_response_reader() { } server_context_meta server_context::get_meta() const { - auto bos_id = llama_vocab_bos(impl->vocab); - auto eos_id = llama_vocab_eos(impl->vocab); + const bool has_vocab = impl->vocab != nullptr && impl->ctx_tgt != nullptr; + auto bos_id = has_vocab ? llama_vocab_bos(impl->vocab) : LLAMA_TOKEN_NULL; + auto eos_id = has_vocab ? llama_vocab_eos(impl->vocab) : LLAMA_TOKEN_NULL; auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx_tgt, bos_id, true) : ""; auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx_tgt, eos_id, true) : ""; @@ -3786,31 +3845,32 @@ server_context_meta server_context::get_meta() const { /* has_mtmd */ impl->has_multimodal(), /* has_inp_image */ impl->chat_params.allow_image, /* has_inp_audio */ impl->chat_params.allow_audio, + /* has_reconstruction */ server_smt_vision_is_lingbot_map(impl->smt_ctx), /* json_ui_settings */ impl->json_ui_settings, /* json_webui_settings */ impl->json_webui_settings, // Deprecated /* slot_n_ctx */ impl->get_slot_n_ctx(), - /* pooling_type */ llama_pooling_type(impl->ctx_tgt), + /* pooling_type */ impl->ctx_tgt ? llama_pooling_type(impl->ctx_tgt) : LLAMA_POOLING_TYPE_NONE, /* chat_params */ impl->chat_params, - /* chat_template_caps */ common_chat_templates_get_caps(impl->chat_params.tmpls.get()), + /* chat_template_caps */ impl->chat_params.tmpls ? common_chat_templates_get_caps(impl->chat_params.tmpls.get()) : std::map{}, /* bos_token_str */ bos_token_str, /* eos_token_str */ eos_token_str, - /* fim_pre_token */ llama_vocab_fim_pre(impl->vocab), - /* fim_sub_token */ llama_vocab_fim_suf(impl->vocab), - /* fim_mid_token */ llama_vocab_fim_mid(impl->vocab), - /* fim_pad_token */ llama_vocab_fim_pad(impl->vocab), - /* fim_rep_token */ llama_vocab_fim_rep(impl->vocab), - /* fim_sep_token */ llama_vocab_fim_sep(impl->vocab), + /* fim_pre_token */ has_vocab ? llama_vocab_fim_pre(impl->vocab) : LLAMA_TOKEN_NULL, + /* fim_sub_token */ has_vocab ? llama_vocab_fim_suf(impl->vocab) : LLAMA_TOKEN_NULL, + /* fim_mid_token */ has_vocab ? llama_vocab_fim_mid(impl->vocab) : LLAMA_TOKEN_NULL, + /* fim_pad_token */ has_vocab ? llama_vocab_fim_pad(impl->vocab) : LLAMA_TOKEN_NULL, + /* fim_rep_token */ has_vocab ? llama_vocab_fim_rep(impl->vocab) : LLAMA_TOKEN_NULL, + /* fim_sep_token */ has_vocab ? llama_vocab_fim_sep(impl->vocab) : LLAMA_TOKEN_NULL, /* logit_bias_eog */ impl->params_base.sampling.logit_bias_eog, - /* model_vocab_type */ llama_vocab_type(impl->vocab), - /* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab), - /* model_n_ctx_train */ llama_model_n_ctx_train(impl->model_tgt), - /* model_n_embd_inp */ llama_model_n_embd(impl->model_tgt), - /* model_n_params */ llama_model_n_params(impl->model_tgt), - /* model_size */ llama_model_size(impl->model_tgt), + /* model_vocab_type */ has_vocab ? llama_vocab_type(impl->vocab) : LLAMA_VOCAB_TYPE_NONE, + /* model_vocab_n_tokens */ has_vocab ? llama_vocab_n_tokens(impl->vocab) : 0, + /* model_n_ctx_train */ impl->model_tgt ? llama_model_n_ctx_train(impl->model_tgt) : 0, + /* model_n_embd_inp */ impl->model_tgt ? llama_model_n_embd(impl->model_tgt) : 0, + /* model_n_params */ impl->model_tgt ? llama_model_n_params(impl->model_tgt) : 0, + /* model_size */ impl->model_tgt ? llama_model_size(impl->model_tgt) : 0, }; } @@ -3886,7 +3946,7 @@ static int32_t prompt_get_n_before_user( if (mctx != nullptr) { result = (int32_t) process_mtmd_prompt(mctx, prefix, prefix_files).size(); - } else if (smt_ctx != nullptr) { + } else if (server_smt_vision_supports_prompt_embeddings(smt_ctx)) { result = (int32_t) process_smt_prompt(smt_ctx, vocab, prefix, prefix_files).size(); } else { result = (int32_t) tokenize_input_prompts(vocab, nullptr, nullptr, prefix, true, true)[0].size(); @@ -3928,9 +3988,10 @@ std::unique_ptr server_routes::handle_completions_impl(con // process prompt std::vector inputs; - if (res_type != TASK_RESPONSE_TYPE_NONE && (ctx_server.mctx != nullptr || ctx_server.smt_ctx != nullptr)) { + if (res_type != TASK_RESPONSE_TYPE_NONE && + (ctx_server.mctx != nullptr || server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx))) { // OAI-compatible chat path with multimodal backend. - if (ctx_server.smt_ctx != nullptr) { + if (server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx)) { inputs.push_back( process_smt_prompt(ctx_server.smt_ctx, ctx_server.vocab, prompt.get(), files)); } else { @@ -3938,7 +3999,7 @@ std::unique_ptr server_routes::handle_completions_impl(con } } else { // Everything else, including multimodal completions. - inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, ctx_server.smt_ctx, prompt, true, true); + inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx) ? ctx_server.smt_ctx : nullptr, prompt, true, true); } // tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks @@ -4383,6 +4444,7 @@ void server_routes::init_routes() { { "modalities", json { {"vision", meta->has_inp_image}, {"audio", meta->has_inp_audio}, + {"reconstruction", meta->has_reconstruction}, } }, { "media_marker", get_media_marker() }, { "endpoint_slots", params.endpoint_slots }, @@ -4489,7 +4551,7 @@ void server_routes::init_routes() { std::string prompt = json_value(data, "prompt", std::string()); std::vector tokenized_prompts = - tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, ctx_server.smt_ctx, prompt, false, true); + tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx) ? ctx_server.smt_ctx : nullptr, prompt, false, true); SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); data["prompt"] = format_prompt_infill( ctx_server.vocab, data.at("input_prefix"), data.at("input_suffix"), data.at("input_extra"), params.n_batch, @@ -4631,6 +4693,167 @@ void server_routes::init_routes() { return res; }; + this->post_reconstruct = [this](const server_http_req & req) { + auto res = create_response(); +#if defined(LLAMA_SERVER_SMT_VISION) + if (!server_smt_vision_is_lingbot_map(ctx_server.smt_ctx)) { + res->error(format_error_response("The current SMT model is not LingBot-MAP.", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + server_smt_lingbot_map_reconstruct_options options; + if (!req.body.empty() && req.files.empty()) { + const json body = json::parse(req.body); + options.output_pose = json_value(body, "output_pose", options.output_pose); + options.output_depth = json_value(body, "output_depth", options.output_depth); + options.output_point_cloud = json_value(body, "output_point_cloud", options.output_point_cloud); + options.max_frames = json_value(body, "max_frames", options.max_frames); + } + + std::vector> images; + images.reserve(req.files.size()); + for (const auto & item : req.files) { + images.push_back(item.second.data); + } + if (images.empty()) { + res->error(format_error_response("LingBot-MAP reconstruction requires multipart image uploads.", + ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + try { + const auto result = server_smt_vision_lingbot_map_reconstruct(ctx_server.smt_ctx, images, options); + res->ok({ + {"object", "lingbot_map.reconstruction"}, + {"status", result.inference_ready ? "ok" : "not_implemented"}, + {"message", result.message}, + {"architecture", result.architecture}, + {"n_images", result.n_images}, + {"tensor_count", result.tensor_count}, + {"image_size", result.image_size}, + {"patch_size", result.patch_size}, + {"hidden_size", result.hidden_size}, + {"camera_hidden_size", result.camera_hidden_size}, + {"preprocess", { + {"width", result.preprocess_width}, + {"height", result.preprocess_height}, + {"resized_heights", result.resized_heights}, + }}, + {"vision_input_shape", result.vision_input_shape}, + {"vision_output_shape", result.vision_output_shape}, + {"vision_input_float_count", result.vision_input_float_count}, + {"vision_output_float_count", result.vision_output_float_count}, + {"vision_output_frames", result.vision_output_frames}, + {"vision_output_tokens", result.vision_output_tokens}, + {"vision_output_hidden", result.vision_output_hidden}, + {"aggregator_input", { + {"tokens_per_frame", result.aggregator_tokens_per_frame}, + {"patch_start_idx", result.aggregator_patch_start_idx}, + {"patch_tokens", result.aggregator_patch_tokens}, + {"vit_prefix_tokens", result.aggregator_vit_prefix_tokens}, + }}, + {"ggml_runtime", { + {"backend", result.ggml_runtime_backend}, + {"buffer_type", result.ggml_runtime_buffer_type}, + {"graph_nodes", result.ggml_runtime_graph_nodes}, + }}, + {"aggregator_probe", { + {"graph_nodes", result.aggregator_probe_graph_nodes}, + {"qkv_shape", result.aggregator_probe_qkv_shape}, + {"output_shape", result.aggregator_probe_output_shape}, + {"global_graph_nodes", result.aggregator_global_probe_graph_nodes}, + {"global_input_tokens", result.aggregator_global_probe_input_tokens}, + {"global_qkv_shape", result.aggregator_global_probe_qkv_shape}, + {"global_output_shape", result.aggregator_global_probe_output_shape}, + {"full_graph_nodes", result.aggregator_full_probe_graph_nodes}, + {"full_selected_outputs", result.aggregator_full_probe_selected_outputs}, + {"full_frame_blocks", result.aggregator_full_probe_frame_blocks}, + {"full_global_blocks", result.aggregator_full_probe_global_blocks}, + {"full_final_frame_shape", result.aggregator_full_probe_final_frame_shape}, + {"full_final_global_shape", result.aggregator_full_probe_final_global_shape}, + {"aggregator_graph_nodes", result.aggregator_graph_nodes}, + {"aggregator_graph_selected_outputs", result.aggregator_graph_selected_outputs}, + {"aggregator_graph_frame_blocks", result.aggregator_graph_frame_blocks}, + {"aggregator_graph_global_blocks", result.aggregator_graph_global_blocks}, + {"aggregator_graph_tokens_per_frame", result.aggregator_graph_tokens_per_frame}, + {"aggregator_graph_patch_start_idx", result.aggregator_graph_patch_start_idx}, + {"aggregator_graph_final_frame_shape", result.aggregator_graph_final_frame_shape}, + {"aggregator_graph_final_global_shape", result.aggregator_graph_final_global_shape}, + {"aggregator_graph_selected_output_shapes", result.aggregator_graph_selected_output_shapes}, + {"selected_layers", result.aggregator_selected_layers}, + }}, + {"camera_head", { + {"graph_nodes", result.camera_head_graph_nodes}, + {"trunk_blocks", result.camera_head_trunk_blocks}, + {"iterations", result.camera_head_iterations}, + {"pose_dim", result.camera_head_pose_dim}, + {"input_shape", result.camera_head_input_shape}, + {"final_pose_shape", result.camera_head_final_pose_shape}, + {"iteration_pose_shapes", result.camera_head_iteration_pose_shapes}, + }}, + {"depth_onnx", { + {"input_count", result.depth_onnx_input_count}, + {"output_count", result.depth_onnx_output_count}, + {"input_float_count", result.depth_onnx_input_float_count}, + {"input_source", result.depth_input_source}, + {"input_names", result.depth_input_names}, + {"output_names", result.depth_output_names}, + {"input_shapes", result.depth_input_shapes}, + {"output_shapes", result.depth_output_shapes}, + {"output_float_counts", result.depth_output_float_counts}, + }}, + {"postprocess", { + {"pose_source", result.pose_output_source}, + {"pose_encoding_shape", result.pose_encoding_shape}, + {"extrinsic_shape", result.extrinsic_shape}, + {"intrinsic_shape", result.intrinsic_shape}, + {"world_points_shape", result.world_points_shape}, + {"world_points_conf_shape", result.world_points_conf_shape}, + {"world_points_bin", { + {"path", result.world_points_path}, + {"dtype", "float32"}, + {"layout", "xyz"}, + {"shape", result.world_points_shape}, + {"bytes", result.world_points_bytes}, + }}, + {"point_count", result.postprocess_point_count}, + {"sample_count", result.postprocess_sample_count}, + {"pose_encoding_sample", result.pose_encoding_sample}, + {"extrinsic_first", result.extrinsic_first}, + {"intrinsic_first", result.intrinsic_first}, + {"world_points_sample", result.world_points_sample}, + {"depth_stats", { + {"min", result.depth_min}, + {"max", result.depth_max}, + {"mean", result.depth_mean}, + }}, + {"depth_conf_stats", { + {"min", result.depth_conf_min}, + {"max", result.depth_conf_max}, + {"mean", result.depth_conf_mean}, + }}, + }}, + {"onnx_sessions_loaded", result.onnx_sessions_loaded}, + {"outputs", { + {"pose", result.output_pose}, + {"depth", result.output_depth}, + {"point_cloud", result.output_point_cloud}, + }}, + {"stages", result.stages}, + }); + } catch (const std::invalid_argument & e) { + res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); + } catch (const std::exception & e) { + res->error(format_error_response(e.what(), ERROR_TYPE_SERVER)); + } +#else + GGML_UNUSED(req); + res->error(format_error_response("LingBot-MAP reconstruction requires llama-server built with SMT vision support.", + ERROR_TYPE_NOT_SUPPORTED)); +#endif + return res; + }; + this->get_models = [this](const server_http_req &) { auto res = create_response(true); @@ -5043,7 +5266,7 @@ std::unique_ptr server_routes::handle_embeddings_impl(cons } auto tokenized_prompts = - tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, ctx_server.smt_ctx, prompt, true, true); + tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx) ? ctx_server.smt_ctx : nullptr, prompt, true, true); for (const auto & tokens : tokenized_prompts) { // this check is necessary for models that do not add BOS token to the input if (tokens.empty()) { diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 77a935d47b91..3083b733c5e0 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -22,6 +22,7 @@ struct server_context_meta { bool has_mtmd; bool has_inp_image; bool has_inp_audio; + bool has_reconstruction; json json_ui_settings; // Primary: new name json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat) int slot_n_ctx; @@ -117,6 +118,7 @@ struct server_routes { server_http_context::handler_t post_anthropic_messages; server_http_context::handler_t post_anthropic_count_tokens; server_http_context::handler_t post_apply_template; + server_http_context::handler_t post_reconstruct; server_http_context::handler_t get_models; server_http_context::handler_t post_tokenize; server_http_context::handler_t post_detokenize; diff --git a/tools/server/server-smt-vision.cpp b/tools/server/server-smt-vision.cpp index bd6631511a0e..38bbc40d0362 100644 --- a/tools/server/server-smt-vision.cpp +++ b/tools/server/server-smt-vision.cpp @@ -1,18 +1,32 @@ #include "server-smt-vision.h" +#include "onnxruntime_cxx_api.h" + #include "common.h" #include "log.h" +#include + #include #include +#include +#include #include +#include +#include #include #include #include #include +#include +#include +#include #include #include +#include #include +#include +#include #include #include @@ -20,20 +34,76 @@ # include "../mtmd/smt-audio-wrapper.h" # include "../mtmd/smt-vision-preprocess.h" # include "../mtmd/smt-vision-wrapper.h" +# include "../mtmd/lingbot-map-wrapper.h" +#endif + +#if defined(LLAMA_SERVER_SMT_VISION) +namespace onnxruntime { +extern const OrtApi * g_ort; +} #endif + +struct lingbot_map_postprocess_result { + std::vector pose_encoding_shape; + std::vector extrinsic_shape; + std::vector intrinsic_shape; + std::vector world_points_shape; + std::vector world_points_conf_shape; + std::vector pose_encoding_sample; + std::vector extrinsic_first; + std::vector intrinsic_first; + std::vector world_points_sample; + std::string world_points_path; + int64_t point_count = 0; + int64_t world_points_bytes = 0; + int32_t sample_count = 0; + double depth_min = 0.0; + double depth_max = 0.0; + double depth_mean = 0.0; + double depth_conf_min = 0.0; + double depth_conf_max = 0.0; + double depth_conf_mean = 0.0; + std::string pose_source; +}; + +struct lingbot_map_onnx_context { + Ort::Env env{ ORT_LOGGING_LEVEL_WARNING, "lingbot-map" }; + Ort::Session vision_session{ nullptr }; + Ort::Session depth_session{ nullptr }; + + std::vector vision_input_names; + std::vector vision_output_names; + std::vector vision_input_names_raw; + std::vector vision_output_names_raw; + + std::vector depth_input_names; + std::vector depth_output_names; + std::vector depth_input_names_raw; + std::vector depth_output_names_raw; + + std::vector vision_input_shape; + std::vector> depth_input_shapes; + std::vector> depth_output_shapes; + int32_t vision_input_h = 0; + int32_t vision_input_w = 0; +}; + #if defined(_WIN32) # include # include #else # include # include +# include #endif struct server_smt_vision_context { #if defined(LLAMA_SERVER_SMT_VISION) - std::unique_ptr smt_vision; - std::unique_ptr smt_audio; + std::unique_ptr smt_vision; + std::unique_ptr smt_audio; + std::unique_ptr lingbot_map; + std::unique_ptr lingbot_onnx; #endif std::mutex mu; int32_t hidden_size = 0; @@ -43,8 +113,478 @@ struct server_smt_vision_context { std::vector tok_audio_beg; std::vector tok_audio_end; std::string architecture; + std::string config_dir; }; + +static int64_t lingbot_current_rss_mb() { +#if defined(_WIN32) + return -1; +#else + std::ifstream statm("/proc/self/statm"); + int64_t pages_total = 0; + int64_t pages_rss = 0; + if (!(statm >> pages_total >> pages_rss)) { + return -1; + } + const long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + return -1; + } + return (pages_rss * (int64_t) page_size) / (1024 * 1024); +#endif +} + + +static int64_t lingbot_elapsed_ms(std::chrono::steady_clock::time_point start) { + return std::chrono::duration_cast(std::chrono::steady_clock::now() - start).count(); +} + +static void lingbot_log_rss(const char * stage) { + const int64_t rss_mb = lingbot_current_rss_mb(); + if (rss_mb >= 0) { + std::cerr << "[LingBot-MAP][mem] " << stage << " rss=" << rss_mb << " MiB\n"; + } +} + + +static std::pair lingbot_make_world_points_paths(const std::string & config_dir) { + namespace fs = std::filesystem; + + std::error_code ec; + fs::path root = config_dir.empty() ? fs::temp_directory_path(ec) : fs::path(config_dir); + if (ec) { + throw std::runtime_error("failed to resolve temporary directory for LingBot-MAP point cloud output"); + } + + fs::path out_dir = root / "lingbot_map_outputs"; + fs::create_directories(out_dir, ec); + if (ec) { + throw std::runtime_error("failed to create LingBot-MAP point cloud output directory: " + out_dir.string()); + } + + static std::atomic counter{ 0 }; + const uint64_t stamp = (uint64_t) std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + const uint64_t seq = counter.fetch_add(1, std::memory_order_relaxed); + const std::string file_name = "world_points_" + std::to_string(stamp) + "_" + std::to_string(seq) + ".f32.bin"; + const fs::path relative_path = fs::path("lingbot_map_outputs") / file_name; + const fs::path write_path = fs::absolute(root / relative_path); + return { write_path.string(), relative_path.generic_string() }; +} + +static std::vector lingbot_make_name_ptrs(const std::vector & names) { + std::vector ptrs; + ptrs.reserve(names.size()); + for (const auto & name : names) { + ptrs.push_back(name.c_str()); + } + return ptrs; +} + +static std::vector lingbot_get_io_shape(Ort::Session & session, bool inputs, size_t index) { + Ort::TypeInfo type_info = inputs ? session.GetInputTypeInfo(index) : session.GetOutputTypeInfo(index); + auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); + return tensor_info.GetShape(); +} + +static std::vector lingbot_get_io_names(Ort::Session & session, bool inputs) { + Ort::AllocatorWithDefaultOptions allocator; + const size_t count = inputs ? session.GetInputCount() : session.GetOutputCount(); + std::vector names; + names.reserve(count); + for (size_t i = 0; i < count; ++i) { + auto allocated = inputs ? session.GetInputNameAllocated(i, allocator) : session.GetOutputNameAllocated(i, allocator); + names.emplace_back(allocated.get()); + } + return names; +} + + +static bool lingbot_init_spacemit_execution_provider( + Ort::SessionOptions & options, + const std::unordered_map & provider_options, + std::string & error_message) { + std::vector keys; + std::vector values; + keys.reserve(provider_options.size()); + values.reserve(provider_options.size()); + for (const auto & entry : provider_options) { + keys.push_back(entry.first.c_str()); + values.push_back(entry.second.c_str()); + } + +#if defined(_WIN32) + GGML_UNUSED(options); + GGML_UNUSED(keys); + GGML_UNUSED(values); + error_message = "Spacemit EP dynamic initialization is not implemented on Windows"; + return false; +#else + void * handle = dlopen("libspacemit_ep.so", RTLD_NOW); + if (!handle) { + error_message = std::string("failed to load libspacemit_ep.so: ") + dlerror(); + return false; + } + + auto * ep_init = reinterpret_cast( + dlsym(handle, "OrtSessionOptionsSpaceMITEnvInit")); + if (!ep_init) { + error_message = std::string("failed to find OrtSessionOptionsSpaceMITEnvInit: ") + dlerror(); + return false; + } + + if (OrtStatus * status = ep_init(options, keys.data(), values.data(), keys.size())) { + error_message = Ort::GetApi().GetErrorMessage(status); + Ort::GetApi().ReleaseStatus(status); + return false; + } + return true; +#endif +} + +static void lingbot_append_spacemit_ep(Ort::SessionOptions & session_options, + const char * session_name, + const lingbot_map_config & cfg) { + std::unordered_map provider_options = cfg.ep_config; + if (provider_options.find("SPACEMIT_EP_INTRA_THREAD_NUM") == provider_options.end()) { + provider_options["SPACEMIT_EP_INTRA_THREAD_NUM"] = "4"; + } + if (provider_options.find("SPACEMIT_EP_INTER_THREAD_NUM") == provider_options.end()) { + provider_options["SPACEMIT_EP_INTER_THREAD_NUM"] = "1"; + } + + std::string error_message; + if (!lingbot_init_spacemit_execution_provider(session_options, provider_options, error_message)) { + throw std::runtime_error(std::string("[LingBot-MAP] failed to initialize Spacemit EP for ") + session_name + ": " + error_message); + } + + std::cerr << "[LingBot-MAP] Spacemit EP enabled for " << session_name << " ("; + for (const auto & pair : provider_options) { + std::cerr << ", " << pair.first << "=" << pair.second; + } + std::cerr << ")\n"; +} + +static std::unique_ptr create_lingbot_map_onnx_context(const lingbot_map_config & cfg) { + auto ctx = std::make_unique(); + Ort::SessionOptions vision_options; + Ort::SessionOptions depth_options; + vision_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + depth_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + lingbot_append_spacemit_ep(vision_options, "vit_encoder", cfg); + lingbot_append_spacemit_ep(depth_options, "dpt_head", cfg); + + ctx->vision_session = Ort::Session(ctx->env, cfg.vision_model_path.c_str(), vision_options); + ctx->depth_session = Ort::Session(ctx->env, cfg.depth_model_path.c_str(), depth_options); + + ctx->vision_input_names = lingbot_get_io_names(ctx->vision_session, true); + ctx->vision_output_names = lingbot_get_io_names(ctx->vision_session, false); + ctx->depth_input_names = lingbot_get_io_names(ctx->depth_session, true); + ctx->depth_output_names = lingbot_get_io_names(ctx->depth_session, false); + + ctx->vision_input_shape = lingbot_get_io_shape(ctx->vision_session, true, 0); + ctx->depth_input_shapes.reserve(ctx->depth_input_names.size()); + for (size_t i = 0; i < ctx->depth_input_names.size(); ++i) { + ctx->depth_input_shapes.push_back(lingbot_get_io_shape(ctx->depth_session, true, i)); + } + ctx->depth_output_shapes.reserve(ctx->depth_output_names.size()); + for (size_t i = 0; i < ctx->depth_output_names.size(); ++i) { + ctx->depth_output_shapes.push_back(lingbot_get_io_shape(ctx->depth_session, false, i)); + } + + if (ctx->vision_input_shape.size() == 5) { + if (ctx->vision_input_shape[3] > 0) { + ctx->vision_input_h = (int32_t) ctx->vision_input_shape[3]; + } + if (ctx->vision_input_shape[4] > 0) { + ctx->vision_input_w = (int32_t) ctx->vision_input_shape[4]; + } + } + + if (ctx->vision_input_names.empty() || ctx->vision_output_names.empty()) { + throw std::runtime_error("LingBot-MAP ViT ONNX session has empty IO signature"); + } + if (ctx->depth_input_names.empty() || ctx->depth_output_names.empty()) { + throw std::runtime_error("LingBot-MAP DPT ONNX session has empty IO signature"); + } + + ctx->vision_input_names_raw = lingbot_make_name_ptrs(ctx->vision_input_names); + ctx->vision_output_names_raw = lingbot_make_name_ptrs(ctx->vision_output_names); + ctx->depth_input_names_raw = lingbot_make_name_ptrs(ctx->depth_input_names); + ctx->depth_output_names_raw = lingbot_make_name_ptrs(ctx->depth_output_names); + return ctx; +} + + +static int64_t lingbot_numel(const std::vector & shape) { + if (shape.empty()) { + return 0; + } + int64_t count = 1; + for (const int64_t dim : shape) { + if (dim <= 0) { + return 0; + } + count *= dim; + } + return count; +} + +static std::vector lingbot_make_depth_input_shape( + const std::vector & onnx_shape, + int32_t n_frames, + int32_t tokens_per_frame, + int32_t camera_hidden_size) { + if (onnx_shape.size() != 4) { + throw std::runtime_error("LingBot-MAP DPT input must be rank-4 [1, frames, tokens, hidden]"); + } + std::vector shape = onnx_shape; + const int64_t expected[4] = { 1, (int64_t) n_frames, (int64_t) tokens_per_frame, (int64_t) camera_hidden_size }; + for (size_t i = 0; i < 4; ++i) { + if (shape[i] < 0) { + shape[i] = expected[i]; + } + if (shape[i] != expected[i]) { + throw std::runtime_error("LingBot-MAP DPT input shape does not match aggregator selected output boundary"); + } + } + return shape; +} + +static void lingbot_validate_depth_outputs(const std::vector & outputs) { + if (outputs.size() != 2 || !outputs[0].IsTensor() || !outputs[1].IsTensor()) { + throw std::runtime_error("LingBot-MAP DPT ONNX must return depth and depth_conf tensors"); + } + const auto depth_shape = outputs[0].GetTensorTypeAndShapeInfo().GetShape(); + const auto conf_shape = outputs[1].GetTensorTypeAndShapeInfo().GetShape(); + if (depth_shape.size() != 5 || conf_shape.size() != 4) { + throw std::runtime_error("LingBot-MAP DPT ONNX returned unexpected output ranks"); + } + if (depth_shape[0] != conf_shape[0] || depth_shape[1] != conf_shape[1] || + depth_shape[2] != conf_shape[2] || depth_shape[3] != conf_shape[3] || depth_shape[4] != 1) { + throw std::runtime_error("LingBot-MAP DPT depth/depth_conf output shapes are inconsistent"); + } +} + + + +static void lingbot_quat_xyzw_to_mat(const float * q, float r[9]) { + const double x = q[0]; + const double y = q[1]; + const double z = q[2]; + const double w = q[3]; + double denom = x*x + y*y + z*z + w*w; + if (denom <= 1e-12) { + denom = 1.0; + } + const double two_s = 2.0 / denom; + r[0] = (float) (1.0 - two_s * (y*y + z*z)); + r[1] = (float) (two_s * (x*y - z*w)); + r[2] = (float) (two_s * (x*z + y*w)); + r[3] = (float) (two_s * (x*y + z*w)); + r[4] = (float) (1.0 - two_s * (x*x + z*z)); + r[5] = (float) (two_s * (y*z - x*w)); + r[6] = (float) (two_s * (x*z - y*w)); + r[7] = (float) (two_s * (y*z + x*w)); + r[8] = (float) (1.0 - two_s * (x*x + y*y)); +} + +static lingbot_map_postprocess_result lingbot_postprocess_reconstruction( + const float * pose_encoding, + const std::string & pose_source, + const float * depth, + const float * depth_conf, + const std::vector & depth_shape, + const std::vector & depth_conf_shape, + int32_t n_frames, + const std::string & world_points_write_path, + const std::string & world_points_response_path) { + if (pose_encoding == nullptr || depth == nullptr || depth_conf == nullptr) { + throw std::runtime_error("LingBot-MAP postprocess requires pose, depth and depth_conf data"); + } + if (depth_shape.size() != 5 || depth_conf_shape.size() != 4 || depth_shape[0] != 1 || + depth_shape[1] != n_frames || depth_shape[4] != 1 || depth_conf_shape[0] != 1 || + depth_conf_shape[1] != n_frames || depth_conf_shape[2] != depth_shape[2] || + depth_conf_shape[3] != depth_shape[3]) { + throw std::runtime_error("LingBot-MAP postprocess received incompatible depth output shapes"); + } + + const int64_t h = depth_shape[2]; + const int64_t w = depth_shape[3]; + const int64_t point_count = (int64_t) n_frames * h * w; + if (point_count <= 0) { + throw std::runtime_error("LingBot-MAP postprocess requires non-empty depth outputs"); + } + + lingbot_map_postprocess_result result; + result.pose_source = pose_source; + result.pose_encoding_shape = { 1, n_frames, 9 }; + result.extrinsic_shape = { 1, n_frames, 3, 4 }; + result.intrinsic_shape = { 1, n_frames, 3, 3 }; + result.world_points_shape = { 1, n_frames, h, w, 3 }; + result.world_points_conf_shape = { 1, n_frames, h, w }; + result.point_count = point_count; + result.world_points_path = world_points_response_path; + + std::ofstream world_points_file; + if (!world_points_write_path.empty()) { + world_points_file.open(world_points_write_path, std::ios::binary | std::ios::trunc); + if (!world_points_file.is_open()) { + throw std::runtime_error("failed to open LingBot-MAP world points output: " + world_points_write_path); + } + } + + std::vector extrinsics_w2c((size_t) n_frames * 12, 0.0f); + std::vector extrinsics_c2w((size_t) n_frames * 12, 0.0f); + std::vector intrinsics((size_t) n_frames * 9, 0.0f); + std::vector c2w_rot((size_t) n_frames * 9, 0.0f); + std::vector c2w_trans((size_t) n_frames * 3, 0.0f); + + for (int32_t f = 0; f < n_frames; ++f) { + const float * p = pose_encoding + (size_t) f * 9; + float r[9]; + lingbot_quat_xyzw_to_mat(p + 3, r); + + float * e = extrinsics_w2c.data() + (size_t) f * 12; + e[0] = r[0]; e[1] = r[1]; e[2] = r[2]; e[3] = p[0]; + e[4] = r[3]; e[5] = r[4]; e[6] = r[5]; e[7] = p[1]; + e[8] = r[6]; e[9] = r[7]; e[10] = r[8]; e[11] = p[2]; + + float fov_h = p[7]; + float fov_w = p[8]; + if (fov_h <= 1e-6f) { + fov_h = 1.0471975511965977f; + } + if (fov_w <= 1e-6f) { + fov_w = 1.0471975511965977f; + } + const float fy = (float) ((double) h / 2.0 / std::tan((double) fov_h / 2.0)); + const float fx = (float) ((double) w / 2.0 / std::tan((double) fov_w / 2.0)); + float * k = intrinsics.data() + (size_t) f * 9; + k[0] = fx; + k[4] = fy; + k[2] = (float) w / 2.0f; + k[5] = (float) h / 2.0f; + k[8] = 1.0f; + + float * cr = c2w_rot.data() + (size_t) f * 9; + cr[0] = r[0]; cr[1] = r[3]; cr[2] = r[6]; + cr[3] = r[1]; cr[4] = r[4]; cr[5] = r[7]; + cr[6] = r[2]; cr[7] = r[5]; cr[8] = r[8]; + + float * ct = c2w_trans.data() + (size_t) f * 3; + ct[0] = -(cr[0] * p[0] + cr[1] * p[1] + cr[2] * p[2]); + ct[1] = -(cr[3] * p[0] + cr[4] * p[1] + cr[5] * p[2]); + ct[2] = -(cr[6] * p[0] + cr[7] * p[1] + cr[8] * p[2]); + + float * c2w = extrinsics_c2w.data() + (size_t) f * 12; + c2w[0] = cr[0]; c2w[1] = cr[1]; c2w[2] = cr[2]; c2w[3] = ct[0]; + c2w[4] = cr[3]; c2w[5] = cr[4]; c2w[6] = cr[5]; c2w[7] = ct[1]; + c2w[8] = cr[6]; c2w[9] = cr[7]; c2w[10] = cr[8]; c2w[11] = ct[2]; + } + + result.pose_encoding_sample.assign(pose_encoding, pose_encoding + std::min((int64_t) n_frames * 9, 9)); + result.extrinsic_first.assign(extrinsics_c2w.begin(), extrinsics_c2w.begin() + std::min(extrinsics_c2w.size(), 12)); + result.intrinsic_first.assign(intrinsics.begin(), intrinsics.begin() + std::min(intrinsics.size(), 9)); + + const int32_t sample_limit = 64; + const int64_t sample_stride = std::max(1, point_count / sample_limit); + result.world_points_sample.reserve((size_t) sample_limit * 3); + + double depth_sum = 0.0; + double conf_sum = 0.0; + result.depth_min = depth[0]; + result.depth_max = depth[0]; + result.depth_conf_min = depth_conf[0]; + result.depth_conf_max = depth_conf[0]; + + int32_t sample_count = 0; + for (int32_t f = 0; f < n_frames; ++f) { + const float * k = intrinsics.data() + (size_t) f * 9; + const float * cr = c2w_rot.data() + (size_t) f * 9; + const float * ct = c2w_trans.data() + (size_t) f * 3; + const float fx_cur = k[0]; + const float fy_cur = k[4]; + const float cx = k[2]; + const float cy = k[5]; + for (int64_t y = 0; y < h; ++y) { + for (int64_t x = 0; x < w; ++x) { + const int64_t idx = ((int64_t) f * h + y) * w + x; + const float d = depth[idx]; + const float c = depth_conf[idx]; + result.depth_min = std::min(result.depth_min, (double) d); + result.depth_max = std::max(result.depth_max, (double) d); + result.depth_conf_min = std::min(result.depth_conf_min, (double) c); + result.depth_conf_max = std::max(result.depth_conf_max, (double) c); + depth_sum += d; + conf_sum += c; + + const float cam_x = ((float) x - cx) * d / fx_cur; + const float cam_y = ((float) y - cy) * d / fy_cur; + const float cam_z = d; + const float world_xyz[3] = { + cr[0] * cam_x + cr[1] * cam_y + cr[2] * cam_z + ct[0], + cr[3] * cam_x + cr[4] * cam_y + cr[5] * cam_z + ct[1], + cr[6] * cam_x + cr[7] * cam_y + cr[8] * cam_z + ct[2], + }; + + if (world_points_file.is_open()) { + world_points_file.write(reinterpret_cast(world_xyz), sizeof(world_xyz)); + if (!world_points_file) { + throw std::runtime_error("failed to write LingBot-MAP world points output: " + world_points_write_path); + } + } + + if (idx % sample_stride == 0 && sample_count < sample_limit) { + result.world_points_sample.insert(result.world_points_sample.end(), world_xyz, world_xyz + 3); + sample_count += 1; + } + } + } + } + if (world_points_file.is_open()) { + world_points_file.close(); + result.world_points_bytes = point_count * 3 * (int64_t) sizeof(float); + } + result.depth_mean = depth_sum / (double) point_count; + result.depth_conf_mean = conf_sum / (double) point_count; + result.sample_count = sample_count; + return result; +} + +bool server_smt_vision_config_is_lingbot_map(const std::string & config_dir) { + if (config_dir.empty()) { + return false; + } + + const std::string config_path = config_dir + "/config.json"; + std::ifstream file(config_path); + if (!file.is_open()) { + return false; + } + + try { + nlohmann::json config = nlohmann::json::parse(file); + if (!config.contains("architectures")) { + return false; + } + const auto & arch = config.at("architectures"); + if (arch.is_array()) { + for (const auto & value : arch) { + if (value.is_string() && value.get() == "LingBotMapFor3DReconstruction") { + return true; + } + } + } else if (arch.is_string()) { + return arch.get() == "LingBotMapFor3DReconstruction"; + } + } catch (...) { + return false; + } + return false; +} + static std::string fnv_hash(const uint8_t * data, size_t len) { const uint64_t fnv_prime = 0x100000001b3ULL; uint64_t hash = 0xcbf29ce484222325ULL; @@ -406,8 +946,24 @@ static int decode_embd(llama_context * lctx, server_smt_vision_context * server_smt_vision_init(llama_context * lctx, const std::string & config_dir, bool warmup) { #if defined(LLAMA_SERVER_SMT_VISION) auto ctx = std::make_unique(); + ctx->config_dir = config_dir; std::string primary_architecture; + if (server_smt_vision_config_is_lingbot_map(config_dir)) { + GGML_UNUSED(lctx); + GGML_UNUSED(warmup); + onnxruntime::g_ort = OrtGetApiBase()->GetApi(ORT_API_VERSION); + ctx->lingbot_map = lingbot_map_context::create(config_dir); + ctx->lingbot_onnx = create_lingbot_map_onnx_context(ctx->lingbot_map->config()); + ctx->architecture = ctx->lingbot_map->architecture(); + LOG_INF("[server-smt] loaded LingBot-MAP model from '%s', architecture=%s, tensors=%" PRId64 "\n", + config_dir.c_str(), ctx->architecture.c_str(), ctx->lingbot_map->tensor_count()); + LOG_INF("[server-smt] loaded LingBot-MAP ONNX sessions: vit_inputs=%zu, vit_outputs=%zu, dpt_inputs=%zu, dpt_outputs=%zu\n", + ctx->lingbot_onnx->vision_input_names.size(), ctx->lingbot_onnx->vision_output_names.size(), + ctx->lingbot_onnx->depth_input_names.size(), ctx->lingbot_onnx->depth_output_names.size()); + return ctx.release(); + } + try { ctx->smt_vision = smt_vision_context::create(config_dir, warmup); ctx->hidden_size = (int32_t) ctx->smt_vision->hidden_size(); @@ -472,6 +1028,282 @@ bool server_smt_vision_supports_audio(const server_smt_vision_context * ctx) { ; } +bool server_smt_vision_supports_prompt_embeddings(const server_smt_vision_context * ctx) { + return ctx != nullptr +#if defined(LLAMA_SERVER_SMT_VISION) + && (ctx->smt_vision != nullptr || ctx->smt_audio != nullptr) +#endif + ; +} + +bool server_smt_vision_is_lingbot_map(const server_smt_vision_context * ctx) { + return ctx != nullptr +#if defined(LLAMA_SERVER_SMT_VISION) + && ctx->lingbot_map != nullptr +#endif + ; +} + +server_smt_lingbot_map_reconstruct_result server_smt_vision_lingbot_map_reconstruct( + server_smt_vision_context * ctx, + const std::vector> & images, + const server_smt_lingbot_map_reconstruct_options & options) { +#if defined(LLAMA_SERVER_SMT_VISION) + if (ctx == nullptr || ctx->lingbot_map == nullptr) { + throw std::runtime_error("SMT context does not contain a LingBot-MAP model"); + } + if (images.empty()) { + throw std::invalid_argument("LingBot-MAP reconstruction requires at least one image"); + } + if (options.max_frames > 0 && (int32_t) images.size() > options.max_frames) { + throw std::invalid_argument("LingBot-MAP reconstruction request exceeds max_frames"); + } + + std::lock_guard lock(ctx->mu); + lingbot_log_rss("request_start"); + + const auto & cfg = ctx->lingbot_map->config(); + if (ctx->lingbot_onnx == nullptr) { + throw std::runtime_error("LingBot-MAP ONNX sessions are not loaded"); + } + + const int32_t input_w = ctx->lingbot_onnx->vision_input_w > 0 ? ctx->lingbot_onnx->vision_input_w : cfg.image_size; + const int32_t input_h = ctx->lingbot_onnx->vision_input_h > 0 ? ctx->lingbot_onnx->vision_input_h : cfg.image_size; + + auto preproc = smt_lingbot_map_preprocess_images(images, input_w, input_h, cfg.patch_size, + cfg.image_mean, cfg.image_std); + lingbot_log_rss("after_preprocess"); + + std::vector input_shape = { 1, (int64_t) images.size(), 3, input_h, input_w }; + auto memory_info = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); + Ort::Value input_tensor = Ort::Value::CreateTensor(memory_info, preproc.tensor_nchw.data(), + preproc.tensor_nchw.size(), + input_shape.data(), input_shape.size()); + + auto stage_start = std::chrono::steady_clock::now(); + auto vit_outputs = ctx->lingbot_onnx->vision_session.Run( + Ort::RunOptions{ nullptr }, + ctx->lingbot_onnx->vision_input_names_raw.data(), + &input_tensor, + 1, + ctx->lingbot_onnx->vision_output_names_raw.data(), + ctx->lingbot_onnx->vision_output_names_raw.size()); + std::cerr << "[LingBot-MAP][time] vit_onnx_ms=" << lingbot_elapsed_ms(stage_start) << "\n"; + lingbot_log_rss("after_vit_onnx"); + + if (vit_outputs.empty() || !vit_outputs[0].IsTensor()) { + throw std::runtime_error("LingBot-MAP ViT ONNX did not return a tensor output"); + } + + auto tensor_info = vit_outputs[0].GetTensorTypeAndShapeInfo(); + std::vector vision_output_shape = tensor_info.GetShape(); + const int64_t vision_output_float_count = (int64_t) tensor_info.GetElementCount(); + if (vision_output_shape.size() != 3) { + throw std::runtime_error("LingBot-MAP ViT output must be rank-3 [frames, tokens, hidden]"); + } + if (vision_output_shape[0] != (int64_t) images.size()) { + throw std::runtime_error("LingBot-MAP ViT output frame count does not match input image count"); + } + if (vision_output_shape[2] != cfg.hidden_size) { + throw std::runtime_error("LingBot-MAP ViT output hidden size does not match config hidden_size"); + } + if (vision_output_shape[1] <= 0) { + throw std::runtime_error("LingBot-MAP ViT output token count must be positive"); + } + + const float * vit_output_data = vit_outputs[0].GetTensorData(); + const auto aggregator_input = ctx->lingbot_map->build_aggregator_input( + vit_output_data, + (int32_t) vision_output_shape[0], + (int32_t) vision_output_shape[1], + (int32_t) vision_output_shape[2], + input_h, + input_w, + /* num_frame_for_scale */ 1); + lingbot_log_rss("after_aggregator_input"); + stage_start = std::chrono::steady_clock::now(); + const auto runtime = ctx->lingbot_map->run_aggregator_camera_head(aggregator_input, /* prefer_smt */ true); + std::cerr << "[LingBot-MAP][time] aggregator_camera_total_ms=" << lingbot_elapsed_ms(stage_start) << "\n"; + lingbot_log_rss("after_aggregator_camera_ggml"); + + if (ctx->lingbot_onnx->depth_input_names.size() != runtime.selected_output_shapes.size() || + runtime.selected_output_shapes.size() != runtime.selected_outputs.size()) { + throw std::runtime_error("LingBot-MAP DPT input count does not match aggregator runtime selected output count"); + } + + std::vector> depth_input_shapes; + depth_input_shapes.reserve(ctx->lingbot_onnx->depth_input_names.size()); + int64_t depth_input_float_count = 0; + for (size_t i = 0; i < ctx->lingbot_onnx->depth_input_names.size(); ++i) { + const auto & selected_shape = runtime.selected_output_shapes[i]; + if (selected_shape.size() != 4 || selected_shape[0] != cfg.camera_hidden_size || + selected_shape[1] != aggregator_input.tokens_per_frame || selected_shape[2] != aggregator_input.n_frames) { + throw std::runtime_error("LingBot-MAP aggregator selected output shape is not compatible with DPT input"); + } + const auto input_shape = lingbot_make_depth_input_shape(ctx->lingbot_onnx->depth_input_shapes[i], + aggregator_input.n_frames, + aggregator_input.tokens_per_frame, + cfg.camera_hidden_size); + depth_input_float_count += lingbot_numel(input_shape); + depth_input_shapes.push_back(input_shape); + } + + std::vector depth_input_storage((size_t) depth_input_float_count, 0.0f); + std::vector depth_input_tensors; + depth_input_tensors.reserve(depth_input_shapes.size()); + size_t depth_input_offset = 0; + for (size_t i = 0; i < depth_input_shapes.size(); ++i) { + const auto & shape = depth_input_shapes[i]; + const int64_t n_elem = lingbot_numel(shape); + if ((size_t) n_elem != runtime.selected_outputs[i].size()) { + throw std::runtime_error("LingBot-MAP runtime selected output size does not match DPT input shape"); + } + std::copy(runtime.selected_outputs[i].begin(), runtime.selected_outputs[i].end(), + depth_input_storage.begin() + (std::ptrdiff_t) depth_input_offset); + depth_input_tensors.push_back(Ort::Value::CreateTensor(memory_info, + depth_input_storage.data() + depth_input_offset, + (size_t) n_elem, + shape.data(), shape.size())); + depth_input_offset += (size_t) n_elem; + } + lingbot_log_rss("after_dpt_input_pack"); + + stage_start = std::chrono::steady_clock::now(); + auto depth_outputs = ctx->lingbot_onnx->depth_session.Run( + Ort::RunOptions{ nullptr }, + ctx->lingbot_onnx->depth_input_names_raw.data(), + depth_input_tensors.data(), + depth_input_tensors.size(), + ctx->lingbot_onnx->depth_output_names_raw.data(), + ctx->lingbot_onnx->depth_output_names_raw.size()); + std::cerr << "[LingBot-MAP][time] dpt_onnx_ms=" << lingbot_elapsed_ms(stage_start) << "\n"; + lingbot_log_rss("after_dpt_onnx"); + lingbot_validate_depth_outputs(depth_outputs); + + std::vector> depth_output_shapes; + std::vector depth_output_float_counts; + depth_output_shapes.reserve(depth_outputs.size()); + depth_output_float_counts.reserve(depth_outputs.size()); + for (const auto & output : depth_outputs) { + auto output_info = output.GetTensorTypeAndShapeInfo(); + depth_output_shapes.push_back(output_info.GetShape()); + depth_output_float_counts.push_back((int64_t) output_info.GetElementCount()); + } + + if (runtime.pose_encoding.size() != (size_t) aggregator_input.n_frames * 9) { + throw std::runtime_error("LingBot-MAP runtime pose output shape does not match frame count"); + } + + const bool save_point_cloud = options.output_point_cloud && cfg.output_point_cloud; + const auto world_points_paths = save_point_cloud ? lingbot_make_world_points_paths(ctx->config_dir) : std::pair{}; + const auto postprocess = lingbot_postprocess_reconstruction( + runtime.pose_encoding.data(), + "camera_head_ggml_runtime", + depth_outputs[0].GetTensorData(), + depth_outputs[1].GetTensorData(), + depth_output_shapes[0], + depth_output_shapes[1], + aggregator_input.n_frames, + world_points_paths.first, + world_points_paths.second); + lingbot_log_rss("after_postprocess"); + + server_smt_lingbot_map_reconstruct_result result; + result.architecture = ctx->lingbot_map->architecture(); + result.message = "LingBot-MAP ViT ONNX inference completed; aggregator/camera_head GGML runtime ran on SMT; DPT ONNX ran; postprocess completed"; + result.stages = { + "config_loaded", + "images_preprocessed", + "vit_onnx_ran", + "aggregator_input_prepared", + "aggregator_camera_head_ggml_runtime_ran", + "depth_onnx_ran", + "postprocess_completed", + }; + if (!postprocess.world_points_path.empty()) { + result.stages.push_back("point_cloud_bin_saved"); + } + result.tensor_count = ctx->lingbot_map->tensor_count(); + result.n_images = (int32_t) images.size(); + result.image_size = cfg.image_size; + result.patch_size = cfg.patch_size; + result.hidden_size = cfg.hidden_size; + result.camera_hidden_size = cfg.camera_hidden_size; + result.preprocess_width = preproc.target_w; + result.preprocess_height = preproc.target_h; + result.vision_input_float_count = (int64_t) preproc.tensor_nchw.size(); + result.vision_output_float_count = vision_output_float_count; + result.vision_output_frames = (int32_t) vision_output_shape[0]; + result.vision_output_tokens = (int32_t) vision_output_shape[1]; + result.vision_output_hidden = (int32_t) vision_output_shape[2]; + result.aggregator_tokens_per_frame = aggregator_input.tokens_per_frame; + result.aggregator_patch_start_idx = aggregator_input.patch_start_idx; + result.aggregator_patch_tokens = aggregator_input.patch_tokens; + result.aggregator_vit_prefix_tokens = aggregator_input.vit_prefix_tokens; + result.aggregator_graph_nodes = runtime.graph_nodes; + result.aggregator_graph_selected_outputs = runtime.selected_output_count; + result.aggregator_graph_frame_blocks = runtime.frame_block_count; + result.aggregator_graph_global_blocks = runtime.global_block_count; + result.aggregator_graph_tokens_per_frame = runtime.tokens_per_frame; + result.aggregator_graph_patch_start_idx = runtime.patch_start_idx; + result.aggregator_graph_selected_output_shapes = runtime.selected_output_shapes; + result.aggregator_selected_layers = runtime.selected_layers; + result.camera_head_graph_nodes = runtime.graph_nodes; + result.camera_head_trunk_blocks = runtime.camera_trunk_block_count; + result.camera_head_iterations = runtime.camera_iteration_count; + result.camera_head_pose_dim = runtime.camera_pose_dim; + result.camera_head_input_shape = runtime.camera_head_input_shape; + result.camera_head_final_pose_shape = runtime.camera_head_final_pose_shape; + result.camera_head_iteration_pose_shapes = runtime.camera_head_iteration_pose_shapes; + result.ggml_runtime_graph_nodes = runtime.graph_nodes; + result.ggml_runtime_backend = runtime.backend_name; + result.ggml_runtime_buffer_type = runtime.buffer_type_name; + result.depth_onnx_input_count = (int32_t) ctx->lingbot_onnx->depth_input_names.size(); + result.depth_onnx_output_count = (int32_t) depth_outputs.size(); + result.depth_onnx_input_float_count = depth_input_float_count; + result.depth_input_source = "aggregator_ggml_runtime_selected_outputs"; + result.depth_input_names = ctx->lingbot_onnx->depth_input_names; + result.depth_output_names = ctx->lingbot_onnx->depth_output_names; + result.depth_input_shapes = std::move(depth_input_shapes); + result.depth_output_shapes = std::move(depth_output_shapes); + result.depth_output_float_counts = std::move(depth_output_float_counts); + result.pose_output_source = postprocess.pose_source; + result.pose_encoding_shape = postprocess.pose_encoding_shape; + result.extrinsic_shape = postprocess.extrinsic_shape; + result.intrinsic_shape = postprocess.intrinsic_shape; + result.world_points_shape = postprocess.world_points_shape; + result.world_points_conf_shape = postprocess.world_points_conf_shape; + result.pose_encoding_sample = postprocess.pose_encoding_sample; + result.extrinsic_first = postprocess.extrinsic_first; + result.intrinsic_first = postprocess.intrinsic_first; + result.world_points_sample = postprocess.world_points_sample; + result.world_points_path = postprocess.world_points_path; + result.world_points_bytes = postprocess.world_points_bytes; + result.postprocess_point_count = postprocess.point_count; + result.postprocess_sample_count = postprocess.sample_count; + result.depth_min = postprocess.depth_min; + result.depth_max = postprocess.depth_max; + result.depth_mean = postprocess.depth_mean; + result.depth_conf_min = postprocess.depth_conf_min; + result.depth_conf_max = postprocess.depth_conf_max; + result.depth_conf_mean = postprocess.depth_conf_mean; + result.vision_input_shape = std::move(input_shape); + result.vision_output_shape = std::move(vision_output_shape); + result.resized_heights = std::move(preproc.resized_heights); + result.output_pose = options.output_pose && cfg.output_pose; + result.output_depth = options.output_depth && cfg.output_depth; + result.output_point_cloud = options.output_point_cloud && cfg.output_point_cloud; + result.onnx_sessions_loaded = true; + result.inference_ready = true; + return result; +#else + GGML_UNUSED(ctx); + GGML_UNUSED(images); + GGML_UNUSED(options); + throw std::runtime_error("SMT media backend is not compiled"); +#endif +} + server_smt_image_chunk server_smt_vision_encode_image_bin(server_smt_vision_context * ctx, const std::vector & data) { if (ctx == nullptr) { diff --git a/tools/server/server-smt-vision.h b/tools/server/server-smt-vision.h index 44081107bdec..b02b7af0f56b 100644 --- a/tools/server/server-smt-vision.h +++ b/tools/server/server-smt-vision.h @@ -27,7 +27,111 @@ struct server_smt_image_chunk { struct server_smt_vision_context; +struct server_smt_lingbot_map_reconstruct_options { + bool output_pose = true; + bool output_depth = true; + bool output_point_cloud = true; + int32_t max_frames = -1; +}; + +struct server_smt_lingbot_map_reconstruct_result { + std::string architecture; + std::string message; + std::vector stages; + + int64_t tensor_count = 0; + int32_t n_images = 0; + int32_t image_size = 0; + int32_t patch_size = 0; + int32_t hidden_size = 0; + int32_t camera_hidden_size = 0; + int32_t preprocess_width = 0; + int32_t preprocess_height = 0; + int64_t vision_input_float_count = 0; + int64_t vision_output_float_count = 0; + int32_t vision_output_frames = 0; + int32_t vision_output_tokens = 0; + int32_t vision_output_hidden = 0; + int32_t aggregator_tokens_per_frame = 0; + int32_t aggregator_patch_start_idx = 0; + int32_t aggregator_patch_tokens = 0; + int32_t aggregator_vit_prefix_tokens = 0; + int32_t aggregator_probe_graph_nodes = 0; + int32_t aggregator_global_probe_graph_nodes = 0; + int32_t aggregator_global_probe_input_tokens = 0; + int32_t aggregator_full_probe_graph_nodes = 0; + int32_t aggregator_full_probe_selected_outputs = 0; + int32_t aggregator_full_probe_frame_blocks = 0; + int32_t aggregator_full_probe_global_blocks = 0; + int32_t aggregator_graph_nodes = 0; + int32_t aggregator_graph_selected_outputs = 0; + int32_t aggregator_graph_frame_blocks = 0; + int32_t aggregator_graph_global_blocks = 0; + int32_t aggregator_graph_tokens_per_frame = 0; + int32_t aggregator_graph_patch_start_idx = 0; + int32_t camera_head_graph_nodes = 0; + int32_t camera_head_trunk_blocks = 0; + int32_t camera_head_iterations = 0; + int32_t camera_head_pose_dim = 0; + int32_t ggml_runtime_graph_nodes = 0; + int32_t depth_onnx_input_count = 0; + int32_t depth_onnx_output_count = 0; + int64_t depth_onnx_input_float_count = 0; + int64_t postprocess_point_count = 0; + int64_t world_points_bytes = 0; + int32_t postprocess_sample_count = 0; + double depth_min = 0.0; + double depth_max = 0.0; + double depth_mean = 0.0; + double depth_conf_min = 0.0; + double depth_conf_max = 0.0; + double depth_conf_mean = 0.0; + std::string depth_input_source; + std::string pose_output_source; + std::string ggml_runtime_backend; + std::string ggml_runtime_buffer_type; + std::string world_points_path; + std::vector aggregator_probe_qkv_shape; + std::vector aggregator_probe_output_shape; + std::vector aggregator_global_probe_qkv_shape; + std::vector aggregator_global_probe_output_shape; + std::vector aggregator_full_probe_final_frame_shape; + std::vector aggregator_full_probe_final_global_shape; + std::vector aggregator_graph_final_frame_shape; + std::vector aggregator_graph_final_global_shape; + std::vector> aggregator_graph_selected_output_shapes; + std::vector aggregator_selected_layers; + std::vector camera_head_input_shape; + std::vector camera_head_final_pose_shape; + std::vector> camera_head_iteration_pose_shapes; + std::vector depth_input_names; + std::vector depth_output_names; + std::vector> depth_input_shapes; + std::vector> depth_output_shapes; + std::vector depth_output_float_counts; + std::vector pose_encoding_shape; + std::vector extrinsic_shape; + std::vector intrinsic_shape; + std::vector world_points_shape; + std::vector world_points_conf_shape; + std::vector pose_encoding_sample; + std::vector extrinsic_first; + std::vector intrinsic_first; + std::vector world_points_sample; + std::vector vision_input_shape; + std::vector vision_output_shape; + std::vector resized_heights; + + bool output_pose = true; + bool output_depth = true; + bool output_point_cloud = true; + bool onnx_sessions_loaded = false; + bool inference_ready = false; +}; + #if defined(LLAMA_SERVER_SMT_VISION) +bool server_smt_vision_config_is_lingbot_map(const std::string & config_dir); + server_smt_vision_context * server_smt_vision_init( llama_context * lctx, const std::string & config_dir, @@ -37,6 +141,13 @@ void server_smt_vision_free(server_smt_vision_context * ctx); bool server_smt_vision_supports_image(const server_smt_vision_context * ctx); bool server_smt_vision_supports_audio(const server_smt_vision_context * ctx); +bool server_smt_vision_supports_prompt_embeddings(const server_smt_vision_context * ctx); +bool server_smt_vision_is_lingbot_map(const server_smt_vision_context * ctx); + +server_smt_lingbot_map_reconstruct_result server_smt_vision_lingbot_map_reconstruct( + server_smt_vision_context * ctx, + const std::vector> & images, + const server_smt_lingbot_map_reconstruct_options & options); server_smt_image_chunk server_smt_vision_encode_media_bin( server_smt_vision_context * ctx, @@ -55,6 +166,10 @@ int32_t server_smt_vision_decode_chunk( int32_t n_batch, bool logits_last); #else +inline bool server_smt_vision_config_is_lingbot_map(const std::string & /* config_dir */) { + return false; +} + inline server_smt_vision_context * server_smt_vision_init( llama_context * /* lctx */, const std::string & /* config_dir */, @@ -73,6 +188,21 @@ inline bool server_smt_vision_supports_audio(const server_smt_vision_context * / return false; } +inline bool server_smt_vision_supports_prompt_embeddings(const server_smt_vision_context * /* ctx */) { + return false; +} + +inline bool server_smt_vision_is_lingbot_map(const server_smt_vision_context * /* ctx */) { + return false; +} + +inline server_smt_lingbot_map_reconstruct_result server_smt_vision_lingbot_map_reconstruct( + server_smt_vision_context * /* ctx */, + const std::vector> & /* images */, + const server_smt_lingbot_map_reconstruct_options & /* options */) { + throw std::runtime_error("SMT media backend is not compiled"); +} + inline server_smt_image_chunk server_smt_vision_encode_media_bin( server_smt_vision_context * /* ctx */, const std::vector & /* data */) { diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 769e80a802f3..71d9efaa49d7 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3,6 +3,7 @@ #include "server-models.h" #include "server-cors-proxy.h" #include "server-tools.h" +#include "server-smt-vision.h" #include "arg.h" #include "build-info.h" @@ -89,9 +90,18 @@ int llama_server(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - // router server never loads a model and must not touch the GPU - // skip device enumeration so the CUDA primary context stays uncreated - const bool is_router_server = params.model.path.empty(); +#if defined(LLAMA_SERVER_SMT_VISION) + const bool is_lingbot_map_reconstruct_server = + (params.media_backend == "smt" || params.media_backend == "auto") && + server_smt_vision_config_is_lingbot_map(params.smt_config_dir); +#else + const bool is_lingbot_map_reconstruct_server = false; +#endif + + // router server never loads a model and must not touch the GPU. A LingBot-MAP + // reconstruction server is model-less from llama's text-LLM perspective, but still + // needs local SMT initialization instead of router proxying. + const bool is_router_server = params.model.path.empty() && !is_lingbot_map_reconstruct_server; common_params_print_info(params, !is_router_server); // validate batch size for embeddings @@ -161,6 +171,7 @@ int llama_server(int argc, char ** argv) { routes.post_tokenize = models_routes->proxy_post; routes.post_detokenize = models_routes->proxy_post; routes.post_apply_template = models_routes->proxy_post; + routes.post_reconstruct = models_routes->proxy_post; routes.get_lora_adapters = models_routes->proxy_get; routes.post_lora_adapters = models_routes->proxy_post; routes.get_slots = models_routes->proxy_get; @@ -204,6 +215,8 @@ int llama_server(int argc, char ** argv) { ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); + ctx_http.post("/reconstruct", ex_wrapper(routes.post_reconstruct)); + ctx_http.post("/v1/reconstruct", ex_wrapper(routes.post_reconstruct)); // LoRA adapters hotswap ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); From 1ddff89fd44db98e04753cc2bc39acf2e7f067cd Mon Sep 17 00:00:00 2001 From: co-seven Date: Mon, 8 Jun 2026 08:02:36 +0000 Subject: [PATCH 2/4] ci: shorten SpacemiT MTMD release tags --- .github/variables.env | 2 +- .github/workflows/build-spacemit-mtmd.yml | 22 ++++++++++++++-------- VERSION_NUMBER | 2 +- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.github/variables.env b/.github/variables.env index 18bf7bf5b9ce..405225b1947e 100644 --- a/.github/variables.env +++ b/.github/variables.env @@ -1,6 +1,6 @@ SPACEMIT_TOOLCHAIN_URL=https://github.com/spacemit-com/toolchain/releases/download/v1.1.2/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz SPACEMIT_ORT_URL=https://github.com/spacemit-com/onnxruntime/releases/download/2.0.2/spacemit-ort.riscv64.2.0.2.tar.gz -SPACEMIT_MTMD_RELEASE_TAG=spacemit-llama.cpp.riscv64 +SPACEMIT_MTMD_PACKAGE_PREFIX=spacemit-llama.cpp.riscv64 SPACEMIT_TOOLCHAIN_ARCHIVE=.cache/spacemit-toolchain.tar.xz SPACEMIT_ORT_ARCHIVE=.cache/spacemit-ort.tar.gz SPACEMIT_TOOLCHAIN_DIR=spacemit_toolchain diff --git a/.github/workflows/build-spacemit-mtmd.yml b/.github/workflows/build-spacemit-mtmd.yml index 57bce4b3206c..621cfcfb3f76 100644 --- a/.github/workflows/build-spacemit-mtmd.yml +++ b/.github/workflows/build-spacemit-mtmd.yml @@ -42,12 +42,14 @@ jobs: exit 1 fi - SPACEMIT_MTMD_RELEASE_TAG="${SPACEMIT_MTMD_RELEASE_TAG}.${VERSION_NUMBER}" + SPACEMIT_MTMD_RELEASE_TAG="v${VERSION_NUMBER}" + SPACEMIT_MTMD_PACKAGE_NAME="${SPACEMIT_MTMD_PACKAGE_PREFIX}.${VERSION_NUMBER}" { echo "SPACEMIT_TOOLCHAIN_URL=${SPACEMIT_TOOLCHAIN_URL}" echo "SPACEMIT_ORT_URL=${SPACEMIT_ORT_URL}" echo "SPACEMIT_MTMD_RELEASE_TAG=${SPACEMIT_MTMD_RELEASE_TAG}" + echo "SPACEMIT_MTMD_PACKAGE_NAME=${SPACEMIT_MTMD_PACKAGE_NAME}" echo "SPACEMIT_TOOLCHAIN_ARCHIVE=${SPACEMIT_TOOLCHAIN_ARCHIVE}" echo "SPACEMIT_ORT_ARCHIVE=${SPACEMIT_ORT_ARCHIVE}" echo "SPACEMIT_TOOLCHAIN_DIR=${SPACEMIT_TOOLCHAIN_DIR}" @@ -59,7 +61,9 @@ jobs: { echo "toolchain_url=${SPACEMIT_TOOLCHAIN_URL}" echo "ort_url=${SPACEMIT_ORT_URL}" + echo "version_number=${VERSION_NUMBER}" echo "release_tag=${SPACEMIT_MTMD_RELEASE_TAG}" + echo "package_name=${SPACEMIT_MTMD_PACKAGE_NAME}" echo "toolchain_archive=${SPACEMIT_TOOLCHAIN_ARCHIVE}" echo "ort_archive=${SPACEMIT_ORT_ARCHIVE}" echo "toolchain_dir=${SPACEMIT_TOOLCHAIN_DIR}" @@ -228,8 +232,8 @@ jobs: shell: bash run: | set -euo pipefail - PACKAGE_DIR="release/${SPACEMIT_MTMD_RELEASE_TAG}" - ASSET_NAME="${SPACEMIT_MTMD_RELEASE_TAG}.tar.gz" + PACKAGE_DIR="release/${SPACEMIT_MTMD_PACKAGE_NAME}" + ASSET_NAME="${SPACEMIT_MTMD_PACKAGE_NAME}.tar.gz" rm -rf "$PACKAGE_DIR" mkdir -p "$PACKAGE_DIR" @@ -240,7 +244,7 @@ jobs: find "$PACKAGE_DIR/bin" -maxdepth 1 \( -type f -o -type l \) \( -name 'test*' -o -name 'export-graph-ops*' \) -exec rm -f {} + fi - tar -czf "release/${ASSET_NAME}" -C release "${SPACEMIT_MTMD_RELEASE_TAG}" + tar -czf "release/${ASSET_NAME}" -C release "${SPACEMIT_MTMD_PACKAGE_NAME}" - name: Inspect package if: ${{ github.event_name == 'pull_request' || (github.event_name == 'push' && steps.release_guard.outputs.should_publish == 'true') }} @@ -248,9 +252,9 @@ jobs: run: | set -euo pipefail - ASSET_NAME="${SPACEMIT_MTMD_RELEASE_TAG}.tar.gz" + ASSET_NAME="${SPACEMIT_MTMD_PACKAGE_NAME}.tar.gz" echo "Package tree:" - find "release/${SPACEMIT_MTMD_RELEASE_TAG}" -maxdepth 2 -print | sort + find "release/${SPACEMIT_MTMD_PACKAGE_NAME}" -maxdepth 2 -print | sort echo "Package archive:" tar -tzf "release/${ASSET_NAME}" @@ -259,7 +263,7 @@ jobs: uses: actions/upload-artifact@v6 with: name: spacemit-mtmd-package - path: release/${{ steps.vars.outputs.release_tag }}.tar.gz + path: release/${{ steps.vars.outputs.package_name }}.tar.gz if-no-files-found: error retention-days: 7 @@ -271,9 +275,11 @@ jobs: tag_name: ${{ steps.vars.outputs.release_tag }} name: ${{ steps.vars.outputs.release_tag }} target_commitish: ${{ github.sha }} - files: release/${{ steps.vars.outputs.release_tag }}.tar.gz + files: release/${{ steps.vars.outputs.package_name }}.tar.gz body: | SpacemiT MTMD build for `spacemit-mtmd`. + Version: `${{ steps.vars.outputs.version_number }}` + Package: `${{ steps.vars.outputs.package_name }}.tar.gz` Commit: `${{ github.sha }}` make_latest: false overwrite_files: true diff --git a/VERSION_NUMBER b/VERSION_NUMBER index d917d3e26adc..b1e80bb2480a 100644 --- a/VERSION_NUMBER +++ b/VERSION_NUMBER @@ -1 +1 @@ -0.1.2 +0.1.3 From 7bf649e0f00dcb97dcb835ac50bff58dc4d4e9a1 Mon Sep 17 00:00:00 2001 From: co-seven Date: Mon, 8 Jun 2026 08:56:56 +0000 Subject: [PATCH 3/4] ci: comment merged PR commit summary --- .../workflows/merge-pr-summary-comment.yml | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 .github/workflows/merge-pr-summary-comment.yml diff --git a/.github/workflows/merge-pr-summary-comment.yml b/.github/workflows/merge-pr-summary-comment.yml new file mode 100644 index 000000000000..dd78314aecfd --- /dev/null +++ b/.github/workflows/merge-pr-summary-comment.yml @@ -0,0 +1,87 @@ +name: Comment merged PR summary + +on: + pull_request: + branches: + - spacemit-mtmd + types: + - closed + +permissions: + contents: read + pull-requests: read + issues: write + +jobs: + comment: + name: Comment merged PR summary + if: ${{ github.event.pull_request.merged == true }} + runs-on: ubuntu-24.04 + + steps: + - name: Comment commit summary + uses: actions/github-script@v8 + with: + script: | + const pr = context.payload.pull_request; + const commits = await github.paginate(github.rest.pulls.listCommits, { + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: pr.number, + per_page: 100, + }); + + const marker = ''; + const commitLines = commits.map((commit, index) => { + const shortSha = commit.sha.substring(0, 7); + const title = commit.commit.message.split('\n')[0]; + return `${index + 1}. \`${shortSha}\` ${title}`; + }); + + const mergedBy = pr.merged_by ? pr.merged_by.login : context.actor; + const body = [ + marker, + `### Merge PR Summary`, + ``, + `Merged PR #${pr.number}: ${pr.title}`, + ``, + `- Target branch: \`${pr.base.ref}\``, + `- Source branch: \`${pr.head.label}\``, + `- Merged by: @${mergedBy}`, + `- Merge commit: ${pr.merge_commit_sha ? '`${pr.merge_commit_sha}`' : 'unknown'}`, + `- Commit count: **${commits.length}**`, + ``, + `Commits:`, + ...commitLines, + ``, + `PR: ${pr.html_url}`, + ].join('\n'); + + const comments = await github.paginate(github.rest.issues.listComments, { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pr.number, + per_page: 100, + }); + const existing = comments.find(comment => comment.body && comment.body.includes(marker)); + + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pr.number, + body, + }); + } + + await core.summary + .addHeading(`Merge PR #${pr.number} summary`, 2) + .addCodeBlock(body.replace(marker + '\n', ''), 'text') + .write(); From 848725832d76774075d21213d8e9f0c73235ea09 Mon Sep 17 00:00:00 2001 From: co-seven Date: Mon, 8 Jun 2026 09:50:44 +0000 Subject: [PATCH 4/4] ci: test bugfix --- src/llama-model-saver.cpp | 1 + src/models/lingbot-map.cpp | 4 ++-- src/models/models.h | 4 ++-- tests/test-llama-archs.cpp | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 528e4c9c069f..572ca768fa0f 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -14,6 +14,7 @@ bool llama_model_saver_supports_arch(llm_arch arch) { switch (arch) { + case LLM_ARCH_LINGBOT_MAP: case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_QWEN35: case LLM_ARCH_QWEN35MOE: diff --git a/src/models/lingbot-map.cpp b/src/models/lingbot-map.cpp index 877402d7ee4a..2934fd8b3cd5 100644 --- a/src/models/lingbot-map.cpp +++ b/src/models/lingbot-map.cpp @@ -59,10 +59,10 @@ void llama_model_lingbot_map::load_arch_hparams(llama_model_loader & ml) { hparams.n_rot_swa = 0; } -void llama_model_lingbot_map::load_arch_tensors(llama_model_loader &) { +[[noreturn]] void llama_model_lingbot_map::load_arch_tensors(llama_model_loader &) { throw std::runtime_error("LingBot-MAP GGUF tensors are loaded by the mtmd SMT wrapper, not llama_model"); } -std::unique_ptr llama_model_lingbot_map::build_arch_graph(const llm_graph_params &) const { +[[noreturn]] std::unique_ptr llama_model_lingbot_map::build_arch_graph(const llm_graph_params &) const { throw std::runtime_error("LingBot-MAP does not support llama_model text graph execution"); } diff --git a/src/models/models.h b/src/models/models.h index 47c099a76d9a..4609a0d063ce 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -176,9 +176,9 @@ struct llama_model_llama_embed : public llama_model_llama { struct llama_model_lingbot_map : public llama_model_base { llama_model_lingbot_map(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; - void load_arch_tensors(llama_model_loader & ml) override; + [[noreturn]] void load_arch_tensors(llama_model_loader & ml) override; - std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; + [[noreturn]] std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 1def7faff605..120e635ec141 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -382,8 +382,8 @@ static bool moe_implemented(const llm_arch arch) { } static bool arch_supported(const llm_arch arch) { - if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) { - return false; // These models don't have usable implementations. + if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_LINGBOT_MAP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) { + return false; // These models don't have usable llama_model text implementations. } if (arch == LLM_ARCH_CHAMELEON) { return false; // Only half-implemented and to be removed in the future.