From 1fe6d496a80d4bce8945df65b4987762deb66865 Mon Sep 17 00:00:00 2001
From: co-seven <linxi.cai@spacemit.com>
Date: Mon, 8 Jun 2026 07:44:07 +0000
Subject: [PATCH 1/4] server: add LingBot-MAP SMT reconstruction pipeline. Add
 LingBot-MAP as an SMT vision model implementation, including GGUF
 metadata/model registration, quantization support, aggregator/camera_head
 GGML runtime, ViT/DPT ONNX integration, reconstruction postprocess, and
 /reconstruct server routing.

---
 convert_hf_to_gguf.py                |  205 ++-
 src/llama-arch.cpp                   |    1 +
 src/llama-arch.h                     |    1 +
 src/llama-model.cpp                  |   11 +
 src/llama-quant.cpp                  |    6 +
 src/models/lingbot-map.cpp           |   68 +
 src/models/models.h                  |    9 +
 tools/mtmd/CMakeLists.txt            |    5 +
 tools/mtmd/lingbot-map-wrapper.cpp   | 1893 ++++++++++++++++++++++++++
 tools/mtmd/lingbot-map-wrapper.h     |  172 +++
 tools/mtmd/smt-vision-preprocess.cpp |  270 ++++
 tools/mtmd/smt-vision-preprocess.h   |   16 +
 tools/server/server-common.cpp       |    4 +-
 tools/server/server-context.cpp      |  287 +++-
 tools/server/server-context.h        |    2 +
 tools/server/server-smt-vision.cpp   |  836 +++++++++++-
 tools/server/server-smt-vision.h     |  130 ++
 tools/server/server.cpp              |   19 +-
 18 files changed, 3894 insertions(+), 41 deletions(-)
 create mode 100644 src/models/lingbot-map.cpp
 create mode 100644 tools/mtmd/lingbot-map-wrapper.cpp
 create mode 100644 tools/mtmd/lingbot-map-wrapper.h

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 85527553563d..e39f95d1f116 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -7,8 +7,11 @@
 import logging
 import os
 import sys
+from collections import OrderedDict
 from pathlib import Path
+from typing import Iterable
 
+import numpy as np
 import torch
 
 if 'NO_LOCAL_GGUF' not in os.environ:
@@ -27,6 +30,184 @@
 )
 
 
+LINGBOT_MAP_ARCH = "lingbot-map"
+LINGBOT_MAP_DEFAULT_CHECKPOINT = Path("/home/cailinxi/modelzoo/lingbot-map/hf_model/lingbot-map.pt")
+LINGBOT_MAP_DEFAULT_OUTFILE = Path("/home/cailinxi/modelzoo/lingbot-map/mtmd_model/lingbot-map-agg-camera-f32.gguf")
+
+
+def unwrap_lingbot_map_state_dict(obj: object) -> OrderedDict[str, torch.Tensor]:
+    if isinstance(obj, dict) and "model" in obj and isinstance(obj["model"], dict):
+        obj = obj["model"]
+    if not isinstance(obj, dict):
+        raise TypeError(f"checkpoint must contain a state dict, got {type(obj)!r}")
+
+    state = OrderedDict()
+    for key, value in obj.items():
+        if isinstance(value, torch.Tensor):
+            state[str(key)] = value.detach().cpu()
+    if not state:
+        raise ValueError("checkpoint does not contain tensor entries")
+    return state
+
+
+def lingbot_map_selected_tensor_names(
+        state: OrderedDict[str, torch.Tensor],
+        include_patch_embed: bool,
+        include_depth_head: bool) -> list[str]:
+    names: list[str] = []
+    for name in state:
+        if name.startswith("aggregator."):
+            if not include_patch_embed and name.startswith("aggregator.patch_embed."):
+                continue
+            names.append(name)
+        elif name.startswith("camera_head."):
+            names.append(name)
+        elif include_depth_head and name.startswith("depth_head."):
+            names.append(name)
+    return names
+
+
+def lingbot_map_count_indexed_modules(names: Iterable[str], prefix: str) -> int:
+    indices: set[int] = set()
+    needle = prefix + "."
+    for name in names:
+        if not name.startswith(needle):
+            continue
+        rest = name[len(needle):]
+        first = rest.split(".", 1)[0]
+        if first.isdigit():
+            indices.add(int(first))
+    return max(indices) + 1 if indices else 0
+
+
+def lingbot_map_infer_metadata(
+        state: OrderedDict[str, torch.Tensor],
+        selected: list[str],
+        include_patch_embed: bool,
+        include_depth_head: bool) -> dict[str, object]:
+    camera_token = state.get("aggregator.camera_token")
+    if camera_token is None:
+        raise KeyError("missing required tensor: aggregator.camera_token")
+
+    embed_dim = int(camera_token.shape[-1])
+    num_camera_token_variants = int(camera_token.shape[1])
+    num_register_tokens = int(state["aggregator.register_token"].shape[2]) if "aggregator.register_token" in state else 0
+    has_scale_token = "aggregator.scale_token" in state
+    num_special_tokens = 1 + num_register_tokens + (1 if has_scale_token else 0)
+    frame_blocks = lingbot_map_count_indexed_modules(selected, "aggregator.frame_blocks")
+    global_blocks = lingbot_map_count_indexed_modules(selected, "aggregator.global_blocks")
+    camera_blocks = lingbot_map_count_indexed_modules(selected, "camera_head.trunk")
+
+    patch_proj = state.get("aggregator.patch_embed.patch_embed.proj.weight")
+    patch_size = int(patch_proj.shape[-1]) if patch_proj is not None else 14
+
+    camera_qkv = state.get("camera_head.trunk.0.attn.qkv.weight")
+    camera_dim = int(camera_qkv.shape[1]) if camera_qkv is not None else embed_dim * 2
+    camera_pose_dim = int(state["camera_head.empty_pose_tokens"].shape[-1]) if "camera_head.empty_pose_tokens" in state else 9
+
+    return {
+        "schema_version": 1,
+        "component": "aggregator_camera_head",
+        "includes_patch_embed": bool(include_patch_embed),
+        "includes_depth_head": bool(include_depth_head),
+        "embed_dim": embed_dim,
+        "camera_dim": camera_dim,
+        "camera_pose_dim": camera_pose_dim,
+        "patch_size": patch_size,
+        "num_register_tokens": num_register_tokens,
+        "num_special_tokens": num_special_tokens,
+        "num_camera_token_variants": num_camera_token_variants,
+        "has_scale_token": has_scale_token,
+        "aggregator_frame_block_count": frame_blocks,
+        "aggregator_global_block_count": global_blocks,
+        "camera_trunk_block_count": camera_blocks,
+        "aa_order": ["frame", "global"],
+        "aa_block_size": 1,
+        "rope_freq": 100.0,
+        "resnet_mean": [0.485, 0.456, 0.406],
+        "resnet_std": [0.229, 0.224, 0.225],
+    }
+
+
+def lingbot_map_add_metadata(writer: gguf.GGUFWriter, meta: dict[str, object], outtype: str) -> None:
+    writer.add_name("LingBot-MAP aggregator + camera head")
+    writer.add_type("model")
+    writer.add_description("LingBot-MAP non-LLM GGUF containing aggregator and camera head tensors.")
+    writer.add_file_type(int(gguf.LlamaFileType.MOSTLY_F16 if outtype == "f16" else gguf.LlamaFileType.ALL_F32))
+    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+
+    for key, value in meta.items():
+        full_key = f"{LINGBOT_MAP_ARCH}.{key}"
+        if isinstance(value, bool):
+            writer.add_bool(full_key, value)
+        elif isinstance(value, int):
+            writer.add_uint32(full_key, value)
+        elif isinstance(value, float):
+            writer.add_float32(full_key, value)
+        elif isinstance(value, str):
+            writer.add_string(full_key, value)
+        elif isinstance(value, list):
+            writer.add_array(full_key, value)
+        else:
+            raise TypeError(f"unsupported metadata value for {key}: {type(value)!r}")
+
+
+def lingbot_map_tensor_to_numpy(tensor: torch.Tensor, outtype: str) -> np.ndarray:
+    if tensor.dtype.is_floating_point:
+        if outtype == "f16":
+            return tensor.to(torch.float16).numpy()
+        return tensor.to(torch.float32).numpy()
+
+    if tensor.dtype in (torch.int8, torch.int16, torch.int32, torch.int64):
+        return tensor.numpy()
+
+    raise TypeError(f"unsupported tensor dtype: {tensor.dtype}")
+
+
+def write_lingbot_map_gguf(args: argparse.Namespace) -> None:
+    outtype = "f32" if args.outtype == "auto" else args.outtype
+    if outtype not in ("f32", "f16"):
+        raise ValueError("LingBot-MAP GGUF conversion only supports --outtype f32 or f16")
+
+    checkpoint_path = args.checkpoint or LINGBOT_MAP_DEFAULT_CHECKPOINT
+    outfile = args.outfile or LINGBOT_MAP_DEFAULT_OUTFILE
+
+    logger.info("Loading LingBot-MAP checkpoint: %s", checkpoint_path)
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+    state = unwrap_lingbot_map_state_dict(checkpoint)
+    selected = lingbot_map_selected_tensor_names(state, args.include_patch_embed, args.include_depth_head)
+    if not selected:
+        raise ValueError("no LingBot-MAP tensors selected for conversion")
+
+    meta = lingbot_map_infer_metadata(state, selected, args.include_patch_embed, args.include_depth_head)
+    total_params = sum(state[name].numel() for name in selected)
+    total_bytes = sum(lingbot_map_tensor_to_numpy(state[name], outtype).nbytes for name in selected)
+
+    logger.info("Selected LingBot-MAP tensors: %d", len(selected))
+    logger.info("Selected LingBot-MAP parameters: %.3f M", total_params / 1e6)
+    logger.info("Selected LingBot-MAP tensor bytes: %.3f MiB", total_bytes / (1024 * 1024))
+    for key, value in meta.items():
+        logger.info("LingBot-MAP meta %s = %s", key, value)
+
+    if args.dry_run:
+        return
+
+    outfile.parent.mkdir(parents=True, exist_ok=True)
+    writer = gguf.GGUFWriter(outfile, LINGBOT_MAP_ARCH)
+    lingbot_map_add_metadata(writer, meta, outtype)
+
+    for name in selected:
+        arr = lingbot_map_tensor_to_numpy(state[name].contiguous(), outtype)
+        writer.add_tensor(name, arr)
+
+    logger.info("Writing LingBot-MAP GGUF: %s", outfile)
+    writer.write_header_to_file()
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file(progress=True)
+    writer.close()
+    logger.info("LingBot-MAP GGUF conversion done")
+
+
 def split_str_to_n_bytes(split_str: str) -> int:
     if split_str.endswith("K"):
         n = int(split_str[:-1]) * 1000
@@ -60,6 +241,22 @@ def parse_args() -> argparse.Namespace:
         "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
         help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type",
     )
+    parser.add_argument(
+        "--lingbot-map", action="store_true",
+        help="Export LingBot-MAP aggregator and camera_head tensors from a PyTorch checkpoint to GGUF.",
+    )
+    parser.add_argument(
+        "--checkpoint", type=Path,
+        help="Path to LingBot-MAP .pt checkpoint. Only used with --lingbot-map.",
+    )
+    parser.add_argument(
+        "--include-patch-embed", action="store_true",
+        help="Also include aggregator.patch_embed.* tensors when converting LingBot-MAP.",
+    )
+    parser.add_argument(
+        "--include-depth-head", action="store_true",
+        help="Also include depth_head.* tensors when converting LingBot-MAP.",
+    )
     parser.add_argument(
         "--bigendian", action="store_true",
         help="model is executed on big endian machine",
@@ -95,7 +292,7 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--dry-run", action="store_true",
-        help="only print out a split plan and exit, without writing any new files",
+        help="only print out a split plan and exit, without writing any new files. In --lingbot-map mode, print selected tensors and inferred metadata without writing GGUF.",
     )
     parser.add_argument(
         "--no-tensor-first-split", action="store_true",
@@ -154,7 +351,7 @@ def parse_args() -> argparse.Namespace:
     )
 
     args = parser.parse_args()
-    if not args.print_supported_models and args.model is None:
+    if not args.print_supported_models and not args.lingbot_map and args.model is None:
         parser.error("the following arguments are required: model")
     return args
 
@@ -172,6 +369,10 @@ def main() -> None:
     else:
         logging.basicConfig(level=logging.INFO)
 
+    if args.lingbot_map:
+        write_lingbot_map_gguf(args)
+        return
+
     if args.remote:
         hf_repo_id = args.model
         from huggingface_hub import snapshot_download
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index be8f73cc1edd..0ca1d20a4a3b 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -8,6 +8,7 @@
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_CLIP,             "clip"             }, // dummy, only used by llama-quantize
+    { LLM_ARCH_LINGBOT_MAP,      "lingbot-map"      },
     { LLM_ARCH_LLAMA,            "llama"            },
     { LLM_ARCH_LLAMA4,           "llama4"           },
     { LLM_ARCH_DECI,             "deci"             },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 2c71bbe81562..a7a21b2ef606 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -12,6 +12,7 @@
 
 enum llm_arch {
     LLM_ARCH_CLIP,
+    LLM_ARCH_LINGBOT_MAP,
     LLM_ARCH_LLAMA,
     LLM_ARCH_LLAMA4,
     LLM_ARCH_DECI,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 3e236f8c17d2..b8056cac3496 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -37,6 +37,8 @@
 
 static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params & params) {
     switch (arch) {
+        case LLM_ARCH_LINGBOT_MAP:
+            return new llama_model_lingbot_map(params);
         case LLM_ARCH_LLAMA:
             return new llama_model_llama(params);
         case LLM_ARCH_LLAMA4:
@@ -1004,6 +1006,14 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
         return;
     }
 
+    if (ml.get_arch() == LLM_ARCH_LINGBOT_MAP) {
+        load_arch_hparams(ml);
+        pimpl->n_bytes = ml.n_bytes;
+        pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
+        hparams.rope_type = LLAMA_ROPE_TYPE_NONE;
+        return;
+    }
+
     ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
     ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
     ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out_impl, false);
@@ -2258,6 +2268,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
     switch (model->arch) {
         // these models do not use RoPE
         case LLM_ARCH_CLIP:
+        case LLM_ARCH_LINGBOT_MAP:
         case LLM_ARCH_GPT2:
         case LLM_ARCH_GPTJ:
         case LLM_ARCH_MPT:
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 43e05c3d56fe..d2955a846237 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -351,6 +351,12 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
     quantize &= name.find(".patch_embd")    == std::string::npos;
     quantize &= name.find(".patch_merger")  == std::string::npos;
 
+    if (arch == LLM_ARCH_LINGBOT_MAP) {
+        // Pose input projection has ne[0] = pose_dim = 9. Legacy block quantizers such as Q4_0
+        // require the first dimension to be divisible by 32, so keep such tiny projection tensors in F32.
+        quantize &= tensor->ne[0] % 32 == 0;
+    }
+
     return quantize;
 }
 
diff --git a/src/models/lingbot-map.cpp b/src/models/lingbot-map.cpp
new file mode 100644
index 000000000000..877402d7ee4a
--- /dev/null
+++ b/src/models/lingbot-map.cpp
@@ -0,0 +1,68 @@
+#include "models.h"
+
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+
+void llama_model_lingbot_map::load_arch_hparams(llama_model_loader & ml) {
+    std::string component;
+    uint32_t embed_dim = 0;
+    uint32_t camera_dim = 0;
+    uint32_t frame_blocks = 0;
+    uint32_t global_blocks = 0;
+    uint32_t camera_blocks = 0;
+
+    ml.get_key("lingbot-map.component", component);
+    ml.get_key("lingbot-map.embed_dim", embed_dim);
+    ml.get_key("lingbot-map.camera_dim", camera_dim);
+    ml.get_key("lingbot-map.aggregator_frame_block_count", frame_blocks);
+    ml.get_key("lingbot-map.aggregator_global_block_count", global_blocks);
+    ml.get_key("lingbot-map.camera_trunk_block_count", camera_blocks);
+
+    if (component != "aggregator_camera_head") {
+        throw std::runtime_error("unsupported LingBot-MAP GGUF component: " + component);
+    }
+    if (embed_dim == 0 || camera_dim == 0 || frame_blocks == 0 || global_blocks == 0 || camera_blocks == 0) {
+        throw std::runtime_error("invalid LingBot-MAP GGUF metadata");
+    }
+
+    type = LLM_TYPE_UNKNOWN;
+    hparams.n_ctx_train = 0;
+    hparams.n_embd = std::max(embed_dim, camera_dim);
+    hparams.n_layer = frame_blocks + global_blocks + camera_blocks;
+    hparams.n_expert = 0;
+    hparams.n_expert_used = 0;
+    hparams.causal_attn = false;
+    hparams.f_norm_eps = 1e-6f;
+    hparams.f_norm_rms_eps = 0.0f;
+    hparams.rope_freq_base_train = 0.0f;
+    hparams.rope_freq_scale_train = 1.0f;
+    hparams.rope_type = LLAMA_ROPE_TYPE_NONE;
+
+    const uint32_t n_heads = 16;
+    const uint32_t n_layers = std::min<uint32_t>(hparams.n_layer, LLAMA_MAX_LAYERS);
+    std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
+    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
+    std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+    std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
+    std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
+    for (uint32_t il = 0; il < n_layers; ++il) {
+        hparams.n_head_arr[il] = n_heads;
+        hparams.n_head_kv_arr[il] = n_heads;
+        hparams.n_ff_arr[il] = hparams.n_embd * 4;
+    }
+    hparams.n_embd_head_k_full = hparams.n_embd / n_heads;
+    hparams.n_embd_head_v_full = hparams.n_embd / n_heads;
+    hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full;
+    hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full;
+    hparams.n_rot_full = 0;
+    hparams.n_rot_swa = 0;
+}
+
+void llama_model_lingbot_map::load_arch_tensors(llama_model_loader &) {
+    throw std::runtime_error("LingBot-MAP GGUF tensors are loaded by the mtmd SMT wrapper, not llama_model");
+}
+
+std::unique_ptr<llm_graph_context> llama_model_lingbot_map::build_arch_graph(const llm_graph_params &) const {
+    throw std::runtime_error("LingBot-MAP does not support llama_model text graph execution");
+}
diff --git a/src/models/models.h b/src/models/models.h
index 5251e2d82802..47c099a76d9a 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -173,6 +173,15 @@ struct llama_model_llama_embed : public llama_model_llama {
 };
 
 
+struct llama_model_lingbot_map : public llama_model_base {
+    llama_model_lingbot_map(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_maincoder : public llama_model_base {
     llama_model_maincoder(const struct llama_model_params & params) : llama_model_base(params) {}
     void load_arch_hparams(llama_model_loader & ml) override;
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 542a18b5cbca..0f46d9adfead 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -145,6 +145,11 @@ if(LLAMA_SERVER_SMT_VISION)
         message(FATAL_ERROR "Could not find spine_llm_argparser.cc in SPACEMIT_ORT_DIR='${SPACEMIT_ORT_DIR}'")
     endif()
 
+    target_sources(mtmd PRIVATE
+        lingbot-map-wrapper.cpp
+        lingbot-map-wrapper.h
+    )
+
     if(EXISTS "${SPACEMIT_ORT_LIB_DIR}/libonnxruntime.so")
         set(ONNXRUNTIME_LIB "${SPACEMIT_ORT_LIB_DIR}/libonnxruntime.so")
     elseif(EXISTS "${SPACEMIT_ORT_LIB_DIR}/libonnxruntime.a")
diff --git a/tools/mtmd/lingbot-map-wrapper.cpp b/tools/mtmd/lingbot-map-wrapper.cpp
new file mode 100644
index 000000000000..46e2a87fe473
--- /dev/null
+++ b/tools/mtmd/lingbot-map-wrapper.cpp
@@ -0,0 +1,1893 @@
+#include "lingbot-map-wrapper.h"
+
+#include "gguf.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+
+#include <cctype>
+#include <cerrno>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+#include <unordered_map>
+#include <sys/stat.h>
+
+namespace {
+
+struct gguf_deleter {
+    void operator()(gguf_context * ctx) const {
+        if (ctx != nullptr) {
+            gguf_free(ctx);
+        }
+    }
+};
+
+struct ggml_deleter {
+    void operator()(ggml_context * ctx) const {
+        if (ctx != nullptr) {
+            ggml_free(ctx);
+        }
+    }
+};
+
+struct ggml_backend_deleter {
+    void operator()(ggml_backend * backend) const {
+        if (backend != nullptr) {
+            ggml_backend_free(backend);
+        }
+    }
+};
+
+struct ggml_backend_buffer_deleter {
+    void operator()(ggml_backend_buffer * buffer) const {
+        if (buffer != nullptr) {
+            ggml_backend_buffer_free(buffer);
+        }
+    }
+};
+
+struct ggml_backend_sched_deleter {
+    void operator()(ggml_backend_sched * sched) const {
+        if (sched != nullptr) {
+            ggml_backend_sched_free(sched);
+        }
+    }
+};
+
+using gguf_context_ptr = std::unique_ptr<gguf_context, gguf_deleter>;
+using ggml_context_ptr = std::unique_ptr<ggml_context, ggml_deleter>;
+using ggml_backend_ptr = std::unique_ptr<ggml_backend, ggml_backend_deleter>;
+using ggml_backend_buffer_ptr = std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter>;
+using ggml_backend_sched_ptr = std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter>;
+
+struct lingbot_map_loaded_gguf {
+    gguf_context_ptr gguf;
+    ggml_context_ptr ggml;
+};
+
+struct lingbot_map_runtime_weights {
+    gguf_context_ptr gguf;
+    ggml_context_ptr ggml;
+    ggml_backend_buffer_ptr buffer;
+};
+
+struct lingbot_map_runtime_graph {
+    ggml_tensor * input_tokens = nullptr;
+    ggml_tensor * camera_head_input = nullptr;
+    ggml_tensor * final_pose = nullptr;
+    ggml_cgraph * graph = nullptr;
+    std::vector<ggml_tensor *> selected_outputs;
+    std::vector<ggml_tensor *> iteration_poses;
+};
+
+
+
+
+static int64_t lingbot_elapsed_ms(std::chrono::steady_clock::time_point start) {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
+}
+
+static bool lingbot_graph_supported_by_backend(ggml_backend_t backend,
+                                               ggml_backend_buffer_type_t buft,
+                                               ggml_cgraph * graph,
+                                               bool log_summary) {
+    if (backend == nullptr || buft == nullptr || graph == nullptr) {
+        return false;
+    }
+    bool ok = true;
+    int unsupported_nodes = 0;
+    if (!ggml_backend_supports_buft(backend, buft)) {
+        ok = false;
+    }
+    const int n_nodes = ggml_graph_n_nodes(graph);
+    for (int i = 0; i < n_nodes; ++i) {
+        const ggml_tensor * node = ggml_graph_node(graph, i);
+        if (node == nullptr) {
+            continue;
+        }
+        if (!ggml_backend_supports_op(backend, node)) {
+            ++unsupported_nodes;
+            ok = false;
+        }
+    }
+    if (!ok && log_summary) {
+        std::cerr << "[LingBot-MAP] GGML graph support check failed on backend=" << ggml_backend_name(backend)
+                  << ", buffer_type=" << ggml_backend_buft_name(buft)
+                  << ", unsupported_nodes=" << unsupported_nodes << "/" << n_nodes << "\n";
+    }
+    return ok;
+}
+
+static std::string read_file_to_string(const std::string & path) {
+    std::ifstream file(path);
+    if (!file.is_open()) {
+        return {};
+    }
+    return std::string((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+}
+
+static bool file_exists(const std::string & path) {
+    struct stat st;
+    return stat(path.c_str(), &st) == 0 && S_ISREG(st.st_mode);
+}
+
+static size_t find_closing_brace(const std::string & text, size_t start_pos) {
+    if (start_pos == std::string::npos) {
+        return std::string::npos;
+    }
+    int depth = 0;
+    for (size_t i = start_pos; i < text.size(); ++i) {
+        if (text[i] == '{') {
+            ++depth;
+        } else if (text[i] == '}') {
+            --depth;
+            if (depth == 0) {
+                return i;
+            }
+        }
+    }
+    return std::string::npos;
+}
+
+static std::string trim_ascii(std::string value) {
+    while (!value.empty() && std::isspace(static_cast<unsigned char>(value.front()))) {
+        value.erase(value.begin());
+    }
+    while (!value.empty() && std::isspace(static_cast<unsigned char>(value.back()))) {
+        value.pop_back();
+    }
+    return value;
+}
+
+static std::string normalize_path(const std::string & base_dir, const std::string & path) {
+    const std::string trimmed = trim_ascii(path);
+    if (trimmed.empty()) {
+        return {};
+    }
+    if (trimmed.front() == '/') {
+        return trimmed;
+    }
+    return base_dir + "/" + trimmed;
+}
+
+static std::string extract_object_block(const std::string & text, const std::string & key) {
+    const std::string marker = "\"" + key + "\"";
+    const size_t key_pos = text.find(marker);
+    if (key_pos == std::string::npos) {
+        return {};
+    }
+    const size_t brace_start = text.find('{', key_pos + marker.size());
+    const size_t brace_end = find_closing_brace(text, brace_start);
+    if (brace_start == std::string::npos || brace_end == std::string::npos || brace_end <= brace_start) {
+        return {};
+    }
+    return text.substr(brace_start, brace_end - brace_start + 1);
+}
+
+static std::string extract_string_value(const std::string & text, const std::string & key) {
+    const std::string marker = "\"" + key + "\"";
+    const size_t key_pos = text.find(marker);
+    if (key_pos == std::string::npos) {
+        return {};
+    }
+    const size_t colon_pos = text.find(':', key_pos + marker.size());
+    if (colon_pos == std::string::npos) {
+        return {};
+    }
+    const size_t first_quote = text.find('"', colon_pos + 1);
+    if (first_quote == std::string::npos) {
+        return {};
+    }
+    const size_t second_quote = text.find('"', first_quote + 1);
+    if (second_quote == std::string::npos) {
+        return {};
+    }
+    return text.substr(first_quote + 1, second_quote - first_quote - 1);
+}
+
+static int32_t extract_int32_value(const std::string & text, const std::string & key, int32_t default_value) {
+    const std::string marker = "\"" + key + "\"";
+    const size_t key_pos = text.find(marker);
+    if (key_pos == std::string::npos) {
+        return default_value;
+    }
+    const size_t colon_pos = text.find(':', key_pos + marker.size());
+    if (colon_pos == std::string::npos) {
+        return default_value;
+    }
+    size_t pos = colon_pos + 1;
+    while (pos < text.size() && std::isspace(static_cast<unsigned char>(text[pos]))) {
+        ++pos;
+    }
+    size_t end = pos;
+    if (end < text.size() && (text[end] == '-' || text[end] == '+')) {
+        ++end;
+    }
+    while (end < text.size() && std::isdigit(static_cast<unsigned char>(text[end]))) {
+        ++end;
+    }
+    if (end == pos) {
+        return default_value;
+    }
+    try {
+        return std::stoi(text.substr(pos, end - pos));
+    } catch (...) {
+        return default_value;
+    }
+}
+
+static void extract_float_array3(const std::string & text, const std::string & key, float values[3]) {
+    const std::string marker = "\"" + key + "\"";
+    const size_t key_pos = text.find(marker);
+    if (key_pos == std::string::npos) {
+        return;
+    }
+    const size_t bracket_start = text.find('[', key_pos + marker.size());
+    const size_t bracket_end = text.find(']', bracket_start == std::string::npos ? key_pos : bracket_start + 1);
+    if (bracket_start == std::string::npos || bracket_end == std::string::npos || bracket_end <= bracket_start) {
+        return;
+    }
+
+    size_t pos = bracket_start + 1;
+    for (int i = 0; i < 3 && pos < bracket_end; ++i) {
+        while (pos < bracket_end && (std::isspace(static_cast<unsigned char>(text[pos])) || text[pos] == ',')) {
+            ++pos;
+        }
+        size_t end = pos;
+        while (end < bracket_end && text[end] != ',') {
+            ++end;
+        }
+        try {
+            values[i] = std::stof(text.substr(pos, end - pos));
+        } catch (...) {
+            return;
+        }
+        pos = end + 1;
+    }
+}
+
+static bool extract_bool_value(const std::string & text, const std::string & key, bool default_value) {
+    const std::string marker = "\"" + key + "\"";
+    const size_t key_pos = text.find(marker);
+    if (key_pos == std::string::npos) {
+        return default_value;
+    }
+    const size_t colon_pos = text.find(':', key_pos + marker.size());
+    if (colon_pos == std::string::npos) {
+        return default_value;
+    }
+    size_t pos = colon_pos + 1;
+    while (pos < text.size() && std::isspace(static_cast<unsigned char>(text[pos]))) {
+        ++pos;
+    }
+    if (text.compare(pos, 4, "true") == 0) {
+        return true;
+    }
+    if (text.compare(pos, 5, "false") == 0) {
+        return false;
+    }
+    return default_value;
+}
+
+
+static std::unordered_map<std::string, std::string> extract_string_map(const std::string & text, const std::string & key) {
+    std::unordered_map<std::string, std::string> values;
+    const std::string marker = "\"" + key + "\"";
+    const size_t key_pos = text.find(marker);
+    if (key_pos == std::string::npos) {
+        return values;
+    }
+    const size_t brace_start = text.find('{', key_pos + marker.size());
+    const size_t brace_end = find_closing_brace(text, brace_start);
+    if (brace_start == std::string::npos || brace_end == std::string::npos || brace_end <= brace_start) {
+        return values;
+    }
+    const std::string content = text.substr(brace_start + 1, brace_end - brace_start - 1);
+    size_t pos = 0;
+    while (pos < content.size()) {
+        while (pos < content.size() && (std::isspace(static_cast<unsigned char>(content[pos])) || content[pos] == ',')) {
+            ++pos;
+        }
+        if (pos >= content.size() || content[pos] != '"') {
+            break;
+        }
+        const size_t key_start = pos + 1;
+        const size_t key_end = content.find('"', key_start);
+        if (key_end == std::string::npos) {
+            break;
+        }
+        const size_t colon = content.find(':', key_end + 1);
+        const size_t value_quote = content.find('"', colon == std::string::npos ? key_end + 1 : colon + 1);
+        if (colon == std::string::npos || value_quote == std::string::npos) {
+            break;
+        }
+        const size_t value_end = content.find('"', value_quote + 1);
+        if (value_end == std::string::npos) {
+            break;
+        }
+        values[content.substr(key_start, key_end - key_start)] = content.substr(value_quote + 1, value_end - value_quote - 1);
+        pos = value_end + 1;
+    }
+    return values;
+}
+
+static void merge_missing_ep_config(std::unordered_map<std::string, std::string> & dst,
+                                    const std::unordered_map<std::string, std::string> & src) {
+    for (const auto & kv : src) {
+        if (dst.find(kv.first) == dst.end()) {
+            dst[kv.first] = kv.second;
+        }
+    }
+}
+
+static void apply_legacy_lingbot_ep_config(const std::string & text,
+                                           std::unordered_map<std::string, std::string> & ep_config) {
+    if (ep_config.find("SPACEMIT_EP_INTRA_THREAD_NUM") == ep_config.end()) {
+        ep_config["SPACEMIT_EP_INTRA_THREAD_NUM"] = std::to_string(extract_int32_value(text, "spacemit_ep_intra_thread_num", 4));
+    }
+    if (ep_config.find("SPACEMIT_EP_INTER_THREAD_NUM") == ep_config.end()) {
+        ep_config["SPACEMIT_EP_INTER_THREAD_NUM"] = std::to_string(extract_int32_value(text, "spacemit_ep_inter_thread_num", 1));
+    }
+    const std::string affinity = extract_string_value(text, "spacemit_ep_intra_thread_affinity");
+    if (!affinity.empty() && ep_config.find("SPACEMIT_EP_INTRA_THREAD_AFFINITY") == ep_config.end()) {
+        ep_config["SPACEMIT_EP_INTRA_THREAD_AFFINITY"] = affinity;
+    }
+}
+
+static std::vector<std::string> extract_string_array(const std::string & text, const std::string & key) {
+    std::vector<std::string> values;
+    const std::string marker = "\"" + key + "\"";
+    const size_t key_pos = text.find(marker);
+    if (key_pos == std::string::npos) {
+        return values;
+    }
+    const size_t bracket_start = text.find('[', key_pos + marker.size());
+    const size_t bracket_end = text.find(']', bracket_start == std::string::npos ? key_pos : bracket_start + 1);
+    if (bracket_start == std::string::npos || bracket_end == std::string::npos || bracket_end <= bracket_start) {
+        return values;
+    }
+    size_t pos = bracket_start + 1;
+    while (pos < bracket_end) {
+        const size_t first_quote = text.find('"', pos);
+        if (first_quote == std::string::npos || first_quote >= bracket_end) {
+            break;
+        }
+        const size_t second_quote = text.find('"', first_quote + 1);
+        if (second_quote == std::string::npos || second_quote > bracket_end) {
+            break;
+        }
+        values.push_back(text.substr(first_quote + 1, second_quote - first_quote - 1));
+        pos = second_quote + 1;
+    }
+    return values;
+}
+
+static std::vector<int32_t> extract_int32_array(const std::string & text, const std::string & key) {
+    std::vector<int32_t> values;
+    const std::string marker = "\"" + key + "\"";
+    const size_t key_pos = text.find(marker);
+    if (key_pos == std::string::npos) {
+        return values;
+    }
+    const size_t bracket_start = text.find('[', key_pos + marker.size());
+    const size_t bracket_end = text.find(']', bracket_start == std::string::npos ? key_pos : bracket_start + 1);
+    if (bracket_start == std::string::npos || bracket_end == std::string::npos || bracket_end <= bracket_start) {
+        return values;
+    }
+
+    size_t pos = bracket_start + 1;
+    while (pos < bracket_end) {
+        while (pos < bracket_end && (std::isspace(static_cast<unsigned char>(text[pos])) || text[pos] == ',')) {
+            ++pos;
+        }
+        if (pos >= bracket_end) {
+            break;
+        }
+        size_t end = pos;
+        if (end < bracket_end && (text[end] == '-' || text[end] == '+')) {
+            ++end;
+        }
+        while (end < bracket_end && std::isdigit(static_cast<unsigned char>(text[end]))) {
+            ++end;
+        }
+        if (end == pos) {
+            break;
+        }
+        values.push_back((int32_t) std::stoi(text.substr(pos, end - pos)));
+        pos = end;
+    }
+    return values;
+}
+
+static uint32_t require_gguf_u32(const gguf_context * ctx, const char * key) {
+    const int64_t id = gguf_find_key(ctx, key);
+    if (id < 0 || gguf_get_kv_type(ctx, id) != GGUF_TYPE_UINT32) {
+        throw std::runtime_error(std::string("missing GGUF uint32 metadata: ") + key);
+    }
+    return gguf_get_val_u32(ctx, id);
+}
+
+
+static bool lingbot_tensor_type_is_supported_matrix_weight(ggml_type type) {
+    return type == GGML_TYPE_F32 || type == GGML_TYPE_F16 || type == GGML_TYPE_BF16 || ggml_is_quantized(type);
+}
+
+static void require_ggml_tensor_shape(
+        ggml_context *       ctx,
+        const std::string &  name,
+        int64_t              ne0,
+        int64_t              ne1 = 1,
+        int64_t              ne2 = 1,
+        int64_t              ne3 = 1,
+        bool                 allow_quantized_matrix_weight = false) {
+    const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+    if (tensor == nullptr) {
+        throw std::runtime_error("missing LingBot-MAP tensor: " + name);
+    }
+    const bool shape_ok = tensor->ne[0] == ne0 && tensor->ne[1] == ne1 && tensor->ne[2] == ne2 && tensor->ne[3] == ne3;
+    const bool type_ok = allow_quantized_matrix_weight ?
+        lingbot_tensor_type_is_supported_matrix_weight(tensor->type) :
+        tensor->type == GGML_TYPE_F32;
+    if (!shape_ok || !type_ok) {
+        throw std::runtime_error(
+            "unexpected LingBot-MAP tensor shape/type: " + name +
+            " type=" + ggml_type_name(tensor->type));
+    }
+}
+
+static void validate_lingbot_map_aggregator_block_shapes(ggml_context * ctx, const lingbot_map_config & cfg) {
+    const int64_t c = cfg.hidden_size;
+    const int64_t mlp = c * 4;
+    const int64_t head_dim = c / 16;
+    if (c <= 0 || c % 16 != 0) {
+        throw std::runtime_error("LingBot-MAP hidden_size must be divisible by 16 attention heads");
+    }
+    for (const auto & prefix : {
+            std::string("aggregator.frame_blocks.0"),
+            std::string("aggregator.frame_blocks.") + std::to_string(cfg.frame_block_count - 1),
+            std::string("aggregator.global_blocks.0"),
+            std::string("aggregator.global_blocks.") + std::to_string(cfg.global_block_count - 1),
+        }) {
+        require_ggml_tensor_shape(ctx, prefix + ".norm1.weight", c);
+        require_ggml_tensor_shape(ctx, prefix + ".norm1.bias", c);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.qkv.weight", c, c * 3, 1, 1, true);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.qkv.bias", c * 3);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.q_norm.weight", head_dim);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.q_norm.bias", head_dim);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.k_norm.weight", head_dim);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.k_norm.bias", head_dim);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.proj.weight", c, c, 1, 1, true);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.proj.bias", c);
+        require_ggml_tensor_shape(ctx, prefix + ".ls1.gamma", c);
+        require_ggml_tensor_shape(ctx, prefix + ".norm2.weight", c);
+        require_ggml_tensor_shape(ctx, prefix + ".norm2.bias", c);
+        require_ggml_tensor_shape(ctx, prefix + ".mlp.fc1.weight", c, mlp, 1, 1, true);
+        require_ggml_tensor_shape(ctx, prefix + ".mlp.fc1.bias", mlp);
+        require_ggml_tensor_shape(ctx, prefix + ".mlp.fc2.weight", mlp, c, 1, 1, true);
+        require_ggml_tensor_shape(ctx, prefix + ".mlp.fc2.bias", c);
+        require_ggml_tensor_shape(ctx, prefix + ".ls2.gamma", c);
+    }
+}
+
+
+static void validate_lingbot_map_camera_head_shapes(ggml_context * ctx, const lingbot_map_config & cfg) {
+    const int64_t c = cfg.camera_hidden_size;
+    const int64_t pose_dim = 9;
+    const int64_t mlp = c * 4;
+    if (c <= 0 || c % 16 != 0) {
+        throw std::runtime_error("LingBot-MAP camera_hidden_size must be divisible by 16 attention heads");
+    }
+    if (cfg.camera_trunk_block_count <= 0 || cfg.camera_num_iterations <= 0) {
+        throw std::runtime_error("LingBot-MAP camera_head requires positive trunk block and iteration counts");
+    }
+
+    require_ggml_tensor_shape(ctx, "camera_head.empty_pose_tokens", pose_dim, 1, 1);
+    require_ggml_tensor_shape(ctx, "camera_head.token_norm.weight", c);
+    require_ggml_tensor_shape(ctx, "camera_head.token_norm.bias", c);
+    require_ggml_tensor_shape(ctx, "camera_head.trunk_norm.weight", c);
+    require_ggml_tensor_shape(ctx, "camera_head.trunk_norm.bias", c);
+    require_ggml_tensor_shape(ctx, "camera_head.embed_pose.weight", pose_dim, c);
+    require_ggml_tensor_shape(ctx, "camera_head.embed_pose.bias", c);
+    require_ggml_tensor_shape(ctx, "camera_head.poseLN_modulation.1.weight", c, c * 3, 1, 1, true);
+    require_ggml_tensor_shape(ctx, "camera_head.poseLN_modulation.1.bias", c * 3);
+    require_ggml_tensor_shape(ctx, "camera_head.pose_branch.fc1.weight", c, c / 2, 1, 1, true);
+    require_ggml_tensor_shape(ctx, "camera_head.pose_branch.fc1.bias", c / 2);
+    require_ggml_tensor_shape(ctx, "camera_head.pose_branch.fc2.weight", c / 2, pose_dim, 1, 1, true);
+    require_ggml_tensor_shape(ctx, "camera_head.pose_branch.fc2.bias", pose_dim);
+
+    for (int32_t i = 0; i < cfg.camera_trunk_block_count; ++i) {
+        const std::string prefix = "camera_head.trunk." + std::to_string(i);
+        require_ggml_tensor_shape(ctx, prefix + ".norm1.weight", c);
+        require_ggml_tensor_shape(ctx, prefix + ".norm1.bias", c);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.qkv.weight", c, c * 3, 1, 1, true);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.qkv.bias", c * 3);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.proj.weight", c, c, 1, 1, true);
+        require_ggml_tensor_shape(ctx, prefix + ".attn.proj.bias", c);
+        require_ggml_tensor_shape(ctx, prefix + ".ls1.gamma", c);
+        require_ggml_tensor_shape(ctx, prefix + ".norm2.weight", c);
+        require_ggml_tensor_shape(ctx, prefix + ".norm2.bias", c);
+        require_ggml_tensor_shape(ctx, prefix + ".mlp.fc1.weight", c, mlp, 1, 1, true);
+        require_ggml_tensor_shape(ctx, prefix + ".mlp.fc1.bias", mlp);
+        require_ggml_tensor_shape(ctx, prefix + ".mlp.fc2.weight", mlp, c, 1, 1, true);
+        require_ggml_tensor_shape(ctx, prefix + ".mlp.fc2.bias", c);
+        require_ggml_tensor_shape(ctx, prefix + ".ls2.gamma", c);
+    }
+}
+
+static std::string require_gguf_string(const gguf_context * ctx, const char * key) {
+    const int64_t id = gguf_find_key(ctx, key);
+    if (id < 0 || gguf_get_kv_type(ctx, id) != GGUF_TYPE_STRING) {
+        throw std::runtime_error(std::string("missing GGUF string metadata: ") + key);
+    }
+    return gguf_get_val_str(ctx, id);
+}
+
+static lingbot_map_config load_lingbot_map_config(const std::string & config_dir) {
+    const std::string config_path = config_dir + "/config.json";
+    const std::string content = read_file_to_string(config_path);
+    if (content.empty()) {
+        throw std::runtime_error("failed to read LingBot-MAP config: " + config_path);
+    }
+
+    const std::string vision_block = extract_object_block(content, "vision_model");
+    const std::string agg_block = extract_object_block(content, "aggregator_camera_model");
+    const std::string depth_block = extract_object_block(content, "depth_model");
+    const std::string post_block = extract_object_block(content, "postprocess");
+    if (vision_block.empty() || agg_block.empty() || depth_block.empty()) {
+        throw std::runtime_error("LingBot-MAP config requires vision_model, aggregator_camera_model, and depth_model blocks");
+    }
+
+    lingbot_map_config cfg;
+    cfg.architectures = extract_string_array(content, "architectures");
+    cfg.vision_model_path = normalize_path(config_dir, extract_string_value(vision_block, "model_path"));
+    cfg.aggregator_camera_model_path = normalize_path(config_dir, extract_string_value(agg_block, "model_path"));
+    cfg.depth_model_path = normalize_path(config_dir, extract_string_value(depth_block, "model_path"));
+    cfg.ep_config = extract_string_map(vision_block, "ep_config");
+    merge_missing_ep_config(cfg.ep_config, extract_string_map(content, "ep_config"));
+    apply_legacy_lingbot_ep_config(vision_block, cfg.ep_config);
+    apply_legacy_lingbot_ep_config(content, cfg.ep_config);
+
+    cfg.image_size = extract_int32_value(vision_block, "image_size", 518);
+    cfg.patch_size = extract_int32_value(vision_block, "patch_size", 14);
+    extract_float_array3(vision_block, "image_mean", cfg.image_mean);
+    extract_float_array3(vision_block, "image_std", cfg.image_std);
+    cfg.hidden_size = extract_int32_value(agg_block, "hidden_size", 0);
+    cfg.camera_hidden_size = extract_int32_value(agg_block, "camera_hidden_size", 0);
+    cfg.num_special_tokens = extract_int32_value(agg_block, "num_special_tokens", 0);
+    cfg.num_register_tokens = extract_int32_value(agg_block, "num_register_tokens", 0);
+    cfg.frame_block_count = extract_int32_value(agg_block, "frame_block_count", 0);
+    cfg.global_block_count = extract_int32_value(agg_block, "global_block_count", 0);
+    cfg.camera_trunk_block_count = extract_int32_value(agg_block, "camera_trunk_block_count", 0);
+    cfg.camera_num_iterations = extract_int32_value(agg_block, "camera_num_iterations", 4);
+    cfg.ggml_threads = extract_int32_value(agg_block, "ggml_threads", 8);
+    cfg.aggregator_selected_layers = extract_int32_array(agg_block, "selected_layers");
+    if (cfg.aggregator_selected_layers.empty()) {
+        cfg.aggregator_selected_layers = { 4, 11, 17, 23 };
+    }
+
+    cfg.output_pose = extract_bool_value(post_block, "output_pose", true);
+    cfg.output_depth = extract_bool_value(post_block, "output_depth", true);
+    cfg.output_point_cloud = extract_bool_value(post_block, "output_point_cloud", true);
+
+    if (cfg.architectures.empty()) {
+        throw std::runtime_error("LingBot-MAP config requires architectures");
+    }
+    if (cfg.vision_model_path.empty() || cfg.aggregator_camera_model_path.empty() || cfg.depth_model_path.empty()) {
+        throw std::runtime_error("LingBot-MAP config contains empty model_path");
+    }
+    for (const int32_t layer_idx : cfg.aggregator_selected_layers) {
+        if (layer_idx < 0 || layer_idx >= cfg.frame_block_count) {
+            throw std::runtime_error("LingBot-MAP aggregator selected_layers contains an invalid layer index");
+        }
+    }
+    for (const auto & path : { cfg.vision_model_path, cfg.aggregator_camera_model_path, cfg.depth_model_path }) {
+        if (!file_exists(path)) {
+            throw std::runtime_error("LingBot-MAP model file not found: " + path);
+        }
+    }
+    return cfg;
+}
+
+static lingbot_map_loaded_gguf load_and_validate_gguf(const lingbot_map_config & cfg) {
+    ggml_context * ggml_raw = nullptr;
+    gguf_init_params params = {
+        /*.no_alloc =*/ false,
+        /*.ctx      =*/ &ggml_raw,
+    };
+    lingbot_map_loaded_gguf loaded;
+    loaded.gguf.reset(gguf_init_from_file(cfg.aggregator_camera_model_path.c_str(), params));
+    loaded.ggml.reset(ggml_raw);
+    if (!loaded.gguf || !loaded.ggml) {
+        throw std::runtime_error("failed to open LingBot-MAP GGUF: " + cfg.aggregator_camera_model_path);
+    }
+
+    const gguf_context * gguf = loaded.gguf.get();
+    const std::string arch = require_gguf_string(gguf, "general.architecture");
+    if (arch != "lingbot-map") {
+        throw std::runtime_error("expected LingBot-MAP GGUF architecture 'lingbot-map', got '" + arch + "'");
+    }
+    const std::string component = require_gguf_string(gguf, "lingbot-map.component");
+    if (component != "aggregator_camera_head") {
+        throw std::runtime_error("unsupported LingBot-MAP GGUF component: " + component);
+    }
+
+    const uint32_t file_type = require_gguf_u32(gguf, "general.file_type");
+    (void) file_type;
+
+    const uint32_t embed_dim = require_gguf_u32(gguf, "lingbot-map.embed_dim");
+    const uint32_t camera_dim = require_gguf_u32(gguf, "lingbot-map.camera_dim");
+    const uint32_t special_tokens = require_gguf_u32(gguf, "lingbot-map.num_special_tokens");
+    const uint32_t frame_blocks = require_gguf_u32(gguf, "lingbot-map.aggregator_frame_block_count");
+    const uint32_t global_blocks = require_gguf_u32(gguf, "lingbot-map.aggregator_global_block_count");
+    const uint32_t camera_blocks = require_gguf_u32(gguf, "lingbot-map.camera_trunk_block_count");
+
+    if ((uint32_t) cfg.hidden_size != embed_dim || (uint32_t) cfg.camera_hidden_size != camera_dim ||
+        (uint32_t) cfg.num_special_tokens != special_tokens || (uint32_t) cfg.frame_block_count != frame_blocks ||
+        (uint32_t) cfg.global_block_count != global_blocks || (uint32_t) cfg.camera_trunk_block_count != camera_blocks) {
+        throw std::runtime_error("LingBot-MAP config.json does not match GGUF metadata");
+    }
+
+    if (gguf_find_tensor(gguf, "aggregator.camera_token") < 0 ||
+        gguf_find_tensor(gguf, "camera_head.pose_branch.fc2.bias") < 0 ||
+        ggml_get_tensor(loaded.ggml.get(), "aggregator.camera_token") == nullptr ||
+        ggml_get_tensor(loaded.ggml.get(), "camera_head.pose_branch.fc2.bias") == nullptr) {
+        throw std::runtime_error("LingBot-MAP GGUF is missing required boundary tensors");
+    }
+    validate_lingbot_map_aggregator_block_shapes(loaded.ggml.get(), cfg);
+    validate_lingbot_map_camera_head_shapes(loaded.ggml.get(), cfg);
+
+    return loaded;
+}
+
+static const float * lingbot_tensor_f32_data(const ggml_tensor * tensor, const std::string & name) {
+    if (tensor == nullptr) {
+        throw std::runtime_error("missing LingBot-MAP tensor: " + name);
+    }
+    if (tensor->type != GGML_TYPE_F32 || tensor->data == nullptr) {
+        throw std::runtime_error("LingBot-MAP tensor must be loaded as F32: " + name);
+    }
+    return static_cast<const float *>(tensor->data);
+}
+
+} // namespace
+
+struct lingbot_map_context::impl {
+    lingbot_map_config config;
+    gguf_context_ptr gguf;
+    ggml_context_ptr ggml;
+    std::string arch_name;
+
+    ggml_backend_ptr runtime_backend;
+    ggml_backend_buffer_type_t runtime_buft = nullptr;
+    lingbot_map_runtime_weights runtime_weights;
+    bool runtime_initialized = false;
+    bool runtime_prefer_smt = true;
+};
+
+lingbot_map_context::~lingbot_map_context() = default;
+
+std::unique_ptr<lingbot_map_context> lingbot_map_context::create(const std::string & config_dir) {
+    auto ctx = std::unique_ptr<lingbot_map_context>(new lingbot_map_context());
+    ctx->pimpl_ = std::make_unique<impl>();
+    ctx->pimpl_->config = load_lingbot_map_config(config_dir);
+    auto loaded = load_and_validate_gguf(ctx->pimpl_->config);
+    ctx->pimpl_->gguf = std::move(loaded.gguf);
+    ctx->pimpl_->ggml = std::move(loaded.ggml);
+    ctx->pimpl_->arch_name = ctx->pimpl_->config.architectures.empty() ? std::string() : ctx->pimpl_->config.architectures[0];
+
+    std::cerr << "[LingBot-MAP] loaded config and GGUF: " << ctx->pimpl_->config.aggregator_camera_model_path
+              << ", tensors=" << gguf_get_n_tensors(ctx->pimpl_->gguf.get()) << "\n";
+    return ctx;
+}
+
+const lingbot_map_config & lingbot_map_context::config() const {
+    return pimpl_->config;
+}
+
+const std::string & lingbot_map_context::architecture() const {
+    return pimpl_->arch_name;
+}
+
+int64_t lingbot_map_context::tensor_count() const {
+    return gguf_get_n_tensors(pimpl_->gguf.get());
+}
+
+ggml_context * lingbot_map_context::ggml_ctx() const {
+    return pimpl_->ggml.get();
+}
+
+const ggml_tensor * lingbot_map_context::tensor(const std::string & name) const {
+    if (pimpl_->ggml == nullptr) {
+        return nullptr;
+    }
+    return ggml_get_tensor(pimpl_->ggml.get(), name.c_str());
+}
+
+
+lingbot_map_aggregator_input lingbot_map_context::build_aggregator_input(
+        const float * vit_tokens,
+        int32_t       n_frames,
+        int32_t       vit_tokens_per_frame,
+        int32_t       hidden_size,
+        int32_t       image_h,
+        int32_t       image_w,
+        int32_t       num_frame_for_scale) const {
+    if (vit_tokens == nullptr) {
+        throw std::invalid_argument("LingBot-MAP aggregator input requires ViT tokens");
+    }
+    const auto & cfg = config();
+    if (n_frames <= 0 || vit_tokens_per_frame <= 0 || hidden_size != cfg.hidden_size) {
+        throw std::invalid_argument("Invalid LingBot-MAP ViT token shape for aggregator");
+    }
+    if (image_h <= 0 || image_w <= 0 || cfg.patch_size <= 0) {
+        throw std::invalid_argument("Invalid LingBot-MAP image dimensions for aggregator");
+    }
+
+    const int32_t patch_h = image_h / cfg.patch_size;
+    const int32_t patch_w = image_w / cfg.patch_size;
+    const int32_t patch_tokens = patch_h * patch_w;
+    if (patch_tokens <= 0 || vit_tokens_per_frame < patch_tokens) {
+        throw std::invalid_argument("LingBot-MAP ViT output does not contain enough patch tokens for aggregator");
+    }
+
+    const int32_t vit_prefix_tokens = vit_tokens_per_frame - patch_tokens;
+    const int32_t patch_start_idx = 1 + cfg.num_register_tokens + 1;
+    if (patch_start_idx != cfg.num_special_tokens) {
+        throw std::runtime_error("LingBot-MAP special token metadata is inconsistent");
+    }
+
+    const ggml_tensor * camera_tensor = tensor("aggregator.camera_token");
+    const ggml_tensor * register_tensor = tensor("aggregator.register_token");
+    const ggml_tensor * scale_tensor = tensor("aggregator.scale_token");
+    const float * camera_token = lingbot_tensor_f32_data(camera_tensor, "aggregator.camera_token");
+    const float * register_token = lingbot_tensor_f32_data(register_tensor, "aggregator.register_token");
+    const float * scale_token = lingbot_tensor_f32_data(scale_tensor, "aggregator.scale_token");
+
+    if (camera_tensor->ne[0] != hidden_size || camera_tensor->ne[1] != 1 || camera_tensor->ne[2] != 2 ||
+        register_tensor->ne[0] != hidden_size || register_tensor->ne[1] != cfg.num_register_tokens || register_tensor->ne[2] != 2 ||
+        scale_tensor->ne[0] != hidden_size || scale_tensor->ne[1] != 1 || scale_tensor->ne[2] != 2) {
+        throw std::runtime_error("LingBot-MAP special token tensor shapes do not match config");
+    }
+
+    lingbot_map_aggregator_input out;
+    out.n_frames = n_frames;
+    out.hidden_size = hidden_size;
+    out.vit_tokens_per_frame = vit_tokens_per_frame;
+    out.vit_prefix_tokens = vit_prefix_tokens;
+    out.patch_tokens = patch_tokens;
+    out.patch_start_idx = patch_start_idx;
+    out.tokens_per_frame = patch_start_idx + patch_tokens;
+    out.tokens.resize((size_t) n_frames * (size_t) out.tokens_per_frame * (size_t) hidden_size);
+
+    const int32_t scale_frames = std::max(1, std::min(num_frame_for_scale, n_frames));
+    auto copy_token_variant = [&](const float * token_base, int32_t variant, int32_t n_token, int32_t frame, int32_t dst_token) {
+        const size_t src_base = ((size_t) variant * (size_t) n_token) * (size_t) hidden_size;
+        const size_t dst_base = ((size_t) frame * (size_t) out.tokens_per_frame + (size_t) dst_token) * (size_t) hidden_size;
+        std::copy(token_base + src_base, token_base + src_base + (size_t) n_token * (size_t) hidden_size,
+                  out.tokens.data() + dst_base);
+    };
+
+    for (int32_t f = 0; f < n_frames; ++f) {
+        const int32_t camera_variant = f == 0 ? 0 : 1;
+        const int32_t register_variant = f == 0 ? 0 : 1;
+        const int32_t scale_variant = f < scale_frames ? 0 : 1;
+
+        copy_token_variant(camera_token, camera_variant, 1, f, 0);
+        copy_token_variant(register_token, register_variant, cfg.num_register_tokens, f, 1);
+        copy_token_variant(scale_token, scale_variant, 1, f, 1 + cfg.num_register_tokens);
+
+        const float * vit_frame = vit_tokens + (size_t) f * (size_t) vit_tokens_per_frame * (size_t) hidden_size;
+        const float * patch_src = vit_frame + (size_t) vit_prefix_tokens * (size_t) hidden_size;
+        float * patch_dst = out.tokens.data() + ((size_t) f * (size_t) out.tokens_per_frame + (size_t) patch_start_idx) * (size_t) hidden_size;
+        std::copy(patch_src, patch_src + (size_t) patch_tokens * (size_t) hidden_size, patch_dst);
+    }
+
+    return out;
+}
+
+
+static ggml_tensor * lingbot_require_tensor(ggml_context * ctx, const std::string & name) {
+    ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+    if (tensor == nullptr) {
+        throw std::runtime_error("missing LingBot-MAP tensor: " + name);
+    }
+    return tensor;
+}
+
+static ggml_tensor * lingbot_layer_norm(
+        ggml_context *      ctx,
+        ggml_tensor *       input,
+        ggml_tensor *       weight,
+        ggml_tensor *       bias,
+        float               eps) {
+    ggml_tensor * cur = ggml_norm(ctx, input, eps);
+    cur = ggml_mul(ctx, cur, weight);
+    cur = ggml_add(ctx, cur, bias);
+    return cur;
+}
+
+static ggml_tensor * lingbot_linear(
+        ggml_context *      ctx,
+        ggml_tensor *       input,
+        ggml_tensor *       weight,
+        ggml_tensor *       bias) {
+    ggml_tensor * cur = ggml_mul_mat(ctx, weight, input);
+    if (bias != nullptr) {
+        cur = ggml_add(ctx, cur, bias);
+    }
+    return cur;
+}
+
+static ggml_tensor * lingbot_mlp_gelu(
+        ggml_context *      ctx,
+        ggml_tensor *       input,
+        ggml_tensor *       fc1_w,
+        ggml_tensor *       fc1_b,
+        ggml_tensor *       fc2_w,
+        ggml_tensor *       fc2_b) {
+    ggml_tensor * cur = lingbot_linear(ctx, input, fc1_w, fc1_b);
+    cur = ggml_gelu(ctx, cur);
+    cur = lingbot_linear(ctx, cur, fc2_w, fc2_b);
+    return cur;
+}
+
+static ggml_tensor * lingbot_qkv_view(
+        ggml_context * ctx,
+        ggml_tensor *  qkv,
+        int64_t        hidden_size,
+        int            index) {
+    return ggml_view_3d(ctx, qkv,
+                        hidden_size,
+                        qkv->ne[1],
+                        qkv->ne[2],
+                        qkv->nb[1],
+                        qkv->nb[2],
+                        (size_t) index * (size_t) hidden_size * ggml_type_size(qkv->type));
+}
+
+static ggml_tensor * lingbot_head_view(
+        ggml_context * ctx,
+        ggml_tensor *  x,
+        int64_t        head_dim,
+        int64_t        n_heads) {
+    return ggml_view_4d(ctx, x,
+                        head_dim,
+                        n_heads,
+                        x->ne[1],
+                        x->ne[2],
+                        (size_t) head_dim * ggml_type_size(x->type),
+                        x->nb[1],
+                        x->nb[2],
+                        0);
+}
+
+static ggml_tensor * lingbot_frame_self_attention(
+        ggml_context * ctx,
+        ggml_tensor *  qkv,
+        ggml_tensor *  q_norm_w,
+        ggml_tensor *  q_norm_b,
+        ggml_tensor *  k_norm_w,
+        ggml_tensor *  k_norm_b,
+        ggml_tensor *  proj_w,
+        ggml_tensor *  proj_b,
+        int64_t        hidden_size) {
+    const int64_t head_dim = q_norm_w->ne[0];
+    if (head_dim <= 0 || hidden_size % head_dim != 0 || k_norm_w->ne[0] != head_dim ||
+        q_norm_b->ne[0] != head_dim || k_norm_b->ne[0] != head_dim) {
+        throw std::runtime_error("LingBot-MAP q/k norm shapes do not match hidden size");
+    }
+    const int64_t n_heads = hidden_size / head_dim;
+
+    ggml_tensor * q = lingbot_qkv_view(ctx, qkv, hidden_size, 0);
+    ggml_tensor * k = lingbot_qkv_view(ctx, qkv, hidden_size, 1);
+    ggml_tensor * v = lingbot_qkv_view(ctx, qkv, hidden_size, 2);
+
+    q = lingbot_head_view(ctx, q, head_dim, n_heads);
+    k = lingbot_head_view(ctx, k, head_dim, n_heads);
+    v = lingbot_head_view(ctx, v, head_dim, n_heads);
+
+    q = lingbot_layer_norm(ctx, q, q_norm_w, q_norm_b, 1e-6f);
+    k = lingbot_layer_norm(ctx, k, k_norm_w, k_norm_b, 1e-6f);
+
+    q = ggml_permute(ctx, q, 0, 2, 1, 3);
+    k = ggml_permute(ctx, k, 0, 2, 1, 3);
+    v = ggml_permute(ctx, v, 0, 2, 1, 3);
+
+    ggml_tensor * attn = ggml_flash_attn_ext(ctx, q, k, v, nullptr, 1.0f / std::sqrt((float) head_dim), 0.0f, 0.0f);
+    ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32);
+    attn = ggml_cont_3d(ctx, attn, hidden_size, qkv->ne[1], qkv->ne[2]);
+
+    return lingbot_linear(ctx, attn, proj_w, proj_b);
+}
+
+static ggml_tensor * lingbot_apply_aggregator_block(
+        ggml_context *      ctx,
+        ggml_context *      weights_ctx,
+        const lingbot_map_config & cfg,
+        ggml_tensor *       x,
+        const std::string & prefix,
+        const std::string & graph_name,
+        ggml_tensor **      qkv_out) {
+    ggml_tensor * norm1_w = lingbot_require_tensor(weights_ctx, prefix + ".norm1.weight");
+    ggml_tensor * norm1_b = lingbot_require_tensor(weights_ctx, prefix + ".norm1.bias");
+    ggml_tensor * qkv_w   = lingbot_require_tensor(weights_ctx, prefix + ".attn.qkv.weight");
+    ggml_tensor * qkv_b   = lingbot_require_tensor(weights_ctx, prefix + ".attn.qkv.bias");
+    ggml_tensor * q_norm_w = lingbot_require_tensor(weights_ctx, prefix + ".attn.q_norm.weight");
+    ggml_tensor * q_norm_b = lingbot_require_tensor(weights_ctx, prefix + ".attn.q_norm.bias");
+    ggml_tensor * k_norm_w = lingbot_require_tensor(weights_ctx, prefix + ".attn.k_norm.weight");
+    ggml_tensor * k_norm_b = lingbot_require_tensor(weights_ctx, prefix + ".attn.k_norm.bias");
+    ggml_tensor * proj_w  = lingbot_require_tensor(weights_ctx, prefix + ".attn.proj.weight");
+    ggml_tensor * proj_b  = lingbot_require_tensor(weights_ctx, prefix + ".attn.proj.bias");
+    ggml_tensor * ls1     = lingbot_require_tensor(weights_ctx, prefix + ".ls1.gamma");
+    ggml_tensor * norm2_w = lingbot_require_tensor(weights_ctx, prefix + ".norm2.weight");
+    ggml_tensor * norm2_b = lingbot_require_tensor(weights_ctx, prefix + ".norm2.bias");
+    ggml_tensor * fc1_w   = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc1.weight");
+    ggml_tensor * fc1_b   = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc1.bias");
+    ggml_tensor * fc2_w   = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc2.weight");
+    ggml_tensor * fc2_b   = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc2.bias");
+    ggml_tensor * ls2     = lingbot_require_tensor(weights_ctx, prefix + ".ls2.gamma");
+
+    ggml_tensor * normed = lingbot_layer_norm(ctx, x, norm1_w, norm1_b, 1e-6f);
+    ggml_tensor * qkv = lingbot_linear(ctx, normed, qkv_w, qkv_b);
+    ggml_set_name(qkv, (graph_name + ".qkv").c_str());
+    if (qkv_out != nullptr) {
+        *qkv_out = qkv;
+    }
+
+    ggml_tensor * attn = lingbot_frame_self_attention(ctx, qkv, q_norm_w, q_norm_b, k_norm_w, k_norm_b,
+                                                      proj_w, proj_b, cfg.hidden_size);
+    attn = ggml_mul(ctx, attn, ls1);
+    ggml_tensor * attn_out = ggml_add(ctx, x, attn);
+    ggml_set_name(attn_out, (graph_name + ".attn_output").c_str());
+
+    ggml_tensor * ffn_inp = lingbot_layer_norm(ctx, attn_out, norm2_w, norm2_b, 1e-6f);
+    ggml_tensor * ffn = lingbot_mlp_gelu(ctx, ffn_inp, fc1_w, fc1_b, fc2_w, fc2_b);
+    ffn = ggml_mul(ctx, ffn, ls2);
+    ggml_tensor * out = ggml_add(ctx, attn_out, ffn);
+    ggml_set_name(out, (graph_name + ".output").c_str());
+    return out;
+}
+
+
+static ggml_tensor * lingbot_camera_self_attention(
+        ggml_context * ctx,
+        ggml_tensor *  qkv,
+        ggml_tensor *  proj_w,
+        ggml_tensor *  proj_b,
+        int64_t        hidden_size) {
+    const int64_t n_heads = 16;
+    if (hidden_size <= 0 || hidden_size % n_heads != 0) {
+        throw std::runtime_error("LingBot-MAP camera hidden size must be divisible by 16 attention heads");
+    }
+    const int64_t head_dim = hidden_size / n_heads;
+
+    ggml_tensor * q = lingbot_qkv_view(ctx, qkv, hidden_size, 0);
+    ggml_tensor * k = lingbot_qkv_view(ctx, qkv, hidden_size, 1);
+    ggml_tensor * v = lingbot_qkv_view(ctx, qkv, hidden_size, 2);
+
+    q = lingbot_head_view(ctx, q, head_dim, n_heads);
+    k = lingbot_head_view(ctx, k, head_dim, n_heads);
+    v = lingbot_head_view(ctx, v, head_dim, n_heads);
+
+    q = ggml_permute(ctx, q, 0, 2, 1, 3);
+    k = ggml_permute(ctx, k, 0, 2, 1, 3);
+    v = ggml_permute(ctx, v, 0, 2, 1, 3);
+
+    ggml_tensor * attn = ggml_flash_attn_ext(ctx, q, k, v, nullptr, 1.0f / std::sqrt((float) head_dim), 0.0f, 0.0f);
+    ggml_flash_attn_ext_set_prec(attn, GGML_PREC_F32);
+    attn = ggml_cont_3d(ctx, attn, hidden_size, qkv->ne[1], qkv->ne[2]);
+
+    return lingbot_linear(ctx, attn, proj_w, proj_b);
+}
+
+static ggml_tensor * lingbot_apply_camera_trunk_block(
+        ggml_context *      ctx,
+        ggml_context *      weights_ctx,
+        const lingbot_map_config & cfg,
+        ggml_tensor *       x,
+        const std::string & prefix,
+        const std::string & graph_name) {
+    ggml_tensor * norm1_w = lingbot_require_tensor(weights_ctx, prefix + ".norm1.weight");
+    ggml_tensor * norm1_b = lingbot_require_tensor(weights_ctx, prefix + ".norm1.bias");
+    ggml_tensor * qkv_w   = lingbot_require_tensor(weights_ctx, prefix + ".attn.qkv.weight");
+    ggml_tensor * qkv_b   = lingbot_require_tensor(weights_ctx, prefix + ".attn.qkv.bias");
+    ggml_tensor * proj_w  = lingbot_require_tensor(weights_ctx, prefix + ".attn.proj.weight");
+    ggml_tensor * proj_b  = lingbot_require_tensor(weights_ctx, prefix + ".attn.proj.bias");
+    ggml_tensor * ls1     = lingbot_require_tensor(weights_ctx, prefix + ".ls1.gamma");
+    ggml_tensor * norm2_w = lingbot_require_tensor(weights_ctx, prefix + ".norm2.weight");
+    ggml_tensor * norm2_b = lingbot_require_tensor(weights_ctx, prefix + ".norm2.bias");
+    ggml_tensor * fc1_w   = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc1.weight");
+    ggml_tensor * fc1_b   = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc1.bias");
+    ggml_tensor * fc2_w   = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc2.weight");
+    ggml_tensor * fc2_b   = lingbot_require_tensor(weights_ctx, prefix + ".mlp.fc2.bias");
+    ggml_tensor * ls2     = lingbot_require_tensor(weights_ctx, prefix + ".ls2.gamma");
+
+    ggml_tensor * normed = lingbot_layer_norm(ctx, x, norm1_w, norm1_b, 1e-6f);
+    ggml_tensor * qkv = lingbot_linear(ctx, normed, qkv_w, qkv_b);
+    ggml_set_name(qkv, (graph_name + ".qkv").c_str());
+
+    ggml_tensor * attn = lingbot_camera_self_attention(ctx, qkv, proj_w, proj_b, cfg.camera_hidden_size);
+    attn = ggml_mul(ctx, attn, ls1);
+    ggml_tensor * attn_out = ggml_add(ctx, x, attn);
+    ggml_set_name(attn_out, (graph_name + ".attn_output").c_str());
+
+    ggml_tensor * ffn_inp = lingbot_layer_norm(ctx, attn_out, norm2_w, norm2_b, 1e-6f);
+    ggml_tensor * ffn = lingbot_mlp_gelu(ctx, ffn_inp, fc1_w, fc1_b, fc2_w, fc2_b);
+    ffn = ggml_mul(ctx, ffn, ls2);
+    ggml_tensor * out = ggml_add(ctx, attn_out, ffn);
+    ggml_set_name(out, (graph_name + ".output").c_str());
+    return out;
+}
+
+static ggml_tensor * lingbot_pose_branch(
+        ggml_context * ctx,
+        ggml_context * weights_ctx,
+        ggml_tensor *  x) {
+    ggml_tensor * fc1_w = lingbot_require_tensor(weights_ctx, "camera_head.pose_branch.fc1.weight");
+    ggml_tensor * fc1_b = lingbot_require_tensor(weights_ctx, "camera_head.pose_branch.fc1.bias");
+    ggml_tensor * fc2_w = lingbot_require_tensor(weights_ctx, "camera_head.pose_branch.fc2.weight");
+    ggml_tensor * fc2_b = lingbot_require_tensor(weights_ctx, "camera_head.pose_branch.fc2.bias");
+    return lingbot_mlp_gelu(ctx, x, fc1_w, fc1_b, fc2_w, fc2_b);
+}
+
+
+static ggml_tensor * lingbot_activate_pose(
+        ggml_context * ctx,
+        ggml_tensor *  pred_pose) {
+    if (pred_pose->ne[0] != 9) {
+        throw std::runtime_error("LingBot-MAP camera_head pose activation expects 9 pose channels");
+    }
+    ggml_tensor * trans_quat = ggml_view_3d(ctx, pred_pose, 7, pred_pose->ne[1], pred_pose->ne[2],
+                                            pred_pose->nb[1], pred_pose->nb[2], 0);
+    ggml_tensor * fov = ggml_view_3d(ctx, pred_pose, 2, pred_pose->ne[1], pred_pose->ne[2],
+                                     pred_pose->nb[1], pred_pose->nb[2],
+                                     7 * ggml_type_size(pred_pose->type));
+    fov = ggml_relu(ctx, fov);
+    return ggml_concat(ctx, trans_quat, fov, 0);
+}
+
+static ggml_backend_buffer_type_t lingbot_select_cpu_buffer_type(
+        ggml_backend_t backend,
+        bool           prefer_smt) {
+    ggml_backend_buffer_type_t default_buft = ggml_backend_get_default_buffer_type(backend);
+    if (!prefer_smt) {
+        return default_buft;
+    }
+
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    auto * get_extra_bufts = (ggml_backend_dev_get_extra_bufts_t)
+            ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
+    if (get_extra_bufts == nullptr) {
+        return default_buft;
+    }
+
+    ggml_backend_buffer_type_t * extra_bufts = get_extra_bufts(dev);
+    if (extra_bufts == nullptr) {
+        return default_buft;
+    }
+    for (int i = 0; extra_bufts[i] != nullptr; ++i) {
+        const char * name = ggml_backend_buft_name(extra_bufts[i]);
+        if (name != nullptr && std::strstr(name, "SPACEMIT") != nullptr &&
+            ggml_backend_supports_buft(backend, extra_bufts[i])) {
+            return extra_bufts[i];
+        }
+    }
+    return default_buft;
+}
+
+static lingbot_map_runtime_weights lingbot_load_runtime_weights(
+        const lingbot_map_config & cfg,
+        ggml_backend_buffer_type_t buft) {
+    ggml_context * ggml_raw = nullptr;
+    gguf_init_params params = {
+        /*.no_alloc =*/ true,
+        /*.ctx      =*/ &ggml_raw,
+    };
+
+    lingbot_map_runtime_weights weights;
+    weights.gguf.reset(gguf_init_from_file(cfg.aggregator_camera_model_path.c_str(), params));
+    weights.ggml.reset(ggml_raw);
+    if (!weights.gguf || !weights.ggml) {
+        throw std::runtime_error("failed to open LingBot-MAP GGUF for runtime: " + cfg.aggregator_camera_model_path);
+    }
+    validate_lingbot_map_aggregator_block_shapes(weights.ggml.get(), cfg);
+    validate_lingbot_map_camera_head_shapes(weights.ggml.get(), cfg);
+
+    weights.buffer.reset(ggml_backend_alloc_ctx_tensors_from_buft(weights.ggml.get(), buft));
+    if (!weights.buffer) {
+        throw std::runtime_error(std::string("failed to allocate LingBot-MAP runtime weights on buffer type: ") +
+                                 ggml_backend_buft_name(buft));
+    }
+    ggml_backend_buffer_set_usage(weights.buffer.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+    std::ifstream fin(cfg.aggregator_camera_model_path, std::ios::binary);
+    if (!fin.is_open()) {
+        throw std::runtime_error("failed to read LingBot-MAP GGUF weights: " + cfg.aggregator_camera_model_path);
+    }
+
+    std::vector<uint8_t> read_buf;
+    const int64_t n_tensors = gguf_get_n_tensors(weights.gguf.get());
+    for (int64_t i = 0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(weights.gguf.get(), i);
+        ggml_tensor * tensor = ggml_get_tensor(weights.ggml.get(), name);
+        if (tensor == nullptr) {
+            throw std::runtime_error(std::string("missing LingBot-MAP runtime tensor: ") + name);
+        }
+        const size_t offset = gguf_get_data_offset(weights.gguf.get()) + gguf_get_tensor_offset(weights.gguf.get(), i);
+        const size_t nbytes = ggml_nbytes(tensor);
+        fin.seekg((std::streamoff) offset, std::ios::beg);
+        if (!fin) {
+            throw std::runtime_error(std::string("failed to seek LingBot-MAP runtime tensor: ") + name);
+        }
+        if (ggml_backend_buft_is_host(buft)) {
+            fin.read(reinterpret_cast<char *>(tensor->data), (std::streamsize) nbytes);
+        } else {
+            read_buf.resize(nbytes);
+            fin.read(reinterpret_cast<char *>(read_buf.data()), (std::streamsize) nbytes);
+            ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes);
+        }
+        if (!fin) {
+            throw std::runtime_error(std::string("failed to load LingBot-MAP runtime tensor: ") + name);
+        }
+    }
+    return weights;
+}
+
+static lingbot_map_runtime_graph lingbot_build_aggregator_camera_runtime_graph(
+        ggml_context *                       ctx,
+        ggml_context *                       weights_ctx,
+        const lingbot_map_config &           cfg,
+        const lingbot_map_aggregator_input & input) {
+    ggml_tensor * tokens = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.hidden_size,
+                                              input.tokens_per_frame, input.n_frames);
+    ggml_set_name(tokens, "lingbot_map.runtime.aggregator.input");
+    ggml_set_input(tokens);
+
+    lingbot_map_runtime_graph built;
+    built.input_tokens = tokens;
+    built.selected_outputs.reserve(cfg.aggregator_selected_layers.size());
+    built.iteration_poses.reserve(cfg.camera_num_iterations);
+
+    ggml_tensor * frame_tokens = nullptr;
+    ggml_tensor * global_tokens = nullptr;
+    ggml_tensor * global_as_frame = nullptr;
+
+    for (int32_t i = 0; i < cfg.frame_block_count; ++i) {
+        frame_tokens = lingbot_apply_aggregator_block(ctx, weights_ctx, cfg, tokens,
+                                                      "aggregator.frame_blocks." + std::to_string(i),
+                                                      "lingbot_map.runtime.aggregator.frame." + std::to_string(i),
+                                                      nullptr);
+        global_tokens = ggml_reshape_3d(ctx, frame_tokens, cfg.hidden_size,
+                                        (int64_t) input.tokens_per_frame * input.n_frames, 1);
+        global_tokens = lingbot_apply_aggregator_block(ctx, weights_ctx, cfg, global_tokens,
+                                                       "aggregator.global_blocks." + std::to_string(i),
+                                                       "lingbot_map.runtime.aggregator.global." + std::to_string(i),
+                                                       nullptr);
+        global_as_frame = ggml_reshape_3d(ctx, global_tokens, cfg.hidden_size, input.tokens_per_frame, input.n_frames);
+
+        if (std::find(cfg.aggregator_selected_layers.begin(), cfg.aggregator_selected_layers.end(), i) !=
+            cfg.aggregator_selected_layers.end()) {
+            ggml_tensor * selected = ggml_concat(ctx, frame_tokens, global_as_frame, 0);
+            ggml_set_name(selected, ("lingbot_map.runtime.aggregator.selected." + std::to_string(i)).c_str());
+            built.selected_outputs.push_back(selected);
+            if (i == cfg.frame_block_count - 1) {
+                built.camera_head_input = selected;
+            }
+        }
+
+        tokens = global_as_frame;
+    }
+
+    if (built.camera_head_input == nullptr) {
+        built.camera_head_input = ggml_concat(ctx, frame_tokens, global_as_frame, 0);
+        ggml_set_name(built.camera_head_input, "lingbot_map.runtime.aggregator.camera_head_input");
+    }
+    if (built.camera_head_input->ne[0] != cfg.camera_hidden_size) {
+        throw std::runtime_error("LingBot-MAP runtime camera_head input width does not match camera_hidden_size");
+    }
+
+    const int64_t pose_dim = 9;
+    ggml_tensor * pose_tokens = ggml_view_3d(ctx, built.camera_head_input,
+                                             cfg.camera_hidden_size, input.n_frames, 1,
+                                             built.camera_head_input->nb[2],
+                                             (size_t) built.camera_head_input->nb[2] * (size_t) input.n_frames,
+                                             0);
+    pose_tokens = lingbot_layer_norm(ctx, pose_tokens,
+                                     lingbot_require_tensor(weights_ctx, "camera_head.token_norm.weight"),
+                                     lingbot_require_tensor(weights_ctx, "camera_head.token_norm.bias"),
+                                     1e-6f);
+    ggml_set_name(pose_tokens, "lingbot_map.runtime.camera_head.pose_tokens");
+
+    ggml_tensor * empty_pose = lingbot_require_tensor(weights_ctx, "camera_head.empty_pose_tokens");
+    ggml_tensor * pred_pose = nullptr;
+
+    for (int32_t iter = 0; iter < cfg.camera_num_iterations; ++iter) {
+        ggml_tensor * module_input = nullptr;
+        if (pred_pose == nullptr) {
+            ggml_tensor * empty_pose_target = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, pose_dim, input.n_frames, 1);
+            module_input = ggml_repeat(ctx, empty_pose, empty_pose_target);
+        } else {
+            module_input = pred_pose;
+        }
+
+        module_input = lingbot_linear(ctx, module_input,
+                                      lingbot_require_tensor(weights_ctx, "camera_head.embed_pose.weight"),
+                                      lingbot_require_tensor(weights_ctx, "camera_head.embed_pose.bias"));
+
+        ggml_tensor * modulation = lingbot_linear(ctx,
+                                                  ggml_silu(ctx, module_input),
+                                                  lingbot_require_tensor(weights_ctx, "camera_head.poseLN_modulation.1.weight"),
+                                                  lingbot_require_tensor(weights_ctx, "camera_head.poseLN_modulation.1.bias"));
+        ggml_tensor * shift = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1,
+                                           modulation->nb[1], modulation->nb[2], 0);
+        ggml_tensor * scale = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1,
+                                           modulation->nb[1], modulation->nb[2],
+                                           (size_t) cfg.camera_hidden_size * ggml_type_size(modulation->type));
+        ggml_tensor * gate = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1,
+                                          modulation->nb[1], modulation->nb[2],
+                                          (size_t) cfg.camera_hidden_size * 2 * ggml_type_size(modulation->type));
+
+        ggml_tensor * adaln = ggml_norm(ctx, pose_tokens, 1e-6f);
+        ggml_tensor * scale_cont = ggml_cont(ctx, scale);
+        ggml_tensor * modulated = ggml_mul(ctx, adaln, ggml_scale_bias(ctx, scale_cont, 1.0f, 1.0f));
+        modulated = ggml_add(ctx, modulated, shift);
+        modulated = ggml_mul(ctx, modulated, gate);
+        modulated = ggml_add(ctx, modulated, pose_tokens);
+
+        for (int32_t block = 0; block < cfg.camera_trunk_block_count; ++block) {
+            modulated = lingbot_apply_camera_trunk_block(ctx, weights_ctx, cfg, modulated,
+                                                         "camera_head.trunk." + std::to_string(block),
+                                                         "lingbot_map.runtime.camera_head.iter." + std::to_string(iter) +
+                                                         ".trunk." + std::to_string(block));
+        }
+
+        ggml_tensor * trunk_norm = lingbot_layer_norm(ctx, modulated,
+                                                      lingbot_require_tensor(weights_ctx, "camera_head.trunk_norm.weight"),
+                                                      lingbot_require_tensor(weights_ctx, "camera_head.trunk_norm.bias"),
+                                                      1e-6f);
+        ggml_tensor * delta = lingbot_pose_branch(ctx, weights_ctx, trunk_norm);
+        pred_pose = pred_pose == nullptr ? delta : ggml_add(ctx, pred_pose, delta);
+        ggml_set_name(pred_pose, ("lingbot_map.runtime.camera_head.pose_iter." + std::to_string(iter)).c_str());
+        ggml_tensor * activated_pose = lingbot_activate_pose(ctx, pred_pose);
+        ggml_set_name(activated_pose, ("lingbot_map.runtime.camera_head.activated_pose_iter." + std::to_string(iter)).c_str());
+        built.iteration_poses.push_back(activated_pose);
+        built.final_pose = activated_pose;
+    }
+
+    built.graph = ggml_new_graph_custom(ctx, 32768, false);
+    for (size_t i = 0; i < built.selected_outputs.size(); ++i) {
+        ggml_tensor * selected = ggml_cont(ctx, built.selected_outputs[i]);
+        ggml_set_name(selected, ("lingbot_map.runtime.aggregator.selected_output." + std::to_string(i)).c_str());
+        built.selected_outputs[i] = selected;
+        ggml_build_forward_expand(built.graph, selected);
+    }
+    built.final_pose = ggml_cont(ctx, built.final_pose);
+    ggml_set_name(built.final_pose, "lingbot_map.runtime.camera_head.final_pose_output");
+    ggml_build_forward_expand(built.graph, built.final_pose);
+    return built;
+}
+
+static lingbot_map_graph_probe_result lingbot_build_aggregator_block_probe(
+        ggml_context *                       weights_ctx,
+        const lingbot_map_config &           cfg,
+        const lingbot_map_aggregator_input & input,
+        const std::string &                  block_prefix,
+        const std::string &                  graph_name,
+        bool                                 flatten_frames) {
+    if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size) {
+        throw std::invalid_argument("Invalid LingBot-MAP aggregator input for block probe");
+    }
+
+    const size_t mem_size = 64ull * 1024ull * 1024ull;
+    ggml_init_params params = {
+        /*.mem_size   =*/ mem_size,
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context_ptr compute_ctx(ggml_init(params));
+    if (!compute_ctx) {
+        throw std::runtime_error("failed to create LingBot-MAP aggregator probe ggml context");
+    }
+
+    const int64_t n_seq_tokens = flatten_frames ? (int64_t) input.tokens_per_frame * input.n_frames : input.tokens_per_frame;
+    const int64_t n_batches = flatten_frames ? 1 : input.n_frames;
+
+    ggml_context * ctx = compute_ctx.get();
+    ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.hidden_size, n_seq_tokens, n_batches);
+    ggml_set_name(x, (graph_name + ".input").c_str());
+    ggml_set_input(x);
+
+    ggml_tensor * qkv = nullptr;
+    ggml_tensor * out = lingbot_apply_aggregator_block(ctx, weights_ctx, cfg, x, block_prefix, graph_name, &qkv);
+
+    ggml_cgraph * graph = ggml_new_graph_custom(ctx, 256, false);
+    ggml_build_forward_expand(graph, qkv);
+    ggml_build_forward_expand(graph, out);
+
+    lingbot_map_graph_probe_result result;
+    result.graph_nodes = ggml_graph_n_nodes(graph);
+    result.input_tokens_per_frame = (int32_t) n_seq_tokens;
+    for (int i = 0; i < 4; ++i) {
+        result.output_ne[i] = (int32_t) out->ne[i];
+        result.qkv_ne[i] = (int32_t) qkv->ne[i];
+    }
+    return result;
+}
+
+lingbot_map_graph_probe_result lingbot_map_context::build_aggregator_frame_block_probe(
+        const lingbot_map_aggregator_input & input,
+        int32_t                              block_index) const {
+    const auto & cfg = config();
+    if (block_index < 0 || block_index >= cfg.frame_block_count) {
+        throw std::invalid_argument("Invalid LingBot-MAP frame block index");
+    }
+    return lingbot_build_aggregator_block_probe(pimpl_->ggml.get(), cfg, input,
+                                                "aggregator.frame_blocks." + std::to_string(block_index),
+                                                "lingbot_map.aggregator.frame_probe",
+                                                /* flatten_frames */ false);
+}
+
+lingbot_map_graph_probe_result lingbot_map_context::build_aggregator_global_block_probe(
+        const lingbot_map_aggregator_input & input,
+        int32_t                              block_index) const {
+    const auto & cfg = config();
+    if (block_index < 0 || block_index >= cfg.global_block_count) {
+        throw std::invalid_argument("Invalid LingBot-MAP global block index");
+    }
+    return lingbot_build_aggregator_block_probe(pimpl_->ggml.get(), cfg, input,
+                                                "aggregator.global_blocks." + std::to_string(block_index),
+                                                "lingbot_map.aggregator.global_probe",
+                                                /* flatten_frames */ true);
+}
+
+lingbot_map_aggregator_probe_result lingbot_map_context::build_aggregator_block_probes(
+        const lingbot_map_aggregator_input & input,
+        int32_t                              block_index) const {
+    lingbot_map_aggregator_probe_result result;
+    result.frame = build_aggregator_frame_block_probe(input, block_index);
+    result.global = build_aggregator_global_block_probe(input, block_index);
+    return result;
+}
+
+lingbot_map_full_aggregator_probe_result lingbot_map_context::build_full_aggregator_probe(
+        const lingbot_map_aggregator_input & input) const {
+    const auto & cfg = config();
+    if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size) {
+        throw std::invalid_argument("Invalid LingBot-MAP aggregator input for full probe");
+    }
+    if (cfg.frame_block_count <= 0 || cfg.global_block_count <= 0 || cfg.frame_block_count != cfg.global_block_count) {
+        throw std::runtime_error("LingBot-MAP full aggregator probe requires matching frame/global block counts");
+    }
+
+    const size_t mem_size = 256ull * 1024ull * 1024ull;
+    ggml_init_params params = {
+        /*.mem_size   =*/ mem_size,
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context_ptr compute_ctx(ggml_init(params));
+    if (!compute_ctx) {
+        throw std::runtime_error("failed to create LingBot-MAP full aggregator probe ggml context");
+    }
+
+    ggml_context * ctx = compute_ctx.get();
+    ggml_tensor * tokens = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.hidden_size, input.tokens_per_frame, input.n_frames);
+    ggml_set_name(tokens, "lingbot_map.aggregator.full_probe.input");
+    ggml_set_input(tokens);
+
+    ggml_tensor * frame_tokens = nullptr;
+    ggml_tensor * global_tokens = nullptr;
+    int32_t selected_outputs = 0;
+    for (int32_t i = 0; i < cfg.frame_block_count; ++i) {
+        frame_tokens = lingbot_apply_aggregator_block(ctx, pimpl_->ggml.get(), cfg, tokens,
+                                                      "aggregator.frame_blocks." + std::to_string(i),
+                                                      "lingbot_map.aggregator.full_probe.frame." + std::to_string(i),
+                                                      nullptr);
+        global_tokens = ggml_reshape_3d(ctx, frame_tokens, cfg.hidden_size,
+                                        (int64_t) input.tokens_per_frame * input.n_frames, 1);
+        global_tokens = lingbot_apply_aggregator_block(ctx, pimpl_->ggml.get(), cfg, global_tokens,
+                                                       "aggregator.global_blocks." + std::to_string(i),
+                                                       "lingbot_map.aggregator.full_probe.global." + std::to_string(i),
+                                                       nullptr);
+        tokens = ggml_reshape_3d(ctx, global_tokens, cfg.hidden_size, input.tokens_per_frame, input.n_frames);
+        if (std::find(cfg.aggregator_selected_layers.begin(), cfg.aggregator_selected_layers.end(), i) !=
+            cfg.aggregator_selected_layers.end()) {
+            selected_outputs += 1;
+        }
+    }
+
+    ggml_cgraph * graph = ggml_new_graph_custom(ctx, 8192, false);
+    ggml_build_forward_expand(graph, frame_tokens);
+    ggml_build_forward_expand(graph, global_tokens);
+
+    lingbot_map_full_aggregator_probe_result result;
+    result.graph_nodes = ggml_graph_n_nodes(graph);
+    result.selected_output_count = selected_outputs;
+    result.frame_block_count = cfg.frame_block_count;
+    result.global_block_count = cfg.global_block_count;
+    for (int i = 0; i < 4; ++i) {
+        result.final_frame_ne[i] = (int32_t) frame_tokens->ne[i];
+        result.final_global_ne[i] = (int32_t) global_tokens->ne[i];
+    }
+    return result;
+}
+
+lingbot_map_aggregator_graph_result lingbot_map_context::build_aggregator_graph(
+        const lingbot_map_aggregator_input & input) const {
+    const auto & cfg = config();
+    if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size) {
+        throw std::invalid_argument("Invalid LingBot-MAP aggregator input for graph build");
+    }
+    if (cfg.frame_block_count <= 0 || cfg.global_block_count <= 0 || cfg.frame_block_count != cfg.global_block_count) {
+        throw std::runtime_error("LingBot-MAP aggregator graph requires matching frame/global block counts");
+    }
+
+    const size_t mem_size = 256ull * 1024ull * 1024ull;
+    ggml_init_params params = {
+        /*.mem_size   =*/ mem_size,
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context_ptr compute_ctx(ggml_init(params));
+    if (!compute_ctx) {
+        throw std::runtime_error("failed to create LingBot-MAP aggregator graph ggml context");
+    }
+
+    ggml_context * ctx = compute_ctx.get();
+    ggml_tensor * tokens = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.hidden_size, input.tokens_per_frame, input.n_frames);
+    ggml_set_name(tokens, "lingbot_map.aggregator.graph.input");
+    ggml_set_input(tokens);
+
+    std::vector<ggml_tensor *> selected_outputs;
+    selected_outputs.reserve(cfg.aggregator_selected_layers.size());
+    ggml_tensor * frame_tokens = nullptr;
+    ggml_tensor * global_tokens = nullptr;
+
+    for (int32_t i = 0; i < cfg.frame_block_count; ++i) {
+        frame_tokens = lingbot_apply_aggregator_block(ctx, pimpl_->ggml.get(), cfg, tokens,
+                                                      "aggregator.frame_blocks." + std::to_string(i),
+                                                      "lingbot_map.aggregator.graph.frame." + std::to_string(i),
+                                                      nullptr);
+        global_tokens = ggml_reshape_3d(ctx, frame_tokens, cfg.hidden_size,
+                                        (int64_t) input.tokens_per_frame * input.n_frames, 1);
+        global_tokens = lingbot_apply_aggregator_block(ctx, pimpl_->ggml.get(), cfg, global_tokens,
+                                                       "aggregator.global_blocks." + std::to_string(i),
+                                                       "lingbot_map.aggregator.graph.global." + std::to_string(i),
+                                                       nullptr);
+        ggml_tensor * global_as_frame = ggml_reshape_3d(ctx, global_tokens, cfg.hidden_size, input.tokens_per_frame, input.n_frames);
+
+        if (std::find(cfg.aggregator_selected_layers.begin(), cfg.aggregator_selected_layers.end(), i) !=
+            cfg.aggregator_selected_layers.end()) {
+            ggml_tensor * selected = ggml_concat(ctx, frame_tokens, global_as_frame, 0);
+            ggml_set_name(selected, ("lingbot_map.aggregator.graph.selected." + std::to_string(i)).c_str());
+            selected_outputs.push_back(selected);
+        }
+
+        tokens = global_as_frame;
+    }
+
+    ggml_cgraph * graph = ggml_new_graph_custom(ctx, 8192, false);
+    ggml_build_forward_expand(graph, frame_tokens);
+    ggml_build_forward_expand(graph, global_tokens);
+    for (ggml_tensor * selected : selected_outputs) {
+        ggml_build_forward_expand(graph, selected);
+    }
+
+    lingbot_map_aggregator_graph_result result;
+    result.graph_nodes = ggml_graph_n_nodes(graph);
+    result.selected_output_count = (int32_t) selected_outputs.size();
+    result.frame_block_count = cfg.frame_block_count;
+    result.global_block_count = cfg.global_block_count;
+    result.tokens_per_frame = input.tokens_per_frame;
+    result.patch_start_idx = input.patch_start_idx;
+    result.selected_layers = cfg.aggregator_selected_layers;
+    for (int i = 0; i < 4; ++i) {
+        result.final_frame_ne[i] = (int32_t) frame_tokens->ne[i];
+        result.final_global_ne[i] = (int32_t) global_tokens->ne[i];
+    }
+    result.selected_output_shapes.reserve(selected_outputs.size());
+    for (const ggml_tensor * selected : selected_outputs) {
+        result.selected_output_shapes.push_back({
+            (int32_t) selected->ne[0],
+            (int32_t) selected->ne[1],
+            (int32_t) selected->ne[2],
+            (int32_t) selected->ne[3],
+        });
+    }
+    return result;
+}
+
+lingbot_map_camera_head_graph_result lingbot_map_context::build_camera_head_graph(
+        const lingbot_map_aggregator_input & input) const {
+    const auto & cfg = config();
+    if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size ||
+        cfg.camera_hidden_size <= 0) {
+        throw std::invalid_argument("Invalid LingBot-MAP camera_head graph input");
+    }
+    if (cfg.camera_trunk_block_count <= 0 || cfg.camera_num_iterations <= 0) {
+        throw std::runtime_error("LingBot-MAP camera_head graph requires trunk blocks and iterations");
+    }
+
+    const size_t mem_size = 256ull * 1024ull * 1024ull;
+    ggml_init_params params = {
+        /*.mem_size   =*/ mem_size,
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context_ptr compute_ctx(ggml_init(params));
+    if (!compute_ctx) {
+        throw std::runtime_error("failed to create LingBot-MAP camera_head graph ggml context");
+    }
+
+    ggml_context * weights_ctx = pimpl_->ggml.get();
+    ggml_context * ctx = compute_ctx.get();
+    const int64_t pose_dim = 9;
+    ggml_tensor * aggregated_tokens = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, cfg.camera_hidden_size,
+                                                         input.tokens_per_frame, input.n_frames);
+    ggml_set_name(aggregated_tokens, "lingbot_map.camera_head.graph.aggregated_tokens");
+    ggml_set_input(aggregated_tokens);
+
+    ggml_tensor * pose_tokens = ggml_view_3d(ctx, aggregated_tokens,
+                                             cfg.camera_hidden_size, input.n_frames, 1,
+                                             aggregated_tokens->nb[2],
+                                             (size_t) aggregated_tokens->nb[2] * (size_t) input.n_frames,
+                                             0);
+    pose_tokens = lingbot_layer_norm(ctx, pose_tokens,
+                                     lingbot_require_tensor(weights_ctx, "camera_head.token_norm.weight"),
+                                     lingbot_require_tensor(weights_ctx, "camera_head.token_norm.bias"),
+                                     1e-6f);
+    ggml_set_name(pose_tokens, "lingbot_map.camera_head.graph.pose_tokens");
+
+    ggml_tensor * empty_pose = lingbot_require_tensor(weights_ctx, "camera_head.empty_pose_tokens");
+    ggml_tensor * pred_pose = nullptr;
+    std::vector<ggml_tensor *> iteration_outputs;
+    iteration_outputs.reserve(cfg.camera_num_iterations);
+
+    for (int32_t iter = 0; iter < cfg.camera_num_iterations; ++iter) {
+        ggml_tensor * module_input = nullptr;
+        if (pred_pose == nullptr) {
+            ggml_tensor * empty_pose_target = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, pose_dim, input.n_frames, 1);
+            module_input = ggml_repeat(ctx, empty_pose, empty_pose_target);
+        } else {
+            module_input = pred_pose;
+        }
+
+        module_input = lingbot_linear(ctx, module_input,
+                                      lingbot_require_tensor(weights_ctx, "camera_head.embed_pose.weight"),
+                                      lingbot_require_tensor(weights_ctx, "camera_head.embed_pose.bias"));
+
+        ggml_tensor * modulation = lingbot_linear(ctx,
+                                                  ggml_silu(ctx, module_input),
+                                                  lingbot_require_tensor(weights_ctx, "camera_head.poseLN_modulation.1.weight"),
+                                                  lingbot_require_tensor(weights_ctx, "camera_head.poseLN_modulation.1.bias"));
+        ggml_tensor * shift = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1,
+                                           modulation->nb[1], modulation->nb[2], 0);
+        ggml_tensor * scale = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1,
+                                           modulation->nb[1], modulation->nb[2],
+                                           (size_t) cfg.camera_hidden_size * ggml_type_size(modulation->type));
+        ggml_tensor * gate = ggml_view_3d(ctx, modulation, cfg.camera_hidden_size, input.n_frames, 1,
+                                          modulation->nb[1], modulation->nb[2],
+                                          (size_t) cfg.camera_hidden_size * 2 * ggml_type_size(modulation->type));
+
+        ggml_tensor * adaln = ggml_norm(ctx, pose_tokens, 1e-6f);
+        ggml_tensor * scale_cont = ggml_cont(ctx, scale);
+        ggml_tensor * modulated = ggml_mul(ctx, adaln, ggml_scale_bias(ctx, scale_cont, 1.0f, 1.0f));
+        modulated = ggml_add(ctx, modulated, shift);
+        modulated = ggml_mul(ctx, modulated, gate);
+        modulated = ggml_add(ctx, modulated, pose_tokens);
+
+        for (int32_t block = 0; block < cfg.camera_trunk_block_count; ++block) {
+            modulated = lingbot_apply_camera_trunk_block(ctx, weights_ctx, cfg, modulated,
+                                                         "camera_head.trunk." + std::to_string(block),
+                                                         "lingbot_map.camera_head.graph.iter." + std::to_string(iter) + ".trunk." + std::to_string(block));
+        }
+
+        ggml_tensor * trunk_norm = lingbot_layer_norm(ctx, modulated,
+                                                      lingbot_require_tensor(weights_ctx, "camera_head.trunk_norm.weight"),
+                                                      lingbot_require_tensor(weights_ctx, "camera_head.trunk_norm.bias"),
+                                                      1e-6f);
+        ggml_tensor * delta = lingbot_pose_branch(ctx, weights_ctx, trunk_norm);
+        pred_pose = pred_pose == nullptr ? delta : ggml_add(ctx, pred_pose, delta);
+        ggml_set_name(pred_pose, ("lingbot_map.camera_head.graph.pose_iter." + std::to_string(iter)).c_str());
+        ggml_tensor * activated_pose = lingbot_activate_pose(ctx, pred_pose);
+        ggml_set_name(activated_pose, ("lingbot_map.camera_head.graph.activated_pose_iter." + std::to_string(iter)).c_str());
+        iteration_outputs.push_back(activated_pose);
+    }
+
+    ggml_cgraph * graph = ggml_new_graph_custom(ctx, 8192, false);
+    for (ggml_tensor * out : iteration_outputs) {
+        ggml_build_forward_expand(graph, out);
+    }
+
+    lingbot_map_camera_head_graph_result result;
+    result.graph_nodes = ggml_graph_n_nodes(graph);
+    result.trunk_block_count = cfg.camera_trunk_block_count;
+    result.iteration_count = cfg.camera_num_iterations;
+    result.pose_dim = (int32_t) pose_dim;
+    for (int i = 0; i < 4; ++i) {
+        result.input_ne[i] = (int32_t) aggregated_tokens->ne[i];
+        result.final_pose_ne[i] = (int32_t) iteration_outputs.back()->ne[i];
+    }
+    result.iteration_pose_shapes.reserve(iteration_outputs.size());
+    for (const ggml_tensor * out : iteration_outputs) {
+        result.iteration_pose_shapes.push_back({
+            (int32_t) out->ne[0],
+            (int32_t) out->ne[1],
+            (int32_t) out->ne[2],
+            (int32_t) out->ne[3],
+        });
+    }
+    return result;
+}
+
+
+lingbot_map_runtime_result lingbot_map_context::run_aggregator_camera_head(
+        const lingbot_map_aggregator_input & input,
+        bool                                 prefer_smt) const {
+    const auto & cfg = config();
+    if (input.n_frames <= 0 || input.tokens_per_frame <= 0 || input.hidden_size != cfg.hidden_size) {
+        throw std::invalid_argument("Invalid LingBot-MAP runtime input");
+    }
+    if (cfg.frame_block_count <= 0 || cfg.global_block_count <= 0 || cfg.frame_block_count != cfg.global_block_count) {
+        throw std::runtime_error("LingBot-MAP runtime requires matching frame/global block counts");
+    }
+    if (cfg.camera_trunk_block_count <= 0 || cfg.camera_num_iterations <= 0) {
+        throw std::runtime_error("LingBot-MAP runtime requires camera_head trunk blocks and iterations");
+    }
+
+    if (!pimpl_->runtime_initialized || pimpl_->runtime_prefer_smt != prefer_smt) {
+        ggml_backend_ptr backend(ggml_backend_cpu_init());
+        if (!backend) {
+            throw std::runtime_error("failed to initialize LingBot-MAP GGML CPU/SMT backend");
+        }
+        ggml_backend_cpu_set_n_threads(backend.get(), cfg.ggml_threads);
+        std::cerr << "[LingBot-MAP] GGML CPU backend threads=" << cfg.ggml_threads << "\n";
+
+        ggml_backend_buffer_type_t buft = lingbot_select_cpu_buffer_type(backend.get(), prefer_smt);
+        lingbot_map_runtime_weights weights;
+        try {
+            weights = lingbot_load_runtime_weights(cfg, buft);
+        } catch (const std::exception & e) {
+            ggml_backend_buffer_type_t default_buft = ggml_backend_get_default_buffer_type(backend.get());
+            if (!prefer_smt || buft == default_buft) {
+                throw;
+            }
+            std::cerr << "[LingBot-MAP] failed to allocate/load runtime weights on " << ggml_backend_buft_name(buft)
+                      << ", falling back to " << ggml_backend_buft_name(default_buft) << ": " << e.what() << "\n";
+            buft = default_buft;
+            weights = lingbot_load_runtime_weights(cfg, buft);
+        }
+
+        pimpl_->runtime_backend = std::move(backend);
+        pimpl_->runtime_buft = buft;
+        pimpl_->runtime_weights = std::move(weights);
+        pimpl_->runtime_prefer_smt = prefer_smt;
+        pimpl_->runtime_initialized = true;
+        std::cerr << "[LingBot-MAP] initialized GGML runtime backend=" << ggml_backend_name(pimpl_->runtime_backend.get())
+                  << ", buffer_type=" << ggml_backend_buft_name(pimpl_->runtime_buft) << "\n";
+    }
+
+    ggml_backend_t backend = pimpl_->runtime_backend.get();
+    ggml_backend_buffer_type_t buft = pimpl_->runtime_buft;
+    ggml_context * weights_ctx = pimpl_->runtime_weights.ggml.get();
+
+    const size_t mem_size = 512ull * 1024ull * 1024ull;
+    ggml_init_params params = {
+        /*.mem_size   =*/ mem_size,
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context_ptr compute_ctx(ggml_init(params));
+    if (!compute_ctx) {
+        throw std::runtime_error("failed to create LingBot-MAP runtime ggml context");
+    }
+
+    auto stage_start = std::chrono::steady_clock::now();
+    lingbot_map_runtime_graph runtime_graph = lingbot_build_aggregator_camera_runtime_graph(
+            compute_ctx.get(), weights_ctx, cfg, input);
+    std::cerr << "[LingBot-MAP][time] ggml_build_graph_ms=" << lingbot_elapsed_ms(stage_start)
+              << ", nodes=" << (runtime_graph.graph ? ggml_graph_n_nodes(runtime_graph.graph) : 0) << "\n";
+    if (runtime_graph.selected_outputs.empty() || runtime_graph.final_pose == nullptr || runtime_graph.graph == nullptr) {
+        throw std::runtime_error("LingBot-MAP runtime graph did not produce required outputs");
+    }
+
+    ggml_backend_buffer_type_t default_buft = ggml_backend_get_default_buffer_type(backend);
+    const bool primary_graph_supported = lingbot_graph_supported_by_backend(
+            backend, buft, runtime_graph.graph, /* log_summary */ buft != default_buft);
+
+    ggml_backend_ptr fallback_backend;
+    std::vector<ggml_backend_t> backend_ptrs;
+    std::vector<ggml_backend_buffer_type_t> backend_bufts;
+    backend_ptrs.push_back(backend);
+    backend_bufts.push_back(buft);
+
+    bool using_hybrid_cpu_fallback = false;
+    if (!primary_graph_supported && buft != default_buft) {
+        if (!ggml_backend_supports_buft(backend, default_buft)) {
+            std::cerr << "[LingBot-MAP] primary backend cannot use " << ggml_backend_buft_name(default_buft)
+                      << ", falling back to plain CPU scheduler\n";
+            buft = default_buft;
+            pimpl_->runtime_buft = buft;
+            pimpl_->runtime_weights = lingbot_load_runtime_weights(cfg, buft);
+            weights_ctx = pimpl_->runtime_weights.ggml.get();
+            stage_start = std::chrono::steady_clock::now();
+            runtime_graph = lingbot_build_aggregator_camera_runtime_graph(compute_ctx.get(), weights_ctx, cfg, input);
+            std::cerr << "[LingBot-MAP][time] ggml_rebuild_graph_after_cpu_fallback_ms=" << lingbot_elapsed_ms(stage_start)
+                      << ", nodes=" << (runtime_graph.graph ? ggml_graph_n_nodes(runtime_graph.graph) : 0) << "\n";
+            backend_ptrs.clear();
+            backend_bufts.clear();
+            backend_ptrs.push_back(backend);
+            backend_bufts.push_back(default_buft);
+        } else {
+            fallback_backend.reset(ggml_backend_cpu_init());
+            if (!fallback_backend) {
+                throw std::runtime_error("failed to initialize LingBot-MAP GGML CPU fallback backend");
+            }
+            ggml_backend_cpu_set_n_threads(fallback_backend.get(), cfg.ggml_threads);
+
+            pimpl_->runtime_buft = default_buft;
+            pimpl_->runtime_weights = lingbot_load_runtime_weights(cfg, default_buft);
+            weights_ctx = pimpl_->runtime_weights.ggml.get();
+            stage_start = std::chrono::steady_clock::now();
+            runtime_graph = lingbot_build_aggregator_camera_runtime_graph(compute_ctx.get(), weights_ctx, cfg, input);
+            std::cerr << "[LingBot-MAP][time] ggml_rebuild_graph_for_hybrid_ms=" << lingbot_elapsed_ms(stage_start)
+                      << ", nodes=" << (runtime_graph.graph ? ggml_graph_n_nodes(runtime_graph.graph) : 0) << "\n";
+            if (runtime_graph.selected_outputs.empty() || runtime_graph.final_pose == nullptr || runtime_graph.graph == nullptr) {
+                throw std::runtime_error("LingBot-MAP runtime graph did not produce required outputs after hybrid rebuild");
+            }
+
+            backend_ptrs.push_back(fallback_backend.get());
+            backend_bufts.push_back(default_buft);
+            using_hybrid_cpu_fallback = true;
+            std::cerr << "[LingBot-MAP] using hybrid GGML scheduler with CPU-host weights: primary=" << ggml_backend_buft_name(buft)
+                      << ", fallback=" << ggml_backend_buft_name(default_buft)
+                      << ", threads=" << cfg.ggml_threads << "\n";
+        }
+    } else if (!primary_graph_supported) {
+        throw std::runtime_error("LingBot-MAP GGML runtime graph contains ops unsupported by the selected backend");
+    }
+
+    ggml_backend_sched_ptr sched(
+            ggml_backend_sched_new(backend_ptrs.data(), backend_bufts.data(), (int) backend_ptrs.size(),
+                                   32768, false, true));
+    if (!sched) {
+        throw std::runtime_error("failed to create LingBot-MAP GGML scheduler");
+    }
+
+    ggml_backend_sched_reset(sched.get());
+    ggml_backend_t output_backend = fallback_backend ? fallback_backend.get() : backend;
+    for (ggml_tensor * selected : runtime_graph.selected_outputs) {
+        ggml_backend_sched_set_tensor_backend(sched.get(), selected, output_backend);
+    }
+    ggml_backend_sched_set_tensor_backend(sched.get(), runtime_graph.final_pose, output_backend);
+
+    stage_start = std::chrono::steady_clock::now();
+    if (!ggml_backend_sched_alloc_graph(sched.get(), runtime_graph.graph)) {
+        throw std::runtime_error("failed to allocate LingBot-MAP GGML runtime graph");
+    }
+    std::cerr << "[LingBot-MAP][time] ggml_alloc_graph_ms=" << lingbot_elapsed_ms(stage_start) << "\n";
+
+    const size_t input_nbytes = ggml_nbytes(runtime_graph.input_tokens);
+    if (input_nbytes != input.tokens.size() * sizeof(float)) {
+        throw std::runtime_error("LingBot-MAP runtime input byte size mismatch");
+    }
+    ggml_backend_tensor_set(runtime_graph.input_tokens, input.tokens.data(), 0, input_nbytes);
+
+    stage_start = std::chrono::steady_clock::now();
+    std::cerr << "[LingBot-MAP][time] ggml_compute_start backend=" << ggml_backend_name(backend)
+              << ", buffer_type=" << ggml_backend_buft_name(buft)
+              << ", nodes=" << ggml_graph_n_nodes(runtime_graph.graph) << "\n";
+    const enum ggml_status status = ggml_backend_sched_graph_compute(sched.get(), runtime_graph.graph);
+    std::cerr << "[LingBot-MAP][time] ggml_compute_ms=" << lingbot_elapsed_ms(stage_start)
+              << ", status=" << ggml_status_to_string(status) << "\n";
+    if (status != GGML_STATUS_SUCCESS) {
+        throw std::runtime_error(std::string("LingBot-MAP GGML runtime compute failed: ") + ggml_status_to_string(status));
+    }
+
+    lingbot_map_runtime_result result;
+    result.backend_name = ggml_backend_name(backend);
+    result.buffer_type_name = using_hybrid_cpu_fallback ?
+        std::string(ggml_backend_buft_name(buft)) + "+" + ggml_backend_buft_name(default_buft) :
+        ggml_backend_buft_name(buft);
+    result.graph_nodes = ggml_graph_n_nodes(runtime_graph.graph);
+    result.selected_output_count = (int32_t) runtime_graph.selected_outputs.size();
+    result.tokens_per_frame = input.tokens_per_frame;
+    result.patch_start_idx = input.patch_start_idx;
+    result.frame_block_count = cfg.frame_block_count;
+    result.global_block_count = cfg.global_block_count;
+    result.camera_trunk_block_count = cfg.camera_trunk_block_count;
+    result.camera_iteration_count = cfg.camera_num_iterations;
+    result.camera_pose_dim = 9;
+    result.selected_layers = cfg.aggregator_selected_layers;
+
+    result.selected_output_shapes.reserve(runtime_graph.selected_outputs.size());
+    result.selected_outputs.reserve(runtime_graph.selected_outputs.size());
+    for (const ggml_tensor * selected : runtime_graph.selected_outputs) {
+        if (selected->type != GGML_TYPE_F32) {
+            throw std::runtime_error("LingBot-MAP runtime selected output is not F32");
+        }
+        result.selected_output_shapes.push_back({
+            (int32_t) selected->ne[0],
+            (int32_t) selected->ne[1],
+            (int32_t) selected->ne[2],
+            (int32_t) selected->ne[3],
+        });
+        std::vector<float> output(ggml_nbytes(selected) / sizeof(float));
+        ggml_backend_t selected_backend = ggml_backend_sched_get_tensor_backend(sched.get(), const_cast<ggml_tensor *>(selected));
+        if (selected_backend == nullptr) {
+            throw std::runtime_error("LingBot-MAP selected output has no scheduled backend");
+        }
+        std::cerr << "[LingBot-MAP][time] reading selected_output index=" << result.selected_outputs.size()
+                  << ", bytes=" << ggml_nbytes(selected)
+                  << ", backend=" << ggml_backend_name(selected_backend)
+                  << "\n";
+        ggml_backend_tensor_get_async(selected_backend, selected, output.data(), 0, ggml_nbytes(selected));
+        ggml_backend_synchronize(selected_backend);
+        result.selected_outputs.push_back(std::move(output));
+    }
+
+    result.camera_head_input_shape = {
+        (int32_t) runtime_graph.camera_head_input->ne[0],
+        (int32_t) runtime_graph.camera_head_input->ne[1],
+        (int32_t) runtime_graph.camera_head_input->ne[2],
+        (int32_t) runtime_graph.camera_head_input->ne[3],
+    };
+    result.camera_head_final_pose_shape = {
+        (int32_t) runtime_graph.final_pose->ne[0],
+        (int32_t) runtime_graph.final_pose->ne[1],
+        (int32_t) runtime_graph.final_pose->ne[2],
+        (int32_t) runtime_graph.final_pose->ne[3],
+    };
+    result.camera_head_iteration_pose_shapes.reserve(runtime_graph.iteration_poses.size());
+    for (const ggml_tensor * pose : runtime_graph.iteration_poses) {
+        result.camera_head_iteration_pose_shapes.push_back({
+            (int32_t) pose->ne[0],
+            (int32_t) pose->ne[1],
+            (int32_t) pose->ne[2],
+            (int32_t) pose->ne[3],
+        });
+    }
+
+    if (runtime_graph.final_pose->type != GGML_TYPE_F32) {
+        throw std::runtime_error("LingBot-MAP runtime final pose is not F32");
+    }
+    result.pose_encoding.resize(ggml_nbytes(runtime_graph.final_pose) / sizeof(float));
+    ggml_backend_t pose_backend = ggml_backend_sched_get_tensor_backend(sched.get(), runtime_graph.final_pose);
+    if (pose_backend == nullptr) {
+        throw std::runtime_error("LingBot-MAP final pose has no scheduled backend");
+    }
+    std::cerr << "[LingBot-MAP][time] reading final_pose bytes=" << ggml_nbytes(runtime_graph.final_pose)
+              << ", backend=" << ggml_backend_name(pose_backend)
+              << "\n";
+    ggml_backend_tensor_get_async(pose_backend, runtime_graph.final_pose, result.pose_encoding.data(), 0, ggml_nbytes(runtime_graph.final_pose));
+    ggml_backend_synchronize(pose_backend);
+    return result;
+}
+
diff --git a/tools/mtmd/lingbot-map-wrapper.h b/tools/mtmd/lingbot-map-wrapper.h
new file mode 100644
index 000000000000..ad3eaeddbc46
--- /dev/null
+++ b/tools/mtmd/lingbot-map-wrapper.h
@@ -0,0 +1,172 @@
+// LingBot-MAP multimodal wrapper.
+// Loads mtmd_model/config.json and validates the ONNX/GGUF artifact set.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+struct ggml_context;
+struct ggml_tensor;
+
+struct lingbot_map_aggregator_input {
+    int32_t n_frames = 0;
+    int32_t hidden_size = 0;
+    int32_t vit_tokens_per_frame = 0;
+    int32_t vit_prefix_tokens = 0;
+    int32_t patch_tokens = 0;
+    int32_t patch_start_idx = 0;
+    int32_t tokens_per_frame = 0;
+    std::vector<float> tokens;
+};
+
+
+struct lingbot_map_graph_probe_result {
+    int32_t graph_nodes = 0;
+    int32_t input_tokens_per_frame = 0;
+    int32_t output_ne[4] = { 0, 0, 0, 0 };
+    int32_t qkv_ne[4] = { 0, 0, 0, 0 };
+};
+
+struct lingbot_map_aggregator_probe_result {
+    lingbot_map_graph_probe_result frame;
+    lingbot_map_graph_probe_result global;
+};
+
+struct lingbot_map_full_aggregator_probe_result {
+    int32_t graph_nodes = 0;
+    int32_t selected_output_count = 0;
+    int32_t frame_block_count = 0;
+    int32_t global_block_count = 0;
+    int32_t final_frame_ne[4] = { 0, 0, 0, 0 };
+    int32_t final_global_ne[4] = { 0, 0, 0, 0 };
+};
+
+struct lingbot_map_aggregator_graph_result {
+    int32_t graph_nodes = 0;
+    int32_t selected_output_count = 0;
+    int32_t frame_block_count = 0;
+    int32_t global_block_count = 0;
+    int32_t tokens_per_frame = 0;
+    int32_t patch_start_idx = 0;
+    int32_t final_frame_ne[4] = { 0, 0, 0, 0 };
+    int32_t final_global_ne[4] = { 0, 0, 0, 0 };
+    std::vector<int32_t> selected_layers;
+    std::vector<std::vector<int32_t>> selected_output_shapes;
+};
+
+struct lingbot_map_camera_head_graph_result {
+    int32_t graph_nodes = 0;
+    int32_t trunk_block_count = 0;
+    int32_t iteration_count = 0;
+    int32_t pose_dim = 0;
+    int32_t input_ne[4] = { 0, 0, 0, 0 };
+    int32_t final_pose_ne[4] = { 0, 0, 0, 0 };
+    std::vector<std::vector<int32_t>> iteration_pose_shapes;
+};
+
+struct lingbot_map_runtime_result {
+    std::string backend_name;
+    std::string buffer_type_name;
+    int32_t graph_nodes = 0;
+    int32_t selected_output_count = 0;
+    int32_t tokens_per_frame = 0;
+    int32_t patch_start_idx = 0;
+    int32_t frame_block_count = 0;
+    int32_t global_block_count = 0;
+    int32_t camera_trunk_block_count = 0;
+    int32_t camera_iteration_count = 0;
+    int32_t camera_pose_dim = 0;
+    std::vector<int32_t> selected_layers;
+    std::vector<std::vector<int32_t>> selected_output_shapes;
+    std::vector<std::vector<float>> selected_outputs;
+    std::vector<int32_t> camera_head_input_shape;
+    std::vector<int32_t> camera_head_final_pose_shape;
+    std::vector<std::vector<int32_t>> camera_head_iteration_pose_shapes;
+    std::vector<float> pose_encoding;
+};
+
+struct lingbot_map_config {
+    std::vector<std::string> architectures;
+
+    std::string vision_model_path;
+    std::string aggregator_camera_model_path;
+    std::string depth_model_path;
+    std::unordered_map<std::string, std::string> ep_config;
+
+    int32_t image_size = 0;
+    int32_t patch_size = 0;
+    float   image_mean[3] = { 0.485f, 0.456f, 0.406f };
+    float   image_std[3]  = { 0.229f, 0.224f, 0.225f };
+    int32_t hidden_size = 0;
+    int32_t camera_hidden_size = 0;
+    int32_t num_special_tokens = 0;
+    int32_t num_register_tokens = 0;
+    int32_t frame_block_count = 0;
+    int32_t global_block_count = 0;
+    int32_t camera_trunk_block_count = 0;
+    int32_t camera_num_iterations = 0;
+    int32_t ggml_threads = 8;
+    std::vector<int32_t> aggregator_selected_layers;
+
+    bool output_pose = true;
+    bool output_depth = true;
+    bool output_point_cloud = true;
+};
+
+struct lingbot_map_context {
+    lingbot_map_context(const lingbot_map_context &) = delete;
+    lingbot_map_context & operator=(const lingbot_map_context &) = delete;
+    ~lingbot_map_context();
+
+    static std::unique_ptr<lingbot_map_context> create(const std::string & config_dir);
+
+    const lingbot_map_config & config() const;
+    const std::string & architecture() const;
+    int64_t tensor_count() const;
+
+    ggml_context * ggml_ctx() const;
+    const ggml_tensor * tensor(const std::string & name) const;
+
+    lingbot_map_aggregator_input build_aggregator_input(
+            const float * vit_tokens,
+            int32_t       n_frames,
+            int32_t       vit_tokens_per_frame,
+            int32_t       hidden_size,
+            int32_t       image_h,
+            int32_t       image_w,
+            int32_t       num_frame_for_scale = 1) const;
+
+    lingbot_map_graph_probe_result build_aggregator_frame_block_probe(
+            const lingbot_map_aggregator_input & input,
+            int32_t                              block_index = 0) const;
+
+    lingbot_map_graph_probe_result build_aggregator_global_block_probe(
+            const lingbot_map_aggregator_input & input,
+            int32_t                              block_index = 0) const;
+
+    lingbot_map_aggregator_probe_result build_aggregator_block_probes(
+            const lingbot_map_aggregator_input & input,
+            int32_t                              block_index = 0) const;
+
+    lingbot_map_full_aggregator_probe_result build_full_aggregator_probe(
+            const lingbot_map_aggregator_input & input) const;
+
+    lingbot_map_aggregator_graph_result build_aggregator_graph(
+            const lingbot_map_aggregator_input & input) const;
+
+    lingbot_map_camera_head_graph_result build_camera_head_graph(
+            const lingbot_map_aggregator_input & input) const;
+
+    lingbot_map_runtime_result run_aggregator_camera_head(
+            const lingbot_map_aggregator_input & input,
+            bool                                 prefer_smt = true) const;
+
+  private:
+    lingbot_map_context() = default;
+    struct impl;
+    std::unique_ptr<impl> pimpl_;
+};
diff --git a/tools/mtmd/smt-vision-preprocess.cpp b/tools/mtmd/smt-vision-preprocess.cpp
index 88780a1ab2b2..4e4741b9d118 100644
--- a/tools/mtmd/smt-vision-preprocess.cpp
+++ b/tools/mtmd/smt-vision-preprocess.cpp
@@ -161,6 +161,197 @@ static std::vector<uint8_t> resize_rgb_u8_antialias(const uint8_t * src,
     return out;
 }
 
+
+static std::vector<uint8_t> rgba_u8_to_rgb_u8_white(const uint8_t * src, int32_t w, int32_t h) {
+    if (src == nullptr || w <= 0 || h <= 0) {
+        throw std::runtime_error("Invalid RGBA image dimensions");
+    }
+
+    std::vector<uint8_t> out((size_t) w * (size_t) h * 3u, 255);
+    for (int32_t y = 0; y < h; ++y) {
+        for (int32_t x = 0; x < w; ++x) {
+            const size_t src_idx = ((size_t) y * (size_t) w + (size_t) x) * 4u;
+            const size_t dst_idx = ((size_t) y * (size_t) w + (size_t) x) * 3u;
+            const uint32_t a = src[src_idx + 3];
+            for (int32_t c = 0; c < 3; ++c) {
+                const uint32_t v = (uint32_t) src[src_idx + (size_t) c];
+                out[dst_idx + (size_t) c] = (uint8_t) ((v * a + 255u * (255u - a) + 127u) / 255u);
+            }
+        }
+    }
+    return out;
+}
+
+static std::vector<uint8_t> resize_rgb_u8_pillow_bicubic(const std::vector<uint8_t> & src,
+                                                         int32_t                      src_w,
+                                                         int32_t                      src_h,
+                                                         int32_t                      dst_w,
+                                                         int32_t                      dst_h) {
+    if (src.size() != (size_t) src_w * (size_t) src_h * 3u || src_w <= 0 || src_h <= 0 || dst_w <= 0 || dst_h <= 0) {
+        throw std::runtime_error("Invalid Pillow bicubic resize dimensions");
+    }
+    if (src_w == dst_w && src_h == dst_h) {
+        return src;
+    }
+
+    constexpr int    precision_bits = 32 - 8 - 2;
+    constexpr double filter_support = 2.0;
+
+    auto bicubic_filter = [](double x) -> double {
+        constexpr double a = -0.5;
+        if (x < 0.0) {
+            x = -x;
+        }
+        if (x < 1.0) {
+            return ((a + 2.0) * x - (a + 3.0)) * x * x + 1.0;
+        }
+        if (x < 2.0) {
+            return (((x - 5.0) * x + 8.0) * x - 4.0) * a;
+        }
+        return 0.0;
+    };
+
+    auto clip8 = [](int32_t value) -> uint8_t {
+        if (value < 0) {
+            return 0;
+        }
+        if (value > 255) {
+            return 255;
+        }
+        return (uint8_t) value;
+    };
+
+    auto precompute_weights = [&](int32_t in_size, int32_t out_size,
+                                  std::vector<int32_t> & bounds,
+                                  std::vector<int32_t> & weights) -> int32_t {
+        const double scale = (double) in_size / (double) out_size;
+        const double filterscale = std::max(1.0, scale);
+        const double support = filter_support * filterscale;
+        const int32_t ksize = (int32_t) std::ceil(support) * 2 + 1;
+        const double ss = 1.0 / filterscale;
+        const double fxp_scale = std::ldexp(1.0, precision_bits);
+
+        bounds.resize((size_t) out_size * 2u);
+        weights.assign((size_t) out_size * (size_t) ksize, 0);
+
+        for (int32_t out = 0; out < out_size; ++out) {
+            const double center = ((double) out + 0.5) * scale;
+            int32_t xmin = (int32_t) (center - support + 0.5);
+            int32_t xmax = (int32_t) (center + support + 0.5);
+            xmin = std::max(0, xmin);
+            xmax = std::min(in_size, xmax);
+            const int32_t count = xmax - xmin;
+
+            bounds[(size_t) out * 2u + 0u] = xmin;
+            bounds[(size_t) out * 2u + 1u] = count;
+
+            double weight_sum = 0.0;
+            std::vector<double> tmp((size_t) ksize, 0.0);
+            for (int32_t k = 0; k < count; ++k) {
+                const double w = bicubic_filter(((double) k + (double) xmin - center + 0.5) * ss);
+                tmp[(size_t) k] = w;
+                weight_sum += w;
+            }
+            if (weight_sum != 0.0) {
+                for (int32_t k = 0; k < count; ++k) {
+                    tmp[(size_t) k] /= weight_sum;
+                }
+            }
+            for (int32_t k = 0; k < ksize; ++k) {
+                double v = tmp[(size_t) k] * fxp_scale;
+                v += tmp[(size_t) k] < 0.0 ? -0.5 : 0.5;
+                v = std::round(v);
+                v = std::clamp(v, (double) std::numeric_limits<int32_t>::min(), (double) std::numeric_limits<int32_t>::max());
+                weights[(size_t) out * (size_t) ksize + (size_t) k] = (int32_t) v;
+            }
+        }
+        return ksize;
+    };
+
+    auto resample_horizontal = [&](const std::vector<uint8_t> & input,
+                                   std::vector<uint8_t> &       output,
+                                   int32_t                      in_w,
+                                   int32_t                      in_h,
+                                   int32_t                      out_w,
+                                   int32_t                      ksize,
+                                   const std::vector<int32_t> & bounds,
+                                   const std::vector<int32_t> & weights) {
+        output.resize((size_t) out_w * (size_t) in_h * 3u);
+        for (int32_t y = 0; y < in_h; ++y) {
+            for (int32_t x = 0; x < out_w; ++x) {
+                const int32_t xmin = bounds[(size_t) x * 2u + 0u];
+                const int32_t count = bounds[(size_t) x * 2u + 1u];
+                int32_t acc[3] = { 1 << (precision_bits - 1), 1 << (precision_bits - 1), 1 << (precision_bits - 1) };
+                for (int32_t k = 0; k < count; ++k) {
+                    const size_t src_idx = ((size_t) y * (size_t) in_w + (size_t) (xmin + k)) * 3u;
+                    const int32_t w = weights[(size_t) x * (size_t) ksize + (size_t) k];
+                    acc[0] += (int32_t) input[src_idx + 0u] * w;
+                    acc[1] += (int32_t) input[src_idx + 1u] * w;
+                    acc[2] += (int32_t) input[src_idx + 2u] * w;
+                }
+                const size_t dst_idx = ((size_t) y * (size_t) out_w + (size_t) x) * 3u;
+                output[dst_idx + 0u] = clip8(acc[0] >> precision_bits);
+                output[dst_idx + 1u] = clip8(acc[1] >> precision_bits);
+                output[dst_idx + 2u] = clip8(acc[2] >> precision_bits);
+            }
+        }
+    };
+
+    auto resample_vertical = [&](const std::vector<uint8_t> & input,
+                                 std::vector<uint8_t> &       output,
+                                 int32_t                      in_w,
+                                 int32_t                      in_h,
+                                 int32_t                      out_h,
+                                 int32_t                      ksize,
+                                 const std::vector<int32_t> & bounds,
+                                 const std::vector<int32_t> & weights) {
+        output.resize((size_t) in_w * (size_t) out_h * 3u);
+        for (int32_t y = 0; y < out_h; ++y) {
+            const int32_t ymin = bounds[(size_t) y * 2u + 0u];
+            const int32_t count = bounds[(size_t) y * 2u + 1u];
+            for (int32_t x = 0; x < in_w; ++x) {
+                int32_t acc[3] = { 1 << (precision_bits - 1), 1 << (precision_bits - 1), 1 << (precision_bits - 1) };
+                for (int32_t k = 0; k < count; ++k) {
+                    const size_t src_idx = ((size_t) (ymin + k) * (size_t) in_w + (size_t) x) * 3u;
+                    const int32_t w = weights[(size_t) y * (size_t) ksize + (size_t) k];
+                    acc[0] += (int32_t) input[src_idx + 0u] * w;
+                    acc[1] += (int32_t) input[src_idx + 1u] * w;
+                    acc[2] += (int32_t) input[src_idx + 2u] * w;
+                }
+                const size_t dst_idx = ((size_t) y * (size_t) in_w + (size_t) x) * 3u;
+                output[dst_idx + 0u] = clip8(acc[0] >> precision_bits);
+                output[dst_idx + 1u] = clip8(acc[1] >> precision_bits);
+                output[dst_idx + 2u] = clip8(acc[2] >> precision_bits);
+            }
+        }
+    };
+
+    std::vector<int32_t> bounds_x;
+    std::vector<int32_t> bounds_y;
+    std::vector<int32_t> weights_x;
+    std::vector<int32_t> weights_y;
+    const bool need_x = src_w != dst_w;
+    const bool need_y = src_h != dst_h;
+    const int32_t ksize_x = need_x ? precompute_weights(src_w, dst_w, bounds_x, weights_x) : 0;
+    const int32_t ksize_y = need_y ? precompute_weights(src_h, dst_h, bounds_y, weights_y) : 0;
+
+    if (need_x && need_y) {
+        std::vector<uint8_t> tmp;
+        resample_horizontal(src, tmp, src_w, src_h, dst_w, ksize_x, bounds_x, weights_x);
+        std::vector<uint8_t> out;
+        resample_vertical(tmp, out, dst_w, src_h, dst_h, ksize_y, bounds_y, weights_y);
+        return out;
+    }
+    if (need_x) {
+        std::vector<uint8_t> out;
+        resample_horizontal(src, out, src_w, src_h, dst_w, ksize_x, bounds_x, weights_x);
+        return out;
+    }
+    std::vector<uint8_t> out;
+    resample_vertical(src, out, src_w, src_h, dst_h, ksize_y, bounds_y, weights_y);
+    return out;
+}
+
 static std::vector<float> rgb_u8_to_chw_f32(const std::vector<uint8_t> & src,
                                             int32_t                      w,
                                             int32_t                      h,
@@ -279,3 +470,82 @@ smt_vision_preprocess_result smt_vision_preprocess_if_image(const std::vector<ui
         throw;
     }
 }
+
+
+smt_lingbot_map_preprocess_result smt_lingbot_map_preprocess_images(
+        const std::vector<std::vector<uint8_t>> & images,
+        int32_t                                   target_w,
+        int32_t                                   target_h,
+        int32_t                                   patch_size,
+        const float                               mean[3],
+        const float                               std_values[3]) {
+    if (images.empty()) {
+        throw std::invalid_argument("LingBot-MAP preprocessing requires at least one image");
+    }
+    if (target_w <= 0 || target_h <= 0 || patch_size <= 0) {
+        throw std::invalid_argument("Invalid LingBot-MAP preprocessing dimensions");
+    }
+
+    smt_lingbot_map_preprocess_result out;
+    out.target_w = target_w;
+    out.target_h = target_h;
+    out.n_images = (int32_t) images.size();
+    out.tensor_nchw.resize((size_t) out.n_images * 3u * (size_t) target_h * (size_t) target_w);
+    out.resized_heights.reserve(images.size());
+
+    const size_t image_plane = (size_t) target_h * (size_t) target_w;
+    for (size_t i = 0; i < images.size(); ++i) {
+        const auto & input = images[i];
+        if (input.empty() || input.size() > (size_t) std::numeric_limits<int>::max()) {
+            throw std::invalid_argument("Invalid LingBot-MAP image payload");
+        }
+
+        int       src_w = 0, src_h = 0, src_c = 0;
+        uint8_t * pixels = stbi_load_from_memory(input.data(), (int) input.size(), &src_w, &src_h, &src_c,
+                                                 /* desired_channels */ 4);
+        if (pixels == nullptr || src_w <= 0 || src_h <= 0) {
+            if (pixels != nullptr) {
+                stbi_image_free(pixels);
+            }
+            throw std::invalid_argument("LingBot-MAP input is not a supported image");
+        }
+
+        try {
+            int32_t resized_h = (int32_t) std::round(((double) src_h * (double) target_w / (double) src_w) /
+                                                     (double) patch_size) * patch_size;
+            resized_h = std::max(patch_size, resized_h);
+            out.resized_heights.push_back(resized_h);
+
+            const auto rgb = rgba_u8_to_rgb_u8_white(pixels, src_w, src_h);
+            stbi_image_free(pixels);
+            pixels = nullptr;
+
+            const auto resized = resize_rgb_u8_pillow_bicubic(rgb, src_w, src_h, target_w, resized_h);
+
+            const int32_t crop_y = resized_h > target_h ? (resized_h - target_h) / 2 : 0;
+            const int32_t pad_y  = resized_h < target_h ? (target_h - resized_h) / 2 : 0;
+
+            for (int32_t c = 0; c < 3; ++c) {
+                const float denom = std_values[c] == 0.0f ? 1.0f : std_values[c];
+                float * dst = out.tensor_nchw.data() + ((i * 3u + (size_t) c) * image_plane);
+                for (int32_t y = 0; y < target_h; ++y) {
+                    const int32_t src_y = y + crop_y - pad_y;
+                    for (int32_t x = 0; x < target_w; ++x) {
+                        uint8_t v = 255;
+                        if (src_y >= 0 && src_y < resized_h) {
+                            v = resized[((size_t) src_y * (size_t) target_w + (size_t) x) * 3u + (size_t) c];
+                        }
+                        dst[(size_t) y * (size_t) target_w + (size_t) x] = (((float) v / 255.0f) - mean[c]) / denom;
+                    }
+                }
+            }
+        } catch (...) {
+            if (pixels != nullptr) {
+                stbi_image_free(pixels);
+            }
+            throw;
+        }
+    }
+
+    return out;
+}
diff --git a/tools/mtmd/smt-vision-preprocess.h b/tools/mtmd/smt-vision-preprocess.h
index b4cc6a1a69bb..7aae24b8a4be 100644
--- a/tools/mtmd/smt-vision-preprocess.h
+++ b/tools/mtmd/smt-vision-preprocess.h
@@ -19,6 +19,14 @@ struct smt_vision_preprocess_config {
     bool  has_normalize_config = false;
 };
 
+struct smt_lingbot_map_preprocess_result {
+    int32_t              target_w = 0;
+    int32_t              target_h = 0;
+    int32_t              n_images = 0;
+    std::vector<float>   tensor_nchw;
+    std::vector<int32_t> resized_heights;
+};
+
 // If input bytes decode as image (jpg/png/webp/...), preprocess them into
 // float32 NCHW bytes for SMT vision ONNX input. Otherwise returns was_image=false.
 smt_vision_preprocess_result smt_vision_preprocess_if_image(const std::vector<uint8_t> &         input,
@@ -26,3 +34,11 @@ smt_vision_preprocess_result smt_vision_preprocess_if_image(const std::vector<ui
                                                             int32_t                              input_width  = 0,
                                                             int32_t                              input_height = 0,
                                                             const smt_vision_preprocess_config * config = nullptr);
+
+smt_lingbot_map_preprocess_result smt_lingbot_map_preprocess_images(
+        const std::vector<std::vector<uint8_t>> & images,
+        int32_t                                   target_w,
+        int32_t                                   target_h,
+        int32_t                                   patch_size,
+        const float                               mean[3],
+        const float                               std_values[3]);
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index 4d23f33f1350..7c5420a5658f 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1202,7 +1202,7 @@ static server_tokens tokenize_input_subprompt(const llama_vocab *         vocab,
                                               bool                        parse_special) {
     constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string";
     constexpr char JSON_MTMD_DATA_KEY[]     = "multimodal_data";
-    const bool     has_mtmd                 = mctx != nullptr || smt_ctx != nullptr;
+    const bool     has_mtmd                 = mctx != nullptr || server_smt_vision_supports_prompt_embeddings(smt_ctx);
     if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
         // string or mixed
         llama_tokens tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special);
@@ -1223,7 +1223,7 @@ static server_tokens tokenize_input_subprompt(const llama_vocab *         vocab,
             for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) {
                 files.push_back(base64_decode(entry));
             }
-            if (smt_ctx != nullptr) {
+            if (server_smt_vision_supports_prompt_embeddings(smt_ctx)) {
                 return process_smt_prompt(smt_ctx, vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), files, add_special,
                                           parse_special);
             }
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 1372da3135e4..eec80696ad3f 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -715,6 +715,7 @@ struct server_context_impl {
 
     common_context_seq_rm_type ctx_tgt_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
     common_context_seq_rm_type ctx_dft_seq_rm_type = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
+    bool model_less_reconstruction = false;
 
     common_speculative_ptr spec;
 
@@ -749,7 +750,7 @@ struct server_context_impl {
 
     bool sleeping = false;
 
-    bool has_multimodal() const { return mctx != nullptr || smt_ctx != nullptr; }
+    bool has_multimodal() const { return mctx != nullptr || server_smt_vision_supports_prompt_embeddings(smt_ctx); }
 
     const char * vision_backend_name() const {
         switch (vision_backend) {
@@ -819,6 +820,57 @@ struct server_context_impl {
         params_base = params;
         params_base.n_outputs_max = server_n_outputs_max(params_base);
 
+#if defined(LLAMA_SERVER_SMT_VISION)
+        const std::string & lingbot_backend_config_dir = params_base.smt_config_dir;
+        const bool is_lingbot_map_reconstruct =
+            (params_base.media_backend == "smt" || params_base.media_backend == "auto") &&
+            server_smt_vision_config_is_lingbot_map(lingbot_backend_config_dir);
+
+        if (is_lingbot_map_reconstruct) {
+            try {
+                smt_ctx = server_smt_vision_init(nullptr, lingbot_backend_config_dir, params_base.warmup);
+            } catch (const std::exception & e) {
+                SRV_ERR("failed to load LingBot-MAP SMT backend from '%s': %s\n", lingbot_backend_config_dir.c_str(), e.what());
+                return false;
+            }
+
+            vision_backend = SERVER_VISION_BACKEND_SMT;
+            model_less_reconstruction = true;
+            chat_params = {
+                /* use_jinja             */ params_base.use_jinja,
+                /* prefill_assistant     */ params_base.prefill_assistant,
+                /* reasoning_format      */ params_base.reasoning_format,
+                /* chat_template_kwargs  */ params_base.default_template_kwargs,
+                /* tmpls                 */ nullptr,
+                /* allow_image           */ false,
+                /* allow_audio           */ false,
+                /* image_bin_only        */ false,
+                /* media_backend         */ vision_backend_name(),
+                /* enable_thinking       */ false,
+                /* reasoning_budget      */ params_base.sampling.reasoning_budget_tokens,
+                /* reasoning_budget_msg  */ params_base.sampling.reasoning_budget_message,
+                /* media_path            */ params_base.media_path,
+                /* force_pure_content    */ params_base.force_pure_content_parser
+            };
+
+            if (!params_base.model_alias.empty()) {
+                model_name = *params_base.model_alias.begin();
+            } else if (!params_base.model.name.empty()) {
+                model_name = params_base.model.name;
+            } else {
+                model_name = "lingbot-map";
+            }
+            model_aliases = params_base.model_alias;
+            model_tags    = params_base.model_tags;
+
+            params = params_base;
+            if (!is_resume) {
+                return init();
+            }
+            return true;
+        }
+#endif
+
         std::string & mmproj_path = params_base.mmproj.path;
         bool has_mmproj = !mmproj_path.empty();
         mtmd_context_params mparams = mtmd_context_params_default();
@@ -1029,9 +1081,9 @@ struct server_context_impl {
         server_vision_backend_mode selected_backend = SERVER_VISION_BACKEND_NONE;
 #if defined(LLAMA_SERVER_SMT_VISION)
         const std::string & backend_pref = params_base.media_backend;
+        const std::string & backend_config_dir = params_base.smt_config_dir;
         if (backend_pref == "auto") {
-            const std::string & smt_config_dir = params_base.smt_config_dir;
-            if (!smt_config_dir.empty()) {
+            if (!backend_config_dir.empty()) {
                 selected_backend = SERVER_VISION_BACKEND_SMT;
             } else if (!mmproj_path.empty()) {
                 selected_backend = SERVER_VISION_BACKEND_MTMD;
@@ -1070,9 +1122,9 @@ struct server_context_impl {
 
 #if defined(LLAMA_SERVER_SMT_VISION)
         } else if (selected_backend == SERVER_VISION_BACKEND_SMT) {
-            const std::string & smt_config_dir = params_base.smt_config_dir;
+            const std::string & smt_config_dir = backend_config_dir;
             if (smt_config_dir.empty()) {
-                SRV_ERR("%s", "media backend 'smt' selected but --smt-config-dir is empty\n");
+                SRV_ERR("%s", "media backend 'smt' selected but --smt-config-dir is not set\n");
                 return false;
             }
             try {
@@ -1259,8 +1311,10 @@ struct server_context_impl {
 
     // unlike load_model(), this is only called once during initialization
     bool init() {
-        GGML_ASSERT(ctx_tgt != nullptr);
-        GGML_ASSERT(model_tgt != nullptr);
+        if (!model_less_reconstruction) {
+            GGML_ASSERT(ctx_tgt != nullptr);
+            GGML_ASSERT(model_tgt != nullptr);
+        }
 
         GGML_ASSERT(!sleeping);
 
@@ -1301,6 +1355,10 @@ struct server_context_impl {
             }
         }
 
+        if (model_less_reconstruction) {
+            return true;
+        }
+
         // populate chat template params
         {
             common_chat_templates_ptr chat_templates;
@@ -2076,12 +2134,12 @@ struct server_context_impl {
     bool tokenize_cli_input(server_task & task) {
         try {
             auto & prompt = task.cli_prompt;
-            if (smt_ctx != nullptr) {
+            if (server_smt_vision_supports_prompt_embeddings(smt_ctx)) {
                 task.tokens = process_smt_prompt(smt_ctx, vocab, prompt, task.cli_files);
             } else if (mctx != nullptr) {
                 task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
             } else {
-                task.tokens = std::move(tokenize_input_prompts(vocab, mctx, smt_ctx, prompt, true, true)[0]);
+                task.tokens = std::move(tokenize_input_prompts(vocab, mctx, server_smt_vision_supports_prompt_embeddings(smt_ctx) ? smt_ctx : nullptr, prompt, true, true)[0]);
             }
             task.cli_prompt.clear();
             task.cli_files.clear();
@@ -3734,7 +3792,7 @@ struct server_context_impl {
         SRV_DBG("%s", "run slots completed\n");
     }
 
-    int get_slot_n_ctx() { return slots.back().n_ctx; }
+    int get_slot_n_ctx() { return slots.empty() ? 0 : slots.back().n_ctx; }
 
     server_response_reader get_response_reader() {
         return server_response_reader(queue_tasks, queue_results, HTTP_POLLING_SECONDS);
@@ -3771,8 +3829,9 @@ server_response_reader server_context::get_response_reader() {
 }
 
 server_context_meta server_context::get_meta() const {
-    auto bos_id        = llama_vocab_bos(impl->vocab);
-    auto eos_id        = llama_vocab_eos(impl->vocab);
+    const bool has_vocab = impl->vocab != nullptr && impl->ctx_tgt != nullptr;
+    auto bos_id = has_vocab ? llama_vocab_bos(impl->vocab) : LLAMA_TOKEN_NULL;
+    auto eos_id = has_vocab ? llama_vocab_eos(impl->vocab) : LLAMA_TOKEN_NULL;
     auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx_tgt, bos_id, true) : "";
     auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx_tgt, eos_id, true) : "";
 
@@ -3786,31 +3845,32 @@ server_context_meta server_context::get_meta() const {
         /* has_mtmd               */ impl->has_multimodal(),
         /* has_inp_image          */ impl->chat_params.allow_image,
         /* has_inp_audio          */ impl->chat_params.allow_audio,
+        /* has_reconstruction     */ server_smt_vision_is_lingbot_map(impl->smt_ctx),
         /* json_ui_settings       */ impl->json_ui_settings,
         /* json_webui_settings    */ impl->json_webui_settings,  // Deprecated
         /* slot_n_ctx             */ impl->get_slot_n_ctx(),
-        /* pooling_type           */ llama_pooling_type(impl->ctx_tgt),
+        /* pooling_type           */ impl->ctx_tgt ? llama_pooling_type(impl->ctx_tgt) : LLAMA_POOLING_TYPE_NONE,
 
         /* chat_params            */ impl->chat_params,
-        /* chat_template_caps     */ common_chat_templates_get_caps(impl->chat_params.tmpls.get()),
+        /* chat_template_caps     */ impl->chat_params.tmpls ? common_chat_templates_get_caps(impl->chat_params.tmpls.get()) : std::map<std::string, bool>{},
 
         /* bos_token_str          */ bos_token_str,
         /* eos_token_str          */ eos_token_str,
-        /* fim_pre_token          */ llama_vocab_fim_pre(impl->vocab),
-        /* fim_sub_token          */ llama_vocab_fim_suf(impl->vocab),
-        /* fim_mid_token          */ llama_vocab_fim_mid(impl->vocab),
-        /* fim_pad_token          */ llama_vocab_fim_pad(impl->vocab),
-        /* fim_rep_token          */ llama_vocab_fim_rep(impl->vocab),
-        /* fim_sep_token          */ llama_vocab_fim_sep(impl->vocab),
+        /* fim_pre_token          */ has_vocab ? llama_vocab_fim_pre(impl->vocab) : LLAMA_TOKEN_NULL,
+        /* fim_sub_token          */ has_vocab ? llama_vocab_fim_suf(impl->vocab) : LLAMA_TOKEN_NULL,
+        /* fim_mid_token          */ has_vocab ? llama_vocab_fim_mid(impl->vocab) : LLAMA_TOKEN_NULL,
+        /* fim_pad_token          */ has_vocab ? llama_vocab_fim_pad(impl->vocab) : LLAMA_TOKEN_NULL,
+        /* fim_rep_token          */ has_vocab ? llama_vocab_fim_rep(impl->vocab) : LLAMA_TOKEN_NULL,
+        /* fim_sep_token          */ has_vocab ? llama_vocab_fim_sep(impl->vocab) : LLAMA_TOKEN_NULL,
 
         /* logit_bias_eog         */ impl->params_base.sampling.logit_bias_eog,
 
-        /* model_vocab_type       */ llama_vocab_type(impl->vocab),
-        /* model_vocab_n_tokens   */ llama_vocab_n_tokens(impl->vocab),
-        /* model_n_ctx_train      */ llama_model_n_ctx_train(impl->model_tgt),
-        /* model_n_embd_inp       */ llama_model_n_embd(impl->model_tgt),
-        /* model_n_params         */ llama_model_n_params(impl->model_tgt),
-        /* model_size             */ llama_model_size(impl->model_tgt),
+        /* model_vocab_type       */ has_vocab ? llama_vocab_type(impl->vocab) : LLAMA_VOCAB_TYPE_NONE,
+        /* model_vocab_n_tokens   */ has_vocab ? llama_vocab_n_tokens(impl->vocab) : 0,
+        /* model_n_ctx_train      */ impl->model_tgt ? llama_model_n_ctx_train(impl->model_tgt) : 0,
+        /* model_n_embd_inp       */ impl->model_tgt ? llama_model_n_embd(impl->model_tgt) : 0,
+        /* model_n_params         */ impl->model_tgt ? llama_model_n_params(impl->model_tgt) : 0,
+        /* model_size             */ impl->model_tgt ? llama_model_size(impl->model_tgt) : 0,
     };
 }
 
@@ -3886,7 +3946,7 @@ static int32_t prompt_get_n_before_user(
 
             if (mctx != nullptr) {
                 result = (int32_t) process_mtmd_prompt(mctx, prefix, prefix_files).size();
-            } else if (smt_ctx != nullptr) {
+            } else if (server_smt_vision_supports_prompt_embeddings(smt_ctx)) {
                 result = (int32_t) process_smt_prompt(smt_ctx, vocab, prefix, prefix_files).size();
             } else {
                 result = (int32_t) tokenize_input_prompts(vocab, nullptr, nullptr, prefix, true, true)[0].size();
@@ -3928,9 +3988,10 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(con
         // process prompt
         std::vector<server_tokens> inputs;
 
-        if (res_type != TASK_RESPONSE_TYPE_NONE && (ctx_server.mctx != nullptr || ctx_server.smt_ctx != nullptr)) {
+        if (res_type != TASK_RESPONSE_TYPE_NONE &&
+            (ctx_server.mctx != nullptr || server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx))) {
             // OAI-compatible chat path with multimodal backend.
-            if (ctx_server.smt_ctx != nullptr) {
+            if (server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx)) {
                 inputs.push_back(
                     process_smt_prompt(ctx_server.smt_ctx, ctx_server.vocab, prompt.get<std::string>(), files));
             } else {
@@ -3938,7 +3999,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(con
             }
         } else {
             // Everything else, including multimodal completions.
-            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, ctx_server.smt_ctx, prompt, true, true);
+            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx) ? ctx_server.smt_ctx : nullptr, prompt, true, true);
         }
 
         // tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
@@ -4383,6 +4444,7 @@ void server_routes::init_routes() {
             { "modalities",                  json {
                 {"vision", meta->has_inp_image},
                 {"audio",  meta->has_inp_audio},
+                {"reconstruction", meta->has_reconstruction},
             } },
             { "media_marker",                get_media_marker() },
             { "endpoint_slots",              params.endpoint_slots },
@@ -4489,7 +4551,7 @@ void server_routes::init_routes() {
 
         std::string                prompt = json_value(data, "prompt", std::string());
         std::vector<server_tokens> tokenized_prompts =
-            tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, ctx_server.smt_ctx, prompt, false, true);
+            tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx) ? ctx_server.smt_ctx : nullptr, prompt, false, true);
         SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
         data["prompt"] = format_prompt_infill(
             ctx_server.vocab, data.at("input_prefix"), data.at("input_suffix"), data.at("input_extra"), params.n_batch,
@@ -4631,6 +4693,167 @@ void server_routes::init_routes() {
         return res;
     };
 
+    this->post_reconstruct = [this](const server_http_req & req) {
+        auto res = create_response();
+#if defined(LLAMA_SERVER_SMT_VISION)
+        if (!server_smt_vision_is_lingbot_map(ctx_server.smt_ctx)) {
+            res->error(format_error_response("The current SMT model is not LingBot-MAP.", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        server_smt_lingbot_map_reconstruct_options options;
+        if (!req.body.empty() && req.files.empty()) {
+            const json body = json::parse(req.body);
+            options.output_pose = json_value(body, "output_pose", options.output_pose);
+            options.output_depth = json_value(body, "output_depth", options.output_depth);
+            options.output_point_cloud = json_value(body, "output_point_cloud", options.output_point_cloud);
+            options.max_frames = json_value(body, "max_frames", options.max_frames);
+        }
+
+        std::vector<std::vector<uint8_t>> images;
+        images.reserve(req.files.size());
+        for (const auto & item : req.files) {
+            images.push_back(item.second.data);
+        }
+        if (images.empty()) {
+            res->error(format_error_response("LingBot-MAP reconstruction requires multipart image uploads.",
+                                             ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        try {
+            const auto result = server_smt_vision_lingbot_map_reconstruct(ctx_server.smt_ctx, images, options);
+            res->ok({
+                {"object", "lingbot_map.reconstruction"},
+                {"status", result.inference_ready ? "ok" : "not_implemented"},
+                {"message", result.message},
+                {"architecture", result.architecture},
+                {"n_images", result.n_images},
+                {"tensor_count", result.tensor_count},
+                {"image_size", result.image_size},
+                {"patch_size", result.patch_size},
+                {"hidden_size", result.hidden_size},
+                {"camera_hidden_size", result.camera_hidden_size},
+                {"preprocess", {
+                    {"width", result.preprocess_width},
+                    {"height", result.preprocess_height},
+                    {"resized_heights", result.resized_heights},
+                }},
+                {"vision_input_shape", result.vision_input_shape},
+                {"vision_output_shape", result.vision_output_shape},
+                {"vision_input_float_count", result.vision_input_float_count},
+                {"vision_output_float_count", result.vision_output_float_count},
+                {"vision_output_frames", result.vision_output_frames},
+                {"vision_output_tokens", result.vision_output_tokens},
+                {"vision_output_hidden", result.vision_output_hidden},
+                {"aggregator_input", {
+                    {"tokens_per_frame", result.aggregator_tokens_per_frame},
+                    {"patch_start_idx", result.aggregator_patch_start_idx},
+                    {"patch_tokens", result.aggregator_patch_tokens},
+                    {"vit_prefix_tokens", result.aggregator_vit_prefix_tokens},
+                }},
+                {"ggml_runtime", {
+                    {"backend", result.ggml_runtime_backend},
+                    {"buffer_type", result.ggml_runtime_buffer_type},
+                    {"graph_nodes", result.ggml_runtime_graph_nodes},
+                }},
+                {"aggregator_probe", {
+                    {"graph_nodes", result.aggregator_probe_graph_nodes},
+                    {"qkv_shape", result.aggregator_probe_qkv_shape},
+                    {"output_shape", result.aggregator_probe_output_shape},
+                    {"global_graph_nodes", result.aggregator_global_probe_graph_nodes},
+                    {"global_input_tokens", result.aggregator_global_probe_input_tokens},
+                    {"global_qkv_shape", result.aggregator_global_probe_qkv_shape},
+                    {"global_output_shape", result.aggregator_global_probe_output_shape},
+                    {"full_graph_nodes", result.aggregator_full_probe_graph_nodes},
+                    {"full_selected_outputs", result.aggregator_full_probe_selected_outputs},
+                    {"full_frame_blocks", result.aggregator_full_probe_frame_blocks},
+                    {"full_global_blocks", result.aggregator_full_probe_global_blocks},
+                    {"full_final_frame_shape", result.aggregator_full_probe_final_frame_shape},
+                    {"full_final_global_shape", result.aggregator_full_probe_final_global_shape},
+                    {"aggregator_graph_nodes", result.aggregator_graph_nodes},
+                    {"aggregator_graph_selected_outputs", result.aggregator_graph_selected_outputs},
+                    {"aggregator_graph_frame_blocks", result.aggregator_graph_frame_blocks},
+                    {"aggregator_graph_global_blocks", result.aggregator_graph_global_blocks},
+                    {"aggregator_graph_tokens_per_frame", result.aggregator_graph_tokens_per_frame},
+                    {"aggregator_graph_patch_start_idx", result.aggregator_graph_patch_start_idx},
+                    {"aggregator_graph_final_frame_shape", result.aggregator_graph_final_frame_shape},
+                    {"aggregator_graph_final_global_shape", result.aggregator_graph_final_global_shape},
+                    {"aggregator_graph_selected_output_shapes", result.aggregator_graph_selected_output_shapes},
+                    {"selected_layers", result.aggregator_selected_layers},
+                }},
+                {"camera_head", {
+                    {"graph_nodes", result.camera_head_graph_nodes},
+                    {"trunk_blocks", result.camera_head_trunk_blocks},
+                    {"iterations", result.camera_head_iterations},
+                    {"pose_dim", result.camera_head_pose_dim},
+                    {"input_shape", result.camera_head_input_shape},
+                    {"final_pose_shape", result.camera_head_final_pose_shape},
+                    {"iteration_pose_shapes", result.camera_head_iteration_pose_shapes},
+                }},
+                {"depth_onnx", {
+                    {"input_count", result.depth_onnx_input_count},
+                    {"output_count", result.depth_onnx_output_count},
+                    {"input_float_count", result.depth_onnx_input_float_count},
+                    {"input_source", result.depth_input_source},
+                    {"input_names", result.depth_input_names},
+                    {"output_names", result.depth_output_names},
+                    {"input_shapes", result.depth_input_shapes},
+                    {"output_shapes", result.depth_output_shapes},
+                    {"output_float_counts", result.depth_output_float_counts},
+                }},
+                {"postprocess", {
+                    {"pose_source", result.pose_output_source},
+                    {"pose_encoding_shape", result.pose_encoding_shape},
+                    {"extrinsic_shape", result.extrinsic_shape},
+                    {"intrinsic_shape", result.intrinsic_shape},
+                    {"world_points_shape", result.world_points_shape},
+                    {"world_points_conf_shape", result.world_points_conf_shape},
+                    {"world_points_bin", {
+                        {"path", result.world_points_path},
+                        {"dtype", "float32"},
+                        {"layout", "xyz"},
+                        {"shape", result.world_points_shape},
+                        {"bytes", result.world_points_bytes},
+                    }},
+                    {"point_count", result.postprocess_point_count},
+                    {"sample_count", result.postprocess_sample_count},
+                    {"pose_encoding_sample", result.pose_encoding_sample},
+                    {"extrinsic_first", result.extrinsic_first},
+                    {"intrinsic_first", result.intrinsic_first},
+                    {"world_points_sample", result.world_points_sample},
+                    {"depth_stats", {
+                        {"min", result.depth_min},
+                        {"max", result.depth_max},
+                        {"mean", result.depth_mean},
+                    }},
+                    {"depth_conf_stats", {
+                        {"min", result.depth_conf_min},
+                        {"max", result.depth_conf_max},
+                        {"mean", result.depth_conf_mean},
+                    }},
+                }},
+                {"onnx_sessions_loaded", result.onnx_sessions_loaded},
+                {"outputs", {
+                    {"pose", result.output_pose},
+                    {"depth", result.output_depth},
+                    {"point_cloud", result.output_point_cloud},
+                }},
+                {"stages", result.stages},
+            });
+        } catch (const std::invalid_argument & e) {
+            res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
+        } catch (const std::exception & e) {
+            res->error(format_error_response(e.what(), ERROR_TYPE_SERVER));
+        }
+#else
+        GGML_UNUSED(req);
+        res->error(format_error_response("LingBot-MAP reconstruction requires llama-server built with SMT vision support.",
+                                         ERROR_TYPE_NOT_SUPPORTED));
+#endif
+        return res;
+    };
+
     this->get_models = [this](const server_http_req &) {
         auto res = create_response(true);
 
@@ -5043,7 +5266,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
     }
 
     auto tokenized_prompts =
-        tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, ctx_server.smt_ctx, prompt, true, true);
+        tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, server_smt_vision_supports_prompt_embeddings(ctx_server.smt_ctx) ? ctx_server.smt_ctx : nullptr, prompt, true, true);
     for (const auto & tokens : tokenized_prompts) {
         // this check is necessary for models that do not add BOS token to the input
         if (tokens.empty()) {
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 77a935d47b91..3083b733c5e0 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -22,6 +22,7 @@ struct server_context_meta {
     bool has_mtmd;
     bool has_inp_image;
     bool has_inp_audio;
+    bool has_reconstruction;
     json json_ui_settings;            // Primary: new name
     json json_webui_settings;            // Deprecated: use json_ui_settings instead (kept for backward compat)
     int slot_n_ctx;
@@ -117,6 +118,7 @@ struct server_routes {
     server_http_context::handler_t post_anthropic_messages;
     server_http_context::handler_t post_anthropic_count_tokens;
     server_http_context::handler_t post_apply_template;
+    server_http_context::handler_t post_reconstruct;
     server_http_context::handler_t get_models;
     server_http_context::handler_t post_tokenize;
     server_http_context::handler_t post_detokenize;
diff --git a/tools/server/server-smt-vision.cpp b/tools/server/server-smt-vision.cpp
index bd6631511a0e..38bbc40d0362 100644
--- a/tools/server/server-smt-vision.cpp
+++ b/tools/server/server-smt-vision.cpp
@@ -1,18 +1,32 @@
 #include "server-smt-vision.h"
 
+#include "onnxruntime_cxx_api.h"
+
 #include "common.h"
 #include "log.h"
 
+#include <nlohmann/json.hpp>
+
 #include <algorithm>
 #include <array>
+#include <atomic>
+#include <chrono>
 #include <cctype>
+#include <cinttypes>
+#include <cstddef>
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
 #include <memory>
 #include <mutex>
+#include <numeric>
 #include <stdexcept>
+#include <system_error>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -20,20 +34,76 @@
 #    include "../mtmd/smt-audio-wrapper.h"
 #    include "../mtmd/smt-vision-preprocess.h"
 #    include "../mtmd/smt-vision-wrapper.h"
+#    include "../mtmd/lingbot-map-wrapper.h"
+#endif
+
+#if defined(LLAMA_SERVER_SMT_VISION)
+namespace onnxruntime {
+extern const OrtApi * g_ort;
+}
 #endif
 
+
+struct lingbot_map_postprocess_result {
+    std::vector<int64_t> pose_encoding_shape;
+    std::vector<int64_t> extrinsic_shape;
+    std::vector<int64_t> intrinsic_shape;
+    std::vector<int64_t> world_points_shape;
+    std::vector<int64_t> world_points_conf_shape;
+    std::vector<float> pose_encoding_sample;
+    std::vector<float> extrinsic_first;
+    std::vector<float> intrinsic_first;
+    std::vector<float> world_points_sample;
+    std::string world_points_path;
+    int64_t point_count = 0;
+    int64_t world_points_bytes = 0;
+    int32_t sample_count = 0;
+    double depth_min = 0.0;
+    double depth_max = 0.0;
+    double depth_mean = 0.0;
+    double depth_conf_min = 0.0;
+    double depth_conf_max = 0.0;
+    double depth_conf_mean = 0.0;
+    std::string pose_source;
+};
+
+struct lingbot_map_onnx_context {
+    Ort::Env            env{ ORT_LOGGING_LEVEL_WARNING, "lingbot-map" };
+    Ort::Session        vision_session{ nullptr };
+    Ort::Session        depth_session{ nullptr };
+
+    std::vector<std::string>  vision_input_names;
+    std::vector<std::string>  vision_output_names;
+    std::vector<const char *> vision_input_names_raw;
+    std::vector<const char *> vision_output_names_raw;
+
+    std::vector<std::string>  depth_input_names;
+    std::vector<std::string>  depth_output_names;
+    std::vector<const char *> depth_input_names_raw;
+    std::vector<const char *> depth_output_names_raw;
+
+    std::vector<int64_t> vision_input_shape;
+    std::vector<std::vector<int64_t>> depth_input_shapes;
+    std::vector<std::vector<int64_t>> depth_output_shapes;
+    int32_t              vision_input_h = 0;
+    int32_t              vision_input_w = 0;
+};
+
 #if defined(_WIN32)
 #    include <io.h>
 #    include <windows.h>
 #else
 #    include <fcntl.h>
 #    include <unistd.h>
+#    include <dlfcn.h>
 #endif
 
 struct server_smt_vision_context {
 #if defined(LLAMA_SERVER_SMT_VISION)
-    std::unique_ptr<smt_vision_context> smt_vision;
-    std::unique_ptr<smt_audio_context>  smt_audio;
+    std::unique_ptr<smt_vision_context>      smt_vision;
+    std::unique_ptr<smt_audio_context>       smt_audio;
+    std::unique_ptr<lingbot_map_context>     lingbot_map;
+    std::unique_ptr<lingbot_map_onnx_context> lingbot_onnx;
 #endif
     std::mutex               mu;
     int32_t                  hidden_size   = 0;
@@ -43,8 +113,478 @@ struct server_smt_vision_context {
     std::vector<llama_token> tok_audio_beg;
     std::vector<llama_token> tok_audio_end;
     std::string              architecture;
+    std::string              config_dir;
 };
 
+
+static int64_t lingbot_current_rss_mb() {
+#if defined(_WIN32)
+    return -1;
+#else
+    std::ifstream statm("/proc/self/statm");
+    int64_t pages_total = 0;
+    int64_t pages_rss = 0;
+    if (!(statm >> pages_total >> pages_rss)) {
+        return -1;
+    }
+    const long page_size = sysconf(_SC_PAGESIZE);
+    if (page_size <= 0) {
+        return -1;
+    }
+    return (pages_rss * (int64_t) page_size) / (1024 * 1024);
+#endif
+}
+
+
+static int64_t lingbot_elapsed_ms(std::chrono::steady_clock::time_point start) {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
+}
+
+static void lingbot_log_rss(const char * stage) {
+    const int64_t rss_mb = lingbot_current_rss_mb();
+    if (rss_mb >= 0) {
+        std::cerr << "[LingBot-MAP][mem] " << stage << " rss=" << rss_mb << " MiB\n";
+    }
+}
+
+
+static std::pair<std::string, std::string> lingbot_make_world_points_paths(const std::string & config_dir) {
+    namespace fs = std::filesystem;
+
+    std::error_code ec;
+    fs::path root = config_dir.empty() ? fs::temp_directory_path(ec) : fs::path(config_dir);
+    if (ec) {
+        throw std::runtime_error("failed to resolve temporary directory for LingBot-MAP point cloud output");
+    }
+
+    fs::path out_dir = root / "lingbot_map_outputs";
+    fs::create_directories(out_dir, ec);
+    if (ec) {
+        throw std::runtime_error("failed to create LingBot-MAP point cloud output directory: " + out_dir.string());
+    }
+
+    static std::atomic<uint64_t> counter{ 0 };
+    const uint64_t stamp = (uint64_t) std::chrono::duration_cast<std::chrono::microseconds>(
+            std::chrono::system_clock::now().time_since_epoch()).count();
+    const uint64_t seq = counter.fetch_add(1, std::memory_order_relaxed);
+    const std::string file_name = "world_points_" + std::to_string(stamp) + "_" + std::to_string(seq) + ".f32.bin";
+    const fs::path relative_path = fs::path("lingbot_map_outputs") / file_name;
+    const fs::path write_path = fs::absolute(root / relative_path);
+    return { write_path.string(), relative_path.generic_string() };
+}
+
+static std::vector<const char *> lingbot_make_name_ptrs(const std::vector<std::string> & names) {
+    std::vector<const char *> ptrs;
+    ptrs.reserve(names.size());
+    for (const auto & name : names) {
+        ptrs.push_back(name.c_str());
+    }
+    return ptrs;
+}
+
+static std::vector<int64_t> lingbot_get_io_shape(Ort::Session & session, bool inputs, size_t index) {
+    Ort::TypeInfo type_info = inputs ? session.GetInputTypeInfo(index) : session.GetOutputTypeInfo(index);
+    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
+    return tensor_info.GetShape();
+}
+
+static std::vector<std::string> lingbot_get_io_names(Ort::Session & session, bool inputs) {
+    Ort::AllocatorWithDefaultOptions allocator;
+    const size_t count = inputs ? session.GetInputCount() : session.GetOutputCount();
+    std::vector<std::string> names;
+    names.reserve(count);
+    for (size_t i = 0; i < count; ++i) {
+        auto allocated = inputs ? session.GetInputNameAllocated(i, allocator) : session.GetOutputNameAllocated(i, allocator);
+        names.emplace_back(allocated.get());
+    }
+    return names;
+}
+
+
+static bool lingbot_init_spacemit_execution_provider(
+        Ort::SessionOptions & options,
+        const std::unordered_map<std::string, std::string> & provider_options,
+        std::string & error_message) {
+    std::vector<const char *> keys;
+    std::vector<const char *> values;
+    keys.reserve(provider_options.size());
+    values.reserve(provider_options.size());
+    for (const auto & entry : provider_options) {
+        keys.push_back(entry.first.c_str());
+        values.push_back(entry.second.c_str());
+    }
+
+#if defined(_WIN32)
+    GGML_UNUSED(options);
+    GGML_UNUSED(keys);
+    GGML_UNUSED(values);
+    error_message = "Spacemit EP dynamic initialization is not implemented on Windows";
+    return false;
+#else
+    void * handle = dlopen("libspacemit_ep.so", RTLD_NOW);
+    if (!handle) {
+        error_message = std::string("failed to load libspacemit_ep.so: ") + dlerror();
+        return false;
+    }
+
+    auto * ep_init = reinterpret_cast<OrtStatus * (*) (OrtSessionOptions *, const char * const *, const char * const *, size_t)>(
+            dlsym(handle, "OrtSessionOptionsSpaceMITEnvInit"));
+    if (!ep_init) {
+        error_message = std::string("failed to find OrtSessionOptionsSpaceMITEnvInit: ") + dlerror();
+        return false;
+    }
+
+    if (OrtStatus * status = ep_init(options, keys.data(), values.data(), keys.size())) {
+        error_message = Ort::GetApi().GetErrorMessage(status);
+        Ort::GetApi().ReleaseStatus(status);
+        return false;
+    }
+    return true;
+#endif
+}
+
+static void lingbot_append_spacemit_ep(Ort::SessionOptions & session_options,
+                                       const char * session_name,
+                                       const lingbot_map_config & cfg) {
+    std::unordered_map<std::string, std::string> provider_options = cfg.ep_config;
+    if (provider_options.find("SPACEMIT_EP_INTRA_THREAD_NUM") == provider_options.end()) {
+        provider_options["SPACEMIT_EP_INTRA_THREAD_NUM"] = "4";
+    }
+    if (provider_options.find("SPACEMIT_EP_INTER_THREAD_NUM") == provider_options.end()) {
+        provider_options["SPACEMIT_EP_INTER_THREAD_NUM"] = "1";
+    }
+
+    std::string error_message;
+    if (!lingbot_init_spacemit_execution_provider(session_options, provider_options, error_message)) {
+        throw std::runtime_error(std::string("[LingBot-MAP] failed to initialize Spacemit EP for ") + session_name + ": " + error_message);
+    }
+
+    std::cerr << "[LingBot-MAP] Spacemit EP enabled for " << session_name << " (";
+    for (const auto & pair : provider_options) {
+        std::cerr << ", " << pair.first << "=" << pair.second;
+    }
+    std::cerr << ")\n";
+}
+
+static std::unique_ptr<lingbot_map_onnx_context> create_lingbot_map_onnx_context(const lingbot_map_config & cfg) {
+    auto ctx = std::make_unique<lingbot_map_onnx_context>();
+    Ort::SessionOptions vision_options;
+    Ort::SessionOptions depth_options;
+    vision_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    depth_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    lingbot_append_spacemit_ep(vision_options, "vit_encoder", cfg);
+    lingbot_append_spacemit_ep(depth_options, "dpt_head", cfg);
+
+    ctx->vision_session = Ort::Session(ctx->env, cfg.vision_model_path.c_str(), vision_options);
+    ctx->depth_session = Ort::Session(ctx->env, cfg.depth_model_path.c_str(), depth_options);
+
+    ctx->vision_input_names = lingbot_get_io_names(ctx->vision_session, true);
+    ctx->vision_output_names = lingbot_get_io_names(ctx->vision_session, false);
+    ctx->depth_input_names = lingbot_get_io_names(ctx->depth_session, true);
+    ctx->depth_output_names = lingbot_get_io_names(ctx->depth_session, false);
+
+    ctx->vision_input_shape = lingbot_get_io_shape(ctx->vision_session, true, 0);
+    ctx->depth_input_shapes.reserve(ctx->depth_input_names.size());
+    for (size_t i = 0; i < ctx->depth_input_names.size(); ++i) {
+        ctx->depth_input_shapes.push_back(lingbot_get_io_shape(ctx->depth_session, true, i));
+    }
+    ctx->depth_output_shapes.reserve(ctx->depth_output_names.size());
+    for (size_t i = 0; i < ctx->depth_output_names.size(); ++i) {
+        ctx->depth_output_shapes.push_back(lingbot_get_io_shape(ctx->depth_session, false, i));
+    }
+
+    if (ctx->vision_input_shape.size() == 5) {
+        if (ctx->vision_input_shape[3] > 0) {
+            ctx->vision_input_h = (int32_t) ctx->vision_input_shape[3];
+        }
+        if (ctx->vision_input_shape[4] > 0) {
+            ctx->vision_input_w = (int32_t) ctx->vision_input_shape[4];
+        }
+    }
+
+    if (ctx->vision_input_names.empty() || ctx->vision_output_names.empty()) {
+        throw std::runtime_error("LingBot-MAP ViT ONNX session has empty IO signature");
+    }
+    if (ctx->depth_input_names.empty() || ctx->depth_output_names.empty()) {
+        throw std::runtime_error("LingBot-MAP DPT ONNX session has empty IO signature");
+    }
+
+    ctx->vision_input_names_raw = lingbot_make_name_ptrs(ctx->vision_input_names);
+    ctx->vision_output_names_raw = lingbot_make_name_ptrs(ctx->vision_output_names);
+    ctx->depth_input_names_raw = lingbot_make_name_ptrs(ctx->depth_input_names);
+    ctx->depth_output_names_raw = lingbot_make_name_ptrs(ctx->depth_output_names);
+    return ctx;
+}
+
+
+static int64_t lingbot_numel(const std::vector<int64_t> & shape) {
+    if (shape.empty()) {
+        return 0;
+    }
+    int64_t count = 1;
+    for (const int64_t dim : shape) {
+        if (dim <= 0) {
+            return 0;
+        }
+        count *= dim;
+    }
+    return count;
+}
+
+static std::vector<int64_t> lingbot_make_depth_input_shape(
+        const std::vector<int64_t> & onnx_shape,
+        int32_t                      n_frames,
+        int32_t                      tokens_per_frame,
+        int32_t                      camera_hidden_size) {
+    if (onnx_shape.size() != 4) {
+        throw std::runtime_error("LingBot-MAP DPT input must be rank-4 [1, frames, tokens, hidden]");
+    }
+    std::vector<int64_t> shape = onnx_shape;
+    const int64_t expected[4] = { 1, (int64_t) n_frames, (int64_t) tokens_per_frame, (int64_t) camera_hidden_size };
+    for (size_t i = 0; i < 4; ++i) {
+        if (shape[i] < 0) {
+            shape[i] = expected[i];
+        }
+        if (shape[i] != expected[i]) {
+            throw std::runtime_error("LingBot-MAP DPT input shape does not match aggregator selected output boundary");
+        }
+    }
+    return shape;
+}
+
+static void lingbot_validate_depth_outputs(const std::vector<Ort::Value> & outputs) {
+    if (outputs.size() != 2 || !outputs[0].IsTensor() || !outputs[1].IsTensor()) {
+        throw std::runtime_error("LingBot-MAP DPT ONNX must return depth and depth_conf tensors");
+    }
+    const auto depth_shape = outputs[0].GetTensorTypeAndShapeInfo().GetShape();
+    const auto conf_shape = outputs[1].GetTensorTypeAndShapeInfo().GetShape();
+    if (depth_shape.size() != 5 || conf_shape.size() != 4) {
+        throw std::runtime_error("LingBot-MAP DPT ONNX returned unexpected output ranks");
+    }
+    if (depth_shape[0] != conf_shape[0] || depth_shape[1] != conf_shape[1] ||
+        depth_shape[2] != conf_shape[2] || depth_shape[3] != conf_shape[3] || depth_shape[4] != 1) {
+        throw std::runtime_error("LingBot-MAP DPT depth/depth_conf output shapes are inconsistent");
+    }
+}
+
+
+
+static void lingbot_quat_xyzw_to_mat(const float * q, float r[9]) {
+    const double x = q[0];
+    const double y = q[1];
+    const double z = q[2];
+    const double w = q[3];
+    double denom = x*x + y*y + z*z + w*w;
+    if (denom <= 1e-12) {
+        denom = 1.0;
+    }
+    const double two_s = 2.0 / denom;
+    r[0] = (float) (1.0 - two_s * (y*y + z*z));
+    r[1] = (float) (two_s * (x*y - z*w));
+    r[2] = (float) (two_s * (x*z + y*w));
+    r[3] = (float) (two_s * (x*y + z*w));
+    r[4] = (float) (1.0 - two_s * (x*x + z*z));
+    r[5] = (float) (two_s * (y*z - x*w));
+    r[6] = (float) (two_s * (x*z - y*w));
+    r[7] = (float) (two_s * (y*z + x*w));
+    r[8] = (float) (1.0 - two_s * (x*x + y*y));
+}
+
+static lingbot_map_postprocess_result lingbot_postprocess_reconstruction(
+        const float *                       pose_encoding,
+        const std::string &                 pose_source,
+        const float *                       depth,
+        const float *                       depth_conf,
+        const std::vector<int64_t> &        depth_shape,
+        const std::vector<int64_t> &        depth_conf_shape,
+        int32_t                             n_frames,
+        const std::string &                 world_points_write_path,
+        const std::string &                 world_points_response_path) {
+    if (pose_encoding == nullptr || depth == nullptr || depth_conf == nullptr) {
+        throw std::runtime_error("LingBot-MAP postprocess requires pose, depth and depth_conf data");
+    }
+    if (depth_shape.size() != 5 || depth_conf_shape.size() != 4 || depth_shape[0] != 1 ||
+        depth_shape[1] != n_frames || depth_shape[4] != 1 || depth_conf_shape[0] != 1 ||
+        depth_conf_shape[1] != n_frames || depth_conf_shape[2] != depth_shape[2] ||
+        depth_conf_shape[3] != depth_shape[3]) {
+        throw std::runtime_error("LingBot-MAP postprocess received incompatible depth output shapes");
+    }
+
+    const int64_t h = depth_shape[2];
+    const int64_t w = depth_shape[3];
+    const int64_t point_count = (int64_t) n_frames * h * w;
+    if (point_count <= 0) {
+        throw std::runtime_error("LingBot-MAP postprocess requires non-empty depth outputs");
+    }
+
+    lingbot_map_postprocess_result result;
+    result.pose_source = pose_source;
+    result.pose_encoding_shape = { 1, n_frames, 9 };
+    result.extrinsic_shape = { 1, n_frames, 3, 4 };
+    result.intrinsic_shape = { 1, n_frames, 3, 3 };
+    result.world_points_shape = { 1, n_frames, h, w, 3 };
+    result.world_points_conf_shape = { 1, n_frames, h, w };
+    result.point_count = point_count;
+    result.world_points_path = world_points_response_path;
+
+    std::ofstream world_points_file;
+    if (!world_points_write_path.empty()) {
+        world_points_file.open(world_points_write_path, std::ios::binary | std::ios::trunc);
+        if (!world_points_file.is_open()) {
+            throw std::runtime_error("failed to open LingBot-MAP world points output: " + world_points_write_path);
+        }
+    }
+
+    std::vector<float> extrinsics_w2c((size_t) n_frames * 12, 0.0f);
+    std::vector<float> extrinsics_c2w((size_t) n_frames * 12, 0.0f);
+    std::vector<float> intrinsics((size_t) n_frames * 9, 0.0f);
+    std::vector<float> c2w_rot((size_t) n_frames * 9, 0.0f);
+    std::vector<float> c2w_trans((size_t) n_frames * 3, 0.0f);
+
+    for (int32_t f = 0; f < n_frames; ++f) {
+        const float * p = pose_encoding + (size_t) f * 9;
+        float r[9];
+        lingbot_quat_xyzw_to_mat(p + 3, r);
+
+        float * e = extrinsics_w2c.data() + (size_t) f * 12;
+        e[0] = r[0]; e[1] = r[1]; e[2] = r[2]; e[3] = p[0];
+        e[4] = r[3]; e[5] = r[4]; e[6] = r[5]; e[7] = p[1];
+        e[8] = r[6]; e[9] = r[7]; e[10] = r[8]; e[11] = p[2];
+
+        float fov_h = p[7];
+        float fov_w = p[8];
+        if (fov_h <= 1e-6f) {
+            fov_h = 1.0471975511965977f;
+        }
+        if (fov_w <= 1e-6f) {
+            fov_w = 1.0471975511965977f;
+        }
+        const float fy = (float) ((double) h / 2.0 / std::tan((double) fov_h / 2.0));
+        const float fx = (float) ((double) w / 2.0 / std::tan((double) fov_w / 2.0));
+        float * k = intrinsics.data() + (size_t) f * 9;
+        k[0] = fx;
+        k[4] = fy;
+        k[2] = (float) w / 2.0f;
+        k[5] = (float) h / 2.0f;
+        k[8] = 1.0f;
+
+        float * cr = c2w_rot.data() + (size_t) f * 9;
+        cr[0] = r[0]; cr[1] = r[3]; cr[2] = r[6];
+        cr[3] = r[1]; cr[4] = r[4]; cr[5] = r[7];
+        cr[6] = r[2]; cr[7] = r[5]; cr[8] = r[8];
+
+        float * ct = c2w_trans.data() + (size_t) f * 3;
+        ct[0] = -(cr[0] * p[0] + cr[1] * p[1] + cr[2] * p[2]);
+        ct[1] = -(cr[3] * p[0] + cr[4] * p[1] + cr[5] * p[2]);
+        ct[2] = -(cr[6] * p[0] + cr[7] * p[1] + cr[8] * p[2]);
+
+        float * c2w = extrinsics_c2w.data() + (size_t) f * 12;
+        c2w[0] = cr[0]; c2w[1] = cr[1]; c2w[2] = cr[2]; c2w[3] = ct[0];
+        c2w[4] = cr[3]; c2w[5] = cr[4]; c2w[6] = cr[5]; c2w[7] = ct[1];
+        c2w[8] = cr[6]; c2w[9] = cr[7]; c2w[10] = cr[8]; c2w[11] = ct[2];
+    }
+
+    result.pose_encoding_sample.assign(pose_encoding, pose_encoding + std::min<int64_t>((int64_t) n_frames * 9, 9));
+    result.extrinsic_first.assign(extrinsics_c2w.begin(), extrinsics_c2w.begin() + std::min<size_t>(extrinsics_c2w.size(), 12));
+    result.intrinsic_first.assign(intrinsics.begin(), intrinsics.begin() + std::min<size_t>(intrinsics.size(), 9));
+
+    const int32_t sample_limit = 64;
+    const int64_t sample_stride = std::max<int64_t>(1, point_count / sample_limit);
+    result.world_points_sample.reserve((size_t) sample_limit * 3);
+
+    double depth_sum = 0.0;
+    double conf_sum = 0.0;
+    result.depth_min = depth[0];
+    result.depth_max = depth[0];
+    result.depth_conf_min = depth_conf[0];
+    result.depth_conf_max = depth_conf[0];
+
+    int32_t sample_count = 0;
+    for (int32_t f = 0; f < n_frames; ++f) {
+        const float * k = intrinsics.data() + (size_t) f * 9;
+        const float * cr = c2w_rot.data() + (size_t) f * 9;
+        const float * ct = c2w_trans.data() + (size_t) f * 3;
+        const float fx_cur = k[0];
+        const float fy_cur = k[4];
+        const float cx = k[2];
+        const float cy = k[5];
+        for (int64_t y = 0; y < h; ++y) {
+            for (int64_t x = 0; x < w; ++x) {
+                const int64_t idx = ((int64_t) f * h + y) * w + x;
+                const float d = depth[idx];
+                const float c = depth_conf[idx];
+                result.depth_min = std::min(result.depth_min, (double) d);
+                result.depth_max = std::max(result.depth_max, (double) d);
+                result.depth_conf_min = std::min(result.depth_conf_min, (double) c);
+                result.depth_conf_max = std::max(result.depth_conf_max, (double) c);
+                depth_sum += d;
+                conf_sum += c;
+
+                const float cam_x = ((float) x - cx) * d / fx_cur;
+                const float cam_y = ((float) y - cy) * d / fy_cur;
+                const float cam_z = d;
+                const float world_xyz[3] = {
+                    cr[0] * cam_x + cr[1] * cam_y + cr[2] * cam_z + ct[0],
+                    cr[3] * cam_x + cr[4] * cam_y + cr[5] * cam_z + ct[1],
+                    cr[6] * cam_x + cr[7] * cam_y + cr[8] * cam_z + ct[2],
+                };
+
+                if (world_points_file.is_open()) {
+                    world_points_file.write(reinterpret_cast<const char *>(world_xyz), sizeof(world_xyz));
+                    if (!world_points_file) {
+                        throw std::runtime_error("failed to write LingBot-MAP world points output: " + world_points_write_path);
+                    }
+                }
+
+                if (idx % sample_stride == 0 && sample_count < sample_limit) {
+                    result.world_points_sample.insert(result.world_points_sample.end(), world_xyz, world_xyz + 3);
+                    sample_count += 1;
+                }
+            }
+        }
+    }
+    if (world_points_file.is_open()) {
+        world_points_file.close();
+        result.world_points_bytes = point_count * 3 * (int64_t) sizeof(float);
+    }
+    result.depth_mean = depth_sum / (double) point_count;
+    result.depth_conf_mean = conf_sum / (double) point_count;
+    result.sample_count = sample_count;
+    return result;
+}
+
+bool server_smt_vision_config_is_lingbot_map(const std::string & config_dir) {
+    if (config_dir.empty()) {
+        return false;
+    }
+
+    const std::string config_path = config_dir + "/config.json";
+    std::ifstream     file(config_path);
+    if (!file.is_open()) {
+        return false;
+    }
+
+    try {
+        nlohmann::json config = nlohmann::json::parse(file);
+        if (!config.contains("architectures")) {
+            return false;
+        }
+        const auto & arch = config.at("architectures");
+        if (arch.is_array()) {
+            for (const auto & value : arch) {
+                if (value.is_string() && value.get<std::string>() == "LingBotMapFor3DReconstruction") {
+                    return true;
+                }
+            }
+        } else if (arch.is_string()) {
+            return arch.get<std::string>() == "LingBotMapFor3DReconstruction";
+        }
+    } catch (...) {
+        return false;
+    }
+    return false;
+}
+
 static std::string fnv_hash(const uint8_t * data, size_t len) {
     const uint64_t fnv_prime = 0x100000001b3ULL;
     uint64_t       hash      = 0xcbf29ce484222325ULL;
@@ -406,8 +946,24 @@ static int decode_embd(llama_context * lctx,
 server_smt_vision_context * server_smt_vision_init(llama_context * lctx, const std::string & config_dir, bool warmup) {
 #if defined(LLAMA_SERVER_SMT_VISION)
     auto        ctx = std::make_unique<server_smt_vision_context>();
+    ctx->config_dir = config_dir;
     std::string primary_architecture;
 
+    if (server_smt_vision_config_is_lingbot_map(config_dir)) {
+        GGML_UNUSED(lctx);
+        GGML_UNUSED(warmup);
+        onnxruntime::g_ort = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+        ctx->lingbot_map = lingbot_map_context::create(config_dir);
+        ctx->lingbot_onnx = create_lingbot_map_onnx_context(ctx->lingbot_map->config());
+        ctx->architecture = ctx->lingbot_map->architecture();
+        LOG_INF("[server-smt] loaded LingBot-MAP model from '%s', architecture=%s, tensors=%" PRId64 "\n",
+                config_dir.c_str(), ctx->architecture.c_str(), ctx->lingbot_map->tensor_count());
+        LOG_INF("[server-smt] loaded LingBot-MAP ONNX sessions: vit_inputs=%zu, vit_outputs=%zu, dpt_inputs=%zu, dpt_outputs=%zu\n",
+                ctx->lingbot_onnx->vision_input_names.size(), ctx->lingbot_onnx->vision_output_names.size(),
+                ctx->lingbot_onnx->depth_input_names.size(), ctx->lingbot_onnx->depth_output_names.size());
+        return ctx.release();
+    }
+
     try {
         ctx->smt_vision      = smt_vision_context::create(config_dir, warmup);
         ctx->hidden_size     = (int32_t) ctx->smt_vision->hidden_size();
@@ -472,6 +1028,282 @@ bool server_smt_vision_supports_audio(const server_smt_vision_context * ctx) {
         ;
 }
 
+bool server_smt_vision_supports_prompt_embeddings(const server_smt_vision_context * ctx) {
+    return ctx != nullptr
+#if defined(LLAMA_SERVER_SMT_VISION)
+           && (ctx->smt_vision != nullptr || ctx->smt_audio != nullptr)
+#endif
+        ;
+}
+
+bool server_smt_vision_is_lingbot_map(const server_smt_vision_context * ctx) {
+    return ctx != nullptr
+#if defined(LLAMA_SERVER_SMT_VISION)
+           && ctx->lingbot_map != nullptr
+#endif
+        ;
+}
+
+server_smt_lingbot_map_reconstruct_result server_smt_vision_lingbot_map_reconstruct(
+        server_smt_vision_context * ctx,
+        const std::vector<std::vector<uint8_t>> & images,
+        const server_smt_lingbot_map_reconstruct_options & options) {
+#if defined(LLAMA_SERVER_SMT_VISION)
+    if (ctx == nullptr || ctx->lingbot_map == nullptr) {
+        throw std::runtime_error("SMT context does not contain a LingBot-MAP model");
+    }
+    if (images.empty()) {
+        throw std::invalid_argument("LingBot-MAP reconstruction requires at least one image");
+    }
+    if (options.max_frames > 0 && (int32_t) images.size() > options.max_frames) {
+        throw std::invalid_argument("LingBot-MAP reconstruction request exceeds max_frames");
+    }
+
+    std::lock_guard<std::mutex> lock(ctx->mu);
+    lingbot_log_rss("request_start");
+
+    const auto & cfg = ctx->lingbot_map->config();
+    if (ctx->lingbot_onnx == nullptr) {
+        throw std::runtime_error("LingBot-MAP ONNX sessions are not loaded");
+    }
+
+    const int32_t input_w = ctx->lingbot_onnx->vision_input_w > 0 ? ctx->lingbot_onnx->vision_input_w : cfg.image_size;
+    const int32_t input_h = ctx->lingbot_onnx->vision_input_h > 0 ? ctx->lingbot_onnx->vision_input_h : cfg.image_size;
+
+    auto preproc = smt_lingbot_map_preprocess_images(images, input_w, input_h, cfg.patch_size,
+                                                     cfg.image_mean, cfg.image_std);
+    lingbot_log_rss("after_preprocess");
+
+    std::vector<int64_t> input_shape = { 1, (int64_t) images.size(), 3, input_h, input_w };
+    auto memory_info = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, preproc.tensor_nchw.data(),
+                                                              preproc.tensor_nchw.size(),
+                                                              input_shape.data(), input_shape.size());
+
+    auto stage_start = std::chrono::steady_clock::now();
+    auto vit_outputs = ctx->lingbot_onnx->vision_session.Run(
+            Ort::RunOptions{ nullptr },
+            ctx->lingbot_onnx->vision_input_names_raw.data(),
+            &input_tensor,
+            1,
+            ctx->lingbot_onnx->vision_output_names_raw.data(),
+            ctx->lingbot_onnx->vision_output_names_raw.size());
+    std::cerr << "[LingBot-MAP][time] vit_onnx_ms=" << lingbot_elapsed_ms(stage_start) << "\n";
+    lingbot_log_rss("after_vit_onnx");
+
+    if (vit_outputs.empty() || !vit_outputs[0].IsTensor()) {
+        throw std::runtime_error("LingBot-MAP ViT ONNX did not return a tensor output");
+    }
+
+    auto tensor_info = vit_outputs[0].GetTensorTypeAndShapeInfo();
+    std::vector<int64_t> vision_output_shape = tensor_info.GetShape();
+    const int64_t vision_output_float_count = (int64_t) tensor_info.GetElementCount();
+    if (vision_output_shape.size() != 3) {
+        throw std::runtime_error("LingBot-MAP ViT output must be rank-3 [frames, tokens, hidden]");
+    }
+    if (vision_output_shape[0] != (int64_t) images.size()) {
+        throw std::runtime_error("LingBot-MAP ViT output frame count does not match input image count");
+    }
+    if (vision_output_shape[2] != cfg.hidden_size) {
+        throw std::runtime_error("LingBot-MAP ViT output hidden size does not match config hidden_size");
+    }
+    if (vision_output_shape[1] <= 0) {
+        throw std::runtime_error("LingBot-MAP ViT output token count must be positive");
+    }
+
+    const float * vit_output_data = vit_outputs[0].GetTensorData<float>();
+    const auto aggregator_input = ctx->lingbot_map->build_aggregator_input(
+            vit_output_data,
+            (int32_t) vision_output_shape[0],
+            (int32_t) vision_output_shape[1],
+            (int32_t) vision_output_shape[2],
+            input_h,
+            input_w,
+            /* num_frame_for_scale */ 1);
+    lingbot_log_rss("after_aggregator_input");
+    stage_start = std::chrono::steady_clock::now();
+    const auto runtime = ctx->lingbot_map->run_aggregator_camera_head(aggregator_input, /* prefer_smt */ true);
+    std::cerr << "[LingBot-MAP][time] aggregator_camera_total_ms=" << lingbot_elapsed_ms(stage_start) << "\n";
+    lingbot_log_rss("after_aggregator_camera_ggml");
+
+    if (ctx->lingbot_onnx->depth_input_names.size() != runtime.selected_output_shapes.size() ||
+        runtime.selected_output_shapes.size() != runtime.selected_outputs.size()) {
+        throw std::runtime_error("LingBot-MAP DPT input count does not match aggregator runtime selected output count");
+    }
+
+    std::vector<std::vector<int64_t>> depth_input_shapes;
+    depth_input_shapes.reserve(ctx->lingbot_onnx->depth_input_names.size());
+    int64_t depth_input_float_count = 0;
+    for (size_t i = 0; i < ctx->lingbot_onnx->depth_input_names.size(); ++i) {
+        const auto & selected_shape = runtime.selected_output_shapes[i];
+        if (selected_shape.size() != 4 || selected_shape[0] != cfg.camera_hidden_size ||
+            selected_shape[1] != aggregator_input.tokens_per_frame || selected_shape[2] != aggregator_input.n_frames) {
+            throw std::runtime_error("LingBot-MAP aggregator selected output shape is not compatible with DPT input");
+        }
+        const auto input_shape = lingbot_make_depth_input_shape(ctx->lingbot_onnx->depth_input_shapes[i],
+                                                                aggregator_input.n_frames,
+                                                                aggregator_input.tokens_per_frame,
+                                                                cfg.camera_hidden_size);
+        depth_input_float_count += lingbot_numel(input_shape);
+        depth_input_shapes.push_back(input_shape);
+    }
+
+    std::vector<float> depth_input_storage((size_t) depth_input_float_count, 0.0f);
+    std::vector<Ort::Value> depth_input_tensors;
+    depth_input_tensors.reserve(depth_input_shapes.size());
+    size_t depth_input_offset = 0;
+    for (size_t i = 0; i < depth_input_shapes.size(); ++i) {
+        const auto & shape = depth_input_shapes[i];
+        const int64_t n_elem = lingbot_numel(shape);
+        if ((size_t) n_elem != runtime.selected_outputs[i].size()) {
+            throw std::runtime_error("LingBot-MAP runtime selected output size does not match DPT input shape");
+        }
+        std::copy(runtime.selected_outputs[i].begin(), runtime.selected_outputs[i].end(),
+                  depth_input_storage.begin() + (std::ptrdiff_t) depth_input_offset);
+        depth_input_tensors.push_back(Ort::Value::CreateTensor<float>(memory_info,
+                                                                      depth_input_storage.data() + depth_input_offset,
+                                                                      (size_t) n_elem,
+                                                                      shape.data(), shape.size()));
+        depth_input_offset += (size_t) n_elem;
+    }
+    lingbot_log_rss("after_dpt_input_pack");
+
+    stage_start = std::chrono::steady_clock::now();
+    auto depth_outputs = ctx->lingbot_onnx->depth_session.Run(
+            Ort::RunOptions{ nullptr },
+            ctx->lingbot_onnx->depth_input_names_raw.data(),
+            depth_input_tensors.data(),
+            depth_input_tensors.size(),
+            ctx->lingbot_onnx->depth_output_names_raw.data(),
+            ctx->lingbot_onnx->depth_output_names_raw.size());
+    std::cerr << "[LingBot-MAP][time] dpt_onnx_ms=" << lingbot_elapsed_ms(stage_start) << "\n";
+    lingbot_log_rss("after_dpt_onnx");
+    lingbot_validate_depth_outputs(depth_outputs);
+
+    std::vector<std::vector<int64_t>> depth_output_shapes;
+    std::vector<int64_t> depth_output_float_counts;
+    depth_output_shapes.reserve(depth_outputs.size());
+    depth_output_float_counts.reserve(depth_outputs.size());
+    for (const auto & output : depth_outputs) {
+        auto output_info = output.GetTensorTypeAndShapeInfo();
+        depth_output_shapes.push_back(output_info.GetShape());
+        depth_output_float_counts.push_back((int64_t) output_info.GetElementCount());
+    }
+
+    if (runtime.pose_encoding.size() != (size_t) aggregator_input.n_frames * 9) {
+        throw std::runtime_error("LingBot-MAP runtime pose output shape does not match frame count");
+    }
+
+    const bool save_point_cloud = options.output_point_cloud && cfg.output_point_cloud;
+    const auto world_points_paths = save_point_cloud ? lingbot_make_world_points_paths(ctx->config_dir) : std::pair<std::string, std::string>{};
+    const auto postprocess = lingbot_postprocess_reconstruction(
+            runtime.pose_encoding.data(),
+            "camera_head_ggml_runtime",
+            depth_outputs[0].GetTensorData<float>(),
+            depth_outputs[1].GetTensorData<float>(),
+            depth_output_shapes[0],
+            depth_output_shapes[1],
+            aggregator_input.n_frames,
+            world_points_paths.first,
+            world_points_paths.second);
+    lingbot_log_rss("after_postprocess");
+
+    server_smt_lingbot_map_reconstruct_result result;
+    result.architecture = ctx->lingbot_map->architecture();
+    result.message = "LingBot-MAP ViT ONNX inference completed; aggregator/camera_head GGML runtime ran on SMT; DPT ONNX ran; postprocess completed";
+    result.stages = {
+        "config_loaded",
+        "images_preprocessed",
+        "vit_onnx_ran",
+        "aggregator_input_prepared",
+        "aggregator_camera_head_ggml_runtime_ran",
+        "depth_onnx_ran",
+        "postprocess_completed",
+    };
+    if (!postprocess.world_points_path.empty()) {
+        result.stages.push_back("point_cloud_bin_saved");
+    }
+    result.tensor_count = ctx->lingbot_map->tensor_count();
+    result.n_images = (int32_t) images.size();
+    result.image_size = cfg.image_size;
+    result.patch_size = cfg.patch_size;
+    result.hidden_size = cfg.hidden_size;
+    result.camera_hidden_size = cfg.camera_hidden_size;
+    result.preprocess_width = preproc.target_w;
+    result.preprocess_height = preproc.target_h;
+    result.vision_input_float_count = (int64_t) preproc.tensor_nchw.size();
+    result.vision_output_float_count = vision_output_float_count;
+    result.vision_output_frames = (int32_t) vision_output_shape[0];
+    result.vision_output_tokens = (int32_t) vision_output_shape[1];
+    result.vision_output_hidden = (int32_t) vision_output_shape[2];
+    result.aggregator_tokens_per_frame = aggregator_input.tokens_per_frame;
+    result.aggregator_patch_start_idx = aggregator_input.patch_start_idx;
+    result.aggregator_patch_tokens = aggregator_input.patch_tokens;
+    result.aggregator_vit_prefix_tokens = aggregator_input.vit_prefix_tokens;
+    result.aggregator_graph_nodes = runtime.graph_nodes;
+    result.aggregator_graph_selected_outputs = runtime.selected_output_count;
+    result.aggregator_graph_frame_blocks = runtime.frame_block_count;
+    result.aggregator_graph_global_blocks = runtime.global_block_count;
+    result.aggregator_graph_tokens_per_frame = runtime.tokens_per_frame;
+    result.aggregator_graph_patch_start_idx = runtime.patch_start_idx;
+    result.aggregator_graph_selected_output_shapes = runtime.selected_output_shapes;
+    result.aggregator_selected_layers = runtime.selected_layers;
+    result.camera_head_graph_nodes = runtime.graph_nodes;
+    result.camera_head_trunk_blocks = runtime.camera_trunk_block_count;
+    result.camera_head_iterations = runtime.camera_iteration_count;
+    result.camera_head_pose_dim = runtime.camera_pose_dim;
+    result.camera_head_input_shape = runtime.camera_head_input_shape;
+    result.camera_head_final_pose_shape = runtime.camera_head_final_pose_shape;
+    result.camera_head_iteration_pose_shapes = runtime.camera_head_iteration_pose_shapes;
+    result.ggml_runtime_graph_nodes = runtime.graph_nodes;
+    result.ggml_runtime_backend = runtime.backend_name;
+    result.ggml_runtime_buffer_type = runtime.buffer_type_name;
+    result.depth_onnx_input_count = (int32_t) ctx->lingbot_onnx->depth_input_names.size();
+    result.depth_onnx_output_count = (int32_t) depth_outputs.size();
+    result.depth_onnx_input_float_count = depth_input_float_count;
+    result.depth_input_source = "aggregator_ggml_runtime_selected_outputs";
+    result.depth_input_names = ctx->lingbot_onnx->depth_input_names;
+    result.depth_output_names = ctx->lingbot_onnx->depth_output_names;
+    result.depth_input_shapes = std::move(depth_input_shapes);
+    result.depth_output_shapes = std::move(depth_output_shapes);
+    result.depth_output_float_counts = std::move(depth_output_float_counts);
+    result.pose_output_source = postprocess.pose_source;
+    result.pose_encoding_shape = postprocess.pose_encoding_shape;
+    result.extrinsic_shape = postprocess.extrinsic_shape;
+    result.intrinsic_shape = postprocess.intrinsic_shape;
+    result.world_points_shape = postprocess.world_points_shape;
+    result.world_points_conf_shape = postprocess.world_points_conf_shape;
+    result.pose_encoding_sample = postprocess.pose_encoding_sample;
+    result.extrinsic_first = postprocess.extrinsic_first;
+    result.intrinsic_first = postprocess.intrinsic_first;
+    result.world_points_sample = postprocess.world_points_sample;
+    result.world_points_path = postprocess.world_points_path;
+    result.world_points_bytes = postprocess.world_points_bytes;
+    result.postprocess_point_count = postprocess.point_count;
+    result.postprocess_sample_count = postprocess.sample_count;
+    result.depth_min = postprocess.depth_min;
+    result.depth_max = postprocess.depth_max;
+    result.depth_mean = postprocess.depth_mean;
+    result.depth_conf_min = postprocess.depth_conf_min;
+    result.depth_conf_max = postprocess.depth_conf_max;
+    result.depth_conf_mean = postprocess.depth_conf_mean;
+    result.vision_input_shape = std::move(input_shape);
+    result.vision_output_shape = std::move(vision_output_shape);
+    result.resized_heights = std::move(preproc.resized_heights);
+    result.output_pose = options.output_pose && cfg.output_pose;
+    result.output_depth = options.output_depth && cfg.output_depth;
+    result.output_point_cloud = options.output_point_cloud && cfg.output_point_cloud;
+    result.onnx_sessions_loaded = true;
+    result.inference_ready = true;
+    return result;
+#else
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(images);
+    GGML_UNUSED(options);
+    throw std::runtime_error("SMT media backend is not compiled");
+#endif
+}
+
 server_smt_image_chunk server_smt_vision_encode_image_bin(server_smt_vision_context *  ctx,
                                                           const std::vector<uint8_t> & data) {
     if (ctx == nullptr) {
diff --git a/tools/server/server-smt-vision.h b/tools/server/server-smt-vision.h
index 44081107bdec..b02b7af0f56b 100644
--- a/tools/server/server-smt-vision.h
+++ b/tools/server/server-smt-vision.h
@@ -27,7 +27,111 @@ struct server_smt_image_chunk {
 
 struct server_smt_vision_context;
 
+struct server_smt_lingbot_map_reconstruct_options {
+    bool output_pose = true;
+    bool output_depth = true;
+    bool output_point_cloud = true;
+    int32_t max_frames = -1;
+};
+
+struct server_smt_lingbot_map_reconstruct_result {
+    std::string architecture;
+    std::string message;
+    std::vector<std::string> stages;
+
+    int64_t tensor_count = 0;
+    int32_t n_images = 0;
+    int32_t image_size = 0;
+    int32_t patch_size = 0;
+    int32_t hidden_size = 0;
+    int32_t camera_hidden_size = 0;
+    int32_t preprocess_width = 0;
+    int32_t preprocess_height = 0;
+    int64_t vision_input_float_count = 0;
+    int64_t vision_output_float_count = 0;
+    int32_t vision_output_frames = 0;
+    int32_t vision_output_tokens = 0;
+    int32_t vision_output_hidden = 0;
+    int32_t aggregator_tokens_per_frame = 0;
+    int32_t aggregator_patch_start_idx = 0;
+    int32_t aggregator_patch_tokens = 0;
+    int32_t aggregator_vit_prefix_tokens = 0;
+    int32_t aggregator_probe_graph_nodes = 0;
+    int32_t aggregator_global_probe_graph_nodes = 0;
+    int32_t aggregator_global_probe_input_tokens = 0;
+    int32_t aggregator_full_probe_graph_nodes = 0;
+    int32_t aggregator_full_probe_selected_outputs = 0;
+    int32_t aggregator_full_probe_frame_blocks = 0;
+    int32_t aggregator_full_probe_global_blocks = 0;
+    int32_t aggregator_graph_nodes = 0;
+    int32_t aggregator_graph_selected_outputs = 0;
+    int32_t aggregator_graph_frame_blocks = 0;
+    int32_t aggregator_graph_global_blocks = 0;
+    int32_t aggregator_graph_tokens_per_frame = 0;
+    int32_t aggregator_graph_patch_start_idx = 0;
+    int32_t camera_head_graph_nodes = 0;
+    int32_t camera_head_trunk_blocks = 0;
+    int32_t camera_head_iterations = 0;
+    int32_t camera_head_pose_dim = 0;
+    int32_t ggml_runtime_graph_nodes = 0;
+    int32_t depth_onnx_input_count = 0;
+    int32_t depth_onnx_output_count = 0;
+    int64_t depth_onnx_input_float_count = 0;
+    int64_t postprocess_point_count = 0;
+    int64_t world_points_bytes = 0;
+    int32_t postprocess_sample_count = 0;
+    double depth_min = 0.0;
+    double depth_max = 0.0;
+    double depth_mean = 0.0;
+    double depth_conf_min = 0.0;
+    double depth_conf_max = 0.0;
+    double depth_conf_mean = 0.0;
+    std::string depth_input_source;
+    std::string pose_output_source;
+    std::string ggml_runtime_backend;
+    std::string ggml_runtime_buffer_type;
+    std::string world_points_path;
+    std::vector<int32_t> aggregator_probe_qkv_shape;
+    std::vector<int32_t> aggregator_probe_output_shape;
+    std::vector<int32_t> aggregator_global_probe_qkv_shape;
+    std::vector<int32_t> aggregator_global_probe_output_shape;
+    std::vector<int32_t> aggregator_full_probe_final_frame_shape;
+    std::vector<int32_t> aggregator_full_probe_final_global_shape;
+    std::vector<int32_t> aggregator_graph_final_frame_shape;
+    std::vector<int32_t> aggregator_graph_final_global_shape;
+    std::vector<std::vector<int32_t>> aggregator_graph_selected_output_shapes;
+    std::vector<int32_t> aggregator_selected_layers;
+    std::vector<int32_t> camera_head_input_shape;
+    std::vector<int32_t> camera_head_final_pose_shape;
+    std::vector<std::vector<int32_t>> camera_head_iteration_pose_shapes;
+    std::vector<std::string> depth_input_names;
+    std::vector<std::string> depth_output_names;
+    std::vector<std::vector<int64_t>> depth_input_shapes;
+    std::vector<std::vector<int64_t>> depth_output_shapes;
+    std::vector<int64_t> depth_output_float_counts;
+    std::vector<int64_t> pose_encoding_shape;
+    std::vector<int64_t> extrinsic_shape;
+    std::vector<int64_t> intrinsic_shape;
+    std::vector<int64_t> world_points_shape;
+    std::vector<int64_t> world_points_conf_shape;
+    std::vector<float> pose_encoding_sample;
+    std::vector<float> extrinsic_first;
+    std::vector<float> intrinsic_first;
+    std::vector<float> world_points_sample;
+    std::vector<int64_t> vision_input_shape;
+    std::vector<int64_t> vision_output_shape;
+    std::vector<int32_t> resized_heights;
+
+    bool output_pose = true;
+    bool output_depth = true;
+    bool output_point_cloud = true;
+    bool onnx_sessions_loaded = false;
+    bool inference_ready = false;
+};
+
 #if defined(LLAMA_SERVER_SMT_VISION)
+bool server_smt_vision_config_is_lingbot_map(const std::string & config_dir);
+
 server_smt_vision_context * server_smt_vision_init(
         llama_context * lctx,
         const std::string & config_dir,
@@ -37,6 +141,13 @@ void server_smt_vision_free(server_smt_vision_context * ctx);
 
 bool server_smt_vision_supports_image(const server_smt_vision_context * ctx);
 bool server_smt_vision_supports_audio(const server_smt_vision_context * ctx);
+bool server_smt_vision_supports_prompt_embeddings(const server_smt_vision_context * ctx);
+bool server_smt_vision_is_lingbot_map(const server_smt_vision_context * ctx);
+
+server_smt_lingbot_map_reconstruct_result server_smt_vision_lingbot_map_reconstruct(
+        server_smt_vision_context * ctx,
+        const std::vector<std::vector<uint8_t>> & images,
+        const server_smt_lingbot_map_reconstruct_options & options);
 
 server_smt_image_chunk server_smt_vision_encode_media_bin(
         server_smt_vision_context * ctx,
@@ -55,6 +166,10 @@ int32_t server_smt_vision_decode_chunk(
         int32_t n_batch,
         bool logits_last);
 #else
+inline bool server_smt_vision_config_is_lingbot_map(const std::string & /* config_dir */) {
+    return false;
+}
+
 inline server_smt_vision_context * server_smt_vision_init(
         llama_context * /* lctx */,
         const std::string & /* config_dir */,
@@ -73,6 +188,21 @@ inline bool server_smt_vision_supports_audio(const server_smt_vision_context * /
     return false;
 }
 
+inline bool server_smt_vision_supports_prompt_embeddings(const server_smt_vision_context * /* ctx */) {
+    return false;
+}
+
+inline bool server_smt_vision_is_lingbot_map(const server_smt_vision_context * /* ctx */) {
+    return false;
+}
+
+inline server_smt_lingbot_map_reconstruct_result server_smt_vision_lingbot_map_reconstruct(
+        server_smt_vision_context * /* ctx */,
+        const std::vector<std::vector<uint8_t>> & /* images */,
+        const server_smt_lingbot_map_reconstruct_options & /* options */) {
+    throw std::runtime_error("SMT media backend is not compiled");
+}
+
 inline server_smt_image_chunk server_smt_vision_encode_media_bin(
         server_smt_vision_context * /* ctx */,
         const std::vector<uint8_t> & /* data */) {
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 769e80a802f3..71d9efaa49d7 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3,6 +3,7 @@
 #include "server-models.h"
 #include "server-cors-proxy.h"
 #include "server-tools.h"
+#include "server-smt-vision.h"
 
 #include "arg.h"
 #include "build-info.h"
@@ -89,9 +90,18 @@ int llama_server(int argc, char ** argv) {
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    // router server never loads a model and must not touch the GPU
-    // skip device enumeration so the CUDA primary context stays uncreated
-    const bool is_router_server = params.model.path.empty();
+#if defined(LLAMA_SERVER_SMT_VISION)
+    const bool is_lingbot_map_reconstruct_server =
+        (params.media_backend == "smt" || params.media_backend == "auto") &&
+        server_smt_vision_config_is_lingbot_map(params.smt_config_dir);
+#else
+    const bool is_lingbot_map_reconstruct_server = false;
+#endif
+
+    // router server never loads a model and must not touch the GPU. A LingBot-MAP
+    // reconstruction server is model-less from llama's text-LLM perspective, but still
+    // needs local SMT initialization instead of router proxying.
+    const bool is_router_server = params.model.path.empty() && !is_lingbot_map_reconstruct_server;
     common_params_print_info(params, !is_router_server);
 
     // validate batch size for embeddings
@@ -161,6 +171,7 @@ int llama_server(int argc, char ** argv) {
         routes.post_tokenize               = models_routes->proxy_post;
         routes.post_detokenize             = models_routes->proxy_post;
         routes.post_apply_template         = models_routes->proxy_post;
+        routes.post_reconstruct            = models_routes->proxy_post;
         routes.get_lora_adapters           = models_routes->proxy_get;
         routes.post_lora_adapters          = models_routes->proxy_post;
         routes.get_slots                   = models_routes->proxy_get;
@@ -204,6 +215,8 @@ int llama_server(int argc, char ** argv) {
     ctx_http.post("/tokenize",                 ex_wrapper(routes.post_tokenize));
     ctx_http.post("/detokenize",               ex_wrapper(routes.post_detokenize));
     ctx_http.post("/apply-template",           ex_wrapper(routes.post_apply_template));
+    ctx_http.post("/reconstruct",              ex_wrapper(routes.post_reconstruct));
+    ctx_http.post("/v1/reconstruct",           ex_wrapper(routes.post_reconstruct));
     // LoRA adapters hotswap
     ctx_http.get ("/lora-adapters",            ex_wrapper(routes.get_lora_adapters));
     ctx_http.post("/lora-adapters",            ex_wrapper(routes.post_lora_adapters));

From 1ddff89fd44db98e04753cc2bc39acf2e7f067cd Mon Sep 17 00:00:00 2001
From: co-seven <linxi.cai@spacemit.com>
Date: Mon, 8 Jun 2026 08:02:36 +0000
Subject: [PATCH 2/4] ci: shorten SpacemiT MTMD release tags

---
 .github/variables.env                     |  2 +-
 .github/workflows/build-spacemit-mtmd.yml | 22 ++++++++++++++--------
 VERSION_NUMBER                            |  2 +-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/.github/variables.env b/.github/variables.env
index 18bf7bf5b9ce..405225b1947e 100644
--- a/.github/variables.env
+++ b/.github/variables.env
@@ -1,6 +1,6 @@
 SPACEMIT_TOOLCHAIN_URL=https://github.com/spacemit-com/toolchain/releases/download/v1.1.2/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
 SPACEMIT_ORT_URL=https://github.com/spacemit-com/onnxruntime/releases/download/2.0.2/spacemit-ort.riscv64.2.0.2.tar.gz
-SPACEMIT_MTMD_RELEASE_TAG=spacemit-llama.cpp.riscv64
+SPACEMIT_MTMD_PACKAGE_PREFIX=spacemit-llama.cpp.riscv64
 SPACEMIT_TOOLCHAIN_ARCHIVE=.cache/spacemit-toolchain.tar.xz
 SPACEMIT_ORT_ARCHIVE=.cache/spacemit-ort.tar.gz
 SPACEMIT_TOOLCHAIN_DIR=spacemit_toolchain
diff --git a/.github/workflows/build-spacemit-mtmd.yml b/.github/workflows/build-spacemit-mtmd.yml
index 57bce4b3206c..621cfcfb3f76 100644
--- a/.github/workflows/build-spacemit-mtmd.yml
+++ b/.github/workflows/build-spacemit-mtmd.yml
@@ -42,12 +42,14 @@ jobs:
             exit 1
           fi
 
-          SPACEMIT_MTMD_RELEASE_TAG="${SPACEMIT_MTMD_RELEASE_TAG}.${VERSION_NUMBER}"
+          SPACEMIT_MTMD_RELEASE_TAG="v${VERSION_NUMBER}"
+          SPACEMIT_MTMD_PACKAGE_NAME="${SPACEMIT_MTMD_PACKAGE_PREFIX}.${VERSION_NUMBER}"
 
           {
             echo "SPACEMIT_TOOLCHAIN_URL=${SPACEMIT_TOOLCHAIN_URL}"
             echo "SPACEMIT_ORT_URL=${SPACEMIT_ORT_URL}"
             echo "SPACEMIT_MTMD_RELEASE_TAG=${SPACEMIT_MTMD_RELEASE_TAG}"
+            echo "SPACEMIT_MTMD_PACKAGE_NAME=${SPACEMIT_MTMD_PACKAGE_NAME}"
             echo "SPACEMIT_TOOLCHAIN_ARCHIVE=${SPACEMIT_TOOLCHAIN_ARCHIVE}"
             echo "SPACEMIT_ORT_ARCHIVE=${SPACEMIT_ORT_ARCHIVE}"
             echo "SPACEMIT_TOOLCHAIN_DIR=${SPACEMIT_TOOLCHAIN_DIR}"
@@ -59,7 +61,9 @@ jobs:
           {
             echo "toolchain_url=${SPACEMIT_TOOLCHAIN_URL}"
             echo "ort_url=${SPACEMIT_ORT_URL}"
+            echo "version_number=${VERSION_NUMBER}"
             echo "release_tag=${SPACEMIT_MTMD_RELEASE_TAG}"
+            echo "package_name=${SPACEMIT_MTMD_PACKAGE_NAME}"
             echo "toolchain_archive=${SPACEMIT_TOOLCHAIN_ARCHIVE}"
             echo "ort_archive=${SPACEMIT_ORT_ARCHIVE}"
             echo "toolchain_dir=${SPACEMIT_TOOLCHAIN_DIR}"
@@ -228,8 +232,8 @@ jobs:
         shell: bash
         run: |
           set -euo pipefail
-          PACKAGE_DIR="release/${SPACEMIT_MTMD_RELEASE_TAG}"
-          ASSET_NAME="${SPACEMIT_MTMD_RELEASE_TAG}.tar.gz"
+          PACKAGE_DIR="release/${SPACEMIT_MTMD_PACKAGE_NAME}"
+          ASSET_NAME="${SPACEMIT_MTMD_PACKAGE_NAME}.tar.gz"
 
           rm -rf "$PACKAGE_DIR"
           mkdir -p "$PACKAGE_DIR"
@@ -240,7 +244,7 @@ jobs:
             find "$PACKAGE_DIR/bin" -maxdepth 1 \( -type f -o -type l \) \( -name 'test*' -o -name 'export-graph-ops*' \) -exec rm -f {} +
           fi
 
-          tar -czf "release/${ASSET_NAME}" -C release "${SPACEMIT_MTMD_RELEASE_TAG}"
+          tar -czf "release/${ASSET_NAME}" -C release "${SPACEMIT_MTMD_PACKAGE_NAME}"
 
       - name: Inspect package
         if: ${{ github.event_name == 'pull_request' || (github.event_name == 'push' && steps.release_guard.outputs.should_publish == 'true') }}
@@ -248,9 +252,9 @@ jobs:
         run: |
           set -euo pipefail
 
-          ASSET_NAME="${SPACEMIT_MTMD_RELEASE_TAG}.tar.gz"
+          ASSET_NAME="${SPACEMIT_MTMD_PACKAGE_NAME}.tar.gz"
           echo "Package tree:"
-          find "release/${SPACEMIT_MTMD_RELEASE_TAG}" -maxdepth 2 -print | sort
+          find "release/${SPACEMIT_MTMD_PACKAGE_NAME}" -maxdepth 2 -print | sort
           echo "Package archive:"
           tar -tzf "release/${ASSET_NAME}"
 
@@ -259,7 +263,7 @@ jobs:
         uses: actions/upload-artifact@v6
         with:
           name: spacemit-mtmd-package
-          path: release/${{ steps.vars.outputs.release_tag }}.tar.gz
+          path: release/${{ steps.vars.outputs.package_name }}.tar.gz
           if-no-files-found: error
           retention-days: 7
 
@@ -271,9 +275,11 @@ jobs:
           tag_name: ${{ steps.vars.outputs.release_tag }}
           name: ${{ steps.vars.outputs.release_tag }}
           target_commitish: ${{ github.sha }}
-          files: release/${{ steps.vars.outputs.release_tag }}.tar.gz
+          files: release/${{ steps.vars.outputs.package_name }}.tar.gz
           body: |
             SpacemiT MTMD build for `spacemit-mtmd`.
+            Version: `${{ steps.vars.outputs.version_number }}`
+            Package: `${{ steps.vars.outputs.package_name }}.tar.gz`
             Commit: `${{ github.sha }}`
           make_latest: false
           overwrite_files: true
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index d917d3e26adc..b1e80bb2480a 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-0.1.2
+0.1.3

From 7bf649e0f00dcb97dcb835ac50bff58dc4d4e9a1 Mon Sep 17 00:00:00 2001
From: co-seven <linxi.cai@spacemit.com>
Date: Mon, 8 Jun 2026 08:56:56 +0000
Subject: [PATCH 3/4] ci: comment merged PR commit summary

---
 .../workflows/merge-pr-summary-comment.yml    | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 .github/workflows/merge-pr-summary-comment.yml

diff --git a/.github/workflows/merge-pr-summary-comment.yml b/.github/workflows/merge-pr-summary-comment.yml
new file mode 100644
index 000000000000..dd78314aecfd
--- /dev/null
+++ b/.github/workflows/merge-pr-summary-comment.yml
@@ -0,0 +1,87 @@
+name: Comment merged PR summary
+
+on:
+  pull_request:
+    branches:
+      - spacemit-mtmd
+    types:
+      - closed
+
+permissions:
+  contents: read
+  pull-requests: read
+  issues: write
+
+jobs:
+  comment:
+    name: Comment merged PR summary
+    if: ${{ github.event.pull_request.merged == true }}
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Comment commit summary
+        uses: actions/github-script@v8
+        with:
+          script: |
+            const pr = context.payload.pull_request;
+            const commits = await github.paginate(github.rest.pulls.listCommits, {
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: pr.number,
+              per_page: 100,
+            });
+
+            const marker = '<!-- merge-pr-summary-comment -->';
+            const commitLines = commits.map((commit, index) => {
+              const shortSha = commit.sha.substring(0, 7);
+              const title = commit.commit.message.split('\n')[0];
+              return `${index + 1}. \`${shortSha}\` ${title}`;
+            });
+
+            const mergedBy = pr.merged_by ? pr.merged_by.login : context.actor;
+            const body = [
+              marker,
+              `### Merge PR Summary`,
+              ``,
+              `Merged PR #${pr.number}: ${pr.title}`,
+              ``,
+              `- Target branch: \`${pr.base.ref}\``,
+              `- Source branch: \`${pr.head.label}\``,
+              `- Merged by: @${mergedBy}`,
+              `- Merge commit: ${pr.merge_commit_sha ? '`${pr.merge_commit_sha}`' : 'unknown'}`,
+              `- Commit count: **${commits.length}**`,
+              ``,
+              `Commits:`,
+              ...commitLines,
+              ``,
+              `PR: ${pr.html_url}`,
+            ].join('\n');
+
+            const comments = await github.paginate(github.rest.issues.listComments, {
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: pr.number,
+              per_page: 100,
+            });
+            const existing = comments.find(comment => comment.body && comment.body.includes(marker));
+
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: pr.number,
+                body,
+              });
+            }
+
+            await core.summary
+              .addHeading(`Merge PR #${pr.number} summary`, 2)
+              .addCodeBlock(body.replace(marker + '\n', ''), 'text')
+              .write();

From 848725832d76774075d21213d8e9f0c73235ea09 Mon Sep 17 00:00:00 2001
From: co-seven <linxi.cai@spacemit.com>
Date: Mon, 8 Jun 2026 09:50:44 +0000
Subject: [PATCH 4/4] ci: test bugfix

---
 src/llama-model-saver.cpp  | 1 +
 src/models/lingbot-map.cpp | 4 ++--
 src/models/models.h        | 4 ++--
 tests/test-llama-archs.cpp | 4 ++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
index 528e4c9c069f..572ca768fa0f 100644
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -14,6 +14,7 @@
 
 bool llama_model_saver_supports_arch(llm_arch arch) {
     switch (arch) {
+        case LLM_ARCH_LINGBOT_MAP:
         case LLM_ARCH_QWEN3NEXT:
         case LLM_ARCH_QWEN35:
         case LLM_ARCH_QWEN35MOE:
diff --git a/src/models/lingbot-map.cpp b/src/models/lingbot-map.cpp
index 877402d7ee4a..2934fd8b3cd5 100644
--- a/src/models/lingbot-map.cpp
+++ b/src/models/lingbot-map.cpp
@@ -59,10 +59,10 @@ void llama_model_lingbot_map::load_arch_hparams(llama_model_loader & ml) {
     hparams.n_rot_swa = 0;
 }
 
-void llama_model_lingbot_map::load_arch_tensors(llama_model_loader &) {
+[[noreturn]] void llama_model_lingbot_map::load_arch_tensors(llama_model_loader &) {
     throw std::runtime_error("LingBot-MAP GGUF tensors are loaded by the mtmd SMT wrapper, not llama_model");
 }
 
-std::unique_ptr<llm_graph_context> llama_model_lingbot_map::build_arch_graph(const llm_graph_params &) const {
+[[noreturn]] std::unique_ptr<llm_graph_context> llama_model_lingbot_map::build_arch_graph(const llm_graph_params &) const {
     throw std::runtime_error("LingBot-MAP does not support llama_model text graph execution");
 }
diff --git a/src/models/models.h b/src/models/models.h
index 47c099a76d9a..4609a0d063ce 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -176,9 +176,9 @@ struct llama_model_llama_embed : public llama_model_llama {
 struct llama_model_lingbot_map : public llama_model_base {
     llama_model_lingbot_map(const struct llama_model_params & params) : llama_model_base(params) {}
     void load_arch_hparams(llama_model_loader & ml) override;
-    void load_arch_tensors(llama_model_loader & ml) override;
+    [[noreturn]] void load_arch_tensors(llama_model_loader & ml) override;
 
-    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+    [[noreturn]] std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
 
diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
index 1def7faff605..120e635ec141 100644
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -382,8 +382,8 @@ static bool moe_implemented(const llm_arch arch) {
 }
 
 static bool arch_supported(const llm_arch arch) {
-    if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
-        return false; // These models don't have usable implementations.
+    if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_LINGBOT_MAP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
+        return false; // These models don't have usable llama_model text implementations.
     }
     if (arch == LLM_ARCH_CHAMELEON) {
         return false; // Only half-implemented and to be removed in the future.