diff --git a/.gitignore b/.gitignore
index ccd784f..4a54642 100644
--- a/.gitignore
+++ b/.gitignore
@@ -209,3 +209,12 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+# model files
+*.pt
+*.pth
+*.onnx
+*.engine
+*.ts
+
+output/
\ No newline at end of file
diff --git a/README.md b/README.md
index 49392b8..33bff36 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,8 @@ ROS2 perception stack for generalist robotics.
 ## Packages
 
 - **track_anything** - EdgeTAM tracking + 3D segmentation with RGBD
+- **detect_anything** - YOLO-E detection node and utilities
+- **semantic_mapping** - semantic 3D mapping with VLM query hooks
 - **vector_perception_utils** - Image and point cloud utilities
 
 ## Installation
@@ -18,9 +20,16 @@ source ~/vector_venv/bin/activate
 cd /home/alex-lin/dev/vector_perception_ros
 pip install -r requirements.txt
 
+sudo apt-get install ros-dev-tools ros-jazzy-pcl-ros libpcl-dev git cmake libgoogle-glog-dev libgflags-dev libatlas-base-dev libeigen3-dev libsuitesparse-dev
+
+sudo apt install -y nlohmann-json3-dev
+sudo apt install ros-jazzy-backward-ros
+
+
 # Build
 source /opt/ros/jazzy/setup.bash
 colcon build
+```
 
 
 
@@ -110,4 +119,5 @@ for det in detections:
 
 See package READMEs:
 - [track_anything/README.md](track_anything/README.md)
+- [semantic_mapping/README.md](semantic_mapping/README.md)
 - [vector_perception_utils/README.md](vector_perception_utils/README.md)
diff --git a/detect_anything/CMakeLists.txt b/detect_anything/CMakeLists.txt
new file mode 100644
index 0000000..ea8c416
--- /dev/null
+++ b/detect_anything/CMakeLists.txt
@@ -0,0 +1,54 @@
+cmake_minimum_required(VERSION 3.10)
+project(detect_anything)
+
+find_package(ament_cmake REQUIRED)
+find_package(rclpy REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(std_msgs REQUIRED)
+find_package(vision_msgs REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(rosidl_default_generators REQUIRED)
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+rosidl_generate_interfaces(${PROJECT_NAME}
+  "msg/DetectionResult.msg"
+  DEPENDENCIES std_msgs sensor_msgs
+)
+
+set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages")
+
+install(
+  DIRECTORY detect_anything
+  DESTINATION ${PYTHON_INSTALL_DIR}
+)
+
+install(
+  PROGRAMS
+    scripts/detection_node
+  DESTINATION lib/${PROJECT_NAME}
+)
+
+install(
+  FILES resource/detect_anything
+  DESTINATION share/${PROJECT_NAME}/resource
+)
+
+install(
+  DIRECTORY config
+  DESTINATION share/${PROJECT_NAME}
+)
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/models")
+  install(
+    DIRECTORY models
+    DESTINATION share/${PROJECT_NAME}
+  )
+endif()
+
+set(ENV_HOOK "${CMAKE_CURRENT_SOURCE_DIR}/env-hooks/venv_pythonpath.sh.in")
+set(ENV_HOOK_OUT "${CMAKE_CURRENT_BINARY_DIR}/ament_cmake_environment_hooks/venv_pythonpath.sh")
+configure_file(${ENV_HOOK} ${ENV_HOOK_OUT} @ONLY)
+ament_environment_hooks(${ENV_HOOK_OUT})
+
+ament_export_dependencies(rosidl_default_runtime)
+ament_package()
diff --git a/detect_anything/README.md b/detect_anything/README.md
new file mode 100644
index 0000000..1e94a06
--- /dev/null
+++ b/detect_anything/README.md
@@ -0,0 +1,28 @@
+# detect_anything
+
+YOLO-E detection node that publishes `DetectionResult` with cropped masks and an annotated overlay topic.
+
+## What’s inside
+- `detect_anything/detection.py`: detection results container and Ultralytics parsing helpers.
+- `detect_anything/yoloe.py`: YOLO-E wrapper with prompt support and basic filtering.
+- `detect_anything/detection_node.py`: ROS2 node wiring the detector to `DetectionResult`.
+- `msg/DetectionResult.msg`: compressed image + cropped mask array.
+
+## Quick start
+```bash
+source ~/vector_venv/bin/activate
+source /opt/ros/jazzy/setup.bash
+colcon build --packages-select detect_anything
+source install/setup.bash
+
+ros2 run detect_anything detection_node \
+  --ros-args -p model_path:=/path/to/yoloe/models \
+             -p model_name:=yoloe-11s-seg-pf.pt \
+             -p conf:=0.6 \
+             -p max_area_ratio:=0.3 \
+             -p image_topic:=/camera/image
+```
+
+Topics:
+- Publishes `/detection_result` (`detect_anything/DetectionResult`) and `/annotated_image_detection` (`sensor_msgs/Image`).
+- Subscribes to `/camera/image` (or `/camera/image/compressed` if `use_compressed:=true`).
diff --git a/detect_anything/config/objects.yaml b/detect_anything/config/objects.yaml
new file mode 100644
index 0000000..371f186
--- /dev/null
+++ b/detect_anything/config/objects.yaml
@@ -0,0 +1,46 @@
+# Simple list of object names for prompting YOLO-E
+- chair
+- desk
+- tv_monitor
+- sofa
+- unknown
+- printer
+- coffee machine
+- refrigerator
+- trash can
+- shoe
+- sink
+- table
+- oven
+- bed
+- painting
+- bulletin board
+- plant
+- vase
+- cabinet
+- shelf
+- book
+- cup
+- sculpture
+- keyboard
+- mouse
+- clock
+- phone
+- toilet
+- bathtub
+- microwave oven
+- pan
+- suitcase
+- light
+- curtain
+- whiteboard
+- shower knob
+- bottle
+- water dispenser
+- vending machine
+- laptop
+- bag
+- locker
+- picture
+- cardboard
+- extinguisher
diff --git a/detect_anything/detect_anything/__init__.py b/detect_anything/detect_anything/__init__.py
new file mode 100644
index 0000000..9c61aab
--- /dev/null
+++ b/detect_anything/detect_anything/__init__.py
@@ -0,0 +1 @@
+"""detect_anything package."""
diff --git a/detect_anything/detect_anything/detection.py b/detect_anything/detect_anything/detection.py
new file mode 100644
index 0000000..1d64d76
--- /dev/null
+++ b/detect_anything/detect_anything/detection.py
@@ -0,0 +1,256 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, Optional, Sequence
+
+from cv_bridge import CvBridge
+import cv2
+import numpy as np
+
+from detect_anything.msg import DetectionResult
+
+_BRIDGE = CvBridge()
+
+
+@dataclass
+class SingleDetectionResult:
+    """Simple container for a YOLO/Ultralytics detection with a segmentation mask."""
+
+    bbox: tuple[float, float, float, float]
+    track_id: int
+    class_id: int
+    label: str
+    confidence: float
+    mask: np.ndarray
+
+    def area(self) -> float:
+        x1, y1, x2, y2 = self.bbox
+        return max(0.0, (x2 - x1)) * max(0.0, (y2 - y1))
+
+
+@dataclass
+class DetectionResults:
+    """Container for detections plus the source image."""
+
+    image: np.ndarray
+    detections: List[SingleDetectionResult]
+
+    def overlay(self, alpha: float = 0.4, thickness: int = 2) -> np.ndarray:
+        """Draw bboxes and masks on the image."""
+        output = self.image.copy()
+        for det in self.detections:
+            rng = np.random.default_rng(det.track_id)
+            color = tuple(int(c) for c in rng.integers(50, 255, size=3))
+
+            if det.mask.size:
+                mask_bool = det.mask.astype(bool)
+                if mask_bool.shape[:2] == output.shape[:2]:
+                    colored_mask = np.zeros_like(output)
+                    colored_mask[mask_bool] = color
+                    output[mask_bool] = cv2.addWeighted(
+                        output[mask_bool], 1 - alpha, colored_mask[mask_bool], alpha, 0
+                    )
+                    contours, _ = cv2.findContours(det.mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                    cv2.drawContours(output, contours, -1, color, thickness)
+
+            x1, y1, x2, y2 = map(int, det.bbox)
+            cv2.rectangle(output, (x1, y1), (x2, y2), color, thickness)
+
+            label_text = f"{det.label} {det.confidence:.2f}"
+            label_text = f"{label_text} | id {det.track_id}"
+
+            (text_w, text_h), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+            cv2.rectangle(
+                output,
+                (x1, max(0, y1 - text_h - baseline - 2)),
+                (x1 + text_w, max(0, y1)),
+                color,
+                -1,
+            )
+            cv2.putText(
+                output,
+                label_text,
+                (x1, max(0, y1 - baseline - 2)),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.5,
+                (255, 255, 255),
+                1,
+                cv2.LINE_AA,
+            )
+        return output
+
+    def to_msg(self, header) -> DetectionResult:
+        """
+        Convert detections into a DetectionResult ROS message.
+        """
+        msg = DetectionResult()
+        msg.header = header
+        msg.image = _BRIDGE.cv2_to_compressed_imgmsg(self.image, dst_format="jpeg")
+        msg.image.header = header
+
+        for det in self.detections:
+            msg.track_id.append(int(det.track_id))
+            msg.x1.append(float(det.bbox[0]))
+            msg.y1.append(float(det.bbox[1]))
+            msg.x2.append(float(det.bbox[2]))
+            msg.y2.append(float(det.bbox[3]))
+            msg.label.append(str(det.label))
+            msg.confidence.append(float(det.confidence))
+
+            mask = det.mask
+            # Crop mask to bbox and align origin to bbox top-left for transport.
+            x1, y1, x2, y2 = [int(v) for v in det.bbox]
+            h, w = self.image.shape[:2]
+            x1 = max(0, min(x1, w))
+            x2 = max(0, min(x2, w))
+            y1 = max(0, min(y1, h))
+            y2 = max(0, min(y2, h))
+            crop_h = max(1, y2 - y1)
+            crop_w = max(1, x2 - x1)
+            if mask.size == 0:
+                mask = np.zeros((crop_h, crop_w), dtype=np.uint8)
+            else:
+                mask = mask[y1:y2, x1:x2]
+                if mask.shape != (crop_h, crop_w):
+                    mask = cv2.resize(
+                        mask.astype(np.uint8),
+                        (crop_w, crop_h),
+                        interpolation=cv2.INTER_NEAREST,
+                    )
+
+            mask_msg = _BRIDGE.cv2_to_compressed_imgmsg(mask, dst_format="png")
+            mask_msg.header = header
+            msg.masks.append(mask_msg)
+
+        return msg
+
+    @classmethod
+    def from_msg(cls, msg: DetectionResult) -> DetectionResults:
+        """
+        Construct DetectionResults from a DetectionResult ROS message.
+        Masks are converted back to full-size overlays at bbox locations.
+        """
+        image = _BRIDGE.compressed_imgmsg_to_cv2(msg.image)
+        detections: List[SingleDetectionResult] = []
+
+        for i in range(len(msg.track_id)):
+            bbox = (
+                float(msg.x1[i]),
+                float(msg.y1[i]),
+                float(msg.x2[i]),
+                float(msg.y2[i]),
+            )
+            mask_img = _BRIDGE.compressed_imgmsg_to_cv2(msg.masks[i], desired_encoding="mono8")
+
+            full_mask = np.zeros(image.shape[:2], dtype=np.uint8)
+            x1, y1, x2, y2 = [int(v) for v in bbox]
+            h, w = image.shape[:2]
+            x1 = max(0, min(x1, w))
+            x2 = max(0, min(x2, w))
+            y1 = max(0, min(y1, h))
+            y2 = max(0, min(y2, h))
+            crop_h = max(1, y2 - y1)
+            crop_w = max(1, x2 - x1)
+            if mask_img.shape[:2] != (crop_h, crop_w):
+                mask_img = cv2.resize(mask_img, (crop_w, crop_h), interpolation=cv2.INTER_NEAREST)
+            full_mask[y1:y2, x1:x2] = mask_img
+
+            detections.append(
+                SingleDetectionResult(
+                    bbox=bbox,
+                    track_id=int(msg.track_id[i]),
+                    class_id=0,  # class_id is not transmitted in DetectionResult
+                    label=str(msg.label[i]),
+                    confidence=float(msg.confidence[i]),
+                    mask=full_mask,
+                )
+            )
+
+        return cls(image=image, detections=detections)
+
+
+def _from_ultralytics_result(
+    result,
+    idx: int,
+    image_shape: Optional[tuple[int, int]] = None,
+    names: Optional[Sequence[str]] = None,
+) -> SingleDetectionResult:
+    """Convert a single Ultralytics Results entry into a SingleDetectionResult."""
+    from ultralytics.engine.results import Results  # local import
+
+    if not isinstance(result, Results):
+        raise TypeError("result must be an ultralytics Results object")
+    if result.boxes is None:
+        raise ValueError("Result has no boxes")
+
+    bbox_array = result.boxes.xyxy[idx].cpu().numpy()
+    bbox = (
+        float(bbox_array[0]),
+        float(bbox_array[1]),
+        float(bbox_array[2]),
+        float(bbox_array[3]),
+    )
+
+    confidence = float(result.boxes.conf[idx].cpu())
+    class_id = int(result.boxes.cls[idx].cpu())
+
+    resolved_names = names
+    if resolved_names is None and hasattr(result, "names"):
+        resolved_names = result.names  # type: ignore[assignment]
+
+    label = f"class_{class_id}"
+    if resolved_names is not None:
+        if isinstance(resolved_names, dict):
+            label = resolved_names.get(class_id, label)
+        elif isinstance(resolved_names, (list, tuple)) and class_id < len(resolved_names):
+            label = resolved_names[class_id]
+
+    track_id = idx
+    if hasattr(result.boxes, "id") and result.boxes.id is not None:
+        track_id = int(result.boxes.id[idx].cpu())
+
+    target_h, target_w = (image_shape or (None, None))
+    if target_h is None or target_w is None:
+        if hasattr(result, "orig_shape") and result.orig_shape:
+            target_h, target_w = result.orig_shape[:2]
+
+    mask = np.zeros((target_h, target_w), dtype=np.uint8) if target_h and target_w else np.zeros((0, 0), dtype=np.uint8)
+    if result.masks is not None and idx < len(result.masks.data):
+        mask_tensor = result.masks.data[idx]
+        mask_np = mask_tensor.cpu().numpy()
+
+        if mask_np.ndim == 3:
+            mask_np = mask_np.squeeze()
+
+        if target_h and target_w and mask_np.shape != (target_h, target_w):
+            mask_np = cv2.resize(mask_np.astype(np.float32), (target_w, target_h), interpolation=cv2.INTER_LINEAR)
+
+        mask = (mask_np > 0.5).astype(np.uint8) * 255  # type: ignore[assignment]
+
+    return SingleDetectionResult(
+        bbox=bbox,
+        track_id=track_id,
+        class_id=class_id,
+        label=label,
+        confidence=confidence,
+        mask=mask,
+    )
+
+
+def from_ultralytics_results(
+    results,
+    image_shape: Optional[tuple[int, int]] = None,
+    names: Optional[Sequence[str]] = None,
+) -> List[SingleDetectionResult]:
+    """Convert a list of Ultralytics Results objects into detections with masks."""
+    detections: List[SingleDetectionResult] = []
+    for result in results:
+        if result.boxes is None:
+            continue
+        num_detections = len(result.boxes.xyxy)
+        for i in range(num_detections):
+            detections.append(_from_ultralytics_result(result, i, image_shape=image_shape, names=names))
+    return detections
+
+
+__all__ = ["DetectionResults", "SingleDetectionResult", "from_ultralytics_results"]
diff --git a/detect_anything/detect_anything/detection_node.py b/detect_anything/detect_anything/detection_node.py
new file mode 100644
index 0000000..7aa0870
--- /dev/null
+++ b/detect_anything/detect_anything/detection_node.py
@@ -0,0 +1,177 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import List, Optional
+
+import yaml
+import rclpy
+from cv_bridge import CvBridge
+from ament_index_python.packages import get_package_share_directory
+from sensor_msgs.msg import CompressedImage, Image
+from rclpy.node import Node
+from rclpy.qos import DurabilityPolicy, QoSProfile, ReliabilityPolicy
+
+from detect_anything.detection import DetectionResults
+from detect_anything.yoloe import Yoloe2DDetector, YoloePromptMode
+from detect_anything.msg import DetectionResult
+
+
+class DetectionNode(Node):
+    """ROS2 node that runs YOLO-E detection and publishes DetectionResult."""
+
+    def __init__(self) -> None:
+        super().__init__("detect_anything_detection_node")
+        self.bridge = CvBridge()
+
+        # Parameters
+        defaults = self._default_params()
+        self.declare_parameter("image_topic", defaults["image_topic"])
+        self.declare_parameter("compressed_image_topic", defaults["compressed_image_topic"])
+        self.declare_parameter("use_compressed", defaults["use_compressed"])
+        self.declare_parameter("model_path", defaults["model_path"])
+        self.declare_parameter("model_name", defaults["model_name"])
+        self.declare_parameter("device", defaults["device"])
+        self.declare_parameter("prompt_mode", defaults["prompt_mode"])
+        self.declare_parameter("objects_file", defaults["objects_file"])
+        self.declare_parameter("conf", defaults["conf"])
+        self.declare_parameter("max_area_ratio", defaults["max_area_ratio"])
+
+        params = self._default_params()
+        image_topic = str(self.get_parameter("image_topic").value)
+        compressed_image_topic = str(self.get_parameter("compressed_image_topic").value)
+        self.use_compressed = bool(self.get_parameter("use_compressed").value)
+
+        base_share = Path(get_package_share_directory("detect_anything"))
+        model_path_val = self.get_parameter("model_path").value
+        model_path = str(model_path_val) if model_path_val not in (None, "") else params["model_path"]
+        if str(model_path).lower() == "none":
+            model_path = params["model_path"]
+        model_path = Path(model_path).expanduser()
+        if not model_path.is_absolute():
+            model_path = (base_share / model_path).resolve()
+
+        model_name = str(self.get_parameter("model_name").value or params["model_name"])
+        device = str(self.get_parameter("device").value or params["device"])
+        prompt_mode_str = str(self.get_parameter("prompt_mode").value or params["prompt_mode"]).lower()
+        objects_file = Path(str(self.get_parameter("objects_file").value or params["objects_file"]))
+        if not objects_file.is_absolute():
+            objects_file = (base_share / objects_file).resolve()
+        conf = float(self.get_parameter("conf").value)
+        max_area_ratio = float(self.get_parameter("max_area_ratio").value)
+
+        prompt_mode = YoloePromptMode(prompt_mode_str)
+
+        # Auto-pick the right default weight if user didn't override
+        if prompt_mode == YoloePromptMode.LRPC and ("seg.pt" in model_name and "seg-pf.pt" not in model_name):
+            model_name = "yoloe-11l-seg-pf.pt"
+            self.get_logger().info(f"Prompt-free mode: switching model_name to {model_name}")
+        if prompt_mode == YoloePromptMode.PROMPT and "seg-pf.pt" in model_name:
+            model_name = "yoloe-11l-seg.pt"
+            self.get_logger().info(f"Prompt mode: switching model_name to {model_name}")
+
+        self.detector = Yoloe2DDetector(
+            model_path=str(model_path),
+            model_name=model_name,
+            device=device or None,
+            prompt_mode=prompt_mode,
+            max_area_ratio=max_area_ratio,
+            conf=conf,
+        )
+
+        prompts = self._load_prompts(objects_file)
+        if prompts and prompt_mode == YoloePromptMode.PROMPT:
+            self.detector.set_prompts(text=prompts)
+            self.get_logger().info(f"Loaded {len(prompts)} prompts from {objects_file}")
+
+        qos = QoSProfile(
+            depth=10,
+            reliability=ReliabilityPolicy.RELIABLE,
+            durability=DurabilityPolicy.VOLATILE,
+        )
+        if self.use_compressed:
+            self.create_subscription(
+                CompressedImage,
+                compressed_image_topic,
+                self._compressed_image_callback,
+                qos,
+            )
+        else:
+            self.create_subscription(
+                Image,
+                image_topic,
+                self._image_callback,
+                qos,
+            )
+
+        self.detection_pub = self.create_publisher(DetectionResult, "/detection_result", qos)
+        self.overlay_pub = self.create_publisher(Image, "/annotated_image_detection", qos)
+
+        self.get_logger().info("detect_anything detection node started")
+
+    def _default_params(self) -> dict:
+        base_share = Path(get_package_share_directory("detect_anything"))
+        model_dir = Path.home() / ".ultralytics" / "weights"
+        return {
+            "image_topic": "/camera/image",
+            "compressed_image_topic": "/camera/image/compressed",
+            "use_compressed": True,
+            "model_path": str(model_dir),
+            "model_name": "yoloe-11l-seg.pt",
+            "device": "cuda",
+            "prompt_mode": YoloePromptMode.PROMPT.value,
+            "objects_file": str(base_share / "config" / "objects.yaml"),
+            "conf": 0.25,
+            "max_area_ratio": 0.3,
+        }
+
+    def _load_prompts(self, objects_file: Path) -> List[str]:
+        if not objects_file.exists():
+            return []
+        try:
+            with open(objects_file, "r", encoding="utf-8") as f:
+                data = yaml.safe_load(f) or []
+                if isinstance(data, list):
+                    return [str(p) for p in data]
+        except Exception as exc:
+            self.get_logger().warn(f"Failed to read prompts from {objects_file}: {exc}")
+        return []
+
+    def _image_callback(self, msg: Image) -> None:
+        cv_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding="bgr8")
+        self._process_image(cv_image, msg.header)
+
+    def _compressed_image_callback(self, msg: CompressedImage) -> None:
+        cv_image = self.bridge.compressed_imgmsg_to_cv2(msg, desired_encoding="bgr8")
+        self._process_image(cv_image, msg.header)
+
+    def _process_image(self, cv_image, header) -> None:
+        detections: DetectionResults = self.detector.process_image(cv_image)
+
+        detection_msg = detections.to_msg(header)
+        self.detection_pub.publish(detection_msg)
+
+        overlay = detections.overlay()
+        overlay_msg = self.bridge.cv2_to_imgmsg(overlay, encoding="bgr8")
+        overlay_msg.header = header
+        self.overlay_pub.publish(overlay_msg)
+
+    def destroy_node(self) -> bool:
+        if hasattr(self, "detector") and self.detector:
+            self.detector.stop()
+        return super().destroy_node()
+
+
+def main(args: Optional[list[str]] = None) -> None:
+    rclpy.init(args=args)
+    node = DetectionNode()
+    try:
+        rclpy.spin(node)
+    except KeyboardInterrupt:
+        pass
+    finally:
+        node.destroy_node()
+        rclpy.shutdown()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/detect_anything/detect_anything/yoloe.py b/detect_anything/detect_anything/yoloe.py
new file mode 100644
index 0000000..056387a
--- /dev/null
+++ b/detect_anything/detect_anything/yoloe.py
@@ -0,0 +1,157 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+import threading
+import zipfile
+from typing import Any, List, Optional, Sequence
+
+import numpy as np
+import torch
+from ultralytics import YOLOE  # type: ignore[attr-defined, import-not-found]
+from ultralytics.utils import SETTINGS
+from ultralytics.utils.downloads import attempt_download_asset
+
+from detect_anything.detection import (
+    DetectionResults,
+    SingleDetectionResult,
+    from_ultralytics_results,
+)
+
+
+class YoloePromptMode(Enum):
+    """YOLO-E prompt modes."""
+
+    LRPC = "lrpc"
+    PROMPT = "prompt"
+
+
+class Yoloe2DDetector:
+    """Thin wrapper around YOLO-E for 2D detection + tracking with masks."""
+
+    def __init__(
+        self,
+        model_path: str | None = None,
+        model_name: str | None = None,
+        device: str | None = None,
+        prompt_mode: YoloePromptMode = YoloePromptMode.LRPC,
+        max_area_ratio: Optional[float] = 0.3,
+        conf: float = 0.6,
+        iou: float = 0.6,
+    ) -> None:
+        if model_name is None:
+            model_name = "yoloe-11s-seg-pf.pt" if prompt_mode == YoloePromptMode.LRPC else "yoloe-11s-seg.pt"
+
+        model_dir = Path(model_path) if model_path else Path.home() / ".ultralytics" / "weights"
+        model_dir = model_dir.expanduser().resolve()
+        model_dir.mkdir(parents=True, exist_ok=True)
+        SETTINGS["weights_dir"] = str(model_dir)
+        weights_path = attempt_download_asset(model_dir / model_name)
+        if prompt_mode == YoloePromptMode.PROMPT:
+            self._ensure_mobileclip(model_dir)
+        self.model = YOLOE(weights_path)
+        self.prompt_mode = prompt_mode
+        self._visual_prompts: dict[str, np.ndarray] | None = None
+        self.max_area_ratio = max_area_ratio
+        self._lock = threading.Lock()
+        self.conf = conf
+        self.iou = iou
+        self._names_override: Sequence[str] | None = None
+
+        if prompt_mode == YoloePromptMode.PROMPT:
+            self.set_prompts(text=["nothing"])
+
+        if self.max_area_ratio is not None and not (0.0 < self.max_area_ratio <= 1.0):
+            raise ValueError("max_area_ratio must be in the range (0, 1].")
+
+        if device:
+            self.device = device
+        elif torch.cuda.is_available():
+            self.device = "cuda"
+        else:
+            self.device = "cpu"
+
+    def set_prompts(
+        self,
+        text: Optional[List[str]] = None,
+        bboxes: Optional[np.ndarray] = None,
+    ) -> None:
+        """Set prompts for detection. Provide either text or bboxes, not both."""
+        if self.prompt_mode == YoloePromptMode.LRPC:
+            # Prompt-free model cannot accept text/visual prompts.
+            return
+        if text is not None and bboxes is not None:
+            raise ValueError("Provide either text or bboxes, not both.")
+        if text is None and bboxes is None:
+            raise ValueError("Must provide either text or bboxes.")
+
+        with self._lock:
+            self.model.predictor = None
+            if text is not None:
+                self.model.set_classes(text, self.model.get_text_pe(text))  # type: ignore[no-untyped-call]
+                self._visual_prompts = None
+                self._names_override = list(text)
+            else:
+                cls = np.arange(len(bboxes), dtype=np.int16)  # type: ignore[arg-type]
+                self._visual_prompts = {"bboxes": bboxes, "cls": cls}  # type: ignore[dict-item]
+                self._names_override = None
+
+    def process_image(self, image: np.ndarray) -> DetectionResults:
+        """Process an image and return detection results."""
+        track_kwargs: dict[str, Any] = {
+            "source": image,
+            "device": self.device,
+            "conf": self.conf,
+            "iou": self.iou,
+            "persist": True,
+            "verbose": False,
+        }
+
+        with self._lock:
+            if self._visual_prompts is not None:
+                track_kwargs["visual_prompts"] = self._visual_prompts
+
+            results = self.model.track(**track_kwargs)  # type: ignore[arg-type]
+
+        detections = from_ultralytics_results(results, image_shape=image.shape[:2], names=self._names_override)
+        filtered = self._apply_filters(image, detections)
+        return DetectionResults(image=image, detections=filtered)
+
+    def _apply_filters(
+        self,
+        image: np.ndarray,
+        detections: List[SingleDetectionResult],
+    ) -> List[SingleDetectionResult]:
+        if self.max_area_ratio is None:
+            return detections
+
+        image_area = float(image.shape[0] * image.shape[1])
+        if image_area <= 0:
+            return detections
+
+        return [det for det in detections if (det.area() / image_area) <= self.max_area_ratio]
+
+    def stop(self) -> None:
+        """Clean up resources used by the detector."""
+        if hasattr(self.model, "predictor") and self.model.predictor is not None:
+            predictor = self.model.predictor
+            if hasattr(predictor, "trackers") and predictor.trackers:
+                for tracker in predictor.trackers:
+                    if hasattr(tracker, "tracker") and hasattr(tracker.tracker, "gmc"):
+                        gmc = tracker.tracker.gmc
+                        if hasattr(gmc, "executor") and gmc.executor is not None:
+                            gmc.executor.shutdown(wait=True)
+            self.model.predictor = None
+
+    @staticmethod
+    def _ensure_mobileclip(model_dir: Path) -> None:
+        target = model_dir / "mobileclip_blt.ts"
+        if target.exists() and zipfile.is_zipfile(target):
+            return
+        if target.exists() or target.is_symlink():
+            target.unlink()
+        attempt_download_asset(target)
+
+
+__all__ = ["Yoloe2DDetector", "YoloePromptMode"]
diff --git a/detect_anything/env-hooks/venv_pythonpath.sh.in b/detect_anything/env-hooks/venv_pythonpath.sh.in
new file mode 100644
index 0000000..9212bac
--- /dev/null
+++ b/detect_anything/env-hooks/venv_pythonpath.sh.in
@@ -0,0 +1 @@
+ament_prepend_unique_value PYTHONPATH "@PYTHON_INSTALL_DIR@"
diff --git a/detect_anything/msg/DetectionResult.msg b/detect_anything/msg/DetectionResult.msg
new file mode 100644
index 0000000..766d5f1
--- /dev/null
+++ b/detect_anything/msg/DetectionResult.msg
@@ -0,0 +1,10 @@
+std_msgs/Header header
+int32[] track_id
+float32[] x1
+float32[] y1
+float32[] x2
+float32[] y2
+string[] label
+float32[] confidence
+sensor_msgs/CompressedImage image
+sensor_msgs/CompressedImage[] masks  # cropped to bbox; aligned to bbox top-left; encoding: mono8
diff --git a/detect_anything/package.xml b/detect_anything/package.xml
new file mode 100644
index 0000000..a7699ae
--- /dev/null
+++ b/detect_anything/package.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>detect_anything</name>
+  <version>0.0.1</version>
+  <description>YOLO-E detection node and utilities.</description>
+  <maintainer email="alex.lin416@outlook.com">Alex Lin</maintainer>
+  <license>MIT</license>
+
+  <buildtool_depend>ament_cmake</buildtool_depend>
+
+  <build_depend>rosidl_default_generators</build_depend>
+  <exec_depend>rosidl_default_runtime</exec_depend>
+  <member_of_group>rosidl_interface_packages</member_of_group>
+
+  <depend>rclpy</depend>
+  <depend>std_msgs</depend>
+  <depend>sensor_msgs</depend>
+  <depend>vision_msgs</depend>
+  <depend>cv_bridge</depend>
+  <depend>vector_perception_utils</depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/detect_anything/resource/detect_anything b/detect_anything/resource/detect_anything
new file mode 100644
index 0000000..eac574f
--- /dev/null
+++ b/detect_anything/resource/detect_anything
@@ -0,0 +1 @@
+detect_anything
diff --git a/detect_anything/scripts/detection_node b/detect_anything/scripts/detection_node
new file mode 100755
index 0000000..822cacf
--- /dev/null
+++ b/detect_anything/scripts/detection_node
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+from detect_anything.detection_node import main
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
index 7b6c110..573b311 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,21 @@
 # basic dependencies
 numpy==1.26.4
 opencv-python
+colcon-common-extensions
 scipy
 pillow
 tqdm
+pyyaml
+lark
+pytest
+moondream
+openai
 
 # 3D processing
 open3d
 
-# deep learning
-torch<=2.4.0
-torchvision<=1.19.0
+# vision
+ultralytics
 
 # SAM2 and EdgeTAM dependencies
 sam2
diff --git a/rviz/semantic_mapping.rviz b/rviz/semantic_mapping.rviz
new file mode 100644
index 0000000..86a2272
--- /dev/null
+++ b/rviz/semantic_mapping.rviz
@@ -0,0 +1,415 @@
+Panels:
+  - Class: rviz_common/Displays
+    Help Height: 0
+    Name: Displays
+    Property Tree Widget:
+      Expanded:
+        - /Global Options1
+        - /Status1
+        - /RoomSegmentation1/room_boundaries1/Topic1
+        - /RoomSegmentation1/room_type_viz1
+      Splitter Ratio: 0.5
+    Tree Height: 715
+  - Class: rviz_common/Selection
+    Name: Selection
+  - Class: rviz_common/Tool Properties
+    Expanded:
+      - /2D Goal Pose1
+      - /Publish Point1
+    Name: Tool Properties
+    Splitter Ratio: 0.5886790156364441
+  - Class: rviz_common/Views
+    Expanded:
+      - /Current View1
+    Name: Views
+    Splitter Ratio: 0.5
+  - Class: rviz_common/Time
+    Experimental: false
+    Name: Time
+    SyncMode: 0
+    SyncSource: PointCloud2
+Visualization Manager:
+  Class: ""
+  Displays:
+    - Alpha: 0.5
+      Cell Size: 1
+      Class: rviz_default_plugins/Grid
+      Color: 160; 160; 164
+      Enabled: true
+      Line Style:
+        Line Width: 0.029999999329447746
+        Value: Lines
+      Name: Grid
+      Normal Cell Count: 0
+      Offset:
+        X: 0
+        Y: 0
+        Z: 0
+      Plane: XY
+      Plane Cell Count: 10
+      Reference Frame: <Fixed Frame>
+      Value: true
+    - Class: rviz_default_plugins/Image
+      Enabled: true
+      Max Value: 1
+      Median window: 5
+      Min Value: 0
+      Name: Image
+      Normalize Range: true
+      Topic:
+        Depth: 5
+        Durability Policy: Volatile
+        History Policy: Keep Last
+        Reliability Policy: Reliable
+        Value: /annotated_image_detection
+      Value: true
+    - Angle Tolerance: 0.10000000149011612
+      Class: rviz_default_plugins/Odometry
+      Covariance:
+        Orientation:
+          Alpha: 0.5
+          Color: 255; 255; 127
+          Color Style: Unique
+          Frame: Local
+          Offset: 1
+          Scale: 1
+          Value: true
+        Position:
+          Alpha: 0.30000001192092896
+          Color: 204; 51; 204
+          Scale: 1
+          Value: true
+        Value: false
+      Enabled: true
+      Keep: 5
+      Name: Odometry
+      Position Tolerance: 0.10000000149011612
+      Shape:
+        Alpha: 1
+        Axes Length: 1
+        Axes Radius: 0.10000000149011612
+        Color: 255; 25; 0
+        Head Length: 0.30000001192092896
+        Head Radius: 0.10000000149011612
+        Shaft Length: 1
+        Shaft Radius: 0.05000000074505806
+        Value: Arrow
+      Topic:
+        Depth: 5
+        Durability Policy: Volatile
+        Filter size: 10
+        History Policy: Keep Last
+        Reliability Policy: Reliable
+        Value: /state_estimation
+      Value: true
+    - Alpha: 0.05000000074505806
+      Autocompute Intensity Bounds: false
+      Autocompute Value Bounds:
+        Max Value: 10
+        Min Value: -10
+        Value: true
+      Axis: Z
+      Channel Name: intensity
+      Class: rviz_default_plugins/PointCloud2
+      Color: 255; 255; 255
+      Color Transformer: FlatColor
+      Decay Time: 5
+      Enabled: true
+      Invert Rainbow: false
+      Max Color: 255; 255; 255
+      Max Intensity: 161
+      Min Color: 0; 0; 0
+      Min Intensity: 0
+      Name: PointCloud2
+      Position Transformer: XYZ
+      Selectable: false
+      Size (Pixels): 3
+      Size (m): 0.009999999776482582
+      Style: Flat Squares
+      Topic:
+        Depth: 5
+        Durability Policy: Volatile
+        History Policy: Keep Last
+        Reliability Policy: Reliable
+        Value: /registered_scan
+      Use Fixed Frame: false
+      Use rainbow: true
+      Value: true
+    - Class: rviz_common/Group
+      Displays:
+        - Alpha: 1
+          Autocompute Intensity Bounds: true
+          Autocompute Value Bounds:
+            Max Value: 0.7398463487625122
+            Min Value: 0.7398463487625122
+            Value: true
+          Axis: Z
+          Channel Name: intensity
+          Class: rviz_default_plugins/PointCloud2
+          Color: 224; 27; 36
+          Color Transformer: AxisColor
+          Decay Time: 0
+          Enabled: true
+          Invert Rainbow: false
+          Max Color: 255; 255; 255
+          Max Intensity: 4096
+          Min Color: 0; 0; 0
+          Min Intensity: 0
+          Name: door_cloud
+          Position Transformer: XYZ
+          Selectable: true
+          Size (Pixels): 3
+          Size (m): 0.10000000149011612
+          Style: Flat Squares
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /door_cloud
+          Use Fixed Frame: true
+          Use rainbow: true
+          Value: true
+        - Class: rviz_default_plugins/MarkerArray
+          Enabled: true
+          Name: room_viewpoints
+          Namespaces:
+            viewpoint_room_ids: true
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /viewpoint_room_ids
+          Value: true
+        - Class: rviz_default_plugins/MarkerArray
+          Enabled: true
+          Name: room_boundaries
+          Namespaces:
+            polygon_group: true
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /room_boundaries
+          Value: true
+        - Class: rviz_default_plugins/Marker
+          Enabled: true
+          Name: chosen_room
+          Namespaces:
+            {}
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            Filter size: 10
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /chosen_room_boundary
+          Value: true
+        - Alpha: 0.800000011920929
+          Autocompute Intensity Bounds: true
+          Autocompute Value Bounds:
+            Max Value: 10
+            Min Value: -10
+            Value: true
+          Axis: Z
+          Channel Name: intensity
+          Class: rviz_default_plugins/PointCloud2
+          Color: 255; 255; 255
+          Color Transformer: RGB8
+          Decay Time: 0
+          Enabled: true
+          Invert Rainbow: false
+          Max Color: 255; 255; 255
+          Max Intensity: 4096
+          Min Color: 0; 0; 0
+          Min Intensity: 0
+          Name: PointCloud2
+          Position Transformer: XYZ
+          Selectable: true
+          Size (Pixels): 5
+          Size (m): 0.019999999552965164
+          Style: Flat Squares
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /room_map_cloud
+          Use Fixed Frame: true
+          Use rainbow: true
+          Value: true
+        - Class: rviz_default_plugins/MarkerArray
+          Enabled: true
+          Name: room_type_viz
+          Namespaces:
+            room_type: true
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /room_type_vis
+          Value: true
+      Enabled: true
+      Name: RoomSegmentation
+    - Class: rviz_common/Group
+      Displays:
+        - Alpha: 1
+          Autocompute Intensity Bounds: true
+          Autocompute Value Bounds:
+            Max Value: 10
+            Min Value: -10
+            Value: true
+          Axis: Z
+          Channel Name: intensity
+          Class: rviz_default_plugins/PointCloud2
+          Color: 255; 255; 255
+          Color Transformer: RGB8
+          Decay Time: 0
+          Enabled: true
+          Invert Rainbow: false
+          Max Color: 255; 255; 255
+          Max Intensity: 4096
+          Min Color: 0; 0; 0
+          Min Intensity: 0
+          Name: object_points
+          Position Transformer: XYZ
+          Selectable: false
+          Size (Pixels): 3
+          Size (m): 0.009999999776482582
+          Style: Flat Squares
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /obj_points
+          Use Fixed Frame: true
+          Use rainbow: true
+          Value: true
+        - Class: rviz_default_plugins/MarkerArray
+          Enabled: true
+          Name: object_bboxes
+          Namespaces:
+            bbox: true
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /obj_boxes
+          Value: true
+        - Class: rviz_default_plugins/MarkerArray
+          Enabled: true
+          Name: object_labels
+          Namespaces:
+            labels: true
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /obj_labels
+          Value: true
+        - Class: rviz_default_plugins/MarkerArray
+          Enabled: false
+          Name: object_visibility
+          Namespaces:
+            {}
+          Topic:
+            Depth: 5
+            Durability Policy: Volatile
+            History Policy: Keep Last
+            Reliability Policy: Reliable
+            Value: /object_visibility_connections
+          Value: false
+      Enabled: true
+      Name: object_mapping
+  Enabled: true
+  Global Options:
+    Background Color: 48; 48; 48
+    Fixed Frame: map
+    Frame Rate: 30
+  Name: root
+  Tools:
+    - Class: rviz_default_plugins/Interact
+      Hide Inactive Objects: true
+    - Class: rviz_default_plugins/MoveCamera
+    - Class: rviz_default_plugins/Select
+    - Class: rviz_default_plugins/FocusCamera
+    - Class: rviz_default_plugins/Measure
+      Line color: 128; 128; 0
+    - Class: rviz_default_plugins/SetInitialPose
+      Covariance x: 0.25
+      Covariance y: 0.25
+      Covariance yaw: 0.06853891909122467
+      Topic:
+        Depth: 5
+        Durability Policy: Volatile
+        History Policy: Keep Last
+        Reliability Policy: Reliable
+        Value: /initialpose
+    - Class: rviz_default_plugins/SetGoal
+      Topic:
+        Depth: 5
+        Durability Policy: Volatile
+        History Policy: Keep Last
+        Reliability Policy: Reliable
+        Value: /goal_pose
+    - Class: rviz_default_plugins/PublishPoint
+      Single click: true
+      Topic:
+        Depth: 5
+        Durability Policy: Volatile
+        History Policy: Keep Last
+        Reliability Policy: Reliable
+        Value: /clicked_point
+  Transformation:
+    Current:
+      Class: rviz_default_plugins/TF
+  Value: true
+  Views:
+    Current:
+      Class: rviz_default_plugins/Orbit
+      Distance: 37.76976013183594
+      Enable Stereo Rendering:
+        Stereo Eye Separation: 0.05999999865889549
+        Stereo Focal Distance: 1
+        Swap Stereo Eyes: false
+        Value: false
+      Focal Point:
+        X: 4.479973316192627
+        Y: 11.129703521728516
+        Z: -8.338903427124023
+      Focal Shape Fixed Size: false
+      Focal Shape Size: 0.05000000074505806
+      Invert Z Axis: false
+      Name: Current View
+      Near Clip Distance: 0.009999999776482582
+      Pitch: 0.6847962737083435
+      Target Frame: map
+      Value: Orbit (rviz_default_plugins)
+      Yaw: 3.7372934818267822
+    Saved: ~
+Window Geometry:
+  Displays:
+    collapsed: false
+  Height: 2197
+  Hide Left Dock: false
+  Hide Right Dock: true
+  Image:
+    collapsed: false
+  QMainWindow State: 000000ff00000000fd00000004000000000000070c0000078bfc020000000bfb0000001200530065006c0065006300740069006f006e00000001e10000009b000000b200fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000a3fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c00610079007301000000700000033d0000018600fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261fb0000000a0049006d00610067006501000002020000007e0000000000000000fb0000000a0049006d00610067006501000003b9000004420000002800fffffffb0000000a0049006d00610067006500000002bb000000c60000000000000000000000010000010f00000342fc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003f000003420000013800fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e10000019700000003000012140000005efc0100000002fb0000000800540069006d00650100000000000012140000047a00fffffffb0000000800540069006d0065010000000000000450000000000000000000000afc0000078b00000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000
+  Selection:
+    collapsed: false
+  Time:
+    collapsed: false
+  Tool Properties:
+    collapsed: false
+  Views:
+    collapsed: true
+  Width: 4628
+  X: 4656
+  Y: 418
diff --git a/semantic_mapping/CMakeLists.txt b/semantic_mapping/CMakeLists.txt
new file mode 100644
index 0000000..71da435
--- /dev/null
+++ b/semantic_mapping/CMakeLists.txt
@@ -0,0 +1,62 @@
+cmake_minimum_required(VERSION 3.10)
+project(semantic_mapping)
+
+find_package(ament_cmake REQUIRED)
+find_package(rclpy REQUIRED)
+find_package(std_msgs REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(geometry_msgs REQUIRED)
+find_package(nav_msgs REQUIRED)
+find_package(visualization_msgs REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(rosidl_default_generators REQUIRED)
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+rosidl_generate_interfaces(${PROJECT_NAME}
+  "msg/DetectionResult.msg"
+  "msg/NavigationQuery.msg"
+  "msg/ObjectNode.msg"
+  "msg/ObjectType.msg"
+  "msg/RoomEarlyStop1.msg"
+  "msg/RoomNode.msg"
+  "msg/RoomNodeList.msg"
+  "msg/RoomType.msg"
+  "msg/TargetObject.msg"
+  "msg/TargetObjectInstruction.msg"
+  "msg/TargetObjectWithSpatial.msg"
+  "msg/VlmAnswer.msg"
+  "msg/ViewpointRep.msg"
+  DEPENDENCIES std_msgs sensor_msgs geometry_msgs
+)
+
+set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages")
+
+install(
+  DIRECTORY semantic_mapping
+  DESTINATION ${PYTHON_INSTALL_DIR}
+)
+
+install(
+  PROGRAMS
+    scripts/semantic_mapping_node
+    scripts/vlm_reasoning_node
+  DESTINATION lib/${PROJECT_NAME}
+)
+
+install(
+  FILES resource/semantic_mapping
+  DESTINATION share/${PROJECT_NAME}/resource
+)
+
+install(
+  DIRECTORY config
+  DESTINATION share/${PROJECT_NAME}
+)
+
+install(
+  DIRECTORY launch
+  DESTINATION share/${PROJECT_NAME}
+)
+
+ament_export_dependencies(rosidl_default_runtime)
+ament_package()
diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md
new file mode 100644
index 0000000..6663558
--- /dev/null
+++ b/semantic_mapping/README.md
@@ -0,0 +1,192 @@
+# Semantic Mapping
+
+ROS2 package for semantic 3D object mapping with VLM integration.
+Includes `vlm_reasoning_node` (Moondream-backed) with unified
+`semantic_mapping` message types across planner/VLM nodes.
+
+## Overview
+
+This package builds a persistent semantic map of objects detected by `detect_anything`, fusing 2D detections with LiDAR point clouds to create 3D object representations. It publishes `ObjectNode` messages compatible with the tare_planner exploration system.
+The pipeline stores voxelized geometry, merges observations, and exports planner‑friendly ROS messages.
+
+## Status: Work in Progress
+
+| Component | File | Status |
+|-----------|------|--------|
+| Messages | `msg/*.msg` | Complete |
+| Data Models | `map_object.py` | Complete |
+| Matching | `utils/matching.py` | Complete |
+| Object Mapper | `mapper.py` | Complete |
+| Cloud Projection | `vector_perception_utils/pointcloud_utils.py` | Complete |
+| Publishers | `utils/map_object_utils.py` | Complete |
+| Image Storage | `utils/storage.py` | Complete |
+| Mapping Node | `mapping_node.py` | Complete |
+| VLM Reasoning Node | `vlm_reasoning_node.py` | Complete |
+| Launch Files | `launch/` | Complete |
+
+## Messages
+
+### ObjectNode.msg
+Main output message for planner integration:
+- `object_id[]`: Track IDs (may contain multiple merged IDs)
+- `label`: Dominant class label
+- `position`: Object centroid in world frame
+- `bbox3d[8]`: 3D bounding box corners
+- `cloud`: Object point cloud
+- `status`: Lifecycle status ("new", "updated", "unchanged")
+- `img_path`: Path to best saved image crop
+- `is_asked_vlm`: VLM query flag
+- `viewpoint_id`: Associated viewpoint
+
+### ObjectType.msg
+VLM query/answer for object relabeling.
+
+### RoomType.msg
+VLM query/answer for room classification.
+
+### TargetObjectInstruction.msg
+User instruction for target object search.
+
+### TargetObjectWithSpatial.msg
+Target object query with spatial/viewpoint context.
+
+### ViewpointRep.msg
+Viewpoint trigger for image saving.
+
+Note: `detect_anything/DetectionResult` masks are cropped to the bbox and aligned to
+the bbox top-left (as documented in `detect_anything/msg/DetectionResult.msg`).
+The mapping node reconstructs full-image masks before projection.
+
+## Architecture
+
+```
+detect_anything/DetectionResult
+        │
+        ▼
+┌───────────────────────────────────────────────┐
+│              semantic_mapping_node            │
+│                                               │
+│  ┌────────────────────┐   ┌──────────────┐    │
+│  │ pointcloud_utils   │   │  ObjectMapper │    │
+│  │ (projection + mask)├──▶│   (pending/   │    │
+│  │                    │   │  permanent)  │    │
+│  └────────────────────┘   └──────────────┘    │
+│                 │                 │            │
+│                 ▼                 ▼            │
+│        ┌────────────────┐  ┌──────────────┐   │
+│        │    MapObject   │  │ map_object_  │   │
+│        │ (voxels/votes) ├─▶│   utils      │   │
+│        └────────────────┘  └──────────────┘   │
+│                 │                 │            │
+│                 ▼                 ▼            │
+│       utils/storage.py      mapping_node.py    │
+└───────────────────────────────────────────────┘
+        │
+        ▼
+/object_nodes, /obj_points, /obj_boxes, /obj_labels
+```
+
+Key components:
+- `map_object.py`: Object state, voxel voting, pose de‑duplication, cached geometry.
+- `utils/matching.py`: Track‑id and spatial matching with IoU/ratio heuristics.
+- `mapper.py`: Pending/permanent tiers, merge rules, and pruning.
+- `vector_perception_utils/pointcloud_utils.py`: Projection + mask segmentation utilities.
+- `utils/map_object_utils.py`: ROS message construction helpers.
+- `utils/storage.py`: Async image save queue.
+- `mapping_node.py`: ROS2 orchestration (subscriptions, sync, publish).
+
+## Nodes
+
+| Node | Purpose |
+|------|---------|
+| `semantic_mapping_node` | Builds the 3D object map and publishes planner object topics. |
+| `vlm_reasoning_node` | Answers room/object queries via Moondream and publishes VLM answers. |
+
+## Dependencies
+
+- `detect_anything`: Detection results with masks
+- `vector_perception_utils`: Point cloud and image utilities
+- ROS2 messages: `sensor_msgs`, `geometry_msgs`, `nav_msgs`, `visualization_msgs`
+
+## Usage
+
+```bash
+# Build
+colcon build --packages-select semantic_mapping
+
+# Run (after implementation complete)
+ros2 launch semantic_mapping semantic_mapping.launch.py
+```
+
+## Configuration
+
+- `config/params.yaml`: Default node parameters for `semantic_mapping_node`.
+- `config/camera_param.yaml`: Camera parameters + static sensor→camera transform for projection.
+- `config/objects.yaml`: Object taxonomy for detection labels.
+
+Key runtime params:
+- `regularize_voxel_size`: Connectivity voxel size used in shape regularization.
+- `ground_filter`, `ground_radius`, `ground_z_clearance`: Simple ground clamp around robot.
+- `visualization_max_objects`: Limits RViz markers/pointcloud to most recent objects.
+- `object_type_query_mode`: `target_only`, `all`, or `target_or_all` for VLM queries.
+- `object_type_query_max_per_cycle`: Max number of object queries per mapping cycle.
+
+## Topics
+
+### Subscriptions
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/detection_result` | `detect_anything/DetectionResult` | 2D detections with masks |
+| `/registered_scan` | `sensor_msgs/PointCloud2` | LiDAR point cloud |
+| `/state_estimation` | `nav_msgs/Odometry` | Robot odometry |
+| `/object_type_answer` | `ObjectType` | VLM relabeling response |
+| `/target_object_instruction` | `TargetObjectInstruction` | Target object search |
+| `/viewpoint_rep_header` | `ViewpointRep` | Viewpoint trigger |
+
+### Publications
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_nodes` | `ObjectNode` | Semantic object map |
+| `/obj_points` | `sensor_msgs/PointCloud2` | Colored object point cloud |
+| `/obj_boxes` | `visualization_msgs/MarkerArray` | 3D bounding box wireframes |
+| `/obj_labels` | `visualization_msgs/MarkerArray` | Object label text |
+| `/object_type_query` | `ObjectType` | VLM query for relabeling |
+| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Target object + viewpoints |
+
+Note: planner/VLM nodes now share `semantic_mapping/msg/*` directly (no bridge).
+
+## VLM Reasoning Topics
+
+### Subscriptions
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_type_query` | `ObjectType` | Object relabeling query |
+| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Spatial target query |
+| `/room_type_query` | `RoomType` | Room classification query |
+| `/keyboard_input` | `std_msgs/String` | Instruction text |
+
+### Publications
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_type_answer` | `ObjectType` | Object relabeling answer |
+| `/target_object_answer` | `TargetObjectWithSpatial` | Spatial target answer |
+| `/target_object_instruction` | `TargetObjectInstruction` | Parsed instruction fields |
+| `/room_type_answer` | `RoomType` | Room classification answer |
+| `/room_type_vis` | `visualization_msgs/MarkerArray` | Room label markers |
+
+## References
+
+- Full migration plan: [`docs/SEMANTIC_MAPPING_MIGRATION.md`](docs/SEMANTIC_MAPPING_MIGRATION.md)
+- Original implementation: `VLM_ROS/src/semantic_mapping/` (main branch)
+  - SAM2 for segmentation
+  - External Gemini API for VLM reasoning
+- Detection: `detect_anything` package (decoupled)
+- dimos patterns: `dimos/perception/detection/objectDB.py`
+
+## Handoff Notes
+
+- Regularization uses vote-weighted filtering + voxel connected components; adjust
+  `regularize_voxel_size` and the regularization percentile (in `MapObject`) if
+  thin structures are dropped.
+- Mapping node logs summary stats and a single mapping loop time per cycle.
+- Permanent objects persist; only pending objects time out via `pending_ttl_s`.
diff --git a/semantic_mapping/config/camera_param.yaml b/semantic_mapping/config/camera_param.yaml
new file mode 100644
index 0000000..8206adb
--- /dev/null
+++ b/semantic_mapping/config/camera_param.yaml
@@ -0,0 +1,16 @@
+image_width: 1920
+image_height: 640
+hfov: 360.0
+vfov: 120.0
+
+sensor_to_camera:
+  parent_frame: sensor
+  child_frame: camera_link
+  translation:
+    x: -0.12
+    y: -0.075
+    z: 0.265
+  rotation_rpy:
+    roll: -1.5707963
+    pitch: 0.0
+    yaw: -1.5707963
diff --git a/semantic_mapping/config/params.yaml b/semantic_mapping/config/params.yaml
new file mode 100644
index 0000000..f373fb6
--- /dev/null
+++ b/semantic_mapping/config/params.yaml
@@ -0,0 +1,57 @@
+semantic_mapping_node:
+  ros__parameters:
+    detection_topic: /detection_result
+    cloud_topic: /registered_scan
+    odom_topic: /state_estimation
+    viewpoint_topic: /viewpoint_rep_header
+    object_type_answer_topic: /object_type_answer
+    object_type_query_topic: /object_type_query
+    target_object_spatial_query_topic: /target_object_spatial_query
+    target_object_instruction_topic: /target_object_instruction
+    object_nodes_topic: /object_nodes
+    obj_points_topic: /obj_points
+    obj_boxes_topic: /obj_boxes
+    obj_labels_topic: /obj_labels
+    map_frame: map
+
+    confidence_threshold: 0.35
+    voxel_size: 0.05
+    regularize_voxel_size: 0.15
+    distance_threshold: 0.3
+    iou_threshold: 0.3
+    min_detections_for_permanent: 3
+    pending_ttl_s: 5.0
+    num_angle_bins: 15
+
+    detection_linear_state_time_bias: 0.0
+    detection_angular_state_time_bias: 0.0
+    cloud_window_before: 0.5
+    cloud_window_after: 0.1
+
+    camera_param_path: config/camera_param.yaml
+    depth_filter: true
+    depth_filter_margin: 0.15
+    depth_jump_threshold: 0.5
+    mask_erode_kernel: 3
+    mask_erode_iterations: 1
+    max_cloud_distance: 8.0
+    visualization_max_objects: 100
+    ground_filter: true
+    ground_radius: 2.5
+    ground_z_clearance: 0.0
+
+    target_object: refrigerator
+    enable_object_type_queries: False
+    save_png: True
+    output_dir: output/object_images
+    save_viewpoint_images: True
+    viewpoint_output_dir: output/viewpoint_images
+    viewpoint_use_timestamp_dir: True
+    viewpoint_save_png: True
+    viewpoint_save_transform: True
+
+vlm_reasoning_node:
+  ros__parameters:
+    dump_room_type_images: True
+    room_type_dump_dir: output/room_type_images
+    room_type_cache_s: 10.0
\ No newline at end of file
diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md
new file mode 100644
index 0000000..2c341fb
--- /dev/null
+++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md
@@ -0,0 +1,543 @@
+# Semantic Mapping Migration Plan
+
+## Overview
+
+This document outlines the comprehensive plan for porting the `semantic_mapping` package from `VLM_ROS` into `vector_perception_ros/semantic_mapping`, following the coding standards of `vector_perception_ros` while preserving functional compatibility with downstream consumers (particularly the tare_planner).
+
+> **Note**: Detection is decoupled into `detect_anything` package. VLM reasoning uses external Gemini API.
+
+### Current Status (Feb 2026)
+- `semantic_mapping` and `detect_anything` are fully migrated and buildable.
+- `detect_anything` installs `models/` into the package share; defaults resolve
+  `model_path` via package share to avoid cwd dependence.
+- Room segmentation is now in `vector_perception_ros/sensor_coverage`.
+- `/navigation_boundary` publication from room segmentation has been removed;
+  the dedicated `navigationBoundary` node should own that topic.
+
+### Handoff Notes (Next Agent)
+- Validate that bagfile/real/sim launch files in autonomy-stack match VLM_ROS.
+- Ensure `detect_anything` downloads MobileCLIP into the resolved model directory
+  (if weights still go to `~/.ultralytics/weights`, confirm `model_path` param).
+- After changes, rebuild `detect_anything`, `semantic_mapping`, `sensor_coverage`.
+
+## Design Principles
+
+1. **Logic cleanup, not restructure** - Preserve the core algorithms (voxel voting, observation angles, shape regularization, IoU/distance matching) while cleaning up dead code and improving organization.
+2. **Same I/O contract** - Maintain the same ROS topic names, message types, and semantics so existing planner integration continues to work.
+3. **vector_perception_ros coding standards** - Full type hints, Google-style docstrings, dataclass containers, proper ROS2 patterns.
+4. **ObjectDB-style patterns** - Adopt the two-tier (pending/permanent) architecture and dual matching (track_id + distance) from dimos where it improves clarity.
+
+---
+
+## Implementation Progress
+
+| Step | Component | Status |
+|------|-----------|--------|
+| 1 | Package Setup & Messages | **COMPLETED** |
+| 2 | Data Models (`map_object.py`) | **COMPLETED** |
+| 2.1 | Refactor to use shared utilities | **COMPLETED** |
+| 3 | Matching Utilities (`utils/matching.py`) | **COMPLETED** |
+| 4 | Object Mapper (`mapper.py`) | **COMPLETED** |
+| 5 | Cloud-Image Projection (`pointcloud_utils.py`) | **COMPLETED** |
+| 6 | Message Publishers (`utils/map_object_utils.py`) | **COMPLETED** |
+| 7 | Image Storage (`utils/storage.py`) | **COMPLETED** |
+| 8 | Mapping Node (`mapping_node.py`) | **COMPLETED** |
+| 9 | Launch Files & Config | **COMPLETED** |
+| 10 | Integration Testing | **COMPLETED** |
+| 11 | VLM Reasoning Node + Room Type Viz | **COMPLETED** |
+| 12 | Tare Message Bridge (/semantic topics) | **COMPLETED** |
+
+### Step 2.1: Refactored map_object.py to use shared utilities
+
+Added to `vector_perception_utils/transform_utils.py`:
+- `normalize_angle()` - now supports both scalar and array input
+- `discretize_angles()` - discretize angles into bin indices
+
+Added to `vector_perception_utils/pointcloud_utils.py`:
+- `get_oriented_bbox()` - returns (center, corners) tuple from minimal OBB
+
+Refactored in `map_object.py`:
+- Removed duplicate `_normalize_angles_to_pi` and `_discretize_angles` functions
+- Simplified `_compute_geometry()` from 43 lines to 15 using `get_oriented_bbox()`
+- Use `create_pointcloud()` in `regularize_shape()` instead of manual o3d creation
+- Use `compute_centroid()` instead of `np.mean(..., axis=0)`
+
+---
+
+## VLM_ROS Reference (Source Implementation)
+
+Key files in `VLM_ROS/src/semantic_mapping/`:
+- `semantic_map_new.py`: `ObjMapper` manages the persistent map
+- `single_object_new.py`: `SingleObject` holds per-object state
+- `semantic_mapping_node.py`: `MappingNode` orchestrates SAM2 masks, LiDAR projection, odom transforms
+
+### ObjMapper Core Flow
+- Input: dict with `bboxes`, `labels`, `confidences`, `ids`, `masks` (from SAM2), plus LiDAR cloud and odom
+- Memory fix: Uses `.copy()` on detection data extraction to prevent memory leaks
+- VLM query threshold: Objects need `best_image_mask_area > 500` before querying VLM
+- `update_map()` steps:
+  1. Transform incoming cloud into body/world frame using odom
+  2. Filter detections by confidence threshold
+  3. For each detection: reconstruct full-image masks (bbox-cropped in `detect_anything/msg/DetectionResult.msg`), project masked cloud, build per-object voxels, then either merge or create a `SingleObject`
+  4. Merge heuristics: match by tracker `obj_id`, 3D IoU, distance thresholds, and optional primitive groups (e.g., chair/sofa)
+  5. Maintain delete lists, save queues (best image/mask per object)
+  6. Publish ROS messages: `ObjectNode`, bbox markers, label markers, colored point cloud
+
+### SingleObject Highlights
+- Keeps class_id→counts/confidence, track_id list, voxel grid (`VoxelFeatureManager`), robot poses, masks, best image saving
+- Computes 3D bbox (axis-aligned + oriented), centroids, clustering flags, and validity caches
+- Manages persistence (`life`, `inactive_frame`, `publish_status`) and VLM interaction state
+
+### dimos Patterns to Adopt
+- `ObjMapper` ⇄ `ObjectDB`: explicit pending/permanent tiers, track_id + spatial matching, promotion thresholds
+- `SingleObject` ⇄ `Object`: cleaner update patterns, separation of core geometry vs VLM metadata
+
+---
+
+## Phase 1: Package Structure & Messages (COMPLETED)
+
+### 1.1 Package Structure
+
+```
+semantic_mapping/
+├── CMakeLists.txt            # Hybrid CMake for message generation + Python install
+├── package.xml               # Dependencies: detect_anything, vector_perception_utils, ROS msgs
+├── msg/
+│   ├── ObjectNode.msg
+│   ├── ObjectType.msg
+│   ├── TargetObjectInstruction.msg
+│   ├── TargetObjectWithSpatial.msg
+│   └── ViewpointRep.msg
+├── config/
+│   ├── objects.yaml          # Object taxonomy (to be copied from VLM_ROS)
+│   └── params.yaml           # Default node parameters
+├── launch/
+│   └── semantic_mapping.launch.py
+├── semantic_mapping/
+│   ├── __init__.py
+│   ├── mapping_node.py       # ROS2 node (orchestration)
+│   ├── mapper.py             # ObjectMapper class (core logic)
+│   ├── map_object.py         # Data classes: MapObject, Observation, VoxelManager
+│   ├── utils/
+│   │   ├── matching.py       # Object matching utilities (IoU, distance, track_id)
+│   │   ├── map_object_utils.py   # ROS message conversion helpers
+│   │   └── storage.py        # Async image saving queue
+│   ├── (moved to vector_perception_utils)  # Cloud-image projection helpers
+├── docs/
+│   └── SEMANTIC_MAPPING_MIGRATION.md
+└── resource/
+    └── semantic_mapping      # ament resource marker
+```
+
+### 1.2 Message Definitions
+
+**ObjectNode.msg** (preserves exact schema for planner compatibility):
+```
+std_msgs/Header header
+int32[] object_id
+string label
+geometry_msgs/Point position
+geometry_msgs/Point[8] bbox3d    # 8 corner points in world frame
+sensor_msgs/PointCloud2 cloud
+string status                    # "new", "updated", "unchanged"
+string img_path
+bool is_asked_vlm
+int32 viewpoint_id
+```
+
+**RoomType.msg**: Room classification query/answer for VLM reasoning
+**ObjectType.msg**: VLM query/answer for object relabeling
+**TargetObjectInstruction.msg**: User-set target/anchor objects
+**TargetObjectWithSpatial.msg**: Target object query with viewpoint context
+**ViewpointRep.msg**: Viewpoint trigger message
+
+---
+
+## Phase 2: Data Models (COMPLETED)
+
+### 2.1 ObjectStatus Enum
+
+```python
+class ObjectStatus(Enum):
+    """Object lifecycle status (ObjectDB-style tiering)."""
+    PENDING = "pending"       # < min_detections threshold
+    PERMANENT = "permanent"   # >= threshold, confirmed object
+    DELETED = "deleted"       # Marked for removal
+```
+
+### 2.2 Observation
+
+```python
+@dataclass
+class Observation:
+    """A single detection observation to update a MapObject."""
+    track_id: int
+    label: str
+    confidence: float
+    points: np.ndarray                    # (N, 3) world-frame points
+    robot_pose: dict                      # {R: (3,3), t: (3,)}
+    mask: np.ndarray | None = None        # 2D segmentation mask
+    image: np.ndarray | None = None       # RGB crop for best-image selection
+    timestamp: float = 0.0
+```
+
+### 2.3 VoxelFeatureManager
+
+```python
+@dataclass
+class VoxelFeatureManager:
+    """Manages voxelized point cloud with observation angle voting."""
+    voxels: np.ndarray                    # (N, 3) world coords
+    votes: np.ndarray                     # (N,) detection count per voxel
+    observation_angles: np.ndarray        # (N, num_bins) angle histogram
+    regularized_mask: np.ndarray | None   # (N,) valid voxel mask after clustering
+    voxel_size: float = 0.03
+    num_angle_bins: int = 20
+
+    def update(self, new_points: np.ndarray, robot_position: np.ndarray) -> None:
+        """Add new points with KDTree-based merge and angle voting."""
+
+    def get_valid_voxels(self, min_votes: int = 1) -> np.ndarray:
+        """Return voxels passing vote threshold and regularization mask."""
+```
+
+### 2.4 MapObject
+
+```python
+@dataclass
+class MapObject:
+    """Represents a tracked object in the semantic map."""
+    # Identity
+    object_id: str                        # UUID for this object
+    track_ids: list[int]                  # Associated tracker IDs (can merge)
+
+    # Classification (voted)
+    class_votes: dict[str, int]           # label -> observation count
+    confidence_scores: dict[str, float]   # label -> best confidence
+
+    # Geometry
+    voxel_manager: VoxelFeatureManager
+    robot_poses: list[dict]               # [{R, t}, ...] observation poses
+
+    # Lifecycle (ObjectDB-style)
+    detection_count: int = 0
+    inactive_frames: int = 0
+    status: ObjectStatus = ObjectStatus.PENDING
+
+    # VLM interaction
+    is_asked_vlm: bool = False
+    updated_by_vlm: bool = False
+    best_image_path: str | None = None
+    best_image_mask_area: float = 0.0
+
+    @property
+    def label(self) -> str: ...
+    @property
+    def centroid(self) -> np.ndarray: ...
+    @property
+    def bbox3d(self) -> np.ndarray: ...
+    def update(self, observation: Observation) -> None: ...
+    def merge(self, other: MapObject) -> None: ...
+    def regularize_shape(self, percentile: float = 0.85) -> None: ...
+```
+
+---
+
+## Phase 3: Matching Utilities (utils/matching.py)
+
+```python
+def compute_iou_3d(bbox1: np.ndarray, bbox2: np.ndarray) -> float:
+    """Compute 3D IoU between two axis-aligned bounding boxes."""
+
+def compute_center_distance(obj1: MapObject, obj2: MapObject) -> float:
+    """Compute Euclidean distance between object centroids."""
+
+def are_merge_compatible(label1: str, label2: str, merge_groups: list[set[str]]) -> bool:
+    """Check if two labels can be merged (same class or in same merge group)."""
+
+def find_best_match(
+    obs: Observation,
+    candidates: list[MapObject],
+    distance_threshold: float,
+    iou_threshold: float,
+) -> MapObject | None:
+    """Find best matching object by distance and IoU criteria."""
+```
+
+---
+
+## Phase 4: Object Mapper (mapper.py)
+
+```python
+class ObjectMapper:
+    """
+    Manages the semantic object map with two-tier state management.
+    Pending objects are promoted to permanent after detection_count >= threshold.
+    """
+
+    def __init__(
+        self,
+        voxel_size: float = 0.03,
+        confidence_threshold: float = 0.30,
+        distance_threshold: float = 0.5,
+        iou_threshold: float = 0.2,
+        min_detections_for_permanent: int = 3,
+        pending_ttl_s: float = 10.0,
+        max_inactive_frames: int = 20,
+    ) -> None:
+        self._pending_objects: dict[str, MapObject] = {}
+        self._permanent_objects: dict[str, MapObject] = {}
+        self._track_id_map: dict[int, str] = {}
+        self._lock = threading.RLock()
+        self._merge_groups: list[set[str]] = [
+            {"chair", "sofa", "couch", "armchair"},
+            {"table", "desk"},
+        ]
+
+    def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: ...
+    def _match_observation(self, obs: Observation) -> MapObject | None: ...
+    def _create_object(self, obs: Observation) -> MapObject: ...
+    def _check_promotion(self, obj: MapObject) -> None: ...
+    def _try_merge_objects(self) -> None: ...
+    def _prune_inactive(self, target_object: str | None) -> list[MapObject]: ...
+    def get_objects(self, include_pending: bool = False) -> list[MapObject]: ...
+```
+
+---
+
+## Phase 5: Mapping Node (mapping_node.py)
+
+### 5.1 Node Structure
+
+```python
+class SemanticMappingNode(Node):
+    """ROS2 node for semantic 3D object mapping."""
+
+    def __init__(self) -> None:
+        super().__init__("semantic_mapping_node")
+        self._declare_parameters()
+        self.mapper = ObjectMapper(...)
+        self.projection = pointcloud_utils
+        self.image_saver = ImageSaveQueue(...)
+
+        # Data buffers (with locks)
+        self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=50)
+        self._odom_buffer: deque[tuple[float, Pose]] = deque(maxlen=100)
+        self._lock = threading.Lock()
+
+        self._create_subscriptions()
+        self._create_publishers()
+        self.create_timer(0.5, self._mapping_callback)
+```
+
+### 5.2 Subscriptions
+
+| Topic | Type | QoS |
+|-------|------|-----|
+| `/detection_result` | `detect_anything/DetectionResult` | Reliable |
+| `/registered_scan` | `PointCloud2` | Best Effort |
+| `/state_estimation` | `Odometry` | Best Effort |
+| `/semantic/object_type_answer` | `ObjectType` | Reliable |
+| `/semantic/target_object_instruction` | `TargetObjectInstruction` | Reliable |
+| `/semantic/viewpoint_rep_header` | `ViewpointRep` | Reliable |
+
+### 5.3 Publishers
+
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_nodes` | `ObjectNode` | Main output for planner |
+| `/obj_points` | `PointCloud2` | Colored object cloud |
+| `/obj_boxes` | `MarkerArray` | Bbox wireframes |
+| `/obj_labels` | `MarkerArray` | Text labels |
+| `/object_type_query` | `ObjectType` | VLM query |
+| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Target object + viewpoints |
+
+Note: semantic_mapping topics are now published without a `/semantic` prefix and
+no bridge is required. Annotated debug images are published by `detect_anything`.
+Note: `semantic_mapping.launch.py` now includes a temporary image republisher
+to bridge `compressed_image_topic` (e.g., `/camera/image/compressed`) to
+`image_topic` (default `/camera/image`) so legacy `tare_planner` room-type
+queries can still consume raw `sensor_msgs/Image`.
+
+---
+
+## Phase 6: Cloud-Image Projection (pointcloud_utils.py)
+
+```python
+def project_pointcloud_to_image(...): ...
+def segment_pointcloud_by_mask(...): ...
+def segment_pointclouds_from_projection(...): ...
+def transform_segmented_clouds_to_world(...): ...
+```
+
+---
+
+## Phase 7: Message Publishers (utils/map_object_utils.py)
+
+```python
+def map_object_to_object_node(obj: MapObject, header: Header, status: str, target_object: str | None = None) -> ObjectNode | None: ...
+def create_bbox_markers(objects: list[MapObject], header: Header) -> MarkerArray: ...
+def create_label_markers(objects: list[MapObject], header: Header) -> MarkerArray: ...
+def create_object_pointcloud(objects: list[MapObject], header: Header) -> PointCloud2 | None: ...
+def create_delete_markers(deleted_ids: list[int], header: Header, namespace: str) -> MarkerArray: ...
+```
+
+Publisher helpers:
+- `map_object_to_object_node` converts a `MapObject` into an `ObjectNode` with centroid, bbox corners, and point cloud.
+- Marker helpers emit bbox wireframes and text labels for RViz.
+- `create_object_pointcloud` builds a colored aggregate cloud from object voxels.
+
+---
+
+## Phase 8: Image Storage (utils/storage.py)
+
+```python
+class ImageSaveQueue:
+    """Background worker for saving object images to disk."""
+
+    def __init__(self, output_dir: Path, max_queue_size: int = 100) -> None: ...
+    def start(self) -> None: ...
+    def stop(self) -> None: ...
+    def queue_save(self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None) -> str: ...
+```
+
+Image storage:
+- Async worker writes `.npy` (and optional `.png`) crops to disk.
+- Queue is bounded to avoid blocking the main mapping loop.
+- Viewpoint snapshots are saved as `viewpoint_{id}.png` and
+  `viewpoint_{id}_transform.npy` in `viewpoint_output_dir` (timestamped per run).
+
+---
+
+## Phase 9: Launch Files
+
+```python
+# semantic_mapping.launch.py
+DeclareLaunchArgument('platform', default_value='mecanum_sim')
+DeclareLaunchArgument('detection_topic', default_value='/detection_result')
+DeclareLaunchArgument('cloud_topic', default_value='/registered_scan')
+DeclareLaunchArgument('odom_topic', default_value='/state_estimation')
+DeclareLaunchArgument('voxel_size', default_value='0.03')
+DeclareLaunchArgument('confidence_threshold', default_value='0.30')
+DeclareLaunchArgument('target_object', default_value='')
+```
+
+---
+
+## Implementation Checklist
+
+### Completed
+- [x] Create `CMakeLists.txt` for message generation
+- [x] Port message definitions from tare_planner
+- [x] Set up `package.xml` with dependencies
+- [x] Implement `map_object.py` with VoxelFeatureManager, MapObject, Observation
+- [x] Port lazy computation patterns (centroid, bbox3d, regularization)
+- [x] Refactor `map_object.py` to use shared utilities from `vector_perception_utils`
+- [x] Add `normalize_angle`, `discretize_angles` to `transform_utils.py`
+- [x] Add `get_oriented_bbox` to `pointcloud_utils.py`
+- [x] Simplify `_compute_geometry()` using Open3D oriented bbox
+- [x] Implement `utils/matching.py` with IoU, distance, merge-compatibility functions
+- [x] Implement `mapper.py` with ObjectDB-style tiering
+- [x] Implement cloud-image projection helpers in `vector_perception_utils/pointcloud_utils.py`
+- [x] Implement `utils/map_object_utils.py` message conversion functions
+- [x] Implement `utils/storage.py` with async queue
+- [x] Implement `mapping_node.py` following detect_anything patterns
+- [x] Add `RoomType.msg` and room-type VLM flow + visualization
+- [x] Add `vlm_reasoning_node.py` with Moondream-backed reasoning
+- [x] Remove `tare_message_bridge` and `/semantic` topic namespace
+
+### Remaining
+- [x] Create launch files and params.yaml
+- [x] Integration testing with bag files
+
+---
+
+## Code Cleanup Items
+
+Items from VLM_ROS to preserve:
+- Memory fixes with `.copy()` on detection data extraction
+- VLM query threshold (`best_image_mask_area > 500`)
+
+Items to remove:
+1. Unused publishers: `/cloud_image`
+2. Unused methods: `serialize_objs_to_bag()`, `is_merge_allowed()`, `handle_object_query()`
+3. Review constants: `BACKGROUND_OBJECTS`, `VERTICAL_OBJECTS`, `BIG_OBJECTS`, `SMALL_OBJECTS`
+4. Don't port: `byte_track`, `Grounded-SAM-2`, `mmconfig` directories
+
+---
+
+## Compatibility Checklist
+
+- [x] `/semantic/object_nodes` publishes ObjectNode with exact schema (bridged to `/object_nodes`)
+- [x] `/obj_points` publishes colored PointCloud2
+- [x] `/obj_boxes` publishes MarkerArray wireframes
+- [x] `/obj_labels` publishes MarkerArray text
+- [x] `/semantic/object_type_query` publishes ObjectType for VLM
+- [x] `/semantic/room_type_query` publishes RoomType for VLM
+- [x] Subscribes to `/detection_result` from detect_anything
+- [x] Subscribes to `/registered_scan` PointCloud2
+- [x] Subscribes to `/state_estimation` Odometry
+- [x] Subscribes to `/semantic/target_object_instruction`
+- [x] Subscribes to `/semantic/viewpoint_rep_header`
+- [x] Subscribes to `/semantic/object_type_answer`
+- [x] Subscribes to `/semantic/room_type_answer`
+- [x] Object positions, bbox corners, status flags match expected semantics
+- [x] Delete markers properly remove objects from RViz
+
+---
+
+## Message Type Alignment (Planner Integration)
+
+Planner/VLM traffic now uses `semantic_mapping/msg/*` end-to-end. The
+previous package-level mismatch has been removed, so no bridge is required.
+
+### Impact
+
+- `semantic_mapping` publishes and subscribes to `semantic_mapping/*` messages
+  on planner topic names (no `/semantic` prefix required).
+
+---
+
+## Recent Updates (Jan 2026)
+
+- Unified launch to `semantic_mapping.launch.py` with mode handling, detect_anything integration, and bagfile defaults.
+- Fixed cloud/image projection transform to match legacy `CloudImageFusion` math.
+- Added mask erosion, depth-jump pruning, distance gating, and light outlier filtering for cleaner object clouds.
+- Added reprojected observation-angle voting for better multi-view aggregation.
+- Fixed bbox ordering and switched `/obj_boxes` to axis-aligned markers (via `get_oriented_bbox(axis_aligned=True)`).
+- Replaced DBSCAN regularization with vote-weighted filtering + voxel connected components.
+- Added `regularize_voxel_size` to grow connectivity for thin structures.
+- Simplified ground filtering to radius-based Z clamp around robot.
+- Added `visualization_max_objects` to keep recent objects visible in RViz.
+- Logging: mapping stats (total/total_deleted/total_permanent) + mapping loop time.
+- Fixed image saving color order and ensured output dir is cleared on start.
+- Set MobileCLIP weights cache to `detect_anything/models` and removed corrupted caches.
+- Viewpoint snapshots are saved via `ImageSaveQueue` with timestamped run folders.
+- Added `RoomType.msg` and VLM room-type query/answer with room label visualization.
+- Removed `/semantic` topic namespace and bridge in favor of unified message types.
+- Moved `project_bbox3d_equirect` into `vector_perception_utils` to drop `dimos` dependency.
+- Added Moondream request timeouts and escaped JSON prompt templates to avoid hangs.
+- Added object-type query fallback when target labels are absent.
+- Stabilized room label colors and trimmed non-essential logging to debug level.
+
+## Temporary Compatibility Notes
+
+- Planner and mapping nodes now share `semantic_mapping/msg/*`; no bridge is required.
+
+## Temporary Handoff Notes
+
+- Regularization is now connected-components; tune `regularize_voxel_size` and keep ratio
+  (`MapObject.regularize_shape` percentile) if legs/sparse parts are dropped.
+- Ground filter uses `ground_filter`, `ground_radius`, `ground_z_clearance`.
+- If bagfile sync issues persist, consider switching mapping inputs to `/sensor_scan` + `/state_estimation_at_scan`.
+- Verify VLM objects.yaml path in VLM_ROS (`vlm_reasoning_node` still expects old path).
+
+---
+
+## Planner Dependencies
+
+The exploration planner (`sensor_coverage_planner_ground.cpp` in tare_planner) is tightly coupled to `ObjectNode`:
+- Subscribes to `/object_nodes` and stores per-object reps
+- Uses object positions, bbox corners, clouds, status, `is_asked_vlm`, and `img_path` for navigation decisions
+
+**Implication**: Keep the `ObjectNode` schema stable. Any new fields from the `MapObject` model can be mapped into extensions, but existing fields and semantics must remain for planner compatibility.
diff --git a/semantic_mapping/docs/TARE_PLANNER_VLM_DIFF_AND_MIGRATION.md b/semantic_mapping/docs/TARE_PLANNER_VLM_DIFF_AND_MIGRATION.md
new file mode 100644
index 0000000..471e99c
--- /dev/null
+++ b/semantic_mapping/docs/TARE_PLANNER_VLM_DIFF_AND_MIGRATION.md
@@ -0,0 +1,733 @@
+# TARE Planner (VLM_ROS vs Autonomy Stack) Diff + VLM Logic Deep Dive
+
+## 0) Purpose and Scope
+This document compares the two `tare_planner/src` trees and provides a detailed
+breakdown of the VLM/room logic inside
+`VLM_ROS/src/exploration_planner/tare_planner/src/sensor_coverage_planner/sensor_coverage_planner_ground.cpp`.
+It also proposes a migration path into a new `vector_perception_ros`
+package named `sensor_coverage_planner`, including room segmentation.
+
+### 0.1 Migration Status (Feb 2026)
+Completed:
+- `tare_planner` in autonomy-stack now uses `semantic_mapping/msg/*` in
+  `representation` and `sensor_coverage_planner_ground` (headers + source).
+- `tare_planner` CMake and package dependencies updated for `semantic_mapping`.
+- `navigationBoundary` publisher code matches VLM_ROS.
+- Added `boundary_sim.ply` to `tare_planner/data` for sim launch.
+- `explore_world.launch` and `explore_world_sim.launch` default
+  `use_boundary:=false` (no boundary published unless explicitly enabled).
+- Room segmentation node moved to `vector_perception_ros/sensor_coverage`.
+
+Open items / verification:
+- Re-check `system_bagfile_with_exploration_planner.launch.py` alignment with
+  VLM_ROS (file was reverted during edits).
+- Ensure `tare_planner` install includes `boundary_sim.ply` after rebuild.
+- Remove `pub_room_boundary_tmp_` declaration in room segmentation header
+  if no longer used.
+
+Scope:
+- Compare all package/file differences between:
+  - `autonomy_stack_mecanum_wheel_platform/src/exploration_planner/tare_planner/src`
+  - `VLM_ROS/src/exploration_planner/tare_planner/src`
+- Map all ROS inputs/outputs (topics + message types) for the nodes in scope.
+- Deep dive on the main VLM logic file (SensorCoveragePlanner3D).
+- Feasibility + cleanup/migration plan into `vector_perception_ros`.
+
+## 1) Context From Existing Migration Docs (Short)
+Key context pulled from:
+- `vector_perception_ros/vlm/docs/VLM_INPUTS_AND_SELECTION.md`
+- `vector_perception_ros/vlm/docs/VLM_NODE_MIGRATION.md`
+- `vector_perception_ros/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md`
+
+Highlights relevant to `tare_planner`:
+- Room-type VLM images are produced inside `sensor_coverage_planner_ground.cpp`
+  via `project_pcl_to_image(...)` (cropped panorama + room mask). Viewpoint
+  snapshots are **not** used for room type.
+- Object-type VLM uses the "best image" crop from semantic mapping, not planner.
+- Target-object spatial queries use viewpoint snapshots and 3D bbox projection.
+- Message types are now unified under `semantic_mapping/msg/*`; no bridge is
+  required.
+- Real-time constraints: avoid heavy per-parameter checks and excessive
+  try/except patterns in hot loops.
+
+## 2) Directory and File Differences
+
+### 2.1 Directory-Level Diff
+Only in `VLM_ROS/src/exploration_planner/tare_planner/src`:
+- `representation/`
+  - `representation.cpp`
+- `room_segmentation/`
+  - `room_segmentation.cpp`
+  - `room_segmentation_incremental_normal.cpp`
+  - `room_segmentation_backup.cpp`
+
+All other directories are present in both trees.
+
+### 2.2 File-Level Diff (Common Directories)
+Files that differ (content changes):
+- `grid_world/grid_world.cpp`
+- `local_coverage_planner/local_coverage_planner.cpp`
+- `planning_env/planning_env.cpp`
+- `rolling_occupancy_grid/rolling_occupancy_grid.cpp`
+- `sensor_coverage_planner/sensor_coverage_planner_ground.cpp`
+- `utils/misc_utils.cpp`
+- `viewpoint/viewpoint.cpp`
+- `viewpoint_manager/viewpoint_manager.cpp`
+
+Summary of differences by file:
+
+**`grid_world.cpp`**
+- Adds room-level state and navigation gates:
+  - `current_room_id_`, `transit_across_room_`, `room_finished_`,
+    `room_near_finished_`, `door_position_`.
+- Adds object/anchor state gates:
+  - `object_found_`, `anchor_object_found_`, `found_object_position_`.
+- Reads room resolution + voxel dimensions (`room_x/y/z`) and computes `shift_`.
+- Restricts cell status transitions to cells that are "in current room".
+- Adds explicit pathing for:
+  - transit across rooms (door path),
+  - found object path,
+  - anchor object TSP path (dynamic mode).
+- Adds `InLocalPlanningHorizonWithoutRoom` gating for roadmap connections.
+
+**`local_coverage_planner.cpp`**
+- Introduces room-aware viewpoint selection:
+  - `viewpoint_roomcandidate_indices` and
+    `ViewPointRoomCandidate` sampling.
+- Lookahead and boundary viewpoint logic now bypasses room gating when needed:
+  - `InLocalPlanningHorizonWithoutRoom(...)`.
+- Adds `transit_across_room_`, `object_found_`, `anchor_object_found_` gates.
+- When transiting/target found: local planning becomes a constrained path
+  using navigation viewpoints rather than exploration sampling.
+
+**`planning_env.cpp`**
+- Adds door collision checks and parameters:
+  - `kDoorCollisionCheckRadius`, `kDoorCollisionCheckPointNumThr`.
+- Adds occupancy-grid line-of-sight hooks:
+  - `CheckLineOfSightInOccupancyGrid(...)`
+  - `Pos2Sub(...)`, `InRange(...)`.
+- Adds `occupied_cloud` storage for room segmentation inputs.
+- Improves safety on coverage-boundary usage (warn when empty).
+
+**`rolling_occupancy_grid.cpp`**
+- Adds sensor-range-limited ray tracing and keeps "COVERED" cells.
+- Adds visibility / line-of-sight check for ray casting:
+  - `CheckLineOfSight(...)`, `PublishVoxels(...)`.
+- Adds helper conversions `Pos2Sub(...)`, `InRange(...)`.
+- Adds parameters for viewpoint collision margin Z.
+- Publishes ray voxels for debugging.
+
+**`utils/misc_utils.cpp`**
+- Adds `SetIntersection(...)` (used for viewpoint rep updates).
+- Adds BFS path helper `find_path_bfs(...)` (room transitions).
+- Adds `idToColor(...)` and HSV conversion (stable room coloring).
+- Adds voxel mapping helpers:
+  - `point_to_voxel(...)`, `voxel_to_point(...)`,
+    cropped variants for room masks.
+
+**`viewpoint.cpp`**
+- Adds room flags to each viewpoint:
+  - `in_room_`, `is_room_candidate_`.
+
+**`viewpoint_manager.cpp`**
+- Reads room parameters and maintains `room_mask_`.
+- Adds `CheckViewPointRoomBoundaryCollision()` to set `in_room_`.
+- Adds room candidate flagging and exposure in candidate selection.
+- Adds `InLocalPlanningHorizonWithoutRoom(...)` to allow pathing near doors.
+- Adds freespace cloud publication and room gates on candidate viewpoints.
+
+**`sensor_coverage_planner_ground.cpp`**
+- Huge expansion to integrate:
+  - `representation` (viewpoint, room, object).
+  - `room_segmentation` inputs.
+  - VLM room/object query + navigation logic.
+  - Object visibility graph and viewpoint snapshots.
+  - Room navigation state machine (finish, change, transit).
+- Full deep dive in section 4.
+
+### 2.3 Common Core Between the Two `sensor_coverage_planner_ground.cpp` Versions
+These are shared in both the autonomy-stack and VLM_ROS files (the VLM version
+layers additional room/object logic on top).
+
+**Common node lifecycle**
+- `SensorCoveragePlanner3D` class, 1 Hz `execute()` timer.
+- Same init chain: `ReadParameters()` → `InitializeData()` → `initialize()`.
+- Shared managers and core members: `keypose_graph_`, `planning_env_`,
+  `grid_world_`, `local_coverage_planner_`, `viewpoint_manager_`,
+  `visualizer_`, plus the same PCL debug clouds.
+
+**Common subscriptions**
+- `/exploration_start`, `/state_estimation_at_scan`, `/registered_scan`
+- `/terrain_map`, `/terrain_map_ext`
+- `/coverage_boundary`, `/navigation_boundary`, `/nogo_boundary`
+- `/joy`, `/reset_waypoint`
+
+**Common publishers**
+- `global_path_full`, `global_path`, `local_path`, `exploration_path`
+- `/way_point`, `exploration_finish`
+- `runtime_breakdown`, `/runtime`, `momentum_activation_count`
+
+**Common planning pipeline in `execute()`**
+- Start gate on `/exploration_start` unless `kAutoStart`.
+- First-cycle bootstrap: `SendInitialWaypoint()` + start-time init.
+- On `keypose_cloud_update_`:
+  - `CountDirectionChange()`
+  - `UpdateGlobalRepresentation()`
+  - `UpdateViewPoints()` and `UpdateKeyposeGraph()`
+  - `UpdateViewPointCoverage()` + `UpdateCoveredAreas()`
+  - `GlobalPlanning()` + `LocalPlanning()`
+  - `ConcatenateGlobalLocalPath()` + `GetLookAheadPoint()`
+  - `PublishWaypoint()` + visualization + runtime publishing
+
+**Common helper functions**
+- `SendInitialWaypoint`, `UpdateKeyposeGraph`, `UpdateViewPoints`,
+  `UpdateViewPointCoverage`, `UpdateCoveredAreas`, `UpdateGlobalRepresentation`
+- `GlobalPlanning`, `LocalPlanning`
+- `PublishGlobalPlanningVisualization`, `PublishLocalPlanningVisualization`
+- `ConcatenateGlobalLocalPath`, `GetLookAheadPoint`, `PublishWaypoint`,
+  `PublishRuntime`, `CountDirectionChange`
+
+## 3) ROS I/O Mapping
+
+### 3.1 `SensorCoveragePlanner3D` (VLM_ROS)
+Node: `tare_planner_node` creates and spins `SensorCoveragePlanner3D`.
+
+Subscriptions (inputs):
+- `/exploration_start` (`std_msgs/Bool`)
+- `/state_estimation_at_scan` (`nav_msgs/Odometry`)
+- `/registered_scan` (`sensor_msgs/PointCloud2`)
+- `/terrain_map` (`sensor_msgs/PointCloud2`)
+- `/terrain_map_ext` (`sensor_msgs/PointCloud2`)
+- `/coverage_boundary` (`geometry_msgs/PolygonStamped`)
+- `/navigation_boundary` (`geometry_msgs/PolygonStamped`)
+- `/current_room_boundary` (`geometry_msgs/PolygonStamped`) [declared param, used in room manager]
+- `/nogo_boundary` (`geometry_msgs/PolygonStamped`)
+- `/joy` (`sensor_msgs/Joy`)
+- `/reset_waypoint` (`std_msgs/Empty`)
+- `/object_nodes` (`tare_planner/ObjectNode`)
+- `/door_cloud` (`sensor_msgs/PointCloud2`)
+- `/room_nodes_list` (`tare_planner/RoomNodeList`)
+- `/goal_point` (`geometry_msgs/PointStamped`)
+- `/room_mask` (`sensor_msgs/Image`)
+- `/camera/image` (`sensor_msgs/Image`)
+- `/room_type_answer` (`tare_planner/RoomType`)
+- `/room_navigation_answer` (`tare_planner/VlmAnswer`)
+- `/keyboard_input` (`std_msgs/String`)
+- `/target_object_instruction` (`tare_planner/TargetObjectInstruction`)
+- `/target_object_answer` (`tare_planner/TargetObject`)
+- `/anchor_object_answer` (`tare_planner/TargetObject`)
+
+Publishers (outputs):
+- `global_path_full` (`nav_msgs/Path`)
+- `global_path` (`nav_msgs/Path`)
+- `old_global_path` (`nav_msgs/Path`)
+- `to_nearest_global_subspace_path` (`nav_msgs/Path`)
+- `local_path` (`nav_msgs/Path`)
+- `exploration_path` (`nav_msgs/Path`)
+- `/way_point` (`geometry_msgs/PointStamped`)
+- `exploration_finish` (`std_msgs/Bool`)
+- `runtime_breakdown` (`std_msgs/Int32MultiArray`)
+- `/runtime` (`std_msgs/Float32`)
+- `momentum_activation_count` (`std_msgs/Int32`)
+- `pointcloud_manager_neighbor_cells_origin` (`geometry_msgs/PointStamped`)
+- `viewpoint_rep_header` (`tare_planner/ViewpointRep`)
+- `object_visibility_connections` (`visualization_msgs/MarkerArray`)
+- `viewpoint_object_visibility` (`std_msgs/String`) [currently unused]
+- `/door_position` (`geometry_msgs/PointStamped`)
+- `/door_normal` (`visualization_msgs/Marker`)
+- `/room_cloud` (`sensor_msgs/PointCloud2`)
+- `/room_type_query` (`tare_planner/RoomType`)
+- `/room_early_stop_1` (`tare_planner/RoomEarlyStop1`)
+- `/room_type_vis` (`visualization_msgs/MarkerArray`)
+- `viewpoint_room_ids` (`visualization_msgs/MarkerArray`)
+- `/room_navigation_query` (`tare_planner/NavigationQuery`)
+- `/object_node_markers` (`visualization_msgs/MarkerArray`)
+- `/chosen_room_boundary` (`visualization_msgs/Marker`)
+- `/target_object_query` (`tare_planner/TargetObject`)
+- `/anchor_object_query` (`tare_planner/TargetObject`)
+- `/target_object_spatial_query` (`tare_planner/TargetObjectWithSpatial`)
+- `/room_anchor_point` (`geometry_msgs/PointStamped`)
+
+Key parameters (VLM additions):
+- `room_resolution`, `room_x`, `room_y`, `room_z`
+- `rep_threshold_`, `kRepSensorRange`
+- `kRushRoomDist_1`, `kRushRoomDist_2`
+- `kCollisionGrid*`, `kCoverage*` (inherited)
+
+I/O delta vs autonomy_stack:
+- Autonomy version uses only exploration, map, boundary, joystick, and path
+  topics. VLM_ROS adds: room segmentation, object nodes, VLM queries/answers,
+  viewpoint reps, visibility markers, navigation queries, door clouds, room
+  masks, camera image pipeline.
+
+### 3.2 `RoomSegmentationNode` (VLM_ROS)
+Node: `room_segmentation_node` (standalone main in `room_segmentation.cpp`).
+
+Subscriptions:
+- `/registered_scan` (`sensor_msgs/PointCloud2`)
+- `/state_estimation` (`nav_msgs/Odometry`)
+- `/occupied_cloud` (`sensor_msgs/PointCloud2`)
+- `/freespace_cloud` (`sensor_msgs/PointCloud2`)
+
+Publishers:
+- `/explore_areas_new` (`sensor_msgs/PointCloud2`)
+- `/room_mask_vis` (`sensor_msgs/Image`)
+- `/room_mask` (`sensor_msgs/Image`)
+- `/door_cloud` (`sensor_msgs/PointCloud2`)
+- `/debug_cloud` (`sensor_msgs/PointCloud2`)
+- `/free_cloud_1` (`sensor_msgs/PointCloud2`)
+- `/current_room_boundary` (`geometry_msgs/PolygonStamped`)
+- `/navigation_boundary` (`geometry_msgs/PolygonStamped`)
+- `/room_boundaries` (`visualization_msgs/MarkerArray`)
+- `/walls` (`sensor_msgs/PointCloud2`)
+- `/room_nodes` (`tare_planner/RoomNode`)
+- `/room_nodes_list` (`tare_planner/RoomNodeList`)
+- `/room_map_cloud` (`sensor_msgs/PointCloud2`)
+
+Key parameters:
+- `room_resolution`, `room_x/y/z`, `rolling_occupancy_grid.resolution_x`
+- `exploredAreaVoxelSize`, `ceilingHeight_`, `wall_thres_height_`
+- `region_growing_radius`, `min_room_size`, `dilation_iteration`
+- `normal_search_num`, `normal_search_radius`
+- `kViewPointCollisionMarginZPlus`, `kViewPointCollisionMarginZMinus`
+
+### 3.3 `navigationBoundary` Node
+Node: `navigationBoundary` (file-based boundary publisher).
+
+Publishers:
+- `/navigation_boundary` (`geometry_msgs/PolygonStamped`)
+
+Inputs:
+- File on disk via `boundary_file_dir` param
+- Params: `sendBoundary`, `sendBoundaryInterval`
+
+### 3.4 Legacy `SensorCoveragePlanner3D` Outputs (Autonomy Stack)
+These are the outputs of the **old** planner that must be preserved/relayed
+so navigation keeps working unchanged.
+
+**Primary outputs (used by navigation stack + tooling)**
+- `global_path_full` (`nav_msgs/Path`)
+- `global_path` (`nav_msgs/Path`)
+- `old_global_path` (`nav_msgs/Path`)
+- `to_nearest_global_subspace_path` (`nav_msgs/Path`)
+- `local_path` (`nav_msgs/Path`)
+- `exploration_path` (`nav_msgs/Path`)
+- `/way_point` (`geometry_msgs/PointStamped`)
+- `exploration_finish` (`std_msgs/Bool`)
+- `runtime_breakdown` (`std_msgs/Int32MultiArray`)
+- `/runtime` (`std_msgs/Float32`)
+- `momentum_activation_count` (`std_msgs/Int32`)
+- `pointcloud_manager_neighbor_cells_origin` (`geometry_msgs/PointStamped`)
+
+**Debug/visualization outputs (PCLCloud wrappers)**
+These are published via `PCLCloud::Publish()` in the old file and appear as
+topics with the names passed at construction time:
+- `keypose_cloud`, `registered_scan_stack`, `registered_cloud`
+- `terrain_cloud_large`, `terrain_collision_cloud`, `terrain_ext_collision_cloud`
+- `viewpoint_vis_cloud`, `viewpoint_in_collision_cloud_`
+- `grid_world_vis_cloud`, `bspline_path_cloud`, `selected_viewpoint_vis_cloud`
+- `collision_cloud`, `lookahead_point_cloud`, `keypose_graph_cloud`
+- `pointcloud_manager_cloud`, `reordered_global_subspace_cloud`
+
+If you only want to keep strict navigation compatibility, the **primary**
+outputs above are the ones that must be relayed; debug topics can be optional.
+
+## 4) SensorCoveragePlanner3D VLM Logic: Deep Dive
+
+This section breaks down the VLM/room logic inside
+`sensor_coverage_planner_ground.cpp` with explicit loops, data dependencies,
+and I/O.
+
+### 4.0 Legacy Core Data Structures Reused by VLM Logic
+These are **old planner data structures** that the VLM logic depends on and
+assumes to behave the same way. They should stay intact when migrating the
+VLM logic into `vector_perception_ros`.
+
+**Core managers (shared across old + new logic)**
+- `planning_env_`: provides coverage, collision cloud, occupancy-grid access,
+  and diff/stacked clouds used by room updates and visibility checks.
+- `grid_world_`: global exploration state, cell status, and return-home logic.
+- `viewpoint_manager_`: viewpoint candidates, LOS/connectivity, and local horizon.
+- `keypose_graph_`: shortest-path computations for objects and room navigation.
+- `local_coverage_planner_`: local path TSP and waypoint ordering.
+- `visualizer_`: markers for local/global planning, paths, and debug views.
+
+**State gating and control**
+- `keypose_cloud_update_`: the main update latch that drives each cycle.
+- `robot_position_`, `robot_yaw_`, `moving_direction_`, `moving_forward_`:
+  pose and motion state used throughout VLM + navigation logic.
+- `lookahead_point_`, `lookahead_point_direction_`:
+  main input to waypoint publishing.
+- `visited_positions_`: used for coverage heuristics and candidate pruning.
+
+**Cloud buffers and derived products**
+- `keypose_cloud_`, `registered_scan_stack_`, `registered_cloud_`:
+  keypose + scan accumulation and downsampling.
+- `terrain_*` clouds: terrain height and collision data for viewpoint filtering.
+- `collision_cloud_`, `viewpoint_vis_cloud_`, `viewpoint_in_collision_cloud_`:
+  inputs to viewpoint candidate selection and visualization.
+- `grid_world_vis_cloud_`, `exploration_path_cloud_`,
+  `selected_viewpoint_vis_cloud_`, `lookahead_point_cloud_`,
+  `keypose_graph_vis_cloud_`, `pointcloud_manager_neighbor_cloud_`:
+  shared debug channels for planners and RViz.
+
+**Planning state**
+- `exploration_path_` and intermediate global/local paths.
+- `robot_viewpoint_`: robot coverage proxy for occupancy updates.
+
+These structures are **not VLM-specific**, but the new VLM logic reads from
+them heavily (coverage clouds, keypose graph distances, and viewpoint manager
+state), so they must be preserved in the new node.
+### 4.1 Core Data Structures and State
+**Representation layer**
+- `representation_` (new in VLM_ROS)
+  - `ViewPointRep`: viewpoint snapshots + visibility lists.
+  - `RoomNodeRep`: room labels, anchor point, polygon, room mask.
+  - `ObjectNodeRep`: object attributes + visible viewpoints.
+
+**Room grid**
+- `room_mask_`, `room_mask_old_`: `cv::Mat` integer room id maps.
+- `room_voxel_dimension_`, `room_resolution_`, `shift_`: map voxel grid.
+- `current_room_id_`, `target_room_id_`, `start_room_id_`, `end_room_id_`.
+- `prev_room_id_`, `previous_room_id_`: used for split/merge handling.
+- `adjacency_matrix`: room connectivity from door clouds.
+
+**Door / transition state**
+- `door_cloud_`, `door_cloud_final_`, `door_cloud_in_range_`.
+- `door_position_`, `door_normal_`.
+- `transit_across_room_`, `at_room_`, `near_room_1_`, `near_room_2_`.
+- `enter_wrong_room_`, `ask_vlm_change_room_`.
+
+**VLM query state**
+- `ask_vlm_near_room_`, `ask_vlm_finish_room_`, `asked_in_advance_`.
+- `room_navigation_query_counter_`, `room_finished_counter_`.
+- `candidate_room_position_`, `has_candidate_room_position_`.
+
+**Object state**
+- `found_object_`, `found_object_id_`, `found_object_distance_`, `ask_found_object_`.
+- `found_anchor_object_`, `anchor_object_`, `found_anchor_object_viewpoint_positions_`.
+- `obj_score_`, `considered_object_ids_`.
+
+**Viewpoint representation**
+- `viewpoint_reps_`, `curr_viewpoint_rep_node_ind`.
+- `add_viewpoint_rep_`, `rep_threshold_`, `rep_sensor_range`, `rep_threshold_voxel_num_`.
+- `current_obs_voxel_inds_`, `previous_obs_voxel_inds_`.
+
+**Camera + odom alignment**
+- `camera_image_`, `imageTime` (from `/camera/image`).
+- Odom stacks for interpolation: `odomTimeStack`, `lidarXStack`, etc.
+
+### 4.2 Input Callbacks (Data Ingestion)
+**StateEstimationCallback**
+- Updates robot position, yaw, motion direction.
+- Maintains odom ring buffers for time-aligned projection in `GetPoseAtTime`.
+- Updates `viewpoint_rep_msg_` header stamp/frame.
+
+**RegisteredScanCallback**
+- Aggregates scans, downsamples, updates planning environment.
+- Every N frames: pushes keypose node into graph and sets `keypose_cloud_update_`.
+
+**RoomMaskCallback**
+- Converts `/room_mask` image to `cv::Mat` (int32), resizes to room voxel grid.
+- Updates:
+  - `room_mask_`
+  - `viewpoint_manager_` room mask
+  - `grid_world_` room mask
+  - `representation_` viewpoint room ids (`UpdateViewpointRoomIdsFromMask`)
+
+**RoomNodeListCallback**
+- Ingests `/room_nodes_list` and updates representation room map.
+- Marks nodes as alive, removes dead nodes, validates anchor points against mask.
+
+**ObjectNodeCallback**
+- Updates representation object node from `/object_nodes`.
+- Maintains `latest_object_node_indices_` for visibility updates.
+
+**DoorCloudCallback**
+- Ingests `/door_cloud`, filters by collision, builds adjacency matrix.
+- Colors door point cloud for visualization.
+
+**CameraImageCallback**
+- Stores panoramic image for room crop pipeline.
+- Updates `imageTime` for projection alignment.
+
+**RoomTypeCallback**
+- Ingests `/room_type_answer`:
+  - Locates room id from anchor point and room mask.
+  - Accumulates voxel counts per label.
+  - If room label changes, marks objects in that room as unconsidered.
+
+**RoomNavigationAnswerCallback**
+- Ingests `/room_navigation_answer`.
+- `answer_type == 1`: early stop; immediately sets goal via anchor point.
+- `answer_type == 0`: room navigation answer; stores candidate room anchor.
+
+**TargetObjectInstructionCallback**
+- Resets target/anchor object context.
+- Clears considered flags on objects and resets room asked flags.
+
+**TargetObjectCallback / AnchorObjectCallback**
+- Marks object or anchor as found if VLM confirms.
+- Updates distances and stores object position + room id.
+
+### 4.3 Object Visibility Graph (Major Loop)
+**`UpdateObjectVisibility()`**
+- Input: `representation_->GetLatestObjectNodeIndicesMutable()` (new objects).
+- For each object id:
+  - Fetch object position + cloud, then convert cloud to voxel indices
+    via `Convert2Voxels(...)` (uses `planning_env_->Pos2Sub` + `InRange`).
+  - For each `ViewPointRep`:
+    - If viewpoint already references the object, mark direct visibility and
+      continue.
+    - If object has the viewpoint in its visible list, connect immediately.
+    - Else do ray-casting visibility:
+      - Compute viewpoint camera position (+0.265 z).
+      - Skip if outside `rep_sensor_range`.
+      - For each object voxel, call
+        `CheckRayVisibilityInOccupancyGrid(...)` which defers to
+        `rolling_occupancy_grid_->CheckLineOfSight(...)`.
+      - On first visible voxel:
+        - Add object index to viewpoint.
+        - Add viewpoint id to object.
+        - Mark object as not considered.
+  - Object-room relation:
+    - Map object centroid to room voxel -> room id.
+    - If room id is 0 (unknown), dilate search by 2 voxels to find a valid id.
+    - Call `representation_->SetObjectRoomRelation(...)`.
+- After processing all new objects:
+  - Compute `obj_score_` by ray-casting from current robot position to object
+    voxels, with higher weight if object has no visible viewpoints.
+  - This score drives `UpdateViewpointRep()` (adds viewpoint reps when high).
+
+**`UpdateViewpointObjectVisibility()`**
+- Runs when a new viewpoint rep is created.
+- For each object in `ObjectNodeRepMap`:
+  - Skip if already in current viewpoint visibility.
+  - Ray-cast from current viewpoint position to object voxels.
+  - On visibility: add to viewpoint + add viewpoint to object + clear considered.
+
+**Visualization**
+- `CreateVisibilityMarkers()`: publishes `MarkerArray` lines between viewpoints
+  and visible objects; color indicates direct vs inferred visibility.
+- `PublishViewpointRoomIdMarkers()`: publishes text markers `V<id>R<room_id>`.
+
+### 4.4 Viewpoint Representation Update
+**`UpdateViewpointRep()`**
+- Pulls `current_obs_voxel_inds_` from `planning_env_`.
+- Computes intersection with `previous_obs_voxel_inds_` via
+  `misc_utils_ns::SetIntersection`.
+- Adds a new viewpoint rep when:
+  - `intersection_voxel_num < rep_threshold_voxel_num_` **and**
+  - `intersection_ratio < rep_threshold_`,
+  - or `obj_score_ > 4.0`.
+- On add:
+  - `representation_->AddViewPointRep(...)`.
+  - Set room id to `current_room_id_`.
+  - Publish `viewpoint_rep_header` only if a new rep was actually inserted.
+  - Update covered voxels and reset `latest_object_node_indices_`.
+- Publishes viewpoint rep cloud + all covered points for visualization.
+
+### 4.5 Room and Door Transition Logic
+**`SetCurrentRoomId()`** (room transitions)
+- Uses current and previous robot voxels in `room_mask_`/`room_mask_old_`.
+- If `transit_across_room_` or `found_object_`: room id updates immediately.
+- If room id unchanged or uninitialized: update and return.
+- If room id changes:
+  - Case 1: `room_mask_old_` at new position already differs from current room
+    -> robot already knew it was a new room -> set `enter_wrong_room_ = true`.
+  - Case 2: overlap between old/new masks determines:
+    - merge: if overlap > 80% -> update current room id.
+    - split: otherwise -> set `ask_vlm_change_room_ = true` and update.
+
+**`GoalPointCallback()`**
+- Resolves `target_room_id_` from room mask at goal point.
+- If goal room == current room and not transiting: reset room info.
+- Else sets `transit_across_room_ = true`, stores goal, publishes `/room_anchor_point`.
+
+**`SetStartAndEndRoomId()`**
+- Uses `misc_utils_ns::find_path_bfs` on `adjacency_matrix` to find a room path.
+- Chooses the last two rooms in the path as `start_room_id_`/`end_room_id_`.
+- Calls `SetRoomPosition(start_room_id_, end_room_id_)`.
+
+**`SetRoomPosition()`**
+- Filters `door_cloud_` for door points between `start_room_id_` and `end_room_id_`.
+- Uses door centroid as transition goal (`door_position_`).
+- Computes `door_normal_` via `GetDoorNormal(...)`:
+  - Sweeps angles, validates by checking room ids along normal and reverse.
+  - Averages valid normals for stability.
+- Publishes `/door_position` and `/door_normal` markers.
+- Publishes chosen room boundary marker.
+
+**`CheckDoorCloudInRange()`**
+- Finds door points within local planning horizon.
+- For unlabeled rooms:
+  - Computes candidate viewpoint near the door.
+  - Marks viewpoint as room candidate
+    (`viewpoint_manager_->SetViewPointRoomCandidate`).
+  - Publishes `door_cloud_in_range_`.
+
+### 4.6 Room Type Query Pipeline
+**`UpdateRoomLabel()`**
+- Pulls covered points in range from `planning_env_`.
+- For each room id:
+  - Accumulate voxel counts + per-room cloud + center estimate.
+- For **unlabeled rooms**:
+  - Update room node voxel count and mark as labeled.
+  - Build a room cloud (points + room center).
+  - Project to panorama via `project_pcl_to_image(...)`.
+  - Set anchor point and save image in `RoomNodeRep`.
+  - Publish `/room_type_query` (RoomType image + mask + anchor).
+  - If room is large enough and unvisited, may call `ChangeRoomQuery(...)`.
+- For **labeled rooms**:
+  - If voxel count increase or area growth threshold exceeded:
+    - Optionally re-project image.
+    - Publish updated `/room_type_query`.
+    - May call `ChangeRoomQuery(...)`.
+- If `ask_vlm_change_room_` set -> `ChangeRoomQuery(prev_room_id_, current_room_id_, true)`.
+
+**`project_pcl_to_image(...)`**
+- Transforms 3D points from world -> lidar -> camera frames.
+- Projects to equirectangular panorama.
+- Tracks horizontal pixel range for room points.
+- Adds special projection for room center and rotates the panorama so the center
+  is in the middle.
+- Crops the panorama to `[min_u, max_u]` with padding.
+
+**`RoomTypeCallback()`**
+- Maps anchor point to room id via `room_mask_`.
+- Accumulates label votes based on `voxel_num`.
+- If label changes: clears `is_considered` on objects in the room.
+
+**`PublishRoomTypeVisualization()`**
+- Renders labeled rooms with stable color (via `idToColor`) and room id text.
+
+### 4.7 Object and Anchor VLM Queries
+**Target object flow**
+- `CheckObjectFound()`:
+  - If target found: update distance + room id and set `ask_found_object_` if
+    within thresholds.
+  - Sets path goal to found object's room anchor point.
+  - If found in current room: `SetFoundTargetObject()`.
+- If not found, iterate all objects:
+  - Must match target label, not considered, `is_asked_vlm_ == true`.
+  - Must have valid image path.
+  - Publish either:
+    - `/target_object_query` (no spatial condition), or
+    - `/target_object_spatial_query` (with viewpoint ids + bbox3d).
+  - Mark object as considered.
+
+**Anchor object flow** (dynamic mode)
+- `CheckAnchorObjectFound()`:
+  - Similar gating as target, but uses room condition and anchor label.
+  - Publishes `/anchor_object_query`.
+  - If anchor found:
+    - Collects visible viewpoint positions to build search tour.
+    - If anchor in current room: `SetFoundAnchorObject()`.
+
+### 4.8 Main `execute()` Loop (High-Level Flow)
+1. **Start gate**: waits for `/exploration_start` unless `kAutoStart`.
+2. **First init**:
+   - Send initial waypoint.
+   - Set `start_time_`, `global_direction_switch_time_`.
+   - `initialized_ = true`.
+3. **Every 1 Hz tick**:
+   - `ProcessObjectNodes()` (remove deleted objects).
+   - `CheckObjectFound()` and `CheckAnchorObjectFound()` if dynamic.
+   - Handle reset flag to clear coverage.
+4. **On `keypose_cloud_update_`**:
+   - `UpdateRoomLabel()`.
+   - `SetCurrentRoomId()`.
+   - If transiting: re-run `GoalPointCallback()` and `SetStartAndEndRoomId()`.
+   - If `at_room_`: reset room info and resume local coverage.
+   - `CountDirectionChange()`.
+   - `UpdateObjectVisibility()`.
+   - `UpdateGlobalRepresentation()` (grid world + keypose graph).
+   - `UpdateViewpointRep()` and `PublishViewpointRoomIdMarkers()`.
+   - If new viewpoint rep: `UpdateViewpointObjectVisibility()`.
+   - `CreateVisibilityMarkers()`.
+   - `UpdateViewPoints()` -> candidate count check.
+   - `CheckDoorCloudInRange()`, `UpdateKeyposeGraph()`.
+   - Coverage updates + uncovered area.
+   - Global + local TSP planning.
+   - Room finish handling:
+     - publish navigation query in advance if nearly finished.
+     - call `GetAnswer()` and mark room covered if finished.
+   - Build exploration path; compute lookahead; publish waypoint.
+   - Publish markers, runtime, room labels, object markers.
+
+### 4.9 Data Flow Summary (Text Graph)
+- `/room_mask` + `/room_nodes_list` -> `RoomNodeRep` + room id grid ->
+  room labels + anchor points -> `/room_type_query` + `/room_navigation_query`.
+- `/object_nodes` -> `ObjectNodeRep` -> visibility graph -> target/anchor queries.
+- `/camera/image` + odom stacks -> `project_pcl_to_image` -> `RoomType` image.
+- `/door_cloud` + room mask -> adjacency -> door/transition planning.
+- `rolling_occupancy_grid` -> line-of-sight -> visibility + viewpoint rep updates.
+
+## 5) Migration Feasibility and Cleanup Plan
+**Feasibility**: Yes, but the logic must be modularized to preserve real-time
+performance and avoid tight coupling between perception and exploration.
+
+### 5.1 Proposed Package Layout
+```
+vector_perception_ros/
+└── sensor_coverage_planner/
+    ├── include/sensor_coverage_planner/
+    │   ├── planner_core.h
+    │   ├── room_mask_manager.h
+    │   ├── room_query_builder.h
+    │   ├── object_visibility_manager.h
+    │   ├── room_transition_manager.h
+    │   └── representation/representation.h
+    ├── src/
+    │   ├── planner_core.cpp
+    │   ├── room_mask_manager.cpp
+    │   ├── room_query_builder.cpp
+    │   ├── object_visibility_manager.cpp
+    │   ├── room_transition_manager.cpp
+    │   ├── representation/representation.cpp
+    │   └── sensor_coverage_planner_node.cpp
+    ├── room_segmentation/
+    │   ├── room_segmentation_node.cpp
+    │   └── room_segmentation_utils.cpp
+    └── launch/
+```
+
+### 5.2 Migration Steps (Incremental)
+1. **Create new package** with same topic contracts as VLM_ROS.
+2. **Port `representation/` first**, keep interfaces stable.
+3. **Port room segmentation node** with exact topics:
+   - `/room_mask`, `/room_nodes_list`, `/door_cloud`.
+4. **Split planner into managers**:
+   - `RoomMaskManager`: room id updates + split/merge logic.
+   - `RoomQueryBuilder`: room-type query generation + image projection.
+   - `RoomTransitionManager`: door traversal, BFS, pathing.
+   - `ObjectVisibilityManager`: visibility graph + ray-casting.
+5. **Maintain message compatibility**:
+   - Use `semantic_mapping/msg/*` across planner and VLM nodes.
+6. **Integrate with semantic mapping**:
+   - `/object_nodes` and `/room_type_answer` are consumed directly (no prefix).
+7. **Validate on bag files** with topic parity checks.
+
+### 5.3 Cleanup Opportunities (Low Risk)
+- Extract `project_pcl_to_image` into a standalone utility.
+- Cache voxel indices for viewpoints and objects across frames.
+- Gate ray-casting by range + coarse visibility before full line-of-sight.
+- Centralize room-id lookup in one helper to avoid duplication.
+
+## 6) Risks and Compatibility Notes
+- **Message package mismatch**:
+  - `tare_planner` vs `semantic_mapping` need explicit bridging.
+- **Time alignment**:
+  - Image timestamp and odom interpolation must stay consistent.
+- **Room resolution mismatch**:
+  - All room-related components must share `room_resolution` and voxel dims.
+- **Visibility differences**:
+  - Changes in occupancy grid or ray tracing will affect VLM query triggers.
+- **Dynamic mode**:
+  - Anchor-object TSP logic must be preserved if used in deployment.
+
+## 7) Suggested Next Actions
+- Isolate `project_pcl_to_image` and add unit tests (math + crop bounds).
+- Build `RoomMaskManager` from existing code.
+- Add integration test for room-type query pipeline.
+- Verify planner/VLM topics use `semantic_mapping/msg/*` directly (no bridge).
diff --git a/semantic_mapping/launch/semantic_mapping.launch.py b/semantic_mapping/launch/semantic_mapping.launch.py
new file mode 100644
index 0000000..01bff4b
--- /dev/null
+++ b/semantic_mapping/launch/semantic_mapping.launch.py
@@ -0,0 +1,220 @@
+"""Launch semantic mapping and detect_anything with a mode selector."""
+
+import os
+
+from ament_index_python.packages import get_package_share_directory
+from launch import LaunchDescription
+from launch.actions import DeclareLaunchArgument, OpaqueFunction
+from launch.substitutions import LaunchConfiguration
+from launch_ros.actions import Node
+
+
+def _parse_bool(value: str, default: bool) -> bool:
+    if value == "":
+        return default
+    return value.lower() in ("true", "1", "yes")
+
+
+def _launch_nodes(context):
+    mode = LaunchConfiguration("mode").perform(context).lower()
+    use_compressed_arg = LaunchConfiguration("use_compressed").perform(context)
+    odom_topic_arg = LaunchConfiguration("odom_topic").perform(context)
+
+    use_sim_time = mode == "bagfile"
+    use_compressed = _parse_bool(use_compressed_arg, default=mode in ("bagfile", "sim"))
+
+    if odom_topic_arg:
+        odom_topic = odom_topic_arg
+    else:
+        odom_topic = "/aft_mapped_to_init_incremental" if mode == "bagfile" else "/state_estimation"
+
+    params_file = LaunchConfiguration("params_file").perform(context)
+    camera_param_path = LaunchConfiguration("camera_param_path").perform(context)
+    detection_topic = LaunchConfiguration("detection_topic").perform(context)
+    cloud_topic = LaunchConfiguration("cloud_topic").perform(context)
+    viewpoint_topic = LaunchConfiguration("viewpoint_topic").perform(context)
+    object_type_answer_topic = LaunchConfiguration("object_type_answer_topic").perform(context)
+    object_type_query_topic = LaunchConfiguration("object_type_query_topic").perform(context)
+    target_object_spatial_query_topic = LaunchConfiguration("target_object_spatial_query_topic").perform(
+        context
+    )
+    target_object_instruction_topic = LaunchConfiguration("target_object_instruction_topic").perform(
+        context
+    )
+    target_object_answer_topic = LaunchConfiguration("target_object_answer_topic").perform(context)
+    instruction_topic = LaunchConfiguration("instruction_topic").perform(context)
+    room_type_query_topic = LaunchConfiguration("room_type_query_topic").perform(context)
+    room_type_answer_topic = LaunchConfiguration("room_type_answer_topic").perform(context)
+    room_segmentation_params = LaunchConfiguration("room_segmentation_params").perform(context)
+    objects_yaml_path = LaunchConfiguration("objects_yaml_path").perform(context)
+    object_nodes_topic = LaunchConfiguration("object_nodes_topic").perform(context)
+    obj_points_topic = LaunchConfiguration("obj_points_topic").perform(context)
+    obj_boxes_topic = LaunchConfiguration("obj_boxes_topic").perform(context)
+    obj_labels_topic = LaunchConfiguration("obj_labels_topic").perform(context)
+    map_frame = LaunchConfiguration("map_frame").perform(context)
+    target_object = LaunchConfiguration("target_object").perform(context)
+    save_png = _parse_bool(LaunchConfiguration("save_png").perform(context), default=False)
+    output_dir = LaunchConfiguration("output_dir").perform(context)
+    image_topic = LaunchConfiguration("image_topic").perform(context)
+    compressed_image_topic = LaunchConfiguration("compressed_image_topic").perform(context)
+    conf = float(LaunchConfiguration("conf").perform(context))
+    max_area_ratio = float(LaunchConfiguration("max_area_ratio").perform(context))
+
+    detect_anything_node = Node(
+        package="detect_anything",
+        executable="detection_node",
+        name="detect_anything_detection_node",
+        output="screen",
+        parameters=[
+            {
+                "use_sim_time": use_sim_time,
+                "use_compressed": use_compressed,
+                "image_topic": image_topic,
+                "compressed_image_topic": compressed_image_topic,
+                "conf": conf,
+                "max_area_ratio": max_area_ratio,
+            }
+        ],
+    )
+
+    image_republisher_node = Node(
+        package="image_transport",
+        executable="republish",
+        name="camera_image_republisher",
+        output="screen",
+        remappings=[
+            ("in/compressed", compressed_image_topic),
+            ("out", image_topic),
+        ],
+        parameters=[
+            {
+                "use_sim_time": use_sim_time,
+                "in_transport": "compressed",
+                "out_transport": "raw",
+            }
+        ],
+    )
+
+    room_seg_params = []
+    if room_segmentation_params:
+        room_seg_params.append(room_segmentation_params)
+    room_seg_params.append({"use_sim_time": use_sim_time})
+
+    room_segmentation_node = Node(
+        package="sensor_coverage",
+        executable="room_segmentation_node",
+        name="room_segmentation_node",
+        output="screen",
+        parameters=room_seg_params,
+    )
+
+    semantic_mapping_node = Node(
+        package="semantic_mapping",
+        executable="semantic_mapping_node",
+        name="semantic_mapping_node",
+        output="screen",
+        parameters=[
+            params_file,
+            {
+                "use_sim_time": use_sim_time,
+                "camera_param_path": camera_param_path,
+                "detection_topic": detection_topic,
+                "cloud_topic": cloud_topic,
+                "odom_topic": odom_topic,
+                "viewpoint_topic": viewpoint_topic,
+                "object_type_answer_topic": object_type_answer_topic,
+                "object_type_query_topic": object_type_query_topic,
+                "target_object_spatial_query_topic": target_object_spatial_query_topic,
+                "target_object_instruction_topic": target_object_instruction_topic,
+                "object_nodes_topic": object_nodes_topic,
+                "obj_points_topic": obj_points_topic,
+                "obj_boxes_topic": obj_boxes_topic,
+                "obj_labels_topic": obj_labels_topic,
+                "map_frame": map_frame,
+                "target_object": target_object,
+                "save_png": save_png,
+                "output_dir": output_dir,
+            },
+        ],
+    )
+
+    vlm_reasoning_node = Node(
+        package="semantic_mapping",
+        executable="vlm_reasoning_node",
+        name="vlm_reasoning_node",
+        output="screen",
+        parameters=[
+            params_file,
+            {
+                "use_sim_time": use_sim_time,
+                "camera_param_path": camera_param_path,
+                "objects_yaml_path": objects_yaml_path,
+                "object_type_query_topic": object_type_query_topic,
+                "object_type_answer_topic": object_type_answer_topic,
+                "target_object_spatial_query_topic": target_object_spatial_query_topic,
+                "target_object_answer_topic": target_object_answer_topic,
+                "target_object_instruction_topic": target_object_instruction_topic,
+                "instruction_topic": instruction_topic,
+                "room_type_query_topic": room_type_query_topic,
+                "room_type_answer_topic": room_type_answer_topic,
+            }
+        ],
+    )
+
+    return [
+        detect_anything_node,
+        image_republisher_node,
+        room_segmentation_node,
+        semantic_mapping_node,
+        vlm_reasoning_node,
+    ]
+
+
+def generate_launch_description() -> LaunchDescription:
+    pkg_share = get_package_share_directory("semantic_mapping")
+    detect_anything_share = get_package_share_directory("detect_anything")
+    sensor_coverage_share = get_package_share_directory("sensor_coverage")
+    default_params = os.path.join(pkg_share, "config", "params.yaml")
+    default_camera_param = os.path.join(pkg_share, "config", "camera_param.yaml")
+    default_objects_yaml = os.path.join(detect_anything_share, "config", "objects.yaml")
+    default_room_seg_params = os.path.join(
+        sensor_coverage_share, "config", "room_segmentation_config.yaml"
+    )
+
+    return LaunchDescription(
+        [
+            DeclareLaunchArgument("mode", default_value="sim"),
+            DeclareLaunchArgument("use_compressed", default_value=""),
+            DeclareLaunchArgument("params_file", default_value=default_params),
+            DeclareLaunchArgument("camera_param_path", default_value=default_camera_param),
+            DeclareLaunchArgument("detection_topic", default_value="/detection_result"),
+            DeclareLaunchArgument("cloud_topic", default_value="/registered_scan"),
+            DeclareLaunchArgument("odom_topic", default_value=""),
+            DeclareLaunchArgument("viewpoint_topic", default_value="/viewpoint_rep_header"),
+            DeclareLaunchArgument("object_type_answer_topic", default_value="/object_type_answer"),
+            DeclareLaunchArgument("object_type_query_topic", default_value="/object_type_query"),
+            DeclareLaunchArgument(
+                "target_object_spatial_query_topic", default_value="/target_object_spatial_query"
+            ),
+            DeclareLaunchArgument("target_object_instruction_topic", default_value="/target_object_instruction"),
+            DeclareLaunchArgument("target_object_answer_topic", default_value="/target_object_answer"),
+            DeclareLaunchArgument("instruction_topic", default_value="/keyboard_input"),
+            DeclareLaunchArgument("room_type_query_topic", default_value="/room_type_query"),
+            DeclareLaunchArgument("room_type_answer_topic", default_value="/room_type_answer"),
+            DeclareLaunchArgument("room_segmentation_params", default_value=default_room_seg_params),
+            DeclareLaunchArgument("objects_yaml_path", default_value=default_objects_yaml),
+            DeclareLaunchArgument("object_nodes_topic", default_value="/object_nodes"),
+            DeclareLaunchArgument("obj_points_topic", default_value="/obj_points"),
+            DeclareLaunchArgument("obj_boxes_topic", default_value="/obj_boxes"),
+            DeclareLaunchArgument("obj_labels_topic", default_value="/obj_labels"),
+            DeclareLaunchArgument("map_frame", default_value="map"),
+            DeclareLaunchArgument("target_object", default_value="refrigerator"),
+            DeclareLaunchArgument("save_png", default_value="false"),
+            DeclareLaunchArgument("output_dir", default_value="output/object_images"),
+            DeclareLaunchArgument("image_topic", default_value="/camera/image"),
+            DeclareLaunchArgument("compressed_image_topic", default_value="/camera/image/compressed"),
+            DeclareLaunchArgument("conf", default_value="0.35"),
+            DeclareLaunchArgument("max_area_ratio", default_value="0.3"),
+            OpaqueFunction(function=_launch_nodes),
+        ]
+    )
diff --git a/semantic_mapping/msg/DetectionResult.msg b/semantic_mapping/msg/DetectionResult.msg
new file mode 100644
index 0000000..0f324ad
--- /dev/null
+++ b/semantic_mapping/msg/DetectionResult.msg
@@ -0,0 +1,9 @@
+std_msgs/Header header
+int32[] track_id
+float32[] x1
+float32[] y1
+float32[] x2
+float32[] y2
+string[] label
+float32[] confidence
+sensor_msgs/Image image
\ No newline at end of file
diff --git a/semantic_mapping/msg/NavigationQuery.msg b/semantic_mapping/msg/NavigationQuery.msg
new file mode 100644
index 0000000..5744672
--- /dev/null
+++ b/semantic_mapping/msg/NavigationQuery.msg
@@ -0,0 +1,3 @@
+string json
+sensor_msgs/Image[] images
+geometry_msgs/Point[] anchor_points
\ No newline at end of file
diff --git a/semantic_mapping/msg/ObjectNode.msg b/semantic_mapping/msg/ObjectNode.msg
new file mode 100644
index 0000000..b2d7686
--- /dev/null
+++ b/semantic_mapping/msg/ObjectNode.msg
@@ -0,0 +1,35 @@
+# Semantic object representation for planner integration.
+#
+# Each message represents a single object in the semantic map with its
+# 3D bounding box, point cloud, and metadata for VLM interaction.
+
+std_msgs/Header header
+
+# Object identity - may contain multiple merged track IDs
+int32[] object_id
+
+# Dominant class label (by weighted vote)
+string label
+
+# Object centroid in world frame
+geometry_msgs/Point position
+
+# 3D bounding box corners (8 points in world frame)
+# Order: [min_x,min_y,min_z], [max_x,min_y,min_z], [max_x,max_y,min_z], [min_x,max_y,min_z],
+#        [min_x,min_y,max_z], [max_x,min_y,max_z], [max_x,max_y,max_z], [min_x,max_y,max_z]
+geometry_msgs/Point[8] bbox3d
+
+# Object point cloud in world frame
+sensor_msgs/PointCloud2 cloud
+
+# Lifecycle status: "new", "updated", "unchanged"
+string status
+
+# Path to best saved image crop of this object
+string img_path
+
+# VLM interaction flags
+bool is_asked_vlm
+
+# Associated viewpoint ID (for viewpoint-triggered saves)
+int32 viewpoint_id
diff --git a/semantic_mapping/msg/ObjectType.msg b/semantic_mapping/msg/ObjectType.msg
new file mode 100644
index 0000000..ab02ea6
--- /dev/null
+++ b/semantic_mapping/msg/ObjectType.msg
@@ -0,0 +1,17 @@
+# VLM query/answer message for object relabeling.
+#
+# Used bidirectionally:
+# - Query (mapping -> VLM): object_id, img_path, labels[] populated
+# - Answer (VLM -> mapping): final_label populated with VLM decision
+
+# Object ID to query/update
+int32 object_id
+
+# Path to object image for VLM analysis
+string img_path
+
+# VLM-determined final label (populated in answer)
+string final_label
+
+# Candidate labels from detection (populated in query)
+string[] labels
diff --git a/semantic_mapping/msg/RoomEarlyStop1.msg b/semantic_mapping/msg/RoomEarlyStop1.msg
new file mode 100644
index 0000000..473c601
--- /dev/null
+++ b/semantic_mapping/msg/RoomEarlyStop1.msg
@@ -0,0 +1,10 @@
+int32 room_id_1
+int32 room_id_2
+geometry_msgs/Point anchor_point_1
+geometry_msgs/Point anchor_point_2
+sensor_msgs/Image image_1
+sensor_msgs/Image image_2
+string room_type_1
+string room_type_2
+bool early_stop_1
+bool enter_wrong_room
\ No newline at end of file
diff --git a/semantic_mapping/msg/RoomNode.msg b/semantic_mapping/msg/RoomNode.msg
new file mode 100644
index 0000000..e406bdc
--- /dev/null
+++ b/semantic_mapping/msg/RoomNode.msg
@@ -0,0 +1,8 @@
+int32 id
+int32 show_id
+geometry_msgs/PolygonStamped polygon
+geometry_msgs/Point centroid
+int32[] neighbors
+bool is_connected
+float32 area
+sensor_msgs/Image room_mask
diff --git a/semantic_mapping/msg/RoomNodeList.msg b/semantic_mapping/msg/RoomNodeList.msg
new file mode 100644
index 0000000..8a9a713
--- /dev/null
+++ b/semantic_mapping/msg/RoomNodeList.msg
@@ -0,0 +1,2 @@
+std_msgs/Header header
+semantic_mapping/RoomNode[] nodes
diff --git a/semantic_mapping/msg/RoomType.msg b/semantic_mapping/msg/RoomType.msg
new file mode 100644
index 0000000..aa6ff3c
--- /dev/null
+++ b/semantic_mapping/msg/RoomType.msg
@@ -0,0 +1,8 @@
+std_msgs/Header header
+geometry_msgs/Point anchor_point
+sensor_msgs/Image image
+sensor_msgs/Image room_mask
+string room_type
+int32 room_id
+int32 voxel_num
+bool in_room
diff --git a/semantic_mapping/msg/TargetObject.msg b/semantic_mapping/msg/TargetObject.msg
new file mode 100644
index 0000000..55c60e4
--- /dev/null
+++ b/semantic_mapping/msg/TargetObject.msg
@@ -0,0 +1,6 @@
+std_msgs/Header header
+int32 object_id
+string object_label
+string img_path
+string room_label
+bool is_target
\ No newline at end of file
diff --git a/semantic_mapping/msg/TargetObjectInstruction.msg b/semantic_mapping/msg/TargetObjectInstruction.msg
new file mode 100644
index 0000000..1a88005
--- /dev/null
+++ b/semantic_mapping/msg/TargetObjectInstruction.msg
@@ -0,0 +1,22 @@
+# User instruction for target object search.
+#
+# Specifies the target object to find and optional spatial/attribute
+# conditions relative to an anchor object.
+
+# Primary target object label to search for
+string target_object
+
+# Room condition (e.g., "in the kitchen")
+string room_condition
+
+# Spatial condition relative to anchor (e.g., "next to", "on top of")
+string spatial_condition
+
+# Attribute condition for target (e.g., "red", "large")
+string attribute_condition
+
+# Anchor object for spatial reference
+string anchor_object
+
+# Attribute condition for anchor object
+string attribute_condition_anchor
diff --git a/semantic_mapping/msg/TargetObjectWithSpatial.msg b/semantic_mapping/msg/TargetObjectWithSpatial.msg
new file mode 100644
index 0000000..cc0c8d7
--- /dev/null
+++ b/semantic_mapping/msg/TargetObjectWithSpatial.msg
@@ -0,0 +1,8 @@
+std_msgs/Header header
+int32 object_id
+string object_label
+string img_path
+int32[] viewpoint_ids
+string room_label
+bool is_target
+geometry_msgs/Point[8] bbox3d
diff --git a/semantic_mapping/msg/ViewpointRep.msg b/semantic_mapping/msg/ViewpointRep.msg
new file mode 100644
index 0000000..5bb1bfe
--- /dev/null
+++ b/semantic_mapping/msg/ViewpointRep.msg
@@ -0,0 +1,8 @@
+# Viewpoint trigger message for image saving.
+#
+# Published by planner to request saving images at specific viewpoints.
+
+std_msgs/Header header
+
+# Unique viewpoint identifier
+int32 viewpoint_id
diff --git a/semantic_mapping/msg/VlmAnswer.msg b/semantic_mapping/msg/VlmAnswer.msg
new file mode 100644
index 0000000..8ec7a30
--- /dev/null
+++ b/semantic_mapping/msg/VlmAnswer.msg
@@ -0,0 +1,4 @@
+int32 room_id
+geometry_msgs/Point anchor_point
+# 0 for vlm navigation query, 1 for early stop 1
+int32 answer_type
\ No newline at end of file
diff --git a/semantic_mapping/package.xml b/semantic_mapping/package.xml
new file mode 100644
index 0000000..4804d1a
--- /dev/null
+++ b/semantic_mapping/package.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>semantic_mapping</name>
+  <version>0.0.1</version>
+  <description>Semantic 3D object mapping node with VLM integration.</description>
+  <maintainer email="alex.lin416@outlook.com">Alex Lin</maintainer>
+  <license>MIT</license>
+
+  <buildtool_depend>ament_cmake</buildtool_depend>
+
+  <build_depend>rosidl_default_generators</build_depend>
+  <exec_depend>rosidl_default_runtime</exec_depend>
+  <member_of_group>rosidl_interface_packages</member_of_group>
+
+  <depend>rclpy</depend>
+  <depend>std_msgs</depend>
+  <depend>sensor_msgs</depend>
+  <depend>geometry_msgs</depend>
+  <depend>nav_msgs</depend>
+  <depend>visualization_msgs</depend>
+  <depend>cv_bridge</depend>
+  <depend>detect_anything</depend>
+  <depend>vector_perception_utils</depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/semantic_mapping/resource/semantic_mapping b/semantic_mapping/resource/semantic_mapping
new file mode 100644
index 0000000..e69de29
diff --git a/semantic_mapping/scripts/semantic_mapping_node b/semantic_mapping/scripts/semantic_mapping_node
new file mode 100755
index 0000000..0312e88
--- /dev/null
+++ b/semantic_mapping/scripts/semantic_mapping_node
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+"""Entry point for semantic_mapping_node."""
+
+from semantic_mapping.mapping_node import main
+
+if __name__ == "__main__":
+    main()
diff --git a/semantic_mapping/scripts/vlm_reasoning_node b/semantic_mapping/scripts/vlm_reasoning_node
new file mode 100755
index 0000000..645540f
--- /dev/null
+++ b/semantic_mapping/scripts/vlm_reasoning_node
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+"""Entry point for vlm_reasoning_node."""
+
+from semantic_mapping.vlm_reasoning_node import main
+
+if __name__ == "__main__":
+    main()
diff --git a/semantic_mapping/semantic_mapping/__init__.py b/semantic_mapping/semantic_mapping/__init__.py
new file mode 100644
index 0000000..8df6f20
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/__init__.py
@@ -0,0 +1 @@
+"""Semantic 3D object mapping for vector_perception_ros."""
diff --git a/semantic_mapping/semantic_mapping/map_object.py b/semantic_mapping/semantic_mapping/map_object.py
new file mode 100644
index 0000000..da143a9
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/map_object.py
@@ -0,0 +1,713 @@
+"""Data models for semantic mapping.
+
+This module defines the core data structures for the semantic mapping system:
+- VoxelFeatureManager: Manages voxelized point clouds with observation angle voting
+- MapObject: Represents a tracked object in the semantic map
+- Observation: A single detection observation
+- ObjectStatus: Enum for object lifecycle status
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional
+
+import numpy as np
+from scipy.spatial import cKDTree
+
+from vector_perception_utils.transform_utils import discretize_angles, normalize_angle
+from vector_perception_utils.pointcloud_utils import (
+    compute_centroid,
+    create_pointcloud,
+)
+
+
+class ObjectStatus(Enum):
+    """Object lifecycle status (ObjectDB-style tiering)."""
+
+    PENDING = "pending"  # < min_detections threshold
+    PERMANENT = "permanent"  # >= threshold, confirmed object
+    DELETED = "deleted"  # Marked for removal
+
+
+@dataclass
+class Observation:
+    """A single detection observation to update a MapObject.
+
+    Attributes:
+        track_id: Tracker ID from detection.
+        label: Class label string.
+        confidence: Detection confidence score.
+        points: World-frame point cloud (N, 3).
+        robot_R: Robot rotation matrix (3, 3) at observation time.
+        robot_t: Robot translation vector (3,) at observation time.
+        mask: Optional 2D segmentation mask.
+        image: Optional RGB image crop for best-image selection.
+        timestamp: Observation timestamp in seconds.
+    """
+
+    track_id: int
+    label: str
+    confidence: float
+    points: np.ndarray
+    robot_R: np.ndarray
+    robot_t: np.ndarray
+    mask: Optional[np.ndarray] = None
+    image: Optional[np.ndarray] = None
+    timestamp: float = 0.0
+
+
+class VoxelFeatureManager:
+    """Manages voxelized point cloud with observation angle voting.
+
+    This class tracks 3D voxels representing an object, along with:
+    - Vote counts per voxel (how many times each voxel was observed)
+    - Observation angles (which directions each voxel was seen from)
+    - Regularization mask (which voxels pass shape filtering)
+
+    The observation angle voting helps distinguish between reliable points
+    (seen from multiple angles) and spurious points (seen from only one angle).
+
+    Attributes:
+        voxels: Point coordinates (N, 3) in world frame.
+        votes: Detection count per voxel (N,).
+        observation_angles: Binary angle histogram per voxel (N, num_bins).
+        regularized_mask: Boolean mask for valid voxels after clustering (N,).
+        voxel_size: Size of voxel grid cells in meters.
+        num_angle_bins: Number of discretized angle bins.
+    """
+
+    def __init__(
+        self,
+        points: np.ndarray,
+        voxel_size: float,
+        robot_R: np.ndarray,
+        robot_t: np.ndarray,
+        num_angle_bins: int = 15,
+    ) -> None:
+        """Initialize VoxelFeatureManager with initial points.
+
+        Args:
+            points: Initial point cloud (N, 3) in world frame.
+            voxel_size: Voxel grid cell size in meters.
+            robot_R: Robot rotation matrix (3, 3) at observation time.
+            robot_t: Robot translation vector (3,) at observation time.
+            num_angle_bins: Number of angle bins for observation tracking.
+        """
+        num_points = points.shape[0]
+
+        # Compute observation angles from robot position
+        vectors = points - robot_t
+        obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0])
+        obs_bins = discretize_angles(obs_angles, num_angle_bins)
+
+        self.voxels = points.copy()
+        self.voxel_size = voxel_size
+        self.num_angle_bins = num_angle_bins
+        self._tree = cKDTree(points)
+
+        # Initialize vote counts
+        self.votes = np.ones(num_points, dtype=np.float32)
+
+        # Initialize observation angle histogram (binary: seen/not seen from each angle)
+        self.observation_angles = np.zeros((num_points, num_angle_bins), dtype=np.uint8)
+        self.observation_angles[np.arange(num_points), obs_bins] = 1
+
+        # Regularization mask (all False until regularize_shape is called)
+        self.regularized_mask = np.zeros(num_points, dtype=bool)
+
+        # Remove vote for dynamic object clearing (not commonly used)
+        self._remove_votes = np.zeros(num_points, dtype=np.int32)
+
+    def update(
+        self,
+        new_points: np.ndarray,
+        robot_R: np.ndarray,
+        robot_t: np.ndarray,
+    ) -> None:
+        """Add new points with KDTree-based merge and angle voting.
+
+        Points within voxel_size of existing points are merged (votes incremented).
+        Points farther away are added as new voxels.
+
+        Args:
+            new_points: New point cloud (M, 3) in world frame.
+            robot_R: Robot rotation matrix (3, 3) at observation time.
+            robot_t: Robot translation vector (3,) at observation time.
+        """
+        if new_points.shape[0] == 0:
+            return
+
+        # Compute observation angles for new points
+        vectors = new_points - robot_t
+        obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0])
+        obs_bins = discretize_angles(obs_angles, self.num_angle_bins)
+
+        # Find nearest existing voxels
+        distances, indices = self._tree.query(new_points)
+
+        # Merge points within voxel_size
+        merge_mask = distances < self.voxel_size
+        merge_indices = indices[merge_mask]
+        merge_bins = obs_bins[merge_mask]
+
+        self.votes[merge_indices] += 1
+        self.observation_angles[merge_indices, merge_bins] = 1
+
+        # Add new points that don't merge
+        new_mask = ~merge_mask
+        if np.any(new_mask):
+            new_voxels = new_points[new_mask]
+            new_bins = obs_bins[new_mask]
+            num_new = new_voxels.shape[0]
+
+            # Extend arrays
+            self.voxels = np.vstack([self.voxels, new_voxels])
+            self.votes = np.concatenate([self.votes, np.ones(num_new, dtype=np.float32)])
+
+            new_obs = np.zeros((num_new, self.num_angle_bins), dtype=np.uint8)
+            new_obs[np.arange(num_new), new_bins] = 1
+            self.observation_angles = np.vstack([self.observation_angles, new_obs])
+
+            self.regularized_mask = np.concatenate(
+                [self.regularized_mask, np.zeros(num_new, dtype=bool)]
+            )
+            self._remove_votes = np.concatenate(
+                [self._remove_votes, np.zeros(num_new, dtype=np.int32)]
+            )
+
+            # Rebuild KDTree
+            self._tree = cKDTree(self.voxels)
+
+    def update_observation_angles(self, voxel_indices: np.ndarray, obs_bins: np.ndarray) -> None:
+        """Update observation angle histogram for selected voxels."""
+        if voxel_indices.size == 0:
+            return
+        self.observation_angles[voxel_indices, obs_bins] = 1
+
+    def merge_from(self, other: VoxelFeatureManager) -> None:
+        """Merge voxels from another VoxelFeatureManager.
+
+        Args:
+            other: Another VoxelFeatureManager to merge from.
+        """
+        if other.voxels.shape[0] == 0:
+            return
+
+        distances, indices = self._tree.query(other.voxels)
+
+        # Merge overlapping voxels
+        merge_mask = distances < self.voxel_size
+        merge_indices = indices[merge_mask]
+
+        self.votes[merge_indices] += other.votes[merge_mask]
+        self.observation_angles[merge_indices] = np.logical_or(
+            self.observation_angles[merge_indices],
+            other.observation_angles[merge_mask],
+        ).astype(np.uint8)
+
+        # Add non-overlapping voxels
+        new_mask = ~merge_mask
+        if np.any(new_mask):
+            self.voxels = np.vstack([self.voxels, other.voxels[new_mask]])
+            self.votes = np.concatenate([self.votes, other.votes[new_mask]])
+            self.observation_angles = np.vstack(
+                [self.observation_angles, other.observation_angles[new_mask]]
+            )
+            self.regularized_mask = np.concatenate(
+                [self.regularized_mask, other.regularized_mask[new_mask]]
+            )
+            self._remove_votes = np.concatenate(
+                [self._remove_votes, np.zeros(np.sum(new_mask), dtype=np.int32)]
+            )
+            self._tree = cKDTree(self.voxels)
+
+    def get_valid_indices(
+        self,
+        diversity_percentile: float = 0.3,
+        use_regularized: bool = True,
+    ) -> np.ndarray:
+        """Get indices of valid voxels based on observation angle diversity.
+
+        Voxels are ranked by their observation angle diversity (number of angles
+        from which they were observed), weighted by vote count. The top percentile
+        of voxels are returned.
+
+        Args:
+            diversity_percentile: Fraction of total weight to include (0-1).
+            use_regularized: If True, only consider voxels in regularized_mask.
+
+        Returns:
+            Indices of valid voxels.
+        """
+        if use_regularized:
+            mask = self.regularized_mask
+            if not np.any(mask):
+                return np.array([], dtype=int)
+            obs_angles = self.observation_angles[mask]
+            votes = self.votes[mask]
+        else:
+            obs_angles = self.observation_angles
+            votes = self.votes
+
+        if len(obs_angles) == 0:
+            return np.array([], dtype=int)
+
+        # Compute diversity score (number of angles observed)
+        angle_diversity = np.sum(obs_angles, axis=1)
+
+        # Sort by diversity (ascending)
+        sorted_indices = np.argsort(angle_diversity)
+        sorted_diversity = angle_diversity[sorted_indices]
+        sorted_weights = votes[sorted_indices]
+
+        # Weight by diversity * votes
+        weighted = sorted_diversity * sorted_weights
+        total_weight = np.sum(weighted)
+
+        if total_weight == 0:
+            return np.array([], dtype=int)
+
+        # Find cutoff index for percentile
+        target_weight = (1 - diversity_percentile) * total_weight
+        cumsum = np.cumsum(weighted)
+        cutoff_idx = np.searchsorted(cumsum, target_weight)
+
+        # Return indices above cutoff (higher diversity)
+        return sorted_indices[cutoff_idx:]
+
+    def get_valid_voxels(
+        self,
+        diversity_percentile: float = 0.3,
+        use_regularized: bool = True,
+    ) -> np.ndarray:
+        """Get valid voxel coordinates.
+
+        Args:
+            diversity_percentile: Fraction of total weight to include (0-1).
+            use_regularized: If True, only consider voxels in regularized_mask.
+
+        Returns:
+            Valid voxel coordinates (M, 3).
+        """
+        indices = self.get_valid_indices(diversity_percentile, use_regularized)
+        if use_regularized and np.any(self.regularized_mask):
+            voxels = self.voxels[self.regularized_mask]
+        else:
+            voxels = self.voxels
+
+        if len(indices) == 0:
+            return np.empty((0, 3), dtype=np.float32)
+        return voxels[indices]
+
+    @property
+    def num_voxels(self) -> int:
+        """Total number of voxels."""
+        return self.voxels.shape[0]
+
+    @property
+    def num_regularized(self) -> int:
+        """Number of voxels passing regularization."""
+        return int(np.sum(self.regularized_mask))
+
+
+@dataclass
+class MapObject:
+    """Represents a tracked object in the semantic map.
+
+    This class combines patterns from VLM_ROS SingleObject and dimos Object,
+    with ObjectDB-style lifecycle management.
+
+    Attributes:
+        object_id: Unique identifier for this object (UUID string).
+        track_ids: List of associated tracker IDs (can merge multiple).
+        voxel_manager: Manages the object's voxelized point cloud.
+        class_votes: Label -> observation count mapping.
+        confidence_scores: Label -> best confidence score mapping.
+        points_count: Label -> total points count mapping.
+        robot_poses: List of robot poses when object was observed.
+        detection_count: Total number of detections.
+        inactive_frames: Frames since last update.
+        last_update_time: Timestamp of last update.
+        status: Object lifecycle status.
+        is_asked_vlm: Whether VLM has been queried for this object.
+        updated_by_vlm: Whether object was relabeled by VLM.
+        best_image_path: Path to best saved image crop.
+        best_image_mask_area: Mask area of the best saved image crop.
+        centroid: Cached centroid position (None if needs recompute).
+        bbox3d_corners: Cached 3D bbox corners (None if needs recompute).
+    """
+
+    object_id: str
+    track_ids: list[int]
+    voxel_manager: VoxelFeatureManager
+
+    # Classification (voted)
+    class_votes: dict[str, int] = field(default_factory=dict)
+    confidence_scores: dict[str, float] = field(default_factory=dict)
+    points_count: dict[str, int] = field(default_factory=dict)
+
+    # Robot poses at observation times
+    robot_poses: list[dict] = field(default_factory=list)
+
+    # Lifecycle (ObjectDB-style)
+    detection_count: int = 1
+    inactive_frames: int = 0
+    last_update_time: float = 0.0
+    status: ObjectStatus = ObjectStatus.PENDING
+
+    # VLM interaction
+    is_asked_vlm: bool = False
+    updated_by_vlm: bool = False
+
+    # Image storage
+    best_image_path: Optional[str] = None
+    best_image_mask_area: float = 0.0
+
+    # Cached geometry (None means needs recompute)
+    _centroid: Optional[np.ndarray] = field(default=None, repr=False)
+    _bbox3d_corners: Optional[np.ndarray] = field(default=None, repr=False)
+    _needs_recompute: bool = field(default=True, repr=False)
+
+    # Configuration
+    _diversity_percentile: float = field(default=0.85, repr=False)
+    _angle_threshold: float = field(default=np.deg2rad(5), repr=False)
+    _distance_threshold: float = field(default=0.3, repr=False)
+    _needs_regularization: bool = field(default=True, repr=False)
+    _regularize_voxel_size: float = field(default=0.06, repr=False)
+
+    @classmethod
+    def from_observation(
+        cls,
+        object_id: str,
+        obs: Observation,
+        voxel_size: float = 0.03,
+        regularize_voxel_size: float = 0.06,
+        num_angle_bins: int = 15,
+    ) -> MapObject:
+        """Create a new MapObject from an initial observation.
+
+        Args:
+            object_id: Unique identifier for this object.
+            obs: Initial observation.
+            voxel_size: Voxel grid cell size in meters.
+            regularize_voxel_size: Voxel size for regularization clustering.
+            num_angle_bins: Number of angle bins for observation tracking.
+
+        Returns:
+            New MapObject instance.
+        """
+        voxel_manager = VoxelFeatureManager(
+            points=obs.points,
+            voxel_size=voxel_size,
+            robot_R=obs.robot_R,
+            robot_t=obs.robot_t,
+            num_angle_bins=num_angle_bins,
+        )
+
+        obj = cls(
+            object_id=object_id,
+            track_ids=[obs.track_id],
+            voxel_manager=voxel_manager,
+            class_votes={obs.label: 1},
+            confidence_scores={obs.label: obs.confidence},
+            points_count={obs.label: len(obs.points)},
+            robot_poses=[{"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}],
+            last_update_time=obs.timestamp,
+            _regularize_voxel_size=regularize_voxel_size,
+        )
+        return obj
+
+    def _invalidate_cache(self) -> None:
+        """Mark cached geometry as needing recompute."""
+        self._needs_recompute = True
+        self._centroid = None
+        self._bbox3d_corners = None
+
+    def _is_similar_pose(self, new_R: np.ndarray, new_t: np.ndarray) -> bool:
+        """Check if a new pose is similar to existing poses.
+
+        Used to avoid redundant updates from the same viewpoint.
+        Compares observation angle and distance to object center.
+
+        Args:
+            new_R: New robot rotation matrix (3, 3). Not used but kept for API.
+            new_t: New robot translation vector (3,).
+
+        Returns:
+            True if pose is similar to an existing pose.
+        """
+        _ = new_R  # Unused but kept for API consistency
+        obj_center = self.centroid
+        if obj_center is None:
+            obj_center = compute_centroid(self.voxel_manager.voxels)
+
+        for pose in self.robot_poses:
+            old_t = pose["t"]  # R not used for similarity check
+
+            old_to_obj = obj_center - old_t
+            new_to_obj = obj_center - new_t
+
+            old_angle = np.arctan2(old_to_obj[1], old_to_obj[0])
+            new_angle = np.arctan2(new_to_obj[1], new_to_obj[0])
+            angle_diff = abs(normalize_angle(np.array([new_angle - old_angle]))[0])
+
+            old_dist = np.linalg.norm(old_to_obj)
+            new_dist = np.linalg.norm(new_to_obj)
+            dist_diff = abs(new_dist - old_dist)
+
+            if angle_diff < self._angle_threshold and dist_diff < self._distance_threshold:
+                return True
+
+        return False
+
+    def update(self, obs: Observation) -> None:
+        """Update object with a new observation.
+
+        Args:
+            obs: New observation to incorporate.
+        """
+        # Update voxel manager
+        self.voxel_manager.update(obs.points, obs.robot_R, obs.robot_t)
+        self.detection_count += 1
+        self.inactive_frames = 0
+        self.last_update_time = obs.timestamp
+        self._needs_regularization = True
+
+        # Check if pose is new (not similar to existing)
+        is_similar = self._is_similar_pose(obs.robot_R, obs.robot_t)
+
+        if not is_similar:
+            # Update class statistics
+            self.class_votes[obs.label] = self.class_votes.get(obs.label, 0) + 1
+            self.confidence_scores[obs.label] = max(
+                self.confidence_scores.get(obs.label, 0), obs.confidence
+            )
+            self.points_count[obs.label] = (
+                self.points_count.get(obs.label, 0) + len(obs.points)
+            )
+
+            # Add pose
+            self.robot_poses.append({"R": obs.robot_R.copy(), "t": obs.robot_t.copy()})
+
+        # Add track ID if new
+        if obs.track_id not in self.track_ids:
+            self.track_ids.append(obs.track_id)
+
+        self._invalidate_cache()
+
+    def merge(self, other: MapObject) -> None:
+        """Merge another MapObject into this one.
+
+        Args:
+            other: Object to merge from.
+        """
+        # Merge track IDs
+        for tid in other.track_ids:
+            if tid not in self.track_ids:
+                self.track_ids.append(tid)
+
+        # Merge voxel managers
+        self.voxel_manager.merge_from(other.voxel_manager)
+        self._needs_regularization = True
+
+        # Merge class statistics
+        for label, count in other.class_votes.items():
+            self.class_votes[label] = self.class_votes.get(label, 0) + count
+            self.confidence_scores[label] = max(
+                self.confidence_scores.get(label, 0),
+                other.confidence_scores.get(label, 0),
+            )
+            self.points_count[label] = (
+                self.points_count.get(label, 0) + other.points_count.get(label, 0)
+            )
+
+        # Merge poses (only unique ones)
+        for pose in other.robot_poses:
+            if not self._is_similar_pose(pose["R"], pose["t"]):
+                self.robot_poses.append(pose)
+
+        # Update lifecycle
+        self.detection_count += other.detection_count
+        self.last_update_time = max(self.last_update_time, other.last_update_time)
+        self.is_asked_vlm = self.is_asked_vlm or other.is_asked_vlm
+
+        self._invalidate_cache()
+
+    @property
+    def label(self) -> str:
+        """Get dominant class label by weighted vote.
+
+        Returns:
+            Most likely class label based on points_count * confidence.
+        """
+        if not self.class_votes:
+            return "unknown"
+
+        # Compute weighted scores
+        weighted_scores = {}
+        for lbl in self.class_votes:
+            pts = self.points_count.get(lbl, 0)
+            conf = self.confidence_scores.get(lbl, 0)
+            weighted_scores[lbl] = pts * conf
+
+        return max(weighted_scores, key=weighted_scores.get)
+
+    @property
+    def confidence(self) -> float:
+        """Get confidence of the dominant label."""
+        return self.confidence_scores.get(self.label, 0.0)
+
+    @property
+    def centroid(self) -> Optional[np.ndarray]:
+        """Compute centroid weighted by observation angles.
+
+        Returns:
+            Centroid position (3,) or None if no valid voxels.
+        """
+        if self._needs_recompute or self._centroid is None:
+            self._compute_geometry()
+        return self._centroid
+
+    @property
+    def bbox3d_corners(self) -> Optional[np.ndarray]:
+        """Compute axis-aligned 3D bounding box corners.
+
+        Returns:
+            8 corner points (8, 3) or None if no valid voxels.
+        """
+        if self._needs_recompute or self._bbox3d_corners is None:
+            self._compute_geometry()
+        return self._bbox3d_corners
+
+    def _compute_geometry(self) -> None:
+        """Compute cached centroid and bbox from valid voxels using Open3D.
+
+        Returns:
+            None. Updates cached centroid and bbox in-place.
+        """
+        # Get valid voxels (regularized first, then fallback to all)
+        valid_voxels = self.voxel_manager.get_valid_voxels(
+            diversity_percentile=self._diversity_percentile,
+            use_regularized=True,
+        )
+        if len(valid_voxels) == 0:
+            valid_voxels = self.voxel_manager.get_valid_voxels(
+                diversity_percentile=self._diversity_percentile,
+                use_regularized=False,
+            )
+
+        if len(valid_voxels) < 4:
+            self._centroid = compute_centroid(valid_voxels) if len(valid_voxels) > 0 else None
+            self._bbox3d_corners = None
+        else:
+            pcd = create_pointcloud(valid_voxels)
+            try:
+                obb = pcd.get_minimal_oriented_bounding_box()
+            except RuntimeError:
+                # Points are coplanar / degenerate — fall back to axis-aligned bbox.
+                obb = pcd.get_axis_aligned_bounding_box()
+                self._centroid = np.asarray(obb.get_center())
+                self._bbox3d_corners = np.asarray(obb.get_box_points())
+                self._needs_recompute = False
+                return
+            self._centroid = np.asarray(obb.center)
+            half_extent = 0.5 * np.asarray(obb.extent)
+            local_corners = np.array(
+                [
+                    [-half_extent[0], -half_extent[1], -half_extent[2]],
+                    [half_extent[0], -half_extent[1], -half_extent[2]],
+                    [half_extent[0], half_extent[1], -half_extent[2]],
+                    [-half_extent[0], half_extent[1], -half_extent[2]],
+                    [-half_extent[0], -half_extent[1], half_extent[2]],
+                    [half_extent[0], -half_extent[1], half_extent[2]],
+                    [half_extent[0], half_extent[1], half_extent[2]],
+                    [-half_extent[0], half_extent[1], half_extent[2]],
+                ],
+                dtype=float,
+            )
+            self._bbox3d_corners = local_corners @ np.asarray(obb.R).T + np.asarray(obb.center)
+
+        self._needs_recompute = False
+
+    def regularize_shape(self, percentile: float = 0.85) -> None:
+        """Apply vote-weighted regularization with light outlier removal."""
+        if not self._needs_regularization:
+            return
+        voxels = self.voxel_manager.voxels
+        if len(voxels) < 3:
+            self.voxel_manager.regularized_mask = np.ones(len(voxels), dtype=bool)
+            self._needs_regularization = False
+            self._invalidate_cache()
+            return
+
+        weights = np.sum(self.voxel_manager.observation_angles, axis=1) * self.voxel_manager.votes
+        keep_ratio = percentile if percentile is not None else 1.0
+        if keep_ratio >= 1.0:
+            keep_mask = np.ones(voxels.shape[0], dtype=bool)
+        else:
+            cutoff = float(np.quantile(weights, 1.0 - keep_ratio))
+            keep_mask = weights >= cutoff
+
+        if not np.any(keep_mask):
+            keep_mask = np.ones(voxels.shape[0], dtype=bool)
+
+        selected_indices = np.nonzero(keep_mask)[0]
+        points_sel = voxels[selected_indices]
+        weights_sel = weights[selected_indices]
+        if points_sel.shape[0] >= 3:
+            voxel_size = self._regularize_voxel_size
+            keys = np.floor(points_sel / voxel_size).astype(np.int32)
+            voxel_map: dict[tuple[int, int, int], list[int]] = {}
+            for idx, key in enumerate(map(tuple, keys)):
+                voxel_map.setdefault(key, []).append(idx)
+
+            neighbor_offsets = [
+                (dx, dy, dz)
+                for dx in (-1, 0, 1)
+                for dy in (-1, 0, 1)
+                for dz in (-1, 0, 1)
+                if not (dx == 0 and dy == 0 and dz == 0)
+            ]
+            visited: set[tuple[int, int, int]] = set()
+            components: list[tuple[np.ndarray, float, np.ndarray]] = []
+            for key in voxel_map:
+                if key in visited:
+                    continue
+                stack = [key]
+                visited.add(key)
+                comp_indices: list[int] = []
+                while stack:
+                    current = stack.pop()
+                    comp_indices.extend(voxel_map[current])
+                    for dx, dy, dz in neighbor_offsets:
+                        neighbor = (current[0] + dx, current[1] + dy, current[2] + dz)
+                        if neighbor in voxel_map and neighbor not in visited:
+                            visited.add(neighbor)
+                            stack.append(neighbor)
+
+                comp_idx = np.array(comp_indices, dtype=int)
+                comp_weight = float(np.sum(weights_sel[comp_idx]))
+                comp_centroid = compute_centroid(points_sel[comp_idx])
+                components.append((comp_idx, comp_weight, comp_centroid))
+
+            if components:
+                main_idx = int(np.argmax([c[1] for c in components]))
+                main_points = points_sel[components[main_idx][0]]
+                main_centroid = components[main_idx][2]
+                extent = np.max(main_points, axis=0) - np.min(main_points, axis=0)
+                dist_thresh = max(0.3, 0.5 * float(np.max(extent)))
+
+                keep_sel_mask = np.zeros(points_sel.shape[0], dtype=bool)
+                for comp_idx, _, comp_centroid in components:
+                    if np.linalg.norm(comp_centroid - main_centroid) <= dist_thresh:
+                        keep_sel_mask[comp_idx] = True
+                if np.any(keep_sel_mask):
+                    keep_mask = np.zeros(voxels.shape[0], dtype=bool)
+                    keep_mask[selected_indices[keep_sel_mask]] = True
+
+        self.voxel_manager.regularized_mask = keep_mask
+
+        self._needs_regularization = False
+        self._invalidate_cache()
diff --git a/semantic_mapping/semantic_mapping/mapper.py b/semantic_mapping/semantic_mapping/mapper.py
new file mode 100644
index 0000000..cc67cf1
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/mapper.py
@@ -0,0 +1,275 @@
+"""Object map manager for semantic mapping."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+import threading
+import time
+import uuid
+
+from .utils.matching import compute_bbox, compute_iou_3d_with_ratios, find_best_match
+from .map_object import MapObject, ObjectStatus, Observation
+
+
+@dataclass
+class UpdateStats:
+    """Summary of updates applied in a mapper step."""
+
+    created: int = 0
+    updated: int = 0
+    merged: int = 0
+    deleted: int = 0
+    promoted: int = 0
+    total: int = 0
+
+
+class ObjectMapper:
+    """Manages the semantic object map with two-tier state management."""
+
+    def __init__(
+        self,
+        voxel_size: float = 0.03,
+        regularize_voxel_size: float = 0.06,
+        confidence_threshold: float = 0.30,
+        distance_threshold: float = 0.5,
+        iou_threshold: float = 0.2,
+        min_detections_for_permanent: int = 3,
+        pending_ttl_s: float = 10.0,
+        num_angle_bins: int = 15,
+    ) -> None:
+        """Initialize the object mapper and its thresholds.
+
+        Args:
+            voxel_size: Voxel grid cell size in meters.
+            regularize_voxel_size: Voxel size for regularization clustering.
+            confidence_threshold: Minimum detection confidence to accept.
+            distance_threshold: Max centroid distance for spatial matching.
+            iou_threshold: IoU threshold for merge checks.
+            min_detections_for_permanent: Detections before promotion.
+            pending_ttl_s: Time-to-live for pending objects.
+            num_angle_bins: Observation angle bins for voxel manager.
+        """
+        self._pending_objects: dict[str, MapObject] = {}
+        self._permanent_objects: dict[str, MapObject] = {}
+        self._track_id_map: dict[int, str] = {}
+        self._lock = threading.RLock()
+
+        self._voxel_size = voxel_size
+        self._regularize_voxel_size = regularize_voxel_size
+        self._confidence_threshold = confidence_threshold
+        self._distance_threshold = distance_threshold
+        self._iou_threshold = iou_threshold
+        self._min_detections_for_permanent = min_detections_for_permanent
+        self._pending_ttl_s = pending_ttl_s
+        self._num_angle_bins = num_angle_bins
+
+    def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats:
+        """Update the map with new observations.
+
+        Args:
+            observations: List of new observations to integrate.
+            target_object: Optional label to protect from pruning.
+
+        Returns:
+            UpdateStats summarizing changes applied.
+        """
+        stats = UpdateStats()
+        now = max((obs.timestamp for obs in observations), default=time.time())
+        updated_ids: set[str] = set()
+        regularized_ids: set[str] = set()
+
+        with self._lock:
+            for obs in observations:
+                if obs.confidence < self._confidence_threshold:
+                    continue
+
+                obj = self._match_observation(obs)
+                if obj is None:
+                    obj = self._create_object(obs)
+                    self._pending_objects[obj.object_id] = obj
+                    stats.created += 1
+                else:
+                    obj.update(obs)
+                    stats.updated += 1
+
+                updated_ids.add(obj.object_id)
+                for track_id in obj.track_ids:
+                    self._track_id_map[track_id] = obj.object_id
+
+                if obj.status == ObjectStatus.PENDING:
+                    if self._check_promotion(obj):
+                        stats.promoted += 1
+
+                if obj.object_id not in regularized_ids and obj.inactive_frames == 0:
+                    obj.regularize_shape(percentile=obj._diversity_percentile)
+                    regularized_ids.add(obj.object_id)
+
+            for obj in list(self._pending_objects.values()) + list(self._permanent_objects.values()):
+                if obj.object_id not in updated_ids:
+                    obj.inactive_frames += 1
+
+            deleted = self._prune_inactive(target_object, now)
+            stats.deleted = len(deleted)
+            stats.merged += self.merge_objects()
+            stats.total = len(self._permanent_objects)
+
+        return stats
+
+    def _match_observation(self, obs: Observation) -> MapObject | None:
+        """Match an observation to an existing object."""
+        if obs.track_id in self._track_id_map:
+            obj_id = self._track_id_map[obs.track_id]
+            if obj_id in self._permanent_objects:
+                return self._permanent_objects[obj_id]
+            if obj_id in self._pending_objects:
+                return self._pending_objects[obj_id]
+
+        candidates = list(self._permanent_objects.values()) + list(self._pending_objects.values())
+        return find_best_match(
+            obs=obs,
+            candidates=candidates,
+            distance_threshold=self._distance_threshold,
+            iou_threshold=self._iou_threshold,
+        )
+
+    def _create_object(self, obs: Observation) -> MapObject:
+        """Create a new MapObject from an observation."""
+        obj = MapObject.from_observation(
+            object_id=str(uuid.uuid4()),
+            obs=obs,
+            voxel_size=self._voxel_size,
+            regularize_voxel_size=self._regularize_voxel_size,
+            num_angle_bins=self._num_angle_bins,
+        )
+        self._track_id_map[obs.track_id] = obj.object_id
+        return obj
+
+    def _check_promotion(self, obj: MapObject) -> bool:
+        """Promote object from pending to permanent if threshold met."""
+        if obj.status != ObjectStatus.PENDING:
+            return False
+        if obj.detection_count < self._min_detections_for_permanent:
+            return False
+        if obj.object_id in self._pending_objects:
+            self._pending_objects.pop(obj.object_id)
+        obj.status = ObjectStatus.PERMANENT
+        self._permanent_objects[obj.object_id] = obj
+        return True
+
+    def merge_objects(self) -> int:
+        """Merge overlapping permanent objects.
+
+        Returns:
+            Number of merges performed.
+        """
+        merged = 0
+        objs = list(self._permanent_objects.values())
+        i = 0
+        while i < len(objs):
+            obj = objs[i]
+            j = i + 1
+            while j < len(objs):
+                other = objs[j]
+                should_merge = False
+
+                if set(obj.track_ids) & set(other.track_ids):
+                    should_merge = True
+                else:
+                    if obj.centroid is None or other.centroid is None:
+                        j += 1
+                        continue
+                    dist = float(((obj.centroid - other.centroid) ** 2).sum() ** 0.5)
+                    if dist > self._distance_threshold:
+                        j += 1
+                        continue
+
+                    obj_points = obj.voxel_manager.get_valid_voxels(
+                        diversity_percentile=obj._diversity_percentile,
+                        use_regularized=True,
+                    )
+                    if len(obj_points) == 0:
+                        obj_points = obj.voxel_manager.get_valid_voxels(
+                            diversity_percentile=obj._diversity_percentile,
+                            use_regularized=False,
+                        )
+                    other_points = other.voxel_manager.get_valid_voxels(
+                        diversity_percentile=other._diversity_percentile,
+                        use_regularized=True,
+                    )
+                    if len(other_points) == 0:
+                        other_points = other.voxel_manager.get_valid_voxels(
+                            diversity_percentile=other._diversity_percentile,
+                            use_regularized=False,
+                        )
+
+                    obj_bbox = compute_bbox(obj_points) if len(obj_points) >= 3 else None
+                    other_bbox = compute_bbox(other_points) if len(other_points) >= 3 else None
+                    obj_corners = obj_bbox[0] if obj_bbox is not None else None
+                    other_corners = other_bbox[0] if other_bbox is not None else None
+                    obj_extent = obj_bbox[1] if obj_bbox is not None else None
+                    other_extent = other_bbox[1] if other_bbox is not None else None
+
+                    dist_merge = dist < 0.25
+                    if obj_extent is not None and other_extent is not None:
+                        dist_thresh = (((obj_extent / 2 + other_extent / 2) / 2) ** 2).sum() ** 0.5
+                        dist_merge = dist_merge or dist < dist_thresh * 0.5
+
+                    if dist_merge:
+                        should_merge = True
+                    elif obj_corners is not None and other_corners is not None:
+                        iou, ratio_obj, ratio_other = compute_iou_3d_with_ratios(
+                            obj_corners, other_corners
+                        )
+                        if (
+                            iou > self._iou_threshold
+                            or ratio_obj > 0.4
+                            or ratio_other > 0.4
+                        ):
+                            should_merge = True
+
+                if should_merge:
+                    obj.merge(other)
+                    for track_id in other.track_ids:
+                        self._track_id_map[track_id] = obj.object_id
+                    self._permanent_objects.pop(other.object_id, None)
+                    objs.pop(j)
+                    merged += 1
+                else:
+                    j += 1
+            i += 1
+
+        return merged
+
+    def _prune_inactive(self, target_object: str | None, now: float) -> list[MapObject]:
+        """Remove inactive pending/permanent objects."""
+        deleted: list[MapObject] = []
+
+        for obj_id, obj in list(self._pending_objects.items()):
+            if (now - obj.last_update_time) > self._pending_ttl_s:
+                deleted.append(self._pending_objects.pop(obj_id))
+
+        for obj_id, obj in list(self._permanent_objects.items()):
+            if target_object and obj.label == target_object:
+                continue
+            if obj.status == ObjectStatus.DELETED:
+                deleted.append(self._permanent_objects.pop(obj_id))
+
+        for obj in deleted:
+            for track_id in obj.track_ids:
+                if self._track_id_map.get(track_id) == obj.object_id:
+                    self._track_id_map.pop(track_id, None)
+
+        return deleted
+
+    def get_objects(self, include_pending: bool = False) -> list[MapObject]:
+        """Return current objects.
+
+        Args:
+            include_pending: If True, include pending objects.
+
+        Returns:
+            List of MapObject instances.
+        """
+        if include_pending:
+            return list(self._permanent_objects.values()) + list(self._pending_objects.values())
+        return list(self._permanent_objects.values())
diff --git a/semantic_mapping/semantic_mapping/mapping_node.py b/semantic_mapping/semantic_mapping/mapping_node.py
new file mode 100644
index 0000000..760ad6d
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/mapping_node.py
@@ -0,0 +1,993 @@
+"""ROS2 mapping node for semantic 3D object mapping."""
+
+from __future__ import annotations
+
+from collections import deque
+from pathlib import Path
+import time
+
+import numpy as np
+from cv_bridge import CvBridge
+import cv2
+import rclpy
+from rclpy.node import Node
+from rclpy.qos import QoSProfile, ReliabilityPolicy
+from sensor_msgs.msg import CompressedImage, PointCloud2
+from sensor_msgs_py import point_cloud2
+from nav_msgs.msg import Odometry
+from std_msgs.msg import Header
+from visualization_msgs.msg import MarkerArray
+from scipy.spatial.transform import Rotation
+
+from detect_anything.msg import DetectionResult
+from semantic_mapping.msg import (
+    ObjectType,
+    ObjectNode,
+    TargetObjectInstruction,
+    TargetObjectWithSpatial,
+    ViewpointRep,
+)
+
+from vector_perception_utils.pointcloud_utils import (
+    project_pointcloud_to_image,
+    segment_pointclouds_from_projection,
+    transform_segmented_clouds_to_world,
+)
+from vector_perception_utils.transform_utils import discretize_angles
+
+from .map_object import Observation, MapObject
+from .utils.map_object_utils import (
+    create_bbox_markers,
+    create_delete_markers,
+    create_label_markers,
+    create_object_pointcloud,
+    map_object_to_object_node,
+)
+from .mapper import ObjectMapper
+from .utils.storage import ImageSaveQueue
+from .utils.utils import collect_cloud_window, compute_ground_z_threshold, interpolate_odom, load_camera_params
+
+
+class SemanticMappingNode(Node):
+    """ROS2 node for semantic object mapping.
+
+    This node fuses 2D detections and LiDAR point clouds into persistent 3D objects:
+    - Buffers detections, clouds, and odometry with timestamps
+    - Interpolates odometry at detection time (position + orientation)
+    - Segments point clouds by detection masks and projects to world frame
+    - Updates the object mapper and publishes ROS messages
+    - Queues cropped images for VLM queries
+    """
+
+    def __init__(self) -> None:
+        """Initialize the mapping node, parameters, buffers, and ROS I/O."""
+        super().__init__("semantic_mapping_node")
+        self._bridge = CvBridge()
+
+        self.declare_parameter("detection_topic", "/detection_result")
+        self.declare_parameter("cloud_topic", "/registered_scan")
+        self.declare_parameter("odom_topic", "/state_estimation")
+        self.declare_parameter("viewpoint_topic", "/viewpoint_rep_header")
+        self.declare_parameter("object_type_answer_topic", "/object_type_answer")
+        self.declare_parameter("object_type_query_topic", "/object_type_query")
+        self.declare_parameter("target_object_spatial_query_topic", "/target_object_spatial_query")
+        self.declare_parameter("target_object_instruction_topic", "/target_object_instruction")
+        self.declare_parameter("object_nodes_topic", "/object_nodes")
+        self.declare_parameter("obj_points_topic", "/obj_points")
+        self.declare_parameter("obj_boxes_topic", "/obj_boxes")
+        self.declare_parameter("obj_labels_topic", "/obj_labels")
+        self.declare_parameter("map_frame", "map")
+
+        self.declare_parameter("confidence_threshold", 0.3)
+        self.declare_parameter("voxel_size", 0.03)
+        self.declare_parameter("regularize_voxel_size", 0.06)
+        self.declare_parameter("distance_threshold", 0.5)
+        self.declare_parameter("iou_threshold", 0.2)
+        self.declare_parameter("min_detections_for_permanent", 3)
+        self.declare_parameter("pending_ttl_s", 10.0)
+        self.declare_parameter("num_angle_bins", 15)
+
+        self.declare_parameter("detection_linear_state_time_bias", 0.0)
+        self.declare_parameter("detection_angular_state_time_bias", 0.0)
+        self.declare_parameter("cloud_window_before", 0.5)
+        self.declare_parameter("cloud_window_after", 0.1)
+
+        self.declare_parameter("camera_param_path", "config/camera_param.yaml")
+        self.declare_parameter("depth_filter", False)
+        self.declare_parameter("depth_filter_margin", 0.15)
+        self.declare_parameter("depth_jump_threshold", 0.3)
+        self.declare_parameter("mask_erode_kernel", 3)
+        self.declare_parameter("mask_erode_iterations", 2)
+        self.declare_parameter("max_cloud_distance", 8.0)
+        self.declare_parameter("visualization_max_objects", 50)
+        self.declare_parameter("ground_filter", True)
+        self.declare_parameter("ground_radius", 2.0)
+        self.declare_parameter("ground_z_clearance", 0.0)
+
+        self.declare_parameter("target_object", "")
+        self.declare_parameter("save_png", False)
+        self.declare_parameter("output_dir", "output/object_images")
+        self.declare_parameter("save_viewpoint_images", True)
+        self.declare_parameter("viewpoint_output_dir", "output/viewpoint_images")
+        self.declare_parameter("viewpoint_use_timestamp_dir", True)
+        self.declare_parameter("viewpoint_save_png", True)
+        self.declare_parameter("viewpoint_save_transform", True)
+        self.declare_parameter("object_type_query_mode", "target_or_all")
+        self.declare_parameter("object_type_query_max_per_cycle", 3)
+        self.declare_parameter("enable_object_type_queries", True)
+
+        detection_topic = str(self.get_parameter("detection_topic").value)
+        cloud_topic = str(self.get_parameter("cloud_topic").value)
+        odom_topic = str(self.get_parameter("odom_topic").value)
+        viewpoint_topic = str(self.get_parameter("viewpoint_topic").value)
+        object_type_answer_topic = str(self.get_parameter("object_type_answer_topic").value)
+        object_type_query_topic = str(self.get_parameter("object_type_query_topic").value)
+        target_object_spatial_query_topic = str(
+            self.get_parameter("target_object_spatial_query_topic").value
+        )
+        target_object_instruction_topic = str(self.get_parameter("target_object_instruction_topic").value)
+        object_nodes_topic = str(self.get_parameter("object_nodes_topic").value)
+        obj_points_topic = str(self.get_parameter("obj_points_topic").value)
+        obj_boxes_topic = str(self.get_parameter("obj_boxes_topic").value)
+        obj_labels_topic = str(self.get_parameter("obj_labels_topic").value)
+        self._map_frame = str(self.get_parameter("map_frame").value)
+
+        self._confidence_threshold = float(self.get_parameter("confidence_threshold").value)
+        self._detection_linear_bias = float(self.get_parameter("detection_linear_state_time_bias").value)
+        self._detection_angular_bias = float(self.get_parameter("detection_angular_state_time_bias").value)
+        self._cloud_window_before = float(self.get_parameter("cloud_window_before").value)
+        self._cloud_window_after = float(self.get_parameter("cloud_window_after").value)
+
+        camera_param_path = str(self.get_parameter("camera_param_path").value)
+        camera_params = load_camera_params(
+            camera_param_path, base_dir=Path(__file__).resolve().parent.parent
+        )
+        self._projection = "equirect"
+        self._axis_mode = "xz"
+        self._image_width = 1920
+        self._image_height = 640
+        self._depth_filter = bool(self.get_parameter("depth_filter").value)
+        self._depth_filter_margin = float(self.get_parameter("depth_filter_margin").value)
+        self._depth_jump_threshold = float(self.get_parameter("depth_jump_threshold").value)
+        self._mask_erode_kernel_size = max(1, int(self.get_parameter("mask_erode_kernel").value))
+        self._mask_erode_iterations = max(0, int(self.get_parameter("mask_erode_iterations").value))
+        self._max_cloud_distance = float(self.get_parameter("max_cloud_distance").value)
+        self._sensor_to_camera = None
+        if camera_params is not None:
+            self._image_width = camera_params.image_width
+            self._image_height = camera_params.image_height
+            self._sensor_to_camera = camera_params.sensor_to_camera
+        self._mask_erode_kernel = cv2.getStructuringElement(
+            cv2.MORPH_RECT,
+            (self._mask_erode_kernel_size, self._mask_erode_kernel_size),
+        )
+        self._visualization_max_objects = int(
+            self.get_parameter("visualization_max_objects").value
+        )
+        self._ground_filter = bool(self.get_parameter("ground_filter").value)
+        self._ground_radius = float(self.get_parameter("ground_radius").value)
+        self._ground_z_clearance = float(self.get_parameter("ground_z_clearance").value)
+        self._ground_percentile = 0.05
+        self._ground_min_points = 80
+
+        self._target_object = str(self.get_parameter("target_object").value)
+        self._room_condition = ""
+        self._spatial_condition = ""
+        self._attribute_condition = ""
+        self._anchor_object = ""
+        self._attribute_condition_anchor = ""
+        self._target_spatial_query_ids: set[int] = set()
+        self._object_viewpoint_ids: dict[int, set[int]] = {}
+        self._save_png = bool(self.get_parameter("save_png").value)
+        self._output_dir = Path(str(self.get_parameter("output_dir").value))
+        self._save_viewpoint_images = bool(self.get_parameter("save_viewpoint_images").value)
+        self._viewpoint_output_dir = Path(str(self.get_parameter("viewpoint_output_dir").value))
+        self._viewpoint_use_timestamp_dir = bool(
+            self.get_parameter("viewpoint_use_timestamp_dir").value
+        )
+        self._viewpoint_save_png = bool(self.get_parameter("viewpoint_save_png").value)
+        self._viewpoint_save_transform = bool(
+            self.get_parameter("viewpoint_save_transform").value
+        )
+        self._object_type_query_mode = str(
+            self.get_parameter("object_type_query_mode").value
+        ).lower()
+        self._object_type_query_max = int(
+            self.get_parameter("object_type_query_max_per_cycle").value
+        )
+        self._enable_object_type_queries = bool(
+            self.get_parameter("enable_object_type_queries").value
+        )
+        self._viewpoint_dir = self._viewpoint_output_dir if self._save_viewpoint_images else None
+
+        self._mapper = ObjectMapper(
+            voxel_size=float(self.get_parameter("voxel_size").value),
+            regularize_voxel_size=float(self.get_parameter("regularize_voxel_size").value),
+            confidence_threshold=self._confidence_threshold,
+            distance_threshold=float(self.get_parameter("distance_threshold").value),
+            iou_threshold=float(self.get_parameter("iou_threshold").value),
+            min_detections_for_permanent=int(self.get_parameter("min_detections_for_permanent").value),
+            pending_ttl_s=float(self.get_parameter("pending_ttl_s").value),
+            num_angle_bins=int(self.get_parameter("num_angle_bins").value),
+        )
+
+        self._image_saver = ImageSaveQueue(
+            output_dir=self._output_dir,
+            max_queue_size=100,
+            save_png=self._save_png,
+        )
+        self._image_saver.start()
+
+        self._viewpoint_saver: ImageSaveQueue | None = None
+        if self._save_viewpoint_images and self._viewpoint_dir is not None:
+            self._viewpoint_saver = ImageSaveQueue(
+                output_dir=self._viewpoint_output_dir,
+                max_queue_size=50,
+                save_png=self._viewpoint_save_png,
+                save_transform=self._viewpoint_save_transform,
+                clear_output_dir=True,
+                use_timestamp_dir=self._viewpoint_use_timestamp_dir,
+            )
+            self._viewpoint_saver.start()
+
+        self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=200)
+        self._odom_buffer: deque[tuple[float, dict]] = deque(maxlen=400)
+        self._detection_buffer: deque[DetectionResult] = deque(maxlen=100)
+        self._viewpoint_queue: deque[tuple[float, int]] = deque(maxlen=50)
+        self._processed_viewpoints: set[float] = set()
+        self._published_ids: set[int] = set()
+        self._mapping_busy = False
+        self._last_mapping_stats_log = 0.0
+        self._last_mapping_time_log = 0.0
+        self._mapping_log_throttle_s = 5.0
+        self._last_query_log = 0.0
+        self._query_log_throttle_s = 5.0
+        self._last_answer_log = 0.0
+
+        qos_cloud = QoSProfile(depth=10, reliability=ReliabilityPolicy.BEST_EFFORT)
+        qos_default = QoSProfile(depth=10, reliability=ReliabilityPolicy.RELIABLE)
+
+        self.create_subscription(DetectionResult, detection_topic, self._detection_callback, qos_default)
+        self.create_subscription(PointCloud2, cloud_topic, self._cloud_callback, qos_cloud)
+        self.create_subscription(Odometry, odom_topic, self._odom_callback, qos_cloud)
+        self.create_subscription(ViewpointRep, viewpoint_topic, self._viewpoint_callback, qos_default)
+        self.create_subscription(ObjectType, object_type_answer_topic, self._object_type_answer_callback, qos_default)
+        self.create_subscription(
+            TargetObjectInstruction, target_object_instruction_topic, self._target_object_callback, qos_default
+        )
+
+        self._object_type_query_pub = self.create_publisher(ObjectType, object_type_query_topic, qos_default)
+        self._target_object_spatial_query_pub = self.create_publisher(
+            TargetObjectWithSpatial, target_object_spatial_query_topic, qos_default
+        )
+        self._object_nodes_pub = self.create_publisher(ObjectNode, object_nodes_topic, qos_default)
+        self._obj_points_pub = self.create_publisher(PointCloud2, obj_points_topic, qos_default)
+        self._obj_boxes_pub = self.create_publisher(MarkerArray, obj_boxes_topic, qos_default)
+        self._obj_labels_pub = self.create_publisher(MarkerArray, obj_labels_topic, qos_default)
+
+        self.create_timer(0.5, self._mapping_callback)
+
+    def destroy_node(self) -> bool:
+        """Stop background workers and destroy the ROS node."""
+        self._image_saver.stop()
+        if self._viewpoint_saver is not None:
+            self._viewpoint_saver.stop()
+        return super().destroy_node()
+
+    def _detection_callback(self, msg: DetectionResult) -> None:
+        """Buffer detection results for later synchronization.
+
+        Args:
+            msg: DetectionResult message with bboxes, labels, and masks.
+        """
+        self._detection_buffer.append(msg)
+
+    def _cloud_callback(self, msg: PointCloud2) -> None:
+        """Buffer incoming point clouds with timestamps.
+
+        Args:
+            msg: PointCloud2 message in the sensor frame.
+        """
+        stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9
+        points = point_cloud2.read_points_numpy(msg, field_names=("x", "y", "z"))
+        self._cloud_buffer.append((stamp, points))
+
+    def _odom_callback(self, msg: Odometry) -> None:
+        """Buffer odometry data for interpolation.
+
+        Args:
+            msg: Odometry message containing pose and twist.
+        """
+        stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9
+        odom = {
+            "position": np.array(
+                [msg.pose.pose.position.x, msg.pose.pose.position.y, msg.pose.pose.position.z]
+            ),
+            "orientation": np.array(
+                [
+                    msg.pose.pose.orientation.x,
+                    msg.pose.pose.orientation.y,
+                    msg.pose.pose.orientation.z,
+                    msg.pose.pose.orientation.w,
+                ]
+            ),
+            "linear_velocity": np.array(
+                [msg.twist.twist.linear.x, msg.twist.twist.linear.y, msg.twist.twist.linear.z]
+            ),
+            "angular_velocity": np.array(
+                [msg.twist.twist.angular.x, msg.twist.twist.angular.y, msg.twist.twist.angular.z]
+            ),
+        }
+        self._odom_buffer.append((stamp, odom))
+
+    def _viewpoint_callback(self, msg: ViewpointRep) -> None:
+        """Queue viewpoint trigger requests.
+
+        Args:
+            msg: ViewpointRep message with timestamp and viewpoint id.
+        """
+        stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9
+        if stamp in self._processed_viewpoints:
+            return
+        self._viewpoint_queue.append((stamp, msg.viewpoint_id))
+
+    def _target_object_callback(self, msg: TargetObjectInstruction) -> None:
+        """Update target object label used for VLM query decisions.
+
+        Args:
+            msg: TargetObjectInstruction message.
+        """
+        self._target_object = msg.target_object
+        self._room_condition = msg.room_condition
+        self._spatial_condition = msg.spatial_condition
+        self._attribute_condition = msg.attribute_condition
+        self._anchor_object = msg.anchor_object
+        self._attribute_condition_anchor = msg.attribute_condition_anchor
+        self._target_spatial_query_ids.clear()
+        if msg.target_object:
+            self.get_logger().debug(
+                f"Target instruction: target={msg.target_object} "
+                f"room={msg.room_condition} spatial={msg.spatial_condition}"
+            )
+
+    def _object_type_answer_callback(self, msg: ObjectType) -> None:
+        """Apply VLM relabeling updates to tracked objects.
+
+        Args:
+            msg: ObjectType message with final label for a track id.
+        """
+        matched = False
+        for obj in self._mapper.get_objects(include_pending=True):
+            if msg.object_id in obj.track_ids:
+                obj.class_votes[msg.final_label] = obj.class_votes.get(msg.final_label, 0) + 50
+                obj.confidence_scores[msg.final_label] = max(
+                    obj.confidence_scores.get(msg.final_label, 0.0), 1.0
+                )
+                obj.points_count[msg.final_label] = obj.points_count.get(msg.final_label, 0)
+                obj.updated_by_vlm = True
+                obj._invalidate_cache()
+                self.get_logger().debug(
+                    f"Object type answer applied: object_id={msg.object_id} "
+                    f"label={msg.final_label}"
+                )
+                matched = True
+                break
+        if not matched:
+            now = time.time()
+            if now - self._last_answer_log > self._query_log_throttle_s:
+                track_ids = [
+                    track_id
+                    for obj in self._mapper.get_objects(include_pending=True)
+                    for track_id in obj.track_ids
+                ]
+                preview = track_ids[:10]
+                suffix = "..." if len(track_ids) > 10 else ""
+                self.get_logger().warning(
+                    f"Object type answer ignored: object_id={msg.object_id} "
+                    f"label={msg.final_label} known_ids={preview}{suffix}"
+                )
+                self._last_answer_log = now
+
+    def _mapping_callback(self) -> None:
+        """Main mapping loop: synchronize inputs, update map, and publish outputs."""
+        if self._mapping_busy:
+            return
+        if len(self._detection_buffer) < 1 or len(self._cloud_buffer) < 1 or len(self._odom_buffer) < 2:
+            return
+
+        self._mapping_busy = True
+        start_time = time.perf_counter()
+        try:
+            detection_msg, viewpoint_stamp, viewpoint_id = self._select_detection()
+            if detection_msg is None:
+                return
+
+            detection_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9
+            image = self._bridge.compressed_imgmsg_to_cv2(detection_msg.image, desired_encoding="bgr8")
+
+            odom = interpolate_odom(
+                list(self._odom_buffer),
+                detection_stamp,
+                self._detection_linear_bias,
+                self._detection_angular_bias,
+            )
+            if odom is None:
+                return
+
+            if viewpoint_stamp is not None:
+                self._save_viewpoint_snapshot(
+                    viewpoint_id=viewpoint_id,
+                    viewpoint_stamp=viewpoint_stamp,
+                    image=image,
+                    odom=odom,
+                )
+
+            neighbor_cloud = collect_cloud_window(
+                list(self._cloud_buffer),
+                detection_stamp,
+                self._cloud_window_before,
+                self._cloud_window_after,
+            )
+            if neighbor_cloud is None:
+                return
+
+            observations = self._build_observations(
+                detection_msg=detection_msg,
+                image=image,
+                odom=odom,
+                cloud=neighbor_cloud,
+            )
+            if len(observations) == 0:
+                return
+
+            stats = self._mapper.update(observations, target_object=self._target_object)
+            now = time.time()
+            if now - self._last_mapping_stats_log > self._mapping_log_throttle_s:
+                self.get_logger().debug(
+                    "Mapping stats:\n"
+                    f"  total: {stats.total}\n"
+                    f"  deleted: {stats.deleted}"
+                )
+                self._last_mapping_stats_log = now
+            self._update_viewpoint_ids(observations, viewpoint_id)
+            self._update_observation_angles(observations)
+            self._update_best_images(observations, image)
+            self._publish_map(detection_stamp, viewpoint_id)
+            self._publish_object_type_queries()
+            self._publish_target_object_spatial_queries()
+
+            if viewpoint_stamp is not None:
+                self._processed_viewpoints.add(viewpoint_stamp)
+        finally:
+            elapsed_ms = (time.perf_counter() - start_time) * 1000.0
+            now = time.time()
+            if now - self._last_mapping_time_log > self._mapping_log_throttle_s:
+                self.get_logger().debug(f"Mapping loop time: {elapsed_ms:.2f} ms")
+                self._last_mapping_time_log = now
+            self._mapping_busy = False
+
+    def _select_detection(self) -> tuple[DetectionResult | None, float | None, int]:
+        """Pick the detection to process, optionally aligned to a viewpoint.
+
+        Returns:
+            Tuple of (detection_msg, viewpoint_stamp, viewpoint_id).
+        """
+        viewpoint_stamp = None
+        viewpoint_id = -1
+
+        if len(self._viewpoint_queue) > 0:
+            stamp, vid = self._viewpoint_queue[0]
+            detections = list(self._detection_buffer)
+            stamps = [msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 for msg in detections]
+            diffs = np.abs(np.array(stamps) - stamp)
+            idx = int(np.argmin(diffs))
+            diff = float(diffs[idx])
+            if diff > 1.0:
+                self.get_logger().debug(
+                    f"Viewpoint stamp {stamp:.3f} is {diff:.3f}s away from closest detection"
+                )
+            detection_msg = detections[idx]
+            viewpoint_stamp = stamp
+            viewpoint_id = vid
+            self._viewpoint_queue.popleft()
+            return detection_msg, viewpoint_stamp, viewpoint_id
+
+        idx = -2 if len(self._detection_buffer) > 1 else -1
+        return self._detection_buffer[idx], viewpoint_stamp, viewpoint_id
+
+    def _update_viewpoint_ids(self, observations: list[Observation], viewpoint_id: int) -> None:
+        """Track viewpoint IDs associated with observations."""
+        if viewpoint_id < 0:
+            return
+        for obs in observations:
+            if obs.track_id not in self._object_viewpoint_ids:
+                self._object_viewpoint_ids[obs.track_id] = set()
+            self._object_viewpoint_ids[obs.track_id].add(viewpoint_id)
+
+    def _collect_viewpoint_ids(self, obj: MapObject) -> list[int]:
+        """Collect viewpoint IDs observed for a MapObject."""
+        ids: set[int] = set()
+        for track_id in obj.track_ids:
+            ids.update(self._object_viewpoint_ids.get(track_id, set()))
+        return sorted(ids)
+
+    def _save_viewpoint_snapshot(
+        self,
+        viewpoint_id: int,
+        viewpoint_stamp: float,
+        image: np.ndarray,
+        odom: dict,
+    ) -> None:
+        """Save viewpoint image and pose transform for spatial queries."""
+        if not self._save_viewpoint_images or self._viewpoint_dir is None:
+            return
+        if viewpoint_id < 0:
+            return
+
+        _ = viewpoint_stamp  # Reserved for future indexing/logging
+
+        transform = None
+        if self._viewpoint_save_transform:
+            R_b2w = Rotation.from_quat(odom["orientation"]).as_matrix()
+            t_b2w = np.array(odom["position"], dtype=float)
+            transform = np.eye(4, dtype=float)
+            transform[:3, :3] = R_b2w
+            transform[:3, 3] = t_b2w
+
+        if self._viewpoint_saver is not None:
+            self._viewpoint_saver.queue_viewpoint_save(
+                viewpoint_id=viewpoint_id, image=image, transform=transform
+            )
+
+    # helper methods moved to semantic_mapping.utils.utils
+
+    def _build_observations(
+        self,
+        detection_msg: DetectionResult,
+        image: np.ndarray,
+        odom: dict,
+        cloud: np.ndarray,
+    ) -> list[Observation]:
+        """Build Observation objects from detections and segmented point clouds.
+
+        Args:
+            detection_msg: DetectionResult with bboxes, labels, and masks.
+            image: RGB image used for masks and cropping.
+            odom: Interpolated odometry dict (position + orientation).
+            cloud: Aggregated point cloud in sensor frame.
+
+        Returns:
+            List of Observation instances for object updates.
+        """
+        det_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9
+        image_shape = image.shape[:2]
+
+        masks = []
+        labels = []
+        confidences = []
+        track_ids = []
+
+        for i in range(len(detection_msg.track_id)):
+            conf = float(detection_msg.confidence[i])
+            if conf < self._confidence_threshold:
+                continue
+            x1 = int(detection_msg.x1[i])
+            y1 = int(detection_msg.y1[i])
+            x2 = int(detection_msg.x2[i])
+            y2 = int(detection_msg.y2[i])
+            x1 = max(0, min(x1, image_shape[1] - 1))
+            x2 = max(0, min(x2, image_shape[1]))
+            y1 = max(0, min(y1, image_shape[0] - 1))
+            y2 = max(0, min(y2, image_shape[0]))
+            if x2 <= x1 or y2 <= y1:
+                continue
+
+            full_mask = np.zeros(image_shape, dtype=bool)
+            if i < len(detection_msg.masks):
+                mask_msg: CompressedImage = detection_msg.masks[i]
+                mask_crop = self._bridge.compressed_imgmsg_to_cv2(mask_msg)
+                if mask_crop.ndim == 3:
+                    mask_crop = cv2.cvtColor(mask_crop, cv2.COLOR_BGR2GRAY)
+                mask_bool = mask_crop.astype(bool)
+                if self._mask_erode_iterations > 0:
+                    mask_eroded = cv2.erode(
+                        mask_bool.astype(np.uint8),
+                        self._mask_erode_kernel,
+                        iterations=self._mask_erode_iterations,
+                    )
+                    mask_bool = mask_eroded.astype(bool)
+                crop_h = min(mask_bool.shape[0], y2 - y1)
+                crop_w = min(mask_bool.shape[1], x2 - x1)
+                full_mask[y1 : y1 + crop_h, x1 : x1 + crop_w] = mask_bool[:crop_h, :crop_w]
+
+            masks.append(full_mask)
+            labels.append(detection_msg.label[i])
+            confidences.append(conf)
+            track_ids.append(int(detection_msg.track_id[i]))
+
+        if len(masks) == 0:
+            return []
+
+        R_b2w = Rotation.from_quat(odom["orientation"]).as_matrix()
+        t_b2w = np.array(odom["position"])
+        R_w2b = R_b2w.T
+        t_w2b = -R_w2b @ t_b2w
+        cloud_body = cloud @ R_w2b.T + t_w2b
+
+        _, pixel_idx = project_pointcloud_to_image(
+            points=cloud_body,
+            projection=self._projection,
+            image_width=self._image_width,
+            image_height=self._image_height,
+            axis_mode=self._axis_mode,
+            intrinsics=None,
+            depth_filter=self._depth_filter,
+            depth_filter_margin=self._depth_filter_margin,
+            transform=self._sensor_to_camera,
+        )
+        seg_clouds, seg_depths = segment_pointclouds_from_projection(
+            points=cloud_body,
+            masks=masks,
+            pixel_idx=pixel_idx,
+        )
+        if self._depth_filter:
+            filtered_clouds = []
+            filtered_depths = []
+            for cloud, depth in zip(seg_clouds, seg_depths):
+                if depth.shape[0] == 0:
+                    filtered_clouds.append(cloud)
+                    filtered_depths.append(depth)
+                    continue
+                min_depth = float(np.min(depth))
+                keep = depth[:, 0] <= (min_depth + self._depth_filter_margin)
+                filtered_clouds.append(cloud[keep])
+                filtered_depths.append(depth[keep])
+            seg_clouds = filtered_clouds
+            seg_depths = filtered_depths
+        world_clouds = transform_segmented_clouds_to_world(
+            clouds=seg_clouds,
+            depths=seg_depths,
+            R_b2w=R_b2w,
+            t_b2w=t_b2w,
+            depth_jump_threshold=self._depth_jump_threshold,
+        )
+        cloud_world = cloud_body @ R_b2w.T + t_b2w
+        ground_z_threshold = compute_ground_z_threshold(
+            cloud_world,
+            t_b2w[:2],
+            self._ground_filter,
+            self._ground_min_points,
+            self._ground_radius,
+            self._ground_percentile,
+            self._ground_z_clearance,
+        )
+
+        observations = []
+        for idx, world_cloud in enumerate(world_clouds):
+            if self._max_cloud_distance > 0:
+                dist = np.linalg.norm(world_cloud - t_b2w, axis=1)
+                world_cloud = world_cloud[dist < self._max_cloud_distance]
+            if ground_z_threshold is not None:
+                world_cloud = world_cloud[world_cloud[:, 2] > ground_z_threshold]
+            if world_cloud.shape[0] == 0:
+                continue
+            observations.append(
+                Observation(
+                    track_id=track_ids[idx],
+                    label=labels[idx],
+                    confidence=confidences[idx],
+                    points=world_cloud,
+                    robot_R=R_b2w,
+                    robot_t=t_b2w,
+                    mask=masks[idx],
+                    image=image,
+                    timestamp=det_stamp,
+                )
+            )
+
+        return observations
+
+    def _update_observation_angles(self, observations: list[Observation]) -> None:
+        """Reproject object voxels onto the current mask to update angle votes."""
+        if len(observations) == 0:
+            return
+
+        objects = self._mapper.get_objects(include_pending=True)
+        track_to_obj = {}
+        for obj in objects:
+            for track_id in obj.track_ids:
+                track_to_obj[track_id] = obj
+
+        for obs in observations:
+            obj = track_to_obj.get(obs.track_id)
+            if obj is None or obs.mask is None:
+                continue
+            voxels = obj.voxel_manager.voxels
+            if voxels.shape[0] == 0:
+                continue
+
+            R_b2w = obs.robot_R
+            t_b2w = obs.robot_t
+            R_w2b = R_b2w.T
+            t_w2b = -R_w2b @ t_b2w
+            voxels_body = voxels @ R_w2b.T + t_w2b
+            _, pixel_idx = project_pointcloud_to_image(
+                points=voxels_body,
+                projection=self._projection,
+                image_width=self._image_width,
+                image_height=self._image_height,
+                axis_mode=self._axis_mode,
+                intrinsics=None,
+                depth_filter=self._depth_filter,
+                depth_filter_margin=self._depth_filter_margin,
+                transform=self._sensor_to_camera,
+            )
+            pix = pixel_idx.astype(int)
+            valid = (
+                (pix[:, 0] >= 0)
+                & (pix[:, 0] < obs.mask.shape[1])
+                & (pix[:, 1] >= 0)
+                & (pix[:, 1] < obs.mask.shape[0])
+            )
+            if not np.any(valid):
+                continue
+            pix = pix[valid]
+            voxel_indices = np.nonzero(valid)[0]
+            on_mask = obs.mask[pix[:, 1], pix[:, 0]].astype(bool)
+            if not np.any(on_mask):
+                continue
+            voxel_indices = voxel_indices[on_mask]
+            vectors = voxels[voxel_indices] - t_b2w
+            obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0])
+            obs_bins = discretize_angles(obs_angles, obj.voxel_manager.num_angle_bins)
+            obj.voxel_manager.update_observation_angles(voxel_indices, obs_bins)
+            obj._invalidate_cache()
+
+    def _update_best_images(self, observations: list[Observation], image: np.ndarray) -> None:
+        """Update best image crops for objects based on mask area.
+
+        Args:
+            observations: Observations created for the current frame.
+            image: Full RGB image associated with the detections.
+        """
+        for obs in observations:
+            obj = None
+            for candidate in self._mapper.get_objects(include_pending=True):
+                if obs.track_id in candidate.track_ids:
+                    obj = candidate
+                    break
+            if obj is None or obs.mask is None:
+                continue
+
+            coords = np.argwhere(obs.mask)
+            if coords.size == 0:
+                continue
+            mask_area = coords.shape[0]
+            if mask_area <= obj.best_image_mask_area:
+                continue
+
+            y0, x0 = coords.min(axis=0)
+            y1, x1 = coords.max(axis=0) + 1
+            padding = max(int(y1 - y0), int(x1 - x0), 80)
+            y0 = max(0, y0 - padding)
+            x0 = max(0, x0 - padding)
+            y1 = min(obs.mask.shape[0], y1 + padding)
+            x1 = min(obs.mask.shape[1], x1 + padding)
+
+            if obj.best_image_path:
+                image_path = Path(obj.best_image_path)
+                mask_path = image_path.with_name(image_path.stem + "_mask.npy")
+                if image_path.exists():
+                    image_path.unlink()
+                if mask_path.exists():
+                    mask_path.unlink()
+                if self._save_png:
+                    png_path = image_path.with_suffix(".png")
+                    if png_path.exists():
+                        png_path.unlink()
+
+            cropped_image = image[y0:y1, x0:x1]
+            cropped_mask = obs.mask[y0:y1, x0:x1]
+
+            obj.best_image_path = self._image_saver.queue_save(
+                object_id=str(obs.track_id),
+                image=cropped_image,
+                mask=cropped_mask,
+            )
+            obj.best_image_mask_area = float(mask_area)
+            obj.is_asked_vlm = False
+
+    def _publish_map(self, stamp: float, viewpoint_id: int) -> None:
+        """Publish object nodes, markers, and aggregated point clouds.
+
+        Args:
+            stamp: Timestamp (seconds) for message headers.
+            viewpoint_id: Viewpoint id for ObjectNode messages.
+        """
+        seconds = int(stamp)
+        nanoseconds = int((stamp - seconds) * 1e9)
+        header = Header()
+        header.stamp.sec = seconds
+        header.stamp.nanosec = nanoseconds
+        header.frame_id = self._map_frame
+
+        objects = self._mapper.get_objects(include_pending=True)
+        for obj in objects:
+            if obj.detection_count == 1:
+                status = "new"
+            elif obj.inactive_frames == 0:
+                status = "updated"
+            else:
+                status = "unchanged"
+            msg = map_object_to_object_node(
+                obj,
+                header,
+                status=status,
+                viewpoint_id=viewpoint_id,
+                target_object=self._target_object,
+            )
+            if msg is not None:
+                self._object_nodes_pub.publish(msg)
+
+        viz_objects = objects
+        if self._visualization_max_objects > 0 and len(objects) > self._visualization_max_objects:
+            viz_objects = sorted(
+                objects, key=lambda obj: obj.last_update_time, reverse=True
+            )[: self._visualization_max_objects]
+
+        bbox_markers = create_bbox_markers(viz_objects, header)
+        label_markers = create_label_markers(viz_objects, header)
+        self._obj_boxes_pub.publish(bbox_markers)
+        self._obj_labels_pub.publish(label_markers)
+
+        obj_cloud = create_object_pointcloud(viz_objects, header)
+        if obj_cloud is not None:
+            self._obj_points_pub.publish(obj_cloud)
+
+        current_ids = {int(obj.track_ids[0]) if obj.track_ids else 0 for obj in viz_objects}
+        deleted_ids = list(self._published_ids - current_ids)
+        if len(deleted_ids) > 0:
+            delete_markers = create_delete_markers(deleted_ids, header, namespace="bbox")
+            self._obj_boxes_pub.publish(delete_markers)
+            delete_labels = create_delete_markers(deleted_ids, header, namespace="labels")
+            self._obj_labels_pub.publish(delete_labels)
+        self._published_ids = current_ids
+
+    def _publish_object_type_queries(self) -> None:
+        """Publish VLM queries for target objects with good image crops."""
+        if not self._enable_object_type_queries:
+            return
+        published = 0
+        objects = self._mapper.get_objects(include_pending=False)
+        eligible = []
+        for obj in objects:
+            if obj.is_asked_vlm or obj.best_image_mask_area <= 500:
+                continue
+            if not obj.best_image_path:
+                continue
+            image_path = Path(obj.best_image_path)
+            if not image_path.exists():
+                continue
+            eligible.append(obj)
+        target = (self._target_object or "").strip().lower()
+        target_matches = [obj for obj in eligible if obj.label == target] if target else []
+
+        mode = self._object_type_query_mode
+        if mode not in ("target_only", "all", "target_or_all"):
+            mode = "target_or_all"
+
+        fallback_used = False
+        if mode == "target_only":
+            candidates = target_matches
+        elif mode == "all":
+            candidates = eligible
+        else:
+            if target_matches:
+                candidates = target_matches
+            else:
+                candidates = eligible
+                fallback_used = True
+
+        if candidates:
+            candidates = sorted(
+                candidates,
+                key=lambda obj: obj.best_image_mask_area,
+                reverse=True,
+            )
+        max_per_cycle = max(1, self._object_type_query_max)
+        for obj in candidates[:max_per_cycle]:
+            msg = ObjectType()
+            msg.object_id = int(obj.track_ids[0]) if obj.track_ids else -1
+            msg.img_path = obj.best_image_path or ""
+            msg.labels = list(obj.class_votes.keys())
+            self._object_type_query_pub.publish(msg)
+            obj.is_asked_vlm = True
+            published += 1
+        now = time.time()
+        if published > 0 or now - self._last_query_log > self._query_log_throttle_s:
+            subscriber_count = self._object_type_query_pub.get_subscription_count()
+            self.get_logger().debug(
+                f"ObjectType queries: published={published} eligible={len(eligible)} "
+                f"target_matches={len(target_matches)} fallback_all={fallback_used} "
+                f"mode={mode} target={self._target_object} subs={subscriber_count}"
+            )
+            self._last_query_log = now
+
+    def _publish_target_object_spatial_queries(self) -> None:
+        """Publish spatial target-object queries when spatial condition is set."""
+        if not self._spatial_condition:
+            return
+
+        published = 0
+        skipped_label = 0
+        skipped_mask = 0
+        skipped_bbox = 0
+        skipped_viewpoints = 0
+        skipped_asked = 0
+        for obj in self._mapper.get_objects(include_pending=False):
+            if obj.label != self._target_object:
+                skipped_label += 1
+                continue
+            if obj.best_image_mask_area <= 500:
+                skipped_mask += 1
+                continue
+            if obj.bbox3d_corners is None:
+                skipped_bbox += 1
+                continue
+
+            object_id = int(obj.track_ids[0]) if obj.track_ids else -1
+            if object_id < 0:
+                continue
+            if object_id in self._target_spatial_query_ids:
+                skipped_asked += 1
+                continue
+
+            viewpoint_ids = self._collect_viewpoint_ids(obj)
+            if len(viewpoint_ids) == 0:
+                skipped_viewpoints += 1
+                continue
+
+            msg = TargetObjectWithSpatial()
+            msg.header.stamp = self.get_clock().now().to_msg()
+            msg.header.frame_id = self._map_frame
+            msg.object_id = object_id
+            msg.object_label = obj.label
+            msg.img_path = obj.best_image_path or ""
+            msg.viewpoint_ids = viewpoint_ids
+            msg.room_label = ""
+            msg.is_target = False
+            for i in range(8):
+                msg.bbox3d[i].x = float(obj.bbox3d_corners[i, 0])
+                msg.bbox3d[i].y = float(obj.bbox3d_corners[i, 1])
+                msg.bbox3d[i].z = float(obj.bbox3d_corners[i, 2])
+
+            self._target_object_spatial_query_pub.publish(msg)
+            self._target_spatial_query_ids.add(object_id)
+            published += 1
+        now = time.time()
+        if published > 0 or now - self._last_query_log > self._query_log_throttle_s:
+            subscriber_count = self._target_object_spatial_query_pub.get_subscription_count()
+            self.get_logger().debug(
+                f"TargetSpatial queries: published={published} skipped_label={skipped_label} "
+                f"skipped_mask={skipped_mask} skipped_bbox={skipped_bbox} "
+                f"skipped_viewpoints={skipped_viewpoints} skipped_asked={skipped_asked} "
+                f"subs={subscriber_count}"
+            )
+            self._last_query_log = now
+
+
+def main(args: list[str] | None = None) -> None:
+    """Entry point for the semantic mapping node."""
+    rclpy.init(args=args)
+    node = SemanticMappingNode()
+    try:
+        rclpy.spin(node)
+    except KeyboardInterrupt:
+        pass
+    finally:
+        node.destroy_node()
+        rclpy.shutdown()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/semantic_mapping/semantic_mapping/prompts/__init__.py b/semantic_mapping/semantic_mapping/prompts/__init__.py
new file mode 100644
index 0000000..acbdca0
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/prompts/__init__.py
@@ -0,0 +1,17 @@
+"""Prompt templates for VLM reasoning tasks."""
+
+from .vlm_prompts import (
+    build_object_type_prompt,
+    build_room_type_prompt,
+    INSTRUCTION_DECOMPOSITION_PROMPT,
+    TARGET_OBJECT_PROMPT,
+    TARGET_OBJECT_SPATIAL_PROMPT,
+)
+
+__all__ = [
+    "build_object_type_prompt",
+    "build_room_type_prompt",
+    "INSTRUCTION_DECOMPOSITION_PROMPT",
+    "TARGET_OBJECT_PROMPT",
+    "TARGET_OBJECT_SPATIAL_PROMPT",
+]
diff --git a/semantic_mapping/semantic_mapping/prompts/vlm_prompts.py b/semantic_mapping/semantic_mapping/prompts/vlm_prompts.py
new file mode 100644
index 0000000..441f452
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/prompts/vlm_prompts.py
@@ -0,0 +1,78 @@
+from typing import Sequence
+
+
+OBJECT_TYPE_PROMPT_TEMPLATE = """You are provided with an RGB image containing an object.
+Select the correct label from the candidate list and respond strictly in JSON.
+
+Return ONLY valid JSON with this schema:
+{{"label": "<one of the candidate labels>", "reason": "<short reason>"}}
+
+Example:
+{{"label": "chair", "reason": "The object has a seat, backrest, and four legs."}}
+
+Candidate labels:
+{labels}
+"""
+
+ROOM_TYPE_PROMPT_TEMPLATE = """You are given an RGB image of a room and a top-down room layout mask.
+Identify the room type and respond strictly in valid JSON.
+
+Return ONLY valid JSON with this schema:
+{{"room_type": "<one of the room types>"}}
+
+If you are not confident, respond with:
+{{"room_type": "unknown"}}
+
+Room types:
+{room_types}
+"""
+
+
+INSTRUCTION_DECOMPOSITION_PROMPT = """You are given a user instruction for finding an object.
+Decompose it into the following fields and return ONLY valid JSON:
+
+{"target_object": "", "room_condition": "", "spatial_condition": "", "attribute_condition": "", "anchor_object": "", "attribute_condition_anchor": ""}
+
+If a field is not specified, return it as an empty string.
+
+Example:
+Instruction: "Find a silver refrigerator next to the white cabinet in the student lounge."
+{"target_object": "refrigerator", "room_condition": "in the student lounge", "spatial_condition": "next to the cabinet", "attribute_condition": "silver", "anchor_object": "cabinet", "attribute_condition_anchor": "white"}
+"""
+
+
+TARGET_OBJECT_PROMPT = """You are given an instruction and a candidate object image.
+Determine whether the object matches the target described in the instruction.
+Consider object type, room condition, and attribute condition.
+
+Return ONLY valid JSON:
+{"is_target": true/false, "reason": "<short reason>"}
+
+Example:
+{"is_target": true, "reason": "The object is a silver refrigerator in the lounge."}
+"""
+
+
+TARGET_OBJECT_SPATIAL_PROMPT = """You are given an instruction, a candidate object image, and
+additional views of the surrounding scene. Determine whether the object matches
+the instruction including any spatial or attribute constraints.
+
+Return ONLY valid JSON:
+{"is_target": true/false, "reason": "<short reason>"}
+
+Example:
+{"is_target": false, "reason": "The object is a fridge but not next to the cabinet."}
+"""
+
+
+def build_object_type_prompt(labels: Sequence[str]) -> str:
+    label_block = ", ".join(labels) if labels else "unknown"
+    return OBJECT_TYPE_PROMPT_TEMPLATE.format(labels=label_block)
+
+
+def build_room_type_prompt(room_types: Sequence[str]) -> str:
+    if room_types:
+        room_block = "\n".join(f"- {name}" for name in room_types)
+    else:
+        room_block = "- unknown"
+    return ROOM_TYPE_PROMPT_TEMPLATE.format(room_types=room_block)
diff --git a/semantic_mapping/semantic_mapping/utils/__init__.py b/semantic_mapping/semantic_mapping/utils/__init__.py
new file mode 100644
index 0000000..4a5efc8
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/__init__.py
@@ -0,0 +1,25 @@
+"""Utility modules for semantic mapping."""
+
+from .utils import (
+    CameraParams,
+    annotate_mask,
+    collect_cloud_window,
+    compute_ground_z_threshold,
+    concat_images,
+    interpolate_odom,
+    load_camera_params,
+    load_npy_image,
+    parse_json_dict,
+)
+
+__all__ = [
+    "CameraParams",
+    "annotate_mask",
+    "collect_cloud_window",
+    "compute_ground_z_threshold",
+    "concat_images",
+    "interpolate_odom",
+    "load_camera_params",
+    "load_npy_image",
+    "parse_json_dict",
+]
diff --git a/semantic_mapping/semantic_mapping/utils/map_object_utils.py b/semantic_mapping/semantic_mapping/utils/map_object_utils.py
new file mode 100644
index 0000000..5ad4163
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/map_object_utils.py
@@ -0,0 +1,264 @@
+"""ROS message helpers for MapObject publishers."""
+
+from __future__ import annotations
+
+import zlib
+from typing import Optional
+
+import numpy as np
+from geometry_msgs.msg import Point
+from sensor_msgs.msg import PointCloud2
+from std_msgs.msg import Header
+from visualization_msgs.msg import Marker, MarkerArray
+
+from semantic_mapping.msg import ObjectNode
+from vector_perception_utils.pointcloud_utils import create_pointcloud2_msg
+
+from ..map_object import MapObject
+
+
+def _axis_aligned_corners(points: np.ndarray) -> Optional[np.ndarray]:
+    if points.size == 0:
+        return None
+    min_bound = np.min(points, axis=0)
+    max_bound = np.max(points, axis=0)
+    return np.array(
+        [
+            [min_bound[0], min_bound[1], min_bound[2]],
+            [max_bound[0], min_bound[1], min_bound[2]],
+            [max_bound[0], max_bound[1], min_bound[2]],
+            [min_bound[0], max_bound[1], min_bound[2]],
+            [min_bound[0], min_bound[1], max_bound[2]],
+            [max_bound[0], min_bound[1], max_bound[2]],
+            [max_bound[0], max_bound[1], max_bound[2]],
+            [min_bound[0], max_bound[1], max_bound[2]],
+        ],
+        dtype=float,
+    )
+
+
+def get_object_seed(obj: MapObject) -> int:
+    """Generate a deterministic seed from a MapObject for color generation.
+    
+    Args:
+        obj: MapObject instance.
+        
+    Returns:
+        CRC32 hash of the object's identifier.
+    """
+    seed_key = obj.object_id or (str(obj.track_ids[0]) if obj.track_ids else obj.label)
+    return zlib.crc32(seed_key.encode("utf-8"))
+
+
+def map_object_to_object_node(
+    obj: MapObject,
+    header: Header,
+    status: str = "new",
+    viewpoint_id: int = -1,
+    target_object: str | None = None,
+) -> Optional[ObjectNode]:
+    """Convert a MapObject into an ObjectNode message.
+
+    Args:
+        obj: MapObject instance.
+        header: ROS header to attach to the message.
+        status: Lifecycle status string ("new", "updated", "unchanged").
+        viewpoint_id: Viewpoint identifier for image saving.
+        target_object: Target object label for potential labeling.
+
+    Returns:
+        ObjectNode message or None if required geometry is missing.
+    """
+    if obj.centroid is None or obj.bbox3d_corners is None:
+        return None
+
+    msg = ObjectNode()
+    msg.header = header
+    msg.object_id = obj.track_ids
+    if target_object and obj.label == target_object and not obj.is_asked_vlm:
+        msg.label = "Potential Target"
+    else:
+        msg.label = obj.label
+    msg.status = status
+    msg.position.x = float(obj.centroid[0])
+    msg.position.y = float(obj.centroid[1])
+    msg.position.z = float(obj.centroid[2])
+
+    for i in range(8):
+        msg.bbox3d[i].x = float(obj.bbox3d_corners[i, 0])
+        msg.bbox3d[i].y = float(obj.bbox3d_corners[i, 1])
+        msg.bbox3d[i].z = float(obj.bbox3d_corners[i, 2])
+
+    voxels = obj.voxel_manager.get_valid_voxels(
+        diversity_percentile=obj._diversity_percentile,
+        use_regularized=True,
+    )
+    if len(voxels) == 0:
+        voxels = obj.voxel_manager.get_valid_voxels(
+            diversity_percentile=obj._diversity_percentile,
+            use_regularized=False,
+        )
+    if len(voxels) > 0:
+        seed = get_object_seed(obj)
+        color = np.array(
+            [
+                (seed & 0xFF) / 255.0,
+                ((seed >> 8) & 0xFF) / 255.0,
+                ((seed >> 16) & 0xFF) / 255.0,
+            ],
+            dtype=np.float32,
+        )
+        colors = np.tile(color, (voxels.shape[0], 1))
+        msg.cloud = create_pointcloud2_msg(points=voxels, colors=colors, header=header)
+
+    msg.img_path = obj.best_image_path or ""
+    msg.is_asked_vlm = obj.is_asked_vlm
+    msg.viewpoint_id = -1 if obj.updated_by_vlm else viewpoint_id
+    return msg
+
+
+def create_bbox_markers(
+    objects: list[MapObject], header: Header, namespace: str = "bbox"
+) -> MarkerArray:
+    """Create wireframe bounding box markers for MapObjects."""
+    markers = MarkerArray()
+    edges = [
+        (0, 1),
+        (1, 2),
+        (2, 3),
+        (3, 0),
+        (4, 5),
+        (5, 6),
+        (6, 7),
+        (7, 4),
+        (0, 4),
+        (1, 5),
+        (2, 6),
+        (3, 7),
+    ]
+
+    for obj in objects:
+        voxels = obj.voxel_manager.get_valid_voxels(
+            diversity_percentile=obj._diversity_percentile,
+            use_regularized=True,
+        )
+        if len(voxels) == 0:
+            voxels = obj.voxel_manager.get_valid_voxels(
+                diversity_percentile=obj._diversity_percentile,
+                use_regularized=False,
+            )
+        corners = _axis_aligned_corners(voxels) if len(voxels) > 0 else None
+        if corners is None:
+            continue
+        marker = Marker()
+        marker.header = header
+        marker.ns = namespace
+        marker.id = int(obj.track_ids[0]) if obj.track_ids else 0
+        marker.type = Marker.LINE_LIST
+        marker.action = Marker.ADD
+        marker.scale.x = 0.02
+
+        seed = get_object_seed(obj)
+        marker.color.r = (seed & 0xFF) / 255.0
+        marker.color.g = ((seed >> 8) & 0xFF) / 255.0
+        marker.color.b = ((seed >> 16) & 0xFF) / 255.0
+        marker.color.a = 1.0
+
+        for i, j in edges:
+            marker.points.append(Point(x=float(corners[i, 0]), y=float(corners[i, 1]), z=float(corners[i, 2])))
+            marker.points.append(Point(x=float(corners[j, 0]), y=float(corners[j, 1]), z=float(corners[j, 2])))
+
+        markers.markers.append(marker)
+
+    return markers
+
+
+def create_label_markers(
+    objects: list[MapObject],
+    header: Header,
+    namespace: str = "labels",
+    text_height: float = 0.4,
+) -> MarkerArray:
+    """Create text label markers for MapObjects."""
+    markers = MarkerArray()
+
+    for obj in objects:
+        if obj.centroid is None:
+            continue
+        marker = Marker()
+        marker.header = header
+        marker.ns = namespace
+        marker.id = int(obj.track_ids[0]) if obj.track_ids else 0
+        marker.type = Marker.TEXT_VIEW_FACING
+        marker.action = Marker.ADD
+        marker.scale.z = text_height
+        marker.text = obj.label
+        marker.pose.position.x = float(obj.centroid[0])
+        marker.pose.position.y = float(obj.centroid[1])
+        marker.pose.position.z = float(obj.centroid[2]) + 0.3
+
+        seed = get_object_seed(obj)
+        marker.color.r = (seed & 0xFF) / 255.0
+        marker.color.g = ((seed >> 8) & 0xFF) / 255.0
+        marker.color.b = ((seed >> 16) & 0xFF) / 255.0
+        marker.color.a = 1.0
+
+        markers.markers.append(marker)
+
+    return markers
+
+
+def create_object_pointcloud(objects: list[MapObject], header: Header) -> Optional[PointCloud2]:
+    """Create a colored point cloud for a list of MapObjects."""
+    points_list = []
+    colors_list = []
+
+    for obj in objects:
+        voxels = obj.voxel_manager.get_valid_voxels(
+            diversity_percentile=obj._diversity_percentile,
+            use_regularized=True,
+        )
+        if len(voxels) == 0:
+            voxels = obj.voxel_manager.get_valid_voxels(
+                diversity_percentile=obj._diversity_percentile,
+                use_regularized=False,
+            )
+        if len(voxels) == 0:
+            continue
+
+        seed = get_object_seed(obj)
+        color = np.array(
+            [
+                (seed & 0xFF) / 255.0,
+                ((seed >> 8) & 0xFF) / 255.0,
+                ((seed >> 16) & 0xFF) / 255.0,
+            ],
+            dtype=np.float32,
+        )
+        colors = np.tile(color, (voxels.shape[0], 1))
+        points_list.append(voxels)
+        colors_list.append(colors)
+
+    if len(points_list) == 0:
+        return None
+
+    points = np.concatenate(points_list, axis=0)
+    colors = np.concatenate(colors_list, axis=0)
+    return create_pointcloud2_msg(points=points, colors=colors, header=header)
+
+
+def create_delete_markers(
+    deleted_ids: list[int],
+    header: Header,
+    namespace: str = "bbox",
+) -> MarkerArray:
+    """Create deletion markers for a list of marker IDs."""
+    markers = MarkerArray()
+    for marker_id in deleted_ids:
+        marker = Marker()
+        marker.header = header
+        marker.ns = namespace
+        marker.id = int(marker_id)
+        marker.action = Marker.DELETE
+        markers.markers.append(marker)
+    return markers
diff --git a/semantic_mapping/semantic_mapping/utils/matching.py b/semantic_mapping/semantic_mapping/utils/matching.py
new file mode 100644
index 0000000..e5aaa19
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/matching.py
@@ -0,0 +1,256 @@
+"""Object matching utilities for semantic mapping."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+import cv2
+import numpy as np
+
+from vector_perception_utils.pointcloud_utils import compute_centroid
+
+from ..map_object import MapObject, Observation
+
+
+def _polygon_area(poly: np.ndarray) -> float:
+    """Compute polygon area using the shoelace formula.
+
+    Args:
+        poly: Polygon vertices in order (N, 2).
+
+    Returns:
+        Polygon area.
+    """
+    if poly.shape[0] < 3:
+        return 0.0
+    x = poly[:, 0]
+    y = poly[:, 1]
+    return 0.5 * abs(float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1))))
+
+
+def _ensure_ccw(poly: np.ndarray) -> np.ndarray:
+    """Ensure polygon vertices are ordered counter-clockwise."""
+    if poly.shape[0] < 3:
+        return poly
+    x = poly[:, 0]
+    y = poly[:, 1]
+    signed = float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1)))
+    if signed < 0:
+        return poly[::-1]
+    return poly
+
+
+def compute_bbox(points: np.ndarray) -> Optional[tuple[np.ndarray, np.ndarray]]:
+    """Compute yaw-only oriented bbox corners + extents."""
+    if points.shape[0] < 3:
+        return None
+
+    xy = points[:, :2].astype(np.float32)
+    unique_xy = np.unique(xy, axis=0)
+    if unique_xy.shape[0] < 3:
+        return None
+
+    rect = cv2.minAreaRect(unique_xy)
+    (width, height) = rect[1]
+    if width == 0.0 or height == 0.0:
+        return None
+
+    box = cv2.boxPoints(rect).astype(np.float64)
+    rect_xy = _ensure_ccw(box)
+
+    zmin = float(np.min(points[:, 2]))
+    zmax = float(np.max(points[:, 2]))
+    extent = np.array([width, height, zmax - zmin], dtype=float)
+    z_center = zmax - extent[2] / 2.0
+    half_z = extent[2] / 2.0
+
+    bottom = np.hstack([rect_xy, np.full((4, 1), z_center - half_z)])
+    top = np.hstack([rect_xy, np.full((4, 1), z_center + half_z)])
+    corners = np.vstack([bottom, top])
+    return corners, extent
+
+
+def _line_intersection(p1: np.ndarray, p2: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> np.ndarray:
+    """Intersection of line p1->p2 with line cp1->cp2.
+
+    Args:
+        p1: Line segment start (2,).
+        p2: Line segment end (2,).
+        cp1: Clip edge start (2,).
+        cp2: Clip edge end (2,).
+
+    Returns:
+        Intersection point (2,).
+    """
+    dc = cp1 - cp2
+    dp = p1 - p2
+    denom = dc[0] * dp[1] - dc[1] * dp[0]
+    if denom == 0:
+        return p2
+    n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0]
+    n2 = p1[0] * p2[1] - p1[1] * p2[0]
+    x = (n1 * dp[0] - n2 * dc[0]) / denom
+    y = (n1 * dp[1] - n2 * dc[1]) / denom
+    return np.array([x, y], dtype=np.float64)
+
+
+def _polygon_clip(subject: np.ndarray, clip: np.ndarray) -> np.ndarray:
+    """Clip a polygon with a convex clip polygon (Sutherland–Hodgman).
+
+    Args:
+        subject: Subject polygon vertices (N, 2).
+        clip: Clip polygon vertices (M, 2).
+
+    Returns:
+        Clipped polygon vertices (K, 2).
+    """
+    if subject.shape[0] == 0:
+        return subject
+
+    def inside(p: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> bool:
+        return (cp2[0] - cp1[0]) * (p[1] - cp1[1]) - (cp2[1] - cp1[1]) * (
+            p[0] - cp1[0]
+        ) >= 0
+
+    output = subject
+    for i in range(len(clip)):
+        cp1 = clip[i]
+        cp2 = clip[(i + 1) % len(clip)]
+        input_list = output
+        output = []
+        if len(input_list) == 0:
+            return np.empty((0, 2), dtype=np.float64)
+        s = input_list[-1]
+        for e in input_list:
+            if inside(e, cp1, cp2):
+                if not inside(s, cp1, cp2):
+                    output.append(_line_intersection(s, e, cp1, cp2))
+                output.append(e)
+            elif inside(s, cp1, cp2):
+                output.append(_line_intersection(s, e, cp1, cp2))
+            s = e
+        output = np.array(output, dtype=np.float64)
+    return np.array(output, dtype=np.float64)
+
+
+def compute_iou_3d_with_ratios(
+    bbox1: np.ndarray, bbox2: np.ndarray
+) -> tuple[float, float, float]:
+    """Compute IoU and intersection ratios using yaw-only bbox overlap.
+
+    Args:
+        bbox1: Corner points for bbox1 (N, 3).
+        bbox2: Corner points for bbox2 (N, 3).
+
+    Returns:
+        Tuple of (iou, ratio_bbox1, ratio_bbox2).
+    """
+    if bbox1.shape[0] < 4 or bbox2.shape[0] < 4:
+        return 0.0, 0.0, 0.0
+
+    rect1 = _ensure_ccw(bbox1[:4, :2])
+    rect2 = _ensure_ccw(bbox2[:4, :2])
+    if rect1.shape[0] < 3 or rect2.shape[0] < 3:
+        return 0.0, 0.0, 0.0
+
+    poly_inter = _polygon_clip(rect1, rect2)
+    inter_area = _polygon_area(poly_inter) if poly_inter.shape[0] >= 3 else 0.0
+
+    zmin1, zmax1 = float(np.min(bbox1[:, 2])), float(np.max(bbox1[:, 2]))
+    zmin2, zmax2 = float(np.min(bbox2[:, 2])), float(np.max(bbox2[:, 2]))
+    height = max(0.0, min(zmax1, zmax2) - max(zmin1, zmin2))
+    inter_vol = inter_area * height
+
+    area1 = _polygon_area(rect1)
+    area2 = _polygon_area(rect2)
+    vol1 = area1 * (zmax1 - zmin1)
+    vol2 = area2 * (zmax2 - zmin2)
+    denom = vol1 + vol2 - inter_vol
+
+    iou = inter_vol / denom if denom > 0 else 0.0
+    ratio1 = inter_vol / vol1 if vol1 > 0 else 0.0
+    ratio2 = inter_vol / vol2 if vol2 > 0 else 0.0
+    return iou, ratio1, ratio2
+
+
+def find_best_match(
+    obs: Observation,
+    candidates: list[MapObject],
+    distance_threshold: float,
+    iou_threshold: float,
+    ratio_threshold: float = 0.4,
+    min_distance_merge: float = 0.25,
+) -> Optional[MapObject]:
+    """Find best matching object by distance/IoU heuristics.
+
+    Args:
+        obs: Observation to match.
+        candidates: Candidate map objects.
+        distance_threshold: Max centroid distance to consider.
+        iou_threshold: Min IoU to accept a merge.
+        ratio_threshold: Min overlap ratio to accept a merge.
+        min_distance_merge: Absolute distance threshold for merge.
+
+    Returns:
+        Best matching object or None if no match passes thresholds.
+    """
+    if obs.points.shape[0] == 0:
+        return None
+
+    obs_center = compute_centroid(obs.points)
+    obs_bbox = compute_bbox(obs.points)
+    obs_corners = obs_bbox[0] if obs_bbox is not None else None
+    obs_extent = obs_bbox[1] if obs_bbox is not None else None
+
+    best_candidate: Optional[MapObject] = None
+    best_dist = float("inf")
+
+    for candidate in candidates:
+        if obs.track_id in candidate.track_ids:
+            return candidate
+
+        cand_center = candidate.centroid
+        if cand_center is None:
+            continue
+
+        dist = float(np.linalg.norm(obs_center - cand_center))
+        if dist > distance_threshold:
+            continue
+
+        cand_points = candidate.voxel_manager.get_valid_voxels(
+            diversity_percentile=candidate._diversity_percentile,
+            use_regularized=True,
+        )
+        if len(cand_points) == 0:
+            cand_points = candidate.voxel_manager.get_valid_voxels(
+                diversity_percentile=candidate._diversity_percentile,
+                use_regularized=False,
+            )
+        cand_bbox = compute_bbox(cand_points) if len(cand_points) >= 3 else None
+        cand_corners = cand_bbox[0] if cand_bbox is not None else None
+        cand_extent = cand_bbox[1] if cand_bbox is not None else None
+
+        dist_merge = dist < min_distance_merge
+        if obs_extent is not None and cand_extent is not None:
+            dist_thresh = np.linalg.norm((obs_extent / 2 + cand_extent / 2) / 2) * 0.5
+            dist_merge = dist_merge or dist < dist_thresh
+
+        if dist_merge:
+            if dist < best_dist:
+                best_candidate = candidate
+                best_dist = dist
+            continue
+
+        if obs_corners is None or cand_corners is None:
+            continue
+
+        iou, ratio_obs, ratio_cand = compute_iou_3d_with_ratios(
+            obs_corners, cand_corners
+        )
+
+        if iou > iou_threshold or ratio_obs > ratio_threshold or ratio_cand > ratio_threshold:
+            if dist < best_dist:
+                best_candidate = candidate
+                best_dist = dist
+
+    return best_candidate
diff --git a/semantic_mapping/semantic_mapping/utils/storage.py b/semantic_mapping/semantic_mapping/utils/storage.py
new file mode 100644
index 0000000..a3089f4
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/storage.py
@@ -0,0 +1,157 @@
+"""Async image saving queue for semantic mapping."""
+
+from __future__ import annotations
+
+from pathlib import Path
+import os
+import shutil
+import queue
+import threading
+import time
+
+import cv2
+import numpy as np
+
+
+def _reset_output_dir(output_dir: Path) -> None:
+    """Clear an output directory if it contains existing files."""
+    if output_dir.exists() and any(output_dir.iterdir()):
+        shutil.rmtree(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+
+class ImageSaveQueue:
+    """Background worker for saving images to disk."""
+
+    def __init__(
+        self,
+        output_dir: Path,
+        max_queue_size: int = 100,
+        save_png: bool = False,
+        save_transform: bool = False,
+        clear_output_dir: bool = True,
+        use_timestamp_dir: bool = False,
+    ) -> None:
+        """Initialize the image save queue.
+
+        Args:
+            output_dir: Base directory where images are stored.
+            max_queue_size: Maximum number of queued save jobs.
+            save_png: If True, also save a .png copy of the image.
+            save_transform: If True, save a viewpoint transform when provided.
+            clear_output_dir: If True, clear existing contents on startup.
+            use_timestamp_dir: If True, write into a timestamped subdirectory.
+        """
+        base_dir = Path(output_dir)
+        if clear_output_dir:
+            _reset_output_dir(base_dir)
+        else:
+            base_dir.mkdir(parents=True, exist_ok=True)
+        if use_timestamp_dir:
+            run_dir = base_dir / f"{time.time():.2f}"
+            run_dir.mkdir(parents=True, exist_ok=True)
+            self._output_dir = run_dir
+        else:
+            self._output_dir = base_dir
+        self._save_png = save_png
+        self._save_transform = save_transform
+        self._queue: queue.Queue = queue.Queue(maxsize=max_queue_size)
+        self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
+
+    def start(self) -> None:
+        """Start the background worker."""
+        if self._thread is not None and self._thread.is_alive():
+            return
+        self._stop_event.clear()
+        self._thread = threading.Thread(target=self._run, daemon=True)
+        self._thread.start()
+
+    def stop(self) -> None:
+        """Stop the background worker."""
+        self._stop_event.set()
+        try:
+            self._queue.put_nowait(None)
+        except queue.Full:
+            pass
+        if self._thread is not None:
+            self._thread.join(timeout=2.0)
+            self._thread = None
+
+    def queue_save(
+        self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None
+    ) -> str:
+        """Queue an image (and optional mask) for saving.
+
+        Args:
+            object_id: Object identifier to embed in filename.
+            image: Image array to save.
+            mask: Optional mask array to save alongside the image.
+
+        Returns:
+            Path to the saved image (.npy).
+        """
+        timestamp_ms = int(time.time() * 1000)
+        image_path = self._output_dir / f"{object_id}_{timestamp_ms}.npy"
+        mask_path = self._output_dir / f"{object_id}_{timestamp_ms}_mask.npy"
+        self._queue.put(("save_object", str(image_path), str(mask_path), image, mask))
+        return str(image_path)
+
+    def queue_viewpoint_save(
+        self, viewpoint_id: int, image: np.ndarray, transform: np.ndarray | None
+    ) -> None:
+        """Queue a viewpoint snapshot for saving."""
+        if not self._save_png and not self._save_transform:
+            return
+        image_path = self._output_dir / f"viewpoint_{viewpoint_id}.png"
+        transform_path = self._output_dir / f"viewpoint_{viewpoint_id}_transform.npy"
+        try:
+            self._queue.put_nowait(
+                ("save_viewpoint", str(image_path), str(transform_path), image, transform)
+            )
+        except queue.Full:
+            return
+
+    def _run(self) -> None:
+        while not self._stop_event.is_set():
+            item = self._queue.get()
+            if item is None:
+                self._queue.task_done()
+                break
+            op, image_path, aux_path, image, aux = item
+            if op == "save_object":
+                try:
+                    image_target = Path(image_path)
+                    image_tmp = image_target.with_suffix(image_target.suffix + ".tmp")
+                    with image_tmp.open("wb") as handle:
+                        np.save(handle, image)
+                    os.replace(image_tmp, image_target)
+                    if aux is not None:
+                        aux_target = Path(aux_path)
+                        aux_tmp = aux_target.with_suffix(aux_target.suffix + ".tmp")
+                        with aux_tmp.open("wb") as handle:
+                            np.save(handle, aux)
+                        os.replace(aux_tmp, aux_target)
+                    if self._save_png:
+                        png_path = image_target.with_suffix(".png")
+                        png_tmp = png_path.with_name(png_path.stem + ".tmp.png")
+                        cv2.imwrite(str(png_tmp), image)
+                        os.replace(png_tmp, png_path)
+                except Exception:
+                    pass
+            elif op == "save_viewpoint":
+                try:
+                    if self._save_transform and aux is not None:
+                        aux_target = Path(aux_path)
+                        aux_tmp = aux_target.with_suffix(aux_target.suffix + ".tmp")
+                        with aux_tmp.open("wb") as handle:
+                            np.save(handle, aux)
+                        os.replace(aux_tmp, aux_target)
+                    if self._save_png:
+                        image_target = Path(image_path)
+                        png_tmp = image_target.with_name(image_target.stem + ".tmp.png")
+                        cv2.imwrite(str(png_tmp), image)
+                        os.replace(png_tmp, image_target)
+                except Exception:
+                    pass
+            self._queue.task_done()
diff --git a/semantic_mapping/semantic_mapping/utils/utils.py b/semantic_mapping/semantic_mapping/utils/utils.py
new file mode 100644
index 0000000..2a4e980
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/utils.py
@@ -0,0 +1,234 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+import json
+import time
+from typing import Any
+
+import cv2
+import numpy as np
+import yaml
+
+
+@dataclass(frozen=True)
+class CameraParams:
+    image_width: int = 1920
+    image_height: int = 640
+    hfov: float = 360.0
+    vfov: float = 120.0
+    sensor_to_camera: np.ndarray | None = None
+
+
+def load_camera_params(path: str, base_dir: Path | None = None) -> CameraParams | None:
+    config_path = Path(path)
+    if not config_path.is_absolute() and base_dir is not None:
+        config_path = base_dir / config_path
+    if not config_path.exists():
+        return None
+    data = yaml.safe_load(config_path.read_text()) or {}
+    sensor_to_camera = _load_sensor_to_camera_matrix(data)
+    return CameraParams(
+        image_width=int(data.get("image_width", 1920)),
+        image_height=int(data.get("image_height", 640)),
+        hfov=float(data.get("hfov", 360.0)),
+        vfov=float(data.get("vfov", 120.0)),
+        sensor_to_camera=sensor_to_camera,
+    )
+
+
+def _load_sensor_to_camera_matrix(data: dict[str, Any]) -> np.ndarray | None:
+    tf_data = data.get("sensor_to_camera", {})
+    trans = tf_data.get("translation", {})
+    rpy = tf_data.get("rotation_rpy", {})
+    cam_offset = np.array(
+        [
+            float(trans.get("x", 0.0)),
+            float(trans.get("y", 0.0)),
+            float(trans.get("z", 0.0)),
+        ]
+    )
+    roll = float(rpy.get("roll", 0.0))
+    pitch = float(rpy.get("pitch", 0.0))
+    yaw = float(rpy.get("yaw", 0.0))
+
+    r_x = np.array(
+        [[1, 0, 0], [0, np.cos(roll), -np.sin(roll)], [0, np.sin(roll), np.cos(roll)]]
+    )
+    r_y = np.array(
+        [[np.cos(pitch), 0, np.sin(pitch)], [0, 1, 0], [-np.sin(pitch), 0, np.cos(pitch)]]
+    )
+    r_z = np.array(
+        [[np.cos(yaw), -np.sin(yaw), 0], [np.sin(yaw), np.cos(yaw), 0], [0, 0, 1]]
+    )
+    cam_rotation = r_z @ r_y @ r_x
+    rotation = cam_rotation.T
+    translation = -rotation @ cam_offset
+    transform = np.eye(4)
+    transform[:3, :3] = rotation
+    transform[:3, 3] = translation
+    return transform
+
+
+def parse_json_dict(text: str) -> dict[str, Any]:
+    if not text:
+        return {}
+    cleaned = text.strip()
+    if "```" in cleaned:
+        parts = cleaned.split("```")
+        if len(parts) >= 3:
+            cleaned = parts[1].strip()
+    start = cleaned.find("{")
+    end = cleaned.rfind("}")
+    if start != -1 and end != -1:
+        cleaned = cleaned[start : end + 1]
+    try:
+        parsed = json.loads(cleaned)
+    except json.JSONDecodeError:
+        return {}
+    return parsed if isinstance(parsed, dict) else {}
+
+
+def load_npy_image(path_str: str, retries: int = 3, retry_delay_s: float = 0.05) -> np.ndarray | None:
+    path = Path(path_str)
+    if not path.exists():
+        return None
+    for attempt in range(max(1, retries)):
+        try:
+            return np.load(str(path))
+        except Exception:
+            if attempt >= retries - 1:
+                return None
+            time.sleep(retry_delay_s)
+
+
+def annotate_mask(image: np.ndarray, image_path: str, iterations: int = 2) -> np.ndarray:
+    mask_path = image_path.replace(".npy", "_mask.npy")
+    if not Path(mask_path).exists():
+        return image
+    mask = np.load(mask_path)
+    mask = cv2.dilate(mask.astype(np.uint8), np.ones((3, 3), np.uint8), iterations=iterations)
+    mask_uint8 = mask.astype(np.uint8)
+    contours, _ = cv2.findContours(mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    annotated = image.copy()
+    cv2.drawContours(annotated, contours, -1, (0, 255, 0), 2)
+    return annotated
+
+
+def concat_images(images: list[np.ndarray]) -> np.ndarray:
+    if not images:
+        return np.zeros((32, 32, 3), dtype=np.uint8)
+    heights = [img.shape[0] for img in images]
+    target_h = min(heights)
+    resized = []
+    for img in images:
+        if img.shape[0] != target_h:
+            scale = target_h / img.shape[0]
+            width = max(1, int(img.shape[1] * scale))
+            img = cv2.resize(img, (width, target_h), interpolation=cv2.INTER_AREA)
+        resized.append(img)
+    return cv2.cvtColor(cv2.hconcat(resized), cv2.COLOR_BGR2RGB)
+
+
+def interpolate_odom(
+    odom_buffer: list[tuple[float, dict]],
+    detection_stamp: float,
+    linear_bias: float,
+    angular_bias: float,
+) -> dict | None:
+    target_stamp = detection_stamp + linear_bias
+    angular_stamp = detection_stamp + angular_bias
+    stamps = [s for s, _ in odom_buffer]
+    if len(stamps) < 2:
+        return None
+    if target_stamp < stamps[0] or target_stamp > stamps[-1]:
+        return None
+    if angular_stamp < stamps[0] or angular_stamp > stamps[-1]:
+        return None
+
+    right_idx = next(i for i, s in enumerate(stamps) if s >= target_stamp)
+    left_idx = max(right_idx - 1, 0)
+    angular_right_idx = next(i for i, s in enumerate(stamps) if s >= angular_stamp)
+    angular_left_idx = max(angular_right_idx - 1, 0)
+
+    left_stamp, left_odom = odom_buffer[left_idx]
+    right_stamp, right_odom = odom_buffer[right_idx]
+    angular_left_stamp, angular_left_odom = odom_buffer[angular_left_idx]
+    angular_right_stamp, angular_right_odom = odom_buffer[angular_right_idx]
+    denom = right_stamp - left_stamp
+    ratio = (target_stamp - left_stamp) / denom if denom > 0 else 0.5
+    angular_denom = angular_right_stamp - angular_left_stamp
+    angular_ratio = (angular_stamp - angular_left_stamp) / angular_denom if angular_denom > 0 else 0.5
+
+    return {
+        "position": right_odom["position"] * ratio + left_odom["position"] * (1 - ratio),
+        "linear_velocity": right_odom["linear_velocity"] * ratio
+        + left_odom["linear_velocity"] * (1 - ratio),
+        "orientation": _slerp_quat(angular_left_odom["orientation"], angular_right_odom["orientation"], angular_ratio),
+        "angular_velocity": angular_right_odom["angular_velocity"] * angular_ratio
+        + angular_left_odom["angular_velocity"] * (1 - angular_ratio),
+    }
+
+
+def collect_cloud_window(
+    cloud_buffer: list[tuple[float, np.ndarray]],
+    detection_stamp: float,
+    window_before: float,
+    window_after: float,
+) -> np.ndarray | None:
+    start = detection_stamp - window_before
+    end = detection_stamp + window_after
+    clouds = [pts for stamp, pts in cloud_buffer if start <= stamp <= end]
+    if len(clouds) == 0:
+        return None
+    return np.concatenate(clouds, axis=0)
+
+
+def compute_ground_z_threshold(
+    points: np.ndarray,
+    robot_xy: np.ndarray,
+    ground_filter: bool,
+    ground_min_points: int,
+    ground_radius: float,
+    ground_percentile: float,
+    ground_z_clearance: float,
+) -> float | None:
+    if not ground_filter:
+        return None
+    if points.shape[0] < ground_min_points:
+        return None
+    if ground_radius <= 0.0:
+        return None
+    if not 0.0 < ground_percentile < 1.0:
+        return None
+
+    diffs = points[:, :2] - robot_xy
+    dist2 = diffs[:, 0] ** 2 + diffs[:, 1] ** 2
+    radius2 = ground_radius**2
+    mask = dist2 <= radius2
+    if np.count_nonzero(mask) < ground_min_points:
+        return None
+
+    z_vals = points[mask, 2]
+    z_thresh = float(np.percentile(z_vals, ground_percentile * 100.0))
+    return z_thresh + ground_z_clearance
+
+
+def _slerp_quat(q0: np.ndarray, q1: np.ndarray, ratio: float) -> np.ndarray:
+    """Slerp between two quaternions without scipy dependency."""
+    q0 = np.asarray(q0, dtype=float)
+    q1 = np.asarray(q1, dtype=float)
+    dot = float(np.dot(q0, q1))
+    if dot < 0.0:
+        q1 = -q1
+        dot = -dot
+    if dot > 0.9995:
+        result = q0 + ratio * (q1 - q0)
+        return result / np.linalg.norm(result)
+    theta_0 = np.arccos(dot)
+    sin_theta_0 = np.sin(theta_0)
+    theta = theta_0 * ratio
+    sin_theta = np.sin(theta)
+    s0 = np.cos(theta) - dot * sin_theta / sin_theta_0
+    s1 = sin_theta / sin_theta_0
+    return s0 * q0 + s1 * q1
diff --git a/semantic_mapping/semantic_mapping/vlm_reasoning_node.py b/semantic_mapping/semantic_mapping/vlm_reasoning_node.py
new file mode 100644
index 0000000..9d717f8
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/vlm_reasoning_node.py
@@ -0,0 +1,556 @@
+"""ROS2 node for VLM reasoning tasks in semantic mapping."""
+
+from __future__ import annotations
+
+from collections import deque
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+import threading
+import time
+
+import cv2
+import numpy as np
+import rclpy
+from cv_bridge import CvBridge
+from rclpy.node import Node
+from std_msgs.msg import String
+import yaml
+
+from semantic_mapping.msg import ObjectType, RoomType, TargetObjectInstruction, TargetObjectWithSpatial
+from vector_perception_utils.pointcloud_utils import project_bbox3d_equirect
+from vlm.models import MoondreamConfig, MoondreamVlm, QwenConfig, QwenVlm, VlmModelBase
+
+from .prompts import (
+    build_object_type_prompt,
+    build_room_type_prompt,
+    INSTRUCTION_DECOMPOSITION_PROMPT,
+    TARGET_OBJECT_SPATIAL_PROMPT,
+)
+from .utils.utils import (
+    annotate_mask,
+    concat_images,
+    load_camera_params,
+    load_npy_image,
+    parse_json_dict,
+)
+
+
+@dataclass
+class InstructionState:
+    instruction: str = ""
+    target_object: str = ""
+    room_condition: str = ""
+    spatial_condition: str = ""
+    attribute_condition: str = ""
+    anchor_object: str = ""
+    attribute_condition_anchor: str = ""
+
+
+class VlmReasoningNode(Node):
+    """VLM reasoning node that answers object queries for semantic mapping."""
+
+    def __init__(self) -> None:
+        super().__init__("vlm_reasoning_node")
+
+        self.declare_parameter("vlm_provider", "qwen")
+        self.declare_parameter("vlm_model", "qwen2.5-vl-72b-instruct")
+        self.declare_parameter("objects_yaml_path", "detect_anything/config/objects.yaml")
+        self.declare_parameter("object_type_query_topic", "/object_type_query")
+        self.declare_parameter("object_type_answer_topic", "/object_type_answer")
+        self.declare_parameter("target_object_spatial_query_topic", "/target_object_spatial_query")
+        self.declare_parameter("target_object_answer_topic", "/target_object_answer")
+        self.declare_parameter("target_object_instruction_topic", "/target_object_instruction")
+        self.declare_parameter("instruction_topic", "/keyboard_input")
+        self.declare_parameter("room_type_query_topic", "/room_type_query")
+        self.declare_parameter("room_type_answer_topic", "/room_type_answer")
+        self.declare_parameter("room_type_cache_s", 10.0)
+        self.declare_parameter("viewpoint_output_dir", "output/viewpoint_images")
+        self.declare_parameter("dump_object_type_images", False)
+        self.declare_parameter("object_type_dump_dir", "output/vlm_object_type")
+        self.declare_parameter("dump_room_type_images", False)
+        self.declare_parameter("room_type_dump_dir", "output/vlm_room_type")
+        self.declare_parameter("camera_param_path", "config/camera_param.yaml")
+        self.declare_parameter("platform", "mecanum")
+        self.declare_parameter("max_viewpoints", 3)
+        self.declare_parameter(
+            "room_types",
+            [
+                "Classroom",
+                "Laboratory",
+                "Office Room",
+                "Meeting Room",
+                "Restroom",
+                "Storage Room",
+                "Copy Room",
+                "Lounge",
+                "Reception",
+                "Hallway",
+            ],
+        )
+
+        self._vlm = self._build_vlm()
+        self._objects = self._load_object_labels()
+        self._viewpoint_dir = Path(str(self.get_parameter("viewpoint_output_dir").value))
+        self._platform = str(self.get_parameter("platform").value)
+        self._max_viewpoints = int(self.get_parameter("max_viewpoints").value)
+        camera_param_path = str(self.get_parameter("camera_param_path").value)
+        self._camera_params = load_camera_params(
+            camera_param_path, base_dir=Path(__file__).resolve().parent.parent
+        )
+        room_types_value = self.get_parameter("room_types").value
+        if isinstance(room_types_value, (list, tuple)):
+            self._room_types = [str(item) for item in room_types_value]
+        else:
+            self._room_types = []
+        self._instruction_state = InstructionState()
+        self._bridge = CvBridge()
+        self._dump_object_type_images = bool(
+            self.get_parameter("dump_object_type_images").value
+        )
+        self._object_type_dump_dir = Path(
+            str(self.get_parameter("object_type_dump_dir").value)
+        )
+        if self._dump_object_type_images:
+            self._object_type_dump_dir.mkdir(parents=True, exist_ok=True)
+        self._dump_room_type_images = bool(self.get_parameter("dump_room_type_images").value)
+        self._room_type_dump_dir = Path(str(self.get_parameter("room_type_dump_dir").value))
+        if self._dump_room_type_images:
+            self._room_type_dump_dir.mkdir(parents=True, exist_ok=True)
+
+        self._lock = threading.Lock()
+        self._instruction_queue: deque[str] = deque(maxlen=20)
+        self._object_type_queries: dict[int, ObjectType] = {}
+        self._room_type_queries: dict[int, RoomType] = {}
+        self._spatial_queries: dict[int, TargetObjectWithSpatial] = {}
+
+        object_type_query_topic = str(self.get_parameter("object_type_query_topic").value)
+        object_type_answer_topic = str(self.get_parameter("object_type_answer_topic").value)
+        target_object_spatial_query_topic = str(
+            self.get_parameter("target_object_spatial_query_topic").value
+        )
+        target_object_answer_topic = str(self.get_parameter("target_object_answer_topic").value)
+        target_object_instruction_topic = str(
+            self.get_parameter("target_object_instruction_topic").value
+        )
+        instruction_topic = str(self.get_parameter("instruction_topic").value)
+        room_type_query_topic = str(self.get_parameter("room_type_query_topic").value)
+        room_type_answer_topic = str(self.get_parameter("room_type_answer_topic").value)
+
+        self.create_subscription(ObjectType, object_type_query_topic, self._object_type_cb, 50)
+        self.create_subscription(
+            TargetObjectWithSpatial,
+            target_object_spatial_query_topic,
+            self._target_object_spatial_cb,
+            10,
+        )
+        self.create_subscription(String, instruction_topic, self._instruction_cb, 10)
+        self.create_subscription(RoomType, room_type_query_topic, self._room_type_cb, 10)
+
+        self._object_type_pub = self.create_publisher(ObjectType, object_type_answer_topic, 50)
+        self._target_object_pub = self.create_publisher(
+            TargetObjectWithSpatial, target_object_answer_topic, 10
+        )
+        self._target_object_instruction_pub = self.create_publisher(
+            TargetObjectInstruction, target_object_instruction_topic, 10
+        )
+        self._room_type_pub = self.create_publisher(RoomType, room_type_answer_topic, 10)
+
+        self._executor = ThreadPoolExecutor(max_workers=4)
+        self.create_timer(0.1, self._process_queues)
+
+        self._room_type_labels: dict[int, RoomType] = {}
+        self._room_type_cache_s = float(self.get_parameter("room_type_cache_s").value)
+        self._room_type_last_query_ts: dict[int, float] = {}
+        self._last_log_times: dict[str, float] = {}
+        self.get_logger().debug(
+            f"VLM reasoning node ready (room_type_query={room_type_query_topic}, "
+            f"room_type_answer={room_type_answer_topic}, "
+            f"object_type_query={object_type_query_topic}, "
+            f"object_type_answer={object_type_answer_topic}, "
+            f"target_object_spatial_query={target_object_spatial_query_topic}, "
+            f"target_object_answer={target_object_answer_topic})"
+        )
+
+    def destroy_node(self) -> bool:
+        self._executor.shutdown(wait=False)
+        return super().destroy_node()
+
+    def _build_vlm(self) -> VlmModelBase:
+        provider = str(self.get_parameter("vlm_provider").value).lower()
+        model_name = str(self.get_parameter("vlm_model").value)
+        if provider == "qwen":
+            return QwenVlm(QwenConfig(model_name=model_name))
+        if provider == "moondream":
+            return MoondreamVlm(MoondreamConfig())
+        raise ValueError(f"Unknown VLM provider: {provider}")
+
+    def _load_object_labels(self) -> list[str]:
+        path = Path(str(self.get_parameter("objects_yaml_path").value))
+        if not path.exists():
+            self.get_logger().warning(f"objects.yaml not found: {path}")
+            return []
+        with path.open("r", encoding="utf-8") as handle:
+            data = yaml.safe_load(handle)
+        if isinstance(data, list):
+            return [str(item) for item in data]
+        return []
+
+    def _instruction_cb(self, msg: String) -> None:
+        self._instruction_queue.append(msg.data)
+
+    def _room_type_cb(self, msg: RoomType) -> None:
+        self._room_type_queries[msg.room_id] = msg
+
+    def _object_type_cb(self, msg: ObjectType) -> None:
+        self._object_type_queries[msg.object_id] = msg
+
+    def _target_object_spatial_cb(self, msg: TargetObjectWithSpatial) -> None:
+        self._spatial_queries[msg.object_id] = msg
+
+    def _process_queues(self) -> None:
+        if self._instruction_queue:
+            instruction = self._instruction_queue.pop()
+            self._instruction_queue.clear()
+            self._submit_task("instruction", self._process_instruction, instruction)
+
+        if self._room_type_queries:
+            items = list(self._room_type_queries.values())
+            self._room_type_queries.clear()
+            for item in items:
+                self._submit_task("room_type", self._process_room_type, item)
+
+        if self._object_type_queries:
+            items = list(self._object_type_queries.values())
+            self._object_type_queries.clear()
+            for item in items:
+                self._submit_task("object_type", self._process_object_type, item)
+
+        if self._spatial_queries:
+            items = list(self._spatial_queries.values())
+            self._spatial_queries.clear()
+            for item in items:
+                self._submit_task("target_spatial", self._process_target_object_spatial, item)
+
+    def _process_instruction(self, instruction: str) -> None:
+        prompt = INSTRUCTION_DECOMPOSITION_PROMPT + f"\nInstruction: {instruction}"
+        blank = np.zeros((32, 32, 3), dtype=np.uint8)
+        response = self._vlm.query(blank, prompt)
+        parsed = parse_json_dict(response)
+        if not parsed:
+            self.get_logger().warning(f"Failed to parse instruction JSON: {response}")
+            return
+
+        state = InstructionState(
+            instruction=instruction,
+            target_object=str(parsed.get("target_object", "")).lower(),
+            room_condition=str(parsed.get("room_condition", "")).lower(),
+            spatial_condition=str(parsed.get("spatial_condition", "")).lower(),
+            attribute_condition=str(parsed.get("attribute_condition", "")).lower(),
+            anchor_object=str(parsed.get("anchor_object", "")).lower(),
+            attribute_condition_anchor=str(parsed.get("attribute_condition_anchor", "")).lower(),
+        )
+        with self._lock:
+            self._instruction_state = state
+
+        msg = TargetObjectInstruction()
+        msg.target_object = state.target_object
+        msg.room_condition = state.room_condition
+        msg.spatial_condition = state.spatial_condition
+        msg.attribute_condition = state.attribute_condition
+        msg.anchor_object = state.anchor_object
+        msg.attribute_condition_anchor = state.attribute_condition_anchor
+        self._target_object_instruction_pub.publish(msg)
+
+    def _process_room_type(self, msg: RoomType) -> None:
+        if not msg.image.data or not msg.room_mask.data:
+            self.get_logger().warning(f"Room type query missing image or mask: {msg.room_id}")
+            return
+        cached = self._room_type_labels.get(msg.room_id)
+        if cached and self._room_type_cache_s > 0:
+            last_query = self._room_type_last_query_ts.get(msg.room_id, 0.0)
+            if time.time() - last_query < self._room_type_cache_s:
+                answer = RoomType()
+                answer.header = msg.header
+                answer.anchor_point = msg.anchor_point
+                answer.image = msg.image
+                answer.room_mask = msg.room_mask
+                answer.room_type = cached.room_type
+                answer.room_id = msg.room_id
+                answer.voxel_num = msg.voxel_num
+                answer.in_room = msg.in_room
+                self._room_type_pub.publish(answer)
+                with self._lock:
+                    self._room_type_labels[answer.room_id] = answer
+                self._log_throttled(
+                    "room_type_cache",
+                    "Room type cached: room_id=%s type=%s",
+                    answer.room_id,
+                    answer.room_type,
+                    interval_s=5.0,
+                )
+                return
+        image = self._bridge.imgmsg_to_cv2(msg.image, desired_encoding="bgr8")
+        mask = self._bridge.imgmsg_to_cv2(msg.room_mask, desired_encoding="mono8")
+        mosaic = self._build_room_type_mosaic(image, mask)
+        if self._dump_room_type_images:
+            timestamp_ms = int(time.time() * 1000)
+            base = self._room_type_dump_dir / f"room_{msg.room_id}_{timestamp_ms}"
+            mask_uint8 = mask.astype(np.uint8)
+            if mask_uint8.max() <= 1:
+                mask_uint8 = (mask_uint8 * 255).astype(np.uint8)
+            cv2.imwrite(str(base.parent / f"{base.name}_image.png"), image)
+            cv2.imwrite(str(base.parent / f"{base.name}_mask.png"), mask_uint8)
+            cv2.imwrite(
+                str(base.parent / f"{base.name}_mosaic.png"),
+                cv2.cvtColor(mosaic, cv2.COLOR_RGB2BGR),
+            )
+
+        room_types = list(self._room_types)
+        if not msg.in_room and "Corridor" in room_types:
+            room_types = [room for room in room_types if room != "Corridor"]
+        prompt = build_room_type_prompt(room_types)
+        start_time = time.time()
+        try:
+            response = self._vlm.query(mosaic, prompt)
+        except Exception as exc:
+            self.get_logger().error(f"Room type query failed: {exc}")
+            return
+        self._room_type_last_query_ts[msg.room_id] = time.time()
+        duration_s = time.time() - start_time
+        parsed = parse_json_dict(response)
+        if parsed:
+            room_type = str(parsed.get("room_type", "")).strip()
+        else:
+            room_type = response.strip().strip('"')
+            self._log_throttled(
+                "room_type_parse",
+                "Room type parse failed: room_id=%s response=%s",
+                msg.room_id,
+                self._preview_response(response),
+                interval_s=5.0,
+            )
+        answer = RoomType()
+        answer.header = msg.header
+        answer.anchor_point = msg.anchor_point
+        answer.image = msg.image
+        answer.room_mask = msg.room_mask
+        answer.room_type = room_type.lower()
+        answer.room_id = msg.room_id
+        answer.voxel_num = msg.voxel_num
+        answer.in_room = msg.in_room
+        self._room_type_pub.publish(answer)
+        with self._lock:
+            self._room_type_labels[answer.room_id] = answer
+        self._log_throttled(
+            "room_type_answer",
+            "Room type answer: room_id=%s type=%s (%.2fs)",
+            answer.room_id,
+            answer.room_type,
+            duration_s,
+            interval_s=2.0,
+        )
+
+    def _process_object_type(self, msg: ObjectType) -> None:
+        image = load_npy_image(msg.img_path)
+        if image is None:
+            self.get_logger().warning(f"Failed to load image: {msg.img_path}")
+            return
+        image = annotate_mask(image, msg.img_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        if self._dump_object_type_images:
+            timestamp_ms = int(time.time() * 1000)
+            dump_path = self._object_type_dump_dir / f"{msg.object_id}_{timestamp_ms}.png"
+            cv2.imwrite(str(dump_path), cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR))
+
+        labels = list(msg.labels) if msg.labels else self._objects
+        prompt = build_object_type_prompt(labels)
+        start_time = time.time()
+        try:
+            response = self._vlm.query(image_rgb, prompt)
+        except Exception as exc:
+            self.get_logger().error(f"Object type query failed: {exc}")
+            return
+        duration_s = time.time() - start_time
+        parsed = parse_json_dict(response)
+        if parsed:
+            final_label = str(parsed.get("label", labels[0] if labels else "unknown")).lower()
+        else:
+            final_label = str(labels[0] if labels else "unknown").lower()
+            self._log_throttled(
+                "object_type_parse",
+                "Object type parse failed: object_id=%s response=%s",
+                msg.object_id,
+                self._preview_response(response),
+                interval_s=5.0,
+            )
+
+        answer = ObjectType()
+        answer.object_id = msg.object_id
+        answer.img_path = msg.img_path
+        answer.final_label = final_label
+        answer.labels = msg.labels
+        self._object_type_pub.publish(answer)
+        self._log_throttled(
+            "object_type_answer",
+            "Object type answer: object_id=%s label=%s (%.2fs)",
+            answer.object_id,
+            answer.final_label,
+            duration_s,
+            interval_s=2.0,
+        )
+
+    @staticmethod
+    def _build_room_type_mosaic(image: np.ndarray, mask: np.ndarray) -> np.ndarray:
+        if image.ndim == 2:
+            image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+        if mask.ndim == 3 and mask.shape[2] == 1:
+            mask = mask[:, :, 0]
+        if mask.ndim == 2:
+            mask_uint8 = mask.astype(np.uint8)
+            if mask_uint8.max() <= 1:
+                mask_uint8 = (mask_uint8 * 255).astype(np.uint8)
+        else:
+            mask_uint8 = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
+
+        nonzero_ratio = float(np.count_nonzero(mask_uint8)) / float(mask_uint8.size)
+        if nonzero_ratio < 0.005:
+            return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        mask_bgr = cv2.cvtColor(mask_uint8, cv2.COLOR_GRAY2BGR)
+        target_h = image.shape[0]
+        if mask_bgr.shape[0] != target_h:
+            scale = target_h / mask_bgr.shape[0]
+            new_w = max(1, int(mask_bgr.shape[1] * scale))
+            mask_bgr = cv2.resize(mask_bgr, (new_w, target_h), interpolation=cv2.INTER_NEAREST)
+        mosaic = cv2.hconcat([image, mask_bgr])
+        return cv2.cvtColor(mosaic, cv2.COLOR_BGR2RGB)
+
+    def _process_target_object_spatial(self, msg: TargetObjectWithSpatial) -> None:
+        image = load_npy_image(msg.img_path)
+        if image is None:
+            self.get_logger().warning(f"Failed to load image: {msg.img_path}")
+            return
+        image = annotate_mask(image, msg.img_path)
+
+        prompt = self._build_target_spatial_prompt(msg)
+        mosaic = self._build_spatial_mosaic(image, msg)
+        try:
+            response = self._vlm.query(mosaic, prompt)
+        except Exception as exc:
+            self.get_logger().error(f"Target spatial query failed: {exc}")
+            return
+        parsed = parse_json_dict(response)
+        is_target = bool(parsed.get("is_target", False))
+
+        answer = TargetObjectWithSpatial()
+        answer.header = msg.header
+        answer.object_id = msg.object_id
+        answer.object_label = msg.object_label
+        answer.img_path = msg.img_path
+        answer.viewpoint_ids = msg.viewpoint_ids
+        answer.room_label = msg.room_label
+        answer.is_target = is_target
+        answer.bbox3d = msg.bbox3d
+        self._target_object_pub.publish(answer)
+        self._log_throttled(
+            "target_spatial_answer",
+            "Target spatial answer: object_id=%s is_target=%s",
+            answer.object_id,
+            answer.is_target,
+        )
+
+    def _log_throttled(self, key: str, msg: str, *args: object, interval_s: float = 5.0) -> None:
+        now = time.time()
+        last = self._last_log_times.get(key, 0.0)
+        if now - last < interval_s:
+            return
+        self._last_log_times[key] = now
+        formatted = msg % args if args else msg
+        self.get_logger().debug(formatted)
+
+    @staticmethod
+    def _preview_response(response: object, limit: int = 160) -> str:
+        text = response if isinstance(response, str) else str(response)
+        text = text.replace("\n", " ").strip()
+        if len(text) > limit:
+            text = f"{text[:limit]}..."
+        return text
+
+    def _submit_task(self, name: str, func: callable, *args: object) -> None:
+        future = self._executor.submit(self._run_task, name, func, *args)
+        future.add_done_callback(self._task_done)
+
+    def _run_task(self, name: str, func: callable, *args: object) -> None:
+        try:
+            func(*args)
+        except Exception as exc:
+            self.get_logger().error(f"VLM task {name} failed: {exc}")
+
+    def _task_done(self, future) -> None:
+        _ = future
+
+    def _build_target_spatial_prompt(self, msg: TargetObjectWithSpatial) -> str:
+        with self._lock:
+            state = self._instruction_state
+        instruction = state.instruction or ""
+        description = f"Object: {msg.object_label}, Room: {msg.room_label or 'unknown'}"
+        detail = (
+            f"Target: {state.target_object}, Room Condition: {state.room_condition or 'none'}, "
+            f"Spatial Condition: {state.spatial_condition or 'none'}, "
+            f"Attribute Condition: {state.attribute_condition or 'none'}"
+        )
+        return f"{TARGET_OBJECT_SPATIAL_PROMPT}\n{instruction}\n{detail}\n{description}"
+
+    def _build_spatial_mosaic(self, image: np.ndarray, msg: TargetObjectWithSpatial) -> np.ndarray:
+        images = [image]
+        if len(msg.viewpoint_ids) > 0:
+            bbox3d = np.array([[p.x, p.y, p.z] for p in msg.bbox3d], dtype=float)
+            for vp_id in list(msg.viewpoint_ids)[: self._max_viewpoints]:
+                vp_image = self._load_viewpoint_image(vp_id)
+                if vp_image is None:
+                    continue
+                transform = self._load_viewpoint_transform(vp_id)
+                if transform is not None and self._camera_params is not None:
+                    vp_image = project_bbox3d_equirect(
+                        image=vp_image,
+                        bbox3d=bbox3d,
+                        body_to_world=transform,
+                        sensor_to_camera=self._camera_params.sensor_to_camera,
+                        image_width=self._camera_params.image_width,
+                        image_height=self._camera_params.image_height,
+                        hfov=self._camera_params.hfov,
+                        vfov=self._camera_params.vfov,
+                    )
+                images.append(vp_image)
+        return concat_images(images)
+
+    def _load_viewpoint_image(self, viewpoint_id: int) -> np.ndarray | None:
+        image_path = self._viewpoint_dir / f"viewpoint_{viewpoint_id}.png"
+        if not image_path.exists():
+            return None
+        return cv2.imread(str(image_path))
+
+    def _load_viewpoint_transform(self, viewpoint_id: int) -> np.ndarray | None:
+        transform_path = self._viewpoint_dir / f"viewpoint_{viewpoint_id}_transform.npy"
+        if not transform_path.exists():
+            return None
+        return np.load(str(transform_path))
+
+
+
+
+
+
+def main() -> None:
+    rclpy.init()
+    node = VlmReasoningNode()
+    try:
+        rclpy.spin(node)
+    finally:
+        node.destroy_node()
+        rclpy.shutdown()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sensor_coverage/CMakeLists.txt b/sensor_coverage/CMakeLists.txt
new file mode 100644
index 0000000..ed2cab6
--- /dev/null
+++ b/sensor_coverage/CMakeLists.txt
@@ -0,0 +1,68 @@
+cmake_minimum_required(VERSION 3.10)
+project(sensor_coverage)
+
+find_package(ament_cmake REQUIRED)
+find_package(rclcpp REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(nav_msgs REQUIRED)
+find_package(geometry_msgs REQUIRED)
+find_package(visualization_msgs REQUIRED)
+find_package(std_msgs REQUIRED)
+find_package(tf2 REQUIRED)
+find_package(tf2_ros REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(OpenCV REQUIRED)
+find_package(Eigen3 REQUIRED)
+find_package(PCL REQUIRED COMPONENTS common io filters kdtree segmentation features)
+find_package(pcl_conversions REQUIRED)
+find_package(semantic_mapping REQUIRED)
+
+add_executable(room_segmentation_node
+  src/room_segmentation/room_segmentation.cpp
+  src/room_segmentation/room_node_rep.cpp
+  src/utils/misc_utils.cpp
+)
+
+target_include_directories(room_segmentation_node
+  PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:include>
+    ${PCL_INCLUDE_DIRS}
+)
+
+target_link_libraries(room_segmentation_node
+  ${PCL_LIBRARIES}
+  ${OpenCV_LIBRARIES}
+)
+
+ament_target_dependencies(room_segmentation_node
+  rclcpp
+  sensor_msgs
+  nav_msgs
+  geometry_msgs
+  visualization_msgs
+  std_msgs
+  tf2
+  tf2_ros
+  cv_bridge
+  pcl_conversions
+  semantic_mapping
+)
+
+install(TARGETS room_segmentation_node
+  DESTINATION lib/${PROJECT_NAME}
+)
+
+install(DIRECTORY include/
+  DESTINATION include
+)
+
+install(DIRECTORY config
+  DESTINATION share/${PROJECT_NAME}
+)
+
+install(FILES resource/${PROJECT_NAME}
+  DESTINATION share/${PROJECT_NAME}/resource
+)
+
+ament_package()
diff --git a/sensor_coverage/config/room_segmentation_config.yaml b/sensor_coverage/config/room_segmentation_config.yaml
new file mode 100644
index 0000000..89c4ba4
--- /dev/null
+++ b/sensor_coverage/config/room_segmentation_config.yaml
@@ -0,0 +1,24 @@
+room_segmentation_node:
+  ros__parameters:
+    exploredAreaDisplayInterval: 1
+    exploredAreaVoxelSize: 0.1
+    dilation_iteration: 3
+    room_resolution: 0.1
+    room_x: 3000
+    room_y: 3000
+    room_z: 80
+    rolling_occupancy_grid.resolution_x: 0.2
+    ceilingHeight_: 2.3
+    wall_thres_height_: 0.1
+    outward_distance_0: 0.5
+    outward_distance_1: 0.3
+    distance_threshold: 2.0
+    distance_angel_threshold: 0.3
+    angle_threshold_deg: 6.0
+    region_growing_radius: 15.0
+    min_room_size: 40
+    normal_search_num: 50
+    normal_search_radius: 0.5
+    kViewPointCollisionMarginZPlus: 0.75
+    kViewPointCollisionMarginZMinus: 0.4
+    isDebug: false
diff --git a/sensor_coverage/include/sensor_coverage/room_segmentation/room_node_rep.h b/sensor_coverage/include/sensor_coverage/room_segmentation/room_node_rep.h
new file mode 100644
index 0000000..3613017
--- /dev/null
+++ b/sensor_coverage/include/sensor_coverage/room_segmentation/room_node_rep.h
@@ -0,0 +1,113 @@
+/**
+ * @file room_node_rep.h
+ * @author Haokun Zhu (haokunz@andrew.cmu.edu)
+ * @brief Room node representation for room segmentation
+ * @version 0.1
+ * @date 2026-01-04
+ *
+ * @copyright Copyright (c) 2026
+ *
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <Eigen/Core>
+#include <opencv2/opencv.hpp>
+
+#include "geometry_msgs/msg/point.hpp"
+#include "geometry_msgs/msg/polygon_stamped.hpp"
+#include "semantic_mapping/msg/room_node.hpp"
+
+namespace representation_ns {
+
+class RoomNodeRep {
+public:
+  explicit RoomNodeRep(const int id, const std::vector<cv::Point> &points);
+  RoomNodeRep() = default;
+  ~RoomNodeRep() = default;
+
+  void UpdateRoomNode(const std::vector<cv::Point> &points, const geometry_msgs::msg::PolygonStamped &polygon);
+  void UpdateRoomNode(const std::vector<cv::Point> &points);
+  void UpdatePolygon(const geometry_msgs::msg::PolygonStamped &polygon);
+  void UpdateCentroid(Eigen::Vector3f &centroid);
+  void RemovePoint(const cv::Point &point);
+  bool InRoom(const geometry_msgs::msg::Point &point);
+  void UpdateRoomNode(const semantic_mapping::msg::RoomNode msg);
+
+  void AddViewpointId(int viewpoint_id) { viewpoint_indices_.insert(viewpoint_id); }
+  void DeleteViewpointId(int viewpoint_id) { viewpoint_indices_.erase(viewpoint_id); }
+  void AddObjectIndex(int object_index) { object_indices_.insert(object_index); }
+  void DeleteObjectIndex(int object_index) { object_indices_.erase(object_index); }
+
+  void SetAnchorPoint(const geometry_msgs::msg::Point &anchor_point) { anchor_point_ = anchor_point; }
+  const geometry_msgs::msg::Point &GetAnchorPoint() const { return anchor_point_; }
+
+  void SetImage(const cv::Mat &image) { image_ = image.clone(); }
+  const cv::Mat &GetImage() const { return image_; }
+
+  void SetRoomMask(const cv::Mat &room_mask);
+  const cv::Mat &GetRoomMask() const { return room_mask_; }
+
+  void SetLastArea(float last_area) { last_area_ = last_area; }
+  float GetLastArea() const { return last_area_; }
+
+  void SetIsAsked() { is_asked_ = std::max(0, is_asked_ - 1); }
+  void ClearRoomLabels();
+
+  std::map<std::string, int> &GetLabelsMutable() { return labels_; }
+  const std::map<std::string, int> &GetLabels() const { return labels_; }
+
+  bool IsLabeled() const { return is_labeled_; }
+  void SetIsLabeled(bool is_labeled) { is_labeled_ = is_labeled; }
+
+  const std::set<int> &GetObjectIndices() const { return object_indices_; }
+  std::set<int> &GetObjectIndicesMutable() { return object_indices_; }
+
+  bool IsVisited() const { return is_visited_; }
+  void SetIsVisited(bool is_visited) { is_visited_ = is_visited; }
+
+  bool IsCovered() const { return is_covered_; }
+  void SetIsCovered(bool is_covered) { is_covered_ = is_covered; }
+
+  int GetIsAsked() const { return is_asked_; }
+  void SetIsAskedValue(int is_asked) { is_asked_ = is_asked; }
+
+  int GetVoxelNum() const { return voxel_num_; }
+  void SetVoxelNum(int voxel_num) { voxel_num_ = voxel_num; }
+
+  bool IsAlive() const { return alive; }
+  void SetAlive(bool is_alive) { alive = is_alive; }
+
+  const geometry_msgs::msg::PolygonStamped &GetPolygon() const { return polygon_; }
+  int GetId() const { return id_; }
+
+  int id_;
+  int show_id_;
+  geometry_msgs::msg::PolygonStamped polygon_;
+  std::vector<cv::Point> points_;
+  Eigen::Vector3f centroid_;
+  float area_;
+  float last_area_;
+  bool alive;
+  std::set<int> neighbors_;
+  std::set<int> viewpoint_indices_;
+  std::set<int> object_indices_;
+  bool is_visited_ = false;
+  bool is_covered_ = false;
+  bool is_labeled_ = false;
+  int is_asked_ = 2;
+  int voxel_num_;
+  std::map<std::string, int> labels_;
+  geometry_msgs::msg::Point anchor_point_;
+  cv::Mat image_;
+  cv::Mat room_mask_;
+  bool is_connected_;
+};
+
+}  // namespace representation_ns
diff --git a/sensor_coverage/include/sensor_coverage/room_segmentation/room_segmentation_node.h b/sensor_coverage/include/sensor_coverage/room_segmentation/room_segmentation_node.h
new file mode 100644
index 0000000..5ffb496
--- /dev/null
+++ b/sensor_coverage/include/sensor_coverage/room_segmentation/room_segmentation_node.h
@@ -0,0 +1,216 @@
+/**
+ * @file room_segmentation_node.h
+ * @author Haokun Zhu (haokunz@andrew.cmu.edu)
+ * @brief ROS 2 Node class header for room segmentation
+ * @version 0.2
+ * @date 2026-01-04
+ * @copyright Copyright (c) 2026
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <array>
+#include <map>
+#include <queue>
+#include <set>
+#include <unordered_map>
+#include <numeric>
+#include <limits>
+
+// ROS 2
+#include "rclcpp/rclcpp.hpp"
+#include "sensor_msgs/msg/point_cloud2.hpp"
+#include "sensor_msgs/msg/image.hpp"
+#include "nav_msgs/msg/odometry.hpp"
+#include "geometry_msgs/msg/polygon_stamped.hpp"
+#include "geometry_msgs/msg/point.hpp"
+#include "visualization_msgs/msg/marker_array.hpp"
+
+// PCL
+#include <pcl/point_cloud.h>
+#include <pcl/point_types.h>
+#include <pcl/filters/voxel_grid.h>
+#include <pcl/search/kdtree.h>
+#include <pcl/filters/passthrough.h>
+#include <pcl_conversions/pcl_conversions.h>
+#include <pcl/features/normal_3d.h>
+#include <pcl/segmentation/region_growing.h>
+#include <pcl/segmentation/sac_segmentation.h>
+#include <pcl/filters/extract_indices.h>
+#include <pcl/common/centroid.h>
+
+// OpenCV
+#include <opencv2/opencv.hpp>
+#include <cv_bridge/cv_bridge.hpp>
+
+// Eigen
+#include <Eigen/Dense>
+
+// Custom messages
+#include "semantic_mapping/msg/room_node.hpp"
+#include "semantic_mapping/msg/room_node_list.hpp"
+#include "sensor_coverage/room_segmentation/room_node_rep.h"
+
+
+namespace room_segmentation {
+
+// Helper structures
+struct PlaneInfo {
+    int id;
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr cloud;
+    std::vector<Eigen::Vector3i> voxel_indices;
+    Eigen::Vector3f normal;
+    Eigen::Vector3f centroid;
+    Eigen::Vector3f u_dir;
+    Eigen::Vector3f v_dir;
+    float width;
+    float height;
+    std::array<Eigen::Vector3f, 4> corners;
+    bool alive = true;
+    bool merged = false;
+};
+
+/**
+ * @brief ROS 2 Node for room segmentation
+ * 
+ * This node processes laser scan data to segment rooms in an indoor environment.
+ * It maintains maps of navigable space, detects walls using plane fitting,
+ * and segments the environment into discrete rooms connected by doors.
+ */
+class RoomSegmentationNode : public rclcpp::Node {
+public:
+    explicit RoomSegmentationNode();
+    ~RoomSegmentationNode() = default;
+
+private:
+    // ==================== Callback Functions ====================
+    void laserCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg);
+    void occupiedCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg);
+    void freespaceCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg);
+    void stateEstimationCallback(const nav_msgs::msg::Odometry::ConstSharedPtr msg);
+    void timerCallback();
+
+    // ==================== Core Processing Functions ====================
+    void roomSegmentation();
+    cv::Mat getWall(const pcl::PointCloud<pcl::PointXYZINormal>::Ptr &cloud);
+    void updateVoxelMap(const std::vector<Eigen::Vector3f> &navigable_points);
+    void updateStateVoxel();
+    void updateFreespace(pcl::PointCloud<pcl::PointXYZI>::Ptr &freespace_cloud_tmp);
+    void updateRooms(cv::Mat &room_mask_cropped, cv::Mat &room_mask_new,  cv::Mat &room_mask_vis_cropped, int &room_number);
+
+    // ==================== Helper Functions ====================
+    void mergePlanes(std::vector<PlaneInfo> &plane_infos, int idx_0, PlaneInfo &compare);
+    bool isPlaneSame(const PlaneInfo &a, const PlaneInfo &b);
+    bool isRoomConnected(const int &room_id, const int &current_room_id, 
+                        const Eigen::MatrixXi &adjacency_matrix);
+    geometry_msgs::msg::PolygonStamped computePolygonFromMaskCropped(const cv::Mat &mask);
+    void saveImageToFile(const cv::Mat &image, const std::string &filename, bool force_save = false);
+    int toIndex(int x, int y, int z);
+    int toIndex(int x, int y);
+
+    // ==================== Publishing Functions ====================
+    void publishRoomNodes();
+    void publishRoomPolygon();
+    void publishDoorCloud();
+
+    // ==================== ROS2 Interfaces ====================
+    
+    // Subscriptions
+    rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr sub_laser_cloud_;
+    rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr sub_occupied_cloud_;
+    rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr sub_freespace_cloud_;
+    rclcpp::Subscription<nav_msgs::msg::Odometry>::SharedPtr sub_state_estimation_;
+
+    // Publishers
+    rclcpp::Publisher<sensor_msgs::msg::PointCloud2>::SharedPtr pub_explored_area_;
+    rclcpp::Publisher<sensor_msgs::msg::Image>::SharedPtr pub_room_mask_vis_;
+    rclcpp::Publisher<sensor_msgs::msg::Image>::SharedPtr pub_room_mask_;
+    rclcpp::Publisher<sensor_msgs::msg::PointCloud2>::SharedPtr pub_door_cloud_;
+    rclcpp::Publisher<sensor_msgs::msg::PointCloud2>::SharedPtr pub_debug_;
+    rclcpp::Publisher<sensor_msgs::msg::PointCloud2>::SharedPtr pub_debug_1_;
+    rclcpp::Publisher<geometry_msgs::msg::PolygonStamped>::SharedPtr pub_room_boundary_;
+    rclcpp::Publisher<geometry_msgs::msg::PolygonStamped>::SharedPtr pub_room_boundary_tmp_;
+    rclcpp::Publisher<visualization_msgs::msg::MarkerArray>::SharedPtr pub_polygon_;
+    rclcpp::Publisher<sensor_msgs::msg::PointCloud2>::SharedPtr pub_wall_cloud_;
+    rclcpp::Publisher<semantic_mapping::msg::RoomNode>::SharedPtr pub_room_node_;
+    rclcpp::Publisher<semantic_mapping::msg::RoomNodeList>::SharedPtr pub_room_node_list_;
+    rclcpp::Publisher<sensor_msgs::msg::PointCloud2>::SharedPtr pub_room_map_cloud_;
+
+    // Timer
+    rclcpp::TimerBase::SharedPtr timer_;
+
+    // ==================== Parameters ====================
+    float explored_area_voxel_size_;
+    float room_resolution_;
+    float room_resolution_inv_;
+    float occupancy_grid_resolution_;
+    float ceiling_height_;
+    float ceiling_height_base_; // this is the ceiling height relative to robot base height
+    float wall_thres_height_;
+    float wall_thres_height_base_; // this is the wall threshold height relative to robot base height
+    float outward_distance_0_; // this controls the extent to which the wall dilates along its own direction.
+    float outward_distance_1_; // this controls the extent to which the wall dilates along its normal direction.
+    float distance_threshold_;
+    float distance_angel_threshold_;
+    float angle_threshold_deg_;
+    float region_growing_radius_; // radius for region growing segmentation
+    int dilation_iteration_; // number of dilation iterations when pre-processing for watershed
+    int min_room_size_; // minimum room size in number of pixels after dilation
+    int exploredAreaDisplayInterval_; // interval for publishing explored area (x10)
+    int normal_search_num_;  // Number of nearest neighbors for normal estimation
+    float normal_search_radius_;  // Search radius for normal estimation
+    float kViewPointCollisionMarginZPlus_;
+    float kViewPointCollisionMarginZMinus_;
+    bool is_debug_; // whether to save debug images
+    std::vector<int> room_voxel_dimension_;
+
+    // ==================== Point Clouds ====================
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr laser_cloud_;
+    pcl::PointCloud<pcl::PointXYZI>::Ptr laser_cloud_tmp_;
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr explored_area_cloud_tmp_;
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr downsampled_explored_area_cloud_;
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr downsampled_explored_area_cloud_tmp_;
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr downsampled_ceiling_cloud_;
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr downsampled_ceiling_cloud_tmp_;
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr in_range_cloud_;
+    pcl::PointCloud<pcl::PointXYZI>::Ptr occupied_cloud_;
+    pcl::PointCloud<pcl::PointXYZI>::Ptr updated_voxel_cloud_;
+    pcl::PointCloud<pcl::PointXYZI>::Ptr freespace_cloud_;
+    pcl::PointCloud<pcl::PointXYZRGBL>::Ptr door_cloud_;
+
+    // ==================== Filters ====================
+    pcl::VoxelGrid<pcl::PointXYZINormal> explored_area_dwz_filter_;
+    pcl::VoxelGrid<pcl::PointXYZINormal> ceiling_cloud_dwz_filter_;
+    pcl::PassThrough<pcl::PointXYZINormal> ceiling_pass_filter_;  // Reusable filter for ceiling height
+    pcl::PassThrough<pcl::PointXYZI> occupied_pass_filter_;  // Reusable filter for occupied cloud
+    pcl::search::KdTree<pcl::PointXYZINormal>::Ptr tree_;
+
+    // ==================== Maps and Data Structures ====================
+    std::vector<int> navigable_voxels_; // voxels that save the point cloud number in each voxel
+    std::vector<int> state_voxels_; // voxels that save the freespace state in each voxel (used to tackle lidar reflection issue)
+    std::vector<int> freespace_indices_; // indices of free voxels 
+    cv::Mat navigable_map_all_; // the whole top-down navigable map that aggregate along the z axis
+    cv::Mat navigable_map_; // the cropped navigable_map_all_
+    cv::Mat wall_hist_all_; // the whole top-down wall histogram map
+    cv::Mat wall_hist_; // the cropped wall_hist_all_
+    cv::Mat state_map_all_;
+    cv::Mat state_map_; // cropped state_map_all_
+    cv::Mat room_mask_; // the int-value room_mask (each room has a unique integer id, 0 means background and -1 means edges between rooms)
+    cv::Mat room_mask_vis_; // the visualizable room mask (each room has a unique color)
+    std::vector<Eigen::Vector2i> bbox_; // bounding box of the current navigable map crop
+    std::vector<Eigen::Vector3f> ceiling_points_;
+    std::vector<PlaneInfo> plane_infos_; // detected plane information
+    std::map<int, representation_ns::RoomNodeRep> room_nodes_map_; // room id to RoomNodeRep
+    int room_node_counter_; // counter to assign unique ids to room nodes
+    
+    // ==================== State Variables ====================
+    Eigen::Vector3f shift_; // shift from the left-down or right-up corner to the center of the _all_ maps
+    geometry_msgs::msg::Point robot_position_; // robot position in world frame
+    int explored_area_display_count_; // counter for publishing explored area
+    bool segment_flag_; // flag to trigger room segmentation
+};
+
+} // namespace room_segmentation
diff --git a/sensor_coverage/include/sensor_coverage/utils/misc_utils.h b/sensor_coverage/include/sensor_coverage/utils/misc_utils.h
new file mode 100644
index 0000000..9ac6432
--- /dev/null
+++ b/sensor_coverage/include/sensor_coverage/utils/misc_utils.h
@@ -0,0 +1,414 @@
+/**
+ * @file utils.h
+ * @author Chao Cao (ccao1@andrew.cmu.edu)
+ * @brief Miscellaneous utility functions
+ * @version 0.1
+ * @date 2019-06-05
+ *
+ * @copyright Copyright (c) 2021
+ *
+ */
+#pragma once
+
+#include <geometry_msgs/msg/point.hpp>
+#include <geometry_msgs/msg/polygon.hpp>
+#include <nav_msgs/msg/odometry.hpp>
+#include <nav_msgs/msg/path.hpp>
+#include <std_msgs/msg/color_rgba.hpp>
+#include <pcl/point_cloud.h>
+#include <pcl/point_types.h>
+#include <pcl_conversions/pcl_conversions.h>
+#include <pcl/common/transforms.h>
+#include <limits>
+#include <cfloat>
+#include <tf2_ros/transform_broadcaster.h>
+#include <tf2/transform_datatypes.h>
+#include <tf2/LinearMath/Matrix3x3.h>
+#include <tf2/LinearMath/Quaternion.h>
+#include <visualization_msgs/msg/marker.hpp>
+#include <math.h>
+#include <iostream>
+#include <cstdlib>
+// uncomment to disable assert()
+// #define NDEBUG
+#include <cassert>
+#include <algorithm>
+#include <vector>
+#include <chrono>
+#include <opencv2/opencv.hpp>
+
+#define MY_ASSERT(val)                                                                                                 \
+  if (!(val))                                                                                                          \
+  {                                                                                                                    \
+    std::cout << "\033[31m Error at [File: " << __FILE__ << "][Line: " << __LINE__ << "]"                              \
+              << "[Function: " << __FUNCTION__ << "] \033[0m\n"                                                        \
+              << std::endl;                                                                                            \
+    exit(1);                                                                                                           \
+  }
+
+namespace misc_utils_ns
+{
+typedef pcl::PointXYZI PCLPointType;
+typedef pcl::PointCloud<PCLPointType> PCLPointCloudType;
+
+template <class FromPointType, class ToPointType>
+void PointToPoint(const FromPointType& from_point, ToPointType& to_point)
+{
+  to_point.x = from_point.x;
+  to_point.y = from_point.y;
+  to_point.z = from_point.z;
+}
+geometry_msgs::msg::Point PCL2GeoMsgPnt(const PCLPointType& pnt);
+PCLPointType GeoMsgPnt2PCL(const geometry_msgs::msg::Point& pnt);
+geometry_msgs::msg::Point GeoMsgPoint(double x, double y, double z);
+PCLPointType PCLPoint(float x, float y, float z);
+// TODO: make these template functions
+void LeftRotatePoint(PCLPointType& pnt);
+void RightRotatePoint(PCLPointType& pnt);
+void LeftRotatePoint(geometry_msgs::msg::Point& pnt);
+void RightRotatePoint(geometry_msgs::msg::Point& pnt);
+template <class CloudType>
+void KeyposeToMap(CloudType& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose);
+double PointXYDist(const geometry_msgs::msg::Point& pnt1, const geometry_msgs::msg::Point& pnt2);
+double PointXYDist(const PCLPointType& pnt1, const PCLPointType& pnt2);
+template <class P1, class P2>
+double PointXYDist(const P1& pnt1, const P2& pnt2)
+{
+  return sqrt(pow((pnt1.x - pnt2.x), 2) + pow((pnt1.y - pnt2.y), 2));
+}
+template <class P1, class P2>
+double PointXYZDist(const P1& pnt1, const P2& pnt2)
+{
+  return sqrt(pow((pnt1.x - pnt2.x), 2) + pow((pnt1.y - pnt2.y), 2) + pow((pnt1.z - pnt2.z), 2));
+}
+double VectorXYAngle(const Eigen::Vector3d& v1, const Eigen::Vector3d& v2);
+double PointAngle(const geometry_msgs::msg::Point& pnt, const geometry_msgs::msg::Point& robot_pos);
+double PointAngle(const PCLPointType& pnt, const geometry_msgs::msg::Point& robot_pos);
+bool CollinearXY(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& p2,
+                 const geometry_msgs::msg::Point& p3, double threshold = 0.1);
+bool LineSegIntersect(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& q1,
+                      const geometry_msgs::msg::Point& p2, const geometry_msgs::msg::Point& q2);
+bool LineSegIntersectWithTolerance(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& q1,
+                                   const geometry_msgs::msg::Point& p2, const geometry_msgs::msg::Point& q2,
+                                   const double tolerance);
+int ThreePointOrientation(const geometry_msgs::msg::Point& p, const geometry_msgs::msg::Point& q,
+                          const geometry_msgs::msg::Point& r);
+bool PointOnLineSeg(const geometry_msgs::msg::Point& p, const geometry_msgs::msg::Point& q,
+                    const geometry_msgs::msg::Point& r);
+double AngleOverlap(double s1, double e1, double s2, double e2);
+double AngleDiff(double source_angle, double target_angle);
+bool PointInPolygon(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Polygon& polygon);
+double LineSegDistance2D(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Point& line_segment_start,
+                         const geometry_msgs::msg::Point& line_segment_end);
+double LineSegDistance3D(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Point& line_segment_start,
+                         const geometry_msgs::msg::Point& line_segment_end);
+double DistancePoint2DToPolygon(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Polygon& polygon);
+void LinInterpPoints(const Eigen::Vector3d& p1, const Eigen::Vector3d& p2, double resolution,
+                     std::vector<Eigen::Vector3d>& interp_points);
+template <class PCLPointType>
+void LinInterpPoints(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& p2, double resolution,
+                     typename pcl::PointCloud<PCLPointType>::Ptr& cloud)
+{
+  double point_dist = PointXYZDist<geometry_msgs::msg::Point, geometry_msgs::msg::Point>(p1, p2);
+  if (point_dist < 0.01)
+    return;
+  PCLPointType point1;
+  point1.x = p1.x;
+  point1.y = p1.y;
+  point1.z = p1.z;
+  PCLPointType point2;
+  point2.x = p1.x;
+  point2.y = p1.y;
+  point2.z = p1.z;
+  if (point_dist < resolution)
+  {
+    cloud->points.push_back(point1);
+    cloud->points.push_back(point2);
+  }
+  else
+  {
+    int num_points = static_cast<int>(point_dist / resolution);
+    cloud->points.push_back(point1);
+    for (int i = 0; i < num_points; i++)
+    {
+      PCLPointType point;
+      point.x = static_cast<float>((p2.x - p1.x) / num_points * i + p1.x);
+      point.y = static_cast<float>((p2.y - p1.y) / num_points * i + p1.y);
+      point.z = static_cast<float>((p2.z - p1.z) / num_points * i + p1.z);
+      cloud->points.push_back(point);
+    }
+    cloud->points.push_back(point2);
+  }
+}
+double DegreeToRadian(double degree);
+double RadianToDegree(double radian);
+/**
+ * Function to append part of path2 ([from_ind, to_ind]) to path1
+ * @param path1
+ * @param path2
+ * @param from_ind
+ * @param to_ind
+ */
+void ConcatenatePath(nav_msgs::msg::Path& path1, const nav_msgs::msg::Path& path2, int from_ind = -1, int to_ind = -1);
+/**
+ * Function to check if an index is within the range of a list
+ * @tparam T
+ * @param list
+ * @param index
+ * @return
+ */
+template <class T>
+bool InRange(const std::vector<T>& list, int index)
+{
+  return index >= 0 && index < list.size();
+}
+// template <typename T>
+// T getParam(rclcpp::Node::SharedPtr nh, const std::string& name, const T default_val)
+// {
+//   T val;
+//   bool success = nh->getParam(name, val);
+//   if (!success)
+//   {
+//     ROS_ERROR_STREAM("Cannot read parameter: " << name);
+//     return default_val;
+//   }
+//   return val;
+// }
+// template <typename T>
+// T getParam(rclcpp::Node::SharedPtr nh, const std::string& name, const T default_val)
+// {
+//   T val;
+//   bool success = nh.getParam(name, val);
+//   if (!success)
+//   {
+//     ROS_ERROR_STREAM("Cannot read parameter: " << name);
+//     return default_val;
+//   }
+//   return val;
+// }
+/**
+ * Function to publish clouds
+ * @tparam T PCL PointCloud type
+ * @param cloud_publisher
+ * @param cloud
+ * @param frame_id
+ */
+template <class PCLPointCloudType>
+void PublishCloud(rclcpp::Node::SharedPtr node,
+                  typename rclcpp::Publisher<sensor_msgs::msg::PointCloud2>::SharedPtr cloud_publisher,
+                  const PCLPointCloudType& cloud, const std::string& frame_id)
+{
+  sensor_msgs::msg::PointCloud2 cloud_msg;
+  pcl::toROSMsg(cloud, cloud_msg);
+  cloud_msg.header.frame_id = frame_id;
+  cloud_msg.header.stamp = node->now();
+  cloud_publisher->publish(cloud_msg);
+}
+
+template <class ROSMsgType>
+void Publish(rclcpp::Node::SharedPtr node, typename rclcpp::Publisher<ROSMsgType>::SharedPtr publisher, ROSMsgType& msg,
+             const std::string& frame_id)
+{
+  msg.header.frame_id = frame_id;
+  msg.header.stamp = node->now();
+  publisher->publish(msg);
+}
+
+void SetDifference(std::vector<int>& v1, std::vector<int>& v2, std::vector<int>& diff);
+void SetIntersection(std::vector<int>& v1, std::vector<int>& v2, std::vector<int>& intersection);
+
+class Timer
+{
+private:
+  std::string name_;
+  std::chrono::time_point<std::chrono::high_resolution_clock> timer_start_;
+  std::chrono::time_point<std::chrono::high_resolution_clock> timer_stop_;
+  std::chrono::duration<double> duration_;
+
+public:
+  Timer(std::string name) : name_(name)
+  {
+  }
+  ~Timer() = default;
+  void Start()
+  {
+    timer_start_ = std::chrono::high_resolution_clock::now();
+  }
+  void Stop(bool show, std::string unit = "ms")
+  {
+    timer_stop_ = std::chrono::high_resolution_clock::now();
+    duration_ = timer_stop_ - timer_start_;
+    if (show)
+    {
+      if (unit == "ms")
+      {
+        std::cout << name_ << " takes " << GetDuration<std::chrono::milliseconds>() << " ms" << std::endl;
+      }
+      else if (unit == "us")
+      {
+        std::cout << name_ << " takes " << GetDuration<std::chrono::microseconds>() << " us" << std::endl;
+      }
+      else if (unit == "ns")
+      {
+        std::cout << name_ << " takes " << GetDuration<std::chrono::nanoseconds>() << " ns" << std::endl;
+      }
+      else if (unit == "s")
+      {
+        std::cout << name_ << " takes " << GetDuration<std::chrono::seconds>() << " s" << std::endl;
+      }
+      else
+      {
+        std::cout << "timer unit error!" << std::endl;
+      }
+    }
+  }
+  template <class U>
+  int GetDuration()
+  {
+    return std::chrono::duration_cast<U>(duration_).count();
+  }
+  int GetDuration(std::string unit = "ms")
+  {
+    if (unit == "ms")
+    {
+      return std::chrono::duration_cast<std::chrono::milliseconds>(duration_).count();
+    }
+    else if (unit == "us")
+    {
+      return std::chrono::duration_cast<std::chrono::microseconds>(duration_).count();
+    }
+    else if (unit == "ns")
+    {
+      return std::chrono::duration_cast<std::chrono::nanoseconds>(duration_).count();
+    }
+    else if (unit == "s")
+    {
+      return std::chrono::duration_cast<std::chrono::seconds>(duration_).count();
+    }
+    else
+    {
+      std::cout << "timer unit error!" << std::endl;
+      return 0;
+    }
+  }
+};
+
+class Marker
+{
+private:
+  std::string pub_topic_;
+  std::string frame_id_;
+  rclcpp::Publisher<visualization_msgs::msg::Marker>::SharedPtr marker_pub_;
+  rclcpp::Node::SharedPtr nh_;
+
+public:
+  static int id_;
+  visualization_msgs::msg::Marker marker_;
+  explicit Marker(rclcpp::Node::SharedPtr nh, std::string pub_topic, std::string frame_id)
+    : nh_(nh), pub_topic_(pub_topic), frame_id_(frame_id)
+  {
+    marker_pub_ = nh_->create_publisher<visualization_msgs::msg::Marker>(pub_topic_, 2);
+    id_++;
+    marker_.id = id_;
+    marker_.action = visualization_msgs::msg::Marker::ADD;
+    marker_.pose.orientation.w = 1.0;
+  }
+  ~Marker() = default;
+  void SetColorRGBA(const std_msgs::msg::ColorRGBA& color)
+  {
+    marker_.color = color;
+  }
+  void SetColorRGBA(double r, double g, double b, double a)
+  {
+    marker_.color.r = r;
+    marker_.color.g = g;
+    marker_.color.b = b;
+    marker_.color.a = a;
+  }
+  void SetScale(double x, double y, double z)
+  {
+    marker_.scale.x = x;
+    marker_.scale.y = y;
+    marker_.scale.z = z;
+  }
+  void SetType(int type)
+  {
+    marker_.type = type;
+  }
+  void SetAction(visualization_msgs::msg::Marker::_action_type action)
+  {
+    marker_.action = action;
+  }
+  void Publish()
+  {
+    misc_utils_ns::Publish(nh_, marker_pub_, marker_, frame_id_);
+  }
+
+  typedef std::shared_ptr<Marker> Ptr;
+};
+int signum(int x);
+double mod(double value, double modulus);
+double intbound(double s, double ds);
+bool InRange(const Eigen::Vector3i& sub, const Eigen::Vector3i& max_sub = Eigen::Vector3i(INT_MAX, INT_MAX, INT_MAX),
+             const Eigen::Vector3i& min_sub = Eigen::Vector3i(INT_MIN, INT_MIN, INT_MIN));
+void RayCast(const Eigen::Vector3i& start_sub, const Eigen::Vector3i& end_sub, const Eigen::Vector3i& max_sub,
+             const Eigen::Vector3i& min_sub, std::vector<Eigen::Vector3i>& output);
+bool InFOV(const Eigen::Vector3d& point_position, const Eigen::Vector3d& viewpoint_position, double vertical_half_fov,
+           double range);
+bool InFOVSimple(const Eigen::Vector3d& point_position, const Eigen::Vector3d& viewpoint_position,
+                 double vertical_fov_ratio, double range, double xy_dist_threshold = 0, double z_diff_threshold = 0,
+                 bool print = false);
+float ApproxAtan(float z);
+float ApproxAtan2(float y, float x);
+double GetPathLength(const nav_msgs::msg::Path& path);
+double GetPathLength(const std::vector<Eigen::Vector3d>& path);
+double AStarSearch(const std::vector<std::vector<int>>& graph, const std::vector<std::vector<double>>& node_dist,
+                   const std::vector<geometry_msgs::msg::Point>& node_positions, int from_idx, int to_idx,
+                   bool get_path, std::vector<int>& path_indices);
+bool AStarSearchWithMaxPathLength(const std::vector<std::vector<int>>& graph,
+                                  const std::vector<std::vector<double>>& node_dist,
+                                  const std::vector<geometry_msgs::msg::Point>& node_positions, int from_idx,
+                                  int to_idx, bool get_path, std::vector<int>& path_indices, double& shortest_dist,
+                                  double max_path_length = DBL_MAX);
+nav_msgs::msg::Path SimplifyPath(const nav_msgs::msg::Path& path);
+nav_msgs::msg::Path DeduplicatePath(const nav_msgs::msg::Path& path, double min_dist);
+void SampleLineSegments(const std::vector<Eigen::Vector3d>& initial_points, double sample_resol,
+                        std::vector<Eigen::Vector3d>& sample_points);
+void UniquifyIntVector(std::vector<int>& list);
+
+std::vector<int> find_path_bfs(int start, int goal, const Eigen::MatrixXi &adj_matrix);
+Eigen::Vector3d idToColor(int id);
+
+// 将点坐标转换为体素网格索引（未裁剪版本）
+Eigen::Vector3i point_to_voxel(
+    const Eigen::Vector3f &pt,
+    const Eigen::Vector3f &origin_shift,
+    float inv_resolution);
+
+Eigen::Vector3i point_to_voxel(
+    geometry_msgs::msg::Point pt,
+    const Eigen::Vector3f &origin_shift,
+    float inv_resolution);
+
+// 将体素网格索引转换为点坐标（未裁剪版本）
+Eigen::Vector3f voxel_to_point(
+    const Eigen::Vector3i &voxel_index,
+    const Eigen::Vector3f &origin_shift,
+    float resolution);
+
+// 将点坐标转换为体素网格索引（考虑裁剪偏移）
+Eigen::Vector3i point_to_voxel_cropped(
+    const Eigen::Vector3f &pt,
+    const Eigen::Vector3f &origin_shift,
+    float inv_resolution,
+    std::vector<Eigen::Vector2i> &bbox);
+
+// 将体素网格索引转换为点坐标（考虑裁剪偏移）
+Eigen::Vector3f voxel_to_point_cropped(
+    const Eigen::Vector3i &voxel_index,
+    const Eigen::Vector3f &origin_shift,
+    float resolution,
+    std::vector<Eigen::Vector2i> &bbox);
+
+
+} // namespace misc_utils_ns
diff --git a/sensor_coverage/package.xml b/sensor_coverage/package.xml
new file mode 100644
index 0000000..ff64d9f
--- /dev/null
+++ b/sensor_coverage/package.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>sensor_coverage</name>
+  <version>0.0.1</version>
+  <description>Room segmentation node for sensor coverage planning.</description>
+  <maintainer email="alex.lin416@outlook.com">Alex Lin</maintainer>
+  <license>MIT</license>
+
+  <buildtool_depend>ament_cmake</buildtool_depend>
+
+  <depend>rclcpp</depend>
+  <depend>sensor_msgs</depend>
+  <depend>nav_msgs</depend>
+  <depend>geometry_msgs</depend>
+  <depend>visualization_msgs</depend>
+  <depend>std_msgs</depend>
+  <depend>tf2</depend>
+  <depend>tf2_ros</depend>
+  <depend>cv_bridge</depend>
+  <depend>pcl_conversions</depend>
+  <depend>semantic_mapping</depend>
+
+  <build_depend>Eigen3</build_depend>
+  <build_depend>PCL</build_depend>
+  <build_depend>OpenCV</build_depend>
+  <exec_depend>Eigen3</exec_depend>
+  <exec_depend>PCL</exec_depend>
+  <exec_depend>OpenCV</exec_depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/sensor_coverage/resource/sensor_coverage b/sensor_coverage/resource/sensor_coverage
new file mode 100644
index 0000000..a340768
--- /dev/null
+++ b/sensor_coverage/resource/sensor_coverage
@@ -0,0 +1 @@
+sensor_coverage
diff --git a/sensor_coverage/src/room_segmentation/room_node_rep.cpp b/sensor_coverage/src/room_segmentation/room_node_rep.cpp
new file mode 100644
index 0000000..6545192
--- /dev/null
+++ b/sensor_coverage/src/room_segmentation/room_node_rep.cpp
@@ -0,0 +1,117 @@
+/**
+ * @file room_node_rep.cpp
+ * @author Haokun Zhu (haokunz@andrew.cmu.edu)
+ * @brief Room node representation for room segmentation
+ * @version 0.1
+ * @date 2026-01-04
+ *
+ * @copyright Copyright (c) 2026
+ *
+ */
+
+#include "sensor_coverage/room_segmentation/room_node_rep.h"
+
+#include <algorithm>
+
+#include <cv_bridge/cv_bridge.hpp>
+
+#include "sensor_coverage/utils/misc_utils.h"
+
+namespace representation_ns {
+
+RoomNodeRep::RoomNodeRep(const int id, const std::vector<cv::Point> &points)
+    : id_(id),
+      show_id_(0),
+      points_(points),
+      centroid_(0.0, 0.0, 0.0),
+      area_(0.0),
+      last_area_(0.0),
+      alive(true),
+      is_visited_(false),
+      is_covered_(false),
+      is_labeled_(false),
+      is_asked_(2),
+      voxel_num_(0),
+      is_connected_(false) {
+  viewpoint_indices_ = std::set<int>();
+  object_indices_ = std::set<int>();
+  anchor_point_ = geometry_msgs::msg::Point();
+}
+
+void RoomNodeRep::UpdateRoomNode(const std::vector<cv::Point> &points,
+                                 const geometry_msgs::msg::PolygonStamped &polygon) {
+  points_ = points;
+  polygon_ = polygon;
+  alive = true;
+}
+
+void RoomNodeRep::UpdateRoomNode(const std::vector<cv::Point> &points) {
+  points_ = points;
+  alive = true;
+}
+
+void RoomNodeRep::UpdateCentroid(Eigen::Vector3f &centroid) { centroid_ = centroid; }
+
+void RoomNodeRep::UpdatePolygon(const geometry_msgs::msg::PolygonStamped &polygon) {
+  polygon_ = polygon;
+  alive = true;
+}
+
+bool RoomNodeRep::InRoom(const geometry_msgs::msg::Point &point) {
+  return misc_utils_ns::PointInPolygon(point, polygon_.polygon);
+}
+
+void RoomNodeRep::RemovePoint(const cv::Point &point) {
+  auto it = std::remove(points_.begin(), points_.end(), point);
+  if (it != points_.end()) {
+    points_.erase(it, points_.end());
+  }
+}
+
+void RoomNodeRep::UpdateRoomNode(const semantic_mapping::msg::RoomNode msg) {
+  id_ = msg.id;
+  show_id_ = msg.show_id;
+  polygon_ = msg.polygon;
+  centroid_ = Eigen::Vector3f(msg.centroid.x, msg.centroid.y, msg.centroid.z);
+  neighbors_.clear();
+  for (const auto &neighbor : msg.neighbors) {
+    neighbors_.insert(neighbor);
+  }
+  alive = true;
+  is_connected_ = msg.is_connected;
+  area_ = msg.area;
+  room_mask_ = cv_bridge::toCvCopy(msg.room_mask, "mono8")->image;
+}
+
+void RoomNodeRep::SetRoomMask(const cv::Mat &room_mask) {
+  cv::Mat room_mask_tmp = room_mask.clone();
+  room_mask_tmp.convertTo(room_mask_tmp, CV_8UC1);
+  std::vector<cv::Point> non_zero_points;
+  cv::findNonZero(room_mask_tmp, non_zero_points);
+  if (non_zero_points.empty()) {
+    return;
+  }
+  cv::Rect rect = cv::boundingRect(non_zero_points);
+  std::vector<Eigen::Vector2i> bbox(2);
+  bbox[0] = {rect.tl().y, rect.tl().x};
+  bbox[1] = {rect.br().y, rect.br().x};
+  int margin = 10;
+  bbox[0] = (bbox[0] - Eigen::Vector2i(margin, margin)).cwiseMax(Eigen::Vector2i(0, 0));
+  bbox[1] = (bbox[1] + Eigen::Vector2i(margin, margin))
+                .cwiseMin(Eigen::Vector2i(room_mask_tmp.rows - 1, room_mask_tmp.cols - 1));
+
+  room_mask_ = room_mask_tmp.rowRange(bbox[0][0], bbox[1][0] + 1)
+                   .colRange(bbox[0][1], bbox[1][1] + 1);
+}
+
+void RoomNodeRep::ClearRoomLabels() {
+  labels_.clear();
+  is_labeled_ = false;
+  is_asked_ = 2;
+  last_area_ = 0.0f;
+  voxel_num_ = 0;
+  anchor_point_ = geometry_msgs::msg::Point();
+  image_ = cv::Mat();
+}
+
+}  // namespace representation_ns
diff --git a/sensor_coverage/src/room_segmentation/room_segmentation.cpp b/sensor_coverage/src/room_segmentation/room_segmentation.cpp
new file mode 100644
index 0000000..ef749c6
--- /dev/null
+++ b/sensor_coverage/src/room_segmentation/room_segmentation.cpp
@@ -0,0 +1,2052 @@
+/**
+ * @file room_segmentation.cpp
+ * @author Haokun Zhu (haokunz@andrew.cmu.edu)
+ * @brief ROS 2 Node implementation for room segmentation
+ * @version 0.2
+ * @date 2026-01-04
+ * @copyright Copyright (c) 2026
+ */
+
+#include "sensor_coverage/room_segmentation/room_segmentation_node.h"
+#include "sensor_coverage/utils/misc_utils.h"
+
+namespace room_segmentation {
+
+// ==================== Constructor ====================
+RoomSegmentationNode::RoomSegmentationNode() 
+    : Node("room_segmentation_node"),
+      explored_area_voxel_size_(0.1f),
+      room_resolution_(0.1f),
+      occupancy_grid_resolution_(0.2f),
+      ceiling_height_base_(2.0f),
+      wall_thres_height_base_(0.1f),
+      outward_distance_0_(0.5f),
+      outward_distance_1_(0.3f),
+      distance_threshold_(2.5f),
+      distance_angel_threshold_(0.3f),
+      angle_threshold_deg_(6.0f),
+      region_growing_radius_(15.0f),
+      dilation_iteration_(4),
+      min_room_size_(40),
+      exploredAreaDisplayInterval_(1),
+      normal_search_num_(50),
+      normal_search_radius_(0.5f),
+      kViewPointCollisionMarginZPlus_(0.5f),
+      kViewPointCollisionMarginZMinus_(0.5f),
+      is_debug_(false),
+      room_voxel_dimension_({200, 200, 50}),
+      explored_area_display_count_(0),
+      room_node_counter_(0),
+      segment_flag_(false)
+{
+    RCLCPP_INFO(this->get_logger(), "Initializing Room Segmentation Node...");
+
+    // ==================== Declare and Get Parameters ====================
+    this->declare_parameter<float>("exploredAreaVoxelSize", 0.1f);
+    this->declare_parameter<float>("room_resolution", 0.1f);
+    this->declare_parameter<float>("rolling_occupancy_grid.resolution_x", 0.2f);
+    this->declare_parameter<int>("room_x", 200);
+    this->declare_parameter<int>("room_y", 200);
+    this->declare_parameter<int>("room_z", 50);
+    this->declare_parameter<float>("ceilingHeight_", 2.0f);
+    this->declare_parameter<float>("wall_thres_height_", 0.1f);
+    this->declare_parameter<int>("exploredAreaDisplayInterval", 1);
+    this->declare_parameter<int>("dilation_iteration", 4);
+    this->declare_parameter<float>("outward_distance_0", 0.5f);
+    this->declare_parameter<float>("outward_distance_1", 0.3f);
+    this->declare_parameter<float>("distance_threshold", 2.5f);
+    this->declare_parameter<float>("distance_angel_threshold", 0.3f);
+    this->declare_parameter<float>("angle_threshold_deg", 6.0f);
+    this->declare_parameter<float>("region_growing_radius", 15.0f);
+    this->declare_parameter<int>("min_room_size", 40);
+    this->declare_parameter<int>("normal_search_num", 50);
+    this->declare_parameter<float>("normal_search_radius", 0.5f);
+    this->declare_parameter<float>("kViewPointCollisionMarginZPlus", 0.5f);
+    this->declare_parameter<float>("kViewPointCollisionMarginZMinus", 0.5f);
+    this->declare_parameter<bool>("isDebug", false);
+
+    this->get_parameter("exploredAreaVoxelSize", explored_area_voxel_size_);
+    this->get_parameter("room_resolution", room_resolution_);
+    this->get_parameter("rolling_occupancy_grid.resolution_x", occupancy_grid_resolution_);
+    this->get_parameter("room_x", room_voxel_dimension_[0]);
+    this->get_parameter("room_y", room_voxel_dimension_[1]);
+    this->get_parameter("room_z", room_voxel_dimension_[2]);
+    this->get_parameter("ceilingHeight_", ceiling_height_base_);
+    this->get_parameter("wall_thres_height_", wall_thres_height_base_);
+    this->get_parameter("exploredAreaDisplayInterval", exploredAreaDisplayInterval_);
+    this->get_parameter("dilation_iteration", dilation_iteration_);
+    this->get_parameter("outward_distance_0", outward_distance_0_);
+    this->get_parameter("outward_distance_1", outward_distance_1_);
+    this->get_parameter("distance_threshold", distance_threshold_);
+    this->get_parameter("distance_angel_threshold", distance_angel_threshold_);
+    this->get_parameter("angle_threshold_deg", angle_threshold_deg_);
+    this->get_parameter("region_growing_radius", region_growing_radius_);
+    this->get_parameter("min_room_size", min_room_size_);
+    this->get_parameter("normal_search_num", normal_search_num_);
+    this->get_parameter("normal_search_radius", normal_search_radius_);
+    this->get_parameter("kViewPointCollisionMarginZPlus", kViewPointCollisionMarginZPlus_);
+    this->get_parameter("kViewPointCollisionMarginZMinus", kViewPointCollisionMarginZMinus_);
+    this->get_parameter("isDebug", is_debug_);
+
+    room_resolution_inv_ = 1.0f / room_resolution_;
+    ceiling_height_ = ceiling_height_base_;
+    wall_thres_height_ = wall_thres_height_base_;
+
+    // ==================== Initialize Point Clouds ====================
+    laser_cloud_.reset(new pcl::PointCloud<pcl::PointXYZINormal>());
+    laser_cloud_tmp_.reset(new pcl::PointCloud<pcl::PointXYZI>());
+    explored_area_cloud_tmp_.reset(new pcl::PointCloud<pcl::PointXYZINormal>());
+    downsampled_explored_area_cloud_.reset(new pcl::PointCloud<pcl::PointXYZINormal>());
+    downsampled_explored_area_cloud_tmp_.reset(new pcl::PointCloud<pcl::PointXYZINormal>());
+    downsampled_ceiling_cloud_.reset(new pcl::PointCloud<pcl::PointXYZINormal>());
+    downsampled_ceiling_cloud_tmp_.reset(new pcl::PointCloud<pcl::PointXYZINormal>());
+    in_range_cloud_.reset(new pcl::PointCloud<pcl::PointXYZINormal>());
+    occupied_cloud_.reset(new pcl::PointCloud<pcl::PointXYZI>());
+    updated_voxel_cloud_.reset(new pcl::PointCloud<pcl::PointXYZI>());
+    freespace_cloud_.reset(new pcl::PointCloud<pcl::PointXYZI>());
+    door_cloud_.reset(new pcl::PointCloud<pcl::PointXYZRGBL>());
+
+    // ==================== Initialize Filters ====================
+    explored_area_dwz_filter_.setLeafSize(explored_area_voxel_size_, 
+                                          explored_area_voxel_size_, 
+                                          explored_area_voxel_size_);
+    ceiling_cloud_dwz_filter_.setLeafSize(explored_area_voxel_size_, 
+                                          explored_area_voxel_size_, 
+                                          explored_area_voxel_size_);
+    ceiling_pass_filter_.setFilterFieldName("z");  // Set filter field once
+    occupied_pass_filter_.setFilterFieldName("z");  // Set filter field once
+    tree_.reset(new pcl::search::KdTree<pcl::PointXYZINormal>());
+
+    // ==================== Initialize Maps ====================
+    shift_ = Eigen::Vector3f(room_voxel_dimension_[0] / 2.0f,
+                            room_voxel_dimension_[1] / 2.0f,
+                            room_voxel_dimension_[2] / 2.0f);
+
+    navigable_voxels_.resize(room_voxel_dimension_[0] * 
+                            room_voxel_dimension_[1] * 
+                            room_voxel_dimension_[2], 0);
+    state_voxels_.resize(room_voxel_dimension_[0] * 
+                        room_voxel_dimension_[1] * 
+                        room_voxel_dimension_[2], -1);
+
+    navigable_map_all_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_32F);
+    wall_hist_all_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_32F);
+    state_map_all_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_8U);
+    room_mask_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_32S);
+    room_mask_vis_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_8UC3);
+    room_mask_vis_.setTo(cv::Scalar(255, 255, 255));
+
+    bbox_.emplace_back(Eigen::Vector2i(0, 0));
+    bbox_.emplace_back(Eigen::Vector2i(room_voxel_dimension_[0] - 1, room_voxel_dimension_[1] - 1));
+
+    robot_position_ = geometry_msgs::msg::Point();
+    robot_position_.x = 0.0;
+    robot_position_.y = 0.0;
+    robot_position_.z = 0.0;
+
+    // ==================== Create Subscriptions ====================
+    sub_laser_cloud_ = this->create_subscription<sensor_msgs::msg::PointCloud2>(
+        "/registered_scan", 20,
+        std::bind(&RoomSegmentationNode::laserCloudCallback, this, std::placeholders::_1));
+
+    sub_state_estimation_ = this->create_subscription<nav_msgs::msg::Odometry>(
+        "/state_estimation", 20,
+        std::bind(&RoomSegmentationNode::stateEstimationCallback, this, std::placeholders::_1));
+
+    sub_occupied_cloud_ = this->create_subscription<sensor_msgs::msg::PointCloud2>(
+        "/occupied_cloud", 20,
+        std::bind(&RoomSegmentationNode::occupiedCloudCallback, this, std::placeholders::_1));
+
+    sub_freespace_cloud_ = this->create_subscription<sensor_msgs::msg::PointCloud2>(
+        "/freespace_cloud", 20,
+        std::bind(&RoomSegmentationNode::freespaceCloudCallback, this, std::placeholders::_1));
+
+    // ==================== Create Publishers ====================
+    pub_explored_area_ = this->create_publisher<sensor_msgs::msg::PointCloud2>("/explore_areas_new", 5);
+    pub_room_mask_vis_ = this->create_publisher<sensor_msgs::msg::Image>("/room_mask_vis", 5);
+    pub_room_mask_ = this->create_publisher<sensor_msgs::msg::Image>("/room_mask", 5);
+    pub_door_cloud_ = this->create_publisher<sensor_msgs::msg::PointCloud2>("/door_cloud", 5);
+    pub_debug_ = this->create_publisher<sensor_msgs::msg::PointCloud2>("/debug_cloud", 5);
+    pub_debug_1_ = this->create_publisher<sensor_msgs::msg::PointCloud2>("/free_cloud_1", 5);
+    pub_room_boundary_ = this->create_publisher<geometry_msgs::msg::PolygonStamped>("/current_room_boundary", 5);
+    pub_polygon_ = this->create_publisher<visualization_msgs::msg::MarkerArray>("/room_boundaries", 50);
+    pub_wall_cloud_ = this->create_publisher<sensor_msgs::msg::PointCloud2>("/walls", 5);
+    pub_room_node_ = this->create_publisher<semantic_mapping::msg::RoomNode>("/room_nodes", 50);
+    pub_room_node_list_ = this->create_publisher<semantic_mapping::msg::RoomNodeList>("/room_nodes_list", 5);
+    pub_room_map_cloud_ = this->create_publisher<sensor_msgs::msg::PointCloud2>("/room_map_cloud", 5);
+
+    // ==================== Create Timer ====================
+    timer_ = this->create_wall_timer(
+        std::chrono::milliseconds(100),
+        std::bind(&RoomSegmentationNode::timerCallback, this));
+
+    RCLCPP_INFO(this->get_logger(), "Room Segmentation Node initialized successfully!");
+}
+
+// ==================== Timer Callback ====================
+void RoomSegmentationNode::timerCallback() {
+    if (segment_flag_) {
+        segment_flag_ = false;
+        roomSegmentation();
+        publishRoomNodes();
+        publishDoorCloud();
+        publishRoomPolygon();
+    }
+}
+
+// ==================== Helper Functions ====================
+void RoomSegmentationNode::saveImageToFile(const cv::Mat &image, const std::string &filename, bool force_save) {
+    if (is_debug_ || force_save) {
+        cv::Mat flipped_image;
+        cv::transpose(image, flipped_image);
+        cv::flip(flipped_image, flipped_image, 0);
+        if (!flipped_image.empty()) {
+            if (cv::imwrite(filename, flipped_image)) {
+                return;
+            } else {
+                RCLCPP_ERROR(this->get_logger(), "Failed to save image: %s", filename.c_str());
+            }
+        } else {
+            RCLCPP_ERROR(this->get_logger(), "Image is empty, cannot save: %s", filename.c_str());
+        }
+    }
+}
+
+int RoomSegmentationNode::toIndex(int x, int y, int z) {
+    return x * room_voxel_dimension_[1] * room_voxel_dimension_[2] +
+           y * room_voxel_dimension_[2] + z;
+}
+
+int RoomSegmentationNode::toIndex(int x, int y) {
+    return x * room_voxel_dimension_[1] + y;
+}
+
+bool RoomSegmentationNode::isPlaneSame(const PlaneInfo &a, const PlaneInfo &b) {
+    float angle = std::acos(std::abs(a.normal.dot(b.normal))) * 180.0f / M_PI;
+    if (angle > angle_threshold_deg_)
+        return false;
+    
+    Eigen::Vector3f centroid_diff = b.centroid - a.centroid;
+    float angel_distance = centroid_diff.dot(a.normal);
+    if (std::abs(angel_distance) > distance_angel_threshold_)
+        return false;
+    
+    float center_dist = (a.centroid - b.centroid).norm();
+    float actual_dist = center_dist - (a.width + b.width) / 2.0f;
+    if (actual_dist > distance_threshold_)
+        return false;
+    
+    return true;
+}
+
+bool RoomSegmentationNode::isRoomConnected(const int &room_id, const int &current_room_id, 
+                                           const Eigen::MatrixXi &adjacency_matrix) {
+    int n = adjacency_matrix.rows();
+    if (room_id < 0 || current_room_id < 0 || room_id >= n || current_room_id >= n) {
+        return false;
+    }
+
+    std::vector<bool> visited(n, false);
+    std::queue<int> q;
+    q.push(current_room_id);
+    visited[current_room_id] = true;
+
+    while (!q.empty()) {
+        int u = q.front();
+        q.pop();
+
+        if (u == room_id)
+            return true;
+
+        for (int v = 0; v < n; v++) {
+            if (!visited[v] && adjacency_matrix(u, v) > 0) {
+                visited[v] = true;
+                q.push(v);
+            }
+        }
+    }
+    return false;
+}
+
+geometry_msgs::msg::PolygonStamped RoomSegmentationNode::computePolygonFromMaskCropped(const cv::Mat &mask) {
+    std::vector<std::vector<cv::Point>> current_room_contours;
+    cv::findContours(mask, current_room_contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE);
+    
+    std::vector<cv::Point> largest_contour;
+    if (!current_room_contours.empty()) {
+        largest_contour = current_room_contours[0];
+        for (const auto &contour : current_room_contours) {
+            if (cv::contourArea(contour) > cv::contourArea(largest_contour)) {
+                largest_contour = contour;
+            }
+        }
+    }
+    
+    geometry_msgs::msg::PolygonStamped boundary_polygon;
+    boundary_polygon.header.frame_id = "map";
+    boundary_polygon.polygon.points.clear();
+    
+    for (const auto &pt : largest_contour) {
+        Eigen::Vector3i pt_voxel(pt.y, pt.x, 0);
+        Eigen::Vector3f pt_position = misc_utils_ns::voxel_to_point_cropped(pt_voxel, shift_, room_resolution_, bbox_);
+        
+        geometry_msgs::msg::Point32 point;
+        point.x = pt_position.x();
+        point.y = pt_position.y();
+        point.z = 0.0;
+        boundary_polygon.polygon.points.push_back(point);
+    }
+    return boundary_polygon;
+}
+
+// ==================== State Estimation Callback ====================
+void RoomSegmentationNode::stateEstimationCallback(const nav_msgs::msg::Odometry::ConstSharedPtr msg) {
+    robot_position_ = msg->pose.pose.position;
+}
+
+// ==================== Callback Functions ====================
+void RoomSegmentationNode::laserCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg) {
+    laser_cloud_->clear();
+    laser_cloud_tmp_->clear();
+    pcl::fromROSMsg(*msg, *laser_cloud_tmp_);
+    // Transform PointXYZI to PointXYZINormal
+    pcl::copyPointCloud(*laser_cloud_tmp_, *laser_cloud_);
+
+    *explored_area_cloud_tmp_ += *laser_cloud_;
+
+    explored_area_display_count_++;
+
+    if (explored_area_display_count_ >= 5 * exploredAreaDisplayInterval_)
+    {
+        // downsample explored area cloud
+        downsampled_explored_area_cloud_tmp_->clear();
+        explored_area_dwz_filter_.setInputCloud(explored_area_cloud_tmp_);
+        explored_area_dwz_filter_.filter(*downsampled_explored_area_cloud_tmp_);
+        *downsampled_explored_area_cloud_ += *downsampled_explored_area_cloud_tmp_; // accumulate downsampled cloud
+
+        ceiling_height_ = ceiling_height_base_ + robot_position_.z;
+        wall_thres_height_ = wall_thres_height_base_ + robot_position_.z;
+
+        // filter the pointcloud below the ceiling height
+        ceiling_pass_filter_.setInputCloud(downsampled_explored_area_cloud_tmp_);
+        ceiling_pass_filter_.setFilterLimits(-std::numeric_limits<float>::max(), ceiling_height_);
+        ceiling_pass_filter_.filter(*downsampled_ceiling_cloud_tmp_);
+
+        // before adding the new point, record the index size
+        int start_idx = downsampled_ceiling_cloud_->size();
+        *downsampled_ceiling_cloud_ += *downsampled_explored_area_cloud_tmp_;
+        int end_idx = downsampled_ceiling_cloud_->size();
+
+        // find the old points that will be affected by the new points
+        tree_->setInputCloud(downsampled_ceiling_cloud_);
+        std::unordered_set<int> affected_old_indices;
+
+        for (const auto &pt : downsampled_ceiling_cloud_tmp_->points)
+        {
+            std::vector<int> indices;
+            std::vector<float> dists;
+            // tree_->radiusSearch(pt, normal_search_radius_, indices, dists);
+            tree_->nearestKSearch(pt, normal_search_num_, indices, dists);
+            for (int idx : indices)
+            {
+                if (idx < start_idx) // only consider old points
+                    affected_old_indices.insert(idx);
+            }
+        }
+
+        // create the points that need to be updated normals
+        pcl::PointCloud<pcl::PointXYZINormal>::Ptr points_to_update(new pcl::PointCloud<pcl::PointXYZINormal>);
+        points_to_update->points.reserve(affected_old_indices.size() + (end_idx - start_idx));
+        std::vector<int> update_indices;
+
+        for (int idx : affected_old_indices)
+        {
+            points_to_update->points.push_back(downsampled_ceiling_cloud_->points[idx]);
+            update_indices.push_back(idx);
+        }
+        for (int i = start_idx; i < end_idx; ++i)
+        {
+            points_to_update->points.push_back(downsampled_ceiling_cloud_->points[i]);
+            update_indices.push_back(i);
+        }
+
+        // use the kdtree to compute normals
+        pcl::NormalEstimation<pcl::PointXYZINormal, pcl::Normal> ne;
+        ne.setInputCloud(points_to_update);
+        ne.setSearchMethod(tree_);
+        ne.setKSearch(normal_search_num_);
+        pcl::PointCloud<pcl::Normal>::Ptr updated_normals(new pcl::PointCloud<pcl::Normal>);
+        ne.compute(*updated_normals);
+
+        // write back the updated normals
+        for (size_t i = 0; i < update_indices.size(); ++i)
+        {
+            int idx = update_indices[i];
+            downsampled_ceiling_cloud_->points[idx].normal_x = updated_normals->points[i].normal_x;
+            downsampled_ceiling_cloud_->points[idx].normal_y = updated_normals->points[i].normal_y;
+            downsampled_ceiling_cloud_->points[idx].normal_z = updated_normals->points[i].normal_z;
+            downsampled_ceiling_cloud_->points[idx].curvature = updated_normals->points[i].curvature;
+        }
+
+        // convert to Eigen::Vector3f for further processing
+        std::vector<Eigen::Vector3f> ceilingPoint_tmp;
+        ceilingPoint_tmp.reserve(downsampled_ceiling_cloud_tmp_->points.size());
+        for (const auto &pt : downsampled_ceiling_cloud_tmp_->points)
+        {
+            ceilingPoint_tmp.emplace_back(pt.x, pt.y, pt.z);
+        }
+
+        updateVoxelMap(ceilingPoint_tmp);
+
+        explored_area_dwz_filter_.setInputCloud(downsampled_explored_area_cloud_);
+        explored_area_dwz_filter_.filter(*downsampled_explored_area_cloud_);
+
+        ceiling_cloud_dwz_filter_.setInputCloud(downsampled_ceiling_cloud_);
+        ceiling_cloud_dwz_filter_.filter(*downsampled_ceiling_cloud_);
+
+        sensor_msgs::msg::PointCloud2 explored_area_msg;
+        pcl::toROSMsg(*downsampled_ceiling_cloud_, explored_area_msg);
+        explored_area_msg.header.stamp = msg->header.stamp;
+        explored_area_msg.header.frame_id = "map";
+        pub_explored_area_->publish(explored_area_msg);
+
+        // get the point cloud that is within the robot's range
+        in_range_cloud_->clear();
+        in_range_cloud_->points.reserve(downsampled_ceiling_cloud_->points.size() / 4); // 预估容量
+        float radius_squared = region_growing_radius_ * region_growing_radius_;
+        for (const auto &pt : downsampled_ceiling_cloud_->points)
+        {
+            float dx = pt.x - robot_position_.x;
+            float dy = pt.y - robot_position_.y;
+            if (dx * dx + dy * dy <= radius_squared)
+            {
+                in_range_cloud_->points.push_back(pt);
+            }
+        }
+        in_range_cloud_->width = in_range_cloud_->points.size();
+        in_range_cloud_->height = 1;
+        in_range_cloud_->is_dense = true;
+
+        // publish the in range cloud for debugging
+        sensor_msgs::msg::PointCloud2 in_range_cloud_msg;
+        pcl::toROSMsg(*in_range_cloud_, in_range_cloud_msg);
+        in_range_cloud_msg.header.stamp = msg->header.stamp;
+        in_range_cloud_msg.header.frame_id = "map";
+        pub_debug_->publish(in_range_cloud_msg);
+
+        explored_area_display_count_ = 0;
+        explored_area_cloud_tmp_->clear();
+        downsampled_ceiling_cloud_tmp_->clear();
+
+        if (!navigable_map_.empty() && !wall_hist_.empty() && !state_map_.empty())
+        {
+            segment_flag_ = true;
+        }
+        else
+        {
+            RCLCPP_WARN_THROTTLE(
+                this->get_logger(),
+                *this->get_clock(),
+                2000,
+                "Room segmentation skipped: maps not ready yet.");
+        }
+    }
+}
+
+void RoomSegmentationNode::occupiedCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg) {
+    occupied_cloud_->clear();
+    pcl::fromROSMsg(*msg, *occupied_cloud_);
+
+    float height_1 = robot_position_.z - kViewPointCollisionMarginZMinus_;
+    float height_2 = robot_position_.z + kViewPointCollisionMarginZPlus_;
+    // filter the occupied cloud within the height range
+    occupied_pass_filter_.setInputCloud(occupied_cloud_);
+    occupied_pass_filter_.setFilterLimits(height_1, height_2);
+    occupied_pass_filter_.filter(*occupied_cloud_);
+
+    if (occupied_cloud_->empty())
+    {
+        RCLCPP_WARN(this->get_logger(), "Occupied cloud is empty after height filtering.");
+        return;
+    }
+
+    updateStateVoxel();
+}
+
+void RoomSegmentationNode::freespaceCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg)
+{
+    freespace_cloud_->clear();
+    pcl::fromROSMsg(*msg, *freespace_cloud_);
+    if (freespace_cloud_->empty())
+    {
+        RCLCPP_WARN(this->get_logger(), "Freespace cloud is empty.");
+        return;
+    }
+
+    pcl::PointCloud<pcl::PointXYZI>::Ptr freespace_cloud_tmp(new pcl::PointCloud<pcl::PointXYZI>);
+    updateFreespace(freespace_cloud_tmp);
+}
+
+// ==================== Core Processing Functions ====================
+cv::Mat RoomSegmentationNode::getWall(const pcl::PointCloud<pcl::PointXYZINormal>::Ptr &cloud) {
+    auto t_start = std::chrono::high_resolution_clock::now();
+
+    // Extract normals from PointXYZINormal
+    auto t0 = std::chrono::high_resolution_clock::now();
+    pcl::PointCloud<pcl::Normal>::Ptr normals(new pcl::PointCloud<pcl::Normal>);
+    normals->resize(cloud->size());
+    for (size_t i = 0; i < cloud->size(); ++i)
+    {
+        const pcl::PointXYZINormal &pt = cloud->points[i];
+        (*normals)[i].normal_x = pt.normal_x;
+        (*normals)[i].normal_y = pt.normal_y;
+        (*normals)[i].normal_z = pt.normal_z;   
+        (*normals)[i].curvature = pt.curvature;
+    }
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    // Region Growing clustering
+    t0 = std::chrono::high_resolution_clock::now();
+    pcl::IndicesPtr indices(new std::vector<int>);
+    pcl::removeNaNFromPointCloud(*cloud, *indices);
+
+    pcl::RegionGrowing<pcl::PointXYZINormal, pcl::Normal> reg;
+    reg.setMinClusterSize(300);
+    reg.setMaxClusterSize(1000000);
+    reg.setSearchMethod(tree_);
+    reg.setNumberOfNeighbours(50);
+    reg.setInputCloud(cloud);
+    reg.setIndices(indices);
+    reg.setInputNormals(normals);
+    reg.setSmoothnessThreshold(3 / 180.0 * M_PI);
+    reg.setCurvatureThreshold(1.0);
+    reg.setSmoothModeFlag(true);
+
+    std::vector<pcl::PointIndices> clusters;
+    reg.extract(clusters);
+    t1 = std::chrono::high_resolution_clock::now();
+    
+    t0 = std::chrono::high_resolution_clock::now();
+    // Process clusters to extract vertical planes
+    std::vector<PlaneInfo> plane_infos_new;
+    plane_infos_new.reserve(clusters.size() / 4); // Reserve estimated space
+    
+    for (size_t i = 0; i < clusters.size(); ++i)
+    {
+        const auto &cluster = clusters[i];
+        
+        // Compute centroid and normal sum directly from indices without copying cloud
+        Eigen::Vector3f centroid(0, 0, 0);
+        Eigen::Vector3f normal_sum(0, 0, 0);
+        int valid_points = 0;
+        
+        for (int idx : cluster.indices)
+        {
+            const pcl::PointXYZINormal &pt = cloud->points[idx];
+            centroid += Eigen::Vector3f(pt.x, pt.y, pt.z);
+            
+            const pcl::Normal &n = normals->points[idx];
+            if (std::isfinite(n.normal_x) && std::isfinite(n.normal_y) && std::isfinite(n.normal_z))
+            {
+                normal_sum += Eigen::Vector3f(n.normal_x, n.normal_y, n.normal_z);
+                valid_points++;
+            }
+        }
+        
+        if (valid_points == 0 || normal_sum.norm() < 1e-3)
+            continue;
+            
+        centroid /= static_cast<float>(cluster.indices.size());
+        Eigen::Vector3f avg_normal = normal_sum.normalized();
+
+        // Early filtering: Remove horizontal planes
+        float dot = std::abs(avg_normal.dot(Eigen::Vector3f::UnitZ()));
+        if (dot > std::cos(80.0f * M_PI / 180.0f))
+            continue;
+        
+        // Compute variance using online algorithm (single pass)
+        float mean_dist = 0.0f;
+        float m2 = 0.0f;
+        int n = 0;
+        
+        for (int idx : cluster.indices)
+        {
+            const pcl::PointXYZINormal &pt = cloud->points[idx];
+            Eigen::Vector3f p(pt.x - centroid.x(), pt.y - centroid.y(), pt.z - centroid.z());
+            float dist = p.dot(avg_normal);
+            
+            n++;
+            float delta = dist - mean_dist;
+            mean_dist += delta / n;
+            float delta2 = dist - mean_dist;
+            m2 += delta * delta2;
+        }
+        
+        float variance = (n > 1) ? (m2 / n) : 0.0f;
+        if (variance > 0.1f)
+            continue;
+
+        avg_normal = (avg_normal - avg_normal.dot(Eigen::Vector3f::UnitZ()) * Eigen::Vector3f::UnitZ()).normalized();
+
+        std::vector<Eigen::Vector3i> voxel_indices;
+        voxel_indices.reserve(cluster.indices.size());
+        
+        Eigen::Vector3f u_dir = avg_normal.cross(Eigen::Vector3f::UnitZ()).normalized();
+        Eigen::Vector3f v_dir = avg_normal.cross(u_dir).normalized();
+        
+        // Compute bounding rectangle directly from original cloud using indices
+        float u_min = FLT_MAX, u_max = -FLT_MAX;
+        float v_min = FLT_MAX, v_max = -FLT_MAX;
+        
+        for (int idx : cluster.indices)
+        {
+            const pcl::PointXYZINormal &point = cloud->points[idx];
+            Eigen::Vector3f p(point.x, point.y, point.z);
+            Eigen::Vector3f relative = p - centroid;
+            float u = relative.dot(u_dir);
+            float v = relative.dot(v_dir);
+
+            u_min = std::min(u_min, u);
+            u_max = std::max(u_max, u);
+            v_min = std::min(v_min, v);
+            v_max = std::max(v_max, v);
+
+            voxel_indices.emplace_back(misc_utils_ns::point_to_voxel(p, shift_, room_resolution_inv_));
+        }
+        
+        float height = v_max - v_min;
+        // Early filtering: check height before further computation
+        if (height < 1.5f)
+            continue;
+        
+        // Now create cluster_cloud only for planes that passed all filters
+        pcl::PointCloud<pcl::PointXYZINormal>::Ptr cluster_cloud(new pcl::PointCloud<pcl::PointXYZINormal>);
+        cluster_cloud->points.reserve(cluster.indices.size());
+        for (int idx : cluster.indices)
+        {
+            cluster_cloud->points.push_back(cloud->points[idx]);
+        }
+        cluster_cloud->width = cluster_cloud->points.size();
+        cluster_cloud->height = 1;
+        cluster_cloud->is_dense = true;
+        
+        // Compute corners of the bounding rectangle
+        std::array<Eigen::Vector3f, 4> corners = {
+            centroid + u_dir * u_min + v_dir * v_min,
+            centroid + u_dir * u_max + v_dir * v_min,
+            centroid + u_dir * u_max + v_dir * v_max,
+            centroid + u_dir * u_min + v_dir * v_max};
+
+        centroid = (corners[0] + corners[1] + corners[2] + corners[3]) / 4.0f;
+        float width = u_max - u_min;
+            
+        plane_infos_new.push_back({static_cast<int>(i),
+                                   cluster_cloud,
+                                   std::move(voxel_indices),
+                                   avg_normal,
+                                   centroid,
+                                   u_dir,
+                                   v_dir,
+                                   width,
+                                   height,
+                                   corners,
+                                   true,
+                                   false});
+    }
+
+    t1 = std::chrono::high_resolution_clock::now();
+
+    // Extract in-range planes from plane_infos_
+    std::vector<int> in_range_plane_indices;
+    for (size_t i = 0; i < plane_infos_.size(); ++i)
+    {
+        const auto &plane = plane_infos_[i];
+        if (plane.cloud->size() > 1000)
+            continue;
+
+        float dist = std::hypot(plane.centroid.x() - robot_position_.x, plane.centroid.y() - robot_position_.y);
+        if (dist < region_growing_radius_)
+        {
+            in_range_plane_indices.push_back(i);
+            plane_infos_[i].alive = false;
+        }
+    }
+
+    // Merge new planes with existing planes
+    for (size_t i = 0; i < plane_infos_new.size(); ++i)
+    {
+        bool found = false;
+        int found_idx = -1;
+        for (size_t j = 0; j < plane_infos_.size(); ++j)
+        {
+            if (isPlaneSame(plane_infos_new[i], plane_infos_[j]))
+            {
+                found = true;
+                found_idx = static_cast<int>(j);
+                break;
+            }
+        }
+        if (!found)
+        {
+            // if no similar plane found, add as a new plane
+            plane_infos_new[i].id = static_cast<int>(plane_infos_.size());
+            plane_infos_.push_back(plane_infos_new[i]);
+        }
+        else
+        {
+            // merge with the found plane
+            mergePlanes(plane_infos_, found_idx, plane_infos_new[i]);
+            plane_infos_[found_idx].alive = true;
+        }
+    }
+
+    // Merge similar planes
+    for (size_t i = 0; i < plane_infos_.size(); ++i)
+    {
+        if (!plane_infos_[i].alive)
+            continue;
+        for (size_t j = i + 1; j < plane_infos_.size(); ++j)
+        {
+            if (!plane_infos_[j].alive)
+                continue;
+            if (isPlaneSame(plane_infos_[i], plane_infos_[j]))
+            {
+                mergePlanes(plane_infos_, static_cast<int>(i), plane_infos_[j]);
+                plane_infos_[j].alive = false;
+                plane_infos_[i].alive = true;
+            }
+        }
+    }
+
+    // Remove planes where most voxels are free
+    for (auto &plane : plane_infos_)
+    {
+        if (!plane.alive)
+            continue;
+        int free_count = 0;
+        int total_count = 0;
+        for (const auto &voxel_index : plane.voxel_indices)
+        {
+            if (voxel_index[0] < 0 || voxel_index[0] >= room_voxel_dimension_[0] ||
+                voxel_index[1] < 0 || voxel_index[1] >= room_voxel_dimension_[1] ||
+                voxel_index[2] < 0 || voxel_index[2] >= room_voxel_dimension_[2])
+                continue;
+            if (state_map_all_.at<uchar>(voxel_index[0], voxel_index[1]) == 1)
+            {
+                free_count++;
+            }
+            total_count++;
+        }
+        if (free_count > total_count * 0.33f)
+        {
+            plane.alive = false;
+        }
+    }
+
+    // Remove non-alive planes
+    plane_infos_.erase(std::remove_if(plane_infos_.begin(), plane_infos_.end(),
+                                     [](const PlaneInfo &plane)
+                                     { return (!plane.alive); }),
+                      plane_infos_.end());
+
+    // Visualize merged planes
+    t0 = std::chrono::high_resolution_clock::now();
+    pcl::PointCloud<pcl::PointXYZRGB>::Ptr merged_cloud(new pcl::PointCloud<pcl::PointXYZRGB>);
+    
+    // Calculate total points for reservation
+    size_t total_points = 0;
+    for (const auto &plane : plane_infos_)
+        total_points += plane.cloud->points.size();
+    merged_cloud->points.reserve(total_points);
+    
+    for (const auto &plane : plane_infos_)
+    {
+        uint8_t r = static_cast<uint8_t>(rand() % 256);
+        uint8_t g = static_cast<uint8_t>(rand() % 256);
+        uint8_t b = static_cast<uint8_t>(rand() % 256);
+        
+        for (const auto& pt : plane.cloud->points)
+        {
+            merged_cloud->points.emplace_back(pt.x, pt.y, pt.z, r, g, b);
+        }
+    }
+    merged_cloud->width = merged_cloud->points.size();
+    merged_cloud->height = 1;
+    merged_cloud->is_dense = true;
+
+    sensor_msgs::msg::PointCloud2 merged_cloud_msg;
+    pcl::toROSMsg(*merged_cloud, merged_cloud_msg);
+    merged_cloud_msg.header.frame_id = "map";
+    pub_wall_cloud_->publish(merged_cloud_msg);
+
+    t1 = std::chrono::high_resolution_clock::now();
+
+    auto t_end = std::chrono::high_resolution_clock::now();
+
+    // Project walls to 2D map (combined loop for both operations)
+    t0 = std::chrono::high_resolution_clock::now();
+    cv::Mat wall_mask(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_8U, cv::Scalar(0));
+
+    for (const auto &plane : plane_infos_)
+    {
+        if (plane.merged)
+            continue;
+
+        // First pass: basic polygon
+        std::vector<cv::Point> polygon_2d;
+        polygon_2d.reserve(4);
+        Eigen::Vector3f outward_1 = plane.normal * outward_distance_1_;
+
+        for (int i = 0; i < 4; i++)
+        {
+            Eigen::Vector3f corner = plane.corners[i];
+            if (i==0)
+                corner = corner - outward_1;
+            else if (i==1)
+                corner = corner - outward_1;
+            else if (i==2)
+                corner = corner + outward_1;
+            else
+                corner = corner + outward_1;
+            
+            Eigen::Vector3i idx = misc_utils_ns::point_to_voxel(corner, shift_, room_resolution_inv_);
+            polygon_2d.emplace_back(idx[1], idx[0]);
+        }
+        cv::fillPoly(wall_mask, std::vector<std::vector<cv::Point>>{polygon_2d}, cv::Scalar(255));
+
+        // Second pass: extension along u_dir
+        polygon_2d.clear();
+        polygon_2d.reserve(4);
+
+        for (int i = 0; i<2; ++i)
+        {
+            Eigen::Vector3f corner = plane.corners[i];
+            Eigen::Vector3f pt_0 = corner;
+            Eigen::Vector3f pt_1;
+            if (i == 0) {
+                pt_1 = corner - plane.u_dir * outward_distance_0_;
+            } else {
+                pt_1 = corner + plane.u_dir * outward_distance_0_;
+            }
+            
+            Eigen::Vector3i idx_0 = misc_utils_ns::point_to_voxel(pt_0, shift_, room_resolution_inv_);
+            Eigen::Vector3i idx_1 = misc_utils_ns::point_to_voxel(pt_1, shift_, room_resolution_inv_);
+            cv::Point pt_2d_0(idx_0[1], idx_0[0]);
+            cv::Point pt_2d_1(idx_1[1], idx_1[0]);
+            
+            cv::LineIterator it(wall_mask, pt_2d_0, pt_2d_1, 8);
+
+            cv::Point pt_found = pt_2d_1;
+            for (int j = 0; j < it.count; ++j, ++it)
+            {
+                cv::Point pt = it.pos();
+                if (pt.x <= 0 || pt.x >= wall_mask.cols - 1 || pt.y <= 0 || pt.y >= wall_mask.rows - 1)
+                {
+                    pt_found = pt;
+                    break;
+                }
+                if (wall_mask.at<uchar>(pt) == 255)
+                {
+                    pt_found = pt;
+                    break;
+                }
+            }
+            if (pt_found.x >= 0 && pt_found.y >= 0)
+            {
+                Eigen::Vector3i idx_found(pt_found.y, pt_found.x, 0);
+                Eigen::Vector3f pt_found_world = misc_utils_ns::voxel_to_point(idx_found, shift_, room_resolution_);
+                Eigen::Vector3f pt_outward_0 = pt_found_world + plane.normal * outward_distance_1_;
+                Eigen::Vector3f pt_outward_1 = pt_found_world - plane.normal * outward_distance_1_;
+                Eigen::Vector3i idx_outward_0 = misc_utils_ns::point_to_voxel(pt_outward_0, shift_, room_resolution_inv_);
+                Eigen::Vector3i idx_outward_1 = misc_utils_ns::point_to_voxel(pt_outward_1, shift_, room_resolution_inv_);
+                polygon_2d.emplace_back(idx_outward_0[1], idx_outward_0[0]);
+                polygon_2d.emplace_back(idx_outward_1[1], idx_outward_1[0]);
+            }
+        }
+        if (!polygon_2d.empty())
+            cv::fillPoly(wall_mask, std::vector<std::vector<cv::Point>>{polygon_2d}, cv::Scalar(255));
+    }
+    
+    t1 = std::chrono::high_resolution_clock::now();
+
+    // Crop wall_mask using bbox_
+    wall_mask = wall_mask.rowRange(bbox_[0][0], bbox_[1][0] + 1)
+                    .colRange(bbox_[0][1], bbox_[1][1] + 1);
+    saveImageToFile(wall_mask, "wall_mask_from_planes.png");
+
+    return wall_mask;
+}
+
+void RoomSegmentationNode::updateVoxelMap(const std::vector<Eigen::Vector3f> &navigable_points) {
+    // Traverse all points and accumulate to voxels
+    for (const auto &pt : navigable_points)
+    {
+        auto idx = misc_utils_ns::point_to_voxel(pt, shift_, room_resolution_inv_);
+        // Boundary clipping
+        idx[0] = std::clamp(idx[0], 0, room_voxel_dimension_[0] - 1);
+        idx[1] = std::clamp(idx[1], 0, room_voxel_dimension_[1] - 1);
+        idx[2] = std::clamp(idx[2], 0, room_voxel_dimension_[2] - 1);
+        
+        if (navigable_voxels_[toIndex(idx[0], idx[1], idx[2])] == 0 &&
+            state_map_all_.at<uchar>(idx[0], idx[1]) != 1) // Don't update free space again
+        {
+            navigable_voxels_[toIndex(idx[0], idx[1], idx[2])] = 1;
+            navigable_map_all_.at<float>(idx[0], idx[1]) += 1.0f; // Accumulate to 2D map
+            
+            if (wall_thres_height_ < pt.z() && pt.z() < ceiling_height_)
+            {
+                // If point height is within wall range, mark corresponding voxel as wall
+                wall_hist_all_.at<float>(idx[0], idx[1]) += 1.0f;
+            }
+        }
+    }
+
+    // Get the bounding box of non-zero area in navigable_map
+    std::vector<cv::Point> non_zero_points;
+    cv::findNonZero(navigable_map_all_, non_zero_points);
+    if (non_zero_points.empty())
+    {
+        return;
+    }
+    
+    cv::Rect rect = cv::boundingRect(non_zero_points);
+    bbox_[0] = Eigen::Vector2i(rect.tl().y, rect.tl().x); // Top-left corner
+    bbox_[1] = Eigen::Vector2i(rect.br().y, rect.br().x); // Bottom-right corner
+    
+    // Add some margin to the bbox
+    int margin = 20;
+    bbox_[0] = (bbox_[0] - Eigen::Vector2i(margin, margin)).cwiseMax(Eigen::Vector2i(0, 0));
+    bbox_[1] = (bbox_[1] + Eigen::Vector2i(margin, margin)).cwiseMin(Eigen::Vector2i(room_voxel_dimension_[0] - 1, room_voxel_dimension_[1] - 1));
+
+    // crop the navigable_map_all using the bbox to get the navigable_map
+    navigable_map_ = navigable_map_all_.rowRange(bbox_[0][0], bbox_[1][0] + 1).colRange(bbox_[0][1], bbox_[1][1] + 1);
+    wall_hist_ = wall_hist_all_.rowRange(bbox_[0][0], bbox_[1][0] + 1).colRange(bbox_[0][1], bbox_[1][1] + 1);
+    state_map_ = state_map_all_.rowRange(bbox_[0][0], bbox_[1][0] + 1).colRange(bbox_[0][1], bbox_[1][1] + 1);
+}
+
+void RoomSegmentationNode::updateStateVoxel() {
+    // use the idx store in freespace_indices_ to update the state_map_all_
+    for (const auto &pt : occupied_cloud_->points)
+    {
+        if (pt.intensity != 0) // Occupied
+        {
+            continue; // Only process occupied space
+        }
+        auto idx = misc_utils_ns::point_to_voxel(
+            Eigen::Vector3f(pt.x - room_resolution_ / 2.0 - 1e-4, 
+                            pt.y - room_resolution_ / 2.0 - 1e-4, 
+                            pt.z - room_resolution_ / 2.0 - 1e-4), 
+            shift_, room_resolution_inv_);
+        // Boundary clipping
+        idx[0] = std::clamp(idx[0], 0, room_voxel_dimension_[0] - 1);
+        idx[1] = std::clamp(idx[1], 0, room_voxel_dimension_[1] - 1);
+        idx[2] = std::clamp(idx[2], 0, room_voxel_dimension_[2] - 1);
+        
+        // Check if all surrounding cells are in freespace_indices
+        bool flag = true;
+        for (int dx = 0; dx <= 1; ++dx)
+        {
+            for (int dy = 0; dy <= 1; ++dy)
+            {
+                int nx = idx[0] + dx;
+                int ny = idx[1] + dy;
+                if (nx >= 0 && nx < room_voxel_dimension_[0] &&
+                    ny >= 0 && ny < room_voxel_dimension_[1])
+                {
+                    if (std::find(freespace_indices_.begin(), freespace_indices_.end(), 
+                                 toIndex(nx, ny)) == freespace_indices_.end())
+                    {
+                        flag = false; // If any point is not in freespace_indices, don't mark as free space
+                        break;
+                    }
+                }
+            }
+        }
+        
+        if (flag)
+        {
+            // Mark surrounding area as free space
+            for (int dx = -1; dx <= 2; ++dx)
+            {
+                for (int dy = -1; dy <= 2; ++dy)
+                {
+                    int nx = idx[0] + dx;
+                    int ny = idx[1] + dy;
+                    if (nx >= 0 && nx < room_voxel_dimension_[0] &&
+                        ny >= 0 && ny < room_voxel_dimension_[1])
+                    {
+                        state_map_all_.at<uchar>(nx, ny) = 1; // Mark as free space
+                        navigable_map_all_.at<float>(nx, ny) = 1.0f;
+                        wall_hist_all_.at<float>(nx, ny) = 1.0f; // Clear wall history
+                        
+                        for (int z = 0; z < room_voxel_dimension_[2]; ++z)
+                        {
+                            int index = toIndex(nx, ny, z);
+                            navigable_voxels_[index] = 0;
+                        }
+
+                        Eigen::Vector3f pt_position = misc_utils_ns::voxel_to_point(
+                            Eigen::Vector3i(nx, ny, 0), shift_, room_resolution_);
+                        pt_position.z() = robot_position_.z; // Keep z-axis height constant
+                        pcl::PointXYZI pt_new;
+                        pt_new.x = pt_position.x();
+                        pt_new.y = pt_position.y();
+                        pt_new.z = pt_position.z();
+                        pt_new.intensity = 0; // Mark as free space
+                        updated_voxel_cloud_->points.push_back(pt_new);
+                    }
+                }
+            }
+            updated_voxel_cloud_->points.push_back(pt);
+        }
+    }
+
+    if (!updated_voxel_cloud_->empty())
+    {
+        // Publish the updated_voxel_cloud using pub_debug_
+        sensor_msgs::msg::PointCloud2 occupied_cloud_msg;
+        pcl::toROSMsg(*updated_voxel_cloud_, occupied_cloud_msg);
+        occupied_cloud_msg.header.stamp = this->now();
+        occupied_cloud_msg.header.frame_id = "map";
+        pub_debug_->publish(occupied_cloud_msg);
+    }
+}
+
+void RoomSegmentationNode::updateFreespace(pcl::PointCloud<pcl::PointXYZI>::Ptr &freespace_cloud_tmp)
+{
+    // this function only updates the navigable_voxels_, navigable_map_all_, wall_hist_all_ based on the freespace_cloud_
+    // this function will store the freespace voxel indices in freespace_indices_, and it is used in updateStateVoxel() to update the state_map_all_
+    freespace_indices_.clear();
+    for (auto &pt : freespace_cloud_->points)
+    {
+        auto idx = misc_utils_ns::point_to_voxel(Eigen::Vector3f(pt.x + 1e-4, pt.y + 1e-4, pt.z + 1e-4), shift_, room_resolution_inv_);
+        // check if the point is within bounds
+        idx[0] = std::clamp(idx[0], 0, room_voxel_dimension_[0] - 1);
+        idx[1] = std::clamp(idx[1], 0, room_voxel_dimension_[1] - 1);
+        idx[2] = std::clamp(idx[2], 0, room_voxel_dimension_[2] - 1);
+
+        for (int dx = 0; dx <= 1; ++dx)
+        {
+            for (int dy = 0; dy <= 1; ++dy)
+            {
+                int nx = idx[0] + dx;
+                int ny = idx[1] + dy;
+                if (nx >= 0 && nx < room_voxel_dimension_[0] &&
+                    ny >= 0 && ny < room_voxel_dimension_[1] &&
+                    state_map_all_.at<uchar>(nx, ny) != 1)
+                {
+                    Eigen::Vector3f pt_position = misc_utils_ns::voxel_to_point(Eigen::Vector3i(nx, ny, idx[2]), shift_, room_resolution_);
+                    pcl::PointXYZI pt_xyz;
+                    pt_xyz.x = pt_position.x();
+                    pt_xyz.y = pt_position.y();
+                    pt_xyz.z = robot_position_.z; // 使用机器人的z坐标作为高度
+                    pt_xyz.intensity = 1.0f;      // 设置强度为1.0f
+                    freespace_cloud_tmp->points.push_back(pt_xyz);
+
+                    freespace_indices_.push_back(toIndex(nx, ny)); // 记录free space的体素索引
+                    for (int dz = -2; dz <= 5; ++dz)               // 只在z轴上扩展
+                    {
+                        int nz = idx[2] + dz;
+                        if (nz >= 0 && nz < room_voxel_dimension_[2])
+                        {
+                            if (navigable_voxels_[toIndex(nx, ny, nz)] == 1)
+                            {
+                                navigable_voxels_[toIndex(nx, ny, nz)] = 0;
+                                navigable_map_all_.at<float>(nx, ny) -= 1.0f;
+                                float pt_z = pt.z + dz * room_resolution_; // 计算实际的z坐标
+                                if (wall_thres_height_ < pt_z && pt_z < ceiling_height_)
+                                    wall_hist_all_.at<float>(nx, ny) -= 1.0f; // 更新墙体地图
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // use pub_debug_1_ to publish the freespace cloud
+    sensor_msgs::msg::PointCloud2 freespace_cloud_msg;
+    pcl::toROSMsg(*freespace_cloud_tmp, freespace_cloud_msg);
+    freespace_cloud_msg.header.stamp = this->now();
+    freespace_cloud_msg.header.frame_id = "map";
+    pub_debug_1_->publish(freespace_cloud_msg);
+}
+
+void RoomSegmentationNode::mergePlanes(std::vector<PlaneInfo> &plane_infos, int idx_0, PlaneInfo &compare) {
+    // Merge planes
+    PlaneInfo &base = plane_infos[idx_0];
+    compare.merged = true;
+
+    size_t size_0 = base.cloud->size();
+    size_t size_1 = compare.cloud->size();
+    base.centroid = (base.centroid * size_0 + compare.centroid * size_1) / (size_0 + size_1);
+    base.cloud->insert(base.cloud->end(), compare.cloud->begin(), compare.cloud->end());
+    
+    // Downsample the merged cloud
+    explored_area_dwz_filter_.setInputCloud(base.cloud);
+    explored_area_dwz_filter_.filter(*base.cloud);
+    
+    // Use RANSAC to fit the normal
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr merged_cloud(new pcl::PointCloud<pcl::PointXYZINormal>);
+    pcl::copyPointCloud(*base.cloud, *merged_cloud);
+    
+    pcl::SACSegmentation<pcl::PointXYZINormal> seg;
+    seg.setOptimizeCoefficients(true);
+    seg.setModelType(pcl::SACMODEL_PLANE);
+    seg.setMethodType(pcl::SAC_RANSAC);
+    seg.setDistanceThreshold(0.2);
+    seg.setInputCloud(merged_cloud);
+    
+    pcl::ModelCoefficients coefficients;
+    pcl::PointIndices inliers;
+    seg.segment(inliers, coefficients);
+    
+    if (inliers.indices.size() > 0)
+    {
+        Eigen::Vector3f normal(coefficients.values[0], coefficients.values[1], coefficients.values[2]);
+        normal = (normal - normal.dot(Eigen::Vector3f::UnitZ()) * Eigen::Vector3f::UnitZ()).normalized(); // Keep only component perpendicular to z-axis
+        base.normal = normal;
+        
+        // Store the inliers as the point cloud
+        base.cloud->clear();
+        base.voxel_indices.clear();
+        for (const auto &index : inliers.indices)
+        {
+            pcl::PointXYZINormal pt = merged_cloud->points[index];
+            base.cloud->push_back(pt);
+            Eigen::Vector3f pt_f(pt.x, pt.y, pt.z);
+            Eigen::Vector3i voxel_index = misc_utils_ns::point_to_voxel(pt_f, shift_, room_resolution_inv_);
+            base.voxel_indices.push_back(voxel_index);
+        }
+    }
+    
+    base.u_dir = base.normal.cross(Eigen::Vector3f::UnitZ()).normalized();
+    base.v_dir = base.normal.cross(base.u_dir).normalized();
+    
+    // Update bounding box
+    // Treat point cloud as points on a plane, compute the four corner points
+    Eigen::Vector4f centroid4f;
+    pcl::compute3DCentroid(*base.cloud, centroid4f);
+    base.centroid = centroid4f.head<3>();
+
+    float u_min = FLT_MAX, u_max = -FLT_MAX;
+    float v_min = FLT_MAX, v_max = -FLT_MAX;
+    for (const auto &point : base.cloud->points)
+    {
+        Eigen::Vector3f p(point.x, point.y, point.z);
+        Eigen::Vector3f relative = p - base.centroid;
+        float u = relative.dot(base.u_dir);
+        float v = relative.dot(base.v_dir);
+
+        u_min = std::min(u_min, u);
+        u_max = std::max(u_max, u);
+        v_min = std::min(v_min, v);
+        v_max = std::max(v_max, v);
+    }
+    
+    // Four corner points (counter-clockwise, starting from bottom-left)
+    std::array<Eigen::Vector3f, 4> corners = {
+        base.centroid + base.u_dir * u_min + base.v_dir * v_min,
+        base.centroid + base.u_dir * u_max + base.v_dir * v_min,
+        base.centroid + base.u_dir * u_max + base.v_dir * v_max,
+        base.centroid + base.u_dir * u_min + base.v_dir * v_max};
+
+    base.centroid = (corners[0] + corners[1] + corners[2] + corners[3]) / 4.0f;
+    base.corners = corners;
+
+    // Compute plane width and height
+    base.width = u_max - u_min;
+    base.height = v_max - v_min;
+}
+
+void RoomSegmentationNode::updateRooms(cv::Mat &room_mask_cropped, cv::Mat &room_mask_new,
+                                       cv::Mat &room_mask_vis_cropped, int &room_number) {
+    // Step 1: Only keep regions in room_mask_cropped that are non-zero in room_mask_new
+    cv::Mat mask = (room_mask_new != 0);
+    room_mask_cropped.setTo(0, mask == 0);
+
+    // Create a set of all room IDs from 1 to room_number that need processing
+    std::set<int> room_need_process_ids;
+    for (int i = 1; i <= room_number; ++i)
+    {
+        room_need_process_ids.insert(i);
+    }
+    
+    Eigen::Vector3d bg_color_ = misc_utils_ns::idToColor(0); // Background color is white
+    cv::Vec3b bg_color = cv::Vec3b(bg_color_[0], bg_color_[1], bg_color_[2]);
+    Eigen::Vector3d color_;
+    cv::Vec3b color;
+
+    room_mask_vis_cropped.setTo(bg_color, mask == 0);
+    
+    // Check each room node, three cases:
+    // 1. If most of room's mask in room_mask_new is 0, the room is invalid or merged, should be deleted
+    // 2. If room's mask in room_mask_new has only 1 non-zero value, compare old and new masks
+    // 3. If room's mask in room_mask_new has multiple non-zero values, the old room was split
+    
+    if (room_mask_new.size() != room_mask_cropped.size())
+    {
+        throw std::runtime_error("room_mask_new and room_mask must have the same size");
+    }
+
+    std::set<int> room_ids_to_remove;
+    std::vector<representation_ns::RoomNodeRep> room_nodes_new;
+    
+    for (auto &id_room_node_pair : room_nodes_map_)
+    {   
+        int room_id = id_room_node_pair.first;
+        representation_ns::RoomNodeRep &room_node = id_room_node_pair.second;
+        std::vector<cv::Point> non_zero_points_old = room_node.points_;
+        
+        if (non_zero_points_old.empty())
+        {
+            room_node.SetAlive(false);
+            room_ids_to_remove.insert(room_id);
+            std::cout << "Room " << room_id << " erase 1" << std::endl;
+            continue;
+        }
+
+        std::vector<cv::Point> non_zero_points_old_cropped;
+        non_zero_points_old_cropped.reserve(non_zero_points_old.size());
+        cv::Mat mask_old = cv::Mat::zeros(room_mask_cropped.size(), CV_8U);
+        
+        for (const auto &pt : non_zero_points_old)
+        {
+            mask_old.at<uchar>(pt.y - bbox_[0][0], pt.x - bbox_[0][1]) = 1;
+            cv::Point pt_cropped(pt.x - bbox_[0][1], pt.y - bbox_[0][0]);
+            non_zero_points_old_cropped.push_back(pt_cropped);
+        }
+        int value_old = room_node.GetId();
+
+        bool all_zero = true;
+        int zero_count = 0;
+        std::set<int> values_set = {};
+        std::unordered_map<int, int> value_count_map = {};
+        std::set<int> values_set_old;
+        
+        for (const auto &pt : non_zero_points_old_cropped)
+        {   
+            int value = room_mask_new.at<int>(pt.y, pt.x);
+            int value_old_tmp = room_mask_cropped.at<int>(pt.y, pt.x);
+            if (value > 0)
+            {
+                values_set.insert(value);
+                value_count_map[value]++;
+                all_zero = false;
+            }
+            else
+            {
+                zero_count++;
+            }
+            if (value_old_tmp > 0)
+            {
+                values_set_old.insert(value_old_tmp);
+            }
+        }
+        
+        // Case 1: Room is mostly zero or invalid
+        if (all_zero || zero_count > non_zero_points_old.size() * 0.8 || 
+            values_set_old.find(value_old) == values_set_old.end())
+        {
+            room_mask_cropped.setTo(0, mask_old);
+            room_node.SetAlive(false);
+            room_ids_to_remove.insert(room_id);
+            std::cout << "Room " << room_id << " erase 2" << std::endl;
+            RCLCPP_INFO(this->get_logger(),
+                        "Room %d: all_zero = %d, zero_count = %d, value_found = %d.",
+                        room_id, all_zero, (zero_count > non_zero_points_old.size() * 0.8), 
+                        (values_set_old.find(value_old) == values_set_old.end()));
+        }
+        // Case 2: Room has single non-zero value
+        else if (values_set.size() == 1 && *values_set.begin() != 0)
+        {
+            int value_new = *values_set.begin();
+            cv::Mat mask_new = (room_mask_new == value_new);
+            
+            if (cv::countNonZero(mask_new ^ mask_old) == 0)
+            {
+                room_mask_new.setTo(0, mask_old);
+                room_need_process_ids.erase(value_new);
+                continue;
+            }
+            else
+            {
+                room_mask_cropped.setTo(0, mask_old);
+                room_mask_vis_cropped.setTo(bg_color, mask_old);
+                
+                std::vector<cv::Point> non_zero_points_new;
+                cv::findNonZero(mask_new, non_zero_points_new);
+
+                room_mask_cropped.setTo(value_old, mask_new);
+                color_ = misc_utils_ns::idToColor(value_old);
+                color = cv::Vec3b(color_[0], color_[1], color_[2]);
+                room_mask_vis_cropped.setTo(color, mask_new);
+                room_mask_new.setTo(0, mask_new);
+
+                for (auto &pt : non_zero_points_new)
+                {
+                    pt.x += bbox_[0][1];
+                    pt.y += bbox_[0][0];
+                }
+                room_node.UpdateRoomNode(non_zero_points_new);
+                room_need_process_ids.erase(value_new);
+            }
+        }
+        // Case 3: Room has multiple non-zero values (split)
+        else if (values_set.size() > 1)
+        {
+            std::vector<int> non_zero_values;
+            for (const auto &value : values_set)
+            {
+                if (value != 0)
+                    non_zero_values.push_back(value);
+            }
+            
+            // Find value with maximum count
+            int max_area_value = 0;
+            int max_area = 0;
+            for (const auto &pair : value_count_map)
+            {
+                if (pair.second > max_area)
+                {
+                    max_area_value = pair.first;
+                    max_area = pair.second;
+                }
+            }
+            
+            cv::Mat mask_new = (room_mask_new == max_area_value);
+            room_mask_cropped.setTo(0, mask_old);
+            room_mask_vis_cropped.setTo(bg_color, mask_old);
+
+            room_mask_cropped.setTo(value_old, mask_new);
+            color_ = misc_utils_ns::idToColor(value_old);
+            color = cv::Vec3b(color_[0], color_[1], color_[2]);
+            room_mask_vis_cropped.setTo(color, mask_new);
+            room_mask_new.setTo(0, mask_new);
+
+            std::vector<cv::Point> non_zero_points_new;
+            cv::findNonZero(mask_new, non_zero_points_new);
+            for (auto &pt : non_zero_points_new)
+            {
+                pt.x += bbox_[0][1];
+                pt.y += bbox_[0][0];
+            }
+            room_node.UpdateRoomNode(non_zero_points_new);
+            room_need_process_ids.erase(max_area_value);
+
+            // Process other values
+            for (const auto &value : non_zero_values)
+            {
+                if (value == max_area_value)
+                    continue;
+
+                cv::Mat mask_other;
+                cv::bitwise_and((room_mask_new == value), (room_mask_cropped == 0), mask_other);
+                std::vector<cv::Point> non_zero_points_other;
+                cv::findNonZero(mask_other, non_zero_points_other);
+
+                cv::Mat mask_other_tmp = (room_mask_new == value);
+                std::vector<cv::Point> non_zero_points_other_tmp;
+                cv::findNonZero(mask_other_tmp, non_zero_points_other_tmp);
+                
+                if (((float)non_zero_points_other.size() / (non_zero_points_other_tmp.size() + 1e-5) > 0.5) && 
+                    non_zero_points_other.size() > 0)
+                {
+                    room_node_counter_++;
+                    int new_room_id = room_node_counter_;
+                    room_mask_cropped.setTo(new_room_id, mask_other_tmp);
+                    Eigen::Vector3d new_color_ = misc_utils_ns::idToColor(new_room_id);
+                    cv::Vec3b new_color = cv::Vec3b(new_color_[0], new_color_[1], new_color_[2]);
+                    room_mask_vis_cropped.setTo(new_color, mask_other_tmp);
+                    room_mask_new.setTo(0, mask_other_tmp);
+
+                    std::vector<cv::Point> non_zero_points_new;
+                    cv::findNonZero(mask_other_tmp, non_zero_points_new);
+                    for (auto &pt : non_zero_points_new)
+                    {
+                        pt.x += bbox_[0][1];
+                        pt.y += bbox_[0][0];
+                    }
+
+                    representation_ns::RoomNodeRep new_room_node(new_room_id, non_zero_points_new);
+                    room_nodes_new.push_back(new_room_node);
+                    room_need_process_ids.erase(value);
+                }
+            }
+        }
+    }
+
+    // Add new room nodes
+    for (auto &new_room_node : room_nodes_new)
+    {
+        int new_room_id = new_room_node.GetId();
+        if (room_nodes_map_.find(new_room_id) == room_nodes_map_.end())
+        {
+            room_nodes_map_[new_room_id] = new_room_node;
+        }
+        else
+        {
+            throw std::runtime_error("New room id already exists in room_nodes_map: " + std::to_string(new_room_id));
+        }
+    }
+    
+    // Process remaining room IDs
+    for (const auto &room_id : room_need_process_ids)
+    {
+        cv::Mat mask_new = (room_mask_new == room_id);
+        if (cv::countNonZero(mask_new) > 0)
+        {
+            room_node_counter_++;
+            int new_room_id = room_node_counter_;
+            room_mask_cropped.setTo(new_room_id, mask_new);
+            Eigen::Vector3d new_color_ = misc_utils_ns::idToColor(new_room_id);
+            cv::Vec3b new_color = cv::Vec3b(new_color_[0], new_color_[1], new_color_[2]);
+            room_mask_vis_cropped.setTo(new_color, mask_new);
+            room_mask_new.setTo(0, mask_new);
+
+            std::vector<cv::Point> non_zero_points_new;
+            cv::findNonZero(mask_new, non_zero_points_new);
+            for (auto &pt : non_zero_points_new)
+            {
+                pt.x += bbox_[0][1];
+                pt.y += bbox_[0][0];
+            }
+
+            representation_ns::RoomNodeRep new_room_node(new_room_id, non_zero_points_new);
+            room_nodes_map_[new_room_id] = new_room_node;
+        }
+    }
+    
+    // Remove all inactive room nodes
+    for (auto it = room_nodes_map_.begin(); it != room_nodes_map_.end();)
+    {
+        assert(it->second.id_ == it->first);
+        if (!it->second.alive) {
+            it = room_nodes_map_.erase(it);
+        } else {
+            ++it;
+        }
+    }
+
+    // Verify all room IDs processed
+    int non_zero_count = cv::countNonZero(room_mask_new);
+    room_mask_new *= 255;
+    saveImageToFile(room_mask_new, "room_mask_new.png");
+    assert(non_zero_count == 0);
+
+    // Renumber rooms and update visualization
+    cv::Mat room_mask_vis_cropped_new = room_mask_vis_.rowRange(bbox_[0][0], bbox_[1][0] + 1)
+                                                      .colRange(bbox_[0][1], bbox_[1][1] + 1);
+    pcl::PointCloud<pcl::PointXYZRGB>::Ptr room_cloud(new pcl::PointCloud<pcl::PointXYZRGB>);
+    
+    int counter = 1;
+    for (auto &id_room_node_pair : room_nodes_map_)
+    {
+        representation_ns::RoomNodeRep &room_node = id_room_node_pair.second;
+        cv::Mat mask_new = (room_mask_cropped == room_node.GetId());
+        room_node.SetRoomMask(mask_new);
+        room_node.show_id_ = counter;
+        counter++;
+
+        geometry_msgs::msg::PolygonStamped new_room_polygon = computePolygonFromMaskCropped(mask_new);
+        room_node.UpdatePolygon(new_room_polygon);
+        room_node.area_ = cv::countNonZero(mask_new) * room_resolution_ * room_resolution_;
+
+        Eigen::Vector3f room_centroid(0.0, 0.0, 0.0);
+        if (!room_node.points_.empty())
+        {
+            for (const auto &pt : room_node.points_)
+            {
+                Eigen::Vector3i pt_voxel(pt.y, pt.x, 0);
+                Eigen::Vector3f pt_position = misc_utils_ns::voxel_to_point(pt_voxel, shift_, room_resolution_);
+                pcl::PointXYZRGB point;
+                point.x = pt_position.x();
+                point.y = pt_position.y();
+                point.z = robot_position_.z;
+                Eigen::Vector3d color_ = misc_utils_ns::idToColor(room_node.GetId());
+                point.b = static_cast<uint8_t>(color_[0]);
+                point.g = static_cast<uint8_t>(color_[1]);
+                point.r = static_cast<uint8_t>(color_[2]);
+                room_cloud->points.push_back(point);
+
+                room_centroid.x() += pt_position.x();
+                room_centroid.y() += pt_position.y();
+            }
+        }
+        room_centroid.x() /= room_node.points_.size();
+        room_centroid.y() /= room_node.points_.size();
+        room_centroid.z() = robot_position_.z;
+        room_node.centroid_ = room_centroid;
+        room_node.neighbors_.clear();
+    }
+    
+    sensor_msgs::msg::PointCloud2 room_cloud_msg;
+    pcl::toROSMsg(*room_cloud, room_cloud_msg);
+    room_cloud_msg.header.stamp = this->now();
+    room_cloud_msg.header.frame_id = "map";
+    pub_room_map_cloud_->publish(room_cloud_msg);
+}
+
+// TODO: Copy implementation from original file lines 1600-2100
+void RoomSegmentationNode::roomSegmentation()
+{
+    auto t_all_0 = std::chrono::high_resolution_clock::now();
+    auto t_0 = std::chrono::high_resolution_clock::now();
+
+    if (navigable_map_.empty() || wall_hist_.empty() || state_map_.empty())
+    {
+        RCLCPP_WARN_THROTTLE(
+            this->get_logger(),
+            *this->get_clock(),
+            2000,
+            "Room segmentation skipped: empty map buffers.");
+        return;
+    }
+    
+    // Extract the room boundary from the navigable map
+    cv::Mat hist_full = navigable_map_.clone();
+    cv::normalize(hist_full, hist_full, 0, 255, cv::NORM_MINMAX);
+    cv::Mat outside_boundary = cv::Mat::zeros(hist_full.size(), CV_8U);
+    cv::threshold(hist_full, outside_boundary, 0, 255, cv::THRESH_BINARY);
+    saveImageToFile(outside_boundary, "full_map_1.png");
+
+    if (outside_boundary.type() != CV_8U)
+        outside_boundary.convertTo(outside_boundary, CV_8U);
+
+    auto t_1 = std::chrono::high_resolution_clock::now();
+    // std::cout << "[Time] Extract Boundary: " << std::chrono::duration<float>(t_1 - t_0).count() << " s" << std::endl;
+    
+    // --------------------------- Pre-Processing: Exacting the wall ---------------------------
+    t_0 = std::chrono::high_resolution_clock::now();
+    cv::Mat wall_from_plane = getWall(in_range_cloud_);
+    t_1 = std::chrono::high_resolution_clock::now();
+    // std::cout << "[Time] Get Wall from Plane: " << std::chrono::duration<float>(t_1 - t_0).count() << " s" << std::endl;
+
+    // --------------------------- Extract walls from histogram ---------------------------
+    t_0 = std::chrono::high_resolution_clock::now();
+    cv::Mat wall_from_hist = wall_hist_.clone();
+    cv::normalize(wall_from_hist, wall_from_hist, 0, 1, cv::NORM_MINMAX);
+    wall_from_hist.convertTo(wall_from_hist, CV_8U);
+    saveImageToFile(wall_from_hist, "walls_skeleton_hist_1_raw.png");
+    double max_val_wall_from_hist = 0;
+    cv::minMaxLoc(wall_from_hist, nullptr, &max_val_wall_from_hist);
+    double threshold_wall_from_hist = max_val_wall_from_hist * 0.5;
+    cv::threshold(wall_from_hist, wall_from_hist, threshold_wall_from_hist, 255, cv::THRESH_BINARY);
+    cv::normalize(wall_from_plane, wall_from_plane, 0, 255, cv::NORM_MINMAX);
+    wall_from_plane.convertTo(wall_from_plane, CV_8U);
+    cv::threshold(wall_from_plane, wall_from_plane, 0, 255, cv::THRESH_BINARY);
+    // save 2 histograms speparately
+    saveImageToFile(wall_from_plane, "wall_from_plane.png");
+    saveImageToFile(wall_from_hist, "wall_from_hist.png");
+    saveImageToFile(state_map_, "state_map.png");
+
+    // Combine the two histograms
+    cv::Mat walls_skeleton_hist_connected = wall_from_hist.clone();
+    wall_from_plane = wall_from_plane | wall_from_hist;
+
+    wall_from_plane.setTo(0, state_map_);
+
+    t_1 = std::chrono::high_resolution_clock::now();
+    // std::cout << "[Time] Extract Wall from Histogram: " << std::chrono::duration<float>(t_1 - t_0).count() << " s" << std::endl;
+
+    // --------------------------- Contour Processing ---------------------------
+    t_0 = std::chrono::high_resolution_clock::now();
+    std::vector<std::vector<cv::Point>> contours;
+    std::vector<cv::Vec4i> hierarchy;
+    cv::findContours(outside_boundary.clone(), contours, hierarchy, cv::RETR_CCOMP, cv::CHAIN_APPROX_SIMPLE);
+
+    double min_hole_area = 400.0;
+    outside_boundary = cv::Mat::zeros(outside_boundary.size(), CV_8U);
+
+    for (int i = 0; i < contours.size(); ++i)
+    {
+        // 外轮廓
+        if (hierarchy[i][3] == -1)
+        {
+            cv::drawContours(outside_boundary, contours, i, cv::Scalar(255), cv::FILLED);
+
+            int child = hierarchy[i][2];
+            while (child != -1)
+            {
+                double area = cv::contourArea(contours[child]);
+                if (area >= min_hole_area)
+                {
+                    // 仅当内轮廓面积足够大时才绘制
+                    cv::drawContours(outside_boundary, contours, child, cv::Scalar(0), cv::FILLED);
+                }
+                child = hierarchy[child][0]; // 下一个兄弟
+            }
+        }
+    }
+
+    saveImageToFile(wall_from_plane, "wall_all.png");
+
+    cv::Mat outside_boundary_connected = outside_boundary.clone();
+
+    // Remove wall positions from boundary
+    outside_boundary.setTo(0, wall_from_plane);
+    outside_boundary_connected.setTo(0, walls_skeleton_hist_connected);
+
+    // Filter connected components by area
+    cv::Mat labels, stats, centroids;
+    int num_labels = cv::connectedComponentsWithStats(outside_boundary, labels, stats, centroids, 8);
+
+    cv::Mat area_mask = cv::Mat::zeros(outside_boundary.size(), CV_8U);
+    for (int i = 1; i < num_labels; ++i)
+    {
+        if (stats.at<int>(i, cv::CC_STAT_AREA) > 100)
+        {
+            cv::Mat label_mask = (labels == i);
+            area_mask.setTo(255, label_mask);
+        }
+    }
+    outside_boundary = area_mask.clone();
+
+    // filter small connected components in outside_boundary_connected
+    num_labels = cv::connectedComponentsWithStats(outside_boundary_connected, labels, stats, centroids, 8);
+    area_mask = cv::Mat::zeros(outside_boundary_connected.size(), CV_8U);
+    for (int i = 1; i < num_labels; ++i)
+    {
+        if (stats.at<int>(i, cv::CC_STAT_AREA) > 100)
+        {
+            cv::Mat label_mask = (labels == i);
+            area_mask.setTo(255, label_mask);
+        }
+    }
+    outside_boundary_connected = area_mask.clone();
+
+    cv::Mat full_map, full_map_connected;
+    cv::bitwise_not(outside_boundary, full_map);
+    cv::bitwise_not(outside_boundary_connected, full_map_connected);
+    cv::Mat boundary_mask = full_map.clone();
+
+    saveImageToFile(full_map, "full_map.png");
+    saveImageToFile(full_map_connected, "full_map_connected.png");
+
+    t_1 = std::chrono::high_resolution_clock::now();
+    // std::cout << "[Time] Contour Processing: " << std::chrono::duration<float>(t_1 - t_0).count() << " s" << std::endl;
+
+    // --------------------------- Boundary Dilation ---------------------------
+    t_0 = std::chrono::high_resolution_clock::now();
+
+    std::vector<cv::Point2d> found_region_centers;
+    std::vector<cv::Mat> found_region_masks;
+
+    cv::Mat kernel_ = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    cv::Mat new_boundary;
+    cv::dilate(boundary_mask, new_boundary, kernel_, cv::Point(-1, -1), dilation_iteration_);
+
+    cv::Mat new_boundary_inv;
+    cv::bitwise_not(new_boundary, new_boundary_inv);
+
+    saveImageToFile(new_boundary, "new_boundary.png");
+
+    cv::Mat new_labels, new_stats, centers;
+    int num_rooms = cv::connectedComponentsWithStats(new_boundary_inv, new_labels, new_stats, centers, 8);
+
+    for (int label = 1; label < num_rooms; ++label)
+    {
+        cv::Mat mask = (new_labels == label);
+        int area = new_stats.at<int>(label, cv::CC_STAT_AREA);
+        if (area > min_room_size_) // TODO
+        {
+            double cy = centers.at<double>(label, 0);
+            double cx = centers.at<double>(label, 1);
+            // add the bbox back to cx and cy
+            cx = cx + bbox_[0][0];
+            cy = cy + bbox_[0][1];
+            found_region_centers.emplace_back(cx, cy);
+            // 用mask直接重写new_boundary
+            new_boundary.setTo(255, mask);
+
+            mask.convertTo(mask, CV_8U, 255);
+            cv::Mat mask_dilated;
+            // cv::dilate(mask, mask_dilated, kernel_, cv::Point(-1, -1), dilation_iteration / 2);
+            found_region_masks.push_back(mask);
+        }
+    }
+
+    // --------------------------- Watershed Segmentation ---------------------------
+    cv::Mat markers = cv::Mat::zeros(boundary_mask.size(), CV_32S);
+    // Set background markers
+    cv::Mat bg_mask = (full_map_connected != 0);
+    markers.setTo(found_region_masks.size() + 1, bg_mask);
+
+    // Set foreground markers
+    for (size_t i = 0; i < found_region_masks.size(); ++i)
+    {
+        cv::Mat mask = (found_region_masks[i] == 255);
+        markers.setTo(static_cast<int>(i + 1), mask);
+    }
+
+    cv::Mat full_map_color;
+    cv::cvtColor(full_map_connected, full_map_color, cv::COLOR_GRAY2BGR);
+    saveImageToFile(full_map_color, "full_map_color.png");
+    cv::watershed(full_map_color, markers);
+
+    // --------------------------- Merge Rooms ---------------------------
+    auto t0_merge_room = std::chrono::high_resolution_clock::now();
+    // Convert cropped markers to original image size
+    cv::Mat door_mask = (markers == -1);
+    
+    // Mark all doors on image boundaries as 0
+    for (int c = 0; c < door_mask.cols; ++c)
+    {
+        if (door_mask.at<uchar>(0, c) != 0)
+            door_mask.at<uchar>(0, c) = 0;
+        if (door_mask.at<uchar>(door_mask.rows - 1, c) != 0)
+            door_mask.at<uchar>(door_mask.rows - 1, c) = 0;
+    }
+
+    // Left and right columns
+    for (int r = 0; r < door_mask.rows; ++r)
+    {
+        if (door_mask.at<uchar>(r, 0) != 0)
+            door_mask.at<uchar>(r, 0) = 0;
+        if (door_mask.at<uchar>(r, door_mask.cols - 1) != 0)
+            door_mask.at<uchar>(r, door_mask.cols - 1) = 0;
+    }
+
+    // Set all markers with value found_region_masks.size() + 1 and -1 to 0
+    markers.setTo(0, markers == found_region_masks.size() + 1);
+    markers.setTo(0, markers == -1);
+    
+    cv::Mat room_mask_cropped = room_mask_.rowRange(bbox_[0][0], bbox_[1][0] + 1)
+                                          .colRange(bbox_[0][1], bbox_[1][1] + 1);
+    cv::Mat room_mask_vis_cropped = room_mask_vis_.rowRange(bbox_[0][0], bbox_[1][0] + 1)
+                                                   .colRange(bbox_[0][1], bbox_[1][1] + 1);
+    int room_number = found_region_masks.size();
+    updateRooms(room_mask_cropped, markers, room_mask_vis_cropped, room_number);
+
+    auto t1_merge_room = std::chrono::high_resolution_clock::now();
+    // std::cout << "[Time] Merge Rooms: " << std::chrono::duration<float>(t1_merge_room - t0_merge_room).count() << " s" << std::endl;
+
+    t_1 = std::chrono::high_resolution_clock::now();
+    // std::cout << "[Time] Dilation: " << std::chrono::duration<float>(t_1 - t_0).count() << " s" << std::endl;
+    
+    // --------------------------- Door Detection and Room Connectivity ---------------------------
+    t_0 = std::chrono::high_resolution_clock::now();
+
+    Eigen::Vector3i robot_idx = misc_utils_ns::point_to_voxel_cropped(
+        Eigen::Vector3f(robot_position_.x, robot_position_.y, robot_position_.z),
+        shift_,
+        room_resolution_inv_,
+        bbox_);
+    int current_room_label = room_mask_cropped.at<int>(robot_idx[0], robot_idx[1]);
+    
+    if ((current_room_label > 0 && current_room_label <= static_cast<int>(found_region_masks.size())))
+    {
+        cv::Mat current_room_mask = (room_mask_cropped == current_room_label);
+        geometry_msgs::msg::PolygonStamped boundary_polygon = computePolygonFromMaskCropped(current_room_mask);
+        pub_room_boundary_->publish(boundary_polygon);
+    }
+
+    door_cloud_->clear();
+    // Assert room_mask_cropped has the same size as door_mask
+    assert(room_mask_cropped.size() == door_mask.size());
+    
+    // Filter door points
+    for (int r = 1; r < door_mask.rows - 1; ++r)
+    {
+        for (int c = 1; c < door_mask.cols - 1; ++c)
+        {
+            if (door_mask.at<uchar>(r, c) != 0)
+            {
+                bool not_door = false;
+                std::set<int> neighborLabels;
+                neighborLabels.clear();
+                
+                for (int dr = -1; dr <= 1; ++dr)
+                {
+                    for (int dc = -1; dc <= 1; ++dc)
+                    {
+                        if (r + dr < 0 || r + dr >= door_mask.rows ||
+                            c + dc < 0 || c + dc >= door_mask.cols || (dr == 0 && dc == 0))
+                        {
+                            continue;
+                        }
+                        int label = room_mask_cropped.at<int>(r + dr, c + dc);
+                        if (label > 0)
+                        {
+                            neighborLabels.insert(label);
+                        }
+                    }
+                }
+                
+                if (neighborLabels.size() == 1)
+                {
+                    not_door = true;
+                    door_mask.at<uchar>(r, c) = 0;
+                }
+                if (neighborLabels.size() > 2)
+                {
+                    not_door = true;
+                    for (int dr = -1; dr <= 1; ++dr)
+                    {
+                        for (int dc = -1; dc <= 1; ++dc)
+                        {
+                            if (r + dr < 0 || r + dr >= door_mask.rows ||
+                                c + dc < 0 || c + dc >= door_mask.cols)
+                            {
+                                continue;
+                            }
+                            door_mask.at<uchar>(r + dr, c + dc) = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Find the largest key in the room_nodes_map
+    int max_room_label = 0;
+    for (const auto &id_room_node_pair : room_nodes_map_)
+    {
+        if (id_room_node_pair.first > max_room_label)
+        {
+            max_room_label = id_room_node_pair.first;
+        }
+    }
+    assert(max_room_label == room_node_counter_);
+    
+    // Initialize adjacency matrix
+    Eigen::MatrixXi adjacency_matrix = Eigen::MatrixXi::Zero(max_room_label, max_room_label);
+
+    // Find connected components in door_mask
+    num_labels = cv::connectedComponentsWithStats(door_mask, labels, stats, centroids, 8);
+    
+    std::vector<cv::Point> non_zero_points;
+    for (int label = 1; label < num_labels; ++label)
+    {
+        cv::Mat label_mask = (labels == label);
+        non_zero_points.clear();
+        cv::findNonZero(label_mask, non_zero_points);
+        
+        std::set<int> neighborLabels;
+        neighborLabels.clear();
+        
+        for (const auto &point : non_zero_points)
+        {
+            int r = point.y;
+            int c = point.x;
+            
+            for (int dr = -1; dr <= 1; ++dr)
+            {
+                for (int dc = -1; dc <= 1; ++dc)
+                {
+                    if (r + dr < 0 || r + dr >= door_mask.rows ||
+                        c + dc < 0 || c + dc >= door_mask.cols || (dr == 0 && dc == 0))
+                    {
+                        continue;
+                    }
+                    int label = room_mask_cropped.at<int>(r + dr, c + dc);
+                    neighborLabels.insert(label);
+                }
+            }
+        }
+        
+        neighborLabels.erase(0);
+        if (neighborLabels.empty())
+        {
+            continue;
+        }
+        if (neighborLabels.size() != 2)
+        {
+            RCLCPP_ERROR(this->get_logger(), "Door mask has more than two labels or less than two labels, skipping this door.");
+            continue;
+        }
+
+        int room_label_1 = *neighborLabels.begin();
+        int room_label_2 = *std::next(neighborLabels.begin());
+
+        if (room_label_1 > room_label_2)
+        {
+            std::swap(room_label_1, room_label_2);
+        }
+
+        // Add neighbors to room nodes
+        room_nodes_map_[room_label_1].neighbors_.insert(room_label_2);
+        room_nodes_map_[room_label_2].neighbors_.insert(room_label_1);
+
+        int door_id = adjacency_matrix(room_label_1 - 1, room_label_2 - 1);
+        adjacency_matrix(room_label_1 - 1, room_label_2 - 1) += 1;
+        adjacency_matrix(room_label_2 - 1, room_label_1 - 1) += 1;
+
+        for (const auto &point : non_zero_points)
+        {
+            pcl::PointXYZRGBL door_point;
+            Eigen::Vector3i door_idx(point.y, point.x, 0);
+            Eigen::Vector3f door_position = misc_utils_ns::voxel_to_point_cropped(door_idx, shift_, room_resolution_, bbox_);
+            door_point.x = door_position.x();
+            door_point.y = door_position.y();
+            door_point.z = robot_position_.z;
+            door_point.r = room_label_1;
+            door_point.g = room_label_2;
+            door_point.b = 0;
+            door_point.label = door_id;
+            door_cloud_->points.push_back(door_point);
+
+            room_mask_vis_cropped.at<cv::Vec3b>(point.y, point.x) = cv::Vec3b(0, 0, 255);
+        }
+    }
+    
+    // Update room connectivity
+    if (current_room_label > 0)
+    {
+        for (auto &id_room_node_pair : room_nodes_map_)
+        {
+            int room_id = id_room_node_pair.first;
+            auto &room_node = id_room_node_pair.second;
+            bool is_connected = isRoomConnected(room_id - 1, current_room_label - 1, adjacency_matrix);
+            room_node.is_connected_ = is_connected;
+        }
+    }
+
+    cv::Mat output_image = room_mask_vis_cropped.clone();
+
+    // Draw robot position
+    cv::circle(
+        output_image,
+        cv::Point(robot_idx[1], robot_idx[0]),
+        3,
+        cv::Scalar(0, 0, 0),
+        -1);
+
+    // Publish room mask
+    sensor_msgs::msg::Image markers_msg;
+    markers_msg.header.stamp = this->get_clock()->now();
+    markers_msg.header.frame_id = "map";
+    markers_msg.height = room_mask_.rows;
+    markers_msg.width = room_mask_.cols;
+    markers_msg.encoding = "32SC1";
+    markers_msg.is_bigendian = false;
+    markers_msg.step = room_mask_.cols * sizeof(int);
+    markers_msg.data.resize(room_mask_.total() * room_mask_.elemSize());
+    std::memcpy(markers_msg.data.data(), room_mask_.data, markers_msg.data.size());
+    pub_room_mask_->publish(markers_msg);
+
+    saveImageToFile(output_image, "room_segmentation_visualization.png");
+
+    // Flip and transpose for visualization
+    cv::Mat flipped_image;
+    cv::transpose(output_image, flipped_image);
+    cv::flip(flipped_image, flipped_image, 0);
+    output_image = flipped_image;
+
+    // Convert to ROS image message
+    sensor_msgs::msg::Image output_image_msg;
+    output_image_msg.header.stamp = this->get_clock()->now();
+    output_image_msg.header.frame_id = "map";
+    output_image_msg.height = output_image.rows;
+    output_image_msg.width = output_image.cols;
+    output_image_msg.encoding = "bgr8";
+    output_image_msg.is_bigendian = false;
+    output_image_msg.step = output_image.cols * output_image.elemSize();
+    output_image_msg.data.resize(output_image.total() * output_image.elemSize());
+    std::memcpy(output_image_msg.data.data(), output_image.data, output_image_msg.data.size());
+    pub_room_mask_vis_->publish(output_image_msg);
+
+    t_1 = std::chrono::high_resolution_clock::now();
+    // std::cout << "[Time] Post Processing: " << std::chrono::duration<float>(t_1 - t_0).count() << " s" << std::endl;
+
+    auto t_all_1 = std::chrono::high_resolution_clock::now();
+    // std::cout << "[Time] Room segmentation: " << std::chrono::duration<float>(t_all_1 - t_all_0).count() << " s" << std::endl;
+}
+
+
+// ==================== Publishing Functions ====================
+void RoomSegmentationNode::publishRoomNodes() {
+    semantic_mapping::msg::RoomNodeList room_node_list_msg;
+    room_node_list_msg.header.stamp = this->now();
+    room_node_list_msg.header.frame_id = "map";
+    room_node_list_msg.nodes.reserve(room_nodes_map_.size());
+    
+    for (const auto &id_room_node_pair : room_nodes_map_) {
+        const representation_ns::RoomNodeRep &room_node = id_room_node_pair.second;
+        semantic_mapping::msg::RoomNode room_node_msg;
+        room_node_msg.id = room_node.GetId();
+        room_node_msg.show_id = room_node.show_id_;
+        room_node_msg.polygon = room_node.GetPolygon();
+        room_node_msg.centroid.x = room_node.centroid_.x();
+        room_node_msg.centroid.y = room_node.centroid_.y();
+        room_node_msg.centroid.z = room_node.centroid_.z();
+        
+        for (const auto &neighbor : room_node.neighbors_) {
+            room_node_msg.neighbors.push_back(neighbor);
+        }
+        room_node_msg.is_connected = room_node.is_connected_;
+        room_node_msg.area = room_node.area_;
+        room_node_msg.room_mask = *cv_bridge::CvImage(std_msgs::msg::Header(), "mono8", room_node.room_mask_).toImageMsg();
+        
+        room_node_list_msg.nodes.push_back(room_node_msg);
+    }
+    pub_room_node_list_->publish(room_node_list_msg);
+}
+
+void RoomSegmentationNode::publishRoomPolygon() {
+    visualization_msgs::msg::MarkerArray marker_array;
+    visualization_msgs::msg::Marker delete_marker;
+    delete_marker.ns = "polygon_group";
+    delete_marker.id = 0;
+    delete_marker.action = visualization_msgs::msg::Marker::DELETEALL;
+    marker_array.markers.push_back(delete_marker);
+    
+    for (auto &id_room_node_pair : room_nodes_map_) {
+        const auto &room_node = id_room_node_pair.second;
+        const auto &poly = room_node.GetPolygon();
+
+        visualization_msgs::msg::Marker marker;
+        marker.header = poly.header;
+        marker.ns = "polygon_group";
+        marker.id = static_cast<int>(room_node.GetId());
+        marker.type = visualization_msgs::msg::Marker::LINE_STRIP;
+        marker.action = visualization_msgs::msg::Marker::ADD;
+        
+        Eigen::Vector3d color_ = misc_utils_ns::idToColor(room_node.GetId());
+        marker.scale.x = 0.13;
+        marker.color.b = color_[0] / 255.0;
+        marker.color.g = color_[1] / 255.0;
+        marker.color.r = color_[2] / 255.0;
+        marker.color.a = 1.0;
+
+        for (const auto &pt : poly.polygon.points) {
+            geometry_msgs::msg::Point p;
+            p.x = pt.x;
+            p.y = pt.y;
+            p.z = robot_position_.z;
+            marker.points.push_back(p);
+        }
+
+        if (!marker.points.empty()) {
+            marker.points.push_back(marker.points.front());
+        }
+        marker_array.markers.push_back(marker);
+    }
+    pub_polygon_->publish(marker_array);
+}
+
+void RoomSegmentationNode::publishDoorCloud() {
+    sensor_msgs::msg::PointCloud2 door_msg;
+    pcl::toROSMsg(*door_cloud_, door_msg);
+    door_msg.header.stamp = this->now();
+    door_msg.header.frame_id = "map";
+    pub_door_cloud_->publish(door_msg);
+}
+
+} // namespace room_segmentation
+
+// ==================== Main Function ====================
+int main(int argc, char **argv) {
+    rclcpp::init(argc, argv);
+    
+    auto node = std::make_shared<room_segmentation::RoomSegmentationNode>();
+    
+    RCLCPP_INFO(node->get_logger(), "Room Segmentation Node is running...");
+    
+    rclcpp::spin(node);
+    
+    rclcpp::shutdown();
+    return 0;
+}
\ No newline at end of file
diff --git a/sensor_coverage/src/utils/misc_utils.cpp b/sensor_coverage/src/utils/misc_utils.cpp
new file mode 100644
index 0000000..bb2a79b
--- /dev/null
+++ b/sensor_coverage/src/utils/misc_utils.cpp
@@ -0,0 +1,1308 @@
+//
+// Created by caochao on 6/5/19.
+//
+
+#include "sensor_coverage/utils/misc_utils.h"
+#include <functional>
+#include <queue>
+
+namespace misc_utils_ns
+{
+/// Function for converting a PointType to a geometry_msgs::msg::Point
+/// \param pnt A PointType
+/// \return A geometry_msgs::msg::Point
+geometry_msgs::msg::Point PCL2GeoMsgPnt(const PCLPointType& pnt)
+{
+  return GeoMsgPoint(pnt.x, pnt.y, pnt.z);
+}
+
+/// Function for converting a geometry_msgs::msg::Point to a PointType
+/// \param pnt A geometry_msgs::msg::Point
+/// \return A PointType
+PCLPointType GeoMsgPnt2PCL(const geometry_msgs::msg::Point& pnt)
+{
+  PCLPointType point_o;
+  point_o.x = (float)pnt.x;
+  point_o.y = (float)pnt.y;
+  point_o.z = (float)pnt.z;
+  return point_o;
+}
+
+geometry_msgs::msg::Point GeoMsgPoint(double x, double y, double z)
+{
+  geometry_msgs::msg::Point p;
+  p.x = x;
+  p.y = y;
+  p.z = z;
+  return p;
+}
+
+PCLPointType PCLPoint(float x, float y, float z)
+{
+  PCLPointType p;
+  p.x = x;
+  p.y = y;
+  p.z = z;
+  return p;
+}
+
+void LeftRotatePoint(PCLPointType& pnt)
+{
+  float tmp_z = pnt.z;
+  pnt.z = pnt.y;
+  pnt.y = pnt.x;
+  pnt.x = tmp_z;
+}
+
+void RightRotatePoint(PCLPointType& pnt)
+{
+  float tmp_x = pnt.x;
+  pnt.x = pnt.y;
+  pnt.y = pnt.z;
+  pnt.z = tmp_x;
+}
+
+void LeftRotatePoint(geometry_msgs::msg::Point& pnt)
+{
+  double tmp_z = pnt.z;
+  pnt.z = pnt.y;
+  pnt.y = pnt.x;
+  pnt.x = tmp_z;
+}
+
+void RightRotatePoint(geometry_msgs::msg::Point& pnt)
+{
+  double tmp_x = pnt.x;
+  pnt.x = pnt.y;
+  pnt.y = pnt.z;
+  pnt.z = tmp_x;
+}
+
+template <class CloudType>
+void KeyposeToMap(CloudType& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose)
+{
+  float tx = (float)keypose->pose.pose.position.x;
+  float ty = (float)keypose->pose.pose.position.y;
+  float tz = (float)keypose->pose.pose.position.z;
+
+  tf2::Quaternion tf_q(keypose->pose.pose.orientation.x, keypose->pose.pose.orientation.y,
+                       keypose->pose.pose.orientation.z, keypose->pose.pose.orientation.w);
+  tf2::Matrix3x3 tf_m(tf_q);
+  double roll, pitch, yaw;
+  tf_m.getRPY(roll, pitch, yaw);
+
+  float sin_roll = (float)sin(roll);
+  float cos_roll = (float)cos(roll);
+  float sin_pitch = (float)sin(pitch);
+  float cos_pitch = (float)cos(pitch);
+  float sin_yaw = (float)sin(yaw);
+  float cos_yaw = (float)cos(yaw);
+
+  for (auto& point : cloud->points)
+  {
+    // To map_rot frame
+    float x1 = point.x;
+    float y1 = point.y;
+    float z1 = point.z;
+
+    float x2 = x1;
+    float y2 = y1 * cos_roll - z1 * sin_roll;
+    float z2 = y1 * sin_roll + z1 * cos_roll;
+
+    float x3 = x2 * cos_pitch + z2 * sin_pitch;
+    float y3 = y2;
+    float z3 = -x2 * sin_pitch + z2 * cos_pitch;
+
+    float x4 = x3 * cos_yaw - y3 * sin_yaw;
+    float y4 = x3 * sin_yaw + y3 * cos_yaw;
+    float z4 = z3;
+
+    float x5 = x4 + tx;
+    float y5 = y4 + ty;
+    float z5 = z4 + tz;
+
+    // To map frame
+    point.x = z5;
+    point.y = x5;
+    point.z = y5;
+  }
+}
+
+/// Function to compute the distance between two geometry_msgs::msg::Point
+/// \param pnt1 The first point
+/// \param pnt2 The second point
+/// \return Distance between the two points
+double PointXYDist(const geometry_msgs::msg::Point& pnt1, const geometry_msgs::msg::Point& pnt2)
+{
+  return sqrt(pow((pnt1.x - pnt2.x), 2) + pow((pnt1.y - pnt2.y), 2));
+}
+
+/// Function to compute the distance between two PointType
+/// \param pnt1 The first point
+/// \param pnt2 The second point
+/// \return Distance between the two points
+double PointXYDist(const PCLPointType& pnt1, const PCLPointType& pnt2)
+{
+  return sqrt(pow((pnt1.x - pnt2.x), 2) + pow((pnt1.y - pnt2.y), 2));
+}
+
+/// Function to compute the angle between two vectors in the xy-plane, the z component is omitted
+/// \param v1 The first (starting) vector
+/// \param v2 The second (end) vector
+/// \return Angle from v1 to v2, positive if counterclockwise, negative if clockwise. The output is within [-pi, pi]
+double VectorXYAngle(const Eigen::Vector3d& v1, const Eigen::Vector3d& v2)
+{
+  return atan2(v1.x() * v2.y() - v1.y() * v2.x(), v1.x() * v2.x() + v1.y() * v2.y());
+}
+
+/// Function to compute the direction (angle) of a geometry_msgs::msg::Point
+/// \param pnt Input point
+/// \param robot_pos Robot position
+/// \return Direction (angle)
+double PointAngle(const geometry_msgs::msg::Point& pnt, const geometry_msgs::msg::Point& robot_pos)
+{
+  return atan2((pnt.y - robot_pos.y), (pnt.x - robot_pos.x));
+}
+
+/// Function to compute the direction (angle) of a PointType
+/// \param pnt Intput point
+/// \param robot_pos Robot position
+/// \return Direction (angle)
+double PointAngle(const PCLPointType& pnt, const geometry_msgs::msg::Point& robot_pos)
+{
+  return atan2((pnt.y - robot_pos.y), (pnt.x - robot_pos.x));
+}
+
+/// Function to check if three points are collinear in the xy-plane
+/// \param p1 First point
+/// \param p2 Second point
+/// \param p3 Third point
+/// \return Collinear
+bool CollinearXY(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& p2,
+                 const geometry_msgs::msg::Point& p3, double threshold)
+{
+  // https://math.stackexchange.com/questions/405966/if-i-have-three-points-is-there-an-easy-way-to-tell-if-they-are-collinear
+  double val = (p2.y - p1.y) * (p3.x - p2.x) - (p3.y - p2.y) * (p2.x - p1.x);
+  if (std::abs(val) < threshold)
+  {
+    return true;
+  }
+  else
+  {
+    return false;
+  }
+}
+
+/// Function to calculate the overlap of two angle intervals [s1, e1] and [s2, e2]. s1(2)->e1(2) counter-clockwise and
+/// all angles shoud be in [-pi, pi] \param s1 starting angle of the first interval \param e1 end angle of the second
+/// interval \param s2 starting angle of the first interval \param e2 end angle of the second interval \return overlap
+/// the overlap of the two intervals. > 0 if overlapped, < 0 otherwise.
+double AngleOverlap(double s1, double e1, double s2, double e2)
+{
+  double overlap = 0.0;
+  // TODO: normalize angles to [-pi, pi]
+  // The first interval crosses the evil branch
+  if (e1 < s1)
+  {
+    // Recursively compute the overlaps
+    double sub_overlap1 = AngleOverlap(s1, M_PI, s2, e2);
+    double sub_overlap2 = AngleOverlap(-M_PI, e1, s2, e2);
+    // If both sub-overlaps are negative (no overlap) or there is only one positive sub-overlap
+    if ((sub_overlap1 < 0 && sub_overlap2 < 0) || sub_overlap1 * sub_overlap2 < 0)
+    {
+      overlap = std::max(sub_overlap1, sub_overlap2);
+    }
+    else
+    {
+      overlap = sub_overlap1 + sub_overlap2;
+    }
+  }
+  else if (e2 < s2)
+  {
+    // Similar to the case above
+    double sub_overlap1 = AngleOverlap(s1, e1, s2, M_PI);
+    double sub_overlap2 = AngleOverlap(s1, e1, -M_PI, e2);
+    if ((sub_overlap1 < 0 && sub_overlap2 < 0) || sub_overlap1 * sub_overlap2 < 0)
+    {
+      overlap = std::max(sub_overlap1, sub_overlap2);
+    }
+    else
+    {
+      overlap = sub_overlap1 + sub_overlap2;
+    }
+  }
+  else
+  {
+    if (e1 > e2)
+    {
+      if (s1 > e2)
+      {
+        // No overlap
+        overlap = e2 - s1;
+      }
+      else if (s1 > s2)
+      {
+        overlap = e2 - s1;
+      }
+      else
+      {
+        overlap = e2 - s2;
+      }
+    }
+    else
+    {
+      if (s2 > e1)
+      {
+        // No overlap
+        overlap = e1 - s2;
+      }
+      else if (s2 > s1)
+      {
+        overlap = e1 - s2;
+      }
+      else
+      {
+        overlap = e1 - s1;
+      }
+    }
+  }
+  return overlap;
+}
+
+double AngleDiff(double source_angle, double target_angle)
+{
+  double angle_diff = target_angle - source_angle;
+  if (angle_diff > M_PI)
+  {
+    angle_diff -= 2 * M_PI;
+  }
+  if (angle_diff < -M_PI)
+  {
+    angle_diff += 2 * M_PI;
+  }
+  return angle_diff;
+}
+
+/// Function to determine if a point is on a line segment.
+/// Given three colinear points p, q, r, the function checks if point q lies on line segment 'pr'
+/// \param p End point of line segment pr
+/// \param q Point to be examined
+/// \param r End point of line segment pr
+/// \return If q is on pr
+bool PointOnLineSeg(const geometry_msgs::msg::Point& p, const geometry_msgs::msg::Point& q,
+                    const geometry_msgs::msg::Point& r)
+{
+  if (q.x <= std::max(p.x, r.x) && q.x >= std::min(p.x, r.x) && q.y <= std::max(p.y, r.y) && q.y >= std::min(p.y, r.y))
+  {
+    return true;
+  }
+  else
+  {
+    return false;
+  }
+}
+
+/// Function to find orientation of ordered triplet (p, q, r).
+/// \param p The first point
+/// \param q The second point
+/// \param r The third point
+/// \return 0 --> p, q and r are colinear, 1 --> Clockwise, 2 --> Counterclockwise
+int ThreePointOrientation(const geometry_msgs::msg::Point& p, const geometry_msgs::msg::Point& q,
+                          const geometry_msgs::msg::Point& r)
+{
+  // See https://www.geeksforgeeks.org/orientation-3-ordered-points/
+  // for details of below formula.
+  double val = (q.y - p.y) * (r.x - q.x) - (q.x - p.x) * (r.y - q.y);
+
+  if (fabs(val) < 0.1)
+    return 0;  // colinear
+
+  return (val > 0) ? 1 : 2;  // clock or counterclock wise
+}
+
+/// Function to check if two line segments intersect, returns
+/// \param p1 end point of 'p1q1'
+/// \param q1 end point of 'p1q1'
+/// \param p2 end point of 'p2q2'
+/// \param q2 end point of 'p2q2'
+/// \return true if line segment 'p1q1' and 'p2q2' intersect, false otherwise
+bool LineSegIntersect(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& q1,
+                      const geometry_msgs::msg::Point& p2, const geometry_msgs::msg::Point& q2)
+{
+  // Find the four orientations needed for general and
+  // special cases
+  int o1 = ThreePointOrientation(p1, q1, p2);
+  int o2 = ThreePointOrientation(p1, q1, q2);
+  int o3 = ThreePointOrientation(p2, q2, p1);
+  int o4 = ThreePointOrientation(p2, q2, q1);
+
+  // General case
+  if (o1 != o2 && o3 != o4)
+    return true;
+
+  // Special Cases
+  // p1, q1 and p2 are colinear and p2 lies on segment p1q1
+  if (o1 == 0 && PointOnLineSeg(p1, p2, q1))
+    return true;
+
+  // p1, q1 and q2 are colinear and q2 lies on segment p1q1
+  if (o2 == 0 && PointOnLineSeg(p1, q2, q1))
+    return true;
+
+  // p2, q2 and p1 are colinear and p1 lies on segment p2q2
+  if (o3 == 0 && PointOnLineSeg(p2, p1, q2))
+    return true;
+
+  // p2, q2 and q1 are colinear and q1 lies on segment p2q2
+  if (o4 == 0 && PointOnLineSeg(p2, q1, q2))
+    return true;
+
+  return false;  // Doesn't fall in any of the above cases
+}
+
+/// Similar to LineSegIntersect(), except adds a "tolerance"
+/// such that each line segment is extended at both ends by this distance
+/// i.e. this will return true more often than LineSegIntersect()
+/// \param p1 end point of 'p1q1'
+/// \param q1 end point of 'p1q1'
+/// \param p2 end point of 'p2q2'
+/// \param q2 end point of 'p2q2'
+/// \param tolerance distance to be added at both ends of both lines
+/// \return true if line segment 'p1q1' and 'p2q2' intersect, false otherwise
+bool LineSegIntersectWithTolerance(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& q1,
+                                   const geometry_msgs::msg::Point& p2, const geometry_msgs::msg::Point& q2,
+                                   const double tolerance)
+{
+  // Make a copy
+  geometry_msgs::msg::Point p1_extend = p1;
+  geometry_msgs::msg::Point q1_extend = q1;
+  geometry_msgs::msg::Point p2_extend = p2;
+  geometry_msgs::msg::Point q2_extend = q2;
+
+  // Extend line segment 1
+  double dist1 = PointXYDist(p1, q1);
+  if (dist1 == 0)
+    return false;  // trivial
+  double dir1_x = (q1.x - p1.x) / dist1;
+  double dir1_y = (q1.y - p1.y) / dist1;
+  p1_extend.x -= tolerance * dir1_x;
+  p1_extend.y -= tolerance * dir1_y;
+  q1_extend.x += tolerance * dir1_x;
+  q1_extend.y += tolerance * dir1_y;
+
+  // Extend line segment 2
+  double dist2 = PointXYDist(p2, q2);
+  if (dist2 == 0)
+    return false;  // trivial
+  double dir2_x = (q2.x - p2.x) / dist2;
+  double dir2_y = (q2.y - p2.y) / dist2;
+  p2_extend.x -= tolerance * dir2_x;
+  p2_extend.y -= tolerance * dir2_y;
+  q2_extend.x += tolerance * dir2_x;
+  q2_extend.y += tolerance * dir2_y;
+
+  // Call the standard function with these extended line segments
+  return LineSegIntersect(p1_extend, q1_extend, p2_extend, q2_extend);
+}
+
+/// Function to check if a point is inside a polygon
+/// \param p point
+/// \param polygon polygon
+/// \return true if the point is inside the polygon
+bool PointInPolygon(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Polygon& polygon)
+{
+  int polygon_pnt_num = polygon.points.size();
+  if (polygon_pnt_num < 3)
+    return false;
+
+  geometry_msgs::msg::Point inf_point;
+  inf_point.x = std::numeric_limits<float>::max();
+  inf_point.y = point.y;
+  int count = 0;
+  int cur_idx = 0;
+  do
+  {
+    int next_idx = (cur_idx + 1) % polygon_pnt_num;
+    // Check if the line segment from 'point' to 'inf_point' intersects
+    // with the line segment from 'polygon[cur_idx]' to 'polygon[next_idx]'
+    geometry_msgs::msg::Point polygon_cur_pnt =
+        GeoMsgPoint(polygon.points[cur_idx].x, polygon.points[cur_idx].y, polygon.points[cur_idx].z);
+    geometry_msgs::msg::Point polygon_next_pnt =
+        GeoMsgPoint(polygon.points[next_idx].x, polygon.points[next_idx].y, polygon.points[next_idx].z);
+    if (LineSegIntersect(polygon_cur_pnt, polygon_next_pnt, point, inf_point))
+    {
+      // If the point 'point' is colinear with line segment 'cur-next',
+      // then check if it lies on segment. If it lies, return true,
+      // otherwise false
+      if (ThreePointOrientation(polygon_cur_pnt, point, polygon_next_pnt) == 0)
+      {
+        return PointOnLineSeg(polygon_cur_pnt, point, polygon_next_pnt);
+      }
+      count++;
+    }
+    cur_idx = next_idx;
+  } while (cur_idx != 0);
+
+  // return true if count is odd, false othterwise
+  return count % 2 == 1;
+}
+
+/// Function to get distance from a point to a line segment
+/// assumes z components is 0!
+/// \param p point
+/// \param line_segment_start
+/// \param line_segment_end
+/// \return distance
+double LineSegDistance2D(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Point& line_segment_start,
+                         const geometry_msgs::msg::Point& line_segment_end)
+{
+  // code adapted from http://geomalgorithms.com/a02-_lines.html
+
+  // vector end to start
+  double v_x = line_segment_end.x - line_segment_start.x;
+  double v_y = line_segment_end.y - line_segment_start.y;
+
+  // vector start to point
+  double w_x = point.x - line_segment_start.x;
+  double w_y = point.y - line_segment_start.y;
+
+  // if outside one boundary, get point to point distance
+  double c1 = v_x * w_x + v_y * w_y;  // dot product
+
+  if (c1 <= 0)
+    return PointXYDist(point, line_segment_start);
+
+  // if outside other boundary, get point to point distance
+  double c2 = v_x * v_x + v_y * v_y;  // dot product
+  if (c2 <= c1)
+    return PointXYDist(point, line_segment_end);
+
+  // otherwise project point and get distance (seems inefficient?)
+  double b = c1 / c2;
+  geometry_msgs::msg::Point point_projected;
+  point_projected.x = line_segment_start.x + b * v_x;
+  point_projected.y = line_segment_start.y + b * v_y;
+  return PointXYDist(point, point_projected);
+}
+
+/// Function to get distance from a point to a line segment
+/// \param p point
+/// \param line_segment_start
+/// \param line_segment_end
+/// \return distance
+double LineSegDistance3D(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Point& line_segment_start,
+                         const geometry_msgs::msg::Point& line_segment_end)
+{
+  // code adapted from http://geomalgorithms.com/a02-_lines.html
+
+  // vector end to start
+  double v_x = line_segment_end.x - line_segment_start.x;
+  double v_y = line_segment_end.y - line_segment_start.y;
+  double v_z = line_segment_end.z - line_segment_start.z;
+
+  // vector start to point
+  double w_x = point.x - line_segment_start.x;
+  double w_y = point.y - line_segment_start.y;
+  double w_z = point.z - line_segment_start.z;
+
+  // if outside one boundary, get point to point distance
+  double c1 = v_x * w_x + v_y * w_y + v_z * w_z;  // dot product
+
+  if (c1 <= 0)
+    return PointXYZDist(point, line_segment_start);
+
+  // if outside other boundary, get point to point distance
+  double c2 = v_x * v_x + v_y * v_y + v_z * v_z;  // dot product
+  if (c2 <= c1)
+    return PointXYZDist(point, line_segment_end);
+
+  // otherwise project point and get distance (seems inefficient?)
+  double b = c1 / c2;
+  geometry_msgs::msg::Point point_projected;
+  point_projected.x = line_segment_start.x + b * v_x;
+  point_projected.y = line_segment_start.y + b * v_y;
+  point_projected.z = line_segment_start.z + b * v_z;
+  return PointXYZDist(point, point_projected);
+}
+
+/// Function to get distance from a point to the closest point on boundary of a polygon
+/// \param p point
+/// \param polygon polygon
+/// \return distance
+double DistancePoint2DToPolygon(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Polygon& polygon)
+{
+  int polygon_pnt_num = polygon.points.size();
+  if (polygon_pnt_num < 1)
+    return 0;
+  if (polygon_pnt_num == 1)
+  {
+    geometry_msgs::msg::Point poly_point = GeoMsgPoint(polygon.points[0].x, polygon.points[0].y, 0);
+    return PointXYDist(point, poly_point);
+  }
+
+  double distance_return = INFINITY;
+  int cur_idx = 0;
+
+  // iterate through points in polygon
+  do
+  {
+    int next_idx = (cur_idx + 1) % polygon_pnt_num;
+
+    // get point to line segment distance
+    geometry_msgs::msg::Point polygon_cur_pnt = GeoMsgPoint(polygon.points[cur_idx].x, polygon.points[cur_idx].y, 0);
+    geometry_msgs::msg::Point polygon_next_pnt = GeoMsgPoint(polygon.points[next_idx].x, polygon.points[next_idx].y, 0);
+    double distance = LineSegDistance2D(point, polygon_cur_pnt, polygon_next_pnt);
+    if (distance < distance_return)
+    {
+      distance_return = distance;
+    }
+
+    cur_idx = next_idx;
+  } while (cur_idx != 0);
+
+  return distance_return;
+}
+
+void LinInterpPoints(const Eigen::Vector3d& p1, const Eigen::Vector3d& p2, double resolution,
+                     std::vector<Eigen::Vector3d>& interp_points)
+{
+  interp_points.clear();
+  double point_dist = (p1 - p2).norm();
+  if (point_dist < resolution)
+  {
+    interp_points.push_back(p1);
+    interp_points.push_back(p2);
+    return;
+  }
+  int point_num = ceil(point_dist / resolution);
+  for (int i = 0; i < point_num; i++)
+  {
+    Eigen::Vector3d interp_point = (p2 - p1) / point_num * i + p1;
+    interp_points.push_back(interp_point);
+  }
+}
+
+double DegreeToRadian(double degree)
+{
+  return degree / 180.0 * M_PI;
+}
+double RadianToDegree(double radian)
+{
+  return radian * 180.0 / M_PI;
+}
+
+void ConcatenatePath(nav_msgs::msg::Path& path1, const nav_msgs::msg::Path& path2, int from_ind, int to_ind)
+{
+  if (path2.poses.empty())
+  {
+    return;
+  }
+  if (from_ind == -1 && to_ind == -1)
+  {
+    from_ind = 0;
+    to_ind = path2.poses.size() - 1;
+  }
+  MY_ASSERT(from_ind >= 0 && from_ind < path2.poses.size());
+  MY_ASSERT(to_ind >= 0 && to_ind < path2.poses.size());
+  if (from_ind <= to_ind)
+  {
+    for (int i = from_ind; i <= to_ind; i++)
+    {
+      path1.poses.push_back(path2.poses[i]);
+    }
+  }
+  else
+  {
+    for (int i = from_ind; i >= to_ind; i--)
+    {
+      path1.poses.push_back(path2.poses[i]);
+    }
+  }
+}
+
+void SetDifference(std::vector<int>& v1, std::vector<int>& v2, std::vector<int>& diff)
+{
+  std::sort(v1.begin(), v1.end());
+  std::sort(v2.begin(), v2.end());
+  diff.clear();
+  std::set_difference(v1.begin(), v1.end(), v2.begin(), v2.end(), std::inserter(diff, diff.begin()));
+}
+
+void SetIntersection(std::vector<int> &v1, std::vector<int> &v2, std::vector<int> &intersection)
+{
+  std::sort(v1.begin(), v1.end());
+  std::sort(v2.begin(), v2.end());
+  intersection.clear();
+  std::set_intersection(
+      v1.begin(), v1.end(),
+      v2.begin(), v2.end(),
+      std::inserter(intersection, intersection.begin()));
+}
+
+int Marker::id_ = 0;
+
+int signum(int x)
+{
+  return x == 0 ? 0 : x < 0 ? -1 : 1;
+}
+
+double mod(double value, double modulus)
+{
+  return fmod(fmod(value, modulus) + modulus, modulus);
+}
+
+double intbound(double s, double ds)
+{
+  // Find the smallest positive t such that s+t*ds is an integer.
+  if (ds < 0)
+  {
+    return intbound(-s, -ds);
+  }
+  else
+  {
+    s = mod(s, 1);
+    // problem is now s+t*ds = 1
+    return (1 - s) / ds;
+  }
+}
+
+bool InRange(const Eigen::Vector3i& sub, const Eigen::Vector3i& max_sub, const Eigen::Vector3i& min_sub)
+{
+  return sub.x() >= min_sub.x() && sub.x() <= max_sub.x() && sub.y() >= min_sub.y() && sub.y() <= max_sub.y() &&
+         sub.z() >= min_sub.z() && sub.z() <= max_sub.z();
+}
+void RayCast(const Eigen::Vector3i& start_sub, const Eigen::Vector3i& end_sub, const Eigen::Vector3i& max_sub,
+             const Eigen::Vector3i& min_sub, std::vector<Eigen::Vector3i>& output)
+{
+  output.clear();
+  MY_ASSERT(InRange(start_sub, max_sub, min_sub));
+  MY_ASSERT(InRange(end_sub, max_sub, min_sub));
+
+  if (start_sub == end_sub)
+  {
+    output.push_back(start_sub);
+    return;
+  }
+  Eigen::Vector3i diff_sub = end_sub - start_sub;
+  double max_dist = diff_sub.squaredNorm();
+  int step_x = signum(diff_sub.x());
+  int step_y = signum(diff_sub.y());
+  int step_z = signum(diff_sub.z());
+  double t_max_x = step_x == 0 ? DBL_MAX : intbound(start_sub.x(), diff_sub.x());
+  double t_max_y = step_y == 0 ? DBL_MAX : intbound(start_sub.y(), diff_sub.y());
+  double t_max_z = step_z == 0 ? DBL_MAX : intbound(start_sub.z(), diff_sub.z());
+  double t_delta_x = step_x == 0 ? DBL_MAX : (double)step_x / (double)diff_sub.x();
+  double t_delta_y = step_y == 0 ? DBL_MAX : (double)step_y / (double)diff_sub.y();
+  double t_delta_z = step_z == 0 ? DBL_MAX : (double)step_z / (double)diff_sub.z();
+  double dist = 0;
+  Eigen::Vector3i cur_sub = start_sub;
+
+  while (true)
+  {
+    MY_ASSERT(InRange(cur_sub, max_sub, min_sub));
+    output.push_back(cur_sub);
+    dist = (cur_sub - start_sub).squaredNorm();
+    if (cur_sub == end_sub || dist > max_dist)
+    {
+      return;
+    }
+    if (t_max_x < t_max_y)
+    {
+      if (t_max_x < t_max_z)
+      {
+        cur_sub.x() += step_x;
+        t_max_x += t_delta_x;
+      }
+      else
+      {
+        cur_sub.z() += step_z;
+        t_max_z += t_delta_z;
+      }
+    }
+    else
+    {
+      if (t_max_y < t_max_z)
+      {
+        cur_sub.y() += step_y;
+        t_max_y += t_delta_y;
+      }
+      else
+      {
+        cur_sub.z() += step_z;
+        t_max_z += t_delta_z;
+      }
+    }
+  }
+}
+
+bool InFOV(const Eigen::Vector3d& point_position, const Eigen::Vector3d& viewpoint_position, double vertical_half_fov,
+           double range)
+{
+  Eigen::Vector3d diff = point_position - viewpoint_position;
+  double dist = diff.norm();
+  if (dist > range)
+  {
+    return false;
+  }
+  double theta = acos(diff.z() / dist);
+  double vertical_fov_max = M_PI / 2 + vertical_half_fov;
+  double vertical_fov_min = M_PI / 2 - vertical_half_fov;
+  return theta >= vertical_fov_min && theta <= vertical_fov_max;
+}
+
+bool InFOVSimple(const Eigen::Vector3d& point_position, const Eigen::Vector3d& viewpoint_position,
+                 double vertical_fov_ratio, double range, double xy_dist_threshold, double z_diff_threshold, bool print)
+{
+  Eigen::Vector3d diff = point_position - viewpoint_position;
+  double xy_dist = sqrt(diff.x() * diff.x() + diff.y() * diff.y());
+  if (xy_dist < xy_dist_threshold && std::abs(diff.z()) < z_diff_threshold)
+  {
+    if (print)
+    {
+      std::cout << "nearby point approximation" << std::endl;
+    }
+    return true;
+  }
+  if (xy_dist > range)
+  {
+    if (print)
+    {
+      std::cout << "xy_dist: " << xy_dist << " > range: " << range << std::endl;
+    }
+    return false;
+  }
+  if (std::abs(diff.z()) > vertical_fov_ratio * xy_dist)
+  {
+    if (print)
+    {
+      std::cout << "diff z: " << std::abs(diff.z()) << " > threshold : " << vertical_fov_ratio << " * " << xy_dist
+                << " = " << vertical_fov_ratio * xy_dist << std::endl;
+    }
+    return false;
+  }
+  return true;
+}
+
+float ApproxAtan(float z)
+{
+  const float n1 = 0.97239411f;
+  const float n2 = -0.19194795f;
+  return (n1 + n2 * z * z) * z;
+}
+
+float ApproxAtan2(float y, float x)
+{
+  float ay = std::abs(y), ax = std::abs(x);
+  int invert = ay > ax;
+  float z = invert ? ax / ay : ay / ax;  // [0,1]
+  float th = ApproxAtan(z);              // [0,π/4]
+  if (invert)
+    th = M_PI_2 - th;  // [0,π/2]
+  if (x < 0)
+    th = M_PI - th;      // [0,π]
+  th = copysign(th, y);  // [-π,π]
+  return th;
+}
+
+double GetPathLength(const nav_msgs::msg::Path& path)
+{
+  double path_length = 0.0;
+  if (path.poses.size() < 2)
+  {
+    return path_length;
+  }
+  for (int i = 0; i < path.poses.size() - 1; i++)
+  {
+    int cur_pose_idx = i;
+    int next_pose_idx = i + 1;
+    path_length += misc_utils_ns::PointXYZDist<geometry_msgs::msg::Point, geometry_msgs::msg::Point>(
+        path.poses[cur_pose_idx].pose.position, path.poses[next_pose_idx].pose.position);
+  }
+  return path_length;
+}
+
+double GetPathLength(const std::vector<Eigen::Vector3d>& path)
+{
+  double path_length = 0.0;
+  if (path.size() < 2)
+  {
+    return path_length;
+  }
+  for (int i = 0; i < path.size() - 1; i++)
+  {
+    int cur_idx = i;
+    int next_idx = i + 1;
+    path_length += (path[cur_idx] - path[next_idx]).norm();
+  }
+  return path_length;
+}
+
+double AStarSearch(const std::vector<std::vector<int>>& graph, const std::vector<std::vector<double>>& node_dist,
+                   const std::vector<geometry_msgs::msg::Point>& node_positions, int from_idx, int to_idx,
+                   bool get_path, std::vector<int>& path_indices)
+{
+  MY_ASSERT(graph.size() == node_dist.size());
+  MY_ASSERT(graph.size() == node_positions.size());
+  MY_ASSERT(misc_utils_ns::InRange<std::vector<int>>(graph, from_idx));
+  MY_ASSERT(misc_utils_ns::InRange<std::vector<int>>(graph, to_idx));
+  double INF = 9999.0;
+  typedef std::pair<double, int> iPair;
+  double shortest_dist = 0;
+  std::priority_queue<iPair, std::vector<iPair>, std::greater<iPair>> pq;
+  std::vector<double> g(graph.size(), INF);
+  std::vector<double> f(graph.size(), INF);
+  std::vector<int> prev(graph.size(), -1);
+  std::vector<bool> in_pg(graph.size(), false);
+
+  g[from_idx] = 0;
+  f[from_idx] = misc_utils_ns::PointXYZDist<geometry_msgs::msg::Point, geometry_msgs::msg::Point>(
+      node_positions[from_idx], node_positions[to_idx]);
+  pq.push(std::make_pair(f[from_idx], from_idx));
+  in_pg[from_idx] = true;
+
+  while (!pq.empty())
+  {
+    int u = pq.top().second;
+    pq.pop();
+    in_pg[u] = false;
+    if (u == to_idx)
+    {
+      shortest_dist = g[u];
+      break;
+    }
+
+    for (int i = 0; i < graph[u].size(); i++)
+    {
+      int v = graph[u][i];
+      MY_ASSERT(misc_utils_ns::InRange<std::vector<int>>(graph, v));
+      double d = node_dist[u][i];
+      if (g[v] > g[u] + d)
+      {
+        prev[v] = u;
+        g[v] = g[u] + d;
+        f[v] = g[v] + misc_utils_ns::PointXYZDist<geometry_msgs::msg::Point, geometry_msgs::msg::Point>(
+                          node_positions[v], node_positions[to_idx]);
+        if (!in_pg[v])
+        {
+          pq.push(std::make_pair(f[v], v));
+          in_pg[v] = true;
+        }
+      }
+    }
+  }
+
+  if (get_path)
+  {
+    path_indices.clear();
+    int u = to_idx;
+    if (prev[u] != -1 || u == from_idx)
+    {
+      while (u != -1)
+      {
+        path_indices.push_back(u);
+        u = prev[u];
+      }
+    }
+  }
+  std::reverse(path_indices.begin(), path_indices.end());
+
+  return shortest_dist;
+}
+
+bool AStarSearchWithMaxPathLength(const std::vector<std::vector<int>>& graph,
+                                  const std::vector<std::vector<double>>& node_dist,
+                                  const std::vector<geometry_msgs::msg::Point>& node_positions, int from_idx,
+                                  int to_idx, bool get_path, std::vector<int>& path_indices, double& shortest_dist,
+                                  double max_path_length)
+{
+  MY_ASSERT(graph.size() == node_dist.size());
+  MY_ASSERT(graph.size() == node_positions.size());
+  MY_ASSERT(misc_utils_ns::InRange<std::vector<int>>(graph, from_idx));
+  MY_ASSERT(misc_utils_ns::InRange<std::vector<int>>(graph, to_idx));
+  double INF = 9999.0;
+  typedef std::pair<double, int> iPair;
+  std::priority_queue<iPair, std::vector<iPair>, std::greater<iPair>> pq;
+  std::vector<double> g(graph.size(), INF);
+  std::vector<double> f(graph.size(), INF);
+  std::vector<int> prev(graph.size(), -1);
+  std::vector<bool> in_pg(graph.size(), false);
+
+  g[from_idx] = 0;
+  f[from_idx] = misc_utils_ns::PointXYZDist<geometry_msgs::msg::Point, geometry_msgs::msg::Point>(
+      node_positions[from_idx], node_positions[to_idx]);
+  pq.push(std::make_pair(f[from_idx], from_idx));
+  in_pg[from_idx] = true;
+
+  bool found_path = false;
+
+  while (!pq.empty())
+  {
+    int u = pq.top().second;
+    pq.pop();
+    in_pg[u] = false;
+    if (u == to_idx)
+    {
+      shortest_dist = g[u];
+      found_path = true;
+      break;
+    }
+    if (g[u] > max_path_length)
+    {
+      shortest_dist = g[u];
+      found_path = false;
+      break;
+    }
+
+    for (int i = 0; i < graph[u].size(); i++)
+    {
+      int v = graph[u][i];
+      MY_ASSERT(misc_utils_ns::InRange<std::vector<int>>(graph, v));
+      double d = node_dist[u][i];
+      if (g[v] > g[u] + d)
+      {
+        prev[v] = u;
+        g[v] = g[u] + d;
+        f[v] = g[v] + misc_utils_ns::PointXYZDist<geometry_msgs::msg::Point, geometry_msgs::msg::Point>(
+                          node_positions[v], node_positions[to_idx]);
+        if (!in_pg[v])
+        {
+          pq.push(std::make_pair(f[v], v));
+          in_pg[v] = true;
+        }
+      }
+    }
+  }
+
+  if (get_path && found_path)
+  {
+    path_indices.clear();
+    int u = to_idx;
+    if (prev[u] != -1 || u == from_idx)
+    {
+      while (u != -1)
+      {
+        path_indices.push_back(u);
+        u = prev[u];
+      }
+    }
+  }
+  std::reverse(path_indices.begin(), path_indices.end());
+
+  return found_path;
+}
+
+nav_msgs::msg::Path SimplifyPath(const nav_msgs::msg::Path& path)
+{
+  nav_msgs::msg::Path simplified_path;
+  if (path.poses.size() <= 2)
+  {
+    simplified_path = path;
+    return simplified_path;
+  }
+
+  simplified_path.poses.push_back(path.poses.front());
+  for (int i = 1; i < path.poses.size() - 1; i++)
+  {
+    geometry_msgs::msg::Point prev_point = path.poses[i - 1].pose.position;
+    geometry_msgs::msg::Point cur_point = path.poses[i].pose.position;
+    geometry_msgs::msg::Point next_point = path.poses[i + 1].pose.position;
+    if (!CollinearXY(prev_point, cur_point, next_point))  // The three points are not colinear in the xy plane
+    {
+      simplified_path.poses.push_back(path.poses[i]);
+    }
+  }
+
+  simplified_path.poses.push_back(path.poses.back());
+
+  return simplified_path;
+}
+
+nav_msgs::msg::Path DeduplicatePath(const nav_msgs::msg::Path& path, double min_dist)
+{
+  nav_msgs::msg::Path deduplicated_path;
+  if (path.poses.size() <= 2)
+  {
+    deduplicated_path = path;
+    return deduplicated_path;
+  }
+  deduplicated_path.poses.push_back(path.poses.front());
+  geometry_msgs::msg::Point prev_point = path.poses[0].pose.position;
+  for (int i = 1; i < path.poses.size() - 1; i++)
+  {
+    double dist_to_prev = misc_utils_ns::PointXYZDist<geometry_msgs::msg::Point, geometry_msgs::msg::Point>(
+        prev_point, path.poses[i].pose.position);
+    if (dist_to_prev > min_dist)
+    {
+      deduplicated_path.poses.push_back(path.poses[i]);
+      prev_point = path.poses[i].pose.position;
+    }
+  }
+  double dist_to_prev = misc_utils_ns::PointXYZDist<geometry_msgs::msg::Point, geometry_msgs::msg::Point>(
+      prev_point, path.poses.back().pose.position);
+  double dist_to_start = misc_utils_ns::PointXYZDist<geometry_msgs::msg::Point, geometry_msgs::msg::Point>(
+      path.poses.front().pose.position, path.poses.back().pose.position);
+  if (dist_to_prev > min_dist && dist_to_start > min_dist)
+  {
+    deduplicated_path.poses.push_back(path.poses.back());
+  }
+  return deduplicated_path;
+}
+
+void SampleLineSegments(const std::vector<Eigen::Vector3d>& initial_points, double sample_resol,
+                        std::vector<Eigen::Vector3d>& sample_points)
+
+{
+  if (initial_points.size() < 2)
+  {
+    sample_points = initial_points;
+    return;
+  }
+  sample_points.clear();
+  int cur_idx = 0;
+  int next_idx = 1;
+  Eigen::Vector3d cur_point = initial_points[cur_idx];
+  Eigen::Vector3d next_point = initial_points[next_idx];
+  sample_points.push_back(cur_point);
+  double sample_dist = sample_resol;
+  while (next_idx < initial_points.size())
+  {
+    next_point = initial_points[next_idx];
+    double dist_to_next = (cur_point - next_point).norm();
+    if (dist_to_next > sample_dist)
+    {
+      Eigen::Vector3d sample_point = (next_point - cur_point).normalized() * sample_dist + cur_point;
+      sample_points.push_back(sample_point);
+      cur_point = sample_point;
+      sample_dist = sample_resol;
+    }
+    else
+    {
+      next_idx++;
+      cur_point = next_point;
+      sample_dist = sample_dist - dist_to_next;
+    }
+  }
+  sample_points.push_back(next_point);
+}
+
+void UniquifyIntVector(std::vector<int>& list)
+{
+  std::sort(list.begin(), list.end());
+  std::vector<int>::iterator it;
+  it = std::unique(list.begin(), list.end());
+  list.resize(std::distance(list.begin(), it));
+}
+
+#include <vector>
+#include <queue>
+#include <unordered_map>
+#include <algorithm>
+#include <iostream> // 可选：用于输出调试信息
+
+std::vector<int> find_path_bfs(
+    int start,
+    int goal,
+    const Eigen::MatrixXi &adj_matrix)
+{
+  int n = adj_matrix.rows();
+
+  if (start < 0 || start >= n || goal < 0 || goal >= n)
+  {
+    std::cerr << "[find_path_bfs] Invalid start or goal index: "
+              << "start = " << start << ", goal = " << goal << ", size = " << n << std::endl;
+    return {};
+  }
+
+  if (adj_matrix.rows() != adj_matrix.cols())
+  {
+    std::cerr << "[find_path_bfs] Error: adj_matrix is not square!" << std::endl;
+    return {};
+  }
+
+  std::vector<bool> visited(n, false);
+  std::unordered_map<int, int> parent;
+  std::queue<int> q;
+
+  visited[start] = true;
+  q.push(start);
+
+  while (!q.empty())
+  {
+    int curr = q.front();
+    q.pop();
+
+    if (curr == goal)
+      break;
+
+    for (int neighbor = 0; neighbor < n; ++neighbor)
+    {
+      if (curr < 0 || curr >= n || neighbor < 0 || neighbor >= n)
+        continue;
+
+      if (adj_matrix(curr, neighbor) != 0 && !visited[neighbor])
+      {
+        visited[neighbor] = true;
+        parent[neighbor] = curr;
+        q.push(neighbor);
+      }
+    }
+  }
+
+  if (!visited[goal])
+  {
+    std::cerr << "[find_path_bfs] No path found from " << start << " to " << goal << std::endl;
+    return {};
+  }
+
+  std::vector<int> path;
+  int at = goal;
+  while (at != start)
+  {
+    path.push_back(at);
+    if (parent.find(at) == parent.end())
+    {
+      std::cerr << "[find_path_bfs] Error: broken path during backtracking at " << at << std::endl;
+      return {};
+    }
+    at = parent[at];
+  }
+  path.push_back(start);
+  std::reverse(path.begin(), path.end());
+  return path;
+}
+
+// 辅助函数：将 HSV 映射为 RGB，范围都是 [0, 255]
+Eigen::Vector3d HSVtoRGB(double h, double s, double v)
+{
+  double c = v * s;
+  double x = c * (1 - std::fabs(fmod(h / 60.0, 2) - 1));
+  double m = v - c;
+  double r_, g_, b_;
+
+  if (h < 60)
+  {
+    r_ = c;
+    g_ = x;
+    b_ = 0;
+  }
+  else if (h < 120)
+  {
+    r_ = x;
+    g_ = c;
+    b_ = 0;
+  }
+  else if (h < 180)
+  {
+    r_ = 0;
+    g_ = c;
+    b_ = x;
+  }
+  else if (h < 240)
+  {
+    r_ = 0;
+    g_ = x;
+    b_ = c;
+  }
+  else if (h < 300)
+  {
+    r_ = x;
+    g_ = 0;
+    b_ = c;
+  }
+  else
+  {
+    r_ = c;
+    g_ = 0;
+    b_ = x;
+  }
+
+  int r = static_cast<int>((r_ + m) * 255.0);
+  int g = static_cast<int>((g_ + m) * 255.0);
+  int b = static_cast<int>((b_ + m) * 255.0);
+
+  return Eigen::Vector3d(b, g, r); // 注意：OpenCV 中 BGR 顺序
+}
+
+// 主函数：ID 映射为鲜艳 RGB 颜色
+Eigen::Vector3d idToColor(int id)
+{
+  if (id == 0)
+  {
+    return Eigen::Vector3d(255, 255, 255); // 白色表示无效或未分配的 ID
+  }
+
+  double hue = (id * 57) % 360; // 色相：分布在 0~360
+  double saturation = 0.85;     // 高饱和度
+  double value = 0.95;          // 高亮度
+
+  return HSVtoRGB(hue, saturation, value);
+}
+
+// 将点坐标转换为体素网格索引
+Eigen::Vector3i point_to_voxel(
+    const Eigen::Vector3f &pt,
+    const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0)
+    float inv_resolution)                // precomputed 1.0 / grid_resolution
+{
+  Eigen::Vector3f shifted = pt * inv_resolution + origin_shift;
+  return shifted.array().floor().cast<int>();
+}
+
+Eigen::Vector3i point_to_voxel(
+    geometry_msgs::msg::Point pt,
+    const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0)
+    float inv_resolution)                // precomputed 1.0 / grid_resolution
+{
+  Eigen::Vector3f pt_eigen(pt.x, pt.y, pt.z);
+  Eigen::Vector3f shifted = pt_eigen * inv_resolution + origin_shift;
+  return shifted.array().floor().cast<int>();
+}
+
+// 将体素网格索引转换为点坐标
+Eigen::Vector3f voxel_to_point(
+    const Eigen::Vector3i &voxel_index,
+    const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0)
+    float resolution)                    // grid_resolution
+{
+  Eigen::Vector3f pt = voxel_index.cast<float>() - origin_shift;
+  return pt * resolution;
+}
+
+Eigen::Vector3i point_to_voxel_cropped(
+    const Eigen::Vector3f &pt,
+    const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0)
+    float inv_resolution,
+    std::vector<Eigen::Vector2i> &bbox) // crop region bbox
+{
+  Eigen::Vector3f shifted = pt * inv_resolution + origin_shift;
+  shifted.x() = shifted.x() - bbox[0].x();
+  shifted.y() = shifted.y() - bbox[0].y();
+  shifted.z() = shifted.z();
+
+  return shifted.array().floor().cast<int>();
+}
+
+// 将体素网格索引转换为点坐标
+Eigen::Vector3f voxel_to_point_cropped(
+    const Eigen::Vector3i &voxel_index,
+    const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0)
+    float resolution,
+    std::vector<Eigen::Vector2i> &bbox) // crop region bbox
+{
+  Eigen::Vector3f voxel_idx = voxel_index.cast<float>();
+  voxel_idx.x() += bbox[0].x();
+  voxel_idx.y() += bbox[0].y();
+  Eigen::Vector3f pt = voxel_idx - origin_shift;
+  return pt * resolution;
+}
+
+}  // namespace misc_utils_ns
+
+template void misc_utils_ns::KeyposeToMap<pcl::PointCloud<pcl::PointXYZ>::Ptr>(
+    pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose);
+template void misc_utils_ns::KeyposeToMap<pcl::PointCloud<pcl::PointXYZI>::Ptr>(
+    pcl::PointCloud<pcl::PointXYZI>::Ptr& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose);
+template void misc_utils_ns::KeyposeToMap<pcl::PointCloud<pcl::PointXYZINormal>::Ptr>(
+    pcl::PointCloud<pcl::PointXYZINormal>::Ptr& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose);
+template void misc_utils_ns::KeyposeToMap<pcl::PointCloud<pcl::PointXYZRGBNormal>::Ptr>(
+    pcl::PointCloud<pcl::PointXYZRGBNormal>::Ptr& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose);
diff --git a/track_anything/setup.cfg b/track_anything/setup.cfg
index 356c3c5..f5d61c5 100644
--- a/track_anything/setup.cfg
+++ b/track_anything/setup.cfg
@@ -1,5 +1,3 @@
-[develop]
-script_dir=$base/lib/track_anything
 [install]
 install_scripts=$base/lib/track_anything
 
diff --git a/track_anything/setup.py b/track_anything/setup.py
index 8069b21..ce62bac 100644
--- a/track_anything/setup.py
+++ b/track_anything/setup.py
@@ -1,9 +1,30 @@
 from setuptools import find_packages, setup
 import os
+import sys
 from glob import glob
 
 package_name = 'track_anything'
 
+# colcon --symlink-install may pass args setuptools doesn't understand
+def _strip_colcon_args(argv: list[str]) -> list[str]:
+    cleaned: list[str] = []
+    skip_next = False
+    for i, arg in enumerate(argv):
+        if skip_next:
+            skip_next = False
+            continue
+        if arg == "--editable":
+            continue
+        if arg == "--build-directory":
+            skip_next = True
+            continue
+        if arg.startswith("--build-directory="):
+            continue
+        cleaned.append(arg)
+    return cleaned
+
+sys.argv = _strip_colcon_args(sys.argv)
+
 setup(
     name=package_name,
     version='0.0.1',
@@ -26,7 +47,6 @@
     maintainer_email='alex.lin416@outlook.com',
     description='ROS2 package for tracking objects using Track-Anything',
     license='MIT',
-    tests_require=['pytest'],
     entry_points={
         'console_scripts': [
         ],
diff --git a/vector_perception_utils/setup.cfg b/vector_perception_utils/setup.cfg
deleted file mode 100644
index 1ce0c22..0000000
--- a/vector_perception_utils/setup.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-[develop]
-script_dir=$base/lib/vector_perception_utils
-[install]
-install_scripts=$base/lib/vector_perception_utils
-
diff --git a/vector_perception_utils/setup.py b/vector_perception_utils/setup.py
index f1b383f..84834ac 100644
--- a/vector_perception_utils/setup.py
+++ b/vector_perception_utils/setup.py
@@ -1,9 +1,30 @@
 from setuptools import find_packages, setup
+import sys
 import os
 from glob import glob
 
 package_name = 'vector_perception_utils'
 
+# colcon --symlink-install may pass args setuptools doesn't understand
+def _strip_colcon_args(argv: list[str]) -> list[str]:
+    cleaned: list[str] = []
+    skip_next = False
+    for i, arg in enumerate(argv):
+        if skip_next:
+            skip_next = False
+            continue
+        if arg == "--editable":
+            continue
+        if arg == "--build-directory":
+            skip_next = True
+            continue
+        if arg.startswith("--build-directory="):
+            continue
+        cleaned.append(arg)
+    return cleaned
+
+sys.argv = _strip_colcon_args(sys.argv)
+
 setup(
     name=package_name,
     version='0.0.1',
@@ -21,7 +42,6 @@
     maintainer_email='alex.lin416@outlook.com',
     description='Utility functions for image and point cloud processing',
     license='MIT',
-    tests_require=['pytest'],
     entry_points={
         'console_scripts': [
             'detection_to_waypoint = vector_perception_utils.detection_to_waypoint:main',
diff --git a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py
index 835db76..40e7141 100644
--- a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py
+++ b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py
@@ -2,6 +2,7 @@
 
 from typing import Tuple, Optional
 
+import cv2
 import numpy as np
 import open3d as o3d
 from scipy.spatial.transform import Rotation
@@ -148,6 +149,75 @@ def remove_radius_outliers(
     return points_filtered, colors_filtered, inlier_indices
 
 
+def fit_plane(
+    points: np.ndarray,
+    distance_threshold: float = 0.05,
+    ransac_n: int = 3,
+    num_iterations: int = 50,
+    max_points: int = 8000,
+    min_inlier_ratio: float = 0.0,
+    min_normal_z: float = 0.0,
+    min_above_ratio: float = 0.0,
+    prefer_up: bool = True,
+) -> Optional[np.ndarray]:
+    """Fit a plane to a point cloud using RANSAC.
+
+    Args:
+        points: Point coordinates (N, 3).
+        distance_threshold: Max distance for inliers during RANSAC.
+        ransac_n: Number of points to sample per RANSAC iteration.
+        num_iterations: RANSAC iterations.
+        max_points: Max points to sample for fitting (0 keeps all).
+        min_inlier_ratio: Minimum inlier ratio to accept the plane.
+        min_normal_z: Minimum absolute Z component of the plane normal.
+        min_above_ratio: Minimum ratio of points above the plane.
+        prefer_up: If True, flip normal to have positive Z.
+
+    Returns:
+        Plane coefficients [a, b, c, d] for ax + by + cz + d = 0, or None.
+    """
+    if points.shape[0] < 3:
+        return None
+
+    sample_points = points
+    if max_points > 0 and points.shape[0] > max_points:
+        indices = np.random.choice(points.shape[0], max_points, replace=False)
+        sample_points = points[indices]
+
+    pcd = create_pointcloud(sample_points)
+    plane_model, inliers = pcd.segment_plane(
+        distance_threshold=distance_threshold,
+        ransac_n=ransac_n,
+        num_iterations=num_iterations,
+    )
+    if len(inliers) == 0:
+        return None
+    if min_inlier_ratio > 0.0 and len(inliers) / sample_points.shape[0] < min_inlier_ratio:
+        return None
+
+    normal = np.array(plane_model[:3], dtype=float)
+    norm = float(np.linalg.norm(normal))
+    if norm == 0.0:
+        return None
+    normal /= norm
+    d = float(plane_model[3]) / norm
+
+    if min_normal_z > 0.0 and abs(normal[2]) < min_normal_z:
+        return None
+    if prefer_up and normal[2] < 0:
+        normal = -normal
+        d = -d
+
+    if min_above_ratio > 0.0:
+        check_points = sample_points
+        distances = check_points @ normal + d
+        above_ratio = float(np.mean(distances > 0.0))
+        if above_ratio < min_above_ratio:
+            return None
+
+    return np.array([normal[0], normal[1], normal[2], d], dtype=float)
+
+
 def transform_pointcloud(
     points: np.ndarray,
     transform: np.ndarray
@@ -199,42 +269,126 @@ def crop_pointcloud_bbox(
 def compute_centroid(points: np.ndarray) -> np.ndarray:
     """
     Compute centroid of point cloud.
-    
+
     Args:
         points: Point coordinates (N, 3)
-        
+
     Returns:
         Centroid as (3,) array
     """
     return np.mean(points, axis=0)
 
+def get_bbox_center(bbox) -> np.ndarray:
+    """Return bounding box center for Open3D AABB/OBB variants."""
+    if hasattr(bbox, "center"):
+        return np.asarray(bbox.center)
+    if hasattr(bbox, "get_center"):
+        return np.asarray(bbox.get_center())
+    if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"):
+        min_bound = np.asarray(bbox.get_min_bound())
+        max_bound = np.asarray(bbox.get_max_bound())
+        return (min_bound + max_bound) * 0.5
+    return np.zeros(3, dtype=float)
+
+
+def get_bbox_extent(bbox) -> np.ndarray:
+    """Return bounding box extent for Open3D AABB/OBB variants."""
+    if hasattr(bbox, "extent"):
+        return np.asarray(bbox.extent)
+    if hasattr(bbox, "get_extent"):
+        return np.asarray(bbox.get_extent())
+    if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"):
+        min_bound = np.asarray(bbox.get_min_bound())
+        max_bound = np.asarray(bbox.get_max_bound())
+        return max_bound - min_bound
+    return np.zeros(3, dtype=float)
+
+
+def get_bbox_corners(bbox) -> np.ndarray:
+    """Return 8 bbox corners for Open3D AABB/OBB variants."""
+    if hasattr(bbox, "get_box_points"):
+        return np.asarray(bbox.get_box_points())
+    if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"):
+        min_bound = np.asarray(bbox.get_min_bound())
+        max_bound = np.asarray(bbox.get_max_bound())
+        return np.array(
+            [
+                [min_bound[0], min_bound[1], min_bound[2]],
+                [max_bound[0], min_bound[1], min_bound[2]],
+                [max_bound[0], max_bound[1], min_bound[2]],
+                [min_bound[0], max_bound[1], min_bound[2]],
+                [min_bound[0], min_bound[1], max_bound[2]],
+                [max_bound[0], min_bound[1], max_bound[2]],
+                [max_bound[0], max_bound[1], max_bound[2]],
+                [min_bound[0], max_bound[1], max_bound[2]],
+            ],
+            dtype=float,
+        )
+    return np.zeros((0, 3), dtype=float)
+
+
+def get_oriented_bbox(
+    points: np.ndarray,
+    axis_aligned: bool = True,
+) -> Optional[Tuple[np.ndarray, np.ndarray]]:
+    """
+    Compute bounding box from point cloud.
+
+    Args:
+        points: Point coordinates (N, 3)
+        axis_aligned: If True, compute axis-aligned bounding box.
+            If False, compute minimal oriented bounding box.
+
+    Returns:
+        Tuple of (center, corners) where center is (3,) and corners is (8, 3),
+        or None if insufficient points.
+    """
+    if len(points) < 4:
+        return None
+
+    pcd = create_pointcloud(points)
+
+    try:
+        if axis_aligned:
+            bbox = pcd.get_axis_aligned_bounding_box()
+        else:
+            bbox = pcd.get_minimal_oriented_bounding_box()
+    except Exception:
+        return None
+
+    center = get_bbox_center(bbox)
+    corners = get_bbox_corners(bbox)
+    return center, corners
+
 
 def pointcloud_to_bbox3d(points: np.ndarray, header=None):
     """
     Convert point cloud to minimal oriented 3D bounding box (ROS2 message).
-    
+
     Args:
         points: Point coordinates (N, 3)
         header: Optional ROS2 header for the message
-        
+
     Returns:
         vision_msgs/BoundingBox3D message or None if insufficient points
     """
-    
+
     if len(points) < 4:
         return None
     
     pcd = create_pointcloud(points)
-    
+
     try:
         obb = pcd.get_minimal_oriented_bounding_box()
     except:
         return None
-    
-    center = obb.center
-    extent = obb.extent
-    quat = Rotation.from_matrix(obb.R).as_quat()
-    
+    center = get_bbox_center(obb)
+    extent = get_bbox_extent(obb)
+    if hasattr(obb, "R"):
+        quat = Rotation.from_matrix(obb.R).as_quat()
+    else:
+        quat = np.array([0.0, 0.0, 0.0, 1.0], dtype=float)
+
     bbox = BoundingBox3D()
     bbox.center.position = Point(
         x=float(center[0]), y=float(center[1]), z=float(center[2])
@@ -243,7 +397,7 @@ def pointcloud_to_bbox3d(points: np.ndarray, header=None):
         x=float(quat[0]), y=float(quat[1]), z=float(quat[2]), w=float(quat[3])
     )
     bbox.size = Vector3(x=float(extent[0]), y=float(extent[1]), z=float(extent[2]))
-    
+
     return bbox
 
 
@@ -328,6 +482,255 @@ def pointcloud_to_depth(
     return depth_image
 
 
+def project_pointcloud_to_image(
+    points: np.ndarray,
+    projection: str = "equirect",
+    image_width: int = 1920,
+    image_height: int = 640,
+    axis_mode: str = "xz",
+    intrinsics: Optional[np.ndarray] = None,
+    depth_filter: bool = False,
+    depth_filter_margin: float = 0.15,
+    transform: Optional[np.ndarray] = None,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Project a point cloud into image pixel coordinates.
+
+    Args:
+        points: Point coordinates (N, 3).
+        projection: Projection model ("equirect" or "pinhole").
+        image_width: Output image width in pixels.
+        image_height: Output image height in pixels.
+        axis_mode: Equirect axis convention ("xz" or "xy").
+        intrinsics: Pinhole intrinsics (3, 3) if using "pinhole".
+        depth_filter: Enable depth-based occlusion filtering.
+        depth_filter_margin: Depth margin for occlusion filtering (meters).
+        transform: Optional 4x4 transform applied before projection.
+
+    Returns:
+        Tuple of (camera_points, pixel_idx) where pixel_idx is (N, 3) with (u, v, depth).
+    """
+    camera_points = transform_pointcloud(points, transform) if transform is not None else points
+
+    if projection == "pinhole":
+        if intrinsics is None:
+            raise ValueError("intrinsics required for pinhole projection")
+        u = camera_points[:, 0] * intrinsics[0, 0] / (camera_points[:, 2] + 1e-6) + intrinsics[0, 2]
+        v = camera_points[:, 1] * intrinsics[1, 1] / (camera_points[:, 2] + 1e-6) + intrinsics[1, 2]
+        return camera_points, np.stack([u, v, camera_points[:, 2]], axis=-1)
+
+    if axis_mode == "xz":
+        hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 2] ** 2)
+        u = (
+            image_width / (2 * np.pi) * np.arctan2(camera_points[:, 0], camera_points[:, 2])
+            + image_width / 2
+            + 1
+        )
+        v = (
+            image_width / (2 * np.pi) * np.arctan(camera_points[:, 1] / hori_dis)
+            + image_height / 2
+            + 1
+        )
+    else:
+        hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 1] ** 2)
+        u = (
+            -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 1], camera_points[:, 0])
+            + image_width / 2
+            + 1
+        )
+        v = (
+            -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 2], hori_dis)
+            + image_height / 2
+            + 1
+        )
+
+    u = np.clip(u, 0, image_width - 1)
+    v = np.clip(v, 0, image_height - 1)
+
+    if depth_filter:
+        depth_map = np.full((image_height, image_width), np.inf)
+        idx = v.astype(int) * image_width + u.astype(int)
+        np.minimum.at(depth_map.ravel(), idx, hori_dis)
+        remove_mask = hori_dis >= depth_map[v.astype(int), u.astype(int)] + depth_filter_margin
+        u = u.copy()
+        v = v.copy()
+        u[remove_mask] = -1
+        v[remove_mask] = -1
+
+    return camera_points, np.stack([u, v, hori_dis], axis=-1)
+
+
+def project_bbox3d_equirect(
+    image: np.ndarray,
+    bbox3d: np.ndarray,
+    body_to_world: np.ndarray,
+    sensor_to_camera: np.ndarray | None,
+    image_width: int,
+    image_height: int,
+    hfov: float = 360.0,
+    vfov: float = 120.0,
+    color: tuple[int, int, int] = (0, 255, 0),
+    thickness: int = 2,
+) -> np.ndarray:
+    """Project a 3D bounding box onto an equirectangular image."""
+    if bbox3d.shape != (8, 3):
+        return image
+    world_to_body = np.linalg.inv(body_to_world)
+    if sensor_to_camera is None:
+        sensor_to_camera = np.eye(4)
+    world_to_camera = sensor_to_camera @ world_to_body
+    points_h = np.concatenate([bbox3d, np.ones((8, 1), dtype=float)], axis=1)
+    cam_points = (world_to_camera @ points_h.T).T[:, :3]
+
+    hfov_rad = max(1e-3, np.deg2rad(hfov))
+    vfov_rad = max(1e-3, np.deg2rad(vfov))
+
+    coords = []
+    for x, y, z in cam_points:
+        yaw = np.arctan2(x, z)
+        dist = np.sqrt(x * x + z * z)
+        pitch = np.arctan2(y, dist)
+        u = (yaw / hfov_rad + 0.5) * image_width
+        v = (0.5 - pitch / vfov_rad) * image_height
+        coords.append((int(round(u)), int(round(v))))
+
+    overlay = image.copy()
+    edges = [
+        (0, 1),
+        (1, 2),
+        (2, 3),
+        (3, 0),
+        (4, 5),
+        (5, 6),
+        (6, 7),
+        (7, 4),
+        (0, 4),
+        (1, 5),
+        (2, 6),
+        (3, 7),
+    ]
+    for i, j in edges:
+        pt1 = coords[i]
+        pt2 = coords[j]
+        if _point_in_image(pt1, image_width, image_height) or _point_in_image(
+            pt2, image_width, image_height
+        ):
+            cv2.line(overlay, pt1, pt2, color, thickness)
+    return overlay
+
+
+def _point_in_image(point: tuple[int, int], width: int, height: int) -> bool:
+    return 0 <= point[0] < width and 0 <= point[1] < height
+
+
+def segment_pointcloud_by_mask(
+    points: np.ndarray,
+    mask: np.ndarray,
+    pixel_idx: np.ndarray,
+) -> np.ndarray:
+    """
+    Segment a point cloud using a binary mask and projected pixels.
+
+    Args:
+        points: Point coordinates (N, 3).
+        mask: Binary mask (H, W).
+        pixel_idx: Pixel coordinates (N, 3) as (u, v, depth).
+
+    Returns:
+        Masked point cloud (M, 3).
+    """
+    h, w = mask.shape[:2]
+    pixel_idx = pixel_idx.astype(int)
+    valid = (
+        (pixel_idx[:, 0] >= 0)
+        & (pixel_idx[:, 0] < w)
+        & (pixel_idx[:, 1] >= 0)
+        & (pixel_idx[:, 1] < h)
+    )
+    pixel_idx = pixel_idx[valid]
+    points = points[valid]
+    keep = mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool)
+    return points[keep]
+
+
+def segment_pointclouds_from_projection(
+    points: np.ndarray,
+    masks: list[np.ndarray],
+    pixel_idx: np.ndarray,
+) -> Tuple[list[np.ndarray], list[np.ndarray]]:
+    """
+    Segment point clouds from projected pixels.
+
+    Args:
+        points: Point coordinates (N, 3).
+        masks: List of binary masks (H, W).
+        pixel_idx: Pixel coordinates (N, 3) as (u, v, depth).
+
+    Returns:
+        Tuple of (clouds, depths) where each entry corresponds to a mask.
+    """
+    if masks is None or len(masks) == 0:
+        return [], []
+
+    image_shape = masks[0].shape
+    valid = (
+        (pixel_idx[:, 0] >= 0)
+        & (pixel_idx[:, 0] < image_shape[1])
+        & (pixel_idx[:, 1] >= 0)
+        & (pixel_idx[:, 1] < image_shape[0])
+    )
+    pixel_idx = pixel_idx[valid]
+    points = points[valid]
+
+    depths = pixel_idx[:, 2]
+    pixel_idx = pixel_idx.astype(int)
+
+    obj_clouds = []
+    obj_depths = []
+    for obj_mask in masks:
+        cloud_mask = obj_mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool)
+        obj_clouds.append(points[cloud_mask])
+        obj_depths.append(depths[cloud_mask].reshape(-1, 1))
+
+    return obj_clouds, obj_depths
+
+
+def transform_segmented_clouds_to_world(
+    clouds: list[np.ndarray],
+    depths: list[np.ndarray],
+    R_b2w: np.ndarray,
+    t_b2w: np.ndarray,
+    depth_jump_threshold: float = 0.3,
+) -> list[np.ndarray]:
+    """
+    Prune depth jumps and transform segmented clouds to world frame.
+
+    Args:
+        clouds: List of segmented point clouds (sensor frame).
+        depths: List of depth arrays aligned with clouds.
+        R_b2w: Body-to-world rotation matrix (3, 3).
+        t_b2w: Body-to-world translation vector (3,).
+        depth_jump_threshold: Depth jump threshold for pruning (meters).
+
+    Returns:
+        List of segmented object clouds in world frame.
+    """
+    obj_cloud_world_list = []
+    for obj_cloud, obj_depth in zip(clouds, depths):
+        if obj_depth.shape[0] > 1:
+            obj_depth_diff = (obj_depth[1:] - obj_depth[:-1]).squeeze()
+            if obj_depth_diff.size > 0 and np.max(obj_depth_diff) > depth_jump_threshold:
+                idx_tmp = np.ones(len(obj_depth), dtype=bool)
+                jump_idx = int(np.argmax(obj_depth_diff))
+                idx_tmp[jump_idx + 1 :] = False
+                obj_cloud = obj_cloud[idx_tmp]
+
+        obj_cloud_world = obj_cloud[:, :3] @ R_b2w.T + t_b2w
+        obj_cloud_world_list.append(obj_cloud_world)
+
+    return obj_cloud_world_list
+
+
 def create_pointcloud2_msg(points: np.ndarray, colors: np.ndarray, header):
     """
     Create PointCloud2 ROS message from points and colors.
diff --git a/vector_perception_utils/vector_perception_utils/transform_utils.py b/vector_perception_utils/vector_perception_utils/transform_utils.py
index 534d8d2..ef3238e 100644
--- a/vector_perception_utils/vector_perception_utils/transform_utils.py
+++ b/vector_perception_utils/vector_perception_utils/transform_utils.py
@@ -8,7 +8,7 @@
 from geometry_msgs.msg import Pose, Quaternion
 
 
-def normalize_angle(angle: float) -> float:
+def normalize_angle(angle: float | np.ndarray) -> float | np.ndarray:
     """
     Normalize angle to [-pi, pi] range.
 
@@ -21,6 +21,26 @@ def normalize_angle(angle: float) -> float:
     return np.arctan2(np.sin(angle), np.cos(angle))
 
 
+def discretize_angles(angles: np.ndarray, num_bins: int) -> np.ndarray:
+    """
+    Discretize angles into bin indices.
+
+    Normalizes angles to [-pi, pi] then maps to bin indices [0, num_bins-1].
+
+    Args:
+        angles: Angles in radians (any range).
+        num_bins: Number of angle bins.
+
+    Returns:
+        Array of bin indices (0 to num_bins-1), same shape as input.
+    """
+    angles = normalize_angle(angles)
+    # Map [-pi, pi] to [0, 2*pi] then to [0, num_bins)
+    normalized = (angles + np.pi) / (2 * np.pi)
+    bins = (normalized * num_bins).astype(int)
+    return np.clip(bins, 0, num_bins - 1)
+
+
 def pose_to_matrix(pose: Pose) -> np.ndarray:
     """
     Convert pose to 4x4 homogeneous transform matrix.
diff --git a/vlm/README.md b/vlm/README.md
new file mode 100644
index 0000000..2bd9fbf
--- /dev/null
+++ b/vlm/README.md
@@ -0,0 +1,67 @@
+# VLM Package
+
+Provider-agnostic vision-language model (VLM) adapters and shared interfaces for
+`vector_perception_ros`.
+
+## Quick Start
+
+Install deps and run tests:
+
+```bash
+source /home/alex-lin/dev/vector_venv/bin/activate
+pip install -r /home/alex-lin/dev/vector_perception_ros/requirements.txt
+pytest -s /home/alex-lin/dev/vector_perception_ros/vlm/test/test_vlm.py
+```
+
+Set API keys:
+
+```bash
+export MOONDREAM_API_KEY=...
+# or
+export QWEN_API_KEY=...
+```
+
+## API Overview
+
+Core interface:
+- `VlmModelBase` (`vlm/models/base.py`)
+  - `query(image, prompt)`
+  - `query_batch(images, prompt)`
+  - `caption(image)`
+  - `detect(image, query)` → list of `VlmDetection`
+  - `point(image, query)` → list of `VlmPoint`
+  - `segment(image, query)` → list of `VlmMask`
+
+Adapters:
+- `MoondreamVlm` (`vlm/models/moondream.py`)
+  - Uses Moondream hosted API
+  - Native detect/point/segment via skills
+  - Segment returns SVG path in `VlmMask.svg_path`
+- `QwenVlm` (`vlm/models/qwen.py`)
+  - Uses OpenAI-compatible DashScope endpoint
+  - Prompt fallback for detect/point (JSON parsing)
+
+## Package Structure
+
+```
+vlm/
+├── docs/
+│   └── VLM_NODE_MIGRATION.md
+├── test/
+│   └── test_vlm.py
+├── vlm/
+│   ├── __init__.py
+│   └── models/
+│       ├── base.py
+│       ├── types.py
+│       ├── prompts.py
+│       ├── prompt_fallback.py
+│       ├── moondream.py
+│       └── qwen.py
+```
+
+## Notes
+
+- Images are numpy arrays in RGB unless otherwise stated.
+- `VlmMask` supports `mask` (mono8) and/or `svg_path` for segmentation outputs.
+- The VLM reasoning ROS node lives under `semantic_mapping/semantic_mapping/`.
diff --git a/vlm/data/kitchen.jpg b/vlm/data/kitchen.jpg
new file mode 100644
index 0000000..f143693
Binary files /dev/null and b/vlm/data/kitchen.jpg differ
diff --git a/vlm/docs/VLM_INPUTS_AND_SELECTION.md b/vlm/docs/VLM_INPUTS_AND_SELECTION.md
new file mode 100644
index 0000000..6f1a471
--- /dev/null
+++ b/vlm/docs/VLM_INPUTS_AND_SELECTION.md
@@ -0,0 +1,271 @@
+# VLM Inputs and "Best Image" Selection
+
+This note documents how room-type and object-type VLM inputs are produced in the
+current migration, and what "best image" actually means in the pipeline. It is
+intended to clarify why viewpoint snapshots exist and how they relate (or do
+not relate) to room-type queries.
+
+## 1) Room segmentation and room-type image crop (legacy planner)
+
+Room segmentation produces a 2D integer room mask (room ids) over a grid. The
+planner (`sensor_coverage_planner_ground.cpp`) maintains this mask and assigns
+room ids to points in the local covered cloud. When a room is newly labeled or
+has grown sufficiently, the planner emits a `RoomType` query that contains:
+
+- `image`: a **cropped panorama** derived from the current camera image.
+- `room_mask`: the **top-down room mask** (grid ids) for context.
+
+How the crop is computed:
+
+1. Collect points in the room (covered cloud) and add the room center.
+2. Use the robot pose at the camera timestamp to project those 3D points into
+   the equirectangular camera image (`project_pcl_to_image`).
+3. Track the horizontal pixel range occupied by the projected points.
+4. Rotate the image so the room center projection is at the horizontal center.
+5. Crop the rotated panorama to `[min_u, max_u]` (plus padding).
+
+The crop logic is in:
+
+- `tare_planner/src/sensor_coverage_planner/sensor_coverage_planner_ground.cpp`
+  - `UpdateRoomLabel()` → `project_pcl_to_image(...)`
+
+Important detail: this crop is **not** based on viewpoint snapshots. It is
+computed directly from the panorama and the room's 3D points.
+
+## 2) Do viewpoint images drive room-type VLM?
+
+**No.** Viewpoint images are saved for **object spatial queries**, not room
+type. Room-type VLM uses the `RoomType` message (cropped panorama + room mask)
+coming from the planner.
+
+Room-type flow in the new stack:
+
+1. `tare_planner` publishes `RoomType` query with `image` + `room_mask`.
+2. `vlm_reasoning_node` builds a room-type prompt and sends the image+mask
+   mosaic to the VLM.
+3. Answer is published back as `RoomType`.
+
+There is no "best view" selection from viewpoint snapshots for room-type.
+
+## 3) What are the viewpoint snapshots?
+
+Viewpoint snapshots are saved in `semantic_mapping` when `ViewpointRep` triggers
+arrive. They capture the **full detection image** (no crop) and a transform.
+
+Storage:
+
+- `output/viewpoint_images/<timestamp>/viewpoint_{id}.png`
+- `output/viewpoint_images/<timestamp>/viewpoint_{id}_transform.npy`
+
+Purpose:
+
+These are used only for **target-object spatial queries**. The VLM builds a
+multi-view mosaic by loading viewpoint images and projecting the 3D bbox onto
+each view (`project_bbox3d_equirect`). This is independent of room-type logic.
+
+Relevant code:
+
+- `semantic_mapping/mapping_node.py`
+  - `_save_viewpoint_snapshot(...)`
+- `semantic_mapping/vlm_reasoning_node.py`
+  - `_build_spatial_mosaic(...)`
+
+## 4) "Best image" selection for object-type VLM
+
+"Best image" is only used for **object-type relabeling queries**, not room type.
+
+Selection logic (`semantic_mapping/mapping_node.py`):
+
+- For each observation, compute mask area.
+- If this area is larger than the current best for that object:
+  - Crop the **full image** to the mask bounding box plus padding.
+  - Save the crop as `.npy` (and optional `.png`).
+  - Store `best_image_path` and `best_image_mask_area`.
+
+Details:
+
+- Padding is `max(height, width, 80)` of the mask bbox.
+- A crop is considered eligible for VLM if `best_image_mask_area > 500`.
+- When a new best crop is written, the object is marked `is_asked_vlm = False`
+  to allow a new object-type query.
+
+This is the only "best image" decision in the current stack.
+
+## 5) How viewpoints and objects relate to room labels
+
+This is handled entirely in the legacy planner (`tare_planner`) today. The
+new `semantic_mapping` pipeline does **not** assign room ids or labels; it only
+passes `viewpoint_id` through for spatial VLM queries.
+
+### Viewpoint → room association (legacy planner)
+
+Viewpoints are associated with rooms by **room mask lookup**:
+
+- When the room mask updates, `representation::UpdateViewpointRoomIdsFromMask`
+  converts each viewpoint position into a voxel index (using `shift_` and
+  `room_resolution_`) and looks up the room id in the mask. It then calls
+  `SetViewpointRoomRelation`, which:
+  - updates the viewpoint's `room_id_`
+  - removes the viewpoint from the old room's list (if it changed)
+  - adds the viewpoint id to the new room's `viewpoint_indices_`
+- When a new viewpoint is created, the planner also sets its room id to the
+  **current room id** (derived from the room mask under the robot), ensuring
+  it is immediately attached to a room.
+
+Key locations:
+
+- `representation/representation.cpp`
+  - `UpdateViewpointRoomIdsFromMask(...)`
+  - `SetViewpointRoomRelation(...)`
+- `sensor_coverage_planner_ground.cpp`
+  - `UpdateViewpointRep()` (assigns `current_room_id_` to new viewpoints)
+
+This is the only place where viewpoints are tied to room labels. The label is
+stored on the room node, so "viewpoint → room label" is simply:
+
+`viewpoint.room_id_` → `room_nodes_map_[room_id].label`
+
+### Object → room association (legacy planner)
+
+Objects are associated with rooms via **object position lookup** in the room
+mask:
+
+- For each object, the planner projects the object centroid to a voxel index.
+- If the mask returns `0` (unknown), it **dilates** the search by a small
+  radius to find a nearby non-zero room id.
+- It then calls `representation::SetObjectRoomRelation`, which updates the
+  object's `room_id_` and updates the room's `object_indices_` set.
+
+Key locations:
+
+- `sensor_coverage_planner_ground.cpp`
+  - `UpdateObjectVisibility()` → object centroid → `room_mask` lookup →
+    `SetObjectRoomRelation(...)` with dilation fallback
+- `representation/representation.cpp`
+  - `SetObjectRoomRelation(...)`
+
+This provides the implicit "object → room label" mapping:
+
+`object.room_id_` → `room_nodes_map_[room_id].label`
+
+### Object ↔ viewpoint visibility (legacy + new stack)
+
+There is a separate **object ↔ viewpoint visibility** link:
+
+- The planner checks line-of-sight visibility between each object and the
+  current viewpoint. If visible, it:
+  - adds the object id to the viewpoint's `object_indices_`
+  - adds the viewpoint id to the object's `visible_viewpoint_indices_`
+- When the new stack publishes `ObjectNode` messages, it includes
+  `viewpoint_id`. The representation updates the visibility lists when this
+  field is present.
+
+Key locations:
+
+- `sensor_coverage_planner_ground.cpp`
+  - visibility check loop around `viewpoint.AddObjectIndex(...)`
+- `representation/representation.cpp`
+  - `UpdateObjectNode(...)` (uses `msg->viewpoint_id`)
+
+### Current migration caveat
+
+In `vector_perception_ros/semantic_mapping`:
+
+- `ObjectNode.msg` does **not** include `room_id`.
+- `ViewpointRep.msg` contains only `viewpoint_id`.
+- Room labels live in the planner (`RoomType` messages), not in the mapper.
+
+So today, **room labels are still owned by the legacy planner**. If you want
+room label data inside `semantic_mapping`, you would need:
+
+- a `room_id` field on `ObjectNode` and/or `ViewpointRep`, and
+- a room-segmentation-aware mapper (or a bridge back from `tare_planner`).
+
+## 6) Summary: what is used where?
+
+| VLM task | Image source | Crop method | Uses viewpoint images? |
+|---|---|---|---|
+| Room type | `RoomType.image` + `RoomType.room_mask` | `project_pcl_to_image` in planner | No |
+| Object type | Best object crop from detection image | Mask bbox + padding | No |
+| Target object spatial | Base object image + viewpoint snapshots | Project 3D bbox into each view | Yes |
+
+## 7) Handoff: improvements for next agent
+
+Please pass these to the next agent:
+
+1. Consider a room-type "best view" strategy:
+   - Use a stable viewpoint near the room anchor.
+   - Pick the viewpoint with max overlap between room mask and projected room
+     polygon, or highest room voxel coverage.
+2. Evaluate whether the planner should throttle room-type queries (e.g. only
+   when room label is unknown or confidence is low).
+3. Add a small diagram to this doc showing the three image pathways:
+   room type (cropped panorama + mask), object type (best crop), spatial query
+   (viewpoint mosaic).
+
+## 8) Room segmentation + planner coupling (legacy)
+
+Room segmentation publishes a `sensor_msgs/Image` on `/room_mask`. The planner
+subscribes to it and uses the mask as the **authoritative room-id grid** for
+multiple responsibilities:
+
+- Assigning room ids to voxelized points (room_id = room_mask at voxel index).
+- Determining current room id and transitions between rooms.
+- Validating anchor points for each room.
+- Generating room-type queries when rooms appear or grow.
+
+Core interactions in the legacy planner:
+
+- `RoomMaskCallback`: resizes the room mask to the voxel grid, then updates
+  `viewpoint_manager` + `grid_world` and synchronizes room ids.
+- `UpdateRoomLabel`: maps covered point cloud voxels to room ids via the mask,
+  aggregates per-room counts, and triggers room-type queries with cropped
+  panorama images.
+- `SetCurrentRoomId`: uses the room mask to detect room transitions and split/
+  merge events.
+
+This is why `sensor_coverage_planner_ground.cpp` has a large, intertwined loop:
+room segmentation, exploration logic, and VLM query generation all live in one
+control flow.
+
+## 9) Refactor direction: move room segmentation loop out of the 5k file
+
+Suggested approach to reduce coupling and move room segmentation processing out
+of the 5k-line planner file:
+
+1. **Extract a RoomSegmentationManager class** (new file):
+   - Owns `room_mask_`, `room_mask_old_`, `room_resolution_`, and `shift_`.
+   - Provides APIs:
+     - `update_mask(room_mask_msg)` (handles resize + update)
+     - `room_id_at(point)` (voxel lookup + bounds check)
+     - `validate_anchor(room_id, anchor_point)`
+     - `detect_split_merge(current_room_id, robot_position, room_mask_old_)`
+
+2. **Extract a RoomTypeQueryBuilder class**:
+   - Encapsulates the `UpdateRoomLabel` logic:
+     - Collect room clouds.
+     - Build room centers and voxel counts.
+     - Compute panorama crop via `project_pcl_to_image`.
+     - Publish `RoomType` query messages when thresholds are met.
+
+3. **Move `project_pcl_to_image` into a dedicated utility**:
+   - The function is already self-contained; place it in a
+     `room_image_projection.{h,cpp}` module with unit-testable inputs.
+
+4. **Create a room segmentation node (optional)**:
+   - A new ROS2 node that consumes `/room_mask` + pose + point cloud and
+     publishes `RoomType` queries directly.
+   - This would let the planner subscribe to room-type answers without hosting
+     the segmentation loop at all.
+
+5. **Keep planner orchestration minimal**:
+   - Planner should only consume:
+     - `RoomType` answers
+     - room-node updates (if needed)
+     - room navigation queries / decisions
+   - This isolates exploration from perception and VLM-specific logic.
+
+This refactor would make room segmentation and room-type queries testable in
+isolation, reduce `sensor_coverage_planner_ground.cpp` size, and allow the
+compressed-image migration to happen in one place rather than scattered
+throughout planner code.
diff --git a/vlm/docs/VLM_NODE_MIGRATION.md b/vlm/docs/VLM_NODE_MIGRATION.md
new file mode 100644
index 0000000..4ccdd3e
--- /dev/null
+++ b/vlm/docs/VLM_NODE_MIGRATION.md
@@ -0,0 +1,308 @@
+# VLM Node Migration Plan
+
+## Overview
+
+This document outlines the plan to refactor `VLM_ROS/src/vlm_node` into a clean,
+provider-agnostic VLM layer inspired by `dimos` patterns, while removing Gemini
+and prioritizing Moondream 3 as the first integrated backend. The goal is to
+standardize skill APIs (query, caption, detect, point, segment) across models,
+support both native skill endpoints (e.g., Moondream) and prompt-based JSON
+extraction (e.g., Qwen), and preserve existing ROS I/O contracts.
+
+## Design Principles
+
+1. **Same ROS I/O contract** - Preserve topic names, message types, and semantics.
+2. **Abstract, capability-driven VLMs** - One interface for all models, with
+   explicit capabilities to select native vs prompt-based skills.
+3. **Prompt-based fallbacks** - When native skills are missing, use strict JSON
+   prompts and normalized output parsing.
+4. **Real-time friendly** - Avoid excessive try/except and per-parameter checks;
+   fail fast with clear errors and keep hot paths lightweight.
+5. **Secrets out of repo** - Move API keys to environment variables/config only.
+
+---
+
+## Implementation Progress
+
+| Step | Component | Status |
+|------|-----------|--------|
+| 1 | Package layout & new docs | **COMPLETED** |
+| 2 | Core abstraction (`base.py`) | **COMPLETED** |
+| 3 | Capability + prompt fallback layer | **COMPLETED** |
+| 4 | Moondream 3 adapter | **COMPLETED** |
+| 5 | Prompt-based adapter (Qwen) | **COMPLETED** |
+| 6 | VLM node integration | **IN PROGRESS** |
+| 7 | Tests + config updates | **IN PROGRESS** |
+
+---
+
+## Phase 1: Package Structure (COMPLETED)
+
+Target layout within `vector_perception_ros/vlm`:
+
+```
+vlm/
+├── docs/
+│   └── VLM_NODE_MIGRATION.md
+├── vlm/
+│   ├── __init__.py
+│   └── models/
+│       ├── __init__.py
+│       ├── base.py                # abstract VLM interface
+│       ├── types.py               # shared types + capabilities
+│       ├── prompts.py             # prompt templates for JSON outputs
+│       ├── prompt_fallback.py     # prompt-based detect/point fallback
+│       ├── moondream.py           # Moondream 3 adapter
+│       ├── qwen.py                # prompt-based adapter
+│       ├── openai.py              # optional future adapter
+│       └── registry.py            # model factory from config
+```
+
+---
+
+## Phase 2: Core Abstraction (COMPLETED)
+
+### 2.1 Abstract Interface
+
+Create `VlmModelBase` that enforces the skill surface:
+
+```python
+class VlmModelBase(ABC):
+    @abstractmethod
+    def query(self, image: Image, prompt: str, **kwargs) -> str: ...
+
+    @abstractmethod
+    def query_batch(self, images: list[Image], prompt: str, **kwargs) -> list[str]: ...
+
+    @abstractmethod
+    def caption(self, image: Image, **kwargs) -> str: ...
+
+    @abstractmethod
+    def detect(self, image: Image, query: str, **kwargs) -> list[VlmDetection]: ...
+
+    @abstractmethod
+    def point(self, image: Image, query: str, **kwargs) -> list[VlmPoint]: ...
+
+    @abstractmethod
+    def segment(self, image: Image, query: str, **kwargs) -> list[VlmMask]: ...
+```
+
+### 2.2 Types and Capabilities
+
+Add lightweight dataclasses for consistent outputs:
+
+- `VlmDetection`: label + bbox (pixel coordinates)
+- `VlmPoint`: label + (x, y) in pixels
+- `VlmMask`: label + mono8 mask
+- `VlmCapabilities`: flags for native `detect/point/segment/caption`
+
+### 2.3 Prompt-Based Skill Fallbacks
+
+Introduce a `PromptSkillMixin` or helper functions that implement:
+
+- `detect()` via JSON list of `[label, x1, y1, x2, y2]`
+- `point()` via JSON list of `[label, x, y]`
+- `segment()` via JSON list of SVG path or RLE mask format
+
+Models that lack native skills (e.g., Qwen) use these defaults. Models with
+native skills override the method and set capability flags.
+
+---
+
+## Phase 3: Moondream 3 Adapter (IN PROGRESS)
+
+### 3.1 Native Skill Mapping
+
+Implement `Moondream3Model` using the Moondream skills API:
+
+- `query()` → `query`
+- `caption()` → `caption`
+- `detect()` → `detect` (native bboxes)
+- `point()` → `point` (native points)
+- `segment()` → `segment` (native SVG path, stored in VlmMask)
+
+Moondream's skill support is documented here:
+https://docs.moondream.ai/skills/
+
+### 3.2 Output Normalization
+
+Convert Moondream outputs to `VlmDetection`, `VlmPoint`, `VlmMask` and normalize
+coordinates to pixels using the input image shape. SVG paths are kept in
+`VlmMask.svg_path` for now; mono8 rasterization can be added when needed.
+
+---
+
+## Phase 4: Prompt-Based Adapter (Qwen) (COMPLETED)
+
+### 4.1 JSON-Only Prompt Templates
+
+Define canonical prompts in `models/prompts.py` for:
+
+- `detect`: JSON array of `[label, x1, y1, x2, y2]`
+- `point`: JSON array of `[label, x, y]`
+- `segment`: JSON array of SVG path strings (or mask encoding)
+
+### 4.2 Parsing + Validation
+
+Use strict JSON extraction. If parse fails, return empty detections and log
+once per model to avoid spam. No deep per-field validation beyond shape checks.
+
+---
+
+## VLM_ROS Reference (Source Implementation)
+
+Key files in `VLM_ROS/src/vlm_node/vlm_node/`:
+- `vlm_reasoning_node.py`: main VLM node, prompt handling, queueing, threading
+- `constants.py`: model name and default instruction fields
+- `utils.py`: bbox projection helper for spatial queries
+
+### Topics (Inputs)
+
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/room_type_query` | `RoomType` | Room image + mask for classification |
+| `/room_navigation_query` | `NavigationQuery` | JSON + candidate room images |
+| `/room_early_stop_1` | `RoomEarlyStop1` | Compare nearby vs current room |
+| `/object_type_query` | `ObjectType` | Object crop + candidate labels |
+| `/keyboard_input` | `std_msgs/String` | Natural language instruction |
+| `/target_object_query` | `TargetObject` | Object crop for target validation |
+| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Object + viewpoints + bbox3d |
+| `/anchor_object_query` | `TargetObject` | Object crop for anchor validation |
+
+### Topics (Outputs)
+
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/room_type_answer` | `RoomType` | Room type classification |
+| `/room_navigation_answer` | `VlmAnswer` | Selected room + reason |
+| `/vlm_answer` | `OverlayText` | Debug overlay text in RViz |
+| `/object_type_answer` | `ObjectType` | Verified label for object |
+| `/target_object_instruction` | `TargetObjectInstruction` | Parsed instruction fields |
+| `/target_object_answer` | `TargetObject` | Target match boolean |
+| `/anchor_object_answer` | `TargetObject` | Anchor match boolean |
+
+### Object Query Flow
+
+1. **Instruction Decomposition** (`/keyboard_input`):
+   - Skips simple control tokens in `skip_set` (e.g., `next`, `dynamic`).
+   - Uses `instruction_decomposition_prompt` to extract:
+     `target_object`, `room_condition`, `spatial_condition`,
+     `attribute_condition`, `anchor_object`, `attribute_condition_anchor`.
+   - Publishes `TargetObjectInstruction` to `/target_object_instruction`.
+
+2. **Object Type Verification** (`/object_type_query`):
+   - Loads `img_path` `.npy` crop; if mask exists, dilates and draws contours.
+   - Uses `object_type_query_prompt` with candidate labels from
+     `semantic_mapping/config/objects.yaml`.
+   - Publishes `ObjectType` with `final_label` to `/object_type_answer`.
+
+3. **Target Object Match** (`/target_object_query`):
+   - Builds description using `object_label` and `room_label`.
+   - Uses `target_object_prompt` to return JSON `{is_target, reason}`.
+   - Publishes `TargetObject` with `is_target` to `/target_object_answer`.
+
+4. **Target Object + Spatial Match** (`/target_object_spatial_query`):
+   - Loads viewpoint images and projects `bbox3d` into each view using
+     `project_bbox3d` and stored transforms.
+   - Uses `target_object_spatial_prompt` with surrounding images.
+   - Publishes `TargetObject` with `is_target` to `/target_object_answer`.
+
+5. **Anchor Object Match** (`/anchor_object_query`):
+   - Similar to target object match, but uses the anchor instruction fields.
+   - Publishes `TargetObject` to `/anchor_object_answer`.
+
+Note: The node uses per-topic queues and spawns a thread per request, keeping
+only the latest query per `room_id` or `object_id` when applicable.
+
+---
+
+## Message Type Alignment
+
+`vector_perception_ros` now standardizes on `semantic_mapping/msg/*` for all
+planner/VLM traffic. Legacy planner message packages are no longer used in the
+migrated stack, so no bridge is required.
+
+### Duplicate Message Names (same fields, different package)
+
+| VLM_ROS (tare_planner) | vector_perception_ros (semantic_mapping) |
+|---|---|
+| `ObjectNode` | `ObjectNode` |
+| `ObjectType` | `ObjectType` |
+| `TargetObjectInstruction` | `TargetObjectInstruction` |
+| `TargetObjectWithSpatial` | `TargetObjectWithSpatial` |
+| `ViewpointRep` | `ViewpointRep` |
+| `RoomType` | `RoomType` |
+
+### Additional messages migrated into `semantic_mapping`
+
+| Message | Used for |
+|---|---|
+| `RoomEarlyStop1` | Early room stopping decision |
+| `NavigationQuery` | Room navigation request |
+| `VlmAnswer` | Room navigation answer |
+| `TargetObject` | Non‑spatial target/anchor verification |
+
+### Current Status in `vector_perception_ros`
+
+- `semantic_mapping/mapping_node.py` publishes planner topics using
+  `semantic_mapping/*` message types (no `/semantic` prefix required).
+- `semantic_mapping/vlm_reasoning_node.py` subscribes to planner queries and
+  publishes answers using the same `semantic_mapping/*` types.
+- The `tare_message_bridge` is removed; the planner consumes
+  `semantic_mapping/msg/*` directly.
+
+---
+
+## Phase 5: VLM Reasoning Node Integration (PENDING)
+
+### 5.1 Model Selection (current)
+
+The ROS node now selects models via parameters:
+
+```yaml
+vlm_reasoning_node:
+  ros__parameters:
+    vlm_provider: "moondream"
+    vlm_model: "moondream3"
+```
+
+`vlm_reasoning_node.py` instantiates `MoondreamVlm` or `QwenVlm` directly. The
+registry-based factory (`registry.py`) remains a follow‑up item.
+
+### 5.2 `vlm_reasoning_node.py` Integration (current)
+
+Implemented:
+
+- Room type query/answer via `semantic_mapping/RoomType`
+- Object type verification via `semantic_mapping/ObjectType`
+- Instruction decomposition → `TargetObjectInstruction`
+- Target object spatial queries → `TargetObjectWithSpatial`
+- Room label visualization on `/room_type_vis`
+- Unified `semantic_mapping/*` messages across planner and VLM nodes
+- `project_bbox3d_equirect` moved into `vector_perception_utils`
+- Moondream timeouts + prompt fixes to prevent hung calls
+
+**Not yet ported in current node:**
+- `RoomEarlyStop1` callbacks
+- `NavigationQuery` callbacks (room selection)
+- `TargetObject` (non‑spatial) query/answer and anchor verification
+
+---
+
+## Phase 6: Tests and Validation (IN PROGRESS)
+
+- Unit tests for prompt-based parsing and fallback behavior.
+- Adapter tests for Moondream 3 response normalization.
+- Minimal integration test to ensure ROS messages remain unchanged.
+- Updated `test_vlm.py` to default to Moondream-only runs with opt-in Qwen and
+  strict network mode.
+
+---
+
+## Compatibility Checklist
+
+- [x] Planner uses `semantic_mapping/msg/*` directly (no bridge).
+- [x] Prompts for room/object tasks produce same JSON schema.
+- [ ] `VLMNode` behavior unchanged from downstream consumer perspective.
+- [x] Moondream 3 integrated as default backend.
+
diff --git a/vlm/package.xml b/vlm/package.xml
new file mode 100644
index 0000000..74b9052
--- /dev/null
+++ b/vlm/package.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>vlm</name>
+  <version>0.0.1</version>
+  <description>Provider-agnostic VLM interfaces and adapters.</description>
+  <maintainer email="alex.lin416@outlook.com">Alex Lin</maintainer>
+  <license>MIT</license>
+
+  <buildtool_depend>ament_cmake</buildtool_depend>
+
+  <test_depend>ament_copyright</test_depend>
+  <test_depend>ament_flake8</test_depend>
+  <test_depend>ament_pep257</test_depend>
+  <test_depend>python3-pytest</test_depend>
+
+  <export>
+    <build_type>ament_python</build_type>
+  </export>
+</package>
diff --git a/vlm/resource/vlm b/vlm/resource/vlm
new file mode 100644
index 0000000..02cef39
--- /dev/null
+++ b/vlm/resource/vlm
@@ -0,0 +1 @@
+vlm
diff --git a/vlm/setup.py b/vlm/setup.py
new file mode 100644
index 0000000..30c9dd3
--- /dev/null
+++ b/vlm/setup.py
@@ -0,0 +1,43 @@
+from setuptools import find_packages, setup
+import os
+import sys
+from glob import glob
+
+package_name = "vlm"
+
+# colcon --symlink-install may pass args setuptools doesn't understand
+def _strip_colcon_args(argv: list[str]) -> list[str]:
+    cleaned: list[str] = []
+    skip_next = False
+    for i, arg in enumerate(argv):
+        if skip_next:
+            skip_next = False
+            continue
+        if arg == "--editable":
+            continue
+        if arg == "--build-directory":
+            skip_next = True
+            continue
+        if arg.startswith("--build-directory="):
+            continue
+        cleaned.append(arg)
+    return cleaned
+
+sys.argv = _strip_colcon_args(sys.argv)
+
+setup(
+    name=package_name,
+    version="0.0.1",
+    packages=find_packages(exclude=["test"]),
+    data_files=[
+        ("share/ament_index/resource_index/packages", ["resource/" + package_name]),
+        ("share/" + package_name, ["package.xml"]),
+        (os.path.join("share", package_name, "docs"), glob(os.path.join("docs", "*.md"))),
+    ],
+    install_requires=["setuptools"],
+    zip_safe=True,
+    maintainer="Alex Lin",
+    maintainer_email="alex.lin416@outlook.com",
+    description="Provider-agnostic VLM interfaces and adapters.",
+    license="MIT",
+)
diff --git a/vlm/test/test_vlm.py b/vlm/test/test_vlm.py
new file mode 100644
index 0000000..71215c1
--- /dev/null
+++ b/vlm/test/test_vlm.py
@@ -0,0 +1,193 @@
+import importlib
+import os
+import ssl
+import sys
+import urllib.error
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+ROOT_DIR = Path(__file__).resolve().parents[1]
+if str(ROOT_DIR) not in sys.path:
+    sys.path.insert(0, str(ROOT_DIR))
+
+
+def _log(message: str) -> None:
+    print(message)
+
+PILImage = pytest.importorskip("PIL.Image", reason="Pillow is required for VLM tests")
+
+
+def _skip_or_raise_network_error(exc: Exception, context: str) -> None:
+    strict = os.getenv("VLM_TEST_STRICT", "").lower() in ("1", "true", "yes")
+    if strict:
+        raise exc
+    pytest.skip(f"{context} failed due to network error: {exc}")
+
+
+def _is_network_error(exc: Exception) -> bool:
+    return isinstance(
+        exc,
+        (
+            urllib.error.URLError,
+            ssl.SSLError,
+            TimeoutError,
+            ConnectionError,
+        ),
+    )
+
+
+def _load_image() -> np.ndarray:
+    image_path = Path(__file__).resolve().parents[1] / "data" / "kitchen.jpg"
+    with PILImage.open(image_path) as image:
+        return np.array(image.convert("RGB"))
+
+
+@pytest.fixture(scope="module")
+def models() -> list[tuple[str, object]]:
+    models: list[tuple[str, object]] = []
+    if "MOONDREAM_API_KEY" in os.environ:
+        try:
+            moondream_mod = importlib.import_module("vlm.models.moondream")
+        except ModuleNotFoundError:
+            moondream_mod = None
+        if moondream_mod is not None:
+            models.append(("moondream", moondream_mod.MoondreamVlm()))
+    if (
+        os.getenv("VLM_TEST_QWEN", "").lower() in ("1", "true", "yes")
+        and ("ALIBABA_API_KEY" in os.environ or "QWEN_API_KEY" in os.environ)
+    ):
+        try:
+            qwen_mod = importlib.import_module("vlm.models.qwen")
+        except ModuleNotFoundError:
+            qwen_mod = None
+        if qwen_mod is not None:
+            models.append(("qwen", qwen_mod.QwenVlm()))
+    if not models:
+        pytest.skip(
+            "No VLM backends available; set MOONDREAM_API_KEY or enable Qwen with VLM_TEST_QWEN=1."
+        )
+    return models
+
+
+def _validate_bbox(bbox: tuple[float, float, float, float], width: int, height: int) -> None:
+    x1, y1, x2, y2 = bbox
+    assert 0.0 <= x1 <= width
+    assert 0.0 <= x2 <= width
+    assert 0.0 <= y1 <= height
+    assert 0.0 <= y2 <= height
+    assert x2 >= x1
+    assert y2 >= y1
+
+
+def _validate_point(point: tuple[float, float], width: int, height: int) -> None:
+    x, y = point
+    assert 0.0 <= x <= width
+    assert 0.0 <= y <= height
+
+
+@pytest.fixture(scope="module")
+def image_data() -> tuple[np.ndarray, int, int]:
+    image = _load_image()
+    height, width = image.shape[:2]
+    return image, height, width
+
+
+def test_vlm_query(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None:
+    image, _, _ = image_data
+    for model_name, model in models:
+        _log(f"\nModel: {model_name} (query)")
+        room_answer = model.query(image, "What room is this?")
+        _log(f"query(room): {room_answer}")
+        assert isinstance(room_answer, str)
+
+
+def test_vlm_caption(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None:
+    image, _, _ = image_data
+    for model_name, model in models:
+        _log(f"\nModel: {model_name} (caption)")
+        caption = model.caption(image)
+        _log(f"caption: {caption}")
+        assert isinstance(caption, str)
+
+
+def test_vlm_query_batch(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None:
+    image, _, _ = image_data
+    for model_name, model in models:
+        _log(f"\nModel: {model_name} (query_batch)")
+        batch = model.query_batch([image, image], "Describe this image in one sentence.")
+        _log(f"query_batch: {[text[:80] for text in batch]}")
+        assert isinstance(batch, list)
+        assert len(batch) == 2
+        assert all(isinstance(item, str) for item in batch)
+
+
+def test_vlm_detect(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None:
+    image, height, width = image_data
+    queries = ["fridge", "microwave"]
+    for model_name, model in models:
+        if not model.capabilities.detect:
+            _log(f"\nModel: {model_name} (detect) skipped")
+            continue
+        _log(f"\nModel: {model_name} (detect)")
+        for query in queries:
+            try:
+                detections = model.detect(image, query)
+            except Exception as exc:
+                if _is_network_error(exc):
+                    _skip_or_raise_network_error(exc, f"{model_name} detect")
+                raise
+            _log(f"detect({query}): {len(detections)}")
+            assert isinstance(detections, list)
+            for detection in detections:
+                assert isinstance(detection.label, str)
+                _validate_bbox(detection.bbox, width, height)
+
+
+def test_vlm_point(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None:
+    image, height, width = image_data
+    queries = ["fridge", "microwave"]
+    for model_name, model in models:
+        if not model.capabilities.point:
+            _log(f"\nModel: {model_name} (point) skipped")
+            continue
+        _log(f"\nModel: {model_name} (point)")
+        for query in queries:
+            try:
+                points = model.point(image, query)
+            except Exception as exc:
+                if _is_network_error(exc):
+                    _skip_or_raise_network_error(exc, f"{model_name} point")
+                raise
+            _log(f"point({query}): {len(points)}")
+            assert isinstance(points, list)
+            for point in points:
+                assert isinstance(point.label, str)
+                _validate_point(point.point, width, height)
+
+
+def test_vlm_segment(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None:
+    image, height, width = image_data
+    queries = ["fridge", "microwave"]
+    for model_name, model in models:
+        if not model.capabilities.segment:
+            _log(f"\nModel: {model_name} (segment) skipped")
+            continue
+        _log(f"\nModel: {model_name} (segment)")
+        for query in queries:
+            try:
+                masks = model.segment(image, query)
+            except Exception as exc:
+                if _is_network_error(exc):
+                    _skip_or_raise_network_error(exc, f"{model_name} segment")
+                raise
+            _log(f"segment({query}): {len(masks)}")
+            assert isinstance(masks, list)
+            for mask in masks:
+                assert isinstance(mask.label, str)
+                assert mask.format in ("mono8", "svg_path")
+                if mask.mask is not None:
+                    assert mask.mask.shape[:2] == (height, width)
+                if mask.svg_path is not None:
+                    assert isinstance(mask.svg_path, str)
diff --git a/vlm/vlm/__init__.py b/vlm/vlm/__init__.py
new file mode 100644
index 0000000..1926a1a
--- /dev/null
+++ b/vlm/vlm/__init__.py
@@ -0,0 +1,27 @@
+"""VLM package with provider-agnostic interfaces and adapters."""
+
+from vlm.models import (
+    MoondreamConfig,
+    MoondreamVlm,
+    PromptFallbackVlm,
+    QwenConfig,
+    QwenVlm,
+    VlmCapabilities,
+    VlmDetection,
+    VlmMask,
+    VlmModelBase,
+    VlmPoint,
+)
+
+__all__ = [
+    "VlmCapabilities",
+    "VlmDetection",
+    "VlmMask",
+    "VlmModelBase",
+    "VlmPoint",
+    "PromptFallbackVlm",
+    "MoondreamVlm",
+    "MoondreamConfig",
+    "QwenVlm",
+    "QwenConfig",
+]
diff --git a/vlm/vlm/models/__init__.py b/vlm/vlm/models/__init__.py
new file mode 100644
index 0000000..07ab808
--- /dev/null
+++ b/vlm/vlm/models/__init__.py
@@ -0,0 +1,27 @@
+"""Model interfaces and shared types for VLM backends."""
+
+from vlm.models.base import VlmModelBase
+from vlm.models.moondream import MoondreamVlm, MoondreamConfig
+from vlm.models.prompt_fallback import PromptFallbackVlm
+from vlm.models.qwen import QwenVlm, QwenConfig
+from vlm.models.types import (
+    ImageType,
+    VlmCapabilities,
+    VlmDetection,
+    VlmMask,
+    VlmPoint,
+)
+
+__all__ = [
+    "ImageType",
+    "VlmCapabilities",
+    "VlmDetection",
+    "VlmMask",
+    "VlmModelBase",
+    "VlmPoint",
+    "PromptFallbackVlm",
+    "MoondreamVlm",
+    "MoondreamConfig",
+    "QwenVlm",
+    "QwenConfig",
+]
diff --git a/vlm/vlm/models/base.py b/vlm/vlm/models/base.py
new file mode 100644
index 0000000..536432f
--- /dev/null
+++ b/vlm/vlm/models/base.py
@@ -0,0 +1,33 @@
+from abc import ABC, abstractmethod
+
+from vlm.models.types import ImageType, VlmCapabilities, VlmDetection, VlmMask, VlmPoint
+
+
+class VlmModelBase(ABC):
+    """Abstract base class for VLM backends."""
+
+    capabilities = VlmCapabilities()
+
+    @abstractmethod
+    def query(self, image: ImageType, prompt: str, **kwargs: object) -> str:
+        """Answer a free-form question about the image."""
+
+    @abstractmethod
+    def query_batch(self, images: list[ImageType], prompt: str, **kwargs: object) -> list[str]:
+        """Answer the same question for multiple images."""
+
+    @abstractmethod
+    def caption(self, image: ImageType, **kwargs: object) -> str:
+        """Generate a natural language description of the image."""
+
+    @abstractmethod
+    def detect(self, image: ImageType, query: str, **kwargs: object) -> list[VlmDetection]:
+        """Return 2D bounding boxes that match the query."""
+
+    @abstractmethod
+    def point(self, image: ImageType, query: str, **kwargs: object) -> list[VlmPoint]:
+        """Return 2D points that match the query."""
+
+    @abstractmethod
+    def segment(self, image: ImageType, query: str, **kwargs: object) -> list[VlmMask]:
+        """Return segmentation masks that match the query."""
diff --git a/vlm/vlm/models/moondream.py b/vlm/vlm/models/moondream.py
new file mode 100644
index 0000000..e032c4d
--- /dev/null
+++ b/vlm/vlm/models/moondream.py
@@ -0,0 +1,132 @@
+from contextlib import contextmanager
+from dataclasses import dataclass
+import logging
+import os
+import socket
+
+import numpy as np
+from PIL import Image as PILImage
+import moondream as md  # type: ignore[import-untyped]
+
+from vlm.models.base import VlmModelBase
+from vlm.models.types import VlmCapabilities, VlmDetection, VlmMask, VlmPoint
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class MoondreamConfig:
+    api_key: str | None = None
+    bgr_input: bool = False
+    caption_length: str = "normal"
+    timeout_s: float | None = 8.0
+
+
+class MoondreamVlm(VlmModelBase):
+    capabilities = VlmCapabilities(detect=True, point=True, segment=True, caption=True)
+
+    def __init__(self, config: MoondreamConfig | None = None) -> None:
+        self._config = config or MoondreamConfig()
+        api_key = self._config.api_key or os.getenv("MOONDREAM_API_KEY")
+        if not api_key:
+            raise ValueError("Moondream API key must be provided or set in MOONDREAM_API_KEY")
+        self._client = md.vl(api_key=api_key)
+        if self._config.timeout_s is not None:
+            logger.info("Moondream timeout set to %.1fs", self._config.timeout_s)
+
+    def _to_pil(self, image: np.ndarray) -> PILImage.Image:
+        if self._config.bgr_input:
+            image = image[..., ::-1]
+        return PILImage.fromarray(image)
+
+    def query(self, image: np.ndarray, prompt: str, **kwargs: object) -> str:
+        pil_image = self._to_pil(image)
+        with self._timeout():
+            result = self._client.query(pil_image, prompt)
+        return result.get("answer", str(result))
+
+    def query_batch(self, images: list[np.ndarray], prompt: str, **kwargs: object) -> list[str]:
+        if not images:
+            return []
+        return [self.query(image, prompt, **kwargs) for image in images]
+
+    def caption(self, image: np.ndarray, **kwargs: object) -> str:
+        pil_image = self._to_pil(image)
+        length = kwargs.get("length", self._config.caption_length)
+        with self._timeout():
+            result = self._client.caption(pil_image, length=length)
+        return result.get("caption", str(result))
+
+    def detect(self, image: np.ndarray, query: str, **kwargs: object) -> list[VlmDetection]:
+        pil_image = self._to_pil(image)
+        with self._timeout():
+            result = self._client.detect(pil_image, query)
+        objects = result.get("objects", [])
+
+        height, width = image.shape[:2]
+        detections = []
+        for obj in objects:
+            label = obj.get("label", query)
+            x1 = float(obj.get("x_min", 0.0)) * width
+            y1 = float(obj.get("y_min", 0.0)) * height
+            x2 = float(obj.get("x_max", 0.0)) * width
+            y2 = float(obj.get("y_max", 0.0)) * height
+            detections.append(VlmDetection(label=label, bbox=(x1, y1, x2, y2)))
+        return detections
+
+    def point(self, image: np.ndarray, query: str, **kwargs: object) -> list[VlmPoint]:
+        pil_image = self._to_pil(image)
+        with self._timeout():
+            result = self._client.point(pil_image, query)
+        points = result.get("points", [])
+
+        height, width = image.shape[:2]
+        detections = []
+        for point in points:
+            label = point.get("label", query)
+            x = float(point.get("x", 0.0)) * width
+            y = float(point.get("y", 0.0)) * height
+            detections.append(VlmPoint(label=label, point=(x, y)))
+        return detections
+
+    def segment(self, image: np.ndarray, query: str, **kwargs: object) -> list[VlmMask]:
+        pil_image = self._to_pil(image)
+        with self._timeout():
+            result = self._client.segment(pil_image, query)
+        segments = (
+            result.get("segments")
+            or result.get("masks")
+            or result.get("objects")
+            or result.get("data")
+            or []
+        )
+        if isinstance(segments, dict):
+            segments = [segments]
+
+        masks = []
+        for segment in segments:
+            if isinstance(segment, str):
+                masks.append(VlmMask(label=query, mask=None, svg_path=segment, format="svg_path"))
+                continue
+            if not isinstance(segment, dict):
+                continue
+            label = segment.get("label", query)
+            svg_path = (
+                segment.get("mask")
+                or segment.get("svg_path")
+                or segment.get("path")
+                or segment.get("svg")
+                or ""
+            )
+            masks.append(VlmMask(label=label, mask=None, svg_path=svg_path, format="svg_path"))
+        return masks
+
+    @contextmanager
+    def _timeout(self):
+        prev = socket.getdefaulttimeout()
+        if self._config.timeout_s is not None:
+            socket.setdefaulttimeout(self._config.timeout_s)
+        try:
+            yield
+        finally:
+            socket.setdefaulttimeout(prev)
diff --git a/vlm/vlm/models/prompt_fallback.py b/vlm/vlm/models/prompt_fallback.py
new file mode 100644
index 0000000..2216215
--- /dev/null
+++ b/vlm/vlm/models/prompt_fallback.py
@@ -0,0 +1,70 @@
+import json
+import logging
+from typing import Any
+
+from vlm.models.base import VlmModelBase
+from vlm.models.prompts import detect_prompt, point_prompt, segment_prompt
+from vlm.models.types import VlmCapabilities, VlmDetection, VlmMask, VlmPoint
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_json_payload(text: str) -> str:
+    if "```" in text:
+        parts = text.split("```")
+        if len(parts) >= 3:
+            return parts[1].strip()
+    return text.strip()
+
+
+def _parse_json_list(text: str) -> list[Any]:
+    payload = _extract_json_payload(text)
+    try:
+        parsed = json.loads(payload)
+    except json.JSONDecodeError:
+        return []
+    if isinstance(parsed, list):
+        return parsed
+    return []
+
+
+class PromptFallbackVlm(VlmModelBase):
+    """Prompt-based skill fallback for models without native detect/point/segment."""
+
+    capabilities = VlmCapabilities(detect=True, point=True, segment=False)
+    _segment_warned = False
+
+    def detect(self, image, query: str, **kwargs: object) -> list[VlmDetection]:
+        response = self.query(image, detect_prompt(query), **kwargs)
+        detections = []
+        for item in _parse_json_list(response):
+            if not isinstance(item, (list, tuple)) or len(item) < 5:
+                continue
+            try:
+                label = str(item[0])
+                x1, y1, x2, y2 = (float(item[1]), float(item[2]), float(item[3]), float(item[4]))
+            except (TypeError, ValueError):
+                continue
+            detections.append(VlmDetection(label=label, bbox=(x1, y1, x2, y2)))
+        return detections
+
+    def point(self, image, query: str, **kwargs: object) -> list[VlmPoint]:
+        response = self.query(image, point_prompt(query), **kwargs)
+        points = []
+        for item in _parse_json_list(response):
+            if not isinstance(item, (list, tuple)) or len(item) < 3:
+                continue
+            try:
+                label = str(item[0])
+                x, y = (float(item[1]), float(item[2]))
+            except (TypeError, ValueError):
+                continue
+            points.append(VlmPoint(label=label, point=(x, y)))
+        return points
+
+    def segment(self, image, query: str, **kwargs: object) -> list[VlmMask]:
+        if not self._segment_warned:
+            logger.info("Prompt-based segment is not implemented yet; returning empty list.")
+            self._segment_warned = True
+        _ = segment_prompt(query)
+        return []
diff --git a/vlm/vlm/models/prompts.py b/vlm/vlm/models/prompts.py
new file mode 100644
index 0000000..8d6503e
--- /dev/null
+++ b/vlm/vlm/models/prompts.py
@@ -0,0 +1,37 @@
+def detect_prompt(query: str) -> str:
+    return f"""Return 2D bounding boxes in pixels for: "{query}"
+
+Output only JSON in this exact format:
+[
+  ["label1", x1, y1, x2, y2],
+  ["label2", x1, y1, x2, y2]
+]
+
+If no matches, return [].
+"""
+
+
+def point_prompt(query: str) -> str:
+    return f"""Return point coordinates in pixels for: "{query}"
+
+Output only JSON in this exact format:
+[
+  ["label1", x, y],
+  ["label2", x, y]
+]
+
+If no matches, return [].
+"""
+
+
+def segment_prompt(query: str) -> str:
+    return f"""Return segmentation masks for: "{query}"
+
+Output only JSON in this exact format:
+[
+  ["label1", "svg_path_string"],
+  ["label2", "svg_path_string"]
+]
+
+If no matches, return [].
+"""
diff --git a/vlm/vlm/models/qwen.py b/vlm/vlm/models/qwen.py
new file mode 100644
index 0000000..34ff715
--- /dev/null
+++ b/vlm/vlm/models/qwen.py
@@ -0,0 +1,88 @@
+from dataclasses import dataclass
+import base64
+from io import BytesIO
+import os
+
+import numpy as np
+from openai import OpenAI
+from PIL import Image as PILImage
+
+from vlm.models.prompt_fallback import PromptFallbackVlm
+from vlm.models.types import VlmCapabilities
+
+
+@dataclass(frozen=True)
+class QwenConfig:
+    model_name: str = "qwen2.5-vl-72b-instruct"
+    base_url: str = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
+    api_key: str | None = None
+    bgr_input: bool = False
+
+
+class QwenVlm(PromptFallbackVlm):
+    capabilities = VlmCapabilities(detect=True, point=True, segment=False, caption=True)
+
+    def __init__(self, config: QwenConfig | None = None) -> None:
+        self._config = config or QwenConfig()
+        api_key = (
+            self._config.api_key
+            or os.getenv("QWEN_API_KEY")
+            or os.getenv("ALIBABA_API_KEY")
+        )
+        if not api_key:
+            raise ValueError("Qwen API key must be set in QWEN_API_KEY or ALIBABA_API_KEY")
+        self._client = OpenAI(base_url=self._config.base_url, api_key=api_key)
+
+    def _image_to_base64(self, image: np.ndarray) -> str:
+        if self._config.bgr_input:
+            image = image[..., ::-1]
+        pil_image = PILImage.fromarray(image)
+        buffer = BytesIO()
+        pil_image.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+    def query(self, image: np.ndarray, prompt: str, **kwargs: object) -> str:
+        img_base64 = self._image_to_base64(image)
+        response = self._client.chat.completions.create(
+            model=self._config.model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{img_base64}"},
+                        },
+                        {"type": "text", "text": prompt},
+                    ],
+                }
+            ],
+        )
+        return response.choices[0].message.content or ""
+
+    def query_batch(self, images: list[np.ndarray], prompt: str, **kwargs: object) -> list[str]:
+        if not images:
+            return []
+        content = [
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{self._image_to_base64(img)}"},
+            }
+            for img in images
+        ]
+        content.append({"type": "text", "text": prompt})
+        response = self._client.chat.completions.create(
+            model=self._config.model_name,
+            messages=[{"role": "user", "content": content}],
+        )
+        response_text = response.choices[0].message.content or ""
+        return [response_text] * len(images)
+
+    def caption(self, image: np.ndarray, **kwargs: object) -> str:
+        return self.query(image, "Describe this image concisely.", **kwargs)
+
+    def detect(self, image: np.ndarray, query: str, **kwargs: object):
+        return super().detect(image, query, **kwargs)
+
+    def point(self, image: np.ndarray, query: str, **kwargs: object):
+        return super().point(image, query, **kwargs)
diff --git a/vlm/vlm/models/types.py b/vlm/vlm/models/types.py
new file mode 100644
index 0000000..34178e8
--- /dev/null
+++ b/vlm/vlm/models/types.py
@@ -0,0 +1,44 @@
+from dataclasses import dataclass
+from typing import TypeAlias
+
+import numpy as np
+
+ImageType: TypeAlias = np.ndarray
+
+
+@dataclass(frozen=True)
+class VlmDetection:
+    """Single 2D detection output in pixel coordinates."""
+
+    label: str
+    bbox: tuple[float, float, float, float]  # (x1, y1, x2, y2)
+
+
+@dataclass(frozen=True)
+class VlmPoint:
+    """Single 2D point output in pixel coordinates."""
+
+    label: str
+    point: tuple[float, float]  # (x, y)
+
+
+@dataclass(frozen=True)
+class VlmMask:
+    """Segmentation output as a binary mask with optional SVG path."""
+
+    label: str
+    mask: np.ndarray | None = None
+    svg_path: str | None = None
+    format: str = "mono8"
+
+
+@dataclass(frozen=True)
+class VlmCapabilities:
+    """Capability flags for a VLM backend."""
+
+    query: bool = True
+    query_batch: bool = True
+    caption: bool = True
+    detect: bool = False
+    point: bool = False
+    segment: bool = False