diff --git a/.gitignore b/.gitignore
index ccd784f..4a54642 100644
--- a/.gitignore
+++ b/.gitignore
@@ -209,3 +209,12 @@ cython_debug/
marimo/_static/
marimo/_lsp/
__marimo__/
+
+# model files
+*.pt
+*.pth
+*.onnx
+*.engine
+*.ts
+
+output/
\ No newline at end of file
diff --git a/README.md b/README.md
index 49392b8..33bff36 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,8 @@ ROS2 perception stack for generalist robotics.
## Packages
- **track_anything** - EdgeTAM tracking + 3D segmentation with RGBD
+- **detect_anything** - YOLO-E detection node and utilities
+- **semantic_mapping** - semantic 3D mapping with VLM query hooks
- **vector_perception_utils** - Image and point cloud utilities
## Installation
@@ -18,9 +20,16 @@ source ~/vector_venv/bin/activate
cd /home/alex-lin/dev/vector_perception_ros
pip install -r requirements.txt
+sudo apt-get install ros-dev-tools ros-jazzy-pcl-ros libpcl-dev git cmake libgoogle-glog-dev libgflags-dev libatlas-base-dev libeigen3-dev libsuitesparse-dev
+
+sudo apt install -y nlohmann-json3-dev
+sudo apt install ros-jazzy-backward-ros
+
+
# Build
source /opt/ros/jazzy/setup.bash
colcon build
+```
@@ -110,4 +119,5 @@ for det in detections:
See package READMEs:
- [track_anything/README.md](track_anything/README.md)
+- [semantic_mapping/README.md](semantic_mapping/README.md)
- [vector_perception_utils/README.md](vector_perception_utils/README.md)
diff --git a/detect_anything/CMakeLists.txt b/detect_anything/CMakeLists.txt
new file mode 100644
index 0000000..ea8c416
--- /dev/null
+++ b/detect_anything/CMakeLists.txt
@@ -0,0 +1,54 @@
+cmake_minimum_required(VERSION 3.10)
+project(detect_anything)
+
+find_package(ament_cmake REQUIRED)
+find_package(rclpy REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(std_msgs REQUIRED)
+find_package(vision_msgs REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(rosidl_default_generators REQUIRED)
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+rosidl_generate_interfaces(${PROJECT_NAME}
+ "msg/DetectionResult.msg"
+ DEPENDENCIES std_msgs sensor_msgs
+)
+
+set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages")
+
+install(
+ DIRECTORY detect_anything
+ DESTINATION ${PYTHON_INSTALL_DIR}
+)
+
+install(
+ PROGRAMS
+ scripts/detection_node
+ DESTINATION lib/${PROJECT_NAME}
+)
+
+install(
+ FILES resource/detect_anything
+ DESTINATION share/${PROJECT_NAME}/resource
+)
+
+install(
+ DIRECTORY config
+ DESTINATION share/${PROJECT_NAME}
+)
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/models")
+ install(
+ DIRECTORY models
+ DESTINATION share/${PROJECT_NAME}
+ )
+endif()
+
+set(ENV_HOOK "${CMAKE_CURRENT_SOURCE_DIR}/env-hooks/venv_pythonpath.sh.in")
+set(ENV_HOOK_OUT "${CMAKE_CURRENT_BINARY_DIR}/ament_cmake_environment_hooks/venv_pythonpath.sh")
+configure_file(${ENV_HOOK} ${ENV_HOOK_OUT} @ONLY)
+ament_environment_hooks(${ENV_HOOK_OUT})
+
+ament_export_dependencies(rosidl_default_runtime)
+ament_package()
diff --git a/detect_anything/README.md b/detect_anything/README.md
new file mode 100644
index 0000000..1e94a06
--- /dev/null
+++ b/detect_anything/README.md
@@ -0,0 +1,28 @@
+# detect_anything
+
+YOLO-E detection node that publishes `DetectionResult` with cropped masks and an annotated overlay topic.
+
+## What’s inside
+- `detect_anything/detection.py`: detection results container and Ultralytics parsing helpers.
+- `detect_anything/yoloe.py`: YOLO-E wrapper with prompt support and basic filtering.
+- `detect_anything/detection_node.py`: ROS2 node wiring the detector to `DetectionResult`.
+- `msg/DetectionResult.msg`: compressed image + cropped mask array.
+
+## Quick start
+```bash
+source ~/vector_venv/bin/activate
+source /opt/ros/jazzy/setup.bash
+colcon build --packages-select detect_anything
+source install/setup.bash
+
+ros2 run detect_anything detection_node \
+ --ros-args -p model_path:=/path/to/yoloe/models \
+ -p model_name:=yoloe-11s-seg-pf.pt \
+ -p conf:=0.6 \
+ -p max_area_ratio:=0.3 \
+ -p image_topic:=/camera/image
+```
+
+Topics:
+- Publishes `/detection_result` (`detect_anything/DetectionResult`) and `/annotated_image_detection` (`sensor_msgs/Image`).
+- Subscribes to `/camera/image` (or `/camera/image/compressed` if `use_compressed:=true`).
diff --git a/detect_anything/config/objects.yaml b/detect_anything/config/objects.yaml
new file mode 100644
index 0000000..371f186
--- /dev/null
+++ b/detect_anything/config/objects.yaml
@@ -0,0 +1,46 @@
+# Simple list of object names for prompting YOLO-E
+- chair
+- desk
+- tv_monitor
+- sofa
+- unknown
+- printer
+- coffee machine
+- refrigerator
+- trash can
+- shoe
+- sink
+- table
+- oven
+- bed
+- painting
+- bulletin board
+- plant
+- vase
+- cabinet
+- shelf
+- book
+- cup
+- sculpture
+- keyboard
+- mouse
+- clock
+- phone
+- toilet
+- bathtub
+- microwave oven
+- pan
+- suitcase
+- light
+- curtain
+- whiteboard
+- shower knob
+- bottle
+- water dispenser
+- vending machine
+- laptop
+- bag
+- locker
+- picture
+- cardboard
+- extinguisher
diff --git a/detect_anything/detect_anything/__init__.py b/detect_anything/detect_anything/__init__.py
new file mode 100644
index 0000000..9c61aab
--- /dev/null
+++ b/detect_anything/detect_anything/__init__.py
@@ -0,0 +1 @@
+"""detect_anything package."""
diff --git a/detect_anything/detect_anything/detection.py b/detect_anything/detect_anything/detection.py
new file mode 100644
index 0000000..1d64d76
--- /dev/null
+++ b/detect_anything/detect_anything/detection.py
@@ -0,0 +1,256 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, Optional, Sequence
+
+from cv_bridge import CvBridge
+import cv2
+import numpy as np
+
+from detect_anything.msg import DetectionResult
+
+_BRIDGE = CvBridge()
+
+
+@dataclass
+class SingleDetectionResult:
+ """Simple container for a YOLO/Ultralytics detection with a segmentation mask."""
+
+ bbox: tuple[float, float, float, float]
+ track_id: int
+ class_id: int
+ label: str
+ confidence: float
+ mask: np.ndarray
+
+ def area(self) -> float:
+ x1, y1, x2, y2 = self.bbox
+ return max(0.0, (x2 - x1)) * max(0.0, (y2 - y1))
+
+
+@dataclass
+class DetectionResults:
+ """Container for detections plus the source image."""
+
+ image: np.ndarray
+ detections: List[SingleDetectionResult]
+
+ def overlay(self, alpha: float = 0.4, thickness: int = 2) -> np.ndarray:
+ """Draw bboxes and masks on the image."""
+ output = self.image.copy()
+ for det in self.detections:
+ rng = np.random.default_rng(det.track_id)
+ color = tuple(int(c) for c in rng.integers(50, 255, size=3))
+
+ if det.mask.size:
+ mask_bool = det.mask.astype(bool)
+ if mask_bool.shape[:2] == output.shape[:2]:
+ colored_mask = np.zeros_like(output)
+ colored_mask[mask_bool] = color
+ output[mask_bool] = cv2.addWeighted(
+ output[mask_bool], 1 - alpha, colored_mask[mask_bool], alpha, 0
+ )
+ contours, _ = cv2.findContours(det.mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+ cv2.drawContours(output, contours, -1, color, thickness)
+
+ x1, y1, x2, y2 = map(int, det.bbox)
+ cv2.rectangle(output, (x1, y1), (x2, y2), color, thickness)
+
+ label_text = f"{det.label} {det.confidence:.2f}"
+ label_text = f"{label_text} | id {det.track_id}"
+
+ (text_w, text_h), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+ cv2.rectangle(
+ output,
+ (x1, max(0, y1 - text_h - baseline - 2)),
+ (x1 + text_w, max(0, y1)),
+ color,
+ -1,
+ )
+ cv2.putText(
+ output,
+ label_text,
+ (x1, max(0, y1 - baseline - 2)),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (255, 255, 255),
+ 1,
+ cv2.LINE_AA,
+ )
+ return output
+
+ def to_msg(self, header) -> DetectionResult:
+ """
+ Convert detections into a DetectionResult ROS message.
+ """
+ msg = DetectionResult()
+ msg.header = header
+ msg.image = _BRIDGE.cv2_to_compressed_imgmsg(self.image, dst_format="jpeg")
+ msg.image.header = header
+
+ for det in self.detections:
+ msg.track_id.append(int(det.track_id))
+ msg.x1.append(float(det.bbox[0]))
+ msg.y1.append(float(det.bbox[1]))
+ msg.x2.append(float(det.bbox[2]))
+ msg.y2.append(float(det.bbox[3]))
+ msg.label.append(str(det.label))
+ msg.confidence.append(float(det.confidence))
+
+ mask = det.mask
+ # Crop mask to bbox and align origin to bbox top-left for transport.
+ x1, y1, x2, y2 = [int(v) for v in det.bbox]
+ h, w = self.image.shape[:2]
+ x1 = max(0, min(x1, w))
+ x2 = max(0, min(x2, w))
+ y1 = max(0, min(y1, h))
+ y2 = max(0, min(y2, h))
+ crop_h = max(1, y2 - y1)
+ crop_w = max(1, x2 - x1)
+ if mask.size == 0:
+ mask = np.zeros((crop_h, crop_w), dtype=np.uint8)
+ else:
+ mask = mask[y1:y2, x1:x2]
+ if mask.shape != (crop_h, crop_w):
+ mask = cv2.resize(
+ mask.astype(np.uint8),
+ (crop_w, crop_h),
+ interpolation=cv2.INTER_NEAREST,
+ )
+
+ mask_msg = _BRIDGE.cv2_to_compressed_imgmsg(mask, dst_format="png")
+ mask_msg.header = header
+ msg.masks.append(mask_msg)
+
+ return msg
+
+ @classmethod
+ def from_msg(cls, msg: DetectionResult) -> DetectionResults:
+ """
+ Construct DetectionResults from a DetectionResult ROS message.
+ Masks are converted back to full-size overlays at bbox locations.
+ """
+ image = _BRIDGE.compressed_imgmsg_to_cv2(msg.image)
+ detections: List[SingleDetectionResult] = []
+
+ for i in range(len(msg.track_id)):
+ bbox = (
+ float(msg.x1[i]),
+ float(msg.y1[i]),
+ float(msg.x2[i]),
+ float(msg.y2[i]),
+ )
+ mask_img = _BRIDGE.compressed_imgmsg_to_cv2(msg.masks[i], desired_encoding="mono8")
+
+ full_mask = np.zeros(image.shape[:2], dtype=np.uint8)
+ x1, y1, x2, y2 = [int(v) for v in bbox]
+ h, w = image.shape[:2]
+ x1 = max(0, min(x1, w))
+ x2 = max(0, min(x2, w))
+ y1 = max(0, min(y1, h))
+ y2 = max(0, min(y2, h))
+ crop_h = max(1, y2 - y1)
+ crop_w = max(1, x2 - x1)
+ if mask_img.shape[:2] != (crop_h, crop_w):
+ mask_img = cv2.resize(mask_img, (crop_w, crop_h), interpolation=cv2.INTER_NEAREST)
+ full_mask[y1:y2, x1:x2] = mask_img
+
+ detections.append(
+ SingleDetectionResult(
+ bbox=bbox,
+ track_id=int(msg.track_id[i]),
+ class_id=0, # class_id is not transmitted in DetectionResult
+ label=str(msg.label[i]),
+ confidence=float(msg.confidence[i]),
+ mask=full_mask,
+ )
+ )
+
+ return cls(image=image, detections=detections)
+
+
+def _from_ultralytics_result(
+ result,
+ idx: int,
+ image_shape: Optional[tuple[int, int]] = None,
+ names: Optional[Sequence[str]] = None,
+) -> SingleDetectionResult:
+ """Convert a single Ultralytics Results entry into a SingleDetectionResult."""
+ from ultralytics.engine.results import Results # local import
+
+ if not isinstance(result, Results):
+ raise TypeError("result must be an ultralytics Results object")
+ if result.boxes is None:
+ raise ValueError("Result has no boxes")
+
+ bbox_array = result.boxes.xyxy[idx].cpu().numpy()
+ bbox = (
+ float(bbox_array[0]),
+ float(bbox_array[1]),
+ float(bbox_array[2]),
+ float(bbox_array[3]),
+ )
+
+ confidence = float(result.boxes.conf[idx].cpu())
+ class_id = int(result.boxes.cls[idx].cpu())
+
+ resolved_names = names
+ if resolved_names is None and hasattr(result, "names"):
+ resolved_names = result.names # type: ignore[assignment]
+
+ label = f"class_{class_id}"
+ if resolved_names is not None:
+ if isinstance(resolved_names, dict):
+ label = resolved_names.get(class_id, label)
+ elif isinstance(resolved_names, (list, tuple)) and class_id < len(resolved_names):
+ label = resolved_names[class_id]
+
+ track_id = idx
+ if hasattr(result.boxes, "id") and result.boxes.id is not None:
+ track_id = int(result.boxes.id[idx].cpu())
+
+ target_h, target_w = (image_shape or (None, None))
+ if target_h is None or target_w is None:
+ if hasattr(result, "orig_shape") and result.orig_shape:
+ target_h, target_w = result.orig_shape[:2]
+
+ mask = np.zeros((target_h, target_w), dtype=np.uint8) if target_h and target_w else np.zeros((0, 0), dtype=np.uint8)
+ if result.masks is not None and idx < len(result.masks.data):
+ mask_tensor = result.masks.data[idx]
+ mask_np = mask_tensor.cpu().numpy()
+
+ if mask_np.ndim == 3:
+ mask_np = mask_np.squeeze()
+
+ if target_h and target_w and mask_np.shape != (target_h, target_w):
+ mask_np = cv2.resize(mask_np.astype(np.float32), (target_w, target_h), interpolation=cv2.INTER_LINEAR)
+
+ mask = (mask_np > 0.5).astype(np.uint8) * 255 # type: ignore[assignment]
+
+ return SingleDetectionResult(
+ bbox=bbox,
+ track_id=track_id,
+ class_id=class_id,
+ label=label,
+ confidence=confidence,
+ mask=mask,
+ )
+
+
+def from_ultralytics_results(
+ results,
+ image_shape: Optional[tuple[int, int]] = None,
+ names: Optional[Sequence[str]] = None,
+) -> List[SingleDetectionResult]:
+ """Convert a list of Ultralytics Results objects into detections with masks."""
+ detections: List[SingleDetectionResult] = []
+ for result in results:
+ if result.boxes is None:
+ continue
+ num_detections = len(result.boxes.xyxy)
+ for i in range(num_detections):
+ detections.append(_from_ultralytics_result(result, i, image_shape=image_shape, names=names))
+ return detections
+
+
+__all__ = ["DetectionResults", "SingleDetectionResult", "from_ultralytics_results"]
diff --git a/detect_anything/detect_anything/detection_node.py b/detect_anything/detect_anything/detection_node.py
new file mode 100644
index 0000000..7aa0870
--- /dev/null
+++ b/detect_anything/detect_anything/detection_node.py
@@ -0,0 +1,177 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import List, Optional
+
+import yaml
+import rclpy
+from cv_bridge import CvBridge
+from ament_index_python.packages import get_package_share_directory
+from sensor_msgs.msg import CompressedImage, Image
+from rclpy.node import Node
+from rclpy.qos import DurabilityPolicy, QoSProfile, ReliabilityPolicy
+
+from detect_anything.detection import DetectionResults
+from detect_anything.yoloe import Yoloe2DDetector, YoloePromptMode
+from detect_anything.msg import DetectionResult
+
+
+class DetectionNode(Node):
+ """ROS2 node that runs YOLO-E detection and publishes DetectionResult."""
+
+ def __init__(self) -> None:
+ super().__init__("detect_anything_detection_node")
+ self.bridge = CvBridge()
+
+ # Parameters
+ defaults = self._default_params()
+ self.declare_parameter("image_topic", defaults["image_topic"])
+ self.declare_parameter("compressed_image_topic", defaults["compressed_image_topic"])
+ self.declare_parameter("use_compressed", defaults["use_compressed"])
+ self.declare_parameter("model_path", defaults["model_path"])
+ self.declare_parameter("model_name", defaults["model_name"])
+ self.declare_parameter("device", defaults["device"])
+ self.declare_parameter("prompt_mode", defaults["prompt_mode"])
+ self.declare_parameter("objects_file", defaults["objects_file"])
+ self.declare_parameter("conf", defaults["conf"])
+ self.declare_parameter("max_area_ratio", defaults["max_area_ratio"])
+
+ params = self._default_params()
+ image_topic = str(self.get_parameter("image_topic").value)
+ compressed_image_topic = str(self.get_parameter("compressed_image_topic").value)
+ self.use_compressed = bool(self.get_parameter("use_compressed").value)
+
+ base_share = Path(get_package_share_directory("detect_anything"))
+ model_path_val = self.get_parameter("model_path").value
+ model_path = str(model_path_val) if model_path_val not in (None, "") else params["model_path"]
+ if str(model_path).lower() == "none":
+ model_path = params["model_path"]
+ model_path = Path(model_path).expanduser()
+ if not model_path.is_absolute():
+ model_path = (base_share / model_path).resolve()
+
+ model_name = str(self.get_parameter("model_name").value or params["model_name"])
+ device = str(self.get_parameter("device").value or params["device"])
+ prompt_mode_str = str(self.get_parameter("prompt_mode").value or params["prompt_mode"]).lower()
+ objects_file = Path(str(self.get_parameter("objects_file").value or params["objects_file"]))
+ if not objects_file.is_absolute():
+ objects_file = (base_share / objects_file).resolve()
+ conf = float(self.get_parameter("conf").value)
+ max_area_ratio = float(self.get_parameter("max_area_ratio").value)
+
+ prompt_mode = YoloePromptMode(prompt_mode_str)
+
+ # Auto-pick the right default weight if user didn't override
+ if prompt_mode == YoloePromptMode.LRPC and ("seg.pt" in model_name and "seg-pf.pt" not in model_name):
+ model_name = "yoloe-11l-seg-pf.pt"
+ self.get_logger().info(f"Prompt-free mode: switching model_name to {model_name}")
+ if prompt_mode == YoloePromptMode.PROMPT and "seg-pf.pt" in model_name:
+ model_name = "yoloe-11l-seg.pt"
+ self.get_logger().info(f"Prompt mode: switching model_name to {model_name}")
+
+ self.detector = Yoloe2DDetector(
+ model_path=str(model_path),
+ model_name=model_name,
+ device=device or None,
+ prompt_mode=prompt_mode,
+ max_area_ratio=max_area_ratio,
+ conf=conf,
+ )
+
+ prompts = self._load_prompts(objects_file)
+ if prompts and prompt_mode == YoloePromptMode.PROMPT:
+ self.detector.set_prompts(text=prompts)
+ self.get_logger().info(f"Loaded {len(prompts)} prompts from {objects_file}")
+
+ qos = QoSProfile(
+ depth=10,
+ reliability=ReliabilityPolicy.RELIABLE,
+ durability=DurabilityPolicy.VOLATILE,
+ )
+ if self.use_compressed:
+ self.create_subscription(
+ CompressedImage,
+ compressed_image_topic,
+ self._compressed_image_callback,
+ qos,
+ )
+ else:
+ self.create_subscription(
+ Image,
+ image_topic,
+ self._image_callback,
+ qos,
+ )
+
+ self.detection_pub = self.create_publisher(DetectionResult, "/detection_result", qos)
+ self.overlay_pub = self.create_publisher(Image, "/annotated_image_detection", qos)
+
+ self.get_logger().info("detect_anything detection node started")
+
+ def _default_params(self) -> dict:
+ base_share = Path(get_package_share_directory("detect_anything"))
+ model_dir = Path.home() / ".ultralytics" / "weights"
+ return {
+ "image_topic": "/camera/image",
+ "compressed_image_topic": "/camera/image/compressed",
+ "use_compressed": True,
+ "model_path": str(model_dir),
+ "model_name": "yoloe-11l-seg.pt",
+ "device": "cuda",
+ "prompt_mode": YoloePromptMode.PROMPT.value,
+ "objects_file": str(base_share / "config" / "objects.yaml"),
+ "conf": 0.25,
+ "max_area_ratio": 0.3,
+ }
+
+ def _load_prompts(self, objects_file: Path) -> List[str]:
+ if not objects_file.exists():
+ return []
+ try:
+ with open(objects_file, "r", encoding="utf-8") as f:
+ data = yaml.safe_load(f) or []
+ if isinstance(data, list):
+ return [str(p) for p in data]
+ except Exception as exc:
+ self.get_logger().warn(f"Failed to read prompts from {objects_file}: {exc}")
+ return []
+
+ def _image_callback(self, msg: Image) -> None:
+ cv_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding="bgr8")
+ self._process_image(cv_image, msg.header)
+
+ def _compressed_image_callback(self, msg: CompressedImage) -> None:
+ cv_image = self.bridge.compressed_imgmsg_to_cv2(msg, desired_encoding="bgr8")
+ self._process_image(cv_image, msg.header)
+
+ def _process_image(self, cv_image, header) -> None:
+ detections: DetectionResults = self.detector.process_image(cv_image)
+
+ detection_msg = detections.to_msg(header)
+ self.detection_pub.publish(detection_msg)
+
+ overlay = detections.overlay()
+ overlay_msg = self.bridge.cv2_to_imgmsg(overlay, encoding="bgr8")
+ overlay_msg.header = header
+ self.overlay_pub.publish(overlay_msg)
+
+ def destroy_node(self) -> bool:
+ if hasattr(self, "detector") and self.detector:
+ self.detector.stop()
+ return super().destroy_node()
+
+
+def main(args: Optional[list[str]] = None) -> None:
+ rclpy.init(args=args)
+ node = DetectionNode()
+ try:
+ rclpy.spin(node)
+ except KeyboardInterrupt:
+ pass
+ finally:
+ node.destroy_node()
+ rclpy.shutdown()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/detect_anything/detect_anything/yoloe.py b/detect_anything/detect_anything/yoloe.py
new file mode 100644
index 0000000..056387a
--- /dev/null
+++ b/detect_anything/detect_anything/yoloe.py
@@ -0,0 +1,157 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+import threading
+import zipfile
+from typing import Any, List, Optional, Sequence
+
+import numpy as np
+import torch
+from ultralytics import YOLOE # type: ignore[attr-defined, import-not-found]
+from ultralytics.utils import SETTINGS
+from ultralytics.utils.downloads import attempt_download_asset
+
+from detect_anything.detection import (
+ DetectionResults,
+ SingleDetectionResult,
+ from_ultralytics_results,
+)
+
+
+class YoloePromptMode(Enum):
+ """YOLO-E prompt modes."""
+
+ LRPC = "lrpc"
+ PROMPT = "prompt"
+
+
+class Yoloe2DDetector:
+ """Thin wrapper around YOLO-E for 2D detection + tracking with masks."""
+
+ def __init__(
+ self,
+ model_path: str | None = None,
+ model_name: str | None = None,
+ device: str | None = None,
+ prompt_mode: YoloePromptMode = YoloePromptMode.LRPC,
+ max_area_ratio: Optional[float] = 0.3,
+ conf: float = 0.6,
+ iou: float = 0.6,
+ ) -> None:
+ if model_name is None:
+ model_name = "yoloe-11s-seg-pf.pt" if prompt_mode == YoloePromptMode.LRPC else "yoloe-11s-seg.pt"
+
+ model_dir = Path(model_path) if model_path else Path.home() / ".ultralytics" / "weights"
+ model_dir = model_dir.expanduser().resolve()
+ model_dir.mkdir(parents=True, exist_ok=True)
+ SETTINGS["weights_dir"] = str(model_dir)
+ weights_path = attempt_download_asset(model_dir / model_name)
+ if prompt_mode == YoloePromptMode.PROMPT:
+ self._ensure_mobileclip(model_dir)
+ self.model = YOLOE(weights_path)
+ self.prompt_mode = prompt_mode
+ self._visual_prompts: dict[str, np.ndarray] | None = None
+ self.max_area_ratio = max_area_ratio
+ self._lock = threading.Lock()
+ self.conf = conf
+ self.iou = iou
+ self._names_override: Sequence[str] | None = None
+
+ if prompt_mode == YoloePromptMode.PROMPT:
+ self.set_prompts(text=["nothing"])
+
+ if self.max_area_ratio is not None and not (0.0 < self.max_area_ratio <= 1.0):
+ raise ValueError("max_area_ratio must be in the range (0, 1].")
+
+ if device:
+ self.device = device
+ elif torch.cuda.is_available():
+ self.device = "cuda"
+ else:
+ self.device = "cpu"
+
+ def set_prompts(
+ self,
+ text: Optional[List[str]] = None,
+ bboxes: Optional[np.ndarray] = None,
+ ) -> None:
+ """Set prompts for detection. Provide either text or bboxes, not both."""
+ if self.prompt_mode == YoloePromptMode.LRPC:
+ # Prompt-free model cannot accept text/visual prompts.
+ return
+ if text is not None and bboxes is not None:
+ raise ValueError("Provide either text or bboxes, not both.")
+ if text is None and bboxes is None:
+ raise ValueError("Must provide either text or bboxes.")
+
+ with self._lock:
+ self.model.predictor = None
+ if text is not None:
+ self.model.set_classes(text, self.model.get_text_pe(text)) # type: ignore[no-untyped-call]
+ self._visual_prompts = None
+ self._names_override = list(text)
+ else:
+ cls = np.arange(len(bboxes), dtype=np.int16) # type: ignore[arg-type]
+ self._visual_prompts = {"bboxes": bboxes, "cls": cls} # type: ignore[dict-item]
+ self._names_override = None
+
+ def process_image(self, image: np.ndarray) -> DetectionResults:
+ """Process an image and return detection results."""
+ track_kwargs: dict[str, Any] = {
+ "source": image,
+ "device": self.device,
+ "conf": self.conf,
+ "iou": self.iou,
+ "persist": True,
+ "verbose": False,
+ }
+
+ with self._lock:
+ if self._visual_prompts is not None:
+ track_kwargs["visual_prompts"] = self._visual_prompts
+
+ results = self.model.track(**track_kwargs) # type: ignore[arg-type]
+
+ detections = from_ultralytics_results(results, image_shape=image.shape[:2], names=self._names_override)
+ filtered = self._apply_filters(image, detections)
+ return DetectionResults(image=image, detections=filtered)
+
+ def _apply_filters(
+ self,
+ image: np.ndarray,
+ detections: List[SingleDetectionResult],
+ ) -> List[SingleDetectionResult]:
+ if self.max_area_ratio is None:
+ return detections
+
+ image_area = float(image.shape[0] * image.shape[1])
+ if image_area <= 0:
+ return detections
+
+ return [det for det in detections if (det.area() / image_area) <= self.max_area_ratio]
+
+ def stop(self) -> None:
+ """Clean up resources used by the detector."""
+ if hasattr(self.model, "predictor") and self.model.predictor is not None:
+ predictor = self.model.predictor
+ if hasattr(predictor, "trackers") and predictor.trackers:
+ for tracker in predictor.trackers:
+ if hasattr(tracker, "tracker") and hasattr(tracker.tracker, "gmc"):
+ gmc = tracker.tracker.gmc
+ if hasattr(gmc, "executor") and gmc.executor is not None:
+ gmc.executor.shutdown(wait=True)
+ self.model.predictor = None
+
+ @staticmethod
+ def _ensure_mobileclip(model_dir: Path) -> None:
+ target = model_dir / "mobileclip_blt.ts"
+ if target.exists() and zipfile.is_zipfile(target):
+ return
+ if target.exists() or target.is_symlink():
+ target.unlink()
+ attempt_download_asset(target)
+
+
+__all__ = ["Yoloe2DDetector", "YoloePromptMode"]
diff --git a/detect_anything/env-hooks/venv_pythonpath.sh.in b/detect_anything/env-hooks/venv_pythonpath.sh.in
new file mode 100644
index 0000000..9212bac
--- /dev/null
+++ b/detect_anything/env-hooks/venv_pythonpath.sh.in
@@ -0,0 +1 @@
+ament_prepend_unique_value PYTHONPATH "@PYTHON_INSTALL_DIR@"
diff --git a/detect_anything/msg/DetectionResult.msg b/detect_anything/msg/DetectionResult.msg
new file mode 100644
index 0000000..766d5f1
--- /dev/null
+++ b/detect_anything/msg/DetectionResult.msg
@@ -0,0 +1,10 @@
+std_msgs/Header header
+int32[] track_id
+float32[] x1
+float32[] y1
+float32[] x2
+float32[] y2
+string[] label
+float32[] confidence
+sensor_msgs/CompressedImage image
+sensor_msgs/CompressedImage[] masks # cropped to bbox; aligned to bbox top-left; encoding: mono8
diff --git a/detect_anything/package.xml b/detect_anything/package.xml
new file mode 100644
index 0000000..a7699ae
--- /dev/null
+++ b/detect_anything/package.xml
@@ -0,0 +1,26 @@
+
+
+
+ detect_anything
+ 0.0.1
+ YOLO-E detection node and utilities.
+ Alex Lin
+ MIT
+
+ ament_cmake
+
+ rosidl_default_generators
+ rosidl_default_runtime
+ rosidl_interface_packages
+
+ rclpy
+ std_msgs
+ sensor_msgs
+ vision_msgs
+ cv_bridge
+ vector_perception_utils
+
+
+ ament_cmake
+
+
diff --git a/detect_anything/resource/detect_anything b/detect_anything/resource/detect_anything
new file mode 100644
index 0000000..eac574f
--- /dev/null
+++ b/detect_anything/resource/detect_anything
@@ -0,0 +1 @@
+detect_anything
diff --git a/detect_anything/scripts/detection_node b/detect_anything/scripts/detection_node
new file mode 100755
index 0000000..822cacf
--- /dev/null
+++ b/detect_anything/scripts/detection_node
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+from detect_anything.detection_node import main
+
+if __name__ == "__main__":
+ main()
diff --git a/requirements.txt b/requirements.txt
index 7b6c110..573b311 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,21 @@
# basic dependencies
numpy==1.26.4
opencv-python
+colcon-common-extensions
scipy
pillow
tqdm
+pyyaml
+lark
+pytest
+moondream
+openai
# 3D processing
open3d
-# deep learning
-torch<=2.4.0
-torchvision<=1.19.0
+# vision
+ultralytics
# SAM2 and EdgeTAM dependencies
sam2
diff --git a/rviz/semantic_mapping.rviz b/rviz/semantic_mapping.rviz
new file mode 100644
index 0000000..86a2272
--- /dev/null
+++ b/rviz/semantic_mapping.rviz
@@ -0,0 +1,415 @@
+Panels:
+ - Class: rviz_common/Displays
+ Help Height: 0
+ Name: Displays
+ Property Tree Widget:
+ Expanded:
+ - /Global Options1
+ - /Status1
+ - /RoomSegmentation1/room_boundaries1/Topic1
+ - /RoomSegmentation1/room_type_viz1
+ Splitter Ratio: 0.5
+ Tree Height: 715
+ - Class: rviz_common/Selection
+ Name: Selection
+ - Class: rviz_common/Tool Properties
+ Expanded:
+ - /2D Goal Pose1
+ - /Publish Point1
+ Name: Tool Properties
+ Splitter Ratio: 0.5886790156364441
+ - Class: rviz_common/Views
+ Expanded:
+ - /Current View1
+ Name: Views
+ Splitter Ratio: 0.5
+ - Class: rviz_common/Time
+ Experimental: false
+ Name: Time
+ SyncMode: 0
+ SyncSource: PointCloud2
+Visualization Manager:
+ Class: ""
+ Displays:
+ - Alpha: 0.5
+ Cell Size: 1
+ Class: rviz_default_plugins/Grid
+ Color: 160; 160; 164
+ Enabled: true
+ Line Style:
+ Line Width: 0.029999999329447746
+ Value: Lines
+ Name: Grid
+ Normal Cell Count: 0
+ Offset:
+ X: 0
+ Y: 0
+ Z: 0
+ Plane: XY
+ Plane Cell Count: 10
+ Reference Frame:
+ Value: true
+ - Class: rviz_default_plugins/Image
+ Enabled: true
+ Max Value: 1
+ Median window: 5
+ Min Value: 0
+ Name: Image
+ Normalize Range: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /annotated_image_detection
+ Value: true
+ - Angle Tolerance: 0.10000000149011612
+ Class: rviz_default_plugins/Odometry
+ Covariance:
+ Orientation:
+ Alpha: 0.5
+ Color: 255; 255; 127
+ Color Style: Unique
+ Frame: Local
+ Offset: 1
+ Scale: 1
+ Value: true
+ Position:
+ Alpha: 0.30000001192092896
+ Color: 204; 51; 204
+ Scale: 1
+ Value: true
+ Value: false
+ Enabled: true
+ Keep: 5
+ Name: Odometry
+ Position Tolerance: 0.10000000149011612
+ Shape:
+ Alpha: 1
+ Axes Length: 1
+ Axes Radius: 0.10000000149011612
+ Color: 255; 25; 0
+ Head Length: 0.30000001192092896
+ Head Radius: 0.10000000149011612
+ Shaft Length: 1
+ Shaft Radius: 0.05000000074505806
+ Value: Arrow
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ Filter size: 10
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /state_estimation
+ Value: true
+ - Alpha: 0.05000000074505806
+ Autocompute Intensity Bounds: false
+ Autocompute Value Bounds:
+ Max Value: 10
+ Min Value: -10
+ Value: true
+ Axis: Z
+ Channel Name: intensity
+ Class: rviz_default_plugins/PointCloud2
+ Color: 255; 255; 255
+ Color Transformer: FlatColor
+ Decay Time: 5
+ Enabled: true
+ Invert Rainbow: false
+ Max Color: 255; 255; 255
+ Max Intensity: 161
+ Min Color: 0; 0; 0
+ Min Intensity: 0
+ Name: PointCloud2
+ Position Transformer: XYZ
+ Selectable: false
+ Size (Pixels): 3
+ Size (m): 0.009999999776482582
+ Style: Flat Squares
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /registered_scan
+ Use Fixed Frame: false
+ Use rainbow: true
+ Value: true
+ - Class: rviz_common/Group
+ Displays:
+ - Alpha: 1
+ Autocompute Intensity Bounds: true
+ Autocompute Value Bounds:
+ Max Value: 0.7398463487625122
+ Min Value: 0.7398463487625122
+ Value: true
+ Axis: Z
+ Channel Name: intensity
+ Class: rviz_default_plugins/PointCloud2
+ Color: 224; 27; 36
+ Color Transformer: AxisColor
+ Decay Time: 0
+ Enabled: true
+ Invert Rainbow: false
+ Max Color: 255; 255; 255
+ Max Intensity: 4096
+ Min Color: 0; 0; 0
+ Min Intensity: 0
+ Name: door_cloud
+ Position Transformer: XYZ
+ Selectable: true
+ Size (Pixels): 3
+ Size (m): 0.10000000149011612
+ Style: Flat Squares
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /door_cloud
+ Use Fixed Frame: true
+ Use rainbow: true
+ Value: true
+ - Class: rviz_default_plugins/MarkerArray
+ Enabled: true
+ Name: room_viewpoints
+ Namespaces:
+ viewpoint_room_ids: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /viewpoint_room_ids
+ Value: true
+ - Class: rviz_default_plugins/MarkerArray
+ Enabled: true
+ Name: room_boundaries
+ Namespaces:
+ polygon_group: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /room_boundaries
+ Value: true
+ - Class: rviz_default_plugins/Marker
+ Enabled: true
+ Name: chosen_room
+ Namespaces:
+ {}
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ Filter size: 10
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /chosen_room_boundary
+ Value: true
+ - Alpha: 0.800000011920929
+ Autocompute Intensity Bounds: true
+ Autocompute Value Bounds:
+ Max Value: 10
+ Min Value: -10
+ Value: true
+ Axis: Z
+ Channel Name: intensity
+ Class: rviz_default_plugins/PointCloud2
+ Color: 255; 255; 255
+ Color Transformer: RGB8
+ Decay Time: 0
+ Enabled: true
+ Invert Rainbow: false
+ Max Color: 255; 255; 255
+ Max Intensity: 4096
+ Min Color: 0; 0; 0
+ Min Intensity: 0
+ Name: PointCloud2
+ Position Transformer: XYZ
+ Selectable: true
+ Size (Pixels): 5
+ Size (m): 0.019999999552965164
+ Style: Flat Squares
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /room_map_cloud
+ Use Fixed Frame: true
+ Use rainbow: true
+ Value: true
+ - Class: rviz_default_plugins/MarkerArray
+ Enabled: true
+ Name: room_type_viz
+ Namespaces:
+ room_type: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /room_type_vis
+ Value: true
+ Enabled: true
+ Name: RoomSegmentation
+ - Class: rviz_common/Group
+ Displays:
+ - Alpha: 1
+ Autocompute Intensity Bounds: true
+ Autocompute Value Bounds:
+ Max Value: 10
+ Min Value: -10
+ Value: true
+ Axis: Z
+ Channel Name: intensity
+ Class: rviz_default_plugins/PointCloud2
+ Color: 255; 255; 255
+ Color Transformer: RGB8
+ Decay Time: 0
+ Enabled: true
+ Invert Rainbow: false
+ Max Color: 255; 255; 255
+ Max Intensity: 4096
+ Min Color: 0; 0; 0
+ Min Intensity: 0
+ Name: object_points
+ Position Transformer: XYZ
+ Selectable: false
+ Size (Pixels): 3
+ Size (m): 0.009999999776482582
+ Style: Flat Squares
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /obj_points
+ Use Fixed Frame: true
+ Use rainbow: true
+ Value: true
+ - Class: rviz_default_plugins/MarkerArray
+ Enabled: true
+ Name: object_bboxes
+ Namespaces:
+ bbox: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /obj_boxes
+ Value: true
+ - Class: rviz_default_plugins/MarkerArray
+ Enabled: true
+ Name: object_labels
+ Namespaces:
+ labels: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /obj_labels
+ Value: true
+ - Class: rviz_default_plugins/MarkerArray
+ Enabled: false
+ Name: object_visibility
+ Namespaces:
+ {}
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /object_visibility_connections
+ Value: false
+ Enabled: true
+ Name: object_mapping
+ Enabled: true
+ Global Options:
+ Background Color: 48; 48; 48
+ Fixed Frame: map
+ Frame Rate: 30
+ Name: root
+ Tools:
+ - Class: rviz_default_plugins/Interact
+ Hide Inactive Objects: true
+ - Class: rviz_default_plugins/MoveCamera
+ - Class: rviz_default_plugins/Select
+ - Class: rviz_default_plugins/FocusCamera
+ - Class: rviz_default_plugins/Measure
+ Line color: 128; 128; 0
+ - Class: rviz_default_plugins/SetInitialPose
+ Covariance x: 0.25
+ Covariance y: 0.25
+ Covariance yaw: 0.06853891909122467
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /initialpose
+ - Class: rviz_default_plugins/SetGoal
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /goal_pose
+ - Class: rviz_default_plugins/PublishPoint
+ Single click: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /clicked_point
+ Transformation:
+ Current:
+ Class: rviz_default_plugins/TF
+ Value: true
+ Views:
+ Current:
+ Class: rviz_default_plugins/Orbit
+ Distance: 37.76976013183594
+ Enable Stereo Rendering:
+ Stereo Eye Separation: 0.05999999865889549
+ Stereo Focal Distance: 1
+ Swap Stereo Eyes: false
+ Value: false
+ Focal Point:
+ X: 4.479973316192627
+ Y: 11.129703521728516
+ Z: -8.338903427124023
+ Focal Shape Fixed Size: false
+ Focal Shape Size: 0.05000000074505806
+ Invert Z Axis: false
+ Name: Current View
+ Near Clip Distance: 0.009999999776482582
+ Pitch: 0.6847962737083435
+ Target Frame: map
+ Value: Orbit (rviz_default_plugins)
+ Yaw: 3.7372934818267822
+ Saved: ~
+Window Geometry:
+ Displays:
+ collapsed: false
+ Height: 2197
+ Hide Left Dock: false
+ Hide Right Dock: true
+ Image:
+ collapsed: false
+ QMainWindow State: 000000ff00000000fd00000004000000000000070c0000078bfc020000000bfb0000001200530065006c0065006300740069006f006e00000001e10000009b000000b200fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000a3fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c00610079007301000000700000033d0000018600fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261fb0000000a0049006d00610067006501000002020000007e0000000000000000fb0000000a0049006d00610067006501000003b9000004420000002800fffffffb0000000a0049006d00610067006500000002bb000000c60000000000000000000000010000010f00000342fc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003f000003420000013800fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e10000019700000003000012140000005efc0100000002fb0000000800540069006d00650100000000000012140000047a00fffffffb0000000800540069006d0065010000000000000450000000000000000000000afc0000078b00000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000
+ Selection:
+ collapsed: false
+ Time:
+ collapsed: false
+ Tool Properties:
+ collapsed: false
+ Views:
+ collapsed: true
+ Width: 4628
+ X: 4656
+ Y: 418
diff --git a/semantic_mapping/CMakeLists.txt b/semantic_mapping/CMakeLists.txt
new file mode 100644
index 0000000..71da435
--- /dev/null
+++ b/semantic_mapping/CMakeLists.txt
@@ -0,0 +1,62 @@
+cmake_minimum_required(VERSION 3.10)
+project(semantic_mapping)
+
+find_package(ament_cmake REQUIRED)
+find_package(rclpy REQUIRED)
+find_package(std_msgs REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(geometry_msgs REQUIRED)
+find_package(nav_msgs REQUIRED)
+find_package(visualization_msgs REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(rosidl_default_generators REQUIRED)
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+rosidl_generate_interfaces(${PROJECT_NAME}
+ "msg/DetectionResult.msg"
+ "msg/NavigationQuery.msg"
+ "msg/ObjectNode.msg"
+ "msg/ObjectType.msg"
+ "msg/RoomEarlyStop1.msg"
+ "msg/RoomNode.msg"
+ "msg/RoomNodeList.msg"
+ "msg/RoomType.msg"
+ "msg/TargetObject.msg"
+ "msg/TargetObjectInstruction.msg"
+ "msg/TargetObjectWithSpatial.msg"
+ "msg/VlmAnswer.msg"
+ "msg/ViewpointRep.msg"
+ DEPENDENCIES std_msgs sensor_msgs geometry_msgs
+)
+
+set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages")
+
+install(
+ DIRECTORY semantic_mapping
+ DESTINATION ${PYTHON_INSTALL_DIR}
+)
+
+install(
+ PROGRAMS
+ scripts/semantic_mapping_node
+ scripts/vlm_reasoning_node
+ DESTINATION lib/${PROJECT_NAME}
+)
+
+install(
+ FILES resource/semantic_mapping
+ DESTINATION share/${PROJECT_NAME}/resource
+)
+
+install(
+ DIRECTORY config
+ DESTINATION share/${PROJECT_NAME}
+)
+
+install(
+ DIRECTORY launch
+ DESTINATION share/${PROJECT_NAME}
+)
+
+ament_export_dependencies(rosidl_default_runtime)
+ament_package()
diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md
new file mode 100644
index 0000000..6663558
--- /dev/null
+++ b/semantic_mapping/README.md
@@ -0,0 +1,192 @@
+# Semantic Mapping
+
+ROS2 package for semantic 3D object mapping with VLM integration.
+Includes `vlm_reasoning_node` (Moondream-backed) with unified
+`semantic_mapping` message types across planner/VLM nodes.
+
+## Overview
+
+This package builds a persistent semantic map of objects detected by `detect_anything`, fusing 2D detections with LiDAR point clouds to create 3D object representations. It publishes `ObjectNode` messages compatible with the tare_planner exploration system.
+The pipeline stores voxelized geometry, merges observations, and exports planner‑friendly ROS messages.
+
+## Status: Work in Progress
+
+| Component | File | Status |
+|-----------|------|--------|
+| Messages | `msg/*.msg` | Complete |
+| Data Models | `map_object.py` | Complete |
+| Matching | `utils/matching.py` | Complete |
+| Object Mapper | `mapper.py` | Complete |
+| Cloud Projection | `vector_perception_utils/pointcloud_utils.py` | Complete |
+| Publishers | `utils/map_object_utils.py` | Complete |
+| Image Storage | `utils/storage.py` | Complete |
+| Mapping Node | `mapping_node.py` | Complete |
+| VLM Reasoning Node | `vlm_reasoning_node.py` | Complete |
+| Launch Files | `launch/` | Complete |
+
+## Messages
+
+### ObjectNode.msg
+Main output message for planner integration:
+- `object_id[]`: Track IDs (may contain multiple merged IDs)
+- `label`: Dominant class label
+- `position`: Object centroid in world frame
+- `bbox3d[8]`: 3D bounding box corners
+- `cloud`: Object point cloud
+- `status`: Lifecycle status ("new", "updated", "unchanged")
+- `img_path`: Path to best saved image crop
+- `is_asked_vlm`: VLM query flag
+- `viewpoint_id`: Associated viewpoint
+
+### ObjectType.msg
+VLM query/answer for object relabeling.
+
+### RoomType.msg
+VLM query/answer for room classification.
+
+### TargetObjectInstruction.msg
+User instruction for target object search.
+
+### TargetObjectWithSpatial.msg
+Target object query with spatial/viewpoint context.
+
+### ViewpointRep.msg
+Viewpoint trigger for image saving.
+
+Note: `detect_anything/DetectionResult` masks are cropped to the bbox and aligned to
+the bbox top-left (as documented in `detect_anything/msg/DetectionResult.msg`).
+The mapping node reconstructs full-image masks before projection.
+
+## Architecture
+
+```
+detect_anything/DetectionResult
+ │
+ ▼
+┌───────────────────────────────────────────────┐
+│ semantic_mapping_node │
+│ │
+│ ┌────────────────────┐ ┌──────────────┐ │
+│ │ pointcloud_utils │ │ ObjectMapper │ │
+│ │ (projection + mask)├──▶│ (pending/ │ │
+│ │ │ │ permanent) │ │
+│ └────────────────────┘ └──────────────┘ │
+│ │ │ │
+│ ▼ ▼ │
+│ ┌────────────────┐ ┌──────────────┐ │
+│ │ MapObject │ │ map_object_ │ │
+│ │ (voxels/votes) ├─▶│ utils │ │
+│ └────────────────┘ └──────────────┘ │
+│ │ │ │
+│ ▼ ▼ │
+│ utils/storage.py mapping_node.py │
+└───────────────────────────────────────────────┘
+ │
+ ▼
+/object_nodes, /obj_points, /obj_boxes, /obj_labels
+```
+
+Key components:
+- `map_object.py`: Object state, voxel voting, pose de‑duplication, cached geometry.
+- `utils/matching.py`: Track‑id and spatial matching with IoU/ratio heuristics.
+- `mapper.py`: Pending/permanent tiers, merge rules, and pruning.
+- `vector_perception_utils/pointcloud_utils.py`: Projection + mask segmentation utilities.
+- `utils/map_object_utils.py`: ROS message construction helpers.
+- `utils/storage.py`: Async image save queue.
+- `mapping_node.py`: ROS2 orchestration (subscriptions, sync, publish).
+
+## Nodes
+
+| Node | Purpose |
+|------|---------|
+| `semantic_mapping_node` | Builds the 3D object map and publishes planner object topics. |
+| `vlm_reasoning_node` | Answers room/object queries via Moondream and publishes VLM answers. |
+
+## Dependencies
+
+- `detect_anything`: Detection results with masks
+- `vector_perception_utils`: Point cloud and image utilities
+- ROS2 messages: `sensor_msgs`, `geometry_msgs`, `nav_msgs`, `visualization_msgs`
+
+## Usage
+
+```bash
+# Build
+colcon build --packages-select semantic_mapping
+
+# Run (after implementation complete)
+ros2 launch semantic_mapping semantic_mapping.launch.py
+```
+
+## Configuration
+
+- `config/params.yaml`: Default node parameters for `semantic_mapping_node`.
+- `config/camera_param.yaml`: Camera parameters + static sensor→camera transform for projection.
+- `config/objects.yaml`: Object taxonomy for detection labels.
+
+Key runtime params:
+- `regularize_voxel_size`: Connectivity voxel size used in shape regularization.
+- `ground_filter`, `ground_radius`, `ground_z_clearance`: Simple ground clamp around robot.
+- `visualization_max_objects`: Limits RViz markers/pointcloud to most recent objects.
+- `object_type_query_mode`: `target_only`, `all`, or `target_or_all` for VLM queries.
+- `object_type_query_max_per_cycle`: Max number of object queries per mapping cycle.
+
+## Topics
+
+### Subscriptions
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/detection_result` | `detect_anything/DetectionResult` | 2D detections with masks |
+| `/registered_scan` | `sensor_msgs/PointCloud2` | LiDAR point cloud |
+| `/state_estimation` | `nav_msgs/Odometry` | Robot odometry |
+| `/object_type_answer` | `ObjectType` | VLM relabeling response |
+| `/target_object_instruction` | `TargetObjectInstruction` | Target object search |
+| `/viewpoint_rep_header` | `ViewpointRep` | Viewpoint trigger |
+
+### Publications
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_nodes` | `ObjectNode` | Semantic object map |
+| `/obj_points` | `sensor_msgs/PointCloud2` | Colored object point cloud |
+| `/obj_boxes` | `visualization_msgs/MarkerArray` | 3D bounding box wireframes |
+| `/obj_labels` | `visualization_msgs/MarkerArray` | Object label text |
+| `/object_type_query` | `ObjectType` | VLM query for relabeling |
+| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Target object + viewpoints |
+
+Note: planner/VLM nodes now share `semantic_mapping/msg/*` directly (no bridge).
+
+## VLM Reasoning Topics
+
+### Subscriptions
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_type_query` | `ObjectType` | Object relabeling query |
+| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Spatial target query |
+| `/room_type_query` | `RoomType` | Room classification query |
+| `/keyboard_input` | `std_msgs/String` | Instruction text |
+
+### Publications
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_type_answer` | `ObjectType` | Object relabeling answer |
+| `/target_object_answer` | `TargetObjectWithSpatial` | Spatial target answer |
+| `/target_object_instruction` | `TargetObjectInstruction` | Parsed instruction fields |
+| `/room_type_answer` | `RoomType` | Room classification answer |
+| `/room_type_vis` | `visualization_msgs/MarkerArray` | Room label markers |
+
+## References
+
+- Full migration plan: [`docs/SEMANTIC_MAPPING_MIGRATION.md`](docs/SEMANTIC_MAPPING_MIGRATION.md)
+- Original implementation: `VLM_ROS/src/semantic_mapping/` (main branch)
+ - SAM2 for segmentation
+ - External Gemini API for VLM reasoning
+- Detection: `detect_anything` package (decoupled)
+- dimos patterns: `dimos/perception/detection/objectDB.py`
+
+## Handoff Notes
+
+- Regularization uses vote-weighted filtering + voxel connected components; adjust
+ `regularize_voxel_size` and the regularization percentile (in `MapObject`) if
+ thin structures are dropped.
+- Mapping node logs summary stats and a single mapping loop time per cycle.
+- Permanent objects persist; only pending objects time out via `pending_ttl_s`.
diff --git a/semantic_mapping/config/camera_param.yaml b/semantic_mapping/config/camera_param.yaml
new file mode 100644
index 0000000..8206adb
--- /dev/null
+++ b/semantic_mapping/config/camera_param.yaml
@@ -0,0 +1,16 @@
+image_width: 1920
+image_height: 640
+hfov: 360.0
+vfov: 120.0
+
+sensor_to_camera:
+ parent_frame: sensor
+ child_frame: camera_link
+ translation:
+ x: -0.12
+ y: -0.075
+ z: 0.265
+ rotation_rpy:
+ roll: -1.5707963
+ pitch: 0.0
+ yaw: -1.5707963
diff --git a/semantic_mapping/config/params.yaml b/semantic_mapping/config/params.yaml
new file mode 100644
index 0000000..f373fb6
--- /dev/null
+++ b/semantic_mapping/config/params.yaml
@@ -0,0 +1,57 @@
+semantic_mapping_node:
+ ros__parameters:
+ detection_topic: /detection_result
+ cloud_topic: /registered_scan
+ odom_topic: /state_estimation
+ viewpoint_topic: /viewpoint_rep_header
+ object_type_answer_topic: /object_type_answer
+ object_type_query_topic: /object_type_query
+ target_object_spatial_query_topic: /target_object_spatial_query
+ target_object_instruction_topic: /target_object_instruction
+ object_nodes_topic: /object_nodes
+ obj_points_topic: /obj_points
+ obj_boxes_topic: /obj_boxes
+ obj_labels_topic: /obj_labels
+ map_frame: map
+
+ confidence_threshold: 0.35
+ voxel_size: 0.05
+ regularize_voxel_size: 0.15
+ distance_threshold: 0.3
+ iou_threshold: 0.3
+ min_detections_for_permanent: 3
+ pending_ttl_s: 5.0
+ num_angle_bins: 15
+
+ detection_linear_state_time_bias: 0.0
+ detection_angular_state_time_bias: 0.0
+ cloud_window_before: 0.5
+ cloud_window_after: 0.1
+
+ camera_param_path: config/camera_param.yaml
+ depth_filter: true
+ depth_filter_margin: 0.15
+ depth_jump_threshold: 0.5
+ mask_erode_kernel: 3
+ mask_erode_iterations: 1
+ max_cloud_distance: 8.0
+ visualization_max_objects: 100
+ ground_filter: true
+ ground_radius: 2.5
+ ground_z_clearance: 0.0
+
+ target_object: refrigerator
+ enable_object_type_queries: False
+ save_png: True
+ output_dir: output/object_images
+ save_viewpoint_images: True
+ viewpoint_output_dir: output/viewpoint_images
+ viewpoint_use_timestamp_dir: True
+ viewpoint_save_png: True
+ viewpoint_save_transform: True
+
+vlm_reasoning_node:
+ ros__parameters:
+ dump_room_type_images: True
+ room_type_dump_dir: output/room_type_images
+ room_type_cache_s: 10.0
\ No newline at end of file
diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md
new file mode 100644
index 0000000..2c341fb
--- /dev/null
+++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md
@@ -0,0 +1,543 @@
+# Semantic Mapping Migration Plan
+
+## Overview
+
+This document outlines the comprehensive plan for porting the `semantic_mapping` package from `VLM_ROS` into `vector_perception_ros/semantic_mapping`, following the coding standards of `vector_perception_ros` while preserving functional compatibility with downstream consumers (particularly the tare_planner).
+
+> **Note**: Detection is decoupled into `detect_anything` package. VLM reasoning uses external Gemini API.
+
+### Current Status (Feb 2026)
+- `semantic_mapping` and `detect_anything` are fully migrated and buildable.
+- `detect_anything` installs `models/` into the package share; defaults resolve
+ `model_path` via package share to avoid cwd dependence.
+- Room segmentation is now in `vector_perception_ros/sensor_coverage`.
+- `/navigation_boundary` publication from room segmentation has been removed;
+ the dedicated `navigationBoundary` node should own that topic.
+
+### Handoff Notes (Next Agent)
+- Validate that bagfile/real/sim launch files in autonomy-stack match VLM_ROS.
+- Ensure `detect_anything` downloads MobileCLIP into the resolved model directory
+ (if weights still go to `~/.ultralytics/weights`, confirm `model_path` param).
+- After changes, rebuild `detect_anything`, `semantic_mapping`, `sensor_coverage`.
+
+## Design Principles
+
+1. **Logic cleanup, not restructure** - Preserve the core algorithms (voxel voting, observation angles, shape regularization, IoU/distance matching) while cleaning up dead code and improving organization.
+2. **Same I/O contract** - Maintain the same ROS topic names, message types, and semantics so existing planner integration continues to work.
+3. **vector_perception_ros coding standards** - Full type hints, Google-style docstrings, dataclass containers, proper ROS2 patterns.
+4. **ObjectDB-style patterns** - Adopt the two-tier (pending/permanent) architecture and dual matching (track_id + distance) from dimos where it improves clarity.
+
+---
+
+## Implementation Progress
+
+| Step | Component | Status |
+|------|-----------|--------|
+| 1 | Package Setup & Messages | **COMPLETED** |
+| 2 | Data Models (`map_object.py`) | **COMPLETED** |
+| 2.1 | Refactor to use shared utilities | **COMPLETED** |
+| 3 | Matching Utilities (`utils/matching.py`) | **COMPLETED** |
+| 4 | Object Mapper (`mapper.py`) | **COMPLETED** |
+| 5 | Cloud-Image Projection (`pointcloud_utils.py`) | **COMPLETED** |
+| 6 | Message Publishers (`utils/map_object_utils.py`) | **COMPLETED** |
+| 7 | Image Storage (`utils/storage.py`) | **COMPLETED** |
+| 8 | Mapping Node (`mapping_node.py`) | **COMPLETED** |
+| 9 | Launch Files & Config | **COMPLETED** |
+| 10 | Integration Testing | **COMPLETED** |
+| 11 | VLM Reasoning Node + Room Type Viz | **COMPLETED** |
+| 12 | Tare Message Bridge (/semantic topics) | **COMPLETED** |
+
+### Step 2.1: Refactored map_object.py to use shared utilities
+
+Added to `vector_perception_utils/transform_utils.py`:
+- `normalize_angle()` - now supports both scalar and array input
+- `discretize_angles()` - discretize angles into bin indices
+
+Added to `vector_perception_utils/pointcloud_utils.py`:
+- `get_oriented_bbox()` - returns (center, corners) tuple from minimal OBB
+
+Refactored in `map_object.py`:
+- Removed duplicate `_normalize_angles_to_pi` and `_discretize_angles` functions
+- Simplified `_compute_geometry()` from 43 lines to 15 using `get_oriented_bbox()`
+- Use `create_pointcloud()` in `regularize_shape()` instead of manual o3d creation
+- Use `compute_centroid()` instead of `np.mean(..., axis=0)`
+
+---
+
+## VLM_ROS Reference (Source Implementation)
+
+Key files in `VLM_ROS/src/semantic_mapping/`:
+- `semantic_map_new.py`: `ObjMapper` manages the persistent map
+- `single_object_new.py`: `SingleObject` holds per-object state
+- `semantic_mapping_node.py`: `MappingNode` orchestrates SAM2 masks, LiDAR projection, odom transforms
+
+### ObjMapper Core Flow
+- Input: dict with `bboxes`, `labels`, `confidences`, `ids`, `masks` (from SAM2), plus LiDAR cloud and odom
+- Memory fix: Uses `.copy()` on detection data extraction to prevent memory leaks
+- VLM query threshold: Objects need `best_image_mask_area > 500` before querying VLM
+- `update_map()` steps:
+ 1. Transform incoming cloud into body/world frame using odom
+ 2. Filter detections by confidence threshold
+ 3. For each detection: reconstruct full-image masks (bbox-cropped in `detect_anything/msg/DetectionResult.msg`), project masked cloud, build per-object voxels, then either merge or create a `SingleObject`
+ 4. Merge heuristics: match by tracker `obj_id`, 3D IoU, distance thresholds, and optional primitive groups (e.g., chair/sofa)
+ 5. Maintain delete lists, save queues (best image/mask per object)
+ 6. Publish ROS messages: `ObjectNode`, bbox markers, label markers, colored point cloud
+
+### SingleObject Highlights
+- Keeps class_id→counts/confidence, track_id list, voxel grid (`VoxelFeatureManager`), robot poses, masks, best image saving
+- Computes 3D bbox (axis-aligned + oriented), centroids, clustering flags, and validity caches
+- Manages persistence (`life`, `inactive_frame`, `publish_status`) and VLM interaction state
+
+### dimos Patterns to Adopt
+- `ObjMapper` ⇄ `ObjectDB`: explicit pending/permanent tiers, track_id + spatial matching, promotion thresholds
+- `SingleObject` ⇄ `Object`: cleaner update patterns, separation of core geometry vs VLM metadata
+
+---
+
+## Phase 1: Package Structure & Messages (COMPLETED)
+
+### 1.1 Package Structure
+
+```
+semantic_mapping/
+├── CMakeLists.txt # Hybrid CMake for message generation + Python install
+├── package.xml # Dependencies: detect_anything, vector_perception_utils, ROS msgs
+├── msg/
+│ ├── ObjectNode.msg
+│ ├── ObjectType.msg
+│ ├── TargetObjectInstruction.msg
+│ ├── TargetObjectWithSpatial.msg
+│ └── ViewpointRep.msg
+├── config/
+│ ├── objects.yaml # Object taxonomy (to be copied from VLM_ROS)
+│ └── params.yaml # Default node parameters
+├── launch/
+│ └── semantic_mapping.launch.py
+├── semantic_mapping/
+│ ├── __init__.py
+│ ├── mapping_node.py # ROS2 node (orchestration)
+│ ├── mapper.py # ObjectMapper class (core logic)
+│ ├── map_object.py # Data classes: MapObject, Observation, VoxelManager
+│ ├── utils/
+│ │ ├── matching.py # Object matching utilities (IoU, distance, track_id)
+│ │ ├── map_object_utils.py # ROS message conversion helpers
+│ │ └── storage.py # Async image saving queue
+│ ├── (moved to vector_perception_utils) # Cloud-image projection helpers
+├── docs/
+│ └── SEMANTIC_MAPPING_MIGRATION.md
+└── resource/
+ └── semantic_mapping # ament resource marker
+```
+
+### 1.2 Message Definitions
+
+**ObjectNode.msg** (preserves exact schema for planner compatibility):
+```
+std_msgs/Header header
+int32[] object_id
+string label
+geometry_msgs/Point position
+geometry_msgs/Point[8] bbox3d # 8 corner points in world frame
+sensor_msgs/PointCloud2 cloud
+string status # "new", "updated", "unchanged"
+string img_path
+bool is_asked_vlm
+int32 viewpoint_id
+```
+
+**RoomType.msg**: Room classification query/answer for VLM reasoning
+**ObjectType.msg**: VLM query/answer for object relabeling
+**TargetObjectInstruction.msg**: User-set target/anchor objects
+**TargetObjectWithSpatial.msg**: Target object query with viewpoint context
+**ViewpointRep.msg**: Viewpoint trigger message
+
+---
+
+## Phase 2: Data Models (COMPLETED)
+
+### 2.1 ObjectStatus Enum
+
+```python
+class ObjectStatus(Enum):
+ """Object lifecycle status (ObjectDB-style tiering)."""
+ PENDING = "pending" # < min_detections threshold
+ PERMANENT = "permanent" # >= threshold, confirmed object
+ DELETED = "deleted" # Marked for removal
+```
+
+### 2.2 Observation
+
+```python
+@dataclass
+class Observation:
+ """A single detection observation to update a MapObject."""
+ track_id: int
+ label: str
+ confidence: float
+ points: np.ndarray # (N, 3) world-frame points
+ robot_pose: dict # {R: (3,3), t: (3,)}
+ mask: np.ndarray | None = None # 2D segmentation mask
+ image: np.ndarray | None = None # RGB crop for best-image selection
+ timestamp: float = 0.0
+```
+
+### 2.3 VoxelFeatureManager
+
+```python
+@dataclass
+class VoxelFeatureManager:
+ """Manages voxelized point cloud with observation angle voting."""
+ voxels: np.ndarray # (N, 3) world coords
+ votes: np.ndarray # (N,) detection count per voxel
+ observation_angles: np.ndarray # (N, num_bins) angle histogram
+ regularized_mask: np.ndarray | None # (N,) valid voxel mask after clustering
+ voxel_size: float = 0.03
+ num_angle_bins: int = 20
+
+ def update(self, new_points: np.ndarray, robot_position: np.ndarray) -> None:
+ """Add new points with KDTree-based merge and angle voting."""
+
+ def get_valid_voxels(self, min_votes: int = 1) -> np.ndarray:
+ """Return voxels passing vote threshold and regularization mask."""
+```
+
+### 2.4 MapObject
+
+```python
+@dataclass
+class MapObject:
+ """Represents a tracked object in the semantic map."""
+ # Identity
+ object_id: str # UUID for this object
+ track_ids: list[int] # Associated tracker IDs (can merge)
+
+ # Classification (voted)
+ class_votes: dict[str, int] # label -> observation count
+ confidence_scores: dict[str, float] # label -> best confidence
+
+ # Geometry
+ voxel_manager: VoxelFeatureManager
+ robot_poses: list[dict] # [{R, t}, ...] observation poses
+
+ # Lifecycle (ObjectDB-style)
+ detection_count: int = 0
+ inactive_frames: int = 0
+ status: ObjectStatus = ObjectStatus.PENDING
+
+ # VLM interaction
+ is_asked_vlm: bool = False
+ updated_by_vlm: bool = False
+ best_image_path: str | None = None
+ best_image_mask_area: float = 0.0
+
+ @property
+ def label(self) -> str: ...
+ @property
+ def centroid(self) -> np.ndarray: ...
+ @property
+ def bbox3d(self) -> np.ndarray: ...
+ def update(self, observation: Observation) -> None: ...
+ def merge(self, other: MapObject) -> None: ...
+ def regularize_shape(self, percentile: float = 0.85) -> None: ...
+```
+
+---
+
+## Phase 3: Matching Utilities (utils/matching.py)
+
+```python
+def compute_iou_3d(bbox1: np.ndarray, bbox2: np.ndarray) -> float:
+ """Compute 3D IoU between two axis-aligned bounding boxes."""
+
+def compute_center_distance(obj1: MapObject, obj2: MapObject) -> float:
+ """Compute Euclidean distance between object centroids."""
+
+def are_merge_compatible(label1: str, label2: str, merge_groups: list[set[str]]) -> bool:
+ """Check if two labels can be merged (same class or in same merge group)."""
+
+def find_best_match(
+ obs: Observation,
+ candidates: list[MapObject],
+ distance_threshold: float,
+ iou_threshold: float,
+) -> MapObject | None:
+ """Find best matching object by distance and IoU criteria."""
+```
+
+---
+
+## Phase 4: Object Mapper (mapper.py)
+
+```python
+class ObjectMapper:
+ """
+ Manages the semantic object map with two-tier state management.
+ Pending objects are promoted to permanent after detection_count >= threshold.
+ """
+
+ def __init__(
+ self,
+ voxel_size: float = 0.03,
+ confidence_threshold: float = 0.30,
+ distance_threshold: float = 0.5,
+ iou_threshold: float = 0.2,
+ min_detections_for_permanent: int = 3,
+ pending_ttl_s: float = 10.0,
+ max_inactive_frames: int = 20,
+ ) -> None:
+ self._pending_objects: dict[str, MapObject] = {}
+ self._permanent_objects: dict[str, MapObject] = {}
+ self._track_id_map: dict[int, str] = {}
+ self._lock = threading.RLock()
+ self._merge_groups: list[set[str]] = [
+ {"chair", "sofa", "couch", "armchair"},
+ {"table", "desk"},
+ ]
+
+ def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: ...
+ def _match_observation(self, obs: Observation) -> MapObject | None: ...
+ def _create_object(self, obs: Observation) -> MapObject: ...
+ def _check_promotion(self, obj: MapObject) -> None: ...
+ def _try_merge_objects(self) -> None: ...
+ def _prune_inactive(self, target_object: str | None) -> list[MapObject]: ...
+ def get_objects(self, include_pending: bool = False) -> list[MapObject]: ...
+```
+
+---
+
+## Phase 5: Mapping Node (mapping_node.py)
+
+### 5.1 Node Structure
+
+```python
+class SemanticMappingNode(Node):
+ """ROS2 node for semantic 3D object mapping."""
+
+ def __init__(self) -> None:
+ super().__init__("semantic_mapping_node")
+ self._declare_parameters()
+ self.mapper = ObjectMapper(...)
+ self.projection = pointcloud_utils
+ self.image_saver = ImageSaveQueue(...)
+
+ # Data buffers (with locks)
+ self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=50)
+ self._odom_buffer: deque[tuple[float, Pose]] = deque(maxlen=100)
+ self._lock = threading.Lock()
+
+ self._create_subscriptions()
+ self._create_publishers()
+ self.create_timer(0.5, self._mapping_callback)
+```
+
+### 5.2 Subscriptions
+
+| Topic | Type | QoS |
+|-------|------|-----|
+| `/detection_result` | `detect_anything/DetectionResult` | Reliable |
+| `/registered_scan` | `PointCloud2` | Best Effort |
+| `/state_estimation` | `Odometry` | Best Effort |
+| `/semantic/object_type_answer` | `ObjectType` | Reliable |
+| `/semantic/target_object_instruction` | `TargetObjectInstruction` | Reliable |
+| `/semantic/viewpoint_rep_header` | `ViewpointRep` | Reliable |
+
+### 5.3 Publishers
+
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_nodes` | `ObjectNode` | Main output for planner |
+| `/obj_points` | `PointCloud2` | Colored object cloud |
+| `/obj_boxes` | `MarkerArray` | Bbox wireframes |
+| `/obj_labels` | `MarkerArray` | Text labels |
+| `/object_type_query` | `ObjectType` | VLM query |
+| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Target object + viewpoints |
+
+Note: semantic_mapping topics are now published without a `/semantic` prefix and
+no bridge is required. Annotated debug images are published by `detect_anything`.
+Note: `semantic_mapping.launch.py` now includes a temporary image republisher
+to bridge `compressed_image_topic` (e.g., `/camera/image/compressed`) to
+`image_topic` (default `/camera/image`) so legacy `tare_planner` room-type
+queries can still consume raw `sensor_msgs/Image`.
+
+---
+
+## Phase 6: Cloud-Image Projection (pointcloud_utils.py)
+
+```python
+def project_pointcloud_to_image(...): ...
+def segment_pointcloud_by_mask(...): ...
+def segment_pointclouds_from_projection(...): ...
+def transform_segmented_clouds_to_world(...): ...
+```
+
+---
+
+## Phase 7: Message Publishers (utils/map_object_utils.py)
+
+```python
+def map_object_to_object_node(obj: MapObject, header: Header, status: str, target_object: str | None = None) -> ObjectNode | None: ...
+def create_bbox_markers(objects: list[MapObject], header: Header) -> MarkerArray: ...
+def create_label_markers(objects: list[MapObject], header: Header) -> MarkerArray: ...
+def create_object_pointcloud(objects: list[MapObject], header: Header) -> PointCloud2 | None: ...
+def create_delete_markers(deleted_ids: list[int], header: Header, namespace: str) -> MarkerArray: ...
+```
+
+Publisher helpers:
+- `map_object_to_object_node` converts a `MapObject` into an `ObjectNode` with centroid, bbox corners, and point cloud.
+- Marker helpers emit bbox wireframes and text labels for RViz.
+- `create_object_pointcloud` builds a colored aggregate cloud from object voxels.
+
+---
+
+## Phase 8: Image Storage (utils/storage.py)
+
+```python
+class ImageSaveQueue:
+ """Background worker for saving object images to disk."""
+
+ def __init__(self, output_dir: Path, max_queue_size: int = 100) -> None: ...
+ def start(self) -> None: ...
+ def stop(self) -> None: ...
+ def queue_save(self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None) -> str: ...
+```
+
+Image storage:
+- Async worker writes `.npy` (and optional `.png`) crops to disk.
+- Queue is bounded to avoid blocking the main mapping loop.
+- Viewpoint snapshots are saved as `viewpoint_{id}.png` and
+ `viewpoint_{id}_transform.npy` in `viewpoint_output_dir` (timestamped per run).
+
+---
+
+## Phase 9: Launch Files
+
+```python
+# semantic_mapping.launch.py
+DeclareLaunchArgument('platform', default_value='mecanum_sim')
+DeclareLaunchArgument('detection_topic', default_value='/detection_result')
+DeclareLaunchArgument('cloud_topic', default_value='/registered_scan')
+DeclareLaunchArgument('odom_topic', default_value='/state_estimation')
+DeclareLaunchArgument('voxel_size', default_value='0.03')
+DeclareLaunchArgument('confidence_threshold', default_value='0.30')
+DeclareLaunchArgument('target_object', default_value='')
+```
+
+---
+
+## Implementation Checklist
+
+### Completed
+- [x] Create `CMakeLists.txt` for message generation
+- [x] Port message definitions from tare_planner
+- [x] Set up `package.xml` with dependencies
+- [x] Implement `map_object.py` with VoxelFeatureManager, MapObject, Observation
+- [x] Port lazy computation patterns (centroid, bbox3d, regularization)
+- [x] Refactor `map_object.py` to use shared utilities from `vector_perception_utils`
+- [x] Add `normalize_angle`, `discretize_angles` to `transform_utils.py`
+- [x] Add `get_oriented_bbox` to `pointcloud_utils.py`
+- [x] Simplify `_compute_geometry()` using Open3D oriented bbox
+- [x] Implement `utils/matching.py` with IoU, distance, merge-compatibility functions
+- [x] Implement `mapper.py` with ObjectDB-style tiering
+- [x] Implement cloud-image projection helpers in `vector_perception_utils/pointcloud_utils.py`
+- [x] Implement `utils/map_object_utils.py` message conversion functions
+- [x] Implement `utils/storage.py` with async queue
+- [x] Implement `mapping_node.py` following detect_anything patterns
+- [x] Add `RoomType.msg` and room-type VLM flow + visualization
+- [x] Add `vlm_reasoning_node.py` with Moondream-backed reasoning
+- [x] Remove `tare_message_bridge` and `/semantic` topic namespace
+
+### Remaining
+- [x] Create launch files and params.yaml
+- [x] Integration testing with bag files
+
+---
+
+## Code Cleanup Items
+
+Items from VLM_ROS to preserve:
+- Memory fixes with `.copy()` on detection data extraction
+- VLM query threshold (`best_image_mask_area > 500`)
+
+Items to remove:
+1. Unused publishers: `/cloud_image`
+2. Unused methods: `serialize_objs_to_bag()`, `is_merge_allowed()`, `handle_object_query()`
+3. Review constants: `BACKGROUND_OBJECTS`, `VERTICAL_OBJECTS`, `BIG_OBJECTS`, `SMALL_OBJECTS`
+4. Don't port: `byte_track`, `Grounded-SAM-2`, `mmconfig` directories
+
+---
+
+## Compatibility Checklist
+
+- [x] `/semantic/object_nodes` publishes ObjectNode with exact schema (bridged to `/object_nodes`)
+- [x] `/obj_points` publishes colored PointCloud2
+- [x] `/obj_boxes` publishes MarkerArray wireframes
+- [x] `/obj_labels` publishes MarkerArray text
+- [x] `/semantic/object_type_query` publishes ObjectType for VLM
+- [x] `/semantic/room_type_query` publishes RoomType for VLM
+- [x] Subscribes to `/detection_result` from detect_anything
+- [x] Subscribes to `/registered_scan` PointCloud2
+- [x] Subscribes to `/state_estimation` Odometry
+- [x] Subscribes to `/semantic/target_object_instruction`
+- [x] Subscribes to `/semantic/viewpoint_rep_header`
+- [x] Subscribes to `/semantic/object_type_answer`
+- [x] Subscribes to `/semantic/room_type_answer`
+- [x] Object positions, bbox corners, status flags match expected semantics
+- [x] Delete markers properly remove objects from RViz
+
+---
+
+## Message Type Alignment (Planner Integration)
+
+Planner/VLM traffic now uses `semantic_mapping/msg/*` end-to-end. The
+previous package-level mismatch has been removed, so no bridge is required.
+
+### Impact
+
+- `semantic_mapping` publishes and subscribes to `semantic_mapping/*` messages
+ on planner topic names (no `/semantic` prefix required).
+
+---
+
+## Recent Updates (Jan 2026)
+
+- Unified launch to `semantic_mapping.launch.py` with mode handling, detect_anything integration, and bagfile defaults.
+- Fixed cloud/image projection transform to match legacy `CloudImageFusion` math.
+- Added mask erosion, depth-jump pruning, distance gating, and light outlier filtering for cleaner object clouds.
+- Added reprojected observation-angle voting for better multi-view aggregation.
+- Fixed bbox ordering and switched `/obj_boxes` to axis-aligned markers (via `get_oriented_bbox(axis_aligned=True)`).
+- Replaced DBSCAN regularization with vote-weighted filtering + voxel connected components.
+- Added `regularize_voxel_size` to grow connectivity for thin structures.
+- Simplified ground filtering to radius-based Z clamp around robot.
+- Added `visualization_max_objects` to keep recent objects visible in RViz.
+- Logging: mapping stats (total/total_deleted/total_permanent) + mapping loop time.
+- Fixed image saving color order and ensured output dir is cleared on start.
+- Set MobileCLIP weights cache to `detect_anything/models` and removed corrupted caches.
+- Viewpoint snapshots are saved via `ImageSaveQueue` with timestamped run folders.
+- Added `RoomType.msg` and VLM room-type query/answer with room label visualization.
+- Removed `/semantic` topic namespace and bridge in favor of unified message types.
+- Moved `project_bbox3d_equirect` into `vector_perception_utils` to drop `dimos` dependency.
+- Added Moondream request timeouts and escaped JSON prompt templates to avoid hangs.
+- Added object-type query fallback when target labels are absent.
+- Stabilized room label colors and trimmed non-essential logging to debug level.
+
+## Temporary Compatibility Notes
+
+- Planner and mapping nodes now share `semantic_mapping/msg/*`; no bridge is required.
+
+## Temporary Handoff Notes
+
+- Regularization is now connected-components; tune `regularize_voxel_size` and keep ratio
+ (`MapObject.regularize_shape` percentile) if legs/sparse parts are dropped.
+- Ground filter uses `ground_filter`, `ground_radius`, `ground_z_clearance`.
+- If bagfile sync issues persist, consider switching mapping inputs to `/sensor_scan` + `/state_estimation_at_scan`.
+- Verify VLM objects.yaml path in VLM_ROS (`vlm_reasoning_node` still expects old path).
+
+---
+
+## Planner Dependencies
+
+The exploration planner (`sensor_coverage_planner_ground.cpp` in tare_planner) is tightly coupled to `ObjectNode`:
+- Subscribes to `/object_nodes` and stores per-object reps
+- Uses object positions, bbox corners, clouds, status, `is_asked_vlm`, and `img_path` for navigation decisions
+
+**Implication**: Keep the `ObjectNode` schema stable. Any new fields from the `MapObject` model can be mapped into extensions, but existing fields and semantics must remain for planner compatibility.
diff --git a/semantic_mapping/docs/TARE_PLANNER_VLM_DIFF_AND_MIGRATION.md b/semantic_mapping/docs/TARE_PLANNER_VLM_DIFF_AND_MIGRATION.md
new file mode 100644
index 0000000..471e99c
--- /dev/null
+++ b/semantic_mapping/docs/TARE_PLANNER_VLM_DIFF_AND_MIGRATION.md
@@ -0,0 +1,733 @@
+# TARE Planner (VLM_ROS vs Autonomy Stack) Diff + VLM Logic Deep Dive
+
+## 0) Purpose and Scope
+This document compares the two `tare_planner/src` trees and provides a detailed
+breakdown of the VLM/room logic inside
+`VLM_ROS/src/exploration_planner/tare_planner/src/sensor_coverage_planner/sensor_coverage_planner_ground.cpp`.
+It also proposes a migration path into a new `vector_perception_ros`
+package named `sensor_coverage_planner`, including room segmentation.
+
+### 0.1 Migration Status (Feb 2026)
+Completed:
+- `tare_planner` in autonomy-stack now uses `semantic_mapping/msg/*` in
+ `representation` and `sensor_coverage_planner_ground` (headers + source).
+- `tare_planner` CMake and package dependencies updated for `semantic_mapping`.
+- `navigationBoundary` publisher code matches VLM_ROS.
+- Added `boundary_sim.ply` to `tare_planner/data` for sim launch.
+- `explore_world.launch` and `explore_world_sim.launch` default
+ `use_boundary:=false` (no boundary published unless explicitly enabled).
+- Room segmentation node moved to `vector_perception_ros/sensor_coverage`.
+
+Open items / verification:
+- Re-check `system_bagfile_with_exploration_planner.launch.py` alignment with
+ VLM_ROS (file was reverted during edits).
+- Ensure `tare_planner` install includes `boundary_sim.ply` after rebuild.
+- Remove `pub_room_boundary_tmp_` declaration in room segmentation header
+ if no longer used.
+
+Scope:
+- Compare all package/file differences between:
+ - `autonomy_stack_mecanum_wheel_platform/src/exploration_planner/tare_planner/src`
+ - `VLM_ROS/src/exploration_planner/tare_planner/src`
+- Map all ROS inputs/outputs (topics + message types) for the nodes in scope.
+- Deep dive on the main VLM logic file (SensorCoveragePlanner3D).
+- Feasibility + cleanup/migration plan into `vector_perception_ros`.
+
+## 1) Context From Existing Migration Docs (Short)
+Key context pulled from:
+- `vector_perception_ros/vlm/docs/VLM_INPUTS_AND_SELECTION.md`
+- `vector_perception_ros/vlm/docs/VLM_NODE_MIGRATION.md`
+- `vector_perception_ros/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md`
+
+Highlights relevant to `tare_planner`:
+- Room-type VLM images are produced inside `sensor_coverage_planner_ground.cpp`
+ via `project_pcl_to_image(...)` (cropped panorama + room mask). Viewpoint
+ snapshots are **not** used for room type.
+- Object-type VLM uses the "best image" crop from semantic mapping, not planner.
+- Target-object spatial queries use viewpoint snapshots and 3D bbox projection.
+- Message types are now unified under `semantic_mapping/msg/*`; no bridge is
+ required.
+- Real-time constraints: avoid heavy per-parameter checks and excessive
+ try/except patterns in hot loops.
+
+## 2) Directory and File Differences
+
+### 2.1 Directory-Level Diff
+Only in `VLM_ROS/src/exploration_planner/tare_planner/src`:
+- `representation/`
+ - `representation.cpp`
+- `room_segmentation/`
+ - `room_segmentation.cpp`
+ - `room_segmentation_incremental_normal.cpp`
+ - `room_segmentation_backup.cpp`
+
+All other directories are present in both trees.
+
+### 2.2 File-Level Diff (Common Directories)
+Files that differ (content changes):
+- `grid_world/grid_world.cpp`
+- `local_coverage_planner/local_coverage_planner.cpp`
+- `planning_env/planning_env.cpp`
+- `rolling_occupancy_grid/rolling_occupancy_grid.cpp`
+- `sensor_coverage_planner/sensor_coverage_planner_ground.cpp`
+- `utils/misc_utils.cpp`
+- `viewpoint/viewpoint.cpp`
+- `viewpoint_manager/viewpoint_manager.cpp`
+
+Summary of differences by file:
+
+**`grid_world.cpp`**
+- Adds room-level state and navigation gates:
+ - `current_room_id_`, `transit_across_room_`, `room_finished_`,
+ `room_near_finished_`, `door_position_`.
+- Adds object/anchor state gates:
+ - `object_found_`, `anchor_object_found_`, `found_object_position_`.
+- Reads room resolution + voxel dimensions (`room_x/y/z`) and computes `shift_`.
+- Restricts cell status transitions to cells that are "in current room".
+- Adds explicit pathing for:
+ - transit across rooms (door path),
+ - found object path,
+ - anchor object TSP path (dynamic mode).
+- Adds `InLocalPlanningHorizonWithoutRoom` gating for roadmap connections.
+
+**`local_coverage_planner.cpp`**
+- Introduces room-aware viewpoint selection:
+ - `viewpoint_roomcandidate_indices` and
+ `ViewPointRoomCandidate` sampling.
+- Lookahead and boundary viewpoint logic now bypasses room gating when needed:
+ - `InLocalPlanningHorizonWithoutRoom(...)`.
+- Adds `transit_across_room_`, `object_found_`, `anchor_object_found_` gates.
+- When transiting/target found: local planning becomes a constrained path
+ using navigation viewpoints rather than exploration sampling.
+
+**`planning_env.cpp`**
+- Adds door collision checks and parameters:
+ - `kDoorCollisionCheckRadius`, `kDoorCollisionCheckPointNumThr`.
+- Adds occupancy-grid line-of-sight hooks:
+ - `CheckLineOfSightInOccupancyGrid(...)`
+ - `Pos2Sub(...)`, `InRange(...)`.
+- Adds `occupied_cloud` storage for room segmentation inputs.
+- Improves safety on coverage-boundary usage (warn when empty).
+
+**`rolling_occupancy_grid.cpp`**
+- Adds sensor-range-limited ray tracing and keeps "COVERED" cells.
+- Adds visibility / line-of-sight check for ray casting:
+ - `CheckLineOfSight(...)`, `PublishVoxels(...)`.
+- Adds helper conversions `Pos2Sub(...)`, `InRange(...)`.
+- Adds parameters for viewpoint collision margin Z.
+- Publishes ray voxels for debugging.
+
+**`utils/misc_utils.cpp`**
+- Adds `SetIntersection(...)` (used for viewpoint rep updates).
+- Adds BFS path helper `find_path_bfs(...)` (room transitions).
+- Adds `idToColor(...)` and HSV conversion (stable room coloring).
+- Adds voxel mapping helpers:
+ - `point_to_voxel(...)`, `voxel_to_point(...)`,
+ cropped variants for room masks.
+
+**`viewpoint.cpp`**
+- Adds room flags to each viewpoint:
+ - `in_room_`, `is_room_candidate_`.
+
+**`viewpoint_manager.cpp`**
+- Reads room parameters and maintains `room_mask_`.
+- Adds `CheckViewPointRoomBoundaryCollision()` to set `in_room_`.
+- Adds room candidate flagging and exposure in candidate selection.
+- Adds `InLocalPlanningHorizonWithoutRoom(...)` to allow pathing near doors.
+- Adds freespace cloud publication and room gates on candidate viewpoints.
+
+**`sensor_coverage_planner_ground.cpp`**
+- Huge expansion to integrate:
+ - `representation` (viewpoint, room, object).
+ - `room_segmentation` inputs.
+ - VLM room/object query + navigation logic.
+ - Object visibility graph and viewpoint snapshots.
+ - Room navigation state machine (finish, change, transit).
+- Full deep dive in section 4.
+
+### 2.3 Common Core Between the Two `sensor_coverage_planner_ground.cpp` Versions
+These are shared in both the autonomy-stack and VLM_ROS files (the VLM version
+layers additional room/object logic on top).
+
+**Common node lifecycle**
+- `SensorCoveragePlanner3D` class, 1 Hz `execute()` timer.
+- Same init chain: `ReadParameters()` → `InitializeData()` → `initialize()`.
+- Shared managers and core members: `keypose_graph_`, `planning_env_`,
+ `grid_world_`, `local_coverage_planner_`, `viewpoint_manager_`,
+ `visualizer_`, plus the same PCL debug clouds.
+
+**Common subscriptions**
+- `/exploration_start`, `/state_estimation_at_scan`, `/registered_scan`
+- `/terrain_map`, `/terrain_map_ext`
+- `/coverage_boundary`, `/navigation_boundary`, `/nogo_boundary`
+- `/joy`, `/reset_waypoint`
+
+**Common publishers**
+- `global_path_full`, `global_path`, `local_path`, `exploration_path`
+- `/way_point`, `exploration_finish`
+- `runtime_breakdown`, `/runtime`, `momentum_activation_count`
+
+**Common planning pipeline in `execute()`**
+- Start gate on `/exploration_start` unless `kAutoStart`.
+- First-cycle bootstrap: `SendInitialWaypoint()` + start-time init.
+- On `keypose_cloud_update_`:
+ - `CountDirectionChange()`
+ - `UpdateGlobalRepresentation()`
+ - `UpdateViewPoints()` and `UpdateKeyposeGraph()`
+ - `UpdateViewPointCoverage()` + `UpdateCoveredAreas()`
+ - `GlobalPlanning()` + `LocalPlanning()`
+ - `ConcatenateGlobalLocalPath()` + `GetLookAheadPoint()`
+ - `PublishWaypoint()` + visualization + runtime publishing
+
+**Common helper functions**
+- `SendInitialWaypoint`, `UpdateKeyposeGraph`, `UpdateViewPoints`,
+ `UpdateViewPointCoverage`, `UpdateCoveredAreas`, `UpdateGlobalRepresentation`
+- `GlobalPlanning`, `LocalPlanning`
+- `PublishGlobalPlanningVisualization`, `PublishLocalPlanningVisualization`
+- `ConcatenateGlobalLocalPath`, `GetLookAheadPoint`, `PublishWaypoint`,
+ `PublishRuntime`, `CountDirectionChange`
+
+## 3) ROS I/O Mapping
+
+### 3.1 `SensorCoveragePlanner3D` (VLM_ROS)
+Node: `tare_planner_node` creates and spins `SensorCoveragePlanner3D`.
+
+Subscriptions (inputs):
+- `/exploration_start` (`std_msgs/Bool`)
+- `/state_estimation_at_scan` (`nav_msgs/Odometry`)
+- `/registered_scan` (`sensor_msgs/PointCloud2`)
+- `/terrain_map` (`sensor_msgs/PointCloud2`)
+- `/terrain_map_ext` (`sensor_msgs/PointCloud2`)
+- `/coverage_boundary` (`geometry_msgs/PolygonStamped`)
+- `/navigation_boundary` (`geometry_msgs/PolygonStamped`)
+- `/current_room_boundary` (`geometry_msgs/PolygonStamped`) [declared param, used in room manager]
+- `/nogo_boundary` (`geometry_msgs/PolygonStamped`)
+- `/joy` (`sensor_msgs/Joy`)
+- `/reset_waypoint` (`std_msgs/Empty`)
+- `/object_nodes` (`tare_planner/ObjectNode`)
+- `/door_cloud` (`sensor_msgs/PointCloud2`)
+- `/room_nodes_list` (`tare_planner/RoomNodeList`)
+- `/goal_point` (`geometry_msgs/PointStamped`)
+- `/room_mask` (`sensor_msgs/Image`)
+- `/camera/image` (`sensor_msgs/Image`)
+- `/room_type_answer` (`tare_planner/RoomType`)
+- `/room_navigation_answer` (`tare_planner/VlmAnswer`)
+- `/keyboard_input` (`std_msgs/String`)
+- `/target_object_instruction` (`tare_planner/TargetObjectInstruction`)
+- `/target_object_answer` (`tare_planner/TargetObject`)
+- `/anchor_object_answer` (`tare_planner/TargetObject`)
+
+Publishers (outputs):
+- `global_path_full` (`nav_msgs/Path`)
+- `global_path` (`nav_msgs/Path`)
+- `old_global_path` (`nav_msgs/Path`)
+- `to_nearest_global_subspace_path` (`nav_msgs/Path`)
+- `local_path` (`nav_msgs/Path`)
+- `exploration_path` (`nav_msgs/Path`)
+- `/way_point` (`geometry_msgs/PointStamped`)
+- `exploration_finish` (`std_msgs/Bool`)
+- `runtime_breakdown` (`std_msgs/Int32MultiArray`)
+- `/runtime` (`std_msgs/Float32`)
+- `momentum_activation_count` (`std_msgs/Int32`)
+- `pointcloud_manager_neighbor_cells_origin` (`geometry_msgs/PointStamped`)
+- `viewpoint_rep_header` (`tare_planner/ViewpointRep`)
+- `object_visibility_connections` (`visualization_msgs/MarkerArray`)
+- `viewpoint_object_visibility` (`std_msgs/String`) [currently unused]
+- `/door_position` (`geometry_msgs/PointStamped`)
+- `/door_normal` (`visualization_msgs/Marker`)
+- `/room_cloud` (`sensor_msgs/PointCloud2`)
+- `/room_type_query` (`tare_planner/RoomType`)
+- `/room_early_stop_1` (`tare_planner/RoomEarlyStop1`)
+- `/room_type_vis` (`visualization_msgs/MarkerArray`)
+- `viewpoint_room_ids` (`visualization_msgs/MarkerArray`)
+- `/room_navigation_query` (`tare_planner/NavigationQuery`)
+- `/object_node_markers` (`visualization_msgs/MarkerArray`)
+- `/chosen_room_boundary` (`visualization_msgs/Marker`)
+- `/target_object_query` (`tare_planner/TargetObject`)
+- `/anchor_object_query` (`tare_planner/TargetObject`)
+- `/target_object_spatial_query` (`tare_planner/TargetObjectWithSpatial`)
+- `/room_anchor_point` (`geometry_msgs/PointStamped`)
+
+Key parameters (VLM additions):
+- `room_resolution`, `room_x`, `room_y`, `room_z`
+- `rep_threshold_`, `kRepSensorRange`
+- `kRushRoomDist_1`, `kRushRoomDist_2`
+- `kCollisionGrid*`, `kCoverage*` (inherited)
+
+I/O delta vs autonomy_stack:
+- Autonomy version uses only exploration, map, boundary, joystick, and path
+ topics. VLM_ROS adds: room segmentation, object nodes, VLM queries/answers,
+ viewpoint reps, visibility markers, navigation queries, door clouds, room
+ masks, camera image pipeline.
+
+### 3.2 `RoomSegmentationNode` (VLM_ROS)
+Node: `room_segmentation_node` (standalone main in `room_segmentation.cpp`).
+
+Subscriptions:
+- `/registered_scan` (`sensor_msgs/PointCloud2`)
+- `/state_estimation` (`nav_msgs/Odometry`)
+- `/occupied_cloud` (`sensor_msgs/PointCloud2`)
+- `/freespace_cloud` (`sensor_msgs/PointCloud2`)
+
+Publishers:
+- `/explore_areas_new` (`sensor_msgs/PointCloud2`)
+- `/room_mask_vis` (`sensor_msgs/Image`)
+- `/room_mask` (`sensor_msgs/Image`)
+- `/door_cloud` (`sensor_msgs/PointCloud2`)
+- `/debug_cloud` (`sensor_msgs/PointCloud2`)
+- `/free_cloud_1` (`sensor_msgs/PointCloud2`)
+- `/current_room_boundary` (`geometry_msgs/PolygonStamped`)
+- `/navigation_boundary` (`geometry_msgs/PolygonStamped`)
+- `/room_boundaries` (`visualization_msgs/MarkerArray`)
+- `/walls` (`sensor_msgs/PointCloud2`)
+- `/room_nodes` (`tare_planner/RoomNode`)
+- `/room_nodes_list` (`tare_planner/RoomNodeList`)
+- `/room_map_cloud` (`sensor_msgs/PointCloud2`)
+
+Key parameters:
+- `room_resolution`, `room_x/y/z`, `rolling_occupancy_grid.resolution_x`
+- `exploredAreaVoxelSize`, `ceilingHeight_`, `wall_thres_height_`
+- `region_growing_radius`, `min_room_size`, `dilation_iteration`
+- `normal_search_num`, `normal_search_radius`
+- `kViewPointCollisionMarginZPlus`, `kViewPointCollisionMarginZMinus`
+
+### 3.3 `navigationBoundary` Node
+Node: `navigationBoundary` (file-based boundary publisher).
+
+Publishers:
+- `/navigation_boundary` (`geometry_msgs/PolygonStamped`)
+
+Inputs:
+- File on disk via `boundary_file_dir` param
+- Params: `sendBoundary`, `sendBoundaryInterval`
+
+### 3.4 Legacy `SensorCoveragePlanner3D` Outputs (Autonomy Stack)
+These are the outputs of the **old** planner that must be preserved/relayed
+so navigation keeps working unchanged.
+
+**Primary outputs (used by navigation stack + tooling)**
+- `global_path_full` (`nav_msgs/Path`)
+- `global_path` (`nav_msgs/Path`)
+- `old_global_path` (`nav_msgs/Path`)
+- `to_nearest_global_subspace_path` (`nav_msgs/Path`)
+- `local_path` (`nav_msgs/Path`)
+- `exploration_path` (`nav_msgs/Path`)
+- `/way_point` (`geometry_msgs/PointStamped`)
+- `exploration_finish` (`std_msgs/Bool`)
+- `runtime_breakdown` (`std_msgs/Int32MultiArray`)
+- `/runtime` (`std_msgs/Float32`)
+- `momentum_activation_count` (`std_msgs/Int32`)
+- `pointcloud_manager_neighbor_cells_origin` (`geometry_msgs/PointStamped`)
+
+**Debug/visualization outputs (PCLCloud wrappers)**
+These are published via `PCLCloud::Publish()` in the old file and appear as
+topics with the names passed at construction time:
+- `keypose_cloud`, `registered_scan_stack`, `registered_cloud`
+- `terrain_cloud_large`, `terrain_collision_cloud`, `terrain_ext_collision_cloud`
+- `viewpoint_vis_cloud`, `viewpoint_in_collision_cloud_`
+- `grid_world_vis_cloud`, `bspline_path_cloud`, `selected_viewpoint_vis_cloud`
+- `collision_cloud`, `lookahead_point_cloud`, `keypose_graph_cloud`
+- `pointcloud_manager_cloud`, `reordered_global_subspace_cloud`
+
+If you only want to keep strict navigation compatibility, the **primary**
+outputs above are the ones that must be relayed; debug topics can be optional.
+
+## 4) SensorCoveragePlanner3D VLM Logic: Deep Dive
+
+This section breaks down the VLM/room logic inside
+`sensor_coverage_planner_ground.cpp` with explicit loops, data dependencies,
+and I/O.
+
+### 4.0 Legacy Core Data Structures Reused by VLM Logic
+These are **old planner data structures** that the VLM logic depends on and
+assumes to behave the same way. They should stay intact when migrating the
+VLM logic into `vector_perception_ros`.
+
+**Core managers (shared across old + new logic)**
+- `planning_env_`: provides coverage, collision cloud, occupancy-grid access,
+ and diff/stacked clouds used by room updates and visibility checks.
+- `grid_world_`: global exploration state, cell status, and return-home logic.
+- `viewpoint_manager_`: viewpoint candidates, LOS/connectivity, and local horizon.
+- `keypose_graph_`: shortest-path computations for objects and room navigation.
+- `local_coverage_planner_`: local path TSP and waypoint ordering.
+- `visualizer_`: markers for local/global planning, paths, and debug views.
+
+**State gating and control**
+- `keypose_cloud_update_`: the main update latch that drives each cycle.
+- `robot_position_`, `robot_yaw_`, `moving_direction_`, `moving_forward_`:
+ pose and motion state used throughout VLM + navigation logic.
+- `lookahead_point_`, `lookahead_point_direction_`:
+ main input to waypoint publishing.
+- `visited_positions_`: used for coverage heuristics and candidate pruning.
+
+**Cloud buffers and derived products**
+- `keypose_cloud_`, `registered_scan_stack_`, `registered_cloud_`:
+ keypose + scan accumulation and downsampling.
+- `terrain_*` clouds: terrain height and collision data for viewpoint filtering.
+- `collision_cloud_`, `viewpoint_vis_cloud_`, `viewpoint_in_collision_cloud_`:
+ inputs to viewpoint candidate selection and visualization.
+- `grid_world_vis_cloud_`, `exploration_path_cloud_`,
+ `selected_viewpoint_vis_cloud_`, `lookahead_point_cloud_`,
+ `keypose_graph_vis_cloud_`, `pointcloud_manager_neighbor_cloud_`:
+ shared debug channels for planners and RViz.
+
+**Planning state**
+- `exploration_path_` and intermediate global/local paths.
+- `robot_viewpoint_`: robot coverage proxy for occupancy updates.
+
+These structures are **not VLM-specific**, but the new VLM logic reads from
+them heavily (coverage clouds, keypose graph distances, and viewpoint manager
+state), so they must be preserved in the new node.
+### 4.1 Core Data Structures and State
+**Representation layer**
+- `representation_` (new in VLM_ROS)
+ - `ViewPointRep`: viewpoint snapshots + visibility lists.
+ - `RoomNodeRep`: room labels, anchor point, polygon, room mask.
+ - `ObjectNodeRep`: object attributes + visible viewpoints.
+
+**Room grid**
+- `room_mask_`, `room_mask_old_`: `cv::Mat` integer room id maps.
+- `room_voxel_dimension_`, `room_resolution_`, `shift_`: map voxel grid.
+- `current_room_id_`, `target_room_id_`, `start_room_id_`, `end_room_id_`.
+- `prev_room_id_`, `previous_room_id_`: used for split/merge handling.
+- `adjacency_matrix`: room connectivity from door clouds.
+
+**Door / transition state**
+- `door_cloud_`, `door_cloud_final_`, `door_cloud_in_range_`.
+- `door_position_`, `door_normal_`.
+- `transit_across_room_`, `at_room_`, `near_room_1_`, `near_room_2_`.
+- `enter_wrong_room_`, `ask_vlm_change_room_`.
+
+**VLM query state**
+- `ask_vlm_near_room_`, `ask_vlm_finish_room_`, `asked_in_advance_`.
+- `room_navigation_query_counter_`, `room_finished_counter_`.
+- `candidate_room_position_`, `has_candidate_room_position_`.
+
+**Object state**
+- `found_object_`, `found_object_id_`, `found_object_distance_`, `ask_found_object_`.
+- `found_anchor_object_`, `anchor_object_`, `found_anchor_object_viewpoint_positions_`.
+- `obj_score_`, `considered_object_ids_`.
+
+**Viewpoint representation**
+- `viewpoint_reps_`, `curr_viewpoint_rep_node_ind`.
+- `add_viewpoint_rep_`, `rep_threshold_`, `rep_sensor_range`, `rep_threshold_voxel_num_`.
+- `current_obs_voxel_inds_`, `previous_obs_voxel_inds_`.
+
+**Camera + odom alignment**
+- `camera_image_`, `imageTime` (from `/camera/image`).
+- Odom stacks for interpolation: `odomTimeStack`, `lidarXStack`, etc.
+
+### 4.2 Input Callbacks (Data Ingestion)
+**StateEstimationCallback**
+- Updates robot position, yaw, motion direction.
+- Maintains odom ring buffers for time-aligned projection in `GetPoseAtTime`.
+- Updates `viewpoint_rep_msg_` header stamp/frame.
+
+**RegisteredScanCallback**
+- Aggregates scans, downsamples, updates planning environment.
+- Every N frames: pushes keypose node into graph and sets `keypose_cloud_update_`.
+
+**RoomMaskCallback**
+- Converts `/room_mask` image to `cv::Mat` (int32), resizes to room voxel grid.
+- Updates:
+ - `room_mask_`
+ - `viewpoint_manager_` room mask
+ - `grid_world_` room mask
+ - `representation_` viewpoint room ids (`UpdateViewpointRoomIdsFromMask`)
+
+**RoomNodeListCallback**
+- Ingests `/room_nodes_list` and updates representation room map.
+- Marks nodes as alive, removes dead nodes, validates anchor points against mask.
+
+**ObjectNodeCallback**
+- Updates representation object node from `/object_nodes`.
+- Maintains `latest_object_node_indices_` for visibility updates.
+
+**DoorCloudCallback**
+- Ingests `/door_cloud`, filters by collision, builds adjacency matrix.
+- Colors door point cloud for visualization.
+
+**CameraImageCallback**
+- Stores panoramic image for room crop pipeline.
+- Updates `imageTime` for projection alignment.
+
+**RoomTypeCallback**
+- Ingests `/room_type_answer`:
+ - Locates room id from anchor point and room mask.
+ - Accumulates voxel counts per label.
+ - If room label changes, marks objects in that room as unconsidered.
+
+**RoomNavigationAnswerCallback**
+- Ingests `/room_navigation_answer`.
+- `answer_type == 1`: early stop; immediately sets goal via anchor point.
+- `answer_type == 0`: room navigation answer; stores candidate room anchor.
+
+**TargetObjectInstructionCallback**
+- Resets target/anchor object context.
+- Clears considered flags on objects and resets room asked flags.
+
+**TargetObjectCallback / AnchorObjectCallback**
+- Marks object or anchor as found if VLM confirms.
+- Updates distances and stores object position + room id.
+
+### 4.3 Object Visibility Graph (Major Loop)
+**`UpdateObjectVisibility()`**
+- Input: `representation_->GetLatestObjectNodeIndicesMutable()` (new objects).
+- For each object id:
+ - Fetch object position + cloud, then convert cloud to voxel indices
+ via `Convert2Voxels(...)` (uses `planning_env_->Pos2Sub` + `InRange`).
+ - For each `ViewPointRep`:
+ - If viewpoint already references the object, mark direct visibility and
+ continue.
+ - If object has the viewpoint in its visible list, connect immediately.
+ - Else do ray-casting visibility:
+ - Compute viewpoint camera position (+0.265 z).
+ - Skip if outside `rep_sensor_range`.
+ - For each object voxel, call
+ `CheckRayVisibilityInOccupancyGrid(...)` which defers to
+ `rolling_occupancy_grid_->CheckLineOfSight(...)`.
+ - On first visible voxel:
+ - Add object index to viewpoint.
+ - Add viewpoint id to object.
+ - Mark object as not considered.
+ - Object-room relation:
+ - Map object centroid to room voxel -> room id.
+ - If room id is 0 (unknown), dilate search by 2 voxels to find a valid id.
+ - Call `representation_->SetObjectRoomRelation(...)`.
+- After processing all new objects:
+ - Compute `obj_score_` by ray-casting from current robot position to object
+ voxels, with higher weight if object has no visible viewpoints.
+ - This score drives `UpdateViewpointRep()` (adds viewpoint reps when high).
+
+**`UpdateViewpointObjectVisibility()`**
+- Runs when a new viewpoint rep is created.
+- For each object in `ObjectNodeRepMap`:
+ - Skip if already in current viewpoint visibility.
+ - Ray-cast from current viewpoint position to object voxels.
+ - On visibility: add to viewpoint + add viewpoint to object + clear considered.
+
+**Visualization**
+- `CreateVisibilityMarkers()`: publishes `MarkerArray` lines between viewpoints
+ and visible objects; color indicates direct vs inferred visibility.
+- `PublishViewpointRoomIdMarkers()`: publishes text markers `VR`.
+
+### 4.4 Viewpoint Representation Update
+**`UpdateViewpointRep()`**
+- Pulls `current_obs_voxel_inds_` from `planning_env_`.
+- Computes intersection with `previous_obs_voxel_inds_` via
+ `misc_utils_ns::SetIntersection`.
+- Adds a new viewpoint rep when:
+ - `intersection_voxel_num < rep_threshold_voxel_num_` **and**
+ - `intersection_ratio < rep_threshold_`,
+ - or `obj_score_ > 4.0`.
+- On add:
+ - `representation_->AddViewPointRep(...)`.
+ - Set room id to `current_room_id_`.
+ - Publish `viewpoint_rep_header` only if a new rep was actually inserted.
+ - Update covered voxels and reset `latest_object_node_indices_`.
+- Publishes viewpoint rep cloud + all covered points for visualization.
+
+### 4.5 Room and Door Transition Logic
+**`SetCurrentRoomId()`** (room transitions)
+- Uses current and previous robot voxels in `room_mask_`/`room_mask_old_`.
+- If `transit_across_room_` or `found_object_`: room id updates immediately.
+- If room id unchanged or uninitialized: update and return.
+- If room id changes:
+ - Case 1: `room_mask_old_` at new position already differs from current room
+ -> robot already knew it was a new room -> set `enter_wrong_room_ = true`.
+ - Case 2: overlap between old/new masks determines:
+ - merge: if overlap > 80% -> update current room id.
+ - split: otherwise -> set `ask_vlm_change_room_ = true` and update.
+
+**`GoalPointCallback()`**
+- Resolves `target_room_id_` from room mask at goal point.
+- If goal room == current room and not transiting: reset room info.
+- Else sets `transit_across_room_ = true`, stores goal, publishes `/room_anchor_point`.
+
+**`SetStartAndEndRoomId()`**
+- Uses `misc_utils_ns::find_path_bfs` on `adjacency_matrix` to find a room path.
+- Chooses the last two rooms in the path as `start_room_id_`/`end_room_id_`.
+- Calls `SetRoomPosition(start_room_id_, end_room_id_)`.
+
+**`SetRoomPosition()`**
+- Filters `door_cloud_` for door points between `start_room_id_` and `end_room_id_`.
+- Uses door centroid as transition goal (`door_position_`).
+- Computes `door_normal_` via `GetDoorNormal(...)`:
+ - Sweeps angles, validates by checking room ids along normal and reverse.
+ - Averages valid normals for stability.
+- Publishes `/door_position` and `/door_normal` markers.
+- Publishes chosen room boundary marker.
+
+**`CheckDoorCloudInRange()`**
+- Finds door points within local planning horizon.
+- For unlabeled rooms:
+ - Computes candidate viewpoint near the door.
+ - Marks viewpoint as room candidate
+ (`viewpoint_manager_->SetViewPointRoomCandidate`).
+ - Publishes `door_cloud_in_range_`.
+
+### 4.6 Room Type Query Pipeline
+**`UpdateRoomLabel()`**
+- Pulls covered points in range from `planning_env_`.
+- For each room id:
+ - Accumulate voxel counts + per-room cloud + center estimate.
+- For **unlabeled rooms**:
+ - Update room node voxel count and mark as labeled.
+ - Build a room cloud (points + room center).
+ - Project to panorama via `project_pcl_to_image(...)`.
+ - Set anchor point and save image in `RoomNodeRep`.
+ - Publish `/room_type_query` (RoomType image + mask + anchor).
+ - If room is large enough and unvisited, may call `ChangeRoomQuery(...)`.
+- For **labeled rooms**:
+ - If voxel count increase or area growth threshold exceeded:
+ - Optionally re-project image.
+ - Publish updated `/room_type_query`.
+ - May call `ChangeRoomQuery(...)`.
+- If `ask_vlm_change_room_` set -> `ChangeRoomQuery(prev_room_id_, current_room_id_, true)`.
+
+**`project_pcl_to_image(...)`**
+- Transforms 3D points from world -> lidar -> camera frames.
+- Projects to equirectangular panorama.
+- Tracks horizontal pixel range for room points.
+- Adds special projection for room center and rotates the panorama so the center
+ is in the middle.
+- Crops the panorama to `[min_u, max_u]` with padding.
+
+**`RoomTypeCallback()`**
+- Maps anchor point to room id via `room_mask_`.
+- Accumulates label votes based on `voxel_num`.
+- If label changes: clears `is_considered` on objects in the room.
+
+**`PublishRoomTypeVisualization()`**
+- Renders labeled rooms with stable color (via `idToColor`) and room id text.
+
+### 4.7 Object and Anchor VLM Queries
+**Target object flow**
+- `CheckObjectFound()`:
+ - If target found: update distance + room id and set `ask_found_object_` if
+ within thresholds.
+ - Sets path goal to found object's room anchor point.
+ - If found in current room: `SetFoundTargetObject()`.
+- If not found, iterate all objects:
+ - Must match target label, not considered, `is_asked_vlm_ == true`.
+ - Must have valid image path.
+ - Publish either:
+ - `/target_object_query` (no spatial condition), or
+ - `/target_object_spatial_query` (with viewpoint ids + bbox3d).
+ - Mark object as considered.
+
+**Anchor object flow** (dynamic mode)
+- `CheckAnchorObjectFound()`:
+ - Similar gating as target, but uses room condition and anchor label.
+ - Publishes `/anchor_object_query`.
+ - If anchor found:
+ - Collects visible viewpoint positions to build search tour.
+ - If anchor in current room: `SetFoundAnchorObject()`.
+
+### 4.8 Main `execute()` Loop (High-Level Flow)
+1. **Start gate**: waits for `/exploration_start` unless `kAutoStart`.
+2. **First init**:
+ - Send initial waypoint.
+ - Set `start_time_`, `global_direction_switch_time_`.
+ - `initialized_ = true`.
+3. **Every 1 Hz tick**:
+ - `ProcessObjectNodes()` (remove deleted objects).
+ - `CheckObjectFound()` and `CheckAnchorObjectFound()` if dynamic.
+ - Handle reset flag to clear coverage.
+4. **On `keypose_cloud_update_`**:
+ - `UpdateRoomLabel()`.
+ - `SetCurrentRoomId()`.
+ - If transiting: re-run `GoalPointCallback()` and `SetStartAndEndRoomId()`.
+ - If `at_room_`: reset room info and resume local coverage.
+ - `CountDirectionChange()`.
+ - `UpdateObjectVisibility()`.
+ - `UpdateGlobalRepresentation()` (grid world + keypose graph).
+ - `UpdateViewpointRep()` and `PublishViewpointRoomIdMarkers()`.
+ - If new viewpoint rep: `UpdateViewpointObjectVisibility()`.
+ - `CreateVisibilityMarkers()`.
+ - `UpdateViewPoints()` -> candidate count check.
+ - `CheckDoorCloudInRange()`, `UpdateKeyposeGraph()`.
+ - Coverage updates + uncovered area.
+ - Global + local TSP planning.
+ - Room finish handling:
+ - publish navigation query in advance if nearly finished.
+ - call `GetAnswer()` and mark room covered if finished.
+ - Build exploration path; compute lookahead; publish waypoint.
+ - Publish markers, runtime, room labels, object markers.
+
+### 4.9 Data Flow Summary (Text Graph)
+- `/room_mask` + `/room_nodes_list` -> `RoomNodeRep` + room id grid ->
+ room labels + anchor points -> `/room_type_query` + `/room_navigation_query`.
+- `/object_nodes` -> `ObjectNodeRep` -> visibility graph -> target/anchor queries.
+- `/camera/image` + odom stacks -> `project_pcl_to_image` -> `RoomType` image.
+- `/door_cloud` + room mask -> adjacency -> door/transition planning.
+- `rolling_occupancy_grid` -> line-of-sight -> visibility + viewpoint rep updates.
+
+## 5) Migration Feasibility and Cleanup Plan
+**Feasibility**: Yes, but the logic must be modularized to preserve real-time
+performance and avoid tight coupling between perception and exploration.
+
+### 5.1 Proposed Package Layout
+```
+vector_perception_ros/
+└── sensor_coverage_planner/
+ ├── include/sensor_coverage_planner/
+ │ ├── planner_core.h
+ │ ├── room_mask_manager.h
+ │ ├── room_query_builder.h
+ │ ├── object_visibility_manager.h
+ │ ├── room_transition_manager.h
+ │ └── representation/representation.h
+ ├── src/
+ │ ├── planner_core.cpp
+ │ ├── room_mask_manager.cpp
+ │ ├── room_query_builder.cpp
+ │ ├── object_visibility_manager.cpp
+ │ ├── room_transition_manager.cpp
+ │ ├── representation/representation.cpp
+ │ └── sensor_coverage_planner_node.cpp
+ ├── room_segmentation/
+ │ ├── room_segmentation_node.cpp
+ │ └── room_segmentation_utils.cpp
+ └── launch/
+```
+
+### 5.2 Migration Steps (Incremental)
+1. **Create new package** with same topic contracts as VLM_ROS.
+2. **Port `representation/` first**, keep interfaces stable.
+3. **Port room segmentation node** with exact topics:
+ - `/room_mask`, `/room_nodes_list`, `/door_cloud`.
+4. **Split planner into managers**:
+ - `RoomMaskManager`: room id updates + split/merge logic.
+ - `RoomQueryBuilder`: room-type query generation + image projection.
+ - `RoomTransitionManager`: door traversal, BFS, pathing.
+ - `ObjectVisibilityManager`: visibility graph + ray-casting.
+5. **Maintain message compatibility**:
+ - Use `semantic_mapping/msg/*` across planner and VLM nodes.
+6. **Integrate with semantic mapping**:
+ - `/object_nodes` and `/room_type_answer` are consumed directly (no prefix).
+7. **Validate on bag files** with topic parity checks.
+
+### 5.3 Cleanup Opportunities (Low Risk)
+- Extract `project_pcl_to_image` into a standalone utility.
+- Cache voxel indices for viewpoints and objects across frames.
+- Gate ray-casting by range + coarse visibility before full line-of-sight.
+- Centralize room-id lookup in one helper to avoid duplication.
+
+## 6) Risks and Compatibility Notes
+- **Message package mismatch**:
+ - `tare_planner` vs `semantic_mapping` need explicit bridging.
+- **Time alignment**:
+ - Image timestamp and odom interpolation must stay consistent.
+- **Room resolution mismatch**:
+ - All room-related components must share `room_resolution` and voxel dims.
+- **Visibility differences**:
+ - Changes in occupancy grid or ray tracing will affect VLM query triggers.
+- **Dynamic mode**:
+ - Anchor-object TSP logic must be preserved if used in deployment.
+
+## 7) Suggested Next Actions
+- Isolate `project_pcl_to_image` and add unit tests (math + crop bounds).
+- Build `RoomMaskManager` from existing code.
+- Add integration test for room-type query pipeline.
+- Verify planner/VLM topics use `semantic_mapping/msg/*` directly (no bridge).
diff --git a/semantic_mapping/launch/semantic_mapping.launch.py b/semantic_mapping/launch/semantic_mapping.launch.py
new file mode 100644
index 0000000..01bff4b
--- /dev/null
+++ b/semantic_mapping/launch/semantic_mapping.launch.py
@@ -0,0 +1,220 @@
+"""Launch semantic mapping and detect_anything with a mode selector."""
+
+import os
+
+from ament_index_python.packages import get_package_share_directory
+from launch import LaunchDescription
+from launch.actions import DeclareLaunchArgument, OpaqueFunction
+from launch.substitutions import LaunchConfiguration
+from launch_ros.actions import Node
+
+
+def _parse_bool(value: str, default: bool) -> bool:
+ if value == "":
+ return default
+ return value.lower() in ("true", "1", "yes")
+
+
+def _launch_nodes(context):
+ mode = LaunchConfiguration("mode").perform(context).lower()
+ use_compressed_arg = LaunchConfiguration("use_compressed").perform(context)
+ odom_topic_arg = LaunchConfiguration("odom_topic").perform(context)
+
+ use_sim_time = mode == "bagfile"
+ use_compressed = _parse_bool(use_compressed_arg, default=mode in ("bagfile", "sim"))
+
+ if odom_topic_arg:
+ odom_topic = odom_topic_arg
+ else:
+ odom_topic = "/aft_mapped_to_init_incremental" if mode == "bagfile" else "/state_estimation"
+
+ params_file = LaunchConfiguration("params_file").perform(context)
+ camera_param_path = LaunchConfiguration("camera_param_path").perform(context)
+ detection_topic = LaunchConfiguration("detection_topic").perform(context)
+ cloud_topic = LaunchConfiguration("cloud_topic").perform(context)
+ viewpoint_topic = LaunchConfiguration("viewpoint_topic").perform(context)
+ object_type_answer_topic = LaunchConfiguration("object_type_answer_topic").perform(context)
+ object_type_query_topic = LaunchConfiguration("object_type_query_topic").perform(context)
+ target_object_spatial_query_topic = LaunchConfiguration("target_object_spatial_query_topic").perform(
+ context
+ )
+ target_object_instruction_topic = LaunchConfiguration("target_object_instruction_topic").perform(
+ context
+ )
+ target_object_answer_topic = LaunchConfiguration("target_object_answer_topic").perform(context)
+ instruction_topic = LaunchConfiguration("instruction_topic").perform(context)
+ room_type_query_topic = LaunchConfiguration("room_type_query_topic").perform(context)
+ room_type_answer_topic = LaunchConfiguration("room_type_answer_topic").perform(context)
+ room_segmentation_params = LaunchConfiguration("room_segmentation_params").perform(context)
+ objects_yaml_path = LaunchConfiguration("objects_yaml_path").perform(context)
+ object_nodes_topic = LaunchConfiguration("object_nodes_topic").perform(context)
+ obj_points_topic = LaunchConfiguration("obj_points_topic").perform(context)
+ obj_boxes_topic = LaunchConfiguration("obj_boxes_topic").perform(context)
+ obj_labels_topic = LaunchConfiguration("obj_labels_topic").perform(context)
+ map_frame = LaunchConfiguration("map_frame").perform(context)
+ target_object = LaunchConfiguration("target_object").perform(context)
+ save_png = _parse_bool(LaunchConfiguration("save_png").perform(context), default=False)
+ output_dir = LaunchConfiguration("output_dir").perform(context)
+ image_topic = LaunchConfiguration("image_topic").perform(context)
+ compressed_image_topic = LaunchConfiguration("compressed_image_topic").perform(context)
+ conf = float(LaunchConfiguration("conf").perform(context))
+ max_area_ratio = float(LaunchConfiguration("max_area_ratio").perform(context))
+
+ detect_anything_node = Node(
+ package="detect_anything",
+ executable="detection_node",
+ name="detect_anything_detection_node",
+ output="screen",
+ parameters=[
+ {
+ "use_sim_time": use_sim_time,
+ "use_compressed": use_compressed,
+ "image_topic": image_topic,
+ "compressed_image_topic": compressed_image_topic,
+ "conf": conf,
+ "max_area_ratio": max_area_ratio,
+ }
+ ],
+ )
+
+ image_republisher_node = Node(
+ package="image_transport",
+ executable="republish",
+ name="camera_image_republisher",
+ output="screen",
+ remappings=[
+ ("in/compressed", compressed_image_topic),
+ ("out", image_topic),
+ ],
+ parameters=[
+ {
+ "use_sim_time": use_sim_time,
+ "in_transport": "compressed",
+ "out_transport": "raw",
+ }
+ ],
+ )
+
+ room_seg_params = []
+ if room_segmentation_params:
+ room_seg_params.append(room_segmentation_params)
+ room_seg_params.append({"use_sim_time": use_sim_time})
+
+ room_segmentation_node = Node(
+ package="sensor_coverage",
+ executable="room_segmentation_node",
+ name="room_segmentation_node",
+ output="screen",
+ parameters=room_seg_params,
+ )
+
+ semantic_mapping_node = Node(
+ package="semantic_mapping",
+ executable="semantic_mapping_node",
+ name="semantic_mapping_node",
+ output="screen",
+ parameters=[
+ params_file,
+ {
+ "use_sim_time": use_sim_time,
+ "camera_param_path": camera_param_path,
+ "detection_topic": detection_topic,
+ "cloud_topic": cloud_topic,
+ "odom_topic": odom_topic,
+ "viewpoint_topic": viewpoint_topic,
+ "object_type_answer_topic": object_type_answer_topic,
+ "object_type_query_topic": object_type_query_topic,
+ "target_object_spatial_query_topic": target_object_spatial_query_topic,
+ "target_object_instruction_topic": target_object_instruction_topic,
+ "object_nodes_topic": object_nodes_topic,
+ "obj_points_topic": obj_points_topic,
+ "obj_boxes_topic": obj_boxes_topic,
+ "obj_labels_topic": obj_labels_topic,
+ "map_frame": map_frame,
+ "target_object": target_object,
+ "save_png": save_png,
+ "output_dir": output_dir,
+ },
+ ],
+ )
+
+ vlm_reasoning_node = Node(
+ package="semantic_mapping",
+ executable="vlm_reasoning_node",
+ name="vlm_reasoning_node",
+ output="screen",
+ parameters=[
+ params_file,
+ {
+ "use_sim_time": use_sim_time,
+ "camera_param_path": camera_param_path,
+ "objects_yaml_path": objects_yaml_path,
+ "object_type_query_topic": object_type_query_topic,
+ "object_type_answer_topic": object_type_answer_topic,
+ "target_object_spatial_query_topic": target_object_spatial_query_topic,
+ "target_object_answer_topic": target_object_answer_topic,
+ "target_object_instruction_topic": target_object_instruction_topic,
+ "instruction_topic": instruction_topic,
+ "room_type_query_topic": room_type_query_topic,
+ "room_type_answer_topic": room_type_answer_topic,
+ }
+ ],
+ )
+
+ return [
+ detect_anything_node,
+ image_republisher_node,
+ room_segmentation_node,
+ semantic_mapping_node,
+ vlm_reasoning_node,
+ ]
+
+
+def generate_launch_description() -> LaunchDescription:
+ pkg_share = get_package_share_directory("semantic_mapping")
+ detect_anything_share = get_package_share_directory("detect_anything")
+ sensor_coverage_share = get_package_share_directory("sensor_coverage")
+ default_params = os.path.join(pkg_share, "config", "params.yaml")
+ default_camera_param = os.path.join(pkg_share, "config", "camera_param.yaml")
+ default_objects_yaml = os.path.join(detect_anything_share, "config", "objects.yaml")
+ default_room_seg_params = os.path.join(
+ sensor_coverage_share, "config", "room_segmentation_config.yaml"
+ )
+
+ return LaunchDescription(
+ [
+ DeclareLaunchArgument("mode", default_value="sim"),
+ DeclareLaunchArgument("use_compressed", default_value=""),
+ DeclareLaunchArgument("params_file", default_value=default_params),
+ DeclareLaunchArgument("camera_param_path", default_value=default_camera_param),
+ DeclareLaunchArgument("detection_topic", default_value="/detection_result"),
+ DeclareLaunchArgument("cloud_topic", default_value="/registered_scan"),
+ DeclareLaunchArgument("odom_topic", default_value=""),
+ DeclareLaunchArgument("viewpoint_topic", default_value="/viewpoint_rep_header"),
+ DeclareLaunchArgument("object_type_answer_topic", default_value="/object_type_answer"),
+ DeclareLaunchArgument("object_type_query_topic", default_value="/object_type_query"),
+ DeclareLaunchArgument(
+ "target_object_spatial_query_topic", default_value="/target_object_spatial_query"
+ ),
+ DeclareLaunchArgument("target_object_instruction_topic", default_value="/target_object_instruction"),
+ DeclareLaunchArgument("target_object_answer_topic", default_value="/target_object_answer"),
+ DeclareLaunchArgument("instruction_topic", default_value="/keyboard_input"),
+ DeclareLaunchArgument("room_type_query_topic", default_value="/room_type_query"),
+ DeclareLaunchArgument("room_type_answer_topic", default_value="/room_type_answer"),
+ DeclareLaunchArgument("room_segmentation_params", default_value=default_room_seg_params),
+ DeclareLaunchArgument("objects_yaml_path", default_value=default_objects_yaml),
+ DeclareLaunchArgument("object_nodes_topic", default_value="/object_nodes"),
+ DeclareLaunchArgument("obj_points_topic", default_value="/obj_points"),
+ DeclareLaunchArgument("obj_boxes_topic", default_value="/obj_boxes"),
+ DeclareLaunchArgument("obj_labels_topic", default_value="/obj_labels"),
+ DeclareLaunchArgument("map_frame", default_value="map"),
+ DeclareLaunchArgument("target_object", default_value="refrigerator"),
+ DeclareLaunchArgument("save_png", default_value="false"),
+ DeclareLaunchArgument("output_dir", default_value="output/object_images"),
+ DeclareLaunchArgument("image_topic", default_value="/camera/image"),
+ DeclareLaunchArgument("compressed_image_topic", default_value="/camera/image/compressed"),
+ DeclareLaunchArgument("conf", default_value="0.35"),
+ DeclareLaunchArgument("max_area_ratio", default_value="0.3"),
+ OpaqueFunction(function=_launch_nodes),
+ ]
+ )
diff --git a/semantic_mapping/msg/DetectionResult.msg b/semantic_mapping/msg/DetectionResult.msg
new file mode 100644
index 0000000..0f324ad
--- /dev/null
+++ b/semantic_mapping/msg/DetectionResult.msg
@@ -0,0 +1,9 @@
+std_msgs/Header header
+int32[] track_id
+float32[] x1
+float32[] y1
+float32[] x2
+float32[] y2
+string[] label
+float32[] confidence
+sensor_msgs/Image image
\ No newline at end of file
diff --git a/semantic_mapping/msg/NavigationQuery.msg b/semantic_mapping/msg/NavigationQuery.msg
new file mode 100644
index 0000000..5744672
--- /dev/null
+++ b/semantic_mapping/msg/NavigationQuery.msg
@@ -0,0 +1,3 @@
+string json
+sensor_msgs/Image[] images
+geometry_msgs/Point[] anchor_points
\ No newline at end of file
diff --git a/semantic_mapping/msg/ObjectNode.msg b/semantic_mapping/msg/ObjectNode.msg
new file mode 100644
index 0000000..b2d7686
--- /dev/null
+++ b/semantic_mapping/msg/ObjectNode.msg
@@ -0,0 +1,35 @@
+# Semantic object representation for planner integration.
+#
+# Each message represents a single object in the semantic map with its
+# 3D bounding box, point cloud, and metadata for VLM interaction.
+
+std_msgs/Header header
+
+# Object identity - may contain multiple merged track IDs
+int32[] object_id
+
+# Dominant class label (by weighted vote)
+string label
+
+# Object centroid in world frame
+geometry_msgs/Point position
+
+# 3D bounding box corners (8 points in world frame)
+# Order: [min_x,min_y,min_z], [max_x,min_y,min_z], [max_x,max_y,min_z], [min_x,max_y,min_z],
+# [min_x,min_y,max_z], [max_x,min_y,max_z], [max_x,max_y,max_z], [min_x,max_y,max_z]
+geometry_msgs/Point[8] bbox3d
+
+# Object point cloud in world frame
+sensor_msgs/PointCloud2 cloud
+
+# Lifecycle status: "new", "updated", "unchanged"
+string status
+
+# Path to best saved image crop of this object
+string img_path
+
+# VLM interaction flags
+bool is_asked_vlm
+
+# Associated viewpoint ID (for viewpoint-triggered saves)
+int32 viewpoint_id
diff --git a/semantic_mapping/msg/ObjectType.msg b/semantic_mapping/msg/ObjectType.msg
new file mode 100644
index 0000000..ab02ea6
--- /dev/null
+++ b/semantic_mapping/msg/ObjectType.msg
@@ -0,0 +1,17 @@
+# VLM query/answer message for object relabeling.
+#
+# Used bidirectionally:
+# - Query (mapping -> VLM): object_id, img_path, labels[] populated
+# - Answer (VLM -> mapping): final_label populated with VLM decision
+
+# Object ID to query/update
+int32 object_id
+
+# Path to object image for VLM analysis
+string img_path
+
+# VLM-determined final label (populated in answer)
+string final_label
+
+# Candidate labels from detection (populated in query)
+string[] labels
diff --git a/semantic_mapping/msg/RoomEarlyStop1.msg b/semantic_mapping/msg/RoomEarlyStop1.msg
new file mode 100644
index 0000000..473c601
--- /dev/null
+++ b/semantic_mapping/msg/RoomEarlyStop1.msg
@@ -0,0 +1,10 @@
+int32 room_id_1
+int32 room_id_2
+geometry_msgs/Point anchor_point_1
+geometry_msgs/Point anchor_point_2
+sensor_msgs/Image image_1
+sensor_msgs/Image image_2
+string room_type_1
+string room_type_2
+bool early_stop_1
+bool enter_wrong_room
\ No newline at end of file
diff --git a/semantic_mapping/msg/RoomNode.msg b/semantic_mapping/msg/RoomNode.msg
new file mode 100644
index 0000000..e406bdc
--- /dev/null
+++ b/semantic_mapping/msg/RoomNode.msg
@@ -0,0 +1,8 @@
+int32 id
+int32 show_id
+geometry_msgs/PolygonStamped polygon
+geometry_msgs/Point centroid
+int32[] neighbors
+bool is_connected
+float32 area
+sensor_msgs/Image room_mask
diff --git a/semantic_mapping/msg/RoomNodeList.msg b/semantic_mapping/msg/RoomNodeList.msg
new file mode 100644
index 0000000..8a9a713
--- /dev/null
+++ b/semantic_mapping/msg/RoomNodeList.msg
@@ -0,0 +1,2 @@
+std_msgs/Header header
+semantic_mapping/RoomNode[] nodes
diff --git a/semantic_mapping/msg/RoomType.msg b/semantic_mapping/msg/RoomType.msg
new file mode 100644
index 0000000..aa6ff3c
--- /dev/null
+++ b/semantic_mapping/msg/RoomType.msg
@@ -0,0 +1,8 @@
+std_msgs/Header header
+geometry_msgs/Point anchor_point
+sensor_msgs/Image image
+sensor_msgs/Image room_mask
+string room_type
+int32 room_id
+int32 voxel_num
+bool in_room
diff --git a/semantic_mapping/msg/TargetObject.msg b/semantic_mapping/msg/TargetObject.msg
new file mode 100644
index 0000000..55c60e4
--- /dev/null
+++ b/semantic_mapping/msg/TargetObject.msg
@@ -0,0 +1,6 @@
+std_msgs/Header header
+int32 object_id
+string object_label
+string img_path
+string room_label
+bool is_target
\ No newline at end of file
diff --git a/semantic_mapping/msg/TargetObjectInstruction.msg b/semantic_mapping/msg/TargetObjectInstruction.msg
new file mode 100644
index 0000000..1a88005
--- /dev/null
+++ b/semantic_mapping/msg/TargetObjectInstruction.msg
@@ -0,0 +1,22 @@
+# User instruction for target object search.
+#
+# Specifies the target object to find and optional spatial/attribute
+# conditions relative to an anchor object.
+
+# Primary target object label to search for
+string target_object
+
+# Room condition (e.g., "in the kitchen")
+string room_condition
+
+# Spatial condition relative to anchor (e.g., "next to", "on top of")
+string spatial_condition
+
+# Attribute condition for target (e.g., "red", "large")
+string attribute_condition
+
+# Anchor object for spatial reference
+string anchor_object
+
+# Attribute condition for anchor object
+string attribute_condition_anchor
diff --git a/semantic_mapping/msg/TargetObjectWithSpatial.msg b/semantic_mapping/msg/TargetObjectWithSpatial.msg
new file mode 100644
index 0000000..cc0c8d7
--- /dev/null
+++ b/semantic_mapping/msg/TargetObjectWithSpatial.msg
@@ -0,0 +1,8 @@
+std_msgs/Header header
+int32 object_id
+string object_label
+string img_path
+int32[] viewpoint_ids
+string room_label
+bool is_target
+geometry_msgs/Point[8] bbox3d
diff --git a/semantic_mapping/msg/ViewpointRep.msg b/semantic_mapping/msg/ViewpointRep.msg
new file mode 100644
index 0000000..5bb1bfe
--- /dev/null
+++ b/semantic_mapping/msg/ViewpointRep.msg
@@ -0,0 +1,8 @@
+# Viewpoint trigger message for image saving.
+#
+# Published by planner to request saving images at specific viewpoints.
+
+std_msgs/Header header
+
+# Unique viewpoint identifier
+int32 viewpoint_id
diff --git a/semantic_mapping/msg/VlmAnswer.msg b/semantic_mapping/msg/VlmAnswer.msg
new file mode 100644
index 0000000..8ec7a30
--- /dev/null
+++ b/semantic_mapping/msg/VlmAnswer.msg
@@ -0,0 +1,4 @@
+int32 room_id
+geometry_msgs/Point anchor_point
+# 0 for vlm navigation query, 1 for early stop 1
+int32 answer_type
\ No newline at end of file
diff --git a/semantic_mapping/package.xml b/semantic_mapping/package.xml
new file mode 100644
index 0000000..4804d1a
--- /dev/null
+++ b/semantic_mapping/package.xml
@@ -0,0 +1,29 @@
+
+
+
+ semantic_mapping
+ 0.0.1
+ Semantic 3D object mapping node with VLM integration.
+ Alex Lin
+ MIT
+
+ ament_cmake
+
+ rosidl_default_generators
+ rosidl_default_runtime
+ rosidl_interface_packages
+
+ rclpy
+ std_msgs
+ sensor_msgs
+ geometry_msgs
+ nav_msgs
+ visualization_msgs
+ cv_bridge
+ detect_anything
+ vector_perception_utils
+
+
+ ament_cmake
+
+
diff --git a/semantic_mapping/resource/semantic_mapping b/semantic_mapping/resource/semantic_mapping
new file mode 100644
index 0000000..e69de29
diff --git a/semantic_mapping/scripts/semantic_mapping_node b/semantic_mapping/scripts/semantic_mapping_node
new file mode 100755
index 0000000..0312e88
--- /dev/null
+++ b/semantic_mapping/scripts/semantic_mapping_node
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+"""Entry point for semantic_mapping_node."""
+
+from semantic_mapping.mapping_node import main
+
+if __name__ == "__main__":
+ main()
diff --git a/semantic_mapping/scripts/vlm_reasoning_node b/semantic_mapping/scripts/vlm_reasoning_node
new file mode 100755
index 0000000..645540f
--- /dev/null
+++ b/semantic_mapping/scripts/vlm_reasoning_node
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+"""Entry point for vlm_reasoning_node."""
+
+from semantic_mapping.vlm_reasoning_node import main
+
+if __name__ == "__main__":
+ main()
diff --git a/semantic_mapping/semantic_mapping/__init__.py b/semantic_mapping/semantic_mapping/__init__.py
new file mode 100644
index 0000000..8df6f20
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/__init__.py
@@ -0,0 +1 @@
+"""Semantic 3D object mapping for vector_perception_ros."""
diff --git a/semantic_mapping/semantic_mapping/map_object.py b/semantic_mapping/semantic_mapping/map_object.py
new file mode 100644
index 0000000..da143a9
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/map_object.py
@@ -0,0 +1,713 @@
+"""Data models for semantic mapping.
+
+This module defines the core data structures for the semantic mapping system:
+- VoxelFeatureManager: Manages voxelized point clouds with observation angle voting
+- MapObject: Represents a tracked object in the semantic map
+- Observation: A single detection observation
+- ObjectStatus: Enum for object lifecycle status
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional
+
+import numpy as np
+from scipy.spatial import cKDTree
+
+from vector_perception_utils.transform_utils import discretize_angles, normalize_angle
+from vector_perception_utils.pointcloud_utils import (
+ compute_centroid,
+ create_pointcloud,
+)
+
+
+class ObjectStatus(Enum):
+ """Object lifecycle status (ObjectDB-style tiering)."""
+
+ PENDING = "pending" # < min_detections threshold
+ PERMANENT = "permanent" # >= threshold, confirmed object
+ DELETED = "deleted" # Marked for removal
+
+
+@dataclass
+class Observation:
+ """A single detection observation to update a MapObject.
+
+ Attributes:
+ track_id: Tracker ID from detection.
+ label: Class label string.
+ confidence: Detection confidence score.
+ points: World-frame point cloud (N, 3).
+ robot_R: Robot rotation matrix (3, 3) at observation time.
+ robot_t: Robot translation vector (3,) at observation time.
+ mask: Optional 2D segmentation mask.
+ image: Optional RGB image crop for best-image selection.
+ timestamp: Observation timestamp in seconds.
+ """
+
+ track_id: int
+ label: str
+ confidence: float
+ points: np.ndarray
+ robot_R: np.ndarray
+ robot_t: np.ndarray
+ mask: Optional[np.ndarray] = None
+ image: Optional[np.ndarray] = None
+ timestamp: float = 0.0
+
+
+class VoxelFeatureManager:
+ """Manages voxelized point cloud with observation angle voting.
+
+ This class tracks 3D voxels representing an object, along with:
+ - Vote counts per voxel (how many times each voxel was observed)
+ - Observation angles (which directions each voxel was seen from)
+ - Regularization mask (which voxels pass shape filtering)
+
+ The observation angle voting helps distinguish between reliable points
+ (seen from multiple angles) and spurious points (seen from only one angle).
+
+ Attributes:
+ voxels: Point coordinates (N, 3) in world frame.
+ votes: Detection count per voxel (N,).
+ observation_angles: Binary angle histogram per voxel (N, num_bins).
+ regularized_mask: Boolean mask for valid voxels after clustering (N,).
+ voxel_size: Size of voxel grid cells in meters.
+ num_angle_bins: Number of discretized angle bins.
+ """
+
+ def __init__(
+ self,
+ points: np.ndarray,
+ voxel_size: float,
+ robot_R: np.ndarray,
+ robot_t: np.ndarray,
+ num_angle_bins: int = 15,
+ ) -> None:
+ """Initialize VoxelFeatureManager with initial points.
+
+ Args:
+ points: Initial point cloud (N, 3) in world frame.
+ voxel_size: Voxel grid cell size in meters.
+ robot_R: Robot rotation matrix (3, 3) at observation time.
+ robot_t: Robot translation vector (3,) at observation time.
+ num_angle_bins: Number of angle bins for observation tracking.
+ """
+ num_points = points.shape[0]
+
+ # Compute observation angles from robot position
+ vectors = points - robot_t
+ obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0])
+ obs_bins = discretize_angles(obs_angles, num_angle_bins)
+
+ self.voxels = points.copy()
+ self.voxel_size = voxel_size
+ self.num_angle_bins = num_angle_bins
+ self._tree = cKDTree(points)
+
+ # Initialize vote counts
+ self.votes = np.ones(num_points, dtype=np.float32)
+
+ # Initialize observation angle histogram (binary: seen/not seen from each angle)
+ self.observation_angles = np.zeros((num_points, num_angle_bins), dtype=np.uint8)
+ self.observation_angles[np.arange(num_points), obs_bins] = 1
+
+ # Regularization mask (all False until regularize_shape is called)
+ self.regularized_mask = np.zeros(num_points, dtype=bool)
+
+ # Remove vote for dynamic object clearing (not commonly used)
+ self._remove_votes = np.zeros(num_points, dtype=np.int32)
+
+ def update(
+ self,
+ new_points: np.ndarray,
+ robot_R: np.ndarray,
+ robot_t: np.ndarray,
+ ) -> None:
+ """Add new points with KDTree-based merge and angle voting.
+
+ Points within voxel_size of existing points are merged (votes incremented).
+ Points farther away are added as new voxels.
+
+ Args:
+ new_points: New point cloud (M, 3) in world frame.
+ robot_R: Robot rotation matrix (3, 3) at observation time.
+ robot_t: Robot translation vector (3,) at observation time.
+ """
+ if new_points.shape[0] == 0:
+ return
+
+ # Compute observation angles for new points
+ vectors = new_points - robot_t
+ obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0])
+ obs_bins = discretize_angles(obs_angles, self.num_angle_bins)
+
+ # Find nearest existing voxels
+ distances, indices = self._tree.query(new_points)
+
+ # Merge points within voxel_size
+ merge_mask = distances < self.voxel_size
+ merge_indices = indices[merge_mask]
+ merge_bins = obs_bins[merge_mask]
+
+ self.votes[merge_indices] += 1
+ self.observation_angles[merge_indices, merge_bins] = 1
+
+ # Add new points that don't merge
+ new_mask = ~merge_mask
+ if np.any(new_mask):
+ new_voxels = new_points[new_mask]
+ new_bins = obs_bins[new_mask]
+ num_new = new_voxels.shape[0]
+
+ # Extend arrays
+ self.voxels = np.vstack([self.voxels, new_voxels])
+ self.votes = np.concatenate([self.votes, np.ones(num_new, dtype=np.float32)])
+
+ new_obs = np.zeros((num_new, self.num_angle_bins), dtype=np.uint8)
+ new_obs[np.arange(num_new), new_bins] = 1
+ self.observation_angles = np.vstack([self.observation_angles, new_obs])
+
+ self.regularized_mask = np.concatenate(
+ [self.regularized_mask, np.zeros(num_new, dtype=bool)]
+ )
+ self._remove_votes = np.concatenate(
+ [self._remove_votes, np.zeros(num_new, dtype=np.int32)]
+ )
+
+ # Rebuild KDTree
+ self._tree = cKDTree(self.voxels)
+
+ def update_observation_angles(self, voxel_indices: np.ndarray, obs_bins: np.ndarray) -> None:
+ """Update observation angle histogram for selected voxels."""
+ if voxel_indices.size == 0:
+ return
+ self.observation_angles[voxel_indices, obs_bins] = 1
+
+ def merge_from(self, other: VoxelFeatureManager) -> None:
+ """Merge voxels from another VoxelFeatureManager.
+
+ Args:
+ other: Another VoxelFeatureManager to merge from.
+ """
+ if other.voxels.shape[0] == 0:
+ return
+
+ distances, indices = self._tree.query(other.voxels)
+
+ # Merge overlapping voxels
+ merge_mask = distances < self.voxel_size
+ merge_indices = indices[merge_mask]
+
+ self.votes[merge_indices] += other.votes[merge_mask]
+ self.observation_angles[merge_indices] = np.logical_or(
+ self.observation_angles[merge_indices],
+ other.observation_angles[merge_mask],
+ ).astype(np.uint8)
+
+ # Add non-overlapping voxels
+ new_mask = ~merge_mask
+ if np.any(new_mask):
+ self.voxels = np.vstack([self.voxels, other.voxels[new_mask]])
+ self.votes = np.concatenate([self.votes, other.votes[new_mask]])
+ self.observation_angles = np.vstack(
+ [self.observation_angles, other.observation_angles[new_mask]]
+ )
+ self.regularized_mask = np.concatenate(
+ [self.regularized_mask, other.regularized_mask[new_mask]]
+ )
+ self._remove_votes = np.concatenate(
+ [self._remove_votes, np.zeros(np.sum(new_mask), dtype=np.int32)]
+ )
+ self._tree = cKDTree(self.voxels)
+
+ def get_valid_indices(
+ self,
+ diversity_percentile: float = 0.3,
+ use_regularized: bool = True,
+ ) -> np.ndarray:
+ """Get indices of valid voxels based on observation angle diversity.
+
+ Voxels are ranked by their observation angle diversity (number of angles
+ from which they were observed), weighted by vote count. The top percentile
+ of voxels are returned.
+
+ Args:
+ diversity_percentile: Fraction of total weight to include (0-1).
+ use_regularized: If True, only consider voxels in regularized_mask.
+
+ Returns:
+ Indices of valid voxels.
+ """
+ if use_regularized:
+ mask = self.regularized_mask
+ if not np.any(mask):
+ return np.array([], dtype=int)
+ obs_angles = self.observation_angles[mask]
+ votes = self.votes[mask]
+ else:
+ obs_angles = self.observation_angles
+ votes = self.votes
+
+ if len(obs_angles) == 0:
+ return np.array([], dtype=int)
+
+ # Compute diversity score (number of angles observed)
+ angle_diversity = np.sum(obs_angles, axis=1)
+
+ # Sort by diversity (ascending)
+ sorted_indices = np.argsort(angle_diversity)
+ sorted_diversity = angle_diversity[sorted_indices]
+ sorted_weights = votes[sorted_indices]
+
+ # Weight by diversity * votes
+ weighted = sorted_diversity * sorted_weights
+ total_weight = np.sum(weighted)
+
+ if total_weight == 0:
+ return np.array([], dtype=int)
+
+ # Find cutoff index for percentile
+ target_weight = (1 - diversity_percentile) * total_weight
+ cumsum = np.cumsum(weighted)
+ cutoff_idx = np.searchsorted(cumsum, target_weight)
+
+ # Return indices above cutoff (higher diversity)
+ return sorted_indices[cutoff_idx:]
+
+ def get_valid_voxels(
+ self,
+ diversity_percentile: float = 0.3,
+ use_regularized: bool = True,
+ ) -> np.ndarray:
+ """Get valid voxel coordinates.
+
+ Args:
+ diversity_percentile: Fraction of total weight to include (0-1).
+ use_regularized: If True, only consider voxels in regularized_mask.
+
+ Returns:
+ Valid voxel coordinates (M, 3).
+ """
+ indices = self.get_valid_indices(diversity_percentile, use_regularized)
+ if use_regularized and np.any(self.regularized_mask):
+ voxels = self.voxels[self.regularized_mask]
+ else:
+ voxels = self.voxels
+
+ if len(indices) == 0:
+ return np.empty((0, 3), dtype=np.float32)
+ return voxels[indices]
+
+ @property
+ def num_voxels(self) -> int:
+ """Total number of voxels."""
+ return self.voxels.shape[0]
+
+ @property
+ def num_regularized(self) -> int:
+ """Number of voxels passing regularization."""
+ return int(np.sum(self.regularized_mask))
+
+
+@dataclass
+class MapObject:
+ """Represents a tracked object in the semantic map.
+
+ This class combines patterns from VLM_ROS SingleObject and dimos Object,
+ with ObjectDB-style lifecycle management.
+
+ Attributes:
+ object_id: Unique identifier for this object (UUID string).
+ track_ids: List of associated tracker IDs (can merge multiple).
+ voxel_manager: Manages the object's voxelized point cloud.
+ class_votes: Label -> observation count mapping.
+ confidence_scores: Label -> best confidence score mapping.
+ points_count: Label -> total points count mapping.
+ robot_poses: List of robot poses when object was observed.
+ detection_count: Total number of detections.
+ inactive_frames: Frames since last update.
+ last_update_time: Timestamp of last update.
+ status: Object lifecycle status.
+ is_asked_vlm: Whether VLM has been queried for this object.
+ updated_by_vlm: Whether object was relabeled by VLM.
+ best_image_path: Path to best saved image crop.
+ best_image_mask_area: Mask area of the best saved image crop.
+ centroid: Cached centroid position (None if needs recompute).
+ bbox3d_corners: Cached 3D bbox corners (None if needs recompute).
+ """
+
+ object_id: str
+ track_ids: list[int]
+ voxel_manager: VoxelFeatureManager
+
+ # Classification (voted)
+ class_votes: dict[str, int] = field(default_factory=dict)
+ confidence_scores: dict[str, float] = field(default_factory=dict)
+ points_count: dict[str, int] = field(default_factory=dict)
+
+ # Robot poses at observation times
+ robot_poses: list[dict] = field(default_factory=list)
+
+ # Lifecycle (ObjectDB-style)
+ detection_count: int = 1
+ inactive_frames: int = 0
+ last_update_time: float = 0.0
+ status: ObjectStatus = ObjectStatus.PENDING
+
+ # VLM interaction
+ is_asked_vlm: bool = False
+ updated_by_vlm: bool = False
+
+ # Image storage
+ best_image_path: Optional[str] = None
+ best_image_mask_area: float = 0.0
+
+ # Cached geometry (None means needs recompute)
+ _centroid: Optional[np.ndarray] = field(default=None, repr=False)
+ _bbox3d_corners: Optional[np.ndarray] = field(default=None, repr=False)
+ _needs_recompute: bool = field(default=True, repr=False)
+
+ # Configuration
+ _diversity_percentile: float = field(default=0.85, repr=False)
+ _angle_threshold: float = field(default=np.deg2rad(5), repr=False)
+ _distance_threshold: float = field(default=0.3, repr=False)
+ _needs_regularization: bool = field(default=True, repr=False)
+ _regularize_voxel_size: float = field(default=0.06, repr=False)
+
+ @classmethod
+ def from_observation(
+ cls,
+ object_id: str,
+ obs: Observation,
+ voxel_size: float = 0.03,
+ regularize_voxel_size: float = 0.06,
+ num_angle_bins: int = 15,
+ ) -> MapObject:
+ """Create a new MapObject from an initial observation.
+
+ Args:
+ object_id: Unique identifier for this object.
+ obs: Initial observation.
+ voxel_size: Voxel grid cell size in meters.
+ regularize_voxel_size: Voxel size for regularization clustering.
+ num_angle_bins: Number of angle bins for observation tracking.
+
+ Returns:
+ New MapObject instance.
+ """
+ voxel_manager = VoxelFeatureManager(
+ points=obs.points,
+ voxel_size=voxel_size,
+ robot_R=obs.robot_R,
+ robot_t=obs.robot_t,
+ num_angle_bins=num_angle_bins,
+ )
+
+ obj = cls(
+ object_id=object_id,
+ track_ids=[obs.track_id],
+ voxel_manager=voxel_manager,
+ class_votes={obs.label: 1},
+ confidence_scores={obs.label: obs.confidence},
+ points_count={obs.label: len(obs.points)},
+ robot_poses=[{"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}],
+ last_update_time=obs.timestamp,
+ _regularize_voxel_size=regularize_voxel_size,
+ )
+ return obj
+
+ def _invalidate_cache(self) -> None:
+ """Mark cached geometry as needing recompute."""
+ self._needs_recompute = True
+ self._centroid = None
+ self._bbox3d_corners = None
+
+ def _is_similar_pose(self, new_R: np.ndarray, new_t: np.ndarray) -> bool:
+ """Check if a new pose is similar to existing poses.
+
+ Used to avoid redundant updates from the same viewpoint.
+ Compares observation angle and distance to object center.
+
+ Args:
+ new_R: New robot rotation matrix (3, 3). Not used but kept for API.
+ new_t: New robot translation vector (3,).
+
+ Returns:
+ True if pose is similar to an existing pose.
+ """
+ _ = new_R # Unused but kept for API consistency
+ obj_center = self.centroid
+ if obj_center is None:
+ obj_center = compute_centroid(self.voxel_manager.voxels)
+
+ for pose in self.robot_poses:
+ old_t = pose["t"] # R not used for similarity check
+
+ old_to_obj = obj_center - old_t
+ new_to_obj = obj_center - new_t
+
+ old_angle = np.arctan2(old_to_obj[1], old_to_obj[0])
+ new_angle = np.arctan2(new_to_obj[1], new_to_obj[0])
+ angle_diff = abs(normalize_angle(np.array([new_angle - old_angle]))[0])
+
+ old_dist = np.linalg.norm(old_to_obj)
+ new_dist = np.linalg.norm(new_to_obj)
+ dist_diff = abs(new_dist - old_dist)
+
+ if angle_diff < self._angle_threshold and dist_diff < self._distance_threshold:
+ return True
+
+ return False
+
+ def update(self, obs: Observation) -> None:
+ """Update object with a new observation.
+
+ Args:
+ obs: New observation to incorporate.
+ """
+ # Update voxel manager
+ self.voxel_manager.update(obs.points, obs.robot_R, obs.robot_t)
+ self.detection_count += 1
+ self.inactive_frames = 0
+ self.last_update_time = obs.timestamp
+ self._needs_regularization = True
+
+ # Check if pose is new (not similar to existing)
+ is_similar = self._is_similar_pose(obs.robot_R, obs.robot_t)
+
+ if not is_similar:
+ # Update class statistics
+ self.class_votes[obs.label] = self.class_votes.get(obs.label, 0) + 1
+ self.confidence_scores[obs.label] = max(
+ self.confidence_scores.get(obs.label, 0), obs.confidence
+ )
+ self.points_count[obs.label] = (
+ self.points_count.get(obs.label, 0) + len(obs.points)
+ )
+
+ # Add pose
+ self.robot_poses.append({"R": obs.robot_R.copy(), "t": obs.robot_t.copy()})
+
+ # Add track ID if new
+ if obs.track_id not in self.track_ids:
+ self.track_ids.append(obs.track_id)
+
+ self._invalidate_cache()
+
+ def merge(self, other: MapObject) -> None:
+ """Merge another MapObject into this one.
+
+ Args:
+ other: Object to merge from.
+ """
+ # Merge track IDs
+ for tid in other.track_ids:
+ if tid not in self.track_ids:
+ self.track_ids.append(tid)
+
+ # Merge voxel managers
+ self.voxel_manager.merge_from(other.voxel_manager)
+ self._needs_regularization = True
+
+ # Merge class statistics
+ for label, count in other.class_votes.items():
+ self.class_votes[label] = self.class_votes.get(label, 0) + count
+ self.confidence_scores[label] = max(
+ self.confidence_scores.get(label, 0),
+ other.confidence_scores.get(label, 0),
+ )
+ self.points_count[label] = (
+ self.points_count.get(label, 0) + other.points_count.get(label, 0)
+ )
+
+ # Merge poses (only unique ones)
+ for pose in other.robot_poses:
+ if not self._is_similar_pose(pose["R"], pose["t"]):
+ self.robot_poses.append(pose)
+
+ # Update lifecycle
+ self.detection_count += other.detection_count
+ self.last_update_time = max(self.last_update_time, other.last_update_time)
+ self.is_asked_vlm = self.is_asked_vlm or other.is_asked_vlm
+
+ self._invalidate_cache()
+
+ @property
+ def label(self) -> str:
+ """Get dominant class label by weighted vote.
+
+ Returns:
+ Most likely class label based on points_count * confidence.
+ """
+ if not self.class_votes:
+ return "unknown"
+
+ # Compute weighted scores
+ weighted_scores = {}
+ for lbl in self.class_votes:
+ pts = self.points_count.get(lbl, 0)
+ conf = self.confidence_scores.get(lbl, 0)
+ weighted_scores[lbl] = pts * conf
+
+ return max(weighted_scores, key=weighted_scores.get)
+
+ @property
+ def confidence(self) -> float:
+ """Get confidence of the dominant label."""
+ return self.confidence_scores.get(self.label, 0.0)
+
+ @property
+ def centroid(self) -> Optional[np.ndarray]:
+ """Compute centroid weighted by observation angles.
+
+ Returns:
+ Centroid position (3,) or None if no valid voxels.
+ """
+ if self._needs_recompute or self._centroid is None:
+ self._compute_geometry()
+ return self._centroid
+
+ @property
+ def bbox3d_corners(self) -> Optional[np.ndarray]:
+ """Compute axis-aligned 3D bounding box corners.
+
+ Returns:
+ 8 corner points (8, 3) or None if no valid voxels.
+ """
+ if self._needs_recompute or self._bbox3d_corners is None:
+ self._compute_geometry()
+ return self._bbox3d_corners
+
+ def _compute_geometry(self) -> None:
+ """Compute cached centroid and bbox from valid voxels using Open3D.
+
+ Returns:
+ None. Updates cached centroid and bbox in-place.
+ """
+ # Get valid voxels (regularized first, then fallback to all)
+ valid_voxels = self.voxel_manager.get_valid_voxels(
+ diversity_percentile=self._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(valid_voxels) == 0:
+ valid_voxels = self.voxel_manager.get_valid_voxels(
+ diversity_percentile=self._diversity_percentile,
+ use_regularized=False,
+ )
+
+ if len(valid_voxels) < 4:
+ self._centroid = compute_centroid(valid_voxels) if len(valid_voxels) > 0 else None
+ self._bbox3d_corners = None
+ else:
+ pcd = create_pointcloud(valid_voxels)
+ try:
+ obb = pcd.get_minimal_oriented_bounding_box()
+ except RuntimeError:
+ # Points are coplanar / degenerate — fall back to axis-aligned bbox.
+ obb = pcd.get_axis_aligned_bounding_box()
+ self._centroid = np.asarray(obb.get_center())
+ self._bbox3d_corners = np.asarray(obb.get_box_points())
+ self._needs_recompute = False
+ return
+ self._centroid = np.asarray(obb.center)
+ half_extent = 0.5 * np.asarray(obb.extent)
+ local_corners = np.array(
+ [
+ [-half_extent[0], -half_extent[1], -half_extent[2]],
+ [half_extent[0], -half_extent[1], -half_extent[2]],
+ [half_extent[0], half_extent[1], -half_extent[2]],
+ [-half_extent[0], half_extent[1], -half_extent[2]],
+ [-half_extent[0], -half_extent[1], half_extent[2]],
+ [half_extent[0], -half_extent[1], half_extent[2]],
+ [half_extent[0], half_extent[1], half_extent[2]],
+ [-half_extent[0], half_extent[1], half_extent[2]],
+ ],
+ dtype=float,
+ )
+ self._bbox3d_corners = local_corners @ np.asarray(obb.R).T + np.asarray(obb.center)
+
+ self._needs_recompute = False
+
+ def regularize_shape(self, percentile: float = 0.85) -> None:
+ """Apply vote-weighted regularization with light outlier removal."""
+ if not self._needs_regularization:
+ return
+ voxels = self.voxel_manager.voxels
+ if len(voxels) < 3:
+ self.voxel_manager.regularized_mask = np.ones(len(voxels), dtype=bool)
+ self._needs_regularization = False
+ self._invalidate_cache()
+ return
+
+ weights = np.sum(self.voxel_manager.observation_angles, axis=1) * self.voxel_manager.votes
+ keep_ratio = percentile if percentile is not None else 1.0
+ if keep_ratio >= 1.0:
+ keep_mask = np.ones(voxels.shape[0], dtype=bool)
+ else:
+ cutoff = float(np.quantile(weights, 1.0 - keep_ratio))
+ keep_mask = weights >= cutoff
+
+ if not np.any(keep_mask):
+ keep_mask = np.ones(voxels.shape[0], dtype=bool)
+
+ selected_indices = np.nonzero(keep_mask)[0]
+ points_sel = voxels[selected_indices]
+ weights_sel = weights[selected_indices]
+ if points_sel.shape[0] >= 3:
+ voxel_size = self._regularize_voxel_size
+ keys = np.floor(points_sel / voxel_size).astype(np.int32)
+ voxel_map: dict[tuple[int, int, int], list[int]] = {}
+ for idx, key in enumerate(map(tuple, keys)):
+ voxel_map.setdefault(key, []).append(idx)
+
+ neighbor_offsets = [
+ (dx, dy, dz)
+ for dx in (-1, 0, 1)
+ for dy in (-1, 0, 1)
+ for dz in (-1, 0, 1)
+ if not (dx == 0 and dy == 0 and dz == 0)
+ ]
+ visited: set[tuple[int, int, int]] = set()
+ components: list[tuple[np.ndarray, float, np.ndarray]] = []
+ for key in voxel_map:
+ if key in visited:
+ continue
+ stack = [key]
+ visited.add(key)
+ comp_indices: list[int] = []
+ while stack:
+ current = stack.pop()
+ comp_indices.extend(voxel_map[current])
+ for dx, dy, dz in neighbor_offsets:
+ neighbor = (current[0] + dx, current[1] + dy, current[2] + dz)
+ if neighbor in voxel_map and neighbor not in visited:
+ visited.add(neighbor)
+ stack.append(neighbor)
+
+ comp_idx = np.array(comp_indices, dtype=int)
+ comp_weight = float(np.sum(weights_sel[comp_idx]))
+ comp_centroid = compute_centroid(points_sel[comp_idx])
+ components.append((comp_idx, comp_weight, comp_centroid))
+
+ if components:
+ main_idx = int(np.argmax([c[1] for c in components]))
+ main_points = points_sel[components[main_idx][0]]
+ main_centroid = components[main_idx][2]
+ extent = np.max(main_points, axis=0) - np.min(main_points, axis=0)
+ dist_thresh = max(0.3, 0.5 * float(np.max(extent)))
+
+ keep_sel_mask = np.zeros(points_sel.shape[0], dtype=bool)
+ for comp_idx, _, comp_centroid in components:
+ if np.linalg.norm(comp_centroid - main_centroid) <= dist_thresh:
+ keep_sel_mask[comp_idx] = True
+ if np.any(keep_sel_mask):
+ keep_mask = np.zeros(voxels.shape[0], dtype=bool)
+ keep_mask[selected_indices[keep_sel_mask]] = True
+
+ self.voxel_manager.regularized_mask = keep_mask
+
+ self._needs_regularization = False
+ self._invalidate_cache()
diff --git a/semantic_mapping/semantic_mapping/mapper.py b/semantic_mapping/semantic_mapping/mapper.py
new file mode 100644
index 0000000..cc67cf1
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/mapper.py
@@ -0,0 +1,275 @@
+"""Object map manager for semantic mapping."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+import threading
+import time
+import uuid
+
+from .utils.matching import compute_bbox, compute_iou_3d_with_ratios, find_best_match
+from .map_object import MapObject, ObjectStatus, Observation
+
+
+@dataclass
+class UpdateStats:
+ """Summary of updates applied in a mapper step."""
+
+ created: int = 0
+ updated: int = 0
+ merged: int = 0
+ deleted: int = 0
+ promoted: int = 0
+ total: int = 0
+
+
+class ObjectMapper:
+ """Manages the semantic object map with two-tier state management."""
+
+ def __init__(
+ self,
+ voxel_size: float = 0.03,
+ regularize_voxel_size: float = 0.06,
+ confidence_threshold: float = 0.30,
+ distance_threshold: float = 0.5,
+ iou_threshold: float = 0.2,
+ min_detections_for_permanent: int = 3,
+ pending_ttl_s: float = 10.0,
+ num_angle_bins: int = 15,
+ ) -> None:
+ """Initialize the object mapper and its thresholds.
+
+ Args:
+ voxel_size: Voxel grid cell size in meters.
+ regularize_voxel_size: Voxel size for regularization clustering.
+ confidence_threshold: Minimum detection confidence to accept.
+ distance_threshold: Max centroid distance for spatial matching.
+ iou_threshold: IoU threshold for merge checks.
+ min_detections_for_permanent: Detections before promotion.
+ pending_ttl_s: Time-to-live for pending objects.
+ num_angle_bins: Observation angle bins for voxel manager.
+ """
+ self._pending_objects: dict[str, MapObject] = {}
+ self._permanent_objects: dict[str, MapObject] = {}
+ self._track_id_map: dict[int, str] = {}
+ self._lock = threading.RLock()
+
+ self._voxel_size = voxel_size
+ self._regularize_voxel_size = regularize_voxel_size
+ self._confidence_threshold = confidence_threshold
+ self._distance_threshold = distance_threshold
+ self._iou_threshold = iou_threshold
+ self._min_detections_for_permanent = min_detections_for_permanent
+ self._pending_ttl_s = pending_ttl_s
+ self._num_angle_bins = num_angle_bins
+
+ def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats:
+ """Update the map with new observations.
+
+ Args:
+ observations: List of new observations to integrate.
+ target_object: Optional label to protect from pruning.
+
+ Returns:
+ UpdateStats summarizing changes applied.
+ """
+ stats = UpdateStats()
+ now = max((obs.timestamp for obs in observations), default=time.time())
+ updated_ids: set[str] = set()
+ regularized_ids: set[str] = set()
+
+ with self._lock:
+ for obs in observations:
+ if obs.confidence < self._confidence_threshold:
+ continue
+
+ obj = self._match_observation(obs)
+ if obj is None:
+ obj = self._create_object(obs)
+ self._pending_objects[obj.object_id] = obj
+ stats.created += 1
+ else:
+ obj.update(obs)
+ stats.updated += 1
+
+ updated_ids.add(obj.object_id)
+ for track_id in obj.track_ids:
+ self._track_id_map[track_id] = obj.object_id
+
+ if obj.status == ObjectStatus.PENDING:
+ if self._check_promotion(obj):
+ stats.promoted += 1
+
+ if obj.object_id not in regularized_ids and obj.inactive_frames == 0:
+ obj.regularize_shape(percentile=obj._diversity_percentile)
+ regularized_ids.add(obj.object_id)
+
+ for obj in list(self._pending_objects.values()) + list(self._permanent_objects.values()):
+ if obj.object_id not in updated_ids:
+ obj.inactive_frames += 1
+
+ deleted = self._prune_inactive(target_object, now)
+ stats.deleted = len(deleted)
+ stats.merged += self.merge_objects()
+ stats.total = len(self._permanent_objects)
+
+ return stats
+
+ def _match_observation(self, obs: Observation) -> MapObject | None:
+ """Match an observation to an existing object."""
+ if obs.track_id in self._track_id_map:
+ obj_id = self._track_id_map[obs.track_id]
+ if obj_id in self._permanent_objects:
+ return self._permanent_objects[obj_id]
+ if obj_id in self._pending_objects:
+ return self._pending_objects[obj_id]
+
+ candidates = list(self._permanent_objects.values()) + list(self._pending_objects.values())
+ return find_best_match(
+ obs=obs,
+ candidates=candidates,
+ distance_threshold=self._distance_threshold,
+ iou_threshold=self._iou_threshold,
+ )
+
+ def _create_object(self, obs: Observation) -> MapObject:
+ """Create a new MapObject from an observation."""
+ obj = MapObject.from_observation(
+ object_id=str(uuid.uuid4()),
+ obs=obs,
+ voxel_size=self._voxel_size,
+ regularize_voxel_size=self._regularize_voxel_size,
+ num_angle_bins=self._num_angle_bins,
+ )
+ self._track_id_map[obs.track_id] = obj.object_id
+ return obj
+
+ def _check_promotion(self, obj: MapObject) -> bool:
+ """Promote object from pending to permanent if threshold met."""
+ if obj.status != ObjectStatus.PENDING:
+ return False
+ if obj.detection_count < self._min_detections_for_permanent:
+ return False
+ if obj.object_id in self._pending_objects:
+ self._pending_objects.pop(obj.object_id)
+ obj.status = ObjectStatus.PERMANENT
+ self._permanent_objects[obj.object_id] = obj
+ return True
+
+ def merge_objects(self) -> int:
+ """Merge overlapping permanent objects.
+
+ Returns:
+ Number of merges performed.
+ """
+ merged = 0
+ objs = list(self._permanent_objects.values())
+ i = 0
+ while i < len(objs):
+ obj = objs[i]
+ j = i + 1
+ while j < len(objs):
+ other = objs[j]
+ should_merge = False
+
+ if set(obj.track_ids) & set(other.track_ids):
+ should_merge = True
+ else:
+ if obj.centroid is None or other.centroid is None:
+ j += 1
+ continue
+ dist = float(((obj.centroid - other.centroid) ** 2).sum() ** 0.5)
+ if dist > self._distance_threshold:
+ j += 1
+ continue
+
+ obj_points = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(obj_points) == 0:
+ obj_points = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=False,
+ )
+ other_points = other.voxel_manager.get_valid_voxels(
+ diversity_percentile=other._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(other_points) == 0:
+ other_points = other.voxel_manager.get_valid_voxels(
+ diversity_percentile=other._diversity_percentile,
+ use_regularized=False,
+ )
+
+ obj_bbox = compute_bbox(obj_points) if len(obj_points) >= 3 else None
+ other_bbox = compute_bbox(other_points) if len(other_points) >= 3 else None
+ obj_corners = obj_bbox[0] if obj_bbox is not None else None
+ other_corners = other_bbox[0] if other_bbox is not None else None
+ obj_extent = obj_bbox[1] if obj_bbox is not None else None
+ other_extent = other_bbox[1] if other_bbox is not None else None
+
+ dist_merge = dist < 0.25
+ if obj_extent is not None and other_extent is not None:
+ dist_thresh = (((obj_extent / 2 + other_extent / 2) / 2) ** 2).sum() ** 0.5
+ dist_merge = dist_merge or dist < dist_thresh * 0.5
+
+ if dist_merge:
+ should_merge = True
+ elif obj_corners is not None and other_corners is not None:
+ iou, ratio_obj, ratio_other = compute_iou_3d_with_ratios(
+ obj_corners, other_corners
+ )
+ if (
+ iou > self._iou_threshold
+ or ratio_obj > 0.4
+ or ratio_other > 0.4
+ ):
+ should_merge = True
+
+ if should_merge:
+ obj.merge(other)
+ for track_id in other.track_ids:
+ self._track_id_map[track_id] = obj.object_id
+ self._permanent_objects.pop(other.object_id, None)
+ objs.pop(j)
+ merged += 1
+ else:
+ j += 1
+ i += 1
+
+ return merged
+
+ def _prune_inactive(self, target_object: str | None, now: float) -> list[MapObject]:
+ """Remove inactive pending/permanent objects."""
+ deleted: list[MapObject] = []
+
+ for obj_id, obj in list(self._pending_objects.items()):
+ if (now - obj.last_update_time) > self._pending_ttl_s:
+ deleted.append(self._pending_objects.pop(obj_id))
+
+ for obj_id, obj in list(self._permanent_objects.items()):
+ if target_object and obj.label == target_object:
+ continue
+ if obj.status == ObjectStatus.DELETED:
+ deleted.append(self._permanent_objects.pop(obj_id))
+
+ for obj in deleted:
+ for track_id in obj.track_ids:
+ if self._track_id_map.get(track_id) == obj.object_id:
+ self._track_id_map.pop(track_id, None)
+
+ return deleted
+
+ def get_objects(self, include_pending: bool = False) -> list[MapObject]:
+ """Return current objects.
+
+ Args:
+ include_pending: If True, include pending objects.
+
+ Returns:
+ List of MapObject instances.
+ """
+ if include_pending:
+ return list(self._permanent_objects.values()) + list(self._pending_objects.values())
+ return list(self._permanent_objects.values())
diff --git a/semantic_mapping/semantic_mapping/mapping_node.py b/semantic_mapping/semantic_mapping/mapping_node.py
new file mode 100644
index 0000000..760ad6d
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/mapping_node.py
@@ -0,0 +1,993 @@
+"""ROS2 mapping node for semantic 3D object mapping."""
+
+from __future__ import annotations
+
+from collections import deque
+from pathlib import Path
+import time
+
+import numpy as np
+from cv_bridge import CvBridge
+import cv2
+import rclpy
+from rclpy.node import Node
+from rclpy.qos import QoSProfile, ReliabilityPolicy
+from sensor_msgs.msg import CompressedImage, PointCloud2
+from sensor_msgs_py import point_cloud2
+from nav_msgs.msg import Odometry
+from std_msgs.msg import Header
+from visualization_msgs.msg import MarkerArray
+from scipy.spatial.transform import Rotation
+
+from detect_anything.msg import DetectionResult
+from semantic_mapping.msg import (
+ ObjectType,
+ ObjectNode,
+ TargetObjectInstruction,
+ TargetObjectWithSpatial,
+ ViewpointRep,
+)
+
+from vector_perception_utils.pointcloud_utils import (
+ project_pointcloud_to_image,
+ segment_pointclouds_from_projection,
+ transform_segmented_clouds_to_world,
+)
+from vector_perception_utils.transform_utils import discretize_angles
+
+from .map_object import Observation, MapObject
+from .utils.map_object_utils import (
+ create_bbox_markers,
+ create_delete_markers,
+ create_label_markers,
+ create_object_pointcloud,
+ map_object_to_object_node,
+)
+from .mapper import ObjectMapper
+from .utils.storage import ImageSaveQueue
+from .utils.utils import collect_cloud_window, compute_ground_z_threshold, interpolate_odom, load_camera_params
+
+
+class SemanticMappingNode(Node):
+ """ROS2 node for semantic object mapping.
+
+ This node fuses 2D detections and LiDAR point clouds into persistent 3D objects:
+ - Buffers detections, clouds, and odometry with timestamps
+ - Interpolates odometry at detection time (position + orientation)
+ - Segments point clouds by detection masks and projects to world frame
+ - Updates the object mapper and publishes ROS messages
+ - Queues cropped images for VLM queries
+ """
+
+ def __init__(self) -> None:
+ """Initialize the mapping node, parameters, buffers, and ROS I/O."""
+ super().__init__("semantic_mapping_node")
+ self._bridge = CvBridge()
+
+ self.declare_parameter("detection_topic", "/detection_result")
+ self.declare_parameter("cloud_topic", "/registered_scan")
+ self.declare_parameter("odom_topic", "/state_estimation")
+ self.declare_parameter("viewpoint_topic", "/viewpoint_rep_header")
+ self.declare_parameter("object_type_answer_topic", "/object_type_answer")
+ self.declare_parameter("object_type_query_topic", "/object_type_query")
+ self.declare_parameter("target_object_spatial_query_topic", "/target_object_spatial_query")
+ self.declare_parameter("target_object_instruction_topic", "/target_object_instruction")
+ self.declare_parameter("object_nodes_topic", "/object_nodes")
+ self.declare_parameter("obj_points_topic", "/obj_points")
+ self.declare_parameter("obj_boxes_topic", "/obj_boxes")
+ self.declare_parameter("obj_labels_topic", "/obj_labels")
+ self.declare_parameter("map_frame", "map")
+
+ self.declare_parameter("confidence_threshold", 0.3)
+ self.declare_parameter("voxel_size", 0.03)
+ self.declare_parameter("regularize_voxel_size", 0.06)
+ self.declare_parameter("distance_threshold", 0.5)
+ self.declare_parameter("iou_threshold", 0.2)
+ self.declare_parameter("min_detections_for_permanent", 3)
+ self.declare_parameter("pending_ttl_s", 10.0)
+ self.declare_parameter("num_angle_bins", 15)
+
+ self.declare_parameter("detection_linear_state_time_bias", 0.0)
+ self.declare_parameter("detection_angular_state_time_bias", 0.0)
+ self.declare_parameter("cloud_window_before", 0.5)
+ self.declare_parameter("cloud_window_after", 0.1)
+
+ self.declare_parameter("camera_param_path", "config/camera_param.yaml")
+ self.declare_parameter("depth_filter", False)
+ self.declare_parameter("depth_filter_margin", 0.15)
+ self.declare_parameter("depth_jump_threshold", 0.3)
+ self.declare_parameter("mask_erode_kernel", 3)
+ self.declare_parameter("mask_erode_iterations", 2)
+ self.declare_parameter("max_cloud_distance", 8.0)
+ self.declare_parameter("visualization_max_objects", 50)
+ self.declare_parameter("ground_filter", True)
+ self.declare_parameter("ground_radius", 2.0)
+ self.declare_parameter("ground_z_clearance", 0.0)
+
+ self.declare_parameter("target_object", "")
+ self.declare_parameter("save_png", False)
+ self.declare_parameter("output_dir", "output/object_images")
+ self.declare_parameter("save_viewpoint_images", True)
+ self.declare_parameter("viewpoint_output_dir", "output/viewpoint_images")
+ self.declare_parameter("viewpoint_use_timestamp_dir", True)
+ self.declare_parameter("viewpoint_save_png", True)
+ self.declare_parameter("viewpoint_save_transform", True)
+ self.declare_parameter("object_type_query_mode", "target_or_all")
+ self.declare_parameter("object_type_query_max_per_cycle", 3)
+ self.declare_parameter("enable_object_type_queries", True)
+
+ detection_topic = str(self.get_parameter("detection_topic").value)
+ cloud_topic = str(self.get_parameter("cloud_topic").value)
+ odom_topic = str(self.get_parameter("odom_topic").value)
+ viewpoint_topic = str(self.get_parameter("viewpoint_topic").value)
+ object_type_answer_topic = str(self.get_parameter("object_type_answer_topic").value)
+ object_type_query_topic = str(self.get_parameter("object_type_query_topic").value)
+ target_object_spatial_query_topic = str(
+ self.get_parameter("target_object_spatial_query_topic").value
+ )
+ target_object_instruction_topic = str(self.get_parameter("target_object_instruction_topic").value)
+ object_nodes_topic = str(self.get_parameter("object_nodes_topic").value)
+ obj_points_topic = str(self.get_parameter("obj_points_topic").value)
+ obj_boxes_topic = str(self.get_parameter("obj_boxes_topic").value)
+ obj_labels_topic = str(self.get_parameter("obj_labels_topic").value)
+ self._map_frame = str(self.get_parameter("map_frame").value)
+
+ self._confidence_threshold = float(self.get_parameter("confidence_threshold").value)
+ self._detection_linear_bias = float(self.get_parameter("detection_linear_state_time_bias").value)
+ self._detection_angular_bias = float(self.get_parameter("detection_angular_state_time_bias").value)
+ self._cloud_window_before = float(self.get_parameter("cloud_window_before").value)
+ self._cloud_window_after = float(self.get_parameter("cloud_window_after").value)
+
+ camera_param_path = str(self.get_parameter("camera_param_path").value)
+ camera_params = load_camera_params(
+ camera_param_path, base_dir=Path(__file__).resolve().parent.parent
+ )
+ self._projection = "equirect"
+ self._axis_mode = "xz"
+ self._image_width = 1920
+ self._image_height = 640
+ self._depth_filter = bool(self.get_parameter("depth_filter").value)
+ self._depth_filter_margin = float(self.get_parameter("depth_filter_margin").value)
+ self._depth_jump_threshold = float(self.get_parameter("depth_jump_threshold").value)
+ self._mask_erode_kernel_size = max(1, int(self.get_parameter("mask_erode_kernel").value))
+ self._mask_erode_iterations = max(0, int(self.get_parameter("mask_erode_iterations").value))
+ self._max_cloud_distance = float(self.get_parameter("max_cloud_distance").value)
+ self._sensor_to_camera = None
+ if camera_params is not None:
+ self._image_width = camera_params.image_width
+ self._image_height = camera_params.image_height
+ self._sensor_to_camera = camera_params.sensor_to_camera
+ self._mask_erode_kernel = cv2.getStructuringElement(
+ cv2.MORPH_RECT,
+ (self._mask_erode_kernel_size, self._mask_erode_kernel_size),
+ )
+ self._visualization_max_objects = int(
+ self.get_parameter("visualization_max_objects").value
+ )
+ self._ground_filter = bool(self.get_parameter("ground_filter").value)
+ self._ground_radius = float(self.get_parameter("ground_radius").value)
+ self._ground_z_clearance = float(self.get_parameter("ground_z_clearance").value)
+ self._ground_percentile = 0.05
+ self._ground_min_points = 80
+
+ self._target_object = str(self.get_parameter("target_object").value)
+ self._room_condition = ""
+ self._spatial_condition = ""
+ self._attribute_condition = ""
+ self._anchor_object = ""
+ self._attribute_condition_anchor = ""
+ self._target_spatial_query_ids: set[int] = set()
+ self._object_viewpoint_ids: dict[int, set[int]] = {}
+ self._save_png = bool(self.get_parameter("save_png").value)
+ self._output_dir = Path(str(self.get_parameter("output_dir").value))
+ self._save_viewpoint_images = bool(self.get_parameter("save_viewpoint_images").value)
+ self._viewpoint_output_dir = Path(str(self.get_parameter("viewpoint_output_dir").value))
+ self._viewpoint_use_timestamp_dir = bool(
+ self.get_parameter("viewpoint_use_timestamp_dir").value
+ )
+ self._viewpoint_save_png = bool(self.get_parameter("viewpoint_save_png").value)
+ self._viewpoint_save_transform = bool(
+ self.get_parameter("viewpoint_save_transform").value
+ )
+ self._object_type_query_mode = str(
+ self.get_parameter("object_type_query_mode").value
+ ).lower()
+ self._object_type_query_max = int(
+ self.get_parameter("object_type_query_max_per_cycle").value
+ )
+ self._enable_object_type_queries = bool(
+ self.get_parameter("enable_object_type_queries").value
+ )
+ self._viewpoint_dir = self._viewpoint_output_dir if self._save_viewpoint_images else None
+
+ self._mapper = ObjectMapper(
+ voxel_size=float(self.get_parameter("voxel_size").value),
+ regularize_voxel_size=float(self.get_parameter("regularize_voxel_size").value),
+ confidence_threshold=self._confidence_threshold,
+ distance_threshold=float(self.get_parameter("distance_threshold").value),
+ iou_threshold=float(self.get_parameter("iou_threshold").value),
+ min_detections_for_permanent=int(self.get_parameter("min_detections_for_permanent").value),
+ pending_ttl_s=float(self.get_parameter("pending_ttl_s").value),
+ num_angle_bins=int(self.get_parameter("num_angle_bins").value),
+ )
+
+ self._image_saver = ImageSaveQueue(
+ output_dir=self._output_dir,
+ max_queue_size=100,
+ save_png=self._save_png,
+ )
+ self._image_saver.start()
+
+ self._viewpoint_saver: ImageSaveQueue | None = None
+ if self._save_viewpoint_images and self._viewpoint_dir is not None:
+ self._viewpoint_saver = ImageSaveQueue(
+ output_dir=self._viewpoint_output_dir,
+ max_queue_size=50,
+ save_png=self._viewpoint_save_png,
+ save_transform=self._viewpoint_save_transform,
+ clear_output_dir=True,
+ use_timestamp_dir=self._viewpoint_use_timestamp_dir,
+ )
+ self._viewpoint_saver.start()
+
+ self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=200)
+ self._odom_buffer: deque[tuple[float, dict]] = deque(maxlen=400)
+ self._detection_buffer: deque[DetectionResult] = deque(maxlen=100)
+ self._viewpoint_queue: deque[tuple[float, int]] = deque(maxlen=50)
+ self._processed_viewpoints: set[float] = set()
+ self._published_ids: set[int] = set()
+ self._mapping_busy = False
+ self._last_mapping_stats_log = 0.0
+ self._last_mapping_time_log = 0.0
+ self._mapping_log_throttle_s = 5.0
+ self._last_query_log = 0.0
+ self._query_log_throttle_s = 5.0
+ self._last_answer_log = 0.0
+
+ qos_cloud = QoSProfile(depth=10, reliability=ReliabilityPolicy.BEST_EFFORT)
+ qos_default = QoSProfile(depth=10, reliability=ReliabilityPolicy.RELIABLE)
+
+ self.create_subscription(DetectionResult, detection_topic, self._detection_callback, qos_default)
+ self.create_subscription(PointCloud2, cloud_topic, self._cloud_callback, qos_cloud)
+ self.create_subscription(Odometry, odom_topic, self._odom_callback, qos_cloud)
+ self.create_subscription(ViewpointRep, viewpoint_topic, self._viewpoint_callback, qos_default)
+ self.create_subscription(ObjectType, object_type_answer_topic, self._object_type_answer_callback, qos_default)
+ self.create_subscription(
+ TargetObjectInstruction, target_object_instruction_topic, self._target_object_callback, qos_default
+ )
+
+ self._object_type_query_pub = self.create_publisher(ObjectType, object_type_query_topic, qos_default)
+ self._target_object_spatial_query_pub = self.create_publisher(
+ TargetObjectWithSpatial, target_object_spatial_query_topic, qos_default
+ )
+ self._object_nodes_pub = self.create_publisher(ObjectNode, object_nodes_topic, qos_default)
+ self._obj_points_pub = self.create_publisher(PointCloud2, obj_points_topic, qos_default)
+ self._obj_boxes_pub = self.create_publisher(MarkerArray, obj_boxes_topic, qos_default)
+ self._obj_labels_pub = self.create_publisher(MarkerArray, obj_labels_topic, qos_default)
+
+ self.create_timer(0.5, self._mapping_callback)
+
+ def destroy_node(self) -> bool:
+ """Stop background workers and destroy the ROS node."""
+ self._image_saver.stop()
+ if self._viewpoint_saver is not None:
+ self._viewpoint_saver.stop()
+ return super().destroy_node()
+
+ def _detection_callback(self, msg: DetectionResult) -> None:
+ """Buffer detection results for later synchronization.
+
+ Args:
+ msg: DetectionResult message with bboxes, labels, and masks.
+ """
+ self._detection_buffer.append(msg)
+
+ def _cloud_callback(self, msg: PointCloud2) -> None:
+ """Buffer incoming point clouds with timestamps.
+
+ Args:
+ msg: PointCloud2 message in the sensor frame.
+ """
+ stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9
+ points = point_cloud2.read_points_numpy(msg, field_names=("x", "y", "z"))
+ self._cloud_buffer.append((stamp, points))
+
+ def _odom_callback(self, msg: Odometry) -> None:
+ """Buffer odometry data for interpolation.
+
+ Args:
+ msg: Odometry message containing pose and twist.
+ """
+ stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9
+ odom = {
+ "position": np.array(
+ [msg.pose.pose.position.x, msg.pose.pose.position.y, msg.pose.pose.position.z]
+ ),
+ "orientation": np.array(
+ [
+ msg.pose.pose.orientation.x,
+ msg.pose.pose.orientation.y,
+ msg.pose.pose.orientation.z,
+ msg.pose.pose.orientation.w,
+ ]
+ ),
+ "linear_velocity": np.array(
+ [msg.twist.twist.linear.x, msg.twist.twist.linear.y, msg.twist.twist.linear.z]
+ ),
+ "angular_velocity": np.array(
+ [msg.twist.twist.angular.x, msg.twist.twist.angular.y, msg.twist.twist.angular.z]
+ ),
+ }
+ self._odom_buffer.append((stamp, odom))
+
+ def _viewpoint_callback(self, msg: ViewpointRep) -> None:
+ """Queue viewpoint trigger requests.
+
+ Args:
+ msg: ViewpointRep message with timestamp and viewpoint id.
+ """
+ stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9
+ if stamp in self._processed_viewpoints:
+ return
+ self._viewpoint_queue.append((stamp, msg.viewpoint_id))
+
+ def _target_object_callback(self, msg: TargetObjectInstruction) -> None:
+ """Update target object label used for VLM query decisions.
+
+ Args:
+ msg: TargetObjectInstruction message.
+ """
+ self._target_object = msg.target_object
+ self._room_condition = msg.room_condition
+ self._spatial_condition = msg.spatial_condition
+ self._attribute_condition = msg.attribute_condition
+ self._anchor_object = msg.anchor_object
+ self._attribute_condition_anchor = msg.attribute_condition_anchor
+ self._target_spatial_query_ids.clear()
+ if msg.target_object:
+ self.get_logger().debug(
+ f"Target instruction: target={msg.target_object} "
+ f"room={msg.room_condition} spatial={msg.spatial_condition}"
+ )
+
+ def _object_type_answer_callback(self, msg: ObjectType) -> None:
+ """Apply VLM relabeling updates to tracked objects.
+
+ Args:
+ msg: ObjectType message with final label for a track id.
+ """
+ matched = False
+ for obj in self._mapper.get_objects(include_pending=True):
+ if msg.object_id in obj.track_ids:
+ obj.class_votes[msg.final_label] = obj.class_votes.get(msg.final_label, 0) + 50
+ obj.confidence_scores[msg.final_label] = max(
+ obj.confidence_scores.get(msg.final_label, 0.0), 1.0
+ )
+ obj.points_count[msg.final_label] = obj.points_count.get(msg.final_label, 0)
+ obj.updated_by_vlm = True
+ obj._invalidate_cache()
+ self.get_logger().debug(
+ f"Object type answer applied: object_id={msg.object_id} "
+ f"label={msg.final_label}"
+ )
+ matched = True
+ break
+ if not matched:
+ now = time.time()
+ if now - self._last_answer_log > self._query_log_throttle_s:
+ track_ids = [
+ track_id
+ for obj in self._mapper.get_objects(include_pending=True)
+ for track_id in obj.track_ids
+ ]
+ preview = track_ids[:10]
+ suffix = "..." if len(track_ids) > 10 else ""
+ self.get_logger().warning(
+ f"Object type answer ignored: object_id={msg.object_id} "
+ f"label={msg.final_label} known_ids={preview}{suffix}"
+ )
+ self._last_answer_log = now
+
+ def _mapping_callback(self) -> None:
+ """Main mapping loop: synchronize inputs, update map, and publish outputs."""
+ if self._mapping_busy:
+ return
+ if len(self._detection_buffer) < 1 or len(self._cloud_buffer) < 1 or len(self._odom_buffer) < 2:
+ return
+
+ self._mapping_busy = True
+ start_time = time.perf_counter()
+ try:
+ detection_msg, viewpoint_stamp, viewpoint_id = self._select_detection()
+ if detection_msg is None:
+ return
+
+ detection_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9
+ image = self._bridge.compressed_imgmsg_to_cv2(detection_msg.image, desired_encoding="bgr8")
+
+ odom = interpolate_odom(
+ list(self._odom_buffer),
+ detection_stamp,
+ self._detection_linear_bias,
+ self._detection_angular_bias,
+ )
+ if odom is None:
+ return
+
+ if viewpoint_stamp is not None:
+ self._save_viewpoint_snapshot(
+ viewpoint_id=viewpoint_id,
+ viewpoint_stamp=viewpoint_stamp,
+ image=image,
+ odom=odom,
+ )
+
+ neighbor_cloud = collect_cloud_window(
+ list(self._cloud_buffer),
+ detection_stamp,
+ self._cloud_window_before,
+ self._cloud_window_after,
+ )
+ if neighbor_cloud is None:
+ return
+
+ observations = self._build_observations(
+ detection_msg=detection_msg,
+ image=image,
+ odom=odom,
+ cloud=neighbor_cloud,
+ )
+ if len(observations) == 0:
+ return
+
+ stats = self._mapper.update(observations, target_object=self._target_object)
+ now = time.time()
+ if now - self._last_mapping_stats_log > self._mapping_log_throttle_s:
+ self.get_logger().debug(
+ "Mapping stats:\n"
+ f" total: {stats.total}\n"
+ f" deleted: {stats.deleted}"
+ )
+ self._last_mapping_stats_log = now
+ self._update_viewpoint_ids(observations, viewpoint_id)
+ self._update_observation_angles(observations)
+ self._update_best_images(observations, image)
+ self._publish_map(detection_stamp, viewpoint_id)
+ self._publish_object_type_queries()
+ self._publish_target_object_spatial_queries()
+
+ if viewpoint_stamp is not None:
+ self._processed_viewpoints.add(viewpoint_stamp)
+ finally:
+ elapsed_ms = (time.perf_counter() - start_time) * 1000.0
+ now = time.time()
+ if now - self._last_mapping_time_log > self._mapping_log_throttle_s:
+ self.get_logger().debug(f"Mapping loop time: {elapsed_ms:.2f} ms")
+ self._last_mapping_time_log = now
+ self._mapping_busy = False
+
+ def _select_detection(self) -> tuple[DetectionResult | None, float | None, int]:
+ """Pick the detection to process, optionally aligned to a viewpoint.
+
+ Returns:
+ Tuple of (detection_msg, viewpoint_stamp, viewpoint_id).
+ """
+ viewpoint_stamp = None
+ viewpoint_id = -1
+
+ if len(self._viewpoint_queue) > 0:
+ stamp, vid = self._viewpoint_queue[0]
+ detections = list(self._detection_buffer)
+ stamps = [msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 for msg in detections]
+ diffs = np.abs(np.array(stamps) - stamp)
+ idx = int(np.argmin(diffs))
+ diff = float(diffs[idx])
+ if diff > 1.0:
+ self.get_logger().debug(
+ f"Viewpoint stamp {stamp:.3f} is {diff:.3f}s away from closest detection"
+ )
+ detection_msg = detections[idx]
+ viewpoint_stamp = stamp
+ viewpoint_id = vid
+ self._viewpoint_queue.popleft()
+ return detection_msg, viewpoint_stamp, viewpoint_id
+
+ idx = -2 if len(self._detection_buffer) > 1 else -1
+ return self._detection_buffer[idx], viewpoint_stamp, viewpoint_id
+
+ def _update_viewpoint_ids(self, observations: list[Observation], viewpoint_id: int) -> None:
+ """Track viewpoint IDs associated with observations."""
+ if viewpoint_id < 0:
+ return
+ for obs in observations:
+ if obs.track_id not in self._object_viewpoint_ids:
+ self._object_viewpoint_ids[obs.track_id] = set()
+ self._object_viewpoint_ids[obs.track_id].add(viewpoint_id)
+
+ def _collect_viewpoint_ids(self, obj: MapObject) -> list[int]:
+ """Collect viewpoint IDs observed for a MapObject."""
+ ids: set[int] = set()
+ for track_id in obj.track_ids:
+ ids.update(self._object_viewpoint_ids.get(track_id, set()))
+ return sorted(ids)
+
+ def _save_viewpoint_snapshot(
+ self,
+ viewpoint_id: int,
+ viewpoint_stamp: float,
+ image: np.ndarray,
+ odom: dict,
+ ) -> None:
+ """Save viewpoint image and pose transform for spatial queries."""
+ if not self._save_viewpoint_images or self._viewpoint_dir is None:
+ return
+ if viewpoint_id < 0:
+ return
+
+ _ = viewpoint_stamp # Reserved for future indexing/logging
+
+ transform = None
+ if self._viewpoint_save_transform:
+ R_b2w = Rotation.from_quat(odom["orientation"]).as_matrix()
+ t_b2w = np.array(odom["position"], dtype=float)
+ transform = np.eye(4, dtype=float)
+ transform[:3, :3] = R_b2w
+ transform[:3, 3] = t_b2w
+
+ if self._viewpoint_saver is not None:
+ self._viewpoint_saver.queue_viewpoint_save(
+ viewpoint_id=viewpoint_id, image=image, transform=transform
+ )
+
+ # helper methods moved to semantic_mapping.utils.utils
+
+ def _build_observations(
+ self,
+ detection_msg: DetectionResult,
+ image: np.ndarray,
+ odom: dict,
+ cloud: np.ndarray,
+ ) -> list[Observation]:
+ """Build Observation objects from detections and segmented point clouds.
+
+ Args:
+ detection_msg: DetectionResult with bboxes, labels, and masks.
+ image: RGB image used for masks and cropping.
+ odom: Interpolated odometry dict (position + orientation).
+ cloud: Aggregated point cloud in sensor frame.
+
+ Returns:
+ List of Observation instances for object updates.
+ """
+ det_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9
+ image_shape = image.shape[:2]
+
+ masks = []
+ labels = []
+ confidences = []
+ track_ids = []
+
+ for i in range(len(detection_msg.track_id)):
+ conf = float(detection_msg.confidence[i])
+ if conf < self._confidence_threshold:
+ continue
+ x1 = int(detection_msg.x1[i])
+ y1 = int(detection_msg.y1[i])
+ x2 = int(detection_msg.x2[i])
+ y2 = int(detection_msg.y2[i])
+ x1 = max(0, min(x1, image_shape[1] - 1))
+ x2 = max(0, min(x2, image_shape[1]))
+ y1 = max(0, min(y1, image_shape[0] - 1))
+ y2 = max(0, min(y2, image_shape[0]))
+ if x2 <= x1 or y2 <= y1:
+ continue
+
+ full_mask = np.zeros(image_shape, dtype=bool)
+ if i < len(detection_msg.masks):
+ mask_msg: CompressedImage = detection_msg.masks[i]
+ mask_crop = self._bridge.compressed_imgmsg_to_cv2(mask_msg)
+ if mask_crop.ndim == 3:
+ mask_crop = cv2.cvtColor(mask_crop, cv2.COLOR_BGR2GRAY)
+ mask_bool = mask_crop.astype(bool)
+ if self._mask_erode_iterations > 0:
+ mask_eroded = cv2.erode(
+ mask_bool.astype(np.uint8),
+ self._mask_erode_kernel,
+ iterations=self._mask_erode_iterations,
+ )
+ mask_bool = mask_eroded.astype(bool)
+ crop_h = min(mask_bool.shape[0], y2 - y1)
+ crop_w = min(mask_bool.shape[1], x2 - x1)
+ full_mask[y1 : y1 + crop_h, x1 : x1 + crop_w] = mask_bool[:crop_h, :crop_w]
+
+ masks.append(full_mask)
+ labels.append(detection_msg.label[i])
+ confidences.append(conf)
+ track_ids.append(int(detection_msg.track_id[i]))
+
+ if len(masks) == 0:
+ return []
+
+ R_b2w = Rotation.from_quat(odom["orientation"]).as_matrix()
+ t_b2w = np.array(odom["position"])
+ R_w2b = R_b2w.T
+ t_w2b = -R_w2b @ t_b2w
+ cloud_body = cloud @ R_w2b.T + t_w2b
+
+ _, pixel_idx = project_pointcloud_to_image(
+ points=cloud_body,
+ projection=self._projection,
+ image_width=self._image_width,
+ image_height=self._image_height,
+ axis_mode=self._axis_mode,
+ intrinsics=None,
+ depth_filter=self._depth_filter,
+ depth_filter_margin=self._depth_filter_margin,
+ transform=self._sensor_to_camera,
+ )
+ seg_clouds, seg_depths = segment_pointclouds_from_projection(
+ points=cloud_body,
+ masks=masks,
+ pixel_idx=pixel_idx,
+ )
+ if self._depth_filter:
+ filtered_clouds = []
+ filtered_depths = []
+ for cloud, depth in zip(seg_clouds, seg_depths):
+ if depth.shape[0] == 0:
+ filtered_clouds.append(cloud)
+ filtered_depths.append(depth)
+ continue
+ min_depth = float(np.min(depth))
+ keep = depth[:, 0] <= (min_depth + self._depth_filter_margin)
+ filtered_clouds.append(cloud[keep])
+ filtered_depths.append(depth[keep])
+ seg_clouds = filtered_clouds
+ seg_depths = filtered_depths
+ world_clouds = transform_segmented_clouds_to_world(
+ clouds=seg_clouds,
+ depths=seg_depths,
+ R_b2w=R_b2w,
+ t_b2w=t_b2w,
+ depth_jump_threshold=self._depth_jump_threshold,
+ )
+ cloud_world = cloud_body @ R_b2w.T + t_b2w
+ ground_z_threshold = compute_ground_z_threshold(
+ cloud_world,
+ t_b2w[:2],
+ self._ground_filter,
+ self._ground_min_points,
+ self._ground_radius,
+ self._ground_percentile,
+ self._ground_z_clearance,
+ )
+
+ observations = []
+ for idx, world_cloud in enumerate(world_clouds):
+ if self._max_cloud_distance > 0:
+ dist = np.linalg.norm(world_cloud - t_b2w, axis=1)
+ world_cloud = world_cloud[dist < self._max_cloud_distance]
+ if ground_z_threshold is not None:
+ world_cloud = world_cloud[world_cloud[:, 2] > ground_z_threshold]
+ if world_cloud.shape[0] == 0:
+ continue
+ observations.append(
+ Observation(
+ track_id=track_ids[idx],
+ label=labels[idx],
+ confidence=confidences[idx],
+ points=world_cloud,
+ robot_R=R_b2w,
+ robot_t=t_b2w,
+ mask=masks[idx],
+ image=image,
+ timestamp=det_stamp,
+ )
+ )
+
+ return observations
+
+ def _update_observation_angles(self, observations: list[Observation]) -> None:
+ """Reproject object voxels onto the current mask to update angle votes."""
+ if len(observations) == 0:
+ return
+
+ objects = self._mapper.get_objects(include_pending=True)
+ track_to_obj = {}
+ for obj in objects:
+ for track_id in obj.track_ids:
+ track_to_obj[track_id] = obj
+
+ for obs in observations:
+ obj = track_to_obj.get(obs.track_id)
+ if obj is None or obs.mask is None:
+ continue
+ voxels = obj.voxel_manager.voxels
+ if voxels.shape[0] == 0:
+ continue
+
+ R_b2w = obs.robot_R
+ t_b2w = obs.robot_t
+ R_w2b = R_b2w.T
+ t_w2b = -R_w2b @ t_b2w
+ voxels_body = voxels @ R_w2b.T + t_w2b
+ _, pixel_idx = project_pointcloud_to_image(
+ points=voxels_body,
+ projection=self._projection,
+ image_width=self._image_width,
+ image_height=self._image_height,
+ axis_mode=self._axis_mode,
+ intrinsics=None,
+ depth_filter=self._depth_filter,
+ depth_filter_margin=self._depth_filter_margin,
+ transform=self._sensor_to_camera,
+ )
+ pix = pixel_idx.astype(int)
+ valid = (
+ (pix[:, 0] >= 0)
+ & (pix[:, 0] < obs.mask.shape[1])
+ & (pix[:, 1] >= 0)
+ & (pix[:, 1] < obs.mask.shape[0])
+ )
+ if not np.any(valid):
+ continue
+ pix = pix[valid]
+ voxel_indices = np.nonzero(valid)[0]
+ on_mask = obs.mask[pix[:, 1], pix[:, 0]].astype(bool)
+ if not np.any(on_mask):
+ continue
+ voxel_indices = voxel_indices[on_mask]
+ vectors = voxels[voxel_indices] - t_b2w
+ obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0])
+ obs_bins = discretize_angles(obs_angles, obj.voxel_manager.num_angle_bins)
+ obj.voxel_manager.update_observation_angles(voxel_indices, obs_bins)
+ obj._invalidate_cache()
+
+ def _update_best_images(self, observations: list[Observation], image: np.ndarray) -> None:
+ """Update best image crops for objects based on mask area.
+
+ Args:
+ observations: Observations created for the current frame.
+ image: Full RGB image associated with the detections.
+ """
+ for obs in observations:
+ obj = None
+ for candidate in self._mapper.get_objects(include_pending=True):
+ if obs.track_id in candidate.track_ids:
+ obj = candidate
+ break
+ if obj is None or obs.mask is None:
+ continue
+
+ coords = np.argwhere(obs.mask)
+ if coords.size == 0:
+ continue
+ mask_area = coords.shape[0]
+ if mask_area <= obj.best_image_mask_area:
+ continue
+
+ y0, x0 = coords.min(axis=0)
+ y1, x1 = coords.max(axis=0) + 1
+ padding = max(int(y1 - y0), int(x1 - x0), 80)
+ y0 = max(0, y0 - padding)
+ x0 = max(0, x0 - padding)
+ y1 = min(obs.mask.shape[0], y1 + padding)
+ x1 = min(obs.mask.shape[1], x1 + padding)
+
+ if obj.best_image_path:
+ image_path = Path(obj.best_image_path)
+ mask_path = image_path.with_name(image_path.stem + "_mask.npy")
+ if image_path.exists():
+ image_path.unlink()
+ if mask_path.exists():
+ mask_path.unlink()
+ if self._save_png:
+ png_path = image_path.with_suffix(".png")
+ if png_path.exists():
+ png_path.unlink()
+
+ cropped_image = image[y0:y1, x0:x1]
+ cropped_mask = obs.mask[y0:y1, x0:x1]
+
+ obj.best_image_path = self._image_saver.queue_save(
+ object_id=str(obs.track_id),
+ image=cropped_image,
+ mask=cropped_mask,
+ )
+ obj.best_image_mask_area = float(mask_area)
+ obj.is_asked_vlm = False
+
+ def _publish_map(self, stamp: float, viewpoint_id: int) -> None:
+ """Publish object nodes, markers, and aggregated point clouds.
+
+ Args:
+ stamp: Timestamp (seconds) for message headers.
+ viewpoint_id: Viewpoint id for ObjectNode messages.
+ """
+ seconds = int(stamp)
+ nanoseconds = int((stamp - seconds) * 1e9)
+ header = Header()
+ header.stamp.sec = seconds
+ header.stamp.nanosec = nanoseconds
+ header.frame_id = self._map_frame
+
+ objects = self._mapper.get_objects(include_pending=True)
+ for obj in objects:
+ if obj.detection_count == 1:
+ status = "new"
+ elif obj.inactive_frames == 0:
+ status = "updated"
+ else:
+ status = "unchanged"
+ msg = map_object_to_object_node(
+ obj,
+ header,
+ status=status,
+ viewpoint_id=viewpoint_id,
+ target_object=self._target_object,
+ )
+ if msg is not None:
+ self._object_nodes_pub.publish(msg)
+
+ viz_objects = objects
+ if self._visualization_max_objects > 0 and len(objects) > self._visualization_max_objects:
+ viz_objects = sorted(
+ objects, key=lambda obj: obj.last_update_time, reverse=True
+ )[: self._visualization_max_objects]
+
+ bbox_markers = create_bbox_markers(viz_objects, header)
+ label_markers = create_label_markers(viz_objects, header)
+ self._obj_boxes_pub.publish(bbox_markers)
+ self._obj_labels_pub.publish(label_markers)
+
+ obj_cloud = create_object_pointcloud(viz_objects, header)
+ if obj_cloud is not None:
+ self._obj_points_pub.publish(obj_cloud)
+
+ current_ids = {int(obj.track_ids[0]) if obj.track_ids else 0 for obj in viz_objects}
+ deleted_ids = list(self._published_ids - current_ids)
+ if len(deleted_ids) > 0:
+ delete_markers = create_delete_markers(deleted_ids, header, namespace="bbox")
+ self._obj_boxes_pub.publish(delete_markers)
+ delete_labels = create_delete_markers(deleted_ids, header, namespace="labels")
+ self._obj_labels_pub.publish(delete_labels)
+ self._published_ids = current_ids
+
+ def _publish_object_type_queries(self) -> None:
+ """Publish VLM queries for target objects with good image crops."""
+ if not self._enable_object_type_queries:
+ return
+ published = 0
+ objects = self._mapper.get_objects(include_pending=False)
+ eligible = []
+ for obj in objects:
+ if obj.is_asked_vlm or obj.best_image_mask_area <= 500:
+ continue
+ if not obj.best_image_path:
+ continue
+ image_path = Path(obj.best_image_path)
+ if not image_path.exists():
+ continue
+ eligible.append(obj)
+ target = (self._target_object or "").strip().lower()
+ target_matches = [obj for obj in eligible if obj.label == target] if target else []
+
+ mode = self._object_type_query_mode
+ if mode not in ("target_only", "all", "target_or_all"):
+ mode = "target_or_all"
+
+ fallback_used = False
+ if mode == "target_only":
+ candidates = target_matches
+ elif mode == "all":
+ candidates = eligible
+ else:
+ if target_matches:
+ candidates = target_matches
+ else:
+ candidates = eligible
+ fallback_used = True
+
+ if candidates:
+ candidates = sorted(
+ candidates,
+ key=lambda obj: obj.best_image_mask_area,
+ reverse=True,
+ )
+ max_per_cycle = max(1, self._object_type_query_max)
+ for obj in candidates[:max_per_cycle]:
+ msg = ObjectType()
+ msg.object_id = int(obj.track_ids[0]) if obj.track_ids else -1
+ msg.img_path = obj.best_image_path or ""
+ msg.labels = list(obj.class_votes.keys())
+ self._object_type_query_pub.publish(msg)
+ obj.is_asked_vlm = True
+ published += 1
+ now = time.time()
+ if published > 0 or now - self._last_query_log > self._query_log_throttle_s:
+ subscriber_count = self._object_type_query_pub.get_subscription_count()
+ self.get_logger().debug(
+ f"ObjectType queries: published={published} eligible={len(eligible)} "
+ f"target_matches={len(target_matches)} fallback_all={fallback_used} "
+ f"mode={mode} target={self._target_object} subs={subscriber_count}"
+ )
+ self._last_query_log = now
+
+ def _publish_target_object_spatial_queries(self) -> None:
+ """Publish spatial target-object queries when spatial condition is set."""
+ if not self._spatial_condition:
+ return
+
+ published = 0
+ skipped_label = 0
+ skipped_mask = 0
+ skipped_bbox = 0
+ skipped_viewpoints = 0
+ skipped_asked = 0
+ for obj in self._mapper.get_objects(include_pending=False):
+ if obj.label != self._target_object:
+ skipped_label += 1
+ continue
+ if obj.best_image_mask_area <= 500:
+ skipped_mask += 1
+ continue
+ if obj.bbox3d_corners is None:
+ skipped_bbox += 1
+ continue
+
+ object_id = int(obj.track_ids[0]) if obj.track_ids else -1
+ if object_id < 0:
+ continue
+ if object_id in self._target_spatial_query_ids:
+ skipped_asked += 1
+ continue
+
+ viewpoint_ids = self._collect_viewpoint_ids(obj)
+ if len(viewpoint_ids) == 0:
+ skipped_viewpoints += 1
+ continue
+
+ msg = TargetObjectWithSpatial()
+ msg.header.stamp = self.get_clock().now().to_msg()
+ msg.header.frame_id = self._map_frame
+ msg.object_id = object_id
+ msg.object_label = obj.label
+ msg.img_path = obj.best_image_path or ""
+ msg.viewpoint_ids = viewpoint_ids
+ msg.room_label = ""
+ msg.is_target = False
+ for i in range(8):
+ msg.bbox3d[i].x = float(obj.bbox3d_corners[i, 0])
+ msg.bbox3d[i].y = float(obj.bbox3d_corners[i, 1])
+ msg.bbox3d[i].z = float(obj.bbox3d_corners[i, 2])
+
+ self._target_object_spatial_query_pub.publish(msg)
+ self._target_spatial_query_ids.add(object_id)
+ published += 1
+ now = time.time()
+ if published > 0 or now - self._last_query_log > self._query_log_throttle_s:
+ subscriber_count = self._target_object_spatial_query_pub.get_subscription_count()
+ self.get_logger().debug(
+ f"TargetSpatial queries: published={published} skipped_label={skipped_label} "
+ f"skipped_mask={skipped_mask} skipped_bbox={skipped_bbox} "
+ f"skipped_viewpoints={skipped_viewpoints} skipped_asked={skipped_asked} "
+ f"subs={subscriber_count}"
+ )
+ self._last_query_log = now
+
+
+def main(args: list[str] | None = None) -> None:
+ """Entry point for the semantic mapping node."""
+ rclpy.init(args=args)
+ node = SemanticMappingNode()
+ try:
+ rclpy.spin(node)
+ except KeyboardInterrupt:
+ pass
+ finally:
+ node.destroy_node()
+ rclpy.shutdown()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/semantic_mapping/semantic_mapping/prompts/__init__.py b/semantic_mapping/semantic_mapping/prompts/__init__.py
new file mode 100644
index 0000000..acbdca0
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/prompts/__init__.py
@@ -0,0 +1,17 @@
+"""Prompt templates for VLM reasoning tasks."""
+
+from .vlm_prompts import (
+ build_object_type_prompt,
+ build_room_type_prompt,
+ INSTRUCTION_DECOMPOSITION_PROMPT,
+ TARGET_OBJECT_PROMPT,
+ TARGET_OBJECT_SPATIAL_PROMPT,
+)
+
+__all__ = [
+ "build_object_type_prompt",
+ "build_room_type_prompt",
+ "INSTRUCTION_DECOMPOSITION_PROMPT",
+ "TARGET_OBJECT_PROMPT",
+ "TARGET_OBJECT_SPATIAL_PROMPT",
+]
diff --git a/semantic_mapping/semantic_mapping/prompts/vlm_prompts.py b/semantic_mapping/semantic_mapping/prompts/vlm_prompts.py
new file mode 100644
index 0000000..441f452
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/prompts/vlm_prompts.py
@@ -0,0 +1,78 @@
+from typing import Sequence
+
+
+OBJECT_TYPE_PROMPT_TEMPLATE = """You are provided with an RGB image containing an object.
+Select the correct label from the candidate list and respond strictly in JSON.
+
+Return ONLY valid JSON with this schema:
+{{"label": "", "reason": ""}}
+
+Example:
+{{"label": "chair", "reason": "The object has a seat, backrest, and four legs."}}
+
+Candidate labels:
+{labels}
+"""
+
+ROOM_TYPE_PROMPT_TEMPLATE = """You are given an RGB image of a room and a top-down room layout mask.
+Identify the room type and respond strictly in valid JSON.
+
+Return ONLY valid JSON with this schema:
+{{"room_type": ""}}
+
+If you are not confident, respond with:
+{{"room_type": "unknown"}}
+
+Room types:
+{room_types}
+"""
+
+
+INSTRUCTION_DECOMPOSITION_PROMPT = """You are given a user instruction for finding an object.
+Decompose it into the following fields and return ONLY valid JSON:
+
+{"target_object": "", "room_condition": "", "spatial_condition": "", "attribute_condition": "", "anchor_object": "", "attribute_condition_anchor": ""}
+
+If a field is not specified, return it as an empty string.
+
+Example:
+Instruction: "Find a silver refrigerator next to the white cabinet in the student lounge."
+{"target_object": "refrigerator", "room_condition": "in the student lounge", "spatial_condition": "next to the cabinet", "attribute_condition": "silver", "anchor_object": "cabinet", "attribute_condition_anchor": "white"}
+"""
+
+
+TARGET_OBJECT_PROMPT = """You are given an instruction and a candidate object image.
+Determine whether the object matches the target described in the instruction.
+Consider object type, room condition, and attribute condition.
+
+Return ONLY valid JSON:
+{"is_target": true/false, "reason": ""}
+
+Example:
+{"is_target": true, "reason": "The object is a silver refrigerator in the lounge."}
+"""
+
+
+TARGET_OBJECT_SPATIAL_PROMPT = """You are given an instruction, a candidate object image, and
+additional views of the surrounding scene. Determine whether the object matches
+the instruction including any spatial or attribute constraints.
+
+Return ONLY valid JSON:
+{"is_target": true/false, "reason": ""}
+
+Example:
+{"is_target": false, "reason": "The object is a fridge but not next to the cabinet."}
+"""
+
+
+def build_object_type_prompt(labels: Sequence[str]) -> str:
+ label_block = ", ".join(labels) if labels else "unknown"
+ return OBJECT_TYPE_PROMPT_TEMPLATE.format(labels=label_block)
+
+
+def build_room_type_prompt(room_types: Sequence[str]) -> str:
+ if room_types:
+ room_block = "\n".join(f"- {name}" for name in room_types)
+ else:
+ room_block = "- unknown"
+ return ROOM_TYPE_PROMPT_TEMPLATE.format(room_types=room_block)
diff --git a/semantic_mapping/semantic_mapping/utils/__init__.py b/semantic_mapping/semantic_mapping/utils/__init__.py
new file mode 100644
index 0000000..4a5efc8
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/__init__.py
@@ -0,0 +1,25 @@
+"""Utility modules for semantic mapping."""
+
+from .utils import (
+ CameraParams,
+ annotate_mask,
+ collect_cloud_window,
+ compute_ground_z_threshold,
+ concat_images,
+ interpolate_odom,
+ load_camera_params,
+ load_npy_image,
+ parse_json_dict,
+)
+
+__all__ = [
+ "CameraParams",
+ "annotate_mask",
+ "collect_cloud_window",
+ "compute_ground_z_threshold",
+ "concat_images",
+ "interpolate_odom",
+ "load_camera_params",
+ "load_npy_image",
+ "parse_json_dict",
+]
diff --git a/semantic_mapping/semantic_mapping/utils/map_object_utils.py b/semantic_mapping/semantic_mapping/utils/map_object_utils.py
new file mode 100644
index 0000000..5ad4163
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/map_object_utils.py
@@ -0,0 +1,264 @@
+"""ROS message helpers for MapObject publishers."""
+
+from __future__ import annotations
+
+import zlib
+from typing import Optional
+
+import numpy as np
+from geometry_msgs.msg import Point
+from sensor_msgs.msg import PointCloud2
+from std_msgs.msg import Header
+from visualization_msgs.msg import Marker, MarkerArray
+
+from semantic_mapping.msg import ObjectNode
+from vector_perception_utils.pointcloud_utils import create_pointcloud2_msg
+
+from ..map_object import MapObject
+
+
+def _axis_aligned_corners(points: np.ndarray) -> Optional[np.ndarray]:
+ if points.size == 0:
+ return None
+ min_bound = np.min(points, axis=0)
+ max_bound = np.max(points, axis=0)
+ return np.array(
+ [
+ [min_bound[0], min_bound[1], min_bound[2]],
+ [max_bound[0], min_bound[1], min_bound[2]],
+ [max_bound[0], max_bound[1], min_bound[2]],
+ [min_bound[0], max_bound[1], min_bound[2]],
+ [min_bound[0], min_bound[1], max_bound[2]],
+ [max_bound[0], min_bound[1], max_bound[2]],
+ [max_bound[0], max_bound[1], max_bound[2]],
+ [min_bound[0], max_bound[1], max_bound[2]],
+ ],
+ dtype=float,
+ )
+
+
+def get_object_seed(obj: MapObject) -> int:
+ """Generate a deterministic seed from a MapObject for color generation.
+
+ Args:
+ obj: MapObject instance.
+
+ Returns:
+ CRC32 hash of the object's identifier.
+ """
+ seed_key = obj.object_id or (str(obj.track_ids[0]) if obj.track_ids else obj.label)
+ return zlib.crc32(seed_key.encode("utf-8"))
+
+
+def map_object_to_object_node(
+ obj: MapObject,
+ header: Header,
+ status: str = "new",
+ viewpoint_id: int = -1,
+ target_object: str | None = None,
+) -> Optional[ObjectNode]:
+ """Convert a MapObject into an ObjectNode message.
+
+ Args:
+ obj: MapObject instance.
+ header: ROS header to attach to the message.
+ status: Lifecycle status string ("new", "updated", "unchanged").
+ viewpoint_id: Viewpoint identifier for image saving.
+ target_object: Target object label for potential labeling.
+
+ Returns:
+ ObjectNode message or None if required geometry is missing.
+ """
+ if obj.centroid is None or obj.bbox3d_corners is None:
+ return None
+
+ msg = ObjectNode()
+ msg.header = header
+ msg.object_id = obj.track_ids
+ if target_object and obj.label == target_object and not obj.is_asked_vlm:
+ msg.label = "Potential Target"
+ else:
+ msg.label = obj.label
+ msg.status = status
+ msg.position.x = float(obj.centroid[0])
+ msg.position.y = float(obj.centroid[1])
+ msg.position.z = float(obj.centroid[2])
+
+ for i in range(8):
+ msg.bbox3d[i].x = float(obj.bbox3d_corners[i, 0])
+ msg.bbox3d[i].y = float(obj.bbox3d_corners[i, 1])
+ msg.bbox3d[i].z = float(obj.bbox3d_corners[i, 2])
+
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(voxels) == 0:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=False,
+ )
+ if len(voxels) > 0:
+ seed = get_object_seed(obj)
+ color = np.array(
+ [
+ (seed & 0xFF) / 255.0,
+ ((seed >> 8) & 0xFF) / 255.0,
+ ((seed >> 16) & 0xFF) / 255.0,
+ ],
+ dtype=np.float32,
+ )
+ colors = np.tile(color, (voxels.shape[0], 1))
+ msg.cloud = create_pointcloud2_msg(points=voxels, colors=colors, header=header)
+
+ msg.img_path = obj.best_image_path or ""
+ msg.is_asked_vlm = obj.is_asked_vlm
+ msg.viewpoint_id = -1 if obj.updated_by_vlm else viewpoint_id
+ return msg
+
+
+def create_bbox_markers(
+ objects: list[MapObject], header: Header, namespace: str = "bbox"
+) -> MarkerArray:
+ """Create wireframe bounding box markers for MapObjects."""
+ markers = MarkerArray()
+ edges = [
+ (0, 1),
+ (1, 2),
+ (2, 3),
+ (3, 0),
+ (4, 5),
+ (5, 6),
+ (6, 7),
+ (7, 4),
+ (0, 4),
+ (1, 5),
+ (2, 6),
+ (3, 7),
+ ]
+
+ for obj in objects:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(voxels) == 0:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=False,
+ )
+ corners = _axis_aligned_corners(voxels) if len(voxels) > 0 else None
+ if corners is None:
+ continue
+ marker = Marker()
+ marker.header = header
+ marker.ns = namespace
+ marker.id = int(obj.track_ids[0]) if obj.track_ids else 0
+ marker.type = Marker.LINE_LIST
+ marker.action = Marker.ADD
+ marker.scale.x = 0.02
+
+ seed = get_object_seed(obj)
+ marker.color.r = (seed & 0xFF) / 255.0
+ marker.color.g = ((seed >> 8) & 0xFF) / 255.0
+ marker.color.b = ((seed >> 16) & 0xFF) / 255.0
+ marker.color.a = 1.0
+
+ for i, j in edges:
+ marker.points.append(Point(x=float(corners[i, 0]), y=float(corners[i, 1]), z=float(corners[i, 2])))
+ marker.points.append(Point(x=float(corners[j, 0]), y=float(corners[j, 1]), z=float(corners[j, 2])))
+
+ markers.markers.append(marker)
+
+ return markers
+
+
+def create_label_markers(
+ objects: list[MapObject],
+ header: Header,
+ namespace: str = "labels",
+ text_height: float = 0.4,
+) -> MarkerArray:
+ """Create text label markers for MapObjects."""
+ markers = MarkerArray()
+
+ for obj in objects:
+ if obj.centroid is None:
+ continue
+ marker = Marker()
+ marker.header = header
+ marker.ns = namespace
+ marker.id = int(obj.track_ids[0]) if obj.track_ids else 0
+ marker.type = Marker.TEXT_VIEW_FACING
+ marker.action = Marker.ADD
+ marker.scale.z = text_height
+ marker.text = obj.label
+ marker.pose.position.x = float(obj.centroid[0])
+ marker.pose.position.y = float(obj.centroid[1])
+ marker.pose.position.z = float(obj.centroid[2]) + 0.3
+
+ seed = get_object_seed(obj)
+ marker.color.r = (seed & 0xFF) / 255.0
+ marker.color.g = ((seed >> 8) & 0xFF) / 255.0
+ marker.color.b = ((seed >> 16) & 0xFF) / 255.0
+ marker.color.a = 1.0
+
+ markers.markers.append(marker)
+
+ return markers
+
+
+def create_object_pointcloud(objects: list[MapObject], header: Header) -> Optional[PointCloud2]:
+ """Create a colored point cloud for a list of MapObjects."""
+ points_list = []
+ colors_list = []
+
+ for obj in objects:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(voxels) == 0:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=False,
+ )
+ if len(voxels) == 0:
+ continue
+
+ seed = get_object_seed(obj)
+ color = np.array(
+ [
+ (seed & 0xFF) / 255.0,
+ ((seed >> 8) & 0xFF) / 255.0,
+ ((seed >> 16) & 0xFF) / 255.0,
+ ],
+ dtype=np.float32,
+ )
+ colors = np.tile(color, (voxels.shape[0], 1))
+ points_list.append(voxels)
+ colors_list.append(colors)
+
+ if len(points_list) == 0:
+ return None
+
+ points = np.concatenate(points_list, axis=0)
+ colors = np.concatenate(colors_list, axis=0)
+ return create_pointcloud2_msg(points=points, colors=colors, header=header)
+
+
+def create_delete_markers(
+ deleted_ids: list[int],
+ header: Header,
+ namespace: str = "bbox",
+) -> MarkerArray:
+ """Create deletion markers for a list of marker IDs."""
+ markers = MarkerArray()
+ for marker_id in deleted_ids:
+ marker = Marker()
+ marker.header = header
+ marker.ns = namespace
+ marker.id = int(marker_id)
+ marker.action = Marker.DELETE
+ markers.markers.append(marker)
+ return markers
diff --git a/semantic_mapping/semantic_mapping/utils/matching.py b/semantic_mapping/semantic_mapping/utils/matching.py
new file mode 100644
index 0000000..e5aaa19
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/matching.py
@@ -0,0 +1,256 @@
+"""Object matching utilities for semantic mapping."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+import cv2
+import numpy as np
+
+from vector_perception_utils.pointcloud_utils import compute_centroid
+
+from ..map_object import MapObject, Observation
+
+
+def _polygon_area(poly: np.ndarray) -> float:
+ """Compute polygon area using the shoelace formula.
+
+ Args:
+ poly: Polygon vertices in order (N, 2).
+
+ Returns:
+ Polygon area.
+ """
+ if poly.shape[0] < 3:
+ return 0.0
+ x = poly[:, 0]
+ y = poly[:, 1]
+ return 0.5 * abs(float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1))))
+
+
+def _ensure_ccw(poly: np.ndarray) -> np.ndarray:
+ """Ensure polygon vertices are ordered counter-clockwise."""
+ if poly.shape[0] < 3:
+ return poly
+ x = poly[:, 0]
+ y = poly[:, 1]
+ signed = float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1)))
+ if signed < 0:
+ return poly[::-1]
+ return poly
+
+
+def compute_bbox(points: np.ndarray) -> Optional[tuple[np.ndarray, np.ndarray]]:
+ """Compute yaw-only oriented bbox corners + extents."""
+ if points.shape[0] < 3:
+ return None
+
+ xy = points[:, :2].astype(np.float32)
+ unique_xy = np.unique(xy, axis=0)
+ if unique_xy.shape[0] < 3:
+ return None
+
+ rect = cv2.minAreaRect(unique_xy)
+ (width, height) = rect[1]
+ if width == 0.0 or height == 0.0:
+ return None
+
+ box = cv2.boxPoints(rect).astype(np.float64)
+ rect_xy = _ensure_ccw(box)
+
+ zmin = float(np.min(points[:, 2]))
+ zmax = float(np.max(points[:, 2]))
+ extent = np.array([width, height, zmax - zmin], dtype=float)
+ z_center = zmax - extent[2] / 2.0
+ half_z = extent[2] / 2.0
+
+ bottom = np.hstack([rect_xy, np.full((4, 1), z_center - half_z)])
+ top = np.hstack([rect_xy, np.full((4, 1), z_center + half_z)])
+ corners = np.vstack([bottom, top])
+ return corners, extent
+
+
+def _line_intersection(p1: np.ndarray, p2: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> np.ndarray:
+ """Intersection of line p1->p2 with line cp1->cp2.
+
+ Args:
+ p1: Line segment start (2,).
+ p2: Line segment end (2,).
+ cp1: Clip edge start (2,).
+ cp2: Clip edge end (2,).
+
+ Returns:
+ Intersection point (2,).
+ """
+ dc = cp1 - cp2
+ dp = p1 - p2
+ denom = dc[0] * dp[1] - dc[1] * dp[0]
+ if denom == 0:
+ return p2
+ n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0]
+ n2 = p1[0] * p2[1] - p1[1] * p2[0]
+ x = (n1 * dp[0] - n2 * dc[0]) / denom
+ y = (n1 * dp[1] - n2 * dc[1]) / denom
+ return np.array([x, y], dtype=np.float64)
+
+
+def _polygon_clip(subject: np.ndarray, clip: np.ndarray) -> np.ndarray:
+ """Clip a polygon with a convex clip polygon (Sutherland–Hodgman).
+
+ Args:
+ subject: Subject polygon vertices (N, 2).
+ clip: Clip polygon vertices (M, 2).
+
+ Returns:
+ Clipped polygon vertices (K, 2).
+ """
+ if subject.shape[0] == 0:
+ return subject
+
+ def inside(p: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> bool:
+ return (cp2[0] - cp1[0]) * (p[1] - cp1[1]) - (cp2[1] - cp1[1]) * (
+ p[0] - cp1[0]
+ ) >= 0
+
+ output = subject
+ for i in range(len(clip)):
+ cp1 = clip[i]
+ cp2 = clip[(i + 1) % len(clip)]
+ input_list = output
+ output = []
+ if len(input_list) == 0:
+ return np.empty((0, 2), dtype=np.float64)
+ s = input_list[-1]
+ for e in input_list:
+ if inside(e, cp1, cp2):
+ if not inside(s, cp1, cp2):
+ output.append(_line_intersection(s, e, cp1, cp2))
+ output.append(e)
+ elif inside(s, cp1, cp2):
+ output.append(_line_intersection(s, e, cp1, cp2))
+ s = e
+ output = np.array(output, dtype=np.float64)
+ return np.array(output, dtype=np.float64)
+
+
+def compute_iou_3d_with_ratios(
+ bbox1: np.ndarray, bbox2: np.ndarray
+) -> tuple[float, float, float]:
+ """Compute IoU and intersection ratios using yaw-only bbox overlap.
+
+ Args:
+ bbox1: Corner points for bbox1 (N, 3).
+ bbox2: Corner points for bbox2 (N, 3).
+
+ Returns:
+ Tuple of (iou, ratio_bbox1, ratio_bbox2).
+ """
+ if bbox1.shape[0] < 4 or bbox2.shape[0] < 4:
+ return 0.0, 0.0, 0.0
+
+ rect1 = _ensure_ccw(bbox1[:4, :2])
+ rect2 = _ensure_ccw(bbox2[:4, :2])
+ if rect1.shape[0] < 3 or rect2.shape[0] < 3:
+ return 0.0, 0.0, 0.0
+
+ poly_inter = _polygon_clip(rect1, rect2)
+ inter_area = _polygon_area(poly_inter) if poly_inter.shape[0] >= 3 else 0.0
+
+ zmin1, zmax1 = float(np.min(bbox1[:, 2])), float(np.max(bbox1[:, 2]))
+ zmin2, zmax2 = float(np.min(bbox2[:, 2])), float(np.max(bbox2[:, 2]))
+ height = max(0.0, min(zmax1, zmax2) - max(zmin1, zmin2))
+ inter_vol = inter_area * height
+
+ area1 = _polygon_area(rect1)
+ area2 = _polygon_area(rect2)
+ vol1 = area1 * (zmax1 - zmin1)
+ vol2 = area2 * (zmax2 - zmin2)
+ denom = vol1 + vol2 - inter_vol
+
+ iou = inter_vol / denom if denom > 0 else 0.0
+ ratio1 = inter_vol / vol1 if vol1 > 0 else 0.0
+ ratio2 = inter_vol / vol2 if vol2 > 0 else 0.0
+ return iou, ratio1, ratio2
+
+
+def find_best_match(
+ obs: Observation,
+ candidates: list[MapObject],
+ distance_threshold: float,
+ iou_threshold: float,
+ ratio_threshold: float = 0.4,
+ min_distance_merge: float = 0.25,
+) -> Optional[MapObject]:
+ """Find best matching object by distance/IoU heuristics.
+
+ Args:
+ obs: Observation to match.
+ candidates: Candidate map objects.
+ distance_threshold: Max centroid distance to consider.
+ iou_threshold: Min IoU to accept a merge.
+ ratio_threshold: Min overlap ratio to accept a merge.
+ min_distance_merge: Absolute distance threshold for merge.
+
+ Returns:
+ Best matching object or None if no match passes thresholds.
+ """
+ if obs.points.shape[0] == 0:
+ return None
+
+ obs_center = compute_centroid(obs.points)
+ obs_bbox = compute_bbox(obs.points)
+ obs_corners = obs_bbox[0] if obs_bbox is not None else None
+ obs_extent = obs_bbox[1] if obs_bbox is not None else None
+
+ best_candidate: Optional[MapObject] = None
+ best_dist = float("inf")
+
+ for candidate in candidates:
+ if obs.track_id in candidate.track_ids:
+ return candidate
+
+ cand_center = candidate.centroid
+ if cand_center is None:
+ continue
+
+ dist = float(np.linalg.norm(obs_center - cand_center))
+ if dist > distance_threshold:
+ continue
+
+ cand_points = candidate.voxel_manager.get_valid_voxels(
+ diversity_percentile=candidate._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(cand_points) == 0:
+ cand_points = candidate.voxel_manager.get_valid_voxels(
+ diversity_percentile=candidate._diversity_percentile,
+ use_regularized=False,
+ )
+ cand_bbox = compute_bbox(cand_points) if len(cand_points) >= 3 else None
+ cand_corners = cand_bbox[0] if cand_bbox is not None else None
+ cand_extent = cand_bbox[1] if cand_bbox is not None else None
+
+ dist_merge = dist < min_distance_merge
+ if obs_extent is not None and cand_extent is not None:
+ dist_thresh = np.linalg.norm((obs_extent / 2 + cand_extent / 2) / 2) * 0.5
+ dist_merge = dist_merge or dist < dist_thresh
+
+ if dist_merge:
+ if dist < best_dist:
+ best_candidate = candidate
+ best_dist = dist
+ continue
+
+ if obs_corners is None or cand_corners is None:
+ continue
+
+ iou, ratio_obs, ratio_cand = compute_iou_3d_with_ratios(
+ obs_corners, cand_corners
+ )
+
+ if iou > iou_threshold or ratio_obs > ratio_threshold or ratio_cand > ratio_threshold:
+ if dist < best_dist:
+ best_candidate = candidate
+ best_dist = dist
+
+ return best_candidate
diff --git a/semantic_mapping/semantic_mapping/utils/storage.py b/semantic_mapping/semantic_mapping/utils/storage.py
new file mode 100644
index 0000000..a3089f4
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/storage.py
@@ -0,0 +1,157 @@
+"""Async image saving queue for semantic mapping."""
+
+from __future__ import annotations
+
+from pathlib import Path
+import os
+import shutil
+import queue
+import threading
+import time
+
+import cv2
+import numpy as np
+
+
+def _reset_output_dir(output_dir: Path) -> None:
+ """Clear an output directory if it contains existing files."""
+ if output_dir.exists() and any(output_dir.iterdir()):
+ shutil.rmtree(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+
+class ImageSaveQueue:
+ """Background worker for saving images to disk."""
+
+ def __init__(
+ self,
+ output_dir: Path,
+ max_queue_size: int = 100,
+ save_png: bool = False,
+ save_transform: bool = False,
+ clear_output_dir: bool = True,
+ use_timestamp_dir: bool = False,
+ ) -> None:
+ """Initialize the image save queue.
+
+ Args:
+ output_dir: Base directory where images are stored.
+ max_queue_size: Maximum number of queued save jobs.
+ save_png: If True, also save a .png copy of the image.
+ save_transform: If True, save a viewpoint transform when provided.
+ clear_output_dir: If True, clear existing contents on startup.
+ use_timestamp_dir: If True, write into a timestamped subdirectory.
+ """
+ base_dir = Path(output_dir)
+ if clear_output_dir:
+ _reset_output_dir(base_dir)
+ else:
+ base_dir.mkdir(parents=True, exist_ok=True)
+ if use_timestamp_dir:
+ run_dir = base_dir / f"{time.time():.2f}"
+ run_dir.mkdir(parents=True, exist_ok=True)
+ self._output_dir = run_dir
+ else:
+ self._output_dir = base_dir
+ self._save_png = save_png
+ self._save_transform = save_transform
+ self._queue: queue.Queue = queue.Queue(maxsize=max_queue_size)
+ self._stop_event = threading.Event()
+ self._thread: threading.Thread | None = None
+
+ def start(self) -> None:
+ """Start the background worker."""
+ if self._thread is not None and self._thread.is_alive():
+ return
+ self._stop_event.clear()
+ self._thread = threading.Thread(target=self._run, daemon=True)
+ self._thread.start()
+
+ def stop(self) -> None:
+ """Stop the background worker."""
+ self._stop_event.set()
+ try:
+ self._queue.put_nowait(None)
+ except queue.Full:
+ pass
+ if self._thread is not None:
+ self._thread.join(timeout=2.0)
+ self._thread = None
+
+ def queue_save(
+ self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None
+ ) -> str:
+ """Queue an image (and optional mask) for saving.
+
+ Args:
+ object_id: Object identifier to embed in filename.
+ image: Image array to save.
+ mask: Optional mask array to save alongside the image.
+
+ Returns:
+ Path to the saved image (.npy).
+ """
+ timestamp_ms = int(time.time() * 1000)
+ image_path = self._output_dir / f"{object_id}_{timestamp_ms}.npy"
+ mask_path = self._output_dir / f"{object_id}_{timestamp_ms}_mask.npy"
+ self._queue.put(("save_object", str(image_path), str(mask_path), image, mask))
+ return str(image_path)
+
+ def queue_viewpoint_save(
+ self, viewpoint_id: int, image: np.ndarray, transform: np.ndarray | None
+ ) -> None:
+ """Queue a viewpoint snapshot for saving."""
+ if not self._save_png and not self._save_transform:
+ return
+ image_path = self._output_dir / f"viewpoint_{viewpoint_id}.png"
+ transform_path = self._output_dir / f"viewpoint_{viewpoint_id}_transform.npy"
+ try:
+ self._queue.put_nowait(
+ ("save_viewpoint", str(image_path), str(transform_path), image, transform)
+ )
+ except queue.Full:
+ return
+
+ def _run(self) -> None:
+ while not self._stop_event.is_set():
+ item = self._queue.get()
+ if item is None:
+ self._queue.task_done()
+ break
+ op, image_path, aux_path, image, aux = item
+ if op == "save_object":
+ try:
+ image_target = Path(image_path)
+ image_tmp = image_target.with_suffix(image_target.suffix + ".tmp")
+ with image_tmp.open("wb") as handle:
+ np.save(handle, image)
+ os.replace(image_tmp, image_target)
+ if aux is not None:
+ aux_target = Path(aux_path)
+ aux_tmp = aux_target.with_suffix(aux_target.suffix + ".tmp")
+ with aux_tmp.open("wb") as handle:
+ np.save(handle, aux)
+ os.replace(aux_tmp, aux_target)
+ if self._save_png:
+ png_path = image_target.with_suffix(".png")
+ png_tmp = png_path.with_name(png_path.stem + ".tmp.png")
+ cv2.imwrite(str(png_tmp), image)
+ os.replace(png_tmp, png_path)
+ except Exception:
+ pass
+ elif op == "save_viewpoint":
+ try:
+ if self._save_transform and aux is not None:
+ aux_target = Path(aux_path)
+ aux_tmp = aux_target.with_suffix(aux_target.suffix + ".tmp")
+ with aux_tmp.open("wb") as handle:
+ np.save(handle, aux)
+ os.replace(aux_tmp, aux_target)
+ if self._save_png:
+ image_target = Path(image_path)
+ png_tmp = image_target.with_name(image_target.stem + ".tmp.png")
+ cv2.imwrite(str(png_tmp), image)
+ os.replace(png_tmp, image_target)
+ except Exception:
+ pass
+ self._queue.task_done()
diff --git a/semantic_mapping/semantic_mapping/utils/utils.py b/semantic_mapping/semantic_mapping/utils/utils.py
new file mode 100644
index 0000000..2a4e980
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/utils.py
@@ -0,0 +1,234 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+import json
+import time
+from typing import Any
+
+import cv2
+import numpy as np
+import yaml
+
+
+@dataclass(frozen=True)
+class CameraParams:
+ image_width: int = 1920
+ image_height: int = 640
+ hfov: float = 360.0
+ vfov: float = 120.0
+ sensor_to_camera: np.ndarray | None = None
+
+
+def load_camera_params(path: str, base_dir: Path | None = None) -> CameraParams | None:
+ config_path = Path(path)
+ if not config_path.is_absolute() and base_dir is not None:
+ config_path = base_dir / config_path
+ if not config_path.exists():
+ return None
+ data = yaml.safe_load(config_path.read_text()) or {}
+ sensor_to_camera = _load_sensor_to_camera_matrix(data)
+ return CameraParams(
+ image_width=int(data.get("image_width", 1920)),
+ image_height=int(data.get("image_height", 640)),
+ hfov=float(data.get("hfov", 360.0)),
+ vfov=float(data.get("vfov", 120.0)),
+ sensor_to_camera=sensor_to_camera,
+ )
+
+
+def _load_sensor_to_camera_matrix(data: dict[str, Any]) -> np.ndarray | None:
+ tf_data = data.get("sensor_to_camera", {})
+ trans = tf_data.get("translation", {})
+ rpy = tf_data.get("rotation_rpy", {})
+ cam_offset = np.array(
+ [
+ float(trans.get("x", 0.0)),
+ float(trans.get("y", 0.0)),
+ float(trans.get("z", 0.0)),
+ ]
+ )
+ roll = float(rpy.get("roll", 0.0))
+ pitch = float(rpy.get("pitch", 0.0))
+ yaw = float(rpy.get("yaw", 0.0))
+
+ r_x = np.array(
+ [[1, 0, 0], [0, np.cos(roll), -np.sin(roll)], [0, np.sin(roll), np.cos(roll)]]
+ )
+ r_y = np.array(
+ [[np.cos(pitch), 0, np.sin(pitch)], [0, 1, 0], [-np.sin(pitch), 0, np.cos(pitch)]]
+ )
+ r_z = np.array(
+ [[np.cos(yaw), -np.sin(yaw), 0], [np.sin(yaw), np.cos(yaw), 0], [0, 0, 1]]
+ )
+ cam_rotation = r_z @ r_y @ r_x
+ rotation = cam_rotation.T
+ translation = -rotation @ cam_offset
+ transform = np.eye(4)
+ transform[:3, :3] = rotation
+ transform[:3, 3] = translation
+ return transform
+
+
+def parse_json_dict(text: str) -> dict[str, Any]:
+ if not text:
+ return {}
+ cleaned = text.strip()
+ if "```" in cleaned:
+ parts = cleaned.split("```")
+ if len(parts) >= 3:
+ cleaned = parts[1].strip()
+ start = cleaned.find("{")
+ end = cleaned.rfind("}")
+ if start != -1 and end != -1:
+ cleaned = cleaned[start : end + 1]
+ try:
+ parsed = json.loads(cleaned)
+ except json.JSONDecodeError:
+ return {}
+ return parsed if isinstance(parsed, dict) else {}
+
+
+def load_npy_image(path_str: str, retries: int = 3, retry_delay_s: float = 0.05) -> np.ndarray | None:
+ path = Path(path_str)
+ if not path.exists():
+ return None
+ for attempt in range(max(1, retries)):
+ try:
+ return np.load(str(path))
+ except Exception:
+ if attempt >= retries - 1:
+ return None
+ time.sleep(retry_delay_s)
+
+
+def annotate_mask(image: np.ndarray, image_path: str, iterations: int = 2) -> np.ndarray:
+ mask_path = image_path.replace(".npy", "_mask.npy")
+ if not Path(mask_path).exists():
+ return image
+ mask = np.load(mask_path)
+ mask = cv2.dilate(mask.astype(np.uint8), np.ones((3, 3), np.uint8), iterations=iterations)
+ mask_uint8 = mask.astype(np.uint8)
+ contours, _ = cv2.findContours(mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+ annotated = image.copy()
+ cv2.drawContours(annotated, contours, -1, (0, 255, 0), 2)
+ return annotated
+
+
+def concat_images(images: list[np.ndarray]) -> np.ndarray:
+ if not images:
+ return np.zeros((32, 32, 3), dtype=np.uint8)
+ heights = [img.shape[0] for img in images]
+ target_h = min(heights)
+ resized = []
+ for img in images:
+ if img.shape[0] != target_h:
+ scale = target_h / img.shape[0]
+ width = max(1, int(img.shape[1] * scale))
+ img = cv2.resize(img, (width, target_h), interpolation=cv2.INTER_AREA)
+ resized.append(img)
+ return cv2.cvtColor(cv2.hconcat(resized), cv2.COLOR_BGR2RGB)
+
+
+def interpolate_odom(
+ odom_buffer: list[tuple[float, dict]],
+ detection_stamp: float,
+ linear_bias: float,
+ angular_bias: float,
+) -> dict | None:
+ target_stamp = detection_stamp + linear_bias
+ angular_stamp = detection_stamp + angular_bias
+ stamps = [s for s, _ in odom_buffer]
+ if len(stamps) < 2:
+ return None
+ if target_stamp < stamps[0] or target_stamp > stamps[-1]:
+ return None
+ if angular_stamp < stamps[0] or angular_stamp > stamps[-1]:
+ return None
+
+ right_idx = next(i for i, s in enumerate(stamps) if s >= target_stamp)
+ left_idx = max(right_idx - 1, 0)
+ angular_right_idx = next(i for i, s in enumerate(stamps) if s >= angular_stamp)
+ angular_left_idx = max(angular_right_idx - 1, 0)
+
+ left_stamp, left_odom = odom_buffer[left_idx]
+ right_stamp, right_odom = odom_buffer[right_idx]
+ angular_left_stamp, angular_left_odom = odom_buffer[angular_left_idx]
+ angular_right_stamp, angular_right_odom = odom_buffer[angular_right_idx]
+ denom = right_stamp - left_stamp
+ ratio = (target_stamp - left_stamp) / denom if denom > 0 else 0.5
+ angular_denom = angular_right_stamp - angular_left_stamp
+ angular_ratio = (angular_stamp - angular_left_stamp) / angular_denom if angular_denom > 0 else 0.5
+
+ return {
+ "position": right_odom["position"] * ratio + left_odom["position"] * (1 - ratio),
+ "linear_velocity": right_odom["linear_velocity"] * ratio
+ + left_odom["linear_velocity"] * (1 - ratio),
+ "orientation": _slerp_quat(angular_left_odom["orientation"], angular_right_odom["orientation"], angular_ratio),
+ "angular_velocity": angular_right_odom["angular_velocity"] * angular_ratio
+ + angular_left_odom["angular_velocity"] * (1 - angular_ratio),
+ }
+
+
+def collect_cloud_window(
+ cloud_buffer: list[tuple[float, np.ndarray]],
+ detection_stamp: float,
+ window_before: float,
+ window_after: float,
+) -> np.ndarray | None:
+ start = detection_stamp - window_before
+ end = detection_stamp + window_after
+ clouds = [pts for stamp, pts in cloud_buffer if start <= stamp <= end]
+ if len(clouds) == 0:
+ return None
+ return np.concatenate(clouds, axis=0)
+
+
+def compute_ground_z_threshold(
+ points: np.ndarray,
+ robot_xy: np.ndarray,
+ ground_filter: bool,
+ ground_min_points: int,
+ ground_radius: float,
+ ground_percentile: float,
+ ground_z_clearance: float,
+) -> float | None:
+ if not ground_filter:
+ return None
+ if points.shape[0] < ground_min_points:
+ return None
+ if ground_radius <= 0.0:
+ return None
+ if not 0.0 < ground_percentile < 1.0:
+ return None
+
+ diffs = points[:, :2] - robot_xy
+ dist2 = diffs[:, 0] ** 2 + diffs[:, 1] ** 2
+ radius2 = ground_radius**2
+ mask = dist2 <= radius2
+ if np.count_nonzero(mask) < ground_min_points:
+ return None
+
+ z_vals = points[mask, 2]
+ z_thresh = float(np.percentile(z_vals, ground_percentile * 100.0))
+ return z_thresh + ground_z_clearance
+
+
+def _slerp_quat(q0: np.ndarray, q1: np.ndarray, ratio: float) -> np.ndarray:
+ """Slerp between two quaternions without scipy dependency."""
+ q0 = np.asarray(q0, dtype=float)
+ q1 = np.asarray(q1, dtype=float)
+ dot = float(np.dot(q0, q1))
+ if dot < 0.0:
+ q1 = -q1
+ dot = -dot
+ if dot > 0.9995:
+ result = q0 + ratio * (q1 - q0)
+ return result / np.linalg.norm(result)
+ theta_0 = np.arccos(dot)
+ sin_theta_0 = np.sin(theta_0)
+ theta = theta_0 * ratio
+ sin_theta = np.sin(theta)
+ s0 = np.cos(theta) - dot * sin_theta / sin_theta_0
+ s1 = sin_theta / sin_theta_0
+ return s0 * q0 + s1 * q1
diff --git a/semantic_mapping/semantic_mapping/vlm_reasoning_node.py b/semantic_mapping/semantic_mapping/vlm_reasoning_node.py
new file mode 100644
index 0000000..9d717f8
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/vlm_reasoning_node.py
@@ -0,0 +1,556 @@
+"""ROS2 node for VLM reasoning tasks in semantic mapping."""
+
+from __future__ import annotations
+
+from collections import deque
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+import threading
+import time
+
+import cv2
+import numpy as np
+import rclpy
+from cv_bridge import CvBridge
+from rclpy.node import Node
+from std_msgs.msg import String
+import yaml
+
+from semantic_mapping.msg import ObjectType, RoomType, TargetObjectInstruction, TargetObjectWithSpatial
+from vector_perception_utils.pointcloud_utils import project_bbox3d_equirect
+from vlm.models import MoondreamConfig, MoondreamVlm, QwenConfig, QwenVlm, VlmModelBase
+
+from .prompts import (
+ build_object_type_prompt,
+ build_room_type_prompt,
+ INSTRUCTION_DECOMPOSITION_PROMPT,
+ TARGET_OBJECT_SPATIAL_PROMPT,
+)
+from .utils.utils import (
+ annotate_mask,
+ concat_images,
+ load_camera_params,
+ load_npy_image,
+ parse_json_dict,
+)
+
+
+@dataclass
+class InstructionState:
+ instruction: str = ""
+ target_object: str = ""
+ room_condition: str = ""
+ spatial_condition: str = ""
+ attribute_condition: str = ""
+ anchor_object: str = ""
+ attribute_condition_anchor: str = ""
+
+
+class VlmReasoningNode(Node):
+ """VLM reasoning node that answers object queries for semantic mapping."""
+
+ def __init__(self) -> None:
+ super().__init__("vlm_reasoning_node")
+
+ self.declare_parameter("vlm_provider", "qwen")
+ self.declare_parameter("vlm_model", "qwen2.5-vl-72b-instruct")
+ self.declare_parameter("objects_yaml_path", "detect_anything/config/objects.yaml")
+ self.declare_parameter("object_type_query_topic", "/object_type_query")
+ self.declare_parameter("object_type_answer_topic", "/object_type_answer")
+ self.declare_parameter("target_object_spatial_query_topic", "/target_object_spatial_query")
+ self.declare_parameter("target_object_answer_topic", "/target_object_answer")
+ self.declare_parameter("target_object_instruction_topic", "/target_object_instruction")
+ self.declare_parameter("instruction_topic", "/keyboard_input")
+ self.declare_parameter("room_type_query_topic", "/room_type_query")
+ self.declare_parameter("room_type_answer_topic", "/room_type_answer")
+ self.declare_parameter("room_type_cache_s", 10.0)
+ self.declare_parameter("viewpoint_output_dir", "output/viewpoint_images")
+ self.declare_parameter("dump_object_type_images", False)
+ self.declare_parameter("object_type_dump_dir", "output/vlm_object_type")
+ self.declare_parameter("dump_room_type_images", False)
+ self.declare_parameter("room_type_dump_dir", "output/vlm_room_type")
+ self.declare_parameter("camera_param_path", "config/camera_param.yaml")
+ self.declare_parameter("platform", "mecanum")
+ self.declare_parameter("max_viewpoints", 3)
+ self.declare_parameter(
+ "room_types",
+ [
+ "Classroom",
+ "Laboratory",
+ "Office Room",
+ "Meeting Room",
+ "Restroom",
+ "Storage Room",
+ "Copy Room",
+ "Lounge",
+ "Reception",
+ "Hallway",
+ ],
+ )
+
+ self._vlm = self._build_vlm()
+ self._objects = self._load_object_labels()
+ self._viewpoint_dir = Path(str(self.get_parameter("viewpoint_output_dir").value))
+ self._platform = str(self.get_parameter("platform").value)
+ self._max_viewpoints = int(self.get_parameter("max_viewpoints").value)
+ camera_param_path = str(self.get_parameter("camera_param_path").value)
+ self._camera_params = load_camera_params(
+ camera_param_path, base_dir=Path(__file__).resolve().parent.parent
+ )
+ room_types_value = self.get_parameter("room_types").value
+ if isinstance(room_types_value, (list, tuple)):
+ self._room_types = [str(item) for item in room_types_value]
+ else:
+ self._room_types = []
+ self._instruction_state = InstructionState()
+ self._bridge = CvBridge()
+ self._dump_object_type_images = bool(
+ self.get_parameter("dump_object_type_images").value
+ )
+ self._object_type_dump_dir = Path(
+ str(self.get_parameter("object_type_dump_dir").value)
+ )
+ if self._dump_object_type_images:
+ self._object_type_dump_dir.mkdir(parents=True, exist_ok=True)
+ self._dump_room_type_images = bool(self.get_parameter("dump_room_type_images").value)
+ self._room_type_dump_dir = Path(str(self.get_parameter("room_type_dump_dir").value))
+ if self._dump_room_type_images:
+ self._room_type_dump_dir.mkdir(parents=True, exist_ok=True)
+
+ self._lock = threading.Lock()
+ self._instruction_queue: deque[str] = deque(maxlen=20)
+ self._object_type_queries: dict[int, ObjectType] = {}
+ self._room_type_queries: dict[int, RoomType] = {}
+ self._spatial_queries: dict[int, TargetObjectWithSpatial] = {}
+
+ object_type_query_topic = str(self.get_parameter("object_type_query_topic").value)
+ object_type_answer_topic = str(self.get_parameter("object_type_answer_topic").value)
+ target_object_spatial_query_topic = str(
+ self.get_parameter("target_object_spatial_query_topic").value
+ )
+ target_object_answer_topic = str(self.get_parameter("target_object_answer_topic").value)
+ target_object_instruction_topic = str(
+ self.get_parameter("target_object_instruction_topic").value
+ )
+ instruction_topic = str(self.get_parameter("instruction_topic").value)
+ room_type_query_topic = str(self.get_parameter("room_type_query_topic").value)
+ room_type_answer_topic = str(self.get_parameter("room_type_answer_topic").value)
+
+ self.create_subscription(ObjectType, object_type_query_topic, self._object_type_cb, 50)
+ self.create_subscription(
+ TargetObjectWithSpatial,
+ target_object_spatial_query_topic,
+ self._target_object_spatial_cb,
+ 10,
+ )
+ self.create_subscription(String, instruction_topic, self._instruction_cb, 10)
+ self.create_subscription(RoomType, room_type_query_topic, self._room_type_cb, 10)
+
+ self._object_type_pub = self.create_publisher(ObjectType, object_type_answer_topic, 50)
+ self._target_object_pub = self.create_publisher(
+ TargetObjectWithSpatial, target_object_answer_topic, 10
+ )
+ self._target_object_instruction_pub = self.create_publisher(
+ TargetObjectInstruction, target_object_instruction_topic, 10
+ )
+ self._room_type_pub = self.create_publisher(RoomType, room_type_answer_topic, 10)
+
+ self._executor = ThreadPoolExecutor(max_workers=4)
+ self.create_timer(0.1, self._process_queues)
+
+ self._room_type_labels: dict[int, RoomType] = {}
+ self._room_type_cache_s = float(self.get_parameter("room_type_cache_s").value)
+ self._room_type_last_query_ts: dict[int, float] = {}
+ self._last_log_times: dict[str, float] = {}
+ self.get_logger().debug(
+ f"VLM reasoning node ready (room_type_query={room_type_query_topic}, "
+ f"room_type_answer={room_type_answer_topic}, "
+ f"object_type_query={object_type_query_topic}, "
+ f"object_type_answer={object_type_answer_topic}, "
+ f"target_object_spatial_query={target_object_spatial_query_topic}, "
+ f"target_object_answer={target_object_answer_topic})"
+ )
+
+ def destroy_node(self) -> bool:
+ self._executor.shutdown(wait=False)
+ return super().destroy_node()
+
+ def _build_vlm(self) -> VlmModelBase:
+ provider = str(self.get_parameter("vlm_provider").value).lower()
+ model_name = str(self.get_parameter("vlm_model").value)
+ if provider == "qwen":
+ return QwenVlm(QwenConfig(model_name=model_name))
+ if provider == "moondream":
+ return MoondreamVlm(MoondreamConfig())
+ raise ValueError(f"Unknown VLM provider: {provider}")
+
+ def _load_object_labels(self) -> list[str]:
+ path = Path(str(self.get_parameter("objects_yaml_path").value))
+ if not path.exists():
+ self.get_logger().warning(f"objects.yaml not found: {path}")
+ return []
+ with path.open("r", encoding="utf-8") as handle:
+ data = yaml.safe_load(handle)
+ if isinstance(data, list):
+ return [str(item) for item in data]
+ return []
+
+ def _instruction_cb(self, msg: String) -> None:
+ self._instruction_queue.append(msg.data)
+
+ def _room_type_cb(self, msg: RoomType) -> None:
+ self._room_type_queries[msg.room_id] = msg
+
+ def _object_type_cb(self, msg: ObjectType) -> None:
+ self._object_type_queries[msg.object_id] = msg
+
+ def _target_object_spatial_cb(self, msg: TargetObjectWithSpatial) -> None:
+ self._spatial_queries[msg.object_id] = msg
+
+ def _process_queues(self) -> None:
+ if self._instruction_queue:
+ instruction = self._instruction_queue.pop()
+ self._instruction_queue.clear()
+ self._submit_task("instruction", self._process_instruction, instruction)
+
+ if self._room_type_queries:
+ items = list(self._room_type_queries.values())
+ self._room_type_queries.clear()
+ for item in items:
+ self._submit_task("room_type", self._process_room_type, item)
+
+ if self._object_type_queries:
+ items = list(self._object_type_queries.values())
+ self._object_type_queries.clear()
+ for item in items:
+ self._submit_task("object_type", self._process_object_type, item)
+
+ if self._spatial_queries:
+ items = list(self._spatial_queries.values())
+ self._spatial_queries.clear()
+ for item in items:
+ self._submit_task("target_spatial", self._process_target_object_spatial, item)
+
+ def _process_instruction(self, instruction: str) -> None:
+ prompt = INSTRUCTION_DECOMPOSITION_PROMPT + f"\nInstruction: {instruction}"
+ blank = np.zeros((32, 32, 3), dtype=np.uint8)
+ response = self._vlm.query(blank, prompt)
+ parsed = parse_json_dict(response)
+ if not parsed:
+ self.get_logger().warning(f"Failed to parse instruction JSON: {response}")
+ return
+
+ state = InstructionState(
+ instruction=instruction,
+ target_object=str(parsed.get("target_object", "")).lower(),
+ room_condition=str(parsed.get("room_condition", "")).lower(),
+ spatial_condition=str(parsed.get("spatial_condition", "")).lower(),
+ attribute_condition=str(parsed.get("attribute_condition", "")).lower(),
+ anchor_object=str(parsed.get("anchor_object", "")).lower(),
+ attribute_condition_anchor=str(parsed.get("attribute_condition_anchor", "")).lower(),
+ )
+ with self._lock:
+ self._instruction_state = state
+
+ msg = TargetObjectInstruction()
+ msg.target_object = state.target_object
+ msg.room_condition = state.room_condition
+ msg.spatial_condition = state.spatial_condition
+ msg.attribute_condition = state.attribute_condition
+ msg.anchor_object = state.anchor_object
+ msg.attribute_condition_anchor = state.attribute_condition_anchor
+ self._target_object_instruction_pub.publish(msg)
+
+ def _process_room_type(self, msg: RoomType) -> None:
+ if not msg.image.data or not msg.room_mask.data:
+ self.get_logger().warning(f"Room type query missing image or mask: {msg.room_id}")
+ return
+ cached = self._room_type_labels.get(msg.room_id)
+ if cached and self._room_type_cache_s > 0:
+ last_query = self._room_type_last_query_ts.get(msg.room_id, 0.0)
+ if time.time() - last_query < self._room_type_cache_s:
+ answer = RoomType()
+ answer.header = msg.header
+ answer.anchor_point = msg.anchor_point
+ answer.image = msg.image
+ answer.room_mask = msg.room_mask
+ answer.room_type = cached.room_type
+ answer.room_id = msg.room_id
+ answer.voxel_num = msg.voxel_num
+ answer.in_room = msg.in_room
+ self._room_type_pub.publish(answer)
+ with self._lock:
+ self._room_type_labels[answer.room_id] = answer
+ self._log_throttled(
+ "room_type_cache",
+ "Room type cached: room_id=%s type=%s",
+ answer.room_id,
+ answer.room_type,
+ interval_s=5.0,
+ )
+ return
+ image = self._bridge.imgmsg_to_cv2(msg.image, desired_encoding="bgr8")
+ mask = self._bridge.imgmsg_to_cv2(msg.room_mask, desired_encoding="mono8")
+ mosaic = self._build_room_type_mosaic(image, mask)
+ if self._dump_room_type_images:
+ timestamp_ms = int(time.time() * 1000)
+ base = self._room_type_dump_dir / f"room_{msg.room_id}_{timestamp_ms}"
+ mask_uint8 = mask.astype(np.uint8)
+ if mask_uint8.max() <= 1:
+ mask_uint8 = (mask_uint8 * 255).astype(np.uint8)
+ cv2.imwrite(str(base.parent / f"{base.name}_image.png"), image)
+ cv2.imwrite(str(base.parent / f"{base.name}_mask.png"), mask_uint8)
+ cv2.imwrite(
+ str(base.parent / f"{base.name}_mosaic.png"),
+ cv2.cvtColor(mosaic, cv2.COLOR_RGB2BGR),
+ )
+
+ room_types = list(self._room_types)
+ if not msg.in_room and "Corridor" in room_types:
+ room_types = [room for room in room_types if room != "Corridor"]
+ prompt = build_room_type_prompt(room_types)
+ start_time = time.time()
+ try:
+ response = self._vlm.query(mosaic, prompt)
+ except Exception as exc:
+ self.get_logger().error(f"Room type query failed: {exc}")
+ return
+ self._room_type_last_query_ts[msg.room_id] = time.time()
+ duration_s = time.time() - start_time
+ parsed = parse_json_dict(response)
+ if parsed:
+ room_type = str(parsed.get("room_type", "")).strip()
+ else:
+ room_type = response.strip().strip('"')
+ self._log_throttled(
+ "room_type_parse",
+ "Room type parse failed: room_id=%s response=%s",
+ msg.room_id,
+ self._preview_response(response),
+ interval_s=5.0,
+ )
+ answer = RoomType()
+ answer.header = msg.header
+ answer.anchor_point = msg.anchor_point
+ answer.image = msg.image
+ answer.room_mask = msg.room_mask
+ answer.room_type = room_type.lower()
+ answer.room_id = msg.room_id
+ answer.voxel_num = msg.voxel_num
+ answer.in_room = msg.in_room
+ self._room_type_pub.publish(answer)
+ with self._lock:
+ self._room_type_labels[answer.room_id] = answer
+ self._log_throttled(
+ "room_type_answer",
+ "Room type answer: room_id=%s type=%s (%.2fs)",
+ answer.room_id,
+ answer.room_type,
+ duration_s,
+ interval_s=2.0,
+ )
+
+ def _process_object_type(self, msg: ObjectType) -> None:
+ image = load_npy_image(msg.img_path)
+ if image is None:
+ self.get_logger().warning(f"Failed to load image: {msg.img_path}")
+ return
+ image = annotate_mask(image, msg.img_path)
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+ if self._dump_object_type_images:
+ timestamp_ms = int(time.time() * 1000)
+ dump_path = self._object_type_dump_dir / f"{msg.object_id}_{timestamp_ms}.png"
+ cv2.imwrite(str(dump_path), cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR))
+
+ labels = list(msg.labels) if msg.labels else self._objects
+ prompt = build_object_type_prompt(labels)
+ start_time = time.time()
+ try:
+ response = self._vlm.query(image_rgb, prompt)
+ except Exception as exc:
+ self.get_logger().error(f"Object type query failed: {exc}")
+ return
+ duration_s = time.time() - start_time
+ parsed = parse_json_dict(response)
+ if parsed:
+ final_label = str(parsed.get("label", labels[0] if labels else "unknown")).lower()
+ else:
+ final_label = str(labels[0] if labels else "unknown").lower()
+ self._log_throttled(
+ "object_type_parse",
+ "Object type parse failed: object_id=%s response=%s",
+ msg.object_id,
+ self._preview_response(response),
+ interval_s=5.0,
+ )
+
+ answer = ObjectType()
+ answer.object_id = msg.object_id
+ answer.img_path = msg.img_path
+ answer.final_label = final_label
+ answer.labels = msg.labels
+ self._object_type_pub.publish(answer)
+ self._log_throttled(
+ "object_type_answer",
+ "Object type answer: object_id=%s label=%s (%.2fs)",
+ answer.object_id,
+ answer.final_label,
+ duration_s,
+ interval_s=2.0,
+ )
+
+ @staticmethod
+ def _build_room_type_mosaic(image: np.ndarray, mask: np.ndarray) -> np.ndarray:
+ if image.ndim == 2:
+ image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+ if mask.ndim == 3 and mask.shape[2] == 1:
+ mask = mask[:, :, 0]
+ if mask.ndim == 2:
+ mask_uint8 = mask.astype(np.uint8)
+ if mask_uint8.max() <= 1:
+ mask_uint8 = (mask_uint8 * 255).astype(np.uint8)
+ else:
+ mask_uint8 = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
+
+ nonzero_ratio = float(np.count_nonzero(mask_uint8)) / float(mask_uint8.size)
+ if nonzero_ratio < 0.005:
+ return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+ mask_bgr = cv2.cvtColor(mask_uint8, cv2.COLOR_GRAY2BGR)
+ target_h = image.shape[0]
+ if mask_bgr.shape[0] != target_h:
+ scale = target_h / mask_bgr.shape[0]
+ new_w = max(1, int(mask_bgr.shape[1] * scale))
+ mask_bgr = cv2.resize(mask_bgr, (new_w, target_h), interpolation=cv2.INTER_NEAREST)
+ mosaic = cv2.hconcat([image, mask_bgr])
+ return cv2.cvtColor(mosaic, cv2.COLOR_BGR2RGB)
+
+ def _process_target_object_spatial(self, msg: TargetObjectWithSpatial) -> None:
+ image = load_npy_image(msg.img_path)
+ if image is None:
+ self.get_logger().warning(f"Failed to load image: {msg.img_path}")
+ return
+ image = annotate_mask(image, msg.img_path)
+
+ prompt = self._build_target_spatial_prompt(msg)
+ mosaic = self._build_spatial_mosaic(image, msg)
+ try:
+ response = self._vlm.query(mosaic, prompt)
+ except Exception as exc:
+ self.get_logger().error(f"Target spatial query failed: {exc}")
+ return
+ parsed = parse_json_dict(response)
+ is_target = bool(parsed.get("is_target", False))
+
+ answer = TargetObjectWithSpatial()
+ answer.header = msg.header
+ answer.object_id = msg.object_id
+ answer.object_label = msg.object_label
+ answer.img_path = msg.img_path
+ answer.viewpoint_ids = msg.viewpoint_ids
+ answer.room_label = msg.room_label
+ answer.is_target = is_target
+ answer.bbox3d = msg.bbox3d
+ self._target_object_pub.publish(answer)
+ self._log_throttled(
+ "target_spatial_answer",
+ "Target spatial answer: object_id=%s is_target=%s",
+ answer.object_id,
+ answer.is_target,
+ )
+
+ def _log_throttled(self, key: str, msg: str, *args: object, interval_s: float = 5.0) -> None:
+ now = time.time()
+ last = self._last_log_times.get(key, 0.0)
+ if now - last < interval_s:
+ return
+ self._last_log_times[key] = now
+ formatted = msg % args if args else msg
+ self.get_logger().debug(formatted)
+
+ @staticmethod
+ def _preview_response(response: object, limit: int = 160) -> str:
+ text = response if isinstance(response, str) else str(response)
+ text = text.replace("\n", " ").strip()
+ if len(text) > limit:
+ text = f"{text[:limit]}..."
+ return text
+
+ def _submit_task(self, name: str, func: callable, *args: object) -> None:
+ future = self._executor.submit(self._run_task, name, func, *args)
+ future.add_done_callback(self._task_done)
+
+ def _run_task(self, name: str, func: callable, *args: object) -> None:
+ try:
+ func(*args)
+ except Exception as exc:
+ self.get_logger().error(f"VLM task {name} failed: {exc}")
+
+ def _task_done(self, future) -> None:
+ _ = future
+
+ def _build_target_spatial_prompt(self, msg: TargetObjectWithSpatial) -> str:
+ with self._lock:
+ state = self._instruction_state
+ instruction = state.instruction or ""
+ description = f"Object: {msg.object_label}, Room: {msg.room_label or 'unknown'}"
+ detail = (
+ f"Target: {state.target_object}, Room Condition: {state.room_condition or 'none'}, "
+ f"Spatial Condition: {state.spatial_condition or 'none'}, "
+ f"Attribute Condition: {state.attribute_condition or 'none'}"
+ )
+ return f"{TARGET_OBJECT_SPATIAL_PROMPT}\n{instruction}\n{detail}\n{description}"
+
+ def _build_spatial_mosaic(self, image: np.ndarray, msg: TargetObjectWithSpatial) -> np.ndarray:
+ images = [image]
+ if len(msg.viewpoint_ids) > 0:
+ bbox3d = np.array([[p.x, p.y, p.z] for p in msg.bbox3d], dtype=float)
+ for vp_id in list(msg.viewpoint_ids)[: self._max_viewpoints]:
+ vp_image = self._load_viewpoint_image(vp_id)
+ if vp_image is None:
+ continue
+ transform = self._load_viewpoint_transform(vp_id)
+ if transform is not None and self._camera_params is not None:
+ vp_image = project_bbox3d_equirect(
+ image=vp_image,
+ bbox3d=bbox3d,
+ body_to_world=transform,
+ sensor_to_camera=self._camera_params.sensor_to_camera,
+ image_width=self._camera_params.image_width,
+ image_height=self._camera_params.image_height,
+ hfov=self._camera_params.hfov,
+ vfov=self._camera_params.vfov,
+ )
+ images.append(vp_image)
+ return concat_images(images)
+
+ def _load_viewpoint_image(self, viewpoint_id: int) -> np.ndarray | None:
+ image_path = self._viewpoint_dir / f"viewpoint_{viewpoint_id}.png"
+ if not image_path.exists():
+ return None
+ return cv2.imread(str(image_path))
+
+ def _load_viewpoint_transform(self, viewpoint_id: int) -> np.ndarray | None:
+ transform_path = self._viewpoint_dir / f"viewpoint_{viewpoint_id}_transform.npy"
+ if not transform_path.exists():
+ return None
+ return np.load(str(transform_path))
+
+
+
+
+
+
+def main() -> None:
+ rclpy.init()
+ node = VlmReasoningNode()
+ try:
+ rclpy.spin(node)
+ finally:
+ node.destroy_node()
+ rclpy.shutdown()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/sensor_coverage/CMakeLists.txt b/sensor_coverage/CMakeLists.txt
new file mode 100644
index 0000000..ed2cab6
--- /dev/null
+++ b/sensor_coverage/CMakeLists.txt
@@ -0,0 +1,68 @@
+cmake_minimum_required(VERSION 3.10)
+project(sensor_coverage)
+
+find_package(ament_cmake REQUIRED)
+find_package(rclcpp REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(nav_msgs REQUIRED)
+find_package(geometry_msgs REQUIRED)
+find_package(visualization_msgs REQUIRED)
+find_package(std_msgs REQUIRED)
+find_package(tf2 REQUIRED)
+find_package(tf2_ros REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(OpenCV REQUIRED)
+find_package(Eigen3 REQUIRED)
+find_package(PCL REQUIRED COMPONENTS common io filters kdtree segmentation features)
+find_package(pcl_conversions REQUIRED)
+find_package(semantic_mapping REQUIRED)
+
+add_executable(room_segmentation_node
+ src/room_segmentation/room_segmentation.cpp
+ src/room_segmentation/room_node_rep.cpp
+ src/utils/misc_utils.cpp
+)
+
+target_include_directories(room_segmentation_node
+ PUBLIC
+ $
+ $
+ ${PCL_INCLUDE_DIRS}
+)
+
+target_link_libraries(room_segmentation_node
+ ${PCL_LIBRARIES}
+ ${OpenCV_LIBRARIES}
+)
+
+ament_target_dependencies(room_segmentation_node
+ rclcpp
+ sensor_msgs
+ nav_msgs
+ geometry_msgs
+ visualization_msgs
+ std_msgs
+ tf2
+ tf2_ros
+ cv_bridge
+ pcl_conversions
+ semantic_mapping
+)
+
+install(TARGETS room_segmentation_node
+ DESTINATION lib/${PROJECT_NAME}
+)
+
+install(DIRECTORY include/
+ DESTINATION include
+)
+
+install(DIRECTORY config
+ DESTINATION share/${PROJECT_NAME}
+)
+
+install(FILES resource/${PROJECT_NAME}
+ DESTINATION share/${PROJECT_NAME}/resource
+)
+
+ament_package()
diff --git a/sensor_coverage/config/room_segmentation_config.yaml b/sensor_coverage/config/room_segmentation_config.yaml
new file mode 100644
index 0000000..89c4ba4
--- /dev/null
+++ b/sensor_coverage/config/room_segmentation_config.yaml
@@ -0,0 +1,24 @@
+room_segmentation_node:
+ ros__parameters:
+ exploredAreaDisplayInterval: 1
+ exploredAreaVoxelSize: 0.1
+ dilation_iteration: 3
+ room_resolution: 0.1
+ room_x: 3000
+ room_y: 3000
+ room_z: 80
+ rolling_occupancy_grid.resolution_x: 0.2
+ ceilingHeight_: 2.3
+ wall_thres_height_: 0.1
+ outward_distance_0: 0.5
+ outward_distance_1: 0.3
+ distance_threshold: 2.0
+ distance_angel_threshold: 0.3
+ angle_threshold_deg: 6.0
+ region_growing_radius: 15.0
+ min_room_size: 40
+ normal_search_num: 50
+ normal_search_radius: 0.5
+ kViewPointCollisionMarginZPlus: 0.75
+ kViewPointCollisionMarginZMinus: 0.4
+ isDebug: false
diff --git a/sensor_coverage/include/sensor_coverage/room_segmentation/room_node_rep.h b/sensor_coverage/include/sensor_coverage/room_segmentation/room_node_rep.h
new file mode 100644
index 0000000..3613017
--- /dev/null
+++ b/sensor_coverage/include/sensor_coverage/room_segmentation/room_node_rep.h
@@ -0,0 +1,113 @@
+/**
+ * @file room_node_rep.h
+ * @author Haokun Zhu (haokunz@andrew.cmu.edu)
+ * @brief Room node representation for room segmentation
+ * @version 0.1
+ * @date 2026-01-04
+ *
+ * @copyright Copyright (c) 2026
+ *
+ */
+
+#pragma once
+
+#include
+#include