diff --git a/.gitignore b/.gitignore index ccd784f..4a54642 100644 --- a/.gitignore +++ b/.gitignore @@ -209,3 +209,12 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +# model files +*.pt +*.pth +*.onnx +*.engine +*.ts + +output/ \ No newline at end of file diff --git a/README.md b/README.md index 49392b8..33bff36 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ ROS2 perception stack for generalist robotics. ## Packages - **track_anything** - EdgeTAM tracking + 3D segmentation with RGBD +- **detect_anything** - YOLO-E detection node and utilities +- **semantic_mapping** - semantic 3D mapping with VLM query hooks - **vector_perception_utils** - Image and point cloud utilities ## Installation @@ -18,9 +20,16 @@ source ~/vector_venv/bin/activate cd /home/alex-lin/dev/vector_perception_ros pip install -r requirements.txt +sudo apt-get install ros-dev-tools ros-jazzy-pcl-ros libpcl-dev git cmake libgoogle-glog-dev libgflags-dev libatlas-base-dev libeigen3-dev libsuitesparse-dev + +sudo apt install -y nlohmann-json3-dev +sudo apt install ros-jazzy-backward-ros + + # Build source /opt/ros/jazzy/setup.bash colcon build +``` @@ -110,4 +119,5 @@ for det in detections: See package READMEs: - [track_anything/README.md](track_anything/README.md) +- [semantic_mapping/README.md](semantic_mapping/README.md) - [vector_perception_utils/README.md](vector_perception_utils/README.md) diff --git a/detect_anything/CMakeLists.txt b/detect_anything/CMakeLists.txt new file mode 100644 index 0000000..ea8c416 --- /dev/null +++ b/detect_anything/CMakeLists.txt @@ -0,0 +1,54 @@ +cmake_minimum_required(VERSION 3.10) +project(detect_anything) + +find_package(ament_cmake REQUIRED) +find_package(rclpy REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(std_msgs REQUIRED) +find_package(vision_msgs REQUIRED) +find_package(cv_bridge REQUIRED) +find_package(rosidl_default_generators REQUIRED) +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +rosidl_generate_interfaces(${PROJECT_NAME} + "msg/DetectionResult.msg" + DEPENDENCIES std_msgs sensor_msgs +) + +set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages") + +install( + DIRECTORY detect_anything + DESTINATION ${PYTHON_INSTALL_DIR} +) + +install( + PROGRAMS + scripts/detection_node + DESTINATION lib/${PROJECT_NAME} +) + +install( + FILES resource/detect_anything + DESTINATION share/${PROJECT_NAME}/resource +) + +install( + DIRECTORY config + DESTINATION share/${PROJECT_NAME} +) + +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/models") + install( + DIRECTORY models + DESTINATION share/${PROJECT_NAME} + ) +endif() + +set(ENV_HOOK "${CMAKE_CURRENT_SOURCE_DIR}/env-hooks/venv_pythonpath.sh.in") +set(ENV_HOOK_OUT "${CMAKE_CURRENT_BINARY_DIR}/ament_cmake_environment_hooks/venv_pythonpath.sh") +configure_file(${ENV_HOOK} ${ENV_HOOK_OUT} @ONLY) +ament_environment_hooks(${ENV_HOOK_OUT}) + +ament_export_dependencies(rosidl_default_runtime) +ament_package() diff --git a/detect_anything/README.md b/detect_anything/README.md new file mode 100644 index 0000000..1e94a06 --- /dev/null +++ b/detect_anything/README.md @@ -0,0 +1,28 @@ +# detect_anything + +YOLO-E detection node that publishes `DetectionResult` with cropped masks and an annotated overlay topic. + +## What’s inside +- `detect_anything/detection.py`: detection results container and Ultralytics parsing helpers. +- `detect_anything/yoloe.py`: YOLO-E wrapper with prompt support and basic filtering. +- `detect_anything/detection_node.py`: ROS2 node wiring the detector to `DetectionResult`. +- `msg/DetectionResult.msg`: compressed image + cropped mask array. + +## Quick start +```bash +source ~/vector_venv/bin/activate +source /opt/ros/jazzy/setup.bash +colcon build --packages-select detect_anything +source install/setup.bash + +ros2 run detect_anything detection_node \ + --ros-args -p model_path:=/path/to/yoloe/models \ + -p model_name:=yoloe-11s-seg-pf.pt \ + -p conf:=0.6 \ + -p max_area_ratio:=0.3 \ + -p image_topic:=/camera/image +``` + +Topics: +- Publishes `/detection_result` (`detect_anything/DetectionResult`) and `/annotated_image_detection` (`sensor_msgs/Image`). +- Subscribes to `/camera/image` (or `/camera/image/compressed` if `use_compressed:=true`). diff --git a/detect_anything/config/objects.yaml b/detect_anything/config/objects.yaml new file mode 100644 index 0000000..371f186 --- /dev/null +++ b/detect_anything/config/objects.yaml @@ -0,0 +1,46 @@ +# Simple list of object names for prompting YOLO-E +- chair +- desk +- tv_monitor +- sofa +- unknown +- printer +- coffee machine +- refrigerator +- trash can +- shoe +- sink +- table +- oven +- bed +- painting +- bulletin board +- plant +- vase +- cabinet +- shelf +- book +- cup +- sculpture +- keyboard +- mouse +- clock +- phone +- toilet +- bathtub +- microwave oven +- pan +- suitcase +- light +- curtain +- whiteboard +- shower knob +- bottle +- water dispenser +- vending machine +- laptop +- bag +- locker +- picture +- cardboard +- extinguisher diff --git a/detect_anything/detect_anything/__init__.py b/detect_anything/detect_anything/__init__.py new file mode 100644 index 0000000..9c61aab --- /dev/null +++ b/detect_anything/detect_anything/__init__.py @@ -0,0 +1 @@ +"""detect_anything package.""" diff --git a/detect_anything/detect_anything/detection.py b/detect_anything/detect_anything/detection.py new file mode 100644 index 0000000..1d64d76 --- /dev/null +++ b/detect_anything/detect_anything/detection.py @@ -0,0 +1,256 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import List, Optional, Sequence + +from cv_bridge import CvBridge +import cv2 +import numpy as np + +from detect_anything.msg import DetectionResult + +_BRIDGE = CvBridge() + + +@dataclass +class SingleDetectionResult: + """Simple container for a YOLO/Ultralytics detection with a segmentation mask.""" + + bbox: tuple[float, float, float, float] + track_id: int + class_id: int + label: str + confidence: float + mask: np.ndarray + + def area(self) -> float: + x1, y1, x2, y2 = self.bbox + return max(0.0, (x2 - x1)) * max(0.0, (y2 - y1)) + + +@dataclass +class DetectionResults: + """Container for detections plus the source image.""" + + image: np.ndarray + detections: List[SingleDetectionResult] + + def overlay(self, alpha: float = 0.4, thickness: int = 2) -> np.ndarray: + """Draw bboxes and masks on the image.""" + output = self.image.copy() + for det in self.detections: + rng = np.random.default_rng(det.track_id) + color = tuple(int(c) for c in rng.integers(50, 255, size=3)) + + if det.mask.size: + mask_bool = det.mask.astype(bool) + if mask_bool.shape[:2] == output.shape[:2]: + colored_mask = np.zeros_like(output) + colored_mask[mask_bool] = color + output[mask_bool] = cv2.addWeighted( + output[mask_bool], 1 - alpha, colored_mask[mask_bool], alpha, 0 + ) + contours, _ = cv2.findContours(det.mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(output, contours, -1, color, thickness) + + x1, y1, x2, y2 = map(int, det.bbox) + cv2.rectangle(output, (x1, y1), (x2, y2), color, thickness) + + label_text = f"{det.label} {det.confidence:.2f}" + label_text = f"{label_text} | id {det.track_id}" + + (text_w, text_h), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + cv2.rectangle( + output, + (x1, max(0, y1 - text_h - baseline - 2)), + (x1 + text_w, max(0, y1)), + color, + -1, + ) + cv2.putText( + output, + label_text, + (x1, max(0, y1 - baseline - 2)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + cv2.LINE_AA, + ) + return output + + def to_msg(self, header) -> DetectionResult: + """ + Convert detections into a DetectionResult ROS message. + """ + msg = DetectionResult() + msg.header = header + msg.image = _BRIDGE.cv2_to_compressed_imgmsg(self.image, dst_format="jpeg") + msg.image.header = header + + for det in self.detections: + msg.track_id.append(int(det.track_id)) + msg.x1.append(float(det.bbox[0])) + msg.y1.append(float(det.bbox[1])) + msg.x2.append(float(det.bbox[2])) + msg.y2.append(float(det.bbox[3])) + msg.label.append(str(det.label)) + msg.confidence.append(float(det.confidence)) + + mask = det.mask + # Crop mask to bbox and align origin to bbox top-left for transport. + x1, y1, x2, y2 = [int(v) for v in det.bbox] + h, w = self.image.shape[:2] + x1 = max(0, min(x1, w)) + x2 = max(0, min(x2, w)) + y1 = max(0, min(y1, h)) + y2 = max(0, min(y2, h)) + crop_h = max(1, y2 - y1) + crop_w = max(1, x2 - x1) + if mask.size == 0: + mask = np.zeros((crop_h, crop_w), dtype=np.uint8) + else: + mask = mask[y1:y2, x1:x2] + if mask.shape != (crop_h, crop_w): + mask = cv2.resize( + mask.astype(np.uint8), + (crop_w, crop_h), + interpolation=cv2.INTER_NEAREST, + ) + + mask_msg = _BRIDGE.cv2_to_compressed_imgmsg(mask, dst_format="png") + mask_msg.header = header + msg.masks.append(mask_msg) + + return msg + + @classmethod + def from_msg(cls, msg: DetectionResult) -> DetectionResults: + """ + Construct DetectionResults from a DetectionResult ROS message. + Masks are converted back to full-size overlays at bbox locations. + """ + image = _BRIDGE.compressed_imgmsg_to_cv2(msg.image) + detections: List[SingleDetectionResult] = [] + + for i in range(len(msg.track_id)): + bbox = ( + float(msg.x1[i]), + float(msg.y1[i]), + float(msg.x2[i]), + float(msg.y2[i]), + ) + mask_img = _BRIDGE.compressed_imgmsg_to_cv2(msg.masks[i], desired_encoding="mono8") + + full_mask = np.zeros(image.shape[:2], dtype=np.uint8) + x1, y1, x2, y2 = [int(v) for v in bbox] + h, w = image.shape[:2] + x1 = max(0, min(x1, w)) + x2 = max(0, min(x2, w)) + y1 = max(0, min(y1, h)) + y2 = max(0, min(y2, h)) + crop_h = max(1, y2 - y1) + crop_w = max(1, x2 - x1) + if mask_img.shape[:2] != (crop_h, crop_w): + mask_img = cv2.resize(mask_img, (crop_w, crop_h), interpolation=cv2.INTER_NEAREST) + full_mask[y1:y2, x1:x2] = mask_img + + detections.append( + SingleDetectionResult( + bbox=bbox, + track_id=int(msg.track_id[i]), + class_id=0, # class_id is not transmitted in DetectionResult + label=str(msg.label[i]), + confidence=float(msg.confidence[i]), + mask=full_mask, + ) + ) + + return cls(image=image, detections=detections) + + +def _from_ultralytics_result( + result, + idx: int, + image_shape: Optional[tuple[int, int]] = None, + names: Optional[Sequence[str]] = None, +) -> SingleDetectionResult: + """Convert a single Ultralytics Results entry into a SingleDetectionResult.""" + from ultralytics.engine.results import Results # local import + + if not isinstance(result, Results): + raise TypeError("result must be an ultralytics Results object") + if result.boxes is None: + raise ValueError("Result has no boxes") + + bbox_array = result.boxes.xyxy[idx].cpu().numpy() + bbox = ( + float(bbox_array[0]), + float(bbox_array[1]), + float(bbox_array[2]), + float(bbox_array[3]), + ) + + confidence = float(result.boxes.conf[idx].cpu()) + class_id = int(result.boxes.cls[idx].cpu()) + + resolved_names = names + if resolved_names is None and hasattr(result, "names"): + resolved_names = result.names # type: ignore[assignment] + + label = f"class_{class_id}" + if resolved_names is not None: + if isinstance(resolved_names, dict): + label = resolved_names.get(class_id, label) + elif isinstance(resolved_names, (list, tuple)) and class_id < len(resolved_names): + label = resolved_names[class_id] + + track_id = idx + if hasattr(result.boxes, "id") and result.boxes.id is not None: + track_id = int(result.boxes.id[idx].cpu()) + + target_h, target_w = (image_shape or (None, None)) + if target_h is None or target_w is None: + if hasattr(result, "orig_shape") and result.orig_shape: + target_h, target_w = result.orig_shape[:2] + + mask = np.zeros((target_h, target_w), dtype=np.uint8) if target_h and target_w else np.zeros((0, 0), dtype=np.uint8) + if result.masks is not None and idx < len(result.masks.data): + mask_tensor = result.masks.data[idx] + mask_np = mask_tensor.cpu().numpy() + + if mask_np.ndim == 3: + mask_np = mask_np.squeeze() + + if target_h and target_w and mask_np.shape != (target_h, target_w): + mask_np = cv2.resize(mask_np.astype(np.float32), (target_w, target_h), interpolation=cv2.INTER_LINEAR) + + mask = (mask_np > 0.5).astype(np.uint8) * 255 # type: ignore[assignment] + + return SingleDetectionResult( + bbox=bbox, + track_id=track_id, + class_id=class_id, + label=label, + confidence=confidence, + mask=mask, + ) + + +def from_ultralytics_results( + results, + image_shape: Optional[tuple[int, int]] = None, + names: Optional[Sequence[str]] = None, +) -> List[SingleDetectionResult]: + """Convert a list of Ultralytics Results objects into detections with masks.""" + detections: List[SingleDetectionResult] = [] + for result in results: + if result.boxes is None: + continue + num_detections = len(result.boxes.xyxy) + for i in range(num_detections): + detections.append(_from_ultralytics_result(result, i, image_shape=image_shape, names=names)) + return detections + + +__all__ = ["DetectionResults", "SingleDetectionResult", "from_ultralytics_results"] diff --git a/detect_anything/detect_anything/detection_node.py b/detect_anything/detect_anything/detection_node.py new file mode 100644 index 0000000..7aa0870 --- /dev/null +++ b/detect_anything/detect_anything/detection_node.py @@ -0,0 +1,177 @@ +from __future__ import annotations + +from pathlib import Path +from typing import List, Optional + +import yaml +import rclpy +from cv_bridge import CvBridge +from ament_index_python.packages import get_package_share_directory +from sensor_msgs.msg import CompressedImage, Image +from rclpy.node import Node +from rclpy.qos import DurabilityPolicy, QoSProfile, ReliabilityPolicy + +from detect_anything.detection import DetectionResults +from detect_anything.yoloe import Yoloe2DDetector, YoloePromptMode +from detect_anything.msg import DetectionResult + + +class DetectionNode(Node): + """ROS2 node that runs YOLO-E detection and publishes DetectionResult.""" + + def __init__(self) -> None: + super().__init__("detect_anything_detection_node") + self.bridge = CvBridge() + + # Parameters + defaults = self._default_params() + self.declare_parameter("image_topic", defaults["image_topic"]) + self.declare_parameter("compressed_image_topic", defaults["compressed_image_topic"]) + self.declare_parameter("use_compressed", defaults["use_compressed"]) + self.declare_parameter("model_path", defaults["model_path"]) + self.declare_parameter("model_name", defaults["model_name"]) + self.declare_parameter("device", defaults["device"]) + self.declare_parameter("prompt_mode", defaults["prompt_mode"]) + self.declare_parameter("objects_file", defaults["objects_file"]) + self.declare_parameter("conf", defaults["conf"]) + self.declare_parameter("max_area_ratio", defaults["max_area_ratio"]) + + params = self._default_params() + image_topic = str(self.get_parameter("image_topic").value) + compressed_image_topic = str(self.get_parameter("compressed_image_topic").value) + self.use_compressed = bool(self.get_parameter("use_compressed").value) + + base_share = Path(get_package_share_directory("detect_anything")) + model_path_val = self.get_parameter("model_path").value + model_path = str(model_path_val) if model_path_val not in (None, "") else params["model_path"] + if str(model_path).lower() == "none": + model_path = params["model_path"] + model_path = Path(model_path).expanduser() + if not model_path.is_absolute(): + model_path = (base_share / model_path).resolve() + + model_name = str(self.get_parameter("model_name").value or params["model_name"]) + device = str(self.get_parameter("device").value or params["device"]) + prompt_mode_str = str(self.get_parameter("prompt_mode").value or params["prompt_mode"]).lower() + objects_file = Path(str(self.get_parameter("objects_file").value or params["objects_file"])) + if not objects_file.is_absolute(): + objects_file = (base_share / objects_file).resolve() + conf = float(self.get_parameter("conf").value) + max_area_ratio = float(self.get_parameter("max_area_ratio").value) + + prompt_mode = YoloePromptMode(prompt_mode_str) + + # Auto-pick the right default weight if user didn't override + if prompt_mode == YoloePromptMode.LRPC and ("seg.pt" in model_name and "seg-pf.pt" not in model_name): + model_name = "yoloe-11l-seg-pf.pt" + self.get_logger().info(f"Prompt-free mode: switching model_name to {model_name}") + if prompt_mode == YoloePromptMode.PROMPT and "seg-pf.pt" in model_name: + model_name = "yoloe-11l-seg.pt" + self.get_logger().info(f"Prompt mode: switching model_name to {model_name}") + + self.detector = Yoloe2DDetector( + model_path=str(model_path), + model_name=model_name, + device=device or None, + prompt_mode=prompt_mode, + max_area_ratio=max_area_ratio, + conf=conf, + ) + + prompts = self._load_prompts(objects_file) + if prompts and prompt_mode == YoloePromptMode.PROMPT: + self.detector.set_prompts(text=prompts) + self.get_logger().info(f"Loaded {len(prompts)} prompts from {objects_file}") + + qos = QoSProfile( + depth=10, + reliability=ReliabilityPolicy.RELIABLE, + durability=DurabilityPolicy.VOLATILE, + ) + if self.use_compressed: + self.create_subscription( + CompressedImage, + compressed_image_topic, + self._compressed_image_callback, + qos, + ) + else: + self.create_subscription( + Image, + image_topic, + self._image_callback, + qos, + ) + + self.detection_pub = self.create_publisher(DetectionResult, "/detection_result", qos) + self.overlay_pub = self.create_publisher(Image, "/annotated_image_detection", qos) + + self.get_logger().info("detect_anything detection node started") + + def _default_params(self) -> dict: + base_share = Path(get_package_share_directory("detect_anything")) + model_dir = Path.home() / ".ultralytics" / "weights" + return { + "image_topic": "/camera/image", + "compressed_image_topic": "/camera/image/compressed", + "use_compressed": True, + "model_path": str(model_dir), + "model_name": "yoloe-11l-seg.pt", + "device": "cuda", + "prompt_mode": YoloePromptMode.PROMPT.value, + "objects_file": str(base_share / "config" / "objects.yaml"), + "conf": 0.25, + "max_area_ratio": 0.3, + } + + def _load_prompts(self, objects_file: Path) -> List[str]: + if not objects_file.exists(): + return [] + try: + with open(objects_file, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) or [] + if isinstance(data, list): + return [str(p) for p in data] + except Exception as exc: + self.get_logger().warn(f"Failed to read prompts from {objects_file}: {exc}") + return [] + + def _image_callback(self, msg: Image) -> None: + cv_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding="bgr8") + self._process_image(cv_image, msg.header) + + def _compressed_image_callback(self, msg: CompressedImage) -> None: + cv_image = self.bridge.compressed_imgmsg_to_cv2(msg, desired_encoding="bgr8") + self._process_image(cv_image, msg.header) + + def _process_image(self, cv_image, header) -> None: + detections: DetectionResults = self.detector.process_image(cv_image) + + detection_msg = detections.to_msg(header) + self.detection_pub.publish(detection_msg) + + overlay = detections.overlay() + overlay_msg = self.bridge.cv2_to_imgmsg(overlay, encoding="bgr8") + overlay_msg.header = header + self.overlay_pub.publish(overlay_msg) + + def destroy_node(self) -> bool: + if hasattr(self, "detector") and self.detector: + self.detector.stop() + return super().destroy_node() + + +def main(args: Optional[list[str]] = None) -> None: + rclpy.init(args=args) + node = DetectionNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/detect_anything/detect_anything/yoloe.py b/detect_anything/detect_anything/yoloe.py new file mode 100644 index 0000000..056387a --- /dev/null +++ b/detect_anything/detect_anything/yoloe.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +import threading +import zipfile +from typing import Any, List, Optional, Sequence + +import numpy as np +import torch +from ultralytics import YOLOE # type: ignore[attr-defined, import-not-found] +from ultralytics.utils import SETTINGS +from ultralytics.utils.downloads import attempt_download_asset + +from detect_anything.detection import ( + DetectionResults, + SingleDetectionResult, + from_ultralytics_results, +) + + +class YoloePromptMode(Enum): + """YOLO-E prompt modes.""" + + LRPC = "lrpc" + PROMPT = "prompt" + + +class Yoloe2DDetector: + """Thin wrapper around YOLO-E for 2D detection + tracking with masks.""" + + def __init__( + self, + model_path: str | None = None, + model_name: str | None = None, + device: str | None = None, + prompt_mode: YoloePromptMode = YoloePromptMode.LRPC, + max_area_ratio: Optional[float] = 0.3, + conf: float = 0.6, + iou: float = 0.6, + ) -> None: + if model_name is None: + model_name = "yoloe-11s-seg-pf.pt" if prompt_mode == YoloePromptMode.LRPC else "yoloe-11s-seg.pt" + + model_dir = Path(model_path) if model_path else Path.home() / ".ultralytics" / "weights" + model_dir = model_dir.expanduser().resolve() + model_dir.mkdir(parents=True, exist_ok=True) + SETTINGS["weights_dir"] = str(model_dir) + weights_path = attempt_download_asset(model_dir / model_name) + if prompt_mode == YoloePromptMode.PROMPT: + self._ensure_mobileclip(model_dir) + self.model = YOLOE(weights_path) + self.prompt_mode = prompt_mode + self._visual_prompts: dict[str, np.ndarray] | None = None + self.max_area_ratio = max_area_ratio + self._lock = threading.Lock() + self.conf = conf + self.iou = iou + self._names_override: Sequence[str] | None = None + + if prompt_mode == YoloePromptMode.PROMPT: + self.set_prompts(text=["nothing"]) + + if self.max_area_ratio is not None and not (0.0 < self.max_area_ratio <= 1.0): + raise ValueError("max_area_ratio must be in the range (0, 1].") + + if device: + self.device = device + elif torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + + def set_prompts( + self, + text: Optional[List[str]] = None, + bboxes: Optional[np.ndarray] = None, + ) -> None: + """Set prompts for detection. Provide either text or bboxes, not both.""" + if self.prompt_mode == YoloePromptMode.LRPC: + # Prompt-free model cannot accept text/visual prompts. + return + if text is not None and bboxes is not None: + raise ValueError("Provide either text or bboxes, not both.") + if text is None and bboxes is None: + raise ValueError("Must provide either text or bboxes.") + + with self._lock: + self.model.predictor = None + if text is not None: + self.model.set_classes(text, self.model.get_text_pe(text)) # type: ignore[no-untyped-call] + self._visual_prompts = None + self._names_override = list(text) + else: + cls = np.arange(len(bboxes), dtype=np.int16) # type: ignore[arg-type] + self._visual_prompts = {"bboxes": bboxes, "cls": cls} # type: ignore[dict-item] + self._names_override = None + + def process_image(self, image: np.ndarray) -> DetectionResults: + """Process an image and return detection results.""" + track_kwargs: dict[str, Any] = { + "source": image, + "device": self.device, + "conf": self.conf, + "iou": self.iou, + "persist": True, + "verbose": False, + } + + with self._lock: + if self._visual_prompts is not None: + track_kwargs["visual_prompts"] = self._visual_prompts + + results = self.model.track(**track_kwargs) # type: ignore[arg-type] + + detections = from_ultralytics_results(results, image_shape=image.shape[:2], names=self._names_override) + filtered = self._apply_filters(image, detections) + return DetectionResults(image=image, detections=filtered) + + def _apply_filters( + self, + image: np.ndarray, + detections: List[SingleDetectionResult], + ) -> List[SingleDetectionResult]: + if self.max_area_ratio is None: + return detections + + image_area = float(image.shape[0] * image.shape[1]) + if image_area <= 0: + return detections + + return [det for det in detections if (det.area() / image_area) <= self.max_area_ratio] + + def stop(self) -> None: + """Clean up resources used by the detector.""" + if hasattr(self.model, "predictor") and self.model.predictor is not None: + predictor = self.model.predictor + if hasattr(predictor, "trackers") and predictor.trackers: + for tracker in predictor.trackers: + if hasattr(tracker, "tracker") and hasattr(tracker.tracker, "gmc"): + gmc = tracker.tracker.gmc + if hasattr(gmc, "executor") and gmc.executor is not None: + gmc.executor.shutdown(wait=True) + self.model.predictor = None + + @staticmethod + def _ensure_mobileclip(model_dir: Path) -> None: + target = model_dir / "mobileclip_blt.ts" + if target.exists() and zipfile.is_zipfile(target): + return + if target.exists() or target.is_symlink(): + target.unlink() + attempt_download_asset(target) + + +__all__ = ["Yoloe2DDetector", "YoloePromptMode"] diff --git a/detect_anything/env-hooks/venv_pythonpath.sh.in b/detect_anything/env-hooks/venv_pythonpath.sh.in new file mode 100644 index 0000000..9212bac --- /dev/null +++ b/detect_anything/env-hooks/venv_pythonpath.sh.in @@ -0,0 +1 @@ +ament_prepend_unique_value PYTHONPATH "@PYTHON_INSTALL_DIR@" diff --git a/detect_anything/msg/DetectionResult.msg b/detect_anything/msg/DetectionResult.msg new file mode 100644 index 0000000..766d5f1 --- /dev/null +++ b/detect_anything/msg/DetectionResult.msg @@ -0,0 +1,10 @@ +std_msgs/Header header +int32[] track_id +float32[] x1 +float32[] y1 +float32[] x2 +float32[] y2 +string[] label +float32[] confidence +sensor_msgs/CompressedImage image +sensor_msgs/CompressedImage[] masks # cropped to bbox; aligned to bbox top-left; encoding: mono8 diff --git a/detect_anything/package.xml b/detect_anything/package.xml new file mode 100644 index 0000000..a7699ae --- /dev/null +++ b/detect_anything/package.xml @@ -0,0 +1,26 @@ + + + + detect_anything + 0.0.1 + YOLO-E detection node and utilities. + Alex Lin + MIT + + ament_cmake + + rosidl_default_generators + rosidl_default_runtime + rosidl_interface_packages + + rclpy + std_msgs + sensor_msgs + vision_msgs + cv_bridge + vector_perception_utils + + + ament_cmake + + diff --git a/detect_anything/resource/detect_anything b/detect_anything/resource/detect_anything new file mode 100644 index 0000000..eac574f --- /dev/null +++ b/detect_anything/resource/detect_anything @@ -0,0 +1 @@ +detect_anything diff --git a/detect_anything/scripts/detection_node b/detect_anything/scripts/detection_node new file mode 100755 index 0000000..822cacf --- /dev/null +++ b/detect_anything/scripts/detection_node @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 +from detect_anything.detection_node import main + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 7b6c110..573b311 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,21 @@ # basic dependencies numpy==1.26.4 opencv-python +colcon-common-extensions scipy pillow tqdm +pyyaml +lark +pytest +moondream +openai # 3D processing open3d -# deep learning -torch<=2.4.0 -torchvision<=1.19.0 +# vision +ultralytics # SAM2 and EdgeTAM dependencies sam2 diff --git a/rviz/semantic_mapping.rviz b/rviz/semantic_mapping.rviz new file mode 100644 index 0000000..86a2272 --- /dev/null +++ b/rviz/semantic_mapping.rviz @@ -0,0 +1,415 @@ +Panels: + - Class: rviz_common/Displays + Help Height: 0 + Name: Displays + Property Tree Widget: + Expanded: + - /Global Options1 + - /Status1 + - /RoomSegmentation1/room_boundaries1/Topic1 + - /RoomSegmentation1/room_type_viz1 + Splitter Ratio: 0.5 + Tree Height: 715 + - Class: rviz_common/Selection + Name: Selection + - Class: rviz_common/Tool Properties + Expanded: + - /2D Goal Pose1 + - /Publish Point1 + Name: Tool Properties + Splitter Ratio: 0.5886790156364441 + - Class: rviz_common/Views + Expanded: + - /Current View1 + Name: Views + Splitter Ratio: 0.5 + - Class: rviz_common/Time + Experimental: false + Name: Time + SyncMode: 0 + SyncSource: PointCloud2 +Visualization Manager: + Class: "" + Displays: + - Alpha: 0.5 + Cell Size: 1 + Class: rviz_default_plugins/Grid + Color: 160; 160; 164 + Enabled: true + Line Style: + Line Width: 0.029999999329447746 + Value: Lines + Name: Grid + Normal Cell Count: 0 + Offset: + X: 0 + Y: 0 + Z: 0 + Plane: XY + Plane Cell Count: 10 + Reference Frame: + Value: true + - Class: rviz_default_plugins/Image + Enabled: true + Max Value: 1 + Median window: 5 + Min Value: 0 + Name: Image + Normalize Range: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /annotated_image_detection + Value: true + - Angle Tolerance: 0.10000000149011612 + Class: rviz_default_plugins/Odometry + Covariance: + Orientation: + Alpha: 0.5 + Color: 255; 255; 127 + Color Style: Unique + Frame: Local + Offset: 1 + Scale: 1 + Value: true + Position: + Alpha: 0.30000001192092896 + Color: 204; 51; 204 + Scale: 1 + Value: true + Value: false + Enabled: true + Keep: 5 + Name: Odometry + Position Tolerance: 0.10000000149011612 + Shape: + Alpha: 1 + Axes Length: 1 + Axes Radius: 0.10000000149011612 + Color: 255; 25; 0 + Head Length: 0.30000001192092896 + Head Radius: 0.10000000149011612 + Shaft Length: 1 + Shaft Radius: 0.05000000074505806 + Value: Arrow + Topic: + Depth: 5 + Durability Policy: Volatile + Filter size: 10 + History Policy: Keep Last + Reliability Policy: Reliable + Value: /state_estimation + Value: true + - Alpha: 0.05000000074505806 + Autocompute Intensity Bounds: false + Autocompute Value Bounds: + Max Value: 10 + Min Value: -10 + Value: true + Axis: Z + Channel Name: intensity + Class: rviz_default_plugins/PointCloud2 + Color: 255; 255; 255 + Color Transformer: FlatColor + Decay Time: 5 + Enabled: true + Invert Rainbow: false + Max Color: 255; 255; 255 + Max Intensity: 161 + Min Color: 0; 0; 0 + Min Intensity: 0 + Name: PointCloud2 + Position Transformer: XYZ + Selectable: false + Size (Pixels): 3 + Size (m): 0.009999999776482582 + Style: Flat Squares + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /registered_scan + Use Fixed Frame: false + Use rainbow: true + Value: true + - Class: rviz_common/Group + Displays: + - Alpha: 1 + Autocompute Intensity Bounds: true + Autocompute Value Bounds: + Max Value: 0.7398463487625122 + Min Value: 0.7398463487625122 + Value: true + Axis: Z + Channel Name: intensity + Class: rviz_default_plugins/PointCloud2 + Color: 224; 27; 36 + Color Transformer: AxisColor + Decay Time: 0 + Enabled: true + Invert Rainbow: false + Max Color: 255; 255; 255 + Max Intensity: 4096 + Min Color: 0; 0; 0 + Min Intensity: 0 + Name: door_cloud + Position Transformer: XYZ + Selectable: true + Size (Pixels): 3 + Size (m): 0.10000000149011612 + Style: Flat Squares + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /door_cloud + Use Fixed Frame: true + Use rainbow: true + Value: true + - Class: rviz_default_plugins/MarkerArray + Enabled: true + Name: room_viewpoints + Namespaces: + viewpoint_room_ids: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /viewpoint_room_ids + Value: true + - Class: rviz_default_plugins/MarkerArray + Enabled: true + Name: room_boundaries + Namespaces: + polygon_group: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /room_boundaries + Value: true + - Class: rviz_default_plugins/Marker + Enabled: true + Name: chosen_room + Namespaces: + {} + Topic: + Depth: 5 + Durability Policy: Volatile + Filter size: 10 + History Policy: Keep Last + Reliability Policy: Reliable + Value: /chosen_room_boundary + Value: true + - Alpha: 0.800000011920929 + Autocompute Intensity Bounds: true + Autocompute Value Bounds: + Max Value: 10 + Min Value: -10 + Value: true + Axis: Z + Channel Name: intensity + Class: rviz_default_plugins/PointCloud2 + Color: 255; 255; 255 + Color Transformer: RGB8 + Decay Time: 0 + Enabled: true + Invert Rainbow: false + Max Color: 255; 255; 255 + Max Intensity: 4096 + Min Color: 0; 0; 0 + Min Intensity: 0 + Name: PointCloud2 + Position Transformer: XYZ + Selectable: true + Size (Pixels): 5 + Size (m): 0.019999999552965164 + Style: Flat Squares + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /room_map_cloud + Use Fixed Frame: true + Use rainbow: true + Value: true + - Class: rviz_default_plugins/MarkerArray + Enabled: true + Name: room_type_viz + Namespaces: + room_type: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /room_type_vis + Value: true + Enabled: true + Name: RoomSegmentation + - Class: rviz_common/Group + Displays: + - Alpha: 1 + Autocompute Intensity Bounds: true + Autocompute Value Bounds: + Max Value: 10 + Min Value: -10 + Value: true + Axis: Z + Channel Name: intensity + Class: rviz_default_plugins/PointCloud2 + Color: 255; 255; 255 + Color Transformer: RGB8 + Decay Time: 0 + Enabled: true + Invert Rainbow: false + Max Color: 255; 255; 255 + Max Intensity: 4096 + Min Color: 0; 0; 0 + Min Intensity: 0 + Name: object_points + Position Transformer: XYZ + Selectable: false + Size (Pixels): 3 + Size (m): 0.009999999776482582 + Style: Flat Squares + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /obj_points + Use Fixed Frame: true + Use rainbow: true + Value: true + - Class: rviz_default_plugins/MarkerArray + Enabled: true + Name: object_bboxes + Namespaces: + bbox: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /obj_boxes + Value: true + - Class: rviz_default_plugins/MarkerArray + Enabled: true + Name: object_labels + Namespaces: + labels: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /obj_labels + Value: true + - Class: rviz_default_plugins/MarkerArray + Enabled: false + Name: object_visibility + Namespaces: + {} + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /object_visibility_connections + Value: false + Enabled: true + Name: object_mapping + Enabled: true + Global Options: + Background Color: 48; 48; 48 + Fixed Frame: map + Frame Rate: 30 + Name: root + Tools: + - Class: rviz_default_plugins/Interact + Hide Inactive Objects: true + - Class: rviz_default_plugins/MoveCamera + - Class: rviz_default_plugins/Select + - Class: rviz_default_plugins/FocusCamera + - Class: rviz_default_plugins/Measure + Line color: 128; 128; 0 + - Class: rviz_default_plugins/SetInitialPose + Covariance x: 0.25 + Covariance y: 0.25 + Covariance yaw: 0.06853891909122467 + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /initialpose + - Class: rviz_default_plugins/SetGoal + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /goal_pose + - Class: rviz_default_plugins/PublishPoint + Single click: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /clicked_point + Transformation: + Current: + Class: rviz_default_plugins/TF + Value: true + Views: + Current: + Class: rviz_default_plugins/Orbit + Distance: 37.76976013183594 + Enable Stereo Rendering: + Stereo Eye Separation: 0.05999999865889549 + Stereo Focal Distance: 1 + Swap Stereo Eyes: false + Value: false + Focal Point: + X: 4.479973316192627 + Y: 11.129703521728516 + Z: -8.338903427124023 + Focal Shape Fixed Size: false + Focal Shape Size: 0.05000000074505806 + Invert Z Axis: false + Name: Current View + Near Clip Distance: 0.009999999776482582 + Pitch: 0.6847962737083435 + Target Frame: map + Value: Orbit (rviz_default_plugins) + Yaw: 3.7372934818267822 + Saved: ~ +Window Geometry: + Displays: + collapsed: false + Height: 2197 + Hide Left Dock: false + Hide Right Dock: true + Image: + collapsed: false + QMainWindow State: 000000ff00000000fd00000004000000000000070c0000078bfc020000000bfb0000001200530065006c0065006300740069006f006e00000001e10000009b000000b200fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000a3fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c00610079007301000000700000033d0000018600fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261fb0000000a0049006d00610067006501000002020000007e0000000000000000fb0000000a0049006d00610067006501000003b9000004420000002800fffffffb0000000a0049006d00610067006500000002bb000000c60000000000000000000000010000010f00000342fc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003f000003420000013800fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e10000019700000003000012140000005efc0100000002fb0000000800540069006d00650100000000000012140000047a00fffffffb0000000800540069006d0065010000000000000450000000000000000000000afc0000078b00000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000 + Selection: + collapsed: false + Time: + collapsed: false + Tool Properties: + collapsed: false + Views: + collapsed: true + Width: 4628 + X: 4656 + Y: 418 diff --git a/semantic_mapping/CMakeLists.txt b/semantic_mapping/CMakeLists.txt new file mode 100644 index 0000000..71da435 --- /dev/null +++ b/semantic_mapping/CMakeLists.txt @@ -0,0 +1,62 @@ +cmake_minimum_required(VERSION 3.10) +project(semantic_mapping) + +find_package(ament_cmake REQUIRED) +find_package(rclpy REQUIRED) +find_package(std_msgs REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(geometry_msgs REQUIRED) +find_package(nav_msgs REQUIRED) +find_package(visualization_msgs REQUIRED) +find_package(cv_bridge REQUIRED) +find_package(rosidl_default_generators REQUIRED) +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +rosidl_generate_interfaces(${PROJECT_NAME} + "msg/DetectionResult.msg" + "msg/NavigationQuery.msg" + "msg/ObjectNode.msg" + "msg/ObjectType.msg" + "msg/RoomEarlyStop1.msg" + "msg/RoomNode.msg" + "msg/RoomNodeList.msg" + "msg/RoomType.msg" + "msg/TargetObject.msg" + "msg/TargetObjectInstruction.msg" + "msg/TargetObjectWithSpatial.msg" + "msg/VlmAnswer.msg" + "msg/ViewpointRep.msg" + DEPENDENCIES std_msgs sensor_msgs geometry_msgs +) + +set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages") + +install( + DIRECTORY semantic_mapping + DESTINATION ${PYTHON_INSTALL_DIR} +) + +install( + PROGRAMS + scripts/semantic_mapping_node + scripts/vlm_reasoning_node + DESTINATION lib/${PROJECT_NAME} +) + +install( + FILES resource/semantic_mapping + DESTINATION share/${PROJECT_NAME}/resource +) + +install( + DIRECTORY config + DESTINATION share/${PROJECT_NAME} +) + +install( + DIRECTORY launch + DESTINATION share/${PROJECT_NAME} +) + +ament_export_dependencies(rosidl_default_runtime) +ament_package() diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md new file mode 100644 index 0000000..6663558 --- /dev/null +++ b/semantic_mapping/README.md @@ -0,0 +1,192 @@ +# Semantic Mapping + +ROS2 package for semantic 3D object mapping with VLM integration. +Includes `vlm_reasoning_node` (Moondream-backed) with unified +`semantic_mapping` message types across planner/VLM nodes. + +## Overview + +This package builds a persistent semantic map of objects detected by `detect_anything`, fusing 2D detections with LiDAR point clouds to create 3D object representations. It publishes `ObjectNode` messages compatible with the tare_planner exploration system. +The pipeline stores voxelized geometry, merges observations, and exports planner‑friendly ROS messages. + +## Status: Work in Progress + +| Component | File | Status | +|-----------|------|--------| +| Messages | `msg/*.msg` | Complete | +| Data Models | `map_object.py` | Complete | +| Matching | `utils/matching.py` | Complete | +| Object Mapper | `mapper.py` | Complete | +| Cloud Projection | `vector_perception_utils/pointcloud_utils.py` | Complete | +| Publishers | `utils/map_object_utils.py` | Complete | +| Image Storage | `utils/storage.py` | Complete | +| Mapping Node | `mapping_node.py` | Complete | +| VLM Reasoning Node | `vlm_reasoning_node.py` | Complete | +| Launch Files | `launch/` | Complete | + +## Messages + +### ObjectNode.msg +Main output message for planner integration: +- `object_id[]`: Track IDs (may contain multiple merged IDs) +- `label`: Dominant class label +- `position`: Object centroid in world frame +- `bbox3d[8]`: 3D bounding box corners +- `cloud`: Object point cloud +- `status`: Lifecycle status ("new", "updated", "unchanged") +- `img_path`: Path to best saved image crop +- `is_asked_vlm`: VLM query flag +- `viewpoint_id`: Associated viewpoint + +### ObjectType.msg +VLM query/answer for object relabeling. + +### RoomType.msg +VLM query/answer for room classification. + +### TargetObjectInstruction.msg +User instruction for target object search. + +### TargetObjectWithSpatial.msg +Target object query with spatial/viewpoint context. + +### ViewpointRep.msg +Viewpoint trigger for image saving. + +Note: `detect_anything/DetectionResult` masks are cropped to the bbox and aligned to +the bbox top-left (as documented in `detect_anything/msg/DetectionResult.msg`). +The mapping node reconstructs full-image masks before projection. + +## Architecture + +``` +detect_anything/DetectionResult + │ + ▼ +┌───────────────────────────────────────────────┐ +│ semantic_mapping_node │ +│ │ +│ ┌────────────────────┐ ┌──────────────┐ │ +│ │ pointcloud_utils │ │ ObjectMapper │ │ +│ │ (projection + mask)├──▶│ (pending/ │ │ +│ │ │ │ permanent) │ │ +│ └────────────────────┘ └──────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌────────────────┐ ┌──────────────┐ │ +│ │ MapObject │ │ map_object_ │ │ +│ │ (voxels/votes) ├─▶│ utils │ │ +│ └────────────────┘ └──────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ utils/storage.py mapping_node.py │ +└───────────────────────────────────────────────┘ + │ + ▼ +/object_nodes, /obj_points, /obj_boxes, /obj_labels +``` + +Key components: +- `map_object.py`: Object state, voxel voting, pose de‑duplication, cached geometry. +- `utils/matching.py`: Track‑id and spatial matching with IoU/ratio heuristics. +- `mapper.py`: Pending/permanent tiers, merge rules, and pruning. +- `vector_perception_utils/pointcloud_utils.py`: Projection + mask segmentation utilities. +- `utils/map_object_utils.py`: ROS message construction helpers. +- `utils/storage.py`: Async image save queue. +- `mapping_node.py`: ROS2 orchestration (subscriptions, sync, publish). + +## Nodes + +| Node | Purpose | +|------|---------| +| `semantic_mapping_node` | Builds the 3D object map and publishes planner object topics. | +| `vlm_reasoning_node` | Answers room/object queries via Moondream and publishes VLM answers. | + +## Dependencies + +- `detect_anything`: Detection results with masks +- `vector_perception_utils`: Point cloud and image utilities +- ROS2 messages: `sensor_msgs`, `geometry_msgs`, `nav_msgs`, `visualization_msgs` + +## Usage + +```bash +# Build +colcon build --packages-select semantic_mapping + +# Run (after implementation complete) +ros2 launch semantic_mapping semantic_mapping.launch.py +``` + +## Configuration + +- `config/params.yaml`: Default node parameters for `semantic_mapping_node`. +- `config/camera_param.yaml`: Camera parameters + static sensor→camera transform for projection. +- `config/objects.yaml`: Object taxonomy for detection labels. + +Key runtime params: +- `regularize_voxel_size`: Connectivity voxel size used in shape regularization. +- `ground_filter`, `ground_radius`, `ground_z_clearance`: Simple ground clamp around robot. +- `visualization_max_objects`: Limits RViz markers/pointcloud to most recent objects. +- `object_type_query_mode`: `target_only`, `all`, or `target_or_all` for VLM queries. +- `object_type_query_max_per_cycle`: Max number of object queries per mapping cycle. + +## Topics + +### Subscriptions +| Topic | Type | Description | +|-------|------|-------------| +| `/detection_result` | `detect_anything/DetectionResult` | 2D detections with masks | +| `/registered_scan` | `sensor_msgs/PointCloud2` | LiDAR point cloud | +| `/state_estimation` | `nav_msgs/Odometry` | Robot odometry | +| `/object_type_answer` | `ObjectType` | VLM relabeling response | +| `/target_object_instruction` | `TargetObjectInstruction` | Target object search | +| `/viewpoint_rep_header` | `ViewpointRep` | Viewpoint trigger | + +### Publications +| Topic | Type | Description | +|-------|------|-------------| +| `/object_nodes` | `ObjectNode` | Semantic object map | +| `/obj_points` | `sensor_msgs/PointCloud2` | Colored object point cloud | +| `/obj_boxes` | `visualization_msgs/MarkerArray` | 3D bounding box wireframes | +| `/obj_labels` | `visualization_msgs/MarkerArray` | Object label text | +| `/object_type_query` | `ObjectType` | VLM query for relabeling | +| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Target object + viewpoints | + +Note: planner/VLM nodes now share `semantic_mapping/msg/*` directly (no bridge). + +## VLM Reasoning Topics + +### Subscriptions +| Topic | Type | Description | +|-------|------|-------------| +| `/object_type_query` | `ObjectType` | Object relabeling query | +| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Spatial target query | +| `/room_type_query` | `RoomType` | Room classification query | +| `/keyboard_input` | `std_msgs/String` | Instruction text | + +### Publications +| Topic | Type | Description | +|-------|------|-------------| +| `/object_type_answer` | `ObjectType` | Object relabeling answer | +| `/target_object_answer` | `TargetObjectWithSpatial` | Spatial target answer | +| `/target_object_instruction` | `TargetObjectInstruction` | Parsed instruction fields | +| `/room_type_answer` | `RoomType` | Room classification answer | +| `/room_type_vis` | `visualization_msgs/MarkerArray` | Room label markers | + +## References + +- Full migration plan: [`docs/SEMANTIC_MAPPING_MIGRATION.md`](docs/SEMANTIC_MAPPING_MIGRATION.md) +- Original implementation: `VLM_ROS/src/semantic_mapping/` (main branch) + - SAM2 for segmentation + - External Gemini API for VLM reasoning +- Detection: `detect_anything` package (decoupled) +- dimos patterns: `dimos/perception/detection/objectDB.py` + +## Handoff Notes + +- Regularization uses vote-weighted filtering + voxel connected components; adjust + `regularize_voxel_size` and the regularization percentile (in `MapObject`) if + thin structures are dropped. +- Mapping node logs summary stats and a single mapping loop time per cycle. +- Permanent objects persist; only pending objects time out via `pending_ttl_s`. diff --git a/semantic_mapping/config/camera_param.yaml b/semantic_mapping/config/camera_param.yaml new file mode 100644 index 0000000..8206adb --- /dev/null +++ b/semantic_mapping/config/camera_param.yaml @@ -0,0 +1,16 @@ +image_width: 1920 +image_height: 640 +hfov: 360.0 +vfov: 120.0 + +sensor_to_camera: + parent_frame: sensor + child_frame: camera_link + translation: + x: -0.12 + y: -0.075 + z: 0.265 + rotation_rpy: + roll: -1.5707963 + pitch: 0.0 + yaw: -1.5707963 diff --git a/semantic_mapping/config/params.yaml b/semantic_mapping/config/params.yaml new file mode 100644 index 0000000..f373fb6 --- /dev/null +++ b/semantic_mapping/config/params.yaml @@ -0,0 +1,57 @@ +semantic_mapping_node: + ros__parameters: + detection_topic: /detection_result + cloud_topic: /registered_scan + odom_topic: /state_estimation + viewpoint_topic: /viewpoint_rep_header + object_type_answer_topic: /object_type_answer + object_type_query_topic: /object_type_query + target_object_spatial_query_topic: /target_object_spatial_query + target_object_instruction_topic: /target_object_instruction + object_nodes_topic: /object_nodes + obj_points_topic: /obj_points + obj_boxes_topic: /obj_boxes + obj_labels_topic: /obj_labels + map_frame: map + + confidence_threshold: 0.35 + voxel_size: 0.05 + regularize_voxel_size: 0.15 + distance_threshold: 0.3 + iou_threshold: 0.3 + min_detections_for_permanent: 3 + pending_ttl_s: 5.0 + num_angle_bins: 15 + + detection_linear_state_time_bias: 0.0 + detection_angular_state_time_bias: 0.0 + cloud_window_before: 0.5 + cloud_window_after: 0.1 + + camera_param_path: config/camera_param.yaml + depth_filter: true + depth_filter_margin: 0.15 + depth_jump_threshold: 0.5 + mask_erode_kernel: 3 + mask_erode_iterations: 1 + max_cloud_distance: 8.0 + visualization_max_objects: 100 + ground_filter: true + ground_radius: 2.5 + ground_z_clearance: 0.0 + + target_object: refrigerator + enable_object_type_queries: False + save_png: True + output_dir: output/object_images + save_viewpoint_images: True + viewpoint_output_dir: output/viewpoint_images + viewpoint_use_timestamp_dir: True + viewpoint_save_png: True + viewpoint_save_transform: True + +vlm_reasoning_node: + ros__parameters: + dump_room_type_images: True + room_type_dump_dir: output/room_type_images + room_type_cache_s: 10.0 \ No newline at end of file diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md new file mode 100644 index 0000000..2c341fb --- /dev/null +++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md @@ -0,0 +1,543 @@ +# Semantic Mapping Migration Plan + +## Overview + +This document outlines the comprehensive plan for porting the `semantic_mapping` package from `VLM_ROS` into `vector_perception_ros/semantic_mapping`, following the coding standards of `vector_perception_ros` while preserving functional compatibility with downstream consumers (particularly the tare_planner). + +> **Note**: Detection is decoupled into `detect_anything` package. VLM reasoning uses external Gemini API. + +### Current Status (Feb 2026) +- `semantic_mapping` and `detect_anything` are fully migrated and buildable. +- `detect_anything` installs `models/` into the package share; defaults resolve + `model_path` via package share to avoid cwd dependence. +- Room segmentation is now in `vector_perception_ros/sensor_coverage`. +- `/navigation_boundary` publication from room segmentation has been removed; + the dedicated `navigationBoundary` node should own that topic. + +### Handoff Notes (Next Agent) +- Validate that bagfile/real/sim launch files in autonomy-stack match VLM_ROS. +- Ensure `detect_anything` downloads MobileCLIP into the resolved model directory + (if weights still go to `~/.ultralytics/weights`, confirm `model_path` param). +- After changes, rebuild `detect_anything`, `semantic_mapping`, `sensor_coverage`. + +## Design Principles + +1. **Logic cleanup, not restructure** - Preserve the core algorithms (voxel voting, observation angles, shape regularization, IoU/distance matching) while cleaning up dead code and improving organization. +2. **Same I/O contract** - Maintain the same ROS topic names, message types, and semantics so existing planner integration continues to work. +3. **vector_perception_ros coding standards** - Full type hints, Google-style docstrings, dataclass containers, proper ROS2 patterns. +4. **ObjectDB-style patterns** - Adopt the two-tier (pending/permanent) architecture and dual matching (track_id + distance) from dimos where it improves clarity. + +--- + +## Implementation Progress + +| Step | Component | Status | +|------|-----------|--------| +| 1 | Package Setup & Messages | **COMPLETED** | +| 2 | Data Models (`map_object.py`) | **COMPLETED** | +| 2.1 | Refactor to use shared utilities | **COMPLETED** | +| 3 | Matching Utilities (`utils/matching.py`) | **COMPLETED** | +| 4 | Object Mapper (`mapper.py`) | **COMPLETED** | +| 5 | Cloud-Image Projection (`pointcloud_utils.py`) | **COMPLETED** | +| 6 | Message Publishers (`utils/map_object_utils.py`) | **COMPLETED** | +| 7 | Image Storage (`utils/storage.py`) | **COMPLETED** | +| 8 | Mapping Node (`mapping_node.py`) | **COMPLETED** | +| 9 | Launch Files & Config | **COMPLETED** | +| 10 | Integration Testing | **COMPLETED** | +| 11 | VLM Reasoning Node + Room Type Viz | **COMPLETED** | +| 12 | Tare Message Bridge (/semantic topics) | **COMPLETED** | + +### Step 2.1: Refactored map_object.py to use shared utilities + +Added to `vector_perception_utils/transform_utils.py`: +- `normalize_angle()` - now supports both scalar and array input +- `discretize_angles()` - discretize angles into bin indices + +Added to `vector_perception_utils/pointcloud_utils.py`: +- `get_oriented_bbox()` - returns (center, corners) tuple from minimal OBB + +Refactored in `map_object.py`: +- Removed duplicate `_normalize_angles_to_pi` and `_discretize_angles` functions +- Simplified `_compute_geometry()` from 43 lines to 15 using `get_oriented_bbox()` +- Use `create_pointcloud()` in `regularize_shape()` instead of manual o3d creation +- Use `compute_centroid()` instead of `np.mean(..., axis=0)` + +--- + +## VLM_ROS Reference (Source Implementation) + +Key files in `VLM_ROS/src/semantic_mapping/`: +- `semantic_map_new.py`: `ObjMapper` manages the persistent map +- `single_object_new.py`: `SingleObject` holds per-object state +- `semantic_mapping_node.py`: `MappingNode` orchestrates SAM2 masks, LiDAR projection, odom transforms + +### ObjMapper Core Flow +- Input: dict with `bboxes`, `labels`, `confidences`, `ids`, `masks` (from SAM2), plus LiDAR cloud and odom +- Memory fix: Uses `.copy()` on detection data extraction to prevent memory leaks +- VLM query threshold: Objects need `best_image_mask_area > 500` before querying VLM +- `update_map()` steps: + 1. Transform incoming cloud into body/world frame using odom + 2. Filter detections by confidence threshold + 3. For each detection: reconstruct full-image masks (bbox-cropped in `detect_anything/msg/DetectionResult.msg`), project masked cloud, build per-object voxels, then either merge or create a `SingleObject` + 4. Merge heuristics: match by tracker `obj_id`, 3D IoU, distance thresholds, and optional primitive groups (e.g., chair/sofa) + 5. Maintain delete lists, save queues (best image/mask per object) + 6. Publish ROS messages: `ObjectNode`, bbox markers, label markers, colored point cloud + +### SingleObject Highlights +- Keeps class_id→counts/confidence, track_id list, voxel grid (`VoxelFeatureManager`), robot poses, masks, best image saving +- Computes 3D bbox (axis-aligned + oriented), centroids, clustering flags, and validity caches +- Manages persistence (`life`, `inactive_frame`, `publish_status`) and VLM interaction state + +### dimos Patterns to Adopt +- `ObjMapper` ⇄ `ObjectDB`: explicit pending/permanent tiers, track_id + spatial matching, promotion thresholds +- `SingleObject` ⇄ `Object`: cleaner update patterns, separation of core geometry vs VLM metadata + +--- + +## Phase 1: Package Structure & Messages (COMPLETED) + +### 1.1 Package Structure + +``` +semantic_mapping/ +├── CMakeLists.txt # Hybrid CMake for message generation + Python install +├── package.xml # Dependencies: detect_anything, vector_perception_utils, ROS msgs +├── msg/ +│ ├── ObjectNode.msg +│ ├── ObjectType.msg +│ ├── TargetObjectInstruction.msg +│ ├── TargetObjectWithSpatial.msg +│ └── ViewpointRep.msg +├── config/ +│ ├── objects.yaml # Object taxonomy (to be copied from VLM_ROS) +│ └── params.yaml # Default node parameters +├── launch/ +│ └── semantic_mapping.launch.py +├── semantic_mapping/ +│ ├── __init__.py +│ ├── mapping_node.py # ROS2 node (orchestration) +│ ├── mapper.py # ObjectMapper class (core logic) +│ ├── map_object.py # Data classes: MapObject, Observation, VoxelManager +│ ├── utils/ +│ │ ├── matching.py # Object matching utilities (IoU, distance, track_id) +│ │ ├── map_object_utils.py # ROS message conversion helpers +│ │ └── storage.py # Async image saving queue +│ ├── (moved to vector_perception_utils) # Cloud-image projection helpers +├── docs/ +│ └── SEMANTIC_MAPPING_MIGRATION.md +└── resource/ + └── semantic_mapping # ament resource marker +``` + +### 1.2 Message Definitions + +**ObjectNode.msg** (preserves exact schema for planner compatibility): +``` +std_msgs/Header header +int32[] object_id +string label +geometry_msgs/Point position +geometry_msgs/Point[8] bbox3d # 8 corner points in world frame +sensor_msgs/PointCloud2 cloud +string status # "new", "updated", "unchanged" +string img_path +bool is_asked_vlm +int32 viewpoint_id +``` + +**RoomType.msg**: Room classification query/answer for VLM reasoning +**ObjectType.msg**: VLM query/answer for object relabeling +**TargetObjectInstruction.msg**: User-set target/anchor objects +**TargetObjectWithSpatial.msg**: Target object query with viewpoint context +**ViewpointRep.msg**: Viewpoint trigger message + +--- + +## Phase 2: Data Models (COMPLETED) + +### 2.1 ObjectStatus Enum + +```python +class ObjectStatus(Enum): + """Object lifecycle status (ObjectDB-style tiering).""" + PENDING = "pending" # < min_detections threshold + PERMANENT = "permanent" # >= threshold, confirmed object + DELETED = "deleted" # Marked for removal +``` + +### 2.2 Observation + +```python +@dataclass +class Observation: + """A single detection observation to update a MapObject.""" + track_id: int + label: str + confidence: float + points: np.ndarray # (N, 3) world-frame points + robot_pose: dict # {R: (3,3), t: (3,)} + mask: np.ndarray | None = None # 2D segmentation mask + image: np.ndarray | None = None # RGB crop for best-image selection + timestamp: float = 0.0 +``` + +### 2.3 VoxelFeatureManager + +```python +@dataclass +class VoxelFeatureManager: + """Manages voxelized point cloud with observation angle voting.""" + voxels: np.ndarray # (N, 3) world coords + votes: np.ndarray # (N,) detection count per voxel + observation_angles: np.ndarray # (N, num_bins) angle histogram + regularized_mask: np.ndarray | None # (N,) valid voxel mask after clustering + voxel_size: float = 0.03 + num_angle_bins: int = 20 + + def update(self, new_points: np.ndarray, robot_position: np.ndarray) -> None: + """Add new points with KDTree-based merge and angle voting.""" + + def get_valid_voxels(self, min_votes: int = 1) -> np.ndarray: + """Return voxels passing vote threshold and regularization mask.""" +``` + +### 2.4 MapObject + +```python +@dataclass +class MapObject: + """Represents a tracked object in the semantic map.""" + # Identity + object_id: str # UUID for this object + track_ids: list[int] # Associated tracker IDs (can merge) + + # Classification (voted) + class_votes: dict[str, int] # label -> observation count + confidence_scores: dict[str, float] # label -> best confidence + + # Geometry + voxel_manager: VoxelFeatureManager + robot_poses: list[dict] # [{R, t}, ...] observation poses + + # Lifecycle (ObjectDB-style) + detection_count: int = 0 + inactive_frames: int = 0 + status: ObjectStatus = ObjectStatus.PENDING + + # VLM interaction + is_asked_vlm: bool = False + updated_by_vlm: bool = False + best_image_path: str | None = None + best_image_mask_area: float = 0.0 + + @property + def label(self) -> str: ... + @property + def centroid(self) -> np.ndarray: ... + @property + def bbox3d(self) -> np.ndarray: ... + def update(self, observation: Observation) -> None: ... + def merge(self, other: MapObject) -> None: ... + def regularize_shape(self, percentile: float = 0.85) -> None: ... +``` + +--- + +## Phase 3: Matching Utilities (utils/matching.py) + +```python +def compute_iou_3d(bbox1: np.ndarray, bbox2: np.ndarray) -> float: + """Compute 3D IoU between two axis-aligned bounding boxes.""" + +def compute_center_distance(obj1: MapObject, obj2: MapObject) -> float: + """Compute Euclidean distance between object centroids.""" + +def are_merge_compatible(label1: str, label2: str, merge_groups: list[set[str]]) -> bool: + """Check if two labels can be merged (same class or in same merge group).""" + +def find_best_match( + obs: Observation, + candidates: list[MapObject], + distance_threshold: float, + iou_threshold: float, +) -> MapObject | None: + """Find best matching object by distance and IoU criteria.""" +``` + +--- + +## Phase 4: Object Mapper (mapper.py) + +```python +class ObjectMapper: + """ + Manages the semantic object map with two-tier state management. + Pending objects are promoted to permanent after detection_count >= threshold. + """ + + def __init__( + self, + voxel_size: float = 0.03, + confidence_threshold: float = 0.30, + distance_threshold: float = 0.5, + iou_threshold: float = 0.2, + min_detections_for_permanent: int = 3, + pending_ttl_s: float = 10.0, + max_inactive_frames: int = 20, + ) -> None: + self._pending_objects: dict[str, MapObject] = {} + self._permanent_objects: dict[str, MapObject] = {} + self._track_id_map: dict[int, str] = {} + self._lock = threading.RLock() + self._merge_groups: list[set[str]] = [ + {"chair", "sofa", "couch", "armchair"}, + {"table", "desk"}, + ] + + def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: ... + def _match_observation(self, obs: Observation) -> MapObject | None: ... + def _create_object(self, obs: Observation) -> MapObject: ... + def _check_promotion(self, obj: MapObject) -> None: ... + def _try_merge_objects(self) -> None: ... + def _prune_inactive(self, target_object: str | None) -> list[MapObject]: ... + def get_objects(self, include_pending: bool = False) -> list[MapObject]: ... +``` + +--- + +## Phase 5: Mapping Node (mapping_node.py) + +### 5.1 Node Structure + +```python +class SemanticMappingNode(Node): + """ROS2 node for semantic 3D object mapping.""" + + def __init__(self) -> None: + super().__init__("semantic_mapping_node") + self._declare_parameters() + self.mapper = ObjectMapper(...) + self.projection = pointcloud_utils + self.image_saver = ImageSaveQueue(...) + + # Data buffers (with locks) + self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=50) + self._odom_buffer: deque[tuple[float, Pose]] = deque(maxlen=100) + self._lock = threading.Lock() + + self._create_subscriptions() + self._create_publishers() + self.create_timer(0.5, self._mapping_callback) +``` + +### 5.2 Subscriptions + +| Topic | Type | QoS | +|-------|------|-----| +| `/detection_result` | `detect_anything/DetectionResult` | Reliable | +| `/registered_scan` | `PointCloud2` | Best Effort | +| `/state_estimation` | `Odometry` | Best Effort | +| `/semantic/object_type_answer` | `ObjectType` | Reliable | +| `/semantic/target_object_instruction` | `TargetObjectInstruction` | Reliable | +| `/semantic/viewpoint_rep_header` | `ViewpointRep` | Reliable | + +### 5.3 Publishers + +| Topic | Type | Description | +|-------|------|-------------| +| `/object_nodes` | `ObjectNode` | Main output for planner | +| `/obj_points` | `PointCloud2` | Colored object cloud | +| `/obj_boxes` | `MarkerArray` | Bbox wireframes | +| `/obj_labels` | `MarkerArray` | Text labels | +| `/object_type_query` | `ObjectType` | VLM query | +| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Target object + viewpoints | + +Note: semantic_mapping topics are now published without a `/semantic` prefix and +no bridge is required. Annotated debug images are published by `detect_anything`. +Note: `semantic_mapping.launch.py` now includes a temporary image republisher +to bridge `compressed_image_topic` (e.g., `/camera/image/compressed`) to +`image_topic` (default `/camera/image`) so legacy `tare_planner` room-type +queries can still consume raw `sensor_msgs/Image`. + +--- + +## Phase 6: Cloud-Image Projection (pointcloud_utils.py) + +```python +def project_pointcloud_to_image(...): ... +def segment_pointcloud_by_mask(...): ... +def segment_pointclouds_from_projection(...): ... +def transform_segmented_clouds_to_world(...): ... +``` + +--- + +## Phase 7: Message Publishers (utils/map_object_utils.py) + +```python +def map_object_to_object_node(obj: MapObject, header: Header, status: str, target_object: str | None = None) -> ObjectNode | None: ... +def create_bbox_markers(objects: list[MapObject], header: Header) -> MarkerArray: ... +def create_label_markers(objects: list[MapObject], header: Header) -> MarkerArray: ... +def create_object_pointcloud(objects: list[MapObject], header: Header) -> PointCloud2 | None: ... +def create_delete_markers(deleted_ids: list[int], header: Header, namespace: str) -> MarkerArray: ... +``` + +Publisher helpers: +- `map_object_to_object_node` converts a `MapObject` into an `ObjectNode` with centroid, bbox corners, and point cloud. +- Marker helpers emit bbox wireframes and text labels for RViz. +- `create_object_pointcloud` builds a colored aggregate cloud from object voxels. + +--- + +## Phase 8: Image Storage (utils/storage.py) + +```python +class ImageSaveQueue: + """Background worker for saving object images to disk.""" + + def __init__(self, output_dir: Path, max_queue_size: int = 100) -> None: ... + def start(self) -> None: ... + def stop(self) -> None: ... + def queue_save(self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None) -> str: ... +``` + +Image storage: +- Async worker writes `.npy` (and optional `.png`) crops to disk. +- Queue is bounded to avoid blocking the main mapping loop. +- Viewpoint snapshots are saved as `viewpoint_{id}.png` and + `viewpoint_{id}_transform.npy` in `viewpoint_output_dir` (timestamped per run). + +--- + +## Phase 9: Launch Files + +```python +# semantic_mapping.launch.py +DeclareLaunchArgument('platform', default_value='mecanum_sim') +DeclareLaunchArgument('detection_topic', default_value='/detection_result') +DeclareLaunchArgument('cloud_topic', default_value='/registered_scan') +DeclareLaunchArgument('odom_topic', default_value='/state_estimation') +DeclareLaunchArgument('voxel_size', default_value='0.03') +DeclareLaunchArgument('confidence_threshold', default_value='0.30') +DeclareLaunchArgument('target_object', default_value='') +``` + +--- + +## Implementation Checklist + +### Completed +- [x] Create `CMakeLists.txt` for message generation +- [x] Port message definitions from tare_planner +- [x] Set up `package.xml` with dependencies +- [x] Implement `map_object.py` with VoxelFeatureManager, MapObject, Observation +- [x] Port lazy computation patterns (centroid, bbox3d, regularization) +- [x] Refactor `map_object.py` to use shared utilities from `vector_perception_utils` +- [x] Add `normalize_angle`, `discretize_angles` to `transform_utils.py` +- [x] Add `get_oriented_bbox` to `pointcloud_utils.py` +- [x] Simplify `_compute_geometry()` using Open3D oriented bbox +- [x] Implement `utils/matching.py` with IoU, distance, merge-compatibility functions +- [x] Implement `mapper.py` with ObjectDB-style tiering +- [x] Implement cloud-image projection helpers in `vector_perception_utils/pointcloud_utils.py` +- [x] Implement `utils/map_object_utils.py` message conversion functions +- [x] Implement `utils/storage.py` with async queue +- [x] Implement `mapping_node.py` following detect_anything patterns +- [x] Add `RoomType.msg` and room-type VLM flow + visualization +- [x] Add `vlm_reasoning_node.py` with Moondream-backed reasoning +- [x] Remove `tare_message_bridge` and `/semantic` topic namespace + +### Remaining +- [x] Create launch files and params.yaml +- [x] Integration testing with bag files + +--- + +## Code Cleanup Items + +Items from VLM_ROS to preserve: +- Memory fixes with `.copy()` on detection data extraction +- VLM query threshold (`best_image_mask_area > 500`) + +Items to remove: +1. Unused publishers: `/cloud_image` +2. Unused methods: `serialize_objs_to_bag()`, `is_merge_allowed()`, `handle_object_query()` +3. Review constants: `BACKGROUND_OBJECTS`, `VERTICAL_OBJECTS`, `BIG_OBJECTS`, `SMALL_OBJECTS` +4. Don't port: `byte_track`, `Grounded-SAM-2`, `mmconfig` directories + +--- + +## Compatibility Checklist + +- [x] `/semantic/object_nodes` publishes ObjectNode with exact schema (bridged to `/object_nodes`) +- [x] `/obj_points` publishes colored PointCloud2 +- [x] `/obj_boxes` publishes MarkerArray wireframes +- [x] `/obj_labels` publishes MarkerArray text +- [x] `/semantic/object_type_query` publishes ObjectType for VLM +- [x] `/semantic/room_type_query` publishes RoomType for VLM +- [x] Subscribes to `/detection_result` from detect_anything +- [x] Subscribes to `/registered_scan` PointCloud2 +- [x] Subscribes to `/state_estimation` Odometry +- [x] Subscribes to `/semantic/target_object_instruction` +- [x] Subscribes to `/semantic/viewpoint_rep_header` +- [x] Subscribes to `/semantic/object_type_answer` +- [x] Subscribes to `/semantic/room_type_answer` +- [x] Object positions, bbox corners, status flags match expected semantics +- [x] Delete markers properly remove objects from RViz + +--- + +## Message Type Alignment (Planner Integration) + +Planner/VLM traffic now uses `semantic_mapping/msg/*` end-to-end. The +previous package-level mismatch has been removed, so no bridge is required. + +### Impact + +- `semantic_mapping` publishes and subscribes to `semantic_mapping/*` messages + on planner topic names (no `/semantic` prefix required). + +--- + +## Recent Updates (Jan 2026) + +- Unified launch to `semantic_mapping.launch.py` with mode handling, detect_anything integration, and bagfile defaults. +- Fixed cloud/image projection transform to match legacy `CloudImageFusion` math. +- Added mask erosion, depth-jump pruning, distance gating, and light outlier filtering for cleaner object clouds. +- Added reprojected observation-angle voting for better multi-view aggregation. +- Fixed bbox ordering and switched `/obj_boxes` to axis-aligned markers (via `get_oriented_bbox(axis_aligned=True)`). +- Replaced DBSCAN regularization with vote-weighted filtering + voxel connected components. +- Added `regularize_voxel_size` to grow connectivity for thin structures. +- Simplified ground filtering to radius-based Z clamp around robot. +- Added `visualization_max_objects` to keep recent objects visible in RViz. +- Logging: mapping stats (total/total_deleted/total_permanent) + mapping loop time. +- Fixed image saving color order and ensured output dir is cleared on start. +- Set MobileCLIP weights cache to `detect_anything/models` and removed corrupted caches. +- Viewpoint snapshots are saved via `ImageSaveQueue` with timestamped run folders. +- Added `RoomType.msg` and VLM room-type query/answer with room label visualization. +- Removed `/semantic` topic namespace and bridge in favor of unified message types. +- Moved `project_bbox3d_equirect` into `vector_perception_utils` to drop `dimos` dependency. +- Added Moondream request timeouts and escaped JSON prompt templates to avoid hangs. +- Added object-type query fallback when target labels are absent. +- Stabilized room label colors and trimmed non-essential logging to debug level. + +## Temporary Compatibility Notes + +- Planner and mapping nodes now share `semantic_mapping/msg/*`; no bridge is required. + +## Temporary Handoff Notes + +- Regularization is now connected-components; tune `regularize_voxel_size` and keep ratio + (`MapObject.regularize_shape` percentile) if legs/sparse parts are dropped. +- Ground filter uses `ground_filter`, `ground_radius`, `ground_z_clearance`. +- If bagfile sync issues persist, consider switching mapping inputs to `/sensor_scan` + `/state_estimation_at_scan`. +- Verify VLM objects.yaml path in VLM_ROS (`vlm_reasoning_node` still expects old path). + +--- + +## Planner Dependencies + +The exploration planner (`sensor_coverage_planner_ground.cpp` in tare_planner) is tightly coupled to `ObjectNode`: +- Subscribes to `/object_nodes` and stores per-object reps +- Uses object positions, bbox corners, clouds, status, `is_asked_vlm`, and `img_path` for navigation decisions + +**Implication**: Keep the `ObjectNode` schema stable. Any new fields from the `MapObject` model can be mapped into extensions, but existing fields and semantics must remain for planner compatibility. diff --git a/semantic_mapping/docs/TARE_PLANNER_VLM_DIFF_AND_MIGRATION.md b/semantic_mapping/docs/TARE_PLANNER_VLM_DIFF_AND_MIGRATION.md new file mode 100644 index 0000000..471e99c --- /dev/null +++ b/semantic_mapping/docs/TARE_PLANNER_VLM_DIFF_AND_MIGRATION.md @@ -0,0 +1,733 @@ +# TARE Planner (VLM_ROS vs Autonomy Stack) Diff + VLM Logic Deep Dive + +## 0) Purpose and Scope +This document compares the two `tare_planner/src` trees and provides a detailed +breakdown of the VLM/room logic inside +`VLM_ROS/src/exploration_planner/tare_planner/src/sensor_coverage_planner/sensor_coverage_planner_ground.cpp`. +It also proposes a migration path into a new `vector_perception_ros` +package named `sensor_coverage_planner`, including room segmentation. + +### 0.1 Migration Status (Feb 2026) +Completed: +- `tare_planner` in autonomy-stack now uses `semantic_mapping/msg/*` in + `representation` and `sensor_coverage_planner_ground` (headers + source). +- `tare_planner` CMake and package dependencies updated for `semantic_mapping`. +- `navigationBoundary` publisher code matches VLM_ROS. +- Added `boundary_sim.ply` to `tare_planner/data` for sim launch. +- `explore_world.launch` and `explore_world_sim.launch` default + `use_boundary:=false` (no boundary published unless explicitly enabled). +- Room segmentation node moved to `vector_perception_ros/sensor_coverage`. + +Open items / verification: +- Re-check `system_bagfile_with_exploration_planner.launch.py` alignment with + VLM_ROS (file was reverted during edits). +- Ensure `tare_planner` install includes `boundary_sim.ply` after rebuild. +- Remove `pub_room_boundary_tmp_` declaration in room segmentation header + if no longer used. + +Scope: +- Compare all package/file differences between: + - `autonomy_stack_mecanum_wheel_platform/src/exploration_planner/tare_planner/src` + - `VLM_ROS/src/exploration_planner/tare_planner/src` +- Map all ROS inputs/outputs (topics + message types) for the nodes in scope. +- Deep dive on the main VLM logic file (SensorCoveragePlanner3D). +- Feasibility + cleanup/migration plan into `vector_perception_ros`. + +## 1) Context From Existing Migration Docs (Short) +Key context pulled from: +- `vector_perception_ros/vlm/docs/VLM_INPUTS_AND_SELECTION.md` +- `vector_perception_ros/vlm/docs/VLM_NODE_MIGRATION.md` +- `vector_perception_ros/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md` + +Highlights relevant to `tare_planner`: +- Room-type VLM images are produced inside `sensor_coverage_planner_ground.cpp` + via `project_pcl_to_image(...)` (cropped panorama + room mask). Viewpoint + snapshots are **not** used for room type. +- Object-type VLM uses the "best image" crop from semantic mapping, not planner. +- Target-object spatial queries use viewpoint snapshots and 3D bbox projection. +- Message types are now unified under `semantic_mapping/msg/*`; no bridge is + required. +- Real-time constraints: avoid heavy per-parameter checks and excessive + try/except patterns in hot loops. + +## 2) Directory and File Differences + +### 2.1 Directory-Level Diff +Only in `VLM_ROS/src/exploration_planner/tare_planner/src`: +- `representation/` + - `representation.cpp` +- `room_segmentation/` + - `room_segmentation.cpp` + - `room_segmentation_incremental_normal.cpp` + - `room_segmentation_backup.cpp` + +All other directories are present in both trees. + +### 2.2 File-Level Diff (Common Directories) +Files that differ (content changes): +- `grid_world/grid_world.cpp` +- `local_coverage_planner/local_coverage_planner.cpp` +- `planning_env/planning_env.cpp` +- `rolling_occupancy_grid/rolling_occupancy_grid.cpp` +- `sensor_coverage_planner/sensor_coverage_planner_ground.cpp` +- `utils/misc_utils.cpp` +- `viewpoint/viewpoint.cpp` +- `viewpoint_manager/viewpoint_manager.cpp` + +Summary of differences by file: + +**`grid_world.cpp`** +- Adds room-level state and navigation gates: + - `current_room_id_`, `transit_across_room_`, `room_finished_`, + `room_near_finished_`, `door_position_`. +- Adds object/anchor state gates: + - `object_found_`, `anchor_object_found_`, `found_object_position_`. +- Reads room resolution + voxel dimensions (`room_x/y/z`) and computes `shift_`. +- Restricts cell status transitions to cells that are "in current room". +- Adds explicit pathing for: + - transit across rooms (door path), + - found object path, + - anchor object TSP path (dynamic mode). +- Adds `InLocalPlanningHorizonWithoutRoom` gating for roadmap connections. + +**`local_coverage_planner.cpp`** +- Introduces room-aware viewpoint selection: + - `viewpoint_roomcandidate_indices` and + `ViewPointRoomCandidate` sampling. +- Lookahead and boundary viewpoint logic now bypasses room gating when needed: + - `InLocalPlanningHorizonWithoutRoom(...)`. +- Adds `transit_across_room_`, `object_found_`, `anchor_object_found_` gates. +- When transiting/target found: local planning becomes a constrained path + using navigation viewpoints rather than exploration sampling. + +**`planning_env.cpp`** +- Adds door collision checks and parameters: + - `kDoorCollisionCheckRadius`, `kDoorCollisionCheckPointNumThr`. +- Adds occupancy-grid line-of-sight hooks: + - `CheckLineOfSightInOccupancyGrid(...)` + - `Pos2Sub(...)`, `InRange(...)`. +- Adds `occupied_cloud` storage for room segmentation inputs. +- Improves safety on coverage-boundary usage (warn when empty). + +**`rolling_occupancy_grid.cpp`** +- Adds sensor-range-limited ray tracing and keeps "COVERED" cells. +- Adds visibility / line-of-sight check for ray casting: + - `CheckLineOfSight(...)`, `PublishVoxels(...)`. +- Adds helper conversions `Pos2Sub(...)`, `InRange(...)`. +- Adds parameters for viewpoint collision margin Z. +- Publishes ray voxels for debugging. + +**`utils/misc_utils.cpp`** +- Adds `SetIntersection(...)` (used for viewpoint rep updates). +- Adds BFS path helper `find_path_bfs(...)` (room transitions). +- Adds `idToColor(...)` and HSV conversion (stable room coloring). +- Adds voxel mapping helpers: + - `point_to_voxel(...)`, `voxel_to_point(...)`, + cropped variants for room masks. + +**`viewpoint.cpp`** +- Adds room flags to each viewpoint: + - `in_room_`, `is_room_candidate_`. + +**`viewpoint_manager.cpp`** +- Reads room parameters and maintains `room_mask_`. +- Adds `CheckViewPointRoomBoundaryCollision()` to set `in_room_`. +- Adds room candidate flagging and exposure in candidate selection. +- Adds `InLocalPlanningHorizonWithoutRoom(...)` to allow pathing near doors. +- Adds freespace cloud publication and room gates on candidate viewpoints. + +**`sensor_coverage_planner_ground.cpp`** +- Huge expansion to integrate: + - `representation` (viewpoint, room, object). + - `room_segmentation` inputs. + - VLM room/object query + navigation logic. + - Object visibility graph and viewpoint snapshots. + - Room navigation state machine (finish, change, transit). +- Full deep dive in section 4. + +### 2.3 Common Core Between the Two `sensor_coverage_planner_ground.cpp` Versions +These are shared in both the autonomy-stack and VLM_ROS files (the VLM version +layers additional room/object logic on top). + +**Common node lifecycle** +- `SensorCoveragePlanner3D` class, 1 Hz `execute()` timer. +- Same init chain: `ReadParameters()` → `InitializeData()` → `initialize()`. +- Shared managers and core members: `keypose_graph_`, `planning_env_`, + `grid_world_`, `local_coverage_planner_`, `viewpoint_manager_`, + `visualizer_`, plus the same PCL debug clouds. + +**Common subscriptions** +- `/exploration_start`, `/state_estimation_at_scan`, `/registered_scan` +- `/terrain_map`, `/terrain_map_ext` +- `/coverage_boundary`, `/navigation_boundary`, `/nogo_boundary` +- `/joy`, `/reset_waypoint` + +**Common publishers** +- `global_path_full`, `global_path`, `local_path`, `exploration_path` +- `/way_point`, `exploration_finish` +- `runtime_breakdown`, `/runtime`, `momentum_activation_count` + +**Common planning pipeline in `execute()`** +- Start gate on `/exploration_start` unless `kAutoStart`. +- First-cycle bootstrap: `SendInitialWaypoint()` + start-time init. +- On `keypose_cloud_update_`: + - `CountDirectionChange()` + - `UpdateGlobalRepresentation()` + - `UpdateViewPoints()` and `UpdateKeyposeGraph()` + - `UpdateViewPointCoverage()` + `UpdateCoveredAreas()` + - `GlobalPlanning()` + `LocalPlanning()` + - `ConcatenateGlobalLocalPath()` + `GetLookAheadPoint()` + - `PublishWaypoint()` + visualization + runtime publishing + +**Common helper functions** +- `SendInitialWaypoint`, `UpdateKeyposeGraph`, `UpdateViewPoints`, + `UpdateViewPointCoverage`, `UpdateCoveredAreas`, `UpdateGlobalRepresentation` +- `GlobalPlanning`, `LocalPlanning` +- `PublishGlobalPlanningVisualization`, `PublishLocalPlanningVisualization` +- `ConcatenateGlobalLocalPath`, `GetLookAheadPoint`, `PublishWaypoint`, + `PublishRuntime`, `CountDirectionChange` + +## 3) ROS I/O Mapping + +### 3.1 `SensorCoveragePlanner3D` (VLM_ROS) +Node: `tare_planner_node` creates and spins `SensorCoveragePlanner3D`. + +Subscriptions (inputs): +- `/exploration_start` (`std_msgs/Bool`) +- `/state_estimation_at_scan` (`nav_msgs/Odometry`) +- `/registered_scan` (`sensor_msgs/PointCloud2`) +- `/terrain_map` (`sensor_msgs/PointCloud2`) +- `/terrain_map_ext` (`sensor_msgs/PointCloud2`) +- `/coverage_boundary` (`geometry_msgs/PolygonStamped`) +- `/navigation_boundary` (`geometry_msgs/PolygonStamped`) +- `/current_room_boundary` (`geometry_msgs/PolygonStamped`) [declared param, used in room manager] +- `/nogo_boundary` (`geometry_msgs/PolygonStamped`) +- `/joy` (`sensor_msgs/Joy`) +- `/reset_waypoint` (`std_msgs/Empty`) +- `/object_nodes` (`tare_planner/ObjectNode`) +- `/door_cloud` (`sensor_msgs/PointCloud2`) +- `/room_nodes_list` (`tare_planner/RoomNodeList`) +- `/goal_point` (`geometry_msgs/PointStamped`) +- `/room_mask` (`sensor_msgs/Image`) +- `/camera/image` (`sensor_msgs/Image`) +- `/room_type_answer` (`tare_planner/RoomType`) +- `/room_navigation_answer` (`tare_planner/VlmAnswer`) +- `/keyboard_input` (`std_msgs/String`) +- `/target_object_instruction` (`tare_planner/TargetObjectInstruction`) +- `/target_object_answer` (`tare_planner/TargetObject`) +- `/anchor_object_answer` (`tare_planner/TargetObject`) + +Publishers (outputs): +- `global_path_full` (`nav_msgs/Path`) +- `global_path` (`nav_msgs/Path`) +- `old_global_path` (`nav_msgs/Path`) +- `to_nearest_global_subspace_path` (`nav_msgs/Path`) +- `local_path` (`nav_msgs/Path`) +- `exploration_path` (`nav_msgs/Path`) +- `/way_point` (`geometry_msgs/PointStamped`) +- `exploration_finish` (`std_msgs/Bool`) +- `runtime_breakdown` (`std_msgs/Int32MultiArray`) +- `/runtime` (`std_msgs/Float32`) +- `momentum_activation_count` (`std_msgs/Int32`) +- `pointcloud_manager_neighbor_cells_origin` (`geometry_msgs/PointStamped`) +- `viewpoint_rep_header` (`tare_planner/ViewpointRep`) +- `object_visibility_connections` (`visualization_msgs/MarkerArray`) +- `viewpoint_object_visibility` (`std_msgs/String`) [currently unused] +- `/door_position` (`geometry_msgs/PointStamped`) +- `/door_normal` (`visualization_msgs/Marker`) +- `/room_cloud` (`sensor_msgs/PointCloud2`) +- `/room_type_query` (`tare_planner/RoomType`) +- `/room_early_stop_1` (`tare_planner/RoomEarlyStop1`) +- `/room_type_vis` (`visualization_msgs/MarkerArray`) +- `viewpoint_room_ids` (`visualization_msgs/MarkerArray`) +- `/room_navigation_query` (`tare_planner/NavigationQuery`) +- `/object_node_markers` (`visualization_msgs/MarkerArray`) +- `/chosen_room_boundary` (`visualization_msgs/Marker`) +- `/target_object_query` (`tare_planner/TargetObject`) +- `/anchor_object_query` (`tare_planner/TargetObject`) +- `/target_object_spatial_query` (`tare_planner/TargetObjectWithSpatial`) +- `/room_anchor_point` (`geometry_msgs/PointStamped`) + +Key parameters (VLM additions): +- `room_resolution`, `room_x`, `room_y`, `room_z` +- `rep_threshold_`, `kRepSensorRange` +- `kRushRoomDist_1`, `kRushRoomDist_2` +- `kCollisionGrid*`, `kCoverage*` (inherited) + +I/O delta vs autonomy_stack: +- Autonomy version uses only exploration, map, boundary, joystick, and path + topics. VLM_ROS adds: room segmentation, object nodes, VLM queries/answers, + viewpoint reps, visibility markers, navigation queries, door clouds, room + masks, camera image pipeline. + +### 3.2 `RoomSegmentationNode` (VLM_ROS) +Node: `room_segmentation_node` (standalone main in `room_segmentation.cpp`). + +Subscriptions: +- `/registered_scan` (`sensor_msgs/PointCloud2`) +- `/state_estimation` (`nav_msgs/Odometry`) +- `/occupied_cloud` (`sensor_msgs/PointCloud2`) +- `/freespace_cloud` (`sensor_msgs/PointCloud2`) + +Publishers: +- `/explore_areas_new` (`sensor_msgs/PointCloud2`) +- `/room_mask_vis` (`sensor_msgs/Image`) +- `/room_mask` (`sensor_msgs/Image`) +- `/door_cloud` (`sensor_msgs/PointCloud2`) +- `/debug_cloud` (`sensor_msgs/PointCloud2`) +- `/free_cloud_1` (`sensor_msgs/PointCloud2`) +- `/current_room_boundary` (`geometry_msgs/PolygonStamped`) +- `/navigation_boundary` (`geometry_msgs/PolygonStamped`) +- `/room_boundaries` (`visualization_msgs/MarkerArray`) +- `/walls` (`sensor_msgs/PointCloud2`) +- `/room_nodes` (`tare_planner/RoomNode`) +- `/room_nodes_list` (`tare_planner/RoomNodeList`) +- `/room_map_cloud` (`sensor_msgs/PointCloud2`) + +Key parameters: +- `room_resolution`, `room_x/y/z`, `rolling_occupancy_grid.resolution_x` +- `exploredAreaVoxelSize`, `ceilingHeight_`, `wall_thres_height_` +- `region_growing_radius`, `min_room_size`, `dilation_iteration` +- `normal_search_num`, `normal_search_radius` +- `kViewPointCollisionMarginZPlus`, `kViewPointCollisionMarginZMinus` + +### 3.3 `navigationBoundary` Node +Node: `navigationBoundary` (file-based boundary publisher). + +Publishers: +- `/navigation_boundary` (`geometry_msgs/PolygonStamped`) + +Inputs: +- File on disk via `boundary_file_dir` param +- Params: `sendBoundary`, `sendBoundaryInterval` + +### 3.4 Legacy `SensorCoveragePlanner3D` Outputs (Autonomy Stack) +These are the outputs of the **old** planner that must be preserved/relayed +so navigation keeps working unchanged. + +**Primary outputs (used by navigation stack + tooling)** +- `global_path_full` (`nav_msgs/Path`) +- `global_path` (`nav_msgs/Path`) +- `old_global_path` (`nav_msgs/Path`) +- `to_nearest_global_subspace_path` (`nav_msgs/Path`) +- `local_path` (`nav_msgs/Path`) +- `exploration_path` (`nav_msgs/Path`) +- `/way_point` (`geometry_msgs/PointStamped`) +- `exploration_finish` (`std_msgs/Bool`) +- `runtime_breakdown` (`std_msgs/Int32MultiArray`) +- `/runtime` (`std_msgs/Float32`) +- `momentum_activation_count` (`std_msgs/Int32`) +- `pointcloud_manager_neighbor_cells_origin` (`geometry_msgs/PointStamped`) + +**Debug/visualization outputs (PCLCloud wrappers)** +These are published via `PCLCloud::Publish()` in the old file and appear as +topics with the names passed at construction time: +- `keypose_cloud`, `registered_scan_stack`, `registered_cloud` +- `terrain_cloud_large`, `terrain_collision_cloud`, `terrain_ext_collision_cloud` +- `viewpoint_vis_cloud`, `viewpoint_in_collision_cloud_` +- `grid_world_vis_cloud`, `bspline_path_cloud`, `selected_viewpoint_vis_cloud` +- `collision_cloud`, `lookahead_point_cloud`, `keypose_graph_cloud` +- `pointcloud_manager_cloud`, `reordered_global_subspace_cloud` + +If you only want to keep strict navigation compatibility, the **primary** +outputs above are the ones that must be relayed; debug topics can be optional. + +## 4) SensorCoveragePlanner3D VLM Logic: Deep Dive + +This section breaks down the VLM/room logic inside +`sensor_coverage_planner_ground.cpp` with explicit loops, data dependencies, +and I/O. + +### 4.0 Legacy Core Data Structures Reused by VLM Logic +These are **old planner data structures** that the VLM logic depends on and +assumes to behave the same way. They should stay intact when migrating the +VLM logic into `vector_perception_ros`. + +**Core managers (shared across old + new logic)** +- `planning_env_`: provides coverage, collision cloud, occupancy-grid access, + and diff/stacked clouds used by room updates and visibility checks. +- `grid_world_`: global exploration state, cell status, and return-home logic. +- `viewpoint_manager_`: viewpoint candidates, LOS/connectivity, and local horizon. +- `keypose_graph_`: shortest-path computations for objects and room navigation. +- `local_coverage_planner_`: local path TSP and waypoint ordering. +- `visualizer_`: markers for local/global planning, paths, and debug views. + +**State gating and control** +- `keypose_cloud_update_`: the main update latch that drives each cycle. +- `robot_position_`, `robot_yaw_`, `moving_direction_`, `moving_forward_`: + pose and motion state used throughout VLM + navigation logic. +- `lookahead_point_`, `lookahead_point_direction_`: + main input to waypoint publishing. +- `visited_positions_`: used for coverage heuristics and candidate pruning. + +**Cloud buffers and derived products** +- `keypose_cloud_`, `registered_scan_stack_`, `registered_cloud_`: + keypose + scan accumulation and downsampling. +- `terrain_*` clouds: terrain height and collision data for viewpoint filtering. +- `collision_cloud_`, `viewpoint_vis_cloud_`, `viewpoint_in_collision_cloud_`: + inputs to viewpoint candidate selection and visualization. +- `grid_world_vis_cloud_`, `exploration_path_cloud_`, + `selected_viewpoint_vis_cloud_`, `lookahead_point_cloud_`, + `keypose_graph_vis_cloud_`, `pointcloud_manager_neighbor_cloud_`: + shared debug channels for planners and RViz. + +**Planning state** +- `exploration_path_` and intermediate global/local paths. +- `robot_viewpoint_`: robot coverage proxy for occupancy updates. + +These structures are **not VLM-specific**, but the new VLM logic reads from +them heavily (coverage clouds, keypose graph distances, and viewpoint manager +state), so they must be preserved in the new node. +### 4.1 Core Data Structures and State +**Representation layer** +- `representation_` (new in VLM_ROS) + - `ViewPointRep`: viewpoint snapshots + visibility lists. + - `RoomNodeRep`: room labels, anchor point, polygon, room mask. + - `ObjectNodeRep`: object attributes + visible viewpoints. + +**Room grid** +- `room_mask_`, `room_mask_old_`: `cv::Mat` integer room id maps. +- `room_voxel_dimension_`, `room_resolution_`, `shift_`: map voxel grid. +- `current_room_id_`, `target_room_id_`, `start_room_id_`, `end_room_id_`. +- `prev_room_id_`, `previous_room_id_`: used for split/merge handling. +- `adjacency_matrix`: room connectivity from door clouds. + +**Door / transition state** +- `door_cloud_`, `door_cloud_final_`, `door_cloud_in_range_`. +- `door_position_`, `door_normal_`. +- `transit_across_room_`, `at_room_`, `near_room_1_`, `near_room_2_`. +- `enter_wrong_room_`, `ask_vlm_change_room_`. + +**VLM query state** +- `ask_vlm_near_room_`, `ask_vlm_finish_room_`, `asked_in_advance_`. +- `room_navigation_query_counter_`, `room_finished_counter_`. +- `candidate_room_position_`, `has_candidate_room_position_`. + +**Object state** +- `found_object_`, `found_object_id_`, `found_object_distance_`, `ask_found_object_`. +- `found_anchor_object_`, `anchor_object_`, `found_anchor_object_viewpoint_positions_`. +- `obj_score_`, `considered_object_ids_`. + +**Viewpoint representation** +- `viewpoint_reps_`, `curr_viewpoint_rep_node_ind`. +- `add_viewpoint_rep_`, `rep_threshold_`, `rep_sensor_range`, `rep_threshold_voxel_num_`. +- `current_obs_voxel_inds_`, `previous_obs_voxel_inds_`. + +**Camera + odom alignment** +- `camera_image_`, `imageTime` (from `/camera/image`). +- Odom stacks for interpolation: `odomTimeStack`, `lidarXStack`, etc. + +### 4.2 Input Callbacks (Data Ingestion) +**StateEstimationCallback** +- Updates robot position, yaw, motion direction. +- Maintains odom ring buffers for time-aligned projection in `GetPoseAtTime`. +- Updates `viewpoint_rep_msg_` header stamp/frame. + +**RegisteredScanCallback** +- Aggregates scans, downsamples, updates planning environment. +- Every N frames: pushes keypose node into graph and sets `keypose_cloud_update_`. + +**RoomMaskCallback** +- Converts `/room_mask` image to `cv::Mat` (int32), resizes to room voxel grid. +- Updates: + - `room_mask_` + - `viewpoint_manager_` room mask + - `grid_world_` room mask + - `representation_` viewpoint room ids (`UpdateViewpointRoomIdsFromMask`) + +**RoomNodeListCallback** +- Ingests `/room_nodes_list` and updates representation room map. +- Marks nodes as alive, removes dead nodes, validates anchor points against mask. + +**ObjectNodeCallback** +- Updates representation object node from `/object_nodes`. +- Maintains `latest_object_node_indices_` for visibility updates. + +**DoorCloudCallback** +- Ingests `/door_cloud`, filters by collision, builds adjacency matrix. +- Colors door point cloud for visualization. + +**CameraImageCallback** +- Stores panoramic image for room crop pipeline. +- Updates `imageTime` for projection alignment. + +**RoomTypeCallback** +- Ingests `/room_type_answer`: + - Locates room id from anchor point and room mask. + - Accumulates voxel counts per label. + - If room label changes, marks objects in that room as unconsidered. + +**RoomNavigationAnswerCallback** +- Ingests `/room_navigation_answer`. +- `answer_type == 1`: early stop; immediately sets goal via anchor point. +- `answer_type == 0`: room navigation answer; stores candidate room anchor. + +**TargetObjectInstructionCallback** +- Resets target/anchor object context. +- Clears considered flags on objects and resets room asked flags. + +**TargetObjectCallback / AnchorObjectCallback** +- Marks object or anchor as found if VLM confirms. +- Updates distances and stores object position + room id. + +### 4.3 Object Visibility Graph (Major Loop) +**`UpdateObjectVisibility()`** +- Input: `representation_->GetLatestObjectNodeIndicesMutable()` (new objects). +- For each object id: + - Fetch object position + cloud, then convert cloud to voxel indices + via `Convert2Voxels(...)` (uses `planning_env_->Pos2Sub` + `InRange`). + - For each `ViewPointRep`: + - If viewpoint already references the object, mark direct visibility and + continue. + - If object has the viewpoint in its visible list, connect immediately. + - Else do ray-casting visibility: + - Compute viewpoint camera position (+0.265 z). + - Skip if outside `rep_sensor_range`. + - For each object voxel, call + `CheckRayVisibilityInOccupancyGrid(...)` which defers to + `rolling_occupancy_grid_->CheckLineOfSight(...)`. + - On first visible voxel: + - Add object index to viewpoint. + - Add viewpoint id to object. + - Mark object as not considered. + - Object-room relation: + - Map object centroid to room voxel -> room id. + - If room id is 0 (unknown), dilate search by 2 voxels to find a valid id. + - Call `representation_->SetObjectRoomRelation(...)`. +- After processing all new objects: + - Compute `obj_score_` by ray-casting from current robot position to object + voxels, with higher weight if object has no visible viewpoints. + - This score drives `UpdateViewpointRep()` (adds viewpoint reps when high). + +**`UpdateViewpointObjectVisibility()`** +- Runs when a new viewpoint rep is created. +- For each object in `ObjectNodeRepMap`: + - Skip if already in current viewpoint visibility. + - Ray-cast from current viewpoint position to object voxels. + - On visibility: add to viewpoint + add viewpoint to object + clear considered. + +**Visualization** +- `CreateVisibilityMarkers()`: publishes `MarkerArray` lines between viewpoints + and visible objects; color indicates direct vs inferred visibility. +- `PublishViewpointRoomIdMarkers()`: publishes text markers `VR`. + +### 4.4 Viewpoint Representation Update +**`UpdateViewpointRep()`** +- Pulls `current_obs_voxel_inds_` from `planning_env_`. +- Computes intersection with `previous_obs_voxel_inds_` via + `misc_utils_ns::SetIntersection`. +- Adds a new viewpoint rep when: + - `intersection_voxel_num < rep_threshold_voxel_num_` **and** + - `intersection_ratio < rep_threshold_`, + - or `obj_score_ > 4.0`. +- On add: + - `representation_->AddViewPointRep(...)`. + - Set room id to `current_room_id_`. + - Publish `viewpoint_rep_header` only if a new rep was actually inserted. + - Update covered voxels and reset `latest_object_node_indices_`. +- Publishes viewpoint rep cloud + all covered points for visualization. + +### 4.5 Room and Door Transition Logic +**`SetCurrentRoomId()`** (room transitions) +- Uses current and previous robot voxels in `room_mask_`/`room_mask_old_`. +- If `transit_across_room_` or `found_object_`: room id updates immediately. +- If room id unchanged or uninitialized: update and return. +- If room id changes: + - Case 1: `room_mask_old_` at new position already differs from current room + -> robot already knew it was a new room -> set `enter_wrong_room_ = true`. + - Case 2: overlap between old/new masks determines: + - merge: if overlap > 80% -> update current room id. + - split: otherwise -> set `ask_vlm_change_room_ = true` and update. + +**`GoalPointCallback()`** +- Resolves `target_room_id_` from room mask at goal point. +- If goal room == current room and not transiting: reset room info. +- Else sets `transit_across_room_ = true`, stores goal, publishes `/room_anchor_point`. + +**`SetStartAndEndRoomId()`** +- Uses `misc_utils_ns::find_path_bfs` on `adjacency_matrix` to find a room path. +- Chooses the last two rooms in the path as `start_room_id_`/`end_room_id_`. +- Calls `SetRoomPosition(start_room_id_, end_room_id_)`. + +**`SetRoomPosition()`** +- Filters `door_cloud_` for door points between `start_room_id_` and `end_room_id_`. +- Uses door centroid as transition goal (`door_position_`). +- Computes `door_normal_` via `GetDoorNormal(...)`: + - Sweeps angles, validates by checking room ids along normal and reverse. + - Averages valid normals for stability. +- Publishes `/door_position` and `/door_normal` markers. +- Publishes chosen room boundary marker. + +**`CheckDoorCloudInRange()`** +- Finds door points within local planning horizon. +- For unlabeled rooms: + - Computes candidate viewpoint near the door. + - Marks viewpoint as room candidate + (`viewpoint_manager_->SetViewPointRoomCandidate`). + - Publishes `door_cloud_in_range_`. + +### 4.6 Room Type Query Pipeline +**`UpdateRoomLabel()`** +- Pulls covered points in range from `planning_env_`. +- For each room id: + - Accumulate voxel counts + per-room cloud + center estimate. +- For **unlabeled rooms**: + - Update room node voxel count and mark as labeled. + - Build a room cloud (points + room center). + - Project to panorama via `project_pcl_to_image(...)`. + - Set anchor point and save image in `RoomNodeRep`. + - Publish `/room_type_query` (RoomType image + mask + anchor). + - If room is large enough and unvisited, may call `ChangeRoomQuery(...)`. +- For **labeled rooms**: + - If voxel count increase or area growth threshold exceeded: + - Optionally re-project image. + - Publish updated `/room_type_query`. + - May call `ChangeRoomQuery(...)`. +- If `ask_vlm_change_room_` set -> `ChangeRoomQuery(prev_room_id_, current_room_id_, true)`. + +**`project_pcl_to_image(...)`** +- Transforms 3D points from world -> lidar -> camera frames. +- Projects to equirectangular panorama. +- Tracks horizontal pixel range for room points. +- Adds special projection for room center and rotates the panorama so the center + is in the middle. +- Crops the panorama to `[min_u, max_u]` with padding. + +**`RoomTypeCallback()`** +- Maps anchor point to room id via `room_mask_`. +- Accumulates label votes based on `voxel_num`. +- If label changes: clears `is_considered` on objects in the room. + +**`PublishRoomTypeVisualization()`** +- Renders labeled rooms with stable color (via `idToColor`) and room id text. + +### 4.7 Object and Anchor VLM Queries +**Target object flow** +- `CheckObjectFound()`: + - If target found: update distance + room id and set `ask_found_object_` if + within thresholds. + - Sets path goal to found object's room anchor point. + - If found in current room: `SetFoundTargetObject()`. +- If not found, iterate all objects: + - Must match target label, not considered, `is_asked_vlm_ == true`. + - Must have valid image path. + - Publish either: + - `/target_object_query` (no spatial condition), or + - `/target_object_spatial_query` (with viewpoint ids + bbox3d). + - Mark object as considered. + +**Anchor object flow** (dynamic mode) +- `CheckAnchorObjectFound()`: + - Similar gating as target, but uses room condition and anchor label. + - Publishes `/anchor_object_query`. + - If anchor found: + - Collects visible viewpoint positions to build search tour. + - If anchor in current room: `SetFoundAnchorObject()`. + +### 4.8 Main `execute()` Loop (High-Level Flow) +1. **Start gate**: waits for `/exploration_start` unless `kAutoStart`. +2. **First init**: + - Send initial waypoint. + - Set `start_time_`, `global_direction_switch_time_`. + - `initialized_ = true`. +3. **Every 1 Hz tick**: + - `ProcessObjectNodes()` (remove deleted objects). + - `CheckObjectFound()` and `CheckAnchorObjectFound()` if dynamic. + - Handle reset flag to clear coverage. +4. **On `keypose_cloud_update_`**: + - `UpdateRoomLabel()`. + - `SetCurrentRoomId()`. + - If transiting: re-run `GoalPointCallback()` and `SetStartAndEndRoomId()`. + - If `at_room_`: reset room info and resume local coverage. + - `CountDirectionChange()`. + - `UpdateObjectVisibility()`. + - `UpdateGlobalRepresentation()` (grid world + keypose graph). + - `UpdateViewpointRep()` and `PublishViewpointRoomIdMarkers()`. + - If new viewpoint rep: `UpdateViewpointObjectVisibility()`. + - `CreateVisibilityMarkers()`. + - `UpdateViewPoints()` -> candidate count check. + - `CheckDoorCloudInRange()`, `UpdateKeyposeGraph()`. + - Coverage updates + uncovered area. + - Global + local TSP planning. + - Room finish handling: + - publish navigation query in advance if nearly finished. + - call `GetAnswer()` and mark room covered if finished. + - Build exploration path; compute lookahead; publish waypoint. + - Publish markers, runtime, room labels, object markers. + +### 4.9 Data Flow Summary (Text Graph) +- `/room_mask` + `/room_nodes_list` -> `RoomNodeRep` + room id grid -> + room labels + anchor points -> `/room_type_query` + `/room_navigation_query`. +- `/object_nodes` -> `ObjectNodeRep` -> visibility graph -> target/anchor queries. +- `/camera/image` + odom stacks -> `project_pcl_to_image` -> `RoomType` image. +- `/door_cloud` + room mask -> adjacency -> door/transition planning. +- `rolling_occupancy_grid` -> line-of-sight -> visibility + viewpoint rep updates. + +## 5) Migration Feasibility and Cleanup Plan +**Feasibility**: Yes, but the logic must be modularized to preserve real-time +performance and avoid tight coupling between perception and exploration. + +### 5.1 Proposed Package Layout +``` +vector_perception_ros/ +└── sensor_coverage_planner/ + ├── include/sensor_coverage_planner/ + │ ├── planner_core.h + │ ├── room_mask_manager.h + │ ├── room_query_builder.h + │ ├── object_visibility_manager.h + │ ├── room_transition_manager.h + │ └── representation/representation.h + ├── src/ + │ ├── planner_core.cpp + │ ├── room_mask_manager.cpp + │ ├── room_query_builder.cpp + │ ├── object_visibility_manager.cpp + │ ├── room_transition_manager.cpp + │ ├── representation/representation.cpp + │ └── sensor_coverage_planner_node.cpp + ├── room_segmentation/ + │ ├── room_segmentation_node.cpp + │ └── room_segmentation_utils.cpp + └── launch/ +``` + +### 5.2 Migration Steps (Incremental) +1. **Create new package** with same topic contracts as VLM_ROS. +2. **Port `representation/` first**, keep interfaces stable. +3. **Port room segmentation node** with exact topics: + - `/room_mask`, `/room_nodes_list`, `/door_cloud`. +4. **Split planner into managers**: + - `RoomMaskManager`: room id updates + split/merge logic. + - `RoomQueryBuilder`: room-type query generation + image projection. + - `RoomTransitionManager`: door traversal, BFS, pathing. + - `ObjectVisibilityManager`: visibility graph + ray-casting. +5. **Maintain message compatibility**: + - Use `semantic_mapping/msg/*` across planner and VLM nodes. +6. **Integrate with semantic mapping**: + - `/object_nodes` and `/room_type_answer` are consumed directly (no prefix). +7. **Validate on bag files** with topic parity checks. + +### 5.3 Cleanup Opportunities (Low Risk) +- Extract `project_pcl_to_image` into a standalone utility. +- Cache voxel indices for viewpoints and objects across frames. +- Gate ray-casting by range + coarse visibility before full line-of-sight. +- Centralize room-id lookup in one helper to avoid duplication. + +## 6) Risks and Compatibility Notes +- **Message package mismatch**: + - `tare_planner` vs `semantic_mapping` need explicit bridging. +- **Time alignment**: + - Image timestamp and odom interpolation must stay consistent. +- **Room resolution mismatch**: + - All room-related components must share `room_resolution` and voxel dims. +- **Visibility differences**: + - Changes in occupancy grid or ray tracing will affect VLM query triggers. +- **Dynamic mode**: + - Anchor-object TSP logic must be preserved if used in deployment. + +## 7) Suggested Next Actions +- Isolate `project_pcl_to_image` and add unit tests (math + crop bounds). +- Build `RoomMaskManager` from existing code. +- Add integration test for room-type query pipeline. +- Verify planner/VLM topics use `semantic_mapping/msg/*` directly (no bridge). diff --git a/semantic_mapping/launch/semantic_mapping.launch.py b/semantic_mapping/launch/semantic_mapping.launch.py new file mode 100644 index 0000000..01bff4b --- /dev/null +++ b/semantic_mapping/launch/semantic_mapping.launch.py @@ -0,0 +1,220 @@ +"""Launch semantic mapping and detect_anything with a mode selector.""" + +import os + +from ament_index_python.packages import get_package_share_directory +from launch import LaunchDescription +from launch.actions import DeclareLaunchArgument, OpaqueFunction +from launch.substitutions import LaunchConfiguration +from launch_ros.actions import Node + + +def _parse_bool(value: str, default: bool) -> bool: + if value == "": + return default + return value.lower() in ("true", "1", "yes") + + +def _launch_nodes(context): + mode = LaunchConfiguration("mode").perform(context).lower() + use_compressed_arg = LaunchConfiguration("use_compressed").perform(context) + odom_topic_arg = LaunchConfiguration("odom_topic").perform(context) + + use_sim_time = mode == "bagfile" + use_compressed = _parse_bool(use_compressed_arg, default=mode in ("bagfile", "sim")) + + if odom_topic_arg: + odom_topic = odom_topic_arg + else: + odom_topic = "/aft_mapped_to_init_incremental" if mode == "bagfile" else "/state_estimation" + + params_file = LaunchConfiguration("params_file").perform(context) + camera_param_path = LaunchConfiguration("camera_param_path").perform(context) + detection_topic = LaunchConfiguration("detection_topic").perform(context) + cloud_topic = LaunchConfiguration("cloud_topic").perform(context) + viewpoint_topic = LaunchConfiguration("viewpoint_topic").perform(context) + object_type_answer_topic = LaunchConfiguration("object_type_answer_topic").perform(context) + object_type_query_topic = LaunchConfiguration("object_type_query_topic").perform(context) + target_object_spatial_query_topic = LaunchConfiguration("target_object_spatial_query_topic").perform( + context + ) + target_object_instruction_topic = LaunchConfiguration("target_object_instruction_topic").perform( + context + ) + target_object_answer_topic = LaunchConfiguration("target_object_answer_topic").perform(context) + instruction_topic = LaunchConfiguration("instruction_topic").perform(context) + room_type_query_topic = LaunchConfiguration("room_type_query_topic").perform(context) + room_type_answer_topic = LaunchConfiguration("room_type_answer_topic").perform(context) + room_segmentation_params = LaunchConfiguration("room_segmentation_params").perform(context) + objects_yaml_path = LaunchConfiguration("objects_yaml_path").perform(context) + object_nodes_topic = LaunchConfiguration("object_nodes_topic").perform(context) + obj_points_topic = LaunchConfiguration("obj_points_topic").perform(context) + obj_boxes_topic = LaunchConfiguration("obj_boxes_topic").perform(context) + obj_labels_topic = LaunchConfiguration("obj_labels_topic").perform(context) + map_frame = LaunchConfiguration("map_frame").perform(context) + target_object = LaunchConfiguration("target_object").perform(context) + save_png = _parse_bool(LaunchConfiguration("save_png").perform(context), default=False) + output_dir = LaunchConfiguration("output_dir").perform(context) + image_topic = LaunchConfiguration("image_topic").perform(context) + compressed_image_topic = LaunchConfiguration("compressed_image_topic").perform(context) + conf = float(LaunchConfiguration("conf").perform(context)) + max_area_ratio = float(LaunchConfiguration("max_area_ratio").perform(context)) + + detect_anything_node = Node( + package="detect_anything", + executable="detection_node", + name="detect_anything_detection_node", + output="screen", + parameters=[ + { + "use_sim_time": use_sim_time, + "use_compressed": use_compressed, + "image_topic": image_topic, + "compressed_image_topic": compressed_image_topic, + "conf": conf, + "max_area_ratio": max_area_ratio, + } + ], + ) + + image_republisher_node = Node( + package="image_transport", + executable="republish", + name="camera_image_republisher", + output="screen", + remappings=[ + ("in/compressed", compressed_image_topic), + ("out", image_topic), + ], + parameters=[ + { + "use_sim_time": use_sim_time, + "in_transport": "compressed", + "out_transport": "raw", + } + ], + ) + + room_seg_params = [] + if room_segmentation_params: + room_seg_params.append(room_segmentation_params) + room_seg_params.append({"use_sim_time": use_sim_time}) + + room_segmentation_node = Node( + package="sensor_coverage", + executable="room_segmentation_node", + name="room_segmentation_node", + output="screen", + parameters=room_seg_params, + ) + + semantic_mapping_node = Node( + package="semantic_mapping", + executable="semantic_mapping_node", + name="semantic_mapping_node", + output="screen", + parameters=[ + params_file, + { + "use_sim_time": use_sim_time, + "camera_param_path": camera_param_path, + "detection_topic": detection_topic, + "cloud_topic": cloud_topic, + "odom_topic": odom_topic, + "viewpoint_topic": viewpoint_topic, + "object_type_answer_topic": object_type_answer_topic, + "object_type_query_topic": object_type_query_topic, + "target_object_spatial_query_topic": target_object_spatial_query_topic, + "target_object_instruction_topic": target_object_instruction_topic, + "object_nodes_topic": object_nodes_topic, + "obj_points_topic": obj_points_topic, + "obj_boxes_topic": obj_boxes_topic, + "obj_labels_topic": obj_labels_topic, + "map_frame": map_frame, + "target_object": target_object, + "save_png": save_png, + "output_dir": output_dir, + }, + ], + ) + + vlm_reasoning_node = Node( + package="semantic_mapping", + executable="vlm_reasoning_node", + name="vlm_reasoning_node", + output="screen", + parameters=[ + params_file, + { + "use_sim_time": use_sim_time, + "camera_param_path": camera_param_path, + "objects_yaml_path": objects_yaml_path, + "object_type_query_topic": object_type_query_topic, + "object_type_answer_topic": object_type_answer_topic, + "target_object_spatial_query_topic": target_object_spatial_query_topic, + "target_object_answer_topic": target_object_answer_topic, + "target_object_instruction_topic": target_object_instruction_topic, + "instruction_topic": instruction_topic, + "room_type_query_topic": room_type_query_topic, + "room_type_answer_topic": room_type_answer_topic, + } + ], + ) + + return [ + detect_anything_node, + image_republisher_node, + room_segmentation_node, + semantic_mapping_node, + vlm_reasoning_node, + ] + + +def generate_launch_description() -> LaunchDescription: + pkg_share = get_package_share_directory("semantic_mapping") + detect_anything_share = get_package_share_directory("detect_anything") + sensor_coverage_share = get_package_share_directory("sensor_coverage") + default_params = os.path.join(pkg_share, "config", "params.yaml") + default_camera_param = os.path.join(pkg_share, "config", "camera_param.yaml") + default_objects_yaml = os.path.join(detect_anything_share, "config", "objects.yaml") + default_room_seg_params = os.path.join( + sensor_coverage_share, "config", "room_segmentation_config.yaml" + ) + + return LaunchDescription( + [ + DeclareLaunchArgument("mode", default_value="sim"), + DeclareLaunchArgument("use_compressed", default_value=""), + DeclareLaunchArgument("params_file", default_value=default_params), + DeclareLaunchArgument("camera_param_path", default_value=default_camera_param), + DeclareLaunchArgument("detection_topic", default_value="/detection_result"), + DeclareLaunchArgument("cloud_topic", default_value="/registered_scan"), + DeclareLaunchArgument("odom_topic", default_value=""), + DeclareLaunchArgument("viewpoint_topic", default_value="/viewpoint_rep_header"), + DeclareLaunchArgument("object_type_answer_topic", default_value="/object_type_answer"), + DeclareLaunchArgument("object_type_query_topic", default_value="/object_type_query"), + DeclareLaunchArgument( + "target_object_spatial_query_topic", default_value="/target_object_spatial_query" + ), + DeclareLaunchArgument("target_object_instruction_topic", default_value="/target_object_instruction"), + DeclareLaunchArgument("target_object_answer_topic", default_value="/target_object_answer"), + DeclareLaunchArgument("instruction_topic", default_value="/keyboard_input"), + DeclareLaunchArgument("room_type_query_topic", default_value="/room_type_query"), + DeclareLaunchArgument("room_type_answer_topic", default_value="/room_type_answer"), + DeclareLaunchArgument("room_segmentation_params", default_value=default_room_seg_params), + DeclareLaunchArgument("objects_yaml_path", default_value=default_objects_yaml), + DeclareLaunchArgument("object_nodes_topic", default_value="/object_nodes"), + DeclareLaunchArgument("obj_points_topic", default_value="/obj_points"), + DeclareLaunchArgument("obj_boxes_topic", default_value="/obj_boxes"), + DeclareLaunchArgument("obj_labels_topic", default_value="/obj_labels"), + DeclareLaunchArgument("map_frame", default_value="map"), + DeclareLaunchArgument("target_object", default_value="refrigerator"), + DeclareLaunchArgument("save_png", default_value="false"), + DeclareLaunchArgument("output_dir", default_value="output/object_images"), + DeclareLaunchArgument("image_topic", default_value="/camera/image"), + DeclareLaunchArgument("compressed_image_topic", default_value="/camera/image/compressed"), + DeclareLaunchArgument("conf", default_value="0.35"), + DeclareLaunchArgument("max_area_ratio", default_value="0.3"), + OpaqueFunction(function=_launch_nodes), + ] + ) diff --git a/semantic_mapping/msg/DetectionResult.msg b/semantic_mapping/msg/DetectionResult.msg new file mode 100644 index 0000000..0f324ad --- /dev/null +++ b/semantic_mapping/msg/DetectionResult.msg @@ -0,0 +1,9 @@ +std_msgs/Header header +int32[] track_id +float32[] x1 +float32[] y1 +float32[] x2 +float32[] y2 +string[] label +float32[] confidence +sensor_msgs/Image image \ No newline at end of file diff --git a/semantic_mapping/msg/NavigationQuery.msg b/semantic_mapping/msg/NavigationQuery.msg new file mode 100644 index 0000000..5744672 --- /dev/null +++ b/semantic_mapping/msg/NavigationQuery.msg @@ -0,0 +1,3 @@ +string json +sensor_msgs/Image[] images +geometry_msgs/Point[] anchor_points \ No newline at end of file diff --git a/semantic_mapping/msg/ObjectNode.msg b/semantic_mapping/msg/ObjectNode.msg new file mode 100644 index 0000000..b2d7686 --- /dev/null +++ b/semantic_mapping/msg/ObjectNode.msg @@ -0,0 +1,35 @@ +# Semantic object representation for planner integration. +# +# Each message represents a single object in the semantic map with its +# 3D bounding box, point cloud, and metadata for VLM interaction. + +std_msgs/Header header + +# Object identity - may contain multiple merged track IDs +int32[] object_id + +# Dominant class label (by weighted vote) +string label + +# Object centroid in world frame +geometry_msgs/Point position + +# 3D bounding box corners (8 points in world frame) +# Order: [min_x,min_y,min_z], [max_x,min_y,min_z], [max_x,max_y,min_z], [min_x,max_y,min_z], +# [min_x,min_y,max_z], [max_x,min_y,max_z], [max_x,max_y,max_z], [min_x,max_y,max_z] +geometry_msgs/Point[8] bbox3d + +# Object point cloud in world frame +sensor_msgs/PointCloud2 cloud + +# Lifecycle status: "new", "updated", "unchanged" +string status + +# Path to best saved image crop of this object +string img_path + +# VLM interaction flags +bool is_asked_vlm + +# Associated viewpoint ID (for viewpoint-triggered saves) +int32 viewpoint_id diff --git a/semantic_mapping/msg/ObjectType.msg b/semantic_mapping/msg/ObjectType.msg new file mode 100644 index 0000000..ab02ea6 --- /dev/null +++ b/semantic_mapping/msg/ObjectType.msg @@ -0,0 +1,17 @@ +# VLM query/answer message for object relabeling. +# +# Used bidirectionally: +# - Query (mapping -> VLM): object_id, img_path, labels[] populated +# - Answer (VLM -> mapping): final_label populated with VLM decision + +# Object ID to query/update +int32 object_id + +# Path to object image for VLM analysis +string img_path + +# VLM-determined final label (populated in answer) +string final_label + +# Candidate labels from detection (populated in query) +string[] labels diff --git a/semantic_mapping/msg/RoomEarlyStop1.msg b/semantic_mapping/msg/RoomEarlyStop1.msg new file mode 100644 index 0000000..473c601 --- /dev/null +++ b/semantic_mapping/msg/RoomEarlyStop1.msg @@ -0,0 +1,10 @@ +int32 room_id_1 +int32 room_id_2 +geometry_msgs/Point anchor_point_1 +geometry_msgs/Point anchor_point_2 +sensor_msgs/Image image_1 +sensor_msgs/Image image_2 +string room_type_1 +string room_type_2 +bool early_stop_1 +bool enter_wrong_room \ No newline at end of file diff --git a/semantic_mapping/msg/RoomNode.msg b/semantic_mapping/msg/RoomNode.msg new file mode 100644 index 0000000..e406bdc --- /dev/null +++ b/semantic_mapping/msg/RoomNode.msg @@ -0,0 +1,8 @@ +int32 id +int32 show_id +geometry_msgs/PolygonStamped polygon +geometry_msgs/Point centroid +int32[] neighbors +bool is_connected +float32 area +sensor_msgs/Image room_mask diff --git a/semantic_mapping/msg/RoomNodeList.msg b/semantic_mapping/msg/RoomNodeList.msg new file mode 100644 index 0000000..8a9a713 --- /dev/null +++ b/semantic_mapping/msg/RoomNodeList.msg @@ -0,0 +1,2 @@ +std_msgs/Header header +semantic_mapping/RoomNode[] nodes diff --git a/semantic_mapping/msg/RoomType.msg b/semantic_mapping/msg/RoomType.msg new file mode 100644 index 0000000..aa6ff3c --- /dev/null +++ b/semantic_mapping/msg/RoomType.msg @@ -0,0 +1,8 @@ +std_msgs/Header header +geometry_msgs/Point anchor_point +sensor_msgs/Image image +sensor_msgs/Image room_mask +string room_type +int32 room_id +int32 voxel_num +bool in_room diff --git a/semantic_mapping/msg/TargetObject.msg b/semantic_mapping/msg/TargetObject.msg new file mode 100644 index 0000000..55c60e4 --- /dev/null +++ b/semantic_mapping/msg/TargetObject.msg @@ -0,0 +1,6 @@ +std_msgs/Header header +int32 object_id +string object_label +string img_path +string room_label +bool is_target \ No newline at end of file diff --git a/semantic_mapping/msg/TargetObjectInstruction.msg b/semantic_mapping/msg/TargetObjectInstruction.msg new file mode 100644 index 0000000..1a88005 --- /dev/null +++ b/semantic_mapping/msg/TargetObjectInstruction.msg @@ -0,0 +1,22 @@ +# User instruction for target object search. +# +# Specifies the target object to find and optional spatial/attribute +# conditions relative to an anchor object. + +# Primary target object label to search for +string target_object + +# Room condition (e.g., "in the kitchen") +string room_condition + +# Spatial condition relative to anchor (e.g., "next to", "on top of") +string spatial_condition + +# Attribute condition for target (e.g., "red", "large") +string attribute_condition + +# Anchor object for spatial reference +string anchor_object + +# Attribute condition for anchor object +string attribute_condition_anchor diff --git a/semantic_mapping/msg/TargetObjectWithSpatial.msg b/semantic_mapping/msg/TargetObjectWithSpatial.msg new file mode 100644 index 0000000..cc0c8d7 --- /dev/null +++ b/semantic_mapping/msg/TargetObjectWithSpatial.msg @@ -0,0 +1,8 @@ +std_msgs/Header header +int32 object_id +string object_label +string img_path +int32[] viewpoint_ids +string room_label +bool is_target +geometry_msgs/Point[8] bbox3d diff --git a/semantic_mapping/msg/ViewpointRep.msg b/semantic_mapping/msg/ViewpointRep.msg new file mode 100644 index 0000000..5bb1bfe --- /dev/null +++ b/semantic_mapping/msg/ViewpointRep.msg @@ -0,0 +1,8 @@ +# Viewpoint trigger message for image saving. +# +# Published by planner to request saving images at specific viewpoints. + +std_msgs/Header header + +# Unique viewpoint identifier +int32 viewpoint_id diff --git a/semantic_mapping/msg/VlmAnswer.msg b/semantic_mapping/msg/VlmAnswer.msg new file mode 100644 index 0000000..8ec7a30 --- /dev/null +++ b/semantic_mapping/msg/VlmAnswer.msg @@ -0,0 +1,4 @@ +int32 room_id +geometry_msgs/Point anchor_point +# 0 for vlm navigation query, 1 for early stop 1 +int32 answer_type \ No newline at end of file diff --git a/semantic_mapping/package.xml b/semantic_mapping/package.xml new file mode 100644 index 0000000..4804d1a --- /dev/null +++ b/semantic_mapping/package.xml @@ -0,0 +1,29 @@ + + + + semantic_mapping + 0.0.1 + Semantic 3D object mapping node with VLM integration. + Alex Lin + MIT + + ament_cmake + + rosidl_default_generators + rosidl_default_runtime + rosidl_interface_packages + + rclpy + std_msgs + sensor_msgs + geometry_msgs + nav_msgs + visualization_msgs + cv_bridge + detect_anything + vector_perception_utils + + + ament_cmake + + diff --git a/semantic_mapping/resource/semantic_mapping b/semantic_mapping/resource/semantic_mapping new file mode 100644 index 0000000..e69de29 diff --git a/semantic_mapping/scripts/semantic_mapping_node b/semantic_mapping/scripts/semantic_mapping_node new file mode 100755 index 0000000..0312e88 --- /dev/null +++ b/semantic_mapping/scripts/semantic_mapping_node @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +"""Entry point for semantic_mapping_node.""" + +from semantic_mapping.mapping_node import main + +if __name__ == "__main__": + main() diff --git a/semantic_mapping/scripts/vlm_reasoning_node b/semantic_mapping/scripts/vlm_reasoning_node new file mode 100755 index 0000000..645540f --- /dev/null +++ b/semantic_mapping/scripts/vlm_reasoning_node @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +"""Entry point for vlm_reasoning_node.""" + +from semantic_mapping.vlm_reasoning_node import main + +if __name__ == "__main__": + main() diff --git a/semantic_mapping/semantic_mapping/__init__.py b/semantic_mapping/semantic_mapping/__init__.py new file mode 100644 index 0000000..8df6f20 --- /dev/null +++ b/semantic_mapping/semantic_mapping/__init__.py @@ -0,0 +1 @@ +"""Semantic 3D object mapping for vector_perception_ros.""" diff --git a/semantic_mapping/semantic_mapping/map_object.py b/semantic_mapping/semantic_mapping/map_object.py new file mode 100644 index 0000000..da143a9 --- /dev/null +++ b/semantic_mapping/semantic_mapping/map_object.py @@ -0,0 +1,713 @@ +"""Data models for semantic mapping. + +This module defines the core data structures for the semantic mapping system: +- VoxelFeatureManager: Manages voxelized point clouds with observation angle voting +- MapObject: Represents a tracked object in the semantic map +- Observation: A single detection observation +- ObjectStatus: Enum for object lifecycle status +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum +from typing import Optional + +import numpy as np +from scipy.spatial import cKDTree + +from vector_perception_utils.transform_utils import discretize_angles, normalize_angle +from vector_perception_utils.pointcloud_utils import ( + compute_centroid, + create_pointcloud, +) + + +class ObjectStatus(Enum): + """Object lifecycle status (ObjectDB-style tiering).""" + + PENDING = "pending" # < min_detections threshold + PERMANENT = "permanent" # >= threshold, confirmed object + DELETED = "deleted" # Marked for removal + + +@dataclass +class Observation: + """A single detection observation to update a MapObject. + + Attributes: + track_id: Tracker ID from detection. + label: Class label string. + confidence: Detection confidence score. + points: World-frame point cloud (N, 3). + robot_R: Robot rotation matrix (3, 3) at observation time. + robot_t: Robot translation vector (3,) at observation time. + mask: Optional 2D segmentation mask. + image: Optional RGB image crop for best-image selection. + timestamp: Observation timestamp in seconds. + """ + + track_id: int + label: str + confidence: float + points: np.ndarray + robot_R: np.ndarray + robot_t: np.ndarray + mask: Optional[np.ndarray] = None + image: Optional[np.ndarray] = None + timestamp: float = 0.0 + + +class VoxelFeatureManager: + """Manages voxelized point cloud with observation angle voting. + + This class tracks 3D voxels representing an object, along with: + - Vote counts per voxel (how many times each voxel was observed) + - Observation angles (which directions each voxel was seen from) + - Regularization mask (which voxels pass shape filtering) + + The observation angle voting helps distinguish between reliable points + (seen from multiple angles) and spurious points (seen from only one angle). + + Attributes: + voxels: Point coordinates (N, 3) in world frame. + votes: Detection count per voxel (N,). + observation_angles: Binary angle histogram per voxel (N, num_bins). + regularized_mask: Boolean mask for valid voxels after clustering (N,). + voxel_size: Size of voxel grid cells in meters. + num_angle_bins: Number of discretized angle bins. + """ + + def __init__( + self, + points: np.ndarray, + voxel_size: float, + robot_R: np.ndarray, + robot_t: np.ndarray, + num_angle_bins: int = 15, + ) -> None: + """Initialize VoxelFeatureManager with initial points. + + Args: + points: Initial point cloud (N, 3) in world frame. + voxel_size: Voxel grid cell size in meters. + robot_R: Robot rotation matrix (3, 3) at observation time. + robot_t: Robot translation vector (3,) at observation time. + num_angle_bins: Number of angle bins for observation tracking. + """ + num_points = points.shape[0] + + # Compute observation angles from robot position + vectors = points - robot_t + obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0]) + obs_bins = discretize_angles(obs_angles, num_angle_bins) + + self.voxels = points.copy() + self.voxel_size = voxel_size + self.num_angle_bins = num_angle_bins + self._tree = cKDTree(points) + + # Initialize vote counts + self.votes = np.ones(num_points, dtype=np.float32) + + # Initialize observation angle histogram (binary: seen/not seen from each angle) + self.observation_angles = np.zeros((num_points, num_angle_bins), dtype=np.uint8) + self.observation_angles[np.arange(num_points), obs_bins] = 1 + + # Regularization mask (all False until regularize_shape is called) + self.regularized_mask = np.zeros(num_points, dtype=bool) + + # Remove vote for dynamic object clearing (not commonly used) + self._remove_votes = np.zeros(num_points, dtype=np.int32) + + def update( + self, + new_points: np.ndarray, + robot_R: np.ndarray, + robot_t: np.ndarray, + ) -> None: + """Add new points with KDTree-based merge and angle voting. + + Points within voxel_size of existing points are merged (votes incremented). + Points farther away are added as new voxels. + + Args: + new_points: New point cloud (M, 3) in world frame. + robot_R: Robot rotation matrix (3, 3) at observation time. + robot_t: Robot translation vector (3,) at observation time. + """ + if new_points.shape[0] == 0: + return + + # Compute observation angles for new points + vectors = new_points - robot_t + obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0]) + obs_bins = discretize_angles(obs_angles, self.num_angle_bins) + + # Find nearest existing voxels + distances, indices = self._tree.query(new_points) + + # Merge points within voxel_size + merge_mask = distances < self.voxel_size + merge_indices = indices[merge_mask] + merge_bins = obs_bins[merge_mask] + + self.votes[merge_indices] += 1 + self.observation_angles[merge_indices, merge_bins] = 1 + + # Add new points that don't merge + new_mask = ~merge_mask + if np.any(new_mask): + new_voxels = new_points[new_mask] + new_bins = obs_bins[new_mask] + num_new = new_voxels.shape[0] + + # Extend arrays + self.voxels = np.vstack([self.voxels, new_voxels]) + self.votes = np.concatenate([self.votes, np.ones(num_new, dtype=np.float32)]) + + new_obs = np.zeros((num_new, self.num_angle_bins), dtype=np.uint8) + new_obs[np.arange(num_new), new_bins] = 1 + self.observation_angles = np.vstack([self.observation_angles, new_obs]) + + self.regularized_mask = np.concatenate( + [self.regularized_mask, np.zeros(num_new, dtype=bool)] + ) + self._remove_votes = np.concatenate( + [self._remove_votes, np.zeros(num_new, dtype=np.int32)] + ) + + # Rebuild KDTree + self._tree = cKDTree(self.voxels) + + def update_observation_angles(self, voxel_indices: np.ndarray, obs_bins: np.ndarray) -> None: + """Update observation angle histogram for selected voxels.""" + if voxel_indices.size == 0: + return + self.observation_angles[voxel_indices, obs_bins] = 1 + + def merge_from(self, other: VoxelFeatureManager) -> None: + """Merge voxels from another VoxelFeatureManager. + + Args: + other: Another VoxelFeatureManager to merge from. + """ + if other.voxels.shape[0] == 0: + return + + distances, indices = self._tree.query(other.voxels) + + # Merge overlapping voxels + merge_mask = distances < self.voxel_size + merge_indices = indices[merge_mask] + + self.votes[merge_indices] += other.votes[merge_mask] + self.observation_angles[merge_indices] = np.logical_or( + self.observation_angles[merge_indices], + other.observation_angles[merge_mask], + ).astype(np.uint8) + + # Add non-overlapping voxels + new_mask = ~merge_mask + if np.any(new_mask): + self.voxels = np.vstack([self.voxels, other.voxels[new_mask]]) + self.votes = np.concatenate([self.votes, other.votes[new_mask]]) + self.observation_angles = np.vstack( + [self.observation_angles, other.observation_angles[new_mask]] + ) + self.regularized_mask = np.concatenate( + [self.regularized_mask, other.regularized_mask[new_mask]] + ) + self._remove_votes = np.concatenate( + [self._remove_votes, np.zeros(np.sum(new_mask), dtype=np.int32)] + ) + self._tree = cKDTree(self.voxels) + + def get_valid_indices( + self, + diversity_percentile: float = 0.3, + use_regularized: bool = True, + ) -> np.ndarray: + """Get indices of valid voxels based on observation angle diversity. + + Voxels are ranked by their observation angle diversity (number of angles + from which they were observed), weighted by vote count. The top percentile + of voxels are returned. + + Args: + diversity_percentile: Fraction of total weight to include (0-1). + use_regularized: If True, only consider voxels in regularized_mask. + + Returns: + Indices of valid voxels. + """ + if use_regularized: + mask = self.regularized_mask + if not np.any(mask): + return np.array([], dtype=int) + obs_angles = self.observation_angles[mask] + votes = self.votes[mask] + else: + obs_angles = self.observation_angles + votes = self.votes + + if len(obs_angles) == 0: + return np.array([], dtype=int) + + # Compute diversity score (number of angles observed) + angle_diversity = np.sum(obs_angles, axis=1) + + # Sort by diversity (ascending) + sorted_indices = np.argsort(angle_diversity) + sorted_diversity = angle_diversity[sorted_indices] + sorted_weights = votes[sorted_indices] + + # Weight by diversity * votes + weighted = sorted_diversity * sorted_weights + total_weight = np.sum(weighted) + + if total_weight == 0: + return np.array([], dtype=int) + + # Find cutoff index for percentile + target_weight = (1 - diversity_percentile) * total_weight + cumsum = np.cumsum(weighted) + cutoff_idx = np.searchsorted(cumsum, target_weight) + + # Return indices above cutoff (higher diversity) + return sorted_indices[cutoff_idx:] + + def get_valid_voxels( + self, + diversity_percentile: float = 0.3, + use_regularized: bool = True, + ) -> np.ndarray: + """Get valid voxel coordinates. + + Args: + diversity_percentile: Fraction of total weight to include (0-1). + use_regularized: If True, only consider voxels in regularized_mask. + + Returns: + Valid voxel coordinates (M, 3). + """ + indices = self.get_valid_indices(diversity_percentile, use_regularized) + if use_regularized and np.any(self.regularized_mask): + voxels = self.voxels[self.regularized_mask] + else: + voxels = self.voxels + + if len(indices) == 0: + return np.empty((0, 3), dtype=np.float32) + return voxels[indices] + + @property + def num_voxels(self) -> int: + """Total number of voxels.""" + return self.voxels.shape[0] + + @property + def num_regularized(self) -> int: + """Number of voxels passing regularization.""" + return int(np.sum(self.regularized_mask)) + + +@dataclass +class MapObject: + """Represents a tracked object in the semantic map. + + This class combines patterns from VLM_ROS SingleObject and dimos Object, + with ObjectDB-style lifecycle management. + + Attributes: + object_id: Unique identifier for this object (UUID string). + track_ids: List of associated tracker IDs (can merge multiple). + voxel_manager: Manages the object's voxelized point cloud. + class_votes: Label -> observation count mapping. + confidence_scores: Label -> best confidence score mapping. + points_count: Label -> total points count mapping. + robot_poses: List of robot poses when object was observed. + detection_count: Total number of detections. + inactive_frames: Frames since last update. + last_update_time: Timestamp of last update. + status: Object lifecycle status. + is_asked_vlm: Whether VLM has been queried for this object. + updated_by_vlm: Whether object was relabeled by VLM. + best_image_path: Path to best saved image crop. + best_image_mask_area: Mask area of the best saved image crop. + centroid: Cached centroid position (None if needs recompute). + bbox3d_corners: Cached 3D bbox corners (None if needs recompute). + """ + + object_id: str + track_ids: list[int] + voxel_manager: VoxelFeatureManager + + # Classification (voted) + class_votes: dict[str, int] = field(default_factory=dict) + confidence_scores: dict[str, float] = field(default_factory=dict) + points_count: dict[str, int] = field(default_factory=dict) + + # Robot poses at observation times + robot_poses: list[dict] = field(default_factory=list) + + # Lifecycle (ObjectDB-style) + detection_count: int = 1 + inactive_frames: int = 0 + last_update_time: float = 0.0 + status: ObjectStatus = ObjectStatus.PENDING + + # VLM interaction + is_asked_vlm: bool = False + updated_by_vlm: bool = False + + # Image storage + best_image_path: Optional[str] = None + best_image_mask_area: float = 0.0 + + # Cached geometry (None means needs recompute) + _centroid: Optional[np.ndarray] = field(default=None, repr=False) + _bbox3d_corners: Optional[np.ndarray] = field(default=None, repr=False) + _needs_recompute: bool = field(default=True, repr=False) + + # Configuration + _diversity_percentile: float = field(default=0.85, repr=False) + _angle_threshold: float = field(default=np.deg2rad(5), repr=False) + _distance_threshold: float = field(default=0.3, repr=False) + _needs_regularization: bool = field(default=True, repr=False) + _regularize_voxel_size: float = field(default=0.06, repr=False) + + @classmethod + def from_observation( + cls, + object_id: str, + obs: Observation, + voxel_size: float = 0.03, + regularize_voxel_size: float = 0.06, + num_angle_bins: int = 15, + ) -> MapObject: + """Create a new MapObject from an initial observation. + + Args: + object_id: Unique identifier for this object. + obs: Initial observation. + voxel_size: Voxel grid cell size in meters. + regularize_voxel_size: Voxel size for regularization clustering. + num_angle_bins: Number of angle bins for observation tracking. + + Returns: + New MapObject instance. + """ + voxel_manager = VoxelFeatureManager( + points=obs.points, + voxel_size=voxel_size, + robot_R=obs.robot_R, + robot_t=obs.robot_t, + num_angle_bins=num_angle_bins, + ) + + obj = cls( + object_id=object_id, + track_ids=[obs.track_id], + voxel_manager=voxel_manager, + class_votes={obs.label: 1}, + confidence_scores={obs.label: obs.confidence}, + points_count={obs.label: len(obs.points)}, + robot_poses=[{"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}], + last_update_time=obs.timestamp, + _regularize_voxel_size=regularize_voxel_size, + ) + return obj + + def _invalidate_cache(self) -> None: + """Mark cached geometry as needing recompute.""" + self._needs_recompute = True + self._centroid = None + self._bbox3d_corners = None + + def _is_similar_pose(self, new_R: np.ndarray, new_t: np.ndarray) -> bool: + """Check if a new pose is similar to existing poses. + + Used to avoid redundant updates from the same viewpoint. + Compares observation angle and distance to object center. + + Args: + new_R: New robot rotation matrix (3, 3). Not used but kept for API. + new_t: New robot translation vector (3,). + + Returns: + True if pose is similar to an existing pose. + """ + _ = new_R # Unused but kept for API consistency + obj_center = self.centroid + if obj_center is None: + obj_center = compute_centroid(self.voxel_manager.voxels) + + for pose in self.robot_poses: + old_t = pose["t"] # R not used for similarity check + + old_to_obj = obj_center - old_t + new_to_obj = obj_center - new_t + + old_angle = np.arctan2(old_to_obj[1], old_to_obj[0]) + new_angle = np.arctan2(new_to_obj[1], new_to_obj[0]) + angle_diff = abs(normalize_angle(np.array([new_angle - old_angle]))[0]) + + old_dist = np.linalg.norm(old_to_obj) + new_dist = np.linalg.norm(new_to_obj) + dist_diff = abs(new_dist - old_dist) + + if angle_diff < self._angle_threshold and dist_diff < self._distance_threshold: + return True + + return False + + def update(self, obs: Observation) -> None: + """Update object with a new observation. + + Args: + obs: New observation to incorporate. + """ + # Update voxel manager + self.voxel_manager.update(obs.points, obs.robot_R, obs.robot_t) + self.detection_count += 1 + self.inactive_frames = 0 + self.last_update_time = obs.timestamp + self._needs_regularization = True + + # Check if pose is new (not similar to existing) + is_similar = self._is_similar_pose(obs.robot_R, obs.robot_t) + + if not is_similar: + # Update class statistics + self.class_votes[obs.label] = self.class_votes.get(obs.label, 0) + 1 + self.confidence_scores[obs.label] = max( + self.confidence_scores.get(obs.label, 0), obs.confidence + ) + self.points_count[obs.label] = ( + self.points_count.get(obs.label, 0) + len(obs.points) + ) + + # Add pose + self.robot_poses.append({"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}) + + # Add track ID if new + if obs.track_id not in self.track_ids: + self.track_ids.append(obs.track_id) + + self._invalidate_cache() + + def merge(self, other: MapObject) -> None: + """Merge another MapObject into this one. + + Args: + other: Object to merge from. + """ + # Merge track IDs + for tid in other.track_ids: + if tid not in self.track_ids: + self.track_ids.append(tid) + + # Merge voxel managers + self.voxel_manager.merge_from(other.voxel_manager) + self._needs_regularization = True + + # Merge class statistics + for label, count in other.class_votes.items(): + self.class_votes[label] = self.class_votes.get(label, 0) + count + self.confidence_scores[label] = max( + self.confidence_scores.get(label, 0), + other.confidence_scores.get(label, 0), + ) + self.points_count[label] = ( + self.points_count.get(label, 0) + other.points_count.get(label, 0) + ) + + # Merge poses (only unique ones) + for pose in other.robot_poses: + if not self._is_similar_pose(pose["R"], pose["t"]): + self.robot_poses.append(pose) + + # Update lifecycle + self.detection_count += other.detection_count + self.last_update_time = max(self.last_update_time, other.last_update_time) + self.is_asked_vlm = self.is_asked_vlm or other.is_asked_vlm + + self._invalidate_cache() + + @property + def label(self) -> str: + """Get dominant class label by weighted vote. + + Returns: + Most likely class label based on points_count * confidence. + """ + if not self.class_votes: + return "unknown" + + # Compute weighted scores + weighted_scores = {} + for lbl in self.class_votes: + pts = self.points_count.get(lbl, 0) + conf = self.confidence_scores.get(lbl, 0) + weighted_scores[lbl] = pts * conf + + return max(weighted_scores, key=weighted_scores.get) + + @property + def confidence(self) -> float: + """Get confidence of the dominant label.""" + return self.confidence_scores.get(self.label, 0.0) + + @property + def centroid(self) -> Optional[np.ndarray]: + """Compute centroid weighted by observation angles. + + Returns: + Centroid position (3,) or None if no valid voxels. + """ + if self._needs_recompute or self._centroid is None: + self._compute_geometry() + return self._centroid + + @property + def bbox3d_corners(self) -> Optional[np.ndarray]: + """Compute axis-aligned 3D bounding box corners. + + Returns: + 8 corner points (8, 3) or None if no valid voxels. + """ + if self._needs_recompute or self._bbox3d_corners is None: + self._compute_geometry() + return self._bbox3d_corners + + def _compute_geometry(self) -> None: + """Compute cached centroid and bbox from valid voxels using Open3D. + + Returns: + None. Updates cached centroid and bbox in-place. + """ + # Get valid voxels (regularized first, then fallback to all) + valid_voxels = self.voxel_manager.get_valid_voxels( + diversity_percentile=self._diversity_percentile, + use_regularized=True, + ) + if len(valid_voxels) == 0: + valid_voxels = self.voxel_manager.get_valid_voxels( + diversity_percentile=self._diversity_percentile, + use_regularized=False, + ) + + if len(valid_voxels) < 4: + self._centroid = compute_centroid(valid_voxels) if len(valid_voxels) > 0 else None + self._bbox3d_corners = None + else: + pcd = create_pointcloud(valid_voxels) + try: + obb = pcd.get_minimal_oriented_bounding_box() + except RuntimeError: + # Points are coplanar / degenerate — fall back to axis-aligned bbox. + obb = pcd.get_axis_aligned_bounding_box() + self._centroid = np.asarray(obb.get_center()) + self._bbox3d_corners = np.asarray(obb.get_box_points()) + self._needs_recompute = False + return + self._centroid = np.asarray(obb.center) + half_extent = 0.5 * np.asarray(obb.extent) + local_corners = np.array( + [ + [-half_extent[0], -half_extent[1], -half_extent[2]], + [half_extent[0], -half_extent[1], -half_extent[2]], + [half_extent[0], half_extent[1], -half_extent[2]], + [-half_extent[0], half_extent[1], -half_extent[2]], + [-half_extent[0], -half_extent[1], half_extent[2]], + [half_extent[0], -half_extent[1], half_extent[2]], + [half_extent[0], half_extent[1], half_extent[2]], + [-half_extent[0], half_extent[1], half_extent[2]], + ], + dtype=float, + ) + self._bbox3d_corners = local_corners @ np.asarray(obb.R).T + np.asarray(obb.center) + + self._needs_recompute = False + + def regularize_shape(self, percentile: float = 0.85) -> None: + """Apply vote-weighted regularization with light outlier removal.""" + if not self._needs_regularization: + return + voxels = self.voxel_manager.voxels + if len(voxels) < 3: + self.voxel_manager.regularized_mask = np.ones(len(voxels), dtype=bool) + self._needs_regularization = False + self._invalidate_cache() + return + + weights = np.sum(self.voxel_manager.observation_angles, axis=1) * self.voxel_manager.votes + keep_ratio = percentile if percentile is not None else 1.0 + if keep_ratio >= 1.0: + keep_mask = np.ones(voxels.shape[0], dtype=bool) + else: + cutoff = float(np.quantile(weights, 1.0 - keep_ratio)) + keep_mask = weights >= cutoff + + if not np.any(keep_mask): + keep_mask = np.ones(voxels.shape[0], dtype=bool) + + selected_indices = np.nonzero(keep_mask)[0] + points_sel = voxels[selected_indices] + weights_sel = weights[selected_indices] + if points_sel.shape[0] >= 3: + voxel_size = self._regularize_voxel_size + keys = np.floor(points_sel / voxel_size).astype(np.int32) + voxel_map: dict[tuple[int, int, int], list[int]] = {} + for idx, key in enumerate(map(tuple, keys)): + voxel_map.setdefault(key, []).append(idx) + + neighbor_offsets = [ + (dx, dy, dz) + for dx in (-1, 0, 1) + for dy in (-1, 0, 1) + for dz in (-1, 0, 1) + if not (dx == 0 and dy == 0 and dz == 0) + ] + visited: set[tuple[int, int, int]] = set() + components: list[tuple[np.ndarray, float, np.ndarray]] = [] + for key in voxel_map: + if key in visited: + continue + stack = [key] + visited.add(key) + comp_indices: list[int] = [] + while stack: + current = stack.pop() + comp_indices.extend(voxel_map[current]) + for dx, dy, dz in neighbor_offsets: + neighbor = (current[0] + dx, current[1] + dy, current[2] + dz) + if neighbor in voxel_map and neighbor not in visited: + visited.add(neighbor) + stack.append(neighbor) + + comp_idx = np.array(comp_indices, dtype=int) + comp_weight = float(np.sum(weights_sel[comp_idx])) + comp_centroid = compute_centroid(points_sel[comp_idx]) + components.append((comp_idx, comp_weight, comp_centroid)) + + if components: + main_idx = int(np.argmax([c[1] for c in components])) + main_points = points_sel[components[main_idx][0]] + main_centroid = components[main_idx][2] + extent = np.max(main_points, axis=0) - np.min(main_points, axis=0) + dist_thresh = max(0.3, 0.5 * float(np.max(extent))) + + keep_sel_mask = np.zeros(points_sel.shape[0], dtype=bool) + for comp_idx, _, comp_centroid in components: + if np.linalg.norm(comp_centroid - main_centroid) <= dist_thresh: + keep_sel_mask[comp_idx] = True + if np.any(keep_sel_mask): + keep_mask = np.zeros(voxels.shape[0], dtype=bool) + keep_mask[selected_indices[keep_sel_mask]] = True + + self.voxel_manager.regularized_mask = keep_mask + + self._needs_regularization = False + self._invalidate_cache() diff --git a/semantic_mapping/semantic_mapping/mapper.py b/semantic_mapping/semantic_mapping/mapper.py new file mode 100644 index 0000000..cc67cf1 --- /dev/null +++ b/semantic_mapping/semantic_mapping/mapper.py @@ -0,0 +1,275 @@ +"""Object map manager for semantic mapping.""" + +from __future__ import annotations + +from dataclasses import dataclass +import threading +import time +import uuid + +from .utils.matching import compute_bbox, compute_iou_3d_with_ratios, find_best_match +from .map_object import MapObject, ObjectStatus, Observation + + +@dataclass +class UpdateStats: + """Summary of updates applied in a mapper step.""" + + created: int = 0 + updated: int = 0 + merged: int = 0 + deleted: int = 0 + promoted: int = 0 + total: int = 0 + + +class ObjectMapper: + """Manages the semantic object map with two-tier state management.""" + + def __init__( + self, + voxel_size: float = 0.03, + regularize_voxel_size: float = 0.06, + confidence_threshold: float = 0.30, + distance_threshold: float = 0.5, + iou_threshold: float = 0.2, + min_detections_for_permanent: int = 3, + pending_ttl_s: float = 10.0, + num_angle_bins: int = 15, + ) -> None: + """Initialize the object mapper and its thresholds. + + Args: + voxel_size: Voxel grid cell size in meters. + regularize_voxel_size: Voxel size for regularization clustering. + confidence_threshold: Minimum detection confidence to accept. + distance_threshold: Max centroid distance for spatial matching. + iou_threshold: IoU threshold for merge checks. + min_detections_for_permanent: Detections before promotion. + pending_ttl_s: Time-to-live for pending objects. + num_angle_bins: Observation angle bins for voxel manager. + """ + self._pending_objects: dict[str, MapObject] = {} + self._permanent_objects: dict[str, MapObject] = {} + self._track_id_map: dict[int, str] = {} + self._lock = threading.RLock() + + self._voxel_size = voxel_size + self._regularize_voxel_size = regularize_voxel_size + self._confidence_threshold = confidence_threshold + self._distance_threshold = distance_threshold + self._iou_threshold = iou_threshold + self._min_detections_for_permanent = min_detections_for_permanent + self._pending_ttl_s = pending_ttl_s + self._num_angle_bins = num_angle_bins + + def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: + """Update the map with new observations. + + Args: + observations: List of new observations to integrate. + target_object: Optional label to protect from pruning. + + Returns: + UpdateStats summarizing changes applied. + """ + stats = UpdateStats() + now = max((obs.timestamp for obs in observations), default=time.time()) + updated_ids: set[str] = set() + regularized_ids: set[str] = set() + + with self._lock: + for obs in observations: + if obs.confidence < self._confidence_threshold: + continue + + obj = self._match_observation(obs) + if obj is None: + obj = self._create_object(obs) + self._pending_objects[obj.object_id] = obj + stats.created += 1 + else: + obj.update(obs) + stats.updated += 1 + + updated_ids.add(obj.object_id) + for track_id in obj.track_ids: + self._track_id_map[track_id] = obj.object_id + + if obj.status == ObjectStatus.PENDING: + if self._check_promotion(obj): + stats.promoted += 1 + + if obj.object_id not in regularized_ids and obj.inactive_frames == 0: + obj.regularize_shape(percentile=obj._diversity_percentile) + regularized_ids.add(obj.object_id) + + for obj in list(self._pending_objects.values()) + list(self._permanent_objects.values()): + if obj.object_id not in updated_ids: + obj.inactive_frames += 1 + + deleted = self._prune_inactive(target_object, now) + stats.deleted = len(deleted) + stats.merged += self.merge_objects() + stats.total = len(self._permanent_objects) + + return stats + + def _match_observation(self, obs: Observation) -> MapObject | None: + """Match an observation to an existing object.""" + if obs.track_id in self._track_id_map: + obj_id = self._track_id_map[obs.track_id] + if obj_id in self._permanent_objects: + return self._permanent_objects[obj_id] + if obj_id in self._pending_objects: + return self._pending_objects[obj_id] + + candidates = list(self._permanent_objects.values()) + list(self._pending_objects.values()) + return find_best_match( + obs=obs, + candidates=candidates, + distance_threshold=self._distance_threshold, + iou_threshold=self._iou_threshold, + ) + + def _create_object(self, obs: Observation) -> MapObject: + """Create a new MapObject from an observation.""" + obj = MapObject.from_observation( + object_id=str(uuid.uuid4()), + obs=obs, + voxel_size=self._voxel_size, + regularize_voxel_size=self._regularize_voxel_size, + num_angle_bins=self._num_angle_bins, + ) + self._track_id_map[obs.track_id] = obj.object_id + return obj + + def _check_promotion(self, obj: MapObject) -> bool: + """Promote object from pending to permanent if threshold met.""" + if obj.status != ObjectStatus.PENDING: + return False + if obj.detection_count < self._min_detections_for_permanent: + return False + if obj.object_id in self._pending_objects: + self._pending_objects.pop(obj.object_id) + obj.status = ObjectStatus.PERMANENT + self._permanent_objects[obj.object_id] = obj + return True + + def merge_objects(self) -> int: + """Merge overlapping permanent objects. + + Returns: + Number of merges performed. + """ + merged = 0 + objs = list(self._permanent_objects.values()) + i = 0 + while i < len(objs): + obj = objs[i] + j = i + 1 + while j < len(objs): + other = objs[j] + should_merge = False + + if set(obj.track_ids) & set(other.track_ids): + should_merge = True + else: + if obj.centroid is None or other.centroid is None: + j += 1 + continue + dist = float(((obj.centroid - other.centroid) ** 2).sum() ** 0.5) + if dist > self._distance_threshold: + j += 1 + continue + + obj_points = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(obj_points) == 0: + obj_points = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + other_points = other.voxel_manager.get_valid_voxels( + diversity_percentile=other._diversity_percentile, + use_regularized=True, + ) + if len(other_points) == 0: + other_points = other.voxel_manager.get_valid_voxels( + diversity_percentile=other._diversity_percentile, + use_regularized=False, + ) + + obj_bbox = compute_bbox(obj_points) if len(obj_points) >= 3 else None + other_bbox = compute_bbox(other_points) if len(other_points) >= 3 else None + obj_corners = obj_bbox[0] if obj_bbox is not None else None + other_corners = other_bbox[0] if other_bbox is not None else None + obj_extent = obj_bbox[1] if obj_bbox is not None else None + other_extent = other_bbox[1] if other_bbox is not None else None + + dist_merge = dist < 0.25 + if obj_extent is not None and other_extent is not None: + dist_thresh = (((obj_extent / 2 + other_extent / 2) / 2) ** 2).sum() ** 0.5 + dist_merge = dist_merge or dist < dist_thresh * 0.5 + + if dist_merge: + should_merge = True + elif obj_corners is not None and other_corners is not None: + iou, ratio_obj, ratio_other = compute_iou_3d_with_ratios( + obj_corners, other_corners + ) + if ( + iou > self._iou_threshold + or ratio_obj > 0.4 + or ratio_other > 0.4 + ): + should_merge = True + + if should_merge: + obj.merge(other) + for track_id in other.track_ids: + self._track_id_map[track_id] = obj.object_id + self._permanent_objects.pop(other.object_id, None) + objs.pop(j) + merged += 1 + else: + j += 1 + i += 1 + + return merged + + def _prune_inactive(self, target_object: str | None, now: float) -> list[MapObject]: + """Remove inactive pending/permanent objects.""" + deleted: list[MapObject] = [] + + for obj_id, obj in list(self._pending_objects.items()): + if (now - obj.last_update_time) > self._pending_ttl_s: + deleted.append(self._pending_objects.pop(obj_id)) + + for obj_id, obj in list(self._permanent_objects.items()): + if target_object and obj.label == target_object: + continue + if obj.status == ObjectStatus.DELETED: + deleted.append(self._permanent_objects.pop(obj_id)) + + for obj in deleted: + for track_id in obj.track_ids: + if self._track_id_map.get(track_id) == obj.object_id: + self._track_id_map.pop(track_id, None) + + return deleted + + def get_objects(self, include_pending: bool = False) -> list[MapObject]: + """Return current objects. + + Args: + include_pending: If True, include pending objects. + + Returns: + List of MapObject instances. + """ + if include_pending: + return list(self._permanent_objects.values()) + list(self._pending_objects.values()) + return list(self._permanent_objects.values()) diff --git a/semantic_mapping/semantic_mapping/mapping_node.py b/semantic_mapping/semantic_mapping/mapping_node.py new file mode 100644 index 0000000..760ad6d --- /dev/null +++ b/semantic_mapping/semantic_mapping/mapping_node.py @@ -0,0 +1,993 @@ +"""ROS2 mapping node for semantic 3D object mapping.""" + +from __future__ import annotations + +from collections import deque +from pathlib import Path +import time + +import numpy as np +from cv_bridge import CvBridge +import cv2 +import rclpy +from rclpy.node import Node +from rclpy.qos import QoSProfile, ReliabilityPolicy +from sensor_msgs.msg import CompressedImage, PointCloud2 +from sensor_msgs_py import point_cloud2 +from nav_msgs.msg import Odometry +from std_msgs.msg import Header +from visualization_msgs.msg import MarkerArray +from scipy.spatial.transform import Rotation + +from detect_anything.msg import DetectionResult +from semantic_mapping.msg import ( + ObjectType, + ObjectNode, + TargetObjectInstruction, + TargetObjectWithSpatial, + ViewpointRep, +) + +from vector_perception_utils.pointcloud_utils import ( + project_pointcloud_to_image, + segment_pointclouds_from_projection, + transform_segmented_clouds_to_world, +) +from vector_perception_utils.transform_utils import discretize_angles + +from .map_object import Observation, MapObject +from .utils.map_object_utils import ( + create_bbox_markers, + create_delete_markers, + create_label_markers, + create_object_pointcloud, + map_object_to_object_node, +) +from .mapper import ObjectMapper +from .utils.storage import ImageSaveQueue +from .utils.utils import collect_cloud_window, compute_ground_z_threshold, interpolate_odom, load_camera_params + + +class SemanticMappingNode(Node): + """ROS2 node for semantic object mapping. + + This node fuses 2D detections and LiDAR point clouds into persistent 3D objects: + - Buffers detections, clouds, and odometry with timestamps + - Interpolates odometry at detection time (position + orientation) + - Segments point clouds by detection masks and projects to world frame + - Updates the object mapper and publishes ROS messages + - Queues cropped images for VLM queries + """ + + def __init__(self) -> None: + """Initialize the mapping node, parameters, buffers, and ROS I/O.""" + super().__init__("semantic_mapping_node") + self._bridge = CvBridge() + + self.declare_parameter("detection_topic", "/detection_result") + self.declare_parameter("cloud_topic", "/registered_scan") + self.declare_parameter("odom_topic", "/state_estimation") + self.declare_parameter("viewpoint_topic", "/viewpoint_rep_header") + self.declare_parameter("object_type_answer_topic", "/object_type_answer") + self.declare_parameter("object_type_query_topic", "/object_type_query") + self.declare_parameter("target_object_spatial_query_topic", "/target_object_spatial_query") + self.declare_parameter("target_object_instruction_topic", "/target_object_instruction") + self.declare_parameter("object_nodes_topic", "/object_nodes") + self.declare_parameter("obj_points_topic", "/obj_points") + self.declare_parameter("obj_boxes_topic", "/obj_boxes") + self.declare_parameter("obj_labels_topic", "/obj_labels") + self.declare_parameter("map_frame", "map") + + self.declare_parameter("confidence_threshold", 0.3) + self.declare_parameter("voxel_size", 0.03) + self.declare_parameter("regularize_voxel_size", 0.06) + self.declare_parameter("distance_threshold", 0.5) + self.declare_parameter("iou_threshold", 0.2) + self.declare_parameter("min_detections_for_permanent", 3) + self.declare_parameter("pending_ttl_s", 10.0) + self.declare_parameter("num_angle_bins", 15) + + self.declare_parameter("detection_linear_state_time_bias", 0.0) + self.declare_parameter("detection_angular_state_time_bias", 0.0) + self.declare_parameter("cloud_window_before", 0.5) + self.declare_parameter("cloud_window_after", 0.1) + + self.declare_parameter("camera_param_path", "config/camera_param.yaml") + self.declare_parameter("depth_filter", False) + self.declare_parameter("depth_filter_margin", 0.15) + self.declare_parameter("depth_jump_threshold", 0.3) + self.declare_parameter("mask_erode_kernel", 3) + self.declare_parameter("mask_erode_iterations", 2) + self.declare_parameter("max_cloud_distance", 8.0) + self.declare_parameter("visualization_max_objects", 50) + self.declare_parameter("ground_filter", True) + self.declare_parameter("ground_radius", 2.0) + self.declare_parameter("ground_z_clearance", 0.0) + + self.declare_parameter("target_object", "") + self.declare_parameter("save_png", False) + self.declare_parameter("output_dir", "output/object_images") + self.declare_parameter("save_viewpoint_images", True) + self.declare_parameter("viewpoint_output_dir", "output/viewpoint_images") + self.declare_parameter("viewpoint_use_timestamp_dir", True) + self.declare_parameter("viewpoint_save_png", True) + self.declare_parameter("viewpoint_save_transform", True) + self.declare_parameter("object_type_query_mode", "target_or_all") + self.declare_parameter("object_type_query_max_per_cycle", 3) + self.declare_parameter("enable_object_type_queries", True) + + detection_topic = str(self.get_parameter("detection_topic").value) + cloud_topic = str(self.get_parameter("cloud_topic").value) + odom_topic = str(self.get_parameter("odom_topic").value) + viewpoint_topic = str(self.get_parameter("viewpoint_topic").value) + object_type_answer_topic = str(self.get_parameter("object_type_answer_topic").value) + object_type_query_topic = str(self.get_parameter("object_type_query_topic").value) + target_object_spatial_query_topic = str( + self.get_parameter("target_object_spatial_query_topic").value + ) + target_object_instruction_topic = str(self.get_parameter("target_object_instruction_topic").value) + object_nodes_topic = str(self.get_parameter("object_nodes_topic").value) + obj_points_topic = str(self.get_parameter("obj_points_topic").value) + obj_boxes_topic = str(self.get_parameter("obj_boxes_topic").value) + obj_labels_topic = str(self.get_parameter("obj_labels_topic").value) + self._map_frame = str(self.get_parameter("map_frame").value) + + self._confidence_threshold = float(self.get_parameter("confidence_threshold").value) + self._detection_linear_bias = float(self.get_parameter("detection_linear_state_time_bias").value) + self._detection_angular_bias = float(self.get_parameter("detection_angular_state_time_bias").value) + self._cloud_window_before = float(self.get_parameter("cloud_window_before").value) + self._cloud_window_after = float(self.get_parameter("cloud_window_after").value) + + camera_param_path = str(self.get_parameter("camera_param_path").value) + camera_params = load_camera_params( + camera_param_path, base_dir=Path(__file__).resolve().parent.parent + ) + self._projection = "equirect" + self._axis_mode = "xz" + self._image_width = 1920 + self._image_height = 640 + self._depth_filter = bool(self.get_parameter("depth_filter").value) + self._depth_filter_margin = float(self.get_parameter("depth_filter_margin").value) + self._depth_jump_threshold = float(self.get_parameter("depth_jump_threshold").value) + self._mask_erode_kernel_size = max(1, int(self.get_parameter("mask_erode_kernel").value)) + self._mask_erode_iterations = max(0, int(self.get_parameter("mask_erode_iterations").value)) + self._max_cloud_distance = float(self.get_parameter("max_cloud_distance").value) + self._sensor_to_camera = None + if camera_params is not None: + self._image_width = camera_params.image_width + self._image_height = camera_params.image_height + self._sensor_to_camera = camera_params.sensor_to_camera + self._mask_erode_kernel = cv2.getStructuringElement( + cv2.MORPH_RECT, + (self._mask_erode_kernel_size, self._mask_erode_kernel_size), + ) + self._visualization_max_objects = int( + self.get_parameter("visualization_max_objects").value + ) + self._ground_filter = bool(self.get_parameter("ground_filter").value) + self._ground_radius = float(self.get_parameter("ground_radius").value) + self._ground_z_clearance = float(self.get_parameter("ground_z_clearance").value) + self._ground_percentile = 0.05 + self._ground_min_points = 80 + + self._target_object = str(self.get_parameter("target_object").value) + self._room_condition = "" + self._spatial_condition = "" + self._attribute_condition = "" + self._anchor_object = "" + self._attribute_condition_anchor = "" + self._target_spatial_query_ids: set[int] = set() + self._object_viewpoint_ids: dict[int, set[int]] = {} + self._save_png = bool(self.get_parameter("save_png").value) + self._output_dir = Path(str(self.get_parameter("output_dir").value)) + self._save_viewpoint_images = bool(self.get_parameter("save_viewpoint_images").value) + self._viewpoint_output_dir = Path(str(self.get_parameter("viewpoint_output_dir").value)) + self._viewpoint_use_timestamp_dir = bool( + self.get_parameter("viewpoint_use_timestamp_dir").value + ) + self._viewpoint_save_png = bool(self.get_parameter("viewpoint_save_png").value) + self._viewpoint_save_transform = bool( + self.get_parameter("viewpoint_save_transform").value + ) + self._object_type_query_mode = str( + self.get_parameter("object_type_query_mode").value + ).lower() + self._object_type_query_max = int( + self.get_parameter("object_type_query_max_per_cycle").value + ) + self._enable_object_type_queries = bool( + self.get_parameter("enable_object_type_queries").value + ) + self._viewpoint_dir = self._viewpoint_output_dir if self._save_viewpoint_images else None + + self._mapper = ObjectMapper( + voxel_size=float(self.get_parameter("voxel_size").value), + regularize_voxel_size=float(self.get_parameter("regularize_voxel_size").value), + confidence_threshold=self._confidence_threshold, + distance_threshold=float(self.get_parameter("distance_threshold").value), + iou_threshold=float(self.get_parameter("iou_threshold").value), + min_detections_for_permanent=int(self.get_parameter("min_detections_for_permanent").value), + pending_ttl_s=float(self.get_parameter("pending_ttl_s").value), + num_angle_bins=int(self.get_parameter("num_angle_bins").value), + ) + + self._image_saver = ImageSaveQueue( + output_dir=self._output_dir, + max_queue_size=100, + save_png=self._save_png, + ) + self._image_saver.start() + + self._viewpoint_saver: ImageSaveQueue | None = None + if self._save_viewpoint_images and self._viewpoint_dir is not None: + self._viewpoint_saver = ImageSaveQueue( + output_dir=self._viewpoint_output_dir, + max_queue_size=50, + save_png=self._viewpoint_save_png, + save_transform=self._viewpoint_save_transform, + clear_output_dir=True, + use_timestamp_dir=self._viewpoint_use_timestamp_dir, + ) + self._viewpoint_saver.start() + + self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=200) + self._odom_buffer: deque[tuple[float, dict]] = deque(maxlen=400) + self._detection_buffer: deque[DetectionResult] = deque(maxlen=100) + self._viewpoint_queue: deque[tuple[float, int]] = deque(maxlen=50) + self._processed_viewpoints: set[float] = set() + self._published_ids: set[int] = set() + self._mapping_busy = False + self._last_mapping_stats_log = 0.0 + self._last_mapping_time_log = 0.0 + self._mapping_log_throttle_s = 5.0 + self._last_query_log = 0.0 + self._query_log_throttle_s = 5.0 + self._last_answer_log = 0.0 + + qos_cloud = QoSProfile(depth=10, reliability=ReliabilityPolicy.BEST_EFFORT) + qos_default = QoSProfile(depth=10, reliability=ReliabilityPolicy.RELIABLE) + + self.create_subscription(DetectionResult, detection_topic, self._detection_callback, qos_default) + self.create_subscription(PointCloud2, cloud_topic, self._cloud_callback, qos_cloud) + self.create_subscription(Odometry, odom_topic, self._odom_callback, qos_cloud) + self.create_subscription(ViewpointRep, viewpoint_topic, self._viewpoint_callback, qos_default) + self.create_subscription(ObjectType, object_type_answer_topic, self._object_type_answer_callback, qos_default) + self.create_subscription( + TargetObjectInstruction, target_object_instruction_topic, self._target_object_callback, qos_default + ) + + self._object_type_query_pub = self.create_publisher(ObjectType, object_type_query_topic, qos_default) + self._target_object_spatial_query_pub = self.create_publisher( + TargetObjectWithSpatial, target_object_spatial_query_topic, qos_default + ) + self._object_nodes_pub = self.create_publisher(ObjectNode, object_nodes_topic, qos_default) + self._obj_points_pub = self.create_publisher(PointCloud2, obj_points_topic, qos_default) + self._obj_boxes_pub = self.create_publisher(MarkerArray, obj_boxes_topic, qos_default) + self._obj_labels_pub = self.create_publisher(MarkerArray, obj_labels_topic, qos_default) + + self.create_timer(0.5, self._mapping_callback) + + def destroy_node(self) -> bool: + """Stop background workers and destroy the ROS node.""" + self._image_saver.stop() + if self._viewpoint_saver is not None: + self._viewpoint_saver.stop() + return super().destroy_node() + + def _detection_callback(self, msg: DetectionResult) -> None: + """Buffer detection results for later synchronization. + + Args: + msg: DetectionResult message with bboxes, labels, and masks. + """ + self._detection_buffer.append(msg) + + def _cloud_callback(self, msg: PointCloud2) -> None: + """Buffer incoming point clouds with timestamps. + + Args: + msg: PointCloud2 message in the sensor frame. + """ + stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 + points = point_cloud2.read_points_numpy(msg, field_names=("x", "y", "z")) + self._cloud_buffer.append((stamp, points)) + + def _odom_callback(self, msg: Odometry) -> None: + """Buffer odometry data for interpolation. + + Args: + msg: Odometry message containing pose and twist. + """ + stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 + odom = { + "position": np.array( + [msg.pose.pose.position.x, msg.pose.pose.position.y, msg.pose.pose.position.z] + ), + "orientation": np.array( + [ + msg.pose.pose.orientation.x, + msg.pose.pose.orientation.y, + msg.pose.pose.orientation.z, + msg.pose.pose.orientation.w, + ] + ), + "linear_velocity": np.array( + [msg.twist.twist.linear.x, msg.twist.twist.linear.y, msg.twist.twist.linear.z] + ), + "angular_velocity": np.array( + [msg.twist.twist.angular.x, msg.twist.twist.angular.y, msg.twist.twist.angular.z] + ), + } + self._odom_buffer.append((stamp, odom)) + + def _viewpoint_callback(self, msg: ViewpointRep) -> None: + """Queue viewpoint trigger requests. + + Args: + msg: ViewpointRep message with timestamp and viewpoint id. + """ + stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 + if stamp in self._processed_viewpoints: + return + self._viewpoint_queue.append((stamp, msg.viewpoint_id)) + + def _target_object_callback(self, msg: TargetObjectInstruction) -> None: + """Update target object label used for VLM query decisions. + + Args: + msg: TargetObjectInstruction message. + """ + self._target_object = msg.target_object + self._room_condition = msg.room_condition + self._spatial_condition = msg.spatial_condition + self._attribute_condition = msg.attribute_condition + self._anchor_object = msg.anchor_object + self._attribute_condition_anchor = msg.attribute_condition_anchor + self._target_spatial_query_ids.clear() + if msg.target_object: + self.get_logger().debug( + f"Target instruction: target={msg.target_object} " + f"room={msg.room_condition} spatial={msg.spatial_condition}" + ) + + def _object_type_answer_callback(self, msg: ObjectType) -> None: + """Apply VLM relabeling updates to tracked objects. + + Args: + msg: ObjectType message with final label for a track id. + """ + matched = False + for obj in self._mapper.get_objects(include_pending=True): + if msg.object_id in obj.track_ids: + obj.class_votes[msg.final_label] = obj.class_votes.get(msg.final_label, 0) + 50 + obj.confidence_scores[msg.final_label] = max( + obj.confidence_scores.get(msg.final_label, 0.0), 1.0 + ) + obj.points_count[msg.final_label] = obj.points_count.get(msg.final_label, 0) + obj.updated_by_vlm = True + obj._invalidate_cache() + self.get_logger().debug( + f"Object type answer applied: object_id={msg.object_id} " + f"label={msg.final_label}" + ) + matched = True + break + if not matched: + now = time.time() + if now - self._last_answer_log > self._query_log_throttle_s: + track_ids = [ + track_id + for obj in self._mapper.get_objects(include_pending=True) + for track_id in obj.track_ids + ] + preview = track_ids[:10] + suffix = "..." if len(track_ids) > 10 else "" + self.get_logger().warning( + f"Object type answer ignored: object_id={msg.object_id} " + f"label={msg.final_label} known_ids={preview}{suffix}" + ) + self._last_answer_log = now + + def _mapping_callback(self) -> None: + """Main mapping loop: synchronize inputs, update map, and publish outputs.""" + if self._mapping_busy: + return + if len(self._detection_buffer) < 1 or len(self._cloud_buffer) < 1 or len(self._odom_buffer) < 2: + return + + self._mapping_busy = True + start_time = time.perf_counter() + try: + detection_msg, viewpoint_stamp, viewpoint_id = self._select_detection() + if detection_msg is None: + return + + detection_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9 + image = self._bridge.compressed_imgmsg_to_cv2(detection_msg.image, desired_encoding="bgr8") + + odom = interpolate_odom( + list(self._odom_buffer), + detection_stamp, + self._detection_linear_bias, + self._detection_angular_bias, + ) + if odom is None: + return + + if viewpoint_stamp is not None: + self._save_viewpoint_snapshot( + viewpoint_id=viewpoint_id, + viewpoint_stamp=viewpoint_stamp, + image=image, + odom=odom, + ) + + neighbor_cloud = collect_cloud_window( + list(self._cloud_buffer), + detection_stamp, + self._cloud_window_before, + self._cloud_window_after, + ) + if neighbor_cloud is None: + return + + observations = self._build_observations( + detection_msg=detection_msg, + image=image, + odom=odom, + cloud=neighbor_cloud, + ) + if len(observations) == 0: + return + + stats = self._mapper.update(observations, target_object=self._target_object) + now = time.time() + if now - self._last_mapping_stats_log > self._mapping_log_throttle_s: + self.get_logger().debug( + "Mapping stats:\n" + f" total: {stats.total}\n" + f" deleted: {stats.deleted}" + ) + self._last_mapping_stats_log = now + self._update_viewpoint_ids(observations, viewpoint_id) + self._update_observation_angles(observations) + self._update_best_images(observations, image) + self._publish_map(detection_stamp, viewpoint_id) + self._publish_object_type_queries() + self._publish_target_object_spatial_queries() + + if viewpoint_stamp is not None: + self._processed_viewpoints.add(viewpoint_stamp) + finally: + elapsed_ms = (time.perf_counter() - start_time) * 1000.0 + now = time.time() + if now - self._last_mapping_time_log > self._mapping_log_throttle_s: + self.get_logger().debug(f"Mapping loop time: {elapsed_ms:.2f} ms") + self._last_mapping_time_log = now + self._mapping_busy = False + + def _select_detection(self) -> tuple[DetectionResult | None, float | None, int]: + """Pick the detection to process, optionally aligned to a viewpoint. + + Returns: + Tuple of (detection_msg, viewpoint_stamp, viewpoint_id). + """ + viewpoint_stamp = None + viewpoint_id = -1 + + if len(self._viewpoint_queue) > 0: + stamp, vid = self._viewpoint_queue[0] + detections = list(self._detection_buffer) + stamps = [msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 for msg in detections] + diffs = np.abs(np.array(stamps) - stamp) + idx = int(np.argmin(diffs)) + diff = float(diffs[idx]) + if diff > 1.0: + self.get_logger().debug( + f"Viewpoint stamp {stamp:.3f} is {diff:.3f}s away from closest detection" + ) + detection_msg = detections[idx] + viewpoint_stamp = stamp + viewpoint_id = vid + self._viewpoint_queue.popleft() + return detection_msg, viewpoint_stamp, viewpoint_id + + idx = -2 if len(self._detection_buffer) > 1 else -1 + return self._detection_buffer[idx], viewpoint_stamp, viewpoint_id + + def _update_viewpoint_ids(self, observations: list[Observation], viewpoint_id: int) -> None: + """Track viewpoint IDs associated with observations.""" + if viewpoint_id < 0: + return + for obs in observations: + if obs.track_id not in self._object_viewpoint_ids: + self._object_viewpoint_ids[obs.track_id] = set() + self._object_viewpoint_ids[obs.track_id].add(viewpoint_id) + + def _collect_viewpoint_ids(self, obj: MapObject) -> list[int]: + """Collect viewpoint IDs observed for a MapObject.""" + ids: set[int] = set() + for track_id in obj.track_ids: + ids.update(self._object_viewpoint_ids.get(track_id, set())) + return sorted(ids) + + def _save_viewpoint_snapshot( + self, + viewpoint_id: int, + viewpoint_stamp: float, + image: np.ndarray, + odom: dict, + ) -> None: + """Save viewpoint image and pose transform for spatial queries.""" + if not self._save_viewpoint_images or self._viewpoint_dir is None: + return + if viewpoint_id < 0: + return + + _ = viewpoint_stamp # Reserved for future indexing/logging + + transform = None + if self._viewpoint_save_transform: + R_b2w = Rotation.from_quat(odom["orientation"]).as_matrix() + t_b2w = np.array(odom["position"], dtype=float) + transform = np.eye(4, dtype=float) + transform[:3, :3] = R_b2w + transform[:3, 3] = t_b2w + + if self._viewpoint_saver is not None: + self._viewpoint_saver.queue_viewpoint_save( + viewpoint_id=viewpoint_id, image=image, transform=transform + ) + + # helper methods moved to semantic_mapping.utils.utils + + def _build_observations( + self, + detection_msg: DetectionResult, + image: np.ndarray, + odom: dict, + cloud: np.ndarray, + ) -> list[Observation]: + """Build Observation objects from detections and segmented point clouds. + + Args: + detection_msg: DetectionResult with bboxes, labels, and masks. + image: RGB image used for masks and cropping. + odom: Interpolated odometry dict (position + orientation). + cloud: Aggregated point cloud in sensor frame. + + Returns: + List of Observation instances for object updates. + """ + det_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9 + image_shape = image.shape[:2] + + masks = [] + labels = [] + confidences = [] + track_ids = [] + + for i in range(len(detection_msg.track_id)): + conf = float(detection_msg.confidence[i]) + if conf < self._confidence_threshold: + continue + x1 = int(detection_msg.x1[i]) + y1 = int(detection_msg.y1[i]) + x2 = int(detection_msg.x2[i]) + y2 = int(detection_msg.y2[i]) + x1 = max(0, min(x1, image_shape[1] - 1)) + x2 = max(0, min(x2, image_shape[1])) + y1 = max(0, min(y1, image_shape[0] - 1)) + y2 = max(0, min(y2, image_shape[0])) + if x2 <= x1 or y2 <= y1: + continue + + full_mask = np.zeros(image_shape, dtype=bool) + if i < len(detection_msg.masks): + mask_msg: CompressedImage = detection_msg.masks[i] + mask_crop = self._bridge.compressed_imgmsg_to_cv2(mask_msg) + if mask_crop.ndim == 3: + mask_crop = cv2.cvtColor(mask_crop, cv2.COLOR_BGR2GRAY) + mask_bool = mask_crop.astype(bool) + if self._mask_erode_iterations > 0: + mask_eroded = cv2.erode( + mask_bool.astype(np.uint8), + self._mask_erode_kernel, + iterations=self._mask_erode_iterations, + ) + mask_bool = mask_eroded.astype(bool) + crop_h = min(mask_bool.shape[0], y2 - y1) + crop_w = min(mask_bool.shape[1], x2 - x1) + full_mask[y1 : y1 + crop_h, x1 : x1 + crop_w] = mask_bool[:crop_h, :crop_w] + + masks.append(full_mask) + labels.append(detection_msg.label[i]) + confidences.append(conf) + track_ids.append(int(detection_msg.track_id[i])) + + if len(masks) == 0: + return [] + + R_b2w = Rotation.from_quat(odom["orientation"]).as_matrix() + t_b2w = np.array(odom["position"]) + R_w2b = R_b2w.T + t_w2b = -R_w2b @ t_b2w + cloud_body = cloud @ R_w2b.T + t_w2b + + _, pixel_idx = project_pointcloud_to_image( + points=cloud_body, + projection=self._projection, + image_width=self._image_width, + image_height=self._image_height, + axis_mode=self._axis_mode, + intrinsics=None, + depth_filter=self._depth_filter, + depth_filter_margin=self._depth_filter_margin, + transform=self._sensor_to_camera, + ) + seg_clouds, seg_depths = segment_pointclouds_from_projection( + points=cloud_body, + masks=masks, + pixel_idx=pixel_idx, + ) + if self._depth_filter: + filtered_clouds = [] + filtered_depths = [] + for cloud, depth in zip(seg_clouds, seg_depths): + if depth.shape[0] == 0: + filtered_clouds.append(cloud) + filtered_depths.append(depth) + continue + min_depth = float(np.min(depth)) + keep = depth[:, 0] <= (min_depth + self._depth_filter_margin) + filtered_clouds.append(cloud[keep]) + filtered_depths.append(depth[keep]) + seg_clouds = filtered_clouds + seg_depths = filtered_depths + world_clouds = transform_segmented_clouds_to_world( + clouds=seg_clouds, + depths=seg_depths, + R_b2w=R_b2w, + t_b2w=t_b2w, + depth_jump_threshold=self._depth_jump_threshold, + ) + cloud_world = cloud_body @ R_b2w.T + t_b2w + ground_z_threshold = compute_ground_z_threshold( + cloud_world, + t_b2w[:2], + self._ground_filter, + self._ground_min_points, + self._ground_radius, + self._ground_percentile, + self._ground_z_clearance, + ) + + observations = [] + for idx, world_cloud in enumerate(world_clouds): + if self._max_cloud_distance > 0: + dist = np.linalg.norm(world_cloud - t_b2w, axis=1) + world_cloud = world_cloud[dist < self._max_cloud_distance] + if ground_z_threshold is not None: + world_cloud = world_cloud[world_cloud[:, 2] > ground_z_threshold] + if world_cloud.shape[0] == 0: + continue + observations.append( + Observation( + track_id=track_ids[idx], + label=labels[idx], + confidence=confidences[idx], + points=world_cloud, + robot_R=R_b2w, + robot_t=t_b2w, + mask=masks[idx], + image=image, + timestamp=det_stamp, + ) + ) + + return observations + + def _update_observation_angles(self, observations: list[Observation]) -> None: + """Reproject object voxels onto the current mask to update angle votes.""" + if len(observations) == 0: + return + + objects = self._mapper.get_objects(include_pending=True) + track_to_obj = {} + for obj in objects: + for track_id in obj.track_ids: + track_to_obj[track_id] = obj + + for obs in observations: + obj = track_to_obj.get(obs.track_id) + if obj is None or obs.mask is None: + continue + voxels = obj.voxel_manager.voxels + if voxels.shape[0] == 0: + continue + + R_b2w = obs.robot_R + t_b2w = obs.robot_t + R_w2b = R_b2w.T + t_w2b = -R_w2b @ t_b2w + voxels_body = voxels @ R_w2b.T + t_w2b + _, pixel_idx = project_pointcloud_to_image( + points=voxels_body, + projection=self._projection, + image_width=self._image_width, + image_height=self._image_height, + axis_mode=self._axis_mode, + intrinsics=None, + depth_filter=self._depth_filter, + depth_filter_margin=self._depth_filter_margin, + transform=self._sensor_to_camera, + ) + pix = pixel_idx.astype(int) + valid = ( + (pix[:, 0] >= 0) + & (pix[:, 0] < obs.mask.shape[1]) + & (pix[:, 1] >= 0) + & (pix[:, 1] < obs.mask.shape[0]) + ) + if not np.any(valid): + continue + pix = pix[valid] + voxel_indices = np.nonzero(valid)[0] + on_mask = obs.mask[pix[:, 1], pix[:, 0]].astype(bool) + if not np.any(on_mask): + continue + voxel_indices = voxel_indices[on_mask] + vectors = voxels[voxel_indices] - t_b2w + obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0]) + obs_bins = discretize_angles(obs_angles, obj.voxel_manager.num_angle_bins) + obj.voxel_manager.update_observation_angles(voxel_indices, obs_bins) + obj._invalidate_cache() + + def _update_best_images(self, observations: list[Observation], image: np.ndarray) -> None: + """Update best image crops for objects based on mask area. + + Args: + observations: Observations created for the current frame. + image: Full RGB image associated with the detections. + """ + for obs in observations: + obj = None + for candidate in self._mapper.get_objects(include_pending=True): + if obs.track_id in candidate.track_ids: + obj = candidate + break + if obj is None or obs.mask is None: + continue + + coords = np.argwhere(obs.mask) + if coords.size == 0: + continue + mask_area = coords.shape[0] + if mask_area <= obj.best_image_mask_area: + continue + + y0, x0 = coords.min(axis=0) + y1, x1 = coords.max(axis=0) + 1 + padding = max(int(y1 - y0), int(x1 - x0), 80) + y0 = max(0, y0 - padding) + x0 = max(0, x0 - padding) + y1 = min(obs.mask.shape[0], y1 + padding) + x1 = min(obs.mask.shape[1], x1 + padding) + + if obj.best_image_path: + image_path = Path(obj.best_image_path) + mask_path = image_path.with_name(image_path.stem + "_mask.npy") + if image_path.exists(): + image_path.unlink() + if mask_path.exists(): + mask_path.unlink() + if self._save_png: + png_path = image_path.with_suffix(".png") + if png_path.exists(): + png_path.unlink() + + cropped_image = image[y0:y1, x0:x1] + cropped_mask = obs.mask[y0:y1, x0:x1] + + obj.best_image_path = self._image_saver.queue_save( + object_id=str(obs.track_id), + image=cropped_image, + mask=cropped_mask, + ) + obj.best_image_mask_area = float(mask_area) + obj.is_asked_vlm = False + + def _publish_map(self, stamp: float, viewpoint_id: int) -> None: + """Publish object nodes, markers, and aggregated point clouds. + + Args: + stamp: Timestamp (seconds) for message headers. + viewpoint_id: Viewpoint id for ObjectNode messages. + """ + seconds = int(stamp) + nanoseconds = int((stamp - seconds) * 1e9) + header = Header() + header.stamp.sec = seconds + header.stamp.nanosec = nanoseconds + header.frame_id = self._map_frame + + objects = self._mapper.get_objects(include_pending=True) + for obj in objects: + if obj.detection_count == 1: + status = "new" + elif obj.inactive_frames == 0: + status = "updated" + else: + status = "unchanged" + msg = map_object_to_object_node( + obj, + header, + status=status, + viewpoint_id=viewpoint_id, + target_object=self._target_object, + ) + if msg is not None: + self._object_nodes_pub.publish(msg) + + viz_objects = objects + if self._visualization_max_objects > 0 and len(objects) > self._visualization_max_objects: + viz_objects = sorted( + objects, key=lambda obj: obj.last_update_time, reverse=True + )[: self._visualization_max_objects] + + bbox_markers = create_bbox_markers(viz_objects, header) + label_markers = create_label_markers(viz_objects, header) + self._obj_boxes_pub.publish(bbox_markers) + self._obj_labels_pub.publish(label_markers) + + obj_cloud = create_object_pointcloud(viz_objects, header) + if obj_cloud is not None: + self._obj_points_pub.publish(obj_cloud) + + current_ids = {int(obj.track_ids[0]) if obj.track_ids else 0 for obj in viz_objects} + deleted_ids = list(self._published_ids - current_ids) + if len(deleted_ids) > 0: + delete_markers = create_delete_markers(deleted_ids, header, namespace="bbox") + self._obj_boxes_pub.publish(delete_markers) + delete_labels = create_delete_markers(deleted_ids, header, namespace="labels") + self._obj_labels_pub.publish(delete_labels) + self._published_ids = current_ids + + def _publish_object_type_queries(self) -> None: + """Publish VLM queries for target objects with good image crops.""" + if not self._enable_object_type_queries: + return + published = 0 + objects = self._mapper.get_objects(include_pending=False) + eligible = [] + for obj in objects: + if obj.is_asked_vlm or obj.best_image_mask_area <= 500: + continue + if not obj.best_image_path: + continue + image_path = Path(obj.best_image_path) + if not image_path.exists(): + continue + eligible.append(obj) + target = (self._target_object or "").strip().lower() + target_matches = [obj for obj in eligible if obj.label == target] if target else [] + + mode = self._object_type_query_mode + if mode not in ("target_only", "all", "target_or_all"): + mode = "target_or_all" + + fallback_used = False + if mode == "target_only": + candidates = target_matches + elif mode == "all": + candidates = eligible + else: + if target_matches: + candidates = target_matches + else: + candidates = eligible + fallback_used = True + + if candidates: + candidates = sorted( + candidates, + key=lambda obj: obj.best_image_mask_area, + reverse=True, + ) + max_per_cycle = max(1, self._object_type_query_max) + for obj in candidates[:max_per_cycle]: + msg = ObjectType() + msg.object_id = int(obj.track_ids[0]) if obj.track_ids else -1 + msg.img_path = obj.best_image_path or "" + msg.labels = list(obj.class_votes.keys()) + self._object_type_query_pub.publish(msg) + obj.is_asked_vlm = True + published += 1 + now = time.time() + if published > 0 or now - self._last_query_log > self._query_log_throttle_s: + subscriber_count = self._object_type_query_pub.get_subscription_count() + self.get_logger().debug( + f"ObjectType queries: published={published} eligible={len(eligible)} " + f"target_matches={len(target_matches)} fallback_all={fallback_used} " + f"mode={mode} target={self._target_object} subs={subscriber_count}" + ) + self._last_query_log = now + + def _publish_target_object_spatial_queries(self) -> None: + """Publish spatial target-object queries when spatial condition is set.""" + if not self._spatial_condition: + return + + published = 0 + skipped_label = 0 + skipped_mask = 0 + skipped_bbox = 0 + skipped_viewpoints = 0 + skipped_asked = 0 + for obj in self._mapper.get_objects(include_pending=False): + if obj.label != self._target_object: + skipped_label += 1 + continue + if obj.best_image_mask_area <= 500: + skipped_mask += 1 + continue + if obj.bbox3d_corners is None: + skipped_bbox += 1 + continue + + object_id = int(obj.track_ids[0]) if obj.track_ids else -1 + if object_id < 0: + continue + if object_id in self._target_spatial_query_ids: + skipped_asked += 1 + continue + + viewpoint_ids = self._collect_viewpoint_ids(obj) + if len(viewpoint_ids) == 0: + skipped_viewpoints += 1 + continue + + msg = TargetObjectWithSpatial() + msg.header.stamp = self.get_clock().now().to_msg() + msg.header.frame_id = self._map_frame + msg.object_id = object_id + msg.object_label = obj.label + msg.img_path = obj.best_image_path or "" + msg.viewpoint_ids = viewpoint_ids + msg.room_label = "" + msg.is_target = False + for i in range(8): + msg.bbox3d[i].x = float(obj.bbox3d_corners[i, 0]) + msg.bbox3d[i].y = float(obj.bbox3d_corners[i, 1]) + msg.bbox3d[i].z = float(obj.bbox3d_corners[i, 2]) + + self._target_object_spatial_query_pub.publish(msg) + self._target_spatial_query_ids.add(object_id) + published += 1 + now = time.time() + if published > 0 or now - self._last_query_log > self._query_log_throttle_s: + subscriber_count = self._target_object_spatial_query_pub.get_subscription_count() + self.get_logger().debug( + f"TargetSpatial queries: published={published} skipped_label={skipped_label} " + f"skipped_mask={skipped_mask} skipped_bbox={skipped_bbox} " + f"skipped_viewpoints={skipped_viewpoints} skipped_asked={skipped_asked} " + f"subs={subscriber_count}" + ) + self._last_query_log = now + + +def main(args: list[str] | None = None) -> None: + """Entry point for the semantic mapping node.""" + rclpy.init(args=args) + node = SemanticMappingNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/semantic_mapping/semantic_mapping/prompts/__init__.py b/semantic_mapping/semantic_mapping/prompts/__init__.py new file mode 100644 index 0000000..acbdca0 --- /dev/null +++ b/semantic_mapping/semantic_mapping/prompts/__init__.py @@ -0,0 +1,17 @@ +"""Prompt templates for VLM reasoning tasks.""" + +from .vlm_prompts import ( + build_object_type_prompt, + build_room_type_prompt, + INSTRUCTION_DECOMPOSITION_PROMPT, + TARGET_OBJECT_PROMPT, + TARGET_OBJECT_SPATIAL_PROMPT, +) + +__all__ = [ + "build_object_type_prompt", + "build_room_type_prompt", + "INSTRUCTION_DECOMPOSITION_PROMPT", + "TARGET_OBJECT_PROMPT", + "TARGET_OBJECT_SPATIAL_PROMPT", +] diff --git a/semantic_mapping/semantic_mapping/prompts/vlm_prompts.py b/semantic_mapping/semantic_mapping/prompts/vlm_prompts.py new file mode 100644 index 0000000..441f452 --- /dev/null +++ b/semantic_mapping/semantic_mapping/prompts/vlm_prompts.py @@ -0,0 +1,78 @@ +from typing import Sequence + + +OBJECT_TYPE_PROMPT_TEMPLATE = """You are provided with an RGB image containing an object. +Select the correct label from the candidate list and respond strictly in JSON. + +Return ONLY valid JSON with this schema: +{{"label": "", "reason": ""}} + +Example: +{{"label": "chair", "reason": "The object has a seat, backrest, and four legs."}} + +Candidate labels: +{labels} +""" + +ROOM_TYPE_PROMPT_TEMPLATE = """You are given an RGB image of a room and a top-down room layout mask. +Identify the room type and respond strictly in valid JSON. + +Return ONLY valid JSON with this schema: +{{"room_type": ""}} + +If you are not confident, respond with: +{{"room_type": "unknown"}} + +Room types: +{room_types} +""" + + +INSTRUCTION_DECOMPOSITION_PROMPT = """You are given a user instruction for finding an object. +Decompose it into the following fields and return ONLY valid JSON: + +{"target_object": "", "room_condition": "", "spatial_condition": "", "attribute_condition": "", "anchor_object": "", "attribute_condition_anchor": ""} + +If a field is not specified, return it as an empty string. + +Example: +Instruction: "Find a silver refrigerator next to the white cabinet in the student lounge." +{"target_object": "refrigerator", "room_condition": "in the student lounge", "spatial_condition": "next to the cabinet", "attribute_condition": "silver", "anchor_object": "cabinet", "attribute_condition_anchor": "white"} +""" + + +TARGET_OBJECT_PROMPT = """You are given an instruction and a candidate object image. +Determine whether the object matches the target described in the instruction. +Consider object type, room condition, and attribute condition. + +Return ONLY valid JSON: +{"is_target": true/false, "reason": ""} + +Example: +{"is_target": true, "reason": "The object is a silver refrigerator in the lounge."} +""" + + +TARGET_OBJECT_SPATIAL_PROMPT = """You are given an instruction, a candidate object image, and +additional views of the surrounding scene. Determine whether the object matches +the instruction including any spatial or attribute constraints. + +Return ONLY valid JSON: +{"is_target": true/false, "reason": ""} + +Example: +{"is_target": false, "reason": "The object is a fridge but not next to the cabinet."} +""" + + +def build_object_type_prompt(labels: Sequence[str]) -> str: + label_block = ", ".join(labels) if labels else "unknown" + return OBJECT_TYPE_PROMPT_TEMPLATE.format(labels=label_block) + + +def build_room_type_prompt(room_types: Sequence[str]) -> str: + if room_types: + room_block = "\n".join(f"- {name}" for name in room_types) + else: + room_block = "- unknown" + return ROOM_TYPE_PROMPT_TEMPLATE.format(room_types=room_block) diff --git a/semantic_mapping/semantic_mapping/utils/__init__.py b/semantic_mapping/semantic_mapping/utils/__init__.py new file mode 100644 index 0000000..4a5efc8 --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/__init__.py @@ -0,0 +1,25 @@ +"""Utility modules for semantic mapping.""" + +from .utils import ( + CameraParams, + annotate_mask, + collect_cloud_window, + compute_ground_z_threshold, + concat_images, + interpolate_odom, + load_camera_params, + load_npy_image, + parse_json_dict, +) + +__all__ = [ + "CameraParams", + "annotate_mask", + "collect_cloud_window", + "compute_ground_z_threshold", + "concat_images", + "interpolate_odom", + "load_camera_params", + "load_npy_image", + "parse_json_dict", +] diff --git a/semantic_mapping/semantic_mapping/utils/map_object_utils.py b/semantic_mapping/semantic_mapping/utils/map_object_utils.py new file mode 100644 index 0000000..5ad4163 --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/map_object_utils.py @@ -0,0 +1,264 @@ +"""ROS message helpers for MapObject publishers.""" + +from __future__ import annotations + +import zlib +from typing import Optional + +import numpy as np +from geometry_msgs.msg import Point +from sensor_msgs.msg import PointCloud2 +from std_msgs.msg import Header +from visualization_msgs.msg import Marker, MarkerArray + +from semantic_mapping.msg import ObjectNode +from vector_perception_utils.pointcloud_utils import create_pointcloud2_msg + +from ..map_object import MapObject + + +def _axis_aligned_corners(points: np.ndarray) -> Optional[np.ndarray]: + if points.size == 0: + return None + min_bound = np.min(points, axis=0) + max_bound = np.max(points, axis=0) + return np.array( + [ + [min_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], max_bound[1], max_bound[2]], + [min_bound[0], max_bound[1], max_bound[2]], + ], + dtype=float, + ) + + +def get_object_seed(obj: MapObject) -> int: + """Generate a deterministic seed from a MapObject for color generation. + + Args: + obj: MapObject instance. + + Returns: + CRC32 hash of the object's identifier. + """ + seed_key = obj.object_id or (str(obj.track_ids[0]) if obj.track_ids else obj.label) + return zlib.crc32(seed_key.encode("utf-8")) + + +def map_object_to_object_node( + obj: MapObject, + header: Header, + status: str = "new", + viewpoint_id: int = -1, + target_object: str | None = None, +) -> Optional[ObjectNode]: + """Convert a MapObject into an ObjectNode message. + + Args: + obj: MapObject instance. + header: ROS header to attach to the message. + status: Lifecycle status string ("new", "updated", "unchanged"). + viewpoint_id: Viewpoint identifier for image saving. + target_object: Target object label for potential labeling. + + Returns: + ObjectNode message or None if required geometry is missing. + """ + if obj.centroid is None or obj.bbox3d_corners is None: + return None + + msg = ObjectNode() + msg.header = header + msg.object_id = obj.track_ids + if target_object and obj.label == target_object and not obj.is_asked_vlm: + msg.label = "Potential Target" + else: + msg.label = obj.label + msg.status = status + msg.position.x = float(obj.centroid[0]) + msg.position.y = float(obj.centroid[1]) + msg.position.z = float(obj.centroid[2]) + + for i in range(8): + msg.bbox3d[i].x = float(obj.bbox3d_corners[i, 0]) + msg.bbox3d[i].y = float(obj.bbox3d_corners[i, 1]) + msg.bbox3d[i].z = float(obj.bbox3d_corners[i, 2]) + + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(voxels) == 0: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + if len(voxels) > 0: + seed = get_object_seed(obj) + color = np.array( + [ + (seed & 0xFF) / 255.0, + ((seed >> 8) & 0xFF) / 255.0, + ((seed >> 16) & 0xFF) / 255.0, + ], + dtype=np.float32, + ) + colors = np.tile(color, (voxels.shape[0], 1)) + msg.cloud = create_pointcloud2_msg(points=voxels, colors=colors, header=header) + + msg.img_path = obj.best_image_path or "" + msg.is_asked_vlm = obj.is_asked_vlm + msg.viewpoint_id = -1 if obj.updated_by_vlm else viewpoint_id + return msg + + +def create_bbox_markers( + objects: list[MapObject], header: Header, namespace: str = "bbox" +) -> MarkerArray: + """Create wireframe bounding box markers for MapObjects.""" + markers = MarkerArray() + edges = [ + (0, 1), + (1, 2), + (2, 3), + (3, 0), + (4, 5), + (5, 6), + (6, 7), + (7, 4), + (0, 4), + (1, 5), + (2, 6), + (3, 7), + ] + + for obj in objects: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(voxels) == 0: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + corners = _axis_aligned_corners(voxels) if len(voxels) > 0 else None + if corners is None: + continue + marker = Marker() + marker.header = header + marker.ns = namespace + marker.id = int(obj.track_ids[0]) if obj.track_ids else 0 + marker.type = Marker.LINE_LIST + marker.action = Marker.ADD + marker.scale.x = 0.02 + + seed = get_object_seed(obj) + marker.color.r = (seed & 0xFF) / 255.0 + marker.color.g = ((seed >> 8) & 0xFF) / 255.0 + marker.color.b = ((seed >> 16) & 0xFF) / 255.0 + marker.color.a = 1.0 + + for i, j in edges: + marker.points.append(Point(x=float(corners[i, 0]), y=float(corners[i, 1]), z=float(corners[i, 2]))) + marker.points.append(Point(x=float(corners[j, 0]), y=float(corners[j, 1]), z=float(corners[j, 2]))) + + markers.markers.append(marker) + + return markers + + +def create_label_markers( + objects: list[MapObject], + header: Header, + namespace: str = "labels", + text_height: float = 0.4, +) -> MarkerArray: + """Create text label markers for MapObjects.""" + markers = MarkerArray() + + for obj in objects: + if obj.centroid is None: + continue + marker = Marker() + marker.header = header + marker.ns = namespace + marker.id = int(obj.track_ids[0]) if obj.track_ids else 0 + marker.type = Marker.TEXT_VIEW_FACING + marker.action = Marker.ADD + marker.scale.z = text_height + marker.text = obj.label + marker.pose.position.x = float(obj.centroid[0]) + marker.pose.position.y = float(obj.centroid[1]) + marker.pose.position.z = float(obj.centroid[2]) + 0.3 + + seed = get_object_seed(obj) + marker.color.r = (seed & 0xFF) / 255.0 + marker.color.g = ((seed >> 8) & 0xFF) / 255.0 + marker.color.b = ((seed >> 16) & 0xFF) / 255.0 + marker.color.a = 1.0 + + markers.markers.append(marker) + + return markers + + +def create_object_pointcloud(objects: list[MapObject], header: Header) -> Optional[PointCloud2]: + """Create a colored point cloud for a list of MapObjects.""" + points_list = [] + colors_list = [] + + for obj in objects: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(voxels) == 0: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + if len(voxels) == 0: + continue + + seed = get_object_seed(obj) + color = np.array( + [ + (seed & 0xFF) / 255.0, + ((seed >> 8) & 0xFF) / 255.0, + ((seed >> 16) & 0xFF) / 255.0, + ], + dtype=np.float32, + ) + colors = np.tile(color, (voxels.shape[0], 1)) + points_list.append(voxels) + colors_list.append(colors) + + if len(points_list) == 0: + return None + + points = np.concatenate(points_list, axis=0) + colors = np.concatenate(colors_list, axis=0) + return create_pointcloud2_msg(points=points, colors=colors, header=header) + + +def create_delete_markers( + deleted_ids: list[int], + header: Header, + namespace: str = "bbox", +) -> MarkerArray: + """Create deletion markers for a list of marker IDs.""" + markers = MarkerArray() + for marker_id in deleted_ids: + marker = Marker() + marker.header = header + marker.ns = namespace + marker.id = int(marker_id) + marker.action = Marker.DELETE + markers.markers.append(marker) + return markers diff --git a/semantic_mapping/semantic_mapping/utils/matching.py b/semantic_mapping/semantic_mapping/utils/matching.py new file mode 100644 index 0000000..e5aaa19 --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/matching.py @@ -0,0 +1,256 @@ +"""Object matching utilities for semantic mapping.""" + +from __future__ import annotations + +from typing import Optional + +import cv2 +import numpy as np + +from vector_perception_utils.pointcloud_utils import compute_centroid + +from ..map_object import MapObject, Observation + + +def _polygon_area(poly: np.ndarray) -> float: + """Compute polygon area using the shoelace formula. + + Args: + poly: Polygon vertices in order (N, 2). + + Returns: + Polygon area. + """ + if poly.shape[0] < 3: + return 0.0 + x = poly[:, 0] + y = poly[:, 1] + return 0.5 * abs(float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1)))) + + +def _ensure_ccw(poly: np.ndarray) -> np.ndarray: + """Ensure polygon vertices are ordered counter-clockwise.""" + if poly.shape[0] < 3: + return poly + x = poly[:, 0] + y = poly[:, 1] + signed = float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1))) + if signed < 0: + return poly[::-1] + return poly + + +def compute_bbox(points: np.ndarray) -> Optional[tuple[np.ndarray, np.ndarray]]: + """Compute yaw-only oriented bbox corners + extents.""" + if points.shape[0] < 3: + return None + + xy = points[:, :2].astype(np.float32) + unique_xy = np.unique(xy, axis=0) + if unique_xy.shape[0] < 3: + return None + + rect = cv2.minAreaRect(unique_xy) + (width, height) = rect[1] + if width == 0.0 or height == 0.0: + return None + + box = cv2.boxPoints(rect).astype(np.float64) + rect_xy = _ensure_ccw(box) + + zmin = float(np.min(points[:, 2])) + zmax = float(np.max(points[:, 2])) + extent = np.array([width, height, zmax - zmin], dtype=float) + z_center = zmax - extent[2] / 2.0 + half_z = extent[2] / 2.0 + + bottom = np.hstack([rect_xy, np.full((4, 1), z_center - half_z)]) + top = np.hstack([rect_xy, np.full((4, 1), z_center + half_z)]) + corners = np.vstack([bottom, top]) + return corners, extent + + +def _line_intersection(p1: np.ndarray, p2: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> np.ndarray: + """Intersection of line p1->p2 with line cp1->cp2. + + Args: + p1: Line segment start (2,). + p2: Line segment end (2,). + cp1: Clip edge start (2,). + cp2: Clip edge end (2,). + + Returns: + Intersection point (2,). + """ + dc = cp1 - cp2 + dp = p1 - p2 + denom = dc[0] * dp[1] - dc[1] * dp[0] + if denom == 0: + return p2 + n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0] + n2 = p1[0] * p2[1] - p1[1] * p2[0] + x = (n1 * dp[0] - n2 * dc[0]) / denom + y = (n1 * dp[1] - n2 * dc[1]) / denom + return np.array([x, y], dtype=np.float64) + + +def _polygon_clip(subject: np.ndarray, clip: np.ndarray) -> np.ndarray: + """Clip a polygon with a convex clip polygon (Sutherland–Hodgman). + + Args: + subject: Subject polygon vertices (N, 2). + clip: Clip polygon vertices (M, 2). + + Returns: + Clipped polygon vertices (K, 2). + """ + if subject.shape[0] == 0: + return subject + + def inside(p: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> bool: + return (cp2[0] - cp1[0]) * (p[1] - cp1[1]) - (cp2[1] - cp1[1]) * ( + p[0] - cp1[0] + ) >= 0 + + output = subject + for i in range(len(clip)): + cp1 = clip[i] + cp2 = clip[(i + 1) % len(clip)] + input_list = output + output = [] + if len(input_list) == 0: + return np.empty((0, 2), dtype=np.float64) + s = input_list[-1] + for e in input_list: + if inside(e, cp1, cp2): + if not inside(s, cp1, cp2): + output.append(_line_intersection(s, e, cp1, cp2)) + output.append(e) + elif inside(s, cp1, cp2): + output.append(_line_intersection(s, e, cp1, cp2)) + s = e + output = np.array(output, dtype=np.float64) + return np.array(output, dtype=np.float64) + + +def compute_iou_3d_with_ratios( + bbox1: np.ndarray, bbox2: np.ndarray +) -> tuple[float, float, float]: + """Compute IoU and intersection ratios using yaw-only bbox overlap. + + Args: + bbox1: Corner points for bbox1 (N, 3). + bbox2: Corner points for bbox2 (N, 3). + + Returns: + Tuple of (iou, ratio_bbox1, ratio_bbox2). + """ + if bbox1.shape[0] < 4 or bbox2.shape[0] < 4: + return 0.0, 0.0, 0.0 + + rect1 = _ensure_ccw(bbox1[:4, :2]) + rect2 = _ensure_ccw(bbox2[:4, :2]) + if rect1.shape[0] < 3 or rect2.shape[0] < 3: + return 0.0, 0.0, 0.0 + + poly_inter = _polygon_clip(rect1, rect2) + inter_area = _polygon_area(poly_inter) if poly_inter.shape[0] >= 3 else 0.0 + + zmin1, zmax1 = float(np.min(bbox1[:, 2])), float(np.max(bbox1[:, 2])) + zmin2, zmax2 = float(np.min(bbox2[:, 2])), float(np.max(bbox2[:, 2])) + height = max(0.0, min(zmax1, zmax2) - max(zmin1, zmin2)) + inter_vol = inter_area * height + + area1 = _polygon_area(rect1) + area2 = _polygon_area(rect2) + vol1 = area1 * (zmax1 - zmin1) + vol2 = area2 * (zmax2 - zmin2) + denom = vol1 + vol2 - inter_vol + + iou = inter_vol / denom if denom > 0 else 0.0 + ratio1 = inter_vol / vol1 if vol1 > 0 else 0.0 + ratio2 = inter_vol / vol2 if vol2 > 0 else 0.0 + return iou, ratio1, ratio2 + + +def find_best_match( + obs: Observation, + candidates: list[MapObject], + distance_threshold: float, + iou_threshold: float, + ratio_threshold: float = 0.4, + min_distance_merge: float = 0.25, +) -> Optional[MapObject]: + """Find best matching object by distance/IoU heuristics. + + Args: + obs: Observation to match. + candidates: Candidate map objects. + distance_threshold: Max centroid distance to consider. + iou_threshold: Min IoU to accept a merge. + ratio_threshold: Min overlap ratio to accept a merge. + min_distance_merge: Absolute distance threshold for merge. + + Returns: + Best matching object or None if no match passes thresholds. + """ + if obs.points.shape[0] == 0: + return None + + obs_center = compute_centroid(obs.points) + obs_bbox = compute_bbox(obs.points) + obs_corners = obs_bbox[0] if obs_bbox is not None else None + obs_extent = obs_bbox[1] if obs_bbox is not None else None + + best_candidate: Optional[MapObject] = None + best_dist = float("inf") + + for candidate in candidates: + if obs.track_id in candidate.track_ids: + return candidate + + cand_center = candidate.centroid + if cand_center is None: + continue + + dist = float(np.linalg.norm(obs_center - cand_center)) + if dist > distance_threshold: + continue + + cand_points = candidate.voxel_manager.get_valid_voxels( + diversity_percentile=candidate._diversity_percentile, + use_regularized=True, + ) + if len(cand_points) == 0: + cand_points = candidate.voxel_manager.get_valid_voxels( + diversity_percentile=candidate._diversity_percentile, + use_regularized=False, + ) + cand_bbox = compute_bbox(cand_points) if len(cand_points) >= 3 else None + cand_corners = cand_bbox[0] if cand_bbox is not None else None + cand_extent = cand_bbox[1] if cand_bbox is not None else None + + dist_merge = dist < min_distance_merge + if obs_extent is not None and cand_extent is not None: + dist_thresh = np.linalg.norm((obs_extent / 2 + cand_extent / 2) / 2) * 0.5 + dist_merge = dist_merge or dist < dist_thresh + + if dist_merge: + if dist < best_dist: + best_candidate = candidate + best_dist = dist + continue + + if obs_corners is None or cand_corners is None: + continue + + iou, ratio_obs, ratio_cand = compute_iou_3d_with_ratios( + obs_corners, cand_corners + ) + + if iou > iou_threshold or ratio_obs > ratio_threshold or ratio_cand > ratio_threshold: + if dist < best_dist: + best_candidate = candidate + best_dist = dist + + return best_candidate diff --git a/semantic_mapping/semantic_mapping/utils/storage.py b/semantic_mapping/semantic_mapping/utils/storage.py new file mode 100644 index 0000000..a3089f4 --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/storage.py @@ -0,0 +1,157 @@ +"""Async image saving queue for semantic mapping.""" + +from __future__ import annotations + +from pathlib import Path +import os +import shutil +import queue +import threading +import time + +import cv2 +import numpy as np + + +def _reset_output_dir(output_dir: Path) -> None: + """Clear an output directory if it contains existing files.""" + if output_dir.exists() and any(output_dir.iterdir()): + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + +class ImageSaveQueue: + """Background worker for saving images to disk.""" + + def __init__( + self, + output_dir: Path, + max_queue_size: int = 100, + save_png: bool = False, + save_transform: bool = False, + clear_output_dir: bool = True, + use_timestamp_dir: bool = False, + ) -> None: + """Initialize the image save queue. + + Args: + output_dir: Base directory where images are stored. + max_queue_size: Maximum number of queued save jobs. + save_png: If True, also save a .png copy of the image. + save_transform: If True, save a viewpoint transform when provided. + clear_output_dir: If True, clear existing contents on startup. + use_timestamp_dir: If True, write into a timestamped subdirectory. + """ + base_dir = Path(output_dir) + if clear_output_dir: + _reset_output_dir(base_dir) + else: + base_dir.mkdir(parents=True, exist_ok=True) + if use_timestamp_dir: + run_dir = base_dir / f"{time.time():.2f}" + run_dir.mkdir(parents=True, exist_ok=True) + self._output_dir = run_dir + else: + self._output_dir = base_dir + self._save_png = save_png + self._save_transform = save_transform + self._queue: queue.Queue = queue.Queue(maxsize=max_queue_size) + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + + def start(self) -> None: + """Start the background worker.""" + if self._thread is not None and self._thread.is_alive(): + return + self._stop_event.clear() + self._thread = threading.Thread(target=self._run, daemon=True) + self._thread.start() + + def stop(self) -> None: + """Stop the background worker.""" + self._stop_event.set() + try: + self._queue.put_nowait(None) + except queue.Full: + pass + if self._thread is not None: + self._thread.join(timeout=2.0) + self._thread = None + + def queue_save( + self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None + ) -> str: + """Queue an image (and optional mask) for saving. + + Args: + object_id: Object identifier to embed in filename. + image: Image array to save. + mask: Optional mask array to save alongside the image. + + Returns: + Path to the saved image (.npy). + """ + timestamp_ms = int(time.time() * 1000) + image_path = self._output_dir / f"{object_id}_{timestamp_ms}.npy" + mask_path = self._output_dir / f"{object_id}_{timestamp_ms}_mask.npy" + self._queue.put(("save_object", str(image_path), str(mask_path), image, mask)) + return str(image_path) + + def queue_viewpoint_save( + self, viewpoint_id: int, image: np.ndarray, transform: np.ndarray | None + ) -> None: + """Queue a viewpoint snapshot for saving.""" + if not self._save_png and not self._save_transform: + return + image_path = self._output_dir / f"viewpoint_{viewpoint_id}.png" + transform_path = self._output_dir / f"viewpoint_{viewpoint_id}_transform.npy" + try: + self._queue.put_nowait( + ("save_viewpoint", str(image_path), str(transform_path), image, transform) + ) + except queue.Full: + return + + def _run(self) -> None: + while not self._stop_event.is_set(): + item = self._queue.get() + if item is None: + self._queue.task_done() + break + op, image_path, aux_path, image, aux = item + if op == "save_object": + try: + image_target = Path(image_path) + image_tmp = image_target.with_suffix(image_target.suffix + ".tmp") + with image_tmp.open("wb") as handle: + np.save(handle, image) + os.replace(image_tmp, image_target) + if aux is not None: + aux_target = Path(aux_path) + aux_tmp = aux_target.with_suffix(aux_target.suffix + ".tmp") + with aux_tmp.open("wb") as handle: + np.save(handle, aux) + os.replace(aux_tmp, aux_target) + if self._save_png: + png_path = image_target.with_suffix(".png") + png_tmp = png_path.with_name(png_path.stem + ".tmp.png") + cv2.imwrite(str(png_tmp), image) + os.replace(png_tmp, png_path) + except Exception: + pass + elif op == "save_viewpoint": + try: + if self._save_transform and aux is not None: + aux_target = Path(aux_path) + aux_tmp = aux_target.with_suffix(aux_target.suffix + ".tmp") + with aux_tmp.open("wb") as handle: + np.save(handle, aux) + os.replace(aux_tmp, aux_target) + if self._save_png: + image_target = Path(image_path) + png_tmp = image_target.with_name(image_target.stem + ".tmp.png") + cv2.imwrite(str(png_tmp), image) + os.replace(png_tmp, image_target) + except Exception: + pass + self._queue.task_done() diff --git a/semantic_mapping/semantic_mapping/utils/utils.py b/semantic_mapping/semantic_mapping/utils/utils.py new file mode 100644 index 0000000..2a4e980 --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/utils.py @@ -0,0 +1,234 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +import json +import time +from typing import Any + +import cv2 +import numpy as np +import yaml + + +@dataclass(frozen=True) +class CameraParams: + image_width: int = 1920 + image_height: int = 640 + hfov: float = 360.0 + vfov: float = 120.0 + sensor_to_camera: np.ndarray | None = None + + +def load_camera_params(path: str, base_dir: Path | None = None) -> CameraParams | None: + config_path = Path(path) + if not config_path.is_absolute() and base_dir is not None: + config_path = base_dir / config_path + if not config_path.exists(): + return None + data = yaml.safe_load(config_path.read_text()) or {} + sensor_to_camera = _load_sensor_to_camera_matrix(data) + return CameraParams( + image_width=int(data.get("image_width", 1920)), + image_height=int(data.get("image_height", 640)), + hfov=float(data.get("hfov", 360.0)), + vfov=float(data.get("vfov", 120.0)), + sensor_to_camera=sensor_to_camera, + ) + + +def _load_sensor_to_camera_matrix(data: dict[str, Any]) -> np.ndarray | None: + tf_data = data.get("sensor_to_camera", {}) + trans = tf_data.get("translation", {}) + rpy = tf_data.get("rotation_rpy", {}) + cam_offset = np.array( + [ + float(trans.get("x", 0.0)), + float(trans.get("y", 0.0)), + float(trans.get("z", 0.0)), + ] + ) + roll = float(rpy.get("roll", 0.0)) + pitch = float(rpy.get("pitch", 0.0)) + yaw = float(rpy.get("yaw", 0.0)) + + r_x = np.array( + [[1, 0, 0], [0, np.cos(roll), -np.sin(roll)], [0, np.sin(roll), np.cos(roll)]] + ) + r_y = np.array( + [[np.cos(pitch), 0, np.sin(pitch)], [0, 1, 0], [-np.sin(pitch), 0, np.cos(pitch)]] + ) + r_z = np.array( + [[np.cos(yaw), -np.sin(yaw), 0], [np.sin(yaw), np.cos(yaw), 0], [0, 0, 1]] + ) + cam_rotation = r_z @ r_y @ r_x + rotation = cam_rotation.T + translation = -rotation @ cam_offset + transform = np.eye(4) + transform[:3, :3] = rotation + transform[:3, 3] = translation + return transform + + +def parse_json_dict(text: str) -> dict[str, Any]: + if not text: + return {} + cleaned = text.strip() + if "```" in cleaned: + parts = cleaned.split("```") + if len(parts) >= 3: + cleaned = parts[1].strip() + start = cleaned.find("{") + end = cleaned.rfind("}") + if start != -1 and end != -1: + cleaned = cleaned[start : end + 1] + try: + parsed = json.loads(cleaned) + except json.JSONDecodeError: + return {} + return parsed if isinstance(parsed, dict) else {} + + +def load_npy_image(path_str: str, retries: int = 3, retry_delay_s: float = 0.05) -> np.ndarray | None: + path = Path(path_str) + if not path.exists(): + return None + for attempt in range(max(1, retries)): + try: + return np.load(str(path)) + except Exception: + if attempt >= retries - 1: + return None + time.sleep(retry_delay_s) + + +def annotate_mask(image: np.ndarray, image_path: str, iterations: int = 2) -> np.ndarray: + mask_path = image_path.replace(".npy", "_mask.npy") + if not Path(mask_path).exists(): + return image + mask = np.load(mask_path) + mask = cv2.dilate(mask.astype(np.uint8), np.ones((3, 3), np.uint8), iterations=iterations) + mask_uint8 = mask.astype(np.uint8) + contours, _ = cv2.findContours(mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + annotated = image.copy() + cv2.drawContours(annotated, contours, -1, (0, 255, 0), 2) + return annotated + + +def concat_images(images: list[np.ndarray]) -> np.ndarray: + if not images: + return np.zeros((32, 32, 3), dtype=np.uint8) + heights = [img.shape[0] for img in images] + target_h = min(heights) + resized = [] + for img in images: + if img.shape[0] != target_h: + scale = target_h / img.shape[0] + width = max(1, int(img.shape[1] * scale)) + img = cv2.resize(img, (width, target_h), interpolation=cv2.INTER_AREA) + resized.append(img) + return cv2.cvtColor(cv2.hconcat(resized), cv2.COLOR_BGR2RGB) + + +def interpolate_odom( + odom_buffer: list[tuple[float, dict]], + detection_stamp: float, + linear_bias: float, + angular_bias: float, +) -> dict | None: + target_stamp = detection_stamp + linear_bias + angular_stamp = detection_stamp + angular_bias + stamps = [s for s, _ in odom_buffer] + if len(stamps) < 2: + return None + if target_stamp < stamps[0] or target_stamp > stamps[-1]: + return None + if angular_stamp < stamps[0] or angular_stamp > stamps[-1]: + return None + + right_idx = next(i for i, s in enumerate(stamps) if s >= target_stamp) + left_idx = max(right_idx - 1, 0) + angular_right_idx = next(i for i, s in enumerate(stamps) if s >= angular_stamp) + angular_left_idx = max(angular_right_idx - 1, 0) + + left_stamp, left_odom = odom_buffer[left_idx] + right_stamp, right_odom = odom_buffer[right_idx] + angular_left_stamp, angular_left_odom = odom_buffer[angular_left_idx] + angular_right_stamp, angular_right_odom = odom_buffer[angular_right_idx] + denom = right_stamp - left_stamp + ratio = (target_stamp - left_stamp) / denom if denom > 0 else 0.5 + angular_denom = angular_right_stamp - angular_left_stamp + angular_ratio = (angular_stamp - angular_left_stamp) / angular_denom if angular_denom > 0 else 0.5 + + return { + "position": right_odom["position"] * ratio + left_odom["position"] * (1 - ratio), + "linear_velocity": right_odom["linear_velocity"] * ratio + + left_odom["linear_velocity"] * (1 - ratio), + "orientation": _slerp_quat(angular_left_odom["orientation"], angular_right_odom["orientation"], angular_ratio), + "angular_velocity": angular_right_odom["angular_velocity"] * angular_ratio + + angular_left_odom["angular_velocity"] * (1 - angular_ratio), + } + + +def collect_cloud_window( + cloud_buffer: list[tuple[float, np.ndarray]], + detection_stamp: float, + window_before: float, + window_after: float, +) -> np.ndarray | None: + start = detection_stamp - window_before + end = detection_stamp + window_after + clouds = [pts for stamp, pts in cloud_buffer if start <= stamp <= end] + if len(clouds) == 0: + return None + return np.concatenate(clouds, axis=0) + + +def compute_ground_z_threshold( + points: np.ndarray, + robot_xy: np.ndarray, + ground_filter: bool, + ground_min_points: int, + ground_radius: float, + ground_percentile: float, + ground_z_clearance: float, +) -> float | None: + if not ground_filter: + return None + if points.shape[0] < ground_min_points: + return None + if ground_radius <= 0.0: + return None + if not 0.0 < ground_percentile < 1.0: + return None + + diffs = points[:, :2] - robot_xy + dist2 = diffs[:, 0] ** 2 + diffs[:, 1] ** 2 + radius2 = ground_radius**2 + mask = dist2 <= radius2 + if np.count_nonzero(mask) < ground_min_points: + return None + + z_vals = points[mask, 2] + z_thresh = float(np.percentile(z_vals, ground_percentile * 100.0)) + return z_thresh + ground_z_clearance + + +def _slerp_quat(q0: np.ndarray, q1: np.ndarray, ratio: float) -> np.ndarray: + """Slerp between two quaternions without scipy dependency.""" + q0 = np.asarray(q0, dtype=float) + q1 = np.asarray(q1, dtype=float) + dot = float(np.dot(q0, q1)) + if dot < 0.0: + q1 = -q1 + dot = -dot + if dot > 0.9995: + result = q0 + ratio * (q1 - q0) + return result / np.linalg.norm(result) + theta_0 = np.arccos(dot) + sin_theta_0 = np.sin(theta_0) + theta = theta_0 * ratio + sin_theta = np.sin(theta) + s0 = np.cos(theta) - dot * sin_theta / sin_theta_0 + s1 = sin_theta / sin_theta_0 + return s0 * q0 + s1 * q1 diff --git a/semantic_mapping/semantic_mapping/vlm_reasoning_node.py b/semantic_mapping/semantic_mapping/vlm_reasoning_node.py new file mode 100644 index 0000000..9d717f8 --- /dev/null +++ b/semantic_mapping/semantic_mapping/vlm_reasoning_node.py @@ -0,0 +1,556 @@ +"""ROS2 node for VLM reasoning tasks in semantic mapping.""" + +from __future__ import annotations + +from collections import deque +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from pathlib import Path +import threading +import time + +import cv2 +import numpy as np +import rclpy +from cv_bridge import CvBridge +from rclpy.node import Node +from std_msgs.msg import String +import yaml + +from semantic_mapping.msg import ObjectType, RoomType, TargetObjectInstruction, TargetObjectWithSpatial +from vector_perception_utils.pointcloud_utils import project_bbox3d_equirect +from vlm.models import MoondreamConfig, MoondreamVlm, QwenConfig, QwenVlm, VlmModelBase + +from .prompts import ( + build_object_type_prompt, + build_room_type_prompt, + INSTRUCTION_DECOMPOSITION_PROMPT, + TARGET_OBJECT_SPATIAL_PROMPT, +) +from .utils.utils import ( + annotate_mask, + concat_images, + load_camera_params, + load_npy_image, + parse_json_dict, +) + + +@dataclass +class InstructionState: + instruction: str = "" + target_object: str = "" + room_condition: str = "" + spatial_condition: str = "" + attribute_condition: str = "" + anchor_object: str = "" + attribute_condition_anchor: str = "" + + +class VlmReasoningNode(Node): + """VLM reasoning node that answers object queries for semantic mapping.""" + + def __init__(self) -> None: + super().__init__("vlm_reasoning_node") + + self.declare_parameter("vlm_provider", "qwen") + self.declare_parameter("vlm_model", "qwen2.5-vl-72b-instruct") + self.declare_parameter("objects_yaml_path", "detect_anything/config/objects.yaml") + self.declare_parameter("object_type_query_topic", "/object_type_query") + self.declare_parameter("object_type_answer_topic", "/object_type_answer") + self.declare_parameter("target_object_spatial_query_topic", "/target_object_spatial_query") + self.declare_parameter("target_object_answer_topic", "/target_object_answer") + self.declare_parameter("target_object_instruction_topic", "/target_object_instruction") + self.declare_parameter("instruction_topic", "/keyboard_input") + self.declare_parameter("room_type_query_topic", "/room_type_query") + self.declare_parameter("room_type_answer_topic", "/room_type_answer") + self.declare_parameter("room_type_cache_s", 10.0) + self.declare_parameter("viewpoint_output_dir", "output/viewpoint_images") + self.declare_parameter("dump_object_type_images", False) + self.declare_parameter("object_type_dump_dir", "output/vlm_object_type") + self.declare_parameter("dump_room_type_images", False) + self.declare_parameter("room_type_dump_dir", "output/vlm_room_type") + self.declare_parameter("camera_param_path", "config/camera_param.yaml") + self.declare_parameter("platform", "mecanum") + self.declare_parameter("max_viewpoints", 3) + self.declare_parameter( + "room_types", + [ + "Classroom", + "Laboratory", + "Office Room", + "Meeting Room", + "Restroom", + "Storage Room", + "Copy Room", + "Lounge", + "Reception", + "Hallway", + ], + ) + + self._vlm = self._build_vlm() + self._objects = self._load_object_labels() + self._viewpoint_dir = Path(str(self.get_parameter("viewpoint_output_dir").value)) + self._platform = str(self.get_parameter("platform").value) + self._max_viewpoints = int(self.get_parameter("max_viewpoints").value) + camera_param_path = str(self.get_parameter("camera_param_path").value) + self._camera_params = load_camera_params( + camera_param_path, base_dir=Path(__file__).resolve().parent.parent + ) + room_types_value = self.get_parameter("room_types").value + if isinstance(room_types_value, (list, tuple)): + self._room_types = [str(item) for item in room_types_value] + else: + self._room_types = [] + self._instruction_state = InstructionState() + self._bridge = CvBridge() + self._dump_object_type_images = bool( + self.get_parameter("dump_object_type_images").value + ) + self._object_type_dump_dir = Path( + str(self.get_parameter("object_type_dump_dir").value) + ) + if self._dump_object_type_images: + self._object_type_dump_dir.mkdir(parents=True, exist_ok=True) + self._dump_room_type_images = bool(self.get_parameter("dump_room_type_images").value) + self._room_type_dump_dir = Path(str(self.get_parameter("room_type_dump_dir").value)) + if self._dump_room_type_images: + self._room_type_dump_dir.mkdir(parents=True, exist_ok=True) + + self._lock = threading.Lock() + self._instruction_queue: deque[str] = deque(maxlen=20) + self._object_type_queries: dict[int, ObjectType] = {} + self._room_type_queries: dict[int, RoomType] = {} + self._spatial_queries: dict[int, TargetObjectWithSpatial] = {} + + object_type_query_topic = str(self.get_parameter("object_type_query_topic").value) + object_type_answer_topic = str(self.get_parameter("object_type_answer_topic").value) + target_object_spatial_query_topic = str( + self.get_parameter("target_object_spatial_query_topic").value + ) + target_object_answer_topic = str(self.get_parameter("target_object_answer_topic").value) + target_object_instruction_topic = str( + self.get_parameter("target_object_instruction_topic").value + ) + instruction_topic = str(self.get_parameter("instruction_topic").value) + room_type_query_topic = str(self.get_parameter("room_type_query_topic").value) + room_type_answer_topic = str(self.get_parameter("room_type_answer_topic").value) + + self.create_subscription(ObjectType, object_type_query_topic, self._object_type_cb, 50) + self.create_subscription( + TargetObjectWithSpatial, + target_object_spatial_query_topic, + self._target_object_spatial_cb, + 10, + ) + self.create_subscription(String, instruction_topic, self._instruction_cb, 10) + self.create_subscription(RoomType, room_type_query_topic, self._room_type_cb, 10) + + self._object_type_pub = self.create_publisher(ObjectType, object_type_answer_topic, 50) + self._target_object_pub = self.create_publisher( + TargetObjectWithSpatial, target_object_answer_topic, 10 + ) + self._target_object_instruction_pub = self.create_publisher( + TargetObjectInstruction, target_object_instruction_topic, 10 + ) + self._room_type_pub = self.create_publisher(RoomType, room_type_answer_topic, 10) + + self._executor = ThreadPoolExecutor(max_workers=4) + self.create_timer(0.1, self._process_queues) + + self._room_type_labels: dict[int, RoomType] = {} + self._room_type_cache_s = float(self.get_parameter("room_type_cache_s").value) + self._room_type_last_query_ts: dict[int, float] = {} + self._last_log_times: dict[str, float] = {} + self.get_logger().debug( + f"VLM reasoning node ready (room_type_query={room_type_query_topic}, " + f"room_type_answer={room_type_answer_topic}, " + f"object_type_query={object_type_query_topic}, " + f"object_type_answer={object_type_answer_topic}, " + f"target_object_spatial_query={target_object_spatial_query_topic}, " + f"target_object_answer={target_object_answer_topic})" + ) + + def destroy_node(self) -> bool: + self._executor.shutdown(wait=False) + return super().destroy_node() + + def _build_vlm(self) -> VlmModelBase: + provider = str(self.get_parameter("vlm_provider").value).lower() + model_name = str(self.get_parameter("vlm_model").value) + if provider == "qwen": + return QwenVlm(QwenConfig(model_name=model_name)) + if provider == "moondream": + return MoondreamVlm(MoondreamConfig()) + raise ValueError(f"Unknown VLM provider: {provider}") + + def _load_object_labels(self) -> list[str]: + path = Path(str(self.get_parameter("objects_yaml_path").value)) + if not path.exists(): + self.get_logger().warning(f"objects.yaml not found: {path}") + return [] + with path.open("r", encoding="utf-8") as handle: + data = yaml.safe_load(handle) + if isinstance(data, list): + return [str(item) for item in data] + return [] + + def _instruction_cb(self, msg: String) -> None: + self._instruction_queue.append(msg.data) + + def _room_type_cb(self, msg: RoomType) -> None: + self._room_type_queries[msg.room_id] = msg + + def _object_type_cb(self, msg: ObjectType) -> None: + self._object_type_queries[msg.object_id] = msg + + def _target_object_spatial_cb(self, msg: TargetObjectWithSpatial) -> None: + self._spatial_queries[msg.object_id] = msg + + def _process_queues(self) -> None: + if self._instruction_queue: + instruction = self._instruction_queue.pop() + self._instruction_queue.clear() + self._submit_task("instruction", self._process_instruction, instruction) + + if self._room_type_queries: + items = list(self._room_type_queries.values()) + self._room_type_queries.clear() + for item in items: + self._submit_task("room_type", self._process_room_type, item) + + if self._object_type_queries: + items = list(self._object_type_queries.values()) + self._object_type_queries.clear() + for item in items: + self._submit_task("object_type", self._process_object_type, item) + + if self._spatial_queries: + items = list(self._spatial_queries.values()) + self._spatial_queries.clear() + for item in items: + self._submit_task("target_spatial", self._process_target_object_spatial, item) + + def _process_instruction(self, instruction: str) -> None: + prompt = INSTRUCTION_DECOMPOSITION_PROMPT + f"\nInstruction: {instruction}" + blank = np.zeros((32, 32, 3), dtype=np.uint8) + response = self._vlm.query(blank, prompt) + parsed = parse_json_dict(response) + if not parsed: + self.get_logger().warning(f"Failed to parse instruction JSON: {response}") + return + + state = InstructionState( + instruction=instruction, + target_object=str(parsed.get("target_object", "")).lower(), + room_condition=str(parsed.get("room_condition", "")).lower(), + spatial_condition=str(parsed.get("spatial_condition", "")).lower(), + attribute_condition=str(parsed.get("attribute_condition", "")).lower(), + anchor_object=str(parsed.get("anchor_object", "")).lower(), + attribute_condition_anchor=str(parsed.get("attribute_condition_anchor", "")).lower(), + ) + with self._lock: + self._instruction_state = state + + msg = TargetObjectInstruction() + msg.target_object = state.target_object + msg.room_condition = state.room_condition + msg.spatial_condition = state.spatial_condition + msg.attribute_condition = state.attribute_condition + msg.anchor_object = state.anchor_object + msg.attribute_condition_anchor = state.attribute_condition_anchor + self._target_object_instruction_pub.publish(msg) + + def _process_room_type(self, msg: RoomType) -> None: + if not msg.image.data or not msg.room_mask.data: + self.get_logger().warning(f"Room type query missing image or mask: {msg.room_id}") + return + cached = self._room_type_labels.get(msg.room_id) + if cached and self._room_type_cache_s > 0: + last_query = self._room_type_last_query_ts.get(msg.room_id, 0.0) + if time.time() - last_query < self._room_type_cache_s: + answer = RoomType() + answer.header = msg.header + answer.anchor_point = msg.anchor_point + answer.image = msg.image + answer.room_mask = msg.room_mask + answer.room_type = cached.room_type + answer.room_id = msg.room_id + answer.voxel_num = msg.voxel_num + answer.in_room = msg.in_room + self._room_type_pub.publish(answer) + with self._lock: + self._room_type_labels[answer.room_id] = answer + self._log_throttled( + "room_type_cache", + "Room type cached: room_id=%s type=%s", + answer.room_id, + answer.room_type, + interval_s=5.0, + ) + return + image = self._bridge.imgmsg_to_cv2(msg.image, desired_encoding="bgr8") + mask = self._bridge.imgmsg_to_cv2(msg.room_mask, desired_encoding="mono8") + mosaic = self._build_room_type_mosaic(image, mask) + if self._dump_room_type_images: + timestamp_ms = int(time.time() * 1000) + base = self._room_type_dump_dir / f"room_{msg.room_id}_{timestamp_ms}" + mask_uint8 = mask.astype(np.uint8) + if mask_uint8.max() <= 1: + mask_uint8 = (mask_uint8 * 255).astype(np.uint8) + cv2.imwrite(str(base.parent / f"{base.name}_image.png"), image) + cv2.imwrite(str(base.parent / f"{base.name}_mask.png"), mask_uint8) + cv2.imwrite( + str(base.parent / f"{base.name}_mosaic.png"), + cv2.cvtColor(mosaic, cv2.COLOR_RGB2BGR), + ) + + room_types = list(self._room_types) + if not msg.in_room and "Corridor" in room_types: + room_types = [room for room in room_types if room != "Corridor"] + prompt = build_room_type_prompt(room_types) + start_time = time.time() + try: + response = self._vlm.query(mosaic, prompt) + except Exception as exc: + self.get_logger().error(f"Room type query failed: {exc}") + return + self._room_type_last_query_ts[msg.room_id] = time.time() + duration_s = time.time() - start_time + parsed = parse_json_dict(response) + if parsed: + room_type = str(parsed.get("room_type", "")).strip() + else: + room_type = response.strip().strip('"') + self._log_throttled( + "room_type_parse", + "Room type parse failed: room_id=%s response=%s", + msg.room_id, + self._preview_response(response), + interval_s=5.0, + ) + answer = RoomType() + answer.header = msg.header + answer.anchor_point = msg.anchor_point + answer.image = msg.image + answer.room_mask = msg.room_mask + answer.room_type = room_type.lower() + answer.room_id = msg.room_id + answer.voxel_num = msg.voxel_num + answer.in_room = msg.in_room + self._room_type_pub.publish(answer) + with self._lock: + self._room_type_labels[answer.room_id] = answer + self._log_throttled( + "room_type_answer", + "Room type answer: room_id=%s type=%s (%.2fs)", + answer.room_id, + answer.room_type, + duration_s, + interval_s=2.0, + ) + + def _process_object_type(self, msg: ObjectType) -> None: + image = load_npy_image(msg.img_path) + if image is None: + self.get_logger().warning(f"Failed to load image: {msg.img_path}") + return + image = annotate_mask(image, msg.img_path) + image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + if self._dump_object_type_images: + timestamp_ms = int(time.time() * 1000) + dump_path = self._object_type_dump_dir / f"{msg.object_id}_{timestamp_ms}.png" + cv2.imwrite(str(dump_path), cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)) + + labels = list(msg.labels) if msg.labels else self._objects + prompt = build_object_type_prompt(labels) + start_time = time.time() + try: + response = self._vlm.query(image_rgb, prompt) + except Exception as exc: + self.get_logger().error(f"Object type query failed: {exc}") + return + duration_s = time.time() - start_time + parsed = parse_json_dict(response) + if parsed: + final_label = str(parsed.get("label", labels[0] if labels else "unknown")).lower() + else: + final_label = str(labels[0] if labels else "unknown").lower() + self._log_throttled( + "object_type_parse", + "Object type parse failed: object_id=%s response=%s", + msg.object_id, + self._preview_response(response), + interval_s=5.0, + ) + + answer = ObjectType() + answer.object_id = msg.object_id + answer.img_path = msg.img_path + answer.final_label = final_label + answer.labels = msg.labels + self._object_type_pub.publish(answer) + self._log_throttled( + "object_type_answer", + "Object type answer: object_id=%s label=%s (%.2fs)", + answer.object_id, + answer.final_label, + duration_s, + interval_s=2.0, + ) + + @staticmethod + def _build_room_type_mosaic(image: np.ndarray, mask: np.ndarray) -> np.ndarray: + if image.ndim == 2: + image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) + if mask.ndim == 3 and mask.shape[2] == 1: + mask = mask[:, :, 0] + if mask.ndim == 2: + mask_uint8 = mask.astype(np.uint8) + if mask_uint8.max() <= 1: + mask_uint8 = (mask_uint8 * 255).astype(np.uint8) + else: + mask_uint8 = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY) + + nonzero_ratio = float(np.count_nonzero(mask_uint8)) / float(mask_uint8.size) + if nonzero_ratio < 0.005: + return cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + mask_bgr = cv2.cvtColor(mask_uint8, cv2.COLOR_GRAY2BGR) + target_h = image.shape[0] + if mask_bgr.shape[0] != target_h: + scale = target_h / mask_bgr.shape[0] + new_w = max(1, int(mask_bgr.shape[1] * scale)) + mask_bgr = cv2.resize(mask_bgr, (new_w, target_h), interpolation=cv2.INTER_NEAREST) + mosaic = cv2.hconcat([image, mask_bgr]) + return cv2.cvtColor(mosaic, cv2.COLOR_BGR2RGB) + + def _process_target_object_spatial(self, msg: TargetObjectWithSpatial) -> None: + image = load_npy_image(msg.img_path) + if image is None: + self.get_logger().warning(f"Failed to load image: {msg.img_path}") + return + image = annotate_mask(image, msg.img_path) + + prompt = self._build_target_spatial_prompt(msg) + mosaic = self._build_spatial_mosaic(image, msg) + try: + response = self._vlm.query(mosaic, prompt) + except Exception as exc: + self.get_logger().error(f"Target spatial query failed: {exc}") + return + parsed = parse_json_dict(response) + is_target = bool(parsed.get("is_target", False)) + + answer = TargetObjectWithSpatial() + answer.header = msg.header + answer.object_id = msg.object_id + answer.object_label = msg.object_label + answer.img_path = msg.img_path + answer.viewpoint_ids = msg.viewpoint_ids + answer.room_label = msg.room_label + answer.is_target = is_target + answer.bbox3d = msg.bbox3d + self._target_object_pub.publish(answer) + self._log_throttled( + "target_spatial_answer", + "Target spatial answer: object_id=%s is_target=%s", + answer.object_id, + answer.is_target, + ) + + def _log_throttled(self, key: str, msg: str, *args: object, interval_s: float = 5.0) -> None: + now = time.time() + last = self._last_log_times.get(key, 0.0) + if now - last < interval_s: + return + self._last_log_times[key] = now + formatted = msg % args if args else msg + self.get_logger().debug(formatted) + + @staticmethod + def _preview_response(response: object, limit: int = 160) -> str: + text = response if isinstance(response, str) else str(response) + text = text.replace("\n", " ").strip() + if len(text) > limit: + text = f"{text[:limit]}..." + return text + + def _submit_task(self, name: str, func: callable, *args: object) -> None: + future = self._executor.submit(self._run_task, name, func, *args) + future.add_done_callback(self._task_done) + + def _run_task(self, name: str, func: callable, *args: object) -> None: + try: + func(*args) + except Exception as exc: + self.get_logger().error(f"VLM task {name} failed: {exc}") + + def _task_done(self, future) -> None: + _ = future + + def _build_target_spatial_prompt(self, msg: TargetObjectWithSpatial) -> str: + with self._lock: + state = self._instruction_state + instruction = state.instruction or "" + description = f"Object: {msg.object_label}, Room: {msg.room_label or 'unknown'}" + detail = ( + f"Target: {state.target_object}, Room Condition: {state.room_condition or 'none'}, " + f"Spatial Condition: {state.spatial_condition or 'none'}, " + f"Attribute Condition: {state.attribute_condition or 'none'}" + ) + return f"{TARGET_OBJECT_SPATIAL_PROMPT}\n{instruction}\n{detail}\n{description}" + + def _build_spatial_mosaic(self, image: np.ndarray, msg: TargetObjectWithSpatial) -> np.ndarray: + images = [image] + if len(msg.viewpoint_ids) > 0: + bbox3d = np.array([[p.x, p.y, p.z] for p in msg.bbox3d], dtype=float) + for vp_id in list(msg.viewpoint_ids)[: self._max_viewpoints]: + vp_image = self._load_viewpoint_image(vp_id) + if vp_image is None: + continue + transform = self._load_viewpoint_transform(vp_id) + if transform is not None and self._camera_params is not None: + vp_image = project_bbox3d_equirect( + image=vp_image, + bbox3d=bbox3d, + body_to_world=transform, + sensor_to_camera=self._camera_params.sensor_to_camera, + image_width=self._camera_params.image_width, + image_height=self._camera_params.image_height, + hfov=self._camera_params.hfov, + vfov=self._camera_params.vfov, + ) + images.append(vp_image) + return concat_images(images) + + def _load_viewpoint_image(self, viewpoint_id: int) -> np.ndarray | None: + image_path = self._viewpoint_dir / f"viewpoint_{viewpoint_id}.png" + if not image_path.exists(): + return None + return cv2.imread(str(image_path)) + + def _load_viewpoint_transform(self, viewpoint_id: int) -> np.ndarray | None: + transform_path = self._viewpoint_dir / f"viewpoint_{viewpoint_id}_transform.npy" + if not transform_path.exists(): + return None + return np.load(str(transform_path)) + + + + + + +def main() -> None: + rclpy.init() + node = VlmReasoningNode() + try: + rclpy.spin(node) + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/sensor_coverage/CMakeLists.txt b/sensor_coverage/CMakeLists.txt new file mode 100644 index 0000000..ed2cab6 --- /dev/null +++ b/sensor_coverage/CMakeLists.txt @@ -0,0 +1,68 @@ +cmake_minimum_required(VERSION 3.10) +project(sensor_coverage) + +find_package(ament_cmake REQUIRED) +find_package(rclcpp REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(nav_msgs REQUIRED) +find_package(geometry_msgs REQUIRED) +find_package(visualization_msgs REQUIRED) +find_package(std_msgs REQUIRED) +find_package(tf2 REQUIRED) +find_package(tf2_ros REQUIRED) +find_package(cv_bridge REQUIRED) +find_package(OpenCV REQUIRED) +find_package(Eigen3 REQUIRED) +find_package(PCL REQUIRED COMPONENTS common io filters kdtree segmentation features) +find_package(pcl_conversions REQUIRED) +find_package(semantic_mapping REQUIRED) + +add_executable(room_segmentation_node + src/room_segmentation/room_segmentation.cpp + src/room_segmentation/room_node_rep.cpp + src/utils/misc_utils.cpp +) + +target_include_directories(room_segmentation_node + PUBLIC + $ + $ + ${PCL_INCLUDE_DIRS} +) + +target_link_libraries(room_segmentation_node + ${PCL_LIBRARIES} + ${OpenCV_LIBRARIES} +) + +ament_target_dependencies(room_segmentation_node + rclcpp + sensor_msgs + nav_msgs + geometry_msgs + visualization_msgs + std_msgs + tf2 + tf2_ros + cv_bridge + pcl_conversions + semantic_mapping +) + +install(TARGETS room_segmentation_node + DESTINATION lib/${PROJECT_NAME} +) + +install(DIRECTORY include/ + DESTINATION include +) + +install(DIRECTORY config + DESTINATION share/${PROJECT_NAME} +) + +install(FILES resource/${PROJECT_NAME} + DESTINATION share/${PROJECT_NAME}/resource +) + +ament_package() diff --git a/sensor_coverage/config/room_segmentation_config.yaml b/sensor_coverage/config/room_segmentation_config.yaml new file mode 100644 index 0000000..89c4ba4 --- /dev/null +++ b/sensor_coverage/config/room_segmentation_config.yaml @@ -0,0 +1,24 @@ +room_segmentation_node: + ros__parameters: + exploredAreaDisplayInterval: 1 + exploredAreaVoxelSize: 0.1 + dilation_iteration: 3 + room_resolution: 0.1 + room_x: 3000 + room_y: 3000 + room_z: 80 + rolling_occupancy_grid.resolution_x: 0.2 + ceilingHeight_: 2.3 + wall_thres_height_: 0.1 + outward_distance_0: 0.5 + outward_distance_1: 0.3 + distance_threshold: 2.0 + distance_angel_threshold: 0.3 + angle_threshold_deg: 6.0 + region_growing_radius: 15.0 + min_room_size: 40 + normal_search_num: 50 + normal_search_radius: 0.5 + kViewPointCollisionMarginZPlus: 0.75 + kViewPointCollisionMarginZMinus: 0.4 + isDebug: false diff --git a/sensor_coverage/include/sensor_coverage/room_segmentation/room_node_rep.h b/sensor_coverage/include/sensor_coverage/room_segmentation/room_node_rep.h new file mode 100644 index 0000000..3613017 --- /dev/null +++ b/sensor_coverage/include/sensor_coverage/room_segmentation/room_node_rep.h @@ -0,0 +1,113 @@ +/** + * @file room_node_rep.h + * @author Haokun Zhu (haokunz@andrew.cmu.edu) + * @brief Room node representation for room segmentation + * @version 0.1 + * @date 2026-01-04 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include "geometry_msgs/msg/point.hpp" +#include "geometry_msgs/msg/polygon_stamped.hpp" +#include "semantic_mapping/msg/room_node.hpp" + +namespace representation_ns { + +class RoomNodeRep { +public: + explicit RoomNodeRep(const int id, const std::vector &points); + RoomNodeRep() = default; + ~RoomNodeRep() = default; + + void UpdateRoomNode(const std::vector &points, const geometry_msgs::msg::PolygonStamped &polygon); + void UpdateRoomNode(const std::vector &points); + void UpdatePolygon(const geometry_msgs::msg::PolygonStamped &polygon); + void UpdateCentroid(Eigen::Vector3f ¢roid); + void RemovePoint(const cv::Point &point); + bool InRoom(const geometry_msgs::msg::Point &point); + void UpdateRoomNode(const semantic_mapping::msg::RoomNode msg); + + void AddViewpointId(int viewpoint_id) { viewpoint_indices_.insert(viewpoint_id); } + void DeleteViewpointId(int viewpoint_id) { viewpoint_indices_.erase(viewpoint_id); } + void AddObjectIndex(int object_index) { object_indices_.insert(object_index); } + void DeleteObjectIndex(int object_index) { object_indices_.erase(object_index); } + + void SetAnchorPoint(const geometry_msgs::msg::Point &anchor_point) { anchor_point_ = anchor_point; } + const geometry_msgs::msg::Point &GetAnchorPoint() const { return anchor_point_; } + + void SetImage(const cv::Mat &image) { image_ = image.clone(); } + const cv::Mat &GetImage() const { return image_; } + + void SetRoomMask(const cv::Mat &room_mask); + const cv::Mat &GetRoomMask() const { return room_mask_; } + + void SetLastArea(float last_area) { last_area_ = last_area; } + float GetLastArea() const { return last_area_; } + + void SetIsAsked() { is_asked_ = std::max(0, is_asked_ - 1); } + void ClearRoomLabels(); + + std::map &GetLabelsMutable() { return labels_; } + const std::map &GetLabels() const { return labels_; } + + bool IsLabeled() const { return is_labeled_; } + void SetIsLabeled(bool is_labeled) { is_labeled_ = is_labeled; } + + const std::set &GetObjectIndices() const { return object_indices_; } + std::set &GetObjectIndicesMutable() { return object_indices_; } + + bool IsVisited() const { return is_visited_; } + void SetIsVisited(bool is_visited) { is_visited_ = is_visited; } + + bool IsCovered() const { return is_covered_; } + void SetIsCovered(bool is_covered) { is_covered_ = is_covered; } + + int GetIsAsked() const { return is_asked_; } + void SetIsAskedValue(int is_asked) { is_asked_ = is_asked; } + + int GetVoxelNum() const { return voxel_num_; } + void SetVoxelNum(int voxel_num) { voxel_num_ = voxel_num; } + + bool IsAlive() const { return alive; } + void SetAlive(bool is_alive) { alive = is_alive; } + + const geometry_msgs::msg::PolygonStamped &GetPolygon() const { return polygon_; } + int GetId() const { return id_; } + + int id_; + int show_id_; + geometry_msgs::msg::PolygonStamped polygon_; + std::vector points_; + Eigen::Vector3f centroid_; + float area_; + float last_area_; + bool alive; + std::set neighbors_; + std::set viewpoint_indices_; + std::set object_indices_; + bool is_visited_ = false; + bool is_covered_ = false; + bool is_labeled_ = false; + int is_asked_ = 2; + int voxel_num_; + std::map labels_; + geometry_msgs::msg::Point anchor_point_; + cv::Mat image_; + cv::Mat room_mask_; + bool is_connected_; +}; + +} // namespace representation_ns diff --git a/sensor_coverage/include/sensor_coverage/room_segmentation/room_segmentation_node.h b/sensor_coverage/include/sensor_coverage/room_segmentation/room_segmentation_node.h new file mode 100644 index 0000000..5ffb496 --- /dev/null +++ b/sensor_coverage/include/sensor_coverage/room_segmentation/room_segmentation_node.h @@ -0,0 +1,216 @@ +/** + * @file room_segmentation_node.h + * @author Haokun Zhu (haokunz@andrew.cmu.edu) + * @brief ROS 2 Node class header for room segmentation + * @version 0.2 + * @date 2026-01-04 + * @copyright Copyright (c) 2026 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ROS 2 +#include "rclcpp/rclcpp.hpp" +#include "sensor_msgs/msg/point_cloud2.hpp" +#include "sensor_msgs/msg/image.hpp" +#include "nav_msgs/msg/odometry.hpp" +#include "geometry_msgs/msg/polygon_stamped.hpp" +#include "geometry_msgs/msg/point.hpp" +#include "visualization_msgs/msg/marker_array.hpp" + +// PCL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// OpenCV +#include +#include + +// Eigen +#include + +// Custom messages +#include "semantic_mapping/msg/room_node.hpp" +#include "semantic_mapping/msg/room_node_list.hpp" +#include "sensor_coverage/room_segmentation/room_node_rep.h" + + +namespace room_segmentation { + +// Helper structures +struct PlaneInfo { + int id; + pcl::PointCloud::Ptr cloud; + std::vector voxel_indices; + Eigen::Vector3f normal; + Eigen::Vector3f centroid; + Eigen::Vector3f u_dir; + Eigen::Vector3f v_dir; + float width; + float height; + std::array corners; + bool alive = true; + bool merged = false; +}; + +/** + * @brief ROS 2 Node for room segmentation + * + * This node processes laser scan data to segment rooms in an indoor environment. + * It maintains maps of navigable space, detects walls using plane fitting, + * and segments the environment into discrete rooms connected by doors. + */ +class RoomSegmentationNode : public rclcpp::Node { +public: + explicit RoomSegmentationNode(); + ~RoomSegmentationNode() = default; + +private: + // ==================== Callback Functions ==================== + void laserCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg); + void occupiedCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg); + void freespaceCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg); + void stateEstimationCallback(const nav_msgs::msg::Odometry::ConstSharedPtr msg); + void timerCallback(); + + // ==================== Core Processing Functions ==================== + void roomSegmentation(); + cv::Mat getWall(const pcl::PointCloud::Ptr &cloud); + void updateVoxelMap(const std::vector &navigable_points); + void updateStateVoxel(); + void updateFreespace(pcl::PointCloud::Ptr &freespace_cloud_tmp); + void updateRooms(cv::Mat &room_mask_cropped, cv::Mat &room_mask_new, cv::Mat &room_mask_vis_cropped, int &room_number); + + // ==================== Helper Functions ==================== + void mergePlanes(std::vector &plane_infos, int idx_0, PlaneInfo &compare); + bool isPlaneSame(const PlaneInfo &a, const PlaneInfo &b); + bool isRoomConnected(const int &room_id, const int ¤t_room_id, + const Eigen::MatrixXi &adjacency_matrix); + geometry_msgs::msg::PolygonStamped computePolygonFromMaskCropped(const cv::Mat &mask); + void saveImageToFile(const cv::Mat &image, const std::string &filename, bool force_save = false); + int toIndex(int x, int y, int z); + int toIndex(int x, int y); + + // ==================== Publishing Functions ==================== + void publishRoomNodes(); + void publishRoomPolygon(); + void publishDoorCloud(); + + // ==================== ROS2 Interfaces ==================== + + // Subscriptions + rclcpp::Subscription::SharedPtr sub_laser_cloud_; + rclcpp::Subscription::SharedPtr sub_occupied_cloud_; + rclcpp::Subscription::SharedPtr sub_freespace_cloud_; + rclcpp::Subscription::SharedPtr sub_state_estimation_; + + // Publishers + rclcpp::Publisher::SharedPtr pub_explored_area_; + rclcpp::Publisher::SharedPtr pub_room_mask_vis_; + rclcpp::Publisher::SharedPtr pub_room_mask_; + rclcpp::Publisher::SharedPtr pub_door_cloud_; + rclcpp::Publisher::SharedPtr pub_debug_; + rclcpp::Publisher::SharedPtr pub_debug_1_; + rclcpp::Publisher::SharedPtr pub_room_boundary_; + rclcpp::Publisher::SharedPtr pub_room_boundary_tmp_; + rclcpp::Publisher::SharedPtr pub_polygon_; + rclcpp::Publisher::SharedPtr pub_wall_cloud_; + rclcpp::Publisher::SharedPtr pub_room_node_; + rclcpp::Publisher::SharedPtr pub_room_node_list_; + rclcpp::Publisher::SharedPtr pub_room_map_cloud_; + + // Timer + rclcpp::TimerBase::SharedPtr timer_; + + // ==================== Parameters ==================== + float explored_area_voxel_size_; + float room_resolution_; + float room_resolution_inv_; + float occupancy_grid_resolution_; + float ceiling_height_; + float ceiling_height_base_; // this is the ceiling height relative to robot base height + float wall_thres_height_; + float wall_thres_height_base_; // this is the wall threshold height relative to robot base height + float outward_distance_0_; // this controls the extent to which the wall dilates along its own direction. + float outward_distance_1_; // this controls the extent to which the wall dilates along its normal direction. + float distance_threshold_; + float distance_angel_threshold_; + float angle_threshold_deg_; + float region_growing_radius_; // radius for region growing segmentation + int dilation_iteration_; // number of dilation iterations when pre-processing for watershed + int min_room_size_; // minimum room size in number of pixels after dilation + int exploredAreaDisplayInterval_; // interval for publishing explored area (x10) + int normal_search_num_; // Number of nearest neighbors for normal estimation + float normal_search_radius_; // Search radius for normal estimation + float kViewPointCollisionMarginZPlus_; + float kViewPointCollisionMarginZMinus_; + bool is_debug_; // whether to save debug images + std::vector room_voxel_dimension_; + + // ==================== Point Clouds ==================== + pcl::PointCloud::Ptr laser_cloud_; + pcl::PointCloud::Ptr laser_cloud_tmp_; + pcl::PointCloud::Ptr explored_area_cloud_tmp_; + pcl::PointCloud::Ptr downsampled_explored_area_cloud_; + pcl::PointCloud::Ptr downsampled_explored_area_cloud_tmp_; + pcl::PointCloud::Ptr downsampled_ceiling_cloud_; + pcl::PointCloud::Ptr downsampled_ceiling_cloud_tmp_; + pcl::PointCloud::Ptr in_range_cloud_; + pcl::PointCloud::Ptr occupied_cloud_; + pcl::PointCloud::Ptr updated_voxel_cloud_; + pcl::PointCloud::Ptr freespace_cloud_; + pcl::PointCloud::Ptr door_cloud_; + + // ==================== Filters ==================== + pcl::VoxelGrid explored_area_dwz_filter_; + pcl::VoxelGrid ceiling_cloud_dwz_filter_; + pcl::PassThrough ceiling_pass_filter_; // Reusable filter for ceiling height + pcl::PassThrough occupied_pass_filter_; // Reusable filter for occupied cloud + pcl::search::KdTree::Ptr tree_; + + // ==================== Maps and Data Structures ==================== + std::vector navigable_voxels_; // voxels that save the point cloud number in each voxel + std::vector state_voxels_; // voxels that save the freespace state in each voxel (used to tackle lidar reflection issue) + std::vector freespace_indices_; // indices of free voxels + cv::Mat navigable_map_all_; // the whole top-down navigable map that aggregate along the z axis + cv::Mat navigable_map_; // the cropped navigable_map_all_ + cv::Mat wall_hist_all_; // the whole top-down wall histogram map + cv::Mat wall_hist_; // the cropped wall_hist_all_ + cv::Mat state_map_all_; + cv::Mat state_map_; // cropped state_map_all_ + cv::Mat room_mask_; // the int-value room_mask (each room has a unique integer id, 0 means background and -1 means edges between rooms) + cv::Mat room_mask_vis_; // the visualizable room mask (each room has a unique color) + std::vector bbox_; // bounding box of the current navigable map crop + std::vector ceiling_points_; + std::vector plane_infos_; // detected plane information + std::map room_nodes_map_; // room id to RoomNodeRep + int room_node_counter_; // counter to assign unique ids to room nodes + + // ==================== State Variables ==================== + Eigen::Vector3f shift_; // shift from the left-down or right-up corner to the center of the _all_ maps + geometry_msgs::msg::Point robot_position_; // robot position in world frame + int explored_area_display_count_; // counter for publishing explored area + bool segment_flag_; // flag to trigger room segmentation +}; + +} // namespace room_segmentation diff --git a/sensor_coverage/include/sensor_coverage/utils/misc_utils.h b/sensor_coverage/include/sensor_coverage/utils/misc_utils.h new file mode 100644 index 0000000..9ac6432 --- /dev/null +++ b/sensor_coverage/include/sensor_coverage/utils/misc_utils.h @@ -0,0 +1,414 @@ +/** + * @file utils.h + * @author Chao Cao (ccao1@andrew.cmu.edu) + * @brief Miscellaneous utility functions + * @version 0.1 + * @date 2019-06-05 + * + * @copyright Copyright (c) 2021 + * + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// uncomment to disable assert() +// #define NDEBUG +#include +#include +#include +#include +#include + +#define MY_ASSERT(val) \ + if (!(val)) \ + { \ + std::cout << "\033[31m Error at [File: " << __FILE__ << "][Line: " << __LINE__ << "]" \ + << "[Function: " << __FUNCTION__ << "] \033[0m\n" \ + << std::endl; \ + exit(1); \ + } + +namespace misc_utils_ns +{ +typedef pcl::PointXYZI PCLPointType; +typedef pcl::PointCloud PCLPointCloudType; + +template +void PointToPoint(const FromPointType& from_point, ToPointType& to_point) +{ + to_point.x = from_point.x; + to_point.y = from_point.y; + to_point.z = from_point.z; +} +geometry_msgs::msg::Point PCL2GeoMsgPnt(const PCLPointType& pnt); +PCLPointType GeoMsgPnt2PCL(const geometry_msgs::msg::Point& pnt); +geometry_msgs::msg::Point GeoMsgPoint(double x, double y, double z); +PCLPointType PCLPoint(float x, float y, float z); +// TODO: make these template functions +void LeftRotatePoint(PCLPointType& pnt); +void RightRotatePoint(PCLPointType& pnt); +void LeftRotatePoint(geometry_msgs::msg::Point& pnt); +void RightRotatePoint(geometry_msgs::msg::Point& pnt); +template +void KeyposeToMap(CloudType& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose); +double PointXYDist(const geometry_msgs::msg::Point& pnt1, const geometry_msgs::msg::Point& pnt2); +double PointXYDist(const PCLPointType& pnt1, const PCLPointType& pnt2); +template +double PointXYDist(const P1& pnt1, const P2& pnt2) +{ + return sqrt(pow((pnt1.x - pnt2.x), 2) + pow((pnt1.y - pnt2.y), 2)); +} +template +double PointXYZDist(const P1& pnt1, const P2& pnt2) +{ + return sqrt(pow((pnt1.x - pnt2.x), 2) + pow((pnt1.y - pnt2.y), 2) + pow((pnt1.z - pnt2.z), 2)); +} +double VectorXYAngle(const Eigen::Vector3d& v1, const Eigen::Vector3d& v2); +double PointAngle(const geometry_msgs::msg::Point& pnt, const geometry_msgs::msg::Point& robot_pos); +double PointAngle(const PCLPointType& pnt, const geometry_msgs::msg::Point& robot_pos); +bool CollinearXY(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& p2, + const geometry_msgs::msg::Point& p3, double threshold = 0.1); +bool LineSegIntersect(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& q1, + const geometry_msgs::msg::Point& p2, const geometry_msgs::msg::Point& q2); +bool LineSegIntersectWithTolerance(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& q1, + const geometry_msgs::msg::Point& p2, const geometry_msgs::msg::Point& q2, + const double tolerance); +int ThreePointOrientation(const geometry_msgs::msg::Point& p, const geometry_msgs::msg::Point& q, + const geometry_msgs::msg::Point& r); +bool PointOnLineSeg(const geometry_msgs::msg::Point& p, const geometry_msgs::msg::Point& q, + const geometry_msgs::msg::Point& r); +double AngleOverlap(double s1, double e1, double s2, double e2); +double AngleDiff(double source_angle, double target_angle); +bool PointInPolygon(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Polygon& polygon); +double LineSegDistance2D(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Point& line_segment_start, + const geometry_msgs::msg::Point& line_segment_end); +double LineSegDistance3D(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Point& line_segment_start, + const geometry_msgs::msg::Point& line_segment_end); +double DistancePoint2DToPolygon(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Polygon& polygon); +void LinInterpPoints(const Eigen::Vector3d& p1, const Eigen::Vector3d& p2, double resolution, + std::vector& interp_points); +template +void LinInterpPoints(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& p2, double resolution, + typename pcl::PointCloud::Ptr& cloud) +{ + double point_dist = PointXYZDist(p1, p2); + if (point_dist < 0.01) + return; + PCLPointType point1; + point1.x = p1.x; + point1.y = p1.y; + point1.z = p1.z; + PCLPointType point2; + point2.x = p1.x; + point2.y = p1.y; + point2.z = p1.z; + if (point_dist < resolution) + { + cloud->points.push_back(point1); + cloud->points.push_back(point2); + } + else + { + int num_points = static_cast(point_dist / resolution); + cloud->points.push_back(point1); + for (int i = 0; i < num_points; i++) + { + PCLPointType point; + point.x = static_cast((p2.x - p1.x) / num_points * i + p1.x); + point.y = static_cast((p2.y - p1.y) / num_points * i + p1.y); + point.z = static_cast((p2.z - p1.z) / num_points * i + p1.z); + cloud->points.push_back(point); + } + cloud->points.push_back(point2); + } +} +double DegreeToRadian(double degree); +double RadianToDegree(double radian); +/** + * Function to append part of path2 ([from_ind, to_ind]) to path1 + * @param path1 + * @param path2 + * @param from_ind + * @param to_ind + */ +void ConcatenatePath(nav_msgs::msg::Path& path1, const nav_msgs::msg::Path& path2, int from_ind = -1, int to_ind = -1); +/** + * Function to check if an index is within the range of a list + * @tparam T + * @param list + * @param index + * @return + */ +template +bool InRange(const std::vector& list, int index) +{ + return index >= 0 && index < list.size(); +} +// template +// T getParam(rclcpp::Node::SharedPtr nh, const std::string& name, const T default_val) +// { +// T val; +// bool success = nh->getParam(name, val); +// if (!success) +// { +// ROS_ERROR_STREAM("Cannot read parameter: " << name); +// return default_val; +// } +// return val; +// } +// template +// T getParam(rclcpp::Node::SharedPtr nh, const std::string& name, const T default_val) +// { +// T val; +// bool success = nh.getParam(name, val); +// if (!success) +// { +// ROS_ERROR_STREAM("Cannot read parameter: " << name); +// return default_val; +// } +// return val; +// } +/** + * Function to publish clouds + * @tparam T PCL PointCloud type + * @param cloud_publisher + * @param cloud + * @param frame_id + */ +template +void PublishCloud(rclcpp::Node::SharedPtr node, + typename rclcpp::Publisher::SharedPtr cloud_publisher, + const PCLPointCloudType& cloud, const std::string& frame_id) +{ + sensor_msgs::msg::PointCloud2 cloud_msg; + pcl::toROSMsg(cloud, cloud_msg); + cloud_msg.header.frame_id = frame_id; + cloud_msg.header.stamp = node->now(); + cloud_publisher->publish(cloud_msg); +} + +template +void Publish(rclcpp::Node::SharedPtr node, typename rclcpp::Publisher::SharedPtr publisher, ROSMsgType& msg, + const std::string& frame_id) +{ + msg.header.frame_id = frame_id; + msg.header.stamp = node->now(); + publisher->publish(msg); +} + +void SetDifference(std::vector& v1, std::vector& v2, std::vector& diff); +void SetIntersection(std::vector& v1, std::vector& v2, std::vector& intersection); + +class Timer +{ +private: + std::string name_; + std::chrono::time_point timer_start_; + std::chrono::time_point timer_stop_; + std::chrono::duration duration_; + +public: + Timer(std::string name) : name_(name) + { + } + ~Timer() = default; + void Start() + { + timer_start_ = std::chrono::high_resolution_clock::now(); + } + void Stop(bool show, std::string unit = "ms") + { + timer_stop_ = std::chrono::high_resolution_clock::now(); + duration_ = timer_stop_ - timer_start_; + if (show) + { + if (unit == "ms") + { + std::cout << name_ << " takes " << GetDuration() << " ms" << std::endl; + } + else if (unit == "us") + { + std::cout << name_ << " takes " << GetDuration() << " us" << std::endl; + } + else if (unit == "ns") + { + std::cout << name_ << " takes " << GetDuration() << " ns" << std::endl; + } + else if (unit == "s") + { + std::cout << name_ << " takes " << GetDuration() << " s" << std::endl; + } + else + { + std::cout << "timer unit error!" << std::endl; + } + } + } + template + int GetDuration() + { + return std::chrono::duration_cast(duration_).count(); + } + int GetDuration(std::string unit = "ms") + { + if (unit == "ms") + { + return std::chrono::duration_cast(duration_).count(); + } + else if (unit == "us") + { + return std::chrono::duration_cast(duration_).count(); + } + else if (unit == "ns") + { + return std::chrono::duration_cast(duration_).count(); + } + else if (unit == "s") + { + return std::chrono::duration_cast(duration_).count(); + } + else + { + std::cout << "timer unit error!" << std::endl; + return 0; + } + } +}; + +class Marker +{ +private: + std::string pub_topic_; + std::string frame_id_; + rclcpp::Publisher::SharedPtr marker_pub_; + rclcpp::Node::SharedPtr nh_; + +public: + static int id_; + visualization_msgs::msg::Marker marker_; + explicit Marker(rclcpp::Node::SharedPtr nh, std::string pub_topic, std::string frame_id) + : nh_(nh), pub_topic_(pub_topic), frame_id_(frame_id) + { + marker_pub_ = nh_->create_publisher(pub_topic_, 2); + id_++; + marker_.id = id_; + marker_.action = visualization_msgs::msg::Marker::ADD; + marker_.pose.orientation.w = 1.0; + } + ~Marker() = default; + void SetColorRGBA(const std_msgs::msg::ColorRGBA& color) + { + marker_.color = color; + } + void SetColorRGBA(double r, double g, double b, double a) + { + marker_.color.r = r; + marker_.color.g = g; + marker_.color.b = b; + marker_.color.a = a; + } + void SetScale(double x, double y, double z) + { + marker_.scale.x = x; + marker_.scale.y = y; + marker_.scale.z = z; + } + void SetType(int type) + { + marker_.type = type; + } + void SetAction(visualization_msgs::msg::Marker::_action_type action) + { + marker_.action = action; + } + void Publish() + { + misc_utils_ns::Publish(nh_, marker_pub_, marker_, frame_id_); + } + + typedef std::shared_ptr Ptr; +}; +int signum(int x); +double mod(double value, double modulus); +double intbound(double s, double ds); +bool InRange(const Eigen::Vector3i& sub, const Eigen::Vector3i& max_sub = Eigen::Vector3i(INT_MAX, INT_MAX, INT_MAX), + const Eigen::Vector3i& min_sub = Eigen::Vector3i(INT_MIN, INT_MIN, INT_MIN)); +void RayCast(const Eigen::Vector3i& start_sub, const Eigen::Vector3i& end_sub, const Eigen::Vector3i& max_sub, + const Eigen::Vector3i& min_sub, std::vector& output); +bool InFOV(const Eigen::Vector3d& point_position, const Eigen::Vector3d& viewpoint_position, double vertical_half_fov, + double range); +bool InFOVSimple(const Eigen::Vector3d& point_position, const Eigen::Vector3d& viewpoint_position, + double vertical_fov_ratio, double range, double xy_dist_threshold = 0, double z_diff_threshold = 0, + bool print = false); +float ApproxAtan(float z); +float ApproxAtan2(float y, float x); +double GetPathLength(const nav_msgs::msg::Path& path); +double GetPathLength(const std::vector& path); +double AStarSearch(const std::vector>& graph, const std::vector>& node_dist, + const std::vector& node_positions, int from_idx, int to_idx, + bool get_path, std::vector& path_indices); +bool AStarSearchWithMaxPathLength(const std::vector>& graph, + const std::vector>& node_dist, + const std::vector& node_positions, int from_idx, + int to_idx, bool get_path, std::vector& path_indices, double& shortest_dist, + double max_path_length = DBL_MAX); +nav_msgs::msg::Path SimplifyPath(const nav_msgs::msg::Path& path); +nav_msgs::msg::Path DeduplicatePath(const nav_msgs::msg::Path& path, double min_dist); +void SampleLineSegments(const std::vector& initial_points, double sample_resol, + std::vector& sample_points); +void UniquifyIntVector(std::vector& list); + +std::vector find_path_bfs(int start, int goal, const Eigen::MatrixXi &adj_matrix); +Eigen::Vector3d idToColor(int id); + +// 将点坐标转换为体素网格索引(未裁剪版本) +Eigen::Vector3i point_to_voxel( + const Eigen::Vector3f &pt, + const Eigen::Vector3f &origin_shift, + float inv_resolution); + +Eigen::Vector3i point_to_voxel( + geometry_msgs::msg::Point pt, + const Eigen::Vector3f &origin_shift, + float inv_resolution); + +// 将体素网格索引转换为点坐标(未裁剪版本) +Eigen::Vector3f voxel_to_point( + const Eigen::Vector3i &voxel_index, + const Eigen::Vector3f &origin_shift, + float resolution); + +// 将点坐标转换为体素网格索引(考虑裁剪偏移) +Eigen::Vector3i point_to_voxel_cropped( + const Eigen::Vector3f &pt, + const Eigen::Vector3f &origin_shift, + float inv_resolution, + std::vector &bbox); + +// 将体素网格索引转换为点坐标(考虑裁剪偏移) +Eigen::Vector3f voxel_to_point_cropped( + const Eigen::Vector3i &voxel_index, + const Eigen::Vector3f &origin_shift, + float resolution, + std::vector &bbox); + + +} // namespace misc_utils_ns diff --git a/sensor_coverage/package.xml b/sensor_coverage/package.xml new file mode 100644 index 0000000..ff64d9f --- /dev/null +++ b/sensor_coverage/package.xml @@ -0,0 +1,34 @@ + + + + sensor_coverage + 0.0.1 + Room segmentation node for sensor coverage planning. + Alex Lin + MIT + + ament_cmake + + rclcpp + sensor_msgs + nav_msgs + geometry_msgs + visualization_msgs + std_msgs + tf2 + tf2_ros + cv_bridge + pcl_conversions + semantic_mapping + + Eigen3 + PCL + OpenCV + Eigen3 + PCL + OpenCV + + + ament_cmake + + diff --git a/sensor_coverage/resource/sensor_coverage b/sensor_coverage/resource/sensor_coverage new file mode 100644 index 0000000..a340768 --- /dev/null +++ b/sensor_coverage/resource/sensor_coverage @@ -0,0 +1 @@ +sensor_coverage diff --git a/sensor_coverage/src/room_segmentation/room_node_rep.cpp b/sensor_coverage/src/room_segmentation/room_node_rep.cpp new file mode 100644 index 0000000..6545192 --- /dev/null +++ b/sensor_coverage/src/room_segmentation/room_node_rep.cpp @@ -0,0 +1,117 @@ +/** + * @file room_node_rep.cpp + * @author Haokun Zhu (haokunz@andrew.cmu.edu) + * @brief Room node representation for room segmentation + * @version 0.1 + * @date 2026-01-04 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "sensor_coverage/room_segmentation/room_node_rep.h" + +#include + +#include + +#include "sensor_coverage/utils/misc_utils.h" + +namespace representation_ns { + +RoomNodeRep::RoomNodeRep(const int id, const std::vector &points) + : id_(id), + show_id_(0), + points_(points), + centroid_(0.0, 0.0, 0.0), + area_(0.0), + last_area_(0.0), + alive(true), + is_visited_(false), + is_covered_(false), + is_labeled_(false), + is_asked_(2), + voxel_num_(0), + is_connected_(false) { + viewpoint_indices_ = std::set(); + object_indices_ = std::set(); + anchor_point_ = geometry_msgs::msg::Point(); +} + +void RoomNodeRep::UpdateRoomNode(const std::vector &points, + const geometry_msgs::msg::PolygonStamped &polygon) { + points_ = points; + polygon_ = polygon; + alive = true; +} + +void RoomNodeRep::UpdateRoomNode(const std::vector &points) { + points_ = points; + alive = true; +} + +void RoomNodeRep::UpdateCentroid(Eigen::Vector3f ¢roid) { centroid_ = centroid; } + +void RoomNodeRep::UpdatePolygon(const geometry_msgs::msg::PolygonStamped &polygon) { + polygon_ = polygon; + alive = true; +} + +bool RoomNodeRep::InRoom(const geometry_msgs::msg::Point &point) { + return misc_utils_ns::PointInPolygon(point, polygon_.polygon); +} + +void RoomNodeRep::RemovePoint(const cv::Point &point) { + auto it = std::remove(points_.begin(), points_.end(), point); + if (it != points_.end()) { + points_.erase(it, points_.end()); + } +} + +void RoomNodeRep::UpdateRoomNode(const semantic_mapping::msg::RoomNode msg) { + id_ = msg.id; + show_id_ = msg.show_id; + polygon_ = msg.polygon; + centroid_ = Eigen::Vector3f(msg.centroid.x, msg.centroid.y, msg.centroid.z); + neighbors_.clear(); + for (const auto &neighbor : msg.neighbors) { + neighbors_.insert(neighbor); + } + alive = true; + is_connected_ = msg.is_connected; + area_ = msg.area; + room_mask_ = cv_bridge::toCvCopy(msg.room_mask, "mono8")->image; +} + +void RoomNodeRep::SetRoomMask(const cv::Mat &room_mask) { + cv::Mat room_mask_tmp = room_mask.clone(); + room_mask_tmp.convertTo(room_mask_tmp, CV_8UC1); + std::vector non_zero_points; + cv::findNonZero(room_mask_tmp, non_zero_points); + if (non_zero_points.empty()) { + return; + } + cv::Rect rect = cv::boundingRect(non_zero_points); + std::vector bbox(2); + bbox[0] = {rect.tl().y, rect.tl().x}; + bbox[1] = {rect.br().y, rect.br().x}; + int margin = 10; + bbox[0] = (bbox[0] - Eigen::Vector2i(margin, margin)).cwiseMax(Eigen::Vector2i(0, 0)); + bbox[1] = (bbox[1] + Eigen::Vector2i(margin, margin)) + .cwiseMin(Eigen::Vector2i(room_mask_tmp.rows - 1, room_mask_tmp.cols - 1)); + + room_mask_ = room_mask_tmp.rowRange(bbox[0][0], bbox[1][0] + 1) + .colRange(bbox[0][1], bbox[1][1] + 1); +} + +void RoomNodeRep::ClearRoomLabels() { + labels_.clear(); + is_labeled_ = false; + is_asked_ = 2; + last_area_ = 0.0f; + voxel_num_ = 0; + anchor_point_ = geometry_msgs::msg::Point(); + image_ = cv::Mat(); +} + +} // namespace representation_ns diff --git a/sensor_coverage/src/room_segmentation/room_segmentation.cpp b/sensor_coverage/src/room_segmentation/room_segmentation.cpp new file mode 100644 index 0000000..ef749c6 --- /dev/null +++ b/sensor_coverage/src/room_segmentation/room_segmentation.cpp @@ -0,0 +1,2052 @@ +/** + * @file room_segmentation.cpp + * @author Haokun Zhu (haokunz@andrew.cmu.edu) + * @brief ROS 2 Node implementation for room segmentation + * @version 0.2 + * @date 2026-01-04 + * @copyright Copyright (c) 2026 + */ + +#include "sensor_coverage/room_segmentation/room_segmentation_node.h" +#include "sensor_coverage/utils/misc_utils.h" + +namespace room_segmentation { + +// ==================== Constructor ==================== +RoomSegmentationNode::RoomSegmentationNode() + : Node("room_segmentation_node"), + explored_area_voxel_size_(0.1f), + room_resolution_(0.1f), + occupancy_grid_resolution_(0.2f), + ceiling_height_base_(2.0f), + wall_thres_height_base_(0.1f), + outward_distance_0_(0.5f), + outward_distance_1_(0.3f), + distance_threshold_(2.5f), + distance_angel_threshold_(0.3f), + angle_threshold_deg_(6.0f), + region_growing_radius_(15.0f), + dilation_iteration_(4), + min_room_size_(40), + exploredAreaDisplayInterval_(1), + normal_search_num_(50), + normal_search_radius_(0.5f), + kViewPointCollisionMarginZPlus_(0.5f), + kViewPointCollisionMarginZMinus_(0.5f), + is_debug_(false), + room_voxel_dimension_({200, 200, 50}), + explored_area_display_count_(0), + room_node_counter_(0), + segment_flag_(false) +{ + RCLCPP_INFO(this->get_logger(), "Initializing Room Segmentation Node..."); + + // ==================== Declare and Get Parameters ==================== + this->declare_parameter("exploredAreaVoxelSize", 0.1f); + this->declare_parameter("room_resolution", 0.1f); + this->declare_parameter("rolling_occupancy_grid.resolution_x", 0.2f); + this->declare_parameter("room_x", 200); + this->declare_parameter("room_y", 200); + this->declare_parameter("room_z", 50); + this->declare_parameter("ceilingHeight_", 2.0f); + this->declare_parameter("wall_thres_height_", 0.1f); + this->declare_parameter("exploredAreaDisplayInterval", 1); + this->declare_parameter("dilation_iteration", 4); + this->declare_parameter("outward_distance_0", 0.5f); + this->declare_parameter("outward_distance_1", 0.3f); + this->declare_parameter("distance_threshold", 2.5f); + this->declare_parameter("distance_angel_threshold", 0.3f); + this->declare_parameter("angle_threshold_deg", 6.0f); + this->declare_parameter("region_growing_radius", 15.0f); + this->declare_parameter("min_room_size", 40); + this->declare_parameter("normal_search_num", 50); + this->declare_parameter("normal_search_radius", 0.5f); + this->declare_parameter("kViewPointCollisionMarginZPlus", 0.5f); + this->declare_parameter("kViewPointCollisionMarginZMinus", 0.5f); + this->declare_parameter("isDebug", false); + + this->get_parameter("exploredAreaVoxelSize", explored_area_voxel_size_); + this->get_parameter("room_resolution", room_resolution_); + this->get_parameter("rolling_occupancy_grid.resolution_x", occupancy_grid_resolution_); + this->get_parameter("room_x", room_voxel_dimension_[0]); + this->get_parameter("room_y", room_voxel_dimension_[1]); + this->get_parameter("room_z", room_voxel_dimension_[2]); + this->get_parameter("ceilingHeight_", ceiling_height_base_); + this->get_parameter("wall_thres_height_", wall_thres_height_base_); + this->get_parameter("exploredAreaDisplayInterval", exploredAreaDisplayInterval_); + this->get_parameter("dilation_iteration", dilation_iteration_); + this->get_parameter("outward_distance_0", outward_distance_0_); + this->get_parameter("outward_distance_1", outward_distance_1_); + this->get_parameter("distance_threshold", distance_threshold_); + this->get_parameter("distance_angel_threshold", distance_angel_threshold_); + this->get_parameter("angle_threshold_deg", angle_threshold_deg_); + this->get_parameter("region_growing_radius", region_growing_radius_); + this->get_parameter("min_room_size", min_room_size_); + this->get_parameter("normal_search_num", normal_search_num_); + this->get_parameter("normal_search_radius", normal_search_radius_); + this->get_parameter("kViewPointCollisionMarginZPlus", kViewPointCollisionMarginZPlus_); + this->get_parameter("kViewPointCollisionMarginZMinus", kViewPointCollisionMarginZMinus_); + this->get_parameter("isDebug", is_debug_); + + room_resolution_inv_ = 1.0f / room_resolution_; + ceiling_height_ = ceiling_height_base_; + wall_thres_height_ = wall_thres_height_base_; + + // ==================== Initialize Point Clouds ==================== + laser_cloud_.reset(new pcl::PointCloud()); + laser_cloud_tmp_.reset(new pcl::PointCloud()); + explored_area_cloud_tmp_.reset(new pcl::PointCloud()); + downsampled_explored_area_cloud_.reset(new pcl::PointCloud()); + downsampled_explored_area_cloud_tmp_.reset(new pcl::PointCloud()); + downsampled_ceiling_cloud_.reset(new pcl::PointCloud()); + downsampled_ceiling_cloud_tmp_.reset(new pcl::PointCloud()); + in_range_cloud_.reset(new pcl::PointCloud()); + occupied_cloud_.reset(new pcl::PointCloud()); + updated_voxel_cloud_.reset(new pcl::PointCloud()); + freespace_cloud_.reset(new pcl::PointCloud()); + door_cloud_.reset(new pcl::PointCloud()); + + // ==================== Initialize Filters ==================== + explored_area_dwz_filter_.setLeafSize(explored_area_voxel_size_, + explored_area_voxel_size_, + explored_area_voxel_size_); + ceiling_cloud_dwz_filter_.setLeafSize(explored_area_voxel_size_, + explored_area_voxel_size_, + explored_area_voxel_size_); + ceiling_pass_filter_.setFilterFieldName("z"); // Set filter field once + occupied_pass_filter_.setFilterFieldName("z"); // Set filter field once + tree_.reset(new pcl::search::KdTree()); + + // ==================== Initialize Maps ==================== + shift_ = Eigen::Vector3f(room_voxel_dimension_[0] / 2.0f, + room_voxel_dimension_[1] / 2.0f, + room_voxel_dimension_[2] / 2.0f); + + navigable_voxels_.resize(room_voxel_dimension_[0] * + room_voxel_dimension_[1] * + room_voxel_dimension_[2], 0); + state_voxels_.resize(room_voxel_dimension_[0] * + room_voxel_dimension_[1] * + room_voxel_dimension_[2], -1); + + navigable_map_all_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_32F); + wall_hist_all_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_32F); + state_map_all_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_8U); + room_mask_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_32S); + room_mask_vis_ = cv::Mat::zeros(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_8UC3); + room_mask_vis_.setTo(cv::Scalar(255, 255, 255)); + + bbox_.emplace_back(Eigen::Vector2i(0, 0)); + bbox_.emplace_back(Eigen::Vector2i(room_voxel_dimension_[0] - 1, room_voxel_dimension_[1] - 1)); + + robot_position_ = geometry_msgs::msg::Point(); + robot_position_.x = 0.0; + robot_position_.y = 0.0; + robot_position_.z = 0.0; + + // ==================== Create Subscriptions ==================== + sub_laser_cloud_ = this->create_subscription( + "/registered_scan", 20, + std::bind(&RoomSegmentationNode::laserCloudCallback, this, std::placeholders::_1)); + + sub_state_estimation_ = this->create_subscription( + "/state_estimation", 20, + std::bind(&RoomSegmentationNode::stateEstimationCallback, this, std::placeholders::_1)); + + sub_occupied_cloud_ = this->create_subscription( + "/occupied_cloud", 20, + std::bind(&RoomSegmentationNode::occupiedCloudCallback, this, std::placeholders::_1)); + + sub_freespace_cloud_ = this->create_subscription( + "/freespace_cloud", 20, + std::bind(&RoomSegmentationNode::freespaceCloudCallback, this, std::placeholders::_1)); + + // ==================== Create Publishers ==================== + pub_explored_area_ = this->create_publisher("/explore_areas_new", 5); + pub_room_mask_vis_ = this->create_publisher("/room_mask_vis", 5); + pub_room_mask_ = this->create_publisher("/room_mask", 5); + pub_door_cloud_ = this->create_publisher("/door_cloud", 5); + pub_debug_ = this->create_publisher("/debug_cloud", 5); + pub_debug_1_ = this->create_publisher("/free_cloud_1", 5); + pub_room_boundary_ = this->create_publisher("/current_room_boundary", 5); + pub_polygon_ = this->create_publisher("/room_boundaries", 50); + pub_wall_cloud_ = this->create_publisher("/walls", 5); + pub_room_node_ = this->create_publisher("/room_nodes", 50); + pub_room_node_list_ = this->create_publisher("/room_nodes_list", 5); + pub_room_map_cloud_ = this->create_publisher("/room_map_cloud", 5); + + // ==================== Create Timer ==================== + timer_ = this->create_wall_timer( + std::chrono::milliseconds(100), + std::bind(&RoomSegmentationNode::timerCallback, this)); + + RCLCPP_INFO(this->get_logger(), "Room Segmentation Node initialized successfully!"); +} + +// ==================== Timer Callback ==================== +void RoomSegmentationNode::timerCallback() { + if (segment_flag_) { + segment_flag_ = false; + roomSegmentation(); + publishRoomNodes(); + publishDoorCloud(); + publishRoomPolygon(); + } +} + +// ==================== Helper Functions ==================== +void RoomSegmentationNode::saveImageToFile(const cv::Mat &image, const std::string &filename, bool force_save) { + if (is_debug_ || force_save) { + cv::Mat flipped_image; + cv::transpose(image, flipped_image); + cv::flip(flipped_image, flipped_image, 0); + if (!flipped_image.empty()) { + if (cv::imwrite(filename, flipped_image)) { + return; + } else { + RCLCPP_ERROR(this->get_logger(), "Failed to save image: %s", filename.c_str()); + } + } else { + RCLCPP_ERROR(this->get_logger(), "Image is empty, cannot save: %s", filename.c_str()); + } + } +} + +int RoomSegmentationNode::toIndex(int x, int y, int z) { + return x * room_voxel_dimension_[1] * room_voxel_dimension_[2] + + y * room_voxel_dimension_[2] + z; +} + +int RoomSegmentationNode::toIndex(int x, int y) { + return x * room_voxel_dimension_[1] + y; +} + +bool RoomSegmentationNode::isPlaneSame(const PlaneInfo &a, const PlaneInfo &b) { + float angle = std::acos(std::abs(a.normal.dot(b.normal))) * 180.0f / M_PI; + if (angle > angle_threshold_deg_) + return false; + + Eigen::Vector3f centroid_diff = b.centroid - a.centroid; + float angel_distance = centroid_diff.dot(a.normal); + if (std::abs(angel_distance) > distance_angel_threshold_) + return false; + + float center_dist = (a.centroid - b.centroid).norm(); + float actual_dist = center_dist - (a.width + b.width) / 2.0f; + if (actual_dist > distance_threshold_) + return false; + + return true; +} + +bool RoomSegmentationNode::isRoomConnected(const int &room_id, const int ¤t_room_id, + const Eigen::MatrixXi &adjacency_matrix) { + int n = adjacency_matrix.rows(); + if (room_id < 0 || current_room_id < 0 || room_id >= n || current_room_id >= n) { + return false; + } + + std::vector visited(n, false); + std::queue q; + q.push(current_room_id); + visited[current_room_id] = true; + + while (!q.empty()) { + int u = q.front(); + q.pop(); + + if (u == room_id) + return true; + + for (int v = 0; v < n; v++) { + if (!visited[v] && adjacency_matrix(u, v) > 0) { + visited[v] = true; + q.push(v); + } + } + } + return false; +} + +geometry_msgs::msg::PolygonStamped RoomSegmentationNode::computePolygonFromMaskCropped(const cv::Mat &mask) { + std::vector> current_room_contours; + cv::findContours(mask, current_room_contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE); + + std::vector largest_contour; + if (!current_room_contours.empty()) { + largest_contour = current_room_contours[0]; + for (const auto &contour : current_room_contours) { + if (cv::contourArea(contour) > cv::contourArea(largest_contour)) { + largest_contour = contour; + } + } + } + + geometry_msgs::msg::PolygonStamped boundary_polygon; + boundary_polygon.header.frame_id = "map"; + boundary_polygon.polygon.points.clear(); + + for (const auto &pt : largest_contour) { + Eigen::Vector3i pt_voxel(pt.y, pt.x, 0); + Eigen::Vector3f pt_position = misc_utils_ns::voxel_to_point_cropped(pt_voxel, shift_, room_resolution_, bbox_); + + geometry_msgs::msg::Point32 point; + point.x = pt_position.x(); + point.y = pt_position.y(); + point.z = 0.0; + boundary_polygon.polygon.points.push_back(point); + } + return boundary_polygon; +} + +// ==================== State Estimation Callback ==================== +void RoomSegmentationNode::stateEstimationCallback(const nav_msgs::msg::Odometry::ConstSharedPtr msg) { + robot_position_ = msg->pose.pose.position; +} + +// ==================== Callback Functions ==================== +void RoomSegmentationNode::laserCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg) { + laser_cloud_->clear(); + laser_cloud_tmp_->clear(); + pcl::fromROSMsg(*msg, *laser_cloud_tmp_); + // Transform PointXYZI to PointXYZINormal + pcl::copyPointCloud(*laser_cloud_tmp_, *laser_cloud_); + + *explored_area_cloud_tmp_ += *laser_cloud_; + + explored_area_display_count_++; + + if (explored_area_display_count_ >= 5 * exploredAreaDisplayInterval_) + { + // downsample explored area cloud + downsampled_explored_area_cloud_tmp_->clear(); + explored_area_dwz_filter_.setInputCloud(explored_area_cloud_tmp_); + explored_area_dwz_filter_.filter(*downsampled_explored_area_cloud_tmp_); + *downsampled_explored_area_cloud_ += *downsampled_explored_area_cloud_tmp_; // accumulate downsampled cloud + + ceiling_height_ = ceiling_height_base_ + robot_position_.z; + wall_thres_height_ = wall_thres_height_base_ + robot_position_.z; + + // filter the pointcloud below the ceiling height + ceiling_pass_filter_.setInputCloud(downsampled_explored_area_cloud_tmp_); + ceiling_pass_filter_.setFilterLimits(-std::numeric_limits::max(), ceiling_height_); + ceiling_pass_filter_.filter(*downsampled_ceiling_cloud_tmp_); + + // before adding the new point, record the index size + int start_idx = downsampled_ceiling_cloud_->size(); + *downsampled_ceiling_cloud_ += *downsampled_explored_area_cloud_tmp_; + int end_idx = downsampled_ceiling_cloud_->size(); + + // find the old points that will be affected by the new points + tree_->setInputCloud(downsampled_ceiling_cloud_); + std::unordered_set affected_old_indices; + + for (const auto &pt : downsampled_ceiling_cloud_tmp_->points) + { + std::vector indices; + std::vector dists; + // tree_->radiusSearch(pt, normal_search_radius_, indices, dists); + tree_->nearestKSearch(pt, normal_search_num_, indices, dists); + for (int idx : indices) + { + if (idx < start_idx) // only consider old points + affected_old_indices.insert(idx); + } + } + + // create the points that need to be updated normals + pcl::PointCloud::Ptr points_to_update(new pcl::PointCloud); + points_to_update->points.reserve(affected_old_indices.size() + (end_idx - start_idx)); + std::vector update_indices; + + for (int idx : affected_old_indices) + { + points_to_update->points.push_back(downsampled_ceiling_cloud_->points[idx]); + update_indices.push_back(idx); + } + for (int i = start_idx; i < end_idx; ++i) + { + points_to_update->points.push_back(downsampled_ceiling_cloud_->points[i]); + update_indices.push_back(i); + } + + // use the kdtree to compute normals + pcl::NormalEstimation ne; + ne.setInputCloud(points_to_update); + ne.setSearchMethod(tree_); + ne.setKSearch(normal_search_num_); + pcl::PointCloud::Ptr updated_normals(new pcl::PointCloud); + ne.compute(*updated_normals); + + // write back the updated normals + for (size_t i = 0; i < update_indices.size(); ++i) + { + int idx = update_indices[i]; + downsampled_ceiling_cloud_->points[idx].normal_x = updated_normals->points[i].normal_x; + downsampled_ceiling_cloud_->points[idx].normal_y = updated_normals->points[i].normal_y; + downsampled_ceiling_cloud_->points[idx].normal_z = updated_normals->points[i].normal_z; + downsampled_ceiling_cloud_->points[idx].curvature = updated_normals->points[i].curvature; + } + + // convert to Eigen::Vector3f for further processing + std::vector ceilingPoint_tmp; + ceilingPoint_tmp.reserve(downsampled_ceiling_cloud_tmp_->points.size()); + for (const auto &pt : downsampled_ceiling_cloud_tmp_->points) + { + ceilingPoint_tmp.emplace_back(pt.x, pt.y, pt.z); + } + + updateVoxelMap(ceilingPoint_tmp); + + explored_area_dwz_filter_.setInputCloud(downsampled_explored_area_cloud_); + explored_area_dwz_filter_.filter(*downsampled_explored_area_cloud_); + + ceiling_cloud_dwz_filter_.setInputCloud(downsampled_ceiling_cloud_); + ceiling_cloud_dwz_filter_.filter(*downsampled_ceiling_cloud_); + + sensor_msgs::msg::PointCloud2 explored_area_msg; + pcl::toROSMsg(*downsampled_ceiling_cloud_, explored_area_msg); + explored_area_msg.header.stamp = msg->header.stamp; + explored_area_msg.header.frame_id = "map"; + pub_explored_area_->publish(explored_area_msg); + + // get the point cloud that is within the robot's range + in_range_cloud_->clear(); + in_range_cloud_->points.reserve(downsampled_ceiling_cloud_->points.size() / 4); // 预估容量 + float radius_squared = region_growing_radius_ * region_growing_radius_; + for (const auto &pt : downsampled_ceiling_cloud_->points) + { + float dx = pt.x - robot_position_.x; + float dy = pt.y - robot_position_.y; + if (dx * dx + dy * dy <= radius_squared) + { + in_range_cloud_->points.push_back(pt); + } + } + in_range_cloud_->width = in_range_cloud_->points.size(); + in_range_cloud_->height = 1; + in_range_cloud_->is_dense = true; + + // publish the in range cloud for debugging + sensor_msgs::msg::PointCloud2 in_range_cloud_msg; + pcl::toROSMsg(*in_range_cloud_, in_range_cloud_msg); + in_range_cloud_msg.header.stamp = msg->header.stamp; + in_range_cloud_msg.header.frame_id = "map"; + pub_debug_->publish(in_range_cloud_msg); + + explored_area_display_count_ = 0; + explored_area_cloud_tmp_->clear(); + downsampled_ceiling_cloud_tmp_->clear(); + + if (!navigable_map_.empty() && !wall_hist_.empty() && !state_map_.empty()) + { + segment_flag_ = true; + } + else + { + RCLCPP_WARN_THROTTLE( + this->get_logger(), + *this->get_clock(), + 2000, + "Room segmentation skipped: maps not ready yet."); + } + } +} + +void RoomSegmentationNode::occupiedCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg) { + occupied_cloud_->clear(); + pcl::fromROSMsg(*msg, *occupied_cloud_); + + float height_1 = robot_position_.z - kViewPointCollisionMarginZMinus_; + float height_2 = robot_position_.z + kViewPointCollisionMarginZPlus_; + // filter the occupied cloud within the height range + occupied_pass_filter_.setInputCloud(occupied_cloud_); + occupied_pass_filter_.setFilterLimits(height_1, height_2); + occupied_pass_filter_.filter(*occupied_cloud_); + + if (occupied_cloud_->empty()) + { + RCLCPP_WARN(this->get_logger(), "Occupied cloud is empty after height filtering."); + return; + } + + updateStateVoxel(); +} + +void RoomSegmentationNode::freespaceCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg) +{ + freespace_cloud_->clear(); + pcl::fromROSMsg(*msg, *freespace_cloud_); + if (freespace_cloud_->empty()) + { + RCLCPP_WARN(this->get_logger(), "Freespace cloud is empty."); + return; + } + + pcl::PointCloud::Ptr freespace_cloud_tmp(new pcl::PointCloud); + updateFreespace(freespace_cloud_tmp); +} + +// ==================== Core Processing Functions ==================== +cv::Mat RoomSegmentationNode::getWall(const pcl::PointCloud::Ptr &cloud) { + auto t_start = std::chrono::high_resolution_clock::now(); + + // Extract normals from PointXYZINormal + auto t0 = std::chrono::high_resolution_clock::now(); + pcl::PointCloud::Ptr normals(new pcl::PointCloud); + normals->resize(cloud->size()); + for (size_t i = 0; i < cloud->size(); ++i) + { + const pcl::PointXYZINormal &pt = cloud->points[i]; + (*normals)[i].normal_x = pt.normal_x; + (*normals)[i].normal_y = pt.normal_y; + (*normals)[i].normal_z = pt.normal_z; + (*normals)[i].curvature = pt.curvature; + } + + auto t1 = std::chrono::high_resolution_clock::now(); + + // Region Growing clustering + t0 = std::chrono::high_resolution_clock::now(); + pcl::IndicesPtr indices(new std::vector); + pcl::removeNaNFromPointCloud(*cloud, *indices); + + pcl::RegionGrowing reg; + reg.setMinClusterSize(300); + reg.setMaxClusterSize(1000000); + reg.setSearchMethod(tree_); + reg.setNumberOfNeighbours(50); + reg.setInputCloud(cloud); + reg.setIndices(indices); + reg.setInputNormals(normals); + reg.setSmoothnessThreshold(3 / 180.0 * M_PI); + reg.setCurvatureThreshold(1.0); + reg.setSmoothModeFlag(true); + + std::vector clusters; + reg.extract(clusters); + t1 = std::chrono::high_resolution_clock::now(); + + t0 = std::chrono::high_resolution_clock::now(); + // Process clusters to extract vertical planes + std::vector plane_infos_new; + plane_infos_new.reserve(clusters.size() / 4); // Reserve estimated space + + for (size_t i = 0; i < clusters.size(); ++i) + { + const auto &cluster = clusters[i]; + + // Compute centroid and normal sum directly from indices without copying cloud + Eigen::Vector3f centroid(0, 0, 0); + Eigen::Vector3f normal_sum(0, 0, 0); + int valid_points = 0; + + for (int idx : cluster.indices) + { + const pcl::PointXYZINormal &pt = cloud->points[idx]; + centroid += Eigen::Vector3f(pt.x, pt.y, pt.z); + + const pcl::Normal &n = normals->points[idx]; + if (std::isfinite(n.normal_x) && std::isfinite(n.normal_y) && std::isfinite(n.normal_z)) + { + normal_sum += Eigen::Vector3f(n.normal_x, n.normal_y, n.normal_z); + valid_points++; + } + } + + if (valid_points == 0 || normal_sum.norm() < 1e-3) + continue; + + centroid /= static_cast(cluster.indices.size()); + Eigen::Vector3f avg_normal = normal_sum.normalized(); + + // Early filtering: Remove horizontal planes + float dot = std::abs(avg_normal.dot(Eigen::Vector3f::UnitZ())); + if (dot > std::cos(80.0f * M_PI / 180.0f)) + continue; + + // Compute variance using online algorithm (single pass) + float mean_dist = 0.0f; + float m2 = 0.0f; + int n = 0; + + for (int idx : cluster.indices) + { + const pcl::PointXYZINormal &pt = cloud->points[idx]; + Eigen::Vector3f p(pt.x - centroid.x(), pt.y - centroid.y(), pt.z - centroid.z()); + float dist = p.dot(avg_normal); + + n++; + float delta = dist - mean_dist; + mean_dist += delta / n; + float delta2 = dist - mean_dist; + m2 += delta * delta2; + } + + float variance = (n > 1) ? (m2 / n) : 0.0f; + if (variance > 0.1f) + continue; + + avg_normal = (avg_normal - avg_normal.dot(Eigen::Vector3f::UnitZ()) * Eigen::Vector3f::UnitZ()).normalized(); + + std::vector voxel_indices; + voxel_indices.reserve(cluster.indices.size()); + + Eigen::Vector3f u_dir = avg_normal.cross(Eigen::Vector3f::UnitZ()).normalized(); + Eigen::Vector3f v_dir = avg_normal.cross(u_dir).normalized(); + + // Compute bounding rectangle directly from original cloud using indices + float u_min = FLT_MAX, u_max = -FLT_MAX; + float v_min = FLT_MAX, v_max = -FLT_MAX; + + for (int idx : cluster.indices) + { + const pcl::PointXYZINormal &point = cloud->points[idx]; + Eigen::Vector3f p(point.x, point.y, point.z); + Eigen::Vector3f relative = p - centroid; + float u = relative.dot(u_dir); + float v = relative.dot(v_dir); + + u_min = std::min(u_min, u); + u_max = std::max(u_max, u); + v_min = std::min(v_min, v); + v_max = std::max(v_max, v); + + voxel_indices.emplace_back(misc_utils_ns::point_to_voxel(p, shift_, room_resolution_inv_)); + } + + float height = v_max - v_min; + // Early filtering: check height before further computation + if (height < 1.5f) + continue; + + // Now create cluster_cloud only for planes that passed all filters + pcl::PointCloud::Ptr cluster_cloud(new pcl::PointCloud); + cluster_cloud->points.reserve(cluster.indices.size()); + for (int idx : cluster.indices) + { + cluster_cloud->points.push_back(cloud->points[idx]); + } + cluster_cloud->width = cluster_cloud->points.size(); + cluster_cloud->height = 1; + cluster_cloud->is_dense = true; + + // Compute corners of the bounding rectangle + std::array corners = { + centroid + u_dir * u_min + v_dir * v_min, + centroid + u_dir * u_max + v_dir * v_min, + centroid + u_dir * u_max + v_dir * v_max, + centroid + u_dir * u_min + v_dir * v_max}; + + centroid = (corners[0] + corners[1] + corners[2] + corners[3]) / 4.0f; + float width = u_max - u_min; + + plane_infos_new.push_back({static_cast(i), + cluster_cloud, + std::move(voxel_indices), + avg_normal, + centroid, + u_dir, + v_dir, + width, + height, + corners, + true, + false}); + } + + t1 = std::chrono::high_resolution_clock::now(); + + // Extract in-range planes from plane_infos_ + std::vector in_range_plane_indices; + for (size_t i = 0; i < plane_infos_.size(); ++i) + { + const auto &plane = plane_infos_[i]; + if (plane.cloud->size() > 1000) + continue; + + float dist = std::hypot(plane.centroid.x() - robot_position_.x, plane.centroid.y() - robot_position_.y); + if (dist < region_growing_radius_) + { + in_range_plane_indices.push_back(i); + plane_infos_[i].alive = false; + } + } + + // Merge new planes with existing planes + for (size_t i = 0; i < plane_infos_new.size(); ++i) + { + bool found = false; + int found_idx = -1; + for (size_t j = 0; j < plane_infos_.size(); ++j) + { + if (isPlaneSame(plane_infos_new[i], plane_infos_[j])) + { + found = true; + found_idx = static_cast(j); + break; + } + } + if (!found) + { + // if no similar plane found, add as a new plane + plane_infos_new[i].id = static_cast(plane_infos_.size()); + plane_infos_.push_back(plane_infos_new[i]); + } + else + { + // merge with the found plane + mergePlanes(plane_infos_, found_idx, plane_infos_new[i]); + plane_infos_[found_idx].alive = true; + } + } + + // Merge similar planes + for (size_t i = 0; i < plane_infos_.size(); ++i) + { + if (!plane_infos_[i].alive) + continue; + for (size_t j = i + 1; j < plane_infos_.size(); ++j) + { + if (!plane_infos_[j].alive) + continue; + if (isPlaneSame(plane_infos_[i], plane_infos_[j])) + { + mergePlanes(plane_infos_, static_cast(i), plane_infos_[j]); + plane_infos_[j].alive = false; + plane_infos_[i].alive = true; + } + } + } + + // Remove planes where most voxels are free + for (auto &plane : plane_infos_) + { + if (!plane.alive) + continue; + int free_count = 0; + int total_count = 0; + for (const auto &voxel_index : plane.voxel_indices) + { + if (voxel_index[0] < 0 || voxel_index[0] >= room_voxel_dimension_[0] || + voxel_index[1] < 0 || voxel_index[1] >= room_voxel_dimension_[1] || + voxel_index[2] < 0 || voxel_index[2] >= room_voxel_dimension_[2]) + continue; + if (state_map_all_.at(voxel_index[0], voxel_index[1]) == 1) + { + free_count++; + } + total_count++; + } + if (free_count > total_count * 0.33f) + { + plane.alive = false; + } + } + + // Remove non-alive planes + plane_infos_.erase(std::remove_if(plane_infos_.begin(), plane_infos_.end(), + [](const PlaneInfo &plane) + { return (!plane.alive); }), + plane_infos_.end()); + + // Visualize merged planes + t0 = std::chrono::high_resolution_clock::now(); + pcl::PointCloud::Ptr merged_cloud(new pcl::PointCloud); + + // Calculate total points for reservation + size_t total_points = 0; + for (const auto &plane : plane_infos_) + total_points += plane.cloud->points.size(); + merged_cloud->points.reserve(total_points); + + for (const auto &plane : plane_infos_) + { + uint8_t r = static_cast(rand() % 256); + uint8_t g = static_cast(rand() % 256); + uint8_t b = static_cast(rand() % 256); + + for (const auto& pt : plane.cloud->points) + { + merged_cloud->points.emplace_back(pt.x, pt.y, pt.z, r, g, b); + } + } + merged_cloud->width = merged_cloud->points.size(); + merged_cloud->height = 1; + merged_cloud->is_dense = true; + + sensor_msgs::msg::PointCloud2 merged_cloud_msg; + pcl::toROSMsg(*merged_cloud, merged_cloud_msg); + merged_cloud_msg.header.frame_id = "map"; + pub_wall_cloud_->publish(merged_cloud_msg); + + t1 = std::chrono::high_resolution_clock::now(); + + auto t_end = std::chrono::high_resolution_clock::now(); + + // Project walls to 2D map (combined loop for both operations) + t0 = std::chrono::high_resolution_clock::now(); + cv::Mat wall_mask(room_voxel_dimension_[0], room_voxel_dimension_[1], CV_8U, cv::Scalar(0)); + + for (const auto &plane : plane_infos_) + { + if (plane.merged) + continue; + + // First pass: basic polygon + std::vector polygon_2d; + polygon_2d.reserve(4); + Eigen::Vector3f outward_1 = plane.normal * outward_distance_1_; + + for (int i = 0; i < 4; i++) + { + Eigen::Vector3f corner = plane.corners[i]; + if (i==0) + corner = corner - outward_1; + else if (i==1) + corner = corner - outward_1; + else if (i==2) + corner = corner + outward_1; + else + corner = corner + outward_1; + + Eigen::Vector3i idx = misc_utils_ns::point_to_voxel(corner, shift_, room_resolution_inv_); + polygon_2d.emplace_back(idx[1], idx[0]); + } + cv::fillPoly(wall_mask, std::vector>{polygon_2d}, cv::Scalar(255)); + + // Second pass: extension along u_dir + polygon_2d.clear(); + polygon_2d.reserve(4); + + for (int i = 0; i<2; ++i) + { + Eigen::Vector3f corner = plane.corners[i]; + Eigen::Vector3f pt_0 = corner; + Eigen::Vector3f pt_1; + if (i == 0) { + pt_1 = corner - plane.u_dir * outward_distance_0_; + } else { + pt_1 = corner + plane.u_dir * outward_distance_0_; + } + + Eigen::Vector3i idx_0 = misc_utils_ns::point_to_voxel(pt_0, shift_, room_resolution_inv_); + Eigen::Vector3i idx_1 = misc_utils_ns::point_to_voxel(pt_1, shift_, room_resolution_inv_); + cv::Point pt_2d_0(idx_0[1], idx_0[0]); + cv::Point pt_2d_1(idx_1[1], idx_1[0]); + + cv::LineIterator it(wall_mask, pt_2d_0, pt_2d_1, 8); + + cv::Point pt_found = pt_2d_1; + for (int j = 0; j < it.count; ++j, ++it) + { + cv::Point pt = it.pos(); + if (pt.x <= 0 || pt.x >= wall_mask.cols - 1 || pt.y <= 0 || pt.y >= wall_mask.rows - 1) + { + pt_found = pt; + break; + } + if (wall_mask.at(pt) == 255) + { + pt_found = pt; + break; + } + } + if (pt_found.x >= 0 && pt_found.y >= 0) + { + Eigen::Vector3i idx_found(pt_found.y, pt_found.x, 0); + Eigen::Vector3f pt_found_world = misc_utils_ns::voxel_to_point(idx_found, shift_, room_resolution_); + Eigen::Vector3f pt_outward_0 = pt_found_world + plane.normal * outward_distance_1_; + Eigen::Vector3f pt_outward_1 = pt_found_world - plane.normal * outward_distance_1_; + Eigen::Vector3i idx_outward_0 = misc_utils_ns::point_to_voxel(pt_outward_0, shift_, room_resolution_inv_); + Eigen::Vector3i idx_outward_1 = misc_utils_ns::point_to_voxel(pt_outward_1, shift_, room_resolution_inv_); + polygon_2d.emplace_back(idx_outward_0[1], idx_outward_0[0]); + polygon_2d.emplace_back(idx_outward_1[1], idx_outward_1[0]); + } + } + if (!polygon_2d.empty()) + cv::fillPoly(wall_mask, std::vector>{polygon_2d}, cv::Scalar(255)); + } + + t1 = std::chrono::high_resolution_clock::now(); + + // Crop wall_mask using bbox_ + wall_mask = wall_mask.rowRange(bbox_[0][0], bbox_[1][0] + 1) + .colRange(bbox_[0][1], bbox_[1][1] + 1); + saveImageToFile(wall_mask, "wall_mask_from_planes.png"); + + return wall_mask; +} + +void RoomSegmentationNode::updateVoxelMap(const std::vector &navigable_points) { + // Traverse all points and accumulate to voxels + for (const auto &pt : navigable_points) + { + auto idx = misc_utils_ns::point_to_voxel(pt, shift_, room_resolution_inv_); + // Boundary clipping + idx[0] = std::clamp(idx[0], 0, room_voxel_dimension_[0] - 1); + idx[1] = std::clamp(idx[1], 0, room_voxel_dimension_[1] - 1); + idx[2] = std::clamp(idx[2], 0, room_voxel_dimension_[2] - 1); + + if (navigable_voxels_[toIndex(idx[0], idx[1], idx[2])] == 0 && + state_map_all_.at(idx[0], idx[1]) != 1) // Don't update free space again + { + navigable_voxels_[toIndex(idx[0], idx[1], idx[2])] = 1; + navigable_map_all_.at(idx[0], idx[1]) += 1.0f; // Accumulate to 2D map + + if (wall_thres_height_ < pt.z() && pt.z() < ceiling_height_) + { + // If point height is within wall range, mark corresponding voxel as wall + wall_hist_all_.at(idx[0], idx[1]) += 1.0f; + } + } + } + + // Get the bounding box of non-zero area in navigable_map + std::vector non_zero_points; + cv::findNonZero(navigable_map_all_, non_zero_points); + if (non_zero_points.empty()) + { + return; + } + + cv::Rect rect = cv::boundingRect(non_zero_points); + bbox_[0] = Eigen::Vector2i(rect.tl().y, rect.tl().x); // Top-left corner + bbox_[1] = Eigen::Vector2i(rect.br().y, rect.br().x); // Bottom-right corner + + // Add some margin to the bbox + int margin = 20; + bbox_[0] = (bbox_[0] - Eigen::Vector2i(margin, margin)).cwiseMax(Eigen::Vector2i(0, 0)); + bbox_[1] = (bbox_[1] + Eigen::Vector2i(margin, margin)).cwiseMin(Eigen::Vector2i(room_voxel_dimension_[0] - 1, room_voxel_dimension_[1] - 1)); + + // crop the navigable_map_all using the bbox to get the navigable_map + navigable_map_ = navigable_map_all_.rowRange(bbox_[0][0], bbox_[1][0] + 1).colRange(bbox_[0][1], bbox_[1][1] + 1); + wall_hist_ = wall_hist_all_.rowRange(bbox_[0][0], bbox_[1][0] + 1).colRange(bbox_[0][1], bbox_[1][1] + 1); + state_map_ = state_map_all_.rowRange(bbox_[0][0], bbox_[1][0] + 1).colRange(bbox_[0][1], bbox_[1][1] + 1); +} + +void RoomSegmentationNode::updateStateVoxel() { + // use the idx store in freespace_indices_ to update the state_map_all_ + for (const auto &pt : occupied_cloud_->points) + { + if (pt.intensity != 0) // Occupied + { + continue; // Only process occupied space + } + auto idx = misc_utils_ns::point_to_voxel( + Eigen::Vector3f(pt.x - room_resolution_ / 2.0 - 1e-4, + pt.y - room_resolution_ / 2.0 - 1e-4, + pt.z - room_resolution_ / 2.0 - 1e-4), + shift_, room_resolution_inv_); + // Boundary clipping + idx[0] = std::clamp(idx[0], 0, room_voxel_dimension_[0] - 1); + idx[1] = std::clamp(idx[1], 0, room_voxel_dimension_[1] - 1); + idx[2] = std::clamp(idx[2], 0, room_voxel_dimension_[2] - 1); + + // Check if all surrounding cells are in freespace_indices + bool flag = true; + for (int dx = 0; dx <= 1; ++dx) + { + for (int dy = 0; dy <= 1; ++dy) + { + int nx = idx[0] + dx; + int ny = idx[1] + dy; + if (nx >= 0 && nx < room_voxel_dimension_[0] && + ny >= 0 && ny < room_voxel_dimension_[1]) + { + if (std::find(freespace_indices_.begin(), freespace_indices_.end(), + toIndex(nx, ny)) == freespace_indices_.end()) + { + flag = false; // If any point is not in freespace_indices, don't mark as free space + break; + } + } + } + } + + if (flag) + { + // Mark surrounding area as free space + for (int dx = -1; dx <= 2; ++dx) + { + for (int dy = -1; dy <= 2; ++dy) + { + int nx = idx[0] + dx; + int ny = idx[1] + dy; + if (nx >= 0 && nx < room_voxel_dimension_[0] && + ny >= 0 && ny < room_voxel_dimension_[1]) + { + state_map_all_.at(nx, ny) = 1; // Mark as free space + navigable_map_all_.at(nx, ny) = 1.0f; + wall_hist_all_.at(nx, ny) = 1.0f; // Clear wall history + + for (int z = 0; z < room_voxel_dimension_[2]; ++z) + { + int index = toIndex(nx, ny, z); + navigable_voxels_[index] = 0; + } + + Eigen::Vector3f pt_position = misc_utils_ns::voxel_to_point( + Eigen::Vector3i(nx, ny, 0), shift_, room_resolution_); + pt_position.z() = robot_position_.z; // Keep z-axis height constant + pcl::PointXYZI pt_new; + pt_new.x = pt_position.x(); + pt_new.y = pt_position.y(); + pt_new.z = pt_position.z(); + pt_new.intensity = 0; // Mark as free space + updated_voxel_cloud_->points.push_back(pt_new); + } + } + } + updated_voxel_cloud_->points.push_back(pt); + } + } + + if (!updated_voxel_cloud_->empty()) + { + // Publish the updated_voxel_cloud using pub_debug_ + sensor_msgs::msg::PointCloud2 occupied_cloud_msg; + pcl::toROSMsg(*updated_voxel_cloud_, occupied_cloud_msg); + occupied_cloud_msg.header.stamp = this->now(); + occupied_cloud_msg.header.frame_id = "map"; + pub_debug_->publish(occupied_cloud_msg); + } +} + +void RoomSegmentationNode::updateFreespace(pcl::PointCloud::Ptr &freespace_cloud_tmp) +{ + // this function only updates the navigable_voxels_, navigable_map_all_, wall_hist_all_ based on the freespace_cloud_ + // this function will store the freespace voxel indices in freespace_indices_, and it is used in updateStateVoxel() to update the state_map_all_ + freespace_indices_.clear(); + for (auto &pt : freespace_cloud_->points) + { + auto idx = misc_utils_ns::point_to_voxel(Eigen::Vector3f(pt.x + 1e-4, pt.y + 1e-4, pt.z + 1e-4), shift_, room_resolution_inv_); + // check if the point is within bounds + idx[0] = std::clamp(idx[0], 0, room_voxel_dimension_[0] - 1); + idx[1] = std::clamp(idx[1], 0, room_voxel_dimension_[1] - 1); + idx[2] = std::clamp(idx[2], 0, room_voxel_dimension_[2] - 1); + + for (int dx = 0; dx <= 1; ++dx) + { + for (int dy = 0; dy <= 1; ++dy) + { + int nx = idx[0] + dx; + int ny = idx[1] + dy; + if (nx >= 0 && nx < room_voxel_dimension_[0] && + ny >= 0 && ny < room_voxel_dimension_[1] && + state_map_all_.at(nx, ny) != 1) + { + Eigen::Vector3f pt_position = misc_utils_ns::voxel_to_point(Eigen::Vector3i(nx, ny, idx[2]), shift_, room_resolution_); + pcl::PointXYZI pt_xyz; + pt_xyz.x = pt_position.x(); + pt_xyz.y = pt_position.y(); + pt_xyz.z = robot_position_.z; // 使用机器人的z坐标作为高度 + pt_xyz.intensity = 1.0f; // 设置强度为1.0f + freespace_cloud_tmp->points.push_back(pt_xyz); + + freespace_indices_.push_back(toIndex(nx, ny)); // 记录free space的体素索引 + for (int dz = -2; dz <= 5; ++dz) // 只在z轴上扩展 + { + int nz = idx[2] + dz; + if (nz >= 0 && nz < room_voxel_dimension_[2]) + { + if (navigable_voxels_[toIndex(nx, ny, nz)] == 1) + { + navigable_voxels_[toIndex(nx, ny, nz)] = 0; + navigable_map_all_.at(nx, ny) -= 1.0f; + float pt_z = pt.z + dz * room_resolution_; // 计算实际的z坐标 + if (wall_thres_height_ < pt_z && pt_z < ceiling_height_) + wall_hist_all_.at(nx, ny) -= 1.0f; // 更新墙体地图 + } + } + } + } + } + } + } + + // use pub_debug_1_ to publish the freespace cloud + sensor_msgs::msg::PointCloud2 freespace_cloud_msg; + pcl::toROSMsg(*freespace_cloud_tmp, freespace_cloud_msg); + freespace_cloud_msg.header.stamp = this->now(); + freespace_cloud_msg.header.frame_id = "map"; + pub_debug_1_->publish(freespace_cloud_msg); +} + +void RoomSegmentationNode::mergePlanes(std::vector &plane_infos, int idx_0, PlaneInfo &compare) { + // Merge planes + PlaneInfo &base = plane_infos[idx_0]; + compare.merged = true; + + size_t size_0 = base.cloud->size(); + size_t size_1 = compare.cloud->size(); + base.centroid = (base.centroid * size_0 + compare.centroid * size_1) / (size_0 + size_1); + base.cloud->insert(base.cloud->end(), compare.cloud->begin(), compare.cloud->end()); + + // Downsample the merged cloud + explored_area_dwz_filter_.setInputCloud(base.cloud); + explored_area_dwz_filter_.filter(*base.cloud); + + // Use RANSAC to fit the normal + pcl::PointCloud::Ptr merged_cloud(new pcl::PointCloud); + pcl::copyPointCloud(*base.cloud, *merged_cloud); + + pcl::SACSegmentation seg; + seg.setOptimizeCoefficients(true); + seg.setModelType(pcl::SACMODEL_PLANE); + seg.setMethodType(pcl::SAC_RANSAC); + seg.setDistanceThreshold(0.2); + seg.setInputCloud(merged_cloud); + + pcl::ModelCoefficients coefficients; + pcl::PointIndices inliers; + seg.segment(inliers, coefficients); + + if (inliers.indices.size() > 0) + { + Eigen::Vector3f normal(coefficients.values[0], coefficients.values[1], coefficients.values[2]); + normal = (normal - normal.dot(Eigen::Vector3f::UnitZ()) * Eigen::Vector3f::UnitZ()).normalized(); // Keep only component perpendicular to z-axis + base.normal = normal; + + // Store the inliers as the point cloud + base.cloud->clear(); + base.voxel_indices.clear(); + for (const auto &index : inliers.indices) + { + pcl::PointXYZINormal pt = merged_cloud->points[index]; + base.cloud->push_back(pt); + Eigen::Vector3f pt_f(pt.x, pt.y, pt.z); + Eigen::Vector3i voxel_index = misc_utils_ns::point_to_voxel(pt_f, shift_, room_resolution_inv_); + base.voxel_indices.push_back(voxel_index); + } + } + + base.u_dir = base.normal.cross(Eigen::Vector3f::UnitZ()).normalized(); + base.v_dir = base.normal.cross(base.u_dir).normalized(); + + // Update bounding box + // Treat point cloud as points on a plane, compute the four corner points + Eigen::Vector4f centroid4f; + pcl::compute3DCentroid(*base.cloud, centroid4f); + base.centroid = centroid4f.head<3>(); + + float u_min = FLT_MAX, u_max = -FLT_MAX; + float v_min = FLT_MAX, v_max = -FLT_MAX; + for (const auto &point : base.cloud->points) + { + Eigen::Vector3f p(point.x, point.y, point.z); + Eigen::Vector3f relative = p - base.centroid; + float u = relative.dot(base.u_dir); + float v = relative.dot(base.v_dir); + + u_min = std::min(u_min, u); + u_max = std::max(u_max, u); + v_min = std::min(v_min, v); + v_max = std::max(v_max, v); + } + + // Four corner points (counter-clockwise, starting from bottom-left) + std::array corners = { + base.centroid + base.u_dir * u_min + base.v_dir * v_min, + base.centroid + base.u_dir * u_max + base.v_dir * v_min, + base.centroid + base.u_dir * u_max + base.v_dir * v_max, + base.centroid + base.u_dir * u_min + base.v_dir * v_max}; + + base.centroid = (corners[0] + corners[1] + corners[2] + corners[3]) / 4.0f; + base.corners = corners; + + // Compute plane width and height + base.width = u_max - u_min; + base.height = v_max - v_min; +} + +void RoomSegmentationNode::updateRooms(cv::Mat &room_mask_cropped, cv::Mat &room_mask_new, + cv::Mat &room_mask_vis_cropped, int &room_number) { + // Step 1: Only keep regions in room_mask_cropped that are non-zero in room_mask_new + cv::Mat mask = (room_mask_new != 0); + room_mask_cropped.setTo(0, mask == 0); + + // Create a set of all room IDs from 1 to room_number that need processing + std::set room_need_process_ids; + for (int i = 1; i <= room_number; ++i) + { + room_need_process_ids.insert(i); + } + + Eigen::Vector3d bg_color_ = misc_utils_ns::idToColor(0); // Background color is white + cv::Vec3b bg_color = cv::Vec3b(bg_color_[0], bg_color_[1], bg_color_[2]); + Eigen::Vector3d color_; + cv::Vec3b color; + + room_mask_vis_cropped.setTo(bg_color, mask == 0); + + // Check each room node, three cases: + // 1. If most of room's mask in room_mask_new is 0, the room is invalid or merged, should be deleted + // 2. If room's mask in room_mask_new has only 1 non-zero value, compare old and new masks + // 3. If room's mask in room_mask_new has multiple non-zero values, the old room was split + + if (room_mask_new.size() != room_mask_cropped.size()) + { + throw std::runtime_error("room_mask_new and room_mask must have the same size"); + } + + std::set room_ids_to_remove; + std::vector room_nodes_new; + + for (auto &id_room_node_pair : room_nodes_map_) + { + int room_id = id_room_node_pair.first; + representation_ns::RoomNodeRep &room_node = id_room_node_pair.second; + std::vector non_zero_points_old = room_node.points_; + + if (non_zero_points_old.empty()) + { + room_node.SetAlive(false); + room_ids_to_remove.insert(room_id); + std::cout << "Room " << room_id << " erase 1" << std::endl; + continue; + } + + std::vector non_zero_points_old_cropped; + non_zero_points_old_cropped.reserve(non_zero_points_old.size()); + cv::Mat mask_old = cv::Mat::zeros(room_mask_cropped.size(), CV_8U); + + for (const auto &pt : non_zero_points_old) + { + mask_old.at(pt.y - bbox_[0][0], pt.x - bbox_[0][1]) = 1; + cv::Point pt_cropped(pt.x - bbox_[0][1], pt.y - bbox_[0][0]); + non_zero_points_old_cropped.push_back(pt_cropped); + } + int value_old = room_node.GetId(); + + bool all_zero = true; + int zero_count = 0; + std::set values_set = {}; + std::unordered_map value_count_map = {}; + std::set values_set_old; + + for (const auto &pt : non_zero_points_old_cropped) + { + int value = room_mask_new.at(pt.y, pt.x); + int value_old_tmp = room_mask_cropped.at(pt.y, pt.x); + if (value > 0) + { + values_set.insert(value); + value_count_map[value]++; + all_zero = false; + } + else + { + zero_count++; + } + if (value_old_tmp > 0) + { + values_set_old.insert(value_old_tmp); + } + } + + // Case 1: Room is mostly zero or invalid + if (all_zero || zero_count > non_zero_points_old.size() * 0.8 || + values_set_old.find(value_old) == values_set_old.end()) + { + room_mask_cropped.setTo(0, mask_old); + room_node.SetAlive(false); + room_ids_to_remove.insert(room_id); + std::cout << "Room " << room_id << " erase 2" << std::endl; + RCLCPP_INFO(this->get_logger(), + "Room %d: all_zero = %d, zero_count = %d, value_found = %d.", + room_id, all_zero, (zero_count > non_zero_points_old.size() * 0.8), + (values_set_old.find(value_old) == values_set_old.end())); + } + // Case 2: Room has single non-zero value + else if (values_set.size() == 1 && *values_set.begin() != 0) + { + int value_new = *values_set.begin(); + cv::Mat mask_new = (room_mask_new == value_new); + + if (cv::countNonZero(mask_new ^ mask_old) == 0) + { + room_mask_new.setTo(0, mask_old); + room_need_process_ids.erase(value_new); + continue; + } + else + { + room_mask_cropped.setTo(0, mask_old); + room_mask_vis_cropped.setTo(bg_color, mask_old); + + std::vector non_zero_points_new; + cv::findNonZero(mask_new, non_zero_points_new); + + room_mask_cropped.setTo(value_old, mask_new); + color_ = misc_utils_ns::idToColor(value_old); + color = cv::Vec3b(color_[0], color_[1], color_[2]); + room_mask_vis_cropped.setTo(color, mask_new); + room_mask_new.setTo(0, mask_new); + + for (auto &pt : non_zero_points_new) + { + pt.x += bbox_[0][1]; + pt.y += bbox_[0][0]; + } + room_node.UpdateRoomNode(non_zero_points_new); + room_need_process_ids.erase(value_new); + } + } + // Case 3: Room has multiple non-zero values (split) + else if (values_set.size() > 1) + { + std::vector non_zero_values; + for (const auto &value : values_set) + { + if (value != 0) + non_zero_values.push_back(value); + } + + // Find value with maximum count + int max_area_value = 0; + int max_area = 0; + for (const auto &pair : value_count_map) + { + if (pair.second > max_area) + { + max_area_value = pair.first; + max_area = pair.second; + } + } + + cv::Mat mask_new = (room_mask_new == max_area_value); + room_mask_cropped.setTo(0, mask_old); + room_mask_vis_cropped.setTo(bg_color, mask_old); + + room_mask_cropped.setTo(value_old, mask_new); + color_ = misc_utils_ns::idToColor(value_old); + color = cv::Vec3b(color_[0], color_[1], color_[2]); + room_mask_vis_cropped.setTo(color, mask_new); + room_mask_new.setTo(0, mask_new); + + std::vector non_zero_points_new; + cv::findNonZero(mask_new, non_zero_points_new); + for (auto &pt : non_zero_points_new) + { + pt.x += bbox_[0][1]; + pt.y += bbox_[0][0]; + } + room_node.UpdateRoomNode(non_zero_points_new); + room_need_process_ids.erase(max_area_value); + + // Process other values + for (const auto &value : non_zero_values) + { + if (value == max_area_value) + continue; + + cv::Mat mask_other; + cv::bitwise_and((room_mask_new == value), (room_mask_cropped == 0), mask_other); + std::vector non_zero_points_other; + cv::findNonZero(mask_other, non_zero_points_other); + + cv::Mat mask_other_tmp = (room_mask_new == value); + std::vector non_zero_points_other_tmp; + cv::findNonZero(mask_other_tmp, non_zero_points_other_tmp); + + if (((float)non_zero_points_other.size() / (non_zero_points_other_tmp.size() + 1e-5) > 0.5) && + non_zero_points_other.size() > 0) + { + room_node_counter_++; + int new_room_id = room_node_counter_; + room_mask_cropped.setTo(new_room_id, mask_other_tmp); + Eigen::Vector3d new_color_ = misc_utils_ns::idToColor(new_room_id); + cv::Vec3b new_color = cv::Vec3b(new_color_[0], new_color_[1], new_color_[2]); + room_mask_vis_cropped.setTo(new_color, mask_other_tmp); + room_mask_new.setTo(0, mask_other_tmp); + + std::vector non_zero_points_new; + cv::findNonZero(mask_other_tmp, non_zero_points_new); + for (auto &pt : non_zero_points_new) + { + pt.x += bbox_[0][1]; + pt.y += bbox_[0][0]; + } + + representation_ns::RoomNodeRep new_room_node(new_room_id, non_zero_points_new); + room_nodes_new.push_back(new_room_node); + room_need_process_ids.erase(value); + } + } + } + } + + // Add new room nodes + for (auto &new_room_node : room_nodes_new) + { + int new_room_id = new_room_node.GetId(); + if (room_nodes_map_.find(new_room_id) == room_nodes_map_.end()) + { + room_nodes_map_[new_room_id] = new_room_node; + } + else + { + throw std::runtime_error("New room id already exists in room_nodes_map: " + std::to_string(new_room_id)); + } + } + + // Process remaining room IDs + for (const auto &room_id : room_need_process_ids) + { + cv::Mat mask_new = (room_mask_new == room_id); + if (cv::countNonZero(mask_new) > 0) + { + room_node_counter_++; + int new_room_id = room_node_counter_; + room_mask_cropped.setTo(new_room_id, mask_new); + Eigen::Vector3d new_color_ = misc_utils_ns::idToColor(new_room_id); + cv::Vec3b new_color = cv::Vec3b(new_color_[0], new_color_[1], new_color_[2]); + room_mask_vis_cropped.setTo(new_color, mask_new); + room_mask_new.setTo(0, mask_new); + + std::vector non_zero_points_new; + cv::findNonZero(mask_new, non_zero_points_new); + for (auto &pt : non_zero_points_new) + { + pt.x += bbox_[0][1]; + pt.y += bbox_[0][0]; + } + + representation_ns::RoomNodeRep new_room_node(new_room_id, non_zero_points_new); + room_nodes_map_[new_room_id] = new_room_node; + } + } + + // Remove all inactive room nodes + for (auto it = room_nodes_map_.begin(); it != room_nodes_map_.end();) + { + assert(it->second.id_ == it->first); + if (!it->second.alive) { + it = room_nodes_map_.erase(it); + } else { + ++it; + } + } + + // Verify all room IDs processed + int non_zero_count = cv::countNonZero(room_mask_new); + room_mask_new *= 255; + saveImageToFile(room_mask_new, "room_mask_new.png"); + assert(non_zero_count == 0); + + // Renumber rooms and update visualization + cv::Mat room_mask_vis_cropped_new = room_mask_vis_.rowRange(bbox_[0][0], bbox_[1][0] + 1) + .colRange(bbox_[0][1], bbox_[1][1] + 1); + pcl::PointCloud::Ptr room_cloud(new pcl::PointCloud); + + int counter = 1; + for (auto &id_room_node_pair : room_nodes_map_) + { + representation_ns::RoomNodeRep &room_node = id_room_node_pair.second; + cv::Mat mask_new = (room_mask_cropped == room_node.GetId()); + room_node.SetRoomMask(mask_new); + room_node.show_id_ = counter; + counter++; + + geometry_msgs::msg::PolygonStamped new_room_polygon = computePolygonFromMaskCropped(mask_new); + room_node.UpdatePolygon(new_room_polygon); + room_node.area_ = cv::countNonZero(mask_new) * room_resolution_ * room_resolution_; + + Eigen::Vector3f room_centroid(0.0, 0.0, 0.0); + if (!room_node.points_.empty()) + { + for (const auto &pt : room_node.points_) + { + Eigen::Vector3i pt_voxel(pt.y, pt.x, 0); + Eigen::Vector3f pt_position = misc_utils_ns::voxel_to_point(pt_voxel, shift_, room_resolution_); + pcl::PointXYZRGB point; + point.x = pt_position.x(); + point.y = pt_position.y(); + point.z = robot_position_.z; + Eigen::Vector3d color_ = misc_utils_ns::idToColor(room_node.GetId()); + point.b = static_cast(color_[0]); + point.g = static_cast(color_[1]); + point.r = static_cast(color_[2]); + room_cloud->points.push_back(point); + + room_centroid.x() += pt_position.x(); + room_centroid.y() += pt_position.y(); + } + } + room_centroid.x() /= room_node.points_.size(); + room_centroid.y() /= room_node.points_.size(); + room_centroid.z() = robot_position_.z; + room_node.centroid_ = room_centroid; + room_node.neighbors_.clear(); + } + + sensor_msgs::msg::PointCloud2 room_cloud_msg; + pcl::toROSMsg(*room_cloud, room_cloud_msg); + room_cloud_msg.header.stamp = this->now(); + room_cloud_msg.header.frame_id = "map"; + pub_room_map_cloud_->publish(room_cloud_msg); +} + +// TODO: Copy implementation from original file lines 1600-2100 +void RoomSegmentationNode::roomSegmentation() +{ + auto t_all_0 = std::chrono::high_resolution_clock::now(); + auto t_0 = std::chrono::high_resolution_clock::now(); + + if (navigable_map_.empty() || wall_hist_.empty() || state_map_.empty()) + { + RCLCPP_WARN_THROTTLE( + this->get_logger(), + *this->get_clock(), + 2000, + "Room segmentation skipped: empty map buffers."); + return; + } + + // Extract the room boundary from the navigable map + cv::Mat hist_full = navigable_map_.clone(); + cv::normalize(hist_full, hist_full, 0, 255, cv::NORM_MINMAX); + cv::Mat outside_boundary = cv::Mat::zeros(hist_full.size(), CV_8U); + cv::threshold(hist_full, outside_boundary, 0, 255, cv::THRESH_BINARY); + saveImageToFile(outside_boundary, "full_map_1.png"); + + if (outside_boundary.type() != CV_8U) + outside_boundary.convertTo(outside_boundary, CV_8U); + + auto t_1 = std::chrono::high_resolution_clock::now(); + // std::cout << "[Time] Extract Boundary: " << std::chrono::duration(t_1 - t_0).count() << " s" << std::endl; + + // --------------------------- Pre-Processing: Exacting the wall --------------------------- + t_0 = std::chrono::high_resolution_clock::now(); + cv::Mat wall_from_plane = getWall(in_range_cloud_); + t_1 = std::chrono::high_resolution_clock::now(); + // std::cout << "[Time] Get Wall from Plane: " << std::chrono::duration(t_1 - t_0).count() << " s" << std::endl; + + // --------------------------- Extract walls from histogram --------------------------- + t_0 = std::chrono::high_resolution_clock::now(); + cv::Mat wall_from_hist = wall_hist_.clone(); + cv::normalize(wall_from_hist, wall_from_hist, 0, 1, cv::NORM_MINMAX); + wall_from_hist.convertTo(wall_from_hist, CV_8U); + saveImageToFile(wall_from_hist, "walls_skeleton_hist_1_raw.png"); + double max_val_wall_from_hist = 0; + cv::minMaxLoc(wall_from_hist, nullptr, &max_val_wall_from_hist); + double threshold_wall_from_hist = max_val_wall_from_hist * 0.5; + cv::threshold(wall_from_hist, wall_from_hist, threshold_wall_from_hist, 255, cv::THRESH_BINARY); + cv::normalize(wall_from_plane, wall_from_plane, 0, 255, cv::NORM_MINMAX); + wall_from_plane.convertTo(wall_from_plane, CV_8U); + cv::threshold(wall_from_plane, wall_from_plane, 0, 255, cv::THRESH_BINARY); + // save 2 histograms speparately + saveImageToFile(wall_from_plane, "wall_from_plane.png"); + saveImageToFile(wall_from_hist, "wall_from_hist.png"); + saveImageToFile(state_map_, "state_map.png"); + + // Combine the two histograms + cv::Mat walls_skeleton_hist_connected = wall_from_hist.clone(); + wall_from_plane = wall_from_plane | wall_from_hist; + + wall_from_plane.setTo(0, state_map_); + + t_1 = std::chrono::high_resolution_clock::now(); + // std::cout << "[Time] Extract Wall from Histogram: " << std::chrono::duration(t_1 - t_0).count() << " s" << std::endl; + + // --------------------------- Contour Processing --------------------------- + t_0 = std::chrono::high_resolution_clock::now(); + std::vector> contours; + std::vector hierarchy; + cv::findContours(outside_boundary.clone(), contours, hierarchy, cv::RETR_CCOMP, cv::CHAIN_APPROX_SIMPLE); + + double min_hole_area = 400.0; + outside_boundary = cv::Mat::zeros(outside_boundary.size(), CV_8U); + + for (int i = 0; i < contours.size(); ++i) + { + // 外轮廓 + if (hierarchy[i][3] == -1) + { + cv::drawContours(outside_boundary, contours, i, cv::Scalar(255), cv::FILLED); + + int child = hierarchy[i][2]; + while (child != -1) + { + double area = cv::contourArea(contours[child]); + if (area >= min_hole_area) + { + // 仅当内轮廓面积足够大时才绘制 + cv::drawContours(outside_boundary, contours, child, cv::Scalar(0), cv::FILLED); + } + child = hierarchy[child][0]; // 下一个兄弟 + } + } + } + + saveImageToFile(wall_from_plane, "wall_all.png"); + + cv::Mat outside_boundary_connected = outside_boundary.clone(); + + // Remove wall positions from boundary + outside_boundary.setTo(0, wall_from_plane); + outside_boundary_connected.setTo(0, walls_skeleton_hist_connected); + + // Filter connected components by area + cv::Mat labels, stats, centroids; + int num_labels = cv::connectedComponentsWithStats(outside_boundary, labels, stats, centroids, 8); + + cv::Mat area_mask = cv::Mat::zeros(outside_boundary.size(), CV_8U); + for (int i = 1; i < num_labels; ++i) + { + if (stats.at(i, cv::CC_STAT_AREA) > 100) + { + cv::Mat label_mask = (labels == i); + area_mask.setTo(255, label_mask); + } + } + outside_boundary = area_mask.clone(); + + // filter small connected components in outside_boundary_connected + num_labels = cv::connectedComponentsWithStats(outside_boundary_connected, labels, stats, centroids, 8); + area_mask = cv::Mat::zeros(outside_boundary_connected.size(), CV_8U); + for (int i = 1; i < num_labels; ++i) + { + if (stats.at(i, cv::CC_STAT_AREA) > 100) + { + cv::Mat label_mask = (labels == i); + area_mask.setTo(255, label_mask); + } + } + outside_boundary_connected = area_mask.clone(); + + cv::Mat full_map, full_map_connected; + cv::bitwise_not(outside_boundary, full_map); + cv::bitwise_not(outside_boundary_connected, full_map_connected); + cv::Mat boundary_mask = full_map.clone(); + + saveImageToFile(full_map, "full_map.png"); + saveImageToFile(full_map_connected, "full_map_connected.png"); + + t_1 = std::chrono::high_resolution_clock::now(); + // std::cout << "[Time] Contour Processing: " << std::chrono::duration(t_1 - t_0).count() << " s" << std::endl; + + // --------------------------- Boundary Dilation --------------------------- + t_0 = std::chrono::high_resolution_clock::now(); + + std::vector found_region_centers; + std::vector found_region_masks; + + cv::Mat kernel_ = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3)); + + cv::Mat new_boundary; + cv::dilate(boundary_mask, new_boundary, kernel_, cv::Point(-1, -1), dilation_iteration_); + + cv::Mat new_boundary_inv; + cv::bitwise_not(new_boundary, new_boundary_inv); + + saveImageToFile(new_boundary, "new_boundary.png"); + + cv::Mat new_labels, new_stats, centers; + int num_rooms = cv::connectedComponentsWithStats(new_boundary_inv, new_labels, new_stats, centers, 8); + + for (int label = 1; label < num_rooms; ++label) + { + cv::Mat mask = (new_labels == label); + int area = new_stats.at(label, cv::CC_STAT_AREA); + if (area > min_room_size_) // TODO + { + double cy = centers.at(label, 0); + double cx = centers.at(label, 1); + // add the bbox back to cx and cy + cx = cx + bbox_[0][0]; + cy = cy + bbox_[0][1]; + found_region_centers.emplace_back(cx, cy); + // 用mask直接重写new_boundary + new_boundary.setTo(255, mask); + + mask.convertTo(mask, CV_8U, 255); + cv::Mat mask_dilated; + // cv::dilate(mask, mask_dilated, kernel_, cv::Point(-1, -1), dilation_iteration / 2); + found_region_masks.push_back(mask); + } + } + + // --------------------------- Watershed Segmentation --------------------------- + cv::Mat markers = cv::Mat::zeros(boundary_mask.size(), CV_32S); + // Set background markers + cv::Mat bg_mask = (full_map_connected != 0); + markers.setTo(found_region_masks.size() + 1, bg_mask); + + // Set foreground markers + for (size_t i = 0; i < found_region_masks.size(); ++i) + { + cv::Mat mask = (found_region_masks[i] == 255); + markers.setTo(static_cast(i + 1), mask); + } + + cv::Mat full_map_color; + cv::cvtColor(full_map_connected, full_map_color, cv::COLOR_GRAY2BGR); + saveImageToFile(full_map_color, "full_map_color.png"); + cv::watershed(full_map_color, markers); + + // --------------------------- Merge Rooms --------------------------- + auto t0_merge_room = std::chrono::high_resolution_clock::now(); + // Convert cropped markers to original image size + cv::Mat door_mask = (markers == -1); + + // Mark all doors on image boundaries as 0 + for (int c = 0; c < door_mask.cols; ++c) + { + if (door_mask.at(0, c) != 0) + door_mask.at(0, c) = 0; + if (door_mask.at(door_mask.rows - 1, c) != 0) + door_mask.at(door_mask.rows - 1, c) = 0; + } + + // Left and right columns + for (int r = 0; r < door_mask.rows; ++r) + { + if (door_mask.at(r, 0) != 0) + door_mask.at(r, 0) = 0; + if (door_mask.at(r, door_mask.cols - 1) != 0) + door_mask.at(r, door_mask.cols - 1) = 0; + } + + // Set all markers with value found_region_masks.size() + 1 and -1 to 0 + markers.setTo(0, markers == found_region_masks.size() + 1); + markers.setTo(0, markers == -1); + + cv::Mat room_mask_cropped = room_mask_.rowRange(bbox_[0][0], bbox_[1][0] + 1) + .colRange(bbox_[0][1], bbox_[1][1] + 1); + cv::Mat room_mask_vis_cropped = room_mask_vis_.rowRange(bbox_[0][0], bbox_[1][0] + 1) + .colRange(bbox_[0][1], bbox_[1][1] + 1); + int room_number = found_region_masks.size(); + updateRooms(room_mask_cropped, markers, room_mask_vis_cropped, room_number); + + auto t1_merge_room = std::chrono::high_resolution_clock::now(); + // std::cout << "[Time] Merge Rooms: " << std::chrono::duration(t1_merge_room - t0_merge_room).count() << " s" << std::endl; + + t_1 = std::chrono::high_resolution_clock::now(); + // std::cout << "[Time] Dilation: " << std::chrono::duration(t_1 - t_0).count() << " s" << std::endl; + + // --------------------------- Door Detection and Room Connectivity --------------------------- + t_0 = std::chrono::high_resolution_clock::now(); + + Eigen::Vector3i robot_idx = misc_utils_ns::point_to_voxel_cropped( + Eigen::Vector3f(robot_position_.x, robot_position_.y, robot_position_.z), + shift_, + room_resolution_inv_, + bbox_); + int current_room_label = room_mask_cropped.at(robot_idx[0], robot_idx[1]); + + if ((current_room_label > 0 && current_room_label <= static_cast(found_region_masks.size()))) + { + cv::Mat current_room_mask = (room_mask_cropped == current_room_label); + geometry_msgs::msg::PolygonStamped boundary_polygon = computePolygonFromMaskCropped(current_room_mask); + pub_room_boundary_->publish(boundary_polygon); + } + + door_cloud_->clear(); + // Assert room_mask_cropped has the same size as door_mask + assert(room_mask_cropped.size() == door_mask.size()); + + // Filter door points + for (int r = 1; r < door_mask.rows - 1; ++r) + { + for (int c = 1; c < door_mask.cols - 1; ++c) + { + if (door_mask.at(r, c) != 0) + { + bool not_door = false; + std::set neighborLabels; + neighborLabels.clear(); + + for (int dr = -1; dr <= 1; ++dr) + { + for (int dc = -1; dc <= 1; ++dc) + { + if (r + dr < 0 || r + dr >= door_mask.rows || + c + dc < 0 || c + dc >= door_mask.cols || (dr == 0 && dc == 0)) + { + continue; + } + int label = room_mask_cropped.at(r + dr, c + dc); + if (label > 0) + { + neighborLabels.insert(label); + } + } + } + + if (neighborLabels.size() == 1) + { + not_door = true; + door_mask.at(r, c) = 0; + } + if (neighborLabels.size() > 2) + { + not_door = true; + for (int dr = -1; dr <= 1; ++dr) + { + for (int dc = -1; dc <= 1; ++dc) + { + if (r + dr < 0 || r + dr >= door_mask.rows || + c + dc < 0 || c + dc >= door_mask.cols) + { + continue; + } + door_mask.at(r + dr, c + dc) = 0; + } + } + } + } + } + } + + // Find the largest key in the room_nodes_map + int max_room_label = 0; + for (const auto &id_room_node_pair : room_nodes_map_) + { + if (id_room_node_pair.first > max_room_label) + { + max_room_label = id_room_node_pair.first; + } + } + assert(max_room_label == room_node_counter_); + + // Initialize adjacency matrix + Eigen::MatrixXi adjacency_matrix = Eigen::MatrixXi::Zero(max_room_label, max_room_label); + + // Find connected components in door_mask + num_labels = cv::connectedComponentsWithStats(door_mask, labels, stats, centroids, 8); + + std::vector non_zero_points; + for (int label = 1; label < num_labels; ++label) + { + cv::Mat label_mask = (labels == label); + non_zero_points.clear(); + cv::findNonZero(label_mask, non_zero_points); + + std::set neighborLabels; + neighborLabels.clear(); + + for (const auto &point : non_zero_points) + { + int r = point.y; + int c = point.x; + + for (int dr = -1; dr <= 1; ++dr) + { + for (int dc = -1; dc <= 1; ++dc) + { + if (r + dr < 0 || r + dr >= door_mask.rows || + c + dc < 0 || c + dc >= door_mask.cols || (dr == 0 && dc == 0)) + { + continue; + } + int label = room_mask_cropped.at(r + dr, c + dc); + neighborLabels.insert(label); + } + } + } + + neighborLabels.erase(0); + if (neighborLabels.empty()) + { + continue; + } + if (neighborLabels.size() != 2) + { + RCLCPP_ERROR(this->get_logger(), "Door mask has more than two labels or less than two labels, skipping this door."); + continue; + } + + int room_label_1 = *neighborLabels.begin(); + int room_label_2 = *std::next(neighborLabels.begin()); + + if (room_label_1 > room_label_2) + { + std::swap(room_label_1, room_label_2); + } + + // Add neighbors to room nodes + room_nodes_map_[room_label_1].neighbors_.insert(room_label_2); + room_nodes_map_[room_label_2].neighbors_.insert(room_label_1); + + int door_id = adjacency_matrix(room_label_1 - 1, room_label_2 - 1); + adjacency_matrix(room_label_1 - 1, room_label_2 - 1) += 1; + adjacency_matrix(room_label_2 - 1, room_label_1 - 1) += 1; + + for (const auto &point : non_zero_points) + { + pcl::PointXYZRGBL door_point; + Eigen::Vector3i door_idx(point.y, point.x, 0); + Eigen::Vector3f door_position = misc_utils_ns::voxel_to_point_cropped(door_idx, shift_, room_resolution_, bbox_); + door_point.x = door_position.x(); + door_point.y = door_position.y(); + door_point.z = robot_position_.z; + door_point.r = room_label_1; + door_point.g = room_label_2; + door_point.b = 0; + door_point.label = door_id; + door_cloud_->points.push_back(door_point); + + room_mask_vis_cropped.at(point.y, point.x) = cv::Vec3b(0, 0, 255); + } + } + + // Update room connectivity + if (current_room_label > 0) + { + for (auto &id_room_node_pair : room_nodes_map_) + { + int room_id = id_room_node_pair.first; + auto &room_node = id_room_node_pair.second; + bool is_connected = isRoomConnected(room_id - 1, current_room_label - 1, adjacency_matrix); + room_node.is_connected_ = is_connected; + } + } + + cv::Mat output_image = room_mask_vis_cropped.clone(); + + // Draw robot position + cv::circle( + output_image, + cv::Point(robot_idx[1], robot_idx[0]), + 3, + cv::Scalar(0, 0, 0), + -1); + + // Publish room mask + sensor_msgs::msg::Image markers_msg; + markers_msg.header.stamp = this->get_clock()->now(); + markers_msg.header.frame_id = "map"; + markers_msg.height = room_mask_.rows; + markers_msg.width = room_mask_.cols; + markers_msg.encoding = "32SC1"; + markers_msg.is_bigendian = false; + markers_msg.step = room_mask_.cols * sizeof(int); + markers_msg.data.resize(room_mask_.total() * room_mask_.elemSize()); + std::memcpy(markers_msg.data.data(), room_mask_.data, markers_msg.data.size()); + pub_room_mask_->publish(markers_msg); + + saveImageToFile(output_image, "room_segmentation_visualization.png"); + + // Flip and transpose for visualization + cv::Mat flipped_image; + cv::transpose(output_image, flipped_image); + cv::flip(flipped_image, flipped_image, 0); + output_image = flipped_image; + + // Convert to ROS image message + sensor_msgs::msg::Image output_image_msg; + output_image_msg.header.stamp = this->get_clock()->now(); + output_image_msg.header.frame_id = "map"; + output_image_msg.height = output_image.rows; + output_image_msg.width = output_image.cols; + output_image_msg.encoding = "bgr8"; + output_image_msg.is_bigendian = false; + output_image_msg.step = output_image.cols * output_image.elemSize(); + output_image_msg.data.resize(output_image.total() * output_image.elemSize()); + std::memcpy(output_image_msg.data.data(), output_image.data, output_image_msg.data.size()); + pub_room_mask_vis_->publish(output_image_msg); + + t_1 = std::chrono::high_resolution_clock::now(); + // std::cout << "[Time] Post Processing: " << std::chrono::duration(t_1 - t_0).count() << " s" << std::endl; + + auto t_all_1 = std::chrono::high_resolution_clock::now(); + // std::cout << "[Time] Room segmentation: " << std::chrono::duration(t_all_1 - t_all_0).count() << " s" << std::endl; +} + + +// ==================== Publishing Functions ==================== +void RoomSegmentationNode::publishRoomNodes() { + semantic_mapping::msg::RoomNodeList room_node_list_msg; + room_node_list_msg.header.stamp = this->now(); + room_node_list_msg.header.frame_id = "map"; + room_node_list_msg.nodes.reserve(room_nodes_map_.size()); + + for (const auto &id_room_node_pair : room_nodes_map_) { + const representation_ns::RoomNodeRep &room_node = id_room_node_pair.second; + semantic_mapping::msg::RoomNode room_node_msg; + room_node_msg.id = room_node.GetId(); + room_node_msg.show_id = room_node.show_id_; + room_node_msg.polygon = room_node.GetPolygon(); + room_node_msg.centroid.x = room_node.centroid_.x(); + room_node_msg.centroid.y = room_node.centroid_.y(); + room_node_msg.centroid.z = room_node.centroid_.z(); + + for (const auto &neighbor : room_node.neighbors_) { + room_node_msg.neighbors.push_back(neighbor); + } + room_node_msg.is_connected = room_node.is_connected_; + room_node_msg.area = room_node.area_; + room_node_msg.room_mask = *cv_bridge::CvImage(std_msgs::msg::Header(), "mono8", room_node.room_mask_).toImageMsg(); + + room_node_list_msg.nodes.push_back(room_node_msg); + } + pub_room_node_list_->publish(room_node_list_msg); +} + +void RoomSegmentationNode::publishRoomPolygon() { + visualization_msgs::msg::MarkerArray marker_array; + visualization_msgs::msg::Marker delete_marker; + delete_marker.ns = "polygon_group"; + delete_marker.id = 0; + delete_marker.action = visualization_msgs::msg::Marker::DELETEALL; + marker_array.markers.push_back(delete_marker); + + for (auto &id_room_node_pair : room_nodes_map_) { + const auto &room_node = id_room_node_pair.second; + const auto &poly = room_node.GetPolygon(); + + visualization_msgs::msg::Marker marker; + marker.header = poly.header; + marker.ns = "polygon_group"; + marker.id = static_cast(room_node.GetId()); + marker.type = visualization_msgs::msg::Marker::LINE_STRIP; + marker.action = visualization_msgs::msg::Marker::ADD; + + Eigen::Vector3d color_ = misc_utils_ns::idToColor(room_node.GetId()); + marker.scale.x = 0.13; + marker.color.b = color_[0] / 255.0; + marker.color.g = color_[1] / 255.0; + marker.color.r = color_[2] / 255.0; + marker.color.a = 1.0; + + for (const auto &pt : poly.polygon.points) { + geometry_msgs::msg::Point p; + p.x = pt.x; + p.y = pt.y; + p.z = robot_position_.z; + marker.points.push_back(p); + } + + if (!marker.points.empty()) { + marker.points.push_back(marker.points.front()); + } + marker_array.markers.push_back(marker); + } + pub_polygon_->publish(marker_array); +} + +void RoomSegmentationNode::publishDoorCloud() { + sensor_msgs::msg::PointCloud2 door_msg; + pcl::toROSMsg(*door_cloud_, door_msg); + door_msg.header.stamp = this->now(); + door_msg.header.frame_id = "map"; + pub_door_cloud_->publish(door_msg); +} + +} // namespace room_segmentation + +// ==================== Main Function ==================== +int main(int argc, char **argv) { + rclcpp::init(argc, argv); + + auto node = std::make_shared(); + + RCLCPP_INFO(node->get_logger(), "Room Segmentation Node is running..."); + + rclcpp::spin(node); + + rclcpp::shutdown(); + return 0; +} \ No newline at end of file diff --git a/sensor_coverage/src/utils/misc_utils.cpp b/sensor_coverage/src/utils/misc_utils.cpp new file mode 100644 index 0000000..bb2a79b --- /dev/null +++ b/sensor_coverage/src/utils/misc_utils.cpp @@ -0,0 +1,1308 @@ +// +// Created by caochao on 6/5/19. +// + +#include "sensor_coverage/utils/misc_utils.h" +#include +#include + +namespace misc_utils_ns +{ +/// Function for converting a PointType to a geometry_msgs::msg::Point +/// \param pnt A PointType +/// \return A geometry_msgs::msg::Point +geometry_msgs::msg::Point PCL2GeoMsgPnt(const PCLPointType& pnt) +{ + return GeoMsgPoint(pnt.x, pnt.y, pnt.z); +} + +/// Function for converting a geometry_msgs::msg::Point to a PointType +/// \param pnt A geometry_msgs::msg::Point +/// \return A PointType +PCLPointType GeoMsgPnt2PCL(const geometry_msgs::msg::Point& pnt) +{ + PCLPointType point_o; + point_o.x = (float)pnt.x; + point_o.y = (float)pnt.y; + point_o.z = (float)pnt.z; + return point_o; +} + +geometry_msgs::msg::Point GeoMsgPoint(double x, double y, double z) +{ + geometry_msgs::msg::Point p; + p.x = x; + p.y = y; + p.z = z; + return p; +} + +PCLPointType PCLPoint(float x, float y, float z) +{ + PCLPointType p; + p.x = x; + p.y = y; + p.z = z; + return p; +} + +void LeftRotatePoint(PCLPointType& pnt) +{ + float tmp_z = pnt.z; + pnt.z = pnt.y; + pnt.y = pnt.x; + pnt.x = tmp_z; +} + +void RightRotatePoint(PCLPointType& pnt) +{ + float tmp_x = pnt.x; + pnt.x = pnt.y; + pnt.y = pnt.z; + pnt.z = tmp_x; +} + +void LeftRotatePoint(geometry_msgs::msg::Point& pnt) +{ + double tmp_z = pnt.z; + pnt.z = pnt.y; + pnt.y = pnt.x; + pnt.x = tmp_z; +} + +void RightRotatePoint(geometry_msgs::msg::Point& pnt) +{ + double tmp_x = pnt.x; + pnt.x = pnt.y; + pnt.y = pnt.z; + pnt.z = tmp_x; +} + +template +void KeyposeToMap(CloudType& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose) +{ + float tx = (float)keypose->pose.pose.position.x; + float ty = (float)keypose->pose.pose.position.y; + float tz = (float)keypose->pose.pose.position.z; + + tf2::Quaternion tf_q(keypose->pose.pose.orientation.x, keypose->pose.pose.orientation.y, + keypose->pose.pose.orientation.z, keypose->pose.pose.orientation.w); + tf2::Matrix3x3 tf_m(tf_q); + double roll, pitch, yaw; + tf_m.getRPY(roll, pitch, yaw); + + float sin_roll = (float)sin(roll); + float cos_roll = (float)cos(roll); + float sin_pitch = (float)sin(pitch); + float cos_pitch = (float)cos(pitch); + float sin_yaw = (float)sin(yaw); + float cos_yaw = (float)cos(yaw); + + for (auto& point : cloud->points) + { + // To map_rot frame + float x1 = point.x; + float y1 = point.y; + float z1 = point.z; + + float x2 = x1; + float y2 = y1 * cos_roll - z1 * sin_roll; + float z2 = y1 * sin_roll + z1 * cos_roll; + + float x3 = x2 * cos_pitch + z2 * sin_pitch; + float y3 = y2; + float z3 = -x2 * sin_pitch + z2 * cos_pitch; + + float x4 = x3 * cos_yaw - y3 * sin_yaw; + float y4 = x3 * sin_yaw + y3 * cos_yaw; + float z4 = z3; + + float x5 = x4 + tx; + float y5 = y4 + ty; + float z5 = z4 + tz; + + // To map frame + point.x = z5; + point.y = x5; + point.z = y5; + } +} + +/// Function to compute the distance between two geometry_msgs::msg::Point +/// \param pnt1 The first point +/// \param pnt2 The second point +/// \return Distance between the two points +double PointXYDist(const geometry_msgs::msg::Point& pnt1, const geometry_msgs::msg::Point& pnt2) +{ + return sqrt(pow((pnt1.x - pnt2.x), 2) + pow((pnt1.y - pnt2.y), 2)); +} + +/// Function to compute the distance between two PointType +/// \param pnt1 The first point +/// \param pnt2 The second point +/// \return Distance between the two points +double PointXYDist(const PCLPointType& pnt1, const PCLPointType& pnt2) +{ + return sqrt(pow((pnt1.x - pnt2.x), 2) + pow((pnt1.y - pnt2.y), 2)); +} + +/// Function to compute the angle between two vectors in the xy-plane, the z component is omitted +/// \param v1 The first (starting) vector +/// \param v2 The second (end) vector +/// \return Angle from v1 to v2, positive if counterclockwise, negative if clockwise. The output is within [-pi, pi] +double VectorXYAngle(const Eigen::Vector3d& v1, const Eigen::Vector3d& v2) +{ + return atan2(v1.x() * v2.y() - v1.y() * v2.x(), v1.x() * v2.x() + v1.y() * v2.y()); +} + +/// Function to compute the direction (angle) of a geometry_msgs::msg::Point +/// \param pnt Input point +/// \param robot_pos Robot position +/// \return Direction (angle) +double PointAngle(const geometry_msgs::msg::Point& pnt, const geometry_msgs::msg::Point& robot_pos) +{ + return atan2((pnt.y - robot_pos.y), (pnt.x - robot_pos.x)); +} + +/// Function to compute the direction (angle) of a PointType +/// \param pnt Intput point +/// \param robot_pos Robot position +/// \return Direction (angle) +double PointAngle(const PCLPointType& pnt, const geometry_msgs::msg::Point& robot_pos) +{ + return atan2((pnt.y - robot_pos.y), (pnt.x - robot_pos.x)); +} + +/// Function to check if three points are collinear in the xy-plane +/// \param p1 First point +/// \param p2 Second point +/// \param p3 Third point +/// \return Collinear +bool CollinearXY(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& p2, + const geometry_msgs::msg::Point& p3, double threshold) +{ + // https://math.stackexchange.com/questions/405966/if-i-have-three-points-is-there-an-easy-way-to-tell-if-they-are-collinear + double val = (p2.y - p1.y) * (p3.x - p2.x) - (p3.y - p2.y) * (p2.x - p1.x); + if (std::abs(val) < threshold) + { + return true; + } + else + { + return false; + } +} + +/// Function to calculate the overlap of two angle intervals [s1, e1] and [s2, e2]. s1(2)->e1(2) counter-clockwise and +/// all angles shoud be in [-pi, pi] \param s1 starting angle of the first interval \param e1 end angle of the second +/// interval \param s2 starting angle of the first interval \param e2 end angle of the second interval \return overlap +/// the overlap of the two intervals. > 0 if overlapped, < 0 otherwise. +double AngleOverlap(double s1, double e1, double s2, double e2) +{ + double overlap = 0.0; + // TODO: normalize angles to [-pi, pi] + // The first interval crosses the evil branch + if (e1 < s1) + { + // Recursively compute the overlaps + double sub_overlap1 = AngleOverlap(s1, M_PI, s2, e2); + double sub_overlap2 = AngleOverlap(-M_PI, e1, s2, e2); + // If both sub-overlaps are negative (no overlap) or there is only one positive sub-overlap + if ((sub_overlap1 < 0 && sub_overlap2 < 0) || sub_overlap1 * sub_overlap2 < 0) + { + overlap = std::max(sub_overlap1, sub_overlap2); + } + else + { + overlap = sub_overlap1 + sub_overlap2; + } + } + else if (e2 < s2) + { + // Similar to the case above + double sub_overlap1 = AngleOverlap(s1, e1, s2, M_PI); + double sub_overlap2 = AngleOverlap(s1, e1, -M_PI, e2); + if ((sub_overlap1 < 0 && sub_overlap2 < 0) || sub_overlap1 * sub_overlap2 < 0) + { + overlap = std::max(sub_overlap1, sub_overlap2); + } + else + { + overlap = sub_overlap1 + sub_overlap2; + } + } + else + { + if (e1 > e2) + { + if (s1 > e2) + { + // No overlap + overlap = e2 - s1; + } + else if (s1 > s2) + { + overlap = e2 - s1; + } + else + { + overlap = e2 - s2; + } + } + else + { + if (s2 > e1) + { + // No overlap + overlap = e1 - s2; + } + else if (s2 > s1) + { + overlap = e1 - s2; + } + else + { + overlap = e1 - s1; + } + } + } + return overlap; +} + +double AngleDiff(double source_angle, double target_angle) +{ + double angle_diff = target_angle - source_angle; + if (angle_diff > M_PI) + { + angle_diff -= 2 * M_PI; + } + if (angle_diff < -M_PI) + { + angle_diff += 2 * M_PI; + } + return angle_diff; +} + +/// Function to determine if a point is on a line segment. +/// Given three colinear points p, q, r, the function checks if point q lies on line segment 'pr' +/// \param p End point of line segment pr +/// \param q Point to be examined +/// \param r End point of line segment pr +/// \return If q is on pr +bool PointOnLineSeg(const geometry_msgs::msg::Point& p, const geometry_msgs::msg::Point& q, + const geometry_msgs::msg::Point& r) +{ + if (q.x <= std::max(p.x, r.x) && q.x >= std::min(p.x, r.x) && q.y <= std::max(p.y, r.y) && q.y >= std::min(p.y, r.y)) + { + return true; + } + else + { + return false; + } +} + +/// Function to find orientation of ordered triplet (p, q, r). +/// \param p The first point +/// \param q The second point +/// \param r The third point +/// \return 0 --> p, q and r are colinear, 1 --> Clockwise, 2 --> Counterclockwise +int ThreePointOrientation(const geometry_msgs::msg::Point& p, const geometry_msgs::msg::Point& q, + const geometry_msgs::msg::Point& r) +{ + // See https://www.geeksforgeeks.org/orientation-3-ordered-points/ + // for details of below formula. + double val = (q.y - p.y) * (r.x - q.x) - (q.x - p.x) * (r.y - q.y); + + if (fabs(val) < 0.1) + return 0; // colinear + + return (val > 0) ? 1 : 2; // clock or counterclock wise +} + +/// Function to check if two line segments intersect, returns +/// \param p1 end point of 'p1q1' +/// \param q1 end point of 'p1q1' +/// \param p2 end point of 'p2q2' +/// \param q2 end point of 'p2q2' +/// \return true if line segment 'p1q1' and 'p2q2' intersect, false otherwise +bool LineSegIntersect(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& q1, + const geometry_msgs::msg::Point& p2, const geometry_msgs::msg::Point& q2) +{ + // Find the four orientations needed for general and + // special cases + int o1 = ThreePointOrientation(p1, q1, p2); + int o2 = ThreePointOrientation(p1, q1, q2); + int o3 = ThreePointOrientation(p2, q2, p1); + int o4 = ThreePointOrientation(p2, q2, q1); + + // General case + if (o1 != o2 && o3 != o4) + return true; + + // Special Cases + // p1, q1 and p2 are colinear and p2 lies on segment p1q1 + if (o1 == 0 && PointOnLineSeg(p1, p2, q1)) + return true; + + // p1, q1 and q2 are colinear and q2 lies on segment p1q1 + if (o2 == 0 && PointOnLineSeg(p1, q2, q1)) + return true; + + // p2, q2 and p1 are colinear and p1 lies on segment p2q2 + if (o3 == 0 && PointOnLineSeg(p2, p1, q2)) + return true; + + // p2, q2 and q1 are colinear and q1 lies on segment p2q2 + if (o4 == 0 && PointOnLineSeg(p2, q1, q2)) + return true; + + return false; // Doesn't fall in any of the above cases +} + +/// Similar to LineSegIntersect(), except adds a "tolerance" +/// such that each line segment is extended at both ends by this distance +/// i.e. this will return true more often than LineSegIntersect() +/// \param p1 end point of 'p1q1' +/// \param q1 end point of 'p1q1' +/// \param p2 end point of 'p2q2' +/// \param q2 end point of 'p2q2' +/// \param tolerance distance to be added at both ends of both lines +/// \return true if line segment 'p1q1' and 'p2q2' intersect, false otherwise +bool LineSegIntersectWithTolerance(const geometry_msgs::msg::Point& p1, const geometry_msgs::msg::Point& q1, + const geometry_msgs::msg::Point& p2, const geometry_msgs::msg::Point& q2, + const double tolerance) +{ + // Make a copy + geometry_msgs::msg::Point p1_extend = p1; + geometry_msgs::msg::Point q1_extend = q1; + geometry_msgs::msg::Point p2_extend = p2; + geometry_msgs::msg::Point q2_extend = q2; + + // Extend line segment 1 + double dist1 = PointXYDist(p1, q1); + if (dist1 == 0) + return false; // trivial + double dir1_x = (q1.x - p1.x) / dist1; + double dir1_y = (q1.y - p1.y) / dist1; + p1_extend.x -= tolerance * dir1_x; + p1_extend.y -= tolerance * dir1_y; + q1_extend.x += tolerance * dir1_x; + q1_extend.y += tolerance * dir1_y; + + // Extend line segment 2 + double dist2 = PointXYDist(p2, q2); + if (dist2 == 0) + return false; // trivial + double dir2_x = (q2.x - p2.x) / dist2; + double dir2_y = (q2.y - p2.y) / dist2; + p2_extend.x -= tolerance * dir2_x; + p2_extend.y -= tolerance * dir2_y; + q2_extend.x += tolerance * dir2_x; + q2_extend.y += tolerance * dir2_y; + + // Call the standard function with these extended line segments + return LineSegIntersect(p1_extend, q1_extend, p2_extend, q2_extend); +} + +/// Function to check if a point is inside a polygon +/// \param p point +/// \param polygon polygon +/// \return true if the point is inside the polygon +bool PointInPolygon(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Polygon& polygon) +{ + int polygon_pnt_num = polygon.points.size(); + if (polygon_pnt_num < 3) + return false; + + geometry_msgs::msg::Point inf_point; + inf_point.x = std::numeric_limits::max(); + inf_point.y = point.y; + int count = 0; + int cur_idx = 0; + do + { + int next_idx = (cur_idx + 1) % polygon_pnt_num; + // Check if the line segment from 'point' to 'inf_point' intersects + // with the line segment from 'polygon[cur_idx]' to 'polygon[next_idx]' + geometry_msgs::msg::Point polygon_cur_pnt = + GeoMsgPoint(polygon.points[cur_idx].x, polygon.points[cur_idx].y, polygon.points[cur_idx].z); + geometry_msgs::msg::Point polygon_next_pnt = + GeoMsgPoint(polygon.points[next_idx].x, polygon.points[next_idx].y, polygon.points[next_idx].z); + if (LineSegIntersect(polygon_cur_pnt, polygon_next_pnt, point, inf_point)) + { + // If the point 'point' is colinear with line segment 'cur-next', + // then check if it lies on segment. If it lies, return true, + // otherwise false + if (ThreePointOrientation(polygon_cur_pnt, point, polygon_next_pnt) == 0) + { + return PointOnLineSeg(polygon_cur_pnt, point, polygon_next_pnt); + } + count++; + } + cur_idx = next_idx; + } while (cur_idx != 0); + + // return true if count is odd, false othterwise + return count % 2 == 1; +} + +/// Function to get distance from a point to a line segment +/// assumes z components is 0! +/// \param p point +/// \param line_segment_start +/// \param line_segment_end +/// \return distance +double LineSegDistance2D(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Point& line_segment_start, + const geometry_msgs::msg::Point& line_segment_end) +{ + // code adapted from http://geomalgorithms.com/a02-_lines.html + + // vector end to start + double v_x = line_segment_end.x - line_segment_start.x; + double v_y = line_segment_end.y - line_segment_start.y; + + // vector start to point + double w_x = point.x - line_segment_start.x; + double w_y = point.y - line_segment_start.y; + + // if outside one boundary, get point to point distance + double c1 = v_x * w_x + v_y * w_y; // dot product + + if (c1 <= 0) + return PointXYDist(point, line_segment_start); + + // if outside other boundary, get point to point distance + double c2 = v_x * v_x + v_y * v_y; // dot product + if (c2 <= c1) + return PointXYDist(point, line_segment_end); + + // otherwise project point and get distance (seems inefficient?) + double b = c1 / c2; + geometry_msgs::msg::Point point_projected; + point_projected.x = line_segment_start.x + b * v_x; + point_projected.y = line_segment_start.y + b * v_y; + return PointXYDist(point, point_projected); +} + +/// Function to get distance from a point to a line segment +/// \param p point +/// \param line_segment_start +/// \param line_segment_end +/// \return distance +double LineSegDistance3D(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Point& line_segment_start, + const geometry_msgs::msg::Point& line_segment_end) +{ + // code adapted from http://geomalgorithms.com/a02-_lines.html + + // vector end to start + double v_x = line_segment_end.x - line_segment_start.x; + double v_y = line_segment_end.y - line_segment_start.y; + double v_z = line_segment_end.z - line_segment_start.z; + + // vector start to point + double w_x = point.x - line_segment_start.x; + double w_y = point.y - line_segment_start.y; + double w_z = point.z - line_segment_start.z; + + // if outside one boundary, get point to point distance + double c1 = v_x * w_x + v_y * w_y + v_z * w_z; // dot product + + if (c1 <= 0) + return PointXYZDist(point, line_segment_start); + + // if outside other boundary, get point to point distance + double c2 = v_x * v_x + v_y * v_y + v_z * v_z; // dot product + if (c2 <= c1) + return PointXYZDist(point, line_segment_end); + + // otherwise project point and get distance (seems inefficient?) + double b = c1 / c2; + geometry_msgs::msg::Point point_projected; + point_projected.x = line_segment_start.x + b * v_x; + point_projected.y = line_segment_start.y + b * v_y; + point_projected.z = line_segment_start.z + b * v_z; + return PointXYZDist(point, point_projected); +} + +/// Function to get distance from a point to the closest point on boundary of a polygon +/// \param p point +/// \param polygon polygon +/// \return distance +double DistancePoint2DToPolygon(const geometry_msgs::msg::Point& point, const geometry_msgs::msg::Polygon& polygon) +{ + int polygon_pnt_num = polygon.points.size(); + if (polygon_pnt_num < 1) + return 0; + if (polygon_pnt_num == 1) + { + geometry_msgs::msg::Point poly_point = GeoMsgPoint(polygon.points[0].x, polygon.points[0].y, 0); + return PointXYDist(point, poly_point); + } + + double distance_return = INFINITY; + int cur_idx = 0; + + // iterate through points in polygon + do + { + int next_idx = (cur_idx + 1) % polygon_pnt_num; + + // get point to line segment distance + geometry_msgs::msg::Point polygon_cur_pnt = GeoMsgPoint(polygon.points[cur_idx].x, polygon.points[cur_idx].y, 0); + geometry_msgs::msg::Point polygon_next_pnt = GeoMsgPoint(polygon.points[next_idx].x, polygon.points[next_idx].y, 0); + double distance = LineSegDistance2D(point, polygon_cur_pnt, polygon_next_pnt); + if (distance < distance_return) + { + distance_return = distance; + } + + cur_idx = next_idx; + } while (cur_idx != 0); + + return distance_return; +} + +void LinInterpPoints(const Eigen::Vector3d& p1, const Eigen::Vector3d& p2, double resolution, + std::vector& interp_points) +{ + interp_points.clear(); + double point_dist = (p1 - p2).norm(); + if (point_dist < resolution) + { + interp_points.push_back(p1); + interp_points.push_back(p2); + return; + } + int point_num = ceil(point_dist / resolution); + for (int i = 0; i < point_num; i++) + { + Eigen::Vector3d interp_point = (p2 - p1) / point_num * i + p1; + interp_points.push_back(interp_point); + } +} + +double DegreeToRadian(double degree) +{ + return degree / 180.0 * M_PI; +} +double RadianToDegree(double radian) +{ + return radian * 180.0 / M_PI; +} + +void ConcatenatePath(nav_msgs::msg::Path& path1, const nav_msgs::msg::Path& path2, int from_ind, int to_ind) +{ + if (path2.poses.empty()) + { + return; + } + if (from_ind == -1 && to_ind == -1) + { + from_ind = 0; + to_ind = path2.poses.size() - 1; + } + MY_ASSERT(from_ind >= 0 && from_ind < path2.poses.size()); + MY_ASSERT(to_ind >= 0 && to_ind < path2.poses.size()); + if (from_ind <= to_ind) + { + for (int i = from_ind; i <= to_ind; i++) + { + path1.poses.push_back(path2.poses[i]); + } + } + else + { + for (int i = from_ind; i >= to_ind; i--) + { + path1.poses.push_back(path2.poses[i]); + } + } +} + +void SetDifference(std::vector& v1, std::vector& v2, std::vector& diff) +{ + std::sort(v1.begin(), v1.end()); + std::sort(v2.begin(), v2.end()); + diff.clear(); + std::set_difference(v1.begin(), v1.end(), v2.begin(), v2.end(), std::inserter(diff, diff.begin())); +} + +void SetIntersection(std::vector &v1, std::vector &v2, std::vector &intersection) +{ + std::sort(v1.begin(), v1.end()); + std::sort(v2.begin(), v2.end()); + intersection.clear(); + std::set_intersection( + v1.begin(), v1.end(), + v2.begin(), v2.end(), + std::inserter(intersection, intersection.begin())); +} + +int Marker::id_ = 0; + +int signum(int x) +{ + return x == 0 ? 0 : x < 0 ? -1 : 1; +} + +double mod(double value, double modulus) +{ + return fmod(fmod(value, modulus) + modulus, modulus); +} + +double intbound(double s, double ds) +{ + // Find the smallest positive t such that s+t*ds is an integer. + if (ds < 0) + { + return intbound(-s, -ds); + } + else + { + s = mod(s, 1); + // problem is now s+t*ds = 1 + return (1 - s) / ds; + } +} + +bool InRange(const Eigen::Vector3i& sub, const Eigen::Vector3i& max_sub, const Eigen::Vector3i& min_sub) +{ + return sub.x() >= min_sub.x() && sub.x() <= max_sub.x() && sub.y() >= min_sub.y() && sub.y() <= max_sub.y() && + sub.z() >= min_sub.z() && sub.z() <= max_sub.z(); +} +void RayCast(const Eigen::Vector3i& start_sub, const Eigen::Vector3i& end_sub, const Eigen::Vector3i& max_sub, + const Eigen::Vector3i& min_sub, std::vector& output) +{ + output.clear(); + MY_ASSERT(InRange(start_sub, max_sub, min_sub)); + MY_ASSERT(InRange(end_sub, max_sub, min_sub)); + + if (start_sub == end_sub) + { + output.push_back(start_sub); + return; + } + Eigen::Vector3i diff_sub = end_sub - start_sub; + double max_dist = diff_sub.squaredNorm(); + int step_x = signum(diff_sub.x()); + int step_y = signum(diff_sub.y()); + int step_z = signum(diff_sub.z()); + double t_max_x = step_x == 0 ? DBL_MAX : intbound(start_sub.x(), diff_sub.x()); + double t_max_y = step_y == 0 ? DBL_MAX : intbound(start_sub.y(), diff_sub.y()); + double t_max_z = step_z == 0 ? DBL_MAX : intbound(start_sub.z(), diff_sub.z()); + double t_delta_x = step_x == 0 ? DBL_MAX : (double)step_x / (double)diff_sub.x(); + double t_delta_y = step_y == 0 ? DBL_MAX : (double)step_y / (double)diff_sub.y(); + double t_delta_z = step_z == 0 ? DBL_MAX : (double)step_z / (double)diff_sub.z(); + double dist = 0; + Eigen::Vector3i cur_sub = start_sub; + + while (true) + { + MY_ASSERT(InRange(cur_sub, max_sub, min_sub)); + output.push_back(cur_sub); + dist = (cur_sub - start_sub).squaredNorm(); + if (cur_sub == end_sub || dist > max_dist) + { + return; + } + if (t_max_x < t_max_y) + { + if (t_max_x < t_max_z) + { + cur_sub.x() += step_x; + t_max_x += t_delta_x; + } + else + { + cur_sub.z() += step_z; + t_max_z += t_delta_z; + } + } + else + { + if (t_max_y < t_max_z) + { + cur_sub.y() += step_y; + t_max_y += t_delta_y; + } + else + { + cur_sub.z() += step_z; + t_max_z += t_delta_z; + } + } + } +} + +bool InFOV(const Eigen::Vector3d& point_position, const Eigen::Vector3d& viewpoint_position, double vertical_half_fov, + double range) +{ + Eigen::Vector3d diff = point_position - viewpoint_position; + double dist = diff.norm(); + if (dist > range) + { + return false; + } + double theta = acos(diff.z() / dist); + double vertical_fov_max = M_PI / 2 + vertical_half_fov; + double vertical_fov_min = M_PI / 2 - vertical_half_fov; + return theta >= vertical_fov_min && theta <= vertical_fov_max; +} + +bool InFOVSimple(const Eigen::Vector3d& point_position, const Eigen::Vector3d& viewpoint_position, + double vertical_fov_ratio, double range, double xy_dist_threshold, double z_diff_threshold, bool print) +{ + Eigen::Vector3d diff = point_position - viewpoint_position; + double xy_dist = sqrt(diff.x() * diff.x() + diff.y() * diff.y()); + if (xy_dist < xy_dist_threshold && std::abs(diff.z()) < z_diff_threshold) + { + if (print) + { + std::cout << "nearby point approximation" << std::endl; + } + return true; + } + if (xy_dist > range) + { + if (print) + { + std::cout << "xy_dist: " << xy_dist << " > range: " << range << std::endl; + } + return false; + } + if (std::abs(diff.z()) > vertical_fov_ratio * xy_dist) + { + if (print) + { + std::cout << "diff z: " << std::abs(diff.z()) << " > threshold : " << vertical_fov_ratio << " * " << xy_dist + << " = " << vertical_fov_ratio * xy_dist << std::endl; + } + return false; + } + return true; +} + +float ApproxAtan(float z) +{ + const float n1 = 0.97239411f; + const float n2 = -0.19194795f; + return (n1 + n2 * z * z) * z; +} + +float ApproxAtan2(float y, float x) +{ + float ay = std::abs(y), ax = std::abs(x); + int invert = ay > ax; + float z = invert ? ax / ay : ay / ax; // [0,1] + float th = ApproxAtan(z); // [0,π/4] + if (invert) + th = M_PI_2 - th; // [0,π/2] + if (x < 0) + th = M_PI - th; // [0,π] + th = copysign(th, y); // [-π,π] + return th; +} + +double GetPathLength(const nav_msgs::msg::Path& path) +{ + double path_length = 0.0; + if (path.poses.size() < 2) + { + return path_length; + } + for (int i = 0; i < path.poses.size() - 1; i++) + { + int cur_pose_idx = i; + int next_pose_idx = i + 1; + path_length += misc_utils_ns::PointXYZDist( + path.poses[cur_pose_idx].pose.position, path.poses[next_pose_idx].pose.position); + } + return path_length; +} + +double GetPathLength(const std::vector& path) +{ + double path_length = 0.0; + if (path.size() < 2) + { + return path_length; + } + for (int i = 0; i < path.size() - 1; i++) + { + int cur_idx = i; + int next_idx = i + 1; + path_length += (path[cur_idx] - path[next_idx]).norm(); + } + return path_length; +} + +double AStarSearch(const std::vector>& graph, const std::vector>& node_dist, + const std::vector& node_positions, int from_idx, int to_idx, + bool get_path, std::vector& path_indices) +{ + MY_ASSERT(graph.size() == node_dist.size()); + MY_ASSERT(graph.size() == node_positions.size()); + MY_ASSERT(misc_utils_ns::InRange>(graph, from_idx)); + MY_ASSERT(misc_utils_ns::InRange>(graph, to_idx)); + double INF = 9999.0; + typedef std::pair iPair; + double shortest_dist = 0; + std::priority_queue, std::greater> pq; + std::vector g(graph.size(), INF); + std::vector f(graph.size(), INF); + std::vector prev(graph.size(), -1); + std::vector in_pg(graph.size(), false); + + g[from_idx] = 0; + f[from_idx] = misc_utils_ns::PointXYZDist( + node_positions[from_idx], node_positions[to_idx]); + pq.push(std::make_pair(f[from_idx], from_idx)); + in_pg[from_idx] = true; + + while (!pq.empty()) + { + int u = pq.top().second; + pq.pop(); + in_pg[u] = false; + if (u == to_idx) + { + shortest_dist = g[u]; + break; + } + + for (int i = 0; i < graph[u].size(); i++) + { + int v = graph[u][i]; + MY_ASSERT(misc_utils_ns::InRange>(graph, v)); + double d = node_dist[u][i]; + if (g[v] > g[u] + d) + { + prev[v] = u; + g[v] = g[u] + d; + f[v] = g[v] + misc_utils_ns::PointXYZDist( + node_positions[v], node_positions[to_idx]); + if (!in_pg[v]) + { + pq.push(std::make_pair(f[v], v)); + in_pg[v] = true; + } + } + } + } + + if (get_path) + { + path_indices.clear(); + int u = to_idx; + if (prev[u] != -1 || u == from_idx) + { + while (u != -1) + { + path_indices.push_back(u); + u = prev[u]; + } + } + } + std::reverse(path_indices.begin(), path_indices.end()); + + return shortest_dist; +} + +bool AStarSearchWithMaxPathLength(const std::vector>& graph, + const std::vector>& node_dist, + const std::vector& node_positions, int from_idx, + int to_idx, bool get_path, std::vector& path_indices, double& shortest_dist, + double max_path_length) +{ + MY_ASSERT(graph.size() == node_dist.size()); + MY_ASSERT(graph.size() == node_positions.size()); + MY_ASSERT(misc_utils_ns::InRange>(graph, from_idx)); + MY_ASSERT(misc_utils_ns::InRange>(graph, to_idx)); + double INF = 9999.0; + typedef std::pair iPair; + std::priority_queue, std::greater> pq; + std::vector g(graph.size(), INF); + std::vector f(graph.size(), INF); + std::vector prev(graph.size(), -1); + std::vector in_pg(graph.size(), false); + + g[from_idx] = 0; + f[from_idx] = misc_utils_ns::PointXYZDist( + node_positions[from_idx], node_positions[to_idx]); + pq.push(std::make_pair(f[from_idx], from_idx)); + in_pg[from_idx] = true; + + bool found_path = false; + + while (!pq.empty()) + { + int u = pq.top().second; + pq.pop(); + in_pg[u] = false; + if (u == to_idx) + { + shortest_dist = g[u]; + found_path = true; + break; + } + if (g[u] > max_path_length) + { + shortest_dist = g[u]; + found_path = false; + break; + } + + for (int i = 0; i < graph[u].size(); i++) + { + int v = graph[u][i]; + MY_ASSERT(misc_utils_ns::InRange>(graph, v)); + double d = node_dist[u][i]; + if (g[v] > g[u] + d) + { + prev[v] = u; + g[v] = g[u] + d; + f[v] = g[v] + misc_utils_ns::PointXYZDist( + node_positions[v], node_positions[to_idx]); + if (!in_pg[v]) + { + pq.push(std::make_pair(f[v], v)); + in_pg[v] = true; + } + } + } + } + + if (get_path && found_path) + { + path_indices.clear(); + int u = to_idx; + if (prev[u] != -1 || u == from_idx) + { + while (u != -1) + { + path_indices.push_back(u); + u = prev[u]; + } + } + } + std::reverse(path_indices.begin(), path_indices.end()); + + return found_path; +} + +nav_msgs::msg::Path SimplifyPath(const nav_msgs::msg::Path& path) +{ + nav_msgs::msg::Path simplified_path; + if (path.poses.size() <= 2) + { + simplified_path = path; + return simplified_path; + } + + simplified_path.poses.push_back(path.poses.front()); + for (int i = 1; i < path.poses.size() - 1; i++) + { + geometry_msgs::msg::Point prev_point = path.poses[i - 1].pose.position; + geometry_msgs::msg::Point cur_point = path.poses[i].pose.position; + geometry_msgs::msg::Point next_point = path.poses[i + 1].pose.position; + if (!CollinearXY(prev_point, cur_point, next_point)) // The three points are not colinear in the xy plane + { + simplified_path.poses.push_back(path.poses[i]); + } + } + + simplified_path.poses.push_back(path.poses.back()); + + return simplified_path; +} + +nav_msgs::msg::Path DeduplicatePath(const nav_msgs::msg::Path& path, double min_dist) +{ + nav_msgs::msg::Path deduplicated_path; + if (path.poses.size() <= 2) + { + deduplicated_path = path; + return deduplicated_path; + } + deduplicated_path.poses.push_back(path.poses.front()); + geometry_msgs::msg::Point prev_point = path.poses[0].pose.position; + for (int i = 1; i < path.poses.size() - 1; i++) + { + double dist_to_prev = misc_utils_ns::PointXYZDist( + prev_point, path.poses[i].pose.position); + if (dist_to_prev > min_dist) + { + deduplicated_path.poses.push_back(path.poses[i]); + prev_point = path.poses[i].pose.position; + } + } + double dist_to_prev = misc_utils_ns::PointXYZDist( + prev_point, path.poses.back().pose.position); + double dist_to_start = misc_utils_ns::PointXYZDist( + path.poses.front().pose.position, path.poses.back().pose.position); + if (dist_to_prev > min_dist && dist_to_start > min_dist) + { + deduplicated_path.poses.push_back(path.poses.back()); + } + return deduplicated_path; +} + +void SampleLineSegments(const std::vector& initial_points, double sample_resol, + std::vector& sample_points) + +{ + if (initial_points.size() < 2) + { + sample_points = initial_points; + return; + } + sample_points.clear(); + int cur_idx = 0; + int next_idx = 1; + Eigen::Vector3d cur_point = initial_points[cur_idx]; + Eigen::Vector3d next_point = initial_points[next_idx]; + sample_points.push_back(cur_point); + double sample_dist = sample_resol; + while (next_idx < initial_points.size()) + { + next_point = initial_points[next_idx]; + double dist_to_next = (cur_point - next_point).norm(); + if (dist_to_next > sample_dist) + { + Eigen::Vector3d sample_point = (next_point - cur_point).normalized() * sample_dist + cur_point; + sample_points.push_back(sample_point); + cur_point = sample_point; + sample_dist = sample_resol; + } + else + { + next_idx++; + cur_point = next_point; + sample_dist = sample_dist - dist_to_next; + } + } + sample_points.push_back(next_point); +} + +void UniquifyIntVector(std::vector& list) +{ + std::sort(list.begin(), list.end()); + std::vector::iterator it; + it = std::unique(list.begin(), list.end()); + list.resize(std::distance(list.begin(), it)); +} + +#include +#include +#include +#include +#include // 可选:用于输出调试信息 + +std::vector find_path_bfs( + int start, + int goal, + const Eigen::MatrixXi &adj_matrix) +{ + int n = adj_matrix.rows(); + + if (start < 0 || start >= n || goal < 0 || goal >= n) + { + std::cerr << "[find_path_bfs] Invalid start or goal index: " + << "start = " << start << ", goal = " << goal << ", size = " << n << std::endl; + return {}; + } + + if (adj_matrix.rows() != adj_matrix.cols()) + { + std::cerr << "[find_path_bfs] Error: adj_matrix is not square!" << std::endl; + return {}; + } + + std::vector visited(n, false); + std::unordered_map parent; + std::queue q; + + visited[start] = true; + q.push(start); + + while (!q.empty()) + { + int curr = q.front(); + q.pop(); + + if (curr == goal) + break; + + for (int neighbor = 0; neighbor < n; ++neighbor) + { + if (curr < 0 || curr >= n || neighbor < 0 || neighbor >= n) + continue; + + if (adj_matrix(curr, neighbor) != 0 && !visited[neighbor]) + { + visited[neighbor] = true; + parent[neighbor] = curr; + q.push(neighbor); + } + } + } + + if (!visited[goal]) + { + std::cerr << "[find_path_bfs] No path found from " << start << " to " << goal << std::endl; + return {}; + } + + std::vector path; + int at = goal; + while (at != start) + { + path.push_back(at); + if (parent.find(at) == parent.end()) + { + std::cerr << "[find_path_bfs] Error: broken path during backtracking at " << at << std::endl; + return {}; + } + at = parent[at]; + } + path.push_back(start); + std::reverse(path.begin(), path.end()); + return path; +} + +// 辅助函数:将 HSV 映射为 RGB,范围都是 [0, 255] +Eigen::Vector3d HSVtoRGB(double h, double s, double v) +{ + double c = v * s; + double x = c * (1 - std::fabs(fmod(h / 60.0, 2) - 1)); + double m = v - c; + double r_, g_, b_; + + if (h < 60) + { + r_ = c; + g_ = x; + b_ = 0; + } + else if (h < 120) + { + r_ = x; + g_ = c; + b_ = 0; + } + else if (h < 180) + { + r_ = 0; + g_ = c; + b_ = x; + } + else if (h < 240) + { + r_ = 0; + g_ = x; + b_ = c; + } + else if (h < 300) + { + r_ = x; + g_ = 0; + b_ = c; + } + else + { + r_ = c; + g_ = 0; + b_ = x; + } + + int r = static_cast((r_ + m) * 255.0); + int g = static_cast((g_ + m) * 255.0); + int b = static_cast((b_ + m) * 255.0); + + return Eigen::Vector3d(b, g, r); // 注意:OpenCV 中 BGR 顺序 +} + +// 主函数:ID 映射为鲜艳 RGB 颜色 +Eigen::Vector3d idToColor(int id) +{ + if (id == 0) + { + return Eigen::Vector3d(255, 255, 255); // 白色表示无效或未分配的 ID + } + + double hue = (id * 57) % 360; // 色相:分布在 0~360 + double saturation = 0.85; // 高饱和度 + double value = 0.95; // 高亮度 + + return HSVtoRGB(hue, saturation, value); +} + +// 将点坐标转换为体素网格索引 +Eigen::Vector3i point_to_voxel( + const Eigen::Vector3f &pt, + const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0) + float inv_resolution) // precomputed 1.0 / grid_resolution +{ + Eigen::Vector3f shifted = pt * inv_resolution + origin_shift; + return shifted.array().floor().cast(); +} + +Eigen::Vector3i point_to_voxel( + geometry_msgs::msg::Point pt, + const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0) + float inv_resolution) // precomputed 1.0 / grid_resolution +{ + Eigen::Vector3f pt_eigen(pt.x, pt.y, pt.z); + Eigen::Vector3f shifted = pt_eigen * inv_resolution + origin_shift; + return shifted.array().floor().cast(); +} + +// 将体素网格索引转换为点坐标 +Eigen::Vector3f voxel_to_point( + const Eigen::Vector3i &voxel_index, + const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0) + float resolution) // grid_resolution +{ + Eigen::Vector3f pt = voxel_index.cast() - origin_shift; + return pt * resolution; +} + +Eigen::Vector3i point_to_voxel_cropped( + const Eigen::Vector3f &pt, + const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0) + float inv_resolution, + std::vector &bbox) // crop region bbox +{ + Eigen::Vector3f shifted = pt * inv_resolution + origin_shift; + shifted.x() = shifted.x() - bbox[0].x(); + shifted.y() = shifted.y() - bbox[0].y(); + shifted.z() = shifted.z(); + + return shifted.array().floor().cast(); +} + +// 将体素网格索引转换为点坐标 +Eigen::Vector3f voxel_to_point_cropped( + const Eigen::Vector3i &voxel_index, + const Eigen::Vector3f &origin_shift, // e.g., (-150.0, -150.0, -1.0) + float resolution, + std::vector &bbox) // crop region bbox +{ + Eigen::Vector3f voxel_idx = voxel_index.cast(); + voxel_idx.x() += bbox[0].x(); + voxel_idx.y() += bbox[0].y(); + Eigen::Vector3f pt = voxel_idx - origin_shift; + return pt * resolution; +} + +} // namespace misc_utils_ns + +template void misc_utils_ns::KeyposeToMap::Ptr>( + pcl::PointCloud::Ptr& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose); +template void misc_utils_ns::KeyposeToMap::Ptr>( + pcl::PointCloud::Ptr& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose); +template void misc_utils_ns::KeyposeToMap::Ptr>( + pcl::PointCloud::Ptr& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose); +template void misc_utils_ns::KeyposeToMap::Ptr>( + pcl::PointCloud::Ptr& cloud, const nav_msgs::msg::Odometry::ConstPtr& keypose); diff --git a/track_anything/setup.cfg b/track_anything/setup.cfg index 356c3c5..f5d61c5 100644 --- a/track_anything/setup.cfg +++ b/track_anything/setup.cfg @@ -1,5 +1,3 @@ -[develop] -script_dir=$base/lib/track_anything [install] install_scripts=$base/lib/track_anything diff --git a/track_anything/setup.py b/track_anything/setup.py index 8069b21..ce62bac 100644 --- a/track_anything/setup.py +++ b/track_anything/setup.py @@ -1,9 +1,30 @@ from setuptools import find_packages, setup import os +import sys from glob import glob package_name = 'track_anything' +# colcon --symlink-install may pass args setuptools doesn't understand +def _strip_colcon_args(argv: list[str]) -> list[str]: + cleaned: list[str] = [] + skip_next = False + for i, arg in enumerate(argv): + if skip_next: + skip_next = False + continue + if arg == "--editable": + continue + if arg == "--build-directory": + skip_next = True + continue + if arg.startswith("--build-directory="): + continue + cleaned.append(arg) + return cleaned + +sys.argv = _strip_colcon_args(sys.argv) + setup( name=package_name, version='0.0.1', @@ -26,7 +47,6 @@ maintainer_email='alex.lin416@outlook.com', description='ROS2 package for tracking objects using Track-Anything', license='MIT', - tests_require=['pytest'], entry_points={ 'console_scripts': [ ], diff --git a/vector_perception_utils/setup.cfg b/vector_perception_utils/setup.cfg deleted file mode 100644 index 1ce0c22..0000000 --- a/vector_perception_utils/setup.cfg +++ /dev/null @@ -1,5 +0,0 @@ -[develop] -script_dir=$base/lib/vector_perception_utils -[install] -install_scripts=$base/lib/vector_perception_utils - diff --git a/vector_perception_utils/setup.py b/vector_perception_utils/setup.py index f1b383f..84834ac 100644 --- a/vector_perception_utils/setup.py +++ b/vector_perception_utils/setup.py @@ -1,9 +1,30 @@ from setuptools import find_packages, setup +import sys import os from glob import glob package_name = 'vector_perception_utils' +# colcon --symlink-install may pass args setuptools doesn't understand +def _strip_colcon_args(argv: list[str]) -> list[str]: + cleaned: list[str] = [] + skip_next = False + for i, arg in enumerate(argv): + if skip_next: + skip_next = False + continue + if arg == "--editable": + continue + if arg == "--build-directory": + skip_next = True + continue + if arg.startswith("--build-directory="): + continue + cleaned.append(arg) + return cleaned + +sys.argv = _strip_colcon_args(sys.argv) + setup( name=package_name, version='0.0.1', @@ -21,7 +42,6 @@ maintainer_email='alex.lin416@outlook.com', description='Utility functions for image and point cloud processing', license='MIT', - tests_require=['pytest'], entry_points={ 'console_scripts': [ 'detection_to_waypoint = vector_perception_utils.detection_to_waypoint:main', diff --git a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py index 835db76..40e7141 100644 --- a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py +++ b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py @@ -2,6 +2,7 @@ from typing import Tuple, Optional +import cv2 import numpy as np import open3d as o3d from scipy.spatial.transform import Rotation @@ -148,6 +149,75 @@ def remove_radius_outliers( return points_filtered, colors_filtered, inlier_indices +def fit_plane( + points: np.ndarray, + distance_threshold: float = 0.05, + ransac_n: int = 3, + num_iterations: int = 50, + max_points: int = 8000, + min_inlier_ratio: float = 0.0, + min_normal_z: float = 0.0, + min_above_ratio: float = 0.0, + prefer_up: bool = True, +) -> Optional[np.ndarray]: + """Fit a plane to a point cloud using RANSAC. + + Args: + points: Point coordinates (N, 3). + distance_threshold: Max distance for inliers during RANSAC. + ransac_n: Number of points to sample per RANSAC iteration. + num_iterations: RANSAC iterations. + max_points: Max points to sample for fitting (0 keeps all). + min_inlier_ratio: Minimum inlier ratio to accept the plane. + min_normal_z: Minimum absolute Z component of the plane normal. + min_above_ratio: Minimum ratio of points above the plane. + prefer_up: If True, flip normal to have positive Z. + + Returns: + Plane coefficients [a, b, c, d] for ax + by + cz + d = 0, or None. + """ + if points.shape[0] < 3: + return None + + sample_points = points + if max_points > 0 and points.shape[0] > max_points: + indices = np.random.choice(points.shape[0], max_points, replace=False) + sample_points = points[indices] + + pcd = create_pointcloud(sample_points) + plane_model, inliers = pcd.segment_plane( + distance_threshold=distance_threshold, + ransac_n=ransac_n, + num_iterations=num_iterations, + ) + if len(inliers) == 0: + return None + if min_inlier_ratio > 0.0 and len(inliers) / sample_points.shape[0] < min_inlier_ratio: + return None + + normal = np.array(plane_model[:3], dtype=float) + norm = float(np.linalg.norm(normal)) + if norm == 0.0: + return None + normal /= norm + d = float(plane_model[3]) / norm + + if min_normal_z > 0.0 and abs(normal[2]) < min_normal_z: + return None + if prefer_up and normal[2] < 0: + normal = -normal + d = -d + + if min_above_ratio > 0.0: + check_points = sample_points + distances = check_points @ normal + d + above_ratio = float(np.mean(distances > 0.0)) + if above_ratio < min_above_ratio: + return None + + return np.array([normal[0], normal[1], normal[2], d], dtype=float) + + def transform_pointcloud( points: np.ndarray, transform: np.ndarray @@ -199,42 +269,126 @@ def crop_pointcloud_bbox( def compute_centroid(points: np.ndarray) -> np.ndarray: """ Compute centroid of point cloud. - + Args: points: Point coordinates (N, 3) - + Returns: Centroid as (3,) array """ return np.mean(points, axis=0) +def get_bbox_center(bbox) -> np.ndarray: + """Return bounding box center for Open3D AABB/OBB variants.""" + if hasattr(bbox, "center"): + return np.asarray(bbox.center) + if hasattr(bbox, "get_center"): + return np.asarray(bbox.get_center()) + if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"): + min_bound = np.asarray(bbox.get_min_bound()) + max_bound = np.asarray(bbox.get_max_bound()) + return (min_bound + max_bound) * 0.5 + return np.zeros(3, dtype=float) + + +def get_bbox_extent(bbox) -> np.ndarray: + """Return bounding box extent for Open3D AABB/OBB variants.""" + if hasattr(bbox, "extent"): + return np.asarray(bbox.extent) + if hasattr(bbox, "get_extent"): + return np.asarray(bbox.get_extent()) + if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"): + min_bound = np.asarray(bbox.get_min_bound()) + max_bound = np.asarray(bbox.get_max_bound()) + return max_bound - min_bound + return np.zeros(3, dtype=float) + + +def get_bbox_corners(bbox) -> np.ndarray: + """Return 8 bbox corners for Open3D AABB/OBB variants.""" + if hasattr(bbox, "get_box_points"): + return np.asarray(bbox.get_box_points()) + if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"): + min_bound = np.asarray(bbox.get_min_bound()) + max_bound = np.asarray(bbox.get_max_bound()) + return np.array( + [ + [min_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], max_bound[1], max_bound[2]], + [min_bound[0], max_bound[1], max_bound[2]], + ], + dtype=float, + ) + return np.zeros((0, 3), dtype=float) + + +def get_oriented_bbox( + points: np.ndarray, + axis_aligned: bool = True, +) -> Optional[Tuple[np.ndarray, np.ndarray]]: + """ + Compute bounding box from point cloud. + + Args: + points: Point coordinates (N, 3) + axis_aligned: If True, compute axis-aligned bounding box. + If False, compute minimal oriented bounding box. + + Returns: + Tuple of (center, corners) where center is (3,) and corners is (8, 3), + or None if insufficient points. + """ + if len(points) < 4: + return None + + pcd = create_pointcloud(points) + + try: + if axis_aligned: + bbox = pcd.get_axis_aligned_bounding_box() + else: + bbox = pcd.get_minimal_oriented_bounding_box() + except Exception: + return None + + center = get_bbox_center(bbox) + corners = get_bbox_corners(bbox) + return center, corners + def pointcloud_to_bbox3d(points: np.ndarray, header=None): """ Convert point cloud to minimal oriented 3D bounding box (ROS2 message). - + Args: points: Point coordinates (N, 3) header: Optional ROS2 header for the message - + Returns: vision_msgs/BoundingBox3D message or None if insufficient points """ - + if len(points) < 4: return None pcd = create_pointcloud(points) - + try: obb = pcd.get_minimal_oriented_bounding_box() except: return None - - center = obb.center - extent = obb.extent - quat = Rotation.from_matrix(obb.R).as_quat() - + center = get_bbox_center(obb) + extent = get_bbox_extent(obb) + if hasattr(obb, "R"): + quat = Rotation.from_matrix(obb.R).as_quat() + else: + quat = np.array([0.0, 0.0, 0.0, 1.0], dtype=float) + bbox = BoundingBox3D() bbox.center.position = Point( x=float(center[0]), y=float(center[1]), z=float(center[2]) @@ -243,7 +397,7 @@ def pointcloud_to_bbox3d(points: np.ndarray, header=None): x=float(quat[0]), y=float(quat[1]), z=float(quat[2]), w=float(quat[3]) ) bbox.size = Vector3(x=float(extent[0]), y=float(extent[1]), z=float(extent[2])) - + return bbox @@ -328,6 +482,255 @@ def pointcloud_to_depth( return depth_image +def project_pointcloud_to_image( + points: np.ndarray, + projection: str = "equirect", + image_width: int = 1920, + image_height: int = 640, + axis_mode: str = "xz", + intrinsics: Optional[np.ndarray] = None, + depth_filter: bool = False, + depth_filter_margin: float = 0.15, + transform: Optional[np.ndarray] = None, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Project a point cloud into image pixel coordinates. + + Args: + points: Point coordinates (N, 3). + projection: Projection model ("equirect" or "pinhole"). + image_width: Output image width in pixels. + image_height: Output image height in pixels. + axis_mode: Equirect axis convention ("xz" or "xy"). + intrinsics: Pinhole intrinsics (3, 3) if using "pinhole". + depth_filter: Enable depth-based occlusion filtering. + depth_filter_margin: Depth margin for occlusion filtering (meters). + transform: Optional 4x4 transform applied before projection. + + Returns: + Tuple of (camera_points, pixel_idx) where pixel_idx is (N, 3) with (u, v, depth). + """ + camera_points = transform_pointcloud(points, transform) if transform is not None else points + + if projection == "pinhole": + if intrinsics is None: + raise ValueError("intrinsics required for pinhole projection") + u = camera_points[:, 0] * intrinsics[0, 0] / (camera_points[:, 2] + 1e-6) + intrinsics[0, 2] + v = camera_points[:, 1] * intrinsics[1, 1] / (camera_points[:, 2] + 1e-6) + intrinsics[1, 2] + return camera_points, np.stack([u, v, camera_points[:, 2]], axis=-1) + + if axis_mode == "xz": + hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 2] ** 2) + u = ( + image_width / (2 * np.pi) * np.arctan2(camera_points[:, 0], camera_points[:, 2]) + + image_width / 2 + + 1 + ) + v = ( + image_width / (2 * np.pi) * np.arctan(camera_points[:, 1] / hori_dis) + + image_height / 2 + + 1 + ) + else: + hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 1] ** 2) + u = ( + -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 1], camera_points[:, 0]) + + image_width / 2 + + 1 + ) + v = ( + -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 2], hori_dis) + + image_height / 2 + + 1 + ) + + u = np.clip(u, 0, image_width - 1) + v = np.clip(v, 0, image_height - 1) + + if depth_filter: + depth_map = np.full((image_height, image_width), np.inf) + idx = v.astype(int) * image_width + u.astype(int) + np.minimum.at(depth_map.ravel(), idx, hori_dis) + remove_mask = hori_dis >= depth_map[v.astype(int), u.astype(int)] + depth_filter_margin + u = u.copy() + v = v.copy() + u[remove_mask] = -1 + v[remove_mask] = -1 + + return camera_points, np.stack([u, v, hori_dis], axis=-1) + + +def project_bbox3d_equirect( + image: np.ndarray, + bbox3d: np.ndarray, + body_to_world: np.ndarray, + sensor_to_camera: np.ndarray | None, + image_width: int, + image_height: int, + hfov: float = 360.0, + vfov: float = 120.0, + color: tuple[int, int, int] = (0, 255, 0), + thickness: int = 2, +) -> np.ndarray: + """Project a 3D bounding box onto an equirectangular image.""" + if bbox3d.shape != (8, 3): + return image + world_to_body = np.linalg.inv(body_to_world) + if sensor_to_camera is None: + sensor_to_camera = np.eye(4) + world_to_camera = sensor_to_camera @ world_to_body + points_h = np.concatenate([bbox3d, np.ones((8, 1), dtype=float)], axis=1) + cam_points = (world_to_camera @ points_h.T).T[:, :3] + + hfov_rad = max(1e-3, np.deg2rad(hfov)) + vfov_rad = max(1e-3, np.deg2rad(vfov)) + + coords = [] + for x, y, z in cam_points: + yaw = np.arctan2(x, z) + dist = np.sqrt(x * x + z * z) + pitch = np.arctan2(y, dist) + u = (yaw / hfov_rad + 0.5) * image_width + v = (0.5 - pitch / vfov_rad) * image_height + coords.append((int(round(u)), int(round(v)))) + + overlay = image.copy() + edges = [ + (0, 1), + (1, 2), + (2, 3), + (3, 0), + (4, 5), + (5, 6), + (6, 7), + (7, 4), + (0, 4), + (1, 5), + (2, 6), + (3, 7), + ] + for i, j in edges: + pt1 = coords[i] + pt2 = coords[j] + if _point_in_image(pt1, image_width, image_height) or _point_in_image( + pt2, image_width, image_height + ): + cv2.line(overlay, pt1, pt2, color, thickness) + return overlay + + +def _point_in_image(point: tuple[int, int], width: int, height: int) -> bool: + return 0 <= point[0] < width and 0 <= point[1] < height + + +def segment_pointcloud_by_mask( + points: np.ndarray, + mask: np.ndarray, + pixel_idx: np.ndarray, +) -> np.ndarray: + """ + Segment a point cloud using a binary mask and projected pixels. + + Args: + points: Point coordinates (N, 3). + mask: Binary mask (H, W). + pixel_idx: Pixel coordinates (N, 3) as (u, v, depth). + + Returns: + Masked point cloud (M, 3). + """ + h, w = mask.shape[:2] + pixel_idx = pixel_idx.astype(int) + valid = ( + (pixel_idx[:, 0] >= 0) + & (pixel_idx[:, 0] < w) + & (pixel_idx[:, 1] >= 0) + & (pixel_idx[:, 1] < h) + ) + pixel_idx = pixel_idx[valid] + points = points[valid] + keep = mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool) + return points[keep] + + +def segment_pointclouds_from_projection( + points: np.ndarray, + masks: list[np.ndarray], + pixel_idx: np.ndarray, +) -> Tuple[list[np.ndarray], list[np.ndarray]]: + """ + Segment point clouds from projected pixels. + + Args: + points: Point coordinates (N, 3). + masks: List of binary masks (H, W). + pixel_idx: Pixel coordinates (N, 3) as (u, v, depth). + + Returns: + Tuple of (clouds, depths) where each entry corresponds to a mask. + """ + if masks is None or len(masks) == 0: + return [], [] + + image_shape = masks[0].shape + valid = ( + (pixel_idx[:, 0] >= 0) + & (pixel_idx[:, 0] < image_shape[1]) + & (pixel_idx[:, 1] >= 0) + & (pixel_idx[:, 1] < image_shape[0]) + ) + pixel_idx = pixel_idx[valid] + points = points[valid] + + depths = pixel_idx[:, 2] + pixel_idx = pixel_idx.astype(int) + + obj_clouds = [] + obj_depths = [] + for obj_mask in masks: + cloud_mask = obj_mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool) + obj_clouds.append(points[cloud_mask]) + obj_depths.append(depths[cloud_mask].reshape(-1, 1)) + + return obj_clouds, obj_depths + + +def transform_segmented_clouds_to_world( + clouds: list[np.ndarray], + depths: list[np.ndarray], + R_b2w: np.ndarray, + t_b2w: np.ndarray, + depth_jump_threshold: float = 0.3, +) -> list[np.ndarray]: + """ + Prune depth jumps and transform segmented clouds to world frame. + + Args: + clouds: List of segmented point clouds (sensor frame). + depths: List of depth arrays aligned with clouds. + R_b2w: Body-to-world rotation matrix (3, 3). + t_b2w: Body-to-world translation vector (3,). + depth_jump_threshold: Depth jump threshold for pruning (meters). + + Returns: + List of segmented object clouds in world frame. + """ + obj_cloud_world_list = [] + for obj_cloud, obj_depth in zip(clouds, depths): + if obj_depth.shape[0] > 1: + obj_depth_diff = (obj_depth[1:] - obj_depth[:-1]).squeeze() + if obj_depth_diff.size > 0 and np.max(obj_depth_diff) > depth_jump_threshold: + idx_tmp = np.ones(len(obj_depth), dtype=bool) + jump_idx = int(np.argmax(obj_depth_diff)) + idx_tmp[jump_idx + 1 :] = False + obj_cloud = obj_cloud[idx_tmp] + + obj_cloud_world = obj_cloud[:, :3] @ R_b2w.T + t_b2w + obj_cloud_world_list.append(obj_cloud_world) + + return obj_cloud_world_list + + def create_pointcloud2_msg(points: np.ndarray, colors: np.ndarray, header): """ Create PointCloud2 ROS message from points and colors. diff --git a/vector_perception_utils/vector_perception_utils/transform_utils.py b/vector_perception_utils/vector_perception_utils/transform_utils.py index 534d8d2..ef3238e 100644 --- a/vector_perception_utils/vector_perception_utils/transform_utils.py +++ b/vector_perception_utils/vector_perception_utils/transform_utils.py @@ -8,7 +8,7 @@ from geometry_msgs.msg import Pose, Quaternion -def normalize_angle(angle: float) -> float: +def normalize_angle(angle: float | np.ndarray) -> float | np.ndarray: """ Normalize angle to [-pi, pi] range. @@ -21,6 +21,26 @@ def normalize_angle(angle: float) -> float: return np.arctan2(np.sin(angle), np.cos(angle)) +def discretize_angles(angles: np.ndarray, num_bins: int) -> np.ndarray: + """ + Discretize angles into bin indices. + + Normalizes angles to [-pi, pi] then maps to bin indices [0, num_bins-1]. + + Args: + angles: Angles in radians (any range). + num_bins: Number of angle bins. + + Returns: + Array of bin indices (0 to num_bins-1), same shape as input. + """ + angles = normalize_angle(angles) + # Map [-pi, pi] to [0, 2*pi] then to [0, num_bins) + normalized = (angles + np.pi) / (2 * np.pi) + bins = (normalized * num_bins).astype(int) + return np.clip(bins, 0, num_bins - 1) + + def pose_to_matrix(pose: Pose) -> np.ndarray: """ Convert pose to 4x4 homogeneous transform matrix. diff --git a/vlm/README.md b/vlm/README.md new file mode 100644 index 0000000..2bd9fbf --- /dev/null +++ b/vlm/README.md @@ -0,0 +1,67 @@ +# VLM Package + +Provider-agnostic vision-language model (VLM) adapters and shared interfaces for +`vector_perception_ros`. + +## Quick Start + +Install deps and run tests: + +```bash +source /home/alex-lin/dev/vector_venv/bin/activate +pip install -r /home/alex-lin/dev/vector_perception_ros/requirements.txt +pytest -s /home/alex-lin/dev/vector_perception_ros/vlm/test/test_vlm.py +``` + +Set API keys: + +```bash +export MOONDREAM_API_KEY=... +# or +export QWEN_API_KEY=... +``` + +## API Overview + +Core interface: +- `VlmModelBase` (`vlm/models/base.py`) + - `query(image, prompt)` + - `query_batch(images, prompt)` + - `caption(image)` + - `detect(image, query)` → list of `VlmDetection` + - `point(image, query)` → list of `VlmPoint` + - `segment(image, query)` → list of `VlmMask` + +Adapters: +- `MoondreamVlm` (`vlm/models/moondream.py`) + - Uses Moondream hosted API + - Native detect/point/segment via skills + - Segment returns SVG path in `VlmMask.svg_path` +- `QwenVlm` (`vlm/models/qwen.py`) + - Uses OpenAI-compatible DashScope endpoint + - Prompt fallback for detect/point (JSON parsing) + +## Package Structure + +``` +vlm/ +├── docs/ +│ └── VLM_NODE_MIGRATION.md +├── test/ +│ └── test_vlm.py +├── vlm/ +│ ├── __init__.py +│ └── models/ +│ ├── base.py +│ ├── types.py +│ ├── prompts.py +│ ├── prompt_fallback.py +│ ├── moondream.py +│ └── qwen.py +``` + +## Notes + +- Images are numpy arrays in RGB unless otherwise stated. +- `VlmMask` supports `mask` (mono8) and/or `svg_path` for segmentation outputs. +- The VLM reasoning ROS node lives under `semantic_mapping/semantic_mapping/`. diff --git a/vlm/data/kitchen.jpg b/vlm/data/kitchen.jpg new file mode 100644 index 0000000..f143693 Binary files /dev/null and b/vlm/data/kitchen.jpg differ diff --git a/vlm/docs/VLM_INPUTS_AND_SELECTION.md b/vlm/docs/VLM_INPUTS_AND_SELECTION.md new file mode 100644 index 0000000..6f1a471 --- /dev/null +++ b/vlm/docs/VLM_INPUTS_AND_SELECTION.md @@ -0,0 +1,271 @@ +# VLM Inputs and "Best Image" Selection + +This note documents how room-type and object-type VLM inputs are produced in the +current migration, and what "best image" actually means in the pipeline. It is +intended to clarify why viewpoint snapshots exist and how they relate (or do +not relate) to room-type queries. + +## 1) Room segmentation and room-type image crop (legacy planner) + +Room segmentation produces a 2D integer room mask (room ids) over a grid. The +planner (`sensor_coverage_planner_ground.cpp`) maintains this mask and assigns +room ids to points in the local covered cloud. When a room is newly labeled or +has grown sufficiently, the planner emits a `RoomType` query that contains: + +- `image`: a **cropped panorama** derived from the current camera image. +- `room_mask`: the **top-down room mask** (grid ids) for context. + +How the crop is computed: + +1. Collect points in the room (covered cloud) and add the room center. +2. Use the robot pose at the camera timestamp to project those 3D points into + the equirectangular camera image (`project_pcl_to_image`). +3. Track the horizontal pixel range occupied by the projected points. +4. Rotate the image so the room center projection is at the horizontal center. +5. Crop the rotated panorama to `[min_u, max_u]` (plus padding). + +The crop logic is in: + +- `tare_planner/src/sensor_coverage_planner/sensor_coverage_planner_ground.cpp` + - `UpdateRoomLabel()` → `project_pcl_to_image(...)` + +Important detail: this crop is **not** based on viewpoint snapshots. It is +computed directly from the panorama and the room's 3D points. + +## 2) Do viewpoint images drive room-type VLM? + +**No.** Viewpoint images are saved for **object spatial queries**, not room +type. Room-type VLM uses the `RoomType` message (cropped panorama + room mask) +coming from the planner. + +Room-type flow in the new stack: + +1. `tare_planner` publishes `RoomType` query with `image` + `room_mask`. +2. `vlm_reasoning_node` builds a room-type prompt and sends the image+mask + mosaic to the VLM. +3. Answer is published back as `RoomType`. + +There is no "best view" selection from viewpoint snapshots for room-type. + +## 3) What are the viewpoint snapshots? + +Viewpoint snapshots are saved in `semantic_mapping` when `ViewpointRep` triggers +arrive. They capture the **full detection image** (no crop) and a transform. + +Storage: + +- `output/viewpoint_images//viewpoint_{id}.png` +- `output/viewpoint_images//viewpoint_{id}_transform.npy` + +Purpose: + +These are used only for **target-object spatial queries**. The VLM builds a +multi-view mosaic by loading viewpoint images and projecting the 3D bbox onto +each view (`project_bbox3d_equirect`). This is independent of room-type logic. + +Relevant code: + +- `semantic_mapping/mapping_node.py` + - `_save_viewpoint_snapshot(...)` +- `semantic_mapping/vlm_reasoning_node.py` + - `_build_spatial_mosaic(...)` + +## 4) "Best image" selection for object-type VLM + +"Best image" is only used for **object-type relabeling queries**, not room type. + +Selection logic (`semantic_mapping/mapping_node.py`): + +- For each observation, compute mask area. +- If this area is larger than the current best for that object: + - Crop the **full image** to the mask bounding box plus padding. + - Save the crop as `.npy` (and optional `.png`). + - Store `best_image_path` and `best_image_mask_area`. + +Details: + +- Padding is `max(height, width, 80)` of the mask bbox. +- A crop is considered eligible for VLM if `best_image_mask_area > 500`. +- When a new best crop is written, the object is marked `is_asked_vlm = False` + to allow a new object-type query. + +This is the only "best image" decision in the current stack. + +## 5) How viewpoints and objects relate to room labels + +This is handled entirely in the legacy planner (`tare_planner`) today. The +new `semantic_mapping` pipeline does **not** assign room ids or labels; it only +passes `viewpoint_id` through for spatial VLM queries. + +### Viewpoint → room association (legacy planner) + +Viewpoints are associated with rooms by **room mask lookup**: + +- When the room mask updates, `representation::UpdateViewpointRoomIdsFromMask` + converts each viewpoint position into a voxel index (using `shift_` and + `room_resolution_`) and looks up the room id in the mask. It then calls + `SetViewpointRoomRelation`, which: + - updates the viewpoint's `room_id_` + - removes the viewpoint from the old room's list (if it changed) + - adds the viewpoint id to the new room's `viewpoint_indices_` +- When a new viewpoint is created, the planner also sets its room id to the + **current room id** (derived from the room mask under the robot), ensuring + it is immediately attached to a room. + +Key locations: + +- `representation/representation.cpp` + - `UpdateViewpointRoomIdsFromMask(...)` + - `SetViewpointRoomRelation(...)` +- `sensor_coverage_planner_ground.cpp` + - `UpdateViewpointRep()` (assigns `current_room_id_` to new viewpoints) + +This is the only place where viewpoints are tied to room labels. The label is +stored on the room node, so "viewpoint → room label" is simply: + +`viewpoint.room_id_` → `room_nodes_map_[room_id].label` + +### Object → room association (legacy planner) + +Objects are associated with rooms via **object position lookup** in the room +mask: + +- For each object, the planner projects the object centroid to a voxel index. +- If the mask returns `0` (unknown), it **dilates** the search by a small + radius to find a nearby non-zero room id. +- It then calls `representation::SetObjectRoomRelation`, which updates the + object's `room_id_` and updates the room's `object_indices_` set. + +Key locations: + +- `sensor_coverage_planner_ground.cpp` + - `UpdateObjectVisibility()` → object centroid → `room_mask` lookup → + `SetObjectRoomRelation(...)` with dilation fallback +- `representation/representation.cpp` + - `SetObjectRoomRelation(...)` + +This provides the implicit "object → room label" mapping: + +`object.room_id_` → `room_nodes_map_[room_id].label` + +### Object ↔ viewpoint visibility (legacy + new stack) + +There is a separate **object ↔ viewpoint visibility** link: + +- The planner checks line-of-sight visibility between each object and the + current viewpoint. If visible, it: + - adds the object id to the viewpoint's `object_indices_` + - adds the viewpoint id to the object's `visible_viewpoint_indices_` +- When the new stack publishes `ObjectNode` messages, it includes + `viewpoint_id`. The representation updates the visibility lists when this + field is present. + +Key locations: + +- `sensor_coverage_planner_ground.cpp` + - visibility check loop around `viewpoint.AddObjectIndex(...)` +- `representation/representation.cpp` + - `UpdateObjectNode(...)` (uses `msg->viewpoint_id`) + +### Current migration caveat + +In `vector_perception_ros/semantic_mapping`: + +- `ObjectNode.msg` does **not** include `room_id`. +- `ViewpointRep.msg` contains only `viewpoint_id`. +- Room labels live in the planner (`RoomType` messages), not in the mapper. + +So today, **room labels are still owned by the legacy planner**. If you want +room label data inside `semantic_mapping`, you would need: + +- a `room_id` field on `ObjectNode` and/or `ViewpointRep`, and +- a room-segmentation-aware mapper (or a bridge back from `tare_planner`). + +## 6) Summary: what is used where? + +| VLM task | Image source | Crop method | Uses viewpoint images? | +|---|---|---|---| +| Room type | `RoomType.image` + `RoomType.room_mask` | `project_pcl_to_image` in planner | No | +| Object type | Best object crop from detection image | Mask bbox + padding | No | +| Target object spatial | Base object image + viewpoint snapshots | Project 3D bbox into each view | Yes | + +## 7) Handoff: improvements for next agent + +Please pass these to the next agent: + +1. Consider a room-type "best view" strategy: + - Use a stable viewpoint near the room anchor. + - Pick the viewpoint with max overlap between room mask and projected room + polygon, or highest room voxel coverage. +2. Evaluate whether the planner should throttle room-type queries (e.g. only + when room label is unknown or confidence is low). +3. Add a small diagram to this doc showing the three image pathways: + room type (cropped panorama + mask), object type (best crop), spatial query + (viewpoint mosaic). + +## 8) Room segmentation + planner coupling (legacy) + +Room segmentation publishes a `sensor_msgs/Image` on `/room_mask`. The planner +subscribes to it and uses the mask as the **authoritative room-id grid** for +multiple responsibilities: + +- Assigning room ids to voxelized points (room_id = room_mask at voxel index). +- Determining current room id and transitions between rooms. +- Validating anchor points for each room. +- Generating room-type queries when rooms appear or grow. + +Core interactions in the legacy planner: + +- `RoomMaskCallback`: resizes the room mask to the voxel grid, then updates + `viewpoint_manager` + `grid_world` and synchronizes room ids. +- `UpdateRoomLabel`: maps covered point cloud voxels to room ids via the mask, + aggregates per-room counts, and triggers room-type queries with cropped + panorama images. +- `SetCurrentRoomId`: uses the room mask to detect room transitions and split/ + merge events. + +This is why `sensor_coverage_planner_ground.cpp` has a large, intertwined loop: +room segmentation, exploration logic, and VLM query generation all live in one +control flow. + +## 9) Refactor direction: move room segmentation loop out of the 5k file + +Suggested approach to reduce coupling and move room segmentation processing out +of the 5k-line planner file: + +1. **Extract a RoomSegmentationManager class** (new file): + - Owns `room_mask_`, `room_mask_old_`, `room_resolution_`, and `shift_`. + - Provides APIs: + - `update_mask(room_mask_msg)` (handles resize + update) + - `room_id_at(point)` (voxel lookup + bounds check) + - `validate_anchor(room_id, anchor_point)` + - `detect_split_merge(current_room_id, robot_position, room_mask_old_)` + +2. **Extract a RoomTypeQueryBuilder class**: + - Encapsulates the `UpdateRoomLabel` logic: + - Collect room clouds. + - Build room centers and voxel counts. + - Compute panorama crop via `project_pcl_to_image`. + - Publish `RoomType` query messages when thresholds are met. + +3. **Move `project_pcl_to_image` into a dedicated utility**: + - The function is already self-contained; place it in a + `room_image_projection.{h,cpp}` module with unit-testable inputs. + +4. **Create a room segmentation node (optional)**: + - A new ROS2 node that consumes `/room_mask` + pose + point cloud and + publishes `RoomType` queries directly. + - This would let the planner subscribe to room-type answers without hosting + the segmentation loop at all. + +5. **Keep planner orchestration minimal**: + - Planner should only consume: + - `RoomType` answers + - room-node updates (if needed) + - room navigation queries / decisions + - This isolates exploration from perception and VLM-specific logic. + +This refactor would make room segmentation and room-type queries testable in +isolation, reduce `sensor_coverage_planner_ground.cpp` size, and allow the +compressed-image migration to happen in one place rather than scattered +throughout planner code. diff --git a/vlm/docs/VLM_NODE_MIGRATION.md b/vlm/docs/VLM_NODE_MIGRATION.md new file mode 100644 index 0000000..4ccdd3e --- /dev/null +++ b/vlm/docs/VLM_NODE_MIGRATION.md @@ -0,0 +1,308 @@ +# VLM Node Migration Plan + +## Overview + +This document outlines the plan to refactor `VLM_ROS/src/vlm_node` into a clean, +provider-agnostic VLM layer inspired by `dimos` patterns, while removing Gemini +and prioritizing Moondream 3 as the first integrated backend. The goal is to +standardize skill APIs (query, caption, detect, point, segment) across models, +support both native skill endpoints (e.g., Moondream) and prompt-based JSON +extraction (e.g., Qwen), and preserve existing ROS I/O contracts. + +## Design Principles + +1. **Same ROS I/O contract** - Preserve topic names, message types, and semantics. +2. **Abstract, capability-driven VLMs** - One interface for all models, with + explicit capabilities to select native vs prompt-based skills. +3. **Prompt-based fallbacks** - When native skills are missing, use strict JSON + prompts and normalized output parsing. +4. **Real-time friendly** - Avoid excessive try/except and per-parameter checks; + fail fast with clear errors and keep hot paths lightweight. +5. **Secrets out of repo** - Move API keys to environment variables/config only. + +--- + +## Implementation Progress + +| Step | Component | Status | +|------|-----------|--------| +| 1 | Package layout & new docs | **COMPLETED** | +| 2 | Core abstraction (`base.py`) | **COMPLETED** | +| 3 | Capability + prompt fallback layer | **COMPLETED** | +| 4 | Moondream 3 adapter | **COMPLETED** | +| 5 | Prompt-based adapter (Qwen) | **COMPLETED** | +| 6 | VLM node integration | **IN PROGRESS** | +| 7 | Tests + config updates | **IN PROGRESS** | + +--- + +## Phase 1: Package Structure (COMPLETED) + +Target layout within `vector_perception_ros/vlm`: + +``` +vlm/ +├── docs/ +│ └── VLM_NODE_MIGRATION.md +├── vlm/ +│ ├── __init__.py +│ └── models/ +│ ├── __init__.py +│ ├── base.py # abstract VLM interface +│ ├── types.py # shared types + capabilities +│ ├── prompts.py # prompt templates for JSON outputs +│ ├── prompt_fallback.py # prompt-based detect/point fallback +│ ├── moondream.py # Moondream 3 adapter +│ ├── qwen.py # prompt-based adapter +│ ├── openai.py # optional future adapter +│ └── registry.py # model factory from config +``` + +--- + +## Phase 2: Core Abstraction (COMPLETED) + +### 2.1 Abstract Interface + +Create `VlmModelBase` that enforces the skill surface: + +```python +class VlmModelBase(ABC): + @abstractmethod + def query(self, image: Image, prompt: str, **kwargs) -> str: ... + + @abstractmethod + def query_batch(self, images: list[Image], prompt: str, **kwargs) -> list[str]: ... + + @abstractmethod + def caption(self, image: Image, **kwargs) -> str: ... + + @abstractmethod + def detect(self, image: Image, query: str, **kwargs) -> list[VlmDetection]: ... + + @abstractmethod + def point(self, image: Image, query: str, **kwargs) -> list[VlmPoint]: ... + + @abstractmethod + def segment(self, image: Image, query: str, **kwargs) -> list[VlmMask]: ... +``` + +### 2.2 Types and Capabilities + +Add lightweight dataclasses for consistent outputs: + +- `VlmDetection`: label + bbox (pixel coordinates) +- `VlmPoint`: label + (x, y) in pixels +- `VlmMask`: label + mono8 mask +- `VlmCapabilities`: flags for native `detect/point/segment/caption` + +### 2.3 Prompt-Based Skill Fallbacks + +Introduce a `PromptSkillMixin` or helper functions that implement: + +- `detect()` via JSON list of `[label, x1, y1, x2, y2]` +- `point()` via JSON list of `[label, x, y]` +- `segment()` via JSON list of SVG path or RLE mask format + +Models that lack native skills (e.g., Qwen) use these defaults. Models with +native skills override the method and set capability flags. + +--- + +## Phase 3: Moondream 3 Adapter (IN PROGRESS) + +### 3.1 Native Skill Mapping + +Implement `Moondream3Model` using the Moondream skills API: + +- `query()` → `query` +- `caption()` → `caption` +- `detect()` → `detect` (native bboxes) +- `point()` → `point` (native points) +- `segment()` → `segment` (native SVG path, stored in VlmMask) + +Moondream's skill support is documented here: +https://docs.moondream.ai/skills/ + +### 3.2 Output Normalization + +Convert Moondream outputs to `VlmDetection`, `VlmPoint`, `VlmMask` and normalize +coordinates to pixels using the input image shape. SVG paths are kept in +`VlmMask.svg_path` for now; mono8 rasterization can be added when needed. + +--- + +## Phase 4: Prompt-Based Adapter (Qwen) (COMPLETED) + +### 4.1 JSON-Only Prompt Templates + +Define canonical prompts in `models/prompts.py` for: + +- `detect`: JSON array of `[label, x1, y1, x2, y2]` +- `point`: JSON array of `[label, x, y]` +- `segment`: JSON array of SVG path strings (or mask encoding) + +### 4.2 Parsing + Validation + +Use strict JSON extraction. If parse fails, return empty detections and log +once per model to avoid spam. No deep per-field validation beyond shape checks. + +--- + +## VLM_ROS Reference (Source Implementation) + +Key files in `VLM_ROS/src/vlm_node/vlm_node/`: +- `vlm_reasoning_node.py`: main VLM node, prompt handling, queueing, threading +- `constants.py`: model name and default instruction fields +- `utils.py`: bbox projection helper for spatial queries + +### Topics (Inputs) + +| Topic | Type | Description | +|-------|------|-------------| +| `/room_type_query` | `RoomType` | Room image + mask for classification | +| `/room_navigation_query` | `NavigationQuery` | JSON + candidate room images | +| `/room_early_stop_1` | `RoomEarlyStop1` | Compare nearby vs current room | +| `/object_type_query` | `ObjectType` | Object crop + candidate labels | +| `/keyboard_input` | `std_msgs/String` | Natural language instruction | +| `/target_object_query` | `TargetObject` | Object crop for target validation | +| `/target_object_spatial_query` | `TargetObjectWithSpatial` | Object + viewpoints + bbox3d | +| `/anchor_object_query` | `TargetObject` | Object crop for anchor validation | + +### Topics (Outputs) + +| Topic | Type | Description | +|-------|------|-------------| +| `/room_type_answer` | `RoomType` | Room type classification | +| `/room_navigation_answer` | `VlmAnswer` | Selected room + reason | +| `/vlm_answer` | `OverlayText` | Debug overlay text in RViz | +| `/object_type_answer` | `ObjectType` | Verified label for object | +| `/target_object_instruction` | `TargetObjectInstruction` | Parsed instruction fields | +| `/target_object_answer` | `TargetObject` | Target match boolean | +| `/anchor_object_answer` | `TargetObject` | Anchor match boolean | + +### Object Query Flow + +1. **Instruction Decomposition** (`/keyboard_input`): + - Skips simple control tokens in `skip_set` (e.g., `next`, `dynamic`). + - Uses `instruction_decomposition_prompt` to extract: + `target_object`, `room_condition`, `spatial_condition`, + `attribute_condition`, `anchor_object`, `attribute_condition_anchor`. + - Publishes `TargetObjectInstruction` to `/target_object_instruction`. + +2. **Object Type Verification** (`/object_type_query`): + - Loads `img_path` `.npy` crop; if mask exists, dilates and draws contours. + - Uses `object_type_query_prompt` with candidate labels from + `semantic_mapping/config/objects.yaml`. + - Publishes `ObjectType` with `final_label` to `/object_type_answer`. + +3. **Target Object Match** (`/target_object_query`): + - Builds description using `object_label` and `room_label`. + - Uses `target_object_prompt` to return JSON `{is_target, reason}`. + - Publishes `TargetObject` with `is_target` to `/target_object_answer`. + +4. **Target Object + Spatial Match** (`/target_object_spatial_query`): + - Loads viewpoint images and projects `bbox3d` into each view using + `project_bbox3d` and stored transforms. + - Uses `target_object_spatial_prompt` with surrounding images. + - Publishes `TargetObject` with `is_target` to `/target_object_answer`. + +5. **Anchor Object Match** (`/anchor_object_query`): + - Similar to target object match, but uses the anchor instruction fields. + - Publishes `TargetObject` to `/anchor_object_answer`. + +Note: The node uses per-topic queues and spawns a thread per request, keeping +only the latest query per `room_id` or `object_id` when applicable. + +--- + +## Message Type Alignment + +`vector_perception_ros` now standardizes on `semantic_mapping/msg/*` for all +planner/VLM traffic. Legacy planner message packages are no longer used in the +migrated stack, so no bridge is required. + +### Duplicate Message Names (same fields, different package) + +| VLM_ROS (tare_planner) | vector_perception_ros (semantic_mapping) | +|---|---| +| `ObjectNode` | `ObjectNode` | +| `ObjectType` | `ObjectType` | +| `TargetObjectInstruction` | `TargetObjectInstruction` | +| `TargetObjectWithSpatial` | `TargetObjectWithSpatial` | +| `ViewpointRep` | `ViewpointRep` | +| `RoomType` | `RoomType` | + +### Additional messages migrated into `semantic_mapping` + +| Message | Used for | +|---|---| +| `RoomEarlyStop1` | Early room stopping decision | +| `NavigationQuery` | Room navigation request | +| `VlmAnswer` | Room navigation answer | +| `TargetObject` | Non‑spatial target/anchor verification | + +### Current Status in `vector_perception_ros` + +- `semantic_mapping/mapping_node.py` publishes planner topics using + `semantic_mapping/*` message types (no `/semantic` prefix required). +- `semantic_mapping/vlm_reasoning_node.py` subscribes to planner queries and + publishes answers using the same `semantic_mapping/*` types. +- The `tare_message_bridge` is removed; the planner consumes + `semantic_mapping/msg/*` directly. + +--- + +## Phase 5: VLM Reasoning Node Integration (PENDING) + +### 5.1 Model Selection (current) + +The ROS node now selects models via parameters: + +```yaml +vlm_reasoning_node: + ros__parameters: + vlm_provider: "moondream" + vlm_model: "moondream3" +``` + +`vlm_reasoning_node.py` instantiates `MoondreamVlm` or `QwenVlm` directly. The +registry-based factory (`registry.py`) remains a follow‑up item. + +### 5.2 `vlm_reasoning_node.py` Integration (current) + +Implemented: + +- Room type query/answer via `semantic_mapping/RoomType` +- Object type verification via `semantic_mapping/ObjectType` +- Instruction decomposition → `TargetObjectInstruction` +- Target object spatial queries → `TargetObjectWithSpatial` +- Room label visualization on `/room_type_vis` +- Unified `semantic_mapping/*` messages across planner and VLM nodes +- `project_bbox3d_equirect` moved into `vector_perception_utils` +- Moondream timeouts + prompt fixes to prevent hung calls + +**Not yet ported in current node:** +- `RoomEarlyStop1` callbacks +- `NavigationQuery` callbacks (room selection) +- `TargetObject` (non‑spatial) query/answer and anchor verification + +--- + +## Phase 6: Tests and Validation (IN PROGRESS) + +- Unit tests for prompt-based parsing and fallback behavior. +- Adapter tests for Moondream 3 response normalization. +- Minimal integration test to ensure ROS messages remain unchanged. +- Updated `test_vlm.py` to default to Moondream-only runs with opt-in Qwen and + strict network mode. + +--- + +## Compatibility Checklist + +- [x] Planner uses `semantic_mapping/msg/*` directly (no bridge). +- [x] Prompts for room/object tasks produce same JSON schema. +- [ ] `VLMNode` behavior unchanged from downstream consumer perspective. +- [x] Moondream 3 integrated as default backend. + diff --git a/vlm/package.xml b/vlm/package.xml new file mode 100644 index 0000000..74b9052 --- /dev/null +++ b/vlm/package.xml @@ -0,0 +1,20 @@ + + + + vlm + 0.0.1 + Provider-agnostic VLM interfaces and adapters. + Alex Lin + MIT + + ament_cmake + + ament_copyright + ament_flake8 + ament_pep257 + python3-pytest + + + ament_python + + diff --git a/vlm/resource/vlm b/vlm/resource/vlm new file mode 100644 index 0000000..02cef39 --- /dev/null +++ b/vlm/resource/vlm @@ -0,0 +1 @@ +vlm diff --git a/vlm/setup.py b/vlm/setup.py new file mode 100644 index 0000000..30c9dd3 --- /dev/null +++ b/vlm/setup.py @@ -0,0 +1,43 @@ +from setuptools import find_packages, setup +import os +import sys +from glob import glob + +package_name = "vlm" + +# colcon --symlink-install may pass args setuptools doesn't understand +def _strip_colcon_args(argv: list[str]) -> list[str]: + cleaned: list[str] = [] + skip_next = False + for i, arg in enumerate(argv): + if skip_next: + skip_next = False + continue + if arg == "--editable": + continue + if arg == "--build-directory": + skip_next = True + continue + if arg.startswith("--build-directory="): + continue + cleaned.append(arg) + return cleaned + +sys.argv = _strip_colcon_args(sys.argv) + +setup( + name=package_name, + version="0.0.1", + packages=find_packages(exclude=["test"]), + data_files=[ + ("share/ament_index/resource_index/packages", ["resource/" + package_name]), + ("share/" + package_name, ["package.xml"]), + (os.path.join("share", package_name, "docs"), glob(os.path.join("docs", "*.md"))), + ], + install_requires=["setuptools"], + zip_safe=True, + maintainer="Alex Lin", + maintainer_email="alex.lin416@outlook.com", + description="Provider-agnostic VLM interfaces and adapters.", + license="MIT", +) diff --git a/vlm/test/test_vlm.py b/vlm/test/test_vlm.py new file mode 100644 index 0000000..71215c1 --- /dev/null +++ b/vlm/test/test_vlm.py @@ -0,0 +1,193 @@ +import importlib +import os +import ssl +import sys +import urllib.error +from pathlib import Path + +import numpy as np +import pytest + +ROOT_DIR = Path(__file__).resolve().parents[1] +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) + + +def _log(message: str) -> None: + print(message) + +PILImage = pytest.importorskip("PIL.Image", reason="Pillow is required for VLM tests") + + +def _skip_or_raise_network_error(exc: Exception, context: str) -> None: + strict = os.getenv("VLM_TEST_STRICT", "").lower() in ("1", "true", "yes") + if strict: + raise exc + pytest.skip(f"{context} failed due to network error: {exc}") + + +def _is_network_error(exc: Exception) -> bool: + return isinstance( + exc, + ( + urllib.error.URLError, + ssl.SSLError, + TimeoutError, + ConnectionError, + ), + ) + + +def _load_image() -> np.ndarray: + image_path = Path(__file__).resolve().parents[1] / "data" / "kitchen.jpg" + with PILImage.open(image_path) as image: + return np.array(image.convert("RGB")) + + +@pytest.fixture(scope="module") +def models() -> list[tuple[str, object]]: + models: list[tuple[str, object]] = [] + if "MOONDREAM_API_KEY" in os.environ: + try: + moondream_mod = importlib.import_module("vlm.models.moondream") + except ModuleNotFoundError: + moondream_mod = None + if moondream_mod is not None: + models.append(("moondream", moondream_mod.MoondreamVlm())) + if ( + os.getenv("VLM_TEST_QWEN", "").lower() in ("1", "true", "yes") + and ("ALIBABA_API_KEY" in os.environ or "QWEN_API_KEY" in os.environ) + ): + try: + qwen_mod = importlib.import_module("vlm.models.qwen") + except ModuleNotFoundError: + qwen_mod = None + if qwen_mod is not None: + models.append(("qwen", qwen_mod.QwenVlm())) + if not models: + pytest.skip( + "No VLM backends available; set MOONDREAM_API_KEY or enable Qwen with VLM_TEST_QWEN=1." + ) + return models + + +def _validate_bbox(bbox: tuple[float, float, float, float], width: int, height: int) -> None: + x1, y1, x2, y2 = bbox + assert 0.0 <= x1 <= width + assert 0.0 <= x2 <= width + assert 0.0 <= y1 <= height + assert 0.0 <= y2 <= height + assert x2 >= x1 + assert y2 >= y1 + + +def _validate_point(point: tuple[float, float], width: int, height: int) -> None: + x, y = point + assert 0.0 <= x <= width + assert 0.0 <= y <= height + + +@pytest.fixture(scope="module") +def image_data() -> tuple[np.ndarray, int, int]: + image = _load_image() + height, width = image.shape[:2] + return image, height, width + + +def test_vlm_query(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None: + image, _, _ = image_data + for model_name, model in models: + _log(f"\nModel: {model_name} (query)") + room_answer = model.query(image, "What room is this?") + _log(f"query(room): {room_answer}") + assert isinstance(room_answer, str) + + +def test_vlm_caption(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None: + image, _, _ = image_data + for model_name, model in models: + _log(f"\nModel: {model_name} (caption)") + caption = model.caption(image) + _log(f"caption: {caption}") + assert isinstance(caption, str) + + +def test_vlm_query_batch(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None: + image, _, _ = image_data + for model_name, model in models: + _log(f"\nModel: {model_name} (query_batch)") + batch = model.query_batch([image, image], "Describe this image in one sentence.") + _log(f"query_batch: {[text[:80] for text in batch]}") + assert isinstance(batch, list) + assert len(batch) == 2 + assert all(isinstance(item, str) for item in batch) + + +def test_vlm_detect(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None: + image, height, width = image_data + queries = ["fridge", "microwave"] + for model_name, model in models: + if not model.capabilities.detect: + _log(f"\nModel: {model_name} (detect) skipped") + continue + _log(f"\nModel: {model_name} (detect)") + for query in queries: + try: + detections = model.detect(image, query) + except Exception as exc: + if _is_network_error(exc): + _skip_or_raise_network_error(exc, f"{model_name} detect") + raise + _log(f"detect({query}): {len(detections)}") + assert isinstance(detections, list) + for detection in detections: + assert isinstance(detection.label, str) + _validate_bbox(detection.bbox, width, height) + + +def test_vlm_point(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None: + image, height, width = image_data + queries = ["fridge", "microwave"] + for model_name, model in models: + if not model.capabilities.point: + _log(f"\nModel: {model_name} (point) skipped") + continue + _log(f"\nModel: {model_name} (point)") + for query in queries: + try: + points = model.point(image, query) + except Exception as exc: + if _is_network_error(exc): + _skip_or_raise_network_error(exc, f"{model_name} point") + raise + _log(f"point({query}): {len(points)}") + assert isinstance(points, list) + for point in points: + assert isinstance(point.label, str) + _validate_point(point.point, width, height) + + +def test_vlm_segment(models: list[tuple[str, object]], image_data: tuple[np.ndarray, int, int]) -> None: + image, height, width = image_data + queries = ["fridge", "microwave"] + for model_name, model in models: + if not model.capabilities.segment: + _log(f"\nModel: {model_name} (segment) skipped") + continue + _log(f"\nModel: {model_name} (segment)") + for query in queries: + try: + masks = model.segment(image, query) + except Exception as exc: + if _is_network_error(exc): + _skip_or_raise_network_error(exc, f"{model_name} segment") + raise + _log(f"segment({query}): {len(masks)}") + assert isinstance(masks, list) + for mask in masks: + assert isinstance(mask.label, str) + assert mask.format in ("mono8", "svg_path") + if mask.mask is not None: + assert mask.mask.shape[:2] == (height, width) + if mask.svg_path is not None: + assert isinstance(mask.svg_path, str) diff --git a/vlm/vlm/__init__.py b/vlm/vlm/__init__.py new file mode 100644 index 0000000..1926a1a --- /dev/null +++ b/vlm/vlm/__init__.py @@ -0,0 +1,27 @@ +"""VLM package with provider-agnostic interfaces and adapters.""" + +from vlm.models import ( + MoondreamConfig, + MoondreamVlm, + PromptFallbackVlm, + QwenConfig, + QwenVlm, + VlmCapabilities, + VlmDetection, + VlmMask, + VlmModelBase, + VlmPoint, +) + +__all__ = [ + "VlmCapabilities", + "VlmDetection", + "VlmMask", + "VlmModelBase", + "VlmPoint", + "PromptFallbackVlm", + "MoondreamVlm", + "MoondreamConfig", + "QwenVlm", + "QwenConfig", +] diff --git a/vlm/vlm/models/__init__.py b/vlm/vlm/models/__init__.py new file mode 100644 index 0000000..07ab808 --- /dev/null +++ b/vlm/vlm/models/__init__.py @@ -0,0 +1,27 @@ +"""Model interfaces and shared types for VLM backends.""" + +from vlm.models.base import VlmModelBase +from vlm.models.moondream import MoondreamVlm, MoondreamConfig +from vlm.models.prompt_fallback import PromptFallbackVlm +from vlm.models.qwen import QwenVlm, QwenConfig +from vlm.models.types import ( + ImageType, + VlmCapabilities, + VlmDetection, + VlmMask, + VlmPoint, +) + +__all__ = [ + "ImageType", + "VlmCapabilities", + "VlmDetection", + "VlmMask", + "VlmModelBase", + "VlmPoint", + "PromptFallbackVlm", + "MoondreamVlm", + "MoondreamConfig", + "QwenVlm", + "QwenConfig", +] diff --git a/vlm/vlm/models/base.py b/vlm/vlm/models/base.py new file mode 100644 index 0000000..536432f --- /dev/null +++ b/vlm/vlm/models/base.py @@ -0,0 +1,33 @@ +from abc import ABC, abstractmethod + +from vlm.models.types import ImageType, VlmCapabilities, VlmDetection, VlmMask, VlmPoint + + +class VlmModelBase(ABC): + """Abstract base class for VLM backends.""" + + capabilities = VlmCapabilities() + + @abstractmethod + def query(self, image: ImageType, prompt: str, **kwargs: object) -> str: + """Answer a free-form question about the image.""" + + @abstractmethod + def query_batch(self, images: list[ImageType], prompt: str, **kwargs: object) -> list[str]: + """Answer the same question for multiple images.""" + + @abstractmethod + def caption(self, image: ImageType, **kwargs: object) -> str: + """Generate a natural language description of the image.""" + + @abstractmethod + def detect(self, image: ImageType, query: str, **kwargs: object) -> list[VlmDetection]: + """Return 2D bounding boxes that match the query.""" + + @abstractmethod + def point(self, image: ImageType, query: str, **kwargs: object) -> list[VlmPoint]: + """Return 2D points that match the query.""" + + @abstractmethod + def segment(self, image: ImageType, query: str, **kwargs: object) -> list[VlmMask]: + """Return segmentation masks that match the query.""" diff --git a/vlm/vlm/models/moondream.py b/vlm/vlm/models/moondream.py new file mode 100644 index 0000000..e032c4d --- /dev/null +++ b/vlm/vlm/models/moondream.py @@ -0,0 +1,132 @@ +from contextlib import contextmanager +from dataclasses import dataclass +import logging +import os +import socket + +import numpy as np +from PIL import Image as PILImage +import moondream as md # type: ignore[import-untyped] + +from vlm.models.base import VlmModelBase +from vlm.models.types import VlmCapabilities, VlmDetection, VlmMask, VlmPoint + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class MoondreamConfig: + api_key: str | None = None + bgr_input: bool = False + caption_length: str = "normal" + timeout_s: float | None = 8.0 + + +class MoondreamVlm(VlmModelBase): + capabilities = VlmCapabilities(detect=True, point=True, segment=True, caption=True) + + def __init__(self, config: MoondreamConfig | None = None) -> None: + self._config = config or MoondreamConfig() + api_key = self._config.api_key or os.getenv("MOONDREAM_API_KEY") + if not api_key: + raise ValueError("Moondream API key must be provided or set in MOONDREAM_API_KEY") + self._client = md.vl(api_key=api_key) + if self._config.timeout_s is not None: + logger.info("Moondream timeout set to %.1fs", self._config.timeout_s) + + def _to_pil(self, image: np.ndarray) -> PILImage.Image: + if self._config.bgr_input: + image = image[..., ::-1] + return PILImage.fromarray(image) + + def query(self, image: np.ndarray, prompt: str, **kwargs: object) -> str: + pil_image = self._to_pil(image) + with self._timeout(): + result = self._client.query(pil_image, prompt) + return result.get("answer", str(result)) + + def query_batch(self, images: list[np.ndarray], prompt: str, **kwargs: object) -> list[str]: + if not images: + return [] + return [self.query(image, prompt, **kwargs) for image in images] + + def caption(self, image: np.ndarray, **kwargs: object) -> str: + pil_image = self._to_pil(image) + length = kwargs.get("length", self._config.caption_length) + with self._timeout(): + result = self._client.caption(pil_image, length=length) + return result.get("caption", str(result)) + + def detect(self, image: np.ndarray, query: str, **kwargs: object) -> list[VlmDetection]: + pil_image = self._to_pil(image) + with self._timeout(): + result = self._client.detect(pil_image, query) + objects = result.get("objects", []) + + height, width = image.shape[:2] + detections = [] + for obj in objects: + label = obj.get("label", query) + x1 = float(obj.get("x_min", 0.0)) * width + y1 = float(obj.get("y_min", 0.0)) * height + x2 = float(obj.get("x_max", 0.0)) * width + y2 = float(obj.get("y_max", 0.0)) * height + detections.append(VlmDetection(label=label, bbox=(x1, y1, x2, y2))) + return detections + + def point(self, image: np.ndarray, query: str, **kwargs: object) -> list[VlmPoint]: + pil_image = self._to_pil(image) + with self._timeout(): + result = self._client.point(pil_image, query) + points = result.get("points", []) + + height, width = image.shape[:2] + detections = [] + for point in points: + label = point.get("label", query) + x = float(point.get("x", 0.0)) * width + y = float(point.get("y", 0.0)) * height + detections.append(VlmPoint(label=label, point=(x, y))) + return detections + + def segment(self, image: np.ndarray, query: str, **kwargs: object) -> list[VlmMask]: + pil_image = self._to_pil(image) + with self._timeout(): + result = self._client.segment(pil_image, query) + segments = ( + result.get("segments") + or result.get("masks") + or result.get("objects") + or result.get("data") + or [] + ) + if isinstance(segments, dict): + segments = [segments] + + masks = [] + for segment in segments: + if isinstance(segment, str): + masks.append(VlmMask(label=query, mask=None, svg_path=segment, format="svg_path")) + continue + if not isinstance(segment, dict): + continue + label = segment.get("label", query) + svg_path = ( + segment.get("mask") + or segment.get("svg_path") + or segment.get("path") + or segment.get("svg") + or "" + ) + masks.append(VlmMask(label=label, mask=None, svg_path=svg_path, format="svg_path")) + return masks + + @contextmanager + def _timeout(self): + prev = socket.getdefaulttimeout() + if self._config.timeout_s is not None: + socket.setdefaulttimeout(self._config.timeout_s) + try: + yield + finally: + socket.setdefaulttimeout(prev) diff --git a/vlm/vlm/models/prompt_fallback.py b/vlm/vlm/models/prompt_fallback.py new file mode 100644 index 0000000..2216215 --- /dev/null +++ b/vlm/vlm/models/prompt_fallback.py @@ -0,0 +1,70 @@ +import json +import logging +from typing import Any + +from vlm.models.base import VlmModelBase +from vlm.models.prompts import detect_prompt, point_prompt, segment_prompt +from vlm.models.types import VlmCapabilities, VlmDetection, VlmMask, VlmPoint + +logger = logging.getLogger(__name__) + + +def _extract_json_payload(text: str) -> str: + if "```" in text: + parts = text.split("```") + if len(parts) >= 3: + return parts[1].strip() + return text.strip() + + +def _parse_json_list(text: str) -> list[Any]: + payload = _extract_json_payload(text) + try: + parsed = json.loads(payload) + except json.JSONDecodeError: + return [] + if isinstance(parsed, list): + return parsed + return [] + + +class PromptFallbackVlm(VlmModelBase): + """Prompt-based skill fallback for models without native detect/point/segment.""" + + capabilities = VlmCapabilities(detect=True, point=True, segment=False) + _segment_warned = False + + def detect(self, image, query: str, **kwargs: object) -> list[VlmDetection]: + response = self.query(image, detect_prompt(query), **kwargs) + detections = [] + for item in _parse_json_list(response): + if not isinstance(item, (list, tuple)) or len(item) < 5: + continue + try: + label = str(item[0]) + x1, y1, x2, y2 = (float(item[1]), float(item[2]), float(item[3]), float(item[4])) + except (TypeError, ValueError): + continue + detections.append(VlmDetection(label=label, bbox=(x1, y1, x2, y2))) + return detections + + def point(self, image, query: str, **kwargs: object) -> list[VlmPoint]: + response = self.query(image, point_prompt(query), **kwargs) + points = [] + for item in _parse_json_list(response): + if not isinstance(item, (list, tuple)) or len(item) < 3: + continue + try: + label = str(item[0]) + x, y = (float(item[1]), float(item[2])) + except (TypeError, ValueError): + continue + points.append(VlmPoint(label=label, point=(x, y))) + return points + + def segment(self, image, query: str, **kwargs: object) -> list[VlmMask]: + if not self._segment_warned: + logger.info("Prompt-based segment is not implemented yet; returning empty list.") + self._segment_warned = True + _ = segment_prompt(query) + return [] diff --git a/vlm/vlm/models/prompts.py b/vlm/vlm/models/prompts.py new file mode 100644 index 0000000..8d6503e --- /dev/null +++ b/vlm/vlm/models/prompts.py @@ -0,0 +1,37 @@ +def detect_prompt(query: str) -> str: + return f"""Return 2D bounding boxes in pixels for: "{query}" + +Output only JSON in this exact format: +[ + ["label1", x1, y1, x2, y2], + ["label2", x1, y1, x2, y2] +] + +If no matches, return []. +""" + + +def point_prompt(query: str) -> str: + return f"""Return point coordinates in pixels for: "{query}" + +Output only JSON in this exact format: +[ + ["label1", x, y], + ["label2", x, y] +] + +If no matches, return []. +""" + + +def segment_prompt(query: str) -> str: + return f"""Return segmentation masks for: "{query}" + +Output only JSON in this exact format: +[ + ["label1", "svg_path_string"], + ["label2", "svg_path_string"] +] + +If no matches, return []. +""" diff --git a/vlm/vlm/models/qwen.py b/vlm/vlm/models/qwen.py new file mode 100644 index 0000000..34ff715 --- /dev/null +++ b/vlm/vlm/models/qwen.py @@ -0,0 +1,88 @@ +from dataclasses import dataclass +import base64 +from io import BytesIO +import os + +import numpy as np +from openai import OpenAI +from PIL import Image as PILImage + +from vlm.models.prompt_fallback import PromptFallbackVlm +from vlm.models.types import VlmCapabilities + + +@dataclass(frozen=True) +class QwenConfig: + model_name: str = "qwen2.5-vl-72b-instruct" + base_url: str = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" + api_key: str | None = None + bgr_input: bool = False + + +class QwenVlm(PromptFallbackVlm): + capabilities = VlmCapabilities(detect=True, point=True, segment=False, caption=True) + + def __init__(self, config: QwenConfig | None = None) -> None: + self._config = config or QwenConfig() + api_key = ( + self._config.api_key + or os.getenv("QWEN_API_KEY") + or os.getenv("ALIBABA_API_KEY") + ) + if not api_key: + raise ValueError("Qwen API key must be set in QWEN_API_KEY or ALIBABA_API_KEY") + self._client = OpenAI(base_url=self._config.base_url, api_key=api_key) + + def _image_to_base64(self, image: np.ndarray) -> str: + if self._config.bgr_input: + image = image[..., ::-1] + pil_image = PILImage.fromarray(image) + buffer = BytesIO() + pil_image.save(buffer, format="PNG") + return base64.b64encode(buffer.getvalue()).decode("utf-8") + + def query(self, image: np.ndarray, prompt: str, **kwargs: object) -> str: + img_base64 = self._image_to_base64(image) + response = self._client.chat.completions.create( + model=self._config.model_name, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{img_base64}"}, + }, + {"type": "text", "text": prompt}, + ], + } + ], + ) + return response.choices[0].message.content or "" + + def query_batch(self, images: list[np.ndarray], prompt: str, **kwargs: object) -> list[str]: + if not images: + return [] + content = [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{self._image_to_base64(img)}"}, + } + for img in images + ] + content.append({"type": "text", "text": prompt}) + response = self._client.chat.completions.create( + model=self._config.model_name, + messages=[{"role": "user", "content": content}], + ) + response_text = response.choices[0].message.content or "" + return [response_text] * len(images) + + def caption(self, image: np.ndarray, **kwargs: object) -> str: + return self.query(image, "Describe this image concisely.", **kwargs) + + def detect(self, image: np.ndarray, query: str, **kwargs: object): + return super().detect(image, query, **kwargs) + + def point(self, image: np.ndarray, query: str, **kwargs: object): + return super().point(image, query, **kwargs) diff --git a/vlm/vlm/models/types.py b/vlm/vlm/models/types.py new file mode 100644 index 0000000..34178e8 --- /dev/null +++ b/vlm/vlm/models/types.py @@ -0,0 +1,44 @@ +from dataclasses import dataclass +from typing import TypeAlias + +import numpy as np + +ImageType: TypeAlias = np.ndarray + + +@dataclass(frozen=True) +class VlmDetection: + """Single 2D detection output in pixel coordinates.""" + + label: str + bbox: tuple[float, float, float, float] # (x1, y1, x2, y2) + + +@dataclass(frozen=True) +class VlmPoint: + """Single 2D point output in pixel coordinates.""" + + label: str + point: tuple[float, float] # (x, y) + + +@dataclass(frozen=True) +class VlmMask: + """Segmentation output as a binary mask with optional SVG path.""" + + label: str + mask: np.ndarray | None = None + svg_path: str | None = None + format: str = "mono8" + + +@dataclass(frozen=True) +class VlmCapabilities: + """Capability flags for a VLM backend.""" + + query: bool = True + query_batch: bool = True + caption: bool = True + detect: bool = False + point: bool = False + segment: bool = False