From bceadd33cf2c772175cf3e8552c9be0eabf94b3a Mon Sep 17 00:00:00 2001
From: y-veys <yveys@mit.edu>
Date: Wed, 25 Mar 2026 18:07:46 -0400
Subject: [PATCH 1/8] Change YOLOWorld to use YOLOE-seg detector

---
 spot_tools/src/spot_skills/detection_utils.py     | 4 ++--
 spot_tools/src/spot_skills/grasp_utils.py         | 5 ++---
 spot_tools_ros/examples/test_spot_executor_ros.py | 7 +++++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/spot_tools/src/spot_skills/detection_utils.py b/spot_tools/src/spot_skills/detection_utils.py
index 392fb63..59e6259 100644
--- a/spot_tools/src/spot_skills/detection_utils.py
+++ b/spot_tools/src/spot_skills/detection_utils.py
@@ -2,7 +2,7 @@
 
 import cv2
 import numpy as np
-from ultralytics import YOLOWorld
+from ultralytics import YOLOE
 
 
 class Detector:
@@ -18,7 +18,7 @@ def __init__(self, spot, yolo_world_path):
         if not yolo_world_path:
             raise ValueError("YOLOWorld model path must be provided.")
 
-        self.yolo_model = YOLOWorld(yolo_world_path)
+        self.yolo_model = YOLOE(yolo_world_path)
         custom_classes = ["", "bag", "cone", "pipe"]
         self.yolo_model.set_classes(custom_classes)
         print("Set classes for YOLOWorld model.")
diff --git a/spot_tools/src/spot_skills/grasp_utils.py b/spot_tools/src/spot_skills/grasp_utils.py
index aa31737..3376f51 100644
--- a/spot_tools/src/spot_skills/grasp_utils.py
+++ b/spot_tools/src/spot_skills/grasp_utils.py
@@ -26,7 +26,6 @@
 from bosdyn.client.robot_command import RobotCommandBuilder, block_until_arm_arrives
 
 from spot_skills.arm_utils import (
-    arm_to_drop,
     close_gripper,
     open_gripper,
     stow_arm,
@@ -116,7 +115,7 @@ def object_place(spot, semantic_class="bag", position=None):
     time.sleep(0.25)
     # arm_to_carry(spot)
     # stow_arm(spot)
-    arm_to_drop(spot)
+    # arm_to_drop(spot)
 
     # odom_T_task = get_root_T_ground_body(
     #    robot_state=spot.get_state(), root_frame_name=ODOM_FRAME_NAME
@@ -153,7 +152,7 @@ def object_place(spot, semantic_class="bag", position=None):
     execute_recovery_action(
         spot,
         recover_arm=False,
-        relative_pose=math_helpers.SE2Pose(x=0.0, y=1.0, angle=0),
+        relative_pose=math_helpers.SE2Pose(x=0.0, y=-1.0, angle=0),
     )
     time.sleep(1)
     return True
diff --git a/spot_tools_ros/examples/test_spot_executor_ros.py b/spot_tools_ros/examples/test_spot_executor_ros.py
index d6f54c9..9782c97 100644
--- a/spot_tools_ros/examples/test_spot_executor_ros.py
+++ b/spot_tools_ros/examples/test_spot_executor_ros.py
@@ -47,7 +47,7 @@ def __init__(self):
         # )
 
         pick_cmd = Pick(
-            "hamilton/odom", "bag", np.array([5.0, 5, 0]), np.array([7.0, 7, 0])
+            "hamilton/odom", "cone", np.array([5.0, 5, 0]), np.array([7.0, 7, 0]), ""
         )
 
         path = np.array(
@@ -60,12 +60,15 @@ def __init__(self):
         second_follow_cmd = Follow("hamilton/odom", path)
 
         place_cmd = Place(
-            "hamilton/odom", "bag", np.array([5.0, 5, 0]), np.array([7.0, 7, 0])
+            "hamilton/odom", "cone", np.array([5.0, 5, 0]), np.array([7.0, 7, 0]), ""
         )
 
         seq = ActionSequence(
             "id0", "spot", [follow_cmd, pick_cmd, second_follow_cmd, place_cmd]
         )
+        # seq = ActionSequence(
+        #     "id0", "spot", [follow_cmd, pick_cmd]
+        # )
 
         publisher.publish(to_msg(seq))
         viz_publisher.publish(to_viz_msg(seq, "planner_ns"))

From 5da3d1fedfaa3b821e2b55ed9ff341f4a8f32a72 Mon Sep 17 00:00:00 2001
From: Jake Arkin <jarkin@mit.edu>
Date: Fri, 27 Mar 2026 14:07:19 -0400
Subject: [PATCH 2/8] Update manipulation approval request process

---
 spot_tools/src/spot_skills/detection_utils.py | 38 ++++++----
 spot_tools/src/spot_skills/grasp_utils.py     | 46 +++++++------
 .../src/spot_tools_ros/spot_executor_ros.py   | 69 ++++++++++++++-----
 3 files changed, 100 insertions(+), 53 deletions(-)

diff --git a/spot_tools/src/spot_skills/detection_utils.py b/spot_tools/src/spot_skills/detection_utils.py
index 59e6259..be19eb3 100644
--- a/spot_tools/src/spot_skills/detection_utils.py
+++ b/spot_tools/src/spot_skills/detection_utils.py
@@ -39,19 +39,25 @@ def set_up_detector(self, semantic_class):
             print(f"Updated recognized classes: {updated_classes}")
 
     def return_centroid(self, img_source, semantic_class, debug):
-        image, img = self.spot.get_image_RGB(view=img_source)
+        # Primary image
+        primary_image, primary_img = self.spot.get_image_RGB(view=img_source)
+        primary_xy = self._get_centroid(primary_img, semantic_class, rotate=0, debug=debug)
 
-        xy = self._get_centroid(img, semantic_class, rotate=0, debug=debug)
-        if xy is None:
-            print("Object not found in first image. Looking around!")
-            xy, image, img, image_source = self._look_for_object(
-                semantic_class, debug=debug
-            )
+        # Always scan all other cameras
+        other_candidates, other_detection_index = self._look_for_object(semantic_class, debug=debug)
 
-            if xy is None:
-                print("Object not found near robot")
+        # Combine: primary first
+        candidates = [(primary_image, primary_img, primary_xy)] + other_candidates
 
-        return xy, image, img
+        if primary_xy is not None:
+            detection_index = 0
+        elif other_detection_index is not None:
+            detection_index = other_detection_index + 1  # offset for prepended primary
+        else:
+            detection_index = None
+            print("Object not found in any camera")
+
+        return detection_index, candidates
 
     def _get_centroid(self, img, semantic_class, rotate, debug):
         if rotate == 0:
@@ -118,6 +124,9 @@ def _get_centroid(self, img, semantic_class, rotate, debug):
     def _look_for_object(self, semantic_class, debug):
         sources = self.spot.image_client.list_image_sources()
 
+        candidates = []  # list of (bosdyn_image, cv2_img, xy_or_None)
+        detection_index = None
+
         for source in sources:
             if (
                 "depth" in source.name or source.name == "hand_image"
@@ -143,11 +152,12 @@ def _look_for_object(self, semantic_class, debug):
             print("Found object centroid:", xy)
             if xy is None:
                 print(f"Object not found in {image_source}.")
-                continue
-            else:
-                return xy, image, img, image_source
 
-        return None, None, None, None
+            candidates.append((image, img, xy))
+            if xy is not None and detection_index is None:
+                detection_index = len(candidates) - 1
+
+        return candidates, detection_index
 
 
 class SemanticDetector(Detector):
diff --git a/spot_tools/src/spot_skills/grasp_utils.py b/spot_tools/src/spot_skills/grasp_utils.py
index 3376f51..cc13f4f 100644
--- a/spot_tools/src/spot_skills/grasp_utils.py
+++ b/spot_tools/src/spot_skills/grasp_utils.py
@@ -187,17 +187,18 @@ def object_grasp(
     # Set up the detector (e.g., for YOLOWorld, this may mean updating recognized classes)
     detector.set_up_detector(semantic_class)
 
+    candidates = None
     while attempts < 2 and not success:
         attempts += 1
 
         if not user_input:
             # Try to get the centroid using the detector passed into the function.
-            xy, image, img = detector.return_centroid(
+            detection_index, candidates = detector.return_centroid(
                 image_source, semantic_class, debug=debug
             )
 
             # If the detector fails to return the centroid, then try again until max_attempts
-            if xy is None:
+            if detection_index is None:
                 continue
 
             else:
@@ -206,9 +207,11 @@ def object_grasp(
         else:
             image, img = spot.get_image_RGB(view=image_source)
             xy = get_user_grasp_input(spot, img)
+            candidates = [(image, img, xy)]
+            detection_index = 0
             print("Found object centroid:", xy)
 
-    if xy is None:
+    if detection_index is None or candidates is None:
         if feedback is not None:
             feedback.print(
                 "INFO",
@@ -223,27 +226,28 @@ def object_grasp(
         )
         time.sleep(1)
         return False
-        # execute_recovery_action(spot, recover_arm=True)
-        # spot.sit()
-        # raise Exception(
-        #     "Failed to find an object in any cameras after 2 attempts. Please check the detector or user input."
-        # )
 
-    # If xy is not None, then display the annotated image
-    else:
-        if feedback is not None:
-            annotated_img = copy(img)
+    image, _, xy = candidates[detection_index]
 
-            response = feedback.bounding_box_detection_feedback(
-                annotated_img,
-                xy[0],
-                xy[1],
-                semantic_class,
-            )
+    # Display all candidate images in the approval panel
+    if feedback is not None:
+        annotated_imgs = [copy(img) for (_, img, _) in candidates]
+
+        approved, updated_xy, selected_index = feedback.bounding_box_detection_feedback(
+            annotated_imgs,
+            detection_index,
+            xy[0] if xy else None,
+            xy[1] if xy else None,
+            semantic_class,
+        )
+
+        if approved is not None and not approved:
+            feedback.print("INFO", "User requested abort.")
+            return False
 
-            if response is not None and not response:
-                feedback.print("INFO", "User requested abort.")
-                return False
+        # Use selected camera image and pixel (panel always sends the correct selection)
+        image, _, _ = candidates[selected_index]
+        xy = updated_xy
 
     pick_vec = geometry_pb2.Vec2(x=xy[0], y=xy[1])
     stow_arm(spot)
diff --git a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
index 8b7dc0b..5981ea9 100755
--- a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
+++ b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
@@ -36,6 +36,8 @@
 from spot_tools_ros.occupancy_grid_ros_updater import OccupancyGridROSUpdater
 from spot_tools_ros.utils import get_tf_pose, waypoints_to_path
 
+from nlu_interface_rviz.msg import ManipulationApprovalRequest, ManipulationApprovalResponse
+
 
 def load_inverse_semantic_id_map_from_label_space(fn):
     with open(fn, "r") as fo:
@@ -77,7 +79,11 @@ def build_markers(pts, namespaces, frames, colors):
 class RosFeedbackCollector:
     def __init__(self, odom_frame: str, output_dir: str):
         self.pick_confirmation_event = threading.Event()
-        self.pick_confirmation_response = False
+        #self.pick_confirmation_response = False
+
+        self.pick_confirmation_approved = False
+        self.pick_confirmation_xy = [0,0]
+        self.pick_confirmation_image_index = 0
 
         self.break_out_of_waiting_loop = False
         self.odom_frame = odom_frame
@@ -85,11 +91,13 @@ def __init__(self, odom_frame: str, output_dir: str):
         self.output_dir = output_dir
 
     def bounding_box_detection_feedback(
-        self, annotated_img, centroid_x, centroid_y, semantic_class
+        self, annotated_imgs, detection_index, centroid_x, centroid_y, semantic_class
     ):
         bridge = CvBridge()
 
-        if centroid_x is not None and centroid_y is not None:
+        # Annotate only the detection image
+        if detection_index is not None and centroid_x is not None and centroid_y is not None:
+            annotated_img = annotated_imgs[detection_index]
             label = f"{semantic_class}"
             cv2.putText(
                 annotated_img,
@@ -100,8 +108,6 @@ def bounding_box_detection_feedback(
                 (0, 0, 225),
                 20,
             )
-
-            # Label the centroid
             cv2.drawMarker(
                 annotated_img,
                 (centroid_x, centroid_y),
@@ -111,8 +117,12 @@ def bounding_box_detection_feedback(
                 thickness=30,
             )
 
-        annotated_img_msg = bridge.cv2_to_imgmsg(annotated_img, encoding="passthrough")
-        self.annotated_img_pub.publish(annotated_img_msg)
+        request_msg = ManipulationApprovalRequest()
+        request_msg.images = [bridge.cv2_to_imgmsg(img, encoding="passthrough") for img in annotated_imgs]
+        request_msg.detection_image_index = detection_index if detection_index is not None else 0
+        request_msg.image_x = centroid_x if centroid_x is not None else 0
+        request_msg.image_y = centroid_y if centroid_y is not None else 0
+        self.annotated_img_pub.publish(request_msg)
 
         self.pick_confirmation_event.clear()
 
@@ -126,10 +136,13 @@ def bounding_box_detection_feedback(
 
         if self.break_out_of_waiting_loop:
             self.logger.info("ROBOT WAS PREEMPTED")
-            self.pick_confirmation_response = False
+            self.pick_confirmation_approved = False
+        else:
+            self.logger.info(f"Pick Confirmation Response Received: approved ({self.pick_confirmation_approved}), xy ({self.pick_confirmation_xy}), image_index ({self.pick_confirmation_image_index})")
 
-        # This boolean determines whether the executor keeps going
-        return self.pick_confirmation_response
+        return (self.pick_confirmation_approved,
+                self.pick_confirmation_xy,
+                self.pick_confirmation_image_index)
 
     def pick_image_feedback(self, semantic_image, mask_image):
         bridge = CvBridge()
@@ -218,13 +231,20 @@ def register_publishers(self, node):
         )
 
         self.annotated_img_pub = node.create_publisher(
-            Image, "~/annotated_image", qos_profile=latching_qos
+            ManipulationApprovalRequest, "~/annotated_image", qos_profile=latching_qos
         )
 
         self.lease_takeover_publisher = node.create_publisher(String, "~/takeover", 10)
 
+        #node.create_subscription(
+        #    Bool,
+        #    "~/pick_confirmation",
+        #    self.pick_confirmation_callback,
+        #    10,
+        #)
+
         node.create_subscription(
-            Bool,
+            ManipulationApprovalResponse,
             "~/pick_confirmation",
             self.pick_confirmation_callback,
             10,
@@ -259,13 +279,26 @@ def set_robot_holding_state(self, is_holding: bool, object_id: str, timeout=5):
         return future.result().success
 
     def pick_confirmation_callback(self, msg):
-        if msg.data:
-            self.logger.info("Detection is valid. Continuing pick action!")
-            self.pick_confirmation_response = True
-        else:
+        #if msg.data:
+        #    self.logger.info("Detection is valid. Continuing pick action!")
+        #    self.pick_confirmation_response = True
+        #else:
+        #    self.logger.warn("Detection is invalid. Discontinuing pick action.")
+        #    self.pick_confirmation_response = False
+
+        #self.pick_confirmation_event.set()
+
+        # If not approved, discontinue
+        # If approved, check whether the detection is overwritten
+        if not msg.approve:
             self.logger.warn("Detection is invalid. Discontinuing pick action.")
-            self.pick_confirmation_response = False
-
+            self.pick_confirmation_approved = False
+        else:
+          self.pick_confirmation_approved = True
+          self.pick_confirmation_image_index = msg.image_index
+          self.pick_confirmation_xy[0] = msg.image_x
+          self.pick_confirmation_xy[1] = msg.image_y
+          self.logger.warn("Detection is valid. Continuing pick action!")
         self.pick_confirmation_event.set()
 
     def log_lease_takeover(self, event: str):

From 4154c798d13324bdfe59409d6bf9f16943b1bc3c Mon Sep 17 00:00:00 2001
From: Jake Arkin <jarkin@mit.edu>
Date: Mon, 30 Mar 2026 17:05:36 -0400
Subject: [PATCH 3/8] Remove annotation from image before publishing; Request
 approval even when no detections are found

---
 spot_tools/src/spot_skills/grasp_utils.py     | 30 +++++++++++++++----
 .../src/spot_tools_ros/spot_executor_ros.py   | 23 +-------------
 2 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/spot_tools/src/spot_skills/grasp_utils.py b/spot_tools/src/spot_skills/grasp_utils.py
index cc13f4f..569c679 100644
--- a/spot_tools/src/spot_skills/grasp_utils.py
+++ b/spot_tools/src/spot_skills/grasp_utils.py
@@ -211,11 +211,11 @@ def object_grasp(
             detection_index = 0
             print("Found object centroid:", xy)
 
-    if detection_index is None or candidates is None:
+    if candidates is None:
         if feedback is not None:
             feedback.print(
                 "INFO",
-                "Failed to find an object in any cameras after 2 attempts. Please check the detector or user input.",
+                "Failed to capture any camera images. Please check the detector or user input.",
             )
         execute_recovery_action(
             spot,
@@ -227,17 +227,23 @@ def object_grasp(
         time.sleep(1)
         return False
 
-    image, _, xy = candidates[detection_index]
-
     # Display all candidate images in the approval panel
     if feedback is not None:
         annotated_imgs = [copy(img) for (_, img, _) in candidates]
 
+        if detection_index is not None:
+            _, _, xy = candidates[detection_index]
+            det_x = xy[0] if xy else None
+            det_y = xy[1] if xy else None
+        else:
+            det_x = None
+            det_y = None
+
         approved, updated_xy, selected_index = feedback.bounding_box_detection_feedback(
             annotated_imgs,
             detection_index,
-            xy[0] if xy else None,
-            xy[1] if xy else None,
+            det_x,
+            det_y,
             semantic_class,
         )
 
@@ -248,6 +254,18 @@ def object_grasp(
         # Use selected camera image and pixel (panel always sends the correct selection)
         image, _, _ = candidates[selected_index]
         xy = updated_xy
+    else:
+        if detection_index is None:
+            execute_recovery_action(
+                spot,
+                recover_arm=False,
+                relative_pose=math_helpers.SE2Pose(
+                    x=0.0, y=0.0, angle=np.random.choice([-0.5, 0.5])
+                ),
+            )
+            time.sleep(1)
+            return False
+        image, _, xy = candidates[detection_index]
 
     pick_vec = geometry_pb2.Vec2(x=xy[0], y=xy[1])
     stow_arm(spot)
diff --git a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
index 5981ea9..2784b49 100755
--- a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
+++ b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
@@ -95,30 +95,9 @@ def bounding_box_detection_feedback(
     ):
         bridge = CvBridge()
 
-        # Annotate only the detection image
-        if detection_index is not None and centroid_x is not None and centroid_y is not None:
-            annotated_img = annotated_imgs[detection_index]
-            label = f"{semantic_class}"
-            cv2.putText(
-                annotated_img,
-                label,
-                (centroid_x - 100, centroid_y - 200),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                5,
-                (0, 0, 225),
-                20,
-            )
-            cv2.drawMarker(
-                annotated_img,
-                (centroid_x, centroid_y),
-                (0, 0, 255),
-                markerType=cv2.MARKER_TILTED_CROSS,
-                markerSize=200,
-                thickness=30,
-            )
-
         request_msg = ManipulationApprovalRequest()
         request_msg.images = [bridge.cv2_to_imgmsg(img, encoding="passthrough") for img in annotated_imgs]
+        request_msg.has_detection = detection_index is not None
         request_msg.detection_image_index = detection_index if detection_index is not None else 0
         request_msg.image_x = centroid_x if centroid_x is not None else 0
         request_msg.image_y = centroid_y if centroid_y is not None else 0

From 7f7c940d464a86cf17e78d7bc7b128551f264b74 Mon Sep 17 00:00:00 2001
From: Jake Arkin <jarkin@mit.edu>
Date: Tue, 31 Mar 2026 17:44:59 -0400
Subject: [PATCH 4/8] Only check secondary image sources if no primary
 detection found

---
 spot_tools/src/spot_skills/detection_utils.py | 54 +++++++++++++------
 spot_tools/src/spot_skills/grasp_utils.py     | 15 +++---
 2 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/spot_tools/src/spot_skills/detection_utils.py b/spot_tools/src/spot_skills/detection_utils.py
index be19eb3..29b52bb 100644
--- a/spot_tools/src/spot_skills/detection_utils.py
+++ b/spot_tools/src/spot_skills/detection_utils.py
@@ -1,10 +1,20 @@
 from copy import copy
+from dataclasses import dataclass
+from typing import Optional
 
 import cv2
 import numpy as np
+from bosdyn.api import image_pb2
 from ultralytics import YOLOE
 
 
+@dataclass
+class DetectionCandidate:
+    """A candidate image from a single camera source for object detection."""
+    bosdyn_image: image_pb2.ImageResponse
+    cv2_image: np.ndarray
+    detection_xy: Optional[tuple[int, int]]
+
 class Detector:
     def __init__(self, spot):
         self.spot = spot
@@ -39,25 +49,26 @@ def set_up_detector(self, semantic_class):
             print(f"Updated recognized classes: {updated_classes}")
 
     def return_centroid(self, img_source, semantic_class, debug):
-        # Primary image
-        primary_image, primary_img = self.spot.get_image_RGB(view=img_source)
-        primary_xy = self._get_centroid(primary_img, semantic_class, rotate=0, debug=debug)
-
-        # Always scan all other cameras
-        other_candidates, other_detection_index = self._look_for_object(semantic_class, debug=debug)
+        # List of DetectionCandidates
+        detection_candidates = [] # List of DetectionCandidates -- one per camera source on Spot
+        detection_index = None
 
-        # Combine: primary first
-        candidates = [(primary_image, primary_img, primary_xy)] + other_candidates
+        # Get the primary image source & run the detector; Add to detection_candidates
+        primary_bosdyn_image, primary_cv2_image = self.spot.get_image_RGB(view=img_source)
+        primary_xy = self._get_centroid(primary_cv2_image, semantic_class, rotate=0, debug=debug)
+        detection_candidates.append(DetectionCandidate(primary_bosdyn_image, primary_cv2_image, primary_xy))
 
+        # If primary image has a detection, just get the images from the other image sources; Otherwise, look for detections
         if primary_xy is not None:
-            detection_index = 0
-        elif other_detection_index is not None:
-            detection_index = other_detection_index + 1  # offset for prepended primary
+          detection_index = 0
+          secondary_candidates = self._get_candidate_detection_images()
         else:
-            detection_index = None
-            print("Object not found in any camera")
-
-        return detection_index, candidates
+          secondary_candidates, secondary_detection_index = self._look_for_object(semantic_class, debug=debug)
+          if secondary_detection_index is not None:
+            detection_index = secondary_detection_index + 1 # offset of the primary candidate
+        # Collect all of the detection candidates
+        detection_candidates.extend(secondary_candidates)
+        return detection_index, detection_candidates
 
     def _get_centroid(self, img, semantic_class, rotate, debug):
         if rotate == 0:
@@ -121,10 +132,19 @@ def _get_centroid(self, img, semantic_class, rotate, debug):
         else:
             return None
 
+    def _get_candidate_detection_images(self):
+      detection_candidates = []
+      sources = self.spot.image_client.list_image_sources()
+      sources = [s for s in sources if not "depth" in s.name and "hand_image" != s.name]
+      for source in sources:
+        bosdyn_image, cv2_image = self.spot.get_image_RGB(view=source.name)
+        detection_candidates.append(DetectionCandidate(bosdyn_image, cv2_image, None))
+      return detection_candidates
+
     def _look_for_object(self, semantic_class, debug):
         sources = self.spot.image_client.list_image_sources()
 
-        candidates = []  # list of (bosdyn_image, cv2_img, xy_or_None)
+        candidates = []
         detection_index = None
 
         for source in sources:
@@ -153,7 +173,7 @@ def _look_for_object(self, semantic_class, debug):
             if xy is None:
                 print(f"Object not found in {image_source}.")
 
-            candidates.append((image, img, xy))
+            candidates.append(DetectionCandidate(image, img, xy))
             if xy is not None and detection_index is None:
                 detection_index = len(candidates) - 1
 
diff --git a/spot_tools/src/spot_skills/grasp_utils.py b/spot_tools/src/spot_skills/grasp_utils.py
index 569c679..ba26a60 100644
--- a/spot_tools/src/spot_skills/grasp_utils.py
+++ b/spot_tools/src/spot_skills/grasp_utils.py
@@ -30,7 +30,7 @@
     open_gripper,
     stow_arm,
 )
-from spot_skills.detection_utils import Detector
+from spot_skills.detection_utils import DetectionCandidate, Detector
 from spot_skills.primitives import execute_recovery_action
 
 g_image_click = None
@@ -207,7 +207,7 @@ def object_grasp(
         else:
             image, img = spot.get_image_RGB(view=image_source)
             xy = get_user_grasp_input(spot, img)
-            candidates = [(image, img, xy)]
+            candidates = [DetectionCandidate(image, img, xy)]
             detection_index = 0
             print("Found object centroid:", xy)
 
@@ -229,10 +229,10 @@ def object_grasp(
 
     # Display all candidate images in the approval panel
     if feedback is not None:
-        annotated_imgs = [copy(img) for (_, img, _) in candidates]
+        cv2_images = [copy(c.cv2_image) for c in candidates]
 
         if detection_index is not None:
-            _, _, xy = candidates[detection_index]
+            xy = candidates[detection_index].detection_xy
             det_x = xy[0] if xy else None
             det_y = xy[1] if xy else None
         else:
@@ -240,7 +240,7 @@ def object_grasp(
             det_y = None
 
         approved, updated_xy, selected_index = feedback.bounding_box_detection_feedback(
-            annotated_imgs,
+            cv2_images,
             detection_index,
             det_x,
             det_y,
@@ -252,7 +252,7 @@ def object_grasp(
             return False
 
         # Use selected camera image and pixel (panel always sends the correct selection)
-        image, _, _ = candidates[selected_index]
+        image = candidates[selected_index].bosdyn_image
         xy = updated_xy
     else:
         if detection_index is None:
@@ -265,7 +265,8 @@ def object_grasp(
             )
             time.sleep(1)
             return False
-        image, _, xy = candidates[detection_index]
+        image = candidates[detection_index].bosdyn_image
+        xy = candidates[detection_index].detection_xy
 
     pick_vec = geometry_pb2.Vec2(x=xy[0], y=xy[1])
     stow_arm(spot)

From f7e639d9dc63b675b9a8fd8467c3305b33a1eb7c Mon Sep 17 00:00:00 2001
From: Jake Arkin <jarkin@mit.edu>
Date: Tue, 31 Mar 2026 17:44:59 -0400
Subject: [PATCH 5/8] Only check secondary image sources if no primary
 detection found

---
 spot_tools/src/spot_skills/detection_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spot_tools/src/spot_skills/detection_utils.py b/spot_tools/src/spot_skills/detection_utils.py
index 29b52bb..2c44f58 100644
--- a/spot_tools/src/spot_skills/detection_utils.py
+++ b/spot_tools/src/spot_skills/detection_utils.py
@@ -49,7 +49,6 @@ def set_up_detector(self, semantic_class):
             print(f"Updated recognized classes: {updated_classes}")
 
     def return_centroid(self, img_source, semantic_class, debug):
-        # List of DetectionCandidates
         detection_candidates = [] # List of DetectionCandidates -- one per camera source on Spot
         detection_index = None
 

From c6a0530f1431abe8929c035e71c5bb71b06e56c9 Mon Sep 17 00:00:00 2001
From: Jake Arkin <jarkin@mit.edu>
Date: Tue, 31 Mar 2026 17:59:47 -0400
Subject: [PATCH 6/8] Update topic name for manipulation approval

---
 spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
index 32f8466..c2fe588 100755
--- a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
+++ b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
@@ -91,17 +91,17 @@ def __init__(self, odom_frame: str, output_dir: str):
         self.output_dir = output_dir
 
     def bounding_box_detection_feedback(
-        self, annotated_imgs, detection_index, centroid_x, centroid_y, semantic_class
+        self, detection_imgs, detection_index, centroid_x, centroid_y, semantic_class
     ):
         bridge = CvBridge()
 
         request_msg = ManipulationApprovalRequest()
-        request_msg.images = [bridge.cv2_to_imgmsg(img, encoding="passthrough") for img in annotated_imgs]
+        request_msg.images = [bridge.cv2_to_imgmsg(img, encoding="passthrough") for img in detection_imgs]
         request_msg.has_detection = detection_index is not None
         request_msg.detection_image_index = detection_index if detection_index is not None else 0
         request_msg.image_x = centroid_x if centroid_x is not None else 0
         request_msg.image_y = centroid_y if centroid_y is not None else 0
-        self.annotated_img_pub.publish(request_msg)
+        self.detection_img_pub.publish(request_msg)
 
         self.pick_confirmation_event.clear()
 
@@ -209,8 +209,8 @@ def register_publishers(self, node):
             MarkerArray, "~/mlp_target_publisher", qos_profile=latching_qos
         )
 
-        self.annotated_img_pub = node.create_publisher(
-            ManipulationApprovalRequest, "~/annotated_image", qos_profile=latching_qos
+        self.detection_img_pub = node.create_publisher(
+            ManipulationApprovalRequest, "~/manipulation_request", qos_profile=latching_qos
         )
 
         self.lease_takeover_publisher = node.create_publisher(String, "~/takeover", 10)

From b55efd3b2f7e14d9ca7a0ced386aa42a6bdaa6e1 Mon Sep 17 00:00:00 2001
From: Jake Arkin <jarkin@mit.edu>
Date: Wed, 1 Apr 2026 10:50:42 -0400
Subject: [PATCH 7/8] Apply formatting

---
 spot_tools/src/spot_skills/detection_utils.py | 48 ++++++++++------
 .../src/spot_tools_ros/spot_executor_ros.py   | 57 +++++++++++--------
 2 files changed, 66 insertions(+), 39 deletions(-)

diff --git a/spot_tools/src/spot_skills/detection_utils.py b/spot_tools/src/spot_skills/detection_utils.py
index 2c44f58..5e84744 100644
--- a/spot_tools/src/spot_skills/detection_utils.py
+++ b/spot_tools/src/spot_skills/detection_utils.py
@@ -11,10 +11,12 @@
 @dataclass
 class DetectionCandidate:
     """A candidate image from a single camera source for object detection."""
+
     bosdyn_image: image_pb2.ImageResponse
     cv2_image: np.ndarray
     detection_xy: Optional[tuple[int, int]]
 
+
 class Detector:
     def __init__(self, spot):
         self.spot = spot
@@ -49,22 +51,32 @@ def set_up_detector(self, semantic_class):
             print(f"Updated recognized classes: {updated_classes}")
 
     def return_centroid(self, img_source, semantic_class, debug):
-        detection_candidates = [] # List of DetectionCandidates -- one per camera source on Spot
+        detection_candidates = []  # List of DetectionCandidates -- one per camera source on Spot
         detection_index = None
 
         # Get the primary image source & run the detector; Add to detection_candidates
-        primary_bosdyn_image, primary_cv2_image = self.spot.get_image_RGB(view=img_source)
-        primary_xy = self._get_centroid(primary_cv2_image, semantic_class, rotate=0, debug=debug)
-        detection_candidates.append(DetectionCandidate(primary_bosdyn_image, primary_cv2_image, primary_xy))
+        primary_bosdyn_image, primary_cv2_image = self.spot.get_image_RGB(
+            view=img_source
+        )
+        primary_xy = self._get_centroid(
+            primary_cv2_image, semantic_class, rotate=0, debug=debug
+        )
+        detection_candidates.append(
+            DetectionCandidate(primary_bosdyn_image, primary_cv2_image, primary_xy)
+        )
 
         # If primary image has a detection, just get the images from the other image sources; Otherwise, look for detections
         if primary_xy is not None:
-          detection_index = 0
-          secondary_candidates = self._get_candidate_detection_images()
+            detection_index = 0
+            secondary_candidates = self._get_candidate_detection_images()
         else:
-          secondary_candidates, secondary_detection_index = self._look_for_object(semantic_class, debug=debug)
-          if secondary_detection_index is not None:
-            detection_index = secondary_detection_index + 1 # offset of the primary candidate
+            secondary_candidates, secondary_detection_index = self._look_for_object(
+                semantic_class, debug=debug
+            )
+            if secondary_detection_index is not None:
+                detection_index = (
+                    secondary_detection_index + 1
+                )  # offset of the primary candidate
         # Collect all of the detection candidates
         detection_candidates.extend(secondary_candidates)
         return detection_index, detection_candidates
@@ -132,13 +144,17 @@ def _get_centroid(self, img, semantic_class, rotate, debug):
             return None
 
     def _get_candidate_detection_images(self):
-      detection_candidates = []
-      sources = self.spot.image_client.list_image_sources()
-      sources = [s for s in sources if not "depth" in s.name and "hand_image" != s.name]
-      for source in sources:
-        bosdyn_image, cv2_image = self.spot.get_image_RGB(view=source.name)
-        detection_candidates.append(DetectionCandidate(bosdyn_image, cv2_image, None))
-      return detection_candidates
+        detection_candidates = []
+        sources = self.spot.image_client.list_image_sources()
+        sources = [
+            s for s in sources if "depth" not in s.name and "hand_image" != s.name
+        ]
+        for source in sources:
+            bosdyn_image, cv2_image = self.spot.get_image_RGB(view=source.name)
+            detection_candidates.append(
+                DetectionCandidate(bosdyn_image, cv2_image, None)
+            )
+        return detection_candidates
 
     def _look_for_object(self, semantic_class, debug):
         sources = self.spot.image_client.list_image_sources()
diff --git a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
index c2fe588..de4ee8c 100755
--- a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
+++ b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
@@ -2,7 +2,6 @@
 import threading
 import time
 
-import cv2
 import numpy as np
 import rclpy
 import rclpy.time
@@ -12,6 +11,10 @@
 from cv_bridge import CvBridge
 from heracles_ros_interfaces.srv import UpdateHoldingState
 from nav_msgs.msg import Path
+from nlu_interface_rviz.msg import (
+    ManipulationApprovalRequest,
+    ManipulationApprovalResponse,
+)
 from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
 from rclpy.executors import MultiThreadedExecutor
 from rclpy.node import Node
@@ -24,7 +27,7 @@
 from spot_executor.fake_spot import FakeSpot
 from spot_executor.spot import Spot
 from spot_skills.detection_utils import YOLODetector
-from std_msgs.msg import Bool, String
+from std_msgs.msg import String
 from visualization_msgs.msg import Marker, MarkerArray
 
 from robot_executor_interface.mid_level_planner import (
@@ -36,8 +39,6 @@
 from spot_tools_ros.occupancy_grid_ros_updater import OccupancyGridROSUpdater
 from spot_tools_ros.utils import get_tf_pose, waypoints_to_path
 
-from nlu_interface_rviz.msg import ManipulationApprovalRequest, ManipulationApprovalResponse
-
 
 def load_inverse_semantic_id_map_from_label_space(fn):
     with open(fn, "r") as fo:
@@ -79,10 +80,10 @@ def build_markers(pts, namespaces, frames, colors):
 class RosFeedbackCollector:
     def __init__(self, odom_frame: str, output_dir: str):
         self.pick_confirmation_event = threading.Event()
-        #self.pick_confirmation_response = False
+        # self.pick_confirmation_response = False
 
         self.pick_confirmation_approved = False
-        self.pick_confirmation_xy = [0,0]
+        self.pick_confirmation_xy = [0, 0]
         self.pick_confirmation_image_index = 0
 
         self.break_out_of_waiting_loop = False
@@ -96,9 +97,13 @@ def bounding_box_detection_feedback(
         bridge = CvBridge()
 
         request_msg = ManipulationApprovalRequest()
-        request_msg.images = [bridge.cv2_to_imgmsg(img, encoding="passthrough") for img in detection_imgs]
+        request_msg.images = [
+            bridge.cv2_to_imgmsg(img, encoding="passthrough") for img in detection_imgs
+        ]
         request_msg.has_detection = detection_index is not None
-        request_msg.detection_image_index = detection_index if detection_index is not None else 0
+        request_msg.detection_image_index = (
+            detection_index if detection_index is not None else 0
+        )
         request_msg.image_x = centroid_x if centroid_x is not None else 0
         request_msg.image_y = centroid_y if centroid_y is not None else 0
         self.detection_img_pub.publish(request_msg)
@@ -117,11 +122,15 @@ def bounding_box_detection_feedback(
             self.logger.info("ROBOT WAS PREEMPTED")
             self.pick_confirmation_approved = False
         else:
-            self.logger.info(f"Pick Confirmation Response Received: approved ({self.pick_confirmation_approved}), xy ({self.pick_confirmation_xy}), image_index ({self.pick_confirmation_image_index})")
+            self.logger.info(
+                f"Pick Confirmation Response Received: approved ({self.pick_confirmation_approved}), xy ({self.pick_confirmation_xy}), image_index ({self.pick_confirmation_image_index})"
+            )
 
-        return (self.pick_confirmation_approved,
-                self.pick_confirmation_xy,
-                self.pick_confirmation_image_index)
+        return (
+            self.pick_confirmation_approved,
+            self.pick_confirmation_xy,
+            self.pick_confirmation_image_index,
+        )
 
     def pick_image_feedback(self, semantic_image, mask_image):
         bridge = CvBridge()
@@ -210,17 +219,19 @@ def register_publishers(self, node):
         )
 
         self.detection_img_pub = node.create_publisher(
-            ManipulationApprovalRequest, "~/manipulation_request", qos_profile=latching_qos
+            ManipulationApprovalRequest,
+            "~/manipulation_request",
+            qos_profile=latching_qos,
         )
 
         self.lease_takeover_publisher = node.create_publisher(String, "~/takeover", 10)
 
-        #node.create_subscription(
+        # node.create_subscription(
         #    Bool,
         #    "~/pick_confirmation",
         #    self.pick_confirmation_callback,
         #    10,
-        #)
+        # )
 
         node.create_subscription(
             ManipulationApprovalResponse,
@@ -258,14 +269,14 @@ def set_robot_holding_state(self, is_holding: bool, object_id: str, timeout=5):
         return future.result().success
 
     def pick_confirmation_callback(self, msg):
-        #if msg.data:
+        # if msg.data:
         #    self.logger.info("Detection is valid. Continuing pick action!")
         #    self.pick_confirmation_response = True
-        #else:
+        # else:
         #    self.logger.warn("Detection is invalid. Discontinuing pick action.")
         #    self.pick_confirmation_response = False
 
-        #self.pick_confirmation_event.set()
+        # self.pick_confirmation_event.set()
 
         # If not approved, discontinue
         # If approved, check whether the detection is overwritten
@@ -273,11 +284,11 @@ def pick_confirmation_callback(self, msg):
             self.logger.warn("Detection is invalid. Discontinuing pick action.")
             self.pick_confirmation_approved = False
         else:
-          self.pick_confirmation_approved = True
-          self.pick_confirmation_image_index = msg.image_index
-          self.pick_confirmation_xy[0] = msg.image_x
-          self.pick_confirmation_xy[1] = msg.image_y
-          self.logger.warn("Detection is valid. Continuing pick action!")
+            self.pick_confirmation_approved = True
+            self.pick_confirmation_image_index = msg.image_index
+            self.pick_confirmation_xy[0] = msg.image_x
+            self.pick_confirmation_xy[1] = msg.image_y
+            self.logger.warn("Detection is valid. Continuing pick action!")
         self.pick_confirmation_event.set()
 
     def log_lease_takeover(self, event: str):

From e480aaff5277122df091beef90770ab170b2183c Mon Sep 17 00:00:00 2001
From: Jake Arkin <jarkin@mit.edu>
Date: Thu, 2 Apr 2026 11:53:25 -0400
Subject: [PATCH 8/8] Remove code comment block

---
 spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
index de4ee8c..36df3d8 100755
--- a/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
+++ b/spot_tools_ros/src/spot_tools_ros/spot_executor_ros.py
@@ -226,13 +226,6 @@ def register_publishers(self, node):
 
         self.lease_takeover_publisher = node.create_publisher(String, "~/takeover", 10)
 
-        # node.create_subscription(
-        #    Bool,
-        #    "~/pick_confirmation",
-        #    self.pick_confirmation_callback,
-        #    10,
-        # )
-
         node.create_subscription(
             ManipulationApprovalResponse,
             "~/pick_confirmation",